RIPE demonstrates that keypoint detection and description can be learned with Reinforcement Learning from image pairs only - no depth, no pose, no artificial augmentation required.
RIPE demonstrates that keypoint detection and description can be learned using only image pairs. A positively labeled image pair (a sufficient number of underlying 3D scene points appear in both images) contains enough implicit information to guide the learning process. Leveraging the epipolar constraint prevents collapse to trivial solutions and provides a strong reward signal. Despite relying on this much weaker training signal, RIPE performs on par with fully supervised extractors.
@article{ripe2025,
year = {2025},
title = {{RIPE: Reinforcement Learning on Unlabeled Image Pairs for Robust Keypoint Extraction}},
author = {Künzel, Johannes and Hilsmann, Anna and Eisert, Peter},
journal = {arXiv},
eprint = {2507.04839},
}