@inproceedings{1bf439745dda4c91866ca96d455bd99c,
title = "Listen to Look Into the Future: Audio-Visual Egocentric Gaze Anticipation",
abstract = "Egocentric gaze anticipation serves as a key building block for the emerging capability of Augmented Reality. Notably, gaze behavior is driven by both visual cues and audio signals during daily activities. Motivated by this observation, we introduce the first model that leverages both the video and audio modalities for egocentric gaze anticipation. Specifically, we propose a Contrastive Spatial-Temporal Separable (CSTS) fusion approach that adopts two modules to separately capture audio-visual correlations in spatial and temporal dimensions, and applies a contrastive loss on the re-weighted audio-visual features from fusion modules for representation learning. We conduct extensive ablation studies and thorough analysis using two egocentric video datasets: Ego4D and Aria, to validate our model design. We demonstrate that audio improves the performance by +2.5% and +2.4% on the two datasets. Our model also outperforms the prior state-of-the-art methods by at least +1.9% and +1.6%. Moreover, we provide visualizations to show the gaze anticipation results and share additional insights into audio-visual representation learning. The code and data split are available on our website (https://bolinlai.github.io/CSTS-EgoGazeAnticipation/).",
keywords = "Audio-Visual Learning, Egocentric Vision, Gaze Behavior",
author = "Bolin Lai and Fiona Ryan and Wenqi Jia and Miao Liu and Rehg, {James M.}",
note = "Portions of this work were supported in part by a gift from Meta and a grant from the Toyota Research Institute University 2.0 program. The second author is supported by an NSF Graduate Research Fellowship.; 18th European Conference on Computer Vision, ECCV 2024 ; Conference date: 29-09-2024 Through 04-10-2024",
year = "2025",
doi = "10.1007/978-3-031-72673-6_11",
language = "English (US)",
isbn = "9783031726729",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer",
pages = "192--210",
editor = "Ale{\v s} Leonardis and Elisa Ricci and Stefan Roth and Olga Russakovsky and Torsten Sattler and G{\"u}l Varol",
booktitle = "Computer Vision – ECCV 2024 - 18th European Conference, Proceedings",
address = "Germany",
}