@inproceedings{f7fbec0121ae4e5cb751304e420ade19,
title = "On Class Separability Pitfalls In Audio-Text Contrastive Zero-Shot Learning",
abstract = "Recent advances in audio-text cross-modal contrastive learning have shown its potential towards zero-shot learning. One possibility for this is by projecting item embeddings from pre-trained backbone neural networks into a cross-modal space in which item similarity can be calculated in either domain. This process relies on a strong unimodal pretraining of the backbone networks, and on a data-intensive training task for the projectors. These two processes can be biased by unintentional data leakage, which can arise from using supervised learning in pretraining or from inadvertently training the cross-modal projection using labels from the zero-shot learning evaluation. In this study, we show that a significant part of the measured zero-shot learning accuracy is due to strengths inherited from the audio and text backbones, that is, they are not learned in the cross-modal domain and are not transferred from one modality to another.",
keywords = "Audio-Text Multimodal Embedding, Contrastive Learning, Data Leakage, Pre-training, Zero-shot Learning",
author = "Tavares, {Tiago Fernandes} and Ayres, {Fabio Jos{\'e}} and Zhepei Wang and Paris Smaragdis",
note = "Authors thank the Insper-UIUC Research Partnership for funding.; 2025 IEEE International Conference on Acoustics, Speech, and Signal Processing, ICASSP 2025 ; Conference date: 06-04-2025 Through 11-04-2025",
year = "2025",
doi = "10.1109/ICASSP49660.2025.10888245",
language = "English (US)",
series = "ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
editor = "Rao, {Bhaskar D} and Isabel Trancoso and Gaurav Sharma and Mehta, {Neelesh B.}",
booktitle = "2025 IEEE International Conference on Acoustics, Speech, and Signal Processing, ICASSP 2025 - Proceedings",
address = "United States",
}