@inproceedings{24cc5778dcc74a95a8aa7bd98719f571,
title = "Semantic Aligned Multi-modal Transformer for Vision-Language Understanding: A Preliminary Study on Visual QA",
abstract = "Recent vision-language understanding approaches adopt a multi-modal transformer pre-training and finetuning paradigm. Prior work learns representations of text tokens and visual features with cross-attention mechanisms and captures the alignment solely based on indirect signals. In this work, we propose to enhance the alignment mechanism by incorporating image scene graph structures as the bridge between the two modalities, and learning with new contrastive objectives. In our preliminary study on the challenging compositional visual question answering task, we show the proposed approach achieves improved results, demonstrating potentials to enhance vision-language understanding.",
author = "Han Ding and Li, {Li Erran} and Zhiting Hu and Yi Xu and Dilek Hakkani-Tur and Zheng Du and Belinda Zeng",
note = "Publisher Copyright: {\textcopyright} 2021 Association for Computational Linguistics; 3rd NAACL Workshop on Multimodal Artificial Intelligence, MAI Workshop 2021 ; Conference date: 06-06-2021",
year = "2021",
doi = "10.18653/v1/2021.maiworkshop-1.11",
language = "English (US)",
series = "Multimodal Artificial Intelligence, MAI Workshop 2021 - Proceedings of the 3rd Workshop",
publisher = "Association for Computational Linguistics (ACL)",
pages = "74--78",
editor = "Amir Zadeh and Louis-Philippe Morency and Liang, {Paul Pu} and Candace Ross and Ruslan Salakhutdinov and Soujanya Poria and Erik Cambria and Kelly Shi",
booktitle = "Multimodal Artificial Intelligence, MAI Workshop 2021 - Proceedings of the 3rd Workshop",
address = "United States",
}