@inproceedings{49c2884f96a9493daeb9efa5cb7fdedf,
title = "EMRQA: A large corpus for question answering on electronic medical records",
abstract = "We propose a novel methodology to generate domain-specific large-scale question answering (QA) datasets by re-purposing existing annotations for other NLP tasks. We demonstrate an instance of this methodology in generating a large-scale QA dataset for electronic medical records by leveraging existing expert annotations on clinical notes for various NLP tasks from the community shared i2b2 datasets§. The resulting corpus (emrQA) has 1 million questions-logical form and 400,000+ question-answer evidence pairs. We characterize the dataset and explore its learning potential by training baseline models for question to logical form and question to answer mapping.",
author = "Anusri Pampari and Preethi Raghavan and Jennifer Liang and Jian Peng",
note = "Publisher Copyright: {\textcopyright} 2018 Association for Computational Linguistics; 2018 Conference on Empirical Methods in Natural Language Processing, EMNLP 2018 ; Conference date: 31-10-2018 Through 04-11-2018",
year = "2018",
language = "English (US)",
series = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, EMNLP 2018",
publisher = "Association for Computational Linguistics",
pages = "2357--2368",
editor = "Ellen Riloff and David Chiang and Julia Hockenmaier and Jun'ichi Tsujii",
booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, EMNLP 2018",
}