@inproceedings{49c2884f96a9493daeb9efa5cb7fdedf,
title = "EMRQA: A large corpus for question answering on electronic medical records",
abstract = "We propose a novel methodology to generate domain-specific large-scale question answering (QA) datasets by re-purposing existing annotations for other NLP tasks. We demonstrate an instance of this methodology in generating a large-scale QA dataset for electronic medical records by leveraging existing expert annotations on clinical notes for various NLP tasks from the community shared i2b2 datasets§. The resulting corpus (emrQA) has 1 million questions-logical form and 400,000+ question-answer evidence pairs. We characterize the dataset and explore its learning potential by training baseline models for question to logical form and question to answer mapping.",
author = "Anusri Pampari and Preethi Raghavan and Jennifer Liang and Jian Peng",
note = "Funding Information: This project is partially funded by Sloan Research Fellowship, PhRMA Foundation Award in Informatics, and NSF Career Award (1652815). The authors would like to thank Siddharth Patwardhan for his valuable feedback in formatting the paper. Publisher Copyright: {\textcopyright} 2018 Association for Computational Linguistics; 2018 Conference on Empirical Methods in Natural Language Processing, EMNLP 2018 ; Conference date: 31-10-2018 Through 04-11-2018",
year = "2018",
language = "English (US)",
series = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, EMNLP 2018",
publisher = "Association for Computational Linguistics",
pages = "2357--2368",
editor = "Ellen Riloff and David Chiang and Julia Hockenmaier and Jun'ichi Tsujii",
booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, EMNLP 2018",
}