@inproceedings{bcfa4dfe3d8845f59d99f57b41f75036,
title = "Improving digital libraries' provision of digital humanities datasets: A case study of htrc literature dataset",
abstract = "This paper investigates the limitations and challenges of the curated datasets provided by digital libraries in support of digital humanities research. This presented work provides a use case utilizing an English literature dataset of 178,381 volumes curated by the HathiTrust Research Center (HTRC) for measuring the change of three literature genres. These volumes were selected from over 17 million digitized items in the HathiTrust Digital Library. We demonstrate our methods and workflow for improving the representativeness and scholarly usability of the existing datasets. We analyzed and effectively overcame three common limitations: duplicate volumes, uneven distribution of data and OCR errors. We suggest that stakeholders of digital libraries should flag and address these limitations to improve their provisions' usability in the context of digital humanities research.",
keywords = "Cultural analytics, Datasets, Digital humanities, Digital libraries",
author = "Yuerong Hu and Ming Jiang and Ted Underwood and Downie, {J. Stephen}",
note = "Publisher Copyright: {\textcopyright} 2020. ACM ISBN.; 2020 ACM/IEEE-CS Joint Conference on Digital Libraries, JCDL 2020 ; Conference date: 01-08-2020 Through 05-08-2020",
year = "2020",
month = aug,
day = "1",
doi = "10.1145/3383583.3398621",
language = "English (US)",
series = "Proceedings of the ACM/IEEE Joint Conference on Digital Libraries",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "405--408",
booktitle = "JCDL 2020 - Proceedings of the ACM/IEEE Joint Conference on Digital Libraries in 2020",
address = "United States",
}