@inproceedings{bcbb310918d642a28c249cb9a05dea1b,
title = "Unsupervised clustering with smoothing for detecting paratext boundaries in scanned documents",
abstract = "Digital humanities scholars are developing new techniques of literary study using non-consumptive processing of large collections of scanned text. A crucial step in working with such collections is to separate the main text of a work from the surrounding paratext, the content of which may distort word counts, location references, sentiment scores, and other important outputs. Simple heuristic methods have been devised, but are not accurate for some texts and some methodological needs. This study describes a method for paratext detection based on smoothed unsupervised clustering. We show that this method is more accurate than simple heuristics, especially for non-fiction works, and edited works with larger amounts of paratext. We also show that a more accurate detection of paratext boundaries improves the accuracy of subsequent text processing, as exemplified by a readability metric.",
keywords = "Digital libraries, Non consumptive analytics, Text mining",
author = "Ana Lu{\v c}i{\'c} and Robin Burke and John Shanahan",
note = "Funding Information: This work was supported in part by the National Endowment for the Humanities (Grant DH-248600-16). The authors would also like to thank the HathiTrust Research Center for support of this work, and members of the Reading Chicago Reading project for their many contributions. Publisher Copyright: {\textcopyright} 2019 IEEE.; 19th ACM/IEEE Joint Conference on Digital Libraries, JCDL 2019 ; Conference date: 02-06-2019 Through 06-06-2019",
year = "2019",
month = jun,
doi = "10.1109/JCDL.2019.00018",
language = "English (US)",
series = "Proceedings of the ACM/IEEE Joint Conference on Digital Libraries",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "53--56",
editor = "Maria Bonn and Dan Wu and Downie, {Stephen J.} and Alain Martaus",
booktitle = "Proceedings - 2019 ACM/IEEE Joint Conference on Digital Libraries, JCDL 2019",
address = "United States",
}