@article{acf827cb769e490cb419928042e8bce8,
title = "Navigating the storm: IMPACT, eMOP, and agile steering standards",
abstract = "This article discusses two major initiatives tasked with developing tools to improve optical character recognition (OCR) or the mechanical keying of texts that are digitally available only as page images. The two initiatives are the IMProving ACcess to Text Project in Europe and the Early Modern OCR Project in the USA. Because of dealing with a multilayered problem like OCR technologies and having to collaborate with radically interdisciplinary and international team members, the two projects developed techniques that we call Agile Project Management, outlined in this essay with rationales for their use.",
author = "Mandell, {Laura C.} and Clemens Neudecker and Apostolos Antonacopoulos and Elizabeth Grumbach and Loretta Auvil and Christy, {Matthew J.} and Heil, {Jacob A.} and Todd Samuelson",
note = "Funding Information: Critical to the processing and analysis of the several thousands of document pages in the IMPACT data set was the new Aletheia software tool (Clausner et al., 2011). Aletheia started from the idea of creating a semi-automated layout and text correction tool and was developed into a fully functional, complete document-analysis and recognition toolkit, now used by several groups and commercial organizations. Aletheia enables the complete analysis of the image content of a document page, including pixel-based enhancement of a region (e.g. paragraph, word, glyph), and manual entry or automated recognition (via Tesseract) of the textual content. It also allows the annotation of any entity on the page and its detailed description. Following the completion of the IMPACT project, Aletheia has been in continuous development and has been used by eMOP, as will be fully described below. eMOP ran from 2012 to 2015. In Fall 2012, Texas A&M University received a $734,000 grant from the Andrew W. Mellon Foundation for eMOP5. eMOP{\textquoteright}s objective was to make machine readable, or improve the readability for, 45 million pages of text from two major proprietary databases: Eighteenth Century Collections Online and Early English Books Online. Generally, eMOP intends to improve the visibility of early modern texts by making their contents fully searchable. The current paradigm of searching special collections for early modern materials by either metadata alone or {\textquoteleft}dirty{\textquoteright} OCR is inefficient for scholarly research (Mandell 2013). In the grant document, we described eMOP{\textquoteright}s main deliverables: • We intend to publish an open-source OCR workflow at grant end. This workflow will con-tain access to an early modern font database, customization guidelines for the Tesseract OCR engine, post-processing and diagnostic algo-rithms, and crowdsourcing and {\textquoteleft}scholar-sourcing{\textquoteright} (to use Brian Geiger{\textquoteright}s phrase) correc-tion tools.",
year = "2017",
month = apr,
day = "1",
doi = "10.1093/llc/fqv062",
language = "English (US)",
volume = "32",
pages = "189--194",
journal = "Digital Scholarship in the Humanities",
issn = "2055-7671",
publisher = "Oxford University Press",
number = "1",
}