@inproceedings{8f0109f472b3414c8452e1b10b8c5972,
title = "Evaluating BERT's Encoding of Intrinsic Semantic Features of OCR'd Digital Library Collections",
abstract = "The uncertainty caused by optical character recognition (OCR) noise has been a primary barrier for digital libraries (DL) to promote their curated datasets for research purposes, particularly when the datasets are fed into advanced language models with less transparency. To shed some light on this issue, this study evaluates the impacts of OCR noise on BERT models for encoding the intrinsic semantic features of OCR'd texts. Specifically, we encoded chapterwise paired OCR'd texts and their cleaned counterparts extracted from books in six domains using BERT pre-trained and fine-tune models respectively. Given the encoded text features, we further calculated the cosine similarity between any two chapters and used normalized discounted cumulative gain (NDCG) [1] to measure BERT variants' capabilities to preserve narrative coherence and semantic relevance among texts. Our empirical results show that (1) BERT embeddings can encode and preserve texts' intrinsic semantic features (i.e., relevance and coherence); and (2) such capabilities are comparatively robust against OCR noise. This should help alleviate some DL users' concerns regarding applying contextualized word embeddings to encode chapter-level or even document-level OCR'd text information, which benefits promoting scholarly use of DL collections. Our research also demonstrates how texts' intrinsic semantic features can be used for evaluating the impacts of OCR noise on advanced language models, which is an underdeveloped and promising direction for future work.",
keywords = "BERT Evaluation, Data Curation, Digital Humanities, Digital Libraries, HathiTrust, Intrinsic Semantic Features, Optical Character Recognition, Parallel Corpus, Word Embeddings",
author = "Ming Jiang and Yuerong Hu and Glen Worthey and Dubnicek, {Ryan C.} and Ted Underwood and Downie, {J. Stephen}",
note = "Publisher Copyright: {\textcopyright} 2021 IEEE.; 21st ACM/IEEE Joint Conference on Digital Libraries, JCDL 2021 ; Conference date: 27-09-2021 Through 30-09-2021",
year = "2021",
doi = "10.1109/JCDL52503.2021.00045",
language = "English (US)",
series = "Proceedings of the ACM/IEEE Joint Conference on Digital Libraries",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "308--309",
editor = "Downie, {J. Stephen} and Dana McKay and Hussein Suleman and Nichols, {David M.} and Faryaneh Poursardar",
booktitle = "Proceedings - 2021 ACM/IEEE Joint Conference on Digital Libraries, JCDL 2021",
address = "United States",
}