@inproceedings{d27890645ebd4781a6a8e52027b2ad19,
title = "Digitization and search: A non-traditional use of HPC",
abstract = "We describe our efforts to provide a form of automated search of handwritten content for digitized document archives. To carry out the search we use a computer vision technique called word spotting. A form of content based image retrieval, it avoids the still difficult task of directly recognizing text by allowing a user to search using a query image containing handwritten text and ranking a database of images in terms of those that contain more similar looking content. In order to make this search capability available on an archive three computationally expensive pre-processing steps are required. We augment this automated portion of the process with a passive crowd sourcing element that mines queries from the systems users in order to then improve the results of future queries. We benchmark the proposed framework on 1930s Census data, a collection of roughly 3.6 million forms and 7 billion individual units of information.",
keywords = "Big Data, Digitization, Indexing Text",
author = "Liana Diesendruck and Luigi Marini and Rob Kooper and Mayank Kejriwal and Kenton McHenry",
year = "2012",
doi = "10.1109/SC.Companion.2012.259",
language = "English (US)",
isbn = "9780769549569",
series = "Proceedings - 2012 SC Companion: High Performance Computing, Networking Storage and Analysis, SCC 2012",
pages = "1460--1462",
booktitle = "Proceedings - 2012 SC Companion",
note = "2012 SC Companion: High Performance Computing, Networking Storage and Analysis, SCC 2012 ; Conference date: 10-11-2012 Through 16-11-2012",
}