@inproceedings{98b15d79a1ec41169f100ea264ffed4b,
title = "Brown Dog: Making the digital world a better place, a few files at a time",
abstract = "Brown Dog is a data transformation service for auto-curation of long-tail data. In this digital age, we have more data available for analysis than ever and this trend will only increase. According to most estimates, 70-80% of this data is unstructured, and together with unsupported data formats and inaccessible software tools, in essence, this data is not either easily accessible or usable to its owners in a meaningful way. Brown Dog aims at making this data more accessible and usable by auto-curation and indexing, leveraging existing and novel data transformation tools. In this paper, we discuss the recent major component improvements to Brown Dog including transformation tools called extractors and converters; desktop, web and terminal-based clients which perform data transformations; libraries written in multiple programming languages which integrate with existing software and extend their data curation capabilities; an online tool store for users to contribute, manage and share data transformation tools and receive credit for developing them; cyberinfrastructure for deploying the system on diverse computing platforms leveraging scalability via Docker swarm;workflow management service for creatively integrating existing transformations to generate custom, reproducible workflows which meet research needs, and its data management capabilities. This paper also discusses data transformation tools developed to support some\ scientific and allied use cases, thereby benefiting researchers in diverse domains. Finally, we briefly discuss our future directions with regard to production deployments as well as how users can access Brown Dog to manage their un-curated unstructured data.",
keywords = "API gateway, Auto-curation, Big data, Data conversion, Data curation, Data cyberinfrastructure, Data transformation, Data wrangling, Metadata extraction, Orchestration, Provenance, Unstructured data",
author = "Satheesan, {Sandeep Puthanveetil} and Benjamin Galewsky and Jong Lee and M. Christopher and Bing Zhang and Jay Alameda and Gregory Jansen and Richard Marciano and Arthur Schmidt and Yan Zhao and Shannon Bradley and Rob Kooper and Luigi Marini and Marcus Slavenas and Inna Zharnitsky and Michael Dietze and Praveen Kumar and Minsker, {Barbara S} and Sullivan, {William C.} and Kenton McHenry",
note = "Funding Information: This material is based upon work supported by the National Science Foundation under Grant No. ACI-1261582. Publisher Copyright: {\textcopyright} 2018 Copyright held by the owner/author(s). Publication rights licensed to the Association for Computing Machinery.; 2018 Practice and Experience in Advanced Research Computing Conference: Seamless Creativity, PEARC 2018 ; Conference date: 22-07-2017 Through 26-07-2017",
year = "2018",
month = jul,
day = "22",
doi = "10.1145/3219104.3219132",
language = "English (US)",
isbn = "9781450364461",
series = "ACM International Conference Proceeding Series",
publisher = "Association for Computing Machinery",
booktitle = "Practice and Experience in Advanced Research Computing 2018",
address = "United States",
}