@inproceedings{c4f664bebe0248a9bed35f07c9ce5b4a,
title = "Efficient and scalable workflows for genomic analyses",
abstract = "Recent growth in the volume of DNA sequence data and the associated computational costs of extracting meaningful information warrant the need for efficient computational systems at scale. In this work, we propose the Illinois Genomics Execution Environment (IGen), a framework for efficient and scalable genome analyses. The design philosophy of IGen is based on algorithmic analysis and extensive measurements on compute- and data-intensive genomic analyses workflows (such as variant discovery and genotyping analysis) executed on high-performance and cloud computing infrastructures. IGen leverages the advantages of existing designs and proposes new software improvements to overcome the inefficiencies we observe in our measurements. Based on these composite improvements, we demonstrate that IGen is able to accelerate the alignment from 13.1 hours to 10.8 hours (1.2×) and the variant from 10.1 hours to 1.25 hours (8×) calling on a single node, and its modular design scales efficiently in a parallel computing environment.",
keywords = "Bioinformatics, Design, Genomics, Measurement, Performance",
author = "Banerjee, {Subho S.} and Athreya, {Arjun P.} and Mainzer, {Liudmila S.} and Jongeneel, {C. Victor} and Hwu, {Wen Mei} and Kalbarczyk, {Zbigniew T.} and Iyer, {Ravishankar K.}",
note = "Funding Information: This research was supported by several grants: in part by the National Science Foundation under Grant No. CNS 13-37732; in part by the Blue Waters sustained-petascale computing project supported by the National Science Foundation (awards OCI-0725070 and ACI-1238993) and the state of Illinois; and in part by IBM Faculty Awards to Profs. Hwu and Iyer. We gratefully acknowledge the support of Dr. Volodymyr Kindratenko and the Innovative Systems Laboratory at NCSA for facilitating our profiling experiments.; 6th ACM International Workshop on Data-Intensive Distributed Computing, DIDC 2016 ; Conference date: 01-06-2016",
year = "2016",
month = jun,
day = "1",
doi = "10.1145/2912152.2912156",
language = "English (US)",
series = "DIDC 2016 - Proceedings of the ACM International Workshop on Data-Intensive Distributed Computing",
publisher = "Association for Computing Machinery",
pages = "27--36",
booktitle = "DIDC 2016 - Proceedings of the ACM International Workshop on Data-Intensive Distributed Computing",
address = "United States",
}