@inproceedings{e54d6ba2fdd14718ae6c3f59b192e269,
title = "LogDiver: A tool for measuring resilience of extreme-scale systems and applications",
abstract = "This paper presents LogDiver, a tool for the analysis of application-level resiliency in extreme-scale computing systems. The tool has been implemented to handle data generated by system monitoring tools in Blue Waters, the petascale machine in production at the University of Illinois' National Center for Supercomputing Applications. The tool is able: i) to filter, extract, and classify error data from different sources of information, such as system logs, hardware sensors and workload logs; ii) to extract signals from the categorized errors; iii) to consolidate user application data and decode application and job exit status, highlighting the reasons for the application/job exit; and iv) to correlate application failures with errors using a mix of empirical and analytical techniques. To the best of our knowledge, this is the first tool capable of measuring application-level resiliency in extreme-scale machines. We also demonstrate the power of the tool by showing that XK applications are more vulnerable to failures when compared to XE applications.",
keywords = "B.8.1 [Performance and Reliability]: Reliability, Fault-Tolerance - HPC applications; Log Analysis, Testing",
author = "{Di Martino}, Catello and Saurabh Jha and Kramer, {William T} and Kalbarczyk, {Zbigniew T} and Iyer, {Ravishankar K}",
note = "Publisher Copyright: {\textcopyright} 2015 ACM.; 5th Workshop on Fault Tolerance for HPC at eXtreme Scale, FTXS 2015 ; Conference date: 15-06-2015",
year = "2015",
month = jun,
day = "15",
doi = "10.1145/2751504.2751511",
language = "English (US)",
series = "FTXS 2015 - Proceedings of the 2015 Workshop on Fault Tolerance for HPC at eXtreme Scale, Part of HPDC 2015",
publisher = "Association for Computing Machinery",
pages = "11--18",
booktitle = "FTXS 2015 - Proceedings of the 2015 Workshop on Fault Tolerance for HPC at eXtreme Scale, Part of HPDC 2015",
address = "United States",
}