@inproceedings{0dfde8a337ff4741b8cd8501f04273ce,
title = "FaultSight: A fault analysis tool for HPC researchers",
abstract = "System reliability is expected to be a significant challenge for future extreme-scale systems. Poor reliability results in a higher frequency of interruptions in high-performance computer (HPC) applications due to system/application crashes or data corruption due to soft errors. In response, application level error detection and recovery schemes are devised to mitigate the impact of these interruptions. Evaluating these schemes and the reliability of an application re-quires the analysis of thousands of fault injection trials, resulting in tedious and time-consuming process. Furthermore, there is no one data analysis tool that can work with all of the fault injection frameworks currently in use. In this paper, we present FaultSight, a fault injection analysis tool capable of efficiently assisting in the analysis of HPC application reliability as well as the effectiveness of resiliency schemes. FaultSight is designed to be flexible and work with data coming from a variety of fault injection frameworks. The effectiveness of FaultSight is demonstrated by exploring the reliability of different versions of the Matrix-Matrix Multiplication kernel using two different fault injection tools. In addition, the detection and recovery schemes are highlighted for the HPCCG mini-app.",
keywords = "Fault-analysis, Fault-analysis-tool, Fault-injection, Fault-tolerance, Resiliency, Soft-error-analysis",
author = "Einar Horn and Dakota Fulp and Jon Calhoun and Luke Olson",
note = "Publisher Copyright: {\textcopyright} 2019 IEEE.; 9th IEEE/ACM Workshop on Fault Tolerance for HPC at eXtreme Scale, FTXS 2019 ; Conference date: 22-11-2019",
year = "2019",
month = nov,
doi = "10.1109/FTXS49593.2019.00008",
language = "English (US)",
series = "Proceedings of FTXS 2019: Fault Tolerance for HPC at eXtreme Scale Workshop - Held in conjunction with SC 2019: The International Conference for High Performance Computing, Networking, Storage and Analysis",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "21--30",
booktitle = "Proceedings of FTXS 2019",
address = "United States",
}