@inproceedings{ec339478e3b0422696e250dc6b9f8f6a,
title = "Towards a more complete understanding of SDC propagation",
abstract = "With the rate of errors that can silently effect an application's state/output expected to increase on future HPC machines, numerous application-level detection and recovery schemes have been proposed. Recovery is more efficient when errors are contained and affect only part of the computation's state. Containment is usually achieved by verifying all information leaking out of a statically defined containment domain, which is an expensive procedure. Alternatively, error propagation can be analyzed to bound the domain that is affected by a detected error. This paper investigates how silent data corruption (SDC) due to soft errors propagates through three HPC applications: HPCCG, Jacobi, and CoMD. To allow for more detailed view of error propagation, the paper tracks propagation at the instruction and application variable level. The impact of detection latency on error propagation is shown along with an application's ability to recover. Finally, the impact of compiler optimizations are explored along with the impact of local problem size on error propagation.",
keywords = "Error Detection, Error Propagation, Error Recovery, Reliability, Silent Data Corruption",
author = "Jon Calhoun and Marc Snir and Olson, {Luke N.} and Gropp, {William D.}",
note = "Funding Information: We would like to our reviewers and shepherd for their helpful and insightful comments on improving the quality ofthis paper. This work was sponsored by the Air Force Office of Scientific Research under grant FA9550-12-1-0478. This work was supported in part by the Office of Advanced Scientific Computing Research, Office of Science, U.S. Department of Energy award DE-FG02-13ER26138/DE-SC0010049. This material is based upon work supported by the National Science Foundation under Grant No. SHF-1617488. This research is part of the Blue Waters sustained-petascale computing project, which is supported by the National Science Foundation (awards OCI-0725070 and ACI-1238993) and the state of Illinois. Blue Waters is a joint effort of the University of Illinois at Urbana-Champaign and its National Center for Supercomputing Applications. Publisher Copyright: {\textcopyright} 2017 Association for Computing Machinery.; 26th ACM International Symposium on High-Performance Parallel and Distributed Computing, HPDC 2017 ; Conference date: 26-06-2017 Through 30-06-2017",
year = "2017",
month = jun,
day = "26",
doi = "10.1145/3078597.3078617",
language = "English (US)",
series = "HPDC 2017 - Proceedings of the 26th International Symposium on High-Performance Parallel and Distributed Computing",
publisher = "Association for Computing Machinery",
pages = "131--142",
booktitle = "HPDC 2017 - Proceedings of the 26th International Symposium on High-Performance Parallel and Distributed Computing",
address = "United States",
}