@inproceedings{ec339478e3b0422696e250dc6b9f8f6a,
title = "Towards a more complete understanding of SDC propagation",
abstract = "With the rate of errors that can silently effect an application's state/output expected to increase on future HPC machines, numerous application-level detection and recovery schemes have been proposed. Recovery is more efficient when errors are contained and affect only part of the computation's state. Containment is usually achieved by verifying all information leaking out of a statically defined containment domain, which is an expensive procedure. Alternatively, error propagation can be analyzed to bound the domain that is affected by a detected error. This paper investigates how silent data corruption (SDC) due to soft errors propagates through three HPC applications: HPCCG, Jacobi, and CoMD. To allow for more detailed view of error propagation, the paper tracks propagation at the instruction and application variable level. The impact of detection latency on error propagation is shown along with an application's ability to recover. Finally, the impact of compiler optimizations are explored along with the impact of local problem size on error propagation.",
keywords = "Error Detection, Error Propagation, Error Recovery, Reliability, Silent Data Corruption",
author = "Jon Calhoun and Marc Snir and Olson, {Luke N.} and Gropp, {William D.}",
note = "Publisher Copyright: {\textcopyright} 2017 Association for Computing Machinery.; 26th ACM International Symposium on High-Performance Parallel and Distributed Computing, HPDC 2017 ; Conference date: 26-06-2017 Through 30-06-2017",
year = "2017",
month = jun,
day = "26",
doi = "10.1145/3078597.3078617",
language = "English (US)",
series = "HPDC 2017 - Proceedings of the 26th International Symposium on High-Performance Parallel and Distributed Computing",
publisher = "Association for Computing Machinery",
pages = "131--142",
booktitle = "HPDC 2017 - Proceedings of the 26th International Symposium on High-Performance Parallel and Distributed Computing",
address = "United States",
}