@inproceedings{cb8e31e590714d279bb741cb6578e6c5,
title = "Understanding the propagation of error due to a silent data corruption in a sparse matrix vector multiply",
abstract = "With the rate of errors that silently effect an application's state/output expected to increase in future HPC machines, numerous mitigation schemes have been proposed, but little work has been done investigating why these schemes detect some error while other is masked. This paper investigates how silent data corruption (SDC) propagates through a sparse matrix vector multiply (SpMV), a fundamental HPC computation kernel. We discover that analyzing the mathematics of the SpMV limits understanding of SDC propagation. We achieve a more complete understanding by investigating how SDC propagates in a SpMV as it is expressed in machine instructions.",
keywords = "Error Propagation, Silent Data Corruption",
author = "Jon Calhoun and Marc Snir and Luke Olson and Maria Garzaran",
note = "Publisher Copyright: {\textcopyright} 2015 IEEE.; IEEE International Conference on Cluster Computing, CLUSTER 2015 ; Conference date: 08-09-2015 Through 11-09-2015",
year = "2015",
month = oct,
day = "26",
doi = "10.1109/CLUSTER.2015.101",
language = "English (US)",
series = "Proceedings - IEEE International Conference on Cluster Computing, ICCC",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "541--542",
booktitle = "Proceedings - 2015 IEEE International Conference on Cluster Computing, CLUSTER 2015",
address = "United States",
}