@inproceedings{1ac4f279148d429ebc83a4fb78173287,
title = "Neural Network Based Silent Error Detector",
abstract = "As we move toward exascale platforms, silent data corruptions (SDC) are likely to occur more frequently. Such errors can lead to incorrect results. Attempts have been made to use generic algorithms to detect such errors. Such detectors have demonstrated high precision and recall for detecting errors, but only if they run immediately after an error has been injected. In this paper, we propose a neural network detector that can detect SDCs even multiple iterations after they were injected. We have evaluated our detector with 6 FLASH applications and 2 Mantevo mini-apps. Experiments show that our detector can detect more than 89% of SDCs with a false positive rate of less than 2%.",
keywords = "Exascale computing, Fault tolerance, Silent data corruption",
author = "Chen Wang and Nikoli Dryden and Franck Cappello and Marc Snir",
note = "Publisher Copyright: {\textcopyright} 2018 IEEE.; 2018 IEEE International Conference on Cluster Computing, CLUSTER 2018 ; Conference date: 10-09-2018 Through 13-09-2018",
year = "2018",
month = oct,
day = "29",
doi = "10.1109/CLUSTER.2018.00035",
language = "English (US)",
series = "Proceedings - IEEE International Conference on Cluster Computing, ICCC",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "168--178",
booktitle = "Proceedings - 2018 IEEE International Conference on Cluster Computing, CLUSTER 2018",
address = "United States",
}