@inproceedings{1ac4f279148d429ebc83a4fb78173287,
title = "Neural Network Based Silent Error Detector",
abstract = "As we move toward exascale platforms, silent data corruptions (SDC) are likely to occur more frequently. Such errors can lead to incorrect results. Attempts have been made to use generic algorithms to detect such errors. Such detectors have demonstrated high precision and recall for detecting errors, but only if they run immediately after an error has been injected. In this paper, we propose a neural network detector that can detect SDCs even multiple iterations after they were injected. We have evaluated our detector with 6 FLASH applications and 2 Mantevo mini-apps. Experiments show that our detector can detect more than 89% of SDCs with a false positive rate of less than 2%.",
keywords = "Exascale computing, Fault tolerance, Silent data corruption",
author = "Chen Wang and Nikoli Dryden and Franck Cappello and Marc Snir",
note = "Funding Information: V. EVALUATION A. ExperimentalSetup We perform our experiments on Blue Waters, a Cray supercomputer managed by the National Center for Supercomputing Applications and supported by the National Science Foundation and the University of Illinois. Each compute node has 2 AMD 6276 Interlagos CPUs and 64 GB of RAM. The neural network is trained and evaluatedon Nvidia DGX-1 at Argonne JLSE. The DGX-1 is equipped with 8 Tesla P100 GPUs. Funding Information: ACKNOWLEDGMENT This research was supported by NSF SHF award number:1617488 and by the U.S. Department of Energy, DOE Office of Science under contract number DE-AC02-06CH11357. It used compute resources at ALCF and NCSA. We thank Dr. Anshu Dubey and Dr. Sheng Di for their gracious help. Publisher Copyright: {\textcopyright} 2018 IEEE.; 2018 IEEE International Conference on Cluster Computing, CLUSTER 2018 ; Conference date: 10-09-2018 Through 13-09-2018",
year = "2018",
month = oct,
day = "29",
doi = "10.1109/CLUSTER.2018.00035",
language = "English (US)",
series = "Proceedings - IEEE International Conference on Cluster Computing, ICCC",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "168--178",
booktitle = "Proceedings - 2018 IEEE International Conference on Cluster Computing, CLUSTER 2018",
address = "United States",
}