@inproceedings{f200d466667c414297f8889fc55d93ae,
title = "POSTER: Hardening Selective Protection across Multiple Program Inputs for HPC Applications",
abstract = "With the ever-shrinking size of transistors and increasing scale of applications, silent data corruptions (SDCs) have become a common yet serious issue in HPC applications. Selective instruction duplication (SID) is a popular fault-tolerance technique that can obtain a high SDC coverage with low-performance overhead, as it selects the most vulnerable parts of a program for protection with priority. However, existing studies of SID are confined to single program input in the evaluation, assuming that the error resilience of the program remains similar across inputs, leading to a drastic loss of SDC coverage from SID when the protected program runs different inputs. Hence, we proposed Sentinel, an automated compiler-based framework to mitigate the loss of SDC coverage. Evaluation results show that Sentinel can effectively mitigate the loss of SDC coverage (up to 97.00%) across multiple inputs, which significantly hardens existing SID techniques.",
keywords = "compiler, error resilience, fault injection, high performance computing",
author = "Yafan Huang and Shengjian Guo and Sheng Di and Guanpeng Li and Franck Cappello",
note = "The material was supported by the U.S. Department of Energy, Office of Science under contract DE-AC02-06CH11357.; 27th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, PPoPP 2022 ; Conference date: 02-04-2022 Through 06-04-2022",
year = "2022",
month = apr,
day = "2",
doi = "10.1145/3503221.3508414",
language = "English (US)",
series = "Proceedings of the ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, PPOPP",
publisher = "Association for Computing Machinery",
pages = "437--438",
booktitle = "PPoPP 2022 - Proceedings of the 27th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming",
address = "United States",
}