@inproceedings{e9e5e3ef3d9643c0a1dffb4506ed7e2b,
title = "ACR: Automatic checkpoint/restart for soft and hard error protection",
abstract = "As machines increase in scale, many researchers have pre-dicted that failure rates will correspondingly increase. Soft errors do not inhibit execution, but may silently generate incorrect results. Recent trends have shown that soft er-ror rates are increasing, and hence they must be detected and handled to maintain correctness. We present a holis-tic methodology for automatically detecting and recovering from soft or hard faults with minimal application interven-tion. This is demonstrated by ACR: an automatic check-point/restart framework that performs application replica-tion and automatically adapts the checkpoint period using online information about the current failure rate. ACR per-forms an application- and user-oblivious recovery. We em-pirically test ACR by injecting failures that follow different distributions for five applications and show low overhead when scaled to 131,072 cores. We also analyze the interac-tion between soft and hard errors and propose three recovery schemes that explore the trade-off between performance and reliability requirements.",
keywords = "Checkpoint/restart, Fault-tolerance, Redundancy, Silent data corruption",
author = "Xiang Ni and Esteban Meneses and Nikhil Jain and Kal{\'e}, {Laxmikant V.}",
year = "2013",
doi = "10.1145/2503210.2503266",
language = "English (US)",
isbn = "9781450323789",
series = "International Conference for High Performance Computing, Networking, Storage and Analysis, SC",
publisher = "IEEE Computer Society",
booktitle = "Proceedings of SC 2013",
note = "2013 International Conference for High Performance Computing, Networking, Storage and Analysis, SC 2013 ; Conference date: 17-11-2013 Through 22-11-2013",
}