@inproceedings{6f5da1662f00410eb320c1589dd169c6,
title = "Coarse grain computation-communication overlap for efficient application-level checkpointing for GPUs",
abstract = "Graphics Processing Units (GPUs) are increasingly used to solve non-graphical scientific problems. However, it has been shown that the reliability of the GPUs is a concern because of the occurrence of the soft and hard errors. The check-point/restart is the most commonly used technique to achieve fault tolerance in the presence of failures. This work present an application-level checkpoint scheme for systems composed of GPUs. Our scheme exploits the benefits of the divide-and-conquer technique and of the communication-computation overlapping to improve the execution time and checkpoint overhead. By dividing the problem and checkpointing in n subprocesses, we show that our scheme improves the checkpoint overhead by a factor of n. We also show that dividing the problem with finer granularity is not beneficial.",
keywords = "CUDA, Checkpoint, Fault tolerance, GPU, Tesla",
author = "Solano-Quinde, {Lizandro D.} and Bode, {Brett M.} and Somani, {Arun K.}",
year = "2010",
month = nov,
day = "29",
doi = "10.1109/EIT.2010.5612125",
language = "English (US)",
isbn = "9781424468751",
series = "2010 IEEE International Conference on Electro/Information Technology, EIT2010",
booktitle = "2010 IEEE International Conference on Electro/Information Technology, EIT2010",
note = "2010 IEEE International Conference on Electro/Information Technology, EIT2010 ; Conference date: 20-05-2010 Through 22-05-2010",
}