@article{d18a003268894a47a0e05be0b876fd3f,
title = "Big systems and big reliability challenges",
author = "Reed, {D. A.} and C. Lu and Mendes, {C. L.}",
note = "Funding Information: Hardware failures are exacerbated by programming models that have limited support for fault-tolerance. For scientific applications, MPI is the most popular parallel programming model. However, the MPI standard does not specify mechanisms or interfaces for fault-tolerance - normally, all of an MPI application's tasks are terminated when any of the underlying nodes fails or becomes inaccessible. Given the standard domain decompositions and data distributions used in message-based parallel programs, there are few altematives to this approach *This work was supported in part by Contract No. 74837-001-0349 from the Regents of University of California (Los Alamos National Laboratory) to William Marsh Rice University, by the National Science Foundation under grant EIA-99-75020, and by the NSF Alliance PACI Cooperative Agreement.",
year = "2004",
doi = "10.1016/S0927-5452(04)80089-3",
language = "English (US)",
volume = "13",
pages = "729--736",
journal = "Advances in Parallel Computing",
issn = "0927-5452",
publisher = "Elsevier",
number = "C",
}