@inproceedings{1180b633a8454189baa2464f1aa866b7,
title = "HydEE: Failure containment without event logging for large scale send-deterministic MPI applications",
abstract = "High performance computing will probably reach exascale in this decade. At this scale, mean time between failures is expected to be a few hours. Existing fault tolerant protocols for message passing applications will not be efficient anymore since they either require a global restart after a failure (check pointing protocols) or result in huge memory occupation (message logging). Hybrid fault tolerant protocols overcome these limits by dividing applications processes into clusters and applying a different protocol within and between clusters. Combining coordinated check pointing inside the clusters and message logging for the inter-cluster messages allows confining the consequences of a failure to a single cluster, while logging only a subset of the messages. However, in existing hybrid protocols, event logging is required for all application messages to ensure a correct execution after a failure. This can significantly impair failure free performance. In this paper, we propose HydEE, a hybrid rollback-recovery protocol for send-deterministic message passing applications, that provides failure containment without logging any event, and only a subset of the application messages. We prove that HydEE can handle multiple concurrent failures by relying on the send-deterministic execution model. Experimental evaluations of our implementation of HydEE in the MPICH2 library show that it introduces almost no overhead on failure free execution.",
keywords = "High performance computing, MPI, failure containment, fault tolerance, send-determinism",
author = "Amina Guermouche and Thomas Ropars and Marc Snir and Franck Cappello",
year = "2012",
doi = "10.1109/IPDPS.2012.111",
language = "English (US)",
isbn = "9780769546759",
series = "Proceedings of the 2012 IEEE 26th International Parallel and Distributed Processing Symposium, IPDPS 2012",
pages = "1216--1227",
booktitle = "Proceedings of the 2012 IEEE 26th International Parallel and Distributed Processing Symposium, IPDPS 2012",
note = "2012 IEEE 26th International Parallel and Distributed Processing Symposium, IPDPS 2012 ; Conference date: 21-05-2012 Through 25-05-2012",
}