@inproceedings{8663bdcd0c2e405f9c61923d13a9f29f,
title = "FAIL-MPI: How fault-tolerant is fault-tolerant MPI?",
abstract = "One of the topics of paramount importance in the development of Cluster and Grid middleware is the impact of faults since their occurrence in Grid infrastructures and in large-scale distributed systems is common. MPI (Message Passing Interface) is a popular abstraction for programming distributed and parallel applications. FAIL (FAult Injection Language) is an abstract language for fault occurrence description capable of expressing complex and realistic fault scenarios. In this paper, we investigate the possibility of using FAIL to inject faults in a fault-tolerant MPI implementation. Our middleware, FAIL-MPI, is used to carry quantitative and qualitative faults and stress testing.",
author = "William Hoarau and Pierre Lemarinier and Thomas Herault and Eric Rodriguez and S{\'e}bastien Tixeuil and Franck Cappello",
year = "2006",
doi = "10.1109/CLUSTR.2006.311851",
language = "English (US)",
isbn = "1424403286",
series = "Proceedings - IEEE International Conference on Cluster Computing, ICCC",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
booktitle = "2006 IEEE International Conference on Cluster Computing, Cluster 2006",
address = "United States",
note = "2006 IEEE International Conference on Cluster Computing, Cluster 2006 ; Conference date: 25-09-2006 Through 28-09-2006",
}