@inproceedings{ea7c14de6a1d496cb403e601ddf81a06,
title = "Fault-tolerant protocol for hybrid task-parallel message-passing applications",
abstract = "We present a fault-tolerant protocol for task-parallel message-passing applications to mitigate transient errors. The protocol requires the restart only of the task that experienced the error and transparently handles any MPI calls inside the task. The protocol is implemented in Nanos - a dataflow runtime for task-based OmpSs programming model - and the PMPI profiling layer to fully support hybrid OmpSs+MPI applications. In our experiments we demonstrate that our fault-tolerant solution has a reasonable overhead, with a maximum observed overhead of 4.5%. We also show that fine-grained parallelization is important for hiding the overheads related to the protocol as well as the recovery of tasks.",
keywords = "Checkpointing, Dataflow model, Fault tolerance, High performance computing, Message logging, Task-based programming model",
author = "Tatiana Martsinkevich and Omer Subasi and Osman Unsal and Jesus Labarta and Franck Cappello",
note = "Publisher Copyright: {\textcopyright} 2015 IEEE.; IEEE International Conference on Cluster Computing, CLUSTER 2015 ; Conference date: 08-09-2015 Through 11-09-2015",
year = "2015",
month = oct,
day = "26",
doi = "10.1109/CLUSTER.2015.104",
language = "English (US)",
series = "Proceedings - IEEE International Conference on Cluster Computing, ICCC",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "563--570",
booktitle = "Proceedings - 2015 IEEE International Conference on Cluster Computing, CLUSTER 2015",
address = "United States",
}