@inproceedings{e77ae9f0851b45c3bab89caa55ee26d8,
title = "On communication determinism in parallel HPC applications",
abstract = "Current fault tolerant protocols for high performance computing parallel applications have two major drawbacks: either they require to restart all processes even in the case of only a single process failure or they have a high performance overhead in fault free situation. As a consequence none of existing generic fault tolerant protocols matches needs of HPC applications and surprisingly, there is no fault tolerant protocol dedicated to them. One way to design better fault tolerant protocols for HPC applications is to explore and take advantage of their specific characteristics. In particular we suspect that most of them present some form of determinism in communication patterns. Communication determinism can play an important role in the design of new fault tolerant protocols by reducing their complexity. In this paper, we explore the communication determinism in 27 HPC parallel applications that are representative of production workloads in large scale centers. We show that most of these applications have deterministic or send-deterministic communication patterns.",
author = "Franck Cappello and Amina Guermouche and Marc Snir",
year = "2010",
doi = "10.1109/ICCCN.2010.5560143",
language = "English (US)",
isbn = "9781424471164",
series = "Proceedings - International Conference on Computer Communications and Networks, ICCCN",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
booktitle = "2010 Proceedings of 19th International Conference on Computer Communications and Networks, ICCCN 2010",
address = "United States",
}