@inproceedings{5b84e0be81f64c3cba35cbe176f1cd09,
title = "Pinpointing crash-consistency bugs in the HPC I/O Stack: A cross-layer approach",
abstract = "We present ParaCrash, a testing framework for studying crash recovery in a typical HPC I/O stack, and demonstrate its use by identifying 15 new crash-consistency bugs in various parallel file systems (PFS) and I/O libraries. ParaCrash uses a {"}golden version{"} approach to test the entire HPC I/O stack: storage state after recovery from a crash is correct if it matches the state that can be achieved by a partial execution with no crashes. It supports systematic testing of a multilayered I/O stack while properly identifying the layer responsible for the bugs.",
keywords = "Crash consistency, I/O library, Parallel file systems",
author = "Jinghan Sun and Jian Huang and Marc Snir",
note = "Publisher Copyright: {\textcopyright} 2021 IEEE Computer Society. All rights reserved.; 33rd International Conference for High Performance Computing, Networking, Storage and Analysis: Science and Beyond, SC 2021 ; Conference date: 14-11-2021 Through 19-11-2021",
year = "2021",
month = nov,
day = "14",
doi = "10.1145/3458817.3476144",
language = "English (US)",
series = "International Conference for High Performance Computing, Networking, Storage and Analysis, SC",
publisher = "IEEE Computer Society",
booktitle = "Proceedings of SC 2021",
}