@inproceedings{dcf0a0506f8643c4af67aa28f0ee0683,
title = "Analyzing the performance and accuracy of lossy checkpointing on sub-iteration of NWChem",
abstract = "Future exascale systems are expected to be characterized by more frequent failures than current petascale systems. This places increased importance on the application to minimize the amount of time wasted due to recompution when recovering from a checkpoint. Typically HPC application checkpoint at iteration boundaries. However, for applications that have a high per-iteration cost, checkpointing inside the iteration limits the amount of re-computation. This paper analyzes the performance and accuracy of using lossy compressed check-pointing in the computational chemistry application NWChem. Our results indicate that lossy compression is an effective tool for reducing the sub-iteration checkpoint size. Moreover, compression error tolerances that yield acceptable deviation in accuracy and iteration count are quantified.",
keywords = "Checkpoint-restart, Coupled-cluster singles and doubles, Lossy data compression, NWChem",
author = "Tasmia Reza and Kristopher Keipert and Sheng Di and Xin Liang and Jon Calhoun and Franck Cappello",
note = "Funding Information: This material is based upon work supported by the National Science Foundation under Grant No. SHF-1910197 and Grant No. SHF-1619253. This work is supported by the US Department of Energy under subaward No. 9F-60179. This research was supported by the Exascale Computing Project (ECP), Project Number: 17-SC-20-SC, a collaborative effort of two DOE organizations - the Office of Science and the National Nuclear Security Administration, responsible for the planning and preparation of a capable exascale ecosystem, including software, applications, hardware, advanced system engineering and early testbed platforms, to support the nation{\textquoteright}s exascale computing imperative. The material was supported by the U.S. Department of Energy, Office of Science, under contract DE-AC02-06CH11357. We acknowledge the computing resources provided on Bebop, which is operated by the Laboratory Computing Resource Center at Argonne National Laboratory. Funding Information: This material is based upon work supported by the National Science Foundation under Grant No. SHF-1910197 and Grant No. SHF-1619253. This work is supported by the US Department of Energy under subaward No. 9F-60179. This research was supported by the Exascale Computing Project (ECP), Project Number: 17-SC-20-SC, a collaborative effort of two DOE organizations - the Office of Science and the National Nuclear Security Administration, responsible for the planning and preparation of a capable exascale ecosystem, including software, applications, hardware, advanced system engineering and early testbed platforms, to support the nation's exascale computing imperative. The material was supported by the U.S. Department of Energy, Office of Science, under contract DEAC02- 06CH11357. We acknowledge the computing resources provided on Bebop, which is operated by the Laboratory Computing Resource Center at Argonne National Laboratory. Publisher Copyright: {\textcopyright} 2019 IEEE.; 5th IEEE/ACM International Workshop on Data Analysis and Reduction for Big Scientific Data, DRBSD-5 2019 ; Conference date: 17-11-2019",
year = "2019",
month = nov,
doi = "10.1109/DRBSD-549595.2019.00009",
language = "English (US)",
series = "Proceedings of DRBSD-5 2019: 5th International Workshop on Data Analysis and Reduction for Big Scientific Data - Held in conjunction with SC 2019: The International Conference for High Performance Computing, Networking, Storage and Analysis",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "23--27",
booktitle = "Proceedings of DRBSD-5 2019",
address = "United States",
}