@inproceedings{c3f11dcfa76742cda04d3dab25344a1a,
title = "GPU-aware Communication with UCX in Parallel Programming Models: Charm++, MPI, and Python",
abstract = "As an increasing number of leadership-class systems embrace GPU accelerators in the race towards exascale, efficient communication of GPU data is becoming one of the most critical components of high-performance computing. For developers of parallel programming models, implementing support for GPU-aware communication using native APIs for GPUs such as CUDA can be a daunting task as it requires considerable effort with little guarantee of performance. In this work, we demonstrate the capability of the Unified Communication X (UCX) framework to compose a GPU-aware communication layer that serves multiple parallel programming models of the Charm++ ecosystem: Charm++, Adaptive MPI (AMPI), and Charm4py. We demonstrate the performance impact of our designs with microbenchmarks adapted from the OSU benchmark suite, obtaining improvements in latency of up to 10.2x, 11.7x, and 17.4x in Charm++, AMPI, and Charm4py, respectively. We also observe increases in bandwidth of up to 9.6x in Charm++, 10x in AMPI, and 10.5x in Charm4py. We show the potential impact of our designs on real-world applications by evaluating a proxy application for the Jacobi iterative method, improving the communication performance by up to 12.4x in Charm++, 12.8x in AMPI, and 19.7x in Charm4py.",
keywords = "AMPI, Charm++, Charm4py, CUDA-aware MPI, GPU communication, Python, UCX",
author = "Jaemin Choi and Zane Fink and Sam White and Nitin Bhat and Richards, {David F.} and Kale, {Laxmikant V.}",
note = "Funding Information: This work was performed under the auspices of the U.S. Department of Energy (DOE) by Lawrence Livermore National Laboratory under Contract DE-AC52-07NA27344 (LLNL-CONF-819099). Funding Information: We thank the UCX developer team, including Akshay Venkatesh, Devendar Bureddy, and Yossi Itigin for their assistance with technical issues on the Summit supercomputer. Funding Information: This research used resources of the Oak Ridge Leadership Computing Facility at the Oak Ridge National Laboratory, which is supported by the Office of Science of the U.S. DOE under Contract No. DE-AC05-00OR22725. Funding Information: This research was supported by the Exascale Computing Project (17-SC-20-SC), a collaborative effort of the U.S. DOE Office of Science and the National Nuclear Security Administration. Publisher Copyright: {\textcopyright} 2021 IEEE.; 2021 IEEE International Parallel and Distributed Processing Symposium Workshops, IPDPSW 2021 ; Conference date: 17-05-2021",
year = "2021",
month = jun,
doi = "10.1109/IPDPSW52791.2021.00079",
language = "English (US)",
series = "2021 IEEE International Parallel and Distributed Processing Symposium Workshops, IPDPSW 2021 - In conjunction with IEEE IPDPS 2021",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "479--488",
booktitle = "2021 IEEE International Parallel and Distributed Processing Symposium Workshops, IPDPSW 2021 - In conjunction with IEEE IPDPS 2021",
address = "United States",
}