@inproceedings{6f792e9dd1384b35989dbb37f0995f1b,
title = "End-to-end performance modeling of distributed GPU applications",
abstract = "With the growing number of GPU-based supercomputing platforms and GPU-enabled applications, the ability to accurately model the performance of such applications is becoming increasingly important. Most current performance models for GPU-enabled applications are limited to single node performance. In this work, we propose a methodology for end-to-end performance modeling of distributed GPU applications. Our work strives to create performance models that are both accurate and easily applicable to any distributed GPU application. We combine trace-driven simulation of MPI communication using the TraceR-CODES framework with a profiling-based roofline model for GPU kernels. We make substantial modifications to these models to capture the complex effects of both on-node and off-node networks in today's multi-GPU supercomputers. We validate our model against empirical data from GPU platforms and also vary tunable parameters of our model to observe how they might affect application performance.",
keywords = "GPU computing, communication, performance modeling, trace-driven simulation",
author = "Jaemin Choi and Richards, {David F.} and Kale, {Laxmikant V.} and Abhinav Bhatele",
note = "Funding Information: This research used resources of the Oak Ridge Leadership Computing Facility at the Oak Ridge National Laboratory, which is supported by the Office of Science of the U.S. DOE under Contract No. DE-AC05-00OR22725. This work used the Extreme Science and Engineering Discovery Environment (XSEDE), which is supported by National Science Foundation grant number ACI-1548562. Specifically, it used the Bridges system, which is supported by NSF award number ACI-1445606, at the Pittsburgh Supercomputing Center. Funding Information: This work was performed under the auspices of the U.S. Department of Energy (DOE) by Lawrence Livermore National Laboratory under Contract DE-AC52-07NA27344 (LLNL-CONF-809401). This research was supported by the Exascale Computing Project (17-SC-20-SC), a collaborative effort of the U.S. DOE Office of Science and the National Nuclear Security Administration. This work was supported by funding provided by the University of Maryland College Park Foundation. Publisher Copyright: {\textcopyright} 2020 ACM.; 34th ACM International Conference on Supercomputing, ICS 2020 ; Conference date: 29-06-2020 Through 02-07-2020",
year = "2020",
month = jun,
day = "29",
doi = "10.1145/3392717.3392737",
language = "English (US)",
series = "Proceedings of the International Conference on Supercomputing",
publisher = "Association for Computing Machinery",
booktitle = "Proceedings of the 34th ACM International Conference on Supercomputing, ICS 2020",
address = "United States",
}