@inproceedings{f3af577064914265ace0b4dd19c595dc,
title = "High-Performance GPU-to-CPU Transpilation and Optimization via High-Level Parallel Constructs",
abstract = "While parallelism remains the main source of performance, architectural implementations and programming models change with each new hardware generation, often leading to costly application re-engineering. Most tools for performance portability require manual and costly application porting to yet another programming model. We propose an alternative approach that automatically translates programs written in one programming model (CUDA), into another (CPU threads) based on Polygeist/MLIR. Our approach includes a representation of parallel constructs that allows conventional compiler transformations to apply transparently and without modification and enables parallelism-specific optimizations. We evaluate our framework by transpiling and optimizing the CUDA Rodinia benchmark suite for a multi-core CPU and achieve a 58\% geomean speedup over handwritten OpenMP code. Further, we show how CUDA kernels from PyTorch can efficiently run and scale on the CPU-only Supercomputer Fugaku without user intervention. Our PyTorch compatibility layer making use of transpiled CUDA PyTorch kernels outperforms the PyTorch CPU native backend by 2.7×.",
keywords = "barrier synchronization, CUDA, MLIR, polygeist",
author = "Moses, \{William S.\} and Ivanov, \{Ivan R.\} and Jens Domke and Toshio Endo and Johannes Doerfert and Oleksandr Zinenko",
note = "Thanks to Valentin Churavy of MIT and Albert Cohen of Google for discussions about transformations within MLIR. Thanks to Douglas Kogut, Jiahao Li, and Bojan Serafimov for discussions about parallel optimizations in compilers. Many thanks to Cosmin Oancea for helping with the final version. William S. Moses was supported in part by a DOE Computational Sciences Graduate Fellowship DE-SC0019323, in part by Los Alamos National Laboratories grant 531711, and in part by the United States Air Force Research Laboratory and the United States Air Force Artificial Intelligence Accelerator and was accomplished under Cooperative Agreement Number FA8750-19-2-1000. Johannes Doerfert was supported in part by the Exascale Computing Project (17-SC-20-SC), a collaborative effort of two U.S. Department of Energy organizations (Office of Science and the National Nuclear Security Administration) responsible for the planning and preparation of a capable exascale ecosystem, including software, applications, hardware, advanced system engineering, and early testbed platforms, in support of the nation's exascale computing imperative, and in part by the Lawrence Livermore National Security, LLC ({"}LLNS{"}) via MPO No. B642066, LLNL-CONF-843530. This work was supported in part by the Japan Society for the Promotion of Science KAKENHI Grant Number 19H04119 and by the Japanese New Energy and Industrial Technology Development Organization (NEDO). The views and conclusions contained in this document are those of the authors and should not be interpreted as representing the official policies, either expressed or implied, of the United States Air Force or the U.S. Government, or Lawrence Livermore National Security, LLC neither of whom nor any of their employees make any endorsements, express or implied warranties or representations or assume any legal liability or responsibility for the accuracy, completeness, or usefulness of the information contained herein. The U.S. Government is authorized to reproduce and distribute reprints for Government purposes notwithstanding any copyright notation herein. William S. Moses was supported in part by a DOE Computational Sciences Graduate Fellowship DE-SC0019323, in part by Los Alamos National Laboratories grant 531711, and in part by the United States Air Force Research Laboratory and the United States Air Force Artificial Intelligence Accelerator and was accomplished under Cooperative Agreement Number FA8750-19-2-1000. Johannes Doerfert was supported in part by the Exascale Computing Project (17-SC-20-SC), a collaborative effort of two U.S. Department of Energy organizations (Office of Science and the National Nuclear Security Administration) responsible for the planning and preparation of a capable exascale ecosystem, including software, applications, hardware, advanced system engineering, and early testbed platforms, in support of the nation{\textquoteright}s exascale computing imperative, and in part by the Lawrence Livermore National Security, LLC ({"}LLNS{"}) via MPO No. B642066, LLNL-CONF-843530. This work was supported in part by the Japan Society for the Promotion of Science KAKENHI Grant Number 19H04119 and by the Japanese New Energy and Industrial Technology Development Organization (NEDO).; 28th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming, PPoPP 2023 ; Conference date: 25-02-2023 Through 01-03-2023",
year = "2023",
month = feb,
day = "25",
doi = "10.1145/3572848.3577475",
language = "English (US)",
series = "Proceedings of the ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, PPOPP",
publisher = "Association for Computing Machinery",
pages = "119--134",
booktitle = "PPoPP 2023 - Proceedings of the 28th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming",
address = "United States",
}