@inproceedings{3bfa96a9f8ca467db84014358cea7204,
title = "Aluminum: An Asynchronous, GPU-Aware Communication Library Optimized for Large-Scale Training of Deep Neural Networks on HPC Systems",
abstract = "We identify communication as a major bottleneck for training deep neural networks on large-scale GPU clusters, taking over 10x as long as computation. To reduce this overhead, we discuss techniques to overlap communication and computation as much as possible. This leads to much of the communication being latency-bound instead of bandwidth-bound, and we find that using a combination of latency- and bandwidth-optimized allreduce algorithms significantly reduces communication costs. We also discuss a semantic mismatch between MPI and CUDA that increases overheads and limits asynchrony, and propose a solution that enables communication to be aware of CUDA streams. We implement these optimizations in the open-source Aluminum communication library, enabling optimized, asynchronous, GPU-aware communication. Aluminum demonstrates improved performance in benchmarks and end-to-end training of deep networks, for both strong and weak scaling.",
keywords = "Collective algorithms, Communication optimization, Deep learning, HPC, Machine learning",
author = "Nikoli Dryden and Naoya Maruyama and Tim Moon and Tom Benson and Andy Yoo and Marc Snir and {Van Essen}, Brian",
note = "Publisher Copyright: {\textcopyright} 2018 IEEE.; 2018 IEEE/ACM Machine Learning in HPC Environments, MLHPC 2018 ; Conference date: 12-11-2018",
year = "2018",
month = jul,
day = "2",
doi = "10.1109/MLHPC.2018.8638639",
language = "English (US)",
series = "Proceedings of MLHPC 2018: Machine Learning in HPC Environments, Held in conjunction with SC 2018: The International Conference for High Performance Computing, Networking, Storage and Analysis",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "1--13",
booktitle = "Proceedings of MLHPC 2018",
address = "United States",
}