@inproceedings{845c0a42c888486e97a08b284a640644,
title = "DeepSpeed-Inference: Enabling Efficient Inference of Transformer Models at Unprecedented Scale",
abstract = "The landscape of transformer model inference is increasingly diverse in model size, model characteristics, latency and throughput requirements, hardware requirements, etc. With such diversity, designing a versatile inference system is challenging. DeepSpeed-Inference addresses these challenges by (1) a multi-GPU inference solution to minimize latency while maximizing throughput for both dense and sparse transformers when the model fits in aggregate GPU memory, and (2) a heterogeneous inference solution that leverages CPU/NVMe/GPU memory to enable high-throughput inference for models larger than aggregate GPU memory. DeepSpeed-Inference reduces latency by 6.4× and increases throughput by 1.5 ×over the state-of-the-art. It enables trillion parameter scale inference under real-time latency constraints by leveraging hundreds of GPUs, an unprecedented scale for inference. It can inference 25 ×larger models than with GPU-only solutions, while delivering a high throughput of 84 TFLOPS (over 50% of A6000 peak).",
keywords = "Deep Learning, DeepSpeed, Distributed Inference, Mixture of Experts, PyTorch, Transformer models",
author = "Aminabadi, {Reza Yazdani} and Samyam Rajbhandari and Awan, {Ammar Ahmad} and Cheng Li and Du Li and Elton Zheng and Olatunji Ruwase and Shaden Smith and Minjia Zhang and Jeff Rasley and Yuxiong He",
note = "Publisher Copyright: {\textcopyright} 2022 IEEE.; 2022 International Conference for High Performance Computing, Networking, Storage and Analysis, SC 2022 ; Conference date: 13-11-2022 Through 18-11-2022",
year = "2022",
doi = "10.1109/SC41404.2022.00051",
language = "English (US)",
series = "International Conference for High Performance Computing, Networking, Storage and Analysis, SC",
publisher = "IEEE Computer Society",
booktitle = "Proceedings of SC 2022",
address = "United States",
}