@inproceedings{afbb91697fb74d6f82f0fafb73c1cdbc,
title = "HAL: Computer System for Scalable Deep Learning",
abstract = "We describe the design, deployment and operation of a computer system built to efficiently run deep learning frameworks. The system consists of 16 IBM POWER9 servers with 4 NVIDIA V100 GPUs each, interconnected with Mellanox EDR InfiniBand fabric, and a DDN all-flash storage array. The system is tailored towards efficient execution of the IBM Watson Machine Learning enterprise software stack that combines popular open-source deep learning frameworks. We build a custom management software stack to enable an efficient use of the system by a diverse community of users and provide guides and recipes for running deep learning workloads at scale utilizing all available GPUs. We demonstrate scaling of a PyTorch and TensorFlow based deep neural networks to produce state-of-the-art performance results.",
keywords = "cluster architecture, deep learning, high-performance computing",
author = "Volodymyr Kindratenko and Dawei Mu and Yan Zhan and John Maloney and Hashemi, {Sayed Hadi} and Benjamin Rabe and Ke Xu and Roy Campbell and Jian Peng and William Gropp",
note = "Publisher Copyright: {\textcopyright} 2020 ACM.; 2020 Conference on Practice and Experience in Advanced Research Computing: Catch the Wave, PEARC 2020 ; Conference date: 27-07-2020 Through 31-07-2020",
year = "2020",
month = jul,
day = "26",
doi = "10.1145/3311790.3396649",
language = "English (US)",
series = "ACM International Conference Proceeding Series",
publisher = "Association for Computing Machinery",
pages = "41--48",
booktitle = "PEARC 2020 - Practice and Experience in Advanced Research Computing 2020",
address = "United States",
}