@inproceedings{33eed615454c4e129273b9742563b7c5,
title = "Doing more with less: Training large DNN models on commodity servers for the masses",
abstract = "Deep neural networks (DNNs) have grown exponentially in complexity and size over the past decade, leaving only the elite who have access to massive datacenter-based resources with the ability to develop and train such models. One of the main challenges for the long tail of researchers who might have access to only limited resources (e.g., a single multi-GPU server) is limited GPU memory capacity compared to model size. The problem is so acute that the memory requirement of training large DNN models can often exceed the aggregate capacity of all available GPUs on commodity servers; this problem only gets worse with the trend of ever-growing model sizes. Current solutions that rely on virtualizing GPU memory (by swapping to/from CPU memory) incur excessive swapping overhead. In this paper, we advocate rethinking how DNN frameworks schedule computation and move data to push the boundaries of training large models efficiently on modest multi-GPU deployments.",
author = "Youjie Li and Amar Phanishayee and Derek Murray and Kim, {Nam Sung}",
note = "Publisher Copyright: {\textcopyright} 2021 ACM.; 18th Workshop on Hot Topics in Operating Systems, HotOS 2021 ; Conference date: 01-06-2021 Through 03-06-2021",
year = "2021",
month = jun,
day = "1",
doi = "10.1145/3458336.3465289",
language = "English (US)",
series = "HotOS 2021 - Proceedings of the 2021 Workshop on Hot Topics in Operating Systems",
publisher = "Association for Computing Machinery",
pages = "119--127",
booktitle = "HotOS 2021 - Proceedings of the 2021 Workshop on Hot Topics in Operating Systems",
address = "United States",
}