@inproceedings{2b50c45948f84be6bfd06a9a349b3bfd,
title = "Accelerating large scale deep learning inference through DeepCPU at Microsoft",
abstract = "The application of deep learning models presents significant improvement to many Microsoft services and products. In this paper, we introduce our experience and methodology of developing and applying the DeepCPU library for serving DL models in production at large scale with remarkable latency improvement and infrastructure cost reduction. We describe two ways to use the library, through customized optimization or framework integration, targeting different scenarios.",
author = "Minjia Zhang and Samyam Rajbandari and Wenhan Wang and Elton Zheng and Olatunji Ruwase and Jeff Rasley and Jason Li and Junhua Wang and Yuxiong He",
year = "2019",
language = "English (US)",
series = "Proceedings of the 2019 USENIX Conference on Operational Machine Learning, OpML 2019",
publisher = "USENIX Association",
pages = "5--7",
booktitle = "Proceedings of the 2019 USENIX Conference on Operational Machine Learning, OpML 2019",
note = "2019 USENIX Conference on Operational Machine Learning, OpML 2019 ; Conference date: 20-05-2019",
}