@inproceedings{b45be921136e42d7b991aa727dba33af,
title = "Machine learning for load balancing in the Linux kernel",
abstract = "The OS load balancing algorithm governs the performance gains provided by a multiprocessor computer system. The Linux's Completely Fair Scheduler (CFS) scheduler tracks process loads by average CPU utilization to balance workload between processor cores. That approach maximizes the utilization of processing time but overlooks the contention for lower-level hardware resources. In servers running compute-intensive workloads, an imbalanced need for limited computing resources hinders execution performance. This paper solves the above problem using a machine learning (ML)-based resource-aware load balancer. We describe (1) low-overhead methods for collecting training data; (2) an ML model based on a multi-layer perceptron model that imitates the CFS load balancer based on the collected training data; and (3) an in-kernel implementation of inference on the model. Our experiments demonstrate that the proposed model has an accuracy of 99% in making migration decisions and while only increasing the latency by 1.9 μs.",
keywords = "Linux kernel, completely fair scheduler, load balancing, machine learning, neural network, operating system",
author = "Jingde Chen and Banerjee, {Subho S.} and Kalbarczyk, {Zbigniew T.} and Iyer, {Ravishankar K.}",
note = "Machine Learning in Scheduling. There have also been prior efforts to apply machine learning to improve OS task scheduling [6]. A common approach is to train machine learning models to learn the CPU utilization of the processes based on execution history, and then classify them to use different scheduling strategies [9, 14]. In [9], the authors tuned the scheduling policy by setting “nice” values of the processes based on predicted Turn-around-time (TaT). In [14], customized execution time slices were deliberately set in a Linux(1) scheduler according to predicted CPU utilization of the processes. Previous work that combined machine learning and resource-aware scheduling mostly focused on application scheduling in heterogeneous systems. Successful attempts have been made to train machine-learning models to perform dynamic scheduling of applications with varying workloads in user space on heterogeneous hardware consisting of CPUs, GPUs, and hardware accelerators [1, 2, 10]. In [2], hardware performance counters were used to make measurements of the utilization of system resources as input to a reinforcement learning ML model. 6 Conclusion In this paper, we explored the application of machine learning to the OS load balancing algorithm of a multiprocessor system. We used imitation learning to incorporate a machine learning model as a system component in the kernel. The evaluation results show that the overhead brought by the in-kernel ML module does not impact system performance. Our experiment results indicate that it is indeed feasible to apply machine learning to tune the load balancing policy in the OS kernel. In future work, we intend to add statistics from hardware performance counters to the model and use deep reinforcement learning to improve the load balancing policy based on hardware resource usages of running processes. Acknowledgments We thank J. Applequist for her help in preparing this manuscript. This research was supported in part by the National Science Foundation (NSF) under Grant Nos. CNS 13-37732 and CNS 16-24790; by the IBM-ILLINOIS Center for Cognitive Computing Systems Research (C3SR), a research collaboration that is part of the IBM AI Horizon Network; and by Intel through equipment donations. Any opinions, findings, and conclusions or recommendations expressed in this material are those of the authors and do not necessarily reflect the views of the NSF, IBM, or Intel.; 11th ACM SIGOPS Asia-Pacific Workshop on Systems, APSys 2020 ; Conference date: 24-08-2020 Through 25-08-2020",
year = "2020",
month = aug,
day = "24",
doi = "10.1145/3409963.3410492",
language = "English (US)",
series = "APSys 2020 - Proceedings of the 2020 ACM SIGOPS Asia-Pacific Workshop on Systems",
publisher = "Association for Computing Machinery",
pages = "67--74",
booktitle = "APSys 2020 - Proceedings of the 2020 ACM SIGOPS Asia-Pacific Workshop on Systems",
address = "United States",
}