@article{M8D0809AA, title = "Dynamic Resource Adjustment Operator Based on Autoscaling for Improving Distributed Training Job Performance on Kubernetes", journal = "KIPS Transactions on Computer and Communication Systems", year = "2022", issn = "2287-5891", doi = "https://doi.org/10.3745/KTCCS.2022.11.7.205", author = "Jinwon Jeong/Heonchang Yu", keywords = "Kubeflow, Kubernetes, Distributed Deep Learning Training, Resource Adjustment Operator", abstract = "One of the many tools used for distributed deep learning training is Kubeflow, which runs on Kubernetes, a container orchestration tool. TensorFlow jobs can be managed using the existing operator provided by Kubeflow. However, when considering the distributed deep learning training jobs based on the parameter server architecture, the scheduling policy used by the existing operator does not consider the task affinity of the distributed training job and does not provide the ability to dynamically allocate or release resources. This can lead to long job completion time and low resource utilization rate. Therefore, in this paper we proposes a new operator that efficiently schedules distributed deep learning training jobs to minimize the job completion time and increase resource utilization rate. We implemented the new operator by modifying the existing operator and conducted experiments to evaluate its performance. The experiment results showed that our scheduling policy improved the average job completion time reduction rate of up to 84% and average CPU utilization increase rate of up to 92%." }