@article{M244C88F4, title = "Hybrid All-Reduce Strategy with Layer Overlapping for Reducing Communication Overhead in Distributed Deep Learning", journal = "KIPS Transactions on Computer and Communication Systems", year = "2021", issn = "2287-5891", doi = "10.3745/KTCCS.2021.10.7.191", author = "Daehyun Kim/Sangho Yeo/Sangyoon Oh", keywords = "Distributed Deep Learning, Synchronization, Layer Overlapping, Allreduce", abstract = "Since the size of training dataset become large and the model is getting deeper to achieve high accuracy in deep learning, the deep neural network training requires a lot of computation and it takes too much time with a single node. Therefore, distributed deep learning is proposed to reduce the training time by distributing computation across multiple nodes. In this study, we propose hybrid allreduce strategy that considers the characteristics of each layer and communication and computational overlapping technique for synchronization of distributed deep learning. Since the convolution layer has fewer parameters than the fully-connected layer as well as it is located at the upper, only short overlapping time is allowed. Thus, butterfly allreduce is used to synchronize the convolution layer. On the other hand, fully-connecter layer is synchronized using ring all-reduce. The empirical experiment results on PyTorch with our proposed scheme shows that the proposed method reduced the training time by up to 33% compared to the baseline PyTorch." }