@article{10.1007/s10994-021-06064-w, year = {2021}, title = {{ZipLine: an optimized algorithm for the elastic bulk synchronous parallel model}}, author = {Zhao, Xing and Papagelis, Manos and An, Aijun and Chen, Bao Xin and Liu, Junfeng and Hu, Yonggang}, journal = {Machine Learning}, issn = {0885-6125}, doi = {10.1007/s10994-021-06064-w}, abstract = {{The bulk synchronous parallel (BSP) is a celebrated synchronization model for general-purpose parallel computing that has successfully been employed for distributed training of deep learning models. A shortcoming of the BSP is that it requires workers to wait for the straggler at every iteration. Therefore, employing BSP increases the waiting time of the faster workers of a cluster and results in an overall prolonged training time. To ameliorate this shortcoming of BSP, we propose ElasticBSP, a model that aims to relax its strict synchronization requirement with an elastic synchronization by allowing delayed synchronization to minimize the waiting time. ElasticBSP offers more flexibility and adaptability during the training phase, without sacrificing the accuracy of the trained model. ElasticBSP is realized by the algorithm named ZipLine, which consists of two phases. First, it estimates for each worker the end time points of its future iterations at run time, and then a one-pass algorithm over the estimated time points of all workers is employed to fast compute an optimal future time point for synchronization. We provide theoretical results about the correctness and performance of the ZipLine algorithm. Furthermore, we propose algorithmic and implementation optimizations of ZipLine, namely ZipLineOpt and ZipLineOptBS, which reduce the time complexity of ZipLine to linearithmic time. A thorough experimental evaluation demonstrates that our proposed ElasticBSP model, materialized by the proposed optimized ZipLine variants, converges faster and to a higher accuracy than the predominant BSP. The focus of the paper is on optimizing the synchronization scheduling over a parameter server architecture. It is orthogonal to other types of optimizations, such as the learning rate optimization.}}, pages = {1--37} }