@conference {, title = {Understanding Scalability and Fine-Grain Parallelism of Synchronous Data Parallel Training}, booktitle = {2019 IEEE/ACM Workshop on Machine Learning in High Performance Computing Environments (MLHPC)}, year = {2019}, month = {2019-11}, publisher = {IEEE}, organization = {IEEE}, address = {Denver, CO}, abstract = {In the age of big data, deep learning has emerged as a powerful tool to extract insight and exploit its value, both in industry and scientific applications. With increasing complexity of learning models and amounts of training data, data-parallel approaches based on frequent all-reduce synchronization steps are increasingly popular. Despite the fact that high-performance computing (HPC) technologies have been designed to address such patterns efficiently, the behavior of data-parallel approaches on HPC platforms is not well understood. To address this issue, in this paper we study the behavior of Horovod, a popular data-parallel approach that relies on MPI, on Theta, a pre-Exascale machine at Argonne National Laboratory. Using two representative applications, we explore two aspects: (1) how performance and scalability is affected by important parameters such as number of nodes, number of workers, threads per node, batch size; (2) how computational phases are interleaved withall-reduce communication phases at fine granularity and what consequences this interleaving has in terms of potential bottlenecks. Our findings show that pipelining of back-propagation, gradient reduction and weight updates mitigate the effects of stragglers during all-reduce only partially. Furthermore, there can be significant delays between weights update, which can be leveraged to mask the overhead of additional background operations that are coupled with the training.}, doi = {https://doi.org/10.1109/MLHPC49564.2019.00006}, author = {Jiali Li and Bogdan Nicolae and Justin M. Wozniak and George Bosilca} }