@article {, title = {An international survey on MPI users}, journal = {Parallel Computing}, volume = {108}, year = {2021}, month = {2021-12}, abstract = {The Message Passing Interface (MPI) plays a crucial part in the parallel computing ecosystem, a driving force behind many of thehigh-performance computing (HPC) successes. To maintain its relevance to the user community{\textemdash}and in particular to the growingHPC community at large{\textemdash}the MPI standard needs to identify and understand the MPI users{\textquoteright} concerns and expectations, and adaptaccordingly to continue to efficiently bridge the gap between users and hardware. This questionnaire survey was conducted usingtwo online questionnaire frameworks and has gathered more than 850 answers from 42 countries since February 2019. Some ofpreceding surveys of MPI uses are questionnaire surveys like ours, while others are conducted either by analyzing MPI programsto reveal static behavior or by using profiling tools to analyze the dynamic runtime behavior of MPI jobs. Our survey is differentfrom other questionnaire surveys in terms of its larger number of participants and wide geographic spread. As a result, it is possibleto illustrate the current status of MPI users more accurately and with a wider geographical distribution. In this report, we will showsome interesting findings, compare the results with preceding studies when possible, and provide some recommendations for MPIForum based on the findings.}, keywords = {message passing interface, MPI, survey}, doi = {10.1016/j.parco.2021.102853}, url = {https://www.sciencedirect.com/science/article/abs/pii/S0167819121000983}, author = {Atsushi Hori and Emmanuel Jeannot and George Bosilca and Takahiro Ogura and Balazs Gerofi and Jie Yin and Yutaka Ishikawa} } @article {1460, title = {Overhead of Using Spare Nodes}, journal = {The International Journal of High Performance Computing Applications}, year = {2020}, month = {2020-02}, abstract = {With the increasing fault rate on high-end supercomputers, the topic of fault tolerance has been gathering attention. To cope with this situation, various fault-tolerance techniques are under investigation; these include user-level, algorithm-based fault-tolerance techniques and parallel execution environments that enable jobs to continue following node failure. Even with these techniques, some programs with static load balancing, such as stencil computation, may underperform after a failure recovery. Even when spare nodes are present, they are not always substituted for failed nodes in an effective way. This article considers the questions of how spare nodes should be allocated, how to substitute them for faulty nodes, and how much the communication performance is affected by such a substitution. The third question stems from the modification of the rank mapping by node substitutions, which can incur additional message collisions. In a stencil computation, rank mapping is done in a straightforward way on a Cartesian network without incurring any message collisions. However, once a substitution has occurred, the optimal node-rank mapping may be destroyed. Therefore, these questions must be answered in a way that minimizes the degradation of communication performance. In this article, several spare node allocation and failed node substitution methods will be proposed, analyzed, and compared in terms of communication performance following the substitution. The proposed substitution methods are named sliding methods. The sliding methods are analyzed by using our developed simulation program and evaluated by using the K computer, Blue Gene/Q (BG/Q), and TSUBAME 2.5. It will be shown that when failures occur, the stencil communication performance on the K and BG/Q can be slowed around 10 times depending on the number of node failures. The barrier performance on the K can be cut in half. On BG/Q, barrier performance can be slowed by a factor of 10. Further, it will also be shown that almost no such communication performance degradation can be seen on TSUBAME 2.5. This is because TSUBAME 2.5 has an Infiniband network connected with a FatTree topology, while the K computer and BG/Q have dedicated Cartesian networks. Thus, the communication performance degradation depends on network characteristics.}, keywords = {communication performance, fault mitigation, Fault tolerance, sliding method, spare node}, issn = {1094-3420}, doi = {https://doi.org/10.1177\%2F1094342020901885}, url = {https://journals.sagepub.com/doi/10.1177/1094342020901885}, author = {Atsushi Hori and Kazumi Yoshinaga and Thomas Herault and Aurelien Bouteiller and George Bosilca and Yutaka Ishikawa} } @article {, title = {A Report of the MPI International Survey (Poster)}, year = {2020}, month = {2020-09}, publisher = {EuroMPI/USA {\textquoteright}20: 27th European MPI Users{\textquoteright} Group Meeting}, address = {Austin, TX}, author = {Atsushi Hori and Takahiro Ogura and Balazs Gerofi and Jie Yin and Yutaka Ishikawa and Emmanuel Jeannot and George Bosilca} } @article {1301, title = {Comparing the Performance of Rigid, Moldable, and Grid-Shaped Applications on Failure-Prone HPC Platforms}, journal = {Parallel Computing}, volume = {85}, year = {2019}, month = {2019-07}, pages = {1{\textendash}12}, doi = {https://doi.org/10.1016/j.parco.2019.02.002}, author = {Valentin Le F{\`e}vre and Thomas Herault and Yves Robert and Aurelien Bouteiller and Atsushi Hori and George Bosilca and Jack Dongarra} } @inbook {1302, title = {System Software for Many-Core and Multi-Core Architectures}, booktitle = {Advanced Software Technologies for Post-Peta Scale Computing: The Japanese Post-Peta CREST Research Project}, year = {2019}, pages = {59{\textendash}75}, publisher = {Springer Singapore}, organization = {Springer Singapore}, address = {Singapore}, abstract = {In this project, the software technologies for the post-peta scale computing were explored. More specifically, OS technologies for heterogeneous architectures, lightweight thread, scalable I/O, and fault mitigation were investigated. As for the OS technologies, a new parallel execution model, Partitioned Virtual Address Space (PVAS), for the many-core CPU was proposed. For the heterogeneous architectures, where multi-core CPU and many-core CPU are connected with an I/O bus, an extension of PVAS, Multiple-PVAS, to have a unified virtual address space of multi-core and many-core CPUs was proposed. The proposed PVAS was also enhanced to have multiple processes where process context switch can take place at the user level (named User-Level Process: ULP). As for the scalable I/O, EARTH, optimization techniques for MPI collective I/O, was proposed. Lastly, for the fault mitigation, User Level Fault Mitigation, ULFM was improved to have faster agreement process, and sliding methods to substitute failed nodes with spare nodes was proposed. The funding of this project was ended in 2016; however, many proposed technologies are still being propelled.}, isbn = {978-981-13-1924-2}, doi = {10.1007/978-981-13-1924-2_4}, url = {https://doi.org/10.1007/978-981-13-1924-2_4}, author = {Atsushi Hori and Tsujita, Yuichi and Shimada, Akio and Yoshinaga, Kazumi and Mitaro, Namiki and Fukazawa, Go and Sato, Mikiko and George Bosilca and Aurelien Bouteiller and Thomas Herault}, editor = {Sato, Mitsuhisa} } @conference {1214, title = {Do moldable applications perform better on failure-prone HPC platforms?}, booktitle = {11th Workshop on Resiliency in High Performance Computing in Clusters, Clouds, and Grids}, series = {LNCS}, year = {2018}, month = {2018-08}, publisher = {Springer Verlag}, organization = {Springer Verlag}, address = {Turin, Italy}, abstract = {This paper compares the performance of different approaches to tolerate failures using checkpoint/restart when executed on large-scale failure-prone platforms. We study (i) Rigid applications, which use a constant number of processors throughout execution; (ii) Moldable applications, which can use a different number of processors after each restart following a fail-stop error; and (iii) GridShaped applications, which are moldable applications restricted to use rectangular processor grids (such as many dense linear algebra kernels). For each application type, we compute the optimal number of failures to tolerate before relinquishing the current allocation and waiting until a new resource can be allocated, and we determine the optimal yield that can be achieved. We instantiate our performance model with a realistic applicative scenario and make it publicly available for further usage.}, author = {Valentin Le F{\`e}vre and George Bosilca and Aurelien Bouteiller and Thomas Herault and Atsushi Hori and Yves Robert and Jack Dongarra} }