@article {, title = {Callback-based completion notification using MPI Continuations}, journal = {Parallel Computing}, volume = {21238566}, year = {2021}, month = {Jan-05-2021}, pages = {102793}, abstract = {Asynchronous programming models (APM) are gaining more and more traction, allowing applications to expose the available concurrency to a runtime system tasked with coordinating the execution. While MPI has long provided support for multi-threaded communication and nonblocking operations, it falls short of adequately supporting APMs as correctly and efficiently handling MPI communication in different models is still a challenge. We have previously proposed an extension to the MPI standard providing operation completion notifications using callbacks, so-called MPI Continuations. This interface is flexible enough to accommodate a wide range of different APMs. In this paper, we present an extension to the previously described interface that allows for finer control of the behavior of the MPI Continuations interface. We then present some of our first experiences in using the interface in the context of different applications, including the NAS parallel benchmarks, the PaRSEC task-based runtime system, and a load-balancing scheme within an adaptive mesh refinement solver called ExaHyPE. We show that the interface, implemented inside Open MPI, enables low-latency, high-throughput completion notifications that outperform solutions implemented in the application space.}, keywords = {MPI, MPI Continuations, OmpSs, OpenMP, parsec, TAMPI, Task-based programming models}, issn = {01678191}, doi = {10.1016/j.parco.2021.102793}, url = {https://www.sciencedirect.com/science/article/abs/pii/S0167819121000466?via\%3Dihub}, author = {Schuchart, Joseph and Samfass, Philipp and Niethammer, Christoph and Gracia, Jos{\'e} and George Bosilca} } @article {1201, title = {Evaluation of Dataflow Programming Models for Electronic Structure Theory}, journal = {Concurrency and Computation: Practice and Experience: Special Issue on Parallel and Distributed Algorithms}, volume = {2018}, year = {2018}, month = {2018-05}, pages = {1{\textendash}20}, abstract = {Dataflow programming models have been growing in popularity as a means to deliver a good balance between performance and portability in the post-petascale era. In this paper, we evaluate different dataflow programming models for electronic structure methods and compare them in terms of programmability, resource utilization, and scalability. In particular, we evaluate two programming paradigms for expressing scientific applications in a dataflow form: (1) explicit dataflow, where the dataflow is specified explicitly by the developer, and (2) implicit dataflow, where a task scheduling runtime derives the dataflow using per-task data-access information embedded in a serial program. We discuss our findings and present a thorough experimental analysis using methods from the NWChem quantum chemistry application as our case study, and OpenMP, StarPU, and PaRSEC as the task-based runtimes that enable the different forms of dataflow execution. Furthermore, we derive an abstract model to explore the limits of the different dataflow programming paradigms.}, keywords = {CCSD, coupled cluster methods, dataflow, NWChem, OpenMP, parsec, StarPU, task-based runtime}, doi = {https://doi.org/10.1002/cpe.4490}, author = {Heike Jagode and Anthony Danalis and Reazul Hoque and Mathieu Faverge and Jack Dongarra} } @article {1176, title = {Argobots: A Lightweight Low-Level Threading and Tasking Framework}, journal = {IEEE Transactions on Parallel and Distributed Systems}, year = {2017}, month = {2017-10}, abstract = {In the past few decades, a number of user-level threading and tasking models have been proposed in the literature to address the shortcomings of OS-level threads, primarily with respect to cost and flexibility. Current state-of-the-art user-level threading and tasking models, however, are either too specific to applications or architectures or are not as powerful or flexible. In this paper, we present Argobots, a lightweight, low-level threading and tasking framework that is designed as a portable and performant substrate for high-level programming models or runtime systems. Argobots offers a carefully designed execution model that balances generality of functionality with providing a rich set of controls to allow specialization by the user or high-level programming model. We describe the design, implementation, and optimization of Argobots and present integrations with three example high-level models: OpenMP, MPI, and co-located I/O service. Evaluations show that (1) Argobots outperforms existing generic threading runtimes; (2) our OpenMP runtime offers more efficient interoperability capabilities than production OpenMP runtimes do; (3) when MPI interoperates with Argobots instead of Pthreads, it enjoys reduced synchronization costs and better latency hiding capabilities; and (4) I/O service with Argobots reduces interference with co-located applications, achieving performance competitive with that of the Pthreads version.}, keywords = {Argobots, context switch, I/O, interoperability, lightweight, MPI, OpenMP, stackable scheduler, tasklet, user-level thread}, doi = {10.1109/TPDS.2017.2766062}, url = {http://ieeexplore.ieee.org/document/8082139/}, author = {Sangmin Seo and Abdelhalim Amer and Pavan Balaji and Cyril Bordage and George Bosilca and Alex Brooks and Philip Carns and Adrian Castello and Damien Genet and Thomas Herault and Shintaro Iwasaki and Prateek Jindal and Sanjay Kale and Sriram Krishnamoorthy and Jonathan Lifflander and Huiwei Lu and Esteban Meneses and Mar Snir and Yanhua Sun and Kenjiro Taura and Pete Beckman} }