@conference {, title = {Distributed-Memory Multi-GPU Block-Sparse Tensor Contraction for Electronic Structure}, booktitle = {35th IEEE International Parallel \& Distributed Processing Symposium (IPDPS 2021)}, year = {2021}, month = {2021-05}, publisher = {IEEE}, organization = {IEEE}, address = {Portland, OR}, abstract = {Many domains of scientific simulation (chemistry, condensed matter physics, data science) increasingly eschew dense tensors for block-sparse tensors, sometimes with additional structure (recursive hierarchy, rank sparsity, etc.). Distributed-memory parallel computation with block-sparse tensorial data is paramount to minimize the time-tosolution (e.g., to study dynamical problems or for real-time analysis) and to accommodate problems of realistic size that are too large to fit into the host/device memory of a single node equipped with accelerators. Unfortunately, computation with such irregular data structures is a poor match to the dominant imperative, bulk-synchronous parallel programming model. In this paper, we focus on the critical element of block-sparse tensor algebra, namely binary tensor contraction, and report on an efficient and scalable implementation using the task-focused PaRSEC runtime. High performance of the block-sparse tensor contraction on the Summit supercomputer is demonstrated for synthetic data as well as for real data involved in electronic structure simulations of unprecedented size.}, keywords = {block-sparse matrix multiplication, distributed-memory, Electronic structure, multi-GPU node, parsec, tensor contraction}, url = {https://hal.inria.fr/hal-02970659/document}, author = {Thomas Herault and Yves Robert and George Bosilca and Robert Harrison and Cannada Lewis and Edward Valeev and Jack Dongarra} } @conference {, title = {The Template Task Graph (TTG) - An Emerging Practical Dataflow Programming Paradigm for Scientific Simulation at Extreme Scale}, booktitle = { 2020 IEEE/ACM 5th International Workshop on Extreme Scale Programming Models and Middleware (ESPM2)}, year = {2020}, month = {2020-11}, publisher = {IEEE}, organization = {IEEE}, abstract = {We describe TESSE, an emerging general-purpose, open-source software ecosystem that attacks the twin challenges of programmer productivity and portable performance for advanced scientific applications on modern high-performance computers. TESSE builds upon and extends the ParsecDAG/-dataflow runtime with a new Domain Specific Languages (DSL) and new integration capabilities. Motivating this work is our belief that such a dataflow model, perhaps with applications composed in domain specific languages, can overcome many of the challenges faced by a wide variety of irregular applications that are poorly served by current programming and execution models. Two such applications from many-body physics and applied mathematics are briefly explored. This paper focuses upon the Template Task Graph (TTG), which is TESSE{\textquoteright}s main C++ Api that provides a powerful work/data-flow programming model. Algorithms on spatial trees, block-sparse tensors, and wave fronts are used to illustrate the API and associated concepts, as well as to compare with related approaches.}, keywords = {dag, dataflow, exascale, graph, High-performance computing, workflow}, doi = {https://doi.org/10.1109/ESPM251964.2020.00011}, author = {George Bosilca and Robert Harrison and Thomas Herault and Mohammad Mahdi Javanmard and Poornima Nookala and Edward Valeev} } @techreport {1280, title = {Tensor Contraction on Distributed Hybrid Architectures using a Task-Based Runtime System}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-18-13}, year = {2018}, month = {2018-12}, publisher = {University of Tennessee}, abstract = {The needs for predictive simulation of electronic structure in chemistry and materials science calls for fast/reduced-scaling formulations of quantum n-body methods that replace the traditional dense tensors with element-, block-, rank-, and block-rank-sparse (data-sparse) tensors. The resulting, highly irregular data structures are a poor match to imperative, bulk-synchronous parallel programming style due to the dynamic nature of the problem and to the lack of clear domain decomposition to guarantee a fair load-balance. TESSE runtime and the associated programming model aim to support performance-portable composition of applications involving irregular and dynamically changing data. In this paper we report an implementation of irregular dense tensor contraction in a paradigmatic electronic structure application based on the TESSE extension of PaRSEC, a distributed hybrid task runtime system, and analyze the resulting performance on a distributed memory cluster of multi-GPU nodes. Unprecedented strong scaling and promising efficiency indicate a viable future for task-based programming of complete production-quality reduced scaling models of electronic structure.}, author = {George Bosilca and Damien Genet and Robert Harrison and Thomas Herault and Mohammad Mahdi Javanmard and Chong Peng and Edward Valeev} } @article {icl:643, title = {The International Exascale Software Project Roadmap}, journal = {International Journal of High Performance Computing}, volume = {25}, number = {1}, year = {2011}, month = {2011-01}, pages = {3-60}, abstract = {Over the last 20 years, the open-source community has provided more and more software on which the world{\textquoteright}s high-performance computing systems depend for performance and productivity. The community has invested millions of dollars and years of effort to build key components. However, although the investments in these separate software elements have been tremendously valuable, a great deal of productivity has also been lost because of the lack of planning, coordination, and key integration of technologies necessary to make them work together smoothly and efficiently, both within individual petascale systems and between different systems. It seems clear that this completely uncoordinated development model will not provide the software needed to support the unprecedented parallelism required for peta/ exascale computation on millions of cores, or the flexibility required to exploit new hardware models and features, such as transactional memory, speculative execution, and graphics processing units. This report describes the work of the community to prepare for the challenges of exascale computing, ultimately combing their efforts in a coordinated International Exascale Software Project.}, doi = {https://doi.org/10.1177/1094342010391989}, author = {Jack Dongarra and Pete Beckman and Terry Moore and Patrick Aerts and Giovanni Aloisio and Jean-Claude Andre and David Barkai and Jean-Yves Berthou and Taisuke Boku and Bertrand Braunschweig and Franck Cappello and Barbara Chapman and Xuebin Chi and Alok Choudhary and Sudip Dosanjh and Thom Dunning and Sandro Fiore and Al Geist and Bill Gropp and Robert Harrison and Mark Hereld and Michael Heroux and Adolfy Hoisie and Koh Hotta and Zhong Jin and Yutaka Ishikawa and Fred Johnson and Sanjay Kale and Richard Kenway and David Keyes and Bill Kramer and Jesus Labarta and Alain Lichnewsky and Thomas Lippert and Bob Lucas and Barney MacCabe and Satoshi Matsuoka and Paul Messina and Peter Michielse and Bernd Mohr and Matthias S. Mueller and Wolfgang E. Nagel and Hiroshi Nakashima and Michael E. Papka and Dan Reed and Mitsuhisa Sato and Ed Seidel and John Shalf and David Skinner and Marc Snir and Thomas Sterling and Rick Stevens and Fred Streitz and Bob Sugar and Shinji Sumimoto and William Tang and John Taylor and Rajeev Thakur and Anne Trefethen and Mateo Valero and Aad van der Steen and Jeffrey Vetter and Peg Williams and Robert Wisniewski and Kathy Yelick} } @article {icl:236, title = {Cray X1 Evaluation Status Report}, journal = {Oak Ridge National Laboratory Report}, volume = {/-2004/13}, year = {2004}, month = {2004-01}, author = {Pratul Agarwal and R. A. Alexander and E. Apra and Satish Balay and Arthur S. Bland and James Colgan and Eduardo D{\textquoteright}Azevedo and Jack Dongarra and Tom Dunigan and Mark Fahey and Al Geist and M. Gordon and Robert Harrison and Dinesh Kaushik and M. Krishnakumar and Piotr Luszczek and Tony Mezzacapa and Jeff Nichols and Jarek Nieplocha and Leonid Oliker and T. Packwood and M. Pindzola and Thomas C. Schulthess and Jeffrey Vetter and James B White and T. Windus and Patrick H. Worley and Thomas Zacharia} }