@booklet {,
	title = {CholeskyQR with Randomization and Pivoting for Tall Matrices (CQRRPT)},
	year = {2024},
	month = {2024-02},
	publisher = {arXiv},
	abstract = {This paper develops and analyzes a new algorithm for QR decomposition with column pivoting (QRCP) of rectangular matrices with large row counts. The algorithm combines methods from randomized numerical linear algebra in a particularly careful way in order to accelerate both pivot decisions for the input matrix and the process of decomposing the pivoted matrix into the QR form. The source of the latter acceleration is a use of randomized preconditioning and CholeskyQR. Comprehensive analysis is provided in both exact and finite-precision arithmetic to characterize the algorithm{\textquoteright}s rank-revealing properties and its numerical stability granted probabilistic assumptions of the sketching operator. An implementation of the proposed algorithm is described and made available inside the open-source RandLAPACK library, which itself relies on RandBLAS - also available in open-source format. Experiments with this implementation on an Intel Xeon Gold 6248R CPU demonstrate order-of-magnitude speedups relative to LAPACK{\textquoteright}s standard function for QRCP, and comparable performance to a specialized algorithm for unpivoted QR of tall matrices, which lacks the strong rank-revealing properties of the proposed method.},
	url = {https://arxiv.org/abs/2311.08316},
	author = {Maksim Melnichenko and Oleg Balabanov and Riley Murray and James Demmel and Michael W. Mahoney and Piotr Luszczek}
}
@booklet {,
	title = {Generalizing Random Butterfly Transforms to Arbitrary Matrix Sizes},
	year = {2023},
	month = {2023-12},
	publisher = {arXiv},
	abstract = {Parker and L{\^e} introduced random butterfly transforms (RBTs) as a preprocessing technique to replace pivoting in dense LU factorization. Unfortunately, their FFT-like recursive structure restricts the dimensions of the matrix. Furthermore, on multi-node systems, efficient management of the communication overheads restricts the matrix{\textquoteright}s distribution even more. To remove these limitations, we have generalized the RBT to arbitrary matrix sizes by truncating the dimensions of each layer in the transform. We expanded Parker{\textquoteright}s theoretical analysis to generalized RBT, specifically that in exact arithmetic, Gaussian elimination with no pivoting will succeed with probability 1 after transforming a matrix with full-depth RBTs. Furthermore, we experimentally show that these generalized transforms improve performance over Parker{\textquoteright}s formulation by up to 62\% while retaining the ability to replace pivoting. This generalized RBT is available in the SLATE numerical software library.},
	url = {https://arxiv.org/abs/2312.09376},
	author = {Neil Lindquist and Piotr Luszczek and Jack Dongarra}
}
@conference {,
	title = {Using Additive Modifications in LU Factorization Instead of Pivoting},
	booktitle = {37th ACM International Conference on Supercomputing (ICS{\textquoteright}23)},
	year = {2023},
	month = {2023-06},
	publisher = {ACM},
	organization = {ACM},
	address = {Orlando, FL},
	doi = {10.1145/3577193.3593731},
	author = {Neil Lindquist and Piotr Luszczek and Jack Dongarra}
}
@techreport {,
	title = {Analysis of the Communication and Computation Cost of FFT Libraries towards Exascale},
	journal = {ICL Technical Report},
	number = {ICL-UT-22-07},
	year = {2022},
	month = {2022-07},
	publisher = {Innovative Computing Laboratory},
	author = {Alan Ayala and Stanimire Tomov and Piotr Luszczek and Sebastien Cayrols and Gerald Ragghianti and Jack Dongarra}
}
@techreport {,
	title = {FFT Benchmark Performance Experiments on Systems Targeting Exascale},
	journal = {ICL Technical Report},
	number = {ICL-UT-22-02},
	year = {2022},
	month = {2022-03},
	author = {Alan Ayala and Stanimire Tomov and Piotr Luszczek and Sebastien Cayrols and Gerald Ragghianti and Jack Dongarra}
}
@techreport {,
	title = {PAQR: Pivoting Avoiding QR factorization},
	journal = {ICL Technical Report},
	number = {ICL-UT-22-06},
	year = {2022},
	month = {2022-06},
	abstract = {The solution of linear least-squares problems is at the heart of many scientific and engineering applications. While any method able to minimize the backward error of such problems is considered numerically stable, the theory states that the forward error depends on the condition number of the
matrix in the system of equations. On the one hand, the QR factorization is an efficient method to solve such problems, but the solutions it produces may have large forward errors when the matrix is deficient. On the other hand, QR with column pivoting (QRCP) is able to produce smaller forward errors on deficient matrices, but its cost is prohibitive compared to QR. The aim of this paper is to propose PAQR, an alternative solution method with the same cost (or smaller) as QR and as accurate as QRCP in practical cases, for the solution of rank-deficient linear least-squares problems. After presenting the algorithm and its implementations on different architectures, we compare its accuracy and performance results on a variety of application problems.
},
	author = {Wissam M. Sid-Lakhdar and Sebastien Cayrols and Daniel Bielich and Ahmad Abdelfattah and Piotr Luszczek and Mark Gates and Stanimire Tomov and Hans Johansen and David Williams-Young and Timothy A. Davis and Jack Dongarra}
}
@techreport {,
	title = {Randomized Numerical Linear Algebra: A Perspective on the Field with an Eye to Software},
	journal = {University of California, Berkeley EECS Technical Report},
	number = {UCB/EECS-2022-258},
	year = {2022},
	month = {2022-11},
	publisher = {University of California, Berkeley},
	abstract = {Randomized numerical linear algebra {\textendash} RandNLA, for short {\textendash} concerns the use of randomization as a resource to develop improved algorithms for large-scale linear algebra computations. The origins of contemporary RandNLA lay in theoretical computer science, where it blossomed from a simple idea: randomization provides an avenue for computing approximate solutions to linear algebra problems more efficiently than deterministic algorithms. This idea proved fruitful in and was largely driven by the development of scalable algorithms for machine learning and statistical data analysis applications. However, the true potential of RandNLA only came into focus once it began to integrate with the fields of numerical analysis and {\textquotedblleft}classical{\textquotedblright} numerical linear algebra. Through the efforts of many individuals, randomized algorithms have been developed that provide full control over the accuracy of their solutions and that are every bit as reliable as algorithms that might be found in libraries such as LAPACK.
The spectrum of possibilities offered by RandNLA has created a virtuous cycle of contributions by numerical analysts, statisticians, theoretical computer scientists, and the machine learning community. Recent years have even seen the incorporation of certain RandNLA methods into MATLAB, the NAG Library, and NVIDIA{\textquoteright}s cuSOLVER. In view of these developments, we believe the time is ripe to accelerate the adoption of RandNLA in the scientific community. In particular, we believe the community stands to benefit significantly from a suitably defined {\textquotedblleft}RandBLAS{\textquotedblright} and {\textquotedblleft}RandLAPACK,{\textquotedblright} to serve as standard libraries for RandNLA, in much the same way that BLAS and LAPACK serve as standards for deterministic linear algebra.
This monograph surveys the field of RandNLA as a step toward building mean- ingful RandBLAS and RandLAPACK libraries. Section 1 begins by setting scope and design principles for RandLAPACK and summarizing subsequent sections of the monograph. Section 2 focuses on RandBLAS, which is to be responsible for sketching. Details of functionality suitable for RandLAPACK are covered in the five sections that follow. Specifically, Sections 3 to 5 cover least squares and optimization, low- rank approximation, and other select problems that are well-understood in how they benefit from randomized algorithms. The remaining sections {\textendash} on statistical leverage scores (Section 6) and tensor computations (Section 7) {\textendash} read more like traditional surveys. The different flavor of these latter sections reflects how, in our assessment, the literature on these topics is still maturing.
We provide a substantial amount of pseudo-code and supplementary material over the course of five appendices. Much of the pseudo-code has been tested via publicly available Matlab and Python implementations.},
	keywords = {Randomized algorithms},
	doi = {10.48550/arXiv.2302.1147},
	url = {https://www2.eecs.berkeley.edu/Pubs/TechRpts/2022/EECS-2022-258.html},
	author = {Riley Murray and James Demmel and Michael W. Mahoney and N. Benjamin Erichson and Maksim Melnichenko and Osman Asif Malik and Laura Grigori and Piotr Luszczek and Micha{\l} Derezi{\'n}ski and Miles E. Lopes and Tianyu Liang and Hengrui Luo and Jack Dongarra}
}
@conference {,
	title = {Surrogate ML/AI Model Benchmarking for FAIR Principles{\textquoteright} Conformance},
	booktitle = {2022 IEEE High Performance Extreme Computing Conference (HPEC)},
	year = {2022},
	month = {2022-09},
	publisher = {IEEE},
	organization = {IEEE},
	abstract = {We present benchmarking platform for surrogate ML/AI models that enables the essential properties for open science and allow them to be findable, accessible, interoperable, and reusable. We also present a use case of cloud cover modeling, analysis, and experimental testing based on a large dataset of multi-spectral satellite sensor data. We use this particular evaluation to highlight the plethora of choices that need resolution for the life cycle of supporting the scientific workflows with data-driven models that need to be first trained to satisfactory accuracy and later monitored during field usage for proper feedback into both computational results and future data model improvements. Unlike traditional testing, performance, or analysis efforts, we focus exclusively on science-oriented metrics as the relevant figures of merit.},
	keywords = {Analytical models, Benchmark testing, Cloud computing, Computational modeling, Data models, Measurement, Satellites},
	doi = {10.1109/HPEC55821.2022.9926401},
	url = {https://ieeexplore.ieee.org/document/9926401/},
	author = {Piotr Luszczek and Cade Brown}
}
@conference {,
	title = {Threshold Pivoting for Dense LU Factorization},
	booktitle = {ScalAH22: 13th Workshop on Latest Advances in Scalable Algorithms for Large-Scale Heterogeneous Systems },
	year = {2022},
	month = {2022-11},
	publisher = {IEEE},
	organization = {IEEE},
	address = {Dallas, Texas},
	abstract = {LU factorization is a key approach for solving large, dense systems of linear equations. Partial row pivoting is commonly used to ensure numerical stability; however, the data movement needed for the row interchanges can reduce performance. To improve this, we propose using threshold pivoting to find pivots almost as good as those selected by partial pivoting but that result in less data movement. Our theoretical analysis bounds the element growth similarly to partial pivoting; however, it also shows that the growth of threshold pivoting for a given matrix cannot be bounded by that of partial pivoting and vice versa. Additionally, we experimentally tested the approach on the Summit supercomputer. Threshold pivoting improved performance by up to 32\% without a significant effect on accuracy. For a more aggressive configuration with up to one digit of accuracy lost, the improvement was as high as 44\%.},
	doi = {10.1109/ScalAH56622.2022.00010},
	author = {Neil Lindquist and Mark Gates and Piotr Luszczek and Jack Dongarra}
}
@article {,
	title = {Accelerating Restarted GMRES with Mixed Precision Arithmetic},
	journal = {IEEE Transactions on Parallel and Distributed Systems},
	year = {2021},
	month = {2021-06},
	abstract = {The generalized minimum residual method (GMRES) is a commonly used iterative Krylov solver for sparse, non-symmetric systems of linear equations. Like other iterative solvers, data movement dominates its run time. To improve this performance, we propose running GMRES in reduced precision with key operations remaining in full precision. Additionally, we provide theoretical results linking the convergence of finite precision GMRES with classical Gram-Schmidt with reorthogonalization (CGSR) and its infinite precision counterpart which helps justify the convergence of this method to double-precision accuracy. We tested the mixed-precision approach with a variety of matrices and preconditioners on a GPU-accelerated node. Excluding the incomplete LU factorization without fill in (ILU(0)) preconditioner, we achieved average speedups ranging from 8 to 61 percent relative to comparable double-precision implementations, with the simpler preconditioners achieving the higher speedups.
},
	keywords = {Convergence, Error correction, iterative methods, Kernel, linear systems, Stability analysis},
	doi = {10.1109/TPDS.2021.3090757},
	author = {Neil Lindquist and Piotr Luszczek and Jack Dongarra}
}
@techreport {,
	title = {Interim Report on Benchmarking FFT Libraries on High Performance Systems},
	journal = {Innovative Computing Laboratory Technical Report},
	number = {ICL-UT-21-03},
	year = {2021},
	month = {2021-07},
	publisher = {University of Tennessee},
	type = {ICL Tech Report},
	abstract = {The Fast Fourier Transform (FFT) is used in many applications such as molecular dynamics, spectrum estimation, fast convolution and correlation, signal modulation, and many wireless multimedia applications. FFTs are also heavily used in ECP applications, such as EXAALT, Copa, ExaSky-HACC, ExaWind, WarpX, and many others. As these applications{\textquoteright} accuracy and speed depend on the performance of the FFTs, we designed an FFT benchmark to mea- sure performance and scalability of currently available FFT packages and present the results from a pre-Exascale platform. Our benchmarking also stresses the overall capacity of system interconnect; thus, it may be considered as an indicator of the bisection bandwidth, communication contention noise, and the software overheads in MPI collectives that are of interest to many other ECP applications and libraries.

This FFT benchmarking project aims to show the strengths and weaknesses of multiple FFT libraries and to indicate what can be done to improve their performance. In particular, we believe that the benchmarking results could help design and implement a fast and robust FFT library for 2D and 3D inputs, while targeting large-scale heterogeneous systems with multicore processors and hardware accelerators that are a co-designed in tandem with ECP applications. Our work involves studying and analyzing state-of-the-art FFT software both from vendors and available as open-source codes to better understand their performance.},
	author = {Alan Ayala and Stanimire Tomov and Piotr Luszczek and Cayrols, Sebastien and Ragghianti, Gerald and Jack Dongarra}
}
@inbook {,
	title = {An Introduction to High Performance Computing and Its Intersection with Advances in Modeling Rare Earth Elements and Actinides},
	booktitle = {Rare Earth Elements and Actinides: Progress in Computational Science Applications},
	volume = {1388},
	year = {2021},
	month = {2021-10},
	pages = {3-53},
	publisher = {American Chemical Society},
	organization = {American Chemical Society},
	chapter = {1},
	address = {Washington, DC},
	abstract = {Computationally driven solutions in nuclear and radiochemistry heavily depend on efficient modeling of Rare Earth Elements (REEs) and actinides. Accurate modeling of REEs and actinides faces challenges stemming from limitations from an imbalanced hardware-software ecosystem and its implications on inefficient use of High Performance Computing (HPC). This chapter provides a historical perspective on the evolution of HPC hardware, its intersectionality with domain sciences, the importance of benchmarks for performance, and an overview of challenges and advances in modeling REEs and actinides. This chapter intends to provide an introduction for researchers at the intersection of scientific computing, software development for HPC, and applied computational modeling of REEs and actinides. The chapter is structured in five sections. First, the Introduction includes subsections focusing on the Importance of REEs and Actinides (1.1), Hardware, Software, and the HPC Ecosystem (1.2), and Electronic Structure Modeling of REEs and Actinides (1.3). Second, a section in High Performance Computing focuses on the TOP500 (2.1), HPC Performance (2.2), HPC Benchmarks: Processing, Bandwidth, and Latency (2.3), and HPC Benchmarks and their Relationship to Chemical Modeling (2.4). Third, the Software Challenges and Advances focus on NWChem/NWChemEx (3.1), MADNESS (3.2), and MPQC (3.3). The fourth section provides a short overview of Artificial Intelligence in HPC applications relevant to nuclear and radiochemistry. The fifth section illustrates A Protocol to Evaluate Complexation Preferences in Separations of REEs and Actinides through Computational Modeling.},
	keywords = {actinide, Computational modeling, HPC, REE},
	isbn = {ISBN13: 9780841298255 eISBN: 9780841298248},
	doi = {10.1021/bk-2021-1388.ch001},
	url = {https://pubs.acs.org/doi/10.1021/bk-2021-1388.ch001},
	author = {Deborah A. Penchoff and Edward Valeev and Heike Jagode and Piotr Luszczek and Anthony Danalis and George Bosilca and Robert J. Harrison and Jack Dongarra and Theresa L. Windus}
}
@book {,
	title = {Lecture Notes in Computer Science: High Performance Computing},
	volume = {12761},
	year = {2021},
	publisher = {Springer International Publishing},
	organization = {Springer International Publishing},
	abstract = {This book constitutes the refereed post-conference proceedings of 9 workshops held at the 35th International ISC High Performance 2021 Conference, in Frankfurt, Germany, in June-July 2021: Second International Workshop on the Application of Machine Learning Techniques to Computational Fluid Dynamics and Solid Mechanics Simulations and Analysis; HPC-IODC: HPC I/O in the Data Center Workshop; Compiler-assisted Correctness Checking and Performance Optimization for HPC; Machine Learning on HPC Systems; 4th International Workshop on Interoperability of Supercomputing and Cloud Technologies; 2nd International Workshop on Monitoring and Operational Data Analytics; 16th Workshop on Virtualization in High--Performance Cloud Computing; Deep Learning on Supercomputers;  5th International Workshop on In Situ Visualization. The 35 papers included in this volume were carefully reviewed and selected. They cover all aspects of research, development, and application of large-scale, high performance experimental and commercial systems. Topics include high-performance computing (HPC), computer architecture and hardware, programming models, system software, performance analysis and modeling, compiler analysis and optimization techniques, software sustainability, scientific applications, deep learning.},
	isbn = {978-3-030-90538-5},
	doi = {10.1007/978-3-030-90539-2},
	author = {Heike Jagode and Anzt, Hartwig and Ltaief, Hatem and Piotr Luszczek}
}
@article {,
	title = {Materials fingerprinting classification},
	journal = {Computer Physics Communications},
	year = {2021},
	month = {Jan-05-2021},
	pages = {108019},
	abstract = {Significant progress in many classes of materials could be made with the availability of experimentally-derived large datasets composed of atomic identities and three-dimensional coordinates. Methods for visualizing the local atomic structure, such as atom probe tomography (APT), which routinely generate datasets comprised of millions of atoms, are an important step in realizing this goal. However, state-of-the-art APT instruments generate noisy and sparse datasets that provide information about elemental type, but obscure atomic structures, thus limiting their subsequent value for materials discovery. The application of a materials fingerprinting process, a machine learning algorithm coupled with topological data analysis, provides an avenue by which here-to-fore unprecedented structural information can be extracted from an APT dataset. As a proof of concept, the material fingerprint is applied to high-entropy alloy APT datasets containing body-centered cubic (BCC) and face-centered cubic (FCC) crystal structures. A local atomic configuration centered on an arbitrary atom is assigned a topological descriptor, with which it can be characterized as a BCC or FCC lattice with near perfect accuracy, despite the inherent noise in the dataset. This successful identification of a fingerprint is a crucial first step in the development of algorithms which can extract more nuanced information, such as chemical ordering, from existing datasets of complex materials.},
	keywords = {Atom probe tomography, High entropy alloy, Machine Learning, Materials discovery, Topological data analysis},
	issn = {00104655},
	doi = {10.1016/j.cpc.2021.108019},
	url = {https://linkinghub.elsevier.com/retrieve/pii/S0010465521001314},
	author = {Spannaus, Adam and Law, Kody J.H. and Piotr Luszczek and Nasrin, Farzana and Micucci, Cassie Putman and Liaw, Peter K. and Santodonato, Louis J. and Keffer, David J. and Maroulas, Vasileios}
}
@techreport {,
	title = {Mixed-Precision Algorithm for Finding Selected Eigenvalues and Eigenvectors of Symmetric and Hermitian Matrices},
	journal = {ICL Technical Report},
	number = {ICL-UT-21-05},
	year = {2021},
	month = {2021-08},
	abstract = {As the new hardware is being equipped with powerful low-precision capabilities driven primarily by the needs of the burgeoning field of Artificial Intelligence (AI), mixed-precision algorithms are now showing far greater potential and renewed interest in scientific computing community. The multi-precision methods commonly follow approximate-iterate scheme by first obtaining the approximate solution from a low-precision factorization and solve. Then, they iteratively refine the solution to the desired accuracy that is often as high as what is possible with traditional approaches. While targeting symmetric and Hermitian eigenvalue problems of the form Ax=\&$\#$955;x, we revisit the SICE algorithm proposed by Dongarra et al. By applying the Sherman-Morrison formula on the diagonally-shifted tridiagonal systems, we propose an updated SICE-SM algorithm. By incorporating the latest two-stage algorithms from the PLASMA and MAGMA software libraries for numerical linear algebra, we achieved up to 3.6x speedup using the mixed-precision eigensolver with the blocked SICE-SM algorithm for iterative refinement when compared with full double complex precision solvers for the cases with a portion of eigenvalues and eigenvectors requested.
},
	keywords = {eigenvalue solver, hardware accelerators, mixed-precision algorithms},
	author = {Yaohung M. Tsai and Piotr Luszczek and Jack Dongarra}
}
@techreport {,
	title = {P1673R3: A Free Function Linear algebra Interface Based on the BLAS},
	journal = {ISO JTC1 SC22 WG22},
	number = {P1673R3},
	year = {2021},
	month = {2021-04},
	publisher = {ISO},
	type = {standard},
	abstract = {We believe this proposal is complementary to P1385, a proposal for a C++ Standard linear algebra library that introduces matrix and vector classes and overloaded arithmetic operators. In fact, we think that our proposal would make a natural foundation for a library like what P1385 proposes. However, a free function interface -- which clearly separates algorithms from data structures -- more naturally allows for a richer set of operations such as what the BLAS provides. A natural extension of the present proposal would include accepting P1385{\textquoteright}s matrix and vector objects as input for the algorithms proposed here. A straightforward way to do that would be for P1385{\textquoteright}s matrix and vector objects to make views of their data available as basic_mdspan.},
	keywords = {C++, linear algebra},
	url = {http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2021/p1673r3.pdf},
	author = {Mark Hoemmen and Daisy Hollman and Christian Trott and Daniel Sunderland and Nevin Liber and Li-Ta Lo and Damien Lebrun-Grandie and Graham Lopez and Peter Caday and Sarah Knepper and Piotr Luszczek and Timothy Costa}
}
@article {,
	title = {A Set of Batched Basic Linear Algebra Subprograms and LAPACK Routines},
	journal = {ACM Transactions on Mathematical Software (TOMS)},
	volume = {47},
	number = {3},
	year = {2021},
	pages = {1{\textendash}23},
	abstract = {This article describes a standard API for a set of Batched Basic Linear Algebra Subprograms (Batched BLAS or BBLAS). The focus is on many independent BLAS operations on small matrices that are grouped together and processed by a single routine, called a Batched BLAS routine. The matrices are grouped together in uniformly sized groups, with just one group if all the matrices are of equal size. The aim is to provide more efficient, but portable, implementations of algorithms on high-performance many-core platforms. These include multicore and many-core CPU processors, GPUs and coprocessors, and other hardware accelerators with floating-point compute facility. As well as the standard types of single and double precision, we also include half and quadruple precision in the standard. In particular, half precision is used in many very large scale applications, such as those associated with machine learning.},
	keywords = {Computations on matrices, Mathematical analysis, Mathematics of computing, Numerical analysis},
	doi = {10.1145/3431921},
	author = {Abdelfattah, Ahmad and Costa, Timothy and Jack Dongarra and Mark Gates and Haidar, Azzam and Hammarling, Sven and Higham, Nicholas J and Kurzak, Jakub and Piotr Luszczek and Stanimire Tomov and others}
}
@inproceedings {,
	title = {Task-graph scheduling extensions for efficient synchronization and communication},
	journal = {Proceedings of the ACM International Conference on Supercomputing},
	year = {2021},
	pages = {88{\textendash}101},
	abstract = {Task graphs have been studied for decades as a foundation for scheduling irregular parallel applications and incorporated in many programming models including OpenMP. While many high-performance parallel libraries are based on task graphs, they also have additional scheduling requirements, such as synchronization within inner levels of data parallelism and internal blocking communications. In this paper, we extend task-graph scheduling to support efficient synchronization and communication within tasks. Compared to past work, our scheduler avoids deadlock and oversubscription of worker threads, and refines victim selection to increase the overlap of sibling tasks. To the best of our knowledge, our approach is the first to combine gang-scheduling and work-stealing in a single runtime. Our approach has been evaluated on the SLATE high-performance linear algebra library. Relative to the LLVM OMP runtime, our runtime demonstrates performance improvements of up to 13.82\%, 15.2\%, and 36.94\% for LU, QR, and Cholesky, respectively, evaluated across different configurations related to matrix size, number of nodes, and use of CPUs vs GPUs.},
	keywords = {Compilers, Computing methodologies, Parallel computing methodologies, Parallel programming languages, Runtime environments, Software and its engineering, Software notations and tools},
	doi = {10.1145/3447818.3461616},
	author = {Bak, Seonmyeong and Hernandez, Oscar and Mark Gates and Piotr Luszczek and Sarkar, Vivek}
}
@article {,
	title = {Translational process: Mathematical software perspective},
	journal = {Journal of Computational Science},
	volume = {52},
	year = {2021},
	pages = {101216},
	abstract = {Each successive generation of computer architecture has brought new challenges to achieving high performance mathematical solvers, necessitating development and analysis of new algorithms, which are then embodied in software libraries. These libraries hide architectural details from applications, allowing them to achieve a level of portability across platforms from desktops to world-class high performance computing (HPC) systems. Thus there has been an informal translational computer science process of developing algorithms and distributing them in open source software libraries for adoption by applications and vendors. With the move to exascale, increasing intentionality about this process will benefit the long-term sustainability of the scientific software stack.},
	keywords = {communication avoiding algorithms, DATAFLOW scheduling runtimes, hardware accelerators},
	doi = {10.1016/j.jocs.2020.101216},
	author = {Jack Dongarra and Mark Gates and Piotr Luszczek and Stanimire Tomov}
}
@article {,
	title = {Clover: Computational Libraries Optimized via Exascale Research},
	year = {2020},
	month = {2020-02},
	publisher = {2020 Exascale Computing Project Annual Meeting},
	address = {Houston, TX},
	author = {Mark Gates and Stanimire Tomov and Hartwig Anzt and Piotr Luszczek and Jack Dongarra}
}
@conference {1478,
	title = {Communication Avoiding 2D Stencil Implementations over PaRSEC Task-Based Runtime},
	booktitle = {2020 IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW)},
	year = {2020},
	month = {2020-05},
	publisher = {IEEE},
	organization = {IEEE},
	address = {New Orleans, LA},
	abstract = {Stencil computation or general sparse matrix-vector product (SpMV) are key components in many algorithms like geometric multigrid or Krylov solvers. But their low arithmetic intensity means that memory bandwidth and network latency will be the performance limiting factors. The current architectural trend favors computations over bandwidth, worsening the already unfavorable imbalance. Previous work approached stencil kernel optimization either by improving memory bandwidth usage or by providing a Communication Avoiding (CA) scheme to minimize network latency in repeated sparse vector multiplication by replicating remote work in order to delay communications on the critical path. Focusing on minimizing communication bottleneck in distributed stencil computation, in this study we combine a CA scheme with the computation and communication overlapping that is inherent in a dataflow task-based runtime system such as PaRSEC to demonstrate their combined benefits. We implemented the 2D five point stencil (Jacobi iteration) in PETSc, and over PaRSEC in two flavors, full communications (base-PaRSEC) and CA-PaRSEC which operate directly on a 2D compute grid. Our results running on two clusters, NaCL and Stampede2 indicate that we can achieve 2{\texttimes} speedup over the standard SpMV solution implemented in PETSc, and in certain cases when kernel execution is not dominating the execution time, the CA-PaRSEC version achieved up to 57\% and 33\% speedup over base-PaRSEC implementation on NaCL and Stampede2 respectively.},
	doi = {https://doi.org/10.1109/IPDPSW50202.2020.00127},
	author = {Yu Pei and Qinglei Cao and George Bosilca and Piotr Luszczek and Victor Eijkhout and Jack Dongarra}
}
@conference {1470,
	title = {Docker Container based PaaS Cloud Computing Comprehensive Benchmarks using LAPACK},
	booktitle = {Computer Modeling and Intelligent Systems CMIS-2020},
	year = {2020},
	month = {2020-03},
	address = {Zaporizhzhoa},
	abstract = {Platform as a Service (PaaS) cloud computing model becomes wide- spread implemented within Docker Containers. Docker uses operating system level virtualization to deliver software in packages called containers. Containers are isolated from one another and comprise all the required software, including operating system API, libraries and configuration files. With such advantageous integrity one can doubt on Docker performance. The present paper applies packet LAPACK, which is widely used for performance benchmarks of super- computers, to collect and compare benchmarks of Docker on Linux Ubuntu and MS Windows platforms. After a brief overview of Docker and LAPACK, a se- ries of Docker images containing LAPACK is created and run, abundant benchmarks obtained and represented in tabular and graphical form. From the final discussion, we conclude that Docker runs with nearly the same perfor- mance on both Linux and Windows platforms, the slowdown does not exceed some ten percent. Though Docker performance in Windows is essentially lim- ited by the amount of RAM allocated to Docker Engine.
},
	keywords = {docker containers, software containers},
	author = {Dmitry Zaitsev and Piotr Luszczek}
}
@conference {1482,
	title = {Improving the Performance of the GMRES Method using Mixed-Precision Techniques},
	booktitle = {Smoky Mountains Computational Sciences \& Engineering Conference (SMC2020)},
	year = {2020},
	month = {2020-08},
	abstract = {The GMRES method is used to solve sparse, non-symmetric systems of linear equations arising from many scientific applications. The solver performance within a single node is memory bound, due to the low arithmetic intensity of its computational kernels. To reduce the amount of data movement, and thus, to improve performance, we investigated the effect of using a mix of single and double precision while retaining double-precision accuracy. Previous efforts have explored reduced precision in the preconditioner, but the use of reduced precision in the solver itself has received limited attention. We found that GMRES only needs double precision in computing the residual and updating the approximate solution to achieve double-precision accuracy, although it must restart after each improvement of single-precision accuracy. This finding holds for the tested orthogonalization schemes: Modified Gram-Schmidt (MGS) and Classical Gram-Schmidt with Re-orthogonalization (CGSR). Furthermore, our mixed-precision GMRES, when restarted at least once, performed 19\% and 24\% faster on average than double-precision GMRES for MGS and CGSR, respectively. Our implementation uses generic programming techniques to ease the burden of coding implementations for different data types. Our use of the Kokkos library allowed us to exploit parallelism and optimize data management. Additionally, KokkosKernels was used when producing performance results. In conclusion, using a mix of single and double precision in GMRES can improve performance while retaining double-precision accuracy.},
	keywords = {Kokkos, Krylov subspace methods, linear algebra, mixed precision},
	author = {Neil Lindquist and Piotr Luszczek and Jack Dongarra}
}
@inbook {1213,
	title = {Interoperable Convergence of Storage, Networking, and Computation},
	booktitle = {Advances in Information and Communication: Proceedings of the 2019 Future of Information and Communication Conference (FICC)},
	number = {2},
	year = {2020},
	pages = {667-690},
	publisher = {Springer International Publishing},
	organization = {Springer International Publishing},
	abstract = {In every form of digital store-and-forward communication, intermediate forwarding nodes are computers, with attendant memory and processing resources. This has inevitably stimulated efforts to create a wide-area infrastructure that goes beyond simple store-and-forward to create a platform that makes more general and varied use of the potential of this collection of increasingly powerful nodes. Historically, these efforts predate the advent of globally routed packet networking. The desire for a converged infrastructure of this kind has only intensified over the last 30 years, as memory, storage, and processing resources have increased in both density and speed while simultaneously decreasing in cost. Although there is a general consensus that it should be possible to define and deploy such a dramatically more capable wide-area platform, a great deal of investment in research prototypes has yet to produce a credible candidate architecture. Drawing on technical analysis, historical examples, and case studies, we present an argument for the hypothesis that in order to realize a distributed system with the kind of convergent generality and deployment scalability that might qualify as "future-defining," we must build it from a small set of simple, generic, and limited abstractions of the low level resources (processing, storage and network) of its intermediate nodes.},
	keywords = {active networks, distributed cloud, distributed processing, distributed storage, edge computing, network convergence, network layering, scalability},
	isbn = {978-3-030-12385-7},
	author = {Micah Beck and Terry Moore and Piotr Luszczek and Anthony Danalis},
	editor = {Kohei Arai and Rahul Bhatia}
}
@article {,
	title = {The PLASMA Library on CORAL Systems and Beyond (Poster)},
	year = {2020},
	month = {2020-02},
	publisher = {2020 Exascale Computing Project Annual Meeting},
	address = {Houston, TX},
	author = {Piotr Luszczek and Jack Dongarra}
}
@techreport {,
	title = {Prospectus for the Next LAPACK and ScaLAPACK Libraries: Basic ALgebra LIbraries for Sustainable Technology with Interdisciplinary Collaboration (BALLISTIC)},
	journal = {LAPACK Working Notes},
	number = {297, ICL-UT-20-07},
	year = {2020},
	month = {2020/07},
	publisher = {University of Tennessee},
	abstract = {The convergence of several unprecedented changes, including formidable new system design constraints and revolutionary levels of heterogeneity, has made it clear that much of the essential software infrastructure of computational science and engineering is, or will soon be, obsolete. Math libraries have historically been in the vanguard of software that must be adapted first to such changes, both because these low-level workhorses are so critical to the accuracy and performance of so many different types of applications, and because they have proved to be outstanding vehicles for finding and implementing solutions to the problems that novel architectures pose. Under the Basic ALgebra LIbraries for Sustainable Technology with Interdisciplinary Collaboration (BALLISTIC) project, the principal designers of the Linear Algebra PACKage (LAPACK) and the Scalable Linear Algebra PACKage (ScaLAPACK), the combination of which is abbreviated Sca/LAPACK, aim to enhance and update these libraries for the ongoing revolution in processor architecture, system design, and application requirements by incorporating them into a layered package of software components{\textemdash}the BALLISTIC ecosystem{\textemdash}that provides users seamless access to state-of-the-art solver implementations through familiar and improved Sca/LAPACK interfaces.},
	author = {James Demmel and Jack Dongarra and Julie Langou and Julien Langou and Piotr Luszczek and Michael Mahoney}
}
@conference {,
	title = {Replacing Pivoting in Distributed Gaussian Elimination with Randomized Techniques},
	booktitle = {2020 IEEE/ACM 11th Workshop on Latest Advances in Scalable Algorithms for Large-Scale Systems (ScalA)},
	year = {2020},
	month = {2020-11},
	publisher = {IEEE},
	organization = {IEEE},
	address = {Atlanta, GA},
	abstract = {Gaussian elimination is a key technique for solving dense, non-symmetric systems of linear equations. Pivoting is used to ensure numerical stability but can introduce significant overheads. We propose replacing pivoting with recursive butterfly transforms (RBTs) and iterative refinement. RBTs use an FFT-like structure and randomized elements to provide an efficient, two-sided preconditioner for factoring. This approach was implemented and tested using Software for Linear Algebra Targeting Exascale (SLATE). In numerical experiments, our implementation was more robust than Gaussian elimination with no pivoting (GENP) but failed to solve all the problems solvable with Gaussian elimination with partial pivoting (GEPP). Furthermore, the proposed solver was able to outperform GEPP when distributed on GPU-accelerated nodes.},
	keywords = {linear systems, Randomized algorithms},
	author = {Neil Lindquist and Piotr Luszczek and Jack Dongarra}
}
@conference {,
	title = {Scalable Data Generation for Evaluating Mixed-Precision Solvers},
	booktitle = {2020 IEEE High Performance Extreme Computing Conference (HPEC)},
	year = {2020},
	month = {2020-09},
	publisher = {IEEE},
	organization = {IEEE},
	address = { Waltham, MA, USA},
	abstract = {We present techniques of generating data for mixed precision solvers that allows to test those solvers in a scalable manner. Our techniques focus on mixed precision hardware and software where both the solver and the hardware can take advantage of mixing multiple floating precision formats. This allows taking advantage of recently released generation of hardware platforms that focus on ML and DNN workloads but can also be utilized for HPC applications if a new breed of algorithms is combined with the custom floating-point formats to deliver performance levels beyond the standard IEEE data types while delivering a comparable accuracy of the results.},
	doi = {https://doi.org/10.1109/HPEC43674.2020.9286145},
	author = {Piotr Luszczek and Yaohung Tsai and Neil Lindquist and Hartwig Anzt and Jack Dongarra}
}
@article {,
	title = {A Set of Batched Basic Linear Algebra Subprograms},
	journal = {ACM Transactions on Mathematical Software},
	year = {2020},
	month = {2020-10},
	abstract = {This paper describes a standard API for a set of Batched Basic Linear Algebra Subprograms (Batched BLAS or BBLAS). The focus is on many independent BLAS operations on small matrices that are grouped together and processed by a single routine, called a Batched BLAS routine. The matrices are grouped together in uniformly sized groups, with just one group if all the matrices are of equal size. The aim is to provide more efficient, but portable, implementations of algorithms on high-performance many-core platforms. These include multicore and many-core CPU processors, GPUs and coprocessors, and other hardware accelerators with floating-point compute facility. As well as the standard types of single and double precision, we also include half and quadruple precision in the standard. In particular half precision is used in many very large scale applications, such as those associated with machine learning.},
	author = {Ahmad Abdelfattah and Timothy Costa and Jack Dongarra and Mark Gates and Azzam Haidar and Sven Hammarling and Nicholas J. Higham and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Mawussi Zounon}
}
@techreport {,
	title = {A Survey of Numerical Methods Utilizing Mixed Precision Arithmetic},
	journal = {SLATE Working Notes},
	number = {15, ICL-UT-20-08},
	year = {2020},
	month = {2020-07},
	publisher = {University of Tennessee},
	type = {SLATE Working Notes},
	author = {Ahmad Abdelfattah and Hartwig Anzt and Erik Boman and Erin Carson and Terry Cojean and Jack Dongarra and Mark Gates and Thomas Gruetzmacher and Nicholas J. Higham and Sherry Li and Neil Lindquist and Yang Liu and Jennifer Loe and Piotr Luszczek and Pratik Nayak and Sri Pranesh and Siva Rajamanickam and Tobias Ribizel and Barry Smith and Kasia Swirydowicz and Stephen Thomas and Stanimire Tomov and Yaohung Tsai and Ichitaro Yamazaki and Urike Meier Yang}
}
@techreport {,
	title = {Translational Process: Mathematical Software Perspective},
	journal = {Innovative Computing Laboratory Technical Report},
	number = {ICL-UT-20-11},
	year = {2020},
	month = {2020-08},
	abstract = {Each successive generation of computer architecture has brought new challenges to achieving high performance mathematical solvers, necessitating development and analysis of new algorithms, which are then embodied in software libraries. These libraries hide architectural details from applications, allowing them to achieve a level of portability across platforms from desktops to worldclass high performance computing (HPC) systems. Thus there has been an informal translational computer science process of developing algorithms and distributing them in open source software libraries for adoption by applications and vendors. With the move to exascale, increasing intentionality about this process will benefit the long-term sustainability of the scientific software stack.},
	keywords = {communication avoiding algorithms, data flow scheduling runtimes, hardware accelerators},
	author = {Jack Dongarra and Mark Gates and Piotr Luszczek and Stanimire Tomov}
}
@article {,
	title = {Translational Process: Mathematical Software Perspective},
	journal = {Journal of Computational Science},
	year = {2020},
	month = {2020-09},
	abstract = {Each successive generation of computer architecture has brought new challenges to achieving high performance mathematical solvers, necessitating development and analysis of new algorithms, which are then embodied in software libraries. These libraries hide architectural details from applications, allowing them to achieve a level of portability across platforms from desktops to world-class high performance computing (HPC) systems. Thus there has been an informal translational computer science process of developing algorithms and distributing them in open source software libraries for adoption by applications and vendors. With the move to exascale, increasing intentionality about this process will benefit the long-term sustainability of the scientific software stack.},
	keywords = {communication avoiding algorithms, DATAFLOW scheduling runtimes, hardware accelerators},
	doi = {https://doi.org/10.1016/j.jocs.2020.101216},
	author = {Jack Dongarra and Mark Gates and Piotr Luszczek and Stanimire Tomov}
}
@article {,
	title = {Using Quantized Integer in LU Factorization with Partial Pivoting (Poster)},
	year = {2020},
	month = {2020-02},
	publisher = {SIAM Conference on Parallel Processing for Scientific Computing (SIAM PP20)},
	address = {Seattle, WA},
	abstract = {Quantization is a common technique to speed the deep learning inference. It is using integers with a shared scalar to represent a set of equally spaced numbers. The quantized integer method has shown great success in compressing the deep learning models, reducing the computation cost without losing too much accuracy. New application specific hardware and specialized CPU extension instructions like Intel AVX-512 VNNI are providing capabilities for us to do integer MADD (multiply and add) efficiently. In this poster, we would like to show our preliminary results of using quantization integers for LU factorization with partial pivoting. Using Int32, the backward error can outperform single precision. However, quantized integer has the similar issue of limited range as FP16 that it would not work directly for large matrices because of big numbers would occur in factored U. We will show some possible solutions to it and how we would like to apply this quantized integer technique to other numerical linear algebra applications.},
	author = {Yaohung Tsai and Piotr Luszczek and Jack Dongarra}
}
@techreport {1399,
	title = {A Collection of White Papers from the BDEC2 Workshop in Poznan, Poland},
	journal = {Innovative Computing Laboratory Technical Report},
	number = {ICL-UT-19-10},
	year = {2019},
	month = {2019-05},
	publisher = {University of Tennessee, Knoxville},
	author = {Gabriel Antoniu and Alexandru Costan and Ovidiu Marcu and Maria S. P{\'e}rez and Nenad Stojanovic and Rosa M. Badia and Miguel V{\'a}zquez and Sergi Girona and Micah Beck and Terry Moore and Piotr Luszczek and Ezra Kissel and Martin Swany and Geoffrey Fox and Vibhatha Abeykoon and Selahattin Akkas and Kannan Govindarajan and Gurhan Gunduz and Supun Kamburugamuve and Niranda Perera and Ahmet Uyar and Pulasthi Wickramasinghe and Chathura Widanage and Maria Girone and Toshihiro Hanawa and Richard Moreno and Ariel Oleksiak and Martin Swany and Ryousei Takano and M.P. van Haarlem and J. van Leeuwen and J.B.R. Oonk and T. Shimwell and L.V.E. Koopmans}
}
@conference {1441,
	title = {Increasing Accuracy of Iterative Refinement in Limited Floating-Point Arithmetic on Half-Precision Accelerators},
	booktitle = {IEEE High Performance Extreme Computing Conference (HPEC 2019), Best Paper Finalist},
	year = {2019},
	month = {2019-09},
	publisher = {IEEE},
	organization = {IEEE},
	address = {Waltham, MA},
	abstract = {The emergence of deep learning as a leading computational workload for machine learning tasks on large-scale cloud infrastructure installations has led to plethora of accelerator hardware releases.  However, the reduced precision and range of the floating-point numbers on these new platforms makes it a non-trivial task to leverage these unprecedented advances in computational power for numerical linear algebra operations that come with a guarantee of robust error bounds.  In order to address these concerns, we present a number of strategies that can be used to increase the accuracy of limited-precision iterative refinement. By limited precision, we mean 16-bit floating-point formats implemented in modern hardware accelerators and are not necessarily compliant with the IEEE half-precision specification. We include the explanation of a broader context and connections to established IEEE floating-point standards and existing high-performance computing (HPC) benchmarks.  We also present a new formulation of LU factorization that we call signed square root LU which produces more numerically balanced L and U factors which directly address the problems of limited range of the low-precision storage formats. The experimental results indicate that it is possible to recover substantial amounts of the accuracy in the system solution that would otherwise be lost. Previously, this could only be achieved by using iterative refinement based on single-precision floating-point arithmetic.  The discussion will also explore the numerical stability issues that are important for robust linear solvers on these new hardware platforms.},
	author = {Piotr Luszczek and Ichitaro Yamazaki and Jack Dongarra}
}
@article {1270,
	title = {PLASMA: Parallel Linear Algebra Software for Multicore Using OpenMP},
	journal = {ACM Transactions on Mathematical Software},
	volume = {45},
	year = {2019},
	month = {2019-06},
	doi = {https://doi.org/10.1145/3264491},
	author = {Jack Dongarra and Mark Gates and Azzam Haidar and Jakub Kurzak and Piotr Luszczek and Panruo Wu and Ichitaro Yamazaki and Asim YarKhan and Maksims Abalenkovs and Negin Bagherpour and Sven Hammarling and Jakub Sistek}
}
@conference {1378,
	title = {Software-Defined Events through PAPI},
	booktitle = {2019 IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW)},
	year = {2019},
	month = {2019-05},
	publisher = {IEEE},
	organization = {IEEE},
	address = {Rio de Janeiro, Brazil},
	abstract = {PAPI has been used for almost two decades as an abstraction and standardization layer for profiling hardware-specific performance metrics. However, application developers-and profiling software packages-are quite often interested in information beyond hardware counters, such as the behavior of libraries used by the software that is being profiled. So far, accessing this information has required interfacing directly with the libraries on a case-by-case basis, or low-level binary instrumentation. In this paper, we introduce the new Software-Defined Event (SDE) component of PAPI which aims to enable PAPI to serve as an abstraction and standardization layer for events that originate in software layers as well. Extending PAPI to include SDEs enables monitoring of both types of performance events-hardware-and software-related events-in a uniform way, through the same consistent PAPI interface. Furthermore, implementing SDE as a PAPI component means that the new API is aimed only at the library developers who wish to export events from within their libraries. The API for reading PAPI events-both hardware and software-remains the same, so all legacy codes and tools that use PAPI will not only continue to work, but they will automatically be able to read SDEs wherever those are available. The goal of this paper is threefold. First, we outline our design decisions regarding the functionality we offer through the new SDE interface, and offer simple examples of usage. Second, we illustrate how those events can be utilized by different software packages, specifically, by showcasing their use in the task-based runtime PaRSEC, and the HPCG supercomputing benchmark. Third, we provide a thorough performance analysis of the overhead that results from monitoring different types of SDEs, and showcase the negligible overhead of using PAPI SDE even in cases of extremely heavy use.},
	doi = {https://doi.org/10.1109/IPDPSW.2019.00069},
	author = {Anthony Danalis and Heike Jagode and Thomas Herault and Piotr Luszczek and Jack Dongarra}
}
@article {1271,
	title = {Autotuning Numerical Dense Linear Algebra for Batched Computation With GPU Hardware Accelerators},
	journal = {Proceedings of the IEEE},
	volume = {106},
	year = {2018},
	month = {2018-11},
	pages = {2040{\textendash}2055},
	abstract = {Computational problems in engineering and scientific disciplines often rely on the solution of many instances of small systems of linear equations, which are called batched solves. In this paper, we focus on the important variants of both batch Cholesky factorization and subsequent substitution. The former requires the linear system matrices to be symmetric positive definite (SPD). We describe the implementation and automated performance engineering of these kernels that implement the factorization and the two substitutions. Our target platforms are graphics processing units (GPUs), which over the past decade have become an attractive high-performance computing (HPC) target for solvers of linear systems of equations. Due to their throughput-oriented design, GPUs exhibit the highest processing rates among the available processors. However, without careful design and coding, this speed is mostly restricted to large matrix sizes. We show an automated exploration of the implementation space as well as a new data layout for the batched class of SPD solvers. Our tests involve the solution of many thousands of linear SPD systems of exactly the same size. The primary focus of our techniques is on the individual matrices in the batch that have dimensions ranging from 5-by-5 up to 100-by-100. We compare our autotuned solvers against the state-of-the-art solvers such as those provided through NVIDIA channels and publicly available in the optimized MAGMA library. The observed performance is competitive and many times superior for many practical cases. The advantage of the presented methodology lies in achieving these results in a portable manner across matrix storage formats and GPU hardware architecture platforms.},
	keywords = {Dense numerical linear algebra, performance autotuning},
	doi = {10.1109/JPROC.2018.2868961},
	author = {Jack Dongarra and Mark Gates and Jakub Kurzak and Piotr Luszczek and Yaohung Tsai}
}
@article {1277,
	title = {Autotuning Techniques for Performance-Portable Point Set Registration in 3D},
	journal = {Supercomputing Frontiers and Innovations},
	volume = {5},
	number = {4},
	year = {2018},
	month = {2018-12},
	chapter = {42},
	abstract = {We present an autotuning approach applied to exhaustive performance engineering of the EM-ICP algorithm for the point set registration problem with a known reference. We were able to achieve progressively higher performance levels through a variety of code transformations and an automated procedure of generating a large number of implementation variants. Furthermore, we managed to exploit code patterns that are not common when only attempting manual optimization but which yielded in our tests better performance for the chosen registration algorithm. Finally, we also show how we maintained high levels of the performance rate in a portable fashion across a wide range of hardware platforms including multicore, manycore coprocessors, and accelerators. Each of these hardware classes is much different from the others and, consequently, cannot reliably be mastered by a single developer in a short time required to deliver a close-to-optimal implementation. We assert in our concluding remarks that our methodology as well as the presented tools provide a valid automation system for software optimization tasks on modern HPC hardware.},
	doi = {10.14529/jsfi180404},
	author = {Piotr Luszczek and Jakub Kurzak and Ichitaro Yamazaki and David Keffer and Vasileios Maroulas and Jack Dongarra}
}
@article {1300,
	title = {Batched BLAS (Basic Linear Algebra Subprograms) 2018 Specification},
	year = {2018},
	month = {2018-07},
	abstract = {This document describes an API for Batch Basic Linear Algebra Subprograms (Batched BLAS or BBLAS). We focus on many independent BLAS operations on small matrices that are grouped together and processed by a single routine, called a Batched BLAS routine. The extensions beyond the original BLAS standard are considered that specify a programming interface not only for routines with uniformly-sized matrices and/or vectors but also for the situation where the sizes vary. The aim is to provide more efficient, but portable, implementations of algorithms on high-performance manycore platforms. These include multicore and many-core CPU processors; GPUs and coprocessors; as well as other hardware accelerators with floating-point compute facility.},
	author = {Jack Dongarra and Iain Duff and Mark Gates and Azzam Haidar and Sven Hammarling and Nicholas J. Higham and Jonathan Hogg and Pedro Valero Lara and Piotr Luszczek and Mawussi Zounon and Samuel D. Relton and Stanimire Tomov and Timothy Costa and Sarah Knepper}
}
@techreport {1203,
	title = {Implementation of the C++ API for Batch BLAS},
	journal = {SLATE Working Notes},
	number = {07, ICL-UT-18-04},
	year = {2018},
	month = {2018-06},
	publisher = {Innovative Computing Laboratory, University of Tennessee},
	author = {Ahmad Abdelfattah and Mark Gates and Jakub Kurzak and Piotr Luszczek and Jack Dongarra}
}
@techreport {1228,
	title = {Linear Systems Performance Report},
	journal = {SLATE Working Notes},
	number = {08, ICL-UT-18-08},
	year = {2018},
	month = {2018-09},
	publisher = {Innovative Computing Laboratory, University of Tennessee},
	type = {SLATE Working Notes},
	author = {Jakub Kurzak and Mark Gates and Ichitaro Yamazaki and Ali Charara and Asim YarKhan and Jamie Finney and Gerald Ragghianti and Piotr Luszczek and Jack Dongarra}
}
@techreport {1191,
	title = {Parallel BLAS Performance Report},
	journal = {SLATE Working Notes},
	number = {05, ICL-UT-18-01},
	year = {2018},
	month = {2018-04},
	publisher = {University of Tennessee},
	author = {Jakub Kurzak and Mark Gates and Asim YarKhan and Ichitaro Yamazaki and Panruo Wu and Piotr Luszczek and Jamie Finney and Jack Dongarra}
}
@techreport {1206,
	title = {Parallel Norms Performance Report},
	journal = {SLATE Working Notes},
	number = {06, ICL-UT-18-06},
	year = {2018},
	month = {2018-06},
	publisher = {Innovative Computing Laboratory, University of Tennessee},
	author = {Jakub Kurzak and Mark Gates and Asim YarKhan and Ichitaro Yamazaki and Piotr Luszczek and Jamie Finney and Jack Dongarra}
}
@article {1258,
	title = {The Singular Value Decomposition: Anatomy of Optimizing an Algorithm for Extreme Scale},
	journal = {SIAM Review},
	volume = {60},
	year = {2018},
	month = {2018-11},
	pages = {808{\textendash}865},
	abstract = {The computation of the singular value decomposition, or SVD, has a long history with many improvements over the years, both in its implementations and algorithmically. Here, we survey the evolution of SVD algorithms for dense matrices, discussing the motivation and performance impacts of changes. There are two main branches of dense SVD methods: bidiagonalization and Jacobi. Bidiagonalization methods started with the implementation by Golub and Reinsch in Algol60, which was subsequently ported to Fortran in the EISPACK library, and was later more efficiently implemented in the LINPACK library, targeting contemporary vector machines. To address cache-based memory hierarchies, the SVD algorithm was reformulated to use Level 3 BLAS in the LAPACK library. To address new architectures, ScaLAPACK was introduced to take advantage of distributed computing, and MAGMA was developed for accelerators such as GPUs. Algorithmically, the divide and conquer and MRRR algorithms were developed to reduce the number of operations. Still, these methods remained memory bound, so two-stage algorithms were developed to reduce memory operations and increase the computational intensity, with efficient implementations in PLASMA, DPLASMA, and MAGMA. Jacobi methods started with the two-sided method of Kogbetliantz and the one-sided method of Hestenes. They have likewise had many developments, including parallel and block versions and preconditioning to improve convergence. In this paper, we investigate the impact of these changes by testing various historical and current implementations on a common, modern multicore machine and a distributed computing platform. We show that algorithmic and implementation improvements have increased the speed of the SVD by several orders of magnitude, while using up to 40 times less energy.},
	keywords = {bidiagonal matrix, bisection, Divide and conquer, Hestenes method, Jacobi method, Kogbetliantz method, MRRR, QR iteration, Singular value decomposition, SVD},
	issn = {0036-1445},
	doi = {10.1137/17M1117732},
	url = {https://epubs.siam.org/doi/10.1137/17M1117732},
	author = {Jack Dongarra and Mark Gates and Azzam Haidar and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Ichitaro Yamazaki}
}
@article {1448,
	title = {Task Based Cholesky Decomposition on Xeon Phi Architectures using OpenMP},
	journal = {International Journal of Computational Science and Engineering (IJCSE)},
	volume = {17},
	number = {3},
	year = {2018},
	month = {2018-10},
	abstract = {The increasing number of computational cores in modern many-core processors, as represented by the Intel Xeon Phi architectures, has created the need for an open-source, high performance and scalable task-based dense linear algebra package that can efficiently use this type of many-core hardware. In this paper, we examined the design modifications necessary when porting PLASMA, a task-based dense linear algebra library, run effectively on two generations of Intel{\textquoteright}s Xeon Phi architecture, known as knights corner (KNC) and knights landing (KNL). First, we modified PLASMA{\textquoteright}s tiled Cholesky decomposition to use OpenMP tasks for its scheduling mechanism to enable Xeon Phi compatibility. We then compared the performance of our modified code to that of the original dynamic scheduler running on an Intel Xeon Sandy Bridge CPU. Finally, we looked at the performance of the OpenMP tiled Cholesky decomposition on knights corner and knights landing processors. We detail the optimisations required to obtain performance on these platforms and compare with the highly tuned Intel MKL math library.},
	doi = {http://dx.doi.org/10.1504/IJCSE.2018.095851},
	author = {Joseph Dorris and Asim YarKhan and Jakub Kurzak and Piotr Luszczek and Jack Dongarra}
}
@conference {1169,
	title = {Autotuning Batch Cholesky Factorization in CUDA with Interleaved Layout of Matrices},
	booktitle = {Parallel and Distributed Processing Symposium Workshops (IPDPSW)},
	year = {2017},
	month = {2017-06},
	publisher = {IEEE},
	organization = {IEEE},
	address = {Orlando, FL},
	abstract = {Batch matrix operations address the case of solving the same linear algebra problem for a very large number of very small matrices. In this paper, we focus on implementing the batch Cholesky factorization in CUDA, in single precision arithmetic, for NVIDIA GPUs. Specifically, we look into the benefits of using noncanonical data layouts, where consecutive memory locations store elements with the same row and column index in a set of consecutive matrices. We discuss a number of different implementation options and tuning parameters. We demonstrate superior performance to traditional implementations for the case of very small matrices.},
	keywords = {batch computation, Cholesky Factorization, data layout, GPU computing, numerical linear algebra},
	doi = {10.1109/IPDPSW.2017.18},
	author = {Mark Gates and Jakub Kurzak and Piotr Luszczek and Yu Pei and Jack Dongarra}
}
@inbook {1167,
	title = {Bringing High Performance Computing to Big Data Algorithms},
	booktitle = {Handbook of Big Data Technologies},
	year = {2017},
	publisher = {Springer},
	organization = {Springer},
	isbn = {978-3-319-49339-8},
	doi = {10.1007/978-3-319-49340-4},
	author = {Hartwig Anzt and Jack Dongarra and Mark Gates and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Ichitaro Yamazaki}
}
@techreport {1175,
	title = {C++ API for Batch BLAS},
	journal = {SLATE Working Notes},
	number = {04, ICL-UT-17-12},
	year = {2017},
	month = {2017-12},
	publisher = {University of Tennessee},
	author = {Ahmad Abdelfattah and Konstantin Arturov and Cris Cecka and Jack Dongarra and Chip Freitag and Mark Gates and Azzam Haidar and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Panruo Wu}
}
@techreport {1081,
	title = {C++ API for BLAS and LAPACK},
	journal = {SLATE Working Notes},
	number = {02, ICL-UT-17-03},
	year = {2017},
	note = {Revision 02-21-2018},
	month = {2017-06},
	publisher = {Innovative Computing Laboratory, University of Tennessee},
	author = {Mark Gates and Piotr Luszczek and Ahmad Abdelfattah and Jakub Kurzak and Jack Dongarra and Konstantin Arturov and Cris Cecka and Chip Freitag}
}
@techreport {1138,
	title = {The Case for Directive Programming for Accelerator Autotuner Optimization},
	journal = {Innovative Computing Laboratory Technical Report},
	number = {ICL-UT-17-07},
	year = {2017},
	month = {2017-10},
	publisher = {University of Tennessee},
	abstract = {In this work, we present the use of compiler pragma directives for parallelizing autotuning of specialized compute kernels for hardware accelerators. A set of constructs, that include prallelizing a source code that prune a generated search space with a large number of constraints for an autotunning infrastructure. For a better performance we studied optimization aimed at minimization of the run time.We also studied the behavior of the parallel load balance and the speedup on four different machines: x86, Xeon Phi, ARMv8, and POWER8.},
	author = {Diana Fayad and Jakub Kurzak and Piotr Luszczek and Panruo Wu and Jack Dongarra}
}
@article {1131,
	title = {Comparing performance of s-step and pipelined GMRES on distributed-memory multicore CPUs},
	year = {2017},
	month = {2017-07},
	publisher = {SIAM Annual Meeting},
	address = {Pittsburgh, Pennsylvania},
	author = {Ichitaro Yamazaki and Mark Hoemmen and Piotr Luszczek and Jack Dongarra}
}
@article {1165,
	title = {Design and Implementation of the PULSAR Programming System for Large Scale Computing},
	journal = {Supercomputing Frontiers and Innovations},
	volume = {4},
	year = {2017},
	abstract = {The objective of the PULSAR project was to design a programming model suitable for large scale machines with complex memory hierarchies, and to deliver a prototype implementation of a runtime system supporting that model. PULSAR tackled the challenge by proposing a programming model based on systolic processing and virtualization. The PULSAR programming model is quite simple, with point-to-point channels as the main communication abstraction. The runtime implementation is very lightweight and fully distributed, and provides multithreading, message-passing and multi-GPU offload capabilities. Performance evaluation shows good scalability up to one thousand nodes with one thousand GPU accelerators.},
	doi = {10.14529/jsfi170101},
	url = {http://superfri.org/superfri/article/view/121/210},
	author = {Jakub Kurzak and Piotr Luszczek and Ichitaro Yamazaki and Yves Robert and Jack Dongarra}
}
@techreport {1133,
	title = {Designing SLATE: Software for Linear Algebra Targeting Exascale},
	journal = {SLATE Working Notes},
	number = {03, ICL-UT-17-06},
	year = {2017},
	month = {2017-10},
	publisher = {Innovative Computing Laboratory, University of Tennessee},
	type = {SLATE Working Notes},
	author = {Jakub Kurzak and Panruo Wu and Mark Gates and Ichitaro Yamazaki and Piotr Luszczek and Gerald Ragghianti and Jack Dongarra}
}
@inproceedings {1011,
	title = {Improving Performance of GMRES by Reducing Communication and Pipelining Global Collectives},
	journal = {Proceedings of The 18th IEEE International Workshop on Parallel and Distributed Scientific and Engineering Computing (PDSEC 2017), Best Paper Award},
	year = {2017},
	month = {2017-06},
	address = {Orlando, FL},
	abstract = {We compare the performance of pipelined and s-step GMRES, respectively referred to as l-GMRES and s-GMRES, on distributed multicore CPUs. Compared to standard GMRES, s-GMRES requires fewer all-reduces, while l-GMRES overlaps the all-reduces with computation. To combine the best features of two algorithms, we propose another variant, (l, t)-GMRES, that not only does fewer global all-reduces than standard GMRES, but also overlaps those all-reduces with other work. We implemented the thread-parallelism and communication-overlap in two different ways. The first uses nonblocking MPI collectives with thread-parallel computational kernels. The second relies on a shared-memory task scheduler. In our experiments, (l, t)-GMRES performed better than l-GMRES by factors of up to 1.67{\texttimes}. In addition, though we only used 50 nodes, when the latency cost became significant, our variant performed up to 1.22{\texttimes} better than s-GMRES by hiding all-reduces.},
	doi = {https://doi.org/10.1109/IPDPSW.2017.65},
	author = {Ichitaro Yamazaki and Mark Hoemmen and Piotr Luszczek and Jack Dongarra}
}
@techreport {1130,
	title = {MAGMA-sparse Interface Design Whitepaper},
	journal = {Innovative Computing Laboratory Technical Report},
	number = {ICL-UT-17-05},
	year = {2017},
	month = {2017-09},
	type = {Technical Report},
	abstract = {In this report we describe the logic and interface we develop for the MAGMA-sparse library
to allow for easy integration as third-party library into a top-level software ecosystem. The
design choices are based on extensive consultation with other software library developers, in
particular the Trilinos software development team. The interface documentation is at this point
not exhaustive, but a first proposal for setting a standard. Although the interface description
targets the MAGMA-sparse software module, we hope that the design choices carry beyond this
specific library, and are attractive for adoption in other packages.
This report is not intended as static document, but will be updated over time to reflect the agile
software development in the ECP 1.3.3.11 STMS11-PEEKS project.},
	author = {Hartwig Anzt and Erik Boman and Jack Dongarra and Goran Flegar and Mark Gates and Mike Heroux and Mark Hoemmen and Jakub Kurzak and Piotr Luszczek and Sivasankaran Rajamanickam and Stanimire Tomov and Stephen Wood and Ichitaro Yamazaki}
}
@techreport {1173,
	title = {PLASMA 17 Performance Report},
	journal = {Innovative Computing Laboratory Technical Report},
	number = {ICL-UT-17-11},
	year = {2017},
	month = {2017-06},
	publisher = {University of Tennessee},
	abstract = {PLASMA (Parallel Linear Algebra for Multicore Architectures) is a dense linear algebra package at the forefront of multicore computing. PLASMA is designed to deliver the highest possible performance from a system with multiple sockets of multicore processors. PLASMA achieves this objective by combining state of the art solutions in parallel algorithms, scheduling, and software engineering. PLASMA currently offers a collection of routines for solving linear systems of equations and least square problems.},
	author = {Maksims Abalenkovs and Negin Bagherpour and Jack Dongarra and Mark Gates and Azzam Haidar and Jakub Kurzak and Piotr Luszczek and Samuel Relton and Jakub Sistek and David Stevens and Panruo Wu and Ichitaro Yamazaki and Asim YarKhan and Mawussi Zounon}
}
@techreport {1172,
	title = {PLASMA 17.1 Functionality Report},
	journal = {Innovative Computing Laboratory Technical Report},
	number = {ICL-UT-17-10},
	year = {2017},
	month = {2017-06},
	publisher = {University of Tennessee},
	abstract = {PLASMA (Parallel Linear Algebra for Multicore Architectures) is a dense linear algebra package at the forefront of multicore computing. PLASMA is designed to deliver the highest possible performance from a system with multiple sockets of multicore processors. PLASMA achieves this objective by combining state of the art solutions in parallel algorithms, scheduling, and software engineering. PLASMA currently offers a collection of routines for solving linear systems of equations and least square problems.},
	author = {Maksims Abalenkovs and Negin Bagherpour and Jack Dongarra and Mark Gates and Azzam Haidar and Jakub Kurzak and Piotr Luszczek and Samuel Relton and Jakub Sistek and David Stevens and Panruo Wu and Ichitaro Yamazaki and Asim YarKhan and Mawussi Zounon}
}
@techreport {1080,
	title = {Roadmap for the Development of a Linear Algebra Library for Exascale Computing: SLATE: Software for Linear Algebra Targeting Exascale},
	journal = {SLATE Working Notes},
	number = {01, ICL-UT-17-02},
	year = {2017},
	month = {2017-06},
	publisher = {Innovative Computing Laboratory, University of Tennessee},
	type = {SLATE Working Notes},
	author = {Ahmad Abdelfattah and Hartwig Anzt and Aurelien Bouteiller and Anthony Danalis and Jack Dongarra and Mark Gates and Azzam Haidar and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Stephen Wood and Panruo Wu and Ichitaro Yamazaki and Asim YarKhan}
}
@conference {,
	title = {Scaling Point Set Registration in 3D Across Thread Counts on Multicore and Hardware Accelerator Platforms through Autotuning for Large Scale Analysis of Scientific Point Clouds},
	booktitle = {IEEE International Workshop on Benchmarking, Performance Tuning and Optimization for Big Data Applications (BPOD 2017)},
	year = {2017},
	month = {2017-12},
	publisher = {IEEE},
	organization = {IEEE},
	address = {Boston, MA},
	abstract = {In this article, we present an autotuning approach applied to systematic performance engineering of the EM-ICP (Expectation-Maximization Iterative Closest Point) algorithm for the point set registration problem. We show how we were able to exceed the performance achieved by the reference code through multiple dependence transformations and automated procedure of generating and evaluating numerous implementation variants. Furthermore, we also managed to exploit code transformations that are not that common during manual optimization but yielded better performance in our tests for the EM-ICP algorithm. Finally, we maintained high levels of performance rate in a portable fashion across a wide range of HPC hardware platforms including multicore, many-core, and GPU-based accelerators. More importantly, the results indicate consistently high performance level and ability to move the task of data analysis through point-set registration to any modern compute platform without the concern of inferior asymptotic efficiency.},
	doi = {https://doi.org/10.1109/BigData.2017.8258258},
	author = {Piotr Luszczek and Jakub Kurzak and Ichitaro Yamazaki and David Keffer and Jack Dongarra}
}
@conference {,
	title = {Towards Numerical Benchmark for Half-Precision Floating Point Arithmetic},
	booktitle = {2017 IEEE High Performance Extreme Computing Conference (HPEC)},
	year = {2017},
	month = {2017-09},
	publisher = {IEEE},
	organization = {IEEE},
	address = {Waltham, MA},
	abstract = {With NVIDA Tegra Jetson X1 and Pascal P100 GPUs, NVIDIA introduced hardware-based computation on FP16 numbers also called half-precision arithmetic. In this talk, we will introduce the steps required to build a viable benchmark for this new arithmetic format. This will include the connections to established IEEE floating point standards and existing HPC benchmarks. The discussion will focus on performance and numerical stability issues that are important for this kind of benchmarking and how they relate to NVIDIA platforms.},
	doi = {https://doi.org/10.1109/HPEC.2017.8091031},
	author = {Piotr Luszczek and Jakub Kurzak and Ichitaro Yamazaki and Jack Dongarra}
}
@article {,
	title = {With Extreme Computing, the Rules Have Changed},
	journal = {Computing in Science \& Engineering},
	volume = {19},
	year = {2017},
	month = {2017-05},
	pages = {52-62},
	abstract = {On the eve of exascale computing, traditional wisdom no longer applies. High-performance computing is gone as we know it. This article discusses a range of new algorithmic techniques emerging in the context of exascale computing, many of which defy the common wisdom of high-performance computing and are considered unorthodox, but could turn out to be a necessity in near future.},
	doi = {https://doi.org/10.1109/MCSE.2017.48},
	author = {Jack Dongarra and Stanimire Tomov and Piotr Luszczek and Jakub Kurzak and Mark Gates and Ichitaro Yamazaki and Hartwig Anzt and Azzam Haidar and Ahmad Abdelfattah}
}
@conference {961,
	title = {Hessenberg Reduction with Transient Error Resilience on GPU-Based Hybrid Architectures},
	booktitle = { 30th IEEE International Parallel \& Distributed Processing Symposium (IPDPS)},
	year = {2016},
	month = {2016-05},
	publisher = {IEEE},
	organization = {IEEE},
	address = {Chicago, IL},
	abstract = {Graphics Processing Units (GPUs) have been seeing widespread adoption in the field of scientific computing, owing to the performance gains provided on computation-intensive applications. In this paper, we present the design and implementation of a Hessenberg reduction algorithm immune to simultaneous soft-errors, capable of taking advantage of hybrid GPU-CPU platforms. These soft-errors are detected and corrected on the fly, preventing the propagation of the error to the rest of the data. Our design is at the intersection between several fault tolerant techniques and employs the algorithm-based fault tolerance technique, diskless checkpointing, and reverse computation to achieve its goal. By utilizing the idle time of the CPUs, and by overlapping both host-side and GPU-side workloads, we minimize the resilience overhead. Experimental results have validated our design decisions as our algorithm introduced less than 2\% performance overhead compared to the optimized, but fault-prone, hybrid Hessenberg reduction.},
	author = {Yulu Jia and Piotr Luszczek and Jack Dongarra}
}
@conference {939,
	title = {Heterogeneous Streaming},
	booktitle = {The Sixth International Workshop on Accelerators and Hybrid Exascale Systems (AsHES), IPDPS 2016},
	year = {2016},
	month = {2016-05},
	publisher = {IEEE},
	organization = {IEEE},
	address = {Chicago, IL},
	abstract = {This paper introduces a new heterogeneous streaming library called hetero Streams (hStreams). We show how a simple FIFO streaming model can be applied to heterogeneous systems that include manycore coprocessors and multicore CPUs. This model supports concurrency across nodes, among tasks within a node, and between data transfers and computation. We give examples for different approaches, show how the implementation can be layered, analyze overheads among layers, and apply those models to parallelize applications using simple, intuitive interfaces. We compare the features and versatility of hStreams, OpenMP, CUDA Streams1 and OmpSs. We show how the use of hStreams makes it easier for scientists to identify tasks and easily expose concurrency among them, and how it enables tuning experts and runtime systems to tailor execution for different heterogeneous targets. Practical application examples are taken from the field of numerical linear algebra, commercial structural simulation software, and a seismic processing application.},
	keywords = {plasma},
	author = {Chris J. Newburn and Gaurav Bansal and Michael Wood and Luis Crivelli and Judit Planas and Alejandro Duran and Paulo Souza and Leonardo Borges and Piotr Luszczek and Stanimire Tomov and Jack Dongarra and Hartwig Anzt and Mark Gates and Azzam Haidar and Yulu Jia and Khairul Kabir and Ichitaro Yamazaki and Jesus Labarta}
}
@article {984,
	title = {High Performance Conjugate Gradient Benchmark: A new Metric for Ranking High Performance Computing Systems},
	journal = {International Journal of High Performance Computing Applications},
	volume = {30},
	year = {2016},
	month = {2016-02},
	pages = {3 - 10},
	issn = {1094-3420},
	doi = {10.1177/1094342015593158},
	url = {http://hpc.sagepub.com/cgi/doi/10.1177/1094342015593158},
	author = {Jack Dongarra and Michael A. Heroux and Piotr Luszczek}
}
@article {1472,
	title = {Linear Algebra Software for Large-Scale Accelerated Multicore Computing},
	journal = {Acta Numerica},
	volume = {25},
	year = {2016},
	month = {2016-05},
	pages = {1-160},
	abstract = {Many crucial scientific computing applications, ranging from national security to medical advances, rely on high-performance linear algebra algorithms and technologies, underscoring their importance and broad impact. Here we present the state-of-the-art design and implementation practices for the acceleration of the predominant linear algebra algorithms on large-scale accelerated multicore systems. Examples are given with fundamental dense linear algebra algorithms {\textendash} from the LU, QR, Cholesky, and LDLT factorizations needed for solving linear systems of equations, to eigenvalue and singular value decomposition (SVD) problems. The implementations presented are readily available via the open-source PLASMA and MAGMA libraries, which represent the next generation modernization of the popular LAPACK library for accelerated multicore systems. To generate the extreme level of parallelism needed for the efficient use of these systems, algorithms of interest are redesigned and then split into well-chosen computational tasks. The task execution is scheduled over the computational components of a hybrid system of multicore CPUs with GPU accelerators and/or Xeon Phi coprocessors, using either static scheduling or light-weight runtime systems. The use of light-weight runtime systems keeps scheduling overheads low, similar to static scheduling, while enabling the expression of parallelism through sequential-like code. This simplifies the development effort and allows exploration of the unique strengths of the various hardware components. Finally, we emphasize the development of innovative linear algebra algorithms using three technologies {\textendash} mixed precision arithmetic, batched operations, and asynchronous iterations {\textendash} that are currently of high interest for accelerated multicore systems.},
	doi = {10.1017/S0962492916000015},
	author = {Ahmad Abdelfattah and Hartwig Anzt and Jack Dongarra and Mark Gates and Azzam Haidar and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and undefined and Asim YarKhan}
}
@techreport {,
	title = {MAGMA Batched: A Batched BLAS Approach for Small Matrix Factorizations and Applications on GPUs},
	journal = {Innovative Computing Laboratory Technical Report},
	number = {ICL-UT-16-02},
	year = {2016},
	month = {2016-08},
	publisher = {University of Tennessee},
	abstract = {A particularly challenging class of problems arising in many applications, called batched problems, involves linear algebra operations on many small-sized matrices. We proposed and designed batched BLAS (Basic Linear Algebra Subroutines), Level-2 GEMV and Level-3 GEMM, to solve them. We illustrate how batched GEMV and GEMM to be able to assist batched advance factorization (e.g. bi-diagonalization) and other BLAS routines (e.g. triangular solve) to achieve optimal performance on GPUs. Our solutions achieved up to 2.8-3{\texttimes} speedups compared to CUBLAS and MKL solutions, wherever possible. We illustrated the batched methodology on a real-world Hydrodynamic application by reformulating the tensor operations into batched BLAS GEMV and GEMM operations. A 2.5{\texttimes} speedup and a 1.4{\texttimes} greenup are obtained by changing 10\% of the code. We accelerated and scaled it on Titan supercomputer to 4096 nodes.},
	author = {Tingxing Dong and Azzam Haidar and Piotr Luszczek and Stanimire Tomov and Ahmad Abdelfattah and Jack Dongarra}
}
@article {,
	title = {A New Metric for Ranking High-Performance Computing Systems},
	journal = {National Science Review},
	volume = {3},
	year = {2016},
	month = {2016-01},
	pages = {30-35},
	doi = {https://doi.org/10.1093/nsr/nwv084},
	author = {Jack Dongarra and Michael A. Heroux and Piotr Luszczek}
}
@article {986,
	title = {Porting the PLASMA Numerical Library to the OpenMP Standard},
	journal = {International Journal of Parallel Programming},
	year = {2016},
	month = {2016-06},
	abstract = {PLASMA is a numerical library intended as a successor to LAPACK for solving problems in dense linear algebra on multicore processors. PLASMA relies on the QUARK scheduler for efficient multithreading of algorithms expressed in a serial fashion. QUARK is a superscalar scheduler and implements automatic parallelization by tracking data dependencies and resolving data hazards at runtime. Recently, this type of scheduling has been incorporated in the OpenMP standard, which allows to transition PLASMA from the proprietary solution offered by QUARK to the standard solution offered by OpenMP. This article studies the feasibility of such transition.},
	issn = {0885-7458},
	doi = {10.1007/s10766-016-0441-6},
	url = {http://link.springer.com/10.1007/s10766-016-0441-6http://link.springer.com/content/pdf/10.1007/s10766-016-0441-6http://link.springer.com/content/pdf/10.1007/s10766-016-0441-6.pdfhttp://link.springer.com/article/10.1007/s10766-016-0441-6/fulltext.html},
	author = {Asim YarKhan and Jakub Kurzak and Piotr Luszczek and Jack Dongarra}
}
@conference {962,
	title = {Search Space Generation and Pruning System for Autotuners},
	booktitle = {30th IEEE International Parallel \& Distributed Processing Symposium (IPDPS)},
	year = {2016},
	month = {2016-05},
	publisher = {IEEE},
	organization = {IEEE},
	address = {Chicago, IL},
	abstract = {This work tackles two simultaneous challenges faced by autotuners: the ease of describing a complex, multidimensional search space, and the speed of evaluating that space, while applying a multitude of pruning constraints. This article presents a declarative notation for describing a search space and a translation system for conversion to a standard C code for fast and multithreaded, as necessary, evaluation. The notation is Python-based and thus simple in syntax and easy to assimilate by the user interested in tuning rather than learning a new programming language. A large number of dimensions and a large number of pruning constraints may be expressed with little effort. The system is discussed in the context of autotuning the canonical matrix multiplication kernel for NVIDIA GPUs, where the search space has 15 dimensions and involves application of 10 complex pruning constrains. The speed of evaluation is compared against generators created using imperative programming style in various scripting and compiled languages.},
	author = {Piotr Luszczek and Mark Gates and Jakub Kurzak and Anthony Danalis and Jack Dongarra}
}
@article {866,
	title = {Acceleration of GPU-based Krylov solvers via Data Transfer Reduction},
	journal = {International Journal of High Performance Computing Applications},
	year = {2015},
	author = {Hartwig Anzt and William Sawyer and Stanimire Tomov and Piotr Luszczek and Jack Dongarra}
}
@conference {843,
	title = {Batched Matrix Computations on Hardware Accelerators},
	booktitle = {EuroMPI/Asia 2015 Workshop},
	year = {2015},
	month = {2015-09},
	address = {Bordeaux, France},
	abstract = {Scientific applications require solvers that work on many small size problems that are independent from each other. At the same time, the high-end hardware evolves rapidly and becomes ever more throughput-oriented and thus there is an increasing need for effective approach to develop energy efficient, high-performance codes for these small matrix problems that we call batched factorizations. The many applications that need this functionality could especially benefit from the use of GPUs, which currently are four to five times more energy efficient than multicore CPUs on important scientific workloads. This paper, consequently, describes the development of the most common, one-sided factorizations: Cholesky, LU, and QR for a set of small dense matrices. The algorithms we present together with their implementations are, by design, inherently parallel. In particular, our approach is based on representing the process as a sequence of batched BLAS routines that are executed entirely on a GPU. Importantly, this is unlike the LAPACK and the hybridMAGMAfactorization algorithms that work under drastically different assumptions of hardware design and efficiency of execution of the various computational kernels involved in the implementation. Thus, our approach is more efficient than what works for a combination of multicore CPUs and GPUs for the problems sizes of interest of the application use cases. The paradigm where upon a single chip (a GPU or a CPU) factorizes a single problem at a time is not at all efficient for in our applications{\textquoteright} context. We illustrate all these claims through a detailed performance analysis. With the help of profiling and tracing tools, we guide our development of batched factorizations to achieve up to two-fold speedup and three-fold better energy efficiency as compared against our highly optimized batched CPU implementations based on MKL library. The tested system featured two sockets of Intel Sandy Bridge CPUs and we compared to a batched LU factorizations featured in the CUBLAS library for GPUs, we achieve as high as 2.5x speedup on the NVIDIA K40 GPU.},
	author = {Azzam Haidar and Piotr Luszczek and Stanimire Tomov and Jack Dongarra}
}
@article {858,
	title = {Batched matrix computations on hardware accelerators based on GPUs},
	journal = {International Journal of High Performance Computing Applications},
	year = {2015},
	month = {2015-02},
	abstract = {Scientific applications require solvers that work on many small size problems that are independent from each other. At the same time, the high-end hardware evolves rapidly and becomes ever more throughput-oriented and thus there is an increasing need for an effective approach to develop energy-efficient, high-performance codes for these small matrix problems that we call batched factorizations. The many applications that need this functionality could especially benefit from the use of GPUs, which currently are four to five times more energy efficient than multicore CPUs on important scientific workloads. This paper, consequently, describes the development of the most common, one-sided factorizations, Cholesky, LU, and QR, for a set of small dense matrices. The algorithms we present together with their implementations are, by design, inherently parallel. In particular, our approach is based on representing the process as a sequence of batched BLAS routines that are executed entirely on a GPU. Importantly, this is unlike the LAPACK and the hybrid MAGMA factorization algorithms that work under drastically different assumptions of hardware design and efficiency of execution of the various computational kernels involved in the implementation. Thus, our approach is more efficient than what works for a combination of multicore CPUs and GPUs for the problems sizes of interest of the application use cases. The paradigm where upon a single chip (a GPU or a CPU) factorizes a single problem at a time is not at all efficient in our applications{\textquoteright} context. We illustrate all of these claims through a detailed performance analysis. With the help of profiling and tracing tools, we guide our development of batched factorizations to achieve up to two-fold speedup and three-fold better energy efficiency as compared against our highly optimized batched CPU implementations based on MKL library. The tested system featured two sockets of Intel Sandy Bridge CPUs and we compared with a batched LU factorizations featured in the CUBLAS library for GPUs, we achieve as high as 2.5{\texttimes} speedup on the NVIDIA K40 GPU.},
	keywords = {batched factorization, hardware accelerators, numerical linear algebra, numerical software libraries, one-sided factorization algorithms},
	doi = {10.1177/1094342014567546},
	author = {Azzam Haidar and Tingxing Dong and Piotr Luszczek and Stanimire Tomov and Jack Dongarra}
}
@conference {928,
	title = {Cholesky Across Accelerators},
	booktitle = {17th IEEE International Conference on High Performance Computing and Communications (HPCC 2015)},
	year = {2015},
	month = {2015-08},
	publisher = {IEEE},
	organization = {IEEE},
	address = {Elizabeth, NJ},
	author = {Asim YarKhan and Azzam Haidar and Chongxiao Cao and Piotr Luszczek and Stanimire Tomov and Jack Dongarra}
}
@conference {894,
	title = {Efficient Eigensolver Algorithms on Accelerator Based Architectures},
	booktitle = {2015 SIAM Conference on Applied Linear Algebra (SIAM LA)},
	year = {2015},
	month = {2015-10},
	publisher = {SIAM},
	organization = {SIAM},
	address = {Atlanta, GA},
	abstract = {The enormous gap between the high-performance capabilities of GPUs and the slow interconnect between them has made the development of numerical software that is scalable across multiple GPUs extremely challenging. We describe a successful methodology on how to address the challenges -starting from our algorithm design, kernel optimization and tuning, to our programming model- in the development of a scalable high-performance symmetric eigenvalue and singular value solver.},
	author = {Azzam Haidar and Piotr Luszczek and Stanimire Tomov and Jack Dongarra}
}
@article {873,
	title = {Experiences in autotuning matrix multiplication for energy minimization on GPUs},
	journal = {Concurrency in Computation: Practice and Experience},
	volume = {27},
	year = {2015},
	month = {2015-12},
	pages = {5096-5113},
	doi = {10.1002/cpe.3516},
	author = {Hartwig Anzt and Blake Haugen and Jakub Kurzak and Piotr Luszczek and Jack Dongarra}
}
@article {982,
	title = {Experiences in Autotuning Matrix Multiplication for Energy Minimization on GPUs},
	journal = {Concurrency and Computation: Practice and Experience},
	volume = {27},
	year = {2015},
	month = {12-Oct},
	pages = {5096 - 5113},
	abstract = {In this paper, we report extensive results and analysis of autotuning the computationally intensive graphics processing units kernel for dense matrix{\textendash}matrix multiplication in double precision. In contrast to traditional autotuning and/or optimization for runtime performance only, we also take the energy efficiency into account. For kernels achieving equal performance, we show significant differences in their energy balance. We also identify the memory throughput as the most influential metric that trades off performance and energy efficiency. As a result, the performance optimal case ends up not being the most efficient kernel in overall resource use.},
	keywords = {Autotuning, energy efficiency, hardware accelerators, matrix multiplication, power},
	doi = {10.1002/cpe.3516},
	url = {http://doi.wiley.com/10.1002/cpe.3516https://api.wiley.com/onlinelibrary/tdm/v1/articles/10.1002\%2Fcpe.3516},
	author = {Hartwig Anzt and Blake Haugen and Jakub Kurzak and Piotr Luszczek and Jack Dongarra}
}
@conference {887,
	title = {Flexible Linear Algebra Development and Scheduling with Cholesky Factorization},
	booktitle = {17th IEEE International Conference on High Performance Computing and Communications},
	year = {2015},
	month = {2015-08},
	address = {Newark, NJ},
	abstract = {Modern high performance computing environments are composed of networks of compute nodes that often contain a variety of heterogeneous compute resources, such as multicore-CPUs, GPUs, and coprocessors. One challenge faced by domain scientists is how to efficiently use all these distributed, heterogeneous resources. In order to use the GPUs effectively, the workload parallelism needs to be much greater than the parallelism for a multicore-CPU. On the other hand, a Xeon Phi coprocessor will work most effectively with degree of parallelism between GPUs and multicore-CPUs. Additionally, effectively using distributed memory nodes brings out another level of complexity where the workload must be carefully partitioned over the nodes. In this work we are using a lightweight runtime environment to handle many of the complexities in such distributed, heterogeneous systems. The runtime environment uses task-superscalar concepts to enable the developer to write serial code while providing parallel execution. The task-programming model allows the developer to write resource-specialization code, so that each resource gets the appropriate sized workload-grain. Our task programming abstraction enables the developer to write a single algorithm that will execute efficiently across the distributed heterogeneous machine. We demonstrate the effectiveness of our approach with performance results for dense linear algebra applications, specifically the Cholesky factorization.},
	author = {Azzam Haidar and Asim YarKhan and Chongxiao Cao and Piotr Luszczek and Stanimire Tomov and Jack Dongarra}
}
@conference {864,
	title = {Framework for Batched and GPU-resident Factorization Algorithms to Block Householder Transformations},
	booktitle = {ISC High Performance},
	year = {2015},
	month = {2015-07},
	publisher = {Springer},
	organization = {Springer},
	address = {Frankfurt, Germany},
	author = {Azzam Haidar and Tingxing Dong and Stanimire Tomov and Piotr Luszczek and Jack Dongarra}
}
@article {923,
	title = {High-Performance Conjugate-Gradient Benchmark: A New Metric for Ranking High-Performance Computing Systems},
	journal = {The International Journal of High Performance Computing Applications},
	year = {2015},
	abstract = {We describe a new high-performance conjugate-gradient (HPCG) benchmark. HPCG is composed of computations and data-access patterns commonly found in scientific applications. HPCG strives for a better correlation to existing codes from the computational science domain and to be representative of their performance. HPCG is meant to help drive the computer system design and implementation in directions that will better impact future performance improvement.},
	keywords = {Additive Schwarz, HPC Benchmarking, Multigrid smoothing, Preconditioned Conjugate Gradient, Validation and Verification},
	doi = {10.1177/1094342015593158},
	author = {Jack Dongarra and Michael A. Heroux and Piotr Luszczek}
}
@article {829,
	title = {HPC Programming on Intel Many-Integrated-Core Hardware with MAGMA Port to Xeon Phi},
	journal = {Scientific Programming},
	volume = {23},
	year = {2015},
	month = {2015-01},
	abstract = {This paper presents the design and implementation of several fundamental dense linear algebra (DLA) algorithms for multicore with Intel Xeon Phi Coprocessors. In particular, we consider algorithms for solving linear systems. Further, we give an overview of the MAGMA MIC library, an open source, high performance library that incorporates the developments presented, and in general provides to heterogeneous architectures of multicore with coprocessors the DLA functionality of the popular LAPACK library. The LAPACK-compliance simplifies the use of the MAGMA MIC library in applications, while providing them with portably performant DLA. High performance is obtained through use of the high-performance BLAS, hardware-specific tuning, and a hybridization methodology where we split the algorithm into computational tasks of various granularities. Execution of those tasks is properly scheduled over the heterogeneous hardware components by minimizing data movements and mapping algorithmic requirements to the architectural strengths of the various heterogeneous hardware components. Our methodology and programming techniques are incorporated into the MAGMA MIC API, which abstracts the application developer from the specifics of the Xeon Phi architecture and is therefore applicable to algorithms beyond the scope of DLA.},
	keywords = {communication and computation overlap, dynamic runtime scheduling using dataflow dependences, hardware accelerators and coprocessors, Intel Xeon Phi processor, Many Integrated Cores, numerical linear algebra},
	issn = {1058-9244},
	doi = {10.3233/SPR-140404},
	author = {Azzam Haidar and Jack Dongarra and Khairul Kabir and Mark Gates and Piotr Luszczek and Stanimire Tomov and Yulu Jia}
}
@techreport {920,
	title = {HPCG Benchmark: a New Metric for Ranking High Performance Computing Systems},
	journal = {University of Tennessee Computer Science Technical Report },
	number = {ut-eecs-15-736},
	year = {2015},
	month = {2015-01},
	publisher = {University of Tennessee},
	abstract = {We describe a new high performance conjugate gradient (HPCG) benchmark. HPCG is composed of computations and data access patterns commonly found in scientific applications. HPCG strives for a better correlation to existing codes from the computational science domain and be representative of their performance. HPCG is meant to help drive the computer system design and implementation in directions that will better impact future performance improvement.},
	keywords = {Additive Schwarz, HPC Benchmarking, Multigrid smoothing, Preconditioned Conjugate Gradient, Validation and Verification},
	url = {http://www.eecs.utk.edu/resources/library/file/1047/ut-eecs-15-736.pdf},
	author = {Jack Dongarra and Michael A. Heroux and Piotr Luszczek}
}
@conference {888,
	title = {MAGMA Embedded: Towards a Dense Linear Algebra Library for Energy Efficient Extreme Computing},
	booktitle = { 2015 IEEE High Performance Extreme Computing Conference (HPEC {\textquoteright}15), (Best Paper Award)},
	year = {2015},
	month = {2015-09},
	publisher = {IEEE},
	organization = {IEEE},
	address = {Waltham, MA},
	abstract = {Embedded computing, not only in large systems like drones and hybrid vehicles, but also in small portable devices like smart phones and watches, gets more extreme to meet ever increasing demands for extended and improved functionalities. This, combined with the typical constrains for low power consumption and small sizes, makes the design of numerical libraries for embedded systems challenging. In this paper, we present the design and implementation of embedded system aware algorithms, that target these challenges in the area of dense linear algebra. We consider the fundamental problems of solving linear systems of equations and least squares problems, using the LU, QR, and Cholesky factorizations, and illustrate our results, both in terms of performance and energy efficiency, on the Jetson TK1 development kit. We developed performance optimizations for both small and large problems. In contrast to the corresponding LAPACK algorithms, the new designs target the use of many-cores, readily available now even in mobile devices like the Jetson TK1, e.g., featuring 192 CUDA cores. The implementations presented will form the core of a MAGMA Embedded library, to be released as part of the MAGMA libraries.
},
	author = {Azzam Haidar and Stanimire Tomov and Piotr Luszczek and Jack Dongarra}
}
@article {1347,
	title = {MAGMA MIC: Optimizing Linear Algebra for Intel Xeon Phi},
	year = {2015},
	month = {2015-06},
	publisher = {ISC High Performance (ISC15), Intel Booth Presentation},
	address = {Frankfurt, Germany},
	author = {Hartwig Anzt and Jack Dongarra and Mark Gates and Azzam Haidar and Khairul Kabir and Piotr Luszczek and Stanimire Tomov and Ichitaro Yamazaki}
}
@conference {916,
	title = {Optimization for Performance and Energy for Batched Matrix Computations on GPUs},
	booktitle = {8th Workshop on General Purpose Processing Using GPUs (GPGPU 8)},
	year = {2015},
	month = {2015-02},
	publisher = {ACM},
	organization = {ACM},
	address = {San Francisco, CA},
	abstract = {As modern hardware keeps evolving, an increasingly effective approach to develop energy efficient and high-performance solvers is to design them to work on many small size independent problems. Many applications already need this functionality, especially for GPUs, which are known to be currently about four to five times more energy efficient than multicore CPUs. We describe the development of the main one-sided factorizations that work for a set of small dense matrices in parallel, and we illustrate our techniques on the LU and Cholesky factorizations. We refer to this mode of operation as a batched factorization. Our approach is based on representing the  algorithms as a sequence of batched BLAS routines for GPU-only execution. The goal of avoiding multicore CPU use, e.g., as in the hybrid CPU-GPU algorithms, is to exclusively benefit from the GPU{\textquoteright}s significantly higher energy efficiency, as well as from the removal of the  costly CPU-to-GPU communications. Furthermore, we do not use a single symmetric multiprocessor (on the GPU) to factorize a single problem at a time. We illustrate how our performance analysis and the use of profiling and tracing tools guided the development and optimization of batched factorizations to achieve up to 2-fold speedup and 3-fold better energy efficiency compared to our highly optimized batched CPU implementations based on the MKL library (when using two sockets of Intel Sandy Bridge CPUs). Compared to a batched LU factorization featured in the CUBLAS library for GPUs, we achieved up to 2.5 speedup on the K40 GPU.},
	keywords = {batched factorization, hardware accelerators, numerical linear algebra, numerical software libraries, one-sided factorization algorithms},
	doi = {10.1145/2716282.2716288},
	author = {Azzam Haidar and Tingxing Dong and Piotr Luszczek and Stanimire Tomov and Jack Dongarra}
}
@article {936,
	title = {Parallel Programming Models for Dense Linear Algebra on Heterogeneous Systems},
	journal = {Supercomputing Frontiers and Innovations},
	volume = {2},
	number = {4},
	year = {2015},
	month = {2015-10},
	abstract = {We present a review of the current best practices in parallel programming models for dense linear algebra (DLA) on heterogeneous architectures. We consider multicore CPUs, stand alone manycore coprocessors, GPUs, and combinations of these. Of interest is the evolution of the programming models for DLA libraries {\textendash} in particular, the evolution from the popular LAPACK and ScaLAPACK libraries to their modernized counterparts PLASMA (for multicore CPUs) and MAGMA (for heterogeneous architectures), as well as other programming models and libraries. Besides providing insights into the programming techniques of the libraries considered, we outline our view of the current strengths and weaknesses of their programming models {\textendash} especially in regards to hardware trends and ease of programming high-performance numerical software that current applications need {\textendash} in order to motivate work and future directions for the next generation of parallel programming models for high-performance linear algebra libraries on heterogeneous systems.},
	keywords = {dense linear algebra, gpu, HPC, Multicore, plasma, Programming models, runtime},
	doi = {10.14529/jsfi1504},
	author = {Maksims Abalenkovs and Ahmad Abdelfattah and Jack Dongarra and Mark Gates and Azzam Haidar and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Ichitaro Yamazaki and Asim YarKhan}
}
@conference {884,
	title = {Performance of Random Sampling for Computing Low-rank Approximations of a Dense Matrix on GPUs},
	booktitle = {The International Conference for High Performance Computing, Networking, Storage and Analysis (SC15)},
	year = {2015},
	month = {2015-11},
	publisher = {ACM},
	organization = {ACM},
	address = {Austin, TX},
	author = {Theo Mary and Ichitaro Yamazaki and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Jack Dongarra}
}
@conference {885,
	title = {Randomized Algorithms to Update Partial Singular Value Decomposition on a Hybrid CPU/GPU Cluster},
	booktitle = {The International Conference for High Performance Computing, Networking, Storage and Analysis (SC15)},
	year = {2015},
	month = {2015-11},
	publisher = {ACM},
	organization = {ACM},
	address = {Austin, TX},
	author = {Ichitaro Yamazaki and Jakub Kurzak and Piotr Luszczek and Jack Dongarra}
}
@article {699,
	title = {A Survey of Recent Developments in Parallel Implementations of Gaussian Elimination},
	journal = {Concurrency and Computation: Practice and Experience},
	volume = {27},
	year = {2015},
	month = {2015-04},
	pages = {1292-1309},
	abstract = {Gaussian elimination is a canonical linear algebra procedure for solving linear systems of equations. In the last few years, the algorithm has received a lot of attention in an attempt to improve its parallel performance. This article surveys recent developments in parallel implementations of Gaussian elimination for shared memory architecture. Five different flavors are investigated. Three of them are based on different strategies for pivoting: partial pivoting, incremental pivoting, and tournament pivoting. The fourth one replaces pivoting with the Partial Random Butterfly Transformation, and finally, an implementation without pivoting is used as a performance baseline. The technique of iterative refinement is applied to recover numerical accuracy when necessary. All parallel implementations are produced using dynamic, superscalar, runtime scheduling and tile matrix layout. Results on two multisocket multicore systems are presented. Performance and numerical accuracy is analyzed.},
	keywords = {Gaussian elimination, lu factorization, Multicore, parallel, plasma, shared memory},
	doi = {10.1002/cpe.3306},
	author = {Simplice Donfack and Jack Dongarra and Mathieu Faverge and Mark Gates and Jakub Kurzak and Piotr Luszczek and Ichitaro Yamazaki}
}
@conference {844,
	title = {Towards Batched Linear Solvers on Accelerated Hardware Platforms},
	booktitle = {8th Workshop on General Purpose Processing Using GPUs (GPGPU 8) co-located with PPOPP 2015},
	year = {2015},
	month = {2015-02},
	publisher = {ACM},
	organization = {ACM},
	address = {San Francisco, CA},
	abstract = {As hardware evolves, an increasingly effective approach to develop energy efficient, high-performance solvers, is to design them to work on many small and independent problems. Indeed, many applications already need this functionality, especially for GPUs, which are known to be currently about four to five times more energy efficient than multicore CPUs for every floating-point operation. In this paper, we describe the development of the main one-sided factorizations: LU, QR, and Cholesky; that are needed for a set of small dense matrices to work in parallel. We refer to such algorithms as batched factorizations. Our approach is based on representing the algorithms as a sequence of batched BLAS routines for GPU-contained execution. Note that this is similar in functionality to the LAPACK and the hybrid MAGMA algorithms for large-matrix factorizations. But it is different from a straightforward approach, whereby each of GPU{\textquoteright}s symmetric multiprocessors factorizes a single problem at a time.We illustrate how our performance analysis together with the profiling and tracing tools guided the development of batched factorizations to achieve up to 2-fold speedup and 3-fold better energy efficiency compared to our highly optimized batched CPU implementations based on the MKL library on a two-sockets, Intel Sandy Bridge server. Compared to a batched LU factorization featured in the NVIDIA{\textquoteright}s CUBLAS library for GPUs, we achieves up to 2.5-fold speedup on the K40 GPU.},
	keywords = {batched factorization, hardware accelerators, numerical linear algebra, numerical software libraries, one-sided factorization algorithms},
	author = {Azzam Haidar and Piotr Luszczek and Stanimire Tomov and Jack Dongarra}
}
@inproceedings {959,
	title = {Weighted Dynamic Scheduling with Many Parallelism Grains for Offloading of Numerical Workloads to Multiple Varied Accelerators},
	journal = {Proceedings of the 6th Workshop on Latest Advances in Scalable Algorithms for Large-Scale Systems (ScalA{\textquoteright}15)},
	volume = {No. 5},
	year = {2015},
	month = {2015-11},
	publisher = {ACM},
	address = {Austin, TX},
	abstract = {A wide variety of heterogeneous compute resources are available to modern computers, including multiple sockets containing multicore CPUs, one-or-more GPUs of varying power, and coprocessors such as the Intel Xeon Phi. The challenge faced by domain scientists is how to efficiently and productively use these varied resources. For example, in order to use GPUs effectively, the workload must have a greater degree of parallelism than a workload designed for a multicore-CPU. The domain scientist would have to design and schedule an application in multiple degrees of parallelism and task grain sizes in order to obtain efficient performance from the resources. We propose a productive programming model starting from serial code, which achieves parallelism and scalability by using a task-superscalar runtime environment to adapt the computation to the available resources. The adaptation is done at multiple points, including multi-level data partitioning, adaptive task grain sizes, and dynamic task scheduling. The effectiveness of this approach for utilizing multi-way heterogeneous hardware resources is demonstrated by implementing dense linear algebra applications.},
	keywords = {dataflow scheduling, hardware accelerators, multi-grain parallelism},
	author = {Azzam Haidar and Yulu Jia and Piotr Luszczek and Stanimire Tomov and Asim YarKhan and Jack Dongarra}
}
@inbook {780,
	title = {Accelerating Numerical Dense Linear Algebra Calculations with GPUs},
	booktitle = {Numerical Computations with GPUs},
	year = {2014},
	pages = {3-28},
	publisher = {Springer International Publishing},
	organization = {Springer International Publishing},
	chapter = {1},
	isbn = {978-3-319-06547-2},
	doi = {10.1007/978-3-319-06548-9_1},
	author = {Jack Dongarra and Mark Gates and Azzam Haidar and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Ichitaro Yamazaki}
}
@article {698,
	title = {Achieving numerical accuracy and high performance using recursive tile LU factorization with partial pivoting},
	journal = {Concurrency and Computation: Practice and Experience},
	volume = {26},
	year = {2014},
	month = {2014-05},
	pages = {1408-1431},
	chapter = {1408},
	abstract = {The LU factorization is an important numerical algorithm for solving systems of linear equations in science and engineering and is a characteristic of many dense linear algebra computations. For example, it has become the de facto numerical algorithm implemented within the LINPACK benchmark to rank the most powerful supercomputers in the world, collected by the TOP500 website. Multicore processors continue to present challenges to the development of fast and robust numerical software due to the increasing levels of hardware parallelism and widening gap between core and memory speeds. In this context, the difficulty in developing new algorithms for the scientific community resides in the combination of two goals: achieving high performance while maintaining the accuracy of the numerical algorithm. This paper proposes a new approach for computing the LU factorization in parallel on multicore architectures, which not only improves the overall performance but also sustains the numerical quality of the standard LU factorization algorithm with partial pivoting. While the update of the trailing submatrix is computationally intensive and highly parallel, the inherently problematic portion of the LU factorization is the panel factorization due to its memory-bound characteristic as well as the atomicity of selecting the appropriate pivots. Our approach uses a parallel fine-grained recursive formulation of the panel factorization step and implements the update of the trailing submatrix with the tile algorithm. Based on conflict-free partitioning of the data and lockless synchronization mechanisms, our implementation lets the overall computation flow naturally without contention. The dynamic runtime system called QUARK is then able to schedule tasks with heterogeneous granularities and to transparently introduce algorithmic lookahead. The performance results of our implementation are competitive compared to the currently available software packages and libraries. For example, it is up to 40\% faster when compared to the equivalent Intel MKL routine and up to threefold faster than LAPACK with multithreaded Intel MKL BLAS.},
	keywords = {factorization, parallel linear algebra, plasma, recursion, shared memory synchronization, threaded parallelism},
	doi = {10.1002/cpe.3110},
	url = {http://doi.wiley.com/10.1002/cpe.3110},
	author = {Jack Dongarra and Mathieu Faverge and Hatem Ltaeif and Piotr Luszczek}
}
@conference {836,
	title = {clMAGMA: High Performance Dense Linear Algebra with OpenCL },
	booktitle = {International Workshop on OpenCL},
	year = {2014},
	month = {2014-05},
	address = {Bristol University, England},
	abstract = {This paper presents the design and implementation of several fundamental dense linear algebra (DLA) algorithms in OpenCL. In particular, these are linear system solvers and eigenvalue problem solvers. Further, we give an overview of the clMAGMA library, an open source, high performance OpenCL library that incorporates the developments presented, and in general provides to heterogeneous architectures the DLA functionality of the popular LAPACK library. The LAPACK-compliance and use of OpenCL simplify the use of clMAGMA in applications, while providing them with portably performant DLA. High performance is obtained through use of the high-performance OpenCL BLAS, hardware and OpenCL-specific tuning, and a hybridization methodology where we split the algorithm into computational tasks of various granularities. Execution of those tasks is properly scheduled over the heterogeneous hardware components by minimizing data movements and mapping algorithmic requirements to the architectural strengths of the various heterogeneous hardware components.},
	author = {Chongxiao Cao and Jack Dongarra and Peng Du and Mark Gates and Piotr Luszczek and Stanimire Tomov}
}
@conference {710,
	title = {Design and Implementation of a Large Scale Tree-Based QR Decomposition Using a 3D Virtual Systolic Array and a Lightweight Runtime},
	booktitle = {Workshop on Large-Scale Parallel Processing, IPDPS 2014},
	year = {2014},
	month = {2014-05},
	publisher = {IEEE},
	organization = {IEEE},
	address = {Phoenix, AZ},
	abstract = {A systolic array provides an alternative computing paradigm to the von Neuman architecture. Though its hardware implementation has failed as a paradigm to design integrated circuits in the past, we are now discovering that the systolic array as a software virtualization layer can lead to an extremely scalable execution paradigm. To demonstrate this scalability, in this paper, we design and implement a 3D virtual systolic array to compute a tile QR decomposition of a tall-and-skinny dense matrix. Our implementation is based on a state-of-the-art algorithm that factorizes a panel based on a tree-reduction. Using a runtime developed as a part of the Parallel Ultra Light Systolic Array Runtime (PULSAR) project, we demonstrate on a Cray-XT5 machine how our virtual systolic array can be mapped to a large-scale machine and obtain excellent parallel performance. This is an important contribution since such a QR decomposition is used, for example, to compute a least squares solution of an overdetermined system, which arises in many scientific and engineering problems.},
	keywords = {dataflow, message-passing, multithreading, QR decomposition, runtime, systolic array},
	author = {Ichitaro Yamazaki and Jakub Kurzak and Piotr Luszczek and Jack Dongarra}
}
@conference {765,
	title = {Heterogeneous Acceleration for Linear Algebra in Mulit-Coprocessor Environments},
	booktitle = {VECPAR 2014},
	year = {2014},
	month = {2014-06},
	address = {Eugene, OR},
	abstract = {We present an efficient and scalable programming model for the development of linear algebra in heterogeneous multi-coprocessor environments. The model incorporates some of the current best design and implementation practices for the heterogeneous acceleration of dense linear algebra (DLA). Examples are given as the basis for solving linear systems{\textquoteright} algorithms {\textendash} the LU, QR, and Cholesky factorizations. To generate the extreme level of parallelism needed for the efficient use of coprocessors, algorithms of interest are redesigned and then split into well-chosen computational tasks. The tasks execution is scheduled over the computational components of a hybrid system of multi-core CPUs and coprocessors using a light-weight runtime system. The use of light-weight runtime systems keeps scheduling overhead low, while enabling the expression of parallelism through otherwise sequential code. This simplifies the development efforts and allows the exploration of the unique strengths of the various hardware components.},
	keywords = {Computer science, factorization, Heterogeneous systems, Intel Xeon Phi, linear algebra},
	author = {Azzam Haidar and Piotr Luszczek and Stanimire Tomov and Jack Dongarra}
}
@article {760,
	title = {Looking Back at Dense Linear Algebra Software},
	journal = {Journal of Parallel and Distributed Computing},
	volume = {74},
	year = {2014},
	month = {2014-07},
	pages = {2548{\textendash}2560},
	chapter = {2548},
	abstract = {Over the years, computational physics and chemistry served as an ongoing source of problems that demanded the ever increasing performance from hardware as well as the software that ran on top of it. Most of these problems could be translated into solutions for systems of linear equations: the very topic of numerical linear algebra. Seemingly then, a set of efficient linear solvers could be solving important scientific problems for years to come. We argue that dramatic changes in hardware designs precipitated by the shifting nature of the marketplace of computer hardware had a continuous effect on the software for numerical linear algebra. The extraction of high percentages of peak performance continues to require adaptation of software. If the past history of this adaptive nature of linear algebra software is any guide then the future theme will feature changes as well{\textendash}changes aimed at harnessing the incredible advances of the evolving hardware infrastructure.},
	keywords = {decompositional approach, dense linear algebra, parallel algorithms},
	doi = {10.1016/j.jpdc.2013.10.005},
	author = {Piotr Luszczek and Jakub Kurzak and Jack Dongarra}
}
@conference {859,
	title = {LU Factorization of Small Matrices: Accelerating Batched DGETRF on the GPU},
	booktitle = {16th IEEE International Conference on High Performance Computing and Communications (HPCC)},
	year = {2014},
	month = {2014-08},
	publisher = {IEEE},
	organization = {IEEE},
	address = {Paris, France},
	abstract = {Gaussian Elimination is commonly used to solve dense linear systems in scientific models. In a large number of applications, a need arises to solve many small size problems, instead of few large linear systems. The size of each of these small linear systems depends, for example, on the number of the ordinary differential equations (ODEs) used in the model, and can be on the order of hundreds of unknowns. To efficiently exploit the computing power of modern accelerator hardware, these linear systems are processed in batches. To improve the numerical stability of the Gaussian Elimination, at least partial pivoting is required, most often accomplished with row pivoting. However, row pivoting can result in a severe performance penalty on GPUs because it brings in thread divergence and non-coalesced memory accesses. The state-of-the-art libraries for linear algebra that target GPUs, such as MAGMA, focus on large matrix sizes. They change the data layout by transposing the matrix to avoid these divergence and non-coalescing penalties. However, the data movement associated with transposition is very expensive for small matrices. In this paper, we propose a batched LU factorization for GPUs by using a multi-level blocked right looking algorithm that preserves the data layout but minimizes the penalty of partial pivoting. Our batched LU achieves up to 2:5-fold speedup when compared to the alternative CUBLAS solutions on a K40c GPU and 3:6-fold speedup over MKL on a node of the Titan supercomputer at ORNL in a nuclear reaction network simulation.},
	author = {Tingxing Dong and Azzam Haidar and Piotr Luszczek and James Harris and Stanimire Tomov and Jack Dongarra}
}
@article {831,
	title = {Model-Driven One-Sided Factorizations on Multicore, Accelerated Systems},
	journal = {Supercomputing Frontiers and Innovations},
	volume = {1},
	year = {2014},
	abstract = {Hardware heterogeneity of the HPC platforms is no longer considered unusual but instead have become the most viable way forward towards Exascale.  In fact, the multitude of the heterogeneous resources available to modern computers are designed for different workloads and their efficient use is closely aligned with the specialized role envisaged by their design.  Commonly in order to efficiently use such GPU resources, the workload in question must have a much greater degree of parallelism than workloads often associated with multicore processors (CPUs).  Available GPU variants differ in their internal architecture and, as a result, are capable of handling workloads of varying degrees of complexity and a range of computational patterns.  This vast array of applicable workloads will likely lead to an ever accelerated mixing of multicore-CPUs and GPUs in multi-user environments with the ultimate goal of offering adequate computing facilities for a wide range of scientific and technical workloads.  In the following paper, we present a research prototype that uses a lightweight runtime environment to manage the resource-specific workloads, and to control the dataflow and parallel execution in hybrid systems.  Our lightweight runtime environment uses task superscalar concepts to enable the developer to write serial code while providing parallel execution.  This concept is reminiscent of dataflow and systolic architectures in its conceptualization of a workload as a set of side-effect-free tasks that pass data items whenever the associated work assignment have been completed.  Additionally, our task abstractions and their parametrization enable uniformity in the algorithmic development across all the heterogeneous resources without sacrificing precious compute cycles.  We include performance results for dense linear algebra functions which demonstrate the practicality and effectiveness of our approach that is aptly capable of full utilization of a wide range of accelerator hardware.},
	keywords = {dense linear algebra, hardware accelerators, task superscalar scheduling},
	doi = {http://dx.doi.org/10.14529/jsfi1401},
	author = {Jack Dongarra and Azzam Haidar and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Asim YarKhan}
}
@conference {839,
	title = {New Algorithm for Computing Eigenvectors of the Symmetric Eigenvalue Problem},
	booktitle = {Workshop on Parallel and Distributed Scientific and Engineering Computing, IPDPS 2014 (Best Paper)},
	year = {2014},
	month = {2014-05},
	publisher = {IEEE},
	organization = {IEEE},
	address = {Phoenix, AZ},
	abstract = {We describe a design and implementation of a multi-stage algorithm for computing eigenvectors of a dense symmetric matrix. We show that reformulating the existing algorithms is beneficial in terms of performance even if that doubles the computational complexity. Through detailed analysis, we show that the effect of the increase in the asymptotic operation count may be compensated by a much improved performance rate. Our performance results indicate that using our approach achieves very good speedup and scalability even when directly compared with the existing state-of-the-art software.},
	doi = {10.1109/IPDPSW.2014.130},
	author = {Azzam Haidar and Piotr Luszczek and Jack Dongarra}
}
@conference {833,
	title = {Optimizing Krylov Subspace Solvers on Graphics Processing Units},
	booktitle = {Fourth International Workshop on Accelerators and Hybrid Exascale Systems (AsHES), IPDPS 2014},
	year = {2014},
	month = {2014-05},
	publisher = {IEEE},
	organization = {IEEE},
	address = {Phoenix, AZ},
	abstract = {Krylov subspace solvers are often the method of choice when solving sparse linear systems iteratively. At the same time, hardware accelerators such as graphics processing units (GPUs) continue to offer significant floating point performance gains for matrix and vector computations through easy-to-use libraries of computational kernels. However, as these libraries are usually composed of a well optimized but limited set of linear algebra operations, applications that use them often fail to leverage the full potential of the accelerator. In this paper we target the acceleration of the BiCGSTAB solver for GPUs, showing that significant improvement can be achieved by reformulating the method and developing application-specific kernels instead of using the generic CUBLAS library provided by NVIDIA. We propose an implementation that benefits from a significantly reduced number of kernel launches and GPUhost communication events, by means of increased data locality and a simultaneous reduction of multiple scalar products. Using experimental data, we show that, depending on the dominance of the untouched sparse matrix vector products, significant performance improvements can be achieved compared to a reference implementation based on the CUBLAS library. We feel that such optimizations are crucial for the subsequent development of highlevel sparse linear algebra libraries.},
	author = {Stanimire Tomov and Piotr Luszczek and Ichitaro Yamazaki and Jack Dongarra and Hartwig Anzt and William Sawyer}
}
@conference {828,
	title = {Performance and Portability with OpenCL for Throughput-Oriented HPC Workloads Across Accelerators, Coprocessors, and Multicore Processors},
	booktitle = {5th Workshop on Latest Advances in Scalable Algorithms for Large-Scale Systems (ScalA {\textquoteright}14)},
	year = {2014},
	month = {2014-11},
	publisher = {IEEE},
	organization = {IEEE},
	address = {New Orleans, LA},
	abstract = {Ever since accelerators and coprocessors became the mainstream hardware for throughput-oriented HPC workloads, various programming techniques have been proposed to increase productivity in terms of both the performance and ease-of-use. We evaluate these aspects of OpenCL on a number of hardware platforms for an important subset of dense linear algebra operations that are relevant to a wide range of scientific applications. Our findings indicate that OpenCL portability has improved since our previous publication and many new and surprising usage scenarios are possible that rival those available after decades of software development on the CPUs. The combined performance-portability metric, even though not promised by the OpenCL standard, reflects the need for tuning performance-critical operations during the porting process and we show how a large portion of the available efficiency is lost if the tuning is not done correctly.},
	doi = {10.1109/ScalA.2014.8},
	author = {Azzam Haidar and Chongxiao Cao and Ichitaro Yamazaki and Jack Dongarra and Mark Gates and Piotr Luszczek and Stanimire Tomov}
}
@techreport {827,
	title = {PULSAR Users{\textquoteright} Guide, Parallel Ultra-Light Systolic Array Runtime},
	journal = {University of Tennessee EECS Technical Report},
	number = {UT-EECS-14-733},
	year = {2014},
	month = {2014-11},
	publisher = {University of Tennessee},
	abstract = {PULSAR version 2.0, released in November 2014, is a complete programming platform for large-scale distributed memory systems with multicore processors and hardware accelerators. PULSAR provides a simple abstraction layer over multithreading, message passing, and multi-GPU, multi-stream programming. PULSAR offers a general-purpose programming model, suitable for a wide range of scientific and engineering applications. PULSAR was inspired by systolic arrays, popularized by Hsiang-Tsung Kung and Charles E. Leiserson.},
	author = {Jack Dongarra and Jakub Kurzak and Piotr Luszczek and Ichitaro Yamazaki}
}
@conference {809,
	title = {Unified Development for Mixed Multi-GPU and Multi-Coprocessor Environments using a Lightweight Runtime Environment},
	booktitle = {IPDPS 2014},
	year = {2014},
	month = {2014-05},
	publisher = {IEEE},
	organization = {IEEE},
	address = {Phoenix, AZ},
	abstract = {Many of the heterogeneous resources available to modern computers are designed for different workloads. In order to efficiently use GPU resources, the workload must have a greater degree of parallelism than a workload designed for multicore-CPUs. And conceptually, the Intel Xeon Phi coprocessors are capable of handling workloads somewhere in between the two. This multitude of applicable workloads will likely lead to mixing multicore-CPUs, GPUs, and Intel coprocessors in multi-user environments that must offer adequate computing facilities for a wide range of workloads. In this work, we are using a lightweight runtime environment to manage the resourcespecific workload, and to control the dataflow and parallel execution in two-way hybrid systems. The lightweight runtime environment uses task superscalar concepts to enable the developer to write serial code while providing parallel execution. In addition, our task abstractions enable unified algorithmic development across all the heterogeneous resources. We provide performance results for dense linear algebra applications, demonstrating the effectiveness of our approach and full utilization of a wide variety of accelerator hardware.},
	keywords = {algorithms, Computer science, CUDA, Heterogeneous systems, Intel Xeon Phi, linear algebra, nVidia, Tesla K20, Tesla M2090},
	author = {Azzam Haidar and Chongxiao Cao and Jack Dongarra and Piotr Luszczek and Stanimire Tomov}
}
@article {icl:702,
	title = {BlackjackBench: Portable Hardware Characterization with Automated Results Analysis},
	journal = {The Computer Journal},
	year = {2013},
	month = {2013-03},
	abstract = {DARPA{\textquoteright}s AACE project aimed to develop Architecture Aware Compiler Environments. Such a compiler automatically characterizes the targeted hardware and optimizes the application codes accordingly. We present the BlackjackBench suite, a collection of portable micro-benchmarks that automate system characterization, plus statistical analysis techniques for interpreting the results. The BlackjackBench benchmarks discover the effective sizes and speeds of the hardware environment rather than the often unattainable peak values. We aim at hardware characteristics that can be observed by running executables generated by existing compilers from standard C codes. We characterize the memory hierarchy, including cache sharing and non-uniform memory access characteristics of the system, properties of the processing cores affecting the instruction execution speed and the length of the operating system scheduler time slot. We show how these features of modern multicores can be discovered programmatically. We also show how the features could potentially interfere with each other resulting in incorrect interpretation of the results, and how established classification and statistical analysis techniques can reduce experimental noise and aid automatic interpretation of results. We show how effective hardware metrics from our probes allow guided tuning of computational kernels that outperform an autotuning library further tuned by the hardware vendor.},
	keywords = {hardware characterization, micro-benchmarks, statistical analysis},
	doi = {10.1093/comjnl/bxt057},
	author = {Anthony Danalis and Piotr Luszczek and Gabriel Marin and Jeffrey Vetter and Jack Dongarra}
}
@techreport {681,
	title = {clMAGMA: High Performance Dense Linear Algebra with OpenCL},
	journal = {University of Tennessee Technical Report (Lawn 275)},
	number = {UT-CS-13-706},
	year = {2013},
	month = {2013-03},
	publisher = {University of Tennessee},
	abstract = {This paper presents the design and implementation of sev- eral fundamental dense linear algebra (DLA) algorithms in OpenCL. In particular, these are linear system solvers and eigenvalue problem solvers. Further, we give an overview of the clMAGMA library, an open source, high performance OpenCL library that incorporates the developments pre- sented, and in general provides to heterogeneous architec- tures the DLA functionality of the popular LAPACK library. The LAPACK-compliance and use of OpenCL simplify the use of clMAGMA in applications, while providing them with portably performant DLA. High performance is ob- tained through use of the high-performance OpenCL BLAS, hardware and OpenCL-speci c tuning, and a hybridization methodology where we split the algorithm into computa- tional tasks of various granularities. Execution of those tasks is properly scheduled over the heterogeneous hardware components by minimizing data movements and mapping algorithmic requirements to the architectural strengths of the various heterogeneous hardware components.},
	author = {Chongxiao Cao and Jack Dongarra and Peng Du and Mark Gates and Piotr Luszczek and Stanimire Tomov}
}
@inproceedings {716,
	title = {CPU-GPU Hybrid Bidiagonal Reduction With Soft Error Resilience},
	journal = {ScalA {\textquoteright}13 Proceedings of the Workshop on Latest Advances in Scalable Algorithms for Large-Scale Systems},
	year = {2013},
	month = {2013-11},
	address = {Montpellier, France},
	abstract = {Soft errors pose a real challenge to applications running on modern hardware as the feature size becomes smaller and the integration density increases for both the modern processors and the memory chips. Soft errors manifest themselves as bit-flips that alter the user value, and numerical software is a category of software that is sensitive to such data changes. In this paper, we present a design of a bidiagonal reduction algorithm that is resilient to soft errors, and we also describe its implementation on hybrid CPU-GPU architectures. Our fault-tolerant algorithm employs Algorithm Based Fault Tolerance, combined with reverse computation, to detect, locate, and correct soft errors. The tests were performed on a Sandy Bridge CPU coupled with an NVIDIA Kepler GPU. The included experiments show that our resilient bidiagonal reduction algorithm adds very little overhead compared to the error-prone code. At matrix size 10110 x 10110, our algorithm only has a performance overhead of 1.085\% when one error occurs, and 0.354\% when no errors occur.},
	author = {Yulu Jia and Piotr Luszczek and George Bosilca and Jack Dongarra}
}
@article {icl:698,
	title = {Dense Linear Algebra on Distributed Heterogeneous Hardware with a Symbolic DAG Approach},
	journal = {Scalable Computing and Communications: Theory and Practice},
	year = {2013},
	month = {2013-03},
	pages = {699-735},
	publisher = {John Wiley \& Sons},
	author = {George Bosilca and Aurelien Bouteiller and Anthony Danalis and Thomas Herault and Piotr Luszczek and Jack Dongarra},
	editor = {Samee Khan and Lin-Wang Wang and Albert Zomaya}
}
@article {759,
	title = {High Performance Bidiagonal Reduction using Tile Algorithms on Homogeneous Multicore Architectures},
	journal = {ACM Transactions on Mathematical Software (TOMS)},
	volume = {39},
	number = {16},
	year = {2013},
	abstract = {This article presents a new high-performance bidiagonal reduction (BRD) for homogeneous multicore architectures. This article is an extension of the high-performance tridiagonal reduction implemented by the same authors [Luszczek et al., IPDPS 2011] to the BRD case. The BRD is the first step toward computing the singular value decomposition of a matrix, which is one of the most important algorithms in numerical linear algebra due to its broad impact in computational science. The high performance of the BRD described in this article comes from the combination of four important features: (1) tile algorithms with tile data layout, which provide an efficient data representation in main memory; (2) a two-stage reduction approach that allows to cast most of the computation during the first stage (reduction to band form) into calls to Level 3 BLAS and reduces the memory traffic during the second stage (reduction from band to bidiagonal form) by using high-performance kernels optimized for cache reuse; (3) a data dependence translation layer that maps the general algorithm with column-major data layout into the tile data layout; and (4) a dynamic runtime system that efficiently schedules the newly implemented kernels across the processing units and ensures that the data dependencies are not violated. A detailed analysis is provided to understand the critical impact of the tile size on the total execution time, which also corresponds to the matrix bandwidth size after the reduction of the first stage. The performance results show a significant improvement over currently established alternatives. The new high-performance BRD achieves up to a 30-fold speedup on a 16-core Intel Xeon machine with a 12000{\texttimes} 12000 matrix size against the state-of-the-art open source and commercial numerical software packages, namely LAPACK, compiled with optimized and multithreaded BLAS from MKL as well as Intel MKL version 10.2.},
	keywords = {algorithms, bidiagional reduction, bulge chasing, data translation layer, dynamic scheduling, high performance kernels, performance, tile algorithms, two-stage approach},
	doi = {10.1145/2450153.2450154},
	author = {Hatem Ltaeif and Piotr Luszczek and Jack Dongarra}
}
@inbook {754,
	title = {HPC Challenge: Design, History, and Implementation Highlights},
	booktitle = {Contemporary High Performance Computing: From Petascale Toward Exascale},
	year = {2013},
	publisher = {Taylor and Francis},
	organization = {Taylor and Francis},
	chapter = {2},
	address = {Boca Raton, FL},
	keywords = {exascale, hpc challenge, hpcc},
	isbn = {978-1-4665-6834-1},
	author = {Jack Dongarra and Piotr Luszczek}
}
@techreport {682,
	title = {Implementing a systolic algorithm for QR factorization on multicore clusters with PaRSEC},
	journal = {Lawn 277},
	number = {UT-CS-13-709},
	year = {2013},
	month = {2013-05},
	abstract = {This article introduces a new systolic algorithm for QR factorization, and its implementation on a supercomputing cluster of multicore nodes. The algorithm targets a virtual 3D-array and requires only local communications. The implementation of the algorithm uses threads at the node level, and MPI for inter-node communications. The complexity of the implementation is addressed with the PaRSEC software, which takes as input a parametrized dependence graph, which is derived from the algorithm, and only requires the user to decide, at the high-level, the allocation of tasks to nodes. We show that the new algorithm exhibits competitive performance with state-of-the-art QR routines on a supercomputer called Kraken, which shows that high-level programming environments, such as PaRSEC, provide a viable alternative to enhance the production of quality software on complex and hierarchical architectures},
	author = {Guillaume Aupy and Mathieu Faverge and Yves Robert and Jakub Kurzak and Piotr Luszczek and Jack Dongarra}
}
@techreport {704,
	title = {An Improved Parallel Singular Value Algorithm and Its Implementation for Multicore Hardware},
	journal = {University of Tennessee Computer Science Technical Report (also LAWN 283)},
	number = {ut-eecs-13-720},
	year = {2013},
	month = {2013-10},
	publisher = {University of Tennessee},
	keywords = {lapack, plasma, scalapack},
	author = {Azzam Haidar and Piotr Luszczek and Jakub Kurzak and Jack Dongarra}
}
@conference {696,
	title = {An Improved Parallel Singular Value Algorithm and Its Implementation for Multicore Hardware},
	booktitle = {Supercomputing 2013},
	year = {2013},
	month = {2013-11},
	address = {Denver, CO},
	author = {Azzam Haidar and Piotr Luszczek and Jakub Kurzak and Jack Dongarra}
}
@article {icl:693,
	title = {LU Factorization with Partial Pivoting for a Multicore System with Accelerators},
	journal = {IEEE Transactions on Parallel and Distributed Computing},
	volume = {24},
	year = {2013},
	month = {2013-08},
	pages = {1613-1621},
	chapter = {1613},
	abstract = {LU factorization with partial pivoting is a canonical numerical procedure and the main component of the high performance LINPACK benchmark. This paper presents an implementation of the algorithm for a hybrid, shared memory, system with standard CPU cores and GPU accelerators. The difficulty of implementing the algorithm for such a system lies in the disproportion between the computational power of the CPUs, compared to the GPUs, and in the meager bandwidth of the communication link between their memory systems. An additional challenge comes from the complexity of the memory-bound and synchronization-rich nature of the panel factorization component of the block LU algorithm, imposed by the use of partial pivoting. The challenges are tackled with the use of a data layout geared toward complex memory hierarchies, autotuning of GPU kernels, fine-grain parallelization of memory-bound CPU operations and dynamic scheduling of tasks to different devices. Performance in excess of one TeraFLOPS is achieved using four AMD Magny Cours CPUs and four NVIDIA Fermi GPUs.},
	keywords = {accelerator, Gaussian elimination, gpu, lu factorization, manycore, Multicore, partial pivoting, plasma},
	doi = {http://doi.ieeecomputersociety.org/10.1109/TPDS.2012.242},
	author = {Jakub Kurzak and Piotr Luszczek and Jack Dongarra}
}
@article {icl:704,
	title = {Multithreading in the PLASMA Library},
	journal = {Multi and Many-Core Processing: Architecture, Programming, Algorithms, \& Applications},
	year = {2013},
	month = {2013-00},
	publisher = {Taylor \& Francis},
	keywords = {plasma},
	author = {Jakub Kurzak and Piotr Luszczek and Asim YarKhan and Mathieu Faverge and Julien Langou and Henricus Bouwmeester and Jack Dongarra},
	editor = {Mohamed Ahmed and Reda Ammar and Sanguthevar Rajasekaran}
}
@conference {705,
	title = {Parallel Reduction to Hessenberg Form with Algorithm-Based Fault Tolerance},
	booktitle = {International Conference for High Performance Computing, Networking, Storage and Analysis, IEEE-SC 2013},
	year = {2013},
	month = {2013-11},
	address = {Denver, CO},
	abstract = {This paper studies the resilience of a two-sided factorization and presents a generic algorithm-based approach capable of making two-sided factorizations resilient. We establish the theoretical proof of the correctness and the numerical stability of the approach in the context of a Hessenberg Reduction (HR) and present the scalability and performance results of a practical implementation. Our method is a hybrid algorithm combining an Algorithm Based Fault Tolerance (ABFT) technique with diskless checkpointing to fully protect the data. We protect the trailing and the initial part of the matrix with checksums, and protect finished panels in the panel scope with diskless checkpoints. Compared with the original HR (the ScaLAPACK PDGEHRD routine) our fault-tolerant algorithm introduces very little overhead, and maintains the same level of scalability. We prove that the overhead shows a decreasing trend as the size of the matrix or the size of the process grid increases.},
	author = {Yulu Jia and George Bosilca and Piotr Luszczek and Jack Dongarra}
}
@conference {753,
	title = {Portable HPC Programming on Intel Many-Integrated-Core Hardware with MAGMA Port to Xeon Phi},
	booktitle = {PPAM 2013},
	year = {2013},
	month = {2013-09},
	address = {Warsaw, Poland},
	abstract = {This paper presents the design and implementation of several fundamental dense linear algebra (DLA) algorithms for multicore with Intel Xeon Phi Coprocessors. In particular, we consider algorithms for solving linear systems. Further, we give an overview of the MAGMA MIC library, an open source, high performance library that incorporates the developments presented, and in general provides to heterogeneous architectures of multicore with coprocessors the DLA functionality of the popular LAPACK library. The LAPACK-compliance simplifies the use of the MAGMA MIC library in applications, while providing them with portably performant DLA. High performance is obtained through use of the high-performance BLAS, hardware-specific tuning, and a hybridization methodology where we split the algorithm into computational tasks of various granularities. Execution of those tasks is properly scheduled over the heterogeneous hardware components by minimizing data movements and mapping algorithmic requirements to the architectural strengths of the various heterogeneous hardware components. Our methodology and programming techniques are incorporated into the MAGMA MIC API, which abstracts the application developer from the specifics of the Xeon Phi architecture and is therefore applicable to algorithms beyond the scope of DLA.},
	keywords = {magma, mic, xeon phi},
	author = {Jack Dongarra and Mark Gates and Azzam Haidar and Yulu Jia and Khairul Kabir and Piotr Luszczek and Stanimire Tomov}
}
@inbook {695,
	title = {Scalable Dense Linear Algebra on Heterogeneous Hardware},
	booktitle = {HPC: Transition Towards Exascale Processing, in the series Advances in Parallel Computing},
	year = {2013},
	abstract = {Abstract. Design of systems exceeding 1 Pflop/s and the push toward 1 Eflop/s, forced a dramatic shift in hardware design. Various physical and engineering constraints resulted in introduction of massive parallelism and functional hybridization with the use of accelerator units. This paradigm change brings about a serious challenge for application developers, as the management of multicore proliferation and heterogeneity rests on software. And it is reasonable to expect, that this situation will not change in the foreseeable future. This chapter presents a methodology of dealing with this issue in three common scenarios. In the context of shared-memory multicore installations, we show how high performance and scalability go hand in hand, when the well-known linear algebra algorithms are recast in terms of Direct Acyclic Graphs (DAGs), which are then transparently scheduled at runtime inside the Parallel Linear Algebra Software for Multicore Architectures (PLASMA) project. Similarly, Matrix Algebra on GPU and Multicore Architectures (MAGMA) schedules DAG-driven computations on multicore processors and accelerators. Finally, Distributed PLASMA (DPLASMA), takes the approach to distributed-memory machines with the use of automatic dependence analysis and the Direct Acyclic Graph Engine (DAGuE) to deliver high performance at the scale of many thousands of cores.},
	author = {George Bosilca and Aurelien Bouteiller and Anthony Danalis and Thomas Herault and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Jack Dongarra}
}
@article {755,
	title = {Soft Error Resilient QR Factorization for Hybrid System with GPGPU},
	journal = {Journal of Computational Science},
	volume = {4},
	year = {2013},
	month = {2013-11},
	pages = {457{\textendash}464},
	abstract = {The general purpose graphics processing units (GPGPUs) are increasingly deployed for scientific computing due to their performance advantages over CPUs. What followed is the fact that fault tolerance has become a more serious concern compared to the period when GPGPUs were used exclusively for graphics applications. Using GPUs and CPUs together in a hybrid computing system increases flexibility and performance but also increases the possibility of the computations being affected by soft errors, for example, in the form of bit flips. In this work, we propose a soft error resilient algorithm for QR factorization on such hybrid systems. Our contributions include: (1) a checkpointing and recovery mechanism for the left-factor Q whose performance is scalable on hybrid systems; (2) optimized Givens rotation utilities on GPGPUs to efficiently reduce an upper Hessenberg matrix to an upper triangular form for the protection of the right factor R; and (3) a recovery algorithm based on QR update on GPGPUs. Experimental results show that our fault tolerant QR factorization can successfully detect and recover from soft errors in the entire matrix with little overhead on hybrid systems with GPGPUs.},
	keywords = {gpgpu, gpu, magma},
	doi = {http://dx.doi.org/10.1016/j.jocs.2013.01.004},
	author = {Peng Du and Piotr Luszczek and Stanimire Tomov and Jack Dongarra}
}
@techreport {685,
	title = {Transient Error Resilient Hessenberg Reduction on GPU-based Hybrid Architectures},
	journal = {UT-CS-13-712},
	year = {2013},
	month = {2013-06},
	publisher = {University of Tennessee Computer Science Technical Report},
	abstract = {Graphics Processing Units (GPUs) are gaining wide spread usage in the field of scientific computing owing to the performance boost GPUs bring to computation intensive applications. The typical configuration is to integrate GPUs and CPUs in the same system where the CPUs handle the control flow and part of the computation workload, and the GPUs serve as accelerators carry out the bulk of the data parallel compute workload. In this paper we design and implement a soft error resilient Hessenberg reduction algorithm on GPU based hybrid platforms. Our design employs algorithm based fault tolerance technique, diskless checkpointing and reverse computation. We detect and correct soft errors on-line without delaying the detection and correction to the end of the factorization. By utilizing idle time of the CPUs and overlapping both host side and GPU side workloads we minimize the observed overhead. Experiment results validated our design philosophy. Our algorithm introduces less than 2\% performance overhead compared to the non-fault tolerant hybrid Hessenberg reduction algorithm.},
	author = {Yulu Jia and Piotr Luszczek and Jack Dongarra}
}
@conference {icl:692,
	title = {Virtual Systolic Array for QR Decomposition},
	booktitle = {15th Workshop on Advances in Parallel and Distributed Computational Models, IEEE International Parallel \& Distributed Processing Symposium (IPDPS 2013)},
	year = {2013},
	month = {2013-05},
	publisher = {IEEE},
	organization = {IEEE},
	address = {Boston, MA},
	abstract = {Systolic arrays offer a very attractive, data-centric, execution model as an alternative to the von Neumann architecture. Hardware implementations of systolic arrays turned out not to be viable solutions in the past. This article shows how the systolic design principles can be applied to a software solution to deliver an algorithm with unprecedented strong scaling capabilities. Systolic array for the QR decomposition is developed and a virtualization layer is used for mapping of the algorithm to a large distributed memory system. Strong scaling properties are discovered, superior to existing solutions.},
	keywords = {dataflow programming, message passing, multi-core, QR decomposition, roofline model, systolic array},
	doi = {10.1109/IPDPS.2013.119},
	author = {Jakub Kurzak and Piotr Luszczek and Mark Gates and Ichitaro Yamazaki and Jack Dongarra}
}
@techreport {688,
	title = {On Algorithmic Variants of Parallel Gaussian Elimination: Comparison of Implementations in Terms of Performance and Numerical Properties},
	journal = {University of Tennessee Computer Science Technical Report},
	number = {UT-CS-13-715},
	year = {2012},
	month = {2013-07},
	abstract = {Gaussian elimination is a canonical linear algebra procedure for solving linear systems of equations. In the last few years, the algorithm received a lot of attention in an attempt to improve its parallel performance. This article surveys recent developments in parallel implementations of the Gaussian elimination. Five different flavors are investigated. Three of them are based on different strategies for pivoting: partial pivoting, incremental pivoting, and tournament pivoting. The fourth one replaces pivoting with the Random Butterfly Transformation, and finally, an implementation without pivoting is used as a performance baseline. The technique of iterative refinement is applied to recover numerical accuracy when necessary. All parallel implementations are produced using dynamic, superscalar, runtime scheduling and tile matrix layout. Results on two multi-socket multicore systems are presented. Performance and numerical accuracy is analyzed.},
	author = {Simplice Donfack and Jack Dongarra and Mathieu Faverge and Mark Gates and Jakub Kurzak and Piotr Luszczek and Ichitaro Yamazaki}
}
@inproceedings {icl:712,
	title = {Anatomy of a Globally Recursive Embedded LINPACK Benchmark},
	journal = {2012 IEEE High Performance Extreme Computing Conference},
	year = {2012},
	month = {2012-09},
	pages = {1-6},
	address = {Waltham, MA},
	abstract = {We present a complete bottom-up implementation of an embedded LINPACK benchmark on iPad 2. We use a novel formulation of a recursive LU factorization that is recursive and parallel at the global scope. We be believe our new algorithm presents an alternative to existing linear algebra parallelization techniques such as master-worker and DAG-based approaches. We show a assembly API that allows us a much higher level of abstraction and provides rapid code development within the confines of mobile device SDK. We use performance modeling to help with the limitation of the device and the limited access to device from the development environment not geared for HPC application tuning.},
	isbn = {978-1-4673-1577-7},
	doi = {10.1109/HPEC.2012.6408679},
	author = {Piotr Luszczek and Jack Dongarra}
}
@article {icl:695,
	title = {A Comprehensive Study of Task Coalescing for Selecting Parallelism Granularity in a Two-Stage Bidiagonal Reduction},
	journal = {IPDPS 2012},
	year = {2012},
	month = {2012-05},
	address = {Shanghai, China},
	author = {Azzam Haidar and Hatem Ltaeif and Piotr Luszczek and Jack Dongarra}
}
@article {icl:703,
	title = {Dense Linear Algebra on Accelerated Multicore Hardware},
	journal = {High Performance Scientific Computing: Algorithms and Applications},
	year = {2012},
	month = {2012-00},
	publisher = {Springer-Verlag},
	address = {London, UK},
	author = {Jack Dongarra and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov},
	editor = {Michael Berry and et al.,}
}
@inproceedings {icl:711,
	title = {Energy Footprint of Advanced Dense Numerical Linear Algebra using Tile Algorithms on Multicore Architecture},
	journal = {The 2nd International Conference on Cloud and Green Computing (submitted)},
	year = {2012},
	month = {2012-11},
	address = {Xiangtan, Hunan, China},
	author = {Jack Dongarra and Hatem Ltaeif and Piotr Luszczek and Vincent M Weaver}
}
@article {icl:707,
	title = {Enhancing Parallelism of Tile Bidiagonal Transformation on Multicore Architectures using Tree Reduction},
	journal = {Lecture Notes in Computer Science},
	volume = {7203},
	year = {2012},
	month = {2012-09},
	pages = {661-670},
	author = {Hatem Ltaeif and Piotr Luszczek and Jack Dongarra}
}
@article {icl:725,
	title = {From CUDA to OpenCL: Towards a Performance-portable Solution for Multi-platform GPU Programming},
	journal = {Parallel Computing},
	volume = {38},
	number = {8},
	year = {2012},
	month = {2012-08},
	pages = {391-407},
	author = {Peng Du and Rick Weber and Piotr Luszczek and Stanimire Tomov and Gregory D. Peterson and Jack Dongarra}
}
@article {icl:723,
	title = {GPU-Accelerated Asynchronous Error Correction for Mixed Precision Iterative Refinement},
	journal = {EuroPar 2012 (also LAWN 260)},
	year = {2012},
	month = {2012-08},
	address = {Rhodes Island, Greece},
	author = {Hartwig Anzt and Piotr Luszczek and Jack Dongarra and Vincent Heuveline}
}
@article {icl:708,
	title = {High Performance Dense Linear System Solver with Resilience to Multiple Soft Errors},
	journal = {ICCS 2012},
	year = {2012},
	month = {2012-06},
	address = {Omaha, NE},
	author = {Peng Du and Piotr Luszczek and Jack Dongarra}
}
@article {icl:706,
	title = {HPC Challenge: Design, History, and Implementation Highlights},
	journal = {On the Road to Exascale Computing: Contemporary Architectures in High Performance Computing (to appear)},
	year = {2012},
	month = {2012-00},
	publisher = {Chapman \& Hall/CRC Press},
	author = {Jack Dongarra and Piotr Luszczek},
	editor = {Jeffrey Vetter}
}
@article {icl:705,
	title = {Looking Back at Dense Linear Algebra Software},
	journal = {Perspectives on Parallel and Distributed Processing: Looking Back and What{\textquoteright}s Ahead (to appear)},
	year = {2012},
	month = {2012-00},
	author = {Piotr Luszczek and Jakub Kurzak and Jack Dongarra},
	editor = {Viktor K. Prasanna and Yves Robert and Per Stenstr{\"o}m}
}
@article {1354,
	title = {MAGMA MIC: Linear Algebra Library for Intel Xeon Phi Coprocessors},
	year = {2012},
	month = {2012-11},
	publisher = {The International Conference for High Performance Computing, Networking, Storage, and Analysis (SC12)},
	address = {Salt Lake City, UT},
	author = {Jack Dongarra and Mark Gates and Yulu Jia and Khairul Kabir and Piotr Luszczek and Stanimire Tomov}
}
@inproceedings {icl:689,
	title = {Measuring Energy and Power with PAPI},
	journal = {International Workshop on Power-Aware Systems and Architectures},
	year = {2012},
	month = {2012-09},
	address = {Pittsburgh, PA},
	abstract = {Energy and power consumption are becoming critical metrics in the design and usage of high performance systems. We have extended the Performance API (PAPI) analysis library to measure and report energy and power values. These values are reported using the existing PAPI API, allowing code previously instrumented for performance counters to also measure power and energy. Higher level tools that build on PAPI will automatically gain support for power and energy readings when used with the newest version of PAPI. We describe in detail the types of energy and power readings available through PAPI. We support external power meters, as well as values provided internally by recent CPUs and GPUs. Measurements are provided directly to the instrumented process, allowing immediate code analysis in real time. We provide examples showing results that can be obtained with our infrastructure.},
	keywords = {papi},
	doi = {10.1109/ICPPW.2012.39},
	author = {Vincent M Weaver and Matt Johnson and Kiran Kasichayanula and James Ralph and Piotr Luszczek and Dan Terpstra and Shirley Moore}
}
@article {icl:686,
	title = {Power Aware Computing on GPUs},
	journal = {SAAHPC {\textquoteright}12 (Best Paper Award)},
	year = {2012},
	month = {2012-07},
	address = {Argonne, IL},
	keywords = {magma},
	author = {Kiran Kasichayanula and Dan Terpstra and Piotr Luszczek and Stanimire Tomov and Shirley Moore and Gregory D. Peterson}
}
@article {icl:718,
	title = {Preliminary Results of Autotuning GEMM Kernels for the NVIDIA Kepler Architecture},
	journal = {LAWN 267},
	year = {2012},
	month = {2012-00},
	author = {Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Jack Dongarra}
}
@inproceedings {icl:665,
	title = {Programming the LU Factorization for a Multicore System with Accelerators},
	journal = {Proceedings of VECPAR{\textquoteright}12},
	year = {2012},
	month = {2012-04},
	address = {Kobe, Japan},
	keywords = {plasma, quark},
	author = {Jakub Kurzak and Piotr Luszczek and Mathieu Faverge and Jack Dongarra}
}
@techreport {icl:660,
	title = {Achieving Numerical Accuracy and High Performance using Recursive Tile LU Factorization},
	journal = {University of Tennessee Computer Science Technical Report (also as a LAWN)},
	number = {ICL-UT-11-08},
	year = {2011},
	month = {2011-09},
	keywords = {plasma, quark},
	author = {Jack Dongarra and Mathieu Faverge and Hatem Ltaeif and Piotr Luszczek}
}
@inproceedings {icl:591,
	title = {BlackjackBench: Hardware Characterization with Portable Micro-Benchmarks and Automatic Statistical Analysis of Results},
	journal = {IEEE International Parallel and Distributed Processing Symposium (submitted)},
	year = {2011},
	month = {2011-05},
	address = {Anchorage, AK},
	author = {Anthony Danalis and Piotr Luszczek and Gabriel Marin and Jeffrey Vetter and Jack Dongarra}
}
@article {icl:587,
	title = {Changes in Dense Linear Algebra Kernels - Decades Long Perspective},
	journal = {in Solving the Schrodinger Equation: Has everything been tried? (to appear)},
	year = {2011},
	month = {2011-00},
	publisher = {Imperial College Press},
	author = {Piotr Luszczek and Jakub Kurzak and Jack Dongarra},
	editor = {P. Popular}
}
@inproceedings {icl:616,
	title = {Evaluation of the HPC Challenge Benchmarks in Virtualized Environments},
	journal = {6th Workshop on Virtualization in High-Performance Cloud Computing},
	year = {2011},
	month = {2011-08},
	address = {Bordeaux, France},
	keywords = {hpcc},
	author = {Piotr Luszczek and Eric Meek and Shirley Moore and Dan Terpstra and Vincent M Weaver and Jack Dongarra}
}
@inproceedings {icl:611,
	title = {Exploiting Fine-Grain Parallelism in Recursive LU Factorization},
	journal = {Proceedings of PARCO{\textquoteright}11},
	number = {ICL-UT-11-04},
	year = {2011},
	month = {2011-04},
	address = {Gent, Belgium},
	keywords = {plasma},
	author = {Jack Dongarra and Mathieu Faverge and Hatem Ltaeif and Piotr Luszczek}
}
@inproceedings {icl:676,
	title = {Flexible Development of Dense Linear Algebra Algorithms on Massively Parallel Architectures with DPLASMA},
	journal = {Proceedings of the Workshops of the 25th IEEE International Symposium on Parallel and Distributed Processing (IPDPS 2011 Workshops)},
	year = {2011},
	month = {2011-05},
	pages = {1432-1441},
	publisher = {IEEE},
	address = {Anchorage, Alaska, USA},
	keywords = {dague, dplasma, parsec},
	author = {George Bosilca and Aurelien Bouteiller and Anthony Danalis and Mathieu Faverge and Azzam Haidar and Thomas Herault and Jakub Kurzak and Julien Langou and Pierre Lemariner and Hatem Ltaeif and Piotr Luszczek and Asim YarKhan and Jack Dongarra}
}
@techreport {icl:662,
	title = {GPU-Accelerated Asynchronous Error Correction for Mixed Precision Iterative Refinement},
	journal = {University of Tennessee Computer Science Technical Report UT-CS-11-690 (also Lawn 260)},
	year = {2011},
	month = {2011-12},
	keywords = {magma},
	author = {Hartwig Anzt and Piotr Luszczek and Jack Dongarra and Vincent Heuveline}
}
@techreport {icl:629,
	title = {High Performance Bidiagonal Reduction using Tile Algorithms on Homogeneous Multicore Architectures},
	journal = {University of Tennessee Computer Science Technical Report, UT-CS-11-673, (also Lawn 247)},
	year = {2011},
	month = {2011-05},
	keywords = {plasma},
	author = {Hatem Ltaeif and Piotr Luszczek and Jack Dongarra}
}
@article {icl:622,
	title = {High Performance Dense Linear System Solver with Soft Error Resilience},
	journal = {IEEE Cluster 2011},
	year = {2011},
	month = {2011-09},
	address = {Austin, TX},
	keywords = {ft-la},
	author = {Peng Du and Piotr Luszczek and Jack Dongarra}
}
@inproceedings {icl:658,
	title = {High Performance Matrix Inversion Based on LU Factorization for Multicore Architectures},
	journal = {Proceedings of MTAGS11},
	year = {2011},
	month = {2011-11},
	address = {Seattle, WA},
	author = {Jack Dongarra and Mathieu Faverge and Hatem Ltaeif and Piotr Luszczek}
}
@inproceedings {icl:621,
	title = {Profiling High Performance Dense Linear Algebra Algorithms on Multicore Architectures for Power and Energy Efficiency},
	journal = {International Conference on Energy-Aware High Performance Computing (EnA-HPC 2011)},
	year = {2011},
	month = {2011-09},
	address = {Hamburg, Germany},
	keywords = {mumi},
	author = {Hatem Ltaeif and Piotr Luszczek and Jack Dongarra}
}
@techreport {icl:625,
	title = {Soft Error Resilient QR Factorization for Hybrid System},
	journal = {University of Tennessee Computer Science Technical Report},
	number = {UT-CS-11-675},
	year = {2011},
	month = {2011-07},
	address = {Knoxville, TN},
	keywords = {ft-la},
	author = {Peng Du and Piotr Luszczek and Stanimire Tomov and Jack Dongarra}
}
@article {icl:635,
	title = {Soft Error Resilient QR Factorization for Hybrid System},
	journal = {UT-CS-11-675 (also LAPACK Working Note $\#$252)},
	number = {ICL-CS-11-675},
	year = {2011},
	month = {2011-07},
	keywords = {magma},
	author = {Peng Du and Piotr Luszczek and Stanimire Tomov and Jack Dongarra}
}
@article {icl:642,
	title = {Soft Error Resilient QR Factorization for Hybrid System with GPGPU},
	journal = {Journal of Computational Science},
	year = {2011},
	month = {2011-11},
	publisher = {Workshop on Latest Advances in Scalable Algorithms for Large-Scale Systems at SC11},
	address = {Seattle, WA},
	keywords = {ft-la},
	author = {Peng Du and Piotr Luszczek and Stanimire Tomov and Jack Dongarra}
}
@inproceedings {icl:592,
	title = {Two-stage Tridiagonal Reduction for Dense Symmetric Matrices using Tile Algorithms on Multicore Architectures},
	journal = {IEEE International Parallel and Distributed Processing Symposium (submitted)},
	year = {2011},
	month = {2011-05},
	address = {Anchorage, AK},
	author = {Piotr Luszczek and Hatem Ltaeif and Jack Dongarra}
}
@techreport {icl:536,
	title = {Analysis of Various Scalar, Vector, and Parallel Implementations of RandomAccess},
	journal = {Innovative Computing Laboratory (ICL) Technical Report},
	number = {ICL-UT-10-03},
	year = {2010},
	month = {2010-06},
	keywords = {hpcc},
	author = {Piotr Luszczek and Jack Dongarra}
}
@techreport {icl:563,
	title = {Distributed Dense Numerical Linear Algebra Algorithms on Massively Parallel Architectures: DPLASMA},
	journal = {University of Tennessee Computer Science Technical Report, UT-CS-10-660},
	year = {2010},
	month = {2010-09},
	keywords = {dague, dplasma, parsec, plasma},
	author = {George Bosilca and Aurelien Bouteiller and Anthony Danalis and Mathieu Faverge and Azzam Haidar and Thomas Herault and Jakub Kurzak and Julien Langou and Pierre Lemariner and Hatem Ltaeif and Piotr Luszczek and Asim YarKhan and Jack Dongarra}
}
@techreport {icl:529,
	title = {Distributed-Memory Task Execution and Dependence Tracking within DAGuE and the DPLASMA Project},
	journal = {Innovative Computing Laboratory Technical Report},
	number = {ICL-UT-10-02},
	year = {2010},
	month = {2010-00},
	keywords = {dague, plasma},
	author = {George Bosilca and Aurelien Bouteiller and Anthony Danalis and Mathieu Faverge and Azzam Haidar and Thomas Herault and Jakub Kurzak and Julien Langou and Pierre Lemariner and Hatem Ltaeif and Piotr Luszczek and Asim YarKhan and Jack Dongarra}
}
@inproceedings {icl:562,
	title = {Mixed-Tool Performance Analysis on Hybrid Multicore Architectures},
	journal = {First International Workshop on Parallel Software Tools and Tool Infrastructures (PSTI 2010)},
	year = {2010},
	month = {2010-09},
	address = {San Diego, CA},
	keywords = {magma},
	author = {Peng Du and Piotr Luszczek and Stanimire Tomov and Jack Dongarra}
}
@inproceedings {icl:549,
	title = {OpenCL Evaluation for Numerical Linear Algebra Library Development},
	journal = {Symposium on Application Accelerators in High-Performance Computing (SAAHPC {\textquoteright}10)},
	year = {2010},
	month = {2010-07},
	address = {Knoxville, TN},
	keywords = {magma},
	author = {Peng Du and Piotr Luszczek and Jack Dongarra}
}
@techreport {icl:578,
	title = {Reducing the time to tune parallel dense linear algebra routines with partial execution and performance modelling},
	journal = {University of Tennessee Computer Science Technical Report},
	number = {UT-CS-10-661},
	year = {2010},
	month = {2010-10},
	keywords = {hpcc},
	author = {Jack Dongarra and Piotr Luszczek}
}
@article {icl:620,
	title = {Using MAGMA with PGI Fortran},
	journal = {PGI Insider},
	year = {2010},
	month = {2010-11},
	keywords = {magma},
	author = {Stanimire Tomov and Mathieu Faverge and Piotr Luszczek and Jack Dongarra}
}
@article {,
	title = {Accelerating Scientific Computations with Mixed Precision Algorithms},
	journal = {Computer Physics Communications},
	volume = {180},
	year = {2009},
	month = {2009-12},
	pages = {2526-2533},
	abstract = {On modern architectures, the performance of 32-bit operations is often at least twice as fast as the performance of 64-bit operations. By using a combination of 32-bit and 64-bit floating point arithmetic, the performance of many dense and sparse linear algebra algorithms can be significantly enhanced while maintaining the 64-bit accuracy of the resulting solution. The approach presented here can apply not only to conventional processors but also to other technologies such as Field Programmable Gate Arrays (FPGA), Graphical Processing Units (GPU), and the STI Cell BE processor. Results on modern processor architectures and the STI Cell BE are presented.},
	doi = {https://doi.org/10.1016/j.cpc.2008.11.005},
	author = {Marc Baboulin and Alfredo Buttari and Jack Dongarra and Jakub Kurzak and Julie Langou and Julien Langou and Piotr Luszczek and Stanimire Tomov}
}
@article {1352,
	title = {Numerical Linear Algebra on Emerging Architectures: The PLASMA and MAGMA Projects},
	year = {2009},
	month = {2009-11},
	publisher = {The International Conference for High Performance Computing, Networking, Storage, and Analysis (SC09)},
	address = {Portland, OR},
	author = {Emmanuel Agullo and James Demmel and Jack Dongarra and Bilel Hadri and Jakub Kurzak and Julien Langou and Hatem Ltaeif and Piotr Luszczek and Rajib Nath and Stanimire Tomov and Asim YarKhan and Vasily Volkov}
}
@inproceedings {icl:486,
	title = {Numerical Linear Algebra on Emerging Architectures: The PLASMA and MAGMA Projects},
	journal = {Journal of Physics: Conference Series},
	volume = {180},
	year = {2009},
	month = {2009-00},
	keywords = {magma, plasma},
	author = {Emmanuel Agullo and James Demmel and Jack Dongarra and Bilel Hadri and Jakub Kurzak and Julien Langou and Hatem Ltaeif and Piotr Luszczek and Stanimire Tomov}
}
@article {icl:502,
	title = {Parallel Programming in MATLAB},
	journal = {The International Journal of High Performance Computing Applications},
	volume = {23},
	number = {3},
	year = {2009},
	month = {2009-07},
	pages = {277-283},
	keywords = {lfc, plasma},
	author = {Piotr Luszczek}
}
@article {icl:451,
	title = {DARPA{\textquoteright}s HPCS Program: History, Models, Tools, Languages},
	journal = {in Advances in Computers},
	volume = {72},
	year = {2008},
	month = {2008-01},
	publisher = {Elsevier},
	author = {Jack Dongarra and Robert Graybill and William Harrod and Robert Lucas and Ewing Lusk and Piotr Luszczek and Janice McMahon and Allan Snavely and Jeffrey Vetter and Katherine Yelick and Sadaf Alam and Roy Campbell and Laura Carrington and Tzu-Yi Chen and Omid Khalili and Jeremy Meredith and Mustafa Tikir},
	editor = {M. Zelkowitz}
}
@article {icl:449,
	title = {Exploiting Mixed Precision Floating Point Hardware in Scientific Computations},
	journal = {in High Performance Computing and Grids in Action},
	year = {2008},
	month = {2008-01},
	publisher = {IOS Press},
	address = {Amsterdam},
	author = {Alfredo Buttari and Jack Dongarra and Jakub Kurzak and Julien Langou and Julien Langou and Piotr Luszczek and Stanimire Tomov},
	editor = {Lucio Grandinetti}
}
@article {icl:450,
	title = {How Elegant Code Evolves With Hardware: The Case Of Gaussian Elimination},
	journal = {in Beautiful Code Leading Programmers Explain How They Think (Chapter 14)},
	year = {2008},
	month = {2008-01},
	pages = {243-282},
	author = {Jack Dongarra and Piotr Luszczek},
	editor = {Andy Oram and G. Wilson}
}
@techreport {icl:427,
	title = {HPCS Library Study Effort},
	journal = {University of Tennessee Computer Science Technical Report, UT-CS-08-617},
	year = {2008},
	month = {2008-01},
	author = {Jack Dongarra and James Demmel and Parry Husbands and Piotr Luszczek}
}
@article {icl:436,
	title = {The LINPACK Benchmark: Past, Present, and Future},
	journal = {Concurrency: Practice and Experience},
	volume = {15},
	year = {2008},
	month = {2008-00},
	pages = {803-820},
	keywords = {hpl},
	author = {Jack Dongarra and Piotr Luszczek and Antoine Petitet}
}
@article {icl:444,
	title = {The PlayStation 3 for High Performance Scientific Computing},
	journal = {Computing in Science and Engineering},
	year = {2008},
	month = {2008-01},
	pages = {80-83},
	author = {Jakub Kurzak and Alfredo Buttari and Piotr Luszczek and Jack Dongarra}
}
@techreport {icl:406,
	title = {The PlayStation 3 for High Performance Scientific Computing},
	journal = {University of Tennessee Computer Science Technical Report},
	number = {UT-CS-08-608},
	year = {2008},
	month = {2008-01},
	author = {Jakub Kurzak and Alfredo Buttari and Piotr Luszczek and Jack Dongarra}
}
@article {icl:424,
	title = {Using Mixed Precision for Sparse Matrix Computations to Enhance the Performance while Achieving 64-bit Accuracy},
	journal = {ACM Transactions on Mathematical Software},
	volume = {34},
	number = {4},
	year = {2008},
	month = {2008-00},
	pages = {17-22},
	keywords = {plasma},
	author = {Alfredo Buttari and Jack Dongarra and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov}
}
@article {icl:392,
	title = {Exploiting Mixed Precision Floating Point Hardware in Scientific Computations},
	journal = {In High Performance Computing and Grids in Action (to appear)},
	year = {2007},
	month = {2007-00},
	publisher = {IOS Press},
	address = {Amsterdam},
	author = {Alfredo Buttari and Jack Dongarra and Jakub Kurzak and Julien Langou and Julie Langou and Piotr Luszczek and Stanimire Tomov},
	editor = {Lucio Grandinetti}
}
@article {icl:398,
	title = {High Performance Development for High End Computing with Python Language Wrapper (PLW)},
	journal = {International Journal for High Performance Computer Applications},
	volume = {21},
	number = {3},
	year = {2007},
	month = {2007-00},
	pages = {360-369},
	author = {Jack Dongarra and Piotr Luszczek}
}
@article {icl:365,
	title = {How Elegant Code Evolves With Hardware: The Case Of Gaussian Elimination},
	journal = {in Beautiful Code Leading Programmers Explain How They Think},
	year = {2007},
	month = {2007-06},
	publisher = {O{\textquoteright}Reilly Media, Inc.},
	author = {Jack Dongarra and Piotr Luszczek},
	editor = {Andy Oram and Greg Wilson}
}
@article {icl:395,
	title = {Mixed Precision Iterative Refinement Techniques for the Solution of Dense Linear Systems},
	journal = {International Journal of High Performance Computer Applications (to appear)},
	year = {2007},
	month = {2007-08},
	author = {Alfredo Buttari and Jack Dongarra and Julien Langou and Julie Langou and Piotr Luszczek and Jakub Kurzak}
}
@techreport {icl:364,
	title = {SCOP3: A Rough Guide to Scientific Computing On the PlayStation 3},
	journal = {University of Tennessee Computer Science Dept. Technical Report, UT-CS-07-595},
	year = {2007},
	month = {2007-00},
	keywords = {multi-core},
	author = {Alfredo Buttari and Piotr Luszczek and Jakub Kurzak and Jack Dongarra and George Bosilca}
}
@article {icl:317,
	title = {Exploiting the Performance of 32 bit Floating Point Arithmetic in Obtaining 64 bit Accuracy},
	journal = {University of Tennessee Computer Science Tech Report},
	number = {UT-CS-06-574, LAPACK Working Note $\#$175},
	year = {2006},
	month = {2006-04},
	keywords = {iter-ref},
	author = {Julien Langou and Julien Langou and Piotr Luszczek and Jakub Kurzak and Alfredo Buttari and Jack Dongarra}
}
@article {icl:320,
	title = {High Performance Development for High End Computing with Python Language Wrapper (PLW)},
	journal = {International Journal of High Performance Computing Applications (to appear)},
	year = {2006},
	month = {2006-00},
	keywords = {hpcc, lfc},
	author = {Piotr Luszczek}
}
@inproceedings {icl:337,
	title = {The HPC Challenge (HPCC) Benchmark Suite},
	journal = {SC06 Conference Tutorial},
	year = {2006},
	month = {2006-11},
	publisher = {IEEE},
	address = {Tampa, Florida},
	keywords = {hpcc, hpcchallenge},
	author = {Piotr Luszczek and David Bailey and Jack Dongarra and Jeremy Kepner and Robert Lucas and Rolf Rabenseifner and Daisuke Takahashi}
}
@article {icl:369,
	title = {The Impact of Multicore on Math Software},
	journal = {PARA 2006},
	year = {2006},
	month = {2006-06},
	address = {Umea, Sweden},
	keywords = {plasma},
	author = {Alfredo Buttari and Jack Dongarra and Jakub Kurzak and Julien Langou and Piotr Luszczek and Stanimire Tomov}
}
@article {icl:370,
	title = {Prospectus for the Next LAPACK and ScaLAPACK Libraries},
	journal = {PARA 2006},
	year = {2006},
	month = {2006-06},
	address = {Umea, Sweden},
	author = {James Demmel and Jack Dongarra and B. Parlett and William Kahan and Ming Gu and David Bindel and Yozo Hida and Xiaoye Li and Osni Marques and Jason E. Riedy and Christof Voemel and Julien Langou and Piotr Luszczek and Jakub Kurzak and Alfredo Buttari and Julien Langou and Stanimire Tomov}
}
@article {icl:332,
	title = {Self Adapting Numerical Software SANS Effort},
	journal = {IBM Journal of Research and Development},
	volume = {50},
	number = {2/3},
	year = {2006},
	month = {2006-01},
	pages = {223-238},
	keywords = {gco},
	author = {George Bosilca and Zizhong Chen and Jack Dongarra and Victor Eijkhout and Graham Fagg and Erika Fuentes and Julien Langou and Piotr Luszczek and Jelena Pjesivac{\textendash}Grbovic and Keith Seymour and Haihang You and Sathish Vadhiyar}
}
@article {icl:304,
	title = {HPC Challenge v1.x Benchmark Suite},
	journal = {SC|05 Tutorial - S13},
	year = {2005},
	month = {2005-01},
	address = {Seattle, Washington},
	keywords = {hpcc},
	author = {Piotr Luszczek and David Koester}
}
@article {icl:282,
	title = {Introduction to the HPC Challenge Benchmark Suite},
	year = {2005},
	month = {2005-03},
	keywords = {hpcc, hpcchallenge},
	author = {Piotr Luszczek and Jack Dongarra and David Koester and Rolf Rabenseifner and Bob Lucas and Jeremy Kepner and John McCalpin and David Bailey and Daisuke Takahashi}
}
@techreport {icl:253,
	title = {Introduction to the HPCChallenge Benchmark Suite},
	journal = {ICL Technical Report},
	number = {ICL-UT-05-01},
	year = {2005},
	note = {Also appears as CS Dept. Tech Report UT-CS-05-544},
	month = {2005-01},
	keywords = {hpcc, hpcchallenge},
	author = {Jack Dongarra and Piotr Luszczek}
}
@article {icl:236,
	title = {Cray X1 Evaluation Status Report},
	journal = {Oak Ridge National Laboratory Report},
	volume = {/-2004/13},
	year = {2004},
	month = {2004-01},
	author = {Pratul Agarwal and R. A. Alexander and E. Apra and Satish Balay and Arthur S. Bland and James Colgan and Eduardo D{\textquoteright}Azevedo and Jack Dongarra and Tom Dunigan and Mark Fahey and Al Geist and M. Gordon and Robert Harrison and Dinesh Kaushik and M. Krishnakumar and Piotr Luszczek and Tony Mezzacapa and Jeff Nichols and Jarek Nieplocha and Leonid Oliker and T. Packwood and M. Pindzola and Thomas C. Schulthess and Jeffrey Vetter and James B White and T. Windus and Patrick H. Worley and Thomas Zacharia}
}
@inproceedings {icl:201,
	title = {Design of an Interactive Environment for Numerically Intensive Parallel Linear Algebra Calculations},
	journal = {International Conference on Computational Science},
	year = {2004},
	month = {2004-06},
	publisher = {Springer Verlag},
	address = {Poland},
	keywords = {lacsi, lfc},
	doi = {10.1007/978-3-540-25944-2_35},
	author = {Piotr Luszczek and Jack Dongarra},
	editor = {Marian Bubak and Geert Dick van Albada and Peter M. Sloot and Jack Dongarra}
}
@inproceedings {icl:142,
	title = {LAPACK for Clusters Project: An Example of Self Adapting Numerical Software},
	journal = {Proceedings of the 37th Annual Hawaii International Conference on System Sciences (HICSS 04{\textquoteright})},
	volume = {9},
	year = {2004},
	month = {2004-01},
	pages = {90282},
	address = {Big Island, Hawaii},
	keywords = {lacsi, lfc},
	author = {Zizhong Chen and Jack Dongarra and Piotr Luszczek and Kenneth Roche}
}
@article {icl:136,
	title = {Self Adapting Software for Numerical Linear Algebra and LAPACK for Clusters},
	journal = {Parallel Computing},
	volume = {29},
	number = {11-12},
	year = {2003},
	month = {2003-11},
	pages = {1723-1743},
	keywords = {lacsi, lfc, sans},
	author = {Zizhong Chen and Jack Dongarra and Piotr Luszczek and Kenneth Roche}
}
@techreport {icl:209,
	title = {Self Adapting Software for Numerical Linear Algebra and LAPACK for Clusters (LAPACK Working Note 160)},
	journal = {University of Tennessee Computer Science Technical Report, UT-CS-03-499},
	year = {2003},
	month = {2003-01},
	keywords = {lacsi},
	author = {Zizhong Chen and Jack Dongarra and Piotr Luszczek and Kenneth Roche}
}
@article {icl:81,
	title = {Recursive Approach in Sparse Matrix LU Factorization},
	journal = {Scientific Programming},
	volume = {9},
	number = {1},
	year = {2001},
	month = {2001-00},
	pages = {51-60},
	author = {Jack Dongarra and Victor Eijkhout and Piotr Luszczek}
}
@inproceedings {icl:38,
	title = {Recursive approach in sparse matrix LU factorization},
	journal = {Proceedings of 1st SGI Users Conference},
	year = {2000},
	month = {2000-01},
	pages = {409-418},
	address = {Cracow, Poland (ACC Cyfronet UMM, 2000)},
	author = {Jack Dongarra and Victor Eijkhout and Piotr Luszczek}
}