@techreport {, title = {Mixed-Precision Algorithm for Finding Selected Eigenvalues and Eigenvectors of Symmetric and Hermitian Matrices}, journal = {ICL Technical Report}, number = {ICL-UT-21-05}, year = {2021}, month = {2021-08}, abstract = {As the new hardware is being equipped with powerful low-precision capabilities driven primarily by the needs of the burgeoning field of Artificial Intelligence (AI), mixed-precision algorithms are now showing far greater potential and renewed interest in scientific computing community. The multi-precision methods commonly follow approximate-iterate scheme by first obtaining the approximate solution from a low-precision factorization and solve. Then, they iteratively refine the solution to the desired accuracy that is often as high as what is possible with traditional approaches. While targeting symmetric and Hermitian eigenvalue problems of the form Ax=\&$\#$955;x, we revisit the SICE algorithm proposed by Dongarra et al. By applying the Sherman-Morrison formula on the diagonally-shifted tridiagonal systems, we propose an updated SICE-SM algorithm. By incorporating the latest two-stage algorithms from the PLASMA and MAGMA software libraries for numerical linear algebra, we achieved up to 3.6x speedup using the mixed-precision eigensolver with the blocked SICE-SM algorithm for iterative refinement when compared with full double complex precision solvers for the cases with a portion of eigenvalues and eigenvectors requested. }, keywords = {eigenvalue solver, hardware accelerators, mixed-precision algorithms}, author = {Yaohung M. Tsai and Piotr Luszczek and Jack Dongarra} } @article {, title = {Translational process: Mathematical software perspective}, journal = {Journal of Computational Science}, volume = {52}, year = {2021}, pages = {101216}, abstract = {Each successive generation of computer architecture has brought new challenges to achieving high performance mathematical solvers, necessitating development and analysis of new algorithms, which are then embodied in software libraries. These libraries hide architectural details from applications, allowing them to achieve a level of portability across platforms from desktops to world-class high performance computing (HPC) systems. Thus there has been an informal translational computer science process of developing algorithms and distributing them in open source software libraries for adoption by applications and vendors. With the move to exascale, increasing intentionality about this process will benefit the long-term sustainability of the scientific software stack.}, keywords = {communication avoiding algorithms, DATAFLOW scheduling runtimes, hardware accelerators}, doi = {10.1016/j.jocs.2020.101216}, author = {Jack Dongarra and Mark Gates and Piotr Luszczek and Stanimire Tomov} } @techreport {, title = {Translational Process: Mathematical Software Perspective}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-20-11}, year = {2020}, month = {2020-08}, abstract = {Each successive generation of computer architecture has brought new challenges to achieving high performance mathematical solvers, necessitating development and analysis of new algorithms, which are then embodied in software libraries. These libraries hide architectural details from applications, allowing them to achieve a level of portability across platforms from desktops to worldclass high performance computing (HPC) systems. Thus there has been an informal translational computer science process of developing algorithms and distributing them in open source software libraries for adoption by applications and vendors. With the move to exascale, increasing intentionality about this process will benefit the long-term sustainability of the scientific software stack.}, keywords = {communication avoiding algorithms, data flow scheduling runtimes, hardware accelerators}, author = {Jack Dongarra and Mark Gates and Piotr Luszczek and Stanimire Tomov} } @article {, title = {Translational Process: Mathematical Software Perspective}, journal = {Journal of Computational Science}, year = {2020}, month = {2020-09}, abstract = {Each successive generation of computer architecture has brought new challenges to achieving high performance mathematical solvers, necessitating development and analysis of new algorithms, which are then embodied in software libraries. These libraries hide architectural details from applications, allowing them to achieve a level of portability across platforms from desktops to world-class high performance computing (HPC) systems. Thus there has been an informal translational computer science process of developing algorithms and distributing them in open source software libraries for adoption by applications and vendors. With the move to exascale, increasing intentionality about this process will benefit the long-term sustainability of the scientific software stack.}, keywords = {communication avoiding algorithms, DATAFLOW scheduling runtimes, hardware accelerators}, doi = {https://doi.org/10.1016/j.jocs.2020.101216}, author = {Jack Dongarra and Mark Gates and Piotr Luszczek and Stanimire Tomov} } @article {1266, title = {Accelerating the SVD Bi-Diagonalization of a Batch of Small Matrices using GPUs}, journal = {Journal of Computational Science}, volume = {26}, year = {2018}, month = {2018-05}, pages = {237{\textendash}245}, abstract = {The acceleration of many small-sized linear algebra problems has become extremely challenging for current many-core architectures, and in particular GPUs. Standard interfaces have been proposed for some of these problems, called batched problems, so that they get targeted for optimization and used in a standard way in applications, calling them directly from highly optimized, standard numerical libraries, like (batched) BLAS and LAPACK. While most of the developments have been for one-sided factorizations and solvers, many important applications {\textendash} from big data analytics to information retrieval, low-rank approximations for solvers and preconditioners {\textendash} require two-sided factorizations, and most notably the SVD factorization. To address these needs and the parallelization challenges related to them, we developed a number of new batched computing techniques and designed batched Basic Linear Algebra Subroutines (BLAS) routines, and in particular the Level-2 BLAS GEMV and the Level-3 BLAS GEMM routines, to solve them. We propose a device functions-based methodology and big-tile setting techniques in our batched BLAS design. The different optimization techniques result in many software versions that must be tuned, for which we adopt an auto-tuning strategy to automatically derive the optimized instances of the routines. We illustrate our batched BLAS approach to optimize batched SVD bi-diagonalization progressively on GPUs. The progression is illustrated on an NVIDIA K40c GPU, and also, ported and presented on AMD Fiji Nano GPU, using AMD{\textquoteright}s Heterogeneous{\textendash}Compute Interface for Portability (HIP) C++ runtime API. We demonstrate achieving 80\% of the theoretically achievable peak performance for the overall algorithm, and significant acceleration of the Level-2 BLAS GEMV and Level-3 BLAS GEMM needed compared to vendor-optimized libraries on GPUs and multicore CPUs. The optimization techniques in this paper are applicable to the other two-sided factorizations as well.}, keywords = {Batched, Eigenvalue and singular value problems, hardware accelerators, numerical linear algebra, Two-sided factorization algorithms}, doi = {https://doi.org/10.1016/j.jocs.2018.01.007}, author = {Tingxing Dong and Azzam Haidar and Stanimire Tomov and Jack Dongarra} } @article {858, title = {Batched matrix computations on hardware accelerators based on GPUs}, journal = {International Journal of High Performance Computing Applications}, year = {2015}, month = {2015-02}, abstract = {Scientific applications require solvers that work on many small size problems that are independent from each other. At the same time, the high-end hardware evolves rapidly and becomes ever more throughput-oriented and thus there is an increasing need for an effective approach to develop energy-efficient, high-performance codes for these small matrix problems that we call batched factorizations. The many applications that need this functionality could especially benefit from the use of GPUs, which currently are four to five times more energy efficient than multicore CPUs on important scientific workloads. This paper, consequently, describes the development of the most common, one-sided factorizations, Cholesky, LU, and QR, for a set of small dense matrices. The algorithms we present together with their implementations are, by design, inherently parallel. In particular, our approach is based on representing the process as a sequence of batched BLAS routines that are executed entirely on a GPU. Importantly, this is unlike the LAPACK and the hybrid MAGMA factorization algorithms that work under drastically different assumptions of hardware design and efficiency of execution of the various computational kernels involved in the implementation. Thus, our approach is more efficient than what works for a combination of multicore CPUs and GPUs for the problems sizes of interest of the application use cases. The paradigm where upon a single chip (a GPU or a CPU) factorizes a single problem at a time is not at all efficient in our applications{\textquoteright} context. We illustrate all of these claims through a detailed performance analysis. With the help of profiling and tracing tools, we guide our development of batched factorizations to achieve up to two-fold speedup and three-fold better energy efficiency as compared against our highly optimized batched CPU implementations based on MKL library. The tested system featured two sockets of Intel Sandy Bridge CPUs and we compared with a batched LU factorizations featured in the CUBLAS library for GPUs, we achieve as high as 2.5{\texttimes} speedup on the NVIDIA K40 GPU.}, keywords = {batched factorization, hardware accelerators, numerical linear algebra, numerical software libraries, one-sided factorization algorithms}, doi = {10.1177/1094342014567546}, author = {Azzam Haidar and Tingxing Dong and Piotr Luszczek and Stanimire Tomov and Jack Dongarra} } @article {982, title = {Experiences in Autotuning Matrix Multiplication for Energy Minimization on GPUs}, journal = {Concurrency and Computation: Practice and Experience}, volume = {27}, year = {2015}, month = {12-Oct}, pages = {5096 - 5113}, abstract = {In this paper, we report extensive results and analysis of autotuning the computationally intensive graphics processing units kernel for dense matrix{\textendash}matrix multiplication in double precision. In contrast to traditional autotuning and/or optimization for runtime performance only, we also take the energy efficiency into account. For kernels achieving equal performance, we show significant differences in their energy balance. We also identify the memory throughput as the most influential metric that trades off performance and energy efficiency. As a result, the performance optimal case ends up not being the most efficient kernel in overall resource use.}, keywords = {Autotuning, energy efficiency, hardware accelerators, matrix multiplication, power}, doi = {10.1002/cpe.3516}, url = {http://doi.wiley.com/10.1002/cpe.3516https://api.wiley.com/onlinelibrary/tdm/v1/articles/10.1002\%2Fcpe.3516}, author = {Hartwig Anzt and Blake Haugen and Jakub Kurzak and Piotr Luszczek and Jack Dongarra} } @conference {916, title = {Optimization for Performance and Energy for Batched Matrix Computations on GPUs}, booktitle = {8th Workshop on General Purpose Processing Using GPUs (GPGPU 8)}, year = {2015}, month = {2015-02}, publisher = {ACM}, organization = {ACM}, address = {San Francisco, CA}, abstract = {As modern hardware keeps evolving, an increasingly effective approach to develop energy efficient and high-performance solvers is to design them to work on many small size independent problems. Many applications already need this functionality, especially for GPUs, which are known to be currently about four to five times more energy efficient than multicore CPUs. We describe the development of the main one-sided factorizations that work for a set of small dense matrices in parallel, and we illustrate our techniques on the LU and Cholesky factorizations. We refer to this mode of operation as a batched factorization. Our approach is based on representing the algorithms as a sequence of batched BLAS routines for GPU-only execution. The goal of avoiding multicore CPU use, e.g., as in the hybrid CPU-GPU algorithms, is to exclusively benefit from the GPU{\textquoteright}s significantly higher energy efficiency, as well as from the removal of the costly CPU-to-GPU communications. Furthermore, we do not use a single symmetric multiprocessor (on the GPU) to factorize a single problem at a time. We illustrate how our performance analysis and the use of profiling and tracing tools guided the development and optimization of batched factorizations to achieve up to 2-fold speedup and 3-fold better energy efficiency compared to our highly optimized batched CPU implementations based on the MKL library (when using two sockets of Intel Sandy Bridge CPUs). Compared to a batched LU factorization featured in the CUBLAS library for GPUs, we achieved up to 2.5 speedup on the K40 GPU.}, keywords = {batched factorization, hardware accelerators, numerical linear algebra, numerical software libraries, one-sided factorization algorithms}, doi = {10.1145/2716282.2716288}, author = {Azzam Haidar and Tingxing Dong and Piotr Luszczek and Stanimire Tomov and Jack Dongarra} } @conference {844, title = {Towards Batched Linear Solvers on Accelerated Hardware Platforms}, booktitle = {8th Workshop on General Purpose Processing Using GPUs (GPGPU 8) co-located with PPOPP 2015}, year = {2015}, month = {2015-02}, publisher = {ACM}, organization = {ACM}, address = {San Francisco, CA}, abstract = {As hardware evolves, an increasingly effective approach to develop energy efficient, high-performance solvers, is to design them to work on many small and independent problems. Indeed, many applications already need this functionality, especially for GPUs, which are known to be currently about four to five times more energy efficient than multicore CPUs for every floating-point operation. In this paper, we describe the development of the main one-sided factorizations: LU, QR, and Cholesky; that are needed for a set of small dense matrices to work in parallel. We refer to such algorithms as batched factorizations. Our approach is based on representing the algorithms as a sequence of batched BLAS routines for GPU-contained execution. Note that this is similar in functionality to the LAPACK and the hybrid MAGMA algorithms for large-matrix factorizations. But it is different from a straightforward approach, whereby each of GPU{\textquoteright}s symmetric multiprocessors factorizes a single problem at a time.We illustrate how our performance analysis together with the profiling and tracing tools guided the development of batched factorizations to achieve up to 2-fold speedup and 3-fold better energy efficiency compared to our highly optimized batched CPU implementations based on the MKL library on a two-sockets, Intel Sandy Bridge server. Compared to a batched LU factorization featured in the NVIDIA{\textquoteright}s CUBLAS library for GPUs, we achieves up to 2.5-fold speedup on the K40 GPU.}, keywords = {batched factorization, hardware accelerators, numerical linear algebra, numerical software libraries, one-sided factorization algorithms}, author = {Azzam Haidar and Piotr Luszczek and Stanimire Tomov and Jack Dongarra} } @inproceedings {959, title = {Weighted Dynamic Scheduling with Many Parallelism Grains for Offloading of Numerical Workloads to Multiple Varied Accelerators}, journal = {Proceedings of the 6th Workshop on Latest Advances in Scalable Algorithms for Large-Scale Systems (ScalA{\textquoteright}15)}, volume = {No. 5}, year = {2015}, month = {2015-11}, publisher = {ACM}, address = {Austin, TX}, abstract = {A wide variety of heterogeneous compute resources are available to modern computers, including multiple sockets containing multicore CPUs, one-or-more GPUs of varying power, and coprocessors such as the Intel Xeon Phi. The challenge faced by domain scientists is how to efficiently and productively use these varied resources. For example, in order to use GPUs effectively, the workload must have a greater degree of parallelism than a workload designed for a multicore-CPU. The domain scientist would have to design and schedule an application in multiple degrees of parallelism and task grain sizes in order to obtain efficient performance from the resources. We propose a productive programming model starting from serial code, which achieves parallelism and scalability by using a task-superscalar runtime environment to adapt the computation to the available resources. The adaptation is done at multiple points, including multi-level data partitioning, adaptive task grain sizes, and dynamic task scheduling. The effectiveness of this approach for utilizing multi-way heterogeneous hardware resources is demonstrated by implementing dense linear algebra applications.}, keywords = {dataflow scheduling, hardware accelerators, multi-grain parallelism}, author = {Azzam Haidar and Yulu Jia and Piotr Luszczek and Stanimire Tomov and Asim YarKhan and Jack Dongarra} } @article {831, title = {Model-Driven One-Sided Factorizations on Multicore, Accelerated Systems}, journal = {Supercomputing Frontiers and Innovations}, volume = {1}, year = {2014}, abstract = {Hardware heterogeneity of the HPC platforms is no longer considered unusual but instead have become the most viable way forward towards Exascale. In fact, the multitude of the heterogeneous resources available to modern computers are designed for different workloads and their efficient use is closely aligned with the specialized role envisaged by their design. Commonly in order to efficiently use such GPU resources, the workload in question must have a much greater degree of parallelism than workloads often associated with multicore processors (CPUs). Available GPU variants differ in their internal architecture and, as a result, are capable of handling workloads of varying degrees of complexity and a range of computational patterns. This vast array of applicable workloads will likely lead to an ever accelerated mixing of multicore-CPUs and GPUs in multi-user environments with the ultimate goal of offering adequate computing facilities for a wide range of scientific and technical workloads. In the following paper, we present a research prototype that uses a lightweight runtime environment to manage the resource-specific workloads, and to control the dataflow and parallel execution in hybrid systems. Our lightweight runtime environment uses task superscalar concepts to enable the developer to write serial code while providing parallel execution. This concept is reminiscent of dataflow and systolic architectures in its conceptualization of a workload as a set of side-effect-free tasks that pass data items whenever the associated work assignment have been completed. Additionally, our task abstractions and their parametrization enable uniformity in the algorithmic development across all the heterogeneous resources without sacrificing precious compute cycles. We include performance results for dense linear algebra functions which demonstrate the practicality and effectiveness of our approach that is aptly capable of full utilization of a wide range of accelerator hardware.}, keywords = {dense linear algebra, hardware accelerators, task superscalar scheduling}, doi = {http://dx.doi.org/10.14529/jsfi1401}, author = {Jack Dongarra and Azzam Haidar and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Asim YarKhan} }