@conference {, title = {Mixed Precision Algebraic Multigrid on GPUs}, booktitle = { Parallel Processing and Applied Mathematics (PPAM 2022)}, volume = {13826}, year = {2023}, month = {2023-04}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, address = {Cham}, abstract = {In this paper, we present the first GPU-native platform-portable algebraic multigrid (AMG) implementation that allows the user to use different precision formats for the distinct multigrid levels. The AMG we present uses an aggregation size 2 parallel graph match as the AMG coarsening strategy. The implementation provides a high level of flexibility in terms of configuring the bottom-level solver and the precision format for the distinct levels. We present convergence and performance results on the GPUs from AMD, Intel, and NVIDIA, and compare against corresponding functionality available in other libraries.}, keywords = {Algebraic multigrid, GPUs, mixed precision, Portability}, isbn = {978-3-031-30441-5}, doi = {10.1007/978-3-031-30442-2_9}, url = {https://link.springer.com/10.1007/978-3-031-30442-2}, author = {Tsai, Yu-Hsiang Mike and Natalie Beams and Anzt, Hartwig}, editor = {Wyrzykowski, Roman and Dongarra, Jack and Deelman, Ewa and Karczewski, Konrad} } @article {, title = {A survey of numerical linear algebra methods utilizing mixed-precision arithmetic}, journal = {The International Journal of High Performance Computing Applications}, volume = {35}, number = {4}, year = {2021}, pages = {344{\textendash}369}, abstract = {The efficient utilization of mixed-precision numerical linear algebra algorithms can offer attractive acceleration to scientific computing applications. Especially with the hardware integration of low-precision special-function units designed for machine learning applications, the traditional numerical algorithms community urgently needs to reconsider the floating point formats used in the distinct operations to efficiently leverage the available compute power. In this work, we provide a comprehensive survey of mixed-precision numerical linear algebra routines, including the underlying concepts, theoretical background, and experimental results for both dense and sparse linear algebra problems.}, keywords = {GPUs, High-performance computing, linear algebra, Mixed-precision arithmetic, numerical mathematics}, doi = {10.1177/10943420211003313}, author = {Abdelfattah, Ahmad and Anzt, Hartwig and Boman, Erik G and Carson, Erin and Cojean, Terry and Jack Dongarra and Fox, Alyson and Mark Gates and Higham, Nicholas J and Li, Xiaoye S and others} } @conference {940, title = {On the Development of Variable Size Batched Computation for Heterogeneous Parallel Architectures}, booktitle = {The 17th IEEE International Workshop on Parallel and Distributed Scientific and Engineering Computing (PDSEC 2016), IPDPS 2016}, year = {2016}, month = {2016-05}, publisher = {IEEE}, organization = {IEEE}, address = {Chicago, IL}, abstract = {
Many scientific applications, ranging from national security to medical advances, require solving a number of relatively small-size independent problems. As the size of each individual problem does not provide sufficient parallelism for the underlying hardware, especially accelerators, these problems must be solved concurrently as a batch in order to saturate the hardware with enough work, hence the name batched computation. A possible simplification is to assume a uniform size for all problems. However, real applications do not necessarily satisfy such assumption. Consequently, an efficient solution for variable-size batched computations is required.
This paper proposes a foundation for high performance variable-size batched matrix computation based on Graphics Processing Units (GPUs). Being throughput-oriented processors, GPUs favor regular computation and less divergence among threads, in order to achieve high performance. Therefore, the development of high performance numerical software for this kind of problems is challenging. As a case study, we developed efficient batched Cholesky factorization algorithms for relatively small matrices of different sizes. However, most of the strategies and the software developed, and in particular a set of variable size batched BLAS kernels, can be used in many other dense matrix factorizations, large scale sparse direct multifrontal solvers, and applications. We propose new interfaces and mechanisms to handle the irregular computation pattern on the GPU. According to the authors{\textquoteright} knowledge, this is the first attempt to develop high performance software for this class of problems. Using a K40c GPU, our performance tests show speedups of up to 2:5 against two Sandy Bridge CPUs (8-core each) running Intel MKL library.
}, keywords = {batched computation, GPUs, variable small sizes}, author = {Ahmad Abdelfattah and Azzam Haidar and Stanimire Tomov and Jack Dongarra} } @conference {943, title = {Performance Tuning and Optimization Techniques of Fixed and Variable Size Batched Cholesky Factorization on GPUs}, booktitle = {International Conference on Computational Science (ICCS{\textquoteright}16)}, year = {2016}, month = {2016-06}, address = {San Diego, CA}, abstract = {Solving a large number of relatively small linear systems has recently drawn more attention in the HPC community, due to the importance of such computational workloads in many scientific applications, including sparse multifrontal solvers. Modern hardware accelerators and their architecture require a set of optimization techniques that are very different from the ones used in solving one relatively large matrix. In order to impose concurrency on such throughput-oriented architectures, a common practice is to batch the solution of these matrices as one task offloaded to the underlying hardware, rather than solving them individually.
This paper presents a high performance batched Cholesky factorization on large sets of relatively small matrices using Graphics Processing Units (GPUs), and addresses both fixed and variable size batched problems. We investigate various algorithm designs and optimization techniques, and show that it is essential to combine kernel design with performance tuning in order to achieve the best possible performance. We compare our approaches against state-of-the-art CPU solutions as well as GPU-based solutions using existing libraries, and show that, on a K40c GPU for example, our kernels are more than 2 faster.
}, keywords = {batched computation, Cholesky Factorization, GPUs, Tuning}, author = {Ahmad Abdelfattah and Azzam Haidar and Stanimire Tomov and Jack Dongarra} }