@conference {, title = {Scalable Data Generation for Evaluating Mixed-Precision Solvers}, booktitle = {2020 IEEE High Performance Extreme Computing Conference (HPEC)}, year = {2020}, month = {2020-09}, publisher = {IEEE}, organization = {IEEE}, address = { Waltham, MA, USA}, abstract = {We present techniques of generating data for mixed precision solvers that allows to test those solvers in a scalable manner. Our techniques focus on mixed precision hardware and software where both the solver and the hardware can take advantage of mixing multiple floating precision formats. This allows taking advantage of recently released generation of hardware platforms that focus on ML and DNN workloads but can also be utilized for HPC applications if a new breed of algorithms is combined with the custom floating-point formats to deliver performance levels beyond the standard IEEE data types while delivering a comparable accuracy of the results.}, doi = {https://doi.org/10.1109/HPEC43674.2020.9286145}, author = {Piotr Luszczek and Yaohung Tsai and Neil Lindquist and Hartwig Anzt and Jack Dongarra} } @techreport {, title = {A Survey of Numerical Methods Utilizing Mixed Precision Arithmetic}, journal = {SLATE Working Notes}, number = {15, ICL-UT-20-08}, year = {2020}, month = {2020-07}, publisher = {University of Tennessee}, type = {SLATE Working Notes}, author = {Ahmad Abdelfattah and Hartwig Anzt and Erik Boman and Erin Carson and Terry Cojean and Jack Dongarra and Mark Gates and Thomas Gruetzmacher and Nicholas J. Higham and Sherry Li and Neil Lindquist and Yang Liu and Jennifer Loe and Piotr Luszczek and Pratik Nayak and Sri Pranesh and Siva Rajamanickam and Tobias Ribizel and Barry Smith and Kasia Swirydowicz and Stephen Thomas and Stanimire Tomov and Yaohung Tsai and Ichitaro Yamazaki and Urike Meier Yang} } @article {, title = {Using Quantized Integer in LU Factorization with Partial Pivoting (Poster)}, year = {2020}, month = {2020-02}, publisher = {SIAM Conference on Parallel Processing for Scientific Computing (SIAM PP20)}, address = {Seattle, WA}, abstract = {Quantization is a common technique to speed the deep learning inference. It is using integers with a shared scalar to represent a set of equally spaced numbers. The quantized integer method has shown great success in compressing the deep learning models, reducing the computation cost without losing too much accuracy. New application specific hardware and specialized CPU extension instructions like Intel AVX-512 VNNI are providing capabilities for us to do integer MADD (multiply and add) efficiently. In this poster, we would like to show our preliminary results of using quantization integers for LU factorization with partial pivoting. Using Int32, the backward error can outperform single precision. However, quantized integer has the similar issue of limited range as FP16 that it would not work directly for large matrices because of big numbers would occur in factored U. We will show some possible solutions to it and how we would like to apply this quantized integer technique to other numerical linear algebra applications.}, author = {Yaohung Tsai and Piotr Luszczek and Jack Dongarra} } @conference {1373, title = {Massively Parallel Automated Software Tuning}, booktitle = {48th International Conference on Parallel Processing (ICPP 2019)}, year = {2019}, month = {2019-08}, publisher = {ACM Press}, organization = {ACM Press}, address = {Kyoto, Japan}, abstract = {This article presents an implementation of a distributed autotuning engine developed as part of the Bench-testing OpenN Software Autotuning Infrastructure project. The system is geared towards performance optimization of computational kernels for graphics processing units, and allows for the deployment of vast autotuning sweeps to massively parallel machines. The software implements dynamic work scheduling to distributed-memory resources and takes advantage of multithreading for parallel compilation and dispatches kernel launches to multiple accelerators. This paper lays out the main design principles of the system and discusses the basic mechanics of the initial implementation. Preliminary performance results are presented, encountered challenges are discussed, and the future directions are outlined.}, doi = {https://doi.org/10.1145/3337821.3337908}, author = {Jakub Kurzak and Yaohung Tsai and Mark Gates and Ahmad Abdelfattah and Jack Dongarra} } @article {1271, title = {Autotuning Numerical Dense Linear Algebra for Batched Computation With GPU Hardware Accelerators}, journal = {Proceedings of the IEEE}, volume = {106}, year = {2018}, month = {2018-11}, pages = {2040{\textendash}2055}, abstract = {Computational problems in engineering and scientific disciplines often rely on the solution of many instances of small systems of linear equations, which are called batched solves. In this paper, we focus on the important variants of both batch Cholesky factorization and subsequent substitution. The former requires the linear system matrices to be symmetric positive definite (SPD). We describe the implementation and automated performance engineering of these kernels that implement the factorization and the two substitutions. Our target platforms are graphics processing units (GPUs), which over the past decade have become an attractive high-performance computing (HPC) target for solvers of linear systems of equations. Due to their throughput-oriented design, GPUs exhibit the highest processing rates among the available processors. However, without careful design and coding, this speed is mostly restricted to large matrix sizes. We show an automated exploration of the implementation space as well as a new data layout for the batched class of SPD solvers. Our tests involve the solution of many thousands of linear SPD systems of exactly the same size. The primary focus of our techniques is on the individual matrices in the batch that have dimensions ranging from 5-by-5 up to 100-by-100. We compare our autotuned solvers against the state-of-the-art solvers such as those provided through NVIDIA channels and publicly available in the optimized MAGMA library. The observed performance is competitive and many times superior for many practical cases. The advantage of the presented methodology lies in achieving these results in a portable manner across matrix storage formats and GPU hardware architecture platforms.}, keywords = {Dense numerical linear algebra, performance autotuning}, doi = {10.1109/JPROC.2018.2868961}, author = {Jack Dongarra and Mark Gates and Jakub Kurzak and Piotr Luszczek and Yaohung Tsai} }