@booklet {, title = {CholeskyQR with Randomization and Pivoting for Tall Matrices (CQRRPT)}, year = {2024}, month = {2024-02}, publisher = {arXiv}, abstract = {This paper develops and analyzes a new algorithm for QR decomposition with column pivoting (QRCP) of rectangular matrices with large row counts. The algorithm combines methods from randomized numerical linear algebra in a particularly careful way in order to accelerate both pivot decisions for the input matrix and the process of decomposing the pivoted matrix into the QR form. The source of the latter acceleration is a use of randomized preconditioning and CholeskyQR. Comprehensive analysis is provided in both exact and finite-precision arithmetic to characterize the algorithm{\textquoteright}s rank-revealing properties and its numerical stability granted probabilistic assumptions of the sketching operator. An implementation of the proposed algorithm is described and made available inside the open-source RandLAPACK library, which itself relies on RandBLAS - also available in open-source format. Experiments with this implementation on an Intel Xeon Gold 6248R CPU demonstrate order-of-magnitude speedups relative to LAPACK{\textquoteright}s standard function for QRCP, and comparable performance to a specialized algorithm for unpivoted QR of tall matrices, which lacks the strong rank-revealing properties of the proposed method.}, url = {https://arxiv.org/abs/2311.08316}, author = {Maksim Melnichenko and Oleg Balabanov and Riley Murray and James Demmel and Michael W. Mahoney and Piotr Luszczek} } @techreport {, title = {Randomized Numerical Linear Algebra: A Perspective on the Field with an Eye to Software}, journal = {University of California, Berkeley EECS Technical Report}, number = {UCB/EECS-2022-258}, year = {2022}, month = {2022-11}, publisher = {University of California, Berkeley}, abstract = {Randomized numerical linear algebra {\textendash} RandNLA, for short {\textendash} concerns the use of randomization as a resource to develop improved algorithms for large-scale linear algebra computations. The origins of contemporary RandNLA lay in theoretical computer science, where it blossomed from a simple idea: randomization provides an avenue for computing approximate solutions to linear algebra problems more efficiently than deterministic algorithms. This idea proved fruitful in and was largely driven by the development of scalable algorithms for machine learning and statistical data analysis applications. However, the true potential of RandNLA only came into focus once it began to integrate with the fields of numerical analysis and {\textquotedblleft}classical{\textquotedblright} numerical linear algebra. Through the efforts of many individuals, randomized algorithms have been developed that provide full control over the accuracy of their solutions and that are every bit as reliable as algorithms that might be found in libraries such as LAPACK. The spectrum of possibilities offered by RandNLA has created a virtuous cycle of contributions by numerical analysts, statisticians, theoretical computer scientists, and the machine learning community. Recent years have even seen the incorporation of certain RandNLA methods into MATLAB, the NAG Library, and NVIDIA{\textquoteright}s cuSOLVER. In view of these developments, we believe the time is ripe to accelerate the adoption of RandNLA in the scientific community. In particular, we believe the community stands to benefit significantly from a suitably defined {\textquotedblleft}RandBLAS{\textquotedblright} and {\textquotedblleft}RandLAPACK,{\textquotedblright} to serve as standard libraries for RandNLA, in much the same way that BLAS and LAPACK serve as standards for deterministic linear algebra. This monograph surveys the field of RandNLA as a step toward building mean- ingful RandBLAS and RandLAPACK libraries. Section 1 begins by setting scope and design principles for RandLAPACK and summarizing subsequent sections of the monograph. Section 2 focuses on RandBLAS, which is to be responsible for sketching. Details of functionality suitable for RandLAPACK are covered in the five sections that follow. Specifically, Sections 3 to 5 cover least squares and optimization, low- rank approximation, and other select problems that are well-understood in how they benefit from randomized algorithms. The remaining sections {\textendash} on statistical leverage scores (Section 6) and tensor computations (Section 7) {\textendash} read more like traditional surveys. The different flavor of these latter sections reflects how, in our assessment, the literature on these topics is still maturing. We provide a substantial amount of pseudo-code and supplementary material over the course of five appendices. Much of the pseudo-code has been tested via publicly available Matlab and Python implementations.}, keywords = {Randomized algorithms}, doi = {10.48550/arXiv.2302.1147}, url = {https://www2.eecs.berkeley.edu/Pubs/TechRpts/2022/EECS-2022-258.html}, author = {Riley Murray and James Demmel and Michael W. Mahoney and N. Benjamin Erichson and Maksim Melnichenko and Osman Asif Malik and Laura Grigori and Piotr Luszczek and Micha{\l} Derezi{\'n}ski and Miles E. Lopes and Tianyu Liang and Hengrui Luo and Jack Dongarra} } @techreport {, title = {Prospectus for the Next LAPACK and ScaLAPACK Libraries: Basic ALgebra LIbraries for Sustainable Technology with Interdisciplinary Collaboration (BALLISTIC)}, journal = {LAPACK Working Notes}, number = {297, ICL-UT-20-07}, year = {2020}, month = {2020/07}, publisher = {University of Tennessee}, abstract = {The convergence of several unprecedented changes, including formidable new system design constraints and revolutionary levels of heterogeneity, has made it clear that much of the essential software infrastructure of computational science and engineering is, or will soon be, obsolete. Math libraries have historically been in the vanguard of software that must be adapted first to such changes, both because these low-level workhorses are so critical to the accuracy and performance of so many different types of applications, and because they have proved to be outstanding vehicles for finding and implementing solutions to the problems that novel architectures pose. Under the Basic ALgebra LIbraries for Sustainable Technology with Interdisciplinary Collaboration (BALLISTIC) project, the principal designers of the Linear Algebra PACKage (LAPACK) and the Scalable Linear Algebra PACKage (ScaLAPACK), the combination of which is abbreviated Sca/LAPACK, aim to enhance and update these libraries for the ongoing revolution in processor architecture, system design, and application requirements by incorporating them into a layered package of software components{\textemdash}the BALLISTIC ecosystem{\textemdash}that provides users seamless access to state-of-the-art solver implementations through familiar and improved Sca/LAPACK interfaces.}, author = {James Demmel and Jack Dongarra and Julie Langou and Julien Langou and Piotr Luszczek and Michael Mahoney} } @techreport {1194, title = {Bidiagonal SVD Computation via an Associated Tridiagonal Eigenproblem}, journal = {LAPACK Working Note}, number = {LAWN 295, ICL-UT-18-02}, year = {2018}, month = {2018-04}, publisher = {University of Tennessee}, abstract = {In this paper, we present an algorithm for the singular value decomposition (SVD) of a bidiagonal matrix by means of the eigenpairs of an associated symmetric tridiagonal matrix. The algorithm is particularly suited for the computation of a subset of singular values and corresponding vectors. We focus on a sequential version of the algorithm, and discuss special cases and implementation details. We use a large set of bidiagonal matrices to assess the accuracy of the implementation in single and double precision, as well as to identify potential shortcomings. We show that the algorithm can be up to three orders of magnitude faster than existing algorithms, which are limited to the computation of a full SVD. We also show time comparisons of an implementation that uses the strategy discussed in the paper as a building block for the computation of the SVD of general matrices.}, author = {Osni Marques and James Demmel and Paulo B. Vasconcelos} } @article {815, title = {Communication-Avoiding Symmetric-Indefinite Factorization}, journal = {SIAM Journal on Matrix Analysis and Application}, volume = {35}, year = {2014}, month = {2014-07}, pages = {1364-1406}, abstract = {We describe and analyze a novel symmetric triangular factorization algorithm. The algorithm is essentially a block version of Aasen{\textquoteright}s triangular tridiagonalization. It factors a dense symmetric matrix A as the product A = P LT L T P T where P is a permutation matrix, L is lower triangular, and T is block tridiagonal and banded. The algorithm is the first symmetric-indefinite communication-avoiding factorization: it performs an asymptotically optimal amount of communication in a two-level memory hierarchy for almost any cache-line size. Adaptations of the algorithm to parallel computers are likely to be communication efficient as well; one such adaptation has been recently published. The current paper describes the algorithm, proves that it is numerically stable, and proves that it is communication optimal.}, keywords = {plasma}, author = {Grey Ballard and Dulceneia Becker and James Demmel and Jack Dongarra and Alex Druinsky and I Peled and Oded Schwartz and Sivan Toledo and Ichitaro Yamazaki} } @article {icl:691, title = {Implementing a Blocked Aasen{\textquoteright}s Algorithm with a Dynamic Scheduler on Multicore Architectures}, journal = {IPDPS 2013 (submitted)}, year = {2013}, month = {2013-00}, address = {Boston, MA}, abstract = {Factorization of a dense symmetric indefinite matrix is a key computational kernel in many scientific and engineering simulations. However, there is no scalable factorization algorithm that takes advantage of the symmetry and guarantees numerical stability through pivoting at the same time. This is because such an algorithm exhibits many of the fundamental challenges in parallel programming like irregular data accesses and irregular task dependencies. In this paper, we address these challenges in a tiled implementation of a blocked Aasen{\textquoteright}s algorithm using a dynamic scheduler. To fully exploit the limited parallelism in this left-looking algorithm, we study several performance enhancing techniques; e.g., parallel reduction to update a panel, tall-skinny LU factorization algorithms to factorize the panel, and a parallel implementation of symmetric pivoting. Our performance results on up to 48 AMD Opteron processors demonstrate that our implementation obtains speedups of up to 2.8 over MKL, while losing only one or two digits in the computed residual norms.}, author = {Ichitaro Yamazaki and Dulceneia Becker and Jack Dongarra and Alex Druinsky and I. Peled and Sivan Toledo and Grey Ballard and James Demmel and Oded Schwartz} } @inbook {747, title = {LAPACK}, booktitle = {Handbook of Linear Algebra}, year = {2013}, publisher = {CRC Press}, organization = {CRC Press}, edition = {Second}, address = {Boca Raton, FL}, abstract = {With a substantial amount of new material, the Handbook of Linear Algebra, Second Edition provides comprehensive coverage of linear algebra concepts, applications, and computational software packages in an easy-to-use format. It guides you from the very elementary aspects of the subject to the frontiers of current research. Along with revisions and updates throughout, the second edition of this bestseller includes 20 new chapters.}, isbn = {9781466507289}, author = {Zhaojun Bai and James Demmel and Jack Dongarra and Julien Langou and Jenny Wang} } @article {icl:568, title = {Accelerating Time-To-Solution for Computational Science and Engineering}, journal = {SciDAC Review}, year = {2009}, month = {2009-00}, author = {James Demmel and Jack Dongarra and Armando Fox and Sam Williams and Vasily Volkov and Katherine Yelick} } @article {1352, title = {Numerical Linear Algebra on Emerging Architectures: The PLASMA and MAGMA Projects}, year = {2009}, month = {2009-11}, publisher = {The International Conference for High Performance Computing, Networking, Storage, and Analysis (SC09)}, address = {Portland, OR}, author = {Emmanuel Agullo and James Demmel and Jack Dongarra and Bilel Hadri and Jakub Kurzak and Julien Langou and Hatem Ltaeif and Piotr Luszczek and Rajib Nath and Stanimire Tomov and Asim YarKhan and Vasily Volkov} } @inproceedings {icl:486, title = {Numerical Linear Algebra on Emerging Architectures: The PLASMA and MAGMA Projects}, journal = {Journal of Physics: Conference Series}, volume = {180}, year = {2009}, month = {2009-00}, keywords = {magma, plasma}, author = {Emmanuel Agullo and James Demmel and Jack Dongarra and Bilel Hadri and Jakub Kurzak and Julien Langou and Hatem Ltaeif and Piotr Luszczek and Stanimire Tomov} } @article {1353, title = {Enhancing the Performance of Dense Linear Algebra Solvers on GPUs (in the MAGMA Project)}, year = {2008}, month = {2008-11}, publisher = {The International Conference for High Performance Computing, Networking, Storage, and Analysis (SC08)}, address = {Austin, TX}, author = {Marc Baboulin and James Demmel and Jack Dongarra and Stanimire Tomov and Vasily Volkov} } @techreport {icl:427, title = {HPCS Library Study Effort}, journal = {University of Tennessee Computer Science Technical Report, UT-CS-08-617}, year = {2008}, month = {2008-01}, author = {Jack Dongarra and James Demmel and Parry Husbands and Piotr Luszczek} } @article {icl:370, title = {Prospectus for the Next LAPACK and ScaLAPACK Libraries}, journal = {PARA 2006}, year = {2006}, month = {2006-06}, address = {Umea, Sweden}, author = {James Demmel and Jack Dongarra and B. Parlett and William Kahan and Ming Gu and David Bindel and Yozo Hida and Xiaoye Li and Osni Marques and Jason E. Riedy and Christof Voemel and Julien Langou and Piotr Luszczek and Jakub Kurzak and Alfredo Buttari and Julien Langou and Stanimire Tomov} } @article {icl:260, title = {LAPACK 2005 Prospectus: Reliable and Scalable Software for Linear Algebra Computations on High End Computers}, year = {2005}, month = {2005-01}, publisher = {LAPACK Working Note 164}, author = {James Demmel and Jack Dongarra} } @inproceedings {icl:238, title = {Self Adapting Linear Algebra Algorithms and Software}, journal = {IEEE Proceedings (to appear)}, year = {2004}, month = {2004-00}, keywords = {salsa, sans}, author = {James Demmel and Jack Dongarra and Victor Eijkhout and Erika Fuentes and Antoine Petitet and Rich Vuduc and Clint Whaley and Katherine Yelick} } @article {icl:125, title = {An Updated Set of Basic Linear Algebra Subprograms (BLAS)}, journal = {ACM Transactions on Mathematical Software}, volume = {28}, number = {2}, year = {2002}, month = {2002-12}, pages = {135-151}, doi = {10.1145/567806.567807}, author = {Susan Blackford and James Demmel and Jack Dongarra and Iain Duff and Sven Hammarling and Greg Henry and Michael Heroux and Linda Kaufman and Andrew Lumsdaine and Antoine Petitet and Roldan Pozo and Karin Remington and Clint Whaley} } @article {icl:6, title = {Basic Linear Algebra Subprograms (BLAS)}, journal = {(an update), submitted to ACM TOMS}, year = {2001}, month = {2001-02}, author = {Susan Blackford and James Demmel and Jack Dongarra and Iain Duff and Sven Hammarling and Greg Henry and Michael Heroux and Linda Kaufman and Andrew Lumsdaine and Antoine Petitet and Roldan Pozo and Karin Remington and Clint Whaley} } @article {icl:50, title = {LAPACK Users{\textquoteright} Guide, 3rd ed.}, journal = {Philadelphia: Society for Industrial and Applied Mathematics}, year = {1999}, month = {1999-01}, author = {Ed Anderson and Zhaojun Bai and Christian Bischof and Susan Blackford and James Demmel and Jack Dongarra and Jeremy Du Croz and Anne Greenbaum and Sven Hammarling and Alan McKenney and Danny Sorensen} }