@article {985,
	title = {Performance optimization of Sparse Matrix-Vector Multiplication for multi-component PDE-based applications using GPUs},
	journal = {Concurrency and Computation: Practice and Experience},
	volume = {28},
	year = {2016},
	month = {2016-05},
	pages = {3447 - 3465},
	abstract = {Simulations of many multi-component PDE-based applications, such as petroleum reservoirs or reacting flows, are dominated by the solution, on each time step and within each Newton step, of large sparse linear systems. The standard solver is a preconditioned Krylov method. Along with application of the preconditioner, memory-bound Sparse Matrix-Vector Multiplication (SpMV) is the most time-consuming operation in such solvers. Multi-species models produce Jacobians with a dense block structure, where the block size can be as large as a few dozen. Failing to exploit this dense block structure vastly underutilizes hardware capable of delivering high performance on dense BLAS operations. This paper presents a GPU-accelerated SpMV kernel for block-sparse matrices. Dense matrix-vector multiplications within the sparse-block structure leverage optimization techniques from the KBLAS library, a high performance library for dense BLAS kernels. The design ideas of KBLAS can be applied to block-sparse matrices. Furthermore, a technique is proposed to balance the workload among thread blocks when there are large variations in the lengths of nonzero rows. Multi-GPU performance is highlighted. The proposed SpMV kernel outperforms existing state-of-the-art implementations using matrices with real structures from different applications.},
	doi = {10.1002/cpe.v28.1210.1002/cpe.3874},
	url = {http://onlinelibrary.wiley.com/doi/10.1002/cpe.3874/full},
	author = {Ahmad Abdelfattah and Hatem Ltaeif and David Keyes and Jack Dongarra}
}
@article {698,
	title = {Achieving numerical accuracy and high performance using recursive tile LU factorization with partial pivoting},
	journal = {Concurrency and Computation: Practice and Experience},
	volume = {26},
	year = {2014},
	month = {2014-05},
	pages = {1408-1431},
	chapter = {1408},
	abstract = {The LU factorization is an important numerical algorithm for solving systems of linear equations in science and engineering and is a characteristic of many dense linear algebra computations. For example, it has become the de facto numerical algorithm implemented within the LINPACK benchmark to rank the most powerful supercomputers in the world, collected by the TOP500 website. Multicore processors continue to present challenges to the development of fast and robust numerical software due to the increasing levels of hardware parallelism and widening gap between core and memory speeds. In this context, the difficulty in developing new algorithms for the scientific community resides in the combination of two goals: achieving high performance while maintaining the accuracy of the numerical algorithm. This paper proposes a new approach for computing the LU factorization in parallel on multicore architectures, which not only improves the overall performance but also sustains the numerical quality of the standard LU factorization algorithm with partial pivoting. While the update of the trailing submatrix is computationally intensive and highly parallel, the inherently problematic portion of the LU factorization is the panel factorization due to its memory-bound characteristic as well as the atomicity of selecting the appropriate pivots. Our approach uses a parallel fine-grained recursive formulation of the panel factorization step and implements the update of the trailing submatrix with the tile algorithm. Based on conflict-free partitioning of the data and lockless synchronization mechanisms, our implementation lets the overall computation flow naturally without contention. The dynamic runtime system called QUARK is then able to schedule tasks with heterogeneous granularities and to transparently introduce algorithmic lookahead. The performance results of our implementation are competitive compared to the currently available software packages and libraries. For example, it is up to 40\% faster when compared to the equivalent Intel MKL routine and up to threefold faster than LAPACK with multithreaded Intel MKL BLAS.},
	keywords = {factorization, parallel linear algebra, plasma, recursion, shared memory synchronization, threaded parallelism},
	doi = {10.1002/cpe.3110},
	url = {http://doi.wiley.com/10.1002/cpe.3110},
	author = {Jack Dongarra and Mathieu Faverge and Hatem Ltaeif and Piotr Luszczek}
}
@article {759,
	title = {High Performance Bidiagonal Reduction using Tile Algorithms on Homogeneous Multicore Architectures},
	journal = {ACM Transactions on Mathematical Software (TOMS)},
	volume = {39},
	number = {16},
	year = {2013},
	abstract = {This article presents a new high-performance bidiagonal reduction (BRD) for homogeneous multicore architectures. This article is an extension of the high-performance tridiagonal reduction implemented by the same authors [Luszczek et al., IPDPS 2011] to the BRD case. The BRD is the first step toward computing the singular value decomposition of a matrix, which is one of the most important algorithms in numerical linear algebra due to its broad impact in computational science. The high performance of the BRD described in this article comes from the combination of four important features: (1) tile algorithms with tile data layout, which provide an efficient data representation in main memory; (2) a two-stage reduction approach that allows to cast most of the computation during the first stage (reduction to band form) into calls to Level 3 BLAS and reduces the memory traffic during the second stage (reduction from band to bidiagonal form) by using high-performance kernels optimized for cache reuse; (3) a data dependence translation layer that maps the general algorithm with column-major data layout into the tile data layout; and (4) a dynamic runtime system that efficiently schedules the newly implemented kernels across the processing units and ensures that the data dependencies are not violated. A detailed analysis is provided to understand the critical impact of the tile size on the total execution time, which also corresponds to the matrix bandwidth size after the reduction of the first stage. The performance results show a significant improvement over currently established alternatives. The new high-performance BRD achieves up to a 30-fold speedup on a 16-core Intel Xeon machine with a 12000{\texttimes} 12000 matrix size against the state-of-the-art open source and commercial numerical software packages, namely LAPACK, compiled with optimized and multithreaded BLAS from MKL as well as Intel MKL version 10.2.},
	keywords = {algorithms, bidiagional reduction, bulge chasing, data translation layer, dynamic scheduling, high performance kernels, performance, tile algorithms, two-stage approach},
	doi = {10.1145/2450153.2450154},
	author = {Hatem Ltaeif and Piotr Luszczek and Jack Dongarra}
}
@article {icl:695,
	title = {A Comprehensive Study of Task Coalescing for Selecting Parallelism Granularity in a Two-Stage Bidiagonal Reduction},
	journal = {IPDPS 2012},
	year = {2012},
	month = {2012-05},
	address = {Shanghai, China},
	author = {Azzam Haidar and Hatem Ltaeif and Piotr Luszczek and Jack Dongarra}
}
@inproceedings {icl:711,
	title = {Energy Footprint of Advanced Dense Numerical Linear Algebra using Tile Algorithms on Multicore Architecture},
	year = {2012},
	month = {2012-11},
	address = {Xiangtan, Hunan, China},
	author = {Jack Dongarra and Hatem Ltaeif and Piotr Luszczek and Vincent M Weaver}
}
@article {icl:707,
	title = {Enhancing Parallelism of Tile Bidiagonal Transformation on Multicore Architectures using Tree Reduction},
	journal = {Lecture Notes in Computer Science},
	volume = {7203},
	year = {2012},
	month = {2012-09},
	pages = {661-670},
	author = {Hatem Ltaeif and Piotr Luszczek and Jack Dongarra}
}
@article {icl:730,
	title = {Matrices Over Runtime Systems at Exascale},
	journal = {Supercomputing {\textquoteright}12 (poster)},
	year = {2012},
	month = {2012-11},
	address = {Salt Lake City, Utah},
	author = {Emmanuel Agullo and George Bosilca and Cedric Castagn{\`e}de and Jack Dongarra and Hatem Ltaeif and Stanimire Tomov}
}
@article {icl:709,
	title = {Optimizing Memory-Bound Numerical Kernels on GPU Hardware Accelerators},
	journal = {VECPAR 2012},
	year = {2012},
	month = {2012-07},
	address = {Kobe, Japan},
	author = {Ahmad Abdelfattah and Jack Dongarra and David Keyes and Hatem Ltaeif}
}
@inproceedings {icl:710,
	title = {Power Profiling of Cholesky and QR Factorizations on Distributed Memory Systems},
	year = {2012},
	month = {2012-09},
	address = {Hamburg, Germany},
	author = {George Bosilca and Jack Dongarra and Hatem Ltaeif}
}
@article {icl:726,
	title = {Toward High Performance Divide and Conquer Eigensolver for Dense Symmetric Matrices},
	journal = {SIAM Journal on Scientific Computing (Accepted)},
	year = {2012},
	month = {2012-07},
	author = {Azzam Haidar and Hatem Ltaeif and Jack Dongarra}
}
@techreport {icl:660,
	title = {Achieving Numerical Accuracy and High Performance using Recursive Tile LU Factorization},
	number = {ICL-UT-11-08},
	year = {2011},
	month = {2011-09},
	keywords = {plasma, quark},
	author = {Jack Dongarra and Mathieu Faverge and Hatem Ltaeif and Piotr Luszczek}
}
@techreport {icl:631,
	title = {Analysis of Dynamically Scheduled Tile Algorithms for Dense Linear Algebra on Multicore Architectures},
	year = {2011},
	month = {2011-03},
	keywords = {plasma, quark},
	author = {Azzam Haidar and Hatem Ltaeif and Asim YarKhan and Jack Dongarra}
}
@inproceedings {icl:611,
	title = {Exploiting Fine-Grain Parallelism in Recursive LU Factorization},
	number = {ICL-UT-11-04},
	year = {2011},
	month = {2011-04},
	address = {Gent, Belgium},
	keywords = {plasma},
	author = {Jack Dongarra and Mathieu Faverge and Hatem Ltaeif and Piotr Luszczek}
}
@inproceedings {icl:676,
	title = {Flexible Development of Dense Linear Algebra Algorithms on Massively Parallel Architectures with DPLASMA},
	year = {2011},
	month = {2011-05},
	pages = {1432-1441},
	publisher = {IEEE},
	address = {Anchorage, Alaska, USA},
	keywords = {dague, dplasma, parsec},
	author = {George Bosilca and Aurelien Bouteiller and Anthony Danalis and Mathieu Faverge and Azzam Haidar and Thomas Herault and Jakub Kurzak and Julien Langou and Pierre Lemariner and Hatem Ltaeif and Piotr Luszczek and Asim YarKhan and Jack Dongarra}
}
@techreport {icl:629,
	title = {High Performance Bidiagonal Reduction using Tile Algorithms on Homogeneous Multicore Architectures},
	year = {2011},
	month = {2011-05},
	keywords = {plasma},
	author = {Hatem Ltaeif and Piotr Luszczek and Jack Dongarra}
}
@inproceedings {icl:658,
	title = {High Performance Matrix Inversion Based on LU Factorization for Multicore Architectures},
	year = {2011},
	month = {2011-11},
	address = {Seattle, WA},
	author = {Jack Dongarra and Mathieu Faverge and Hatem Ltaeif and Piotr Luszczek}
}
@article {icl:653,
	title = {A Hybridization Methodology for High-Performance Linear Algebra Software for GPUs},
	journal = {in GPU Computing Gems, Jade Edition},
	volume = {2},
	year = {2011},
	month = {2011-00},
	pages = {473-484},
	publisher = {Elsevier},
	keywords = {magma, morse},
	author = {Emmanuel Agullo and Cedric Augonnet and Jack Dongarra and Hatem Ltaeif and Raymond Namyst and Samuel Thibault and Stanimire Tomov},
	editor = {Wen-mei W. Hwu}
}
@article {icl:599,
	title = {LU Factorization for Accelerator-Based Systems},
	journal = {IEEE/ACS AICCSA 2011},
	year = {2011},
	month = {2011-12},
	address = {Sharm-El-Sheikh, Egypt},
	keywords = {magma, morse},
	author = {Emmanuel Agullo and Cedric Augonnet and Jack Dongarra and Mathieu Faverge and Julien Langou and Hatem Ltaeif and Stanimire Tomov}
}
@inproceedings {icl:657,
	title = {Parallel Reduction to Condensed Forms for Symmetric Eigenvalue Problems using Aggregated Fine-Grained and Memory-Aware Kernels},
	year = {2011},
	month = {2011-11},
	address = {Seattle, WA},
	keywords = {plasma, quark},
	author = {Azzam Haidar and Hatem Ltaeif and Jack Dongarra}
}
@techreport {icl:627,
	title = {Parallel Reduction to Condensed Forms for Symmetric Eigenvalue Problems using Aggregated Fine-Grained and Memory-Aware Kernels},
	year = {2011},
	month = {2011-08},
	author = {Azzam Haidar and Hatem Ltaeif and Jack Dongarra}
}
@inproceedings {icl:621,
	title = {Profiling High Performance Dense Linear Algebra Algorithms on Multicore Architectures for Power and Energy Efficiency},
	year = {2011},
	month = {2011-09},
	address = {Hamburg, Germany},
	keywords = {mumi},
	author = {Hatem Ltaeif and Piotr Luszczek and Jack Dongarra}
}
@article {icl:604,
	title = {Toward High Performance Divide and Conquer Eigensolver for Dense Symmetric Matrices.},
	journal = {Submitted to SIAM Journal on Scientific Computing (SISC)},
	year = {2011},
	month = {2011-00},
	author = {Azzam Haidar and Hatem Ltaeif and Jack Dongarra}
}
@inproceedings {icl:592,
	title = {Two-stage Tridiagonal Reduction for Dense Symmetric Matrices using Tile Algorithms on Multicore Architectures},
	year = {2011},
	month = {2011-05},
	address = {Anchorage, AK},
	author = {Piotr Luszczek and Hatem Ltaeif and Jack Dongarra}
}
@article {icl:533,
	title = {Analysis of Dynamically Scheduled Tile Algorithms for Dense Linear Algebra on Multicore Architectures},
	journal = {Submitted to Concurrency and Computations: Practice and Experience},
	year = {2010},
	month = {2010-11},
	keywords = {plasma, quark},
	author = {Azzam Haidar and Hatem Ltaeif and Asim YarKhan and Jack Dongarra}
}
@inproceedings {icl:523,
	title = {Dense Linear Algebra Solvers for Multicore with GPU Accelerators},
	year = {2010},
	pages = {1-8},
	address = {Atlanta, GA},
	abstract = {Solving dense linear systems of equations is a fundamental problem in scientific computing. Numerical simulations involving complex systems represented in terms of unknown variables and relations between them often lead to linear systems of equations that must be solved as fast as possible. We describe current efforts toward the development of these critical solvers in the area of dense linear algebra (DLA) for multicore with GPU accelerators. We describe how to code/develop solvers to effectively use the high computing power available in these new and emerging hybrid architectures. The approach taken is based on hybridization techniques in the context of Cholesky, LU, and QR factorizations. We use a high-level parallel programming model and leverage existing software infrastructure, e.g. optimized BLAS for CPU and GPU, and LAPACK for sequential CPU processing. Included also are architecture and algorithm-specific optimizations for standard solvers as well as mixed-precision iterative refinement solvers. The new algorithms, depending on the hardware configuration and routine parameters, can lead to orders of magnitude acceleration when compared to the same algorithms on standard multicore architectures that do not contain GPU accelerators. The newly developed DLA solvers are integrated and freely available through the MAGMA library.},
	doi = {10.1109/IPDPSW.2010.5470941},
	author = {Stanimire Tomov and Rajib Nath and Hatem Ltaeif and Jack Dongarra}
}
@techreport {icl:563,
	title = {Distributed Dense Numerical Linear Algebra Algorithms on Massively Parallel Architectures: DPLASMA},
	year = {2010},
	month = {2010-09},
	keywords = {dague, dplasma, parsec, plasma},
	author = {George Bosilca and Aurelien Bouteiller and Anthony Danalis and Mathieu Faverge and Azzam Haidar and Thomas Herault and Jakub Kurzak and Julien Langou and Pierre Lemariner and Hatem Ltaeif and Piotr Luszczek and Asim YarKhan and Jack Dongarra}
}
@techreport {icl:529,
	title = {Distributed-Memory Task Execution and Dependence Tracking within DAGuE and the DPLASMA Project},
	number = {ICL-UT-10-02},
	year = {2010},
	month = {2010-00},
	keywords = {dague, plasma},
	author = {George Bosilca and Aurelien Bouteiller and Anthony Danalis and Mathieu Faverge and Azzam Haidar and Thomas Herault and Jakub Kurzak and Julien Langou and Pierre Lemariner and Hatem Ltaeif and Piotr Luszczek and Asim YarKhan and Jack Dongarra}
}
@techreport {icl:585,
	title = {Faster, Cheaper, Better - A Hybridization Methodology to Develop Linear Algebra Software for GPUs},
	number = {230},
	year = {2010},
	month = {2010-00},
	keywords = {magma, morse},
	author = {Emmanuel Agullo and Cedric Augonnet and Jack Dongarra and Hatem Ltaeif and Raymond Namyst and Samuel Thibault and Stanimire Tomov}
}
@article {icl:526,
	title = {Hybrid Multicore Cholesky Factorization with Multiple GPU Accelerators},
	journal = {IEEE Transaction on Parallel and Distributed Systems (submitted)},
	year = {2010},
	month = {2010-03},
	keywords = {magma, plasma},
	author = {Hatem Ltaeif and Stanimire Tomov and Rajib Nath and Jack Dongarra}
}
@article {icl:569,
	title = {Parallel Band Two-Sided Matrix Bidiagonalization for Multicore Architectures},
	journal = {IEEE Transactions on Parallel and Distributed Systems},
	year = {2010},
	month = {2010-04},
	pages = {417-423},
	author = {Hatem Ltaeif and Jakub Kurzak and Jack Dongarra}
}
@inproceedings {icl:577,
	title = {QR Factorization on a Multicore Node Enhanced with Multiple GPU Accelerators},
	number = {ICL-UT-10-04},
	year = {2010},
	month = {2010-10},
	address = {Anchorage, AK},
	keywords = {magma, morse, plasma},
	author = {Emmanuel Agullo and Cedric Augonnet and Jack Dongarra and Mathieu Faverge and Hatem Ltaeif and Samuel Thibault and Stanimire Tomov}
}
@article {icl:521,
	title = {A Scalable High Performant Cholesky Factorization for Multicore with GPU Accelerators},
	journal = {Proc. of VECPAR{\textquoteright}10 (to appear)},
	year = {2010},
	month = {2010-06},
	address = {Berkeley, CA},
	keywords = {magma, plasma},
	author = {Hatem Ltaeif and Stanimire Tomov and Rajib Nath and Peng Du and Jack Dongarra}
}
@techreport {icl:530,
	title = {Scalable Tile Communication-Avoiding QR Factorization on Multicore Cluster Systems},
	volume = {{\textendash}10-653},
	year = {2010},
	month = {2010-04},
	keywords = {plasma},
	author = {Fengguang Song and Hatem Ltaeif and Bilel Hadri and Jack Dongarra}
}
@article {icl:559,
	title = {Scalable Tile Communication-Avoiding QR Factorization on Multicore Cluster Systems},
	journal = {SC{\textquoteright}10},
	year = {2010},
	month = {2010-11},
	publisher = {ACM SIGARCH/ IEEE Computer Society},
	address = {New Orleans, LA},
	keywords = {plasma},
	author = {Fengguang Song and Hatem Ltaeif and Bilel Hadri and Jack Dongarra}
}
@article {1362,
	title = {Scheduling Cholesky Factorization on Multicore Architectures with GPU Accelerators},
	year = {2010},
	month = {2010-07},
	publisher = {2010 Symposium on Application Accelerators in High-Performance Computing (SAAHPC{\textquoteright}10), Poster},
	address = {Knoxville, TN},
	author = {Emmanuel Agullo and Cedric Augonnet and Jack Dongarra and Hatem Ltaeif and Raymond Namyst and Rajib Nath and Jean Roman and Samuel Thibault and Stanimire Tomov}
}
@article {icl:540,
	title = {Scheduling Dense Linear Algebra Operations on Multicore Processors},
	journal = {Concurrency and Computation: Practice and Experience},
	volume = {22},
	number = {1},
	year = {2010},
	month = {2010-01},
	pages = {15-44},
	keywords = {gridpac, plasma},
	author = {Jakub Kurzak and Hatem Ltaeif and Jack Dongarra and Rosa M. Badia}
}
@article {icl:473,
	title = {Scheduling Two-sided Transformations using Tile Algorithms on Multicore Architectures},
	journal = {Journal of Scientific Computing},
	volume = {18},
	number = {1},
	year = {2010},
	month = {2010-00},
	pages = {33-50},
	keywords = {plasma},
	author = {Hatem Ltaeif and Jakub Kurzak and Jack Dongarra and Rosa M. Badia}
}
@inproceedings {icl:488,
	title = {Comparative Study of One-Sided Factorizations with Multiple Software Packages on Multi-Core Hardware},
	year = {2009},
	month = {2009-00},
	author = {Emmanuel Agullo and Bilel Hadri and Hatem Ltaeif and Jack Dongarra}
}
@article {icl:518,
	title = {Dependency-Driven Scheduling of Dense Matrix Factorizations on Shared-Memory Systems},
	journal = {PPAM 2009},
	year = {2009},
	month = {2009-09},
	address = {Poland},
	author = {Jakub Kurzak and Hatem Ltaeif and Jack Dongarra and Rosa M. Badia}
}
@article {icl:524,
	title = {Enhancing Parallelism of Tile QR Factorization for Multicore Architectures},
	journal = {Submitted to Transaction on Parallel and Distributed Systems},
	year = {2009},
	month = {2009-12},
	keywords = {plasma},
	author = {Bilel Hadri and Hatem Ltaeif and Emmanuel Agullo and Jack Dongarra}
}
@article {1352,
	title = {Numerical Linear Algebra on Emerging Architectures: The PLASMA and MAGMA Projects},
	year = {2009},
	month = {2009-11},
	publisher = {The International Conference for High Performance Computing, Networking, Storage, and Analysis (SC09)},
	address = {Portland, OR},
	author = {Emmanuel Agullo and James Demmel and Jack Dongarra and Bilel Hadri and Jakub Kurzak and Julien Langou and Hatem Ltaeif and Piotr Luszczek and Rajib Nath and Stanimire Tomov and Asim YarKhan and Vasily Volkov}
}
@inproceedings {icl:486,
	title = {Numerical Linear Algebra on Emerging Architectures: The PLASMA and MAGMA Projects},
	volume = {180},
	year = {2009},
	month = {2009-00},
	keywords = {magma, plasma},
	author = {Emmanuel Agullo and James Demmel and Jack Dongarra and Bilel Hadri and Jakub Kurzak and Julien Langou and Hatem Ltaeif and Piotr Luszczek and Stanimire Tomov}
}
@article {1365,
	title = {Numerical Linear Algebra on Hybrid Architectures: Recent Developments in the MAGMA Project},
	year = {2009},
	month = {2009-11},
	publisher = {The International Conference for High Performance Computing, Networking, Storage, and Analysis (SC09)},
	address = {Portland, Oregon},
	author = {Rajib Nath and Jack Dongarra and Stanimire Tomov and Hatem Ltaeif and Peng Du}
}
@article {icl:489,
	title = {Parallel Band Two-Sided Matrix Bidiagonalization for Multicore Architectures},
	journal = {IEEE Transactions on Parallel and Distributed Systems (to appear)},
	year = {2009},
	month = {2009-05},
	author = {Hatem Ltaeif and Jakub Kurzak and Jack Dongarra}
}
@techreport {icl:495,
	title = {Scheduling Linear Algebra Operations on Multicore Processors},
	year = {2009},
	month = {2009-00},
	author = {Jakub Kurzak and Hatem Ltaeif and Jack Dongarra and Rosa M. Badia}
}
@article {icl:510,
	title = {Scheduling Linear Algebra Operations on Multicore Processors},
	journal = {Concurrency Practice and Experience (to appear)},
	year = {2009},
	month = {2009-00},
	keywords = {plasma},
	author = {Jakub Kurzak and Hatem Ltaeif and Jack Dongarra and Rosa M. Badia}
}
@techreport {icl:487,
	title = {Tall and Skinny QR Matrix Factorization Using Tile Algorithms on Multicore Architectures},
	number = {ICL-UT-09-03},
	year = {2009},
	month = {2009-09},
	keywords = {plasma},
	author = {Bilel Hadri and Hatem Ltaeif and Emmanuel Agullo and Jack Dongarra}
}
@inproceedings {icl:522,
	title = {Tile QR Factorization with Parallel Panel Processing for Multicore Architectures},
	year = {2009},
	month = {2009-12},
	address = {Atlanta, GA},
	keywords = {plasma},
	author = {Bilel Hadri and Hatem Ltaeif and Emmanuel Agullo and Jack Dongarra}
}
@techreport {icl:431,
	title = {Parallel Block Hessenberg Reduction using Algorithms-By-Tiles for Multicore Architectures Revisited},
	year = {2008},
	month = {2008-08},
	keywords = {plasma},
	author = {Hatem Ltaeif and Jakub Kurzak and Jack Dongarra}
}