@techreport {,
	title = {MAGMA Batched: A Batched BLAS Approach for Small Matrix Factorizations and Applications on GPUs},
	journal = {Innovative Computing Laboratory Technical Report},
	number = {ICL-UT-16-02},
	year = {2016},
	month = {2016-08},
	publisher = {University of Tennessee},
	abstract = {A particularly challenging class of problems arising in many applications, called batched problems, involves linear algebra operations on many small-sized matrices. We proposed and designed batched BLAS (Basic Linear Algebra Subroutines), Level-2 GEMV and Level-3 GEMM, to solve them. We illustrate how batched GEMV and GEMM to be able to assist batched advance factorization (e.g. bi-diagonalization) and other BLAS routines (e.g. triangular solve) to achieve optimal performance on GPUs. Our solutions achieved up to 2.8-3{\texttimes} speedups compared to CUBLAS and MKL solutions, wherever possible. We illustrated the batched methodology on a real-world Hydrodynamic application by reformulating the tensor operations into batched BLAS GEMV and GEMM operations. A 2.5{\texttimes} speedup and a 1.4{\texttimes} greenup are obtained by changing 10\% of the code. We accelerated and scaled it on Titan supercomputer to 4096 nodes.},
	author = {Tingxing Dong and Azzam Haidar and Piotr Luszczek and Stanimire Tomov and Ahmad Abdelfattah and Jack Dongarra}
}