@article {,
	title = {Gingko: A Sparse Linear Algebrea Library for HPC},
	year = {2021},
	month = {2021-04},
	publisher = {2021 ECP Annual Meeting},
	author = {Hartwig Anzt and Natalie Beams and Terry Cojean and Fritz G{\"o}bel and Thomas Gr{\"u}tzmacher and Aditya Kashi and Pratik Nayak and Tobias Ribizel and Yuhsiang M. Tsai}
}
@conference {,
	title = {Evaluating the Performance of NVIDIA{\textquoteright}s A100 Ampere GPU for Sparse and Batched Computations},
	booktitle = {2020 IEEE/ACM Workshop on Performance Modeling, Benchmarking and Simulation of High Performance Computer Systems (PMBS)},
	year = {2020},
	month = {2020-11},
	publisher = {IEEE},
	organization = {IEEE},
	abstract = {GPU accelerators have become an important backbone for scientific high performance-computing, and the performance advances obtained from adopting new GPU hardware are significant. In this paper we take a first look at NVIDIA{\textquoteright}s newest server-line GPU, the A100 architecture, part of the Ampere generation. Specifically, we assess its performance for sparse and batch computations, as these routines are relied upon in many scientific applications, and compare to the p},
	keywords = {Batched linear algebra, NVIDIA A100 GPU, sparse linear algebra, Sparse Matrix Vector Product},
	author = {Hartwig Anzt and Yuhsiang M. Tsai and Ahmad Abdelfattah and Terry Cojean and Jack Dongarra}
}
@article {,
	title = {Load-Balancing Sparse Matrix Vector Product Kernels on GPUs},
	journal = {ACM Transactions on Parallel Computing},
	volume = {7},
	year = {2020},
	month = {2020-03},
	abstract = {Efficient processing of Irregular Matrices on Single Instruction, Multiple Data (SIMD)-type architectures is a persistent challenge. Resolving it requires innovations in the development of data formats, computational techniques, and implementations that strike a balance between thread divergence, which is inherent for Irregular Matrices, and padding, which alleviates the performance-detrimental thread divergence but introduces artificial overheads. To this end, in this article, we address the challenge of designing high performance sparse matrix-vector product (SpMV) kernels designed for Nvidia Graphics Processing Units (GPUs). We present a compressed sparse row (CSR) format suitable for unbalanced matrices. We also provide a load-balancing kernel for the coordinate (COO) matrix format and extend it to a hybrid algorithm that stores part of the matrix in SIMD-friendly Ellpack format (ELL) format. The ratio between the ELL- and the COO-part is determined using a theoretical analysis of the nonzeros-per-row distribution. For the over 2,800 test matrices available in the Suite Sparse matrix collection, we compare the performance against SpMV kernels provided by NVIDIA{\textquoteright}s cuSPARSE library and a heavily-tuned sliced ELL (SELL-P) kernel that prevents unnecessary padding by considering the irregular matrices as a combination of matrix blocks stored in ELL format.},
	doi = {https://doi.org/10.1145/3380930},
	author = {Hartwig Anzt and Terry Cojean and Chen Yen-Chen and Jack Dongarra and Goran Flegar and Pratik Nayak and Stanimire Tomov and Yuhsiang M. Tsai and Weichung Wang}
}
@conference {,
	title = {Sparse Linear Algebra on AMD and NVIDIA GPUs{\textemdash}The Race is On},
	booktitle = {ISC High Performance},
	year = {2020},
	month = {2020-06},
	publisher = {Springer},
	organization = {Springer},
	abstract = {Efficiently processing sparse matrices is a central and performance-critical part of many scientific simulation codes. Recognizing the adoption of manycore accelerators in HPC, we evaluate in this paper the performance of the currently best sparse matrix-vector product (SpMV) implementations on high-end GPUs from AMD and NVIDIA. Specifically, we optimize SpMV kernels for the CSR, COO, ELL, and HYB format taking the hardware characteristics of the latest GPU technologies into account. We compare for 2,800 test matrices the performance of our kernels against AMD{\textquoteright}s hipSPARSE library and NVIDIA{\textquoteright}s cuSPARSE library, and ultimately assess how the GPU technologies from AMD and NVIDIA compare in terms of SpMV performance.},
	keywords = {AMD, GPUs, nVidia, sparse matrix vector product (SpMV)},
	doi = {https://doi.org/10.1007/978-3-030-50743-5_16},
	author = {Yuhsiang M. Tsai and Terry Cojean and Hartwig Anzt}
}
@conference {1318,
	title = {Towards Continuous Benchmarking},
	booktitle = {Platform for Advanced Scientific Computing Conference (PASC 2019)},
	year = {2019},
	month = {2019-06},
	publisher = {ACM Press},
	organization = {ACM Press},
	address = {Zurich, Switzerland},
	abstract = {We present an automated performance evaluation framework that enables an automated workflow for testing and performance evaluation of software libraries. Integrating this component into an ecosystem enables sustainable software development, as a community effort, via a web application for interactively evaluating the performance of individual software components. The performance evaluation tool is based exclusively on web technologies, which removes the burden of downloading performance data or installing additional software. We employ this framework for the Ginkgo software ecosystem, but the framework can be used with essentially any software project, including the comparison between different software libraries. The Continuous Integration (CI) framework of Ginkgo is also extended to automatically run a benchmark suite on predetermined HPC systems, store the state of the machine and the environment along with the compiled binaries, and collect results in a publicly accessible performance data repository based on Git. The Ginkgo performance explorer (GPE) can be used to retrieve the performance data from the repository, and visualizes it in a web browser. GPE also implements an interface that allows users to write scripts, archived in a Git repository, to extract particular data, compute particular metrics, and visualize them in many different formats (as specified by the script). The combination of these approaches creates a workflow which enables performance reproducibility and software sustainability of scientific software. In this paper, we present example scripts that extract and visualize performance data for Ginkgo{\textquoteright}s SpMV kernels that allow users to identify the optimal kernel for specific problem characteristics.},
	isbn = {9781450367707},
	doi = {https://doi.org/10.1145/3324989.3325719},
	author = {Hartwig Anzt and Yen Chen Chen and Terry Cojean and Jack Dongarra and Goran Flegar and Pratik Nayak and Enrique S. Quintana-Orti and Yuhsiang M. Tsai and Weichung Wang}
}