@conference {, title = {Using Additive Modifications in LU Factorization Instead of Pivoting}, booktitle = {37th ACM International Conference on Supercomputing (ICS{\textquoteright}23)}, year = {2023}, month = {2023-06}, publisher = {ACM}, organization = {ACM}, address = {Orlando, FL}, doi = {10.1145/3577193.3593731}, author = {Neil Lindquist and Piotr Luszczek and Jack Dongarra} } @article {, title = {Using Ginkgo{\textquoteright}s memory accessor for improving the accuracy of memory-bound low precision BLAS}, journal = {Software: Practice and Experience}, volume = {532}, year = {2023}, month = {Jan-01-2023}, pages = {81 - 98}, issn = {0038-0644}, doi = {10.1002/spe.v53.110.1002/spe.3041}, url = {https://doi.org/10.1002/spe.3041}, author = {Gr{\"u}tzmacher, Thomas and Anzt, Hartwig and Quintana-Ort{\'\i}, Enrique S.} } @article {, title = {Using long vector extensions for MPI reductions}, journal = {Parallel Computing}, volume = {109}, year = {2022}, month = {2022-03}, pages = {102871}, abstract = {The modern CPU{\textquoteright}s design, including the deep memory hierarchies and SIMD/vectorization capability have a more significant impact on algorithms{\textquoteright} efficiency than the modest frequency increase observed recently. The current introduction of wide vector instruction set extensions (AVX and SVE) motivated vectorization to become a critical software component to increase efficiency and close the gap to peak performance. In this paper, we investigate the impact of the vectorization of MPI reduction operations. We propose an implementation of predefined MPI reduction operations using vector intrinsics (AVX and SVE) to improve the time-to-solution of the predefined MPI reduction operations. The evaluation of the resulting software stack under different scenarios demonstrates that the approach is not only efficient but also generalizable to many vector architectures. Experiments conducted on varied architectures (Intel Xeon Gold, AMD Zen 2, and Arm A64FX), show that the proposed vector extension optimized reduction operations significantly reduce completion time for collective communication reductions. With these optimizations, we achieve higher memory bandwidth and an increased efficiency for local computations, which directly benefit the overall cost of collective reductions and applications based on them.}, issn = {01678191}, doi = {10.1016/j.parco.2021.102871}, url = {https://www.sciencedirect.com/science/article/pii/S0167819121001137}, author = {Zhong, Dong and Cao, Qinglei and George Bosilca and Dongarra, Jack} } @conference {, title = {Using Advanced Vector Extensions AVX-512 for MPI Reduction}, booktitle = {EuroMPI/USA {\textquoteright}20: 27th European MPI Users{\textquoteright} Group Meeting}, year = {2020}, month = {2020-09}, address = {Austin, TX}, abstract = {As the scale of high-performance computing (HPC) systems continues to grow, researchers are devoted themselves to explore increasing levels of parallelism to achieve optimal performance. The modern CPU{\textquoteright}s design, including its features of hierarchical memory and SIMD/vectorization capability, governs algorithms{\textquoteright} efficiency. The recent introduction of wide vector instruction set extensions (AVX and SVE) motivated vectorization to become of critical importance to increase efficiency and close the gap to peak performance. In this paper, we propose an implementation of predefined MPI reduction operations utilizing AVX, AVX2 and AVX-512 intrinsics to provide vector-based reduction operation and to improve the timeto- solution of these predefined MPI reduction operations. With these optimizations, we achieve higher efficiency for local computations, which directly benefit the overall cost of collective reductions. The evaluation of the resulting software stack under different scenarios demonstrates that the solution is at the same time generic and efficient. Experiments are conducted on an Intel Xeon Gold cluster, which shows our AVX-512 optimized reduction operations achieve 10X performance benefits than Open MPI default for MPI local reduction.}, keywords = {Instruction level parallelism, Intel AVX2/AVX-512, Long vector extension, MPI reduction operation, Single instruction multiple data, Vector operation}, doi = {https://doi.org/10.1145/3416315.3416316}, author = {Dong Zhong and Qinglei Cao and George Bosilca and Jack Dongarra} } @article {, title = {Using Advanced Vector Extensions AVX-512 for MPI Reduction (Poster)}, year = {2020}, month = {2020-09}, publisher = {EuroMPI/USA {\textquoteright}20: 27th European MPI Users{\textquoteright} Group Meeting}, address = {Austin, TX}, author = {Dong Zhong and George Bosilca and Qinglei Cao and Jack Dongarra} } @conference {1484, title = {Using Arm Scalable Vector Extension to Optimize Open MPI}, booktitle = {20th IEEE/ACM International Symposium on Cluster, Cloud and Internet Computing (CCGRID 2020)}, year = {2020}, month = {2020-05}, publisher = {IEEE/ACM}, organization = {IEEE/ACM}, address = {Melbourne, Australia}, abstract = {As the scale of high-performance computing (HPC) systems continues to grow, increasing levels of parallelism must be implored to achieve optimal performance. Recently, the processors support wide vector extensions, vectorization becomes much more important to exploit the potential peak performance of target architecture. Novel processor architectures, such as the Armv8-A architecture, introduce Scalable Vector Extension (SVE) - an optional separate architectural extension with a new set of A64 instruction encodings, which enables even greater parallelisms. In this paper, we analyze the usage and performance of the SVE instructions in Arm SVE vector Instruction Set Architecture (ISA); and utilize those instructions to improve the memcpy and various local reduction operations. Furthermore, we propose new strategies to improve the performance of MPI operations including datatype packing/unpacking and MPI reduction. With these optimizations, we not only provide a higher-parallelism for a single node, but also achieve a more efficient communication scheme of message exchanging. The resulting efforts have been implemented in the context of OPEN MPI, providing efficient and scalable capabilities of SVE usage and extending the possible implementations of SVE to a more extensive range of programming and execution paradigms. The evaluation of the resulting software stack under different scenarios with both simulator and Fujitsu{\textquoteright}s A64FX processor demonstrates that the solution is at the same time generic and efficient.}, keywords = {ARMIE, datatype pack and unpack, local reduction, non-contiguous accesses, SVE, Vector Length Agnostic}, doi = {https://doi.org/10.1109/CCGrid49817.2020.00-71}, author = {Dong Zhong and Pavel Shamis and Qinglei Cao and George Bosilca and Jack Dongarra} } @article {, title = {Using Quantized Integer in LU Factorization with Partial Pivoting (Poster)}, year = {2020}, month = {2020-02}, publisher = {SIAM Conference on Parallel Processing for Scientific Computing (SIAM PP20)}, address = {Seattle, WA}, abstract = {Quantization is a common technique to speed the deep learning inference. It is using integers with a shared scalar to represent a set of equally spaced numbers. The quantized integer method has shown great success in compressing the deep learning models, reducing the computation cost without losing too much accuracy. New application specific hardware and specialized CPU extension instructions like Intel AVX-512 VNNI are providing capabilities for us to do integer MADD (multiply and add) efficiently. In this poster, we would like to show our preliminary results of using quantization integers for LU factorization with partial pivoting. Using Int32, the backward error can outperform single precision. However, quantized integer has the similar issue of limited range as FP16 that it would not work directly for large matrices because of big numbers would occur in factored U. We will show some possible solutions to it and how we would like to apply this quantized integer technique to other numerical linear algebra applications.}, author = {Yaohung Tsai and Piotr Luszczek and Jack Dongarra} } @article {1390, title = {Understanding Native Event Semantics}, year = {2019}, month = {2019-04}, publisher = {9th JLESC Workshop}, address = {Knoxville, TN}, author = {Anthony Danalis and Heike Jagode and Daniel Barry and Jack Dongarra} } @conference {, title = {Understanding Scalability and Fine-Grain Parallelism of Synchronous Data Parallel Training}, booktitle = {2019 IEEE/ACM Workshop on Machine Learning in High Performance Computing Environments (MLHPC)}, year = {2019}, month = {2019-11}, publisher = {IEEE}, organization = {IEEE}, address = {Denver, CO}, abstract = {In the age of big data, deep learning has emerged as a powerful tool to extract insight and exploit its value, both in industry and scientific applications. With increasing complexity of learning models and amounts of training data, data-parallel approaches based on frequent all-reduce synchronization steps are increasingly popular. Despite the fact that high-performance computing (HPC) technologies have been designed to address such patterns efficiently, the behavior of data-parallel approaches on HPC platforms is not well understood. To address this issue, in this paper we study the behavior of Horovod, a popular data-parallel approach that relies on MPI, on Theta, a pre-Exascale machine at Argonne National Laboratory. Using two representative applications, we explore two aspects: (1) how performance and scalability is affected by important parameters such as number of nodes, number of workers, threads per node, batch size; (2) how computational phases are interleaved withall-reduce communication phases at fine granularity and what consequences this interleaving has in terms of potential bottlenecks. Our findings show that pipelining of back-propagation, gradient reduction and weight updates mitigate the effects of stragglers during all-reduce only partially. Furthermore, there can be significant delays between weights update, which can be leveraged to mask the overhead of additional background operations that are coupled with the training.}, doi = {https://doi.org/10.1109/MLHPC49564.2019.00006}, author = {Jiali Li and Bogdan Nicolae and Justin M. Wozniak and George Bosilca} } @article {1332, title = {Using GPU FP16 Tensor Cores Arithmetic to Accelerate Mixed-Precision Iterative Refinement Solvers and Reduce Energy Consumption}, year = {2018}, month = {2018-06}, publisher = {ISC High Performance (ISC18), Best Poster Award}, address = {Frankfurt, Germany}, author = {Azzam Haidar and Stanimire Tomov and Ahmad Abdelfattah and Mawussi Zounon and Jack Dongarra} } @conference {1265, title = {Using GPU FP16 Tensor Cores Arithmetic to Accelerate Mixed-Precision Iterative Refinement Solvers and Reduce Energy Consumption}, booktitle = {ISC High Performance (ISC{\textquoteright}18), Best Poster}, year = {2018}, month = {2018-06}, address = {Frankfurt, Germany}, author = {Azzam Haidar and Stanimire Tomov and Ahmad Abdelfattah and Mawussi Zounon and Jack Dongarra} } @article {1221, title = {Using Jacobi Iterations and Blocking for Solving Sparse Triangular Systems in Incomplete Factorization Preconditioning}, journal = {Journal of Parallel and Distributed Computing}, volume = {119}, year = {2018}, month = {2018-11}, pages = {219{\textendash}230}, abstract = {When using incomplete factorization preconditioners with an iterative method to solve large sparse linear systems, each application of the preconditioner involves solving two sparse triangular systems. These triangular systems are challenging to solve efficiently on computers with high levels of concurrency. On such computers, it has recently been proposed to use Jacobi iterations, which are highly parallel, to approximately solve the triangular systems from incomplete factorizations. The effectiveness of this approach, however, is problem-dependent: the Jacobi iterations may not always converge quickly enough for all problems. Thus, as a necessary and important step to evaluate this approach, we experimentally test the approach on a large number of realistic symmetric positive definite problems. We also show that by using block Jacobi iterations, we can extend the range of problems for which such an approach can be effective. For block Jacobi iterations, it is essential for the blocking to be cognizant of the matrix structure.}, doi = {https://doi.org/10.1016/j.jpdc.2018.04.017}, author = {Edmond Chow and Hartwig Anzt and Jennifer Scott and Jack Dongarra} } @conference {1127, title = {Using Software-Based Performance Counters to Expose Low-Level Open MPI Performance Information}, booktitle = {EuroMPI}, year = {2017}, month = {2017-09}, publisher = {ACM}, organization = {ACM}, address = {Chicago, IL}, abstract = {This paper details the implementation and usage of software-based performance counters to understand the performance of a particular implementation of the MPI standard, Open MPI. Such counters can expose intrinsic features of the software stack that are not available otherwise in a generic and portable way. The PMPI-interface is useful for instrumenting MPI applications at a user level, however it is insufficient for providing meaningful internal MPI performance details. While the Peruse interface provides more detailed information on state changes within Open MPI, it has not seen widespread adoption. We introduce a simple low-level approach that instruments the Open MPI code at key locations to provide fine-grained MPI performance metrics. We evaluate the overhead associated with adding these counters to Open MPI as well as their use in determining bottlenecks and areas for improvement both in user code and the MPI implementation itself.}, keywords = {MPI, Performance Counters, Profiling, Tools}, isbn = {978-1-4503-4849-2/17/09}, doi = {https://doi.org/10.1145/3127024.3127039}, url = {https://dl.acm.org/citation.cfm?id=3127024}, author = {David Eberius and Thananon Patinyasakdikul and George Bosilca} } @article {995, title = {Updating Incomplete Factorization Preconditioners for Model Order Reduction}, journal = {Numerical Algorithms}, volume = {73}, number = {3}, year = {2016}, month = {2016-02}, pages = {611{\textendash}630}, abstract = {When solving a sequence of related linear systems by iterative methods, it is common to reuse the preconditioner for several systems, and then to recompute the preconditioner when the matrix has changed significantly. Rather than recomputing the preconditioner from scratch, it is potentially more efficient to update the previous preconditioner. Unfortunately, it is not always known how to update a preconditioner, for example, when the preconditioner is an incomplete factorization. A recently proposed iterative algorithm for computing incomplete factorizations, however, is able to exploit an initial guess, unlike existing algorithms for incomplete factorizations. By treating a previous factorization as an initial guess to this algorithm, an incomplete factorization may thus be updated. We use a sequence of problems from model order reduction. Experimental results using an optimized GPU implementation show that updating a previous factorization can be inexpensive and effective, making solving sequences of linear systems a potential niche problem for the iterative incomplete factorization algorithm.}, keywords = {key publication}, doi = {10.1007/s11075-016-0110-2}, author = {Hartwig Anzt and Edmond Chow and Jens Saak and Jack Dongarra} } @inproceedings {1309, title = {UCX: An Open Source Framework for HPC Network APIs and Beyond}, journal = {2015 IEEE 23rd Annual Symposium on High-Performance Interconnects}, year = {2015}, month = {Aug}, pages = {40-43}, publisher = {IEEE}, address = {Santa Clara, CA, USA}, abstract = {This paper presents Unified Communication X (UCX), a set of network APIs and their implementations for high throughput computing. UCX comes from the combined effort of national laboratories, industry, and academia to design and implement a high-performing and highly-scalable network stack for next generation applications and systems. UCX design provides the ability to tailor its APIs and network functionality to suit a wide variety of application domains and hardware. We envision these APIs to satisfy the networking needs of many programming models such as Message Passing Interface (MPI), OpenSHMEM, Partitioned Global Address Space (PGAS) languages, task-based paradigms and I/O bound applications. To evaluate the design we implement the APIs and protocols, and measure the performance of overhead-critical network primitives fundamental for implementing many parallel programming models and system libraries. Our results show that the latency, bandwidth, and message rate achieved by the portable UCX prototype is very close to that of the underlying driver. With UCX, we achieved a message exchange latency of 0.89 us, a bandwidth of 6138.5 MB/s, and a message rate of 14 million messages per second. As far as we know, this is the highest bandwidth and message rate achieved by any network stack (publicly known) on this hardware.}, keywords = {application program interfaces, Bandwidth, Electronics packaging, Hardware, high throughput computing, highly-scalable network stack, HPC, HPC network APIs, I/O bound applications, Infiniband, input-output programs, Libraries, Memory management, message passing, message passing interface, Middleware, MPI, open source framework, OpenSHMEM, parallel programming, parallel programming models, partitioned global address space languages, PGAS, PGAS languages, Programming, protocols, public domain software, RDMA, system libraries, task-based paradigms, UCX, Unified Communication X}, isbn = {978-1-4673-9160-3}, doi = {10.1109/HOTI.2015.13}, author = {P. Shamis and Manjunath Gorentla Venkata and M. Graham Lopez and M. B. Baker and O. Hernandez and Y. Itigin and M. Dubman and G. Shainer and R. L. Graham and L. Liss and Y. Shahar and S. Potluri and D. Rossetti and D. Becker and D. Poole and C. Lamb and S. Kumar and C. Stunkel and George Bosilca and Aurelien Bouteiller} } @conference {809, title = {Unified Development for Mixed Multi-GPU and Multi-Coprocessor Environments using a Lightweight Runtime Environment}, booktitle = {IPDPS 2014}, year = {2014}, month = {2014-05}, publisher = {IEEE}, organization = {IEEE}, address = {Phoenix, AZ}, abstract = {Many of the heterogeneous resources available to modern computers are designed for different workloads. In order to efficiently use GPU resources, the workload must have a greater degree of parallelism than a workload designed for multicore-CPUs. And conceptually, the Intel Xeon Phi coprocessors are capable of handling workloads somewhere in between the two. This multitude of applicable workloads will likely lead to mixing multicore-CPUs, GPUs, and Intel coprocessors in multi-user environments that must offer adequate computing facilities for a wide range of workloads. In this work, we are using a lightweight runtime environment to manage the resourcespecific workload, and to control the dataflow and parallel execution in two-way hybrid systems. The lightweight runtime environment uses task superscalar concepts to enable the developer to write serial code while providing parallel execution. In addition, our task abstractions enable unified algorithmic development across all the heterogeneous resources. We provide performance results for dense linear algebra applications, demonstrating the effectiveness of our approach and full utilization of a wide variety of accelerator hardware.}, keywords = {algorithms, Computer science, CUDA, Heterogeneous systems, Intel Xeon Phi, linear algebra, nVidia, Tesla K20, Tesla M2090}, author = {Azzam Haidar and Chongxiao Cao and Jack Dongarra and Piotr Luszczek and Stanimire Tomov} } @article {826, title = {Unveiling the Performance-energy Trade-off in Iterative Linear System Solvers for Multithreaded Processors}, journal = {Concurrency and Computation: Practice and Experience}, volume = {27}, year = {2014}, month = {2014-09}, pages = {885-904}, chapter = {885}, abstract = {In this paper, we analyze the interactions occurring in the triangle performance-power-energy for the execution of a pivotal numerical algorithm, the iterative conjugate gradient (CG) method, on a diverse collection of parallel multithreaded architectures. This analysis is especially timely in a decade where the power wall has arisen as a major obstacle to build faster processors. Moreover, the CG method has recently been proposed as a complement to the LINPACK benchmark, as this iterative method is argued to be more archetypical of the performance of today{\textquoteright}s scientific and engineering applications. To gain insights about the benefits of hands-on optimizations we include runtime and energy efficiency results for both out-of-the-box usage relying exclusively on compiler optimizations, and implementations manually optimized for target architectures, that range from general-purpose and digital signal multicore processors to manycore graphics processing units, all representative of current multithreaded systems.}, keywords = {CG, CPUs, energy efficiency, GPUs, low-power architectures}, doi = {10.1002/cpe.3341}, url = {http://dx.doi.org/10.1002/cpe.3341}, author = {Jos{\'e} I. Aliaga and Hartwig Anzt and Maribel Castillo and Juan C. Fern{\'a}ndez and Germ{\'a}n Le{\'o}n and Joaqu{\'\i}n P{\'e}rez and Enrique S. Quintana-Orti} } @conference {768, title = {Utilizing Dataflow-based Execution for Coupled Cluster Methods}, booktitle = {2014 IEEE International Conference on Cluster Computing}, number = {ICL-UT-14-02}, year = {2014}, month = {2014-09}, publisher = {IEEE}, organization = {IEEE}, address = {Madrid, Spain}, abstract = {Computational chemistry comprises one of the driving forces of High Performance Computing. In particular, many-body methods, such as Coupled Cluster (CC) methods of the quantum chemistry package NWCHEM, are of particular interest for the applied chemistry community. Harnessing large fractions of the processing power of modern large scale computing platforms has become increasingly difficult. With the increase in scale, complexity, and heterogeneity of modern platforms, traditional programming models fail to deliver the expected performance scalability. On our way to Exascale and with these extremely hybrid platforms, dataflow-based programming models may be the only viable way for achieving and maintaining computation at scale. In this paper, we discuss a dataflow-based programming model and its applicability to NWCHEM{\textquoteright}s CC methods. Our dataflow version of the CC kernels breaks down the algorithm into fine-grained tasks with explicitly defined data dependencies. As a result, many of the traditional synchronization points can be eliminated, allowing for a dynamic reshaping of the execution based on the ongoing availability of computational resources. We build this experiment using PARSEC {\textendash} a task-based dataflow-driven execution engine {\textendash} that enables efficient task scheduling on distributed systems, providing a desirable portability layer for application developers.}, author = {Heike McCraw and Anthony Danalis and George Bosilca and Jack Dongarra and Karol Kowalski and Theresa Windus} } @article {748, title = {Unified Model for Assessing Checkpointing Protocols at Extreme-Scale}, journal = {Concurrency and Computation: Practice and Experience}, year = {2013}, month = {2013-11}, abstract = {In this paper, we present a unified model for several well-known checkpoint/restart protocols. The proposed model is generic enough to encompass both extremes of the checkpoint/restart space, from coordinated approaches to a variety of uncoordinated checkpoint strategies (with message logging). We identify a set of crucial parameters, instantiate them, and compare the expected efficiency of the fault tolerant protocols, for a given application/platform pair. We then propose a detailed analysis of several scenarios, including some of the most powerful currently available high performance computing platforms, as well as anticipated Exascale designs. The results of this analytical comparison are corroborated by a comprehensive set of simulations. Altogether, they outline comparative behaviors of checkpoint strategies at very large scale, thereby providing insight that is hardly accessible to direct experimentation.}, doi = {10.1002/cpe.3173}, author = {George Bosilca and Aurelien Bouteiller and Elisabeth Brunet and Franck Cappello and Jack Dongarra and Amina Guermouche and Thomas Herault and Yves Robert and Frederic Vivien and Dounia Zaidouni} } @techreport {icl:716, title = {Unified Model for Assessing Checkpointing Protocols at Extreme-Scale}, journal = {University of Tennessee Computer Science Technical Report (also LAWN 269)}, number = {UT-CS-12-697}, year = {2012}, month = {2012-06}, author = {George Bosilca and Aurelien Bouteiller and Elisabeth Brunet and Franck Cappello and Jack Dongarra and Amina Guermouche and Thomas Herault and Yves Robert and Frederic Vivien and Dounia Zaidouni} } @inproceedings {icl:736, title = {User Level Failure Mitigation in MPI}, journal = {Euro-Par 2012: Parallel Processing Workshops}, volume = {7640}, year = {2012}, month = {2012-08}, pages = {499-504}, publisher = {Springer Berlin Heidelberg}, address = {Rhodes Island, Greece}, keywords = {ftmpi}, author = {Wesley Bland}, editor = {Ioannis Caragiannis and Michael Alexander and Rosa M. Badia and Mario Cannataro and Alexandru Costan and Marco Danelutto and Frederic Desprez and Bettina Krammer and Sahuquillo, J. and Stephen L. Scott and J. Weidendorfer} } @inproceedings {icl:593, title = {A Unified HPC Environment for Hybrid Manycore/GPU Distributed Systems}, journal = {IEEE International Parallel and Distributed Processing Symposium (submitted)}, year = {2011}, month = {2011-05}, address = {Anchorage, AK}, keywords = {dague}, author = {George Bosilca and Aurelien Bouteiller and Thomas Herault and Pierre Lemariner and Narapat Ohm Saengpatsa and Stanimire Tomov and Jack Dongarra} } @article {icl:618, title = {User-Defined Events for Hardware Performance Monitoring}, journal = {Procedia Computer Science}, volume = {4}, year = {2011}, month = {2011-05}, pages = {2096-2104}, publisher = {Elsevier}, abstract = {PAPI is a widely used cross-platform interface to hardware performance counters. PAPI currently supports native events, which are those provided by a given platform, and preset events, which are pre-defined events thought to be common across platforms. Presets are currently mapped and defined at the time that PAPI is compiled and installed. The idea of user-defined events is to allow users to define their own metrics and to have those metrics mapped to events on a platform without the need to re-install PAPI. User-defined events can be defined in terms of native, preset, and previously defined user-defined events. The user can combine events and constants in an arbitrary expression to define a new metric and give a name to the new metric. This name can then be specified as a PAPI event in a PAPI library call the same way as native and preset events. End-user tools such as TAU and Scalasca that use PAPI can also use the user-defined metrics. Users can publish their metric definitions so that other users can use them as well. We present several examples of how user-defined events can be used for performance analysis and modeling.}, keywords = {mumi, papi}, doi = {https://doi.org/10.1016/j.procs.2011.04.229}, author = {Shirley Moore and James Ralph} } @article {icl:620, title = {Using MAGMA with PGI Fortran}, journal = {PGI Insider}, year = {2010}, month = {2010-11}, keywords = {magma}, author = {Stanimire Tomov and Mathieu Faverge and Piotr Luszczek and Jack Dongarra} } @article {icl:544, title = {Using multiple levels of parallelism to enhance the performance of domain decomposition solvers}, journal = {Parallel Computing}, volume = {36}, number = {5-6}, year = {2010}, month = {2010-00}, pages = {285-296}, publisher = {Elsevier journals}, author = {Luc Giraud and Azzam Haidar and Stephane Pralet}, editor = {Costas Bekas and Pascua D{\textquoteright}Ambra and Ananth Grama and Yousef Saad and Petko Yanev} } @inproceedings {icl:412, title = {Usage of the Scalasca Toolset for Scalable Performance Analysis of Large-scale Parallel Applications}, journal = {Proceedings of the 2nd International Workshop on Tools for High Performance Computing}, year = {2008}, month = {2008-01}, pages = {157-167}, publisher = {Springer}, address = {Stuttgart, Germany}, keywords = {point}, author = {Felix Wolf and Brian Wylie and Erika Abraham and Wolfgang Frings and Karl F{\"u}rlinger and Markus Geimer and Marc-Andre Hermanns and Bernd Mohr and Shirley Moore and Matthias Pfeifer}, editor = {Michael Resch and Rainer Keller and Valentin Himmler and Bettina Krammer and A Schulz} } @techreport {icl:430, title = {Using dual techniques to derive componentwise and mixed condition numbers for a linear functional of a linear least squares solution}, journal = {University of Tennessee Computer Science Technical Report, UT-CS-08-622 (also LAPACK Working Note 207)}, year = {2008}, month = {2008-01}, author = {Marc Baboulin and Serge Gratton} } @article {icl:424, title = {Using Mixed Precision for Sparse Matrix Computations to Enhance the Performance while Achieving 64-bit Accuracy}, journal = {ACM Transactions on Mathematical Software}, volume = {34}, number = {4}, year = {2008}, month = {2008-00}, pages = {17-22}, keywords = {plasma}, author = {Alfredo Buttari and Jack Dongarra and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov} } @article {icl:401, title = {The Use of Bulk States to Accelerate the Band Edge State Calculation of a Semiconductor Quantum Dot}, journal = {Journal of Computational Physics}, volume = {223}, year = {2007}, month = {2007-00}, pages = {774-782}, author = {Christof Voemel and Stanimire Tomov and Lin-Wang Wang and Osni Marques and Jack Dongarra} } @inproceedings {icl:382, title = {On Using Incremental Profiling for the Performance Analysis of Shared Memory Parallel Applications}, journal = {Proceedings of the 13th International Euro-Par Conference on Parallel Processing (Euro-Par {\textquoteright}07)}, year = {2007}, month = {2007-01}, publisher = {Springer LNCS}, address = {Rennes, France}, keywords = {kojak}, author = {Karl F{\"u}rlinger and Jack Dongarra and Michael Gerndt} } @article {icl:326, title = {The use of bulk states to accelerate the band edge state calculation of a semiconductor quantum dot}, journal = {Journal of Computational Physics (submitted)}, year = {2006}, month = {2006-01}, keywords = {doe-nano}, author = {Christof Voemel and Stanimire Tomov and Lin-Wang Wang and Osni Marques and Jack Dongarra} } @article {icl:125, title = {An Updated Set of Basic Linear Algebra Subprograms (BLAS)}, journal = {ACM Transactions on Mathematical Software}, volume = {28}, number = {2}, year = {2002}, month = {2002-12}, pages = {135-151}, doi = {10.1145/567806.567807}, author = {Susan Blackford and James Demmel and Jack Dongarra and Iain Duff and Sven Hammarling and Greg Henry and Michael Heroux and Linda Kaufman and Andrew Lumsdaine and Antoine Petitet and Roldan Pozo and Karin Remington and Clint Whaley} } @techreport {icl:96, title = {Users{\textquoteright} Guide to NetSolve v1.4.1}, journal = {ICL Technical Report}, number = {ICL-UT-02-05}, year = {2002}, month = {2002-06}, keywords = {netsolve}, author = {Sudesh Agrawal and Dorian Arnold and Susan Blackford and Jack Dongarra and Michelle Miller and Kiran Sagi and Zhiao Shi and Keith Seymour and Sathish Vadhiyar} } @conference {icl:11, title = {Using PAPI for Hardware Performance Monitoring on Linux Systems}, booktitle = {Conference on Linux Clusters: The HPC Revolution}, year = {2001}, month = {2001-06}, publisher = {Linux Clusters Institute}, organization = {Linux Clusters Institute}, address = {Urbana, Illinois}, abstract = {PAPI is a specification of a cross-platform interface to hardware performance counters on modern microprocessors. These counters exist as a small set of registers that count events, which are occurrences of specific signals related to a processor{\textquoteright}s function. Monitoring these events has a variety of uses in application performance analysis and tuning. The PAPI specification consists of both a standard set of events deemed most relevant for application performance tuning, as well as both high-level and low-level sets of routines for accessing the counters. The high level interface simply provides the ability to start, stop, and read sets of events, and is intended for the acquisition of simple but accurate measurement by application engineers. The fully programmable low-level interface provides sophisticated options for controlling the counters, such as setting thresholds for interrupt on overflow, as well as access to all native counting modes and events, and is intended for third-party tool writers or users with more sophisticated needs. PAPI has been implemented on a number of platforms, including Linux/x86 and Linux/IA-64. The Linux/x86 implementation requires a kernel patch that provides a driver for the hardware counters. The driver memory maps the counter registers into user space and allows virtualizing the counters on a perprocess or per-thread basis. The kernel patch is being proposed for inclusion in the main Linux tree. The PAPI library provides access on Linux platforms not only to the standard set of events mentioned above but also to all the Linux/x86 and Linux/IA-64 native events. PAPI has been installed and is in use, either directly or through incorporation into third-party end-user performance analysis tools, on a number of Linux clusters, including the New Mexico LosLobos cluster and Linux clusters at NCSA and the University of Tennessee being used for the GrADS (Grid Application Development Software) project. }, keywords = {papi}, author = {Jack Dongarra and Kevin London and Shirley Moore and Phil Mucci and Dan Terpstra} }