@techreport {, title = {Analysis of the Communication and Computation Cost of FFT Libraries towards Exascale}, journal = {ICL Technical Report}, number = {ICL-UT-22-07}, year = {2022}, month = {2022-07}, publisher = {Innovative Computing Laboratory}, author = {Alan Ayala and Stanimire Tomov and Piotr Luszczek and Sebastien Cayrols and Gerald Ragghianti and Jack Dongarra} } @techreport {, title = {FFT Benchmark Performance Experiments on Systems Targeting Exascale}, journal = {ICL Technical Report}, number = {ICL-UT-22-02}, year = {2022}, month = {2022-03}, author = {Alan Ayala and Stanimire Tomov and Piotr Luszczek and Sebastien Cayrols and Gerald Ragghianti and Jack Dongarra} } @techreport {, title = {Mixed precision and approximate 3D FFTs: Speed for accuracy trade-off with GPU-aware MPI and run-time data compression}, journal = {ICL Technical Report}, number = {ICL-UT-22-04}, year = {2022}, month = {2022-05}, keywords = {All-to-all, Approximate FFTs, ECP, heFFTe, Lossy compression, mixed-precision algorithms, MPI}, author = {Sebastien Cayrols and Jiali Li and George Bosilca and Stanimire Tomov and Alan Ayala and Jack Dongarra} } @article {, title = {Accelerating FFT towards Exascale Computing}, year = {2021}, publisher = {NVIDIA GPU Technology Conference (GTC2021)}, author = {Alan Ayala and Stanimire Tomov and Haidar, Azzam and Stoyanov, M. and Cayrols, Sebastien and Li, Jiali and George Bosilca and Jack Dongarra} } @techreport {, title = {Interim Report on Benchmarking FFT Libraries on High Performance Systems}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-21-03}, year = {2021}, month = {2021-07}, publisher = {University of Tennessee}, type = {ICL Tech Report}, abstract = {The Fast Fourier Transform (FFT) is used in many applications such as molecular dynamics, spectrum estimation, fast convolution and correlation, signal modulation, and many wireless multimedia applications. FFTs are also heavily used in ECP applications, such as EXAALT, Copa, ExaSky-HACC, ExaWind, WarpX, and many others. As these applications{\textquoteright} accuracy and speed depend on the performance of the FFTs, we designed an FFT benchmark to mea- sure performance and scalability of currently available FFT packages and present the results from a pre-Exascale platform. Our benchmarking also stresses the overall capacity of system interconnect; thus, it may be considered as an indicator of the bisection bandwidth, communication contention noise, and the software overheads in MPI collectives that are of interest to many other ECP applications and libraries. This FFT benchmarking project aims to show the strengths and weaknesses of multiple FFT libraries and to indicate what can be done to improve their performance. In particular, we believe that the benchmarking results could help design and implement a fast and robust FFT library for 2D and 3D inputs, while targeting large-scale heterogeneous systems with multicore processors and hardware accelerators that are a co-designed in tandem with ECP applications. Our work involves studying and analyzing state-of-the-art FFT software both from vendors and available as open-source codes to better understand their performance.}, author = {Alan Ayala and Stanimire Tomov and Piotr Luszczek and Cayrols, Sebastien and Ragghianti, Gerald and Jack Dongarra} } @inproceedings {, title = {Scalability Issues in FFT Computation}, journal = {International Conference on Parallel Computing Technologies}, year = {2021}, pages = {279{\textendash}287}, publisher = {Springer}, abstract = {The fast Fourier transform (FFT), is one the most important tools in mathematics, and it is widely required by several applications of science and engineering. State-of-the-art parallel implementations of the FFT algorithm, based on Cooley-Tukey developments, are known to be communication-bound, which causes critical issues when scaling the computational and architectural capabilities. In this paper, we study the main performance bottleneck of FFT computations on hybrid CPU and GPU systems at large-scale. We provide numerical simulations and potential acceleration techniques that can be easily integrated into FFT distributed libraries. We present different experiments on performance scalability and runtime analysis on the world{\textquoteright}s most powerful supercomputers today: Summit, using up to 6,144 NVIDIA V100 GPUs, and Fugaku, using more than one million Fujitsu A64FX cores.}, keywords = {Hybrid systems, Parallel FFT, scalability}, isbn = {978-3-030-86359-3}, doi = {10.1007/978-3-030-86359-3_21}, author = {Alan Ayala and Stanimire Tomov and Stoyanov, Miroslav and Jack Dongarra} } @techreport {1461, title = {FFT-ECP API and High-Performance Library Prototype for 2-D and 3-D FFTs on Large-Scale Heterogeneous Systems with GPUs}, journal = {ECP Milestone Report}, number = {FFT-ECP STML13-27}, year = {2020}, note = {revision 01-2020}, month = {2020-01}, publisher = {Innovative Computing Laboratory, University of Tennessee}, type = {ECP WBS 2.3.3.13 Milestone Report}, author = {Stanimire Tomov and Alan Ayala and Azzam Haidar and Jack Dongarra} } @conference {1481, title = {heFFTe: Highly Efficient FFT for Exascale}, booktitle = {International Conference on Computational Science (ICCS 2020)}, year = {2020}, month = {2020-06}, address = {Amsterdam, Netherlands}, abstract = {Exascale computing aspires to meet the increasing demands from large scientific applications. Software targeting exascale is typically designed for heterogeneous architectures; henceforth, it is not only important to develop well-designed software, but also make it aware of the hardware architecture and efficiently exploit its power. Currently, several and diverse applications, such as those part of the Exascale Computing Project (ECP) in the United States, rely on efficient computation of the Fast Fourier Transform (FFT). In this context, we present the design and implementation of heFFTe (Highly Efficient FFT for Exascale) library, which targets the upcoming exascale supercomputers. We provide highly (linearly) scalable GPU kernels that achieve more than 40{\texttimes} speedup with respect to local kernels from CPU state-of-the-art libraries, and over 2{\texttimes} speedup for the whole FFT computation. A communication model for parallel FFTs is also provided to analyze the bottleneck for large-scale problems. We show experiments obtained on Summit supercomputer at Oak Ridge National Laboratory, using up to 24,576 IBM Power9 cores and 6,144 NVIDIA V-100 GPUs.}, keywords = {exascale, FFT, gpu, scalable algorithm}, doi = {https://doi.org/10.1007/978-3-030-50371-0_19}, author = {Alan Ayala and Stanimire Tomov and Azzam Haidar and Jack Dongarra} } @article {, title = {heFFTe: Highly Efficient FFT for Exascale (Poster)}, year = {2020}, month = {2020-10}, publisher = {NVIDIA GPU Technology Conference (GTC2020)}, author = {Alan Ayala and Stanimire Tomov and Azzam Haidar and Jack Dongarra} } @article {, title = {heFFTe: Highly Efficient FFT for Exascale (Poster)}, year = {2020}, month = {2020-02}, publisher = {SIAM Conference on Parallel Processing for Scientific Computing (SIAM PP20)}, address = {Seattle, WA}, abstract = {Considered one of the top 10 algorithms of the 20th century, the Fast Fourier Transform (FFT) is widely used by applications in science and engineering. Large scale parallel applications targeting exascale, such as those part of the DOE Exascale Computing Project (ECP), are designed for heterogeneous architectures and, currently, more than a dozen ECP applications use FFTs in their codes. To address the applications needs, we developed the highly efficient FFTs for exascale (heFFTe) library. The heFFTe library release features very good weak and strong scalability and performance that is close to 90\% of the roofline peak performance. We present these performance results on the Summit supercomputer. heFFTe is also integrated in a number of applications and we present how the overall performance gets improved by using hFFTe. Performance model, limitations, and challenges are discussed for current and upcoming computer architectures.}, author = {Alan Ayala and Stanimire Tomov and Azzam Haidar and Jack Dongarra} } @article {, title = {heFFTe: Highly Efficient FFT for Exascale (Poster)}, year = {2020}, month = {2020-02}, publisher = {2020 Exascale Computing Project Annual Meeting}, address = {Houston, TX}, author = {Alan Ayala and Stanimire Tomov and Jack Dongarra and Azzam Haidar} } @techreport {1322, title = {Design and Implementation for FFT-ECP on Distributed Accelerated Systems}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-19-05}, year = {2019}, month = {2019-04}, publisher = {University of Tennessee}, type = {ECP WBS 2.3.3.09 Milestone Report}, author = {Stanimire Tomov and Azzam Haidar and Alan Ayala and Daniel Schultz and Jack Dongarra} } @article {1329, title = {FFT-ECP Fast Fourier Transform}, year = {2019}, month = {2019-01}, publisher = {2019 ECP Annual Meeting (Research Poster)}, address = {Houston, TX}, author = {Stanimire Tomov and Azzam Haidar and Alan Ayala and Daniel Schultz and Jack Dongarra} } @techreport {1401, title = {FFT-ECP Implementation Optimizations and Features Phase}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-19-12}, year = {2019}, month = {2019-10}, publisher = {University of Tennessee}, author = {Stanimire Tomov and Azzam Haidar and Alan Ayala and Hejer Shaiek and Jack Dongarra} } @article {1385, title = {GPUDirect MPI Communications and Optimizations to Accelerate FFTs on Exascale Systems}, journal = {EuroMPI{\textquoteright}19 Posters, Zurich, Switzerland}, number = {icl-ut-19-06}, year = {2019}, month = {2019-09}, publisher = {ICL}, type = {Extended Abstract}, abstract = {Fast Fourier transforms (FFTs) are used in applications ranging from molecular dynamics and spectrum estimation to machine learn- ing, fast convolution and correlation, signal modulation, wireless multimedia applications, and others. However, FFTs are memory bound, and therefore, to accelerate them, it is crucial to avoid and optimize the FFTs{\textquoteright} communications. To this end, we present a 3-D FFT design for distributed graphics processing unit (GPU) systems that: (1) efficiently uses GPUs{\textquoteright} high bandwidth, (2) reduces global communications algorithmically, when possible, and (3) employs GPUDirect technologies as well as MPI optimizations in the development of high-performance FFTs for large-scale GPU-accelerated systems. We show that these developments and optimizations lead to very good strong scalability and a performance that is close to 90\% of the theoretical peak.}, keywords = {CUDA-Aware MPI, ECP, FFT, FFT-ECP, gpu, GPUDirect}, author = {Hejer Shaiek and Stanimire Tomov and Alan Ayala and Azzam Haidar and Jack Dongarra} } @conference {1403, title = {Impacts of Multi-GPU MPI Collective Communications on Large FFT Computation}, booktitle = {Workshop on Exascale MPI (ExaMPI) at SC19}, year = {2019}, month = {2019-11}, address = {Denver, CO}, keywords = {Collective MPI, Exascale applications, FFT, Heterogeneous systems, scalable}, author = {Alan Ayala and Stanimire Tomov and Xi Luo and Hejer Shaiek and Azzam Haidar and George Bosilca and Jack Dongarra} }