@article {icl:567, title = {Self-Healing Network for Scalable Fault-Tolerant Runtime Environments}, journal = {Future Generation Computer Systems}, volume = {26}, number = {3}, year = {2010}, month = {2010-03}, pages = {479-485}, author = {Thara Angskun and Graham Fagg and George Bosilca and Jelena Pjesivac{\textendash}Grbovic and Jack Dongarra} } @article {icl:357, title = {Decision Trees and MPI Collective Algorithm Selection Problem}, journal = {Euro-Par 2007}, year = {2007}, month = {2007-08}, pages = {105{\textendash}115}, publisher = {Springer}, address = {Rennes, France}, keywords = {ftmpi}, author = {Jelena Pjesivac{\textendash}Grbovic and George Bosilca and Graham Fagg and Thara Angskun and Jack Dongarra} } @article {icl:356, title = {MPI Collective Algorithm Selection and Quadtree Encoding}, journal = {Parallel Computing (Special Edition: EuroPVM/MPI 2006)}, year = {2007}, month = {2007-00}, publisher = {Elsevier}, keywords = {ftmpi}, author = {Jelena Pjesivac{\textendash}Grbovic and George Bosilca and Graham Fagg and Thara Angskun and Jack Dongarra} } @inbook {875, title = {A New Approach to MPI Collective Communication Implementations}, booktitle = {Distributed and Parallel Systems}, year = {2007}, pages = {45-54}, publisher = {Springer US}, organization = {Springer US}, abstract = {Recent research into the optimization of collective MPI operations has resulted in a wide variety of algorithms and corresponding implementations, each typically only applicable in a relatively narrow scope: on a specific architecture, on a specific network, with a specific number of processes, with a specific data size and/or data-type {\textendash} or any combination of these (or other) factors. This situation presents an enormous challenge to portable MPI implementations which are expected to provide optimized collective operation performance on all platforms. Many portable implementations have attempted to provide a token number of algorithms that are intended to realize good performance on most systems. However, many platform configurations are still left without well-tuned collective operations. This paper presents a proposal for a framework that will allow a wide variety of collective algorithm implementations and a flexible, multi-tiered selection process for choosing which implementation to use when an application invokes an MPI collective function.}, keywords = {Automatic Selection, Collective Operation, Framework, Message Passing (MPI), Open MPI}, isbn = {978-0-387-69857-1}, doi = {10.1007/978-0-387-69858-8_5}, author = {Torsten Hoefler and Jeffrey M. Squyres and Graham Fagg and George Bosilca and Wolfgang Rehm and Andrew Lumsdaine} } @article {icl:358, title = {Performance Analysis of MPI Collective Operations}, journal = {Cluster computing}, volume = {10}, number = {2}, year = {2007}, month = {2007-06}, pages = {127-143}, publisher = {Springer Netherlands}, keywords = {ftmpi}, author = {Jelena Pjesivac{\textendash}Grbovic and Thara Angskun and George Bosilca and Graham Fagg and Edgar Gabriel and Jack Dongarra} } @inproceedings {icl:354, title = {Reliability Analysis of Self-Healing Network using Discrete-Event Simulation}, journal = {Proceedings of Seventh IEEE International Symposium on Cluster Computing and the Grid (CCGrid {\textquoteright}07)}, year = {2007}, month = {2007-05}, pages = {437-444}, publisher = {IEEE Computer Society}, keywords = {ftmpi}, author = {Thara Angskun and George Bosilca and Graham Fagg and Jelena Pjesivac{\textendash}Grbovic and Jack Dongarra} } @article {icl:315, title = {Flexible collective communication tuning architecture applied to Open MPI}, journal = {2006 Euro PVM/MPI (submitted)}, year = {2006}, month = {2006-01}, address = {Bonn, Germany}, keywords = {ftmpi}, author = {Graham Fagg and Jelena Pjesivac{\textendash}Grbovic and George Bosilca and Thara Angskun and Jack Dongarra} } @article {icl:322, title = {FT-MPI, Fault-Tolerant Metacomputing and Generic Name Services: A Case Study}, journal = {Lecture Notes in Computer Science}, volume = {4192}, number = {ICL-UT-06-14}, year = {2006}, month = {2006-00}, pages = {133-140}, publisher = {Springer Berlin / Heidelberg}, keywords = {ftmpi}, author = {David Dewolfs and Jan Broeckhove and Vaidy Sunderam and Graham Fagg} } @article {icl:650, title = {Implementation and Usage of the PERUSE-Interface in Open MPI}, journal = {Euro PVM/MPI 2006}, year = {2006}, month = {2006-09}, address = {Bonn, Germany}, author = {Rainer Keller and George Bosilca and Graham Fagg and Michael Resch and Jack Dongarra} } @techreport {icl:314, title = {MPI Collective Algorithm Selection and Quadtree Encoding}, journal = {ICL Technical Report}, number = {ICL-UT-06-11}, year = {2006}, month = {2006-00}, keywords = {ftmpi}, author = {Jelena Pjesivac{\textendash}Grbovic and Graham Fagg and Thara Angskun and George Bosilca and Jack Dongarra} } @article {icl:323, title = {MPI Collective Algorithm Selection and Quadtree Encoding}, journal = {Lecture Notes in Computer Science}, volume = {4192}, number = {ICL-UT-06-13}, year = {2006}, month = {2006-09}, pages = {40-48}, publisher = {Springer Berlin / Heidelberg}, keywords = {ftmpi}, author = {Jelena Pjesivac{\textendash}Grbovic and Graham Fagg and Thara Angskun and George Bosilca and Jack Dongarra} } @inproceedings {icl:310, title = {Proposal of MPI operation level Checkpoint/Rollback and one implementation}, journal = {Proceedings of IEEE CCGrid 2006}, year = {2006}, month = {2006-01}, publisher = {IEEE Computer Society}, keywords = {HARNESS/FT-PI}, author = {Yuan Tang and Graham Fagg and Jack Dongarra} } @article {icl:316, title = {Scalable Fault Tolerant Protocol for Parallel Runtime Environments}, journal = {2006 Euro PVM/MPI}, number = {ICL-UT-06-12}, year = {2006}, month = {2006-00}, address = {Bonn, Germany}, keywords = {ftmpi}, author = {Thara Angskun and Graham Fagg and George Bosilca and Jelena Pjesivac{\textendash}Grbovic and Jack Dongarra} } @article {icl:332, title = {Self Adapting Numerical Software SANS Effort}, journal = {IBM Journal of Research and Development}, volume = {50}, number = {2/3}, year = {2006}, month = {2006-01}, pages = {223-238}, keywords = {gco}, author = {George Bosilca and Zizhong Chen and Jack Dongarra and Victor Eijkhout and Graham Fagg and Erika Fuentes and Julien Langou and Piotr Luszczek and Jelena Pjesivac{\textendash}Grbovic and Keith Seymour and Haihang You and Sathish Vadhiyar} } @inproceedings {icl:330, title = {Self-Healing Network for Scalable Fault Tolerant Runtime Environments}, journal = {DAPSYS 2006, 6th Austrian-Hungarian Workshop on Distributed and Parallel Systems}, year = {2006}, month = {2006-01}, address = {Innsbruck, Austria}, author = {Thara Angskun and Graham Fagg and George Bosilca and Jelena Pjesivac{\textendash}Grbovic and Jack Dongarra} } @inproceedings {icl:293, title = {Dynamic Process Management for Pipelined Applications}, journal = {Proceedings of DoD HPCMP UGC 2005 (to appear)}, year = {2005}, month = {2005-01}, publisher = {IEEE}, address = {Nashville, TN}, author = {David Cronk and Graham Fagg and Susan Emeny and Scott Tucker} } @inproceedings {icl:265, title = {Fault Tolerant High Performance Computing by a Coding Approach}, journal = {Proceedings of ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming (to appear)}, year = {2005}, month = {2005-01}, address = {Chicago, Illinois}, keywords = {ftmpi, grads, lacsi, sans}, author = {Zizhong Chen and Graham Fagg and Edgar Gabriel and Julien Langou and Thara Angskun and George Bosilca and Jack Dongarra} } @inproceedings {icl:280, title = {Hash Functions for Datatype Signatures in MPI}, journal = {Proceedings of 12th European Parallel Virtual Machine and Message Passing Interface Conference - Euro PVM/MPI}, volume = {3666}, year = {2005}, month = {2005-09}, pages = {76-83}, publisher = {Springer-Verlag Berlin}, address = {Sorrento (Naples), Italy}, keywords = {ftmpi}, author = {George Bosilca and Jack Dongarra and Graham Fagg and Julien Langou}, editor = {Beniamino Di Martino} } @inproceedings {icl:249, title = {Performance Analysis of MPI Collective Operations}, journal = {4th International Workshop on Performance Modeling, Evaluation, and Optmization of Parallel and Distributed Systems (PMEO-PDS {\textquoteright}05)}, year = {2005}, month = {2005-04}, address = {Denver, Colorado}, keywords = {ftmpi}, author = {Jelena Pjesivac{\textendash}Grbovic and Thara Angskun and George Bosilca and Graham Fagg and Edgar Gabriel and Jack Dongarra} } @article {icl:306, title = {Performance Analysis of MPI Collective Operations}, journal = {Cluster Computing Journal (to appear)}, year = {2005}, month = {2005-01}, keywords = {ftmpi}, author = {Jelena Pjesivac{\textendash}Grbovic and Thara Angskun and George Bosilca and Graham Fagg and Edgar Gabriel and Jack Dongarra} } @inproceedings {icl:279, title = {Scalable Fault Tolerant MPI: Extending the Recovery Algorithm}, journal = {Proceedings of 12th European Parallel Virtual Machine and Message Passing Interface Conference - Euro PVM/MPI}, volume = {3666}, year = {2005}, month = {2005-09}, pages = {67}, publisher = {Springer-Verlag Berlin}, address = {Sorrento (Naples) , Italy}, keywords = {ftmpi}, author = {Graham Fagg and Thara Angskun and George Bosilca and Jelena Pjesivac{\textendash}Grbovic and Jack Dongarra}, editor = {Beniamino Di Martino} } @techreport {icl:261, title = {Towards an Accurate Model for Collective Communications}, journal = {ICL Technical Report}, number = {ICL-UT-05-03}, year = {2005}, month = {2005-01}, author = {Sathish Vadhiyar and Graham Fagg and Jack Dongarra} } @article {icl:241, title = {Building and using a Fault Tolerant MPI implementation}, journal = {International Journal of High Performance Applications and Supercomputing (to appear)}, year = {2004}, month = {2004-00}, keywords = {ftmpi, lacsi, sans}, author = {Graham Fagg and Jack Dongarra} } @inproceedings {icl:230, title = {Extending the MPI Specification for Process Fault Tolerance on High Performance Computing Systems}, journal = {Proceedings of ISC2004 (to appear)}, year = {2004}, month = {2004-06}, address = {Heidelberg, Germany}, keywords = {ftmpi, lacsi}, author = {Graham Fagg and Edgar Gabriel and George Bosilca and Thara Angskun and Zizhong Chen and Jelena Pjesivac{\textendash}Grbovic and Kevin London and Jack Dongarra} } @article {icl:240, title = {Process Fault-Tolerance: Semantics, Design and Applications for High Performance Computing}, journal = {International Journal for High Performance Applications and Supercomputing (to appear)}, year = {2004}, month = {2004-04}, keywords = {ftmpi, lacsi}, author = {Graham Fagg and Edgar Gabriel and Zizhong Chen and Thara Angskun and George Bosilca and Jelena Pjesivac{\textendash}Grbovic and Jack Dongarra} } @article {icl:167, title = {Towards an Accurate Model for Collective Communications}, journal = {International Journal of High Performance Applications, Special Issue: Automatic Performance Tuning}, volume = {18}, number = {1}, year = {2004}, month = {2004-01}, pages = {159-167}, keywords = {lacsi}, author = {Sathish Vadhiyar and Graham Fagg and Jack Dongarra} } @article {icl:145, title = {Evaluating The Performance Of MPI-2 Dynamic Communicators And One-Sided Communication}, journal = {Lecture Notes in Computer Science, Recent Advances in Parallel Virtual Machine and Message Passing Interface, 10th European PVM/MPI User{\textquoteright}s Group Meeting}, volume = {2840}, year = {2003}, month = {2003-09}, pages = {88-97}, publisher = {Springer-Verlag, Berlin}, address = {Venice, Italy}, keywords = {ftmpi}, author = {Edgar Gabriel and Graham Fagg and Jack Dongarra} } @inproceedings {icl:153, title = {Fault Tolerant Communication Library and Applications for High Performance Computing}, journal = {Los Alamos Computer Science Institute (LACSI) Symposium 2003 (presented)}, year = {2003}, month = {2003-10}, address = {Santa Fe, NM}, keywords = {ftmpi, lacsi}, author = {Graham Fagg and Edgar Gabriel and Zizhong Chen and Thara Angskun and George Bosilca and Antonin Bukovsky and Jack Dongarra} } @inproceedings {icl:144, title = {A Fault-Tolerant Communication Library for Grid Environments}, journal = {17th Annual ACM International Conference on Supercomputing (ICS{\textquoteright}03) International Workshop on Grid Computing and e-Science}, year = {2003}, month = {2003-06}, address = {San Francisco}, keywords = {ftmpi, lacsi}, author = {Edgar Gabriel and Graham Fagg and Antonin Bukovsky and Thara Angskun and Jack Dongarra} } @article {icl:213, title = {HARNESS Fault Tolerant MPI Design, Usage and Performance Issues}, journal = {Future Generation Computer Systems}, volume = {18}, number = {8}, year = {2002}, month = {2002-01}, pages = {1127-1142}, author = {Graham Fagg and Jack Dongarra} } @inproceedings {icl:122, title = {The Internet BackPlane Protocol: A Study in Resource Sharing}, journal = {Proceedings of the second IEEE/ACM International Symposium on Cluster Computing and the Grid (CCGRID 2002)}, year = {2002}, month = {2002-10}, address = {Berlin, Germany}, keywords = {ftmpi}, author = {Alessandro Bassi and Micah Beck and Graham Fagg and Terry Moore and James Plank and Martin Swany and Rich Wolski} } @inproceedings {icl:203, title = {Fault Tolerant MPI for the HARNESS Meta-Computing System}, journal = {Proceedings of International Conference of Computational Science - ICCS 2001, Lecture Notes in Computer Science}, volume = {2073}, year = {2001}, month = {2001-00}, pages = {355-366}, publisher = {Springer Verlag}, address = {Berlin}, keywords = {ftmpi, harness}, doi = {10.1007/3-540-45545-0_44}, author = {Graham Fagg and Antonin Bukovsky and Jack Dongarra}, editor = {Benjoe A. Juliano and R. Renner and K. Tan} } @article {icl:86, title = {HARNESS and Fault Tolerant MPI}, journal = {Parallel Computing}, volume = {27}, number = {11}, year = {2001}, month = {2001-01}, pages = {1479-1496}, author = {Graham Fagg and Antonin Bukovsky and Jack Dongarra} } @article {icl:89, title = {Numerical Libraries and The Grid}, journal = {International Journal of High Performance Applications and Supercomputing}, volume = {15}, number = {4}, year = {2001}, month = {2001-01}, pages = {359-374}, keywords = {grads}, author = {Antoine Petitet and Susan Blackford and Jack Dongarra and Brett Ellis and Graham Fagg and Kenneth Roche and Sathish Vadhiyar} } @techreport {icl:21, title = {Numerical Libraries and The Grid: The Grads Experiments with ScaLAPACK}, journal = {University of Tennessee Computer Science Technical Report}, number = {UT-CS-01-460}, year = {2001}, month = {2001-01}, keywords = {grads, scalapack}, author = {Antoine Petitet and Susan Blackford and Jack Dongarra and Brett Ellis and Graham Fagg and Kenneth Roche and Sathish Vadhiyar} } @inproceedings {icl:8, title = {Parallel I/O for EQM Applications}, journal = {Department of Defense Users{\textquoteright} Group Conference Proceedings (to appear),}, year = {2001}, month = {2001-06}, address = {Biloxi, Mississippi}, keywords = {ftmpi}, author = {David Cronk and Graham Fagg and Shirley Moore} } @article {icl:14, title = {Parallel IO Support for Meta-Computing Applications: MPI_Connect IO Applied to PACX-MPI}, journal = {8th European PVM/MPI User{\textquoteright}s Group Meeting, Lecture Notes in Computer Science}, volume = {2131}, year = {2001}, month = {2001-09}, publisher = {Springer Verlag, Berlin}, address = {Greece}, keywords = {ftmpi}, author = {Graham Fagg and Edgar Gabriel and Michael Resch} } @inproceedings {icl:78, title = {Performance Modeling for Self Adapting Collective Communications for MPI}, journal = {LACSI Symposium 2001}, year = {2001}, month = {2001-10}, address = {Santa Fe, NM}, keywords = {ftmpi}, author = {Sathish Vadhiyar and Graham Fagg and Jack Dongarra} } @inproceedings {icl:48, title = {Automatically Tuned Collective Communications}, journal = {Proceedings of SuperComputing 2000 (SC{\textquoteright}2000)}, year = {2000}, month = {2000-11}, address = {Dallas, TX}, keywords = {ftmpi}, author = {Sathish Vadhiyar and Graham Fagg and Jack Dongarra} } @inproceedings {icl:43, title = {FT-MPI: Fault Tolerant MPI, Supporting Dynamic Applications in a Dynamic World}, journal = {Lecture Notes in Computer Science: Proceedings of EuroPVM-MPI 2000}, year = {2000}, month = {2000-01}, pages = {V1908,346-353}, address = {(Hungary: Springer Verlag, 2000)}, keywords = {ftmpi}, author = {Graham Fagg and Jack Dongarra} } @article {icl:118, title = {Message Passing Software Systems}, journal = {Encyclopedia of Electrical and Engineering, Supplement 1}, year = {2000}, month = {2000-00}, publisher = {John Wiley \& Sons, Inc.}, keywords = {ftmpi}, author = {Jack Dongarra and Graham Fagg and Rolf Hempel and David W. Walker}, editor = {J. Webster} } @techreport {icl:34, title = {Metacomputing: An Evaluation of Emerging Systems}, journal = {University of Tennessee Computer Science Department Technical Report}, number = {UT-CS-00-445}, year = {2000}, month = {2000-07}, author = {David Cronk and Brett Ellis and Graham Fagg} } @techreport {icl:228, title = {Secure Remote Access to Numerical Software and Computation Hardware}, journal = {University of Tennessee Computer Science Technical Report, UT-CS-00-446}, year = {2000}, month = {2000-07}, author = {Dorian Arnold and Shirley Browne and Jack Dongarra and Graham Fagg and Keith Moore} } @inproceedings {icl:26, title = {Secure Remote Access to Numerical Software and Computational Hardware}, journal = {Proceedings of the DoD HPC Users Group Conference (HPCUG) 2000}, year = {2000}, month = {2000-06}, address = {Albuquerque, NM}, keywords = {netsolve}, author = {Dorian Arnold and Shirley Browne and Jack Dongarra and Graham Fagg and Keith Moore} } @article {icl:55, title = {HARNESS: A Next Generation Distributed Virtual Machine}, journal = {International Journal on Future Generation Computer Systems}, volume = {15}, number = {5-6}, year = {1999}, month = {1999-01}, pages = {571-582}, keywords = {harness}, author = {Micah Beck and Jack Dongarra and Graham Fagg and Al Geist and Paul Gray and James Kohl and Mauro Migliardi and Keith Moore and Terry Moore and Philip Papadopoulous and Stephen L. Scott and Vaidy Sunderam} } @article {icl:73, title = {Scalable Networked Information Processing Environment (SNIPE)}, journal = {Journal on Future Generation Computer Systems}, volume = {15}, number = {5/6}, year = {1999}, month = {1999-01}, pages = {595-605}, keywords = {harness}, author = {Graham Fagg and Keith Moore and Jack Dongarra} }