@article {1218, title = {Coping with Silent and Fail-Stop Errors at Scale by Combining Replication and Checkpointing}, journal = {Journal of Parallel and Distributed Computing}, volume = {122}, year = {2018}, month = {2018-12}, pages = {209{\textendash}225}, abstract = {This paper provides a model and an analytical study of replication as a technique to cope with silent errors, as well as a mixture of both silent and fail-stop errors on large-scale platforms. Compared with fail-stop errors that are immediately detected when they occur, silent errors require a detection mechanism. To detect silent errors, many application-specific techniques are available, either based on algorithms (e.g., ABFT), invariant preservation or data analytics, but replication remains the most transparent and least intrusive technique. We explore the right level (duplication, triplication or more) of replication for two frameworks: (i) when the platform is subject to only silent errors, and (ii) when the platform is subject to both silent and fail-stop errors. A higher level of replication is more expensive in terms of resource usage but enables to tolerate more errors and to even correct some errors, hence there is a trade-off to be found. Replication is combined with checkpointing and comes with two flavors: process replication and group replication. Process replication applies to message-passing applications with communicating processes. Each process is replicated, and the platform is composed of process pairs, or triplets. Group replication applies to black-box applications, whose parallel execution is replicated several times. The platform is partitioned into two halves (or three thirds). In both scenarios, results are compared before each checkpoint, which is taken only when both results (duplication) or two out of three results (triplication) coincide. Otherwise, one or more silent errors have been detected, and the application rolls back to the last checkpoint, as well as when fail-stop errors have struck. We provide a detailed analytical study for all of these scenarios, with formulas to decide, for each scenario, the optimal parameters as a function of the error rate, checkpoint cost, and platform size. We also report a set of extensive simulation results that nicely corroborates the analytical model.}, keywords = {checkpointing, fail-stop errors, Fault tolerance, High-performance computing, Replication, silent errors}, doi = {https://doi.org/10.1016/j.jpdc.2018.08.002}, author = {Anne Benoit and Aurelien Cavelan and Franck Cappello and Padma Raghavan and Yves Robert and Hongyang Sun} } @conference {930, title = {Optimal Resilience Patterns to Cope with Fail-stop and Silent Errors}, booktitle = {2016 IEEE International Parallel and Distributed Processing Symposium (IPDPS)}, year = {2016}, month = {2016-05}, publisher = {IEEE}, organization = {IEEE}, address = {Chicago, IL}, abstract = {This work focuses on resilience techniques at extreme scale. Many papers deal with fail-stop errors. Many others deal with silent errors (or silent data corruptions). But very few papers deal with fail-stop and silent errors simultaneously. However, HPC applications will obviously have to cope with both error sources. This paper presents a unified framework and optimal algorithmic solutions to this double challenge. Silent errors are handled via verification mechanisms (either partially or fully accurate) and in-memory checkpoints. Fail-stop errors are processed via disk checkpoints. All verification and checkpoint types are combined into computational patterns. We provide a unified model, and a full characterization of the optimal pattern. Our results nicely extend several published solutions and demonstrate how to make use of different techniques to solve the double threat of fail-stop and silent errors. Extensive simulations based on real data confirm the accuracy of the model, and show that patterns that combine all resilience mechanisms are required to provide acceptable overheads.}, keywords = {fail-stop errors, multilevel checkpoint, optimal pattern, resilience, silent errors, verification}, doi = {10.1109/IPDPS.2016.39}, author = {Anne Benoit and Aurelien Cavelan and Yves Robert and Hongyang Sun} }