@InProceedings{Supelec646,
author = {Constantinos Makassikis and Virginie Galtier and Stephane Vialle},
title = {{A Skeletal-Based Approach for the Development of Fault-Tolerant SPMD Applications}},
year = {2010},
booktitle = {{Proceedings of the 11th International Conference on Parallel and Distributed Computing, Applications and Technologies (PDCAT)}},
publisher = {IEEE Computer Society},
pages = {239-248},
month = {dec},
address = {Wuhan, China},
url = {http://dx.doi.org/10.1109/PDCAT.2010.89},
isbn = {978-1-4244-9110-0},
doi = {10.1109/PDCAT.2010.89},
abstract = {Distributing applications over PC clusters to speedup or size-up the execution is now commonplace. Yet efficiently tolerating faults of these systems is a major issue. To ease the addition of checkpoint-based fault tolerance at the application level, we introduce a Model for Low-Overhead Tolerance of Faults (MoLOToF) which is based on structuring applications using fault-tolerant skeletons. MoLOToF also encourages collaborations with the programmer and the execution environment. The skeletons are adapted to specific parallelization paradigms and yield what can be called fault-tolerant algorithmic skeletons. The application of MoLOToF to the SPMD parallelization paradigm results in our proposed FT-SPMD framework. Experiments show that the complexity for developing an application is small and the use of the framework has a small impact on performance. Comparisons with existing system-level checkpoint solutions, namely LAM/MPI and DMTCP, point out that FT-SPMD has a lower runtime overhead while being more robust when a higher level of fault tolerance is required.}
}