Position
Currently I'm working as a Research Assistant Professor
at the
Innovative Computing Laboratory
at University of Tennessee, Knoxville. I can be joined at 865-974-9375 or in person in my office (Claxton 316).
ULFM - User Level Failure Mitigation is a set of MPI interface extensions enabling Message Passing programs to restore MPI communication capabilies affected by process failures. It supports rebuilding communicators, RMA windows and I/O Files. No particular recovery model is imposed or favored, instead a set of versatile APIs is included that provides support for differente recovery styles. The application directs the recovery, so it can pay for the cost of repairing only the necessary MPI objects. The ULFM specification is a crucial infrastructure to enable the deployment of advanced, production quality fault toleant techniques; it is a versatile solution to improve the efficiency of novel and established fault tolerant techniques. Look at the flyer.
MPICH-V is a research effort with theoretical studies, experimental evaluations and pragmatic implementations aiming to provide a MPI implementation based on MPICH, featuring multiple fault tolerant protocols. MPICH-V provides automatic fault tolerant MPI library (i.e. a totaly unchanged application linked with the mpich-v library is a fault tolerant application).
PaRSEC - Parallel Runtime Scheduling and Execution Controller -
is a generic framework for architecture aware scheduling and management of micro-tasks on distributed many-core heterogeneous architectures. Applications we consider can be expressed as a Direct Acyclic Graph of tasks with labeled edges designating data dependencies. DAGs are represented in a compact problem-size independent format that can be queried on-demand to discover data dependencies in a totally distributed fashion. PaRSEC assigns computation threads to the cores, overlaps communications and computations and uses a dynamic, fully-distributed scheduler based on architectural features such as NUMA nodes and algorithmic features such as data reuse.
The framework includes libraries, a runtime system, and development tools to help application developers tackle the difficult task of porting their applications to highly heterogeneous and diverse environment.
PaRSEC is the underlying infrastructure for the DPLASMA distributed memory, tile algorithm based linear algebra package.
Kabir, K., Haidar, A., Tomov, S., Bouteiller, A., Dongarra, J. "A Framework for Out of Memory SVD Algorithms," ISC High Performance 2017, Springer International Publishing, Frankfurt, Germany, pp. 158-178, June 19-21, 2017 [pdf] [bibtex] @article{icl:926,
author = {Kabir, K. and Haidar, A. and Tomov, S. and Bouteiller, A. and Dongarra, J.},
title = {A Framework for Out of Memory SVD Algorithms},
booktitle = {ISC High Performance 2017},
institution = {Innovative Computing Laboratory, University of Tennessee},
pages = {pp. 158-178},
address = {Frankfurt, Germany},
month = {June},
year = {2017}
}
[
hide]
Kabir, K., Haidar, A., Tomov, S., Bouteiller, A., Dongarra, J. "A Framework for Out of Memory SVD Algorithms," ISC High Performance 2017, Springer International Publishing, Frankfurt, Germany, pp. 158-178, June 19-21, 2017 [pdf] [bibtex] @article{icl:927,
author = {Kabir, K. and Haidar, A. and Tomov, S. and Bouteiller, A. and Dongarra, J.},
title = {A Framework for Out of Memory SVD Algorithms},
booktitle = {ISC High Performance 2017},
institution = {Innovative Computing Laboratory, University of Tennessee},
pages = {pp. 158-178},
address = {Frankfurt, Germany},
month = {June},
year = {2017}
}
[
hide]
Herault, T., Bouteiller, A., Bosilca, G., Gamell, M., Teranishi, K., Parashar, M., Dongarra, J. "Practical Scalable Consensus for Pseudo-Synchronous Distributed Systems," Supercomputing, Austin, TX, November, 2015 [pdf] [bibtex] @article{icl:883,
author = {Herault, T. and Bouteiller, A. and Bosilca, G. and Gamell, M. and Teranishi, K. and Parashar, M. and Dongarra, J.},
title = {Practical Scalable Consensus for Pseudo-Synchronous Distributed Systems},
booktitle = {Supercomputing},
institution = {Innovative Computing Laboratory, University of Tennessee},
address = {Austin, TX},
month = {November},
year = {2015}
}
[
hide]
Wu, W., Bouteiller, A., Bosilca, G., Faverge, M., Dongarra, J. "Hierarchical DAG scheduling for Hybrid Distributed Systems," 29th IEEE International Parallel & Distributed Processing Symposium (IPDPS), IEEE, Hyderabad, India, May, 2015 [pdf] [bibtex] @inproceedings{icl:837,
author = {Wu, W. and Bouteiller, A. and Bosilca, G. and Faverge, M. and Dongarra, J.},
title = {Hierarchical DAG scheduling for Hybrid Distributed Systems},
booktitle = {29th IEEE International Parallel & Distributed Processing Symposium (IPDPS)},
institution = {Innovative Computing Laboratory, University of Tennessee},
address = {Hyderabad, India},
month = {May},
year = {2015}
}
[
hide]
Herault, T., Bouteiller, A., Bosilca, G., Gamell, M., Teranishi, K., Parashar, M., Dongarra, J. "Practical Scalable Consensus for Pseudo-Synchronous Distributed Systems: Formal Proof," University of Tennessee Computer Science Technical Report, ICL-UT-15-01, April, 2015 [pdf] [bibtex] @techreport{icl:865,
author = {Herault, T. and Bouteiller, A. and Bosilca, G. and Gamell, M. and Teranishi, K. and Parashar, M. and Dongarra, J.},
title = {Practical Scalable Consensus for Pseudo-Synchronous Distributed Systems: Formal Proof},
booktitle = {University of Tennessee Computer Science Technical Report},
institution = {Innovative Computing Laboratory, University of Tennessee},
month = {April},
year = {2015}
}
[
hide]
Danalis, A., Bosilca, G., Bouteiller, A., Herault, T., Dongarra, J. "PTG: an abstraction for unhindered parallelism," Proceedings of the International Workshop on Domain-Specific Languages and High-Level Frameworks for High Performance Computing (WOLFHPC), IEEE Press, New Orleans, Louisiana, Nov 17, 2014 [pdf] [bibtex] @inproceedings{icl:864,
author = {Danalis, A. and Bosilca, G. and Bouteiller, A. and Herault, T. and Dongarra, J.},
title = {PTG: an abstraction for unhindered parallelism},
booktitle = {Proceedings of the International Workshop on Domain-Specific Languages and High-Level Frameworks for High Performance Computing (WOLFHPC)},
institution = {Innovative Computing Laboratory, University of Tennessee},
address = {New Orleans, Louisiana},
month = {Nov},
year = {2014}
}
[
hide]
Bosilca, G., Bouteiller, A., Herault, T., Robert, Y., Dongarra, J. "Assessing the Impact of ABFT and Checkpoint Composite Strategies," 16th Workshop on Advances in Parallel and Distributed Computational Models, IPDPS 2014, IEEE, Phoenix, AZ, May, 2014 [pdf] [bibtex] @inproceedings{icl:780,
author = {Bosilca, G. and Bouteiller, A. and Herault, T. and Robert, Y. and Dongarra, J.},
title = {Assessing the Impact of ABFT and Checkpoint Composite Strategies},
booktitle = {16th Workshop on Advances in Parallel and Distributed Computational Models, IPDPS 2014},
institution = {Innovative Computing Laboratory, University of Tennessee},
address = {Phoenix, AZ},
month = {May},
year = {2014}
}
[
hide]
Bosilca, G., Bouteiller, A., Brunet, E., Cappello, F., Dongarra, J., Guermouche, A., Herault, T., Robert, Y., Vivien, F., Zaidouni, D. "Unified Model for Assessing Checkpointing Protocols at Extreme-Scale," Concurrency and Computation: Practice and Experience, John Wiley & Sons, Ltd., November, 2013 [pdf] [bibtex] @article{icl:785,
author = {Bosilca, G. and Bouteiller, A. and Brunet, E. and Cappello, F. and Dongarra, J. and Guermouche, A. and Herault, T. and Robert, Y. and Vivien, F. and Zaidouni, D.},
title = {Unified Model for Assessing Checkpointing Protocols at Extreme-Scale},
booktitle = {Concurrency and Computation: Practice and Experience},
institution = {Innovative Computing Laboratory, University of Tennessee},
month = {November},
year = {2013}
}
[
hide]
Bosilca, G., Bouteiller, A., Danalis, A., Faverge, M., Herault, T., Dongarra, J. "PaRSEC: Exploiting Heterogeneity to Enhance Scalability," IEEE Computing in Science and Engineering, Vol. 15, No. 6, 36-45, November, 2013 [pdf] [bibtex] @article{icl:786,
author = {Bosilca, G. and Bouteiller, A. and Danalis, A. and Faverge, M. and Herault, T. and Dongarra, J.},
title = {PaRSEC: Exploiting Heterogeneity to Enhance Scalability},
booktitle = {IEEE Computing in Science and Engineering},
institution = {Innovative Computing Laboratory, University of Tennessee},
volume = {Vol. 15, No. 6},
pages = {36-45},
month = {November},
year = {2013}
}
[
hide]
Bosilca, G., Bouteiller, A., Herault, T., Robert, Y., and Jack Dongarra "Assessing the impact of {ABFT} and Checkpoint composite strategies," University of Tennessee Computer Science Technical Report, ICL-UT-13-03, September, 2013 [pdf] [bibtex] @techreport{icl:757,
author = {Bosilca, G. and Bouteiller, A. and Herault, T. and Robert, Y. and and Jack Dongarra},
title = {Assessing the impact of {ABFT} and Checkpoint composite strategies},
booktitle = {University of Tennessee Computer Science Technical Report},
institution = {Innovative Computing Laboratory, University of Tennessee},
month = {September},
year = {2013}
}
[
hide]
Bland, W., Du, P., Bouteiller, A., Herault, T., Bosilca, G., Dongarra, J. "Extending the Scope of the Checkpoint-on-Failure Protocol for Forward Recovery in Standard MPI," Concurrency and Computation: Practice and Experience, July, 2013 [pdf] [bibtex] @article{icl:755,
author = {Bland, W. and Du, P. and Bouteiller, A. and Herault, T. and Bosilca, G. and Dongarra, J.},
title = {Extending the Scope of the Checkpoint-on-Failure Protocol for Forward Recovery in Standard MPI},
booktitle = {Concurrency and Computation: Practice and Experience},
institution = {Innovative Computing Laboratory, University of Tennessee},
month = {July},
year = {2013}
}
[
hide]
Bland, W., Bouteiller, A., Herault, T., Bosilca, G., Dongarra, J. "Post-failure recovery of MPI communication capability: Design and Rationale," International Journal of High Performance Computing Applications, June, 2013 [pdf] [bibtex] @article{icl:756,
author = {Bland, W. and Bouteiller, A. and Herault, T. and Bosilca, G. and Dongarra, J.},
title = {Post-failure recovery of MPI communication capability: Design and Rationale},
booktitle = {International Journal of High Performance Computing Applications},
institution = {Innovative Computing Laboratory, University of Tennessee},
month = {June},
year = {2013}
}
[
hide]
Bland, W., Bouteiller, A., Herault, T., Hursey, J., Bosilca, G., Dongarra, J.J. "An evaluation of User-Level Failure Mitigation support in MPI," Computing, Springer, Vienna, DOI 10.1007/s00607-013-0331-3, 1-14, May, 2013 [pdf] [bibtex] @article{icl:744,
author = {Bland, W. and Bouteiller, A. and Herault, T. and Hursey, J. and Bosilca, G. and Dongarra, J.J.},
title = {An evaluation of User-Level Failure Mitigation support in MPI},
booktitle = {Computing},
institution = {Innovative Computing Laboratory, University of Tennessee},
volume = {DOI 10.1007/s00607-013-0331-3},
pages = {1-14},
address = {Vienna},
month = {May},
year = {2013}
}
[
hide]
Bouteiller, A., Cappello, F., Dongarra, J., Guermouche, A., Herault, T., and Robert, Y. "Multi-criteria checkpointing strategies: optimizing response-time versus resource utilization," University of Tennessee Computer Science Technical Report, ICL-UT-13-01, February 15, 2013 [pdf] [bibtex] @techreport{icl:733,
author = {Bouteiller, A. and Cappello, F. and Dongarra, J. and Guermouche, A. and Herault, T. and and Robert, Y.},
title = {Multi-criteria checkpointing strategies: optimizing response-time versus resource utilization},
booktitle = {University of Tennessee Computer Science Technical Report},
institution = {Innovative Computing Laboratory, University of Tennessee},
month = {February},
year = {2013}
}
[
hide]
Ma, T., Bosilca, G., Bouteiller, A., Dongarra, J. "Kernel-assisted and topology-aware MPI collective communications on multi-core/many-core platforms," Journal of Parallel and Distributed Computing, accepted, January, 2013 [pdf] [bibtex] @article{icl:734,
author = {Ma, T. and Bosilca, G. and Bouteiller, A. and Dongarra, J.},
title = {Kernel-assisted and topology-aware MPI collective communications on multi-core/many-core platforms},
booktitle = {Journal of Parallel and Distributed Computing},
institution = {Innovative Computing Laboratory, University of Tennessee},
volume = {accepted},
month = {January},
year = {2013}
}
[
hide]
Bosilca, G., Bouteiller, A., Danalis, A., Herault, T., Kurzak, J., Luszczek, P., Tomov, S., and J. Dongarra "Scalable Dense Linear Algebra on Heterogeneous Hardware," HPC: Transition Towards Exascale Processing, in the series Advances in Parallel Computing, IOS Press, 2013 [pdf] [bibtex] @article{icl:758,
author = {Bosilca, G. and Bouteiller, A. and Danalis, A. and Herault, T. and Kurzak, J. and Luszczek, P. and Tomov, S. and and J. Dongarra},
title = {Scalable Dense Linear Algebra on Heterogeneous Hardware},
booktitle = {HPC: Transition Towards Exascale Processing, in the series Advances in Parallel Computing},
institution = {Innovative Computing Laboratory, University of Tennessee},
year = {2013}
}
[
hide]
Bouteiller, A., Herault, T., Bosilca, G., Dongarra, J. "Correlated Set Coordination in Fault Tolerant Message Logging Protocols," Concurrency and Computation: Practice and Experience, Vol. 25, No. 4, pp. 572-585, 2013 [pdf] [bibtex] @article{icl:787,
author = {Bouteiller, A. and Herault, T. and Bosilca, G. and Dongarra, J.},
title = {Correlated Set Coordination in Fault Tolerant Message Logging Protocols},
booktitle = {Concurrency and Computation: Practice and Experience},
institution = {Innovative Computing Laboratory, University of Tennessee},
volume = {Vol. 25, No. 4},
pages = {pp. 572-585},
year = {2013}
}
[
hide]
Bland, W., Bouteiller, A., Herault, T., Hursey, J., Bosilca, G., Dongarra, J. "An Evaluation of User-Level Failure Mitigation Support in MPI," Proceedings of Recent Advances in Message Passing Interface - 19th European MPI Users' Group Meeting, EuroMPI 2012, Springer, Vienna, Austria, September 23 - 26, 2012 [pdf] [bibtex] @inproceedings{icl:680,
author = {Bland, W. and Bouteiller, A. and Herault, T. and Hursey, J. and Bosilca, G. and Dongarra, J.},
title = {An Evaluation of User-Level Failure Mitigation Support in MPI},
booktitle = {Proceedings of Recent Advances in Message Passing Interface - 19th European MPI Users' Group Meeting, EuroMPI 2012},
institution = {Innovative Computing Laboratory, University of Tennessee},
address = {Vienna, Austria},
month = {September},
year = {2012}
}
[
hide]
Bland, W., Du, P., Bouteiller, A., Herault, T., Bosilca, G., Dongarra, J. "A Checkpoint-on-Failure Protocol for Algorithm-Based Recovery in Standard MPI," 18th International European Conference on Parallel and Distributed Computing (Euro-Par 2012) (Best Paper Award), Christos Kaklamanis, Theodore Papatheodorou and Paul Spirakis eds. Springer-Verlag, Rhodes, Greece, August 27-31, 2012 [pdf] [bibtex] @inproceedings{icl:679,
author = {Bland, W. and Du, P. and Bouteiller, A. and Herault, T. and Bosilca, G. and Dongarra, J.},
title = {A Checkpoint-on-Failure Protocol for Algorithm-Based Recovery in Standard MPI},
booktitle = {18th International European Conference on Parallel and Distributed Computing (Euro-Par 2012) (Best Paper Award)},
institution = {Innovative Computing Laboratory, University of Tennessee},
address = {Rhodes, Greece},
month = {August},
year = {2012}
}
[
hide]
Bosilca, G., Bouteiller, A., Brunet, E., Cappello, F., Dongarra, J., Guermouche, A., Herault, T., Robert, Y., Vivien, F., Zaidouni, D. "Unified Model for Assessing Checkpointing Protocols at Extreme-Scale," University of Tennessee Computer Science Technical Report (also LAWN 269), UT-CS-12-697, June, 2012 [pdf] [bibtex] @techreport{icl:716,
author = {Bosilca, G. and Bouteiller, A. and Brunet, E. and Cappello, F. and Dongarra, J. and Guermouche, A. and Herault, T. and Robert, Y. and Vivien, F. and Zaidouni, D.},
title = {Unified Model for Assessing Checkpointing Protocols at Extreme-Scale},
booktitle = {University of Tennessee Computer Science Technical Report (also LAWN 269)},
institution = {Innovative Computing Laboratory, University of Tennessee},
month = {June},
year = {2012}
}
[
hide]
Ma, T., Bosilca, G., Bouteiller, A., Dongarra, J. "HierKNEM: An Adaptive Framework for Kernel-Assisted and Topology-Aware Collective Communications on Many-core Clusters," IPDPS 2012 (Best Paper), Shanghai, China, May, 2012 [pdf] [bibtex] @article{icl:700,
author = {Ma, T. and Bosilca, G. and Bouteiller, A. and Dongarra, J.},
title = {HierKNEM: An Adaptive Framework for Kernel-Assisted and Topology-Aware Collective Communications on Many-core Clusters},
booktitle = {IPDPS 2012 (Best Paper)},
institution = {Innovative Computing Laboratory, University of Tennessee},
address = {Shanghai, China},
month = {May},
year = {2012}
}
[
hide]
Bouteiller, A., Herault, T., Bosilca, G., Dongarra, J. "Correlated Set Coordination in Fault Tolerant Message Logging Protocols," Concurrency and Computation: Practice and Experience (accepted), March, 2012 [bibtex] @article{icl:720,
author = {Bouteiller, A. and Herault, T. and Bosilca, G. and Dongarra, J.},
title = {Correlated Set Coordination in Fault Tolerant Message Logging Protocols},
booktitle = {Concurrency and Computation: Practice and Experience (accepted)},
institution = {Innovative Computing Laboratory, University of Tennessee},
month = {March},
year = {2012}
}
[
hide]
Du, P., Bouteiller, A., Bosilca, G., Herault, T., Dongarra, J. "Algorithm-Based Fault Tolerance for Dense Matrix Factorization," Proceedings of the 17th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, PPOPP 2012, J. Ramanujam, P. Sadayappan eds. ACM, New Orleans, LA, USA, 225-234, February 25-29, 2012 [pdf] [bibtex] @inproceedings{icl:672,
author = {Du, P. and Bouteiller, A. and Bosilca, G. and Herault, T. and Dongarra, J.},
title = {Algorithm-Based Fault Tolerance for Dense Matrix Factorization},
booktitle = {Proceedings of the 17th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, PPOPP 2012},
institution = {Innovative Computing Laboratory, University of Tennessee},
pages = {225-234},
address = {New Orleans, LA, USA},
month = {February},
year = {2012}
}
[
hide]
Bland, W., Bosilca, G., Bouteiller, A., Herault, T., Dongarra, J. "A Proposal for User-Level Failure Mitigation in the MPI-3 Standard," University of Tennessee Electrical Engineering and Computer Science Technical Report, ut-cs-12-693, February 24, 2012 [pdf] [bibtex] @techreport{icl:667,
author = {Bland, W. and Bosilca, G. and Bouteiller, A. and Herault, T. and Dongarra, J.},
title = {A Proposal for User-Level Failure Mitigation in the MPI-3 Standard},
booktitle = {University of Tennessee Electrical Engineering and Computer Science Technical Report},
institution = {Innovative Computing Laboratory, University of Tennessee},
month = {February},
year = {2012}
}
[
hide]
Danalis, A., Bouteiller, A., Bosilca, G., Dongarra, J., Herault, T. "From Serial Loops to Parallel Execution on Distributed Systems," PPoPP 2012 (submitted), New Orleans, LA, February, 2012 [pdf] [bibtex] @article{icl:699,
author = {Danalis, A. and Bouteiller, A. and Bosilca, G. and Dongarra, J. and Herault, T.},
title = {From Serial Loops to Parallel Execution on Distributed Systems},
booktitle = {PPoPP 2012 (submitted)},
institution = {Innovative Computing Laboratory, University of Tennessee},
address = {New Orleans, LA},
month = {February},
year = {2012}
}
[
hide]
Bosilca, G., Bouteiller, A., Danalis, A., Herault, T., Luszczek, P., Dongarra, J. "Dense Linear Algebra on Distributed Heterogeneous Hardware with a Symbolic DAG Approach," Scalable Computing and Communications: Theory and Practice, Khan, S., Wang, L., Zomaya, A. eds. John Wiley & Sons, 699-735, March, 2013 [bibtex] @article{icl:698,
author = {Bosilca, G. and Bouteiller, A. and Danalis, A. and Herault, T. and Luszczek, P. and Dongarra, J.},
title = {Dense Linear Algebra on Distributed Heterogeneous Hardware with a Symbolic DAG Approach},
booktitle = {Scalable Computing and Communications: Theory and Practice},
institution = {Innovative Computing Laboratory, University of Tennessee},
pages = {699-735},
month = {March},
year = {2012}
}
[
hide]
Bosilca, G., Bouteiller, A., Danalis, A., Herault, T., Lemarinier, P., Dongarra, J. "DAGuE: A generic distributed DAG Engine for High Performance Computing.," Parallel Computing, T. Hoefler eds. Elsevier, Vol. 38, No 1-2, 27-51, 2012 [pdf] [bibtex] @article{icl:670,
author = {Bosilca, G. and Bouteiller, A. and Danalis, A. and Herault, T. and Lemarinier, P. and Dongarra, J.},
title = {DAGuE: A generic distributed DAG Engine for High Performance Computing.},
booktitle = {Parallel Computing},
institution = {Innovative Computing Laboratory, University of Tennessee},
volume = {Vol. 38, No 1-2},
pages = {27-51},
year = {2012}
}
[
hide]
Bland, W., Du, P., Bouteiller, A., Herault, T., Bosilca, G., Dongarra, J. "Extending the Scope of the Checkpoint-on-Failure Protocol for Forward Recovery in Standard MPI," University of Tennessee Computer Science Technical Report, ut-cs-12-702, 2012 [pdf] [bibtex] @techreport{icl:724,
author = {Bland, W. and Du, P. and Bouteiller, A. and Herault, T. and Bosilca, G. and Dongarra, J.},
title = {Extending the Scope of the Checkpoint-on-Failure Protocol for Forward Recovery in Standard MPI},
booktitle = {University of Tennessee Computer Science Technical Report},
institution = {Innovative Computing Laboratory, University of Tennessee},
year = {2012}
}
[
hide]
Ma, T., Bouteiller, A., Bosilca, G., Dongarra, J. "Impact of Kernel-Assisted MPI Communication over Scientific Applications: CPMD and FFTW," 18th EuroMPI, Cotronis, Y., Danalis, A., Nikolopoulos, D., Dongarra, J. eds. Springer, Santorini, Greece, pp. 247-254, September, 2011 [bibtex] @article{icl:646,
author = {Ma, T. and Bouteiller, A. and Bosilca, G. and Dongarra, J.},
title = {Impact of Kernel-Assisted MPI Communication over Scientific Applications: CPMD and FFTW},
booktitle = {18th EuroMPI},
institution = {Innovative Computing Laboratory, University of Tennessee},
pages = {pp. 247-254},
address = {Santorini, Greece},
month = {September},
year = {2011}
}
[
hide]
Ma, T., Bosilca, G., Bouteiller, A., Goglin, B., Squyres, J., Dongarra, J. "Kernel Assisted Collective Intra-node MPI Communication Among Multi-core and Many-core CPUs," Int'l Conference on Parallel Processing (ICPP '11), Taipei, Taiwan, September, 2011 [bibtex] @inproceedings{icl:649,
author = {Ma, T. and Bosilca, G. and Bouteiller, A. and Goglin, B. and Squyres, J. and Dongarra, J.},
title = {Kernel Assisted Collective Intra-node MPI Communication Among Multi-core and Many-core CPUs},
booktitle = {Int'l Conference on Parallel Processing (ICPP '11)},
institution = {Innovative Computing Laboratory, University of Tennessee},
address = {Taipei, Taiwan},
month = {September},
year = {2011}
}
[
hide]
Bouteiller, A., Herault, T., Bosilca, G., Dongarra, J. "Correlated Set Coordination in Fault Tolerant Message Logging Protocols," Proceedings of 17th International Conference, Euro-Par 2011, Part II, Emmanuel Jeannot, Raymond Namyst, Jean Roman eds. Springer, Bordeaux, France, LNCS Vol. 6853, 51-64, August 29 - September 2, 2011 [pdf] [bibtex] @inproceedings{icl:673,
author = {Bouteiller, A. and Herault, T. and Bosilca, G. and Dongarra, J.},
title = {Correlated Set Coordination in Fault Tolerant Message Logging Protocols},
booktitle = {Proceedings of 17th International Conference, Euro-Par 2011, Part II},
institution = {Innovative Computing Laboratory, University of Tennessee},
volume = {LNCS Vol. 6853},
pages = {51-64},
address = {Bordeaux, France},
month = {August},
year = {2011}
}
[
hide]
Du, P., Bouteiller, A., Bosilca, G., Herault, T., Dongarra, J. "Algorithm-based Fault Tolerance for Dense Matrix Factorizations," University of Tennessee Computer Science Technical Report, Knoxville, TN, UT-CS-11-676, August 05, 2011 [pdf] [bibtex] @techreport{icl:626,
author = {Du, P. and Bouteiller, A. and Bosilca, G. and Herault, T. and Dongarra, J.},
title = {Algorithm-based Fault Tolerance for Dense Matrix Factorizations},
booktitle = {University of Tennessee Computer Science Technical Report},
institution = {Innovative Computing Laboratory, University of Tennessee},
address = {Knoxville, TN},
month = {August},
year = {2011}
}
[
hide]
Bosilca, G., Bouteiller, A., Herault, T., Lemarier, P., Saengpatsa, N., Tomov, S., Dongarra, J. "Performance Portability of a GPU Enabled Factorization with the DAGuE Framework," IEEE Cluster: workshop on Parallel Programming on Accelerator Clusters (PPAC), June 24, 2011 [pdf] [bibtex] @inproceedings{icl:636,
author = {Bosilca, G. and Bouteiller, A. and Herault, T. and Lemarier, P. and Saengpatsa, N. and Tomov, S. and Dongarra, J.},
title = {Performance Portability of a GPU Enabled Factorization with the DAGuE Framework},
booktitle = {IEEE Cluster: workshop on Parallel Programming on Accelerator Clusters (PPAC)},
institution = {Innovative Computing Laboratory, University of Tennessee},
month = {June},
year = {2011}
}
[
hide]
Bosilca, G., Bouteiller, A., Herault, T., Lemarinier, P., Saengpatsa, N., Tomov, S., Dongarra, J. "A Unified HPC Environment for Hybrid Manycore/GPU Distributed Systems," IEEE International Parallel and Distributed Processing Symposium (submitted), Anchorage, AK, May 16-20, 2011 [bibtex] @inproceedings{icl:593,
author = {Bosilca, G. and Bouteiller, A. and Herault, T. and Lemarinier, P. and Saengpatsa, N. and Tomov, S. and Dongarra, J.},
title = {A Unified HPC Environment for Hybrid Manycore/GPU Distributed Systems},
booktitle = {IEEE International Parallel and Distributed Processing Symposium (submitted)},
institution = {Innovative Computing Laboratory, University of Tennessee},
address = {Anchorage, AK},
month = {May},
year = {2011}
}
[
hide]
Bosilca, G., Bouteiller, A., Danalis, A., Herault, T., Lemarinier, P., Dongarra, J. "DAGuE: A Generic Distributed DAG Engine for High Performance Computing," Proceedings of the Workshops of the 25th IEEE International Symposium on Parallel and Distributed Processing (IPDPS 2011 Workshops), IEEE, Anchorage, Alaska, USA, 1151-1158, 16-20 May, 2011 [bibtex] @inproceedings{icl:675,
author = {Bosilca, G. and Bouteiller, A. and Danalis, A. and Herault, T. and Lemarinier, P. and Dongarra, J.},
title = {DAGuE: A Generic Distributed DAG Engine for High Performance Computing},
booktitle = {Proceedings of the Workshops of the 25th IEEE International Symposium on Parallel and Distributed Processing (IPDPS 2011 Workshops)},
institution = {Innovative Computing Laboratory, University of Tennessee},
pages = {1151-1158},
address = {Anchorage, Alaska, USA},
year = {2011}
}
[
hide]
Bosilca, G., Bouteiller, A., Danalis, A., Faverge, M., Haidar, A., Herault, T., Kurzak, J., Langou, J., Lemarinier, P., Ltaeif, H., Luszczek, P., YarKhan, A., Dongarra, J. "Flexible Development of Dense Linear Algebra Algorithms on Massively Parallel Architectures with DPLASMA," Proceedings of the Workshops of the 25th IEEE International Symposium on Parallel and Distributed Processing (IPDPS 2011 Workshops), IEEE, Anchorage, Alaska, USA, 1432-1441, 16-20 May, 2011 [pdf] [bibtex] @inproceedings{icl:676,
author = {Bosilca, G. and Bouteiller, A. and Danalis, A. and Faverge, M. and Haidar, A. and Herault, T. and Kurzak, J. and Langou, J. and Lemarinier, P. and Ltaeif, H. and Luszczek, P. and YarKhan, A. and Dongarra, J.},
title = {Flexible Development of Dense Linear Algebra Algorithms on Massively Parallel Architectures with DPLASMA},
booktitle = {Proceedings of the Workshops of the 25th IEEE International Symposium on Parallel and Distributed Processing (IPDPS 2011 Workshops)},
institution = {Innovative Computing Laboratory, University of Tennessee},
pages = {1432-1441},
address = {Anchorage, Alaska, USA},
year = {2011}
}
[
hide]
Ma, T., Bosilca, G., Bouteiller, A., Goglin, B., Squyres, J., Dongarra, J. "Kernel Assisted Collective Intra-node Communication Among Multicore and Manycore CPUs," University of Tennessee Computer Science Technical Report, UT-CS-10-663, November, 2010 [pdf] [bibtex] @techreport{icl:597,
author = {Ma, T. and Bosilca, G. and Bouteiller, A. and Goglin, B. and Squyres, J. and Dongarra, J.},
title = {Kernel Assisted Collective Intra-node Communication Among Multicore and Manycore CPUs},
booktitle = {University of Tennessee Computer Science Technical Report, UT-CS-10-663},
institution = {Innovative Computing Laboratory, University of Tennessee},
month = {November},
year = {2010}
}
[
hide]
Bosilca, G., Bouteiller, A., Danalis, A., Faverge, M., Haidar, H., Herault, T., Kurzak, J., Langou, J., Lemariner, P., Ltaief, H., Luszczek, P., YarKhan, A., Dongarra, J. "Distributed Dense Numerical Linear Algebra Algorithms on Massively Parallel Architectures: DPLASMA," University of Tennessee Computer Science Technical Report, UT-CS-10-660, Sept. 15, 2010 [pdf] [bibtex] @techreport{icl:563,
author = {Bosilca, G. and Bouteiller, A. and Danalis, A. and Faverge, M. and Haidar, H. and Herault, T. and Kurzak, J. and Langou, J. and Lemariner, P. and Ltaief, H. and Luszczek, P. and YarKhan, A. and Dongarra, J.},
title = {Distributed Dense Numerical Linear Algebra Algorithms on Massively Parallel Architectures: DPLASMA},
booktitle = {University of Tennessee Computer Science Technical Report, UT-CS-10-660},
institution = {Innovative Computing Laboratory, University of Tennessee},
month = {Sept},
year = {2010}
}
[
hide]
Ma, T., Bouteiller, A., Bosilca, G., Dongarra, J. "Locality and Topology aware Intra-node Communication Among Multicore CPUs," Proceedings of the 17th EuroMPI conference, LNCS, Stuttgart, Germany, September, 2010 [pdf] [bibtex] @inproceedings{icl:535,
author = {Ma, T. and Bouteiller, A. and Bosilca, G. and Dongarra, J.},
title = {Locality and Topology aware Intra-node Communication Among Multicore CPUs},
booktitle = {Proceedings of the 17th EuroMPI conference},
institution = {Innovative Computing Laboratory, University of Tennessee},
address = {Stuttgart, Germany},
month = {September},
year = {2010}
}
[
hide]
Bosilca, G., Bouteiller, A., Herault, T., Lemarinier, P., Dongarra, J. "Dodging the Cost of Unavoidable Memory Copies in Message Logging Protocols," Proceedings of EuroMPI 2010, Jack Dongarra, Michael Resch, Rainer Keller, Edgar Gabriel, eds. eds. Springer, Stuttgart, Germany, September, 2010 [pdf] [bibtex] @inproceedings{icl:534,
author = {Bosilca, G. and Bouteiller, A. and Herault, T. and Lemarinier, P. and Dongarra, J.},
title = {Dodging the Cost of Unavoidable Memory Copies in Message Logging Protocols},
booktitle = {Proceedings of EuroMPI 2010},
institution = {Innovative Computing Laboratory, University of Tennessee},
address = {Stuttgart, Germany},
month = {September},
year = {2010}
}
[
hide]
Bouteiller, A., Bosilca, G., Dongarra, J. "Redesigning the Message Logging Model for High Performance," Concurrency and Computation: Practice and Experience (online version), June 27, 2010 [pdf] [bibtex] @article{icl:565,
author = {Bouteiller, A. and Bosilca, G. and Dongarra, J.},
title = {Redesigning the Message Logging Model for High Performance},
booktitle = {Concurrency and Computation: Practice and Experience (online version)},
institution = {Innovative Computing Laboratory, University of Tennessee},
month = {June},
year = {2010}
}
[
hide]
Bosilca, G., Bouteiller, A., Danalis, A., Herault, T., Lemarinier, P., Dongarra, J. "DAGuE: A generic distributed DAG engine for high performance computing," Innovative Computing Laboratory Technical Report, ICL-UT-10-01, April 11, 2010 [pdf] [bibtex] @techreport{icl:528,
author = {Bosilca, G. and Bouteiller, A. and Danalis, A. and Herault, T. and Lemarinier, P. and Dongarra, J.},
title = {DAGuE: A generic distributed DAG engine for high performance computing},
booktitle = {Innovative Computing Laboratory Technical Report},
institution = {Innovative Computing Laboratory, University of Tennessee},
month = {April},
year = {2010}
}
[
hide]
Bosilca, G., Bouteiller, A., Danalis, A, Faverge, M., Haidar, A., Herault, T., Kurzak, J., Langou, J., Lemarinier, P., Ltaief, H., Luszczek, P., YarKhan, A., Dongarra, J. "Distributed-Memory Task Execution and Dependence Tracking within DAGuE and the DPLASMA Project," Innovative Computing Laboratory Technical Report, ICL-UT-10-02, 2010 [pdf] [bibtex] @techreport{icl:529,
author = {Bosilca, G. and Bouteiller, A. and Danalis, A, Faverge, M. and Haidar, A. and Herault, T. and Kurzak, J. and Langou, J. and Lemarinier, P. and Ltaief, H. and Luszczek, P. and YarKhan, A. and Dongarra, J.},
title = {Distributed-Memory Task Execution and Dependence Tracking within DAGuE and the DPLASMA Project},
booktitle = {Innovative Computing Laboratory Technical Report},
institution = {Innovative Computing Laboratory, University of Tennessee},
year = {2010}
}
[
hide]
Bouteiller, A., Ropars, T., Bosilca, G., Morin, C., Dongarra, J. "Reasons for a pessimistic or optimistic message logging protocol in MPI uncoordinated failure, recovery," Cluster Computing and Workshops, 2009. CLUSTER '09. IEEE International Conference on, IEEE, New Orleans, LA, 1-9, August, 2009 [pdf] [bibtex] @inproceedings{icl:863,
author = {Bouteiller, A. and Ropars, T. and Bosilca, G. and Morin, C. and Dongarra, J.},
title = {Reasons for a pessimistic or optimistic message logging protocol in MPI uncoordinated failure, recovery},
booktitle = {Cluster Computing and Workshops, 2009. CLUSTER '09. IEEE International Conference on},
institution = {Innovative Computing Laboratory, University of Tennessee},
pages = {1-9},
address = {New Orleans, LA},
month = {August},
year = {2009}
}
[
hide]
Bouteiller, A., Desprez, F. "Fault Tolerance Management for a Hierarchical GridRPC Middleware," 8th IEEE International Symposium on Cluster Computing and the Grid (CCGrid 2008), Lyon, France, May 19-22, 2008 [pdf] [bibtex] @inproceedings{icl:428,
author = {Bouteiller, A. and Desprez, F.},
title = {Fault Tolerance Management for a Hierarchical GridRPC Middleware},
booktitle = {8th IEEE International Symposium on Cluster Computing and the Grid (CCGrid 2008)},
institution = {Innovative Computing Laboratory, University of Tennessee},
address = {Lyon, France},
month = {May},
}
[
hide]
Bouteiller, A., Bosilca, G., Dongarra, J. "Redesigning the Message Logging Model for High Performance," International Supercomputer Conference (ISC 2008), Dresden, Germany, June 17, 2008 [pdf] [bibtex] @inproceedings{icl:456,
author = {Bouteiller, A. and Bosilca, G. and Dongarra, J.},
title = {Redesigning the Message Logging Model for High Performance},
booktitle = {International Supercomputer Conference (ISC 2008)},
institution = {Innovative Computing Laboratory, University of Tennessee},
address = {Dresden, Germany},
month = {June},
}
[
hide]
Bouteiller, A., Bosilca, G., Dongarra, J. "Retrospect: Deterministic Relay of MPI Applications for Interactive Distributed Debugging," Accepted for Euro PVM/MPI 2007, Springer, September, 2007 [bibtex] @article{icl:353,
author = {Bouteiller, A. and Bosilca, G. and Dongarra, J.},
title = {Retrospect: Deterministic Relay of MPI Applications for Interactive Distributed Debugging},
booktitle = {Accepted for Euro PVM/MPI 2007},
institution = {Innovative Computing Laboratory, University of Tennessee},
month = {September},
year = {2007}
}
[
hide]