I'm actually Alumni at the University of
Tennessee in the Innovative Computing Laboratory where I work on
different project related to dense linear algebra for hybrid
architectures. Before that, I did my PhD jointly in Runtime and
ScAlAplix teams from INRIA Bordeaux Sud-Ouest on dynamic
scheduling for sparse direct solvers. I defended my thesis under
the direction of Raymond Namyst and Jean Roman in December
2009. In relation to these works, I also work on different
projects to analyze complex applications on distributed architectures.
PLASMA
The Parallel Linear Algebra for Scalable Multi-core Architectures
(PLASMA) project aims to address the critical and highly disruptive
situation that is facing the Linear Algebra and High Performance
Computing community due to the introduction of multi-core
architectures.
MAGMA
The MAGMA project aims to develop a dense linear algebra library
similar to LAPACK but for heterogeneous/hybrid architectures, starting
with current "Multicore+GPU" systems.
DPLASMA
DAGuE aims at enabling scientific computing on large scale distributed
environments featuring many cores, accelerators and high speed
networks. The framework includes libraries, a runtime system, and
development tools to help application developers tackle the difficult
task of porting their applications to highly heterogeneous and diverse
environment. DPLASMA is dense linear algebra library based on this
runtime system for distributed and hybrid architectures.
Tile-MAGMA
Tile-MAGMA is an extension to the MAGMA project. It aims to provide a
dense linear algebra library for nodes of multi-cores enhanced with
multiples GPUs. This project relies on the StarPU
runtime system, the GPU kernels from MAGMA and the CPU kernels and the
algorithms from PLASMA.
Sparse Linear Algebra
PaStiX
The Parallel Sparse matriX package is a scientific library that
provides a high performance MPI/thread solver for very large sparse
linear systems based on direct and block ILU(k) iterative methods.
Murge
Murge is an interface definition for sparse solvers created by
developpers from the HIPS and PaStiX projects to provide a common
interface to both solvers. It aims to be as simple as PETSc without
the overcost introduced by the internal structures.
Others
ViTE
ViTE is a tool to visualize execution traces in Paje or OTF format to
help user to debug and/or profile parallel applications. It is an open
source software licenced under CeCILL-A.
EZTrace
EZTrace is a tool that aims at generating automatically execution
trace from HPC (High Performance Computing) programs. It generates
execution trace files that can be interpreted by visualization tools
such as ViTE.
M. Faverge.
"Ordonnancement hybride statique-dynamique en algèbre linéaire creuse pour de grands clusters de machines NUMA et multi-coeurs".
PhD thesis,
LaBRI, Université Bordeaux I, Talence,
Talence, France,
2009.
[pdf]
Keyword(s): Sparse.
[bibtex]
@phdthesis{t:LaBRI::MF09,
author = "Faverge, M.",
title = "Ordonnancement hybride statique-dynamique en alg\`ebre lin\'eaire creuse pour de grands clusters de machines NUMA et multi-coeurs",
year = "2009",
abstract = "New supercomputers incorporate many microprocessors which include themselves one or many computational cores. These new architectures induce strongly hierarchical topologies. These are called NUMA architectures. Sparse direct solvers are a basic building block of many numerical simulation algorithms. They need to be adapted to these new architectures with Non Uniform Memory Accesses. We propose to introduce a dynamic scheduling designed for NUMA architectures in the \pastix\, solver. The data structures of the solver, as well as the patterns of communication have been modified to meet the needs of these architectures and dynamic scheduling. We are also interested in the dynamic adaptation of the computation grain to use efficiently multi-core architectures and shared memory. Experiments on several numerical test cases will be presented to prove the efficiency of the approach on different architectures.",
address = "Talence, France",
keywords = "Sparse",
month = "",
optannote = "",
optkey = "",
optnote = "",
opttype = "",
school = "LaBRI, Universit\'e Bordeaux I, Talence",
url = "http://www.labri.fr/~ramet/restricted/these_faverge.pdf.gz",
}
J. Dongarra,
M. Faverge,
T. Herault,
M. Jacquelin,
J. Langou,
and Y. Robert.
"Hierarchical QR factorization algorithms for multi-core clusters".
Parallel Computing,
(0):-,
2013.
[pdf]
[doi:10.1016/j.parco.2013.01.003]
Keyword(s): Multi-core,
QR factorization,
Numerical linear algebra,
Hierarchical architecture,
Distributed memory,
Cluster.
[bibtex]
@article{parco12,
author = "Dongarra, J. and Faverge, M. and Herault, T. and Jacquelin, M. and Langou, J. and Robert, Y.",
title = "Hierarchical QR factorization algorithms for multi-core clusters",
year = "2013",
doi = "10.1016/j.parco.2013.01.003",
issn = "0167-8191",
journal = "Parallel Computing",
keywords = "Multi-core",
keywords = "QR factorization",
keywords = "Numerical linear algebra",
keywords = "Hierarchical architecture",
keywords = "Distributed memory",
keywords = "Cluster",
note = "",
number = "0",
pages = " - ",
url = "http://www.sciencedirect.com/science/article/pii/S0167819113000100",
volume = "",
}
J. Kurzak,
P. Luszczek,
M. Faverge,
and J. Dongarra.
"LU Factorization with Partial Pivoting for a Multicore System with Accelerators".
IEEE Transactions on Parallel and Distributed Systems,
99(PrePrints):1,
2012.
[doi:http://doi.ieeecomputersociety.org/10.1109/TPDS.2012.242]
[bibtex]
@article{10.1109/TPDS.2012.242,
author = "Kurzak, J. and Luszczek, P. and Faverge, M. and Dongarra, J.",
title = "LU Factorization with Partial Pivoting for a Multicore System with Accelerators",
year = "2012",
address = "Los Alamitos, CA, USA",
doi = "http://doi.ieeecomputersociety.org/10.1109/TPDS.2012.242",
issn = "1045-9219",
journal = "IEEE Transactions on Parallel and Distributed Systems",
number = "PrePrints",
pages = "1",
publisher = "IEEE Computer Society",
volume = "99",
}
J. Dongarra,
M. Faverge,
H. Ltaief,
and P. Luszczek.
"Achieving Numerical Accuracy and High Performance using Recursive Tile LU Factorization".
Submitted to Concurrency and Computation: Practice & Experience, LAWN 259,
September 2011.
[bibtex]
@article{ccpe11,
author = "Dongarra, J. and Faverge, M. and Ltaief, H. and Luszczek, P.",
title = "Achieving Numerical Accuracy and High Performance using Recursive Tile LU Factorization",
year = "2011",
journal = "Submitted to Concurrency and Computation: Practice \& Experience, LAWN 259",
month = "sep",
optannote = "",
optkey = "",
optnote = "",
optnumber = "",
optpages = "",
optvolume = "",
}
J. Kurzak,
P. Luszczek,
A. YarKhan,
M. Faverge,
J. Langou,
H. Bouwmeester,
and J. Dongarra.
"Handbook of Multi and Many-Core Processing: Architecture, Algorithms, Programming, and Applications",
chapter Multithreading in the PLASMA Library.
Chapman and Hall/CRC,
To Be Published 26th March 2014.
[bibtex]
@inbook{iclbook11,
author = "Kurzak, J. and Luszczek, P. and YarKhan, A. and Faverge, M. and Langou, J. and Bouwmeester, H. and Dongarra, J.",
editor = "Mohamed Ahmed, Reda A. Ammar, Sanguthevar Rajasekaran",
title = "Handbook of Multi and Many-Core Processing: Architecture, Algorithms, Programming, and Applications",
chapter = "Multithreading in the PLASMA Library",
publisher = "Chapman and Hall/CRC",
year = "To Be Published 26th March 2014",
OPTkey = "",
OPTvolume = "",
OPTnumber = "",
OPTseries = "",
OPTtype = "",
OPTaddress = "",
OPTedition = "",
OPTmonth = "",
OPTpages = "",
OPTnote = "",
OPTannote = ""
}
J. Dongarra,
M. Faverge,
T. Herault,
J. Langou,
and Y. Robert.
"Hierarchical QR Factorization Algorithms for Multi-core Cluster Systems".
In Parallel Distributed Processing Symposium (IPDPS), 2012 IEEE 26th International,
pages 607-618,
2012.
[doi:10.1109/IPDPS.2012.62]
[bibtex]
@inproceedings{6267863,
author = "Dongarra, J. and Faverge, M. and Herault, T. and Langou, J. and Robert, Y.",
title = "Hierarchical QR Factorization Algorithms for Multi-core Cluster Systems",
year = "2012",
booktitle = "Parallel Distributed Processing Symposium (IPDPS), 2012 IEEE 26th International",
doi = "10.1109/IPDPS.2012.62",
issn = "1530-2075",
pages = "607-618",
}
J. Kurzak,
P. Luszczek,
M. Faverge,
and J. Dongarra.
"Programming the LU Factorization for a Multicore System with Accelerators.".
In ,
April 2012.
[pdf]
[bibtex]
@inproceedings{icl:665,
author = "Kurzak, J. and Luszczek, P. and Faverge, M. and Dongarra, J.",
title = "Programming the LU Factorization for a Multicore System with Accelerators.",
year = "2012",
institution = "Innovative Computing Laboratory, University of Tennessee",
journal = "Proceedings of the 10th International Meeting on High-Performance Computing for Computational Science ({VECPAR}'12)",
month = "apr",
url = "http://icl.cs.utk.edu/news_pub/submissions/lawn266.pdf",
}
E. Agullo,
C. Augonnet,
J. Dongarra,
M. Faverge,
H. Ltaief,
S. Thibault,
and S. Tomov.
"QR Factorization on a Multicore Node Enhanced with Multiple GPU Accelerators".
In Proceedings of the 25th IEEE International Parallel & Distributed Processing Symposium (IPDPS'11),
Anchorage, United Sttes,
pages 932-943,
mai 2011.
[pdf]
[bibtex]
@inproceedings{icl577,
author = "Agullo, E. and Augonnet, C. and Dongarra, J. and Faverge, M. and Ltaief, H. and Thibault, S. and Tomov, S.",
title = "QR Factorization on a Multicore Node Enhanced with Multiple GPU Accelerators",
year = "2011",
address = "Anchorage, United Sttes",
booktitle = "Proceedings of the 25th {IEEE} International Parallel \& Distributed Processing Symposium ({IPDPS'11})",
institution = "Innovative Computing Laboratory, University of Tennessee",
journal = "University of Tennessee Computer Science Technical Report",
keywords = "",
month = "mai",
pages = "932--943",
url = "http://icl.cs.utk.edu/news_pub/submissions/plagma_qr.pdf",
}
G. Bosilca,
A. Bouteiller,
A. Danalis,
M. Faverge,
H. Haidar,
T. Herault,
J. Kurzak,
J. Langou,
P. Lemarinier,
H. Ltaief,
P. Luszczek,
A. YarKhan,
and J. Dongarra.
"Flexible Development of Dense Linear Algebra Algorithms on Massively Parallel Architectures with DPLASMA".
In Proceedings of the 25th IEEE International Symposium on Parallel & Distributed Processing Workshops and Phd Forum (IPDPSW'11), PDSEC 2011,
Anchorage, United States,
pages 1432-1441,
mai 2011.
[pdf]
[bibtex]
@inproceedings{icl529,
author = "Bosilca, G. and Bouteiller, A. and Danalis, A. and Faverge, M. and Haidar, H. and Herault, T. and Kurzak, J. and Langou, J. and Lemarinier, P. and Ltaief, H. and Luszczek, P. and YarKhan, A. and Dongarra, J.",
title = "Flexible Development of Dense Linear Algebra Algorithms on Massively Parallel Architectures with DPLASMA",
year = "2011",
address = "Anchorage, United States",
booktitle = "Proceedings of the 25th {IEEE} International Symposium on Parallel \& Distributed Processing Workshops and Phd Forum ({IPDPSW'11}), PDSEC 2011",
institution = "Innovative Computing Laboratory, University of Tennessee",
journal = "Innovative Computing Laboratory Technical Report",
month = "mai",
pages = "1432--1441",
url = "http://icl.cs.utk.edu/news_pub/submissions/DPLASMA_technical_report.pdf",
}
K. Coulomb,
A. Degomme,
M. Faverge,
and F. Trahay.
"An open-source tool-chain for performance analysis".
In Parallel Tools Workshop,
2011.
[bibtex]
@inproceedings{icl:659,
author = "Coulomb, K. and Degomme, A. and Faverge, M. and Trahay, F.",
title = "An open-source tool-chain for performance analysis",
year = "2011",
institution = "Innovative Computing Laboratory, University of Tennessee",
booktitle = "Parallel Tools Workshop",
}
J. Dongarra,
M. Faverge,
H. Ltaief,
and P. Luszczek.
"Exploiting Fine-Grain Parallelism in Recursive LU Factorization".
In Proceedings of ParCo 2011,
July 2011.
[bibtex]
@inproceedings{icl:611,
author = "Dongarra, J. and Faverge, M. and Ltaief, H. and Luszczek, P.",
title = "Exploiting Fine-Grain Parallelism in Recursive LU Factorization",
year = "2011",
booktitle = "Proceedings of ParCo 2011",
institution = "Innovative Computing Laboratory, University of Tennessee",
month = "jul",
}
J. Dongarra,
M. Faverge,
H. Ltaief,
and P. Luszczek.
"High Performance Matrix Inversion Based on LU Factorization for Multicore Architectures".
In Proceedings of MTAGS11,
2011.
[bibtex]
@inproceedings{icl:658,
author = "Dongarra, J. and Faverge, M. and Ltaief, H. and Luszczek, P.",
title = "High Performance Matrix Inversion Based on LU Factorization for Multicore Architectures",
year = "2011",
institution = "Innovative Computing Laboratory, University of Tennessee",
booktitle = "Proceedings of MTAGS11",
}
E. Agullo,
C. Augonnet,
J. Dongarra,
M. Faverge,
J. Langou,
H. Ltaief,
and S. Tomov.
"LU Factorization for Accelerator-based Systems".
In Proceedings of the 9th IEEE/ACS International Conference on Computer Systems and Applications (AICCSA'11),
pages 217-224,
December 2010.
Note: Best Paper award.
[pdf]
[bibtex]
@inproceedings{icl:599,
author = "Agullo, E. and Augonnet, C. and Dongarra, J. and Faverge, M. and Langou, J. and Ltaief, H. and Tomov, S.",
title = "LU Factorization for Accelerator-based Systems",
year = "2010",
booktitle = "Proceedings of the 9th {IEEE/ACS} International Conference on Computer Systems and Applications ({AICCSA}'11)",
pages = "217--224",
institution = "Innovative Computing Laboratory, University of Tennessee",
month = "dec",
note = "Best Paper award",
url = "http://icl.cs.utk.edu/news_pub/submissions/plagma_lu.pdf",
}
M. Faverge.
"A NUMA Aware Scheduler for a Parallel Sparse Direct Solver".
In Journées Informatique Massivement Multiprocesseur et Multicoeur,
Rocquencourt, France,
2009.
[pdf]
Keyword(s): Sparse.
[bibtex]
@inproceedings{c:LaBRI::i3m,
author = "Faverge, M.",
title = "A NUMA Aware Scheduler for a Parallel Sparse Direct Solver",
year = "2009",
address = "Rocquencourt, France",
booktitle = "Journ\'ees Informatique Massivement Multiprocesseur et Multicoeur",
keywords = "Sparse",
month = "",
optannote = "",
optcrossref = "",
opteditor = "",
optkey = "",
optnumber = "",
optorganization = "",
optpages = "",
optpublisher = "",
optseries = "",
optvolume = "",
url = "http://www.labri.fr/~ramet/restricted/i3m.pdf.gz",
}
M. Faverge.
"Dynamic Scheduling for Sparse Direct Solver on NUMA and Multicore Architectures".
In Sparse Days,
Toulouse, France,
2009.
Keyword(s): Sparse.
[bibtex]
@inproceedings{C:LaBRI::sparsedays2009,
author = "Faverge, M.",
title = "Dynamic Scheduling for Sparse Direct Solver on NUMA and Multicore Architectures",
year = "2009",
address = "Toulouse, France",
booktitle = "Sparse Days",
keywords = "Sparse",
month = "",
optannote = "",
optcrossref = "",
opteditor = "",
optkey = "",
optnumber = "",
optorganization = "",
optpages = "",
optpublisher = "",
optseries = "",
optvolume = "",
}
M. Faverge.
"Vers un solveur de systèmes linéaires creux adapté aux machines NUMA".
In ACTES RenPar'2009,
Toulouse, France,
2009.
[pdf]
Keyword(s): Sparse.
[bibtex]
M. Faverge,
X. Lacoste,
and P. Ramet.
"A NUMA Aware Scheduler for a Parallel Sparse Direct Solver".
In Proceedings of PMAA'2008,
Neuchatel, Swiss,
2008.
Keyword(s): Sparse.
[bibtex]
@inproceedings{C:LaBRI::PMAA2008b,
author = "Faverge, M. and Lacoste, X. and Ramet, P.",
title = "A NUMA Aware Scheduler for a Parallel Sparse Direct Solver",
year = "2008",
abstract = "Over the past few years, parallel sparse direct solvers made significant progress and are now able to solve efficiently industrial three-dimensional problems with several millions of unknowns. An hybrid MPI-thread implementation of our direct solver PaStiX is already well suited for SMP nodes or new multi-core architectures and drastically reduced the memory overhead and improved scalability. In the context of distributed NUMA architectures, a dynamic scheduler based on a work-stealing algorithm has been developed to fill in communication idle times. On these architectures, it is important to take care of NUMA effects and to preserve memory affinity during the work-stealing. The scheduling of communications also needs to be adapted, especially to ensure the overlap by computations. Experiments on numerical test cases will be presented to prove the efficiency of the approach on NUMA architectures. If memory is not large enough to treat a given problem, disks must be used to store data that cannot fit in memory (out-of-core storage). The idle-times due to disk access have to be managed by our dynamic scheduler to prefetch and save datasets. Thus, we design and study specific scheduling algorithms in this particular context.",
address = "Neuchatel, Swiss",
booktitle = "Proceedings of PMAA'2008",
keywords = "Sparse",
month = "",
optannote = "",
optcrossref = "",
opteditor = "",
optkey = "",
optnote = "",
optnumber = "",
optorganization = "",
optpages = "",
optpublisher = "",
optseries = "",
opturl = "",
optvolume = "",
}
M. Faverge and P. Ramet.
"Dynamic Scheduling for sparse direct Solver on NUMA architectures".
In Proceedings of PARA'2008,
Trondheim, Norway,
2008.
[pdf]
Keyword(s): Sparse.
[bibtex]
@inproceedings{C:LaBRI::para08,
author = "Faverge, M. and Ramet, P.",
title = "Dynamic Scheduling for sparse direct Solver on NUMA architectures",
year = "2008",
abstract = "Over the past few years, parallel sparse direct solvers made significant progress and are now able to efficiently work on problems with several millions of equations. This paper presents some improvements on our sparse direct solver PaStiX1 for distributed Non-Uniform Memory Access architectures. We show results on two preliminary works: a memory allocation scheme more adapted to these architectures and a better overlap of communication by computation. We also present a dynamic scheduler that takes care of memory affinity and data locality.",
address = "Trondheim, Norway",
booktitle = "Proceedings of PARA'2008",
keywords = "Sparse",
month = "",
optannote = "",
optcrossref = "",
opteditor = "",
optkey = "",
optnumber = "",
optorganization = "",
optpages = "",
optpublisher = "",
optseries = "",
optvolume = "",
url = "http://www.labri.fr/~ramet/restricted/para08.pdf.gz",
}
X. Lacoste,
P. Ramet,
M. Faverge,
I. Yamazaki,
and J. Dongarra.
"Sparse direct solvers with accelerators over DAG runtimes".
Rapport de recherche RR-7972,
INRIA,
2012.
[pdf]
[PDF]
[bibtex]
@techreport{larafayado12,
author = "Lacoste, X. and Ramet, P. and Faverge, M. and Yamazaki, I. and Dongarra, J.",
title = "Sparse direct solvers with accelerators over DAG runtimes",
year = "2012",
abstract = "The current trend in the high performance computing shows a dramatic increase in the number of cores on the shared memory compute nodes. Algorithms, especially those related to linear algebra, need to be adapted to these new computer architectures in order to be efficient. PASTIX is a sparse parallel direct solver, that incorporates a dynamic scheduler for strongly hierarchical modern architectures. In this paper, we study the replacement of this internal highly integrated scheduling strategy by two generic runtime frameworks: DAGUE and STARPU. Those runtimes will give the opportunity to execute the factorization tasks graph on emerging computers equipped with accelerators. As for previous work done in dense linear algebra, we present the kernels used for GPU computations inspired by the MAGMA library and the DAG algorithm used with those two runtimes. A comparative study of the performances of the supernodal solver with the three different schedulers is performed on manycore architectures and the improvements obtained with accelerators are presented with the STARPU runtime. These results demonstrate that these DAG runtimes provide uniform programming interfaces to obtain high performance on different architectures on irregular problems as sparse direct factorizations.",
affiliation = "BACCHUS - INRIA Bordeaux - Sud-Ouest , Laboratoire Bordelais de Recherche en Informatique - LaBRI , Innovative Computing Laboratory - ICL",
hal_id = "hal-00700066",
institution = "INRIA",
language = "Anglais",
number = "RR-7972",
pages = "11",
pdf = "http://hal.inria.fr/hal-00700066/PDF/RR-7972.pdf",
type = "Rapport de recherche",
url = "http://hal.inria.fr/hal-00700066",
}
D. Becker,
M. Faverge,
and J. Dongarra.
"Towards a Parallel Tile LDL Factorization for Multicore Architectures".
Technical report,
Innovative Computing Laboratory, University of Tennessee,
2011.
[bibtex]
@techreport{icl:610,
author = "Becker, D. and Faverge, M. and Dongarra, J.",
title = "Towards a Parallel Tile LDL Factorization for Multicore Architectures",
year = "2011",
institution = "Innovative Computing Laboratory, University of Tennessee",
journal = "ICL Technical Report",
}
G. Bosilca,
A. Bouteiller,
A. Danalis,
M. Faverge,
H. Haidar,
T. Herault,
J. Kurzak,
J. Langou,
P. Lemarinier,
H. Ltaief,
P. Luszczek,
A. YarKhan,
and J. Dongarra.
"Distibuted Dense Numerical Linear Algebra Algorithms on Massively Parallel Architectures: DPLASMA".
Technical report,
Innovative Computing Laboratory, University of Tennessee,
April 2010.
[pdf]
[bibtex]
@techreport{icl563,
author = "Bosilca, G. and Bouteiller, A. and Danalis, A. and Faverge, M. and Haidar, H. and Herault, T. and Kurzak, J. and Langou, J. and Lemarinier, P. and Ltaief, H. and Luszczek, P. and YarKhan, A. and Dongarra, J.",
title = "Distibuted Dense Numerical Linear Algebra Algorithms on Massively Parallel Architectures: DPLASMA",
year = "2010",
institution = "Innovative Computing Laboratory, University of Tennessee",
journal = "University of Tennessee Computer Science Technical Report, UT-CS-10-660",
month = "apr",
url = "http://icl.cs.utk.edu/news_pub/submissions/ut-cs-10-660.pdf",
}
J. Dongarra,
M. Faverge,
Y. Ishikawa,
R. Namyst,
F. Rue,
and F. Trahay.
"EZTrace: a generic framework for performance analysis".
Technical report,
Innovative Computing Laboratory, University of Tennessee,
December 2010.
Note: Poster at CCGrid 2011.
[bibtex]
@techreport{icl600,
author = "Dongarra, J. and Faverge, M. and Ishikawa, Y. and Namyst, R. and Rue, F. and Trahay, F.",
title = "EZTrace: a generic framework for performance analysis",
year = "2010",
institution = "Innovative Computing Laboratory, University of Tennessee",
journal = "ICL Technical Report, Poster at CCGrid 2011",
month = "dec",
note = "Poster at CCGrid 2011",
}