MAGMA  1.2.0
MatrixAlgebraonGPUandMulticoreArchitectures
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
context.h File Reference

Go to the source code of this file.

Classes

struct  magma_context_s

Typedefs

typedef struct magma_context_s magma_context_t

Functions

magma_context_tmagma_init (void *, void *(*func)(void *a), magma_int_t nthread, magma_int_t ncpu, magma_int_t ngpu, magma_int_t argc, char **argv)
void magma_finalize (magma_context_t *cntxt)
void auto_tune (char algorithm, char precision, magma_int_t ncores, magma_int_t ncorespsocket, magma_int_t m, magma_int_t n, magma_int_t *nb, magma_int_t *ob, magma_int_t *ib, magma_int_t *nthreads, magma_int_t *nquarkthreads)

Typedef Documentation


Function Documentation

void auto_tune ( char  algorithm,
char  precision,
magma_int_t  ncores,
magma_int_t  ncorespsocket,
magma_int_t  m,
magma_int_t  n,
magma_int_t nb,
magma_int_t ob,
magma_int_t ib,
magma_int_t nthreads,
magma_int_t nquarkthreads 
)

Definition at line 15 of file auto_tune.cpp.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
This function initializes tunable parameters to be used for
subsequent calls to hybrid routines in the MAGMA library.
The idea is to use the matrix size together with the number of cores
and the number of cores per socket to do a table lookup for tunable
parameter values based on existing research results.
Arguments
=========
algorithm (input) CHAR
'q' QR
'l' LU
'c' Choleskey
precision (input) CHAR
's' Single
'd' Double
'c' Complex single
'z' Complex double
ncores (input) INTEGER
Number of cores
ncorespsocket (intput) INTEGER
Number of cores per socket
m (input) INTEGER
Number of rows
n (intput) INTEGER
Number of columns
nb (output) INTEGER
Block size
ob (output) INTEGER
Outer block size
ib (output) INTEGER
Inner block size
nthreads (output) INTEGER
Number of MAMGMA threads
nquarkthreads (output) INTEGER
Number of QUARK threads
===================================================================== */
/* if QR */
if (algorithm == 'q') {
/* The best inner block size is always 12 */
*ib = 12;
/* The best number of QUARK threads is the number of cores per socket, in general */
*nquarkthreads = ncorespsocket;
/* 0 <= m <= 2080 */
if ((m > 0) && (m <= 2080)) {
*nb = 64;
*ob = 64;
*nthreads = 2;
}
/* 2080 < m <= 3360 */
if ((m > 2080) && (m <= 3360)) {
*nb = 128;
*ob = 128;
*nthreads = 6;
if ((*nthreads + *nquarkthreads) > ncores)
*nthreads = ncores - *nquarkthreads;
}
/* 3360 < m <= 4640 */
if ((m > 3360) && (m <= 4640)) {
*nb = 128;
*ob = 128;
*nthreads = 14;
if ((*nthreads + *nquarkthreads) > ncores)
*nthreads = ncores - *nquarkthreads;
}
/* 4640 < m <= 5920 */
if ((m > 4640) && (m <= 5920)) {
*nb = 128;
*ob = 160;
*nthreads = 18;
if ((*nthreads + *nquarkthreads) > ncores)
*nthreads = ncores - *nquarkthreads;
/* ncores = 12; ncorespsocket = 6 */
if ((ncores == 12) && (ncorespsocket == 6)) {
*ob = 128;
*nquarkthreads = 4;
*nthreads = 8;
}
}
/* 5920 < m <= 7200 */
if ((m > 5920) && (m <= 7200)) {
*nb = 128;
*ob = 160;
*nthreads = 22;
if ((*nthreads + *nquarkthreads) > ncores)
*nthreads = ncores - *nquarkthreads;
/* ncores = 12; ncorespsocket = 6 */
if ((ncores == 12) && (ncorespsocket == 6)) {
*ob = 128;
*nquarkthreads = 4;
*nthreads = 8;
}
}
/* 7200 < m <= 8480 */
if ((m > 7200) && (m <= 8480)) {
*nb = 128;
*ob = 160;
*nthreads = 26;
if ((*nthreads + *nquarkthreads) > ncores)
*nthreads = ncores - *nquarkthreads;
/* ncores = 12; ncorespsocket = 6 */
if ((ncores == 12) && (ncorespsocket == 6)) {
*ob = 128;
*nquarkthreads = 3;
*nthreads = 9;
}
}
/* 8480 < m <= 9760 */
if ((m > 8480) && (m <= 9760)) {
*nb = 128;
*ob = 160;
*nthreads = 30;
if ((*nthreads + *nquarkthreads) > ncores)
*nthreads = ncores - *nquarkthreads;
/* ncores = 12; ncorespsocket = 6 */
if ((ncores == 12) && (ncorespsocket == 6)) {
*ob = 128;
*nquarkthreads = 3;
*nthreads = 9;
if (precision == 's'){
*nb = 192;
*ob = 192;
}
}
}
/* 9760 < m <= 11040 */
if ((m > 9760) && (m <= 11040)) {
*nb = 128;
*ob = 160;
*nthreads = 34;
if ((*nthreads + *nquarkthreads) > ncores)
*nthreads = ncores - *nquarkthreads;
/* ncores = 12; ncorespsocket = 6 */
if ((ncores == 12) && (ncorespsocket == 6)) {
*ob = 128;
*nquarkthreads = 2;
*nthreads = 10;
if (precision == 's'){
*nb = 192;
*ob = 192;
}
}
}
/* 11040 < m <= 12320 */
if ((m > 11040) && (m <= 12320)) {
*nb = 128;
*ob = 160;
*nthreads = 36;
if ((*nthreads + *nquarkthreads) > ncores)
*nthreads = ncores - *nquarkthreads;
/* ncores = 12; ncorespsocket = 6 */
if ((ncores == 12) && (ncorespsocket == 6)) {
*ob = 224;
if (precision == 'c'){
*ob = 128;
}
*nquarkthreads = 2;
*nthreads = 10;
if (precision == 's'){
*nb = 192;
*ob = 192;
}
}
}
/* 12320 < m <= 13600 */
if ((m > 12320) && (m <= 13600)) {
*nb = 128;
*ob = 160;
*nthreads = 42;
if ((*nthreads + *nquarkthreads) > ncores)
*nthreads = ncores - *nquarkthreads;
if (precision == 'z'){
*ob = 192;
}
/* ncores = 12; ncorespsocket = 6 */
if ((ncores == 12) && (ncorespsocket == 6)) {
*ob = 224;
if (precision == 'c'){
*ob = 128;
}
*nquarkthreads = 2;
*nthreads = 10;
if (precision == 's'){
*nb = 192;
*ob = 224;
}
}
}
/* 13600 < m <= 15220 */
if ((m > 13600) && (m <= 15220)) {
*nb = 128;
*ob = 192;
*nthreads = 42;
if ((*nthreads + *nquarkthreads) > ncores)
*nthreads = ncores - *nquarkthreads;
if (precision == 'd'){
*ob = 160;
}
/* ncores = 12; ncorespsocket = 6 */
if ((ncores == 12) && (ncorespsocket == 6)) {
*ob = 224;
if (precision == 'c'){
*ob = 160;
}
*nquarkthreads = 2;
*nthreads = 10;
if (precision == 's'){
*nb = 192;
*ob = 224;
}
}
}
/* 15220 < m <= 16800 */
if ((m > 15220) && (m <= 16800)) {
*nb = 128;
*ob = 192;
*nthreads = 42;
if ((*nthreads + *nquarkthreads) > ncores)
*nthreads = ncores - *nquarkthreads;
if (precision == 'd'){
*nb = 160;
*ob = 200;
}
if (precision == 'c'){
*ob = 160;
}
/* ncores = 12; ncorespsocket = 6 */
if ((ncores == 12) && (ncorespsocket == 6)) {
*ob = 224;
if (precision == 'c'){
*ob = 192;
}
*nquarkthreads = 2;
*nthreads = 10;
if (precision == 's'){
*nb = 192;
*ob = 224;
}
}
}
/* 16800 < m */
if (m > 16800) {
*nb = 128;
*ob = 224;
*nthreads = 42;
if ((*nthreads + *nquarkthreads) > ncores)
*nthreads = ncores - *nquarkthreads;
if (precision == 'd'){
*nb = 160;
*ob = 200;
}
if (precision == 'c'){
*ob = 192;
}
/* ncores = 12; ncorespsocket = 6 */
if ((ncores == 12) && (ncorespsocket == 6)) {
*ob = 256;
if (precision == 'c'){
*ob = 192;
}
*nquarkthreads = 2;
*nthreads = 10;
if (precision == 's'){
*nb = 192;
*ob = 224;
}
}
}
}
}
void magma_finalize ( magma_context_t cntxt)

Here is the caller graph for this function:

magma_context_t* magma_init ( void *  ,
void *(*)(void *a)  func,
magma_int_t  nthread,
magma_int_t  ncpu,
magma_int_t  ngpu,
magma_int_t  argc,
char **  argv 
)

Definition at line 27 of file init.cpp.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
This function initializes the hardware context to be used for
subsequent calls to routines in the MAGMA library.
Arguments
=========
NCPU (input) INTEGER
Number of CPU cores to be used in the computations.
NGPU (input) INTEGER
Number of GPU cores to be used in the computations.
===================================================================== */
t_params **tp = (t_params**)malloc(sizeof(t_params*)*nthread);
pthread_t *thread;
context = (magma_context *)malloc(sizeof(magma_context));
if (nthread > 0) {
thread = (pthread_t*)malloc(sizeof(pthread_t)*nthread);
for (i = 0; i < nthread; i++){
tp[i] = (t_params*)malloc(sizeof(t_params));
tp[i]->params = params;
tp[i]->tid = i;
pthread_create(&thread[i], NULL, func, (void *)tp[i]);
}
}
if (ncpu <= 1)
ncpu = 1;
if (ngpu <= 0)
ngpu = 0;
context->num_cores = ncpu;
context->num_gpus = ngpu;
if (ncpu > 1)
{
/* Initialize the QUARK scheduler */
context->quark = QUARK_New(ncpu);
}
if (ngpu > 1)
{
printf("The requested number of GPUs is not yet supported.\n\n");
printf("The number of GPUs set to one.\n\n");
context->num_gpus = 1;
}
if (ngpu == 1)
{
CUdevice dev;
context->gpu_context = (CUcontext *)malloc(ngpu * sizeof(CUcontext));
/* For now we use by default device 0, always */
if( CUDA_SUCCESS != cuInit( 0 ) ) {
fprintf(stderr, "CUDA: Not initialized\n" );
exit(-1);
}
if( CUDA_SUCCESS != cuDeviceGet( &dev, 0 ) ) {
fprintf(stderr, "CUDA: Cannot get the device\n");
exit(-1);
}
if( CUDA_SUCCESS != cuCtxCreate( &context->gpu_context[0], 0, dev ) ) {
fprintf(stderr, "CUDA: Cannot create the context\n");
exit(-1);
}
if( CUDA_SUCCESS != cublasInit( ) ) {
fprintf(stderr, "CUBLAS: Not initialized\n");
exit(-1);
}
}
context->nb = -1;
for(i = 1; i<argc; i++)
if (strcmp("-b", argv[i])==0)
context->nb = atoi(argv[++i]);
return context;
}

Here is the caller graph for this function: