MAGMA  1.2.0
MatrixAlgebraonGPUandMulticoreArchitectures
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
spotrf2_ooc.cpp File Reference
#include "common_magma.h"
#include "../testing/flops.h"
Include dependency graph for spotrf2_ooc.cpp:

Go to the source code of this file.

Macros

#define PRECISION_s
#define magma_dgemm   magmablas_dgemm
#define magma_dtrsm   magmablas_dtrsm
#define PRECISION_s
#define FLOPS(n)   ( FMULS_POTRF(n) + FADDS_POTRF(n) )
#define A(i, j)   (a +(j)*lda + (i))
#define dA(d, i, j)   (dwork[(d)]+(j)*lddla + (i))
#define dT(d, i, j)   (dt[(d)] +(j)*ldda + (i))
#define dAup(d, i, j)   (dwork[(d)]+(j)*NB + (i))
#define dTup(d, i, j)   (dt[(d)] +(j)*nb + (i))

Functions

magma_int_t magma_shtodpo (int num_gpus, char *uplo, magma_int_t m, magma_int_t n, magma_int_t off_i, magma_int_t off_j, magma_int_t nb, float *h_A, magma_int_t lda, float **d_lA, magma_int_t ldda, cudaStream_t **stream, magma_int_t *info)
magma_int_t magma_sdtohpo (int num_gpus, char *uplo, magma_int_t m, magma_int_t n, magma_int_t off_i, magma_int_t off_j, magma_int_t nb, magma_int_t NB, float *a, magma_int_t lda, float **work, magma_int_t ldda, cudaStream_t **stream, magma_int_t *info)
magma_int_t magma_spotrf3_mgpu (int num_gpus, char uplo, magma_int_t m, magma_int_t n, magma_int_t off_i, magma_int_t off_j, magma_int_t nb, float **d_lA, magma_int_t ldda, float **d_lP, magma_int_t lddlp, float *work, magma_int_t ldwrk, cudaStream_t **streaml, magma_int_t *info)
magma_int_t magma_spotrf2_ooc (magma_int_t num_gpus0, char uplo, magma_int_t n, float *a, magma_int_t lda, magma_int_t *info)

Macro Definition Documentation

#define A (   i,
 
)    (a +(j)*lda + (i))

Definition at line 51 of file spotrf2_ooc.cpp.

#define dA (   d,
  i,
 
)    (dwork[(d)]+(j)*lddla + (i))

Definition at line 52 of file spotrf2_ooc.cpp.

#define dAup (   d,
  i,
 
)    (dwork[(d)]+(j)*NB + (i))

Definition at line 54 of file spotrf2_ooc.cpp.

#define dT (   d,
  i,
 
)    (dt[(d)] +(j)*ldda + (i))

Definition at line 53 of file spotrf2_ooc.cpp.

#define dTup (   d,
  i,
 
)    (dt[(d)] +(j)*nb + (i))

Definition at line 55 of file spotrf2_ooc.cpp.

#define FLOPS (   n)    ( FMULS_POTRF(n) + FADDS_POTRF(n) )

Definition at line 33 of file spotrf2_ooc.cpp.

#define magma_dgemm   magmablas_dgemm

Definition at line 16 of file spotrf2_ooc.cpp.

#define magma_dtrsm   magmablas_dtrsm

Definition at line 17 of file spotrf2_ooc.cpp.

#define PRECISION_s

Definition at line 29 of file spotrf2_ooc.cpp.

#define PRECISION_s

Definition at line 29 of file spotrf2_ooc.cpp.


Function Documentation

magma_int_t magma_sdtohpo ( int  num_gpus,
char *  uplo,
magma_int_t  m,
magma_int_t  n,
magma_int_t  off_i,
magma_int_t  off_j,
magma_int_t  nb,
magma_int_t  NB,
float *  a,
magma_int_t  lda,
float **  work,
magma_int_t  ldda,
cudaStream_t **  stream,
magma_int_t info 
)

Here is the caller graph for this function:

magma_int_t magma_shtodpo ( int  num_gpus,
char *  uplo,
magma_int_t  m,
magma_int_t  n,
magma_int_t  off_i,
magma_int_t  off_j,
magma_int_t  nb,
float *  h_A,
magma_int_t  lda,
float **  d_lA,
magma_int_t  ldda,
cudaStream_t **  stream,
magma_int_t info 
)

Here is the caller graph for this function:

magma_int_t magma_spotrf2_ooc ( magma_int_t  num_gpus0,
char  uplo,
magma_int_t  n,
float *  a,
magma_int_t  lda,
magma_int_t info 
)

Definition at line 58 of file spotrf2_ooc.cpp.

References __func__, A, dA, dAup, dt, dT, dTup, dwork, FLOPS, get_current_time(), GetTimerValue(), lapackf77_lsame, lapackf77_spotrf(), MAGMA_ERR_DEVICE_ALLOC, MAGMA_ERR_HOST_ALLOC, magma_free(), magma_free_host(), magma_get_dpotrf_nb(), magma_queue_create(), magma_queue_destroy(), MAGMA_S_NEG_ONE, MAGMA_S_ONE, magma_sdtohpo(), magma_setdevice(), magma_sgemm(), magma_shtodpo(), magma_smalloc(), magma_smalloc_host(), magma_spotrf3_mgpu(), magma_ssetmatrix_async(), magma_ssyrk(), MAGMA_SUCCESS, magma_xerbla(), MagmaLower, MagmaNoTrans, MagmaTrans, MagmaUpper, max, min, uplo, and codegen::work.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
SPOTRF_OOC computes the Cholesky factorization of a real symmetric
positive definite matrix A. This version does not require work
space on the GPU passed as input. GPU memory is allocated in the
routine. The matrix A may not fit entirely in the GPU memory.
The factorization has the form
A = U**T * U, if UPLO = 'U', or
A = L * L**T, if UPLO = 'L',
where U is an upper triangular matrix and L is lower triangular.
This is the block version of the algorithm, calling Level 3 BLAS.
Arguments
=========
UPLO (input) CHARACTER*1
= 'U': Upper triangle of A is stored;
= 'L': Lower triangle of A is stored.
N (input) INTEGER
The order of the matrix A. N >= 0.
A (input/output) REAL array, dimension (LDA,N)
On entry, the symmetric matrix A. If UPLO = 'U', the leading
N-by-N upper triangular part of A contains the upper
triangular part of the matrix A, and the strictly lower
triangular part of A is not referenced. If UPLO = 'L', the
leading N-by-N lower triangular part of A contains the lower
triangular part of the matrix A, and the strictly upper
triangular part of A is not referenced.
On exit, if INFO = 0, the factor U or L from the Cholesky
factorization A = U**T * U or A = L * L**T.
Higher performance is achieved if A is in pinned memory, e.g.
allocated using magma_malloc_host.
LDA (input) INTEGER
The leading dimension of the array A. LDA >= max(1,N).
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
or another error occured, such as memory allocation failed.
> 0: if INFO = i, the leading minor of order i is not
positive definite, and the factorization could not be
completed.
===================================================================== */
/* Local variables */
float c_one = MAGMA_S_ONE;
float c_neg_one = MAGMA_S_NEG_ONE;
float *dwork[4], *dt[4], *work;
char uplo_[2] = {uplo, 0};
magma_int_t ldda, lddla, ldwrk, nb, iinfo, n_local[4], J2, d, num_gpus;
static magma_int_t j, jj, jb, jb1, jb2, jb3, J, JB, NB, MB;
float d_one = 1.0;
float d_neg_one = -1.0;
long int upper = lapackf77_lsame(uplo_, "U");
#if CUDA_VERSION > 3010
size_t totalMem;
#else
unsigned int totalMem;
#endif
CUdevice dev;
static cudaStream_t stream[4][3];
//#define ROW_MAJOR_PROFILE
#ifdef ROW_MAJOR_PROFILE
magma_timestr_t start, end, start0, end0;
float chol_time = 1.0;
#endif
*info = 0;
if ((! upper) && (! lapackf77_lsame(uplo_, "L"))) {
*info = -1;
} else if (n < 0) {
*info = -2;
} else if (lda < max(1,n)) {
*info = -4;
}
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
/* Quick return */
if ( n == 0 )
return *info;
if( num_gpus0 > n/nb ) {
num_gpus = n/nb;
if( n%nb != 0 ) num_gpus ++;
} else {
num_gpus = num_gpus0;
}
ldda = n/(nb*num_gpus);
if( n%(nb*num_gpus) != 0 ) ldda++;
ldda = num_gpus*((nb*ldda+31)/32)*32;
/* figure out NB */
cuDeviceGet( &dev, 0);
cuDeviceTotalMem( &totalMem, dev );
totalMem /= sizeof(float);
MB = n; /* number of rows in the big panel */
NB = (magma_int_t)(num_gpus*(0.8*totalMem/ldda-2*nb)); /* number of columns in the big panel */
if( NB >= n ) {
#ifdef CHECK_SPOTRF_OOC
printf( " * still fit in GPU memory.\n" );
#endif
NB = n;
} else {
#ifdef CHECK_SPOTRF_OOC
printf( " * don't fit in GPU memory.\n" );
#endif
NB = (NB / nb) * nb; /* making sure it's devisable by nb */
}
#ifdef CHECK_SPOTRF_OOC
if( NB != n ) printf( " * running in out-core mode (n=%d, NB=%d, nb=%d).\n",n,NB,nb );
else printf( " * running in in-core mode (n=%d, NB=%d, nb=%d).\n",n,NB,nb );
fflush(stdout);
#endif
ldda = ((n+31)/32)*32;
lddla = ((nb*(1+n/(nb*num_gpus))+31)/32)*32;
for (d=0; d<num_gpus; d++ ) {
if (MAGMA_SUCCESS != magma_smalloc( &dt[d], NB*lddla + 2*nb*ldda )) {
return *info;
}
dwork[d] = &dt[d][2*nb*ldda];
magma_queue_create( &stream[d][0] );
magma_queue_create( &stream[d][1] );
magma_queue_create( &stream[d][2] );
}
#ifdef ROW_MAJOR_PROFILE
start0 = get_current_time();
#endif
ldwrk = n;
if (MAGMA_SUCCESS != magma_smalloc_host( &work, ldwrk*nb )) {
return *info;
}
if (nb <= 1 || nb >= n) {
lapackf77_spotrf(uplo_, &n, a, &lda, info);
} else {
/* Use hybrid blocked code. */
if (upper) {
/* =========================================================== *
* Compute the Cholesky factorization A = U'*U. *
* big panel is divided by block-row and distributed in block *
* column cyclic format */
/* for each big-panel */
for( J=0; J<n; J+=NB ) {
JB = min(NB,n-J);
jb = min(JB,nb);
if( num_gpus0 > (n-J)/nb ) {
num_gpus = (n-J)/nb;
if( (n-J)%nb != 0 ) num_gpus ++;
} else {
num_gpus = num_gpus0;
}
/* load the new big-panel by block-rows */
magma_shtodpo( num_gpus, &uplo, JB, n, J, J, nb, a, lda, dwork, NB, (cudaStream_t **)stream, &iinfo);
#ifdef ROW_MAJOR_PROFILE
start = get_current_time();
#endif
/* update with the previous big-panels */
for( j=0; j<J; j+=nb ) {
/* upload the diagonal of big panel */
for( d=0; d<num_gpus; d++ ) {
A(j, J), lda,
dTup(d, 0, J), nb, stream[d][0] );
n_local[d] = 0;
}
/* upload off-diagonals */
for( jj=J+JB; jj<n; jj+=nb ) {
d = ((jj-J)/nb)%num_gpus;
jb2 = min(nb, n-jj);
A(j, jj), lda,
dTup(d, 0, J+JB+n_local[d]), nb, stream[d][0] );
n_local[d] += jb2;
}
/* update the current big-panel using the previous block-row */
jb3 = nb; //min(nb,J-j); // number of columns in this previous block-column (nb)
for( jj=0; jj<JB; jj+=nb ) { /* diagonal */
d = (jj/nb)%num_gpus;
J2 = (jj/(nb*num_gpus))*nb;
jb1 = min(JB,jj+nb); // first row in the next block-row
jb2 = min(nb,JB-jj); // number of rows in this current block-row
jb = jj; //jb1-jb2; // number of columns in the off-diagona blocks (jj)
jb, jb2, nb,
c_neg_one, dTup(d, 0, J ), nb,
dTup(d, 0, J+jb), nb,
c_one, dAup(d, 0, J2), NB);
d_neg_one, dTup(d, 0, J+jb), nb,
d_one, dAup(d, jb, J2), NB);
}
if( n > J+JB ) { /* off-diagonal */
for( d=0; d<num_gpus; d++ ) {
/* local number of columns in the big panel */
n_local[d] = (((n-J)/nb)/num_gpus)*nb;
if (d < ((n-J)/nb)%num_gpus)
n_local[d] += nb;
else if (d == ((n-J)/nb)%num_gpus)
n_local[d] += (n-J)%nb;
/* local number of columns in diagonal */
n_local[d] -= ((JB/nb)/num_gpus)*nb;
if (d < (JB/nb)%num_gpus)
n_local[d] -= nb;
J2 = nb*(JB/(nb*num_gpus));
if( d < (JB/nb)%num_gpus ) J2+=nb;
JB, n_local[d], nb,
c_neg_one, dTup(d, 0, J ), nb,
dTup(d, 0, J+JB), nb,
c_one, dAup(d, 0, J2), NB);
}
}
} /* end of updates with previous rows */
/* factor the big panel */
magma_spotrf3_mgpu(num_gpus, uplo, JB, n-J, J, J, nb, dwork, NB, dt, ldda, a, lda, (cudaStream_t **)stream, &iinfo);
if( iinfo != 0 ) {
*info = J+iinfo;
break;
}
#ifdef ROW_MAJOR_PROFILE
chol_time += GetTimerValue(start, end);
#endif
/* upload the off-diagonal (and diagonal!!!) big panel */
magma_sdtohpo(num_gpus, &uplo, JB, n, J, J, nb, NB, a, lda, dwork, NB, (cudaStream_t **)stream, &iinfo);
}
} else {
/* ========================================================= *
* Compute the Cholesky factorization A = L*L'. */
/* for each big-panel */
for( J=0; J<n; J+=NB ) {
JB = min(NB,n-J);
if( num_gpus0 > (n-J)/nb ) {
num_gpus = (n-J)/nb;
if( (n-J)%nb != 0 ) num_gpus ++;
} else {
num_gpus = num_gpus0;
}
/* load the new big-panel by block-columns */
magma_shtodpo( num_gpus, &uplo, n, JB, J, J, nb, a, lda, dwork, lddla, (cudaStream_t **)stream, &iinfo);
/* update with the previous big-panels */
#ifdef ROW_MAJOR_PROFILE
start = get_current_time();
#endif
for( j=0; j<J; j+=nb ) {
/* upload the diagonal of big panel */
for( d=0; d<num_gpus; d++ ) {
A(J, j), lda,
dT(d, J, 0), ldda, stream[d][0] );
n_local[d] = 0;
}
/* upload off-diagonals */
for( jj=J+JB; jj<n; jj+=nb ) {
d = ((jj-J)/nb)%num_gpus;
jb2 = min(nb, n-jj);
A(jj, j), lda,
dT(d, J+JB+n_local[d], 0), ldda, stream[d][0] );
n_local[d] += jb2;
}
/* update the current big-panel using the previous block-row */
jb3 = nb; //min(nb,J-j);
for( jj=0; jj<JB; jj+=nb ) { /* diagonal */
d = (jj/nb)%num_gpus;
J2 = (jj/(nb*num_gpus))*nb;
jb1 = min(JB,jj+nb);
jb2 = min(nb,JB-jj);
jb = jj; //jb1-jb2;
jb2, jb, nb,
c_neg_one, dT(d, J+jb, 0), ldda,
dT(d, J, 0), ldda,
c_one, dA(d, J2, 0), lddla);
d_neg_one, dT(d, J+jb, 0), ldda,
d_one, dA(d, J2, jb ), lddla);
}
if( n > J+JB ) { /* off-diagonal */
for( d=0; d<num_gpus; d++ ) {
/* local number of columns in the big panel */
n_local[d] = (((n-J)/nb)/num_gpus)*nb;
if (d < ((n-J)/nb)%num_gpus)
n_local[d] += nb;
else if (d == ((n-J)/nb)%num_gpus)
n_local[d] += (n-J)%nb;
/* local number of columns in diagonal */
n_local[d] -= ((JB/nb)/num_gpus)*nb;
if (d < (JB/nb)%num_gpus)
n_local[d] -= nb;
J2 = nb*(JB/(nb*num_gpus));
if( d < (JB/nb)%num_gpus ) J2+=nb;
n_local[d], JB, nb,
c_neg_one, dT(d, J+JB, 0), ldda,
dT(d, J, 0), ldda,
c_one, dA(d, J2, 0), lddla);
}
}
}
/* factor the big panel */
magma_spotrf3_mgpu(num_gpus, uplo, n-J, JB, J, J, nb, dwork, lddla, dt, ldda, a, lda, (cudaStream_t **)stream, &iinfo);
if( iinfo != 0 ) {
*info = J+iinfo;
break;
}
#ifdef ROW_MAJOR_PROFILE
chol_time += GetTimerValue(start, end);
#endif
/* upload the off-diagonal big panel */
//magma_sdtohpo( num_gpus, &uplo, n, JB, J, J, nb, NB, a, lda, dwork, lddla, (cudaStream_t **)stream, &iinfo);
magma_sdtohpo( num_gpus, &uplo, n, JB, J, J, nb, JB, a, lda, dwork, lddla, (cudaStream_t **)stream, &iinfo);
} /* end of for J */
} /* if upper */
} /* if nb */
#ifdef ROW_MAJOR_PROFILE
end0 = get_current_time();
#endif
if( num_gpus0 > n/nb ) {
num_gpus = n/nb;
if( n%nb != 0 ) num_gpus ++;
} else {
num_gpus = num_gpus0;
}
for (d=0; d<num_gpus; d++ ) {
magma_free( dt[d] );
magma_queue_destroy( stream[d][0] );
magma_queue_destroy( stream[d][1] );
magma_queue_destroy( stream[d][2] );
}
magma_free_host( work );
#ifdef ROW_MAJOR_PROFILE
printf("\n n=%d NB=%d nb=%d\n",n,NB,nb);
printf(" Without memory allocation: %f / %f = %f GFlop/s\n", FLOPS((float)n)/1000000, GetTimerValue(start0, end0),
FLOPS((float)n)/(1000000*GetTimerValue(start0, end0)));
printf(" Performance %f / %f = %f GFlop/s\n", FLOPS((float)n)/1000000, chol_time, FLOPS( (float)n ) / (1000000*chol_time));
#endif
return *info;
} /* magma_spotrf_ooc */

Here is the call graph for this function:

Here is the caller graph for this function:

magma_int_t magma_spotrf3_mgpu ( int  num_gpus,
char  uplo,
magma_int_t  m,
magma_int_t  n,
magma_int_t  off_i,
magma_int_t  off_j,
magma_int_t  nb,
float **  d_lA,
magma_int_t  ldda,
float **  d_lP,
magma_int_t  lddlp,
float *  work,
magma_int_t  ldwrk,
cudaStream_t **  streaml,
magma_int_t info 
)

Here is the caller graph for this function: