{
char uplo_[2] = {
uplo, 0};
magma_int_t ldda, lddla, ldwrk, nb, iinfo, n_local[4], J2, d, num_gpus;
static magma_int_t j, jj, jb, jb1, jb2, jb3, J, JB, NB, MB;
float d_one = 1.0;
float d_neg_one = -1.0;
#if CUDA_VERSION > 3010
size_t totalMem;
#else
unsigned int totalMem;
#endif
CUdevice dev;
static cudaStream_t stream[4][3];
#ifdef ROW_MAJOR_PROFILE
float chol_time = 1.0;
#endif
*info = 0;
*info = -1;
} else if (n < 0) {
*info = -2;
}
else if (lda <
max(1,n)) {
*info = -4;
}
if (*info != 0) {
return *info;
}
if ( n == 0 )
return *info;
if( num_gpus0 > n/nb ) {
num_gpus = n/nb;
if( n%nb != 0 ) num_gpus ++;
} else {
num_gpus = num_gpus0;
}
ldda = n/(nb*num_gpus);
if( n%(nb*num_gpus) != 0 ) ldda++;
ldda = num_gpus*((nb*ldda+31)/32)*32;
cuDeviceGet( &dev, 0);
cuDeviceTotalMem( &totalMem, dev );
totalMem /= sizeof(float);
MB = n;
if( NB >= n ) {
#ifdef CHECK_SPOTRF_OOC
printf( " * still fit in GPU memory.\n" );
#endif
NB = n;
} else {
#ifdef CHECK_SPOTRF_OOC
printf( " * don't fit in GPU memory.\n" );
#endif
NB = (NB / nb) * nb;
}
#ifdef CHECK_SPOTRF_OOC
if( NB != n ) printf( " * running in out-core mode (n=%d, NB=%d, nb=%d).\n",n,NB,nb );
else printf( " * running in in-core mode (n=%d, NB=%d, nb=%d).\n",n,NB,nb );
fflush(stdout);
#endif
ldda = ((n+31)/32)*32;
lddla = ((nb*(1+n/(nb*num_gpus))+31)/32)*32;
for (d=0; d<num_gpus; d++ ) {
return *info;
}
dwork[d] = &dt[d][2*nb*ldda];
}
#ifdef ROW_MAJOR_PROFILE
#endif
ldwrk = n;
return *info;
}
if (nb <= 1 || nb >= n) {
} else {
if (upper) {
for( J=0; J<n; J+=NB ) {
if( num_gpus0 > (n-J)/nb ) {
num_gpus = (n-J)/nb;
if( (n-J)%nb != 0 ) num_gpus ++;
} else {
num_gpus = num_gpus0;
}
magma_shtodpo( num_gpus, &
uplo, JB, n, J, J, nb, a, lda, dwork, NB, (cudaStream_t **)stream, &iinfo);
#ifdef ROW_MAJOR_PROFILE
#endif
for( j=0; j<J; j+=nb ) {
for( d=0; d<num_gpus; d++ ) {
dTup(d, 0, J), nb, stream[d][0] );
n_local[d] = 0;
}
for( jj=J+JB; jj<n; jj+=nb ) {
d = ((jj-J)/nb)%num_gpus;
dTup(d, 0, J+JB+n_local[d]), nb, stream[d][0] );
n_local[d] += jb2;
}
jb3 = nb;
for( jj=0; jj<JB; jj+=nb ) {
d = (jj/nb)%num_gpus;
J2 = (jj/(nb*num_gpus))*nb;
jb = jj;
jb, jb2, nb,
c_neg_one,
dTup(d, 0, J ), nb,
c_one,
dAup(d, 0, J2), NB);
d_neg_one,
dTup(d, 0, J+jb), nb,
d_one,
dAup(d, jb, J2), NB);
}
if( n > J+JB ) {
for( d=0; d<num_gpus; d++ ) {
n_local[d] = (((n-J)/nb)/num_gpus)*nb;
if (d < ((n-J)/nb)%num_gpus)
n_local[d] += nb;
else if (d == ((n-J)/nb)%num_gpus)
n_local[d] += (n-J)%nb;
n_local[d] -= ((JB/nb)/num_gpus)*nb;
if (d < (JB/nb)%num_gpus)
n_local[d] -= nb;
J2 = nb*(JB/(nb*num_gpus));
if( d < (JB/nb)%num_gpus ) J2+=nb;
JB, n_local[d], nb,
c_neg_one,
dTup(d, 0, J ), nb,
c_one,
dAup(d, 0, J2), NB);
}
}
}
magma_spotrf3_mgpu(num_gpus,
uplo, JB, n-J, J, J, nb, dwork, NB, dt, ldda, a, lda, (cudaStream_t **)stream, &iinfo);
if( iinfo != 0 ) {
*info = J+iinfo;
break;
}
#ifdef ROW_MAJOR_PROFILE
#endif
magma_sdtohpo(num_gpus, &
uplo, JB, n, J, J, nb, NB, a, lda, dwork, NB, (cudaStream_t **)stream, &iinfo);
}
} else {
for( J=0; J<n; J+=NB ) {
if( num_gpus0 > (n-J)/nb ) {
num_gpus = (n-J)/nb;
if( (n-J)%nb != 0 ) num_gpus ++;
} else {
num_gpus = num_gpus0;
}
magma_shtodpo( num_gpus, &
uplo, n, JB, J, J, nb, a, lda, dwork, lddla, (cudaStream_t **)stream, &iinfo);
#ifdef ROW_MAJOR_PROFILE
#endif
for( j=0; j<J; j+=nb ) {
for( d=0; d<num_gpus; d++ ) {
dT(d, J, 0), ldda, stream[d][0] );
n_local[d] = 0;
}
for( jj=J+JB; jj<n; jj+=nb ) {
d = ((jj-J)/nb)%num_gpus;
dT(d, J+JB+n_local[d], 0), ldda, stream[d][0] );
n_local[d] += jb2;
}
jb3 = nb;
for( jj=0; jj<JB; jj+=nb ) {
d = (jj/nb)%num_gpus;
J2 = (jj/(nb*num_gpus))*nb;
jb = jj;
jb2, jb, nb,
c_neg_one,
dT(d, J+jb, 0), ldda,
c_one,
dA(d, J2, 0), lddla);
d_neg_one,
dT(d, J+jb, 0), ldda,
d_one,
dA(d, J2, jb ), lddla);
}
if( n > J+JB ) {
for( d=0; d<num_gpus; d++ ) {
n_local[d] = (((n-J)/nb)/num_gpus)*nb;
if (d < ((n-J)/nb)%num_gpus)
n_local[d] += nb;
else if (d == ((n-J)/nb)%num_gpus)
n_local[d] += (n-J)%nb;
n_local[d] -= ((JB/nb)/num_gpus)*nb;
if (d < (JB/nb)%num_gpus)
n_local[d] -= nb;
J2 = nb*(JB/(nb*num_gpus));
if( d < (JB/nb)%num_gpus ) J2+=nb;
n_local[d], JB, nb,
c_neg_one,
dT(d, J+JB, 0), ldda,
c_one,
dA(d, J2, 0), lddla);
}
}
}
magma_spotrf3_mgpu(num_gpus,
uplo, n-J, JB, J, J, nb, dwork, lddla, dt, ldda, a, lda, (cudaStream_t **)stream, &iinfo);
if( iinfo != 0 ) {
*info = J+iinfo;
break;
}
#ifdef ROW_MAJOR_PROFILE
#endif
magma_sdtohpo( num_gpus, &
uplo, n, JB, J, J, nb, JB, a, lda, dwork, lddla, (cudaStream_t **)stream, &iinfo);
}
}
}
#ifdef ROW_MAJOR_PROFILE
#endif
if( num_gpus0 > n/nb ) {
num_gpus = n/nb;
if( n%nb != 0 ) num_gpus ++;
} else {
num_gpus = num_gpus0;
}
for (d=0; d<num_gpus; d++ ) {
}
#ifdef ROW_MAJOR_PROFILE
printf("\n n=%d NB=%d nb=%d\n",n,NB,nb);
printf(
" Without memory allocation: %f / %f = %f GFlop/s\n",
FLOPS((
float)n)/1000000,
GetTimerValue(start0, end0),
printf(
" Performance %f / %f = %f GFlop/s\n",
FLOPS((
float)n)/1000000, chol_time,
FLOPS( (
float)n ) / (1000000*chol_time));
#endif
return *info;
}