{
#define A(i,j) (a + (j)*lda + (i))
#define inAT(d,i,j) (dAT[d] + (i)*nb*ldn_local + (j)*nb)
#define inPT(d,i,j) (dPT[d] + (i)*nb*nb + (j)*nb)
#ifdef PROFILE
float flops, time_rmajor = 0, time_rmajor2 = 0, time_rmajor3 = 0, time_mem = 0;
#define FMULS_GETRF(__m, __n) ( ((__m) < (__n)) ? (0.5 * (__m) * ((__m) * ((__n) - (1./3.) * (__m) - 1. ) + (__n)) + (2. / 3.) * (__m)) \
: (0.5 * (__n) * ((__n) * ((__m) - (1./3.) * (__n) - 1. ) + (__m)) + (2. / 3.) * (__n)) )
#define FADDS_GETRF(__m, __n) ( ((__m) < (__n)) ? (0.5 * (__m) * ((__m) * ((__n) - (1./3.) * (__m) ) - (__n)) + (1. / 6.) * (__m)) \
: (0.5 * (__n) * ((__n) * ((__m) - (1./3.) * (__n) ) - (__m)) + (1. / 6.) * (__n)) )
#define PRECISION_s
#if defined(PRECISION_z) || defined(PRECISION_c)
#define FLOPS(m, n) ( 6. * FMULS_GETRF(m, n) + 2. * FADDS_GETRF(m, n) )
#else
#define FLOPS(m, n) ( FMULS_GETRF(m, n) + FADDS_GETRF(m, n) )
#endif
#endif
float *dAT[4], *
dA[4], *dPT[4];
magma_int_t iinfo = 0, nb, maxm, n_local[4], ldn_local;
#if CUDA_VERSION > 3010
size_t totalMem;
#else
unsigned int totalMem;
#endif
CUdevice dev;
static cudaStream_t stream[4][2];
*info = 0;
if (m < 0)
*info = -1;
else if (n < 0)
*info = -2;
*info = -4;
if (*info != 0) {
return *info;
}
if (m == 0 || n == 0)
return *info;
maxm = ((m + 31)/32)*32;
cuDeviceGet( &dev, 0);
cuDeviceTotalMem( &totalMem, dev );
totalMem /= sizeof(float);
char * ngr_nb_char = getenv("MAGMA_NGR_NB");
if( ngr_nb_char != NULL ) NB =
max( nb,
min( NB, atoi(ngr_nb_char) ) );
if( num_gpus0 > ceil((float)NB/nb) ) {
num_gpus = (int)ceil((float)NB/nb);
} else {
num_gpus = num_gpus0;
}
if( num_gpus*NB >= n ) {
#ifdef CHECK_SGETRF_OOC
printf( " * still fit in GPU memory.\n" );
#endif
NB = n;
} else {
#ifdef CHECK_SGETRF_OOC
printf( " * don't fit in GPU memory.\n" );
#endif
NB = num_gpus*NB;
NB =
max(nb,(NB / nb) * nb);
}
#ifdef CHECK_SGETRF_OOC
if( NB != n ) printf( " * running in out-core mode (n=%d, NB=%d, nb=%d).\n",n,NB,nb );
else printf( " * running in in-core mode (n=%d, NB=%d, nb=%d).\n",n,NB,nb );
fflush(stdout);
#endif
if ( (nb <= 1) || (nb >=
min(m,n)) ) {
} else {
#ifdef PROFILE
#endif
n_local[0] = (NB/nb)/num_gpus;
if( NB%(nb*num_gpus) != 0 ) n_local[0] ++;
n_local[0] *= nb;
ldn_local = ((n_local[0]+31)/32)*32;
for( d=0; d<num_gpus; d++ ) {
return *info;
}
dPT[d] = dA[d] + nb*maxm;
dAT[d] = dA[d] + h*nb*maxm;
for( ii=0; ii<h; ii++ ) {
}
}
#ifdef PROFILE
printf(
" memory-allocation time: %e\n",
GetTimerValue(start, end)/1000.0 );
#endif
for( I=0; I<n; I+=NB ) {
M = m;
maxm = ((M + 31)/32)*32;
if( num_gpus0 > ceil((float)N/nb) ) {
num_gpus = (int)ceil((float)N/nb);
} else {
num_gpus = num_gpus0;
}
for( d=0; d<num_gpus; d++ ) {
n_local[d] = ((N/nb)/num_gpus)*nb;
if (d < (N/nb)%num_gpus)
n_local[d] += nb;
else if (d == (N/nb)%num_gpus)
n_local[d] += N%nb;
}
ldn_local = ((n_local[0]+31)/32)*32;
#ifdef PROFILE
#endif
dAT, ldn_local, 0, dA, maxm, M, N, nb);
#ifdef PROFILE
#endif
for( offset = 0; offset<
min(m,I); offset+=NB )
{
NBk =
min( m-offset, NB );
for( d=0; d<num_gpus; d++ ) {
dA[d], (maxm-offset), stream[d][0] );
}
for( d=0; d<num_gpus; d++ ) {
}
for( jj=0, ib=offset/nb; jj<NBk; jj+=nb, ib++ )
{
ii = offset+jj;
rows = maxm - ii;
for( d=0; d<num_gpus; d++ ) {
if( jj+nb < NBk )
dA[d], (rows-nb), stream[d][0] );
n_local[d], nb, c_one,
inPT(d,0,0), nb,
inAT(d,ib,0), ldn_local );
if( M > ii+nb ) {
n_local[d], M-(ii+nb), nb, c_neg_one,
inAT(d,ib,0), ldn_local,
inPT(d,1,0), nb, c_one,
inAT(d,ib+1,0), ldn_local );
}
}
}
}
if( M > I ) {
magma_sgetrf1_mgpu(num_gpus, M-I, N, nb, I, dAT, ldn_local, ipiv+I, dA, &a[I*lda], lda,
(cudaStream_t **)stream, &iinfo);
if( iinfo < 0 ) {
*info = iinfo;
break;
} else if( iinfo != 0 ) {
*info = iinfo + I * NB;
}
for( ii=I; ii<
min(I+N,m); ii++ ) ipiv[ii] += I;
}
#ifdef PROFILE
#endif
magmablas_sgetmatrix_transpose3(num_gpus, (cudaStream_t **)stream, dAT, ldn_local,
A(0,I), lda, dA, maxm, M, N, nb);
#ifdef PROFILE
#endif
}
#ifdef PROFILE
flops =
FLOPS( (
float)m, (
float)n ) / 1000000;
printf(" NB=%d nb=%d\n",NB,nb);
printf(" memcopy and transpose %e seconds\n",time_mem );
printf(
" total time %e seconds\n",
GetTimerValue(start0,end)/1000.0);
printf(" Performance %f GFlop/s, %f seconds without htod and dtoh\n", flops / time_rmajor, time_rmajor /1000.0);
printf(" Performance %f GFlop/s, %f seconds with htod\n", flops / time_rmajor3, time_rmajor3/1000.0);
printf(" Performance %f GFlop/s, %f seconds with dtoh\n", flops / time_rmajor2, time_rmajor2/1000.0);
printf(
" Performance %f GFlop/s, %f seconds without memory-allocation\n", flops /
GetTimerValue(start, end),
GetTimerValue(start,end)/1000.0);
#endif
for( d=0; d<num_gpus0; d++ ) {
for( ii=0; ii<h; ii++ ) {
}
}
}
return *info;
}