MAGMA  1.2.0
MatrixAlgebraonGPUandMulticoreArchitectures
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
sgetrf3_ooc.cpp File Reference
#include "common_magma.h"
Include dependency graph for sgetrf3_ooc.cpp:

Go to the source code of this file.

Macros

#define PRECISION_s
#define magma_sgemm   magmablas_sgemm
#define magma_strsm   magmablas_strsm
#define A(i, j)   (a + (j)*lda + (i))
#define inAT(d, i, j)   (dAT[d] + (i)*nb*ldn_local + (j)*nb)
#define inPT(d, i, j)   (dPT[d] + (i)*nb*nb + (j)*nb)

Functions

void magmablas_spermute_long3 (float *dAT, int lda, int *ipiv, int nb, int ind)
magma_int_t magma_sgetrf1_gpu (magma_int_t m, magma_int_t n, float *dAT, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info)
magma_int_t magma_sgetrf1_mgpu (magma_int_t num_gpus, magma_int_t m, magma_int_t n, magma_int_t nb, magma_int_t offset, float **d_lAT, magma_int_t lddat, magma_int_t *ipiv, float **d_lAP, float *work, magma_int_t lddwork, cudaStream_t **stream, magma_int_t *info)
void magmablas_sgetmatrix_transpose3 (magma_int_t num_gpus, cudaStream_t **stream, float **dat, int ldda, float *ha, int lda, float **dB, int lddb, int m, int n, int nb)
void magmablas_ssetmatrix_transpose3 (magma_int_t num_gpus, cudaStream_t **stream, float *ha, int lda, float **dat, int ldda, int starti, float **dB, int lddb, int m, int n, int nb)
magma_int_t magma_sgetrf3_ooc (magma_int_t num_gpus0, magma_int_t m, magma_int_t n, float *a, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info)
magma_int_t magma_sgetrf2_piv (magma_int_t num_gpus0, magma_int_t m, magma_int_t n, float *a, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info)

Macro Definition Documentation

#define A (   i,
 
)    (a + (j)*lda + (i))
#define inAT (   d,
  i,
 
)    (dAT[d] + (i)*nb*ldn_local + (j)*nb)
#define inPT (   d,
  i,
 
)    (dPT[d] + (i)*nb*nb + (j)*nb)
#define magma_sgemm   magmablas_sgemm

Definition at line 17 of file sgetrf3_ooc.cpp.

#define magma_strsm   magmablas_strsm

Definition at line 18 of file sgetrf3_ooc.cpp.

#define PRECISION_s

Definition at line 15 of file sgetrf3_ooc.cpp.


Function Documentation

magma_int_t magma_sgetrf1_gpu ( magma_int_t  m,
magma_int_t  n,
float *  dAT,
magma_int_t  ldda,
magma_int_t ipiv,
magma_int_t info 
)
magma_int_t magma_sgetrf1_mgpu ( magma_int_t  num_gpus,
magma_int_t  m,
magma_int_t  n,
magma_int_t  nb,
magma_int_t  offset,
float **  d_lAT,
magma_int_t  lddat,
magma_int_t ipiv,
float **  d_lAP,
float *  work,
magma_int_t  lddwork,
cudaStream_t **  stream,
magma_int_t info 
)

Definition at line 32 of file sgetrf1_mgpu.cpp.

References __func__, inAT, lapackf77_sgetrf(), magma_device_sync(), magma_queue_create(), magma_queue_destroy(), magma_queue_sync(), MAGMA_S_NEG_ONE, MAGMA_S_ONE, magma_setdevice(), magma_sgemm, magma_sgetmatrix(), magma_sgetmatrix_async(), magma_ssetmatrix_async(), magma_strsm(), magma_xerbla(), magmablas_spermute_long2(), magmablas_spermute_long3(), magmablas_stranspose(), magmablas_stranspose2(), magmablas_strsm(), MagmaNoTrans, MagmaRight, MagmaUnit, MagmaUpper, max, and min.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
November 2010
Purpose
=======
SGETRF computes an LU factorization of a general M-by-N matrix A
using partial pivoting with row interchanges.
The factorization has the form
A = P * L * U
where P is a permutation matrix, L is lower triangular with unit
diagonal elements (lower trapezoidal if m > n), and U is upper
triangular (upper trapezoidal if m < n).
This is the right-looking Level 3 BLAS version of the algorithm.
Arguments
=========
NUM_GPUS
(input) INTEGER
The number of GPUS to be used for the factorization.
M (input) INTEGER
The number of rows of the matrix A. M >= 0.
N (input) INTEGER
The number of columns of the matrix A. N >= 0.
A (input/output) REAL array on the GPU, dimension (LDDA,N).
On entry, the M-by-N matrix to be factored.
On exit, the factors L and U from the factorization
A = P*L*U; the unit diagonal elements of L are not stored.
LDDA (input) INTEGER
The leading dimension of the array A. LDDA >= max(1,M).
IPIV (output) INTEGER array, dimension (min(M,N))
The pivot indices; for 1 <= i <= min(M,N), row i of the
matrix was interchanged with row IPIV(i).
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
or another error occured, such as memory allocation failed.
> 0: if INFO = i, U(i,i) is exactly zero. The factorization
has been completed, but the factor U is exactly
singular, and division by zero will occur if it is used
to solve a system of equations.
===================================================================== */
#define inAT(id,i,j) (d_lAT[(id)] + ((offset)+(i)*nb)*lddat + (j)*nb)
float c_one = MAGMA_S_ONE;
float c_neg_one = MAGMA_S_NEG_ONE;
magma_int_t iinfo, n_local[4];
magma_int_t maxm, mindim;
magma_int_t i, d, rows, cols, s, ldpan[4];
magma_int_t id, i_local, i_local2, nb0, nb1;
float *d_panel[4], *panel_local[4];
static cudaStream_t streaml[4][2];
/* Check arguments */
*info = 0;
if (m < 0)
*info = -2;
else if (n < 0)
*info = -3;
else if (num_gpus*lddat < max(1,n))
*info = -5;
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
/* Quick return if possible */
if (m == 0 || n == 0)
return *info;
/* Function Body */
mindim = min(m, n);
//nb = magma_get_sgetrf_nb(m);
if( num_gpus > ceil((float)n/nb) ) {
printf( " * too many GPUs for the matrix size, using %d GPUs\n",num_gpus );
*info = -1;
return *info;
}
{
/* Use hybrid blocked code. */
maxm = ((m + 31)/32)*32;
/* allocate workspace for each GPU */
for(i=0; i<num_gpus; i++){
/* local-n and local-ld */
n_local[i] = ((n/nb)/num_gpus)*nb;
if (i < (n/nb)%num_gpus)
n_local[i] += nb;
else if (i == (n/nb)%num_gpus)
n_local[i] += n%nb;
d_panel[i] = &(d_lAP[i][nb*maxm]);
/* streams */
magma_queue_create( &streaml[i][0] );
magma_queue_create( &streaml[i][1] );
}
s = mindim / nb;
for( i=0; i<s; i++ )
{
/* Set the GPU number that holds the current panel */
id = i%num_gpus;
/* Set the local index where the current panel is */
i_local = i/num_gpus;
cols = maxm - i*nb;
rows = m - i*nb;
/* start sending the panel to cpu */
magmablas_stranspose( d_lAP[id], cols, inAT(id,i,i_local), lddat, nb, cols );
d_lAP[id], cols,
work, lddwork, streaml[id][1] );
/* make sure that gpu queue is empty */
/* the remaining updates */
if ( i>0 ){
/* id-th gpu update the remaining matrix */
n_local[id] - (i_local+1)*nb, nb,
c_one, panel_local[id], ldpan[id],
inAT(id,i-1,i_local+1), lddat );
n_local[id]-(i_local+1)*nb, rows, nb,
c_neg_one, inAT(id,i-1,i_local+1), lddat,
&(panel_local[id][nb*ldpan[id]]), ldpan[id],
c_one, inAT(id,i, i_local+1), lddat );
}
/* stnchrnoize i-th panel from id-th gpu into work */
magma_queue_sync( streaml[id][1] );
/* i-th panel factorization */
lapackf77_sgetrf( &rows, &nb, work, &lddwork, ipiv+i*nb, &iinfo);
if ( (*info == 0) && (iinfo > 0) ) {
*info = iinfo + i*nb;
//break;
}
/* start sending the panel to all the gpus */
for( d=0; d<num_gpus; d++ ) {
work, lddwork,
d_lAP[d], maxm, streaml[d][0] );
}
for( d=0; d<num_gpus; d++ ) {
/* apply the pivoting */
if( d == 0 )
magmablas_spermute_long2( inAT(d,0,0), lddat, ipiv, nb, i*nb );
else
magmablas_spermute_long3( inAT(d,0,0), lddat, ipiv, nb, i*nb );
/* storage for panel */
if( d == id ) {
/* the panel belond to this gpu */
panel_local[d] = inAT(d,i,i_local);
ldpan[d] = lddat;
/* next column */
i_local2 = i_local+1;
} else {
/* the panel belong to another gpu */
panel_local[d] = d_panel[d];
ldpan[d] = nb;
/* next column */
i_local2 = i_local;
if( d < id ) i_local2 ++;
}
/* the size of the next column */
if ( s > (i+1) ) {
nb0 = nb;
} else { /* no look-ahead for the remaining columns for now */
nb0 = n_local[d]-nb*(s/num_gpus);
if( d < s%num_gpus ) nb0 -= nb;
}
if( d == (i+1)%num_gpus) {
/* owns the next column, look-ahead the column */
nb1 = nb0;
} else {
/* update the entire trailing matrix */
nb1 = n_local[d] - i_local2*nb;
}
/* synchronization */
magma_queue_sync( streaml[d][0] );
magmablas_stranspose2(panel_local[d], ldpan[d], d_lAP[d], maxm, cols, nb);
/* gpu updating the trailing matrix */
nb1, nb, c_one,
panel_local[d], ldpan[d],
inAT(d, i, i_local2), lddat);
nb1, m-(i+1)*nb, nb,
c_neg_one, inAT(d, i, i_local2), lddat,
&(panel_local[d][nb*ldpan[d]]), ldpan[d],
c_one, inAT(d, i+1, i_local2), lddat );
} /* end of gpu updates */
} /* end of for i=1..s */
/* Set the GPU number that holds the last panel */
id = s%num_gpus;
/* Set the local index where the last panel is */
i_local = s/num_gpus;
/* size of the last diagonal-block */
nb0 = min(m - s*nb, n - s*nb);
rows = m - s*nb;
cols = maxm - s*nb;
if( nb0 > 0 ) {
/* send the last panel to cpu (no look-ahead for the remaining for remaining columns) */
magmablas_stranspose2( d_lAP[id], maxm, inAT(id,s,i_local), lddat, nb0, rows);
magma_sgetmatrix( rows, nb0, d_lAP[id], maxm, work, lddwork );
/* make sure that gpu queue is empty */
/* factor on cpu */
lapackf77_sgetrf( &rows, &nb0, work, &lddwork, ipiv+s*nb, &iinfo);
if ( (*info == 0) && (iinfo > 0) )
*info = iinfo + s*nb;
/* start sending the factor to gpus */
for( d=0; d<num_gpus; d++ ) {
i_local2 = i_local;
if( d < id ) i_local2 ++;
if( d == id || n_local[d] > i_local2*nb )
{
work, lddwork,
d_lAP[d], maxm, streaml[d][0] );
}
}
}
/* clean up */
for( d=0; d<num_gpus; d++ ) {
if( nb0 > 0 ) {
if( d == 0 )
magmablas_spermute_long2( inAT(d,0,0), lddat, ipiv, nb0, s*nb );
else
magmablas_spermute_long3( inAT(d,0,0), lddat, ipiv, nb0, s*nb );
i_local2 = i_local;
if( d < id ) i_local2++;
if( d == id ) {
/* the panel belond to this gpu */
panel_local[d] = inAT(d,s,i_local);
/* next column */
nb1 = n_local[d] - i_local*nb-nb0;
magma_queue_sync( streaml[d][0] );
magmablas_stranspose2( panel_local[d], lddat, d_lAP[d], maxm, rows, nb0);
if( nb1 > 0 )
nb1, nb0, c_one,
panel_local[d], lddat,
inAT(d,s,i_local)+nb0, lddat);
} else if( n_local[d] > i_local2*nb ) {
/* the panel belong to another gpu */
panel_local[d] = d_panel[d];
/* next column */
nb1 = n_local[d] - i_local2*nb;
magma_queue_sync( streaml[d][0] );
magmablas_stranspose2( panel_local[d], nb0, d_lAP[d], maxm, rows, nb0);
nb1, nb0, c_one,
panel_local[d], nb0,
inAT(d,s,i_local2), lddat);
}
}
//magma_device_sync();
magma_queue_destroy( streaml[d][0] );
magma_queue_destroy( streaml[d][1] );
} /* end of for d=1,..,num_gpus */
}
return *info;
/* End of MAGMA_SGETRF_MGPU */
}

Here is the call graph for this function:

Here is the caller graph for this function:

magma_int_t magma_sgetrf2_piv ( magma_int_t  num_gpus0,
magma_int_t  m,
magma_int_t  n,
float *  a,
magma_int_t  lda,
magma_int_t ipiv,
magma_int_t info 
)

Definition at line 381 of file sgetrf3_ooc.cpp.

References lapackf77_slaswp(), magma_get_sgetrf_nb(), max, and min.

{
magma_int_t nb, h = 2, num_gpus;
magma_int_t NB, I, k1, k2, incx, minmn, maxm;
*info = 0;
if (m < 0)
*info = -1;
else if (n < 0)
*info = -2;
else if (lda < max(1,m))
*info = -4;
if (*info != 0)
return *info;
/* Quick return if possible */
if (m == 0 || n == 0)
return *info;
/* initialize nb */
maxm = ((m + 31)/32)*32;
/* figure out NB */
#if CUDA_VERSION > 3010
size_t totalMem;
#else
unsigned int totalMem;
#endif
CUdevice dev;
cuDeviceGet( &dev, 0);
cuDeviceTotalMem( &totalMem, dev );
totalMem /= sizeof(float);
/* number of columns in the big panel */
NB = (magma_int_t)(0.8*totalMem/maxm-h*nb);
//NB = (magma_int_t)min(n,num_gpus*(0.8*totalMem/maxm-h*nb));
//NB = (magma_int_t)min(n,(num_gpus*0.8*totalMem/(maxm))-2*nb);
char * ngr_nb_char = getenv("MAGMA_NGR_NB");
if( ngr_nb_char != NULL ) NB = max( nb, min( NB, atoi(ngr_nb_char) ) );
if( num_gpus0 > ceil((float)NB/nb) ) {
num_gpus = (int)ceil((float)NB/nb);
} else {
num_gpus = num_gpus0;
}
if( num_gpus*NB >= n ) {
#ifdef CHECK_SGETRF_OOC
printf( " * still fit in GPU memory.\n" );
#endif
NB = n;
} else {
#ifdef CHECK_SGETRF_OOC
printf( " * don't fit in GPU memory.\n" );
#endif
NB = num_gpus*NB;
NB = max(nb,(NB / nb) * nb); /* making sure it's devisable by nb (x64) */
}
minmn = min(m,n);
for( I=0; I<minmn-NB; I+=NB ) {
k1 = 1+I+NB;
k2 = minmn;
incx = 1;
lapackf77_slaswp(&NB, &a[I*lda], &lda, &k1, &k2, ipiv, &incx);
}
return *info;
} /* magma_sgetrf_piv */

Here is the call graph for this function:

Here is the caller graph for this function:

magma_int_t magma_sgetrf3_ooc ( magma_int_t  num_gpus0,
magma_int_t  m,
magma_int_t  n,
float *  a,
magma_int_t  lda,
magma_int_t ipiv,
magma_int_t info 
)

Definition at line 56 of file sgetrf3_ooc.cpp.

References __func__, A, dA, FLOPS, get_current_time(), GetTimerValue(), inAT, inPT, lapackf77_sgetrf(), MAGMA_ERR_DEVICE_ALLOC, magma_free(), magma_get_sgetrf_nb(), magma_queue_create(), magma_queue_destroy(), magma_queue_sync(), MAGMA_S_NEG_ONE, MAGMA_S_ONE, magma_setdevice(), magma_sgemm, magma_sgetrf1_mgpu(), magma_smalloc(), magma_ssetmatrix_async(), magma_strsm, MAGMA_SUCCESS, magma_xerbla(), magmablas_sgetmatrix_transpose3(), magmablas_spermute_long3(), magmablas_ssetmatrix_transpose3(), magmablas_stranspose2(), MagmaNoTrans, MagmaRight, MagmaUnit, MagmaUpper, max, and min.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
November 2010
Purpose
=======
SGETRF_OOC computes an LU factorization of a general M-by-N matrix A
using partial pivoting with row interchanges. This version does not
require work space on the GPU passed as input. GPU memory is allocated
in the routine. The matrix may not fit entirely in the GPU memory.
The factorization has the form
A = P * L * U
where P is a permutation matrix, L is lower triangular with unit
diagonal elements (lower trapezoidal if m > n), and U is upper
triangular (upper trapezoidal if m < n).
This is the right-looking Level 3 BLAS version of the algorithm.
Note: The factorization of big panel is done calling multiple-gpu-interface.
Pivots are applied on GPU within the big panel.
Arguments
=========
M (input) INTEGER
The number of rows of the matrix A. M >= 0.
N (input) INTEGER
The number of columns of the matrix A. N >= 0.
A (input/output) REAL array, dimension (LDA,N)
On entry, the M-by-N matrix to be factored.
On exit, the factors L and U from the factorization
A = P*L*U; the unit diagonal elements of L are not stored.
Higher performance is achieved if A is in pinned memory, e.g.
allocated using magma_malloc_host.
LDA (input) INTEGER
The leading dimension of the array A. LDA >= max(1,M).
IPIV (output) INTEGER array, dimension (min(M,N))
The pivot indices; for 1 <= i <= min(M,N), row i of the
matrix was interchanged with row IPIV(i).
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
or another error occured, such as memory allocation failed.
> 0: if INFO = i, U(i,i) is exactly zero. The factorization
has been completed, but the factor U is exactly
singular, and division by zero will occur if it is used
to solve a system of equations.
===================================================================== */
#define A(i,j) (a + (j)*lda + (i))
#define inAT(d,i,j) (dAT[d] + (i)*nb*ldn_local + (j)*nb)
#define inPT(d,i,j) (dPT[d] + (i)*nb*nb + (j)*nb)
/* Flops formula */
//#define PROFILE
#ifdef PROFILE
float flops, time_rmajor = 0, time_rmajor2 = 0, time_rmajor3 = 0, time_mem = 0;
magma_timestr_t start, start1, start2, end1, end, start0 = get_current_time();
#define FMULS_GETRF(__m, __n) ( ((__m) < (__n)) ? (0.5 * (__m) * ((__m) * ((__n) - (1./3.) * (__m) - 1. ) + (__n)) + (2. / 3.) * (__m)) \
: (0.5 * (__n) * ((__n) * ((__m) - (1./3.) * (__n) - 1. ) + (__m)) + (2. / 3.) * (__n)) )
#define FADDS_GETRF(__m, __n) ( ((__m) < (__n)) ? (0.5 * (__m) * ((__m) * ((__n) - (1./3.) * (__m) ) - (__n)) + (1. / 6.) * (__m)) \
: (0.5 * (__n) * ((__n) * ((__m) - (1./3.) * (__n) ) - (__m)) + (1. / 6.) * (__n)) )
#define PRECISION_s
#if defined(PRECISION_z) || defined(PRECISION_c)
#define FLOPS(m, n) ( 6. * FMULS_GETRF(m, n) + 2. * FADDS_GETRF(m, n) )
#else
#define FLOPS(m, n) ( FMULS_GETRF(m, n) + FADDS_GETRF(m, n) )
#endif
#endif
float *dAT[4], *dA[4], *dPT[4];
float c_one = MAGMA_S_ONE;
float c_neg_one = MAGMA_S_NEG_ONE;
magma_int_t iinfo = 0, nb, maxm, n_local[4], ldn_local;
magma_int_t N, M, NB, NBk, I, d, num_gpus;
magma_int_t i, ii, jj, h = 2, offset, ib, rows, s;
#if CUDA_VERSION > 3010
size_t totalMem;
#else
unsigned int totalMem;
#endif
CUdevice dev;
static cudaStream_t stream[4][2];
*info = 0;
if (m < 0)
*info = -1;
else if (n < 0)
*info = -2;
else if (lda < max(1,m))
*info = -4;
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
/* Quick return if possible */
if (m == 0 || n == 0)
return *info;
/* initialize nb */
maxm = ((m + 31)/32)*32;
/* figure out NB */
cuDeviceGet( &dev, 0);
cuDeviceTotalMem( &totalMem, dev );
totalMem /= sizeof(float);
/* number of columns in the big panel */
NB = (magma_int_t)(0.8*totalMem/maxm-h*nb);
char * ngr_nb_char = getenv("MAGMA_NGR_NB");
if( ngr_nb_char != NULL ) NB = max( nb, min( NB, atoi(ngr_nb_char) ) );
if( num_gpus0 > ceil((float)NB/nb) ) {
num_gpus = (int)ceil((float)NB/nb);
} else {
num_gpus = num_gpus0;
}
if( num_gpus*NB >= n ) {
#ifdef CHECK_SGETRF_OOC
printf( " * still fit in GPU memory.\n" );
#endif
NB = n;
} else {
#ifdef CHECK_SGETRF_OOC
printf( " * don't fit in GPU memory.\n" );
#endif
NB = num_gpus*NB;
NB = max(nb,(NB / nb) * nb); /* making sure it's devisable by nb (x64) */
}
#ifdef CHECK_SGETRF_OOC
if( NB != n ) printf( " * running in out-core mode (n=%d, NB=%d, nb=%d).\n",n,NB,nb );
else printf( " * running in in-core mode (n=%d, NB=%d, nb=%d).\n",n,NB,nb );
fflush(stdout);
#endif
if ( (nb <= 1) || (nb >= min(m,n)) ) {
/* Use CPU code for scalar of one tile. */
lapackf77_sgetrf(&m, &n, a, &lda, ipiv, info);
} else {
/* Use hybrid blocked code. */
/* allocate memory on GPU to store the big panel */
#ifdef PROFILE
start = get_current_time();
#endif
n_local[0] = (NB/nb)/num_gpus;
if( NB%(nb*num_gpus) != 0 ) n_local[0] ++;
n_local[0] *= nb;
ldn_local = ((n_local[0]+31)/32)*32;
for( d=0; d<num_gpus; d++ ) {
if (MAGMA_SUCCESS != magma_smalloc( &dA[d], (h*nb + ldn_local)*maxm )) {
return *info;
}
dPT[d] = dA[d] + nb*maxm; /* for storing the previous panel from CPU */
dAT[d] = dA[d] + h*nb*maxm;
for( ii=0; ii<h; ii++ ) {
magma_queue_create( &stream[d][ii] );
}
}
#ifdef PROFILE
printf( " memory-allocation time: %e\n",GetTimerValue(start, end)/1000.0 );
start = get_current_time();
#endif
for( I=0; I<n; I+=NB ) {
M = m;
N = min( NB, n-I ); /* number of columns in this big panel */
s = min(max(m-I,0),N)/nb; /* number of small block-columns in this big panel */
maxm = ((M + 31)/32)*32;
if( num_gpus0 > ceil((float)N/nb) ) {
num_gpus = (int)ceil((float)N/nb);
} else {
num_gpus = num_gpus0;
}
for( d=0; d<num_gpus; d++ ) {
n_local[d] = ((N/nb)/num_gpus)*nb;
if (d < (N/nb)%num_gpus)
n_local[d] += nb;
else if (d == (N/nb)%num_gpus)
n_local[d] += N%nb;
}
ldn_local = ((n_local[0]+31)/32)*32;
#ifdef PROFILE
start2 = get_current_time();
#endif
/* upload the next big panel into GPU, transpose (A->A'), and pivot it */
magmablas_ssetmatrix_transpose3(num_gpus, (cudaStream_t **)stream, A(0,I), lda,
dAT, ldn_local, 0, dA, maxm, M, N, nb);
//magmablas_shtodt3(num_gpus, (cudaStream_t **)stream, A(0,I), lda,
// dAT, ldn_local, dA, maxm, M, N, nb, h);
//magmablas_shtodt4(num_gpus, (cudaStream_t **)stream, A(0,I), lda,
// dAT, ldn_local, dA, maxm, M, N, nb, h/2);
#ifdef PROFILE
start1 = get_current_time();
#endif
/* == --------------------------------------------------------------- == */
/* == loop around the previous big-panels to update the new big-panel == */
for( offset = 0; offset<min(m,I); offset+=NB )
{
NBk = min( m-offset, NB );
/* start sending the first tile from the previous big-panels to gpus */
for( d=0; d<num_gpus; d++ ) {
magma_ssetmatrix_async( (M-offset), nb,
A(offset,offset), lda,
dA[d], (maxm-offset), stream[d][0] );
}
/* applying the pivot from the previous big-panel */
for( d=0; d<num_gpus; d++ ) {
magmablas_spermute_long3( inAT(d,0,0), ldn_local, ipiv, NBk, offset );
}
/* == going through each block-column of previous big-panels == */
for( jj=0, ib=offset/nb; jj<NBk; jj+=nb, ib++ )
{
ii = offset+jj;
rows = maxm - ii;
for( d=0; d<num_gpus; d++ ) {
/* upload the previous block-column to GPU */
magma_queue_sync( stream[d][0] );
/* transpose the previous column */
magmablas_stranspose2( inPT(d,0,0), nb, dA[d], rows, M-ii, nb);
/* start sending the next column */
if( jj+nb < NBk )
magma_ssetmatrix_async( (M-ii-nb), min(nb,NBk-jj-nb),
A(ii+nb,ii+nb), lda,
dA[d], (rows-nb), stream[d][0] );
/* update with the block column */
n_local[d], nb, c_one, inPT(d,0,0), nb, inAT(d,ib,0), ldn_local );
if( M > ii+nb ) {
n_local[d], M-(ii+nb), nb, c_neg_one, inAT(d,ib,0), ldn_local,
inPT(d,1,0), nb, c_one, inAT(d,ib+1,0), ldn_local );
}
} /* end of for each block-columns in a big-panel */
}
} /* end of for each previous big-panels */
/* calling magma-gpu interface to panel-factorize the big panel */
if( M > I ) {
magma_sgetrf1_mgpu(num_gpus, M-I, N, nb, I, dAT, ldn_local, ipiv+I, dA, &a[I*lda], lda,
(cudaStream_t **)stream, &iinfo);
if( iinfo < 0 ) {
*info = iinfo;
break;
} else if( iinfo != 0 ) {
*info = iinfo + I * NB;
//break;
}
/* adjust pivots */
for( ii=I; ii<min(I+N,m); ii++ ) ipiv[ii] += I;
}
#ifdef PROFILE
end1 = get_current_time();
time_rmajor += GetTimerValue(start1, end1);
time_rmajor3 += GetTimerValue(start2, end1);
time_mem += (GetTimerValue(start2, end1)-GetTimerValue(start1, end1))/1000.0;
#endif
/* download the current big panel to CPU */
magmablas_sgetmatrix_transpose3(num_gpus, (cudaStream_t **)stream, dAT, ldn_local, A(0,I), lda, dA, maxm, M, N, nb);
#ifdef PROFILE
end1 = get_current_time();
time_rmajor2 += GetTimerValue(start1, end1);
#endif
} /* end of for */
#ifdef PROFILE
flops = FLOPS( (float)m, (float)n ) / 1000000;
printf(" NB=%d nb=%d\n",NB,nb);
printf(" memcopy and transpose %e seconds\n",time_mem );
printf(" total time %e seconds\n",GetTimerValue(start0,end)/1000.0);
printf(" Performance %f GFlop/s, %f seconds without htod and dtoh\n", flops / time_rmajor, time_rmajor /1000.0);
printf(" Performance %f GFlop/s, %f seconds with htod\n", flops / time_rmajor3, time_rmajor3/1000.0);
printf(" Performance %f GFlop/s, %f seconds with dtoh\n", flops / time_rmajor2, time_rmajor2/1000.0);
printf(" Performance %f GFlop/s, %f seconds without memory-allocation\n", flops / GetTimerValue(start, end), GetTimerValue(start,end)/1000.0);
#endif
for( d=0; d<num_gpus0; d++ ) {
magma_free( dA[d] );
for( ii=0; ii<h; ii++ ) {
magma_queue_destroy( stream[d][ii] );
}
}
}
return *info;
} /* magma_sgetrf_ooc */

Here is the call graph for this function:

Here is the caller graph for this function:

void magmablas_sgetmatrix_transpose3 ( magma_int_t  num_gpus,
cudaStream_t **  stream,
float **  dat,
int  ldda,
float *  ha,
int  lda,
float **  dB,
int  lddb,
int  m,
int  n,
int  nb 
)

Here is the caller graph for this function:

void magmablas_spermute_long3 ( float *  dAT,
int  lda,
int *  ipiv,
int  nb,
int  ind 
)
void magmablas_ssetmatrix_transpose3 ( magma_int_t  num_gpus,
cudaStream_t **  stream,
float *  ha,
int  lda,
float **  dat,
int  ldda,
int  starti,
float **  dB,
int  lddb,
int  m,
int  n,
int  nb 
)

Here is the caller graph for this function: