{
#define inAT(id,i,j) (d_lAT[(id)] + ((offset)+(i)*nb)*lddat + (j)*nb)
cuFloatComplex *d_panel[4], *panel_local[4];
static cudaStream_t streaml[4][2];
*info = 0;
if (m < 0)
*info = -2;
else if (n < 0)
*info = -3;
else if (num_gpus*lddat <
max(1,n))
*info = -5;
if (*info != 0) {
return *info;
}
if (m == 0 || n == 0)
return *info;
if( num_gpus > ceil((float)n/nb) ) {
printf( " * too many GPUs for the matrix size, using %d GPUs\n",num_gpus );
*info = -1;
return *info;
}
{
maxm = ((m + 31)/32)*32;
for(i=0; i<num_gpus; i++){
n_local[i] = ((n/nb)/num_gpus)*nb;
if (i < (n/nb)%num_gpus)
n_local[i] += nb;
else if (i == (n/nb)%num_gpus)
n_local[i] += n%nb;
d_panel[i] = &(d_lAP[i][nb*maxm]);
}
s = mindim / nb;
for( i=0; i<s; i++ )
{
id = i%num_gpus;
i_local = i/num_gpus;
cols = maxm - i*nb;
rows = m - i*nb;
d_lAP[id], cols,
work, lddwork, streaml[
id][1] );
if ( i>0 ){
n_local[id] - (i_local+1)*nb, nb,
c_one, panel_local[id], ldpan[id],
inAT(
id,i-1,i_local+1), lddat );
n_local[id]-(i_local+1)*nb, rows, nb,
c_neg_one,
inAT(
id,i-1,i_local+1), lddat,
&(panel_local[id][nb*ldpan[id]]), ldpan[id],
c_one,
inAT(
id,i, i_local+1), lddat );
}
if ( (*info == 0) && (iinfo > 0) ) {
*info = iinfo + i*nb;
}
for( d=0; d<num_gpus; d++ ) {
d_lAP[d], maxm, streaml[d][0] );
}
for( d=0; d<num_gpus; d++ ) {
if( d == 0 )
else
if( d == id ) {
panel_local[d] =
inAT(d,i,i_local);
ldpan[d] = lddat;
i_local2 = i_local+1;
} else {
panel_local[d] = d_panel[d];
ldpan[d] = nb;
i_local2 = i_local;
if( d < id ) i_local2 ++;
}
if ( s > (i+1) ) {
nb0 = nb;
} else {
nb0 = n_local[d]-nb*(s/num_gpus);
if( d < s%num_gpus ) nb0 -= nb;
}
if( d == (i+1)%num_gpus) {
nb1 = nb0;
} else {
nb1 = n_local[d] - i_local2*nb;
}
nb1, nb, c_one,
panel_local[d], ldpan[d],
inAT(d, i, i_local2), lddat);
nb1, m-(i+1)*nb, nb,
c_neg_one,
inAT(d, i, i_local2), lddat,
&(panel_local[d][nb*ldpan[d]]), ldpan[d],
c_one,
inAT(d, i+1, i_local2), lddat );
}
}
id = s%num_gpus;
i_local = s/num_gpus;
nb0 =
min(m - s*nb, n - s*nb);
rows = m - s*nb;
cols = maxm - s*nb;
if( nb0 > 0 ) {
if ( (*info == 0) && (iinfo > 0) )
*info = iinfo + s*nb;
for( d=0; d<num_gpus; d++ ) {
i_local2 = i_local;
if( d < id ) i_local2 ++;
if( d == id || n_local[d] > i_local2*nb )
{
d_lAP[d], maxm, streaml[d][0] );
}
}
}
for( d=0; d<num_gpus; d++ ) {
if( nb0 > 0 ) {
if( d == 0 )
else
i_local2 = i_local;
if( d < id ) i_local2++;
if( d == id ) {
panel_local[d] =
inAT(d,s,i_local);
nb1 = n_local[d] - i_local*nb-nb0;
if( nb1 > 0 )
nb1, nb0, c_one,
panel_local[d], lddat,
inAT(d,s,i_local)+nb0, lddat);
} else if( n_local[d] > i_local2*nb ) {
panel_local[d] = d_panel[d];
nb1 = n_local[d] - i_local2*nb;
nb1, nb0, c_one,
panel_local[d], nb0,
inAT(d,s,i_local2), lddat);
}
}
}
}
return *info;
}