{
#define dlA(gpu,a_1,a_2) ( dlA[gpu]+(a_2)*(ldda) + (a_1))
#define work_ref(a_1) ( work + (a_1))
#define hwork ( work + (nb)*(m))
#define hwrk_ref(a_1) ( local_work + (a_1))
#define lhwrk ( local_work + (nb)*(m))
float *
dwork[4], *panel[4], *local_work;
magma_int_t i, j, k, ldwork, lddwork, old_i, old_ib, rows;
float ctime, dtime;
int panel_gpunum=-1, i_local, n_local[4], la_gpu, displacement;
*info = 0;
if (m < 0) {
*info = -1;
} else if (n < 0) {
*info = -2;
}
else if (ldda <
max(1,m)) {
*info = -4;
}
if (*info != 0) {
return *info;
}
if (k == 0)
return *info;
displacement = n * nb;
lwork =
max((m+n+64) * nb,n*m+n*nb);
lhwork = lwork - (m)*nb;
for(i=0; i<num_gpus; i++){
#ifdef MultiGPUs
#endif
return *info;
}
}
for(i=0; i<num_gpus; i++){
n_local[i] = ((n/nb)/num_gpus)*nb;
if (i < (n/nb)%num_gpus)
n_local[i] += nb;
else if (i == (n/nb)%num_gpus)
n_local[i] += n%nb;
}
for(i=0; i<num_gpus; i++){
#ifdef MultiGPUs
#endif
}
return *info;
}
static cudaStream_t streaml[4][2];
cudaEvent_t start[4], stop[4][10];
for(i=0; i<num_gpus; i++){
#ifdef MultiGPUs
#endif
for(j=0; j<10; j++)
}
for(j=0; j<num_gpus; j++){
}
nbmin = 2;
nx = nb;
ldwork = m;
lddwork= n;
if (nb >= nbmin && nb < k && nx < k) {
old_i = 0; old_ib = nb;
for (i = 0; i < k-nx; i += nb)
{
panel_gpunum = (i/nb)%num_gpus;
i_local = i/(nb*num_gpus)*nb;
rows = m -i;
#ifdef MultiGPUs
#endif
dlA(panel_gpunum, i, i_local), ldda,
hwrk_ref(i), ldwork, streaml[panel_gpunum][1] );
if (i>0){
la_gpu = panel_gpunum;
#ifdef MultiGPUs
#endif
m-old_i, n_local[la_gpu]-i_local-old_ib, old_ib,
panel[la_gpu], ldda, dwork[la_gpu], lddwork,
dlA(la_gpu, old_i, i_local+old_ib), ldda,
dwork[la_gpu]+old_ib, lddwork);
la_gpu = ((i-nb)/nb)%num_gpus;
#ifdef MultiGPUs
#endif
panel[la_gpu], ldda, streaml[la_gpu][0] );
}
#ifdef MultiGPUs
#endif
&rows, &ib,
hwrk_ref(i), &ldwork, tau+i, lhwrk, &ib);
for(j=0; j<num_gpus; j++)
{
#ifdef MultiGPUs
#endif
if (j == panel_gpunum)
panel[j] =
dlA(j, i, i_local);
else
panel[j] = dwork[j]+displacement;
panel[j], ldda, streaml[j][0] );
}
for(j=0; j<num_gpus; j++)
{
#ifdef MultiGPUs
#endif
}
if (i>0){
for(j=0; j<num_gpus; j++){
}
}
if (i + ib < n)
{
for(j=0; j<num_gpus; j++)
{
#ifdef MultiGPUs
#endif
lhwrk, ib,
dwork[j], lddwork, streaml[j][0] );
}
if (i+nb < k-nx)
{
la_gpu = (panel_gpunum+1)%num_gpus;
int i_loc = (i+nb)/(nb*num_gpus)*nb;
for(j=0; j<num_gpus; j++){
#ifdef MultiGPUs
#endif
if (j==la_gpu)
rows, ib, ib,
panel[j], ldda, dwork[j], lddwork,
dlA(j, i, i_loc), ldda, dwork[j]+ib, lddwork);
else if (j<=panel_gpunum)
rows, n_local[j]-i_local-ib, ib,
panel[j], ldda, dwork[j], lddwork,
dlA(j, i, i_local+ib), ldda, dwork[j]+ib, lddwork);
else
rows, n_local[j]-i_local, ib,
panel[j], ldda, dwork[j], lddwork,
dlA(j, i, i_local), ldda, dwork[j]+ib, lddwork);
}
}
else {
la_gpu = (panel_gpunum+1)%num_gpus;
int i_loc = (i+nb)/(nb*num_gpus)*nb;
#ifdef MultiGPUs
#endif
rows, n_local[la_gpu]-i_loc, ib,
panel[la_gpu], ldda, dwork[la_gpu], lddwork,
dlA(la_gpu, i, i_loc), ldda, dwork[la_gpu]+ib, lddwork);
#ifdef MultiGPUs
#endif
dlA(panel_gpunum, i, i_local), ldda );
}
old_i = i;
old_ib = ib;
}
}
} else {
i = 0;
}
for(j=0; j<num_gpus; j++){
#ifdef MultiGPUs
#endif
}
if (i < k) {
ib = n-i;
rows = m-i;
lhwork = lwork - rows*ib;
panel_gpunum = (panel_gpunum+1)%num_gpus;
int i_loc = (i)/(nb*num_gpus)*nb;
#ifdef MultiGPUs
#endif
if (i == 0) {
} else {
dlA(panel_gpunum, i, i_loc), ldda,
lhwrk, rows );
}
lhwork = lwork - rows*ib;
lapackf77_sgeqrf(&rows, &ib, lhwrk, &rows, tau+i, lhwrk+ib*rows, &lhwork, info);
if (i == 0) {
} else {
lhwrk, rows,
dlA(panel_gpunum, i, i_loc), ldda );
}
}
for(i=0; i<num_gpus; i++){
#ifdef MultiGPUs
#endif
}
return *info;
}