{
#define a_ref(a_1,a_2) ( a+(a_2)*(lda) + (a_1))
#define da_ref(a_1,a_2) (da+(a_2)*ldda + (a_1))
int cnt=-1;
int i, k, lddwork, old_i, old_ib;
int nbmin, nx, ib, ldda;
*info = 0;
int lwkopt = n * nb;
long int lquery = (lwork == -1);
if (m < 0) {
*info = -1;
} else if (n < 0) {
*info = -2;
}
else if (lda <
max(1,m)) {
*info = -4;
}
else if (lwork <
max(1,n) && ! lquery) {
*info = -7;
}
if (*info != 0) {
}
else if (lquery)
if (k == 0) {
}
cublasStatus status;
static cudaStream_t stream[2];
cudaStreamCreate(&stream[0]);
cudaStreamCreate(&stream[1]);
nbmin = 2;
nx = nb;
lddwork = ((n+31)/32)*32;
ldda = ((m+31)/32)*32;
cuDoubleComplex *da;
status = cublasAlloc((n)*ldda + nb*lddwork, sizeof(cuDoubleComplex), (void**)&da);
if (status != CUBLAS_STATUS_SUCCESS) {
*info = -8;
return 0;
}
cuDoubleComplex *
dwork = da + ldda*(n);
if (nb >= nbmin && nb < k && nx < k) {
cudaMemcpy2DAsync(
da_ref(0,nb), ldda*
sizeof(cuDoubleComplex),
a_ref(0,nb), lda *
sizeof(cuDoubleComplex),
sizeof(cuDoubleComplex)*(m), (n-nb),
cudaMemcpyHostToDevice,stream[0]);
old_i = 0; old_ib = nb;
for (i = 0; i < k-nx; i += nb) {
if (i>0){
cudaMemcpy2DAsync(
a_ref(i,i), lda *
sizeof(cuDoubleComplex),
da_ref(i,i), ldda*
sizeof(cuDoubleComplex),
sizeof(cuDoubleComplex)*(m-i), ib,
cudaMemcpyDeviceToHost,stream[1]);
cudaMemcpy2DAsync(
a_ref(0,i), lda *
sizeof(cuDoubleComplex),
da_ref(0,i), ldda*
sizeof(cuDoubleComplex),
sizeof(cuDoubleComplex)*i, ib,
cudaMemcpyDeviceToHost,stream[0]);
m-old_i, n-old_i-2*old_ib, old_ib,
da_ref(old_i, old_i), ldda, dwork, lddwork,
da_ref(old_i, old_i+2*old_ib), ldda, dwork+old_ib, lddwork);
}
cudaStreamSynchronize(stream[1]);
int rows = m-i;
cnt++;
cntxt->
nb = qr_params->
ib;
tau+i,
work, &lwork, info);
&rows, &ib,
a_ref(i,i), &lda, tau+i, qr_params->
t+cnt*nb*nb, &ib);
if (cnt < qr_params->np_gpu) {
}
cublasSetMatrix(rows, ib, sizeof(cuDoubleComplex),
if (qr_params->
flag == 1)
if (i + ib < n) {
cublasSetMatrix(ib, ib,
sizeof(cuDoubleComplex), qr_params->
t+cnt*nb*nb, ib, dwork, lddwork);
if (i+ib < k-nx)
rows, ib, ib,
da_ref(i, i ), ldda, dwork, lddwork,
da_ref(i, i+ib), ldda, dwork+ib, lddwork);
else
rows, n-i-ib, ib,
da_ref(i, i ), ldda, dwork, lddwork,
da_ref(i, i+ib), ldda, dwork+ib, lddwork);
old_i = i;
old_ib = ib;
}
}
} else {
i = 0;
}
if (i < k)
{
ib = n-i;
if (i!=0)
cublasGetMatrix(m, ib, sizeof(cuDoubleComplex),
int rows = m-i;
cnt++;
if (cnt < qr_params->np_gpu)
{
&rows, &ib2,
a_ref(i,i), &lda, tau+i, qr_params->
t+cnt*nb*nb, &ib2);
}
}
cudaStreamDestroy( stream[0] );
cudaStreamDestroy( stream[1] );
cublasFree(da);
}