MAGMA  1.2.0
MatrixAlgebraonGPUandMulticoreArchitectures
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
cunmqr_m.cpp File Reference
#include "common_magma.h"
Include dependency graph for cunmqr_m.cpp:

Go to the source code of this file.

Macros

#define N_MAX_GPU   8
#define A(i, j)   ( a+(j)*lda + (i))
#define C(i, j)   ( c+(j)*ldc + (i))
#define dC(gpui, i, j)   (dw[gpui]+(j)*lddc + (i))
#define dA_c(gpui, ind, i, j)   (dw[gpui] + n_l*lddc + (ind)*lddar*lddac + (i) + (j)*lddac)
#define dA_r(gpui, ind, i, j)   (dw[gpui] + n_l*lddc + (ind)*lddar*lddac + (i) + (j)*lddar)
#define dt(gpui, ind)   (dw[gpui] + n_l*lddc + 2*lddac*lddar + (ind)*(nb+1)*nb)
#define dwork(gpui, ind)   (dw[gpui] + n_l*lddc + 2*lddac*lddar + 2*(nb+1)*nb + (ind)*lddwork*nb)

Functions

void magmablas_csetdiag1subdiag0_stream (char uplo, int k, int nb, cuFloatComplex *A, int lda, cudaStream_t stream)
magma_int_t magma_cunmqr_m (magma_int_t nrgpu, const char side, const char trans, magma_int_t m, magma_int_t n, magma_int_t k, cuFloatComplex *a, magma_int_t lda, cuFloatComplex *tau, cuFloatComplex *c, magma_int_t ldc, cuFloatComplex *work, magma_int_t lwork, magma_int_t *info)

Macro Definition Documentation

#define A (   i,
 
)    ( a+(j)*lda + (i))

Definition at line 17 of file cunmqr_m.cpp.

#define C (   i,
 
)    ( c+(j)*ldc + (i))

Definition at line 18 of file cunmqr_m.cpp.

#define dA_c (   gpui,
  ind,
  i,
 
)    (dw[gpui] + n_l*lddc + (ind)*lddar*lddac + (i) + (j)*lddac)

Definition at line 21 of file cunmqr_m.cpp.

#define dA_r (   gpui,
  ind,
  i,
 
)    (dw[gpui] + n_l*lddc + (ind)*lddar*lddac + (i) + (j)*lddar)

Definition at line 22 of file cunmqr_m.cpp.

#define dC (   gpui,
  i,
 
)    (dw[gpui]+(j)*lddc + (i))

Definition at line 20 of file cunmqr_m.cpp.

#define dt (   gpui,
  ind 
)    (dw[gpui] + n_l*lddc + 2*lddac*lddar + (ind)*(nb+1)*nb)

Definition at line 23 of file cunmqr_m.cpp.

#define dwork (   gpui,
  ind 
)    (dw[gpui] + n_l*lddc + 2*lddac*lddar + 2*(nb+1)*nb + (ind)*lddwork*nb)

Definition at line 24 of file cunmqr_m.cpp.

#define N_MAX_GPU   8

Definition at line 13 of file cunmqr_m.cpp.


Function Documentation

magma_int_t magma_cunmqr_m ( magma_int_t  nrgpu,
const char  side,
const char  trans,
magma_int_t  m,
magma_int_t  n,
magma_int_t  k,
cuFloatComplex *  a,
magma_int_t  lda,
cuFloatComplex *  tau,
cuFloatComplex *  c,
magma_int_t  ldc,
cuFloatComplex *  work,
magma_int_t  lwork,
magma_int_t info 
)

Definition at line 31 of file cunmqr_m.cpp.

References __func__, A, C, dA_c, dC, dt, dwork, lapackf77_clarft(), lapackf77_cunmqr(), lapackf77_lsame, MAGMA_C_ONE, MAGMA_C_SET2REAL, magma_cgetmatrix_async(), magma_clarfb_gpu(), magma_cmalloc(), magma_csetmatrix_async(), MAGMA_ERR_DEVICE_ALLOC, magma_free(), magma_getdevice(), magma_queue_create(), magma_queue_destroy(), magma_queue_sync(), magma_setdevice(), MAGMA_SUCCESS, magma_xerbla(), magmablas_csetdiag1subdiag0_stream(), magmablasSetKernelStream(), MagmaColumnwise, MagmaForward, max, min, N_MAX_GPU, side, and trans.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
CUNMQR overwrites the general complex M-by-N matrix C with
SIDE = 'L' SIDE = 'R'
TRANS = 'N': Q * C C * Q
TRANS = 'T': Q**H * C C * Q**H
where Q is a complex orthogonal matrix defined as the product of k
elementary reflectors
Q = H(1) H(2) . . . H(k)
as returned by CGEQRF. Q is of order M if SIDE = 'L' and of order N
if SIDE = 'R'.
Arguments
=========
SIDE (input) CHARACTER*1
= 'L': apply Q or Q**H from the Left;
= 'R': apply Q or Q**H from the Right.
TRANS (input) CHARACTER*1
= 'N': No transpose, apply Q;
= 'T': Transpose, apply Q**H.
M (input) INTEGER
The number of rows of the matrix C. M >= 0.
N (input) INTEGER
The number of columns of the matrix C. N >= 0.
K (input) INTEGER
The number of elementary reflectors whose product defines
the matrix Q.
If SIDE = 'L', M >= K >= 0;
if SIDE = 'R', N >= K >= 0.
A (input) COMPLEX array, dimension (LDA,K)
The i-th column must contain the vector which defines the
elementary reflector H(i), for i = 1,2,...,k, as returned by
CGEQRF in the first k columns of its array argument A.
LDA (input) INTEGER
The leading dimension of the array A.
If SIDE = 'L', LDA >= max(1,M);
if SIDE = 'R', LDA >= max(1,N).
TAU (input) COMPLEX array, dimension (K)
TAU(i) must contain the scalar factor of the elementary
reflector H(i), as returned by CGEQRF.
C (input/output) COMPLEX array, dimension (LDC,N)
On entry, the M-by-N matrix C.
On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q.
LDC (input) INTEGER
The leading dimension of the array C. LDC >= max(1,M).
WORK (workspace/output) COMPLEX array, dimension (MAX(1,LWORK))
On exit, if INFO = 0, WORK(0) returns the optimal LWORK.
LWORK (input) INTEGER
The dimension of the array WORK.
If SIDE = 'L', LWORK >= max(1,N);
if SIDE = 'R', LWORK >= max(1,M).
For optimum performance LWORK >= N*NB if SIDE = 'L', and
LWORK >= M*NB if SIDE = 'R', where NB is the optimal
blocksize.
If LWORK = -1, then a workspace query is assumed; the routine
only calculates the optimal size of the WORK array, returns
this value as the first entry of the WORK array, and no error
message related to LWORK is issued by XERBLA.
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
===================================================================== */
cuFloatComplex c_one = MAGMA_C_ONE;
char side_[2] = {side, 0};
char trans_[2] = {trans, 0};
cuFloatComplex* dw[N_MAX_GPU];
cudaStream_t stream [N_MAX_GPU][2];
magma_int_t ind_c, kb;
cuFloatComplex t[4160]; /* was [65][64] */
magma_int_t i1, i2, i3, ib, nb, nq, nw;
magma_int_t left, notran, lquery;
magma_int_t iinfo, lwkopt;
magma_int_t igpu = 0;
int gpu_b;
magma_getdevice(&gpu_b);
*info = 0;
left = lapackf77_lsame(side_, "L");
notran = lapackf77_lsame(trans_, "N");
lquery = (lwork == -1);
/* NQ is the order of Q and NW is the minimum dimension of WORK */
if (left) {
nq = m;
nw = n;
} else {
nq = n;
nw = m;
}
if (! left && ! lapackf77_lsame(side_, "R")) {
*info = -1;
} else if (! notran && ! lapackf77_lsame(trans_, "T")) {
*info = -2;
} else if (m < 0) {
*info = -3;
} else if (n < 0) {
*info = -4;
} else if (k < 0 || k > nq) {
*info = -5;
} else if (lda < max(1,nq)) {
*info = -7;
} else if (ldc < max(1,m)) {
*info = -10;
} else if (lwork < max(1,nw) && ! lquery) {
*info = -12;
}
if (*info == 0)
{
/* Determine the block size. NB may be at most NBMAX, where NBMAX
is used to define the local array T. */
nb = 64;
lwkopt = max(1,nw) * nb;
MAGMA_C_SET2REAL( work[0], lwkopt );
}
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
else if (lquery) {
return *info;
}
/* Quick return if possible */
if (m == 0 || n == 0 || k == 0) {
work[0] = c_one;
return *info;
}
magma_int_t lddc = m;
magma_int_t lddac = nq;
magma_int_t lddar =nb;
magma_int_t lddwork = nw;
magma_int_t n_l = (n-1)/nrgpu+1; // local n
for (igpu = 0; igpu < nrgpu; ++igpu){
if (MAGMA_SUCCESS != magma_cmalloc( &dw[igpu], (n_l*lddc + 2*lddac*lddar + 2*(nb + 1 + lddwork)*nb) )) {
printf("%d: size: %ld\n", igpu, (n_l*lddc + 2*lddac*lddar + (nb+1+lddwork)*nb)*sizeof(cuFloatComplex));
magma_xerbla( __func__, -(*info) );
return *info;
}
magma_queue_create( &stream[igpu][0] );
magma_queue_create( &stream[igpu][1] );
}
if (nb >= k)
{
/* Use CPU code */
lapackf77_cunmqr(side_, trans_, &m, &n, &k, a, &lda, tau,
c, &ldc, work, &lwork, &iinfo);
}
else
{
/* Use hybrid CPU-MGPU code */
if (left) {
//copy C to mgpus
for (igpu = 0; igpu < nrgpu; ++igpu){
kb = min(n_l, n-igpu*n_l);
C(0, igpu*n_l), ldc,
dC(igpu, 0, 0), lddc, stream[igpu][0] );
}
if ( !notran ) {
i1 = 0;
i2 = k;
i3 = nb;
} else {
i1 = (k - 1) / nb * nb;
i2 = 0;
i3 = -nb;
}
kb = min(nb, k-i1);
for (igpu = 0; igpu < nrgpu; ++igpu){
magma_csetmatrix_async( (nq-i1), kb,
A(i1, i1), lda,
dA_c(igpu, 0, i1, 0), lddac, stream[igpu][0] );
}
ind_c = 0;
for (i = i1; i3 < 0 ? i >= i2 : i < i2; i += i3)
{
ib = min(nb, k - i);
/* Form the triangular factor of the block reflector
H = H(i) H(i+1) . . . H(i+ib-1) */
i__4 = nq - i;
lapackf77_clarft("F", "C", &i__4, &ib, A(i, i), &lda,
&tau[i], t, &ib);
/* H or H' is applied to C(1:m,i:n) */
/* Apply H or H'; First copy T to the GPU */
for (igpu = 0; igpu < nrgpu; ++igpu){
t, ib,
dt(igpu, ind_c), ib, stream[igpu][ind_c] );
magma_queue_sync( stream[igpu][ind_c] ); // Makes sure that we can change t next iteration.
}
// start the copy of next A panel
kb = min(nb, k - i - i3);
if (kb > 0 && i+i3 >= 0){
for (igpu = 0; igpu < nrgpu; ++igpu){
magma_csetmatrix_async( (i__4-i3), kb,
A(i+i3, i+i3), lda,
dA_c(igpu, (ind_c+1)%2, i+i3, 0), lddac, stream[igpu][(ind_c+1)%2] );
}
}
for (igpu = 0; igpu < nrgpu; ++igpu){
// Put 0s in the upper triangular part of dA;
magmablas_csetdiag1subdiag0_stream('L', ib, ib, dA_c(igpu, ind_c, i, 0), lddac, stream[igpu][ind_c]);
magmablasSetKernelStream(stream[igpu][ind_c]);
m-i, n_l, ib,
dA_c(igpu, ind_c, i, 0), lddac, dt(igpu, ind_c), ib,
dC(igpu, i, 0), lddc,
dwork(igpu, ind_c), lddwork);
}
ind_c = (ind_c+1)%2;
}
//copy C from mgpus
for (igpu = 0; igpu < nrgpu; ++igpu){
magma_queue_sync( stream[igpu][0] );
magma_queue_sync( stream[igpu][1] );
kb = min(n_l, n-igpu*n_l);
dC(igpu, 0, 0), lddc,
C(0, igpu*n_l), ldc, stream[igpu][0] );
}
} else {
fprintf(stderr, "The case (side == right) is not implemented\n");
return *info;
/*if ( notran ) {
i1 = 0;
i2 = k;
i3 = nb;
} else {
i1 = (k - 1) / nb * nb;
i2 = 0;
i3 = -nb;
}
mi = m;
ic = 0;
for (i = i1; i3 < 0 ? i >= i2 : i < i2; i += i3)
{
ib = min(nb, k - i);
// Form the triangular factor of the block reflector
// H = H(i) H(i+1) . . . H(i+ib-1)
i__4 = nq - i;
lapackf77_clarft("F", "C", &i__4, &ib, A(i, i), &lda,
&tau[i], t, &ib);
// 1) copy the panel from A to the GPU, and
// 2) Put 0s in the upper triangular part of dA;
magma_csetmatrix( i__4, ib, A(i, i), lda, dA(i, 0), ldda );
magmablas_csetdiag1subdiag0('L', ib, ib, dA(i, 0), ldda);
// H or H' is applied to C(1:m,i:n)
ni = n - i;
jc = i;
// Apply H or H'; First copy T to the GPU
magma_csetmatrix( ib, ib, t, ib, dt, ib );
magma_clarfb_gpu( side, trans, MagmaForward, MagmaColumnwise,
mi, ni, ib,
dA(i, 0), ldda, dt, ib,
dC(ic, jc), lddc,
dwork, lddwork);
}
*/
}
}
MAGMA_C_SET2REAL( work[0], lwkopt );
for (igpu = 0; igpu < nrgpu; ++igpu){
magma_queue_sync( stream[igpu][0] );
magma_queue_destroy( stream[igpu][0] );
magma_queue_destroy( stream[igpu][1] );
magma_free( dw[igpu] );
}
return *info;
} /* magma_cunmqr */

Here is the call graph for this function:

Here is the caller graph for this function:

void magmablas_csetdiag1subdiag0_stream ( char  uplo,
int  k,
int  nb,
cuFloatComplex *  A,
int  lda,
cudaStream_t  stream 
)

Here is the caller graph for this function: