MAGMA  1.2.0
MatrixAlgebraonGPUandMulticoreArchitectures
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
zunmqr_m.cpp File Reference
#include "common_magma.h"
Include dependency graph for zunmqr_m.cpp:

Go to the source code of this file.

Macros

#define N_MAX_GPU   8
#define A(i, j)   ( a+(j)*lda + (i))
#define C(i, j)   ( c+(j)*ldc + (i))
#define dC(gpui, i, j)   (dw[gpui]+(j)*lddc + (i))
#define dA_c(gpui, ind, i, j)   (dw[gpui] + n_l*lddc + (ind)*lddar*lddac + (i) + (j)*lddac)
#define dA_r(gpui, ind, i, j)   (dw[gpui] + n_l*lddc + (ind)*lddar*lddac + (i) + (j)*lddar)
#define dt(gpui, ind)   (dw[gpui] + n_l*lddc + 2*lddac*lddar + (ind)*(nb+1)*nb)
#define dwork(gpui, ind)   (dw[gpui] + n_l*lddc + 2*lddac*lddar + 2*(nb+1)*nb + (ind)*lddwork*nb)

Functions

void magmablas_zsetdiag1subdiag0_stream (char uplo, int k, int nb, cuDoubleComplex *A, int lda, cudaStream_t stream)
magma_int_t magma_zunmqr_m (magma_int_t nrgpu, const char side, const char trans, magma_int_t m, magma_int_t n, magma_int_t k, cuDoubleComplex *a, magma_int_t lda, cuDoubleComplex *tau, cuDoubleComplex *c, magma_int_t ldc, cuDoubleComplex *work, magma_int_t lwork, magma_int_t *info)

Macro Definition Documentation

#define A (   i,
 
)    ( a+(j)*lda + (i))

Definition at line 17 of file zunmqr_m.cpp.

#define C (   i,
 
)    ( c+(j)*ldc + (i))

Definition at line 18 of file zunmqr_m.cpp.

#define dA_c (   gpui,
  ind,
  i,
 
)    (dw[gpui] + n_l*lddc + (ind)*lddar*lddac + (i) + (j)*lddac)

Definition at line 21 of file zunmqr_m.cpp.

#define dA_r (   gpui,
  ind,
  i,
 
)    (dw[gpui] + n_l*lddc + (ind)*lddar*lddac + (i) + (j)*lddar)

Definition at line 22 of file zunmqr_m.cpp.

#define dC (   gpui,
  i,
 
)    (dw[gpui]+(j)*lddc + (i))

Definition at line 20 of file zunmqr_m.cpp.

#define dt (   gpui,
  ind 
)    (dw[gpui] + n_l*lddc + 2*lddac*lddar + (ind)*(nb+1)*nb)

Definition at line 23 of file zunmqr_m.cpp.

#define dwork (   gpui,
  ind 
)    (dw[gpui] + n_l*lddc + 2*lddac*lddar + 2*(nb+1)*nb + (ind)*lddwork*nb)

Definition at line 24 of file zunmqr_m.cpp.

#define N_MAX_GPU   8

Definition at line 13 of file zunmqr_m.cpp.


Function Documentation

magma_int_t magma_zunmqr_m ( magma_int_t  nrgpu,
const char  side,
const char  trans,
magma_int_t  m,
magma_int_t  n,
magma_int_t  k,
cuDoubleComplex *  a,
magma_int_t  lda,
cuDoubleComplex *  tau,
cuDoubleComplex *  c,
magma_int_t  ldc,
cuDoubleComplex *  work,
magma_int_t  lwork,
magma_int_t info 
)

Definition at line 31 of file zunmqr_m.cpp.

References __func__, A, C, dA_c, dC, dt, dwork, lapackf77_lsame, lapackf77_zlarft, lapackf77_zunmqr, MAGMA_ERR_DEVICE_ALLOC, magma_free(), magma_getdevice(), magma_queue_create(), magma_queue_destroy(), magma_queue_sync(), magma_setdevice(), MAGMA_SUCCESS, magma_xerbla(), MAGMA_Z_ONE, MAGMA_Z_SET2REAL, magma_zgetmatrix_async(), magma_zlarfb_gpu(), magma_zmalloc(), magma_zsetmatrix_async(), magmablas_zsetdiag1subdiag0_stream(), magmablasSetKernelStream(), MagmaColumnwise, MagmaForward, max, min, N_MAX_GPU, side, and trans.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
ZUNMQR overwrites the general complex M-by-N matrix C with
SIDE = 'L' SIDE = 'R'
TRANS = 'N': Q * C C * Q
TRANS = 'T': Q**H * C C * Q**H
where Q is a complex orthogonal matrix defined as the product of k
elementary reflectors
Q = H(1) H(2) . . . H(k)
as returned by ZGEQRF. Q is of order M if SIDE = 'L' and of order N
if SIDE = 'R'.
Arguments
=========
SIDE (input) CHARACTER*1
= 'L': apply Q or Q**H from the Left;
= 'R': apply Q or Q**H from the Right.
TRANS (input) CHARACTER*1
= 'N': No transpose, apply Q;
= 'T': Transpose, apply Q**H.
M (input) INTEGER
The number of rows of the matrix C. M >= 0.
N (input) INTEGER
The number of columns of the matrix C. N >= 0.
K (input) INTEGER
The number of elementary reflectors whose product defines
the matrix Q.
If SIDE = 'L', M >= K >= 0;
if SIDE = 'R', N >= K >= 0.
A (input) COMPLEX_16 array, dimension (LDA,K)
The i-th column must contain the vector which defines the
elementary reflector H(i), for i = 1,2,...,k, as returned by
ZGEQRF in the first k columns of its array argument A.
LDA (input) INTEGER
The leading dimension of the array A.
If SIDE = 'L', LDA >= max(1,M);
if SIDE = 'R', LDA >= max(1,N).
TAU (input) COMPLEX_16 array, dimension (K)
TAU(i) must contain the scalar factor of the elementary
reflector H(i), as returned by ZGEQRF.
C (input/output) COMPLEX_16 array, dimension (LDC,N)
On entry, the M-by-N matrix C.
On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q.
LDC (input) INTEGER
The leading dimension of the array C. LDC >= max(1,M).
WORK (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK))
On exit, if INFO = 0, WORK(0) returns the optimal LWORK.
LWORK (input) INTEGER
The dimension of the array WORK.
If SIDE = 'L', LWORK >= max(1,N);
if SIDE = 'R', LWORK >= max(1,M).
For optimum performance LWORK >= N*NB if SIDE = 'L', and
LWORK >= M*NB if SIDE = 'R', where NB is the optimal
blocksize.
If LWORK = -1, then a workspace query is assumed; the routine
only calculates the optimal size of the WORK array, returns
this value as the first entry of the WORK array, and no error
message related to LWORK is issued by XERBLA.
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
===================================================================== */
cuDoubleComplex c_one = MAGMA_Z_ONE;
char side_[2] = {side, 0};
char trans_[2] = {trans, 0};
cuDoubleComplex* dw[N_MAX_GPU];
cudaStream_t stream [N_MAX_GPU][2];
magma_int_t ind_c, kb;
cuDoubleComplex t[4160]; /* was [65][64] */
magma_int_t i1, i2, i3, ib, nb, nq, nw;
magma_int_t left, notran, lquery;
magma_int_t iinfo, lwkopt;
magma_int_t igpu = 0;
int gpu_b;
magma_getdevice(&gpu_b);
*info = 0;
left = lapackf77_lsame(side_, "L");
notran = lapackf77_lsame(trans_, "N");
lquery = (lwork == -1);
/* NQ is the order of Q and NW is the minimum dimension of WORK */
if (left) {
nq = m;
nw = n;
} else {
nq = n;
nw = m;
}
if (! left && ! lapackf77_lsame(side_, "R")) {
*info = -1;
} else if (! notran && ! lapackf77_lsame(trans_, "T")) {
*info = -2;
} else if (m < 0) {
*info = -3;
} else if (n < 0) {
*info = -4;
} else if (k < 0 || k > nq) {
*info = -5;
} else if (lda < max(1,nq)) {
*info = -7;
} else if (ldc < max(1,m)) {
*info = -10;
} else if (lwork < max(1,nw) && ! lquery) {
*info = -12;
}
if (*info == 0)
{
/* Determine the block size. NB may be at most NBMAX, where NBMAX
is used to define the local array T. */
nb = 64;
lwkopt = max(1,nw) * nb;
MAGMA_Z_SET2REAL( work[0], lwkopt );
}
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
else if (lquery) {
return *info;
}
/* Quick return if possible */
if (m == 0 || n == 0 || k == 0) {
work[0] = c_one;
return *info;
}
magma_int_t lddc = m;
magma_int_t lddac = nq;
magma_int_t lddar =nb;
magma_int_t lddwork = nw;
magma_int_t n_l = (n-1)/nrgpu+1; // local n
for (igpu = 0; igpu < nrgpu; ++igpu){
if (MAGMA_SUCCESS != magma_zmalloc( &dw[igpu], (n_l*lddc + 2*lddac*lddar + 2*(nb + 1 + lddwork)*nb) )) {
printf("%d: size: %ld\n", igpu, (n_l*lddc + 2*lddac*lddar + (nb+1+lddwork)*nb)*sizeof(cuDoubleComplex));
magma_xerbla( __func__, -(*info) );
return *info;
}
magma_queue_create( &stream[igpu][0] );
magma_queue_create( &stream[igpu][1] );
}
if (nb >= k)
{
/* Use CPU code */
lapackf77_zunmqr(side_, trans_, &m, &n, &k, a, &lda, tau,
c, &ldc, work, &lwork, &iinfo);
}
else
{
/* Use hybrid CPU-MGPU code */
if (left) {
//copy C to mgpus
for (igpu = 0; igpu < nrgpu; ++igpu){
kb = min(n_l, n-igpu*n_l);
C(0, igpu*n_l), ldc,
dC(igpu, 0, 0), lddc, stream[igpu][0] );
}
if ( !notran ) {
i1 = 0;
i2 = k;
i3 = nb;
} else {
i1 = (k - 1) / nb * nb;
i2 = 0;
i3 = -nb;
}
kb = min(nb, k-i1);
for (igpu = 0; igpu < nrgpu; ++igpu){
magma_zsetmatrix_async( (nq-i1), kb,
A(i1, i1), lda,
dA_c(igpu, 0, i1, 0), lddac, stream[igpu][0] );
}
ind_c = 0;
for (i = i1; i3 < 0 ? i >= i2 : i < i2; i += i3)
{
ib = min(nb, k - i);
/* Form the triangular factor of the block reflector
H = H(i) H(i+1) . . . H(i+ib-1) */
i__4 = nq - i;
lapackf77_zlarft("F", "C", &i__4, &ib, A(i, i), &lda,
&tau[i], t, &ib);
/* H or H' is applied to C(1:m,i:n) */
/* Apply H or H'; First copy T to the GPU */
for (igpu = 0; igpu < nrgpu; ++igpu){
t, ib,
dt(igpu, ind_c), ib, stream[igpu][ind_c] );
magma_queue_sync( stream[igpu][ind_c] ); // Makes sure that we can change t next iteration.
}
// start the copy of next A panel
kb = min(nb, k - i - i3);
if (kb > 0 && i+i3 >= 0){
for (igpu = 0; igpu < nrgpu; ++igpu){
magma_zsetmatrix_async( (i__4-i3), kb,
A(i+i3, i+i3), lda,
dA_c(igpu, (ind_c+1)%2, i+i3, 0), lddac, stream[igpu][(ind_c+1)%2] );
}
}
for (igpu = 0; igpu < nrgpu; ++igpu){
// Put 0s in the upper triangular part of dA;
magmablas_zsetdiag1subdiag0_stream('L', ib, ib, dA_c(igpu, ind_c, i, 0), lddac, stream[igpu][ind_c]);
magmablasSetKernelStream(stream[igpu][ind_c]);
m-i, n_l, ib,
dA_c(igpu, ind_c, i, 0), lddac, dt(igpu, ind_c), ib,
dC(igpu, i, 0), lddc,
dwork(igpu, ind_c), lddwork);
}
ind_c = (ind_c+1)%2;
}
//copy C from mgpus
for (igpu = 0; igpu < nrgpu; ++igpu){
magma_queue_sync( stream[igpu][0] );
magma_queue_sync( stream[igpu][1] );
kb = min(n_l, n-igpu*n_l);
dC(igpu, 0, 0), lddc,
C(0, igpu*n_l), ldc, stream[igpu][0] );
}
} else {
fprintf(stderr, "The case (side == right) is not implemented\n");
return *info;
/*if ( notran ) {
i1 = 0;
i2 = k;
i3 = nb;
} else {
i1 = (k - 1) / nb * nb;
i2 = 0;
i3 = -nb;
}
mi = m;
ic = 0;
for (i = i1; i3 < 0 ? i >= i2 : i < i2; i += i3)
{
ib = min(nb, k - i);
// Form the triangular factor of the block reflector
// H = H(i) H(i+1) . . . H(i+ib-1)
i__4 = nq - i;
lapackf77_zlarft("F", "C", &i__4, &ib, A(i, i), &lda,
&tau[i], t, &ib);
// 1) copy the panel from A to the GPU, and
// 2) Put 0s in the upper triangular part of dA;
magma_zsetmatrix( i__4, ib, A(i, i), lda, dA(i, 0), ldda );
magmablas_zsetdiag1subdiag0('L', ib, ib, dA(i, 0), ldda);
// H or H' is applied to C(1:m,i:n)
ni = n - i;
jc = i;
// Apply H or H'; First copy T to the GPU
magma_zsetmatrix( ib, ib, t, ib, dt, ib );
magma_zlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise,
mi, ni, ib,
dA(i, 0), ldda, dt, ib,
dC(ic, jc), lddc,
dwork, lddwork);
}
*/
}
}
MAGMA_Z_SET2REAL( work[0], lwkopt );
for (igpu = 0; igpu < nrgpu; ++igpu){
magma_queue_sync( stream[igpu][0] );
magma_queue_destroy( stream[igpu][0] );
magma_queue_destroy( stream[igpu][1] );
magma_free( dw[igpu] );
}
return *info;
} /* magma_zunmqr */

Here is the call graph for this function:

Here is the caller graph for this function:

void magmablas_zsetdiag1subdiag0_stream ( char  uplo,
int  k,
int  nb,
cuDoubleComplex *  A,
int  lda,
cudaStream_t  stream 
)

Here is the caller graph for this function: