MAGMA  1.2.0
MatrixAlgebraonGPUandMulticoreArchitectures
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
cunmtr_m.cpp File Reference
#include "common_magma.h"
Include dependency graph for cunmtr_m.cpp:

Go to the source code of this file.

Functions

magma_int_t magma_cunmqr_m (magma_int_t nrgpu, const char side, const char trans, magma_int_t m, magma_int_t n, magma_int_t k, cuFloatComplex *a, magma_int_t lda, cuFloatComplex *tau, cuFloatComplex *c, magma_int_t ldc, cuFloatComplex *work, magma_int_t lwork, magma_int_t *info)
magma_int_t magma_cunmtr_m (magma_int_t nrgpu, char side, char uplo, char trans, magma_int_t m, magma_int_t n, cuFloatComplex *a, magma_int_t lda, cuFloatComplex *tau, cuFloatComplex *c, magma_int_t ldc, cuFloatComplex *work, magma_int_t lwork, magma_int_t *info)

Function Documentation

magma_int_t magma_cunmqr_m ( magma_int_t  nrgpu,
const char  side,
const char  trans,
magma_int_t  m,
magma_int_t  n,
magma_int_t  k,
cuFloatComplex *  a,
magma_int_t  lda,
cuFloatComplex *  tau,
cuFloatComplex *  c,
magma_int_t  ldc,
cuFloatComplex *  work,
magma_int_t  lwork,
magma_int_t info 
)

Definition at line 31 of file cunmqr_m.cpp.

References __func__, A, C, dA_c, dC, dt, dwork, lapackf77_clarft(), lapackf77_cunmqr(), lapackf77_lsame, MAGMA_C_ONE, MAGMA_C_SET2REAL, magma_cgetmatrix_async(), magma_clarfb_gpu(), magma_cmalloc(), magma_csetmatrix_async(), MAGMA_ERR_DEVICE_ALLOC, magma_free(), magma_getdevice(), magma_queue_create(), magma_queue_destroy(), magma_queue_sync(), magma_setdevice(), MAGMA_SUCCESS, magma_xerbla(), magmablas_csetdiag1subdiag0_stream(), magmablasSetKernelStream(), MagmaColumnwise, MagmaForward, max, min, N_MAX_GPU, side, and trans.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
CUNMQR overwrites the general complex M-by-N matrix C with
SIDE = 'L' SIDE = 'R'
TRANS = 'N': Q * C C * Q
TRANS = 'T': Q**H * C C * Q**H
where Q is a complex orthogonal matrix defined as the product of k
elementary reflectors
Q = H(1) H(2) . . . H(k)
as returned by CGEQRF. Q is of order M if SIDE = 'L' and of order N
if SIDE = 'R'.
Arguments
=========
SIDE (input) CHARACTER*1
= 'L': apply Q or Q**H from the Left;
= 'R': apply Q or Q**H from the Right.
TRANS (input) CHARACTER*1
= 'N': No transpose, apply Q;
= 'T': Transpose, apply Q**H.
M (input) INTEGER
The number of rows of the matrix C. M >= 0.
N (input) INTEGER
The number of columns of the matrix C. N >= 0.
K (input) INTEGER
The number of elementary reflectors whose product defines
the matrix Q.
If SIDE = 'L', M >= K >= 0;
if SIDE = 'R', N >= K >= 0.
A (input) COMPLEX array, dimension (LDA,K)
The i-th column must contain the vector which defines the
elementary reflector H(i), for i = 1,2,...,k, as returned by
CGEQRF in the first k columns of its array argument A.
LDA (input) INTEGER
The leading dimension of the array A.
If SIDE = 'L', LDA >= max(1,M);
if SIDE = 'R', LDA >= max(1,N).
TAU (input) COMPLEX array, dimension (K)
TAU(i) must contain the scalar factor of the elementary
reflector H(i), as returned by CGEQRF.
C (input/output) COMPLEX array, dimension (LDC,N)
On entry, the M-by-N matrix C.
On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q.
LDC (input) INTEGER
The leading dimension of the array C. LDC >= max(1,M).
WORK (workspace/output) COMPLEX array, dimension (MAX(1,LWORK))
On exit, if INFO = 0, WORK(0) returns the optimal LWORK.
LWORK (input) INTEGER
The dimension of the array WORK.
If SIDE = 'L', LWORK >= max(1,N);
if SIDE = 'R', LWORK >= max(1,M).
For optimum performance LWORK >= N*NB if SIDE = 'L', and
LWORK >= M*NB if SIDE = 'R', where NB is the optimal
blocksize.
If LWORK = -1, then a workspace query is assumed; the routine
only calculates the optimal size of the WORK array, returns
this value as the first entry of the WORK array, and no error
message related to LWORK is issued by XERBLA.
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
===================================================================== */
cuFloatComplex c_one = MAGMA_C_ONE;
char side_[2] = {side, 0};
char trans_[2] = {trans, 0};
cuFloatComplex* dw[N_MAX_GPU];
cudaStream_t stream [N_MAX_GPU][2];
magma_int_t ind_c, kb;
cuFloatComplex t[4160]; /* was [65][64] */
magma_int_t i1, i2, i3, ib, nb, nq, nw;
magma_int_t left, notran, lquery;
magma_int_t iinfo, lwkopt;
magma_int_t igpu = 0;
int gpu_b;
magma_getdevice(&gpu_b);
*info = 0;
left = lapackf77_lsame(side_, "L");
notran = lapackf77_lsame(trans_, "N");
lquery = (lwork == -1);
/* NQ is the order of Q and NW is the minimum dimension of WORK */
if (left) {
nq = m;
nw = n;
} else {
nq = n;
nw = m;
}
if (! left && ! lapackf77_lsame(side_, "R")) {
*info = -1;
} else if (! notran && ! lapackf77_lsame(trans_, "T")) {
*info = -2;
} else if (m < 0) {
*info = -3;
} else if (n < 0) {
*info = -4;
} else if (k < 0 || k > nq) {
*info = -5;
} else if (lda < max(1,nq)) {
*info = -7;
} else if (ldc < max(1,m)) {
*info = -10;
} else if (lwork < max(1,nw) && ! lquery) {
*info = -12;
}
if (*info == 0)
{
/* Determine the block size. NB may be at most NBMAX, where NBMAX
is used to define the local array T. */
nb = 64;
lwkopt = max(1,nw) * nb;
MAGMA_C_SET2REAL( work[0], lwkopt );
}
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
else if (lquery) {
return *info;
}
/* Quick return if possible */
if (m == 0 || n == 0 || k == 0) {
work[0] = c_one;
return *info;
}
magma_int_t lddc = m;
magma_int_t lddac = nq;
magma_int_t lddar =nb;
magma_int_t lddwork = nw;
magma_int_t n_l = (n-1)/nrgpu+1; // local n
for (igpu = 0; igpu < nrgpu; ++igpu){
if (MAGMA_SUCCESS != magma_cmalloc( &dw[igpu], (n_l*lddc + 2*lddac*lddar + 2*(nb + 1 + lddwork)*nb) )) {
printf("%d: size: %ld\n", igpu, (n_l*lddc + 2*lddac*lddar + (nb+1+lddwork)*nb)*sizeof(cuFloatComplex));
magma_xerbla( __func__, -(*info) );
return *info;
}
magma_queue_create( &stream[igpu][0] );
magma_queue_create( &stream[igpu][1] );
}
if (nb >= k)
{
/* Use CPU code */
lapackf77_cunmqr(side_, trans_, &m, &n, &k, a, &lda, tau,
c, &ldc, work, &lwork, &iinfo);
}
else
{
/* Use hybrid CPU-MGPU code */
if (left) {
//copy C to mgpus
for (igpu = 0; igpu < nrgpu; ++igpu){
kb = min(n_l, n-igpu*n_l);
C(0, igpu*n_l), ldc,
dC(igpu, 0, 0), lddc, stream[igpu][0] );
}
if ( !notran ) {
i1 = 0;
i2 = k;
i3 = nb;
} else {
i1 = (k - 1) / nb * nb;
i2 = 0;
i3 = -nb;
}
kb = min(nb, k-i1);
for (igpu = 0; igpu < nrgpu; ++igpu){
magma_csetmatrix_async( (nq-i1), kb,
A(i1, i1), lda,
dA_c(igpu, 0, i1, 0), lddac, stream[igpu][0] );
}
ind_c = 0;
for (i = i1; i3 < 0 ? i >= i2 : i < i2; i += i3)
{
ib = min(nb, k - i);
/* Form the triangular factor of the block reflector
H = H(i) H(i+1) . . . H(i+ib-1) */
i__4 = nq - i;
lapackf77_clarft("F", "C", &i__4, &ib, A(i, i), &lda,
&tau[i], t, &ib);
/* H or H' is applied to C(1:m,i:n) */
/* Apply H or H'; First copy T to the GPU */
for (igpu = 0; igpu < nrgpu; ++igpu){
t, ib,
dt(igpu, ind_c), ib, stream[igpu][ind_c] );
magma_queue_sync( stream[igpu][ind_c] ); // Makes sure that we can change t next iteration.
}
// start the copy of next A panel
kb = min(nb, k - i - i3);
if (kb > 0 && i+i3 >= 0){
for (igpu = 0; igpu < nrgpu; ++igpu){
magma_csetmatrix_async( (i__4-i3), kb,
A(i+i3, i+i3), lda,
dA_c(igpu, (ind_c+1)%2, i+i3, 0), lddac, stream[igpu][(ind_c+1)%2] );
}
}
for (igpu = 0; igpu < nrgpu; ++igpu){
// Put 0s in the upper triangular part of dA;
magmablas_csetdiag1subdiag0_stream('L', ib, ib, dA_c(igpu, ind_c, i, 0), lddac, stream[igpu][ind_c]);
magmablasSetKernelStream(stream[igpu][ind_c]);
m-i, n_l, ib,
dA_c(igpu, ind_c, i, 0), lddac, dt(igpu, ind_c), ib,
dC(igpu, i, 0), lddc,
dwork(igpu, ind_c), lddwork);
}
ind_c = (ind_c+1)%2;
}
//copy C from mgpus
for (igpu = 0; igpu < nrgpu; ++igpu){
magma_queue_sync( stream[igpu][0] );
magma_queue_sync( stream[igpu][1] );
kb = min(n_l, n-igpu*n_l);
dC(igpu, 0, 0), lddc,
C(0, igpu*n_l), ldc, stream[igpu][0] );
}
} else {
fprintf(stderr, "The case (side == right) is not implemented\n");
return *info;
/*if ( notran ) {
i1 = 0;
i2 = k;
i3 = nb;
} else {
i1 = (k - 1) / nb * nb;
i2 = 0;
i3 = -nb;
}
mi = m;
ic = 0;
for (i = i1; i3 < 0 ? i >= i2 : i < i2; i += i3)
{
ib = min(nb, k - i);
// Form the triangular factor of the block reflector
// H = H(i) H(i+1) . . . H(i+ib-1)
i__4 = nq - i;
lapackf77_clarft("F", "C", &i__4, &ib, A(i, i), &lda,
&tau[i], t, &ib);
// 1) copy the panel from A to the GPU, and
// 2) Put 0s in the upper triangular part of dA;
magma_csetmatrix( i__4, ib, A(i, i), lda, dA(i, 0), ldda );
magmablas_csetdiag1subdiag0('L', ib, ib, dA(i, 0), ldda);
// H or H' is applied to C(1:m,i:n)
ni = n - i;
jc = i;
// Apply H or H'; First copy T to the GPU
magma_csetmatrix( ib, ib, t, ib, dt, ib );
magma_clarfb_gpu( side, trans, MagmaForward, MagmaColumnwise,
mi, ni, ib,
dA(i, 0), ldda, dt, ib,
dC(ic, jc), lddc,
dwork, lddwork);
}
*/
}
}
MAGMA_C_SET2REAL( work[0], lwkopt );
for (igpu = 0; igpu < nrgpu; ++igpu){
magma_queue_sync( stream[igpu][0] );
magma_queue_destroy( stream[igpu][0] );
magma_queue_destroy( stream[igpu][1] );
magma_free( dw[igpu] );
}
return *info;
} /* magma_cunmqr */

Here is the call graph for this function:

Here is the caller graph for this function:

magma_int_t magma_cunmtr_m ( magma_int_t  nrgpu,
char  side,
char  uplo,
char  trans,
magma_int_t  m,
magma_int_t  n,
cuFloatComplex *  a,
magma_int_t  lda,
cuFloatComplex *  tau,
cuFloatComplex *  c,
magma_int_t  ldc,
cuFloatComplex *  work,
magma_int_t  lwork,
magma_int_t info 
)

Definition at line 26 of file cunmtr_m.cpp.

References __func__, lapackf77_lsame, MAGMA_C_ONE, MAGMA_C_SET2REAL, magma_cunmqr_m(), magma_xerbla(), max, side, trans, and uplo.

{
/* -- MAGMA (version 1.2.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
May 2012
Purpose
=======
CUNMTR overwrites the general complex M-by-N matrix C with
SIDE = 'L' SIDE = 'R'
TRANS = 'N': Q * C C * Q
TRANS = 'T': Q**H * C C * Q**H
where Q is a complex orthogonal matrix of order nq, with nq = m if
SIDE = 'L' and nq = n if SIDE = 'R'. Q is defined as the product of
nq-1 elementary reflectors, as returned by SSYTRD:
if UPLO = 'U', Q = H(nq-1) . . . H(2) H(1);
if UPLO = 'L', Q = H(1) H(2) . . . H(nq-1).
Arguments
=========
SIDE (input) CHARACTER*1
= 'L': apply Q or Q**H from the Left;
= 'R': apply Q or Q**H from the Right.
UPLO (input) CHARACTER*1
= 'U': Upper triangle of A contains elementary reflectors
from SSYTRD;
= 'L': Lower triangle of A contains elementary reflectors
from SSYTRD.
TRANS (input) CHARACTER*1
= 'N': No transpose, apply Q;
= 'T': Transpose, apply Q**H.
M (input) INTEGER
The number of rows of the matrix C. M >= 0.
N (input) INTEGER
The number of columns of the matrix C. N >= 0.
A (input) COMPLEX array, dimension
(LDA,M) if SIDE = 'L'
(LDA,N) if SIDE = 'R'
The vectors which define the elementary reflectors, as
returned by SSYTRD.
LDA (input) INTEGER
The leading dimension of the array A.
LDA >= max(1,M) if SIDE = 'L'; LDA >= max(1,N) if SIDE = 'R'.
TAU (input) COMPLEX array, dimension
(M-1) if SIDE = 'L'
(N-1) if SIDE = 'R'
TAU(i) must contain the scalar factor of the elementary
reflector H(i), as returned by SSYTRD.
C (input/output) COMPLEX array, dimension (LDC,N)
On entry, the M-by-N matrix C.
On exit, C is overwritten by Q*C or Q\*\*H*C or C*Q\*\*H or C*Q.
LDC (input) INTEGER
The leading dimension of the array C. LDC >= max(1,M).
WORK (workspace/output) COMPLEX array, dimension (MAX(1,LWORK))
On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
LWORK (input) INTEGER
The dimension of the array WORK.
If SIDE = 'L', LWORK >= max(1,N);
if SIDE = 'R', LWORK >= max(1,M).
For optimum performance LWORK >= N*NB if SIDE = 'L', and
LWORK >= M*NB if SIDE = 'R', where NB is the optimal
blocksize.
If LWORK = -1, then a workspace query is assumed; the routine
only calculates the optimal size of the WORK array, returns
this value as the first entry of the WORK array, and no error
message related to LWORK is issued.
INFO (output) INTEGER
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
===================================================================== */
cuFloatComplex c_one = MAGMA_C_ONE;
char side_[2] = {side, 0};
char uplo_[2] = {uplo, 0};
char trans_[2] = {trans, 0};
static magma_int_t i1, i2, nb, mi, ni, nq, nw;
long int left, upper, lquery;
static magma_int_t iinfo;
static magma_int_t lwkopt;
*info = 0;
left = lapackf77_lsame(side_, "L");
upper = lapackf77_lsame(uplo_, "U");
lquery = lwork == -1;
/* NQ is the order of Q and NW is the minimum dimension of WORK */
if (left) {
nq = m;
nw = n;
} else {
nq = n;
nw = m;
}
if (! left && ! lapackf77_lsame(side_, "R")) {
*info = -1;
} else if (! upper && ! lapackf77_lsame(uplo_, "L")) {
*info = -2;
} else if (! lapackf77_lsame(trans_, "N") &&
! lapackf77_lsame(trans_, "C")) {
*info = -3;
} else if (m < 0) {
*info = -4;
} else if (n < 0) {
*info = -5;
} else if (lda < max(1,nq)) {
*info = -7;
} else if (ldc < max(1,m)) {
*info = -10;
} else if (lwork < max(1,nw) && ! lquery) {
*info = -12;
}
if (*info == 0)
{
nb = 32;
lwkopt = max(1,nw) * nb;
MAGMA_C_SET2REAL( work[0], lwkopt );
}
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
else if (lquery) {
return *info;
}
/* Quick return if possible */
if (m == 0 || n == 0 || nq == 1) {
work[0] = c_one;
return *info;
}
if (left) {
mi = m - 1;
ni = n;
} else {
mi = m;
ni = n - 1;
}
if (upper)
{
/* Q was determined by a call to SSYTRD with UPLO = 'U' */
i__2 = nq - 1;
printf("cunmtr_m upper case not implemented");
exit(-1);
//lapackf77_cunmql(side_, trans_, &mi, &ni, &i__2, &a[lda], &lda,
// tau, c, &ldc, work, &lwork, &iinfo);
//magma_cunmql_m(nrgpu, side, trans, mi, ni, i__2, &a[lda], lda, tau,
// c, ldc, work, lwork, &iinfo);
}
else
{
/* Q was determined by a call to SSYTRD with UPLO = 'L' */
if (left) {
i1 = 1;
i2 = 0;
} else {
i1 = 0;
i2 = 1;
}
i__2 = nq - 1;
magma_cunmqr_m(nrgpu, side, trans, mi, ni, i__2, &a[1], lda, tau,
&c[i1 + i2 * ldc], ldc, work, lwork, &iinfo);
}
MAGMA_C_SET2REAL( work[0], lwkopt );
return *info;
} /* magma_cunmtr */

Here is the call graph for this function:

Here is the caller graph for this function: