MAGMA  magma-1.4.0
Matrix Algebra on GPU and Multicore Architectures
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
zunmqr_gpu.cpp File Reference
#include "common_magma.h"
Include dependency graph for zunmqr_gpu.cpp:

Go to the source code of this file.

Macros

#define dA(a_1, a_2)   (dA + (a_1) + (a_2)*ldda)
 
#define dC(a_1, a_2)   (dC + (a_1) + (a_2)*lddc)
 
#define dT(a_1)   (dT + (a_1)*nb)
 

Functions

magma_int_t magma_zunmqr_gpu (char side, char trans, magma_int_t m, magma_int_t n, magma_int_t k, magmaDoubleComplex *dA, magma_int_t ldda, magmaDoubleComplex *tau, magmaDoubleComplex *dC, magma_int_t lddc, magmaDoubleComplex *hwork, magma_int_t lwork, magmaDoubleComplex *dT, magma_int_t nb, magma_int_t *info)
 

Macro Definition Documentation

#define dA (   a_1,
  a_2 
)    (dA + (a_1) + (a_2)*ldda)
#define dC (   a_1,
  a_2 
)    (dC + (a_1) + (a_2)*lddc)
#define dT (   a_1)    (dT + (a_1)*nb)

Function Documentation

magma_int_t magma_zunmqr_gpu ( char  side,
char  trans,
magma_int_t  m,
magma_int_t  n,
magma_int_t  k,
magmaDoubleComplex *  dA,
magma_int_t  ldda,
magmaDoubleComplex *  tau,
magmaDoubleComplex *  dC,
magma_int_t  lddc,
magmaDoubleComplex *  hwork,
magma_int_t  lwork,
magmaDoubleComplex *  dT,
magma_int_t  nb,
magma_int_t info 
)

Definition at line 17 of file zunmqr_gpu.cpp.

References __func__, dA, dC, dT, dwork, hwork, lapackf77_lsame, lapackf77_zunmqr, magma_xerbla(), MAGMA_Z_MAKE, MAGMA_Z_ONE, magma_zgetmatrix, magma_zlarfb_gpu(), magma_zsetmatrix, MagmaColumnwise, MagmaConjTransStr, MagmaForward, max, and min.

25 {
26 /* -- MAGMA (version 1.4.0) --
27  Univ. of Tennessee, Knoxville
28  Univ. of California, Berkeley
29  Univ. of Colorado, Denver
30  August 2013
31 
32  Purpose
33  =======
34  ZUNMQR_GPU overwrites the general complex M-by-N matrix C with
35 
36  SIDE = 'L' SIDE = 'R'
37  TRANS = 'N': Q * C C * Q
38  TRANS = 'T': Q**H * C C * Q**H
39 
40  where Q is a complex orthogonal matrix defined as the product of k
41  elementary reflectors
42 
43  Q = H(1) H(2) . . . H(k)
44 
45  as returned by ZGEQRF. Q is of order M if SIDE = 'L' and of order N
46  if SIDE = 'R'.
47 
48  Arguments
49  =========
50  SIDE (input) CHARACTER*1
51  = 'L': apply Q or Q**H from the Left;
52  = 'R': apply Q or Q**H from the Right.
53 
54  TRANS (input) CHARACTER*1
55  = 'N': No transpose, apply Q;
56  = 'T': Transpose, apply Q**H.
57 
58  M (input) INTEGER
59  The number of rows of the matrix C. M >= 0.
60 
61  N (input) INTEGER
62  The number of columns of the matrix C. N >= 0.
63 
64  K (input) INTEGER
65  The number of elementary reflectors whose product defines
66  the matrix Q.
67  If SIDE = 'L', M >= K >= 0;
68  if SIDE = 'R', N >= K >= 0.
69 
70  DA (input) COMPLEX_16 array on the GPU, dimension (LDDA,K)
71  The i-th column must contain the vector which defines the
72  elementary reflector H(i), for i = 1,2,...,k, as returned by
73  ZGEQRF in the first k columns of its array argument DA.
74  DA is modified by the routine but restored on exit.
75 
76  LDDA (input) INTEGER
77  The leading dimension of the array DA.
78  If SIDE = 'L', LDDA >= max(1,M);
79  if SIDE = 'R', LDDA >= max(1,N).
80 
81  TAU (input) COMPLEX_16 array, dimension (K)
82  TAU(i) must contain the scalar factor of the elementary
83  reflector H(i), as returned by ZGEQRF.
84 
85  DC (input/output) COMPLEX_16 array on the GPU, dimension (LDDC,N)
86  On entry, the M-by-N matrix C.
87  On exit, C is overwritten by Q*C or Q**H * C or C * Q**H or C*Q.
88 
89  LDDC (input) INTEGER
90  The leading dimension of the array DC. LDDC >= max(1,M).
91 
92  HWORK (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK))
93 
94  Currently, zgetrs_gpu assumes that on exit, hwork contains the last
95  block of A and C. This will change and *should not be relied on*!
96 
97  LWORK (input) INTEGER
98  The dimension of the array HWORK.
99  LWORK >= (M-K+NB)*(N+NB) + N*NB if SIDE = 'L', and
100  LWORK >= (N-K+NB)*(M+NB) + M*NB if SIDE = 'R',
101  where NB is the given blocksize.
102 
103  If LWORK = -1, then a workspace query is assumed; the routine
104  only calculates the optimal size of the HWORK array, returns
105  this value as the first entry of the HWORK array, and no error
106  message related to LWORK is issued by XERBLA.
107 
108  DT (input) COMPLEX_16 array on the GPU that is the output
109  (the 9th argument) of magma_zgeqrf_gpu.
110 
111  NB (input) INTEGER
112  This is the blocking size that was used in pre-computing DT, e.g.,
113  the blocking size used in magma_zgeqrf_gpu.
114 
115  INFO (output) INTEGER
116  = 0: successful exit
117  < 0: if INFO = -i, the i-th argument had an illegal value
118  ===================================================================== */
119 
120  #define dA(a_1,a_2) (dA + (a_1) + (a_2)*ldda)
121  #define dC(a_1,a_2) (dC + (a_1) + (a_2)*lddc)
122  #define dT(a_1) (dT + (a_1)*nb)
123 
124  magmaDoubleComplex c_one = MAGMA_Z_ONE;
125 
126  char side_[2] = {side, 0};
127  char trans_[2] = {trans, 0};
128 
129  magmaDoubleComplex *dwork;
130  magma_int_t i, lddwork;
131  magma_int_t i1, i2, step, ib, ic, jc, ma, mi, ni, nq, nw;
132  int left, notran, lquery;
133  magma_int_t lwkopt;
134 
135  *info = 0;
136  left = lapackf77_lsame(side_, "L");
137  notran = lapackf77_lsame(trans_, "N");
138  lquery = (lwork == -1);
139 
140  /* NQ is the order of Q and NW is the minimum dimension of WORK */
141  if (left) {
142  nq = m;
143  nw = n;
144  } else {
145  nq = n;
146  nw = m;
147  }
148  lwkopt = (nq - k + nb)*(nw + nb) + nw*nb;
149  hwork[0] = MAGMA_Z_MAKE( lwkopt, 0 );
150 
151  if ( (!left) && (!lapackf77_lsame(side_, "R")) ) {
152  *info = -1;
153  } else if ( (!notran) && (!lapackf77_lsame(trans_, MagmaConjTransStr)) ) {
154  *info = -2;
155  } else if (m < 0) {
156  *info = -3;
157  } else if (n < 0) {
158  *info = -4;
159  } else if (k < 0 || k > nq) {
160  *info = -5;
161  } else if (ldda < max(1,nq)) {
162  *info = -7;
163  } else if (lddc < max(1,m)) {
164  *info = -10;
165  } else if (lwork < lwkopt && ! lquery) {
166  *info = -12;
167  }
168 
169  if (*info != 0) {
170  magma_xerbla( __func__, -(*info) );
171  return *info;
172  }
173  else if (lquery) {
174  return *info;
175  }
176 
177  /* Quick return if possible */
178  if (m == 0 || n == 0 || k == 0) {
179  hwork[0] = c_one;
180  return *info;
181  }
182 
183  lddwork = k;
184  dwork = dT(2*lddwork);
185 
186  if ( (left && (! notran)) || ((! left) && notran) ) {
187  // left trans: Q^T C
188  // right notrans: C Q
189  // multiply from first block, i = 0, to next-to-last block, i < k-nb
190  i1 = 0;
191  i2 = k-nb;
192  step = nb;
193  } else {
194  // left notrans: Q C
195  // right trans: C Q^T
196  // multiply from next-to-last block, i = floor((k-1-nb)/nb)*nb, to first block, i = 0
197  i1 = ((k - 1 - nb) / nb) * nb;
198  i2 = 0;
199  step = -nb;
200  }
201 
202  if (left) {
203  ni = n;
204  jc = 0;
205  } else {
206  mi = m;
207  ic = 0;
208  }
209 
210  /* Use unblocked code to multiply last or only block (cases Q*C or C*Q^T). */
211  // workspace left: A(mi*nb) + C(mi*ni) + work(ni*nb_la) = (m-k-nb)*nb + (m-k-nb)*n + n*nb
212  // workspace right: A(ni*nb) + C(mi*ni) + work(mi*nb_la) = (n-k-nb)*nb + m*(n-k-nb) + m*nb
213  if ( step < 0 ) {
214  // i is beginning of last block
215  i = i1 - step;
216  if ( i >= k ) {
217  i = i1;
218  }
219  ib = k - i;
220  if (left) {
221  // ni=n, jc=0, H or H^T is applied to C(i:m-1,0:n-1)
222  mi = m - i;
223  ma = mi;
224  ic = i;
225  }
226  else {
227  // mi=m, ic=0, H or H^T is applied to C(0:m-1,i:n-1)
228  ni = n - i;
229  ma = ni;
230  jc = i;
231  }
232 
233  magmaDoubleComplex* hA = hwork;
234  magmaDoubleComplex* hC = hwork + ma*ib;
235  magmaDoubleComplex* hW = hwork + ma*ib + mi*ni;
236  magma_int_t lhwork = lwork - (ma*ib + mi*ni);
237 
238  magma_zgetmatrix( ma, ib, dA(i, i ), ldda, hA, ma );
239  magma_zgetmatrix( mi, ni, dC(ic, jc), lddc, hC, mi );
240 
241  lapackf77_zunmqr( side_, trans_,
242  &mi, &ni, &ib,
243  hA, &ma, tau+i,
244  hC, &mi,
245  hW, &lhwork, info );
246 
247  // send the updated part of C back to the GPU
248  magma_zsetmatrix( mi, ni, hC, mi, dC(ic, jc), lddc );
249  }
250 
251  /* Use blocked code to multiply blocks */
252  if (nb < k) {
253  for( i=i1; (step<0 ? i>=i2 : i<i2); i+=step ) {
254  ib = min(nb, k - i);
255  if (left) {
256  // ni=n, jc=0, H or H^T is applied to C(i:m-1,0:n-1)
257  mi = m - i;
258  ic = i;
259  }
260  else {
261  // mi=m, ic=0, H or H^T is applied to C(0:m-1,i:n-1)
262  ni = n - i;
263  jc = i;
264  }
265 
267  mi, ni, ib,
268  dA(i, i ), ldda, dT(i), nb,
269  dC(ic, jc), lddc, dwork, nw );
270  }
271  }
272  else {
273  i = i1;
274  }
275 
276  /* Use unblocked code to multiply the last or only block (cases Q^T*C or C*Q). */
277  if ( step > 0 ) {
278  ib = k-i;
279  if (left) {
280  // ni=n, jc=0, H or H^T is applied to C(i:m-1,0:n-1)
281  mi = m - i;
282  ma = mi;
283  ic = i;
284  }
285  else {
286  // mi=m, ic=0, H or H^T is applied to C(0:m-1,i:n-1)
287  ni = n - i;
288  ma = ni;
289  jc = i;
290  }
291 
292  magmaDoubleComplex* hA = hwork;
293  magmaDoubleComplex* hC = hwork + ma*ib;
294  magmaDoubleComplex* hW = hwork + ma*ib + mi*ni;
295  magma_int_t lhwork = lwork - (ma*ib + mi*ni);
296 
297  magma_zgetmatrix( ma, ib, dA(i, i ), ldda, hA, ma );
298  magma_zgetmatrix( mi, ni, dC(ic, jc), lddc, hC, mi );
299 
300  lapackf77_zunmqr( side_, trans_,
301  &mi, &ni, &ib,
302  hA, &ma, tau+i,
303  hC, &mi,
304  hW, &lhwork, info );
305 
306  // send the updated part of C back to the GPU
307  magma_zsetmatrix( mi, ni, hC, mi, dC(ic, jc), lddc );
308  }
309 
310  // TODO sync. For cases Q*C and C*Q^T, last call is magma_zlarfb_gpu,
311  // which is async magma_gemm calls, so zunmqr can be unfinished.
312 
313  // TODO: zgeqrs_gpu ASSUMES that hwork contains the last block of A and C.
314  // That needs to be fixed, but until then, don't modify hwork[0] here.
315  // In LAPACK: On exit, if INFO = 0, HWORK(1) returns the optimal LWORK.
316  //hwork[0] = MAGMA_Z_MAKE( lwkopt, 0 );
317  return *info;
318 } /* end of magma_zunmqr_gpu */
#define min(a, b)
Definition: common_magma.h:86
#define dA(a_1, a_2)
#define MAGMA_Z_MAKE(r, i)
Definition: magma.h:123
#define __func__
Definition: common_magma.h:65
#define hwork
#define magma_zgetmatrix(m, n, dA_src, ldda, hB_dst, ldb)
Definition: magmablas_z.h:705
int magma_int_t
Definition: magmablas.h:12
magma_int_t magma_zlarfb_gpu(char side, char trans, char direct, char storev, magma_int_t m, magma_int_t n, magma_int_t k, cuDoubleComplex *dv, magma_int_t ldv, cuDoubleComplex *dt, magma_int_t ldt, cuDoubleComplex *dc, magma_int_t ldc, cuDoubleComplex *dowrk, magma_int_t ldwork)
Definition: zlarfb_gpu.cpp:21
#define dwork(dev, i, j)
#define lapackf77_zunmqr
void magma_xerbla(const char *srname, magma_int_t info)
Definition: xerbla.cpp:8
#define lapackf77_lsame
Definition: magma_lapack.h:23
magma_int_t ldda
#define MagmaForward
Definition: magma.h:71
#define dC(a_1, a_2)
#define dT(a_1)
#define MAGMA_Z_ONE
Definition: magma.h:132
#define magma_zsetmatrix(m, n, hA_src, lda, dB_dst, lddb)
Definition: magmablas_z.h:702
#define max(a, b)
Definition: common_magma.h:82
#define MagmaConjTransStr
Definition: magma.h:82
#define MagmaColumnwise
Definition: magma.h:74

Here is the call graph for this function: