MAGMA  magma-1.4.0
Matrix Algebra on GPU and Multicore Architectures
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
zunmqr_m.cpp
Go to the documentation of this file.
1 /*
2  -- MAGMA (version 1.4.0) --
3  Univ. of Tennessee, Knoxville
4  Univ. of California, Berkeley
5  Univ. of Colorado, Denver
6  August 2013
7 
8  @author Raffaele Solca
9  @author Azzam Haidar
10  @author Stan Tomov
11 
12  @precisions normal z -> s d c
13 
14 */
15 #include "common_magma.h"
16 
17 #define A(i, j) ( a+(j)*lda + (i))
18 #define C(i, j) ( c+(j)*ldc + (i))
19 
20 #define dC(gpui, i, j) (dw[gpui]+(j)*lddc + (i))
21 #define dA_c(gpui, ind, i, j) (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddac)
22 #define dA_r(gpui, ind, i, j) (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddar)
23 #define dt(gpui, ind) (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + (ind)*((nb+1)*nb))
24 #define dwork(gpui, ind) (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + 2*((nb+1)*nb) + (ind)*(lddwork*nb))
25 
26 extern"C"{
27  void magmablas_zsetdiag1subdiag0_stream(char uplo, int k, int nb, magmaDoubleComplex *A, int lda, magma_queue_t stream);
28 }
29 
30 extern "C" magma_int_t
31 magma_zunmqr_m(magma_int_t nrgpu, char side, char trans,
33  magmaDoubleComplex *a, magma_int_t lda,
34  magmaDoubleComplex *tau,
35  magmaDoubleComplex *c, magma_int_t ldc,
36  magmaDoubleComplex *work, magma_int_t lwork,
37  magma_int_t *info)
38 {
39 /* -- MAGMA (version 1.4.0) --
40  Univ. of Tennessee, Knoxville
41  Univ. of California, Berkeley
42  Univ. of Colorado, Denver
43  August 2013
44 
45  Purpose
46  =======
47  ZUNMQR overwrites the general complex M-by-N matrix C with
48 
49  SIDE = 'L' SIDE = 'R'
50  TRANS = 'N': Q * C C * Q
51  TRANS = 'T': Q**H * C C * Q**H
52 
53  where Q is a complex orthogonal matrix defined as the product of k
54  elementary reflectors
55 
56  Q = H(1) H(2) . . . H(k)
57 
58  as returned by ZGEQRF. Q is of order M if SIDE = 'L' and of order N
59  if SIDE = 'R'.
60 
61  Arguments
62  =========
63  SIDE (input) CHARACTER*1
64  = 'L': apply Q or Q**H from the Left;
65  = 'R': apply Q or Q**H from the Right.
66 
67  TRANS (input) CHARACTER*1
68  = 'N': No transpose, apply Q;
69  = 'T': Transpose, apply Q**H.
70 
71  M (input) INTEGER
72  The number of rows of the matrix C. M >= 0.
73 
74  N (input) INTEGER
75  The number of columns of the matrix C. N >= 0.
76 
77  K (input) INTEGER
78  The number of elementary reflectors whose product defines
79  the matrix Q.
80  If SIDE = 'L', M >= K >= 0;
81  if SIDE = 'R', N >= K >= 0.
82 
83  A (input) COMPLEX_16 array, dimension (LDA,K)
84  The i-th column must contain the vector which defines the
85  elementary reflector H(i), for i = 1,2,...,k, as returned by
86  ZGEQRF in the first k columns of its array argument A.
87 
88  LDA (input) INTEGER
89  The leading dimension of the array A.
90  If SIDE = 'L', LDA >= max(1,M);
91  if SIDE = 'R', LDA >= max(1,N).
92 
93  TAU (input) COMPLEX_16 array, dimension (K)
94  TAU(i) must contain the scalar factor of the elementary
95  reflector H(i), as returned by ZGEQRF.
96 
97  C (input/output) COMPLEX_16 array, dimension (LDC,N)
98  On entry, the M-by-N matrix C.
99  On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q.
100 
101  LDC (input) INTEGER
102  The leading dimension of the array C. LDC >= max(1,M).
103 
104  WORK (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK))
105  On exit, if INFO = 0, WORK(0) returns the optimal LWORK.
106 
107  LWORK (input) INTEGER
108  The dimension of the array WORK.
109  If SIDE = 'L', LWORK >= max(1,N);
110  if SIDE = 'R', LWORK >= max(1,M).
111  For optimum performance LWORK >= N*NB if SIDE = 'L', and
112  LWORK >= M*NB if SIDE = 'R', where NB is the optimal
113  blocksize.
114 
115  If LWORK = -1, then a workspace query is assumed; the routine
116  only calculates the optimal size of the WORK array, returns
117  this value as the first entry of the WORK array, and no error
118  message related to LWORK is issued by XERBLA.
119 
120  INFO (output) INTEGER
121  = 0: successful exit
122  < 0: if INFO = -i, the i-th argument had an illegal value
123  ===================================================================== */
124  magmaDoubleComplex c_one = MAGMA_Z_ONE;
125 
126  char side_[2] = {side, 0};
127  char trans_[2] = {trans, 0};
128 
129  magma_int_t nb = 128;
130  magmaDoubleComplex *t ;
131  magma_zmalloc_pinned (&t, nb*nb);
132  //printf("calling zunmqr_m with nb=%d\n", (int) nb);
133 
134  magmaDoubleComplex* dw[MagmaMaxGPUs];
135  magma_queue_t stream [MagmaMaxGPUs][2];
136  magma_event_t event [MagmaMaxGPUs][2];
137 
138  magma_int_t ind_c;
139 
140  magma_int_t igpu = 0;
141  int gpu_b;
142  magma_getdevice(&gpu_b);
143 
144  *info = 0;
145 
146  magma_int_t left = lapackf77_lsame(side_, "L");
147  magma_int_t notran = lapackf77_lsame(trans_, "N");
148  magma_int_t lquery = (lwork == -1);
149 
150  /* NQ is the order of Q and NW is the minimum dimension of WORK */
151  magma_int_t nq, nw;
152  if (left) {
153  nq = m;
154  nw = n;
155  } else {
156  nq = n;
157  nw = m;
158  }
159 
160 
161  if (! left && ! lapackf77_lsame(side_, "R")) {
162  *info = -1;
163  } else if (! notran && ! lapackf77_lsame(trans_, "T")) {
164  *info = -2;
165  } else if (m < 0) {
166  *info = -3;
167  } else if (n < 0) {
168  *info = -4;
169  } else if (k < 0 || k > nq) {
170  *info = -5;
171  } else if (lda < max(1,nq)) {
172  *info = -7;
173  } else if (ldc < max(1,m)) {
174  *info = -10;
175  } else if (lwork < max(1,nw) && ! lquery) {
176  *info = -12;
177  }
178 
179  magma_int_t lwkopt = max(1,nw) * nb;
180  if (*info == 0)
181  {
182  MAGMA_Z_SET2REAL( work[0], lwkopt );
183  }
184 
185  if (*info != 0) {
186  magma_xerbla( __func__, -(*info) );
187  return *info;
188  }
189  else if (lquery) {
190  return *info;
191  }
192 
193  /* Quick return if possible */
194  if (m == 0 || n == 0 || k == 0) {
195  work[0] = c_one;
196  return *info;
197  }
198 
199  if (nb >= k)
200  {
201  /* Use CPU code */
202  lapackf77_zunmqr(side_, trans_, &m, &n, &k, a, &lda, tau,
203  c, &ldc, work, &lwork, info);
204  return *info;
205  }
206 
207  magma_int_t lddc = (m+63)/64*64;
208  magma_int_t lddac = nq;
209  magma_int_t lddar = nb;
210  magma_int_t lddwork = nw;
211 
212  magma_int_t nlocal[ MagmaMaxGPUs ] = { 0 };
213 
214  magma_int_t nb_l=256;
215  magma_int_t nbl = (n-1)/nb_l+1; // number of blocks
216  magma_int_t maxnlocal = (nbl+nrgpu-1)/nrgpu*nb_l;
217 
218  nrgpu = min(nrgpu, (n+nb_l-1)/nb_l); // Don't use GPU that will not have data.
219 
220  magma_int_t ldw = maxnlocal*lddc // dC
221  + 2*lddac*lddar // 2*dA
222  + 2*(nb + 1 + lddwork)*nb; // 2*(dT and dwork)
223 
224  for (igpu = 0; igpu < nrgpu; ++igpu){
225  magma_setdevice(igpu);
226  if (MAGMA_SUCCESS != magma_zmalloc( &dw[igpu], ldw)) {
227 
228  magma_xerbla( __func__, -(*info) );
229  *info = MAGMA_ERR_DEVICE_ALLOC;
230 
231  return *info;
232  }
233  magma_queue_create( &stream[igpu][0] );
234  magma_queue_create( &stream[igpu][1] );
235  magma_event_create( &event[igpu][0] );
236  magma_event_create( &event[igpu][1] );
237  }
238 
239  /* Use hybrid CPU-MGPU code */
240  if (left) {
241 
242  //copy C to mgpus
243  for (magma_int_t i = 0; i < nbl; ++i){
244  magma_int_t igpu = i%nrgpu;
245  magma_setdevice(igpu);
246  magma_int_t kb = min(nb_l, n-i*nb_l);
247  magma_zsetmatrix_async( m, kb,
248  C(0, i*nb_l), ldc,
249  dC(igpu, 0, i/nrgpu*nb_l), lddc, stream[igpu][0] );
250  nlocal[igpu] += kb;
251  }
252 
253  magma_int_t i1, i2, i3;
254  if ( !notran ) {
255  i1 = 0;
256  i2 = k;
257  i3 = nb;
258  } else {
259  i1 = (k - 1) / nb * nb;
260  i2 = 0;
261  i3 = -nb;
262  }
263 
264  ind_c = 0;
265 
266  for (magma_int_t i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3)
267  {
268  // start the copy of A panel
269  magma_int_t kb = min(nb, k - i);
270  for (igpu = 0; igpu < nrgpu; ++igpu){
271  magma_setdevice(igpu);
272  magma_event_sync(event[igpu][ind_c]); // check if the new data can be copied
273  magma_zsetmatrix_async(nq-i, kb,
274  A(i, i), lda,
275  dA_c(igpu, ind_c, i, 0), lddac, stream[igpu][0] );
276  // Put 0s in the upper triangular part of dA;
277  magmablas_zsetdiag1subdiag0_stream('L', kb, kb, dA_c(igpu, ind_c, i, 0), lddac, stream[igpu][0]);
278  }
279 
280  /* Form the triangular factor of the block reflector
281  H = H(i) H(i+1) . . . H(i+ib-1) */
282  magma_int_t nqi = nq - i;
283  lapackf77_zlarft("F", "C", &nqi, &kb, A(i, i), &lda,
284  &tau[i], t, &kb);
285 
286  /* H or H' is applied to C(1:m,i:n) */
287 
288  /* Apply H or H'; First copy T to the GPU */
289  for (igpu = 0; igpu < nrgpu; ++igpu){
290  magma_setdevice(igpu);
291  magma_zsetmatrix_async(kb, kb,
292  t, kb,
293  dt(igpu, ind_c), kb, stream[igpu][0] );
294  }
295 
296  for (igpu = 0; igpu < nrgpu; ++igpu){
297  magma_setdevice(igpu);
298  magma_queue_sync( stream[igpu][0] ); // check if the data was copied
299  magmablasSetKernelStream(stream[igpu][1]);
301  m-i, nlocal[igpu], kb,
302  dA_c(igpu, ind_c, i, 0), lddac, dt(igpu, ind_c), kb,
303  dC(igpu, i, 0), lddc,
304  dwork(igpu, ind_c), lddwork);
305  magma_event_record(event[igpu][ind_c], stream[igpu][1] );
306  }
307 
308  ind_c = (ind_c+1)%2;
309  }
310 
311  for (igpu = 0; igpu < nrgpu; ++igpu){
312  magma_setdevice(igpu);
313  magma_queue_sync( stream[igpu][1] );
314  }
315 
316  //copy C from mgpus
317  for (magma_int_t i = 0; i < nbl; ++i){
318  magma_int_t igpu = i%nrgpu;
319  magma_setdevice(igpu);
320  magma_int_t kb = min(nb_l, n-i*nb_l);
321  magma_zgetmatrix( m, kb,
322  dC(igpu, 0, i/nrgpu*nb_l), lddc,
323  C(0, i*nb_l), ldc );
324 // magma_zgetmatrix_async( m, kb,
325 // dC(igpu, 0, i/nrgpu*nb_l), lddc,
326 // C(0, i*nb_l), ldc, stream[igpu][0] );
327  }
328 
329  } else {
330 
331  fprintf(stderr, "The case (side == right) is not implemented\n");
332  magma_xerbla( __func__, 1 );
333  return *info;
334  /*
335  if ( notran ) {
336  i1 = 0;
337  i2 = k;
338  i3 = nb;
339  } else {
340  i1 = (k - 1) / nb * nb;
341  i2 = 0;
342  i3 = -nb;
343  }
344 
345  mi = m;
346  ic = 0;
347 
348  for (i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3)
349  {
350  ib = min(nb, k - i);
351 
352  // Form the triangular factor of the block reflector
353  // H = H(i) H(i+1) . . . H(i+ib-1)
354  i__4 = nq - i;
355  lapackf77_zlarft("F", "C", &i__4, &ib, A(i, i), &lda,
356  &tau[i], t, &ib);
357 
358  // 1) copy the panel from A to the GPU, and
359  // 2) Put 0s in the upper triangular part of dA;
360  magma_zsetmatrix( i__4, ib, A(i, i), lda, dA(i, 0), ldda );
361  magmablas_zsetdiag1subdiag0('L', ib, ib, dA(i, 0), ldda);
362 
363 
364  // H or H' is applied to C(1:m,i:n)
365  ni = n - i;
366  jc = i;
367 
368  // Apply H or H'; First copy T to the GPU
369  magma_zsetmatrix( ib, ib, t, ib, dt, ib );
370  magma_zlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise,
371  mi, ni, ib,
372  dA(i, 0), ldda, dt, ib,
373  dC(ic, jc), lddc,
374  dwork, lddwork);
375  }
376  */
377  }
378 
379  MAGMA_Z_SET2REAL( work[0], lwkopt );
380 
381  for (igpu = 0; igpu < nrgpu; ++igpu){
382  magma_setdevice(igpu);
384  magma_event_destroy( event[igpu][0] );
385  magma_event_destroy( event[igpu][1] );
386  magma_queue_destroy( stream[igpu][0] );
387  magma_queue_destroy( stream[igpu][1] );
388  magma_free( dw[igpu] );
389  }
390 
391  magma_setdevice(gpu_b);
392 
393  return *info;
394 } /* magma_zunmqr */
#define A(i, j)
Definition: zunmqr_m.cpp:17
#define min(a, b)
Definition: common_magma.h:86
#define magma_queue_create(queuePtr)
Definition: magma.h:113
#define __func__
Definition: common_magma.h:65
static magma_err_t magma_zmalloc(magmaDoubleComplex_ptr *ptrPtr, size_t n)
Definition: magma.h:80
void magma_event_destroy(magma_event_t event)
#define lapackf77_zlarft
Definition: magma_zlapack.h:80
#define MAGMA_ERR_DEVICE_ALLOC
Definition: magma_types.h:276
void magmablas_zsetdiag1subdiag0_stream(char uplo, int k, int nb, magmaDoubleComplex *A, int lda, magma_queue_t stream)
#define magma_free(ptr)
Definition: magma.h:57
#define magma_zgetmatrix(m, n, dA_src, ldda, hB_dst, ldb)
Definition: magmablas_z.h:705
int magma_int_t
Definition: magmablas.h:12
#define magma_queue_destroy(queue)
Definition: magma.h:116
#define C(i, j)
Definition: zunmqr_m.cpp:18
magma_int_t magma_zlarfb_gpu(char side, char trans, char direct, char storev, magma_int_t m, magma_int_t n, magma_int_t k, cuDoubleComplex *dv, magma_int_t ldv, cuDoubleComplex *dt, magma_int_t ldt, cuDoubleComplex *dc, magma_int_t ldc, cuDoubleComplex *dowrk, magma_int_t ldwork)
Definition: zlarfb_gpu.cpp:21
#define dC(gpui, i, j)
Definition: zunmqr_m.cpp:20
cublasStatus_t magmablasSetKernelStream(magma_queue_t stream)
void magma_setdevice(magma_device_t dev)
#define lapackf77_zunmqr
void magma_getdevice(magma_device_t *dev)
void magma_xerbla(const char *srname, magma_int_t info)
Definition: xerbla.cpp:8
#define MagmaMaxGPUs
Definition: magma_types.h:255
#define lapackf77_lsame
Definition: magma_lapack.h:23
void magma_event_record(magma_event_t event, magma_queue_t queue)
void magma_event_sync(magma_event_t event)
#define MAGMA_Z_SET2REAL(v, t)
Definition: magma.h:115
#define dt(gpui, ind)
Definition: zunmqr_m.cpp:23
magma_int_t magma_zunmqr_m(magma_int_t nrgpu, char side, char trans, magma_int_t m, magma_int_t n, magma_int_t k, magmaDoubleComplex *a, magma_int_t lda, magmaDoubleComplex *tau, magmaDoubleComplex *c, magma_int_t ldc, magmaDoubleComplex *work, magma_int_t lwork, magma_int_t *info)
Definition: zunmqr_m.cpp:31
static magma_err_t magma_zmalloc_pinned(magmaDoubleComplex **ptrPtr, size_t n)
Definition: magma.h:92
#define MagmaForward
Definition: magma.h:71
#define MAGMA_SUCCESS
Definition: magma.h:106
void magma_event_create(magma_event_t *eventPtr)
#define magma_zsetmatrix_async(m, n, hA_src, lda, dB_dst, lddb, queue)
Definition: magmablas_z.h:711
#define MAGMA_Z_ONE
Definition: magma.h:132
#define dA_c(gpui, ind, i, j)
Definition: zunmqr_m.cpp:21
#define dwork(gpui, ind)
Definition: zunmqr_m.cpp:24
#define max(a, b)
Definition: common_magma.h:82
#define MagmaColumnwise
Definition: magma.h:74
#define magma_queue_sync(queue)
Definition: magma.h:119