MAGMA  magma-1.4.0
Matrix Algebra on GPU and Multicore Architectures
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
cunmqr_m.cpp File Reference
#include "common_magma.h"
Include dependency graph for cunmqr_m.cpp:

Go to the source code of this file.

Macros

#define A(i, j)   ( a+(j)*lda + (i))
 
#define C(i, j)   ( c+(j)*ldc + (i))
 
#define dC(gpui, i, j)   (dw[gpui]+(j)*lddc + (i))
 
#define dA_c(gpui, ind, i, j)   (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddac)
 
#define dA_r(gpui, ind, i, j)   (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddar)
 
#define dt(gpui, ind)   (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + (ind)*((nb+1)*nb))
 
#define dwork(gpui, ind)   (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + 2*((nb+1)*nb) + (ind)*(lddwork*nb))
 

Functions

void magmablas_csetdiag1subdiag0_stream (char uplo, int k, int nb, magmaFloatComplex *A, int lda, magma_queue_t stream)
 
magma_int_t magma_cunmqr_m (magma_int_t nrgpu, char side, char trans, magma_int_t m, magma_int_t n, magma_int_t k, magmaFloatComplex *a, magma_int_t lda, magmaFloatComplex *tau, magmaFloatComplex *c, magma_int_t ldc, magmaFloatComplex *work, magma_int_t lwork, magma_int_t *info)
 

Macro Definition Documentation

#define A (   i,
 
)    ( a+(j)*lda + (i))

Definition at line 17 of file cunmqr_m.cpp.

#define C (   i,
 
)    ( c+(j)*ldc + (i))

Definition at line 18 of file cunmqr_m.cpp.

#define dA_c (   gpui,
  ind,
  i,
 
)    (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddac)

Definition at line 21 of file cunmqr_m.cpp.

#define dA_r (   gpui,
  ind,
  i,
 
)    (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddar)

Definition at line 22 of file cunmqr_m.cpp.

#define dC (   gpui,
  i,
 
)    (dw[gpui]+(j)*lddc + (i))

Definition at line 20 of file cunmqr_m.cpp.

#define dt (   gpui,
  ind 
)    (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + (ind)*((nb+1)*nb))

Definition at line 23 of file cunmqr_m.cpp.

#define dwork (   gpui,
  ind 
)    (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + 2*((nb+1)*nb) + (ind)*(lddwork*nb))

Definition at line 24 of file cunmqr_m.cpp.

Function Documentation

magma_int_t magma_cunmqr_m ( magma_int_t  nrgpu,
char  side,
char  trans,
magma_int_t  m,
magma_int_t  n,
magma_int_t  k,
magmaFloatComplex *  a,
magma_int_t  lda,
magmaFloatComplex *  tau,
magmaFloatComplex *  c,
magma_int_t  ldc,
magmaFloatComplex *  work,
magma_int_t  lwork,
magma_int_t info 
)

Definition at line 31 of file cunmqr_m.cpp.

References __func__, A, C, dA_c, dC, dt, dwork, lapackf77_clarft(), lapackf77_cunmqr(), lapackf77_lsame, MAGMA_C_ONE, MAGMA_C_SET2REAL, magma_cgetmatrix, magma_clarfb_gpu(), magma_cmalloc(), magma_cmalloc_pinned(), magma_csetmatrix_async, MAGMA_ERR_DEVICE_ALLOC, magma_event_create(), magma_event_destroy(), magma_event_record(), magma_event_sync(), magma_free, magma_getdevice(), magma_queue_create, magma_queue_destroy, magma_queue_sync, magma_setdevice(), MAGMA_SUCCESS, magma_xerbla(), magmablas_csetdiag1subdiag0_stream(), magmablasSetKernelStream(), MagmaColumnwise, MagmaForward, MagmaMaxGPUs, max, and min.

38 {
39 /* -- MAGMA (version 1.4.0) --
40  Univ. of Tennessee, Knoxville
41  Univ. of California, Berkeley
42  Univ. of Colorado, Denver
43  August 2013
44 
45  Purpose
46  =======
47  CUNMQR overwrites the general complex M-by-N matrix C with
48 
49  SIDE = 'L' SIDE = 'R'
50  TRANS = 'N': Q * C C * Q
51  TRANS = 'T': Q**H * C C * Q**H
52 
53  where Q is a complex orthogonal matrix defined as the product of k
54  elementary reflectors
55 
56  Q = H(1) H(2) . . . H(k)
57 
58  as returned by CGEQRF. Q is of order M if SIDE = 'L' and of order N
59  if SIDE = 'R'.
60 
61  Arguments
62  =========
63  SIDE (input) CHARACTER*1
64  = 'L': apply Q or Q**H from the Left;
65  = 'R': apply Q or Q**H from the Right.
66 
67  TRANS (input) CHARACTER*1
68  = 'N': No transpose, apply Q;
69  = 'T': Transpose, apply Q**H.
70 
71  M (input) INTEGER
72  The number of rows of the matrix C. M >= 0.
73 
74  N (input) INTEGER
75  The number of columns of the matrix C. N >= 0.
76 
77  K (input) INTEGER
78  The number of elementary reflectors whose product defines
79  the matrix Q.
80  If SIDE = 'L', M >= K >= 0;
81  if SIDE = 'R', N >= K >= 0.
82 
83  A (input) COMPLEX array, dimension (LDA,K)
84  The i-th column must contain the vector which defines the
85  elementary reflector H(i), for i = 1,2,...,k, as returned by
86  CGEQRF in the first k columns of its array argument A.
87 
88  LDA (input) INTEGER
89  The leading dimension of the array A.
90  If SIDE = 'L', LDA >= max(1,M);
91  if SIDE = 'R', LDA >= max(1,N).
92 
93  TAU (input) COMPLEX array, dimension (K)
94  TAU(i) must contain the scalar factor of the elementary
95  reflector H(i), as returned by CGEQRF.
96 
97  C (input/output) COMPLEX array, dimension (LDC,N)
98  On entry, the M-by-N matrix C.
99  On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q.
100 
101  LDC (input) INTEGER
102  The leading dimension of the array C. LDC >= max(1,M).
103 
104  WORK (workspace/output) COMPLEX array, dimension (MAX(1,LWORK))
105  On exit, if INFO = 0, WORK(0) returns the optimal LWORK.
106 
107  LWORK (input) INTEGER
108  The dimension of the array WORK.
109  If SIDE = 'L', LWORK >= max(1,N);
110  if SIDE = 'R', LWORK >= max(1,M).
111  For optimum performance LWORK >= N*NB if SIDE = 'L', and
112  LWORK >= M*NB if SIDE = 'R', where NB is the optimal
113  blocksize.
114 
115  If LWORK = -1, then a workspace query is assumed; the routine
116  only calculates the optimal size of the WORK array, returns
117  this value as the first entry of the WORK array, and no error
118  message related to LWORK is issued by XERBLA.
119 
120  INFO (output) INTEGER
121  = 0: successful exit
122  < 0: if INFO = -i, the i-th argument had an illegal value
123  ===================================================================== */
124  magmaFloatComplex c_one = MAGMA_C_ONE;
125 
126  char side_[2] = {side, 0};
127  char trans_[2] = {trans, 0};
128 
129  magma_int_t nb = 128;
130  magmaFloatComplex *t ;
131  magma_cmalloc_pinned (&t, nb*nb);
132  //printf("calling cunmqr_m with nb=%d\n", (int) nb);
133 
134  magmaFloatComplex* dw[MagmaMaxGPUs];
135  magma_queue_t stream [MagmaMaxGPUs][2];
136  magma_event_t event [MagmaMaxGPUs][2];
137 
138  magma_int_t ind_c;
139 
140  magma_int_t igpu = 0;
141  int gpu_b;
142  magma_getdevice(&gpu_b);
143 
144  *info = 0;
145 
146  magma_int_t left = lapackf77_lsame(side_, "L");
147  magma_int_t notran = lapackf77_lsame(trans_, "N");
148  magma_int_t lquery = (lwork == -1);
149 
150  /* NQ is the order of Q and NW is the minimum dimension of WORK */
151  magma_int_t nq, nw;
152  if (left) {
153  nq = m;
154  nw = n;
155  } else {
156  nq = n;
157  nw = m;
158  }
159 
160 
161  if (! left && ! lapackf77_lsame(side_, "R")) {
162  *info = -1;
163  } else if (! notran && ! lapackf77_lsame(trans_, "T")) {
164  *info = -2;
165  } else if (m < 0) {
166  *info = -3;
167  } else if (n < 0) {
168  *info = -4;
169  } else if (k < 0 || k > nq) {
170  *info = -5;
171  } else if (lda < max(1,nq)) {
172  *info = -7;
173  } else if (ldc < max(1,m)) {
174  *info = -10;
175  } else if (lwork < max(1,nw) && ! lquery) {
176  *info = -12;
177  }
178 
179  magma_int_t lwkopt = max(1,nw) * nb;
180  if (*info == 0)
181  {
182  MAGMA_C_SET2REAL( work[0], lwkopt );
183  }
184 
185  if (*info != 0) {
186  magma_xerbla( __func__, -(*info) );
187  return *info;
188  }
189  else if (lquery) {
190  return *info;
191  }
192 
193  /* Quick return if possible */
194  if (m == 0 || n == 0 || k == 0) {
195  work[0] = c_one;
196  return *info;
197  }
198 
199  if (nb >= k)
200  {
201  /* Use CPU code */
202  lapackf77_cunmqr(side_, trans_, &m, &n, &k, a, &lda, tau,
203  c, &ldc, work, &lwork, info);
204  return *info;
205  }
206 
207  magma_int_t lddc = (m+63)/64*64;
208  magma_int_t lddac = nq;
209  magma_int_t lddar = nb;
210  magma_int_t lddwork = nw;
211 
212  magma_int_t nlocal[ MagmaMaxGPUs ] = { 0 };
213 
214  magma_int_t nb_l=256;
215  magma_int_t nbl = (n-1)/nb_l+1; // number of blocks
216  magma_int_t maxnlocal = (nbl+nrgpu-1)/nrgpu*nb_l;
217 
218  nrgpu = min(nrgpu, (n+nb_l-1)/nb_l); // Don't use GPU that will not have data.
219 
220  magma_int_t ldw = maxnlocal*lddc // dC
221  + 2*lddac*lddar // 2*dA
222  + 2*(nb + 1 + lddwork)*nb; // 2*(dT and dwork)
223 
224  for (igpu = 0; igpu < nrgpu; ++igpu){
225  magma_setdevice(igpu);
226  if (MAGMA_SUCCESS != magma_cmalloc( &dw[igpu], ldw)) {
227 
228  magma_xerbla( __func__, -(*info) );
229  *info = MAGMA_ERR_DEVICE_ALLOC;
230 
231  return *info;
232  }
233  magma_queue_create( &stream[igpu][0] );
234  magma_queue_create( &stream[igpu][1] );
235  magma_event_create( &event[igpu][0] );
236  magma_event_create( &event[igpu][1] );
237  }
238 
239  /* Use hybrid CPU-MGPU code */
240  if (left) {
241 
242  //copy C to mgpus
243  for (magma_int_t i = 0; i < nbl; ++i){
244  magma_int_t igpu = i%nrgpu;
245  magma_setdevice(igpu);
246  magma_int_t kb = min(nb_l, n-i*nb_l);
247  magma_csetmatrix_async( m, kb,
248  C(0, i*nb_l), ldc,
249  dC(igpu, 0, i/nrgpu*nb_l), lddc, stream[igpu][0] );
250  nlocal[igpu] += kb;
251  }
252 
253  magma_int_t i1, i2, i3;
254  if ( !notran ) {
255  i1 = 0;
256  i2 = k;
257  i3 = nb;
258  } else {
259  i1 = (k - 1) / nb * nb;
260  i2 = 0;
261  i3 = -nb;
262  }
263 
264  ind_c = 0;
265 
266  for (magma_int_t i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3)
267  {
268  // start the copy of A panel
269  magma_int_t kb = min(nb, k - i);
270  for (igpu = 0; igpu < nrgpu; ++igpu){
271  magma_setdevice(igpu);
272  magma_event_sync(event[igpu][ind_c]); // check if the new data can be copied
273  magma_csetmatrix_async(nq-i, kb,
274  A(i, i), lda,
275  dA_c(igpu, ind_c, i, 0), lddac, stream[igpu][0] );
276  // Put 0s in the upper triangular part of dA;
277  magmablas_csetdiag1subdiag0_stream('L', kb, kb, dA_c(igpu, ind_c, i, 0), lddac, stream[igpu][0]);
278  }
279 
280  /* Form the triangular factor of the block reflector
281  H = H(i) H(i+1) . . . H(i+ib-1) */
282  magma_int_t nqi = nq - i;
283  lapackf77_clarft("F", "C", &nqi, &kb, A(i, i), &lda,
284  &tau[i], t, &kb);
285 
286  /* H or H' is applied to C(1:m,i:n) */
287 
288  /* Apply H or H'; First copy T to the GPU */
289  for (igpu = 0; igpu < nrgpu; ++igpu){
290  magma_setdevice(igpu);
291  magma_csetmatrix_async(kb, kb,
292  t, kb,
293  dt(igpu, ind_c), kb, stream[igpu][0] );
294  }
295 
296  for (igpu = 0; igpu < nrgpu; ++igpu){
297  magma_setdevice(igpu);
298  magma_queue_sync( stream[igpu][0] ); // check if the data was copied
299  magmablasSetKernelStream(stream[igpu][1]);
301  m-i, nlocal[igpu], kb,
302  dA_c(igpu, ind_c, i, 0), lddac, dt(igpu, ind_c), kb,
303  dC(igpu, i, 0), lddc,
304  dwork(igpu, ind_c), lddwork);
305  magma_event_record(event[igpu][ind_c], stream[igpu][1] );
306  }
307 
308  ind_c = (ind_c+1)%2;
309  }
310 
311  for (igpu = 0; igpu < nrgpu; ++igpu){
312  magma_setdevice(igpu);
313  magma_queue_sync( stream[igpu][1] );
314  }
315 
316  //copy C from mgpus
317  for (magma_int_t i = 0; i < nbl; ++i){
318  magma_int_t igpu = i%nrgpu;
319  magma_setdevice(igpu);
320  magma_int_t kb = min(nb_l, n-i*nb_l);
321  magma_cgetmatrix( m, kb,
322  dC(igpu, 0, i/nrgpu*nb_l), lddc,
323  C(0, i*nb_l), ldc );
324 // magma_cgetmatrix_async( m, kb,
325 // dC(igpu, 0, i/nrgpu*nb_l), lddc,
326 // C(0, i*nb_l), ldc, stream[igpu][0] );
327  }
328 
329  } else {
330 
331  fprintf(stderr, "The case (side == right) is not implemented\n");
332  magma_xerbla( __func__, 1 );
333  return *info;
334  /*
335  if ( notran ) {
336  i1 = 0;
337  i2 = k;
338  i3 = nb;
339  } else {
340  i1 = (k - 1) / nb * nb;
341  i2 = 0;
342  i3 = -nb;
343  }
344 
345  mi = m;
346  ic = 0;
347 
348  for (i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3)
349  {
350  ib = min(nb, k - i);
351 
352  // Form the triangular factor of the block reflector
353  // H = H(i) H(i+1) . . . H(i+ib-1)
354  i__4 = nq - i;
355  lapackf77_clarft("F", "C", &i__4, &ib, A(i, i), &lda,
356  &tau[i], t, &ib);
357 
358  // 1) copy the panel from A to the GPU, and
359  // 2) Put 0s in the upper triangular part of dA;
360  magma_csetmatrix( i__4, ib, A(i, i), lda, dA(i, 0), ldda );
361  magmablas_csetdiag1subdiag0('L', ib, ib, dA(i, 0), ldda);
362 
363 
364  // H or H' is applied to C(1:m,i:n)
365  ni = n - i;
366  jc = i;
367 
368  // Apply H or H'; First copy T to the GPU
369  magma_csetmatrix( ib, ib, t, ib, dt, ib );
370  magma_clarfb_gpu( side, trans, MagmaForward, MagmaColumnwise,
371  mi, ni, ib,
372  dA(i, 0), ldda, dt, ib,
373  dC(ic, jc), lddc,
374  dwork, lddwork);
375  }
376  */
377  }
378 
379  MAGMA_C_SET2REAL( work[0], lwkopt );
380 
381  for (igpu = 0; igpu < nrgpu; ++igpu){
382  magma_setdevice(igpu);
384  magma_event_destroy( event[igpu][0] );
385  magma_event_destroy( event[igpu][1] );
386  magma_queue_destroy( stream[igpu][0] );
387  magma_queue_destroy( stream[igpu][1] );
388  magma_free( dw[igpu] );
389  }
390 
391  magma_setdevice(gpu_b);
392 
393  return *info;
394 } /* magma_cunmqr */
#define min(a, b)
Definition: common_magma.h:86
magma_int_t magma_clarfb_gpu(char side, char trans, char direct, char storev, magma_int_t m, magma_int_t n, magma_int_t k, const magmaFloatComplex *dv, magma_int_t ldv, const magmaFloatComplex *dt, magma_int_t ldt, magmaFloatComplex *dc, magma_int_t ldc, magmaFloatComplex *dwork, magma_int_t ldwork)
Definition: clarfb_gpu.cpp:15
#define magma_queue_create(queuePtr)
Definition: magma.h:113
#define __func__
Definition: common_magma.h:65
#define C(i, j)
Definition: cunmqr_m.cpp:18
#define dwork(gpui, ind)
Definition: cunmqr_m.cpp:24
void magma_event_destroy(magma_event_t event)
static magma_err_t magma_cmalloc(magmaFloatComplex_ptr *ptrPtr, size_t n)
Definition: magma.h:79
#define MAGMA_ERR_DEVICE_ALLOC
Definition: magma_types.h:276
#define magma_free(ptr)
Definition: magma.h:57
int magma_int_t
Definition: magmablas.h:12
#define dA_c(gpui, ind, i, j)
Definition: cunmqr_m.cpp:21
#define magma_queue_destroy(queue)
Definition: magma.h:116
cublasStatus_t magmablasSetKernelStream(magma_queue_t stream)
#define magma_cgetmatrix(m, n, dA_src, ldda, hB_dst, ldb)
Definition: magmablas_c.h:705
void magma_setdevice(magma_device_t dev)
void lapackf77_cunmqr(const char *side, const char *trans, const magma_int_t *m, const magma_int_t *n, const magma_int_t *k, const magmaFloatComplex *A, const magma_int_t *lda, const magmaFloatComplex *tau, magmaFloatComplex *C, const magma_int_t *ldc, magmaFloatComplex *work, const magma_int_t *lwork, magma_int_t *info)
void magma_getdevice(magma_device_t *dev)
void magma_xerbla(const char *srname, magma_int_t info)
Definition: xerbla.cpp:8
#define MagmaMaxGPUs
Definition: magma_types.h:255
#define lapackf77_lsame
Definition: magma_lapack.h:23
void magma_event_record(magma_event_t event, magma_queue_t queue)
void magma_event_sync(magma_event_t event)
#define dC(gpui, i, j)
Definition: cunmqr_m.cpp:20
#define A(i, j)
Definition: cunmqr_m.cpp:17
void lapackf77_clarft(const char *direct, const char *storev, const magma_int_t *n, const magma_int_t *k, magmaFloatComplex *V, const magma_int_t *ldv, const magmaFloatComplex *tau, magmaFloatComplex *T, const magma_int_t *ldt)
static magma_err_t magma_cmalloc_pinned(magmaFloatComplex **ptrPtr, size_t n)
Definition: magma.h:91
#define MAGMA_C_ONE
Definition: magma.h:154
#define MagmaForward
Definition: magma.h:71
#define MAGMA_SUCCESS
Definition: magma.h:106
void magma_event_create(magma_event_t *eventPtr)
void magmablas_csetdiag1subdiag0_stream(char uplo, int k, int nb, magmaFloatComplex *A, int lda, magma_queue_t stream)
#define dt(gpui, ind)
Definition: cunmqr_m.cpp:23
#define MAGMA_C_SET2REAL(v, t)
Definition: magma.h:137
#define magma_csetmatrix_async(m, n, hA_src, lda, dB_dst, lddb, queue)
Definition: magmablas_c.h:711
#define max(a, b)
Definition: common_magma.h:82
#define MagmaColumnwise
Definition: magma.h:74
#define magma_queue_sync(queue)
Definition: magma.h:119

Here is the call graph for this function:

Here is the caller graph for this function:

void magmablas_csetdiag1subdiag0_stream ( char  uplo,
int  k,
int  nb,
magmaFloatComplex *  A,
int  lda,
magma_queue_t  stream 
)

Here is the caller graph for this function: