MAGMA  1.2.0
MatrixAlgebraonGPUandMulticoreArchitectures
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
cunmqr_m.cpp
Go to the documentation of this file.
1 /*
2  -- MAGMA (version 1.2.0) --
3  Univ. of Tennessee, Knoxville
4  Univ. of California, Berkeley
5  Univ. of Colorado, Denver
6  May 2012
7 
8  @author Stan Tomov
9 
10  @generated c Thu May 10 22:27:00 2012
11 
12 */
13 #define N_MAX_GPU 8
14 
15 #include "common_magma.h"
16 
17 #define A(i, j) ( a+(j)*lda + (i))
18 #define C(i, j) ( c+(j)*ldc + (i))
19 
20 #define dC(gpui, i, j) (dw[gpui]+(j)*lddc + (i))
21 #define dA_c(gpui, ind, i, j) (dw[gpui] + n_l*lddc + (ind)*lddar*lddac + (i) + (j)*lddac)
22 #define dA_r(gpui, ind, i, j) (dw[gpui] + n_l*lddc + (ind)*lddar*lddac + (i) + (j)*lddar)
23 #define dt(gpui, ind) (dw[gpui] + n_l*lddc + 2*lddac*lddar + (ind)*(nb+1)*nb)
24 #define dwork(gpui, ind) (dw[gpui] + n_l*lddc + 2*lddac*lddar + 2*(nb+1)*nb + (ind)*lddwork*nb)
25 
26 extern"C"{
27  void magmablas_csetdiag1subdiag0_stream(char uplo, int k, int nb, cuFloatComplex *A, int lda, cudaStream_t stream);
28 }
29 
30 extern "C" magma_int_t
31 magma_cunmqr_m(magma_int_t nrgpu, const char side, const char trans,
33  cuFloatComplex *a, magma_int_t lda,
34  cuFloatComplex *tau,
35  cuFloatComplex *c, magma_int_t ldc,
36  cuFloatComplex *work, magma_int_t lwork,
37  magma_int_t *info)
38 {
39 /* -- MAGMA (version 1.2.0) --
40  Univ. of Tennessee, Knoxville
41  Univ. of California, Berkeley
42  Univ. of Colorado, Denver
43  May 2012
44 
45  Purpose
46  =======
47  CUNMQR overwrites the general complex M-by-N matrix C with
48 
49  SIDE = 'L' SIDE = 'R'
50  TRANS = 'N': Q * C C * Q
51  TRANS = 'T': Q**H * C C * Q**H
52 
53  where Q is a complex orthogonal matrix defined as the product of k
54  elementary reflectors
55 
56  Q = H(1) H(2) . . . H(k)
57 
58  as returned by CGEQRF. Q is of order M if SIDE = 'L' and of order N
59  if SIDE = 'R'.
60 
61  Arguments
62  =========
63  SIDE (input) CHARACTER*1
64  = 'L': apply Q or Q**H from the Left;
65  = 'R': apply Q or Q**H from the Right.
66 
67  TRANS (input) CHARACTER*1
68  = 'N': No transpose, apply Q;
69  = 'T': Transpose, apply Q**H.
70 
71  M (input) INTEGER
72  The number of rows of the matrix C. M >= 0.
73 
74  N (input) INTEGER
75  The number of columns of the matrix C. N >= 0.
76 
77  K (input) INTEGER
78  The number of elementary reflectors whose product defines
79  the matrix Q.
80  If SIDE = 'L', M >= K >= 0;
81  if SIDE = 'R', N >= K >= 0.
82 
83  A (input) COMPLEX array, dimension (LDA,K)
84  The i-th column must contain the vector which defines the
85  elementary reflector H(i), for i = 1,2,...,k, as returned by
86  CGEQRF in the first k columns of its array argument A.
87 
88  LDA (input) INTEGER
89  The leading dimension of the array A.
90  If SIDE = 'L', LDA >= max(1,M);
91  if SIDE = 'R', LDA >= max(1,N).
92 
93  TAU (input) COMPLEX array, dimension (K)
94  TAU(i) must contain the scalar factor of the elementary
95  reflector H(i), as returned by CGEQRF.
96 
97  C (input/output) COMPLEX array, dimension (LDC,N)
98  On entry, the M-by-N matrix C.
99  On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q.
100 
101  LDC (input) INTEGER
102  The leading dimension of the array C. LDC >= max(1,M).
103 
104  WORK (workspace/output) COMPLEX array, dimension (MAX(1,LWORK))
105  On exit, if INFO = 0, WORK(0) returns the optimal LWORK.
106 
107  LWORK (input) INTEGER
108  The dimension of the array WORK.
109  If SIDE = 'L', LWORK >= max(1,N);
110  if SIDE = 'R', LWORK >= max(1,M).
111  For optimum performance LWORK >= N*NB if SIDE = 'L', and
112  LWORK >= M*NB if SIDE = 'R', where NB is the optimal
113  blocksize.
114 
115  If LWORK = -1, then a workspace query is assumed; the routine
116  only calculates the optimal size of the WORK array, returns
117  this value as the first entry of the WORK array, and no error
118  message related to LWORK is issued by XERBLA.
119 
120  INFO (output) INTEGER
121  = 0: successful exit
122  < 0: if INFO = -i, the i-th argument had an illegal value
123  ===================================================================== */
124  cuFloatComplex c_one = MAGMA_C_ONE;
125 
126  char side_[2] = {side, 0};
127  char trans_[2] = {trans, 0};
128 
129  cuFloatComplex* dw[N_MAX_GPU];
130  cudaStream_t stream [N_MAX_GPU][2];
131 
132  magma_int_t ind_c, kb;
133 
134  magma_int_t i__4;
135  magma_int_t i;
136  cuFloatComplex t[4160]; /* was [65][64] */
137  magma_int_t i1, i2, i3, ib, nb, nq, nw;
138  magma_int_t left, notran, lquery;
139  magma_int_t iinfo, lwkopt;
140 
141  magma_int_t igpu = 0;
142 
143  int gpu_b;
144  magma_getdevice(&gpu_b);
145 
146  *info = 0;
147  left = lapackf77_lsame(side_, "L");
148  notran = lapackf77_lsame(trans_, "N");
149  lquery = (lwork == -1);
150 
151  /* NQ is the order of Q and NW is the minimum dimension of WORK */
152  if (left) {
153  nq = m;
154  nw = n;
155  } else {
156  nq = n;
157  nw = m;
158  }
159  if (! left && ! lapackf77_lsame(side_, "R")) {
160  *info = -1;
161  } else if (! notran && ! lapackf77_lsame(trans_, "T")) {
162  *info = -2;
163  } else if (m < 0) {
164  *info = -3;
165  } else if (n < 0) {
166  *info = -4;
167  } else if (k < 0 || k > nq) {
168  *info = -5;
169  } else if (lda < max(1,nq)) {
170  *info = -7;
171  } else if (ldc < max(1,m)) {
172  *info = -10;
173  } else if (lwork < max(1,nw) && ! lquery) {
174  *info = -12;
175  }
176 
177  if (*info == 0)
178  {
179  /* Determine the block size. NB may be at most NBMAX, where NBMAX
180  is used to define the local array T. */
181  nb = 64;
182  lwkopt = max(1,nw) * nb;
183  MAGMA_C_SET2REAL( work[0], lwkopt );
184  }
185 
186  if (*info != 0) {
187  magma_xerbla( __func__, -(*info) );
188  return *info;
189  }
190  else if (lquery) {
191  return *info;
192  }
193 
194  /* Quick return if possible */
195  if (m == 0 || n == 0 || k == 0) {
196  work[0] = c_one;
197  return *info;
198  }
199 
200  magma_int_t lddc = m;
201  magma_int_t lddac = nq;
202  magma_int_t lddar =nb;
203  magma_int_t lddwork = nw;
204 
205  magma_int_t n_l = (n-1)/nrgpu+1; // local n
206 
207  for (igpu = 0; igpu < nrgpu; ++igpu){
208  magma_setdevice(igpu);
209  if (MAGMA_SUCCESS != magma_cmalloc( &dw[igpu], (n_l*lddc + 2*lddac*lddar + 2*(nb + 1 + lddwork)*nb) )) {
210  printf("%d: size: %ld\n", igpu, (n_l*lddc + 2*lddac*lddar + (nb+1+lddwork)*nb)*sizeof(cuFloatComplex));
211  magma_xerbla( __func__, -(*info) );
212  *info = MAGMA_ERR_DEVICE_ALLOC;
213  return *info;
214  }
215  magma_queue_create( &stream[igpu][0] );
216  magma_queue_create( &stream[igpu][1] );
217  }
218 
219  if (nb >= k)
220  {
221  /* Use CPU code */
222  lapackf77_cunmqr(side_, trans_, &m, &n, &k, a, &lda, tau,
223  c, &ldc, work, &lwork, &iinfo);
224  }
225  else
226  {
227  /* Use hybrid CPU-MGPU code */
228  if (left) {
229 
230  //copy C to mgpus
231  for (igpu = 0; igpu < nrgpu; ++igpu){
232  magma_setdevice(igpu);
233  kb = min(n_l, n-igpu*n_l);
234  magma_csetmatrix_async( m, kb,
235  C(0, igpu*n_l), ldc,
236  dC(igpu, 0, 0), lddc, stream[igpu][0] );
237  }
238 
239  if ( !notran ) {
240  i1 = 0;
241  i2 = k;
242  i3 = nb;
243  } else {
244  i1 = (k - 1) / nb * nb;
245  i2 = 0;
246  i3 = -nb;
247  }
248 
249  kb = min(nb, k-i1);
250  for (igpu = 0; igpu < nrgpu; ++igpu){
251  magma_setdevice(igpu);
252  magma_csetmatrix_async( (nq-i1), kb,
253  A(i1, i1), lda,
254  dA_c(igpu, 0, i1, 0), lddac, stream[igpu][0] );
255  }
256  ind_c = 0;
257 
258  for (i = i1; i3 < 0 ? i >= i2 : i < i2; i += i3)
259  {
260  ib = min(nb, k - i);
261  /* Form the triangular factor of the block reflector
262  H = H(i) H(i+1) . . . H(i+ib-1) */
263  i__4 = nq - i;
264  lapackf77_clarft("F", "C", &i__4, &ib, A(i, i), &lda,
265  &tau[i], t, &ib);
266 
267  /* H or H' is applied to C(1:m,i:n) */
268 
269  /* Apply H or H'; First copy T to the GPU */
270  for (igpu = 0; igpu < nrgpu; ++igpu){
271  magma_setdevice(igpu);
272  magma_csetmatrix_async( ib, ib,
273  t, ib,
274  dt(igpu, ind_c), ib, stream[igpu][ind_c] );
275 
276  magma_queue_sync( stream[igpu][ind_c] ); // Makes sure that we can change t next iteration.
277  }
278 
279  // start the copy of next A panel
280  kb = min(nb, k - i - i3);
281  if (kb > 0 && i+i3 >= 0){
282  for (igpu = 0; igpu < nrgpu; ++igpu){
283  magma_setdevice(igpu);
284  magma_csetmatrix_async( (i__4-i3), kb,
285  A(i+i3, i+i3), lda,
286  dA_c(igpu, (ind_c+1)%2, i+i3, 0), lddac, stream[igpu][(ind_c+1)%2] );
287  }
288  }
289 
290  for (igpu = 0; igpu < nrgpu; ++igpu){
291  magma_setdevice(igpu);
292 
293  // Put 0s in the upper triangular part of dA;
294  magmablas_csetdiag1subdiag0_stream('L', ib, ib, dA_c(igpu, ind_c, i, 0), lddac, stream[igpu][ind_c]);
295 
296  magmablasSetKernelStream(stream[igpu][ind_c]);
298  m-i, n_l, ib,
299  dA_c(igpu, ind_c, i, 0), lddac, dt(igpu, ind_c), ib,
300  dC(igpu, i, 0), lddc,
301  dwork(igpu, ind_c), lddwork);
302  }
303 
304  ind_c = (ind_c+1)%2;
305  }
306 
307  //copy C from mgpus
308  for (igpu = 0; igpu < nrgpu; ++igpu){
309  magma_setdevice(igpu);
310  magma_queue_sync( stream[igpu][0] );
311  magma_queue_sync( stream[igpu][1] );
312  kb = min(n_l, n-igpu*n_l);
313  magma_cgetmatrix_async( m, kb,
314  dC(igpu, 0, 0), lddc,
315  C(0, igpu*n_l), ldc, stream[igpu][0] );
316  }
317 
318  } else {
319 
320  fprintf(stderr, "The case (side == right) is not implemented\n");
321  magma_xerbla( __func__, 1 );
322  return *info;
323 
324  /*if ( notran ) {
325  i1 = 0;
326  i2 = k;
327  i3 = nb;
328  } else {
329  i1 = (k - 1) / nb * nb;
330  i2 = 0;
331  i3 = -nb;
332  }
333 
334  mi = m;
335  ic = 0;
336 
337  for (i = i1; i3 < 0 ? i >= i2 : i < i2; i += i3)
338  {
339  ib = min(nb, k - i);
340 
341  // Form the triangular factor of the block reflector
342  // H = H(i) H(i+1) . . . H(i+ib-1)
343  i__4 = nq - i;
344  lapackf77_clarft("F", "C", &i__4, &ib, A(i, i), &lda,
345  &tau[i], t, &ib);
346 
347  // 1) copy the panel from A to the GPU, and
348  // 2) Put 0s in the upper triangular part of dA;
349  magma_csetmatrix( i__4, ib, A(i, i), lda, dA(i, 0), ldda );
350  magmablas_csetdiag1subdiag0('L', ib, ib, dA(i, 0), ldda);
351 
352 
353  // H or H' is applied to C(1:m,i:n)
354  ni = n - i;
355  jc = i;
356 
357  // Apply H or H'; First copy T to the GPU
358  magma_csetmatrix( ib, ib, t, ib, dt, ib );
359  magma_clarfb_gpu( side, trans, MagmaForward, MagmaColumnwise,
360  mi, ni, ib,
361  dA(i, 0), ldda, dt, ib,
362  dC(ic, jc), lddc,
363  dwork, lddwork);
364  }
365  */
366  }
367  }
368  MAGMA_C_SET2REAL( work[0], lwkopt );
369 
370  for (igpu = 0; igpu < nrgpu; ++igpu){
371  magma_setdevice(igpu);
373  magma_queue_sync( stream[igpu][0] );
374  magma_queue_destroy( stream[igpu][0] );
375  magma_queue_destroy( stream[igpu][1] );
376  magma_free( dw[igpu] );
377  }
378 
379  magma_setdevice(gpu_b);
380 
381  return *info;
382 } /* magma_cunmqr */
383 
384