MAGMA  magma-1.4.0
Matrix Algebra on GPU and Multicore Architectures
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
zlahr2_m.cpp File Reference
#include "common_magma.h"
Include dependency graph for zlahr2_m.cpp:

Go to the source code of this file.

Macros

#define PRECISION_z
 
#define A(i, j)   ( A + (i) + (j)*lda)
 
#define Y(i, j)   ( Y + (i) + (j)*ldy)
 
#define T(i, j)   ( T + (i) + (j)*ldt)
 
#define dA(d, i, j)   (data->A [d] + (i) + (j)*ldda)
 
#define dTi(d)   (data->Ti[d])
 
#define dV(d, i, j)   (data->V [d] + (i) + (j)*ldv )
 
#define dVd(d, i, j)   (data->Vd[d] + (i) + (j)*ldvd)
 
#define dY(d, i, j)   (data->Y [d] + (i) + (j)*ldda)
 

Functions

magma_int_t magma_zlahr2_m (magma_int_t n, magma_int_t k, magma_int_t nb, magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex *tau, magmaDoubleComplex *T, magma_int_t ldt, magmaDoubleComplex *Y, magma_int_t ldy, struct zgehrd_data *data)
 

Macro Definition Documentation

#define A (   i,
 
)    ( A + (i) + (j)*lda)
#define dA (   d,
  i,
 
)    (data->A [d] + (i) + (j)*ldda)
#define dTi (   d)    (data->Ti[d])
#define dV (   d,
  i,
 
)    (data->V [d] + (i) + (j)*ldv )
#define dVd (   d,
  i,
 
)    (data->Vd[d] + (i) + (j)*ldvd)
#define dY (   d,
  i,
 
)    (data->Y [d] + (i) + (j)*ldda)
#define PRECISION_z

Definition at line 13 of file zlahr2_m.cpp.

#define T (   i,
 
)    ( T + (i) + (j)*ldt)
#define Y (   i,
 
)    ( Y + (i) + (j)*ldy)

Function Documentation

magma_int_t magma_zlahr2_m ( magma_int_t  n,
magma_int_t  k,
magma_int_t  nb,
magmaDoubleComplex *  A,
magma_int_t  lda,
magmaDoubleComplex *  tau,
magmaDoubleComplex *  T,
magma_int_t  ldt,
magmaDoubleComplex *  Y,
magma_int_t  ldy,
struct zgehrd_data data 
)

Definition at line 16 of file zlahr2_m.cpp.

References __func__, A, blasf77_zaxpy, blasf77_zcopy, blasf77_zgemv, blasf77_ztrmv, dA, dTi, dV, dVd, dY, lapackf77_zlacgv, lapackf77_zlarfg, lapackf77_zlaset, zgehrd_data::ldda, zgehrd_data::ldv, zgehrd_data::ldvd, magma_indices_1D_bcyclic(), magma_queue_sync, magma_setdevice(), magma_xerbla(), MAGMA_Z_NEG_ONE, MAGMA_Z_NEGATE, MAGMA_Z_ONE, MAGMA_Z_ZERO, magma_zgemm(), magma_zgemv(), magma_zgetmatrix_async, magma_zgetvector_async, magma_zsetmatrix_async, magma_zsetvector_async, magmablas_zlacpy(), magmablas_zlaset(), magmablasSetKernelStream(), MagmaUpperLower, max, zgehrd_data::ngpu, zgehrd_data::streams, T, and Y.

23 {
24 /* -- MAGMA (version 1.4.0) --
25  Univ. of Tennessee, Knoxville
26  Univ. of California, Berkeley
27  Univ. of Colorado, Denver
28  August 2013
29 
30  Purpose
31  =======
32  ZLAHR2 reduces the first NB columns of a complex general n-BY-(n-k+1)
33  matrix A so that elements below the k-th subdiagonal are zero. The
34  reduction is performed by an orthogonal similarity transformation
35  Q' * A * Q. The routine returns the matrices V and T which determine
36  Q as a block reflector I - V*T*V', and also the matrix Y = A * V.
37  (Note this is different than LAPACK, which computes Y = A * V * T.)
38 
39  This is an auxiliary routine called by ZGEHRD.
40 
41  Arguments
42  =========
43  N (input) INTEGER
44  The order of the matrix A.
45 
46  K (input) INTEGER
47  The offset for the reduction. Elements below the k-th
48  subdiagonal in the first NB columns are reduced to zero.
49  K < N.
50 
51  NB (input) INTEGER
52  The number of columns to be reduced.
53 
54  A (input/output) COMPLEX_16 array, dimension (LDA,N-K+1)
55  On entry, the n-by-(n-k+1) general matrix A.
56  On exit, the elements on and above the k-th subdiagonal in
57  the first NB columns are overwritten with the corresponding
58  elements of the reduced matrix; the elements below the k-th
59  subdiagonal, with the array TAU, represent the matrix Q as a
60  product of elementary reflectors. The other columns of A are
61  unchanged. See Further Details.
62 
63  LDA (input) INTEGER
64  The leading dimension of the array A. LDA >= max(1,N).
65 
66  TAU (output) COMPLEX_16 array, dimension (NB)
67  The scalar factors of the elementary reflectors. See Further
68  Details.
69 
70  T (output) COMPLEX_16 array, dimension (LDT,NB)
71  The upper triangular matrix T.
72 
73  LDT (input) INTEGER
74  The leading dimension of the array T. LDT >= NB.
75 
76  Y (output) COMPLEX_16 array, dimension (LDY,NB)
77  The n-by-nb matrix Y.
78 
79  LDY (input) INTEGER
80  The leading dimension of the array Y. LDY >= N.
81 
82  dA (input/output) COMPLEX_16 array on the GPU, dimension (LDA,N-K+1)
83  On entry, the n-by-(n-k+1) general matrix A.
84  On exit, the elements in rows K:N of the first NB columns are
85  overwritten with the matrix Y.
86 
87  DV (output) COMPLEX_16 array on the GPU, dimension (N, NB)
88  On exit this contains the Householder vectors of the transformation.
89 
90  Further Details
91  ===============
92  The matrix Q is represented as a product of nb elementary reflectors
93 
94  Q = H(1) H(2) . . . H(nb).
95 
96  Each H(i) has the form
97 
98  H(i) = I - tau * v * v'
99 
100  where tau is a complex scalar, and v is a complex vector with
101  v(1:i+k-1) = 0, v(i+k) = 1; v(i+k+1:n) is stored on exit in
102  A(i+k+1:n,i), and tau in TAU(i).
103 
104  The elements of the vectors v together form the (n-k+1)-by-nb matrix
105  V which is needed, with T and Y, to apply the transformation to the
106  unreduced part of the matrix, using an update of the form:
107  A := (I - V*T*V') * (A - Y*T*V').
108 
109  The contents of A on exit are illustrated by the following example
110  with n = 7, k = 3 and nb = 2:
111 
112  ( a a a a a )
113  ( a a a a a )
114  ( a a a a a )
115  ( h h a a a )
116  ( v1 h a a a )
117  ( v1 v2 a a a )
118  ( v1 v2 a a a )
119 
120  where "a" denotes an element of the original matrix A, h denotes a
121  modified element of the upper Hessenberg matrix H, and vi denotes an
122  element of the vector defining H(i).
123 
124  This implementation follows the hybrid algorithm and notations described in
125 
126  S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg
127  form through hybrid GPU-based computing," University of Tennessee Computer
128  Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219),
129  May 24, 2009.
130  ===================================================================== */
131 
132  #define A( i, j ) ( A + (i) + (j)*lda)
133  #define Y( i, j ) ( Y + (i) + (j)*ldy)
134  #define T( i, j ) ( T + (i) + (j)*ldt)
135  #define dA( d, i, j ) (data->A [d] + (i) + (j)*ldda)
136  #define dTi( d ) (data->Ti[d])
137  #define dV( d, i, j ) (data->V [d] + (i) + (j)*ldv )
138  #define dVd( d, i, j ) (data->Vd[d] + (i) + (j)*ldvd)
139  #define dY( d, i, j ) (data->Y [d] + (i) + (j)*ldda)
140 
141  magmaDoubleComplex c_zero = MAGMA_Z_ZERO;
142  magmaDoubleComplex c_one = MAGMA_Z_ONE;
143  magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
144  magmaDoubleComplex tmp;
145 
146  magma_int_t ngpu = data->ngpu;
147  magma_int_t ldda = data->ldda;
148  magma_int_t ldv = data->ldv;
149  magma_int_t ldvd = data->ldvd;
150 
151  magma_int_t ione = 1;
152 
153  magma_int_t d, dki1, dn, nblocks, gblock, lblock, lgid;
154  magma_int_t n_k_i_1, n_k;
155  magmaDoubleComplex scale;
156 
157  magma_int_t i;
158  magmaDoubleComplex ei = MAGMA_Z_ZERO;
159 
160  magma_int_t info_data = 0;
161  magma_int_t *info = &info_data;
162  if (n < 0) {
163  *info = -1;
164  } else if (k < 0 || k >= n) {
165  *info = -2;
166  } else if (nb < 1 || nb > n) {
167  *info = -3;
168  } else if (lda < max(1,n)) {
169  *info = -5;
170  } else if (ldt < nb) {
171  *info = -8;
172  } else if (ldy < max(1,n)) {
173  *info = -10;
174  }
175  if (*info != 0) {
176  magma_xerbla( __func__, -(*info) );
177  return *info;
178  }
179 
180  // adjust from 1-based indexing
181  k -= 1;
182 
183  // Function Body
184  if (n <= 1)
185  return 0;
186 
187  // zero out current top block of V on all GPUs
188  for( d = 0; d < ngpu; ++d ) {
189  magma_setdevice( d );
190  magmablasSetKernelStream( data->streams[d] );
191  magmablas_zlaset( MagmaUpperLower, nb, nb, dV(d,k,0), ldv );
192  }
193 
194  // set all Y=0
195  lapackf77_zlaset( "Full", &n, &nb, &c_zero, &c_zero, Y, &ldy );
196 
197  for (i = 0; i < nb; ++i) {
198  n_k_i_1 = n - k - i - 1;
199  n_k = n - k;
200 
201  if (i > 0) {
202  // Finish applying I - V * T * V' on right
203  tmp = MAGMA_Z_NEGATE( tau[i-1] );
204  blasf77_zaxpy( &n_k, &tmp, Y(k,i-1), &ione, A(k,i), &ione );
205 
206  // Apply I - V * T' * V' to this column (call it b) from the
207  // left, using the last column of T as workspace, w.
208  //
209  // Let V = ( V1 ) and b = ( b1 ) (first i-1 rows)
210  // ( V2 ) ( b2 )
211  // where V1 is unit lower triangular
212 
213  // w := b1 = A(k+1:k+i, i)
214  blasf77_zcopy( &i,
215  A(k+1,i), &ione,
216  T(0,nb-1), &ione );
217 
218  // w := V1' * b1 = VA(k+1:k+i, 0:i-1)' * w
219  blasf77_ztrmv( "Lower", "Conj", "Unit", &i,
220  A(k+1,0), &lda,
221  T(0,nb-1), &ione );
222 
223  // w := w + V2'*b2 = w + VA(k+i+1:n-1, 0:i-1)' * A(k+i+1:n-1, i)
224  blasf77_zgemv( "Conj", &n_k_i_1, &i,
225  &c_one, A(k+i+1,0), &lda,
226  A(k+i+1,i), &ione,
227  &c_one, T(0,nb-1), &ione );
228 
229  // w := T'*w = T(0:i-1, 0:i-1)' * w
230  blasf77_ztrmv( "Upper", "Conj", "Non-unit", &i,
231  T(0,0), &ldt,
232  T(0,nb-1), &ione );
233 
234  // b2 := b2 - V2*w = A(k+i+1:n-1, i) - VA(k+i+1:n-1, 0:i-1) * w
235  blasf77_zgemv( "No trans", &n_k_i_1, &i,
236  &c_neg_one, A(k+i+1,0), &lda,
237  T(0,nb-1), &ione,
238  &c_one, A(k+i+1,i), &ione );
239 
240  // w := V1*w = VA(k+1:k+i, 0:i-1) * w
241  blasf77_ztrmv( "Lower", "No trans", "Unit", &i,
242  A(k+1,0), &lda,
243  T(0,nb-1), &ione );
244 
245  // b1 := b1 - w = A(k+1:k+i-1, i) - w
246  blasf77_zaxpy( &i,
247  &c_neg_one, T(0,nb-1), &ione,
248  A(k+1,i), &ione );
249 
250  // Restore diagonal element, saved below during previous iteration
251  *A(k+i,i-1) = ei;
252  }
253 
254  // Generate the elementary reflector H(i) to annihilate A(k+i+1:n-1,i)
255  lapackf77_zlarfg( &n_k_i_1,
256  A(k+i+1,i),
257  A(k+i+2,i), &ione, &tau[i] );
258  // Save diagonal element and set to one, to simplify multiplying by V
259  ei = *A(k+i+1,i);
260  *A(k+i+1,i) = c_one;
261 
262  // compute yi = A vi = sum_g A{d} vi{d}
263  nblocks = (n-1) / nb / ngpu + 1;
264  for( d = 0; d < ngpu; ++d ) {
265  magma_setdevice( d );
266  magmablasSetKernelStream( data->streams[d] );
267 
268  // dV(k+i+1:n-1, i) = VA(k+i:n, i)
269  magma_zsetvector_async( n_k_i_1,
270  A(k+i+1,i), 1,
271  dV(d, k+i+1, i), 1, data->streams[d] );
272 
273  // copy column of dV -> dVd, using block cyclic distribution.
274  // This assumes V and Vd have been padded so that
275  // a 2D matrix copy doesn't access them out-of-bounds
276  gblock = k / nb;
277  lblock = gblock / ngpu;
278  lgid = gblock % ngpu;
279  if ( d < lgid ) {
280  lblock += 1;
281  }
282  // treat V as (nb*ngpu) x nblock matrix, and Vd as nb x nblock matrix
283  magmablas_zlacpy( 'F', nb, nblocks-lblock,
284  dV (d, d*nb + lblock*nb*ngpu, i), nb*ngpu,
285  dVd(d, 0 + lblock*nb , i), nb );
286 
287  // convert global indices (k) to local indices (dk)
288  magma_indices_1D_bcyclic( nb, ngpu, d, k+i+1, n, &dki1, &dn );
289 
290  // dY(k:n, i) = dA(k:n, k+i+1:n) * dV(k+i+1:n, i)
291  // skip if matrix is empty
292  // each GPU copies to different temporary vector in Y,
293  // which are summed in separate loop below
294  if ( dn-dki1 > 0 ) {
295  magma_zgemv( 'N', n-k, dn-dki1,
296  c_one, dA (d, k , dki1), ldda,
297  dVd(d, dki1, i), 1,
298  c_zero, dY (d, k , i), 1 );
299 
300  // copy vector to host, storing in column nb+d of Y
301  // as temporary space (Y has >= nb+ngpu columns)
303  dY(d, k, i), 1,
304  Y(k, nb+d), 1, data->streams[d] );
305  }
306  }
307 
308  // while GPU is doing above Ag*v...
309  // Compute T(0:i,i) = [ -tau T V' vi ]
310  // [ tau ]
311  // T(0:i-1, i) = -tau VA(k+i+1:n-1, 0:i-1)' VA(k+i+1:n-1, i)
312  scale = MAGMA_Z_NEGATE( tau[i] );
313  blasf77_zgemv( "Conj", &n_k_i_1, &i,
314  &scale, A(k+i+1,0), &lda,
315  A(k+i+1,i), &ione,
316  &c_zero, T(0,i), &ione );
317  // T(0:i-1, i) = T(0:i-1, 0:i-1) * T(0:i-1, i)
318  blasf77_ztrmv( "Upper", "No trans", "Non-unit", &i,
319  T(0,0), &ldt,
320  T(0,i), &ione );
321  *T(i,i) = tau[i];
322 
323  // apply reflectors to next column, A(i+1), on right only.
324  // one axpy will be required to finish this, in the next iteration above
325  if ( i > 0 && i+1 < nb ) {
326  // Update next column, A(k:n,i+1), applying Q on right.
327  // One axpy will be required to finish this, in the next iteration
328  // above, after yi is computed.
329  // This updates one more row than LAPACK does (row k),
330  // making block above panel an even multiple of nb.
331  // Use last column of T as workspace, w.
332  magma_int_t i1 = i+1;
333 
334  // If complex, conjugate row of V, and undo afterwards
335  #if defined(PRECISION_z) || defined(PRECISION_c)
336  lapackf77_zlacgv( &i1, A(k+i1,0), &lda );
337  #endif
338  // w = T(0:i, 0:i+1) * VA(k+i+1, 0:i+1)'
339  // T is now rectangular, so we use gemv instead of trmv as in lapack.
340  blasf77_zgemv( "No trans", &i, &i1,
341  &c_one, T(0,0), &ldt,
342  A(k+i1,0), &lda,
343  &c_zero, T(0,nb-1), &ione );
344  #if defined(PRECISION_z) || defined(PRECISION_c)
345  lapackf77_zlacgv( &i1, A(k+i1,0), &lda );
346  #endif
347 
348  // A(k:n, i+1) -= Y(k:n, 0:i) * w
349  blasf77_zgemv( "No trans", &n_k, &i,
350  &c_neg_one, Y(k,0), &ldy,
351  T(0,nb-1), &ione,
352  &c_one, A(k,i1), &ione );
353  }
354 
355  // yi = sum_g yi{d}
356  for( d = 0; d < ngpu; ++d ) {
357  magma_setdevice( d );
358  magma_queue_sync( data->streams[d] );
359  magma_indices_1D_bcyclic( nb, ngpu, d, k+i+1, n, &dki1, &dn );
360  if ( dn-dki1 > 0 ) {
361  // yi = yi + yi{d}
362  blasf77_zaxpy( &n_k, &c_one, Y(k,nb+d), &ione, Y(k,i), &ione );
363  }
364  }
365  }
366  // Restore diagonal element
367  *A(k+nb,nb-1) = ei;
368 
369  // compute Y = Am V = sum_g Am{d} V{d} --- top part, Y(0:k-1,:)
370  for( d = 0; d < ngpu; ++d ) {
371  magma_setdevice( d );
372  magmablasSetKernelStream( data->streams[d] );
373 
374  // convert global indices (k) to local indices (dk)
375  magma_indices_1D_bcyclic( nb, ngpu, d, k+1, n, &dki1, &dn );
376 
377  // dY(0:k, :) = dA(0:k, k+i+1:n-1) * dV(k+i+1:n-1, :)
378  // skip if matrix is empty
379  // each GPU copies to different temporary block in Y,
380  // which are summed in separate loop below
381  if ( dn-dki1 > 0 ) {
382  magma_zgemm( 'N', 'N', k, nb, dn-dki1,
383  c_one, dA (d, 0 , dki1), ldda,
384  dVd(d, dki1, 0), ldvd,
385  c_zero, dY (d, 0 , 0), ldda );
386 
387  // copy result to host, storing in columns [nb + nb*d : nb + nb*(d+1)] of Y
388  // as temporary space (Y has nb + nb*ngpu columns)
389  magma_zgetmatrix_async( k, nb,
390  dY(d, 0, 0), ldda,
391  Y(0,nb+nb*d), ldy, data->streams[d] );
392  }
393  }
394 
395  // Y = sum_g Y{d}
396  for( d = 0; d < ngpu; ++d ) {
397  magma_setdevice( d );
398  magma_queue_sync( 0 );
399  magma_indices_1D_bcyclic( nb, ngpu, d, k+1, n, &dki1, &dn );
400  if ( dn-dki1 > 0 ) {
401  // Y = Y + Am V
402  for( i = 0; i < nb; ++i ) {
403  blasf77_zaxpy( &k, &c_one, Y(0,nb+nb*d+i), &ione, Y(0,i), &ione );
404  }
405  }
406  }
407 
408  // copy Y and T matrices to GPUs
409  for( d = 0; d < ngpu; ++d ) {
410  magma_setdevice( d );
411  magma_zsetmatrix_async( n, nb, Y, ldy, dY(d, 0, 0), ldda, data->streams[d] );
412  magma_zsetmatrix_async( nb, nb, T, nb, dTi(d), nb, data->streams[d] );
413  }
414 
415  return 0;
416 } // magma_zlahr2
#define MagmaUpperLower
Definition: magma.h:63
#define blasf77_zaxpy
Definition: magma_zlapack.h:23
#define __func__
Definition: common_magma.h:65
#define magma_zgetmatrix_async(m, n, dA_src, ldda, hB_dst, ldb, queue)
Definition: magmablas_z.h:714
magma_int_t ldv
#define MAGMA_Z_NEG_ONE
Definition: magma.h:134
#define dV(d, i, j)
void magmablas_zlaset(magma_int_t m, magma_int_t n, cuDoubleComplex *A, magma_int_t lda)
magma_int_t ldvd
#define magma_zsetvector_async(n, hx_src, incx, dy_dst, incy, queue)
Definition: magmablas_z.h:640
#define dY(d, i, j)
#define lapackf77_zlaset
Definition: magma_zlapack.h:84
int magma_int_t
Definition: magmablas.h:12
#define Y(i, j)
#define MAGMA_Z_NEGATE(a)
Definition: magma_types.h:150
#define T(i, j)
void magma_zgemm(magma_trans_t transA, magma_trans_t transB, magma_int_t m, magma_int_t n, magma_int_t k, magmaDoubleComplex alpha, magmaDoubleComplex_const_ptr dA, magma_int_t ldda, magmaDoubleComplex_const_ptr dB, magma_int_t lddb, magmaDoubleComplex beta, magmaDoubleComplex_ptr dC, magma_int_t lddc)
void magmablas_zlacpy(char uplo, magma_int_t m, magma_int_t n, cuDoubleComplex *A, magma_int_t lda, cuDoubleComplex *B, magma_int_t ldb)
cublasStatus_t magmablasSetKernelStream(magma_queue_t stream)
#define magma_zgetvector_async(n, dx_src, incx, hy_dst, incy, queue)
Definition: magmablas_z.h:643
#define dVd(d, i, j)
void magma_setdevice(magma_device_t dev)
void magma_indices_1D_bcyclic(magma_int_t nb, magma_int_t ngpu, magma_int_t dev, magma_int_t j0, magma_int_t j1, magma_int_t *dj0, magma_int_t *dj1)
Definition: auxiliary.cpp:220
void magma_xerbla(const char *srname, magma_int_t info)
Definition: xerbla.cpp:8
#define A(i, j)
#define MAGMA_Z_ZERO
Definition: magma.h:131
magma_int_t ldda
#define dTi(d)
void magma_zgemv(magma_trans_t transA, magma_int_t m, magma_int_t n, magmaDoubleComplex alpha, magmaDoubleComplex_const_ptr dA, magma_int_t ldda, magmaDoubleComplex_const_ptr dx, magma_int_t incx, magmaDoubleComplex beta, magmaDoubleComplex_ptr dy, magma_int_t incy)
#define lapackf77_zlacgv
Definition: magma_zlapack.h:74
#define magma_zsetmatrix_async(m, n, hA_src, lda, dB_dst, lddb, queue)
Definition: magmablas_z.h:711
#define dA(d, i, j)
#define blasf77_zcopy
Definition: magma_zlapack.h:24
magma_queue_t streams[MagmaMaxGPUs]
#define MAGMA_Z_ONE
Definition: magma.h:132
#define blasf77_zgemv
Definition: magma_zlapack.h:34
#define max(a, b)
Definition: common_magma.h:82
#define blasf77_ztrmv
Definition: magma_zlapack.h:46
#define lapackf77_zlarfg
Definition: magma_zlapack.h:79
#define magma_queue_sync(queue)
Definition: magma.h:119

Here is the call graph for this function:

Here is the caller graph for this function: