MAGMA  magma-1.4.0
Matrix Algebra on GPU and Multicore Architectures
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
dlatrd2.cpp
Go to the documentation of this file.
1 /*
2  -- MAGMA (version 1.4.0) --
3  Univ. of Tennessee, Knoxville
4  Univ. of California, Berkeley
5  Univ. of Colorado, Denver
6  August 2013
7 
8  @author Raffaele Solca
9  @author Stan Tomov
10 
11  @generated d Tue Aug 13 16:44:35 2013
12 
13 */
14 #include "common_magma.h"
15 
16 #include <cblas.h>
17 #include <assert.h>
18 
19 #define PRECISION_d
20 
21 #define A(i, j) (a+(j)*lda + (i))
22 #define W(i, j) (w+(j)*ldw + (i))
23 
24 #define dA(i, j) (da+(j)*ldda + (i))
25 #define dW(i, j) (dw+(j)*lddw + (i))
26 
27 
28 extern "C" magma_int_t
30  double *a, magma_int_t lda,
31  double *e, double *tau,
32  double *w, magma_int_t ldw,
33  double *da, magma_int_t ldda,
34  double *dw, magma_int_t lddw,
35  double *dwork, magma_int_t ldwork)
36 {
37 /* -- MAGMA (version 1.4.0) --
38  Univ. of Tennessee, Knoxville
39  Univ. of California, Berkeley
40  Univ. of Colorado, Denver
41  August 2013
42 
43  Purpose
44  =======
45  DLATRD2 reduces NB rows and columns of a real symmetric matrix A to
46  symmetric tridiagonal form by an orthogonal similarity
47  transformation Q' * A * Q, and returns the matrices V and W which are
48  needed to apply the transformation to the unreduced part of A.
49 
50  If UPLO = 'U', DLATRD reduces the last NB rows and columns of a
51  matrix, of which the upper triangle is supplied;
52  if UPLO = 'L', DLATRD reduces the first NB rows and columns of a
53  matrix, of which the lower triangle is supplied.
54 
55  This is an auxiliary routine called by DSYTRD2_GPU. It uses an
56  accelerated HEMV that needs extra memory.
57 
58  Arguments
59  =========
60  UPLO (input) CHARACTER*1
61  Specifies whether the upper or lower triangular part of the
62  symmetric matrix A is stored:
63  = 'U': Upper triangular
64  = 'L': Lower triangular
65 
66  N (input) INTEGER
67  The order of the matrix A.
68 
69  NB (input) INTEGER
70  The number of rows and columns to be reduced.
71 
72  A (input/output) DOUBLE_PRECISION array, dimension (LDA,N)
73  On entry, the symmetric matrix A. If UPLO = 'U', the leading
74  n-by-n upper triangular part of A contains the upper
75  triangular part of the matrix A, and the strictly lower
76  triangular part of A is not referenced. If UPLO = 'L', the
77  leading n-by-n lower triangular part of A contains the lower
78  triangular part of the matrix A, and the strictly upper
79  triangular part of A is not referenced.
80  On exit:
81  if UPLO = 'U', the last NB columns have been reduced to
82  tridiagonal form, with the diagonal elements overwriting
83  the diagonal elements of A; the elements above the diagonal
84  with the array TAU, represent the orthogonal matrix Q as a
85  product of elementary reflectors;
86  if UPLO = 'L', the first NB columns have been reduced to
87  tridiagonal form, with the diagonal elements overwriting
88  the diagonal elements of A; the elements below the diagonal
89  with the array TAU, represent the orthogonal matrix Q as a
90  product of elementary reflectors.
91  See Further Details.
92 
93  LDA (input) INTEGER
94  The leading dimension of the array A. LDA >= (1,N).
95 
96  E (output) DOUBLE_PRECISION array, dimension (N-1)
97  If UPLO = 'U', E(n-nb:n-1) contains the superdiagonal
98  elements of the last NB columns of the reduced matrix;
99  if UPLO = 'L', E(1:nb) contains the subdiagonal elements of
100  the first NB columns of the reduced matrix.
101 
102  TAU (output) DOUBLE_PRECISION array, dimension (N-1)
103  The scalar factors of the elementary reflectors, stored in
104  TAU(n-nb:n-1) if UPLO = 'U', and in TAU(1:nb) if UPLO = 'L'.
105  See Further Details.
106 
107  W (output) DOUBLE_PRECISION array, dimension (LDW,NB)
108  The n-by-nb matrix W required to update the unreduced part
109  of A.
110 
111  LDW (input) INTEGER
112  The leading dimension of the array W. LDW >= max(1,N).
113 
114  Further Details
115  ===============
116  If UPLO = 'U', the matrix Q is represented as a product of elementary
117  reflectors
118 
119  Q = H(n) H(n-1) . . . H(n-nb+1).
120 
121  Each H(i) has the form
122 
123  H(i) = I - tau * v * v'
124 
125  where tau is a real scalar, and v is a real vector with
126  v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i),
127  and tau in TAU(i-1).
128 
129  If UPLO = 'L', the matrix Q is represented as a product of elementary
130  reflectors
131 
132  Q = H(1) H(2) . . . H(nb).
133 
134  Each H(i) has the form
135 
136  H(i) = I - tau * v * v'
137 
138  where tau is a real scalar, and v is a real vector with
139  v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i),
140  and tau in TAU(i).
141 
142  The elements of the vectors v together form the n-by-nb matrix V
143  which is needed, with W, to apply the transformation to the unreduced
144  part of the matrix, using a symmetric rank-2k update of the form:
145  A := A - V*W' - W*V'.
146 
147  The contents of A on exit are illustrated by the following examples
148  with n = 5 and nb = 2:
149 
150  if UPLO = 'U': if UPLO = 'L':
151 
152  ( a a a v4 v5 ) ( d )
153  ( a a v4 v5 ) ( 1 d )
154  ( a 1 v5 ) ( v1 1 a )
155  ( d 1 ) ( v1 v2 a a )
156  ( d ) ( v1 v2 a a a )
157 
158  where d denotes a diagonal element of the reduced matrix, a denotes
159  an element of the original matrix that is unchanged, and vi denotes
160  an element of the vector defining H(i).
161  ===================================================================== */
162 
163  char uplo_[2] = {uplo, 0};
164 
165  magma_int_t i;
166 
167  double c_neg_one = MAGMA_D_NEG_ONE;
168  double c_one = MAGMA_D_ONE;
169  double c_zero = MAGMA_D_ZERO;
170 
171  double value = MAGMA_D_ZERO;
172 
173  magma_int_t ione = 1;
174 
175  magma_int_t i_n, i_1, iw;
176 
177  double alpha;
178  double *f;
179 
180  if (n <= 0) {
181  return 0;
182  }
183 
184  magma_queue_t stream;
185  magma_queue_create( &stream );
186  magma_dmalloc_cpu( &f, n );
187  assert( f != NULL ); // TODO return error, or allocate outside dlatrd
188 
189  if (lapackf77_lsame(uplo_, "U")) {
190 
191  /* Reduce last NB columns of upper triangle */
192  for (i = n-1; i >= n - nb ; --i) {
193  i_1 = i + 1;
194  i_n = n - i - 1;
195 
196  iw = i - n + nb;
197  if (i < n-1) {
198  /* Update A(1:i,i) */
199  #if defined(PRECISION_z) || defined(PRECISION_c)
200  lapackf77_dlacgv(&i_n, W(i, iw+1), &ldw);
201  #endif
202  blasf77_dgemv("No transpose", &i_1, &i_n, &c_neg_one, A(0, i+1), &lda,
203  W(i, iw+1), &ldw, &c_one, A(0, i), &ione);
204  #if defined(PRECISION_z) || defined(PRECISION_c)
205  lapackf77_dlacgv(&i_n, W(i, iw+1), &ldw);
206  lapackf77_dlacgv(&i_n, A(i, i+1), &ldw);
207  #endif
208  blasf77_dgemv("No transpose", &i_1, &i_n, &c_neg_one, W(0, iw+1), &ldw,
209  A(i, i+1), &lda, &c_one, A(0, i), &ione);
210  #if defined(PRECISION_z) || defined(PRECISION_c)
211  lapackf77_dlacgv(&i_n, A(i, i+1), &ldw);
212  #endif
213  }
214  if (i > 0) {
215  /* Generate elementary reflector H(i) to annihilate A(1:i-2,i) */
216 
217  alpha = *A(i-1, i);
218 
219  lapackf77_dlarfg(&i, &alpha, A(0, i), &ione, &tau[i - 1]);
220 
221  e[i-1] = MAGMA_D_REAL( alpha );
222  MAGMA_D_SET2REAL(*A(i-1, i), 1.);
223 
224  /* Compute W(1:i-1,i) */
225  // 1. Send the block reflector A(0:n-i-1,i) to the GPU
226  magma_dsetvector( i, A(0, i), 1, dA(0, i), 1 );
227 
228  #if (GPUSHMEM < 200)
229  magma_dsymv(MagmaUpper, i, c_one, dA(0, 0), ldda,
230  dA(0, i), ione, c_zero, dW(0, iw), ione);
231  #else
232  magmablas_dsymv2(MagmaUpper, i, c_one, dA(0, 0), ldda,
233  dA(0, i), ione, c_zero, dW(0, iw), ione,
234  dwork, ldwork);
235  #endif
236 
237  // 2. Start putting the result back (asynchronously)
239  dW(0, iw), lddw,
240  W(0, iw) /*test*/, ldw, stream );
241 
242  if (i < n-1) {
243  blasf77_dgemv(MagmaTransStr, &i, &i_n, &c_one, W(0, iw+1), &ldw,
244  A(0, i), &ione, &c_zero, W(i+1, iw), &ione);
245  }
246 
247  // 3. Here is where we need it // TODO find the right place
248  magma_queue_sync( stream );
249 
250  if (i < n-1) {
251  blasf77_dgemv("No transpose", &i, &i_n, &c_neg_one, A(0, i+1), &lda,
252  W(i+1, iw), &ione, &c_one, W(0, iw), &ione);
253 
254  blasf77_dgemv(MagmaTransStr, &i, &i_n, &c_one, A(0, i+1), &lda,
255  A(0, i), &ione, &c_zero, W(i+1, iw), &ione);
256 
257  blasf77_dgemv("No transpose", &i, &i_n, &c_neg_one, W(0, iw+1), &ldw,
258  W(i+1, iw), &ione, &c_one, W(0, iw), &ione);
259  }
260 
261  blasf77_dscal(&i, &tau[i - 1], W(0, iw), &ione);
262 
263  #if defined(PRECISION_z) || defined(PRECISION_c)
264  cblas_ddot_sub( i, W(0,iw), ione, A(0,i), ione, &value );
265  #else
266  value = cblas_ddot( i, W(0,iw), ione, A(0,i), ione );
267  #endif
268  alpha = tau[i - 1] * -0.5f * value;
269  blasf77_daxpy(&i, &alpha, A(0, i), &ione,
270  W(0, iw), &ione);
271  }
272  }
273  }
274  else {
275  /* Reduce first NB columns of lower triangle */
276  for (i = 0; i < nb; ++i) {
277 
278  /* Update A(i:n,i) */
279  i_n = n - i;
280  #if defined(PRECISION_z) || defined(PRECISION_c)
281  lapackf77_dlacgv(&i, W(i, 0), &ldw);
282  #endif
283  blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, A(i, 0), &lda,
284  W(i, 0), &ldw, &c_one, A(i, i), &ione);
285  #if defined(PRECISION_z) || defined(PRECISION_c)
286  lapackf77_dlacgv(&i, W(i, 0), &ldw);
287  lapackf77_dlacgv(&i, A(i ,0), &lda);
288  #endif
289  blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, W(i, 0), &ldw,
290  A(i, 0), &lda, &c_one, A(i, i), &ione);
291  #if defined(PRECISION_z) || defined(PRECISION_c)
292  lapackf77_dlacgv(&i, A(i, 0), &lda);
293  #endif
294 
295  if (i < n-1) {
296  /* Generate elementary reflector H(i) to annihilate A(i+2:n,i) */
297  i_n = n - i - 1;
298  alpha = *A(i+1, i);
299  lapackf77_dlarfg(&i_n, &alpha, A(min(i+2,n-1), i), &ione, &tau[i]);
300  e[i] = MAGMA_D_REAL( alpha );
301  MAGMA_D_SET2REAL(*A(i+1, i), 1.);
302 
303  /* Compute W(i+1:n,i) */
304  // 1. Send the block reflector A(i+1:n,i) to the GPU
305  magma_dsetvector( i_n, A(i+1, i), 1, dA(i+1, i), 1 );
306 
307  #if (GPUSHMEM < 200)
308  magma_dsymv(MagmaLower, i_n, c_one, dA(i+1, i+1), ldda, dA(i+1, i), ione, c_zero,
309  dW(i+1, i), ione);
310  #else
311  magmablas_dsymv2('L', i_n, c_one, dA(i+1, i+1), ldda, dA(i+1, i), ione, c_zero,
312  dW(i+1, i), ione,
313  dwork, ldwork);
314  #endif
315 
316  // 2. Start putting the result back (asynchronously)
317  magma_dgetmatrix_async( i_n, 1,
318  dW(i+1, i), lddw,
319  W(i+1, i), ldw, stream );
320 
321  blasf77_dgemv(MagmaTransStr, &i_n, &i, &c_one, W(i+1, 0), &ldw,
322  A(i+1, i), &ione, &c_zero, W(0, i), &ione);
323 
324  blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, A(i+1, 0), &lda,
325  W(0, i), &ione, &c_zero, f, &ione);
326 
327  blasf77_dgemv(MagmaTransStr, &i_n, &i, &c_one, A(i+1, 0), &lda,
328  A(i+1, i), &ione, &c_zero, W(0, i), &ione);
329 
330  // 3. Here is where we need it
331  magma_queue_sync( stream );
332 
333  if (i!=0)
334  blasf77_daxpy(&i_n, &c_one, f, &ione, W(i+1, i), &ione);
335 
336  blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, W(i+1, 0), &ldw,
337  W(0, i), &ione, &c_one, W(i+1, i), &ione);
338  blasf77_dscal(&i_n, &tau[i], W(i+1,i), &ione);
339  #if defined(PRECISION_z) || defined(PRECISION_c)
340  cblas_ddot_sub( i_n, W(i+1,i), ione, A(i+1,i), ione, &value );
341  #else
342  value = cblas_ddot( i_n, W(i+1,i), ione, A(i+1,i), ione );
343  #endif
344  alpha = tau[i] * -0.5f * value;
345  blasf77_daxpy(&i_n, &alpha, A(i+1, i), &ione, W(i+1,i), &ione);
346  }
347  }
348  }
349 
350  magma_free_cpu(f);
351  magma_queue_destroy( stream );
352 
353  return 0;
354 } /* dlatrd */
#define MagmaTransStr
Definition: magma.h:81
#define A(i, j)
Definition: dlatrd2.cpp:21
void lapackf77_dlarfg(const magma_int_t *n, double *alpha, double *x, const magma_int_t *incx, double *tau)
#define min(a, b)
Definition: common_magma.h:86
#define MAGMA_D_ONE
Definition: magma.h:176
#define magma_queue_create(queuePtr)
Definition: magma.h:113
#define MagmaUpper
Definition: magma.h:61
void blasf77_dgemv(const char *transa, const magma_int_t *m, const magma_int_t *n, const double *alpha, const double *A, const magma_int_t *lda, const double *x, const magma_int_t *incx, const double *beta, double *y, const magma_int_t *incy)
magma_int_t ldda
#define magma_dsetvector(n, hx_src, incx, dy_dst, incy)
Definition: magmablas_d.h:634
magma_int_t magmablas_dsymv2(magma_uplo_t uplo, magma_int_t n, double alpha, magmaDouble_const_ptr dA, magma_int_t ldda, magmaDouble_const_ptr dX, magma_int_t incx, double beta, magmaDouble_ptr dY, magma_int_t incy, magmaDouble_ptr dwork, magma_int_t lwork)
double cblas_ddot(const int N, const double *X, const int incX, const double *Y, const int incY)
int magma_int_t
Definition: magmablas.h:12
#define magma_dgetmatrix_async(m, n, dA_src, ldda, hB_dst, ldb, queue)
Definition: magmablas_d.h:714
#define magma_queue_destroy(queue)
Definition: magma.h:116
void magma_dsymv(magma_uplo_t uplo, magma_int_t n, double alpha, magmaDouble_const_ptr dA, magma_int_t ldda, magmaDouble_const_ptr dx, magma_int_t incx, double beta, magmaDouble_ptr dy, magma_int_t incy)
#define dA(i, j)
Definition: dlatrd2.cpp:24
#define W(i, j)
Definition: dlatrd2.cpp:22
#define dwork(dev, i, j)
void blasf77_daxpy(const magma_int_t *n, const double *alpha, const double *x, const magma_int_t *incx, double *y, const magma_int_t *incy)
#define MagmaLower
Definition: magma.h:62
#define dW(i, j)
Definition: dlatrd2.cpp:25
void lapackf77_dlacgv(const magma_int_t *n, double *x, const magma_int_t *incx)
magma_int_t magma_dlatrd2(char uplo, magma_int_t n, magma_int_t nb, double *a, magma_int_t lda, double *e, double *tau, double *w, magma_int_t ldw, double *da, magma_int_t ldda, double *dw, magma_int_t lddw, double *dwork, magma_int_t ldwork)
Definition: dlatrd2.cpp:29
#define lapackf77_lsame
Definition: magma_lapack.h:23
#define MAGMA_D_REAL(a)
Definition: magma.h:168
static magma_err_t magma_dmalloc_cpu(double **ptrPtr, size_t n)
Definition: magma.h:84
void blasf77_dscal(const magma_int_t *n, const double *alpha, double *x, const magma_int_t *incx)
#define MAGMA_D_SET2REAL(v, t)
Definition: magma.h:159
#define MAGMA_D_NEG_ONE
Definition: magma.h:178
#define MAGMA_D_ZERO
Definition: magma.h:175
magma_err_t magma_free_cpu(void *ptr)
#define magma_queue_sync(queue)
Definition: magma.h:119