MAGMA  1.2.0 MatrixAlgebraonGPUandMulticoreArchitectures
clatrd.cpp
Go to the documentation of this file.
1 /*
2  -- MAGMA (version 1.2.0) --
3  Univ. of Tennessee, Knoxville
4  Univ. of California, Berkeley
6  May 2012
7
8  @author Stan Tomov
9  @author Raffaele Solca
10
11  @generated c Thu May 10 22:26:57 2012
12
13 */
14 #include "common_magma.h"
15 #include <cblas.h>
16
17 #define PRECISION_c
18
19 #define A(i, j) (a+(j)*lda + (i))
20 #define W(i, j) (w+(j)*ldw + (i))
21
22 #define dA(i, j) (da+(j)*ldda + (i))
23 #define dW(i, j) (dw+(j)*lddw + (i))
24
25 extern "C" magma_int_t
27  cuFloatComplex *a, magma_int_t lda,
28  float *e, cuFloatComplex *tau,
29  cuFloatComplex *w, magma_int_t ldw,
30  cuFloatComplex *da, magma_int_t ldda,
31  cuFloatComplex *dw, magma_int_t lddw)
32 {
33 /* -- MAGMA (version 1.2.0) --
34  Univ. of Tennessee, Knoxville
35  Univ. of California, Berkeley
37  May 2012
38
39  Purpose
40  =======
41  CLATRD reduces NB rows and columns of a complex Hermitian matrix A to
42  Hermitian tridiagonal form by an orthogonal similarity
43  transformation Q' * A * Q, and returns the matrices V and W which are
44  needed to apply the transformation to the unreduced part of A.
45
46  If UPLO = 'U', CLATRD reduces the last NB rows and columns of a
47  matrix, of which the upper triangle is supplied;
48  if UPLO = 'L', CLATRD reduces the first NB rows and columns of a
49  matrix, of which the lower triangle is supplied.
50
51  This is an auxiliary routine called by CHETRD.
52
53  Arguments
54  =========
55  UPLO (input) CHARACTER*1
56  Specifies whether the upper or lower triangular part of the
57  Hermitian matrix A is stored:
58  = 'U': Upper triangular
59  = 'L': Lower triangular
60
61  N (input) INTEGER
62  The order of the matrix A.
63
64  NB (input) INTEGER
65  The number of rows and columns to be reduced.
66
67  A (input/output) COMPLEX array, dimension (LDA,N)
68  On entry, the Hermitian matrix A. If UPLO = 'U', the leading
69  n-by-n upper triangular part of A contains the upper
70  triangular part of the matrix A, and the strictly lower
71  triangular part of A is not referenced. If UPLO = 'L', the
72  leading n-by-n lower triangular part of A contains the lower
73  triangular part of the matrix A, and the strictly upper
74  triangular part of A is not referenced.
75  On exit:
76  if UPLO = 'U', the last NB columns have been reduced to
77  tridiagonal form, with the diagonal elements overwriting
78  the diagonal elements of A; the elements above the diagonal
79  with the array TAU, represent the orthogonal matrix Q as a
80  product of elementary reflectors;
81  if UPLO = 'L', the first NB columns have been reduced to
82  tridiagonal form, with the diagonal elements overwriting
83  the diagonal elements of A; the elements below the diagonal
84  with the array TAU, represent the orthogonal matrix Q as a
85  product of elementary reflectors.
86  See Further Details.
87
88  LDA (input) INTEGER
89  The leading dimension of the array A. LDA >= (1,N).
90
91  E (output) COMPLEX array, dimension (N-1)
92  If UPLO = 'U', E(n-nb:n-1) contains the superdiagonal
93  elements of the last NB columns of the reduced matrix;
94  if UPLO = 'L', E(1:nb) contains the subdiagonal elements of
95  the first NB columns of the reduced matrix.
96
97  TAU (output) COMPLEX array, dimension (N-1)
98  The scalar factors of the elementary reflectors, stored in
99  TAU(n-nb:n-1) if UPLO = 'U', and in TAU(1:nb) if UPLO = 'L'.
100  See Further Details.
101
102  W (output) COMPLEX array, dimension (LDW,NB)
103  The n-by-nb matrix W required to update the unreduced part
104  of A.
105
106  LDW (input) INTEGER
107  The leading dimension of the array W. LDW >= max(1,N).
108
109  Further Details
110  ===============
111  If UPLO = 'U', the matrix Q is represented as a product of elementary
112  reflectors
113
114  Q = H(n) H(n-1) . . . H(n-nb+1).
115
116  Each H(i) has the form
117
118  H(i) = I - tau * v * v'
119
120  where tau is a complex scalar, and v is a complex vector with
121  v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i),
122  and tau in TAU(i-1).
123
124  If UPLO = 'L', the matrix Q is represented as a product of elementary
125  reflectors
126
127  Q = H(1) H(2) . . . H(nb).
128
129  Each H(i) has the form
130
131  H(i) = I - tau * v * v'
132
133  where tau is a complex scalar, and v is a complex vector with
134  v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i),
135  and tau in TAU(i).
136
137  The elements of the vectors v together form the n-by-nb matrix V
138  which is needed, with W, to apply the transformation to the unreduced
139  part of the matrix, using a Hermitian rank-2k update of the form:
140  A := A - V*W' - W*V'.
141
142  The contents of A on exit are illustrated by the following examples
143  with n = 5 and nb = 2:
144
145  if UPLO = 'U': if UPLO = 'L':
146
147  ( a a a v4 v5 ) ( d )
148  ( a a v4 v5 ) ( 1 d )
149  ( a 1 v5 ) ( v1 1 a )
150  ( d 1 ) ( v1 v2 a a )
151  ( d ) ( v1 v2 a a a )
152
153  where d denotes a diagonal element of the reduced matrix, a denotes
154  an element of the original matrix that is unchanged, and vi denotes
155  an element of the vector defining H(i).
156  ===================================================================== */
157
158  char uplo_[2] = {uplo, 0};
159
160  static magma_int_t i;
161
162  cuFloatComplex c_neg_one = MAGMA_C_NEG_ONE;
163  cuFloatComplex c_one = MAGMA_C_ONE;
164  cuFloatComplex c_zero = MAGMA_C_ZERO;
165
166  #if defined(PRECISION_z) || defined(PRECISION_c)
167  cuFloatComplex value = MAGMA_C_ZERO;
168  #endif
169
170  static magma_int_t ione = 1;
171
172  static magma_int_t i_n, i_1, iw;
173
174  static cuFloatComplex alpha;
175
176  cuFloatComplex *f = (cuFloatComplex *)malloc(n*sizeof(cuFloatComplex ));
177
178  if (n <= 0) {
179  return 0;
180  }
181
182  static cudaStream_t stream;
183  magma_queue_create( &stream );
184
185  if (lapackf77_lsame(uplo_, "U")) {
186
187  /* Reduce last NB columns of upper triangle */
188  for (i = n-1; i >= n - nb ; --i) {
189  i_1 = i + 1;
190  i_n = n - i - 1;
191
192  iw = i - n + nb;
193  if (i < n-1) {
194  /* Update A(1:i,i) */
195  #if defined(PRECISION_z) || defined(PRECISION_c)
196  lapackf77_clacgv(&i_n, W(i, iw+1), &ldw);
197  #endif
198  blasf77_cgemv("No transpose", &i_1, &i_n, &c_neg_one, A(0, i+1), &lda,
199  W(i, iw+1), &ldw, &c_one, A(0, i), &ione);
200  #if defined(PRECISION_z) || defined(PRECISION_c)
201  lapackf77_clacgv(&i_n, W(i, iw+1), &ldw);
202  lapackf77_clacgv(&i_n, A(i, i+1), &lda);
203  #endif
204  blasf77_cgemv("No transpose", &i_1, &i_n, &c_neg_one, W(0, iw+1), &ldw,
205  A(i, i+1), &lda, &c_one, A(0, i), &ione);
206  #if defined(PRECISION_z) || defined(PRECISION_c)
207  lapackf77_clacgv(&i_n, A(i, i+1), &lda);
208  #endif
209  }
210  if (i > 0) {
211  /* Generate elementary reflector H(i) to annihilate A(1:i-2,i) */
212
213  alpha = *A(i-1, i);
214
215  lapackf77_clarfg(&i, &alpha, A(0, i), &ione, &tau[i - 1]);
216
217  e[i-1] = MAGMA_C_REAL( alpha );
218  MAGMA_C_SET2REAL(*A(i-1, i), 1.);
219
220  /* Compute W(1:i-1,i) */
221  // 1. Send the block reflector A(0:n-i-1,i) to the GPU
222  magma_csetvector( i, A(0, i), 1, dA(0, i), 1 );
223
224  magma_chemv(MagmaUpper, i, c_one, dA(0, 0), ldda,
225  dA(0, i), ione, c_zero, dW(0, iw), ione);
226
227  // 2. Start putting the result back (asynchronously)
229  dW(0, iw), lddw,
230  W(0, iw) /*test*/, ldw, stream );
231
232  if (i < n-1) {
233
234  blasf77_cgemv(MagmaConjTransStr, &i, &i_n, &c_one, W(0, iw+1), &ldw,
235  A(0, i), &ione, &c_zero, W(i+1, iw), &ione);
236  }
237
238  // 3. Here is where we need it // TODO find the right place
239  magma_queue_sync( stream );
240
241  if (i < n-1) {
242
243  blasf77_cgemv("No transpose", &i, &i_n, &c_neg_one, A(0, i+1), &lda,
244  W(i+1, iw), &ione, &c_one, W(0, iw), &ione);
245
246  blasf77_cgemv(MagmaConjTransStr, &i, &i_n, &c_one, A(0, i+1), &lda,
247  A(0, i), &ione, &c_zero, W(i+1, iw), &ione);
248
249  blasf77_cgemv("No transpose", &i, &i_n, &c_neg_one, W(0, iw+1), &ldw,
250  W(i+1, iw), &ione, &c_one, W(0, iw), &ione);
251  }
252
253  blasf77_cscal(&i, &tau[i - 1], W(0, iw), &ione);
254
255 #if defined(PRECISION_z) || defined(PRECISION_c)
256  cblas_cdotc_sub(i, W(0, iw), ione, A(0, i), ione, &value);
257
258 // blasf77_cdotc(&value, &i, W(0, iw), &ione, A(0, i), &ione);
259  alpha = tau[i - 1] * -.5f * value;
260 #else
261  alpha = tau[i - 1] * -.5f * blasf77_cdotc(&i, W(0, iw), &ione, A(0, i), &ione);
262 #endif
263  blasf77_caxpy(&i, &alpha, A(0, i), &ione,
264  W(0, iw), &ione);
265  }
266  }
267
268  } else {
269
270  /* Reduce first NB columns of lower triangle */
271  for (i = 0; i < nb; ++i)
272  {
273
274  /* Update A(i:n,i) */
275  i_n = n - i;
276  #if defined(PRECISION_z) || defined(PRECISION_c)
277  lapackf77_clacgv(&i, W(i, 0), &ldw);
278  #endif
279  blasf77_cgemv("No transpose", &i_n, &i, &c_neg_one, A(i, 0), &lda,
280  W(i, 0), &ldw, &c_one, A(i, i), &ione);
281  #if defined(PRECISION_z) || defined(PRECISION_c)
282  lapackf77_clacgv(&i, W(i, 0), &ldw);
283  lapackf77_clacgv(&i, A(i ,0), &lda);
284  #endif
285  blasf77_cgemv("No transpose", &i_n, &i, &c_neg_one, W(i, 0), &ldw,
286  A(i, 0), &lda, &c_one, A(i, i), &ione);
287  #if defined(PRECISION_z) || defined(PRECISION_c)
288  lapackf77_clacgv(&i, A(i, 0), &lda);
289  #endif
290
291  if (i < n-1)
292  {
293  /* Generate elementary reflector H(i) to annihilate A(i+2:n,i) */
294  i_n = n - i - 1;
295  alpha = *A(i+1, i);
296  lapackf77_clarfg(&i_n, &alpha, A(min(i+2,n-1), i), &ione, &tau[i]);
297  e[i] = MAGMA_C_REAL( alpha );
298  MAGMA_C_SET2REAL(*A(i+1, i), 1.);
299
300  /* Compute W(i+1:n,i) */
301  // 1. Send the block reflector A(i+1:n,i) to the GPU
302  magma_csetvector( i_n, A(i+1, i), 1, dA(i+1, i), 1 );
303
304  magma_chemv(MagmaLower, i_n, c_one, dA(i+1, i+1), ldda, dA(i+1, i), ione, c_zero,
305  dW(i+1, i), ione);
306
307  // 2. Start putting the result back (asynchronously)
308  magma_cgetmatrix_async( i_n, 1,
309  dW(i+1, i), lddw,
310  W(i+1, i), ldw, stream );
311
312  blasf77_cgemv(MagmaConjTransStr, &i_n, &i, &c_one, W(i+1, 0), &ldw,
313  A(i+1, i), &ione, &c_zero, W(0, i), &ione);
314
315  blasf77_cgemv("No transpose", &i_n, &i, &c_neg_one, A(i+1, 0), &lda,
316  W(0, i), &ione, &c_zero, f, &ione);
317
318  blasf77_cgemv(MagmaConjTransStr, &i_n, &i, &c_one, A(i+1, 0), &lda,
319  A(i+1, i), &ione, &c_zero, W(0, i), &ione);
320
321  // 3. Here is where we need it
322  magma_queue_sync( stream );
323
324  if (i!=0)
325  blasf77_caxpy(&i_n, &c_one, f, &ione, W(i+1, i), &ione);
326
327  blasf77_cgemv("No transpose", &i_n, &i, &c_neg_one, W(i+1, 0), &ldw,
328  W(0, i), &ione, &c_one, W(i+1, i), &ione);
329  blasf77_cscal(&i_n, &tau[i], W(i+1,i), &ione);
330
331  #if defined(PRECISION_z) || defined(PRECISION_c)
332  /* Comment:
333  To do - move to cblas in cases like this. The commented
334  out version works with MKL but is not a standard interface
335  for other BLAS zdoc implementations
336  */
337
338  cblas_cdotc_sub(i_n, W(i +1, i), ione,
339  A(i +1, i), ione, &value);
340
341  //blasf77_cdotc(&value, &i_n, W(i+1,i), &ione, A(i+1, i), &ione);
342  alpha = tau[i]* -.5f * value;
343  #else
344  alpha = tau[i]* -.5f* blasf77_cdotc(&i_n, W(i+1,i), &ione, A(i+1, i), &ione);
345  #endif
346  blasf77_caxpy(&i_n, &alpha, A(i+1, i), &ione, W(i+1,i), &ione);
347  }
348  }
349  }
350
351  free(f);
352  magma_queue_destroy( stream );
353
354  return 0;
355 } /* clatrd_ */
356