MAGMA  1.2.0
MatrixAlgebraonGPUandMulticoreArchitectures
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
clatrd.cpp
Go to the documentation of this file.
1 /*
2  -- MAGMA (version 1.2.0) --
3  Univ. of Tennessee, Knoxville
4  Univ. of California, Berkeley
5  Univ. of Colorado, Denver
6  May 2012
7 
8  @author Stan Tomov
9  @author Raffaele Solca
10 
11  @generated c Thu May 10 22:26:57 2012
12 
13 */
14 #include "common_magma.h"
15 #include <cblas.h>
16 
17 #define PRECISION_c
18 
19 #define A(i, j) (a+(j)*lda + (i))
20 #define W(i, j) (w+(j)*ldw + (i))
21 
22 #define dA(i, j) (da+(j)*ldda + (i))
23 #define dW(i, j) (dw+(j)*lddw + (i))
24 
25 extern "C" magma_int_t
27  cuFloatComplex *a, magma_int_t lda,
28  float *e, cuFloatComplex *tau,
29  cuFloatComplex *w, magma_int_t ldw,
30  cuFloatComplex *da, magma_int_t ldda,
31  cuFloatComplex *dw, magma_int_t lddw)
32 {
33 /* -- MAGMA (version 1.2.0) --
34  Univ. of Tennessee, Knoxville
35  Univ. of California, Berkeley
36  Univ. of Colorado, Denver
37  May 2012
38 
39  Purpose
40  =======
41  CLATRD reduces NB rows and columns of a complex Hermitian matrix A to
42  Hermitian tridiagonal form by an orthogonal similarity
43  transformation Q' * A * Q, and returns the matrices V and W which are
44  needed to apply the transformation to the unreduced part of A.
45 
46  If UPLO = 'U', CLATRD reduces the last NB rows and columns of a
47  matrix, of which the upper triangle is supplied;
48  if UPLO = 'L', CLATRD reduces the first NB rows and columns of a
49  matrix, of which the lower triangle is supplied.
50 
51  This is an auxiliary routine called by CHETRD.
52 
53  Arguments
54  =========
55  UPLO (input) CHARACTER*1
56  Specifies whether the upper or lower triangular part of the
57  Hermitian matrix A is stored:
58  = 'U': Upper triangular
59  = 'L': Lower triangular
60 
61  N (input) INTEGER
62  The order of the matrix A.
63 
64  NB (input) INTEGER
65  The number of rows and columns to be reduced.
66 
67  A (input/output) COMPLEX array, dimension (LDA,N)
68  On entry, the Hermitian matrix A. If UPLO = 'U', the leading
69  n-by-n upper triangular part of A contains the upper
70  triangular part of the matrix A, and the strictly lower
71  triangular part of A is not referenced. If UPLO = 'L', the
72  leading n-by-n lower triangular part of A contains the lower
73  triangular part of the matrix A, and the strictly upper
74  triangular part of A is not referenced.
75  On exit:
76  if UPLO = 'U', the last NB columns have been reduced to
77  tridiagonal form, with the diagonal elements overwriting
78  the diagonal elements of A; the elements above the diagonal
79  with the array TAU, represent the orthogonal matrix Q as a
80  product of elementary reflectors;
81  if UPLO = 'L', the first NB columns have been reduced to
82  tridiagonal form, with the diagonal elements overwriting
83  the diagonal elements of A; the elements below the diagonal
84  with the array TAU, represent the orthogonal matrix Q as a
85  product of elementary reflectors.
86  See Further Details.
87 
88  LDA (input) INTEGER
89  The leading dimension of the array A. LDA >= (1,N).
90 
91  E (output) COMPLEX array, dimension (N-1)
92  If UPLO = 'U', E(n-nb:n-1) contains the superdiagonal
93  elements of the last NB columns of the reduced matrix;
94  if UPLO = 'L', E(1:nb) contains the subdiagonal elements of
95  the first NB columns of the reduced matrix.
96 
97  TAU (output) COMPLEX array, dimension (N-1)
98  The scalar factors of the elementary reflectors, stored in
99  TAU(n-nb:n-1) if UPLO = 'U', and in TAU(1:nb) if UPLO = 'L'.
100  See Further Details.
101 
102  W (output) COMPLEX array, dimension (LDW,NB)
103  The n-by-nb matrix W required to update the unreduced part
104  of A.
105 
106  LDW (input) INTEGER
107  The leading dimension of the array W. LDW >= max(1,N).
108 
109  Further Details
110  ===============
111  If UPLO = 'U', the matrix Q is represented as a product of elementary
112  reflectors
113 
114  Q = H(n) H(n-1) . . . H(n-nb+1).
115 
116  Each H(i) has the form
117 
118  H(i) = I - tau * v * v'
119 
120  where tau is a complex scalar, and v is a complex vector with
121  v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i),
122  and tau in TAU(i-1).
123 
124  If UPLO = 'L', the matrix Q is represented as a product of elementary
125  reflectors
126 
127  Q = H(1) H(2) . . . H(nb).
128 
129  Each H(i) has the form
130 
131  H(i) = I - tau * v * v'
132 
133  where tau is a complex scalar, and v is a complex vector with
134  v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i),
135  and tau in TAU(i).
136 
137  The elements of the vectors v together form the n-by-nb matrix V
138  which is needed, with W, to apply the transformation to the unreduced
139  part of the matrix, using a Hermitian rank-2k update of the form:
140  A := A - V*W' - W*V'.
141 
142  The contents of A on exit are illustrated by the following examples
143  with n = 5 and nb = 2:
144 
145  if UPLO = 'U': if UPLO = 'L':
146 
147  ( a a a v4 v5 ) ( d )
148  ( a a v4 v5 ) ( 1 d )
149  ( a 1 v5 ) ( v1 1 a )
150  ( d 1 ) ( v1 v2 a a )
151  ( d ) ( v1 v2 a a a )
152 
153  where d denotes a diagonal element of the reduced matrix, a denotes
154  an element of the original matrix that is unchanged, and vi denotes
155  an element of the vector defining H(i).
156  ===================================================================== */
157 
158  char uplo_[2] = {uplo, 0};
159 
160  static magma_int_t i;
161 
162  cuFloatComplex c_neg_one = MAGMA_C_NEG_ONE;
163  cuFloatComplex c_one = MAGMA_C_ONE;
164  cuFloatComplex c_zero = MAGMA_C_ZERO;
165 
166  #if defined(PRECISION_z) || defined(PRECISION_c)
167  cuFloatComplex value = MAGMA_C_ZERO;
168  #endif
169 
170  static magma_int_t ione = 1;
171 
172  static magma_int_t i_n, i_1, iw;
173 
174  static cuFloatComplex alpha;
175 
176  cuFloatComplex *f = (cuFloatComplex *)malloc(n*sizeof(cuFloatComplex ));
177 
178  if (n <= 0) {
179  return 0;
180  }
181 
182  static cudaStream_t stream;
183  magma_queue_create( &stream );
184 
185  if (lapackf77_lsame(uplo_, "U")) {
186 
187  /* Reduce last NB columns of upper triangle */
188  for (i = n-1; i >= n - nb ; --i) {
189  i_1 = i + 1;
190  i_n = n - i - 1;
191 
192  iw = i - n + nb;
193  if (i < n-1) {
194  /* Update A(1:i,i) */
195  #if defined(PRECISION_z) || defined(PRECISION_c)
196  lapackf77_clacgv(&i_n, W(i, iw+1), &ldw);
197  #endif
198  blasf77_cgemv("No transpose", &i_1, &i_n, &c_neg_one, A(0, i+1), &lda,
199  W(i, iw+1), &ldw, &c_one, A(0, i), &ione);
200  #if defined(PRECISION_z) || defined(PRECISION_c)
201  lapackf77_clacgv(&i_n, W(i, iw+1), &ldw);
202  lapackf77_clacgv(&i_n, A(i, i+1), &lda);
203  #endif
204  blasf77_cgemv("No transpose", &i_1, &i_n, &c_neg_one, W(0, iw+1), &ldw,
205  A(i, i+1), &lda, &c_one, A(0, i), &ione);
206  #if defined(PRECISION_z) || defined(PRECISION_c)
207  lapackf77_clacgv(&i_n, A(i, i+1), &lda);
208  #endif
209  }
210  if (i > 0) {
211  /* Generate elementary reflector H(i) to annihilate A(1:i-2,i) */
212 
213  alpha = *A(i-1, i);
214 
215  lapackf77_clarfg(&i, &alpha, A(0, i), &ione, &tau[i - 1]);
216 
217  e[i-1] = MAGMA_C_REAL( alpha );
218  MAGMA_C_SET2REAL(*A(i-1, i), 1.);
219 
220  /* Compute W(1:i-1,i) */
221  // 1. Send the block reflector A(0:n-i-1,i) to the GPU
222  magma_csetvector( i, A(0, i), 1, dA(0, i), 1 );
223 
224  magma_chemv(MagmaUpper, i, c_one, dA(0, 0), ldda,
225  dA(0, i), ione, c_zero, dW(0, iw), ione);
226 
227  // 2. Start putting the result back (asynchronously)
229  dW(0, iw), lddw,
230  W(0, iw) /*test*/, ldw, stream );
231 
232  if (i < n-1) {
233 
234  blasf77_cgemv(MagmaConjTransStr, &i, &i_n, &c_one, W(0, iw+1), &ldw,
235  A(0, i), &ione, &c_zero, W(i+1, iw), &ione);
236  }
237 
238  // 3. Here is where we need it // TODO find the right place
239  magma_queue_sync( stream );
240 
241  if (i < n-1) {
242 
243  blasf77_cgemv("No transpose", &i, &i_n, &c_neg_one, A(0, i+1), &lda,
244  W(i+1, iw), &ione, &c_one, W(0, iw), &ione);
245 
246  blasf77_cgemv(MagmaConjTransStr, &i, &i_n, &c_one, A(0, i+1), &lda,
247  A(0, i), &ione, &c_zero, W(i+1, iw), &ione);
248 
249  blasf77_cgemv("No transpose", &i, &i_n, &c_neg_one, W(0, iw+1), &ldw,
250  W(i+1, iw), &ione, &c_one, W(0, iw), &ione);
251  }
252 
253  blasf77_cscal(&i, &tau[i - 1], W(0, iw), &ione);
254 
255 #if defined(PRECISION_z) || defined(PRECISION_c)
256  cblas_cdotc_sub(i, W(0, iw), ione, A(0, i), ione, &value);
257 
258 // blasf77_cdotc(&value, &i, W(0, iw), &ione, A(0, i), &ione);
259  alpha = tau[i - 1] * -.5f * value;
260 #else
261  alpha = tau[i - 1] * -.5f * blasf77_cdotc(&i, W(0, iw), &ione, A(0, i), &ione);
262 #endif
263  blasf77_caxpy(&i, &alpha, A(0, i), &ione,
264  W(0, iw), &ione);
265  }
266  }
267 
268  } else {
269 
270  /* Reduce first NB columns of lower triangle */
271  for (i = 0; i < nb; ++i)
272  {
273 
274  /* Update A(i:n,i) */
275  i_n = n - i;
276  #if defined(PRECISION_z) || defined(PRECISION_c)
277  lapackf77_clacgv(&i, W(i, 0), &ldw);
278  #endif
279  blasf77_cgemv("No transpose", &i_n, &i, &c_neg_one, A(i, 0), &lda,
280  W(i, 0), &ldw, &c_one, A(i, i), &ione);
281  #if defined(PRECISION_z) || defined(PRECISION_c)
282  lapackf77_clacgv(&i, W(i, 0), &ldw);
283  lapackf77_clacgv(&i, A(i ,0), &lda);
284  #endif
285  blasf77_cgemv("No transpose", &i_n, &i, &c_neg_one, W(i, 0), &ldw,
286  A(i, 0), &lda, &c_one, A(i, i), &ione);
287  #if defined(PRECISION_z) || defined(PRECISION_c)
288  lapackf77_clacgv(&i, A(i, 0), &lda);
289  #endif
290 
291  if (i < n-1)
292  {
293  /* Generate elementary reflector H(i) to annihilate A(i+2:n,i) */
294  i_n = n - i - 1;
295  alpha = *A(i+1, i);
296  lapackf77_clarfg(&i_n, &alpha, A(min(i+2,n-1), i), &ione, &tau[i]);
297  e[i] = MAGMA_C_REAL( alpha );
298  MAGMA_C_SET2REAL(*A(i+1, i), 1.);
299 
300  /* Compute W(i+1:n,i) */
301  // 1. Send the block reflector A(i+1:n,i) to the GPU
302  magma_csetvector( i_n, A(i+1, i), 1, dA(i+1, i), 1 );
303 
304  magma_chemv(MagmaLower, i_n, c_one, dA(i+1, i+1), ldda, dA(i+1, i), ione, c_zero,
305  dW(i+1, i), ione);
306 
307  // 2. Start putting the result back (asynchronously)
308  magma_cgetmatrix_async( i_n, 1,
309  dW(i+1, i), lddw,
310  W(i+1, i), ldw, stream );
311 
312  blasf77_cgemv(MagmaConjTransStr, &i_n, &i, &c_one, W(i+1, 0), &ldw,
313  A(i+1, i), &ione, &c_zero, W(0, i), &ione);
314 
315  blasf77_cgemv("No transpose", &i_n, &i, &c_neg_one, A(i+1, 0), &lda,
316  W(0, i), &ione, &c_zero, f, &ione);
317 
318  blasf77_cgemv(MagmaConjTransStr, &i_n, &i, &c_one, A(i+1, 0), &lda,
319  A(i+1, i), &ione, &c_zero, W(0, i), &ione);
320 
321  // 3. Here is where we need it
322  magma_queue_sync( stream );
323 
324  if (i!=0)
325  blasf77_caxpy(&i_n, &c_one, f, &ione, W(i+1, i), &ione);
326 
327  blasf77_cgemv("No transpose", &i_n, &i, &c_neg_one, W(i+1, 0), &ldw,
328  W(0, i), &ione, &c_one, W(i+1, i), &ione);
329  blasf77_cscal(&i_n, &tau[i], W(i+1,i), &ione);
330 
331  #if defined(PRECISION_z) || defined(PRECISION_c)
332  /* Comment:
333  To do - move to cblas in cases like this. The commented
334  out version works with MKL but is not a standard interface
335  for other BLAS zdoc implementations
336  */
337 
338  cblas_cdotc_sub(i_n, W(i +1, i), ione,
339  A(i +1, i), ione, &value);
340 
341  //blasf77_cdotc(&value, &i_n, W(i+1,i), &ione, A(i+1, i), &ione);
342  alpha = tau[i]* -.5f * value;
343  #else
344  alpha = tau[i]* -.5f* blasf77_cdotc(&i_n, W(i+1,i), &ione, A(i+1, i), &ione);
345  #endif
346  blasf77_caxpy(&i_n, &alpha, A(i+1, i), &ione, W(i+1,i), &ione);
347  }
348  }
349  }
350 
351  free(f);
352  magma_queue_destroy( stream );
353 
354  return 0;
355 } /* clatrd_ */
356