MAGMA  1.2.0
MatrixAlgebraonGPUandMulticoreArchitectures
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
zhegst_m.cpp
Go to the documentation of this file.
1 /*
2  -- MAGMA (version 1.2.0) --
3  Univ. of Tennessee, Knoxville
4  Univ. of California, Berkeley
5  Univ. of Colorado, Denver
6  May 2012
7 
8  @author Raffaele Solca
9 
10  @precisions normal z -> s d c
11 */
12 #define N_MAX_GPU 8
13 
14 #include "common_magma.h"
15 #include "cblas.h"
16 
17 extern "C"
19 
20 #define A(i, j) (a+(j)*nb*lda + (i)*nb)
21 #define B(i, j) (b+(j)*nb*ldb + (i)*nb)
22 
23 #define dA(gpui, i, j) (dw[gpui] + (j)*nb*ldda + (i)*nb)
24 #define dB_c(gpui, i, j) (dw[gpui] + dima*ldda + (i)*nb + (j)*nb*lddbc)
25 #define dB_r(gpui, i, j) (dw[gpui] + dima*ldda + (i)*nb + (j)*nb*lddbr)
26 
27 extern "C" magma_int_t
29  cuDoubleComplex *a, magma_int_t lda,
30  cuDoubleComplex *b, magma_int_t ldb, magma_int_t *info)
31 {
32 /*
33  -- MAGMA (version 1.2.0) --
34  Univ. of Tennessee, Knoxville
35  Univ. of California, Berkeley
36  Univ. of Colorado, Denver
37  May 2012
38 
39 
40  Purpose
41  =======
42 
43  ZHEGST reduces a complex Hermitian-definite generalized
44  eigenproblem to standard form.
45 
46  If ITYPE = 1, the problem is A*x = lambda*B*x,
47  and A is overwritten by inv(U**H)*A*inv(U) or inv(L)*A*inv(L**H)
48 
49  If ITYPE = 2 or 3, the problem is A*B*x = lambda*x or
50  B*A*x = lambda*x, and A is overwritten by U*A*U**H or L**H*A*L.
51 
52  B must have been previously factorized as U**H*U or L*L**H by ZPOTRF.
53 
54  Arguments
55  =========
56 
57  ITYPE (input) INTEGER
58  = 1: compute inv(U**H)*A*inv(U) or inv(L)*A*inv(L**H);
59  = 2 or 3: compute U*A*U**H or L**H*A*L.
60 
61  UPLO (input) CHARACTER*1
62  = 'U': Upper triangle of A is stored and B is factored as
63  U**H*U;
64  = 'L': Lower triangle of A is stored and B is factored as
65  L*L**H.
66 
67  N (input) INTEGER
68  The order of the matrices A and B. N >= 0.
69 
70  A (input/output) COMPLEX*16 array, dimension (LDA,N)
71  On entry, the Hermitian matrix A. If UPLO = 'U', the leading
72  N-by-N upper triangular part of A contains the upper
73  triangular part of the matrix A, and the strictly lower
74  triangular part of A is not referenced. If UPLO = 'L', the
75  leading N-by-N lower triangular part of A contains the lower
76  triangular part of the matrix A, and the strictly upper
77  triangular part of A is not referenced.
78 
79  On exit, if INFO = 0, the transformed matrix, stored in the
80  same format as A.
81 
82  LDA (input) INTEGER
83  The leading dimension of the array A. LDA >= max(1,N).
84 
85  B (input) COMPLEX*16 array, dimension (LDB,N)
86  The triangular factor from the Cholesky factorization of B,
87  as returned by ZPOTRF.
88 
89  LDB (input) INTEGER
90  The leading dimension of the array B. LDB >= max(1,N).
91 
92  INFO (output) INTEGER
93  = 0: successful exit
94  < 0: if INFO = -i, the i-th argument had an illegal value
95 
96  =====================================================================*/
97 
98  char uplo_[2] = {uplo, 0};
99  magma_int_t k, kb, j, jb, kb2;
100  magma_int_t ldda, dima, lddbr, lddbc;
101  cuDoubleComplex c_one = MAGMA_Z_ONE;
102  cuDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
103  cuDoubleComplex c_half = MAGMA_Z_HALF;
104  cuDoubleComplex c_neg_half = MAGMA_Z_NEG_HALF;
105  cuDoubleComplex* dw[N_MAX_GPU];
106  cudaStream_t stream [N_MAX_GPU][3];
107  magma_int_t igpu = 0;
108 
109  int gpu_b;
110  magma_getdevice(&gpu_b);
111 
112  double d_one = 1.0;
113  long int upper = lapackf77_lsame(uplo_, "U");
114 
116 
117  /* Test the input parameters. */
118  *info = 0;
119  if (itype<1 || itype>3){
120  *info = -1;
121  }else if ((! upper) && (! lapackf77_lsame(uplo_, "L"))) {
122  *info = -2;
123  } else if (n < 0) {
124  *info = -3;
125  } else if (lda < max(1,n)) {
126  *info = -5;
127  }else if (ldb < max(1,n)) {
128  *info = -7;
129  }
130  if (*info != 0) {
131  magma_xerbla( __func__, -(*info) );
132  return *info;
133  }
134 
135  /* Quick return */
136  if ( n == 0 )
137  return *info;
138 
139  magma_int_t nbl = (n-1)/nb+1; // number of blocks
140 
141  if ( (itype==1 && upper) || (itype!=1 && !upper) ){
142  ldda = ((nbl-1)/nrgpu+1)*nb;
143  dima = n;
144  } else {
145  ldda = n;
146  dima = ((nbl-1)/nrgpu+1)*nb;
147  }
148  lddbr = 2 * nb;
149  lddbc = n;
150  for (igpu = 0; igpu < nrgpu; ++igpu){
151  magma_setdevice(igpu);
152  if (MAGMA_SUCCESS != magma_zmalloc( &dw[igpu], (dima*ldda + lddbc*lddbr) )) {
153  *info = MAGMA_ERR_DEVICE_ALLOC;
154  return *info;
155  }
156  magma_queue_create( &stream[igpu][0] );
157  magma_queue_create( &stream[igpu][1] );
158  magma_queue_create( &stream[igpu][2] );
159  }
160 
161  /* Use hybrid blocked code */
162 
163  if (itype==1) {
164  if (upper) {
165 
166  /* Compute inv(U')*A*inv(U) */
167 
168  //copy A to mgpus
169  for (k = 0; k < nbl; ++k){
170  igpu = k%nrgpu;
171  magma_setdevice(igpu);
172  kb = min(nb, n-k*nb);
173  magma_zsetmatrix_async( kb, n-k*nb,
174  A(k, k), lda,
175  dA(igpu, k/nrgpu, k), ldda, stream[igpu][0] );
176  }
177  kb= min(n,nb);
178  igpu = 0;
179  magma_setdevice(igpu);
180  // dB_r(0,0) is used to store B(k,k)
181  magma_zsetmatrix_async( kb, kb,
182  B(0, 0), ldb,
183  dB_r(igpu, 0, 0), lddbr, stream[igpu][1] );
184 
185  for(k = 0; k<nbl; ++k){
186  kb= min(n-k*nb,nb);
187  kb2= min(n-(k+1)*nb,nb);
188 
189  if(k+1<nbl){
190  for (igpu = 0; igpu < nrgpu; ++igpu){
191  magma_setdevice(igpu);
192  magma_queue_sync( stream[igpu][0] );
193  magma_zsetmatrix_async( kb, n-(k+1)*nb,
194  B(k, k+1), ldb,
195  dB_r(igpu, 0, k+1), lddbr, stream[igpu][0] );
196  }
197  }
198 
199  igpu = k%nrgpu;
200  magma_setdevice(igpu);
201 
202  magma_queue_sync( stream[igpu][1] ); // Needed, otherwise conflicts reading B(k,k) between hegs2 and cudaMemcpy2D
203  magma_queue_sync( stream[igpu][2] );
204 
205  if(k+1<nbl){
206  magmablasSetKernelStream(stream[igpu][1]);
207  // dB_r(0,0) stores B(k,k)
209  kb, n-(k+1)*nb,
210  c_one, dB_r(igpu, 0, 0), lddbr,
211  dA(igpu, k/nrgpu, k+1), ldda);
212  }
213 
214  lapackf77_zhegs2( &itype, uplo_, &kb, A(k,k), &lda, B(k,k), &ldb, info);
215 printf("hegs2%d\n", k);
216  if (k+1<nbl) {
217  magma_zsetmatrix_async( kb, kb,
218  A(k, k), lda,
219  dA(igpu, k/nrgpu, k), ldda, stream[igpu][0] );
220 
221  magma_queue_sync( stream[igpu][1] );
222  magmablasSetKernelStream(stream[igpu][0]);
223 
225  kb, n-(k+1)*nb,
226  c_neg_half, dA(igpu, k/nrgpu, k), ldda,
227  dB_r(igpu, 0, k+1), lddbr,
228  c_one, dA(igpu, k/nrgpu, k+1), ldda);
229 
230  magma_queue_sync( stream[igpu][0] );
231 
232  magma_zgetmatrix( kb, n-(k+1)*nb,
233  dA(igpu, k/nrgpu, k+1), ldda,
234  A(k, k+1), lda );
235 
236  // send the partially updated panel of dA to each gpu in the second dB block
237  // to overlap hemm computation
238 
239  for (igpu = 0; igpu < nrgpu; ++igpu){
240  magma_setdevice(igpu);
241  magma_zsetmatrix_async( kb, n-(k+1)*nb,
242  A(k, k+1), lda,
243  dB_r(igpu, 1, k+1), lddbr, stream[igpu][0] );
244  }
245 
246  igpu = k%nrgpu;
247  magma_setdevice(igpu);
248  magmablasSetKernelStream(stream[igpu][1]);
249 
251  kb, n-(k+1)*nb,
252  c_neg_half, dA(igpu, k/nrgpu, k), ldda,
253  dB_r(igpu, 0, k+1), lddbr,
254  c_one, dA(igpu, k/nrgpu, k+1), ldda);
255 
256  for (igpu = 0; igpu < nrgpu; ++igpu){
257  magma_queue_sync( stream[igpu][0] );
258  }
259 
260  for (j = k+1; j < nbl; ++j){
261  jb = min(nb, n-j*nb);
262  igpu = j%nrgpu;
263  magma_setdevice(igpu);
264  magmablasSetKernelStream(stream[igpu][(j/nrgpu)%3]);
266  jb, nb,
267  c_neg_one, dB_r(igpu, 1, j), lddbr,
268  dB_r(igpu, 0, j), lddbr,
269  d_one, dA(igpu, j/nrgpu, j), ldda);
270  magma_queue_sync( stream[igpu][((j)/nrgpu)%3] ); // Needed for correctness. Why?
271  if (j == k+1){
272  magma_queue_sync( stream[igpu][(j/nrgpu)%3] );
273  magma_zgetmatrix_async( kb2, kb2,
274  dA(igpu, (k+1)/nrgpu, k+1), ldda,
275  A(k+1, k+1), lda, stream[igpu][2] );
276  // dB_r(0,0) is used to store B(k,k)
277  magma_zsetmatrix_async( kb2, kb2,
278  B(k+1, k+1), ldb,
279  dB_r(igpu, 0, 0), lddbr, stream[igpu][1] );
280  }
281  }
282  for (j = k+1; j < nbl-1; ++j){
283  igpu = j%nrgpu;
284  magma_setdevice(igpu);
285  magmablasSetKernelStream(stream[igpu][0]);
286  magma_zgemm(MagmaConjTrans, MagmaNoTrans, nb, n-(j+1)*nb, nb, c_neg_one, dB_r(igpu, 0, j), lddbr,
287  dB_r(igpu, 1, j+1), lddbr, c_one, dA(igpu, j/nrgpu, j+1), ldda );
288 
289  magma_zgemm(MagmaConjTrans, MagmaNoTrans, nb, n-(j+1)*nb, nb, c_neg_one, dB_r(igpu, 1, j), lddbr,
290  dB_r(igpu, 0, j+1), lddbr, c_one, dA(igpu, j/nrgpu, j+1), ldda );
291  }
292  }
293  }
294 
295  for (igpu = 0; igpu < nrgpu; ++igpu){
296  magma_queue_sync( stream[igpu][0] );
297  magma_queue_sync( stream[igpu][1] );
298  }
299 
300  if (n > nb){
301 
302  magma_int_t nloc[N_MAX_GPU];
303 
304  jb = min(nb, n-nb);
305  for (igpu = 0; igpu < nrgpu; ++igpu){
306  nloc[igpu]=0;
307  magma_setdevice(igpu);
308  magma_zsetmatrix_async( jb, n-nb,
309  B(1, 1), ldb,
310  dB_r(igpu, 1, 1), lddbr, stream[igpu][1] );
311  }
312  for (j = 1; j < nbl; ++j){
313  if ((j+1)*nb < n){
314  jb = min(nb, n-(j+1)*nb);
315  for (igpu = 0; igpu < nrgpu; ++igpu){
316  magma_setdevice(igpu);
317  magma_zsetmatrix_async( jb, n-(j+1)*nb,
318  B(j+1, j+1), ldb,
319  dB_r(igpu, (j+1)%2, j+1), lddbr, stream[igpu][(j+1)%2] );
320  }
321  }
322  jb = min(nb, n-j*nb);
323  nloc[(j-1)%nrgpu] += nb;
324 
325  for (igpu = 0; igpu < nrgpu; ++igpu){
326  magma_setdevice(igpu);
327  magmablasSetKernelStream(stream[igpu][j%2]);
328  magma_ztrsm(MagmaRight, uplo, MagmaNoTrans, MagmaNonUnit, nloc[igpu], jb, c_one, dB_r(igpu, j%2, j), lddbr,
329  dA(igpu, 0, j), ldda );
330  }
331 
332  if ( j < nbl-1 ){
333 
334  for (igpu = 0; igpu < nrgpu; ++igpu){
335  magma_setdevice(igpu);
336  magmablasSetKernelStream(stream[igpu][j%2]);
337  magma_zgemm(MagmaNoTrans, MagmaNoTrans, nloc[igpu], n-(j+1)*nb, nb, c_neg_one, dA(igpu, 0, j), ldda,
338  dB_r(igpu, j%2, j+1), lddbr, c_one, dA(igpu, 0, j+1), ldda );
339  }
340  }
341 
342  for (igpu = 0; igpu < nrgpu; ++igpu){
343  magma_queue_sync( stream[igpu][j%2] );
344  }
345 
346  for (k = 0; k < j; ++k){
347  igpu = k%nrgpu;
348  magma_setdevice(igpu);
349  kb = min(nb, n-k*nb);
350  magma_zgetmatrix_async( kb, jb,
351  dA(igpu, k/nrgpu, j), ldda,
352  A(k, j), lda, stream[igpu][2] );
353  }
354  }
355  }
356 
357 
358  } else {
359  /* Compute inv(L)*A*inv(L') */
360  //copy A to mgpus
361  for (k = 0; k < nbl; ++k){
362  igpu = k%nrgpu;
363  magma_setdevice(igpu);
364  kb = min(nb, n-k*nb);
365  magma_zsetmatrix_async( (n-k*nb), kb,
366  A(k, k), lda,
367  dA(igpu, k, k/nrgpu), ldda, stream[igpu][0] );
368  }
369  kb= min(n,nb);
370  igpu = 0;
371  magma_setdevice(igpu);
372  // dB_c(0,0) is used to store B(k,k)
373  magma_zsetmatrix_async( kb, kb,
374  B(0, 0), ldb,
375  dB_c(igpu, 0, 0), lddbc, stream[igpu][1] );
376 
377  for(k = 0; k<nbl; ++k){
378  kb= min(n-k*nb,nb);
379  kb2= min(n-(k+1)*nb,nb);
380 
381  if(k+1<nbl){
382  for (igpu = 0; igpu < nrgpu; ++igpu){
383  magma_setdevice(igpu);
384  magma_queue_sync( stream[igpu][0] );
385  magma_zsetmatrix_async( (n-(k+1)*nb), kb,
386  B(k+1, k), ldb,
387  dB_c(igpu, k+1, 0), lddbc, stream[igpu][0] );
388  }
389  }
390 
391  igpu = k%nrgpu;
392  magma_setdevice(igpu);
393 
394  magma_queue_sync( stream[igpu][1] ); // Needed, otherwise conflicts reading B(k,k) between hegs2 and cudaMemcpy2D
395  magma_queue_sync( stream[igpu][2] );
396 
397  if(k+1<nbl){
398  magmablasSetKernelStream(stream[igpu][1]);
399  // dB_c(0,0) stores B(k,k)
401  n-(k+1)*nb, kb,
402  c_one, dB_c(igpu, 0, 0), lddbc,
403  dA(igpu, k+1, k/nrgpu), ldda);
404  }
405 
406  lapackf77_zhegs2( &itype, uplo_, &kb, A(k,k), &lda, B(k,k), &ldb, info);
407 
408  if (k+1<nbl) {
409  magma_zsetmatrix_async( kb, kb,
410  A(k, k), lda,
411  dA(igpu, k , k/nrgpu), ldda, stream[igpu][0] );
412 
413  magma_queue_sync( stream[igpu][1] );
414  magmablasSetKernelStream(stream[igpu][0]);
415 
417  n-(k+1)*nb, kb,
418  c_neg_half, dA(igpu, k, k/nrgpu), ldda,
419  dB_c(igpu, k+1, 0), lddbc,
420  c_one, dA(igpu, k+1, k/nrgpu), ldda);
421 
422  magma_queue_sync( stream[igpu][0] );
423 
424  magma_zgetmatrix( n-(k+1)*nb, kb,
425  dA(igpu, k+1, k/nrgpu), ldda,
426  A(k+1, k), lda );
427 
428  // send the partially updated panel of dA to each gpu in the second dB block
429  // to overlap hemm computation
430 
431  for (igpu = 0; igpu < nrgpu; ++igpu){
432  magma_setdevice(igpu);
433  magma_zsetmatrix_async( (n-(k+1)*nb), kb,
434  A(k+1, k), lda,
435  dB_c(igpu, k+1, 1), lddbc, stream[igpu][0] );
436  }
437 
438  igpu = k%nrgpu;
439  magma_setdevice(igpu);
440  magmablasSetKernelStream(stream[igpu][1]);
441 
443  n-(k+1)*nb, kb,
444  c_neg_half, dA(igpu, k, k/nrgpu), ldda,
445  dB_c(igpu, k+1, 0), lddbc,
446  c_one, dA(igpu, k+1, k/nrgpu), ldda);
447 
448  for (igpu = 0; igpu < nrgpu; ++igpu){
449  magma_queue_sync( stream[igpu][0] );
450  }
451 
452  for (j = k+1; j < nbl; ++j){
453  jb = min(nb, n-j*nb);
454  igpu = j%nrgpu;
455  magma_setdevice(igpu);
456  magmablasSetKernelStream(stream[igpu][(j/nrgpu)%3]);
458  jb, nb,
459  c_neg_one, dB_c(igpu, j, 1), lddbc,
460  dB_c(igpu, j, 0), lddbc,
461  d_one, dA(igpu, j, j/nrgpu), ldda);
462  magma_queue_sync( stream[igpu][((j)/nrgpu)%3] ); // Needed for correctness. Why?
463  if (j == k+1){
464  magma_queue_sync( stream[igpu][(j/nrgpu)%3] );
465  magma_zgetmatrix_async( kb2, kb2,
466  dA(igpu, k+1, (k+1)/nrgpu), ldda,
467  A(k+1, k+1), lda, stream[igpu][2] );
468  // dB_c(0,0) is used to store B(k,k)
469  magma_zsetmatrix_async( kb2, kb2,
470  B(k+1, k+1), ldb,
471  dB_c(igpu, 0, 0), lddbc, stream[igpu][1] );
472  }
473  }
474  for (j = k+1; j < nbl-1; ++j){
475  igpu = j%nrgpu;
476  magma_setdevice(igpu);
477  magmablasSetKernelStream(stream[igpu][0]);
478  magma_zgemm(MagmaNoTrans, MagmaConjTrans, n-(j+1)*nb, nb, nb, c_neg_one, dB_c(igpu, j+1, 1), lddbc,
479  dB_c(igpu, j, 0), lddbc, c_one, dA(igpu, j+1, j/nrgpu), ldda );
480 
481  magma_zgemm(MagmaNoTrans, MagmaConjTrans, n-(j+1)*nb, nb, nb, c_neg_one, dB_c(igpu, j+1, 0), lddbc,
482  dB_c(igpu, j, 1), lddbc, c_one, dA(igpu, j+1, j/nrgpu), ldda );
483  }
484  }
485  }
486 
487  for (igpu = 0; igpu < nrgpu; ++igpu){
488  magma_queue_sync( stream[igpu][0] );
489  magma_queue_sync( stream[igpu][1] );
490  }
491 
492  if (n > nb){
493 
494  magma_int_t nloc[N_MAX_GPU];
495 
496  jb = min(nb, n-nb);
497  for (igpu = 0; igpu < nrgpu; ++igpu){
498  nloc[igpu]=0;
499  magma_setdevice(igpu);
500  magma_zsetmatrix_async( (n-nb), jb,
501  B(1, 1), ldb,
502  dB_c(igpu, 1, 1), lddbc, stream[igpu][1] );
503  }
504  for (j = 1; j < nbl; ++j){
505  if ((j+1)*nb < n){
506  jb = min(nb, n-(j+1)*nb);
507  for (igpu = 0; igpu < nrgpu; ++igpu){
508  magma_setdevice(igpu);
509  magma_zsetmatrix_async( (n-(j+1)*nb), jb,
510  B(j+1, j+1), ldb,
511  dB_c(igpu, j+1, (j+1)%2), lddbc, stream[igpu][(j+1)%2] );
512  }
513  }
514  jb = min(nb, n-j*nb);
515  nloc[(j-1)%nrgpu] += nb;
516 
517  for (igpu = 0; igpu < nrgpu; ++igpu){
518  magma_setdevice(igpu);
519  magmablasSetKernelStream(stream[igpu][j%2]);
520  magma_ztrsm(MagmaLeft, uplo, MagmaNoTrans, MagmaNonUnit, jb, nloc[igpu], c_one, dB_c(igpu, j, j%2), lddbc,
521  dA(igpu, j, 0), ldda );
522  }
523 
524  if ( j < nbl-1 ){
525 
526  for (igpu = 0; igpu < nrgpu; ++igpu){
527  magma_setdevice(igpu);
528  magmablasSetKernelStream(stream[igpu][j%2]);
529  magma_zgemm(MagmaNoTrans, MagmaNoTrans, n-(j+1)*nb, nloc[igpu], nb, c_neg_one, dB_c(igpu, j+1, j%2), lddbc,
530  dA(igpu, j, 0), ldda, c_one, dA(igpu, j+1, 0), ldda );
531  }
532  }
533 
534  for (igpu = 0; igpu < nrgpu; ++igpu){
535  magma_queue_sync( stream[igpu][j%2] );
536  }
537 
538  for (k = 0; k < j; ++k){
539  igpu = k%nrgpu;
540  magma_setdevice(igpu);
541  kb = min(nb, n-k*nb);
542  magma_zgetmatrix_async( jb, kb,
543  dA(igpu, j, k/nrgpu), ldda,
544  A(j, k), lda, stream[igpu][2] );
545  }
546  }
547  }
548 
549  }
550 
551  } else {
552 
553  if (upper) {
554 
555  printf("zhegst_m: type2 upper not implemented\n");
556  exit(-1);
557 
558  /* Compute U*A*U' */
559 
560 /* for(k = 0; k<n; k+=nb){
561  kb= min(n-k,nb);
562 
563  magma_zgetmatrix_async( kb, kb,
564  dA(k, k), ldda,
565  A(k, k), lda, stream[0] );
566 
567  // Update the upper triangle of A(1:k+kb-1,1:k+kb-1)
568  if(k>0){
569 
570  magma_ztrmm(MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit,
571  k, kb,
572  c_one ,dB(0,0), lddb,
573  dA(0,k), ldda);
574 
575  magma_zhemm(MagmaRight, MagmaUpper,
576  k, kb,
577  c_half, dA(k,k), ldda,
578  dB(0,k), lddb,
579  c_one, dA(0, k), ldda);
580 
581  magma_queue_sync( stream[1] );
582 
583  magma_zher2k(MagmaUpper, MagmaNoTrans,
584  k, kb,
585  c_one, dA(0,k), ldda,
586  dB(0,k), lddb,
587  d_one, dA(0,0), ldda);
588 
589  magma_zhemm(MagmaRight, MagmaUpper,
590  k, kb,
591  c_half, dA(k,k), ldda,
592  dB(0,k), lddb,
593  c_one, dA(0, k), ldda);
594 
595  magma_ztrmm(MagmaRight, MagmaUpper, MagmaConjTrans, MagmaNonUnit,
596  k, kb,
597  c_one, dB(k,k), lddb,
598  dA(0,k), ldda);
599 
600  }
601 
602  magma_queue_sync( stream[0] );
603 
604  lapackf77_zhegs2( &itype, uplo_, &kb, A(k, k), &lda, B(k, k), &ldb, info);
605 
606  magma_zsetmatrix_async( kb, kb,
607  A(k, k), lda,
608  dA(k, k), ldda, stream[1] );
609 
610  }
611 
612  magma_queue_sync( stream[1] );
613 */
614  } else {
615 
616  /* Compute L'*A*L */
617 
618  printf("zhegst_m: type2 lower not implemented\n");
619  exit(-1);
620 
621 /* if (n > nb){
622 
623  magma_int_t nloc[N_MAX_GPU];
624  for(igpu = 0; igpu < nrgpu; ++igpu)
625  nloc[igpu] = 0;
626 
627  kb = min(nb, n);
628  for (j = 0; j < nbl; ++j){
629  igpu = j%nrgpu;
630  magma_setdevice(igpu);
631  jb = min(nb, n-j*nb);
632  nloc[igpu] += jb;
633  magma_zsetmatrix_async( jb, kb,
634  A(j, 0), lda,
635  dA(igpu, j/nrgpu, 0), ldda, stream[igpu][0] );
636  }
637  for (igpu = 0; igpu < nrgpu; ++igpu){
638  magma_setdevice(igpu);
639  magma_zsetmatrix_async( kb, kb,
640  B(0, 0), ldb,
641  dB_r(igpu, 0, 0), lddbr, stream[igpu][0] );
642  }
643  for (k = 0; k < nbl-1; ++k){
644  nloc[k%nrgpu] -= nb;
645  if (k < nbl-2){
646  kb = min(nb, n-(k+1)*nb);
647  for (j = k; j < nbl; ++j){
648  igpu = j%nrgpu;
649  magma_setdevice(igpu);
650  jb = min(nb, n-j*nb);
651  magma_zsetmatrix_async( jb, kb,
652  A(j, k+1), lda,
653  dA(igpu, j/nrgpu, k+1), ldda, stream[igpu][(k+1)%2] );
654  }
655  for (igpu = 0; igpu < nrgpu; ++igpu){
656  magma_setdevice(igpu);
657  magma_zsetmatrix_async( kb, (k+1)*nb + kb,
658  B(k+1, 0), ldb,
659  dB_r(igpu, (k+1)%2, 0), lddbr, stream[igpu][(k+1)%2] );
660  }
661  }
662 
663  kb = min(nb, n-k*nb);
664 
665  if (k > 0){
666  for (igpu = 0; igpu < nrgpu; ++igpu){
667  magma_setdevice(igpu);
668  magmablasSetKernelStream(stream[igpu][k%2]);
669  magma_zgemm(MagmaNoTrans, MagmaNoTrans, nloc[igpu], k*nb, kb, c_one, dA(igpu, n-nloc[igpu], k), ldda,
670  dB_r(igpu, k%2, 0), lddbr, c_one, dA(igpu, n-nloc[igpu], 0), ldda );
671  }
672  }
673 
674  for (igpu = 0; igpu < nrgpu; ++igpu){
675  magma_setdevice(igpu);
676  magmablasSetKernelStream(stream[igpu][k%2]);
677  magma_ztrmm(MagmaRight, uplo, MagmaNoTrans, MagmaNonUnit, nloc[igpu], kb, c_one, dB_r(igpu, k%2, k), lddbr,
678  dA(igpu, n-nloc[igpu], k), ldda );
679  }
680 
681  for (igpu = 0; igpu < nrgpu; ++igpu){
682  magma_queue_sync( stream[igpu][k%2] );
683  }
684 
685  }
686 
687  }
688 
690  // put for loop!
691  // put copies!
692 
693  magmablasSetKernelStream(stream[igpu][0]);
694 
695  magma_zhemm(MagmaRight, MagmaLower,
696  kb, k*nb,
697  c_half, dA(igpu, k/nrgpu, 0), ldda,
698  dB_r(igpu, 0, 0), lddbr,
699  c_one, dA(igpu, k/nrgpu, 0), ldda);
700 
701  magma_queue_sync( stream[igpu][0] );
702 
703  magma_zgetmatrix( kb, k*nb,
704  dA(igpu, k/nrgpu, 0), ldda,
705  A(k, 0), lda );
706 
707  // send the partially updated panel of dA to each gpu in the second dB block
708  // to overlap hemm computation
709 
710  for (igpu = 0; igpu < nrgpu; ++igpu){
711  magma_setdevice(igpu);
712  magma_zsetmatrix_async( kb, // ERROR: missing dimension,
713  A(k, 0), lda,
714  dB_r(igpu, 1, 0), lddbr, stream[igpu][0] );
715  }
716 
717  igpu = k%nrgpu;
718  magma_setdevice(igpu);
719  magmablasSetKernelStream(stream[igpu][1]);
720 
721  magma_zhemm(MagmaRight, MagmaLower,
722  n-(k+1)*nb, kb,
723  c_neg_half, dA(igpu, k, k/nrgpu), ldda,
724  dB_c(igpu, k+1, 0), lddbc,
725  c_one, dA(igpu, k+1, k/nrgpu), ldda);
726 
727  for (igpu = 0; igpu < nrgpu; ++igpu){
728  magma_queue_sync( stream[igpu][0] );
729  }
730 
731 
732  //copy B from mgpus
733  for (j = 0; j < nbl; ++j){
734  igpu = j%nrgpu;
735  magma_setdevice(igpu);
736  jb = min(nb, n-j*nb);
737  magma_zgetmatrix_async( jb, n,
738  dA(igpu, j/nrgpu, 0), ldda,
739  A(j, 0), lda, stream[igpu][0] );
740  }
741 
742 *//* for(k = 0; k<n; k+=nb){
743  kb= min(n-k,nb);
744 
745  magma_zgetmatrix_async( kb, kb,
746  dA(k, k), ldda,
747  A(k, k), lda, stream[0] );
748 
749  // Update the lower triangle of A(1:k+kb-1,1:k+kb-1)
750  if(k>0){
751 
752  magma_ztrmm(MagmaRight, MagmaLower, MagmaNoTrans, MagmaNonUnit,
753  kb, k,
754  c_one ,dB(0,0), lddb,
755  dA(k,0), ldda);
756 
757  magma_zhemm(MagmaLeft, MagmaLower,
758  kb, k,
759  c_half, dA(k,k), ldda,
760  dB(k,0), lddb,
761  c_one, dA(k, 0), ldda);
762 
763  magma_queue_sync( stream[1] );
764 
765  magma_zher2k(MagmaLower, MagmaConjTrans,
766  k, kb,
767  c_one, dA(k,0), ldda,
768  dB(k,0), lddb,
769  d_one, dA(0,0), ldda);
770 
771  magma_zhemm(MagmaLeft, MagmaLower,
772  kb, k,
773  c_half, dA(k,k), ldda,
774  dB(k,0), lddb,
775  c_one, dA(k, 0), ldda);
776 
777  magma_ztrmm(MagmaLeft, MagmaLower, MagmaConjTrans, MagmaNonUnit,
778  kb, k,
779  c_one, dB(k,k), lddb,
780  dA(k,0), ldda);
781  }
782 
783  magma_queue_sync( stream[0] );
784 
785  lapackf77_zhegs2( &itype, uplo_, &kb, A(k,k), &lda, B(k,k), &ldb, info);
786 
787  magma_zsetmatrix_async( kb, kb,
788  A(k, k), lda,
789  dA(k, k), ldda, stream[1] );
790  }
791 
792  magma_queue_sync( stream[1] );
793  */
794  }
795  }
796 
797  for (igpu = 0; igpu < nrgpu; ++igpu){
798  magma_setdevice(igpu);
800  magma_queue_sync( stream[igpu][2] );
801  magma_queue_destroy( stream[igpu][0] );
802  magma_queue_destroy( stream[igpu][1] );
803  magma_queue_destroy( stream[igpu][2] );
804  magma_free( dw[igpu] );
805  }
806 
807  magma_setdevice(gpu_b);
808 
809  return *info;
810 } /* magma_zhegst_gpu */