14 #include "common_magma.h"
20 #define A(i, j) (a+(j)*nb*lda + (i)*nb)
21 #define B(i, j) (b+(j)*nb*ldb + (i)*nb)
23 #define dA(gpui, i, j) (dw[gpui] + (j)*nb*ldda + (i)*nb)
24 #define dB_c(gpui, i, j) (dw[gpui] + dima*ldda + (i)*nb + (j)*nb*lddbc)
25 #define dB_r(gpui, i, j) (dw[gpui] + dima*ldda + (i)*nb + (j)*nb*lddbr)
98 char uplo_[2] = {
uplo, 0};
119 if (itype<1 || itype>3){
125 }
else if (lda <
max(1,n)) {
127 }
else if (ldb <
max(1,n)) {
141 if ( (itype==1 && upper) || (itype!=1 && !upper) ){
142 ldda = ((nbl-1)/nrgpu+1)*nb;
146 dima = ((nbl-1)/nrgpu+1)*nb;
150 for (igpu = 0; igpu < nrgpu; ++igpu){
169 for (k = 0; k < nbl; ++k){
172 kb =
min(nb, n-k*nb);
175 dA(igpu, k/nrgpu, k), ldda, stream[igpu][0] );
183 dB_r(igpu, 0, 0), lddbr, stream[igpu][1] );
185 for(k = 0; k<nbl; ++k){
187 kb2=
min(n-(k+1)*nb,nb);
190 for (igpu = 0; igpu < nrgpu; ++igpu){
195 dB_r(igpu, 0, k+1), lddbr, stream[igpu][0] );
210 c_one,
dB_r(igpu, 0, 0), lddbr,
211 dA(igpu, k/nrgpu, k+1), ldda);
215 printf(
"hegs2%d\n", k);
219 dA(igpu, k/nrgpu, k), ldda, stream[igpu][0] );
226 c_neg_half,
dA(igpu, k/nrgpu, k), ldda,
227 dB_r(igpu, 0, k+1), lddbr,
228 c_one,
dA(igpu, k/nrgpu, k+1), ldda);
233 dA(igpu, k/nrgpu, k+1), ldda,
239 for (igpu = 0; igpu < nrgpu; ++igpu){
243 dB_r(igpu, 1, k+1), lddbr, stream[igpu][0] );
252 c_neg_half,
dA(igpu, k/nrgpu, k), ldda,
253 dB_r(igpu, 0, k+1), lddbr,
254 c_one,
dA(igpu, k/nrgpu, k+1), ldda);
256 for (igpu = 0; igpu < nrgpu; ++igpu){
260 for (j = k+1; j < nbl; ++j){
261 jb =
min(nb, n-j*nb);
267 c_neg_one,
dB_r(igpu, 1, j), lddbr,
268 dB_r(igpu, 0, j), lddbr,
269 d_one,
dA(igpu, j/nrgpu, j), ldda);
274 dA(igpu, (k+1)/nrgpu, k+1), ldda,
275 A(k+1, k+1), lda, stream[igpu][2] );
279 dB_r(igpu, 0, 0), lddbr, stream[igpu][1] );
282 for (j = k+1; j < nbl-1; ++j){
286 magma_zgemm(
MagmaConjTrans,
MagmaNoTrans, nb, n-(j+1)*nb, nb, c_neg_one,
dB_r(igpu, 0, j), lddbr,
287 dB_r(igpu, 1, j+1), lddbr, c_one,
dA(igpu, j/nrgpu, j+1), ldda );
289 magma_zgemm(
MagmaConjTrans,
MagmaNoTrans, nb, n-(j+1)*nb, nb, c_neg_one,
dB_r(igpu, 1, j), lddbr,
290 dB_r(igpu, 0, j+1), lddbr, c_one,
dA(igpu, j/nrgpu, j+1), ldda );
295 for (igpu = 0; igpu < nrgpu; ++igpu){
305 for (igpu = 0; igpu < nrgpu; ++igpu){
310 dB_r(igpu, 1, 1), lddbr, stream[igpu][1] );
312 for (j = 1; j < nbl; ++j){
314 jb =
min(nb, n-(j+1)*nb);
315 for (igpu = 0; igpu < nrgpu; ++igpu){
319 dB_r(igpu, (j+1)%2, j+1), lddbr, stream[igpu][(j+1)%2] );
322 jb =
min(nb, n-j*nb);
323 nloc[(j-1)%nrgpu] += nb;
325 for (igpu = 0; igpu < nrgpu; ++igpu){
328 magma_ztrsm(
MagmaRight, uplo,
MagmaNoTrans,
MagmaNonUnit, nloc[igpu], jb, c_one,
dB_r(igpu, j%2, j), lddbr,
329 dA(igpu, 0, j), ldda );
334 for (igpu = 0; igpu < nrgpu; ++igpu){
337 magma_zgemm(
MagmaNoTrans,
MagmaNoTrans, nloc[igpu], n-(j+1)*nb, nb, c_neg_one,
dA(igpu, 0, j), ldda,
338 dB_r(igpu, j%2, j+1), lddbr, c_one,
dA(igpu, 0, j+1), ldda );
342 for (igpu = 0; igpu < nrgpu; ++igpu){
346 for (k = 0; k < j; ++k){
349 kb =
min(nb, n-k*nb);
351 dA(igpu, k/nrgpu, j), ldda,
352 A(k, j), lda, stream[igpu][2] );
361 for (k = 0; k < nbl; ++k){
364 kb =
min(nb, n-k*nb);
367 dA(igpu, k, k/nrgpu), ldda, stream[igpu][0] );
375 dB_c(igpu, 0, 0), lddbc, stream[igpu][1] );
377 for(k = 0; k<nbl; ++k){
379 kb2=
min(n-(k+1)*nb,nb);
382 for (igpu = 0; igpu < nrgpu; ++igpu){
387 dB_c(igpu, k+1, 0), lddbc, stream[igpu][0] );
402 c_one,
dB_c(igpu, 0, 0), lddbc,
403 dA(igpu, k+1, k/nrgpu), ldda);
411 dA(igpu, k , k/nrgpu), ldda, stream[igpu][0] );
418 c_neg_half,
dA(igpu, k, k/nrgpu), ldda,
419 dB_c(igpu, k+1, 0), lddbc,
420 c_one,
dA(igpu, k+1, k/nrgpu), ldda);
425 dA(igpu, k+1, k/nrgpu), ldda,
431 for (igpu = 0; igpu < nrgpu; ++igpu){
435 dB_c(igpu, k+1, 1), lddbc, stream[igpu][0] );
444 c_neg_half,
dA(igpu, k, k/nrgpu), ldda,
445 dB_c(igpu, k+1, 0), lddbc,
446 c_one,
dA(igpu, k+1, k/nrgpu), ldda);
448 for (igpu = 0; igpu < nrgpu; ++igpu){
452 for (j = k+1; j < nbl; ++j){
453 jb =
min(nb, n-j*nb);
459 c_neg_one,
dB_c(igpu, j, 1), lddbc,
460 dB_c(igpu, j, 0), lddbc,
461 d_one,
dA(igpu, j, j/nrgpu), ldda);
466 dA(igpu, k+1, (k+1)/nrgpu), ldda,
467 A(k+1, k+1), lda, stream[igpu][2] );
471 dB_c(igpu, 0, 0), lddbc, stream[igpu][1] );
474 for (j = k+1; j < nbl-1; ++j){
478 magma_zgemm(
MagmaNoTrans,
MagmaConjTrans, n-(j+1)*nb, nb, nb, c_neg_one,
dB_c(igpu, j+1, 1), lddbc,
479 dB_c(igpu, j, 0), lddbc, c_one,
dA(igpu, j+1, j/nrgpu), ldda );
481 magma_zgemm(
MagmaNoTrans,
MagmaConjTrans, n-(j+1)*nb, nb, nb, c_neg_one,
dB_c(igpu, j+1, 0), lddbc,
482 dB_c(igpu, j, 1), lddbc, c_one,
dA(igpu, j+1, j/nrgpu), ldda );
487 for (igpu = 0; igpu < nrgpu; ++igpu){
497 for (igpu = 0; igpu < nrgpu; ++igpu){
502 dB_c(igpu, 1, 1), lddbc, stream[igpu][1] );
504 for (j = 1; j < nbl; ++j){
506 jb =
min(nb, n-(j+1)*nb);
507 for (igpu = 0; igpu < nrgpu; ++igpu){
511 dB_c(igpu, j+1, (j+1)%2), lddbc, stream[igpu][(j+1)%2] );
514 jb =
min(nb, n-j*nb);
515 nloc[(j-1)%nrgpu] += nb;
517 for (igpu = 0; igpu < nrgpu; ++igpu){
520 magma_ztrsm(
MagmaLeft, uplo,
MagmaNoTrans,
MagmaNonUnit, jb, nloc[igpu], c_one,
dB_c(igpu, j, j%2), lddbc,
521 dA(igpu, j, 0), ldda );
526 for (igpu = 0; igpu < nrgpu; ++igpu){
529 magma_zgemm(
MagmaNoTrans,
MagmaNoTrans, n-(j+1)*nb, nloc[igpu], nb, c_neg_one,
dB_c(igpu, j+1, j%2), lddbc,
530 dA(igpu, j, 0), ldda, c_one,
dA(igpu, j+1, 0), ldda );
534 for (igpu = 0; igpu < nrgpu; ++igpu){
538 for (k = 0; k < j; ++k){
541 kb =
min(nb, n-k*nb);
543 dA(igpu, j, k/nrgpu), ldda,
544 A(j, k), lda, stream[igpu][2] );
555 printf(
"zhegst_m: type2 upper not implemented\n");
618 printf(
"zhegst_m: type2 lower not implemented\n");
797 for (igpu = 0; igpu < nrgpu; ++igpu){