14 #include "common_magma.h"
19 #define A(i, j) (a+(j)*nb*lda + (i)*nb)
20 #define B(i, j) (b+(j)*nb*ldb + (i)*nb)
22 #define dB(gpui, i, j) (dw[gpui] + (j)*nb*lddb + (i)*nb)
24 #define dA(gpui, i, j) (dw[gpui] + dimb*lddb + (i)*nb + (j)*nb*ldda)
167 char side_[2] = {
side, 0};
168 char uplo_[2] = {
uplo, 0};
169 char transa_[2] = {transa, 0};
170 char diag_[2] = {
diag, 0};
211 }
else if (lda <
max(1,nrowa)) {
213 }
else if (ldb <
max(1,m)) {
233 dimb = ((nbl-1)/nrgpu+1)*nb;
242 lddb = ((mbl-1)/nrgpu+1)*nb;
253 for (igpu = 0; igpu < nrgpu; ++igpu){
267 printf(
"dtrsm_m: alpha = 0 not implemented\n");
283 for(igpu = 0; igpu < nrgpu; ++igpu)
287 for (k = 0; k < nbl; ++k){
290 kb =
min(nb, n-k*nb);
294 dB(igpu, 0, k/nrgpu), lddb, stream[igpu][(mbl+1)%2] );
296 jb =
min(nb, m-(mbl-1)*nb);
297 for (igpu = 0; igpu < nrgpu; ++igpu){
301 dA(igpu, 0, (mbl-1)%2), ldda, stream[igpu][(mbl+1)%2] );
303 for (j = mbl-1; j >= 0; --j){
306 for (igpu = 0; igpu < nrgpu; ++igpu){
310 dA(igpu, 0, (j+1)%2), ldda, stream[igpu][(j+1)%2] );
318 jb =
min(nb, m-j*nb);
320 for (igpu = 0; igpu < nrgpu; ++igpu){
323 magma_dtrsm(side, uplo, transa, diag, jb, nloc[igpu], alpha_,
dA(igpu, j, j%2), ldda,
324 dB(igpu, j, 0), lddb );
328 for (igpu = 0; igpu < nrgpu; ++igpu){
331 magma_dgemm(transa,
MagmaNoTrans, j*nb, nloc[igpu], jb, c_neg_one,
dA(igpu, 0, j%2), ldda,
332 dB(igpu, j, 0), lddb, alpha_,
dB(igpu, 0, 0), lddb );
336 for (igpu = 0; igpu < nrgpu; ++igpu){
340 for (k = 0; k < nbl; ++k){
343 kb =
min(nb, n-k*nb);
345 dB(igpu, j, k/nrgpu), lddb,
346 B(j, k), ldb, stream[igpu][2] );
356 for(igpu = 0; igpu < nrgpu; ++igpu)
360 for (k = 0; k < nbl; ++k){
363 kb =
min(nb, n-k*nb);
367 dB(igpu, 0, k/nrgpu), lddb, stream[igpu][0] );
370 for (igpu = 0; igpu < nrgpu; ++igpu){
374 dA(igpu, 0, 0), ldda, stream[igpu][0] );
376 for (j = 0; j < mbl; ++j){
378 jb =
min(nb, m-(j+1)*nb);
379 for (igpu = 0; igpu < nrgpu; ++igpu){
383 dA(igpu, j+1, (j+1)%2), ldda, stream[igpu][(j+1)%2] );
386 jb =
min(nb, m-j*nb);
393 for (igpu = 0; igpu < nrgpu; ++igpu){
396 magma_dtrsm(side, uplo, transa, diag, jb, nloc[igpu], alpha_,
dA(igpu, j, j%2), ldda,
397 dB(igpu, j, 0), lddb );
402 for (igpu = 0; igpu < nrgpu; ++igpu){
405 magma_dgemm(transa,
MagmaNoTrans, m-(j+1)*nb, nloc[igpu], nb, c_neg_one,
dA(igpu, j+1, j%2), ldda,
406 dB(igpu, j, 0), lddb, alpha_,
dB(igpu, j+1, 0), lddb );
410 for (igpu = 0; igpu < nrgpu; ++igpu){
414 for (k = 0; k < nbl; ++k){
417 kb =
min(nb, n-k*nb);
419 dB(igpu, j, k/nrgpu), lddb,
420 B(j, k), ldb, stream[igpu][2] );
436 for(igpu = 0; igpu < nrgpu; ++igpu)
440 for (k = 0; k < nbl; ++k){
443 kb =
min(nb, n-k*nb);
447 dB(igpu, 0, k/nrgpu), lddb, stream[igpu][0] );
450 for (igpu = 0; igpu < nrgpu; ++igpu){
454 dA(igpu, 0, 0), ldda, stream[igpu][0] );
456 for (j = 0; j < mbl; ++j){
458 jb =
min(nb, m-(j+1)*nb);
459 for (igpu = 0; igpu < nrgpu; ++igpu){
463 dA(igpu, (j+1)%2, j+1), ldda, stream[igpu][(j+1)%2] );
466 jb =
min(nb, m-j*nb);
473 for (igpu = 0; igpu < nrgpu; ++igpu){
476 magma_dtrsm(side, uplo, transa, diag, jb, nloc[igpu], alpha_,
dA(igpu, j%2, j), ldda,
477 dB(igpu, j, 0), lddb );
482 for (igpu = 0; igpu < nrgpu; ++igpu){
485 magma_dgemm(transa,
MagmaNoTrans, m-(j+1)*nb, nloc[igpu], nb, c_neg_one,
dA(igpu, j%2, j+1), ldda,
486 dB(igpu, j, 0), lddb, alpha_,
dB(igpu, j+1, 0), lddb );
490 for (igpu = 0; igpu < nrgpu; ++igpu){
494 for (k = 0; k < nbl; ++k){
497 kb =
min(nb, n-k*nb);
499 dB(igpu, j, k/nrgpu), lddb,
500 B(j, k), ldb, stream[igpu][2] );
510 for(igpu = 0; igpu < nrgpu; ++igpu)
514 for (k = 0; k < nbl; ++k){
517 kb =
min(nb, n-k*nb);
521 dB(igpu, 0, k/nrgpu), lddb, stream[igpu][(mbl+1)%2] );
523 jb =
min(nb, m-(mbl-1)*nb);
524 for (igpu = 0; igpu < nrgpu; ++igpu){
528 dA(igpu, (mbl-1)%2, 0), ldda, stream[igpu][(mbl+1)%2] );
530 for (j = mbl-1; j >= 0; --j){
533 for (igpu = 0; igpu < nrgpu; ++igpu){
537 dA(igpu, (j+1)%2, 0), ldda, stream[igpu][(j+1)%2] );
545 jb =
min(nb, m-j*nb);
547 for (igpu = 0; igpu < nrgpu; ++igpu){
550 magma_dtrsm(side, uplo, transa, diag, jb, nloc[igpu], alpha_,
dA(igpu, j%2, j), ldda,
551 dB(igpu, j, 0), lddb );
555 for (igpu = 0; igpu < nrgpu; ++igpu){
558 magma_dgemm(transa,
MagmaNoTrans, j*nb, nloc[igpu], jb, c_neg_one,
dA(igpu, j%2, 0), ldda,
559 dB(igpu, j, 0), lddb, alpha_,
dB(igpu, 0, 0), lddb );
563 for (igpu = 0; igpu < nrgpu; ++igpu){
567 for (k = 0; k < nbl; ++k){
570 kb =
min(nb, n-k*nb);
572 dB(igpu, j, k/nrgpu), lddb,
573 B(j, k), ldb, stream[igpu][2] );
590 for(igpu = 0; igpu < nrgpu; ++igpu)
594 for (j = 0; j < mbl; ++j){
597 jb =
min(nb, m-j*nb);
601 dB(igpu, j/nrgpu, 0), lddb, stream[igpu][0] );
604 for (igpu = 0; igpu < nrgpu; ++igpu){
608 dA(igpu, 0, 0), ldda, stream[igpu][0] );
610 for (k = 0; k < nbl; ++k){
612 kb =
min(nb, n-(k+1)*nb);
613 for (igpu = 0; igpu < nrgpu; ++igpu){
617 dA(igpu, (k+1)%2, k+1), ldda, stream[igpu][(k+1)%2] );
620 kb =
min(nb, n-k*nb);
627 for (igpu = 0; igpu < nrgpu; ++igpu){
630 magma_dtrsm(side, uplo, transa, diag, mloc[igpu], kb, alpha_,
dA(igpu, k%2, k), ldda,
631 dB(igpu, 0, k), lddb );
636 for (igpu = 0; igpu < nrgpu; ++igpu){
639 magma_dgemm(
MagmaNoTrans, transa, mloc[igpu], n-(k+1)*nb, nb, c_neg_one,
dB(igpu, 0, k), lddb,
640 dA(igpu, k%2, k+1), ldda, alpha_,
dB(igpu, 0, k+1), lddb );
644 for (igpu = 0; igpu < nrgpu; ++igpu){
648 for (j = 0; j < mbl; ++j){
651 jb =
min(nb, m-j*nb);
653 dB(igpu, j/nrgpu, k), lddb,
654 B(j, k), ldb, stream[igpu][2] );
663 for(igpu = 0; igpu < nrgpu; ++igpu)
667 for (j = 0; j < mbl; ++j){
670 jb =
min(nb, m-j*nb);
674 dB(igpu, j/nrgpu, 0), lddb, stream[igpu][(nbl+1)%2] );
676 kb =
min(nb, n-(nbl-1)*nb);
677 for (igpu = 0; igpu < nrgpu; ++igpu){
681 dA(igpu, (nbl-1)%2, 0), ldda, stream[igpu][(nbl+1)%2] );
683 for (k = nbl-1; k >= 0; --k){
686 for (igpu = 0; igpu < nrgpu; ++igpu){
690 dA(igpu, (k+1)%2, 0), ldda, stream[igpu][(k+1)%2] );
698 kb =
min(nb, n-k*nb);
700 for (igpu = 0; igpu < nrgpu; ++igpu){
703 magma_dtrsm(side, uplo, transa, diag, mloc[igpu], kb, alpha_,
dA(igpu, k%2, k), ldda,
704 dB(igpu, 0, k), lddb );
708 for (igpu = 0; igpu < nrgpu; ++igpu){
711 magma_dgemm(
MagmaNoTrans, transa, mloc[igpu], k*nb, kb, c_neg_one,
dB(igpu, 0, k), lddb,
712 dA(igpu, k%2, 0), ldda, alpha_,
dB(igpu, 0, 0), lddb );
716 for (igpu = 0; igpu < nrgpu; ++igpu){
720 for (j = 0; j < mbl; ++j){
723 jb =
min(nb, m-j*nb);
725 dB(igpu, j/nrgpu, k), lddb,
726 B(j, k), ldb, stream[igpu][2] );
740 for(igpu = 0; igpu < nrgpu; ++igpu)
744 for (j = 0; j < mbl; ++j){
747 jb =
min(nb, m-j*nb);
751 dB(igpu, j/nrgpu, 0), lddb, stream[igpu][(nbl+1)%2] );
753 kb =
min(nb, n-(nbl-1)*nb);
754 for (igpu = 0; igpu < nrgpu; ++igpu){
758 dA(igpu, 0, (nbl-1)%2), ldda, stream[igpu][(nbl+1)%2] );
760 for (k = nbl-1; k >= 0; --k){
763 for (igpu = 0; igpu < nrgpu; ++igpu){
767 dA(igpu, 0, (k+1)%2), ldda, stream[igpu][(k+1)%2] );
775 kb =
min(nb, n-k*nb);
777 for (igpu = 0; igpu < nrgpu; ++igpu){
780 magma_dtrsm(side, uplo, transa, diag, mloc[igpu], kb, alpha_,
dA(igpu, k, k%2), ldda,
781 dB(igpu, 0, k), lddb );
785 for (igpu = 0; igpu < nrgpu; ++igpu){
788 magma_dgemm(
MagmaNoTrans, transa, mloc[igpu], k*nb, kb, c_neg_one,
dB(igpu, 0, k), lddb,
789 dA(igpu, 0, k%2), ldda, alpha_,
dB(igpu, 0, 0), lddb );
793 for (igpu = 0; igpu < nrgpu; ++igpu){
797 for (j = 0; j < mbl; ++j){
800 jb =
min(nb, m-j*nb);
802 dB(igpu, j/nrgpu, k), lddb,
803 B(j, k), ldb, stream[igpu][2] );
812 for(igpu = 0; igpu < nrgpu; ++igpu)
816 for (j = 0; j < mbl; ++j){
819 jb =
min(nb, m-j*nb);
823 dB(igpu, j/nrgpu, 0), lddb, stream[igpu][0] );
826 for (igpu = 0; igpu < nrgpu; ++igpu){
830 dA(igpu, 0, 0), ldda, stream[igpu][0] );
832 for (k = 0; k < nbl; ++k){
834 kb =
min(nb, n-(k+1)*nb);
835 for (igpu = 0; igpu < nrgpu; ++igpu){
839 dA(igpu, k+1, (k+1)%2), ldda, stream[igpu][(k+1)%2] );
842 kb =
min(nb, n-k*nb);
849 for (igpu = 0; igpu < nrgpu; ++igpu){
852 magma_dtrsm(side, uplo, transa, diag, mloc[igpu], kb, alpha_,
dA(igpu, k, k%2), ldda,
853 dB(igpu, 0, k), lddb );
858 for (igpu = 0; igpu < nrgpu; ++igpu){
861 magma_dgemm(
MagmaNoTrans, transa, mloc[igpu], n-(k+1)*nb, nb, c_neg_one,
dB(igpu, 0, k), lddb,
862 dA(igpu, k+1, k%2), ldda, alpha_,
dB(igpu, 0, k+1), lddb );
866 for (igpu = 0; igpu < nrgpu; ++igpu){
870 for (j = 0; j < mbl; ++j){
873 jb =
min(nb, m-j*nb);
875 dB(igpu, j/nrgpu, k), lddb,
876 B(j, k), ldb, stream[igpu][2] );
885 for (igpu = 0; igpu < nrgpu; ++igpu){