MAGMA  1.2.0 MatrixAlgebraonGPUandMulticoreArchitectures
dtrsm_m.cpp
Go to the documentation of this file.
1 /*
2  -- MAGMA (version 1.2.0) --
3  Univ. of Tennessee, Knoxville
4  Univ. of California, Berkeley
6  May 2012
7
8  @author Raffaele Solca
9
10  @generated d Thu May 10 22:27:04 2012
11 */
12 #define N_MAX_GPU 8
13
14 #include "common_magma.h"
15
16 extern "C"
18
19 #define A(i, j) (a+(j)*nb*lda + (i)*nb)
20 #define B(i, j) (b+(j)*nb*ldb + (i)*nb)
21
22 #define dB(gpui, i, j) (dw[gpui] + (j)*nb*lddb + (i)*nb)
23
24 #define dA(gpui, i, j) (dw[gpui] + dimb*lddb + (i)*nb + (j)*nb*ldda)
25
26 extern "C" magma_int_t
27 magma_dtrsm_m (magma_int_t nrgpu, char side, char uplo, char transa, char diag,
28  magma_int_t m, magma_int_t n, double alpha, double *a,
29  magma_int_t lda, double *b, magma_int_t ldb)
30 {
31
32 /* Purpose
33  =======
34
35  DTRSM solves one of the matrix equations
36
37  op( A )*X = alpha*B, or X*op( A ) = alpha*B,
38
39  where alpha is a scalar, X and B are m by n matrices, A is a unit, or
40
41  non-unit, upper or lower triangular matrix and op( A ) is one of
42
43
44  op( A ) = A or op( A ) = A' or op( A ) = g( A' ).
45
46  The matrix X is overwritten on B.
47
48  Parameters
49  ==========
50
51  SIDE - CHARACTER*1.
52  On entry, SIDE specifies whether op( A ) appears on the left
53
54  or right of X as follows:
55
56  SIDE = 'L' or 'l' op( A )*X = alpha*B.
57
58  SIDE = 'R' or 'r' X*op( A ) = alpha*B.
59
60  Unchanged on exit.
61
62  UPLO - CHARACTER*1.
63  On entry, UPLO specifies whether the matrix A is an upper or
64
65  lower triangular matrix as follows:
66
67  UPLO = 'U' or 'u' A is an upper triangular matrix.
68
69  UPLO = 'L' or 'l' A is a lower triangular matrix.
70
71  Unchanged on exit.
72
73  TRANSA - CHARACTER*1.
74  On entry, TRANSA specifies the form of op( A ) to be used in
75
76  the matrix multiplication as follows:
77
78  TRANSA = 'N' or 'n' op( A ) = A.
79
80  TRANSA = 'T' or 't' op( A ) = A'.
81
82  TRANSA = 'C' or 'c' op( A ) = g( A' ).
83
84  Unchanged on exit.
85
86  DIAG - CHARACTER*1.
87  On entry, DIAG specifies whether or not A is unit triangular
88
89  as follows:
90
91  DIAG = 'U' or 'u' A is assumed to be unit triangular.
92
93  DIAG = 'N' or 'n' A is not assumed to be unit
94  triangular.
95
96  Unchanged on exit.
97
98  M - INTEGER.
99  On entry, M specifies the number of rows of B. M must be at
100
101  least zero.
102  Unchanged on exit.
103
104  N - INTEGER.
105  On entry, N specifies the number of columns of B. N must be
106
107  at least zero.
108  Unchanged on exit.
109
110  ALPHA - COMPLEX*16 .
111  On entry, ALPHA specifies the scalar alpha. When alpha is
112
113  zero then A is not referenced and B need not be set before
114
115  entry.
116  Unchanged on exit.
117
118  A - COMPLEX*16 array of DIMENSION ( LDA, k ), where k is m
119
120  when SIDE = 'L' or 'l' and is n when SIDE = 'R' or 'r'.
121
122  Before entry with UPLO = 'U' or 'u', the leading k by k
123
124  upper triangular part of the array A must contain the upper
125
126  triangular matrix and the strictly lower triangular part of
127
128  A is not referenced.
129  Before entry with UPLO = 'L' or 'l', the leading k by k
130
131  lower triangular part of the array A must contain the lower
132
133  triangular matrix and the strictly upper triangular part of
134
135  A is not referenced.
136  Note that when DIAG = 'U' or 'u', the diagonal elements of
137
138  A are not referenced either, but are assumed to be unity.
139
140  Unchanged on exit.
141
142  LDA - INTEGER.
143  On entry, LDA specifies the first dimension of A as declared
144
145  in the calling (sub) program. When SIDE = 'L' or 'l' then
146
147  LDA must be at least max( 1, m ), when SIDE = 'R' or 'r'
148
149  then LDA must be at least max( 1, n ).
150  Unchanged on exit.
151
152  B - COMPLEX*16 array of DIMENSION ( LDB, n ).
153  Before entry, the leading m by n part of the array B must
154
155  contain the right-hand side matrix B, and on exit is
156
157  overwritten by the solution matrix X.
158
159  LDB - INTEGER.
160  On entry, LDB specifies the first dimension of B as declared
161
162  in the calling (sub) program. LDB must be at least
163
164  max( 1, m ).
165  Unchanged on exit.*/
166
167  char side_[2] = {side, 0};
168  char uplo_[2] = {uplo, 0};
169  char transa_[2] = {transa, 0};
170  char diag_[2] = {diag, 0};
171  double c_one = MAGMA_D_ONE;
172  double c_neg_one = MAGMA_D_NEG_ONE;
173  double alpha_;
174  double* dw[N_MAX_GPU];
175  cudaStream_t stream [N_MAX_GPU][3];
176  magma_int_t lside;
177  magma_int_t upper;
178  magma_int_t notransp;
179  magma_int_t nrowa;
181  magma_int_t igpu = 0;
182  magma_int_t info;
183  magma_int_t k,j,jj,kb,jb,jjb;
184  magma_int_t ldda, dima, lddb, dimb;
185  int gpu_b;
186  magma_getdevice(&gpu_b);
187
188  lside = lapackf77_lsame(side_, "L");
189  if (lside) {
190  nrowa = m;
191  } else {
192  nrowa = n;
193  }
194  upper = lapackf77_lsame(uplo_, "U");
195  notransp = lapackf77_lsame(transa_, "N");
196
197  info = 0;
198  if (! lside && ! lapackf77_lsame(side_, "R")) {
199  info = 1;
200  } else if (! upper && ! lapackf77_lsame(uplo_, "L")) {
201  info = 2;
202  } else if (! notransp && ! lapackf77_lsame(transa_, "T")
203  && ! lapackf77_lsame(transa_, "C")) {
204  info = 3;
205  } else if (! lapackf77_lsame(diag_, "U") && ! lapackf77_lsame(diag_, "N")) {
206  info = 4;
207  } else if (m < 0) {
208  info = 5;
209  } else if (n < 0) {
210  info = 6;
211  } else if (lda < max(1,nrowa)) {
212  info = 9;
213  } else if (ldb < max(1,m)) {
214  info = 11;
215  }
216
217  if (info != 0) {
218  magma_xerbla( __func__, -info );
219  return info;
220  }
221
222  //Quick return if possible.
223
224  if (n == 0) {
225  return info;
226  }
227
228  magma_int_t nbl = (n-1)/nb+1; // number of blocks in a row
229  magma_int_t mbl = (m-1)/nb+1; // number of blocks in a column
230
231  if (lside) {
232  lddb = m;
233  dimb = ((nbl-1)/nrgpu+1)*nb;
234  if ( notransp ) {
235  ldda = m;
236  dima = 2 * nb;
237  } else {
238  ldda = 2 * nb;
239  dima = m;
240  }
241  } else {
242  lddb = ((mbl-1)/nrgpu+1)*nb;
243  dimb = n;
244  if ( !notransp ) {
245  ldda = n;
246  dima = 2 * nb;
247  } else {
248  ldda = 2 * nb;
249  dima = n;
250  }
251  }
252
253  for (igpu = 0; igpu < nrgpu; ++igpu){
254  magma_setdevice(igpu);
255  if (MAGMA_SUCCESS != magma_dmalloc( &dw[igpu], (dimb*lddb + dima*ldda) )) {
256  info = MAGMA_ERR_DEVICE_ALLOC;
257  return info;
258  }
259  magma_queue_create( &stream[igpu][0] );
260  magma_queue_create( &stream[igpu][1] );
261  magma_queue_create( &stream[igpu][2] );
262  }
263
264  // alpha = 0 case;
265
266  if (MAGMA_D_REAL(alpha) == 0. && MAGMA_D_IMAG(alpha) == 0.) {
267  printf("dtrsm_m: alpha = 0 not implemented\n");
268  exit(-1);
269
270  return info;
271  }
272
273  if (lside) {
274  if (notransp) {
275
276  //Form B := alpha*inv( A )*B
277
278  if (upper) {
279
280  //left upper notranspose
281
282  magma_int_t nloc[N_MAX_GPU];
283  for(igpu = 0; igpu < nrgpu; ++igpu)
284  nloc[igpu] = 0;
285
286  //copy B to mgpus
287  for (k = 0; k < nbl; ++k){
288  igpu = k%nrgpu;
289  magma_setdevice(igpu);
290  kb = min(nb, n-k*nb);
291  nloc[igpu] += kb;
292  magma_dsetmatrix_async( m, kb,
293  B(0, k), ldb,
294  dB(igpu, 0, k/nrgpu), lddb, stream[igpu][(mbl+1)%2] );
295  }
296  jb = min(nb, m-(mbl-1)*nb);
297  for (igpu = 0; igpu < nrgpu; ++igpu){
298  magma_setdevice(igpu);
299  magma_dsetmatrix_async( m, jb,
300  A(0, mbl-1), lda,
301  dA(igpu, 0, (mbl-1)%2), ldda, stream[igpu][(mbl+1)%2] );
302  }
303  for (j = mbl-1; j >= 0; --j){
304  if (j > 0){
305  jb = nb;
306  for (igpu = 0; igpu < nrgpu; ++igpu){
307  magma_setdevice(igpu);
308  magma_dsetmatrix_async( j*nb, jb,
309  A(0, j-1), lda,
310  dA(igpu, 0, (j+1)%2), ldda, stream[igpu][(j+1)%2] );
311  }
312  }
313  if (j==mbl-1)
314  alpha_=alpha;
315  else
316  alpha_= c_one;
317
318  jb = min(nb, m-j*nb);
319
320  for (igpu = 0; igpu < nrgpu; ++igpu){
321  magma_setdevice(igpu);
322  magmablasSetKernelStream(stream[igpu][j%2]);
323  magma_dtrsm(side, uplo, transa, diag, jb, nloc[igpu], alpha_, dA(igpu, j, j%2), ldda,
324  dB(igpu, j, 0), lddb );
325  }
326
327  if (j>0){
328  for (igpu = 0; igpu < nrgpu; ++igpu){
329  magma_setdevice(igpu);
330  magmablasSetKernelStream(stream[igpu][j%2]);
331  magma_dgemm(transa, MagmaNoTrans, j*nb, nloc[igpu], jb, c_neg_one, dA(igpu, 0, j%2), ldda,
332  dB(igpu, j, 0), lddb, alpha_, dB(igpu, 0, 0), lddb );
333  }
334  }
335
336  for (igpu = 0; igpu < nrgpu; ++igpu){
337  magma_queue_sync( stream[igpu][j%2] );
338  }
339
340  for (k = 0; k < nbl; ++k){
341  igpu = k%nrgpu;
342  magma_setdevice(igpu);
343  kb = min(nb, n-k*nb);
344  magma_dgetmatrix_async( jb, kb,
345  dB(igpu, j, k/nrgpu), lddb,
346  B(j, k), ldb, stream[igpu][2] );
347  }
348  }
349
350  }
351  else
352  {
353  //left lower notranspose
354
355  magma_int_t nloc[N_MAX_GPU];
356  for(igpu = 0; igpu < nrgpu; ++igpu)
357  nloc[igpu] = 0;
358
359  //copy B to mgpus
360  for (k = 0; k < nbl; ++k){
361  igpu = k%nrgpu;
362  magma_setdevice(igpu);
363  kb = min(nb, n-k*nb);
364  nloc[igpu] += kb;
365  magma_dsetmatrix_async( m, kb,
366  B(0, k), ldb,
367  dB(igpu, 0, k/nrgpu), lddb, stream[igpu][0] );
368  }
369  jb = min(nb, m);
370  for (igpu = 0; igpu < nrgpu; ++igpu){
371  magma_setdevice(igpu);
372  magma_dsetmatrix_async( m, jb,
373  A(0, 0), lda,
374  dA(igpu, 0, 0), ldda, stream[igpu][0] );
375  }
376  for (j = 0; j < mbl; ++j){
377  if ((j+1)*nb < m){
378  jb = min(nb, m-(j+1)*nb);
379  for (igpu = 0; igpu < nrgpu; ++igpu){
380  magma_setdevice(igpu);
381  magma_dsetmatrix_async( (m-(j+1)*nb), jb,
382  A(j+1, j+1), lda,
383  dA(igpu, j+1, (j+1)%2), ldda, stream[igpu][(j+1)%2] );
384  }
385  }
386  jb = min(nb, m-j*nb);
387
388  if (j==0)
389  alpha_=alpha;
390  else
391  alpha_= c_one;
392
393  for (igpu = 0; igpu < nrgpu; ++igpu){
394  magma_setdevice(igpu);
395  magmablasSetKernelStream(stream[igpu][j%2]);
396  magma_dtrsm(side, uplo, transa, diag, jb, nloc[igpu], alpha_, dA(igpu, j, j%2), ldda,
397  dB(igpu, j, 0), lddb );
398  }
399
400  if ( j < mbl-1 ){
401
402  for (igpu = 0; igpu < nrgpu; ++igpu){
403  magma_setdevice(igpu);
404  magmablasSetKernelStream(stream[igpu][j%2]);
405  magma_dgemm(transa, MagmaNoTrans, m-(j+1)*nb, nloc[igpu], nb, c_neg_one, dA(igpu, j+1, j%2), ldda,
406  dB(igpu, j, 0), lddb, alpha_, dB(igpu, j+1, 0), lddb );
407  }
408  }
409
410  for (igpu = 0; igpu < nrgpu; ++igpu){
411  magma_queue_sync( stream[igpu][j%2] );
412  }
413
414  for (k = 0; k < nbl; ++k){
415  igpu = k%nrgpu;
416  magma_setdevice(igpu);
417  kb = min(nb, n-k*nb);
418  magma_dgetmatrix_async( jb, kb,
419  dB(igpu, j, k/nrgpu), lddb,
420  B(j, k), ldb, stream[igpu][2] );
421  }
422  }
423
424  }
425  }
426  else
427  {
428
429  //Form B := alpha*inv( A' )*B
430
431  if (upper) {
432
433  //left upper transpose or transpose
434
435  magma_int_t nloc[N_MAX_GPU];
436  for(igpu = 0; igpu < nrgpu; ++igpu)
437  nloc[igpu] = 0;
438
439  //copy B to mgpus
440  for (k = 0; k < nbl; ++k){
441  igpu = k%nrgpu;
442  magma_setdevice(igpu);
443  kb = min(nb, n-k*nb);
444  nloc[igpu] += kb;
445  magma_dsetmatrix_async( m, kb,
446  B(0, k), ldb,
447  dB(igpu, 0, k/nrgpu), lddb, stream[igpu][0] );
448  }
449  jb = min(nb, m);
450  for (igpu = 0; igpu < nrgpu; ++igpu){
451  magma_setdevice(igpu);
452  magma_dsetmatrix_async( jb, m,
453  A(0, 0), lda,
454  dA(igpu, 0, 0), ldda, stream[igpu][0] );
455  }
456  for (j = 0; j < mbl; ++j){
457  if ((j+1)*nb < m){
458  jb = min(nb, m-(j+1)*nb);
459  for (igpu = 0; igpu < nrgpu; ++igpu){
460  magma_setdevice(igpu);
461  magma_dsetmatrix_async( jb, m-(j+1)*nb,
462  A(j+1, j+1), lda,
463  dA(igpu, (j+1)%2, j+1), ldda, stream[igpu][(j+1)%2] );
464  }
465  }
466  jb = min(nb, m-j*nb);
467
468  if (j==0)
469  alpha_=alpha;
470  else
471  alpha_= c_one;
472
473  for (igpu = 0; igpu < nrgpu; ++igpu){
474  magma_setdevice(igpu);
475  magmablasSetKernelStream(stream[igpu][j%2]);
476  magma_dtrsm(side, uplo, transa, diag, jb, nloc[igpu], alpha_, dA(igpu, j%2, j), ldda,
477  dB(igpu, j, 0), lddb );
478  }
479
480  if ( j < mbl-1 ){
481
482  for (igpu = 0; igpu < nrgpu; ++igpu){
483  magma_setdevice(igpu);
484  magmablasSetKernelStream(stream[igpu][j%2]);
485  magma_dgemm(transa, MagmaNoTrans, m-(j+1)*nb, nloc[igpu], nb, c_neg_one, dA(igpu, j%2, j+1), ldda,
486  dB(igpu, j, 0), lddb, alpha_, dB(igpu, j+1, 0), lddb );
487  }
488  }
489
490  for (igpu = 0; igpu < nrgpu; ++igpu){
491  magma_queue_sync( stream[igpu][j%2] );
492  }
493
494  for (k = 0; k < nbl; ++k){
495  igpu = k%nrgpu;
496  magma_setdevice(igpu);
497  kb = min(nb, n-k*nb);
498  magma_dgetmatrix_async( jb, kb,
499  dB(igpu, j, k/nrgpu), lddb,
500  B(j, k), ldb, stream[igpu][2] );
501  }
502  }
503  }
504  else
505  {
506
507  //left lower transpose or transpose
508
509  magma_int_t nloc[N_MAX_GPU];
510  for(igpu = 0; igpu < nrgpu; ++igpu)
511  nloc[igpu] = 0;
512
513  //copy B to mgpus
514  for (k = 0; k < nbl; ++k){
515  igpu = k%nrgpu;
516  magma_setdevice(igpu);
517  kb = min(nb, n-k*nb);
518  nloc[igpu] += kb;
519  magma_dsetmatrix_async( m, kb,
520  B(0, k), ldb,
521  dB(igpu, 0, k/nrgpu), lddb, stream[igpu][(mbl+1)%2] );
522  }
523  jb = min(nb, m-(mbl-1)*nb);
524  for (igpu = 0; igpu < nrgpu; ++igpu){
525  magma_setdevice(igpu);
526  magma_dsetmatrix_async( jb, m,
527  A(mbl-1, 0), lda,
528  dA(igpu, (mbl-1)%2, 0), ldda, stream[igpu][(mbl+1)%2] );
529  }
530  for (j = mbl-1; j >= 0; --j){
531  if (j > 0){
532  jb = nb;
533  for (igpu = 0; igpu < nrgpu; ++igpu){
534  magma_setdevice(igpu);
535  magma_dsetmatrix_async( jb, j*nb,
536  A(j-1, 0), lda,
537  dA(igpu, (j+1)%2, 0), ldda, stream[igpu][(j+1)%2] );
538  }
539  }
540  if (j==mbl-1)
541  alpha_=alpha;
542  else
543  alpha_= c_one;
544
545  jb = min(nb, m-j*nb);
546
547  for (igpu = 0; igpu < nrgpu; ++igpu){
548  magma_setdevice(igpu);
549  magmablasSetKernelStream(stream[igpu][j%2]);
550  magma_dtrsm(side, uplo, transa, diag, jb, nloc[igpu], alpha_, dA(igpu, j%2, j), ldda,
551  dB(igpu, j, 0), lddb );
552  }
553
554  if (j>0){
555  for (igpu = 0; igpu < nrgpu; ++igpu){
556  magma_setdevice(igpu);
557  magmablasSetKernelStream(stream[igpu][j%2]);
558  magma_dgemm(transa, MagmaNoTrans, j*nb, nloc[igpu], jb, c_neg_one, dA(igpu, j%2, 0), ldda,
559  dB(igpu, j, 0), lddb, alpha_, dB(igpu, 0, 0), lddb );
560  }
561  }
562
563  for (igpu = 0; igpu < nrgpu; ++igpu){
564  magma_queue_sync( stream[igpu][j%2] );
565  }
566
567  for (k = 0; k < nbl; ++k){
568  igpu = k%nrgpu;
569  magma_setdevice(igpu);
570  kb = min(nb, n-k*nb);
571  magma_dgetmatrix_async( jb, kb,
572  dB(igpu, j, k/nrgpu), lddb,
573  B(j, k), ldb, stream[igpu][2] );
574  }
575  }
576
577  }
578  }
579  }
580  else
581  {
582  if (notransp) {
583
584  //Form B := alpha*B*inv( A ).
585
586  if (upper) {
587
588  //right upper notranspose
589  magma_int_t mloc[N_MAX_GPU];
590  for(igpu = 0; igpu < nrgpu; ++igpu)
591  mloc[igpu] = 0;
592
593  //copy B to mgpus
594  for (j = 0; j < mbl; ++j){
595  igpu = j%nrgpu;
596  magma_setdevice(igpu);
597  jb = min(nb, m-j*nb);
598  mloc[igpu] += jb;
599  magma_dsetmatrix_async( jb, n,
600  B(j, 0), ldb,
601  dB(igpu, j/nrgpu, 0), lddb, stream[igpu][0] );
602  }
603  kb = min(nb, n);
604  for (igpu = 0; igpu < nrgpu; ++igpu){
605  magma_setdevice(igpu);
606  magma_dsetmatrix_async( kb, n,
607  A(0, 0), lda,
608  dA(igpu, 0, 0), ldda, stream[igpu][0] );
609  }
610  for (k = 0; k < nbl; ++k){
611  if ((k+1)*nb < n){
612  kb = min(nb, n-(k+1)*nb);
613  for (igpu = 0; igpu < nrgpu; ++igpu){
614  magma_setdevice(igpu);
615  magma_dsetmatrix_async( kb, n-(k+1)*nb,
616  A(k+1, k+1), lda,
617  dA(igpu, (k+1)%2, k+1), ldda, stream[igpu][(k+1)%2] );
618  }
619  }
620  kb = min(nb, n-k*nb);
621
622  if (k==0)
623  alpha_=alpha;
624  else
625  alpha_= c_one;
626
627  for (igpu = 0; igpu < nrgpu; ++igpu){
628  magma_setdevice(igpu);
629  magmablasSetKernelStream(stream[igpu][k%2]);
630  magma_dtrsm(side, uplo, transa, diag, mloc[igpu], kb, alpha_, dA(igpu, k%2, k), ldda,
631  dB(igpu, 0, k), lddb );
632  }
633
634  if ( k < nbl-1 ){
635
636  for (igpu = 0; igpu < nrgpu; ++igpu){
637  magma_setdevice(igpu);
638  magmablasSetKernelStream(stream[igpu][k%2]);
639  magma_dgemm(MagmaNoTrans, transa, mloc[igpu], n-(k+1)*nb, nb, c_neg_one, dB(igpu, 0, k), lddb,
640  dA(igpu, k%2, k+1), ldda, alpha_, dB(igpu, 0, k+1), lddb );
641  }
642  }
643
644  for (igpu = 0; igpu < nrgpu; ++igpu){
645  magma_queue_sync( stream[igpu][k%2] );
646  }
647
648  for (j = 0; j < mbl; ++j){
649  igpu = j%nrgpu;
650  magma_setdevice(igpu);
651  jb = min(nb, m-j*nb);
652  magma_dgetmatrix_async( jb, kb,
653  dB(igpu, j/nrgpu, k), lddb,
654  B(j, k), ldb, stream[igpu][2] );
655  }
656  }
657  }
658  else
659  {
660
661  //right lower notranspose
662  magma_int_t mloc[N_MAX_GPU];
663  for(igpu = 0; igpu < nrgpu; ++igpu)
664  mloc[igpu] = 0;
665
666  //copy B to mgpus
667  for (j = 0; j < mbl; ++j){
668  igpu = j%nrgpu;
669  magma_setdevice(igpu);
670  jb = min(nb, m-j*nb);
671  mloc[igpu] += jb;
672  magma_dsetmatrix_async( jb, n,
673  B(j, 0), ldb,
674  dB(igpu, j/nrgpu, 0), lddb, stream[igpu][(nbl+1)%2] );
675  }
676  kb = min(nb, n-(nbl-1)*nb);
677  for (igpu = 0; igpu < nrgpu; ++igpu){
678  magma_setdevice(igpu);
679  magma_dsetmatrix_async( kb, n,
680  A(nbl-1, 0), lda,
681  dA(igpu, (nbl-1)%2, 0), ldda, stream[igpu][(nbl+1)%2] );
682  }
683  for (k = nbl-1; k >= 0; --k){
684  if (k > 0){
685  kb = nb;
686  for (igpu = 0; igpu < nrgpu; ++igpu){
687  magma_setdevice(igpu);
688  magma_dsetmatrix_async( kb, k*nb,
689  A(k-1, 0), lda,
690  dA(igpu, (k+1)%2, 0), ldda, stream[igpu][(k+1)%2] );
691  }
692  }
693  if (k==nbl-1)
694  alpha_=alpha;
695  else
696  alpha_= c_one;
697
698  kb = min(nb, n-k*nb);
699
700  for (igpu = 0; igpu < nrgpu; ++igpu){
701  magma_setdevice(igpu);
702  magmablasSetKernelStream(stream[igpu][k%2]);
703  magma_dtrsm(side, uplo, transa, diag, mloc[igpu], kb, alpha_, dA(igpu, k%2, k), ldda,
704  dB(igpu, 0, k), lddb );
705  }
706
707  if (k>0){
708  for (igpu = 0; igpu < nrgpu; ++igpu){
709  magma_setdevice(igpu);
710  magmablasSetKernelStream(stream[igpu][k%2]);
711  magma_dgemm(MagmaNoTrans, transa, mloc[igpu], k*nb, kb, c_neg_one, dB(igpu, 0, k), lddb,
712  dA(igpu, k%2, 0), ldda, alpha_, dB(igpu, 0, 0), lddb );
713  }
714  }
715
716  for (igpu = 0; igpu < nrgpu; ++igpu){
717  magma_queue_sync( stream[igpu][k%2] );
718  }
719
720  for (j = 0; j < mbl; ++j){
721  igpu = j%nrgpu;
722  magma_setdevice(igpu);
723  jb = min(nb, m-j*nb);
724  magma_dgetmatrix_async( jb, kb,
725  dB(igpu, j/nrgpu, k), lddb,
726  B(j, k), ldb, stream[igpu][2] );
727  }
728  }
729  }
730  }
731  else
732  {
733
734  //Form B := alpha*B*inv( A' ).
735
736  if (upper) {
737
738  //right upper transpose or transpose
739  magma_int_t mloc[N_MAX_GPU];
740  for(igpu = 0; igpu < nrgpu; ++igpu)
741  mloc[igpu] = 0;
742
743  //copy B to mgpus
744  for (j = 0; j < mbl; ++j){
745  igpu = j%nrgpu;
746  magma_setdevice(igpu);
747  jb = min(nb, m-j*nb);
748  mloc[igpu] += jb;
749  magma_dsetmatrix_async( jb, n,
750  B(j, 0), ldb,
751  dB(igpu, j/nrgpu, 0), lddb, stream[igpu][(nbl+1)%2] );
752  }
753  kb = min(nb, n-(nbl-1)*nb);
754  for (igpu = 0; igpu < nrgpu; ++igpu){
755  magma_setdevice(igpu);
756  magma_dsetmatrix_async( n, kb,
757  A(0, nbl-1), lda,
758  dA(igpu, 0, (nbl-1)%2), ldda, stream[igpu][(nbl+1)%2] );
759  }
760  for (k = nbl-1; k >= 0; --k){
761  if (k > 0){
762  kb = nb;
763  for (igpu = 0; igpu < nrgpu; ++igpu){
764  magma_setdevice(igpu);
765  magma_dsetmatrix_async( k*nb, kb,
766  A(0, k-1), lda,
767  dA(igpu, 0, (k+1)%2), ldda, stream[igpu][(k+1)%2] );
768  }
769  }
770  if (k==nbl-1)
771  alpha_=alpha;
772  else
773  alpha_= c_one;
774
775  kb = min(nb, n-k*nb);
776
777  for (igpu = 0; igpu < nrgpu; ++igpu){
778  magma_setdevice(igpu);
779  magmablasSetKernelStream(stream[igpu][k%2]);
780  magma_dtrsm(side, uplo, transa, diag, mloc[igpu], kb, alpha_, dA(igpu, k, k%2), ldda,
781  dB(igpu, 0, k), lddb );
782  }
783
784  if (k>0){
785  for (igpu = 0; igpu < nrgpu; ++igpu){
786  magma_setdevice(igpu);
787  magmablasSetKernelStream(stream[igpu][k%2]);
788  magma_dgemm(MagmaNoTrans, transa, mloc[igpu], k*nb, kb, c_neg_one, dB(igpu, 0, k), lddb,
789  dA(igpu, 0, k%2), ldda, alpha_, dB(igpu, 0, 0), lddb );
790  }
791  }
792
793  for (igpu = 0; igpu < nrgpu; ++igpu){
794  magma_queue_sync( stream[igpu][k%2] );
795  }
796
797  for (j = 0; j < mbl; ++j){
798  igpu = j%nrgpu;
799  magma_setdevice(igpu);
800  jb = min(nb, m-j*nb);
801  magma_dgetmatrix_async( jb, kb,
802  dB(igpu, j/nrgpu, k), lddb,
803  B(j, k), ldb, stream[igpu][2] );
804  }
805  }
806  }
807  else
808  {
809
810  //right lower transpose or transpose
811  magma_int_t mloc[N_MAX_GPU];
812  for(igpu = 0; igpu < nrgpu; ++igpu)
813  mloc[igpu] = 0;
814
815  //copy B to mgpus
816  for (j = 0; j < mbl; ++j){
817  igpu = j%nrgpu;
818  magma_setdevice(igpu);
819  jb = min(nb, m-j*nb);
820  mloc[igpu] += jb;
821  magma_dsetmatrix_async( jb, n,
822  B(j, 0), ldb,
823  dB(igpu, j/nrgpu, 0), lddb, stream[igpu][0] );
824  }
825  kb = min(nb, n);
826  for (igpu = 0; igpu < nrgpu; ++igpu){
827  magma_setdevice(igpu);
828  magma_dsetmatrix_async( n, kb,
829  A(0, 0), lda,
830  dA(igpu, 0, 0), ldda, stream[igpu][0] );
831  }
832  for (k = 0; k < nbl; ++k){
833  if ((k+1)*nb < n){
834  kb = min(nb, n-(k+1)*nb);
835  for (igpu = 0; igpu < nrgpu; ++igpu){
836  magma_setdevice(igpu);
837  magma_dsetmatrix_async( (n-(k+1)*nb), kb,
838  A(k+1, k+1), lda,
839  dA(igpu, k+1, (k+1)%2), ldda, stream[igpu][(k+1)%2] );
840  }
841  }
842  kb = min(nb, n-k*nb);
843
844  if (k==0)
845  alpha_=alpha;
846  else
847  alpha_= c_one;
848
849  for (igpu = 0; igpu < nrgpu; ++igpu){
850  magma_setdevice(igpu);
851  magmablasSetKernelStream(stream[igpu][k%2]);
852  magma_dtrsm(side, uplo, transa, diag, mloc[igpu], kb, alpha_, dA(igpu, k, k%2), ldda,
853  dB(igpu, 0, k), lddb );
854  }
855
856  if ( k < nbl-1 ){
857
858  for (igpu = 0; igpu < nrgpu; ++igpu){
859  magma_setdevice(igpu);
860  magmablasSetKernelStream(stream[igpu][k%2]);
861  magma_dgemm(MagmaNoTrans, transa, mloc[igpu], n-(k+1)*nb, nb, c_neg_one, dB(igpu, 0, k), lddb,
862  dA(igpu, k+1, k%2), ldda, alpha_, dB(igpu, 0, k+1), lddb );
863  }
864  }
865
866  for (igpu = 0; igpu < nrgpu; ++igpu){
867  magma_queue_sync( stream[igpu][k%2] );
868  }
869
870  for (j = 0; j < mbl; ++j){
871  igpu = j%nrgpu;
872  magma_setdevice(igpu);
873  jb = min(nb, m-j*nb);
874  magma_dgetmatrix_async( jb, kb,
875  dB(igpu, j/nrgpu, k), lddb,
876  B(j, k), ldb, stream[igpu][2] );
877  }
878  }
879  }
880  }
881
882  }
883
884
885  for (igpu = 0; igpu < nrgpu; ++igpu){
886  magma_setdevice(igpu);
888  magma_queue_sync( stream[igpu][2] );
889  magma_queue_destroy( stream[igpu][0] );
890  magma_queue_destroy( stream[igpu][1] );
891  magma_queue_destroy( stream[igpu][2] );
892  magma_free( dw[igpu] );
893  }
894
895  magma_setdevice(gpu_b);
896
897  return info;
898
899 } /* magma_dtrsm_m */
900
901