MAGMA  magma-1.4.0
Matrix Algebra on GPU and Multicore Architectures
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
ztrsm_m.cpp File Reference
#include "common_magma.h"
Include dependency graph for ztrsm_m.cpp:

Go to the source code of this file.

Macros

#define A(i, j)   (a+(j)*nb*lda + (i)*nb)
 
#define B(i, j)   (b+(j)*nb*ldb + (i)*nb)
 
#define dB(gpui, i, j)   (dw[gpui] + (j)*nb*lddb + (i)*nb)
 
#define dA(gpui, i, j)   (dw[gpui] + dimb*lddb + (i)*nb + (j)*nb*ldda)
 

Functions

magma_int_t magma_get_ztrsm_m_nb ()
 
magma_int_t magma_ztrsm_m (magma_int_t nrgpu, char side, char uplo, char transa, char diag, magma_int_t m, magma_int_t n, magmaDoubleComplex alpha, magmaDoubleComplex *a, magma_int_t lda, magmaDoubleComplex *b, magma_int_t ldb)
 

Macro Definition Documentation

#define A (   i,
 
)    (a+(j)*nb*lda + (i)*nb)

Definition at line 17 of file ztrsm_m.cpp.

#define B (   i,
 
)    (b+(j)*nb*ldb + (i)*nb)

Definition at line 18 of file ztrsm_m.cpp.

#define dA (   gpui,
  i,
 
)    (dw[gpui] + dimb*lddb + (i)*nb + (j)*nb*ldda)

Definition at line 22 of file ztrsm_m.cpp.

#define dB (   gpui,
  i,
 
)    (dw[gpui] + (j)*nb*lddb + (i)*nb)

Definition at line 20 of file ztrsm_m.cpp.

Function Documentation

magma_int_t magma_get_ztrsm_m_nb ( )

Definition at line 15 of file ztrsm_m.cpp.

15 { return 128;}

Here is the caller graph for this function:

magma_int_t magma_ztrsm_m ( magma_int_t  nrgpu,
char  side,
char  uplo,
char  transa,
char  diag,
magma_int_t  m,
magma_int_t  n,
magmaDoubleComplex  alpha,
magmaDoubleComplex *  a,
magma_int_t  lda,
magmaDoubleComplex *  b,
magma_int_t  ldb 
)

Definition at line 25 of file ztrsm_m.cpp.

References __func__, A, B, dA, dB, lapackf77_lsame, MAGMA_ERR_DEVICE_ALLOC, magma_free, magma_get_ztrsm_m_nb(), magma_getdevice(), magma_queue_create, magma_queue_destroy, magma_queue_sync, magma_setdevice(), MAGMA_SUCCESS, magma_xerbla(), MAGMA_Z_IMAG, MAGMA_Z_NEG_ONE, MAGMA_Z_ONE, MAGMA_Z_REAL, magma_zgemm(), magma_zgetmatrix_async, magma_zmalloc(), magma_zsetmatrix_async, magma_ztrsm(), magmablasSetKernelStream(), MagmaMaxGPUs, MagmaNoTrans, max, and min.

28 {
29 /* -- MAGMA (version 1.4.0) --
30  Univ. of Tennessee, Knoxville
31  Univ. of California, Berkeley
32  Univ. of Colorado, Denver
33  August 2013
34 
35  Purpose
36  =======
37  ZTRSM solves one of the matrix equations
38  op( A )*X = alpha*B, or X*op( A ) = alpha*B,
39  where alpha is a scalar, X and B are m by n matrices, A is a unit, or
40  non-unit, upper or lower triangular matrix and op( A ) is one of
41 
42  op( A ) = A or op( A ) = A' or op( A ) = conj( A' ).
43 
44  The matrix X is overwritten on B.
45 
46  Parameters
47  ==========
48  SIDE CHARACTER*1.
49  On entry, SIDE specifies whether op( A ) appears on the left
50  or right of X as follows:
51  SIDE = 'L' or 'l' op( A )*X = alpha*B.
52  SIDE = 'R' or 'r' X*op( A ) = alpha*B.
53  Unchanged on exit.
54 
55  UPLO CHARACTER*1.
56  On entry, UPLO specifies whether the matrix A is an upper or
57  lower triangular matrix as follows:
58  UPLO = 'U' or 'u' A is an upper triangular matrix.
59  UPLO = 'L' or 'l' A is a lower triangular matrix.
60  Unchanged on exit.
61 
62  TRANSA CHARACTER*1.
63  On entry, TRANSA specifies the form of op( A ) to be used in
64  the matrix multiplication as follows:
65  TRANSA = 'N' or 'n' op( A ) = A.
66  TRANSA = 'T' or 't' op( A ) = A'.
67  TRANSA = 'C' or 'c' op( A ) = conj( A' ).
68  Unchanged on exit.
69 
70  DIAG CHARACTER*1.
71  On entry, DIAG specifies whether or not A is unit triangular
72  as follows:
73  DIAG = 'U' or 'u' A is assumed to be unit triangular.
74  DIAG = 'N' or 'n' A is not assumed to be unit
75  triangular.
76  Unchanged on exit.
77 
78  M INTEGER.
79  On entry, M specifies the number of rows of B. M must be at
80  least zero.
81  Unchanged on exit.
82 
83  N INTEGER.
84  On entry, N specifies the number of columns of B. N must be
85  at least zero.
86  Unchanged on exit.
87 
88  ALPHA COMPLEX_16 .
89  On entry, ALPHA specifies the scalar alpha. When alpha is
90  zero then A is not referenced and B need not be set before
91  entry.
92  Unchanged on exit.
93 
94  A COMPLEX_16 array of DIMENSION ( LDA, k ), where k is m
95  when SIDE = 'L' or 'l' and is n when SIDE = 'R' or 'r'.
96  Before entry with UPLO = 'U' or 'u', the leading k by k
97  upper triangular part of the array A must contain the upper
98  triangular matrix and the strictly lower triangular part of
99  A is not referenced.
100  Before entry with UPLO = 'L' or 'l', the leading k by k
101  lower triangular part of the array A must contain the lower
102  triangular matrix and the strictly upper triangular part of
103  A is not referenced.
104  Note that when DIAG = 'U' or 'u', the diagonal elements of
105  A are not referenced either, but are assumed to be unity.
106  Unchanged on exit.
107 
108  LDA INTEGER.
109  On entry, LDA specifies the first dimension of A as declared
110  in the calling (sub) program.
111  When SIDE = 'L' or 'l' then LDA >= max( 1, m ),
112  when SIDE = 'R' or 'r' then LDA >= max( 1, n ).
113  Unchanged on exit.
114 
115  B COMPLEX_16 array of DIMENSION ( LDB, n ).
116  Before entry, the leading m by n part of the array B must
117  contain the right-hand side matrix B, and on exit is
118  overwritten by the solution matrix X.
119 
120  LDB INTEGER.
121  On entry, LDB specifies the first dimension of B as declared
122  in the calling (sub) program. LDB must be at least
123  max( 1, m ).
124  Unchanged on exit.
125  ===================================================================== */
126 
127  char side_[2] = {side, 0};
128  char uplo_[2] = {uplo, 0};
129  char transa_[2] = {transa, 0};
130  char diag_[2] = {diag, 0};
131  magmaDoubleComplex c_one = MAGMA_Z_ONE;
132  magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
133  magmaDoubleComplex alpha_;
134  magmaDoubleComplex* dw[MagmaMaxGPUs];
135  magma_queue_t stream [MagmaMaxGPUs][3];
136  magma_int_t lside;
137  magma_int_t upper;
138  magma_int_t notransp;
139  magma_int_t nrowa;
141  magma_int_t igpu = 0;
142  magma_int_t info;
143  magma_int_t k, j, kb, jb;
144  magma_int_t ldda, dima, lddb, dimb;
145  int gpu_b;
146  magma_getdevice(&gpu_b);
147 
148  lside = lapackf77_lsame(side_, "L");
149  if (lside) {
150  nrowa = m;
151  } else {
152  nrowa = n;
153  }
154  upper = lapackf77_lsame(uplo_, "U");
155  notransp = lapackf77_lsame(transa_, "N");
156 
157  info = 0;
158  if (! lside && ! lapackf77_lsame(side_, "R")) {
159  info = 1;
160  } else if (! upper && ! lapackf77_lsame(uplo_, "L")) {
161  info = 2;
162  } else if (! notransp && ! lapackf77_lsame(transa_, "T")
163  && ! lapackf77_lsame(transa_, "C")) {
164  info = 3;
165  } else if (! lapackf77_lsame(diag_, "U") && ! lapackf77_lsame(diag_, "N")) {
166  info = 4;
167  } else if (m < 0) {
168  info = 5;
169  } else if (n < 0) {
170  info = 6;
171  } else if (lda < max(1,nrowa)) {
172  info = 9;
173  } else if (ldb < max(1,m)) {
174  info = 11;
175  }
176 
177  if (info != 0) {
178  magma_xerbla( __func__, -info );
179  return info;
180  }
181 
182  //Quick return if possible.
183 
184  if (n == 0) {
185  return info;
186  }
187 
188  magma_int_t nbl = (n-1)/nb+1; // number of blocks in a row
189  magma_int_t mbl = (m-1)/nb+1; // number of blocks in a column
190 
191  if (lside) {
192  lddb = m;
193  dimb = ((nbl-1)/nrgpu+1)*nb;
194  if ( notransp ) {
195  ldda = m;
196  dima = 2 * nb;
197  } else {
198  ldda = 2 * nb;
199  dima = m;
200  }
201  } else {
202  lddb = ((mbl-1)/nrgpu+1)*nb;
203  dimb = n;
204  if ( !notransp ) {
205  ldda = n;
206  dima = 2 * nb;
207  } else {
208  ldda = 2 * nb;
209  dima = n;
210  }
211  }
212 
213  for (igpu = 0; igpu < nrgpu; ++igpu){
214  magma_setdevice(igpu);
215  if (MAGMA_SUCCESS != magma_zmalloc( &dw[igpu], (dimb*lddb + dima*ldda) )) {
216  info = MAGMA_ERR_DEVICE_ALLOC;
217  return info;
218  }
219  magma_queue_create( &stream[igpu][0] );
220  magma_queue_create( &stream[igpu][1] );
221  magma_queue_create( &stream[igpu][2] );
222  }
223 
224  // alpha = 0 case;
225 
226  if (MAGMA_Z_REAL(alpha) == 0. && MAGMA_Z_IMAG(alpha) == 0.) {
227  printf("ztrsm_m: alpha = 0 not implemented\n");
228  exit(-1);
229 
230  return info;
231  }
232 
233  if (lside) {
234  if (notransp) {
235 
236  //Form B := alpha*inv( A )*B
237 
238  if (upper) {
239 
240  //left upper notranspose
241 
243  for(igpu = 0; igpu < nrgpu; ++igpu)
244  nloc[igpu] = 0;
245 
246  //copy B to mgpus
247  for (k = 0; k < nbl; ++k){
248  igpu = k%nrgpu;
249  magma_setdevice(igpu);
250  kb = min(nb, n-k*nb);
251  nloc[igpu] += kb;
252  magma_zsetmatrix_async( m, kb,
253  B(0, k), ldb,
254  dB(igpu, 0, k/nrgpu), lddb, stream[igpu][(mbl+1)%2] );
255  }
256  jb = min(nb, m-(mbl-1)*nb);
257  for (igpu = 0; igpu < nrgpu; ++igpu){
258  magma_setdevice(igpu);
259  magma_zsetmatrix_async( m, jb,
260  A(0, mbl-1), lda,
261  dA(igpu, 0, (mbl-1)%2), ldda, stream[igpu][(mbl+1)%2] );
262  }
263  for (j = mbl-1; j >= 0; --j){
264  if (j > 0){
265  jb = nb;
266  for (igpu = 0; igpu < nrgpu; ++igpu){
267  magma_setdevice(igpu);
268  magma_zsetmatrix_async( j*nb, jb,
269  A(0, j-1), lda,
270  dA(igpu, 0, (j+1)%2), ldda, stream[igpu][(j+1)%2] );
271  }
272  }
273  if (j==mbl-1)
274  alpha_=alpha;
275  else
276  alpha_= c_one;
277 
278  jb = min(nb, m-j*nb);
279 
280  for (igpu = 0; igpu < nrgpu; ++igpu){
281  magma_setdevice(igpu);
282  magmablasSetKernelStream(stream[igpu][j%2]);
283  magma_ztrsm(side, uplo, transa, diag, jb, nloc[igpu], alpha_, dA(igpu, j, j%2), ldda,
284  dB(igpu, j, 0), lddb );
285  }
286 
287  if (j>0){
288  for (igpu = 0; igpu < nrgpu; ++igpu){
289  magma_setdevice(igpu);
290  magmablasSetKernelStream(stream[igpu][j%2]);
291  magma_zgemm(transa, MagmaNoTrans, j*nb, nloc[igpu], jb, c_neg_one, dA(igpu, 0, j%2), ldda,
292  dB(igpu, j, 0), lddb, alpha_, dB(igpu, 0, 0), lddb );
293  }
294  }
295 
296  for (igpu = 0; igpu < nrgpu; ++igpu){
297  magma_queue_sync( stream[igpu][j%2] );
298  }
299 
300  for (k = 0; k < nbl; ++k){
301  igpu = k%nrgpu;
302  magma_setdevice(igpu);
303  kb = min(nb, n-k*nb);
304  magma_zgetmatrix_async( jb, kb,
305  dB(igpu, j, k/nrgpu), lddb,
306  B(j, k), ldb, stream[igpu][2] );
307  }
308  }
309 
310  }
311  else
312  {
313  //left lower notranspose
314 
316  for(igpu = 0; igpu < nrgpu; ++igpu)
317  nloc[igpu] = 0;
318 
319  //copy B to mgpus
320  for (k = 0; k < nbl; ++k){
321  igpu = k%nrgpu;
322  magma_setdevice(igpu);
323  kb = min(nb, n-k*nb);
324  nloc[igpu] += kb;
325  magma_zsetmatrix_async( m, kb,
326  B(0, k), ldb,
327  dB(igpu, 0, k/nrgpu), lddb, stream[igpu][0] );
328  }
329  jb = min(nb, m);
330  for (igpu = 0; igpu < nrgpu; ++igpu){
331  magma_setdevice(igpu);
332  magma_zsetmatrix_async( m, jb,
333  A(0, 0), lda,
334  dA(igpu, 0, 0), ldda, stream[igpu][0] );
335  }
336  for (j = 0; j < mbl; ++j){
337  if ((j+1)*nb < m){
338  jb = min(nb, m-(j+1)*nb);
339  for (igpu = 0; igpu < nrgpu; ++igpu){
340  magma_setdevice(igpu);
341  magma_zsetmatrix_async( (m-(j+1)*nb), jb,
342  A(j+1, j+1), lda,
343  dA(igpu, j+1, (j+1)%2), ldda, stream[igpu][(j+1)%2] );
344  }
345  }
346  jb = min(nb, m-j*nb);
347 
348  if (j==0)
349  alpha_=alpha;
350  else
351  alpha_= c_one;
352 
353  for (igpu = 0; igpu < nrgpu; ++igpu){
354  magma_setdevice(igpu);
355  magmablasSetKernelStream(stream[igpu][j%2]);
356  magma_ztrsm(side, uplo, transa, diag, jb, nloc[igpu], alpha_, dA(igpu, j, j%2), ldda,
357  dB(igpu, j, 0), lddb );
358  }
359 
360  if ( j < mbl-1 ){
361 
362  for (igpu = 0; igpu < nrgpu; ++igpu){
363  magma_setdevice(igpu);
364  magmablasSetKernelStream(stream[igpu][j%2]);
365  magma_zgemm(transa, MagmaNoTrans, m-(j+1)*nb, nloc[igpu], nb, c_neg_one, dA(igpu, j+1, j%2), ldda,
366  dB(igpu, j, 0), lddb, alpha_, dB(igpu, j+1, 0), lddb );
367  }
368  }
369 
370  for (igpu = 0; igpu < nrgpu; ++igpu){
371  magma_queue_sync( stream[igpu][j%2] );
372  }
373 
374  for (k = 0; k < nbl; ++k){
375  igpu = k%nrgpu;
376  magma_setdevice(igpu);
377  kb = min(nb, n-k*nb);
378  magma_zgetmatrix_async( jb, kb,
379  dB(igpu, j, k/nrgpu), lddb,
380  B(j, k), ldb, stream[igpu][2] );
381  }
382  }
383 
384  }
385  }
386  else
387  {
388 
389  //Form B := alpha*inv( A' )*B
390 
391  if (upper) {
392 
393  //left upper transpose or conj transpose
394 
396  for(igpu = 0; igpu < nrgpu; ++igpu)
397  nloc[igpu] = 0;
398 
399  //copy B to mgpus
400  for (k = 0; k < nbl; ++k){
401  igpu = k%nrgpu;
402  magma_setdevice(igpu);
403  kb = min(nb, n-k*nb);
404  nloc[igpu] += kb;
405  magma_zsetmatrix_async( m, kb,
406  B(0, k), ldb,
407  dB(igpu, 0, k/nrgpu), lddb, stream[igpu][0] );
408  }
409  jb = min(nb, m);
410  for (igpu = 0; igpu < nrgpu; ++igpu){
411  magma_setdevice(igpu);
412  magma_zsetmatrix_async( jb, m,
413  A(0, 0), lda,
414  dA(igpu, 0, 0), ldda, stream[igpu][0] );
415  }
416  for (j = 0; j < mbl; ++j){
417  if ((j+1)*nb < m){
418  jb = min(nb, m-(j+1)*nb);
419  for (igpu = 0; igpu < nrgpu; ++igpu){
420  magma_setdevice(igpu);
421  magma_zsetmatrix_async( jb, m-(j+1)*nb,
422  A(j+1, j+1), lda,
423  dA(igpu, (j+1)%2, j+1), ldda, stream[igpu][(j+1)%2] );
424  }
425  }
426  jb = min(nb, m-j*nb);
427 
428  if (j==0)
429  alpha_=alpha;
430  else
431  alpha_= c_one;
432 
433  for (igpu = 0; igpu < nrgpu; ++igpu){
434  magma_setdevice(igpu);
435  magmablasSetKernelStream(stream[igpu][j%2]);
436  magma_ztrsm(side, uplo, transa, diag, jb, nloc[igpu], alpha_, dA(igpu, j%2, j), ldda,
437  dB(igpu, j, 0), lddb );
438  }
439 
440  if ( j < mbl-1 ){
441 
442  for (igpu = 0; igpu < nrgpu; ++igpu){
443  magma_setdevice(igpu);
444  magmablasSetKernelStream(stream[igpu][j%2]);
445  magma_zgemm(transa, MagmaNoTrans, m-(j+1)*nb, nloc[igpu], nb, c_neg_one, dA(igpu, j%2, j+1), ldda,
446  dB(igpu, j, 0), lddb, alpha_, dB(igpu, j+1, 0), lddb );
447  }
448  }
449 
450  for (igpu = 0; igpu < nrgpu; ++igpu){
451  magma_queue_sync( stream[igpu][j%2] );
452  }
453 
454  for (k = 0; k < nbl; ++k){
455  igpu = k%nrgpu;
456  magma_setdevice(igpu);
457  kb = min(nb, n-k*nb);
458  magma_zgetmatrix_async( jb, kb,
459  dB(igpu, j, k/nrgpu), lddb,
460  B(j, k), ldb, stream[igpu][2] );
461  }
462  }
463  }
464  else
465  {
466 
467  //left lower transpose or conj transpose
468 
470  for(igpu = 0; igpu < nrgpu; ++igpu)
471  nloc[igpu] = 0;
472 
473  //copy B to mgpus
474  for (k = 0; k < nbl; ++k){
475  igpu = k%nrgpu;
476  magma_setdevice(igpu);
477  kb = min(nb, n-k*nb);
478  nloc[igpu] += kb;
479  magma_zsetmatrix_async( m, kb,
480  B(0, k), ldb,
481  dB(igpu, 0, k/nrgpu), lddb, stream[igpu][(mbl+1)%2] );
482  }
483  jb = min(nb, m-(mbl-1)*nb);
484  for (igpu = 0; igpu < nrgpu; ++igpu){
485  magma_setdevice(igpu);
486  magma_zsetmatrix_async( jb, m,
487  A(mbl-1, 0), lda,
488  dA(igpu, (mbl-1)%2, 0), ldda, stream[igpu][(mbl+1)%2] );
489  }
490  for (j = mbl-1; j >= 0; --j){
491  if (j > 0){
492  jb = nb;
493  for (igpu = 0; igpu < nrgpu; ++igpu){
494  magma_setdevice(igpu);
495  magma_zsetmatrix_async( jb, j*nb,
496  A(j-1, 0), lda,
497  dA(igpu, (j+1)%2, 0), ldda, stream[igpu][(j+1)%2] );
498  }
499  }
500  if (j==mbl-1)
501  alpha_=alpha;
502  else
503  alpha_= c_one;
504 
505  jb = min(nb, m-j*nb);
506 
507  for (igpu = 0; igpu < nrgpu; ++igpu){
508  magma_setdevice(igpu);
509  magmablasSetKernelStream(stream[igpu][j%2]);
510  magma_ztrsm(side, uplo, transa, diag, jb, nloc[igpu], alpha_, dA(igpu, j%2, j), ldda,
511  dB(igpu, j, 0), lddb );
512  }
513 
514  if (j>0){
515  for (igpu = 0; igpu < nrgpu; ++igpu){
516  magma_setdevice(igpu);
517  magmablasSetKernelStream(stream[igpu][j%2]);
518  magma_zgemm(transa, MagmaNoTrans, j*nb, nloc[igpu], jb, c_neg_one, dA(igpu, j%2, 0), ldda,
519  dB(igpu, j, 0), lddb, alpha_, dB(igpu, 0, 0), lddb );
520  }
521  }
522 
523  for (igpu = 0; igpu < nrgpu; ++igpu){
524  magma_queue_sync( stream[igpu][j%2] );
525  }
526 
527  for (k = 0; k < nbl; ++k){
528  igpu = k%nrgpu;
529  magma_setdevice(igpu);
530  kb = min(nb, n-k*nb);
531  magma_zgetmatrix_async( jb, kb,
532  dB(igpu, j, k/nrgpu), lddb,
533  B(j, k), ldb, stream[igpu][2] );
534  }
535  }
536 
537  }
538  }
539  }
540  else
541  {
542  if (notransp) {
543 
544  //Form B := alpha*B*inv( A ).
545 
546  if (upper) {
547 
548  //right upper notranspose
550  for(igpu = 0; igpu < nrgpu; ++igpu)
551  mloc[igpu] = 0;
552 
553  //copy B to mgpus
554  for (j = 0; j < mbl; ++j){
555  igpu = j%nrgpu;
556  magma_setdevice(igpu);
557  jb = min(nb, m-j*nb);
558  mloc[igpu] += jb;
559  magma_zsetmatrix_async( jb, n,
560  B(j, 0), ldb,
561  dB(igpu, j/nrgpu, 0), lddb, stream[igpu][0] );
562  }
563  kb = min(nb, n);
564  for (igpu = 0; igpu < nrgpu; ++igpu){
565  magma_setdevice(igpu);
566  magma_zsetmatrix_async( kb, n,
567  A(0, 0), lda,
568  dA(igpu, 0, 0), ldda, stream[igpu][0] );
569  }
570  for (k = 0; k < nbl; ++k){
571  if ((k+1)*nb < n){
572  kb = min(nb, n-(k+1)*nb);
573  for (igpu = 0; igpu < nrgpu; ++igpu){
574  magma_setdevice(igpu);
575  magma_zsetmatrix_async( kb, n-(k+1)*nb,
576  A(k+1, k+1), lda,
577  dA(igpu, (k+1)%2, k+1), ldda, stream[igpu][(k+1)%2] );
578  }
579  }
580  kb = min(nb, n-k*nb);
581 
582  if (k==0)
583  alpha_=alpha;
584  else
585  alpha_= c_one;
586 
587  for (igpu = 0; igpu < nrgpu; ++igpu){
588  magma_setdevice(igpu);
589  magmablasSetKernelStream(stream[igpu][k%2]);
590  magma_ztrsm(side, uplo, transa, diag, mloc[igpu], kb, alpha_, dA(igpu, k%2, k), ldda,
591  dB(igpu, 0, k), lddb );
592  }
593 
594  if ( k < nbl-1 ){
595 
596  for (igpu = 0; igpu < nrgpu; ++igpu){
597  magma_setdevice(igpu);
598  magmablasSetKernelStream(stream[igpu][k%2]);
599  magma_zgemm(MagmaNoTrans, transa, mloc[igpu], n-(k+1)*nb, nb, c_neg_one, dB(igpu, 0, k), lddb,
600  dA(igpu, k%2, k+1), ldda, alpha_, dB(igpu, 0, k+1), lddb );
601  }
602  }
603 
604  for (igpu = 0; igpu < nrgpu; ++igpu){
605  magma_queue_sync( stream[igpu][k%2] );
606  }
607 
608  for (j = 0; j < mbl; ++j){
609  igpu = j%nrgpu;
610  magma_setdevice(igpu);
611  jb = min(nb, m-j*nb);
612  magma_zgetmatrix_async( jb, kb,
613  dB(igpu, j/nrgpu, k), lddb,
614  B(j, k), ldb, stream[igpu][2] );
615  }
616  }
617  }
618  else
619  {
620 
621  //right lower notranspose
623  for(igpu = 0; igpu < nrgpu; ++igpu)
624  mloc[igpu] = 0;
625 
626  //copy B to mgpus
627  for (j = 0; j < mbl; ++j){
628  igpu = j%nrgpu;
629  magma_setdevice(igpu);
630  jb = min(nb, m-j*nb);
631  mloc[igpu] += jb;
632  magma_zsetmatrix_async( jb, n,
633  B(j, 0), ldb,
634  dB(igpu, j/nrgpu, 0), lddb, stream[igpu][(nbl+1)%2] );
635  }
636  kb = min(nb, n-(nbl-1)*nb);
637  for (igpu = 0; igpu < nrgpu; ++igpu){
638  magma_setdevice(igpu);
639  magma_zsetmatrix_async( kb, n,
640  A(nbl-1, 0), lda,
641  dA(igpu, (nbl-1)%2, 0), ldda, stream[igpu][(nbl+1)%2] );
642  }
643  for (k = nbl-1; k >= 0; --k){
644  if (k > 0){
645  kb = nb;
646  for (igpu = 0; igpu < nrgpu; ++igpu){
647  magma_setdevice(igpu);
648  magma_zsetmatrix_async( kb, k*nb,
649  A(k-1, 0), lda,
650  dA(igpu, (k+1)%2, 0), ldda, stream[igpu][(k+1)%2] );
651  }
652  }
653  if (k==nbl-1)
654  alpha_=alpha;
655  else
656  alpha_= c_one;
657 
658  kb = min(nb, n-k*nb);
659 
660  for (igpu = 0; igpu < nrgpu; ++igpu){
661  magma_setdevice(igpu);
662  magmablasSetKernelStream(stream[igpu][k%2]);
663  magma_ztrsm(side, uplo, transa, diag, mloc[igpu], kb, alpha_, dA(igpu, k%2, k), ldda,
664  dB(igpu, 0, k), lddb );
665  }
666 
667  if (k>0){
668  for (igpu = 0; igpu < nrgpu; ++igpu){
669  magma_setdevice(igpu);
670  magmablasSetKernelStream(stream[igpu][k%2]);
671  magma_zgemm(MagmaNoTrans, transa, mloc[igpu], k*nb, kb, c_neg_one, dB(igpu, 0, k), lddb,
672  dA(igpu, k%2, 0), ldda, alpha_, dB(igpu, 0, 0), lddb );
673  }
674  }
675 
676  for (igpu = 0; igpu < nrgpu; ++igpu){
677  magma_queue_sync( stream[igpu][k%2] );
678  }
679 
680  for (j = 0; j < mbl; ++j){
681  igpu = j%nrgpu;
682  magma_setdevice(igpu);
683  jb = min(nb, m-j*nb);
684  magma_zgetmatrix_async( jb, kb,
685  dB(igpu, j/nrgpu, k), lddb,
686  B(j, k), ldb, stream[igpu][2] );
687  }
688  }
689  }
690  }
691  else
692  {
693 
694  //Form B := alpha*B*inv( A' ).
695 
696  if (upper) {
697 
698  //right upper transpose or conj transpose
700  for(igpu = 0; igpu < nrgpu; ++igpu)
701  mloc[igpu] = 0;
702 
703  //copy B to mgpus
704  for (j = 0; j < mbl; ++j){
705  igpu = j%nrgpu;
706  magma_setdevice(igpu);
707  jb = min(nb, m-j*nb);
708  mloc[igpu] += jb;
709  magma_zsetmatrix_async( jb, n,
710  B(j, 0), ldb,
711  dB(igpu, j/nrgpu, 0), lddb, stream[igpu][(nbl+1)%2] );
712  }
713  kb = min(nb, n-(nbl-1)*nb);
714  for (igpu = 0; igpu < nrgpu; ++igpu){
715  magma_setdevice(igpu);
716  magma_zsetmatrix_async( n, kb,
717  A(0, nbl-1), lda,
718  dA(igpu, 0, (nbl-1)%2), ldda, stream[igpu][(nbl+1)%2] );
719  }
720  for (k = nbl-1; k >= 0; --k){
721  if (k > 0){
722  kb = nb;
723  for (igpu = 0; igpu < nrgpu; ++igpu){
724  magma_setdevice(igpu);
725  magma_zsetmatrix_async( k*nb, kb,
726  A(0, k-1), lda,
727  dA(igpu, 0, (k+1)%2), ldda, stream[igpu][(k+1)%2] );
728  }
729  }
730  if (k==nbl-1)
731  alpha_=alpha;
732  else
733  alpha_= c_one;
734 
735  kb = min(nb, n-k*nb);
736 
737  for (igpu = 0; igpu < nrgpu; ++igpu){
738  magma_setdevice(igpu);
739  magmablasSetKernelStream(stream[igpu][k%2]);
740  magma_ztrsm(side, uplo, transa, diag, mloc[igpu], kb, alpha_, dA(igpu, k, k%2), ldda,
741  dB(igpu, 0, k), lddb );
742  }
743 
744  if (k>0){
745  for (igpu = 0; igpu < nrgpu; ++igpu){
746  magma_setdevice(igpu);
747  magmablasSetKernelStream(stream[igpu][k%2]);
748  magma_zgemm(MagmaNoTrans, transa, mloc[igpu], k*nb, kb, c_neg_one, dB(igpu, 0, k), lddb,
749  dA(igpu, 0, k%2), ldda, alpha_, dB(igpu, 0, 0), lddb );
750  }
751  }
752 
753  for (igpu = 0; igpu < nrgpu; ++igpu){
754  magma_queue_sync( stream[igpu][k%2] );
755  }
756 
757  for (j = 0; j < mbl; ++j){
758  igpu = j%nrgpu;
759  magma_setdevice(igpu);
760  jb = min(nb, m-j*nb);
761  magma_zgetmatrix_async( jb, kb,
762  dB(igpu, j/nrgpu, k), lddb,
763  B(j, k), ldb, stream[igpu][2] );
764  }
765  }
766  }
767  else
768  {
769 
770  //right lower transpose or conj transpose
772  for(igpu = 0; igpu < nrgpu; ++igpu)
773  mloc[igpu] = 0;
774 
775  //copy B to mgpus
776  for (j = 0; j < mbl; ++j){
777  igpu = j%nrgpu;
778  magma_setdevice(igpu);
779  jb = min(nb, m-j*nb);
780  mloc[igpu] += jb;
781  magma_zsetmatrix_async( jb, n,
782  B(j, 0), ldb,
783  dB(igpu, j/nrgpu, 0), lddb, stream[igpu][0] );
784  }
785  kb = min(nb, n);
786  for (igpu = 0; igpu < nrgpu; ++igpu){
787  magma_setdevice(igpu);
788  magma_zsetmatrix_async( n, kb,
789  A(0, 0), lda,
790  dA(igpu, 0, 0), ldda, stream[igpu][0] );
791  }
792  for (k = 0; k < nbl; ++k){
793  if ((k+1)*nb < n){
794  kb = min(nb, n-(k+1)*nb);
795  for (igpu = 0; igpu < nrgpu; ++igpu){
796  magma_setdevice(igpu);
797  magma_zsetmatrix_async( (n-(k+1)*nb), kb,
798  A(k+1, k+1), lda,
799  dA(igpu, k+1, (k+1)%2), ldda, stream[igpu][(k+1)%2] );
800  }
801  }
802  kb = min(nb, n-k*nb);
803 
804  if (k==0)
805  alpha_=alpha;
806  else
807  alpha_= c_one;
808 
809  for (igpu = 0; igpu < nrgpu; ++igpu){
810  magma_setdevice(igpu);
811  magmablasSetKernelStream(stream[igpu][k%2]);
812  magma_ztrsm(side, uplo, transa, diag, mloc[igpu], kb, alpha_, dA(igpu, k, k%2), ldda,
813  dB(igpu, 0, k), lddb );
814  }
815 
816  if ( k < nbl-1 ){
817 
818  for (igpu = 0; igpu < nrgpu; ++igpu){
819  magma_setdevice(igpu);
820  magmablasSetKernelStream(stream[igpu][k%2]);
821  magma_zgemm(MagmaNoTrans, transa, mloc[igpu], n-(k+1)*nb, nb, c_neg_one, dB(igpu, 0, k), lddb,
822  dA(igpu, k+1, k%2), ldda, alpha_, dB(igpu, 0, k+1), lddb );
823  }
824  }
825 
826  for (igpu = 0; igpu < nrgpu; ++igpu){
827  magma_queue_sync( stream[igpu][k%2] );
828  }
829 
830  for (j = 0; j < mbl; ++j){
831  igpu = j%nrgpu;
832  magma_setdevice(igpu);
833  jb = min(nb, m-j*nb);
834  magma_zgetmatrix_async( jb, kb,
835  dB(igpu, j/nrgpu, k), lddb,
836  B(j, k), ldb, stream[igpu][2] );
837  }
838  }
839  }
840  }
841 
842  }
843 
844 
845  for (igpu = 0; igpu < nrgpu; ++igpu){
846  magma_setdevice(igpu);
848  magma_queue_sync( stream[igpu][2] );
849  magma_queue_destroy( stream[igpu][0] );
850  magma_queue_destroy( stream[igpu][1] );
851  magma_queue_destroy( stream[igpu][2] );
852  magma_free( dw[igpu] );
853  }
854 
855  magma_setdevice(gpu_b);
856 
857  return info;
858 
859 } /* magma_ztrsm_m */
#define min(a, b)
Definition: common_magma.h:86
#define magma_queue_create(queuePtr)
Definition: magma.h:113
#define __func__
Definition: common_magma.h:65
#define magma_zgetmatrix_async(m, n, dA_src, ldda, hB_dst, ldb, queue)
Definition: magmablas_z.h:714
#define MAGMA_Z_NEG_ONE
Definition: magma.h:134
static magma_err_t magma_zmalloc(magmaDoubleComplex_ptr *ptrPtr, size_t n)
Definition: magma.h:80
#define MAGMA_ERR_DEVICE_ALLOC
Definition: magma_types.h:276
magma_int_t magma_get_ztrsm_m_nb()
Definition: ztrsm_m.cpp:15
#define dB(gpui, i, j)
Definition: ztrsm_m.cpp:20
#define magma_free(ptr)
Definition: magma.h:57
int magma_int_t
Definition: magmablas.h:12
#define MAGMA_Z_IMAG(a)
Definition: magma.h:125
#define magma_queue_destroy(queue)
Definition: magma.h:116
void magma_zgemm(magma_trans_t transA, magma_trans_t transB, magma_int_t m, magma_int_t n, magma_int_t k, magmaDoubleComplex alpha, magmaDoubleComplex_const_ptr dA, magma_int_t ldda, magmaDoubleComplex_const_ptr dB, magma_int_t lddb, magmaDoubleComplex beta, magmaDoubleComplex_ptr dC, magma_int_t lddc)
void magma_ztrsm(magma_side_t side, magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_int_t m, magma_int_t n, magmaDoubleComplex alpha, magmaDoubleComplex_const_ptr dA, magma_int_t ldda, magmaDoubleComplex_ptr dB, magma_int_t lddb)
cublasStatus_t magmablasSetKernelStream(magma_queue_t stream)
void magma_setdevice(magma_device_t dev)
void magma_getdevice(magma_device_t *dev)
void magma_xerbla(const char *srname, magma_int_t info)
Definition: xerbla.cpp:8
#define MagmaMaxGPUs
Definition: magma_types.h:255
#define lapackf77_lsame
Definition: magma_lapack.h:23
magma_int_t ldda
#define B(i, j)
Definition: ztrsm_m.cpp:18
#define MAGMA_SUCCESS
Definition: magma.h:106
#define A(i, j)
Definition: ztrsm_m.cpp:17
#define magma_zsetmatrix_async(m, n, hA_src, lda, dB_dst, lddb, queue)
Definition: magmablas_z.h:711
#define MAGMA_Z_ONE
Definition: magma.h:132
#define MagmaNoTrans
Definition: magma.h:57
#define max(a, b)
Definition: common_magma.h:82
#define dA(gpui, i, j)
Definition: ztrsm_m.cpp:22
#define MAGMA_Z_REAL(a)
Definition: magma.h:124
#define magma_queue_sync(queue)
Definition: magma.h:119

Here is the call graph for this function:

Here is the caller graph for this function: