MAGMA  magma-1.4.0
Matrix Algebra on GPU and Multicore Architectures
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
spotrf3_mgpu.cpp File Reference
#include "common_magma.h"
#include "trace.h"
Include dependency graph for spotrf3_mgpu.cpp:

Go to the source code of this file.

Macros

#define PRECISION_s
 
#define STRSM_WORK
 
#define Alo(i, j)   (a + ((j)+off_j)*lda + (nb*(((i)/nb)%h)+off_i))
 
#define Aup(i, j)   (a +(nb*(((j)/nb)%h)+off_j)*lda + (i+off_i))
 
#define dlA(id, i, j)   (d_lA[(id)] + (j)*ldda + (i))
 
#define dlP(id, i, j, k)   (d_lP[(id)] + (k)*nb*lddp + (j)*lddp + (i))
 
#define dlPT(id, i, j, k)   (d_lP[(id)] + (k)*nb*lddp + (j)*nb + (i))
 
#define dinvA(d, j)   &(d_dinvA[(d)][(j)*trsm_nb*trsm_n])
 
#define dx(d, j)   &(d_x[(d)][(j)*nb*m])
 
#define A(i, j)   (a +(j)*lda + (i))
 
#define dA(d, i, j)   (dwork[(d)]+(j)*ldda + (i))
 

Functions

magma_int_t magma_spotrf3_mgpu (magma_int_t num_gpus, char uplo, magma_int_t m, magma_int_t n, magma_int_t off_i, magma_int_t off_j, magma_int_t nb, float *d_lA[], magma_int_t ldda, float *d_lP[], magma_int_t lddp, float *a, magma_int_t lda, magma_int_t h, magma_queue_t stream[][3], magma_event_t event[][5], magma_int_t *info)
 
magma_int_t magma_shtodpo (magma_int_t num_gpus, char *uplo, magma_int_t m, magma_int_t n, magma_int_t off_i, magma_int_t off_j, magma_int_t nb, float *a, magma_int_t lda, float *dwork[], magma_int_t ldda, magma_queue_t stream[][3], magma_int_t *info)
 
magma_int_t magma_sdtohpo (magma_int_t num_gpus, char *uplo, magma_int_t m, magma_int_t n, magma_int_t off_i, magma_int_t off_j, magma_int_t nb, magma_int_t NB, float *a, magma_int_t lda, float *dwork[], magma_int_t ldda, magma_queue_t stream[][3], magma_int_t *info)
 

Macro Definition Documentation

#define A (   i,
 
)    (a +(j)*lda + (i))

Definition at line 703 of file spotrf3_mgpu.cpp.

#define Alo (   i,
 
)    (a + ((j)+off_j)*lda + (nb*(((i)/nb)%h)+off_i))

Definition at line 24 of file spotrf3_mgpu.cpp.

#define Aup (   i,
 
)    (a +(nb*(((j)/nb)%h)+off_j)*lda + (i+off_i))

Definition at line 25 of file spotrf3_mgpu.cpp.

#define dA (   d,
  i,
 
)    (dwork[(d)]+(j)*ldda + (i))

Definition at line 704 of file spotrf3_mgpu.cpp.

#define dinvA (   d,
 
)    &(d_dinvA[(d)][(j)*trsm_nb*trsm_n])
#define dlA (   id,
  i,
 
)    (d_lA[(id)] + (j)*ldda + (i))

Definition at line 27 of file spotrf3_mgpu.cpp.

#define dlP (   id,
  i,
  j,
 
)    (d_lP[(id)] + (k)*nb*lddp + (j)*lddp + (i))

Definition at line 28 of file spotrf3_mgpu.cpp.

#define dlPT (   id,
  i,
  j,
 
)    (d_lP[(id)] + (k)*nb*lddp + (j)*nb + (i))

Definition at line 29 of file spotrf3_mgpu.cpp.

#define dx (   d,
 
)    &(d_x[(d)][(j)*nb*m])
#define PRECISION_s

Definition at line 15 of file spotrf3_mgpu.cpp.

#define STRSM_WORK

Definition at line 18 of file spotrf3_mgpu.cpp.

Function Documentation

magma_int_t magma_sdtohpo ( magma_int_t  num_gpus,
char *  uplo,
magma_int_t  m,
magma_int_t  n,
magma_int_t  off_i,
magma_int_t  off_j,
magma_int_t  nb,
magma_int_t  NB,
float *  a,
magma_int_t  lda,
float *  dwork[],
magma_int_t  ldda,
magma_queue_t  stream[][3],
magma_int_t info 
)

Definition at line 766 of file spotrf3_mgpu.cpp.

References A, dA, lapackf77_lsame, magma_queue_sync, magma_setdevice(), magma_sgetmatrix_async, and min.

771 {
772  magma_int_t k;
773  if( lapackf77_lsame(uplo, "U") ) {
774  magma_int_t j, jj, jb, mj;
775 
776  /* go through each column */
777  for (j=off_j+NB; j<n; j+=nb) {
778  jj = (j-off_j)/(nb*num_gpus);
779  k = ((j-off_j)/nb)%num_gpus;
780 
781  jb = min(nb, (n-j));
782  if(j+jb < off_j+m)
783  mj = (j-off_i)+jb;
784  else
785  mj = m;
786 
787  magma_setdevice(k);
788  magma_sgetmatrix_async( mj, jb,
789  dA(k, 0, jj*nb), ldda,
790  A(off_i, j), lda,
791  stream[k][0] );
792  magma_queue_sync( stream[k][0] );
793  }
794  } else {
795  magma_int_t i, ii, ib, ni;
796 
797  /* go through each row */
798  for(i=off_i+NB; i<m; i+=nb){
799  ii = (i-off_i)/(nb*num_gpus);
800  k = ((i-off_i)/nb)%num_gpus;
801 
802  ib = min(nb, (m-i));
803  if(i+ib < off_i+n)
804  ni = (i-off_i)+ib;
805  else
806  ni = n;
807 
808  magma_setdevice(k);
809  magma_sgetmatrix_async( ib, ni,
810  dA(k, ii*nb, 0), ldda,
811  A(i, off_j), lda,
812  stream[k][0] );
813  magma_queue_sync( stream[k][0] );
814  }
815  }
816  /*for( k=0; k<num_gpus; k++ ) {
817  magma_setdevice(k);
818  magma_queue_sync( stream[k][0] );
819  }*/
820  magma_setdevice(0);
821 
822  return *info;
823 }
#define min(a, b)
Definition: common_magma.h:86
#define dA(d, i, j)
int magma_int_t
Definition: magmablas.h:12
#define magma_sgetmatrix_async(m, n, dA_src, ldda, hB_dst, ldb, queue)
Definition: magmablas_s.h:714
magma_int_t ldda
void magma_setdevice(magma_device_t dev)
#define lapackf77_lsame
Definition: magma_lapack.h:23
#define A(i, j)
#define magma_queue_sync(queue)
Definition: magma.h:119

Here is the call graph for this function:

Here is the caller graph for this function:

magma_int_t magma_shtodpo ( magma_int_t  num_gpus,
char *  uplo,
magma_int_t  m,
magma_int_t  n,
magma_int_t  off_i,
magma_int_t  off_j,
magma_int_t  nb,
float *  a,
magma_int_t  lda,
float *  dwork[],
magma_int_t  ldda,
magma_queue_t  stream[][3],
magma_int_t info 
)

Definition at line 707 of file spotrf3_mgpu.cpp.

References A, dA, lapackf77_lsame, magma_queue_sync, magma_setdevice(), magma_ssetmatrix_async, and min.

712 {
713  magma_int_t k;
714  if( lapackf77_lsame(uplo, "U") ) {
715  magma_int_t j, jj, jb, mj;
716 
717  /* go through each column */
718  for (j=off_j; j<n; j+=nb) {
719  jj = (j-off_j)/(nb*num_gpus);
720  k = ((j-off_j)/nb)%num_gpus;
721 
722  jb = min(nb, (n-j));
723  if(j+jb < off_j+m)
724  mj = (j-off_i)+jb;
725  else
726  mj = m;
727 
728  magma_setdevice(k);
729  magma_ssetmatrix_async( mj, jb,
730  A(off_i, j), lda,
731  dA(k, 0, jj*nb), ldda,
732  stream[k][0] );
733  }
734  }
735  else {
736  magma_int_t i, ii, ib, ni;
737 
738  /* go through each row */
739  for(i=off_i; i<m; i+=nb){
740  ii = (i-off_i)/(nb*num_gpus);
741  k = ((i-off_i)/nb)%num_gpus;
742 
743  ib = min(nb, (m-i));
744  if(i+ib < off_i+n)
745  ni = (i-off_i)+ib;
746  else
747  ni = n;
748 
749  magma_setdevice(k);
750  magma_ssetmatrix_async( ib, ni,
751  A(i, off_j), lda,
752  dA(k, ii*nb, 0), ldda,
753  stream[k][0] );
754  }
755  }
756  for( k=0; k<num_gpus; k++ ) {
757  magma_setdevice(k);
758  magma_queue_sync( stream[k][0] );
759  }
760  magma_setdevice(0);
761 
762  return *info;
763 }
#define min(a, b)
Definition: common_magma.h:86
#define dA(d, i, j)
int magma_int_t
Definition: magmablas.h:12
magma_int_t ldda
void magma_setdevice(magma_device_t dev)
#define lapackf77_lsame
Definition: magma_lapack.h:23
#define magma_ssetmatrix_async(m, n, hA_src, lda, dB_dst, lddb, queue)
Definition: magmablas_s.h:711
#define A(i, j)
#define magma_queue_sync(queue)
Definition: magma.h:119

Here is the call graph for this function:

Here is the caller graph for this function:

magma_int_t magma_spotrf3_mgpu ( magma_int_t  num_gpus,
char  uplo,
magma_int_t  m,
magma_int_t  n,
magma_int_t  off_i,
magma_int_t  off_j,
magma_int_t  nb,
float *  d_lA[],
magma_int_t  ldda,
float *  d_lP[],
magma_int_t  lddp,
float *  a,
magma_int_t  lda,
magma_int_t  h,
magma_queue_t  stream[][3],
magma_event_t  event[][5],
magma_int_t info 
)

Definition at line 33 of file spotrf3_mgpu.cpp.

References __func__, Alo, Aup, dinvA, dlA, dlP, dlPT, dx, lapackf77_lsame, lapackf77_spotrf(), sgehrd_data::ldda, MAGMA_ERR_DEVICE_ALLOC, magma_event_record(), magma_free, magma_queue_sync, magma_queue_wait_event(), MAGMA_S_NEG_ONE, MAGMA_S_ONE, magma_scopymatrix_async, magma_setdevice(), magma_sgemm(), magma_sgetmatrix_async, magma_smalloc(), magma_ssetmatrix_async, magma_ssyrk(), magma_strsm(), MAGMA_SUCCESS, magma_xerbla(), magmablas_slaset(), magmablas_strsm_work(), magmablasSetKernelStream(), MagmaLeft, MagmaLower, MagmaLowerStr, MagmaMaxGPUs, MagmaNonUnit, MagmaNoTrans, MagmaRight, MagmaTrans, MagmaUpper, MagmaUpperLower, MagmaUpperStr, max, min, trace_cpu_end, trace_cpu_start, trace_finalize, trace_gpu_end, trace_gpu_start, and trace_init.

40 {
41 /* -- MAGMA (version 1.4.0) --
42  Univ. of Tennessee, Knoxville
43  Univ. of California, Berkeley
44  Univ. of Colorado, Denver
45  August 2013
46 
47  Purpose
48  =======
49  SPOTRF computes the Cholesky factorization of a real symmetric
50  positive definite matrix dA.
51  Auxiliary subroutine for spotrf2_ooc. It is multiple gpu interface to compute
52  Cholesky of a "rectangular" matrix.
53 
54  The factorization has the form
55  dA = U**T * U, if UPLO = 'U', or
56  dA = L * L**T, if UPLO = 'L',
57  where U is an upper triangular matrix and L is lower triangular.
58 
59  This is the block version of the algorithm, calling Level 3 BLAS.
60 
61  Arguments
62  =========
63  UPLO (input) CHARACTER*1
64  = 'U': Upper triangle of dA is stored;
65  = 'L': Lower triangle of dA is stored.
66 
67  N (input) INTEGER
68  The order of the matrix dA. N >= 0.
69 
70  dA (input/output) REAL array on the GPU, dimension (LDDA,N)
71  On entry, the symmetric matrix dA. If UPLO = 'U', the leading
72  N-by-N upper triangular part of dA contains the upper
73  triangular part of the matrix dA, and the strictly lower
74  triangular part of dA is not referenced. If UPLO = 'L', the
75  leading N-by-N lower triangular part of dA contains the lower
76  triangular part of the matrix dA, and the strictly upper
77  triangular part of dA is not referenced.
78 
79  On exit, if INFO = 0, the factor U or L from the Cholesky
80  factorization dA = U**T * U or dA = L * L**T.
81 
82  LDDA (input) INTEGER
83  The leading dimension of the array dA. LDDA >= max(1,N).
84  To benefit from coalescent memory accesses LDDA must be
85  dividable by 16.
86 
87  INFO (output) INTEGER
88  = 0: successful exit
89  < 0: if INFO = -i, the i-th argument had an illegal value
90  > 0: if INFO = i, the leading minor of order i is not
91  positive definite, and the factorization could not be
92  completed.
93  ===================================================================== */
94 
95 
96  magma_int_t j, jb, nb0, nb2, d, dd, id, j_local, j_local2, buf;
97  char uplo_[2] = {uplo, 0};
98  float c_one = MAGMA_S_ONE;
99  float c_neg_one = MAGMA_S_NEG_ONE;
100  float d_one = 1.0;
101  float d_neg_one = -1.0;
102  int upper = lapackf77_lsame(uplo_, "U");
103  float *dlpanel;
104  magma_int_t n_local[MagmaMaxGPUs], ldpanel;
105  const magma_int_t stream1 = 0, stream2 = 1, stream3 = 2;
106 #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(STRSM_WORK)
107  /* used by strsm_work */
108  int trsm_nb = 128;
109  int trsm_n = trsm_nb*((nb+trsm_nb-1)/trsm_nb);
110  float *d_dinvA[MagmaMaxGPUs];
111  float *d_x[MagmaMaxGPUs];
112  #define dinvA(d,j) &(d_dinvA[(d)][(j)*trsm_nb*trsm_n])
113  #define dx(d,j) &(d_x[(d)][(j)*nb*m])
114  /*
115  * Allocate device memory for the inversed diagonal blocks, size=N*BLOCK_SIZE
116  */
117  for( d=0; d<num_gpus; d++ ) {
118  magma_setdevice(d);
119  if ( (MAGMA_SUCCESS != magma_smalloc( &d_dinvA[d], 2*trsm_nb*trsm_n )) ||
120  (MAGMA_SUCCESS != magma_smalloc( &d_x[d], 2*nb*(upper ? n : m) )) ) {
121  *info = MAGMA_ERR_DEVICE_ALLOC;
122  return *info;
123  }
124  }
125  magma_setdevice(0);
126 #endif
127 
128  *info = 0;
129  if ( (! upper) && (! lapackf77_lsame(uplo_, "L")) ) {
130  *info = -1;
131  } else if (n < 0) {
132  *info = -2;
133  } else if (!upper && num_gpus*ldda < max(1,n)) {
134  *info = -4;
135  } else if (upper && ldda < max(1,m)) {
136  *info = -4;
137  }
138  if (*info != 0) {
139  magma_xerbla( __func__, -(*info) );
140  return *info;
141  }
142 
143  /* initialization */
144  for( d=0; d<num_gpus; d++ ) {
145  /* local-n and local-ld */
146  if (upper) {
147  n_local[d] = (n/(nb*num_gpus))*nb;
148  if (d < (n/nb)%num_gpus)
149  n_local[d] += nb;
150  else if (d == (n/nb)%num_gpus)
151  n_local[d] += n%nb;
152  } else {
153  n_local[d] = (m/(nb*num_gpus))*nb;
154  if (d < (m/nb)%num_gpus)
155  n_local[d] += nb;
156  else if (d == (m/nb)%num_gpus)
157  n_local[d] += m%nb;
158  }
159  }
160 
161  /* == initialize the trace */
162  trace_init( 1, num_gpus, 3, (CUstream_st**)stream );
163 
164  if (upper)
165  {
166  /* ---------------------------------------------- */
167  /* Upper-triangular case */
168  /* > Compute the Cholesky factorization A = U'*U. */
169  /* ---------------------------------------------- */
170  for (j=0; j<m; j+=nb) {
171 
172  /* Set the GPU number that holds the current panel */
173  id = (j/nb)%num_gpus;
174  buf = (j/nb)%num_gpus; // right now, we have num_gpu buffers, so id and buf are the same..
175 
176  /* Set the local index where the current panel is */
177  j_local = j/(nb*num_gpus);
178  jb = min(nb, (m-j));
179 
180  /* Update the current diagonal block on stream1 */
181  magma_setdevice(id);
182  if( j > 0 ) {
183  magmablasSetKernelStream(stream[id][stream1]);
184  trace_gpu_start( id, stream1, "syrk", "syrk" );
186  d_neg_one, dlA(id, 0, nb*j_local), ldda,
187  d_one, dlA(id, j, nb*j_local), ldda);
188  trace_gpu_end( id, stream1 );
189  }
190 
191  /* send the diagonal to cpu on stream1 */
192  trace_gpu_start( id, stream1, "comm", "D to CPU" );
193  magma_sgetmatrix_async( jb, jb,
194  dlA(id, j, nb*j_local), ldda,
195  Aup(j,j), lda,
196  stream[id][stream1] );
197  trace_gpu_end( id, stream1 );
198 
199  /* update off-diagonal blocks in the panel */
200  if( j > 0 ) {
201  d = (j/nb+1)%num_gpus;
202  for( dd=0; dd<num_gpus; dd++ ) {
203  j_local2 = j_local+1;
204  if( d > id ) j_local2 --;
205  nb0 = nb*j_local2; // number of local columns in the panel, while jb is panel-size (number of rows)
206 
207  if( n_local[d] > nb0 ) {
208  magma_setdevice(d);
209  magmablasSetKernelStream(stream[d][stream2]);
210  if( d == id ) {
211  dlpanel = dlA(d,0,nb*j_local);
212  ldpanel = ldda;
213  // the GPU owns the row from start, and no need of synch.
214  //magma_queue_wait_event( stream[d][stream2], event[d][0] ); // rows arrived at gpu
215  } else {
216  dlpanel = dlP(d,nb,0,buf);
217  ldpanel = lddp;
218  magma_queue_wait_event( stream[d][stream2], event[d][0] ); // rows arrived at gpu
219  }
220  trace_gpu_start( d, stream2, "gemm", "gemm" );
222  jb, n_local[d]-nb0, j,
223  c_neg_one, dlpanel, ldpanel,
224  dlA(d, 0, nb0), ldda,
225  c_one, dlA(d, j, nb0), ldda);
226  trace_gpu_end( d, stream2 );
227  magma_event_record( event[d][2], stream[d][stream2] );
228  }
229  d = (d+1)%num_gpus;
230  }
231  }
232 
233  /* wait for panel and factorize it on cpu */
234  magma_setdevice(id);
235  magma_queue_sync( stream[id][stream1] );
236  trace_cpu_start( 0, "getrf", "getrf" );
237  lapackf77_spotrf(MagmaUpperStr, &jb, Aup(j,j), &lda, info);
238  trace_cpu_end( 0 );
239  if (*info != 0) {
240  *info = *info + j;
241  break;
242  }
243 
244  /* send the diagonal to gpus on stream1 */
245  if ( (j+jb) < n) {
246  d = (j/nb+1)%num_gpus;
247  for( dd=0; dd<num_gpus; dd++ ) {
248  if( d == id ) {
249  dlpanel = dlA(d, j, nb*j_local);
250  ldpanel = ldda;
251  } else {
252  dlpanel = dlP(d,0,0,buf);
253  ldpanel = lddp;
254  }
255  magma_setdevice(d);
256  trace_gpu_start( d, stream1, "comm", "comm" );
257  magma_ssetmatrix_async( jb, jb,
258  Aup(j,j), lda,
259  dlpanel, ldpanel,
260  stream[d][stream1] );
261  trace_gpu_end( d, stream1 );
262  magma_event_record( event[d][1], stream[d][stream1] );
263  d = (d+1)%num_gpus;
264  }
265  } else {
266  magma_setdevice(id);
267  trace_gpu_start( id, stream1, "comm", "comm" );
268  magma_ssetmatrix_async( jb, jb,
269  Aup(j,j), lda,
270  dlA(id, j, nb*j_local), ldda,
271  stream[id][stream1] );
272  trace_gpu_end( id, stream1 );
273  }
274 
275  /* panel-factorize the off-diagonal */
276  if ( (j+jb) < n) {
277  d = (j/nb+1)%num_gpus;
278  for( dd=0; dd<num_gpus; dd++ ) {
279  /* next column */
280  j_local2 = j_local+1;
281  if( d > id ) j_local2--;
282  if( d == id ) {
283  dlpanel = dlA(d,j,nb*j_local);
284  ldpanel = ldda;
285  } else {
286  dlpanel = dlP(d,0,0,buf);
287  ldpanel = lddp;
288  }
289  nb2 = n_local[d] - j_local2*nb;
290 
291  magma_setdevice(d);
292  if( j+jb < m && d == (j/nb+1)%num_gpus ) {
293  /* owns the next column, look-ahead next block on stream1 */
294  nb0 = min(nb, nb2);
295  magmablasSetKernelStream(stream[d][stream1]);
296  magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for gemm update
297  trace_gpu_start( d, stream1, "trsm", "trsm" );
298 #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(STRSM_WORK)
299  magmablas_slaset( MagmaUpperLower, trsm_nb, trsm_n, dinvA(d,0),trsm_nb );
300  magmablas_slaset( MagmaUpperLower, nb0,jb, dx(d,0),nb0 );
303  jb, nb0, c_one,
304  dlpanel, ldpanel,
305  dlA(d, j, nb*j_local2), ldda,
306  1, dinvA(d,0), dx(d,0) );
307 #else
310  jb, nb0, c_one,
311  dlpanel, ldpanel,
312  dlA(d, j, nb*j_local2), ldda);
313 #endif
314  magma_event_record( event[d][4], stream[d][stream1] );
315  trace_gpu_end( d, stream1 );
316  } else if( nb2 > 0 ) {
317  /* update all the blocks on stream2 */
318  magma_queue_wait_event( stream[d][stream2], event[d][1] ); // wait for cholesky factor
319  trace_gpu_start( d, stream2, "trsm", "trsm" );
320  magmablasSetKernelStream(stream[d][stream2]);
321 #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(STRSM_WORK)
322  magmablas_slaset( MagmaUpperLower, trsm_nb,trsm_n, dinvA(d,0),trsm_nb );
323  magmablas_slaset( MagmaUpperLower, nb2,jb, dx(d,0),nb2 );
326  jb, nb2, c_one,
327  dlpanel, ldpanel,
328  dlA(d, j, nb*j_local2), ldda,
329  1, dinvA(d,0), dx(d,0) );
330 #else
333  jb, nb2, c_one,
334  dlpanel, ldpanel,
335  dlA(d, j, nb*j_local2), ldda);
336 #endif
337  trace_gpu_end( d, stream2 );
338  }
339  d = (d+1)%num_gpus;
340  } /* end of for */
341 
342  /* ========================================================== */
343  if( j+jb < m ) {
344  d = (j/nb+1)%num_gpus;
345  /* next column */
346  j_local2 = j_local+1;
347  if( d > id ) j_local2--;
348  nb0 = min(nb, n_local[d]-nb*j_local2 );
349 
350  /* even on 1 gpu, off-diagonals are copied to cpu (synchronize at the end). *
351  * so we have the Cholesky factor, but only diagonal submatrix of the big panel, *
352  * on cpu at the end. */
353  int d2, buf2;
354  magma_setdevice(d);
355  /* lookahead done */
356  magma_queue_wait_event( stream[d][stream3], event[d][4] );
357 
358  trace_gpu_start( d, stream3, "comm", "row to CPU" );
359  magma_sgetmatrix_async( (j+jb), nb0,
360  dlA(d, 0, nb*j_local2), ldda,
361  Aup(0,j+jb), lda,
362  stream[d][stream3] );
363  trace_gpu_end( d, stream3 );
364  magma_event_record( event[d][3], stream[d][stream3] );
365  /* needed on pluto */
366  //magma_queue_sync( stream[d][stream3] );
367 
368  /* broadcast rows to gpus on stream2 */
369  buf2 = ((j+jb)/nb)%num_gpus;
370  for( d2=0; d2<num_gpus; d2++ ) {
371  if( d2 != d )
372  {
373  magma_setdevice(d2);
374  trace_gpu_start( d2, stream3, "comm", "row to GPUs" );
375  magma_queue_wait_event( stream[d2][stream3], event[d][3] ); // rows arrived at cpu on stream3
376  magma_ssetmatrix_async( j+jb, nb0,
377  Aup(0,j+jb), lda,
378  dlP(d2,nb,0,buf2), lddp,
379  stream[d2][stream3] );
380  trace_gpu_end( d2, stream3 );
381  magma_event_record( event[d2][0], stream[d2][stream3] );
382  }
383  }
384 
385  /* =========================== */
386  /* update the remaining blocks */
387  nb2 = n_local[d]-(nb*j_local2 + nb0);
388  if( nb2 > 0 ) {
389  if( d == id ) {
390  dlpanel = dlA(d, j, nb*j_local);
391  ldpanel = ldda;
392  } else {
393  dlpanel = dlP(d,0,0,buf);
394  ldpanel = lddp;
395  }
396  magma_setdevice(d);
397  magmablasSetKernelStream(stream[d][stream2]);
398  trace_gpu_start( d, stream2, "trsm", "trsm" );
399 #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(STRSM_WORK)
400  int flag = 0;
401  if (flag == 0) {
402  magma_queue_wait_event( stream[d][stream2], event[d][4] ); // lookahead -> diagonal inversion
403  } else {
404  magmablas_slaset( MagmaUpperLower, trsm_nb,trsm_n, dinvA(d,flag),trsm_nb );
405  magma_queue_wait_event( stream[d][stream2], event[d][1] ); // panel received
406  }
407  magmablas_slaset( MagmaUpperLower, nb2,jb, dx(d,1),nb2 );
409  jb, nb2, c_one,
410  dlpanel, ldpanel,
411  dlA(d, j, nb*j_local2+nb0), ldda,
412  flag, dinvA(d,flag), dx(d,1) );
413 #else
414  magma_queue_wait_event( stream[d][stream2], event[d][1] ); // wait for cholesky factor
416  jb, nb2, c_one,
417  dlpanel, ldpanel,
418  dlA(d, j, nb*j_local2+nb0), ldda);
419 #endif
420  trace_gpu_end( d, stream2 );
421  }
422  }
423  } /* end of strsm */
424  } /* end of for j=1, .., n */
425  } else {
426 
427  /* ---------------------------------------------- */
428  /* Lower-triangular case */
429  /* > Compute the Cholesky factorization A = L*L'. */
430  /* ---------------------------------------------- */
431  for (j=0; j<n; j+=nb) {
432 
433  /* Set the GPU number that holds the current panel */
434  id = (j/nb)%num_gpus;
435  buf = (j/nb)%num_gpus;
436 
437  /* Set the local index where the current panel is */
438  j_local = j/(nb*num_gpus);
439  jb = min(nb, (n-j));
440 
441  /* Update the current diagonal block on stream1 */
442  magma_setdevice(id);
443  if( j > 0 ) {
444  magmablasSetKernelStream(stream[id][stream1]);
446  d_neg_one, dlA(id, nb*j_local, 0), ldda,
447  d_one, dlA(id, nb*j_local, j), ldda);
448  }
449 
450  /* send the diagonal to cpu on stream1 */
451  magma_sgetmatrix_async( jb, jb,
452  dlA(id, nb*j_local, j), ldda,
453  Alo(j,j), lda,
454  stream[id][stream1] );
455 
456  /* update off-diagonal blocks of the panel */
457  if( j > 0 ) {
458  d = (j/nb+1)%num_gpus;
459  for( dd=0; dd<num_gpus; dd++ ) {
460  j_local2 = j_local+1;
461  if( d > id ) j_local2 --;
462  nb0 = nb*j_local2;
463 
464  if( nb0 < n_local[d] ) {
465  magma_setdevice(d);
466  magmablasSetKernelStream(stream[d][stream2]);
467  if( d == id ) {
468  dlpanel = dlA(d, nb*j_local, 0);
469  ldpanel = ldda;
470  } else {
471  dlpanel = dlPT(d,0,nb,buf);
472  ldpanel = nb;
473  magma_queue_wait_event( stream[d][stream2], event[d][0] ); // rows arrived at gpu
474  }
476  n_local[d]-nb0, jb, j,
477  c_neg_one, dlA(d, nb0, 0), ldda,
478  dlpanel, ldpanel,
479  c_one, dlA(d, nb0, j), ldda);
480  magma_event_record( event[d][2], stream[d][stream2] );
481  }
482  d = (d+1)%num_gpus;
483  }
484  }
485 
486  /* wait for the panel and factorized it on cpu */
487  magma_setdevice(id);
488  magma_queue_sync( stream[id][stream1] );
489  lapackf77_spotrf(MagmaLowerStr, &jb, Alo(j,j), &lda, info);
490  if (*info != 0) {
491  *info = *info + j;
492  break;
493  }
494 
495  /* send the diagonal to gpus on stream1 */
496  if ( (j+jb) < m) {
497  d = (j/nb+1)%num_gpus;
498  for( dd=0; dd<num_gpus; dd++ ) {
499  if( d == id ) {
500  dlpanel = dlA(d, nb*j_local, j);
501  ldpanel = ldda;
502  } else {
503  dlpanel = dlPT(d, 0, 0, buf);
504  ldpanel = nb;
505  }
506  magma_setdevice(d);
507  magma_ssetmatrix_async( jb, jb,
508  Alo(j,j), lda,
509  dlpanel, ldpanel,
510  stream[d][stream1] );
511  magma_event_record( event[d][1], stream[d][stream1] );
512  d = (d+1)%num_gpus;
513  }
514  } else {
515  magma_setdevice(id);
516  magma_ssetmatrix_async( jb, jb,
517  Alo(j,j), lda,
518  dlA(id, nb*j_local, j), ldda,
519  stream[id][stream1] );
520  }
521 
522  /* panel factorize the off-diagonal */
523  if ( (j+jb) < m) {
524  d = (j/nb+1)%num_gpus;
525  for( dd=0; dd<num_gpus; dd++ ) {
526  /* next column */
527  j_local2 = j_local+1;
528  if( d > id ) j_local2--;
529  if( d == id ) {
530  dlpanel = dlA(d, nb*j_local, j);
531  ldpanel = ldda;
532  } else {
533  dlpanel = dlPT(d, 0, 0, buf);
534  ldpanel = nb;
535  }
536  nb2 = n_local[d] - j_local2*nb;
537  nb0 = min(nb, nb2);
538 
539  magma_setdevice(d);
540  if( j+nb < n && d == (j/nb+1)%num_gpus ) { /* owns next column, look-ahead next block on stream1 */
541  if ( j > 0 ) magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for gemm update
542  magmablasSetKernelStream(stream[d][stream1]);
543 #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(STRSM_WORK)
544  magmablas_slaset( MagmaUpperLower, trsm_nb, trsm_n, dinvA(d,0),trsm_nb );
545  magmablas_slaset( MagmaUpperLower, nb0,jb, dx(d,0),nb0 );
548  nb0, jb, c_one,
549  dlpanel, ldpanel,
550  dlA(d, nb*j_local2, j), ldda,
551  1, dinvA(d,0), dx(d,0) );
552 #else
555  nb0, jb, c_one,
556  dlpanel, ldpanel,
557  dlA(d, nb*j_local2, j), ldda);
558 #endif
559  magma_event_record( event[d][4], stream[d][stream1] );
560  } else if( nb2 > 0 ) { /* other gpus updating all the blocks on stream2 */
561  /* update the entire column */
562  magma_queue_wait_event( stream[d][stream2], event[d][1] ); // wait for the cholesky factor
563  magmablasSetKernelStream(stream[d][stream2]);
564 #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(STRSM_WORK)
565  magmablas_slaset( MagmaUpperLower, trsm_nb,trsm_n, dinvA(d,0),trsm_nb );
566  magmablas_slaset( MagmaUpperLower, nb2,jb, dx(d,0),nb2 );
568  nb2, jb, c_one,
569  dlpanel, ldpanel,
570  dlA(d, nb*j_local2, j), ldda,
571  1, dinvA(d,0), dx(d,0) );
572 #else
574  nb2, jb, c_one,
575  dlpanel, ldpanel,
576  dlA(d, nb*j_local2, j), ldda);
577 #endif
578  }
579  d = (d+1)%num_gpus;
580  } /* end for d */
581 
582  /* ========================================================== */
583  if( j+jb < n ) {
584  d = (j/nb+1)%num_gpus;
585  /* next column */
586  j_local2 = j_local+1;
587  if( d > id ) j_local2--;
588  nb0 = min(nb, n_local[d]-nb*j_local2 );
589 
590  /* even on 1 gpu, we copy off-diagonal to cpu (but don't synchronize). */
591  /* so we have the Cholesky factor on cpu at the end. */
592  int d2, buf2;
593 //#define SPOTRF_DEVICE_TO_DEVICE
594 #ifdef SPOTRF_DEVICE_TO_DEVICE
595  // lookahead done
596 
597  /* broadcast the rows to gpus */
598  buf2 = ((j+jb)/nb)%num_gpus;
599  for( d2=0; d2<num_gpus; d2++ ) {
600  magma_setdevice(d2);
601  magma_queue_wait_event( stream[d2][stream3], event[d][4] );
602  if( d2 != d ) {
603  magma_scopymatrix_async( nb0, j+jb,
604  dlPT(d2,0,nb,buf2), nb, // first nbxnb reserved for diagonal block
605  dlA(d, nb*j_local2, 0), ldda,
606  stream[d2][stream3] );
607  magma_event_record( event[d2][0], stream[d2][stream3] );
608  } else {
609  magma_sgetmatrix_async( nb0, j+jb,
610  dlA(d, nb*j_local2, 0), ldda,
611  Alo(j+jb,0), lda,
612  stream[d][stream3] );
613  }
614  }
615 #else
616  // lookahead done
617  magma_setdevice(d);
618  magma_queue_wait_event( stream[d][stream3], event[d][4] );
619  magma_sgetmatrix_async( nb0, j+jb,
620  dlA(d, nb*j_local2, 0), ldda,
621  Alo(j+jb,0), lda,
622  stream[d][stream3] );
623  magma_event_record( event[d][3], stream[d][stream3] );
624  /* syn on rows on CPU, seem to be needed on Pluto */
625  //magma_queue_sync( stream[d][stream3] );
626 
627  /* broadcast the rows to gpus */
628  buf2 = ((j+jb)/nb)%num_gpus;
629  for( d2=0; d2<num_gpus; d2++ ) {
630  if( d2 != d )
631  {
632  magma_setdevice(d2);
633  magma_queue_wait_event( stream[d2][stream3], event[d][3] ); // getmatrix done
634  magma_ssetmatrix_async( nb0, j+jb,
635  Alo(j+jb,0), lda,
636  dlPT(d2,0,nb,buf2), nb, // first nbxnb reserved for diagonal block
637  stream[d2][stream3] );
638  magma_event_record( event[d2][0], stream[d2][stream3] );
639  }
640  }
641 #endif
642  /* =================================== */
643  /* updates remaining blocks on stream2 */
644  nb2 = n_local[d] - (j_local2*nb + nb0);
645  if( nb2 > 0 ) {
646  if( d == id ) {
647  dlpanel = dlA(d, nb*j_local, j);
648  ldpanel = ldda;
649  } else {
650  dlpanel = dlPT(d,0,0,buf);
651  ldpanel = nb;
652  }
653  magma_setdevice(d);
654  magmablasSetKernelStream(stream[d][stream2]);
655  /* update the remaining blocks in the column */
656 #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(STRSM_WORK)
657  int flag = 0;
658  if (flag == 0) {
659  magma_queue_wait_event( stream[d][stream2], event[d][4] ); // lookahead -> diagonal inversion
660  } else {
661  magmablas_slaset( MagmaUpperLower, trsm_nb,trsm_n, dinvA(d,flag),trsm_nb );
662  magma_queue_wait_event( stream[d][stream2], event[d][1] ); // panel received
663  }
664  magmablas_slaset( MagmaUpperLower, nb2,jb, dx(d,1),nb2 );
666  nb2, jb, c_one,
667  dlpanel, ldpanel,
668  dlA(d, nb*j_local2+nb0, j), ldda,
669  flag, dinvA(d,flag), dx(d,1) );
670 #else
671  magma_queue_wait_event( stream[d][stream2], event[d][1] ); // panel received
673  nb2, jb, c_one,
674  dlpanel, ldpanel,
675  dlA(d, nb*j_local2+nb0, j), ldda);
676 #endif
677  }
678  }
679  }
680  }
681  } /* end of else not upper */
682 
683  /* == finalize the trace == */
684  trace_finalize( "spotrf.svg","trace.css" );
685  for( d=0; d<num_gpus; d++ ) {
686  magma_setdevice(d);
687  for( j=0; j<3; j++ ) {
688  magma_queue_sync( stream[d][j] );
689  }
690 #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(STRSM_WORK)
691  magma_free( d_dinvA[d] );
692  magma_free( d_x[d] );
693 #endif
695  }
696  magma_setdevice(0);
697 
698  return *info;
699 } /* magma_spotrf_mgpu */
#define dlP(id, i, j, k)
#define MagmaUpperLower
Definition: magma.h:63
#define min(a, b)
Definition: common_magma.h:86
#define dinvA(d, j)
void magma_sgemm(magma_trans_t transA, magma_trans_t transB, magma_int_t m, magma_int_t n, magma_int_t k, float alpha, magmaFloat_const_ptr dA, magma_int_t ldda, magmaFloat_const_ptr dB, magma_int_t lddb, float beta, magmaFloat_ptr dC, magma_int_t lddc)
#define magma_scopymatrix_async(m, n, dA_src, ldda, dB_dst, lddb, queue)
Definition: magmablas_s.h:717
#define MagmaLeft
Definition: magma.h:68
#define __func__
Definition: common_magma.h:65
#define trace_init(x1, x2, x3, x4)
Definition: trace.h:26
#define MagmaUpper
Definition: magma.h:61
#define MAGMA_ERR_DEVICE_ALLOC
Definition: magma_types.h:276
void magmablas_slaset(magma_uplo_t uplo, magma_int_t m, magma_int_t n, magmaFloat_ptr dA, magma_int_t ldda)
#define magma_free(ptr)
Definition: magma.h:57
#define MAGMA_S_NEG_ONE
Definition: magma.h:200
int magma_int_t
Definition: magmablas.h:12
#define magma_sgetmatrix_async(m, n, dA_src, ldda, hB_dst, ldb, queue)
Definition: magmablas_s.h:714
#define dx(d, j)
magma_int_t ldda
void magma_queue_wait_event(magma_queue_t queue, magma_event_t event)
cublasStatus_t magmablasSetKernelStream(magma_queue_t stream)
void magma_setdevice(magma_device_t dev)
#define MagmaLower
Definition: magma.h:62
void magma_ssyrk(magma_uplo_t uplo, magma_trans_t trans, magma_int_t n, magma_int_t k, float alpha, magmaFloat_const_ptr dA, magma_int_t ldda, float beta, magmaFloat_ptr dC, magma_int_t lddc)
#define MAGMA_S_ONE
Definition: magma.h:198
#define trace_gpu_end(x1, x2)
Definition: trace.h:30
void magma_xerbla(const char *srname, magma_int_t info)
Definition: xerbla.cpp:8
#define MagmaMaxGPUs
Definition: magma_types.h:255
#define lapackf77_lsame
Definition: magma_lapack.h:23
void magma_event_record(magma_event_t event, magma_queue_t queue)
void lapackf77_spotrf(const char *uplo, const magma_int_t *n, float *A, const magma_int_t *lda, magma_int_t *info)
void magma_strsm(magma_side_t side, magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_int_t m, magma_int_t n, float alpha, magmaFloat_const_ptr dA, magma_int_t ldda, magmaFloat_ptr dB, magma_int_t lddb)
#define MagmaTrans
Definition: magma.h:58
#define trace_cpu_start(x1, x2, x3)
Definition: trace.h:27
#define dlA(id, i, j)
void magmablas_strsm_work(magma_side_t side, magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_int_t m, magma_int_t n, float alpha, magmaFloat_const_ptr dA, magma_int_t ldda, magmaFloat_ptr db, magma_int_t lddb, int flag, magmaFloat_ptr d_dinvA, magmaFloat_ptr dx)
#define magma_ssetmatrix_async(m, n, hA_src, lda, dB_dst, lddb, queue)
Definition: magmablas_s.h:711
#define Aup(i, j)
#define MagmaLowerStr
Definition: magma.h:85
#define trace_cpu_end(x1)
Definition: trace.h:28
#define Alo(i, j)
#define MagmaNonUnit
Definition: magma.h:65
#define MAGMA_SUCCESS
Definition: magma.h:106
static magma_err_t magma_smalloc(magmaFloat_ptr *ptrPtr, size_t n)
Definition: magma.h:77
#define trace_gpu_start(x1, x2, x3, x4)
Definition: trace.h:29
#define dlPT(id, i, j, k)
#define MagmaRight
Definition: magma.h:69
#define trace_finalize(x1, x2)
Definition: trace.h:31
#define MagmaUpperStr
Definition: magma.h:84
#define MagmaNoTrans
Definition: magma.h:57
#define max(a, b)
Definition: common_magma.h:82
#define magma_queue_sync(queue)
Definition: magma.h:119

Here is the call graph for this function:

Here is the caller graph for this function: