MAGMA  magma-1.4.0
Matrix Algebra on GPU and Multicore Architectures
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
dpotrf3_mgpu.cpp
Go to the documentation of this file.
1 /*
2  -- MAGMA (version 1.4.0) --
3  Univ. of Tennessee, Knoxville
4  Univ. of California, Berkeley
5  Univ. of Colorado, Denver
6  August 2013
7 
8  @generated d Tue Aug 13 16:44:08 2013
9 
10 */
11 #include "common_magma.h"
12 #include "trace.h"
13 
14 /* === Define what BLAS to use ============================================ */
15 #define PRECISION_d
16 
17 #if defined(PRECISION_s) || defined(PRECISION_d)
18 #define DTRSM_WORK
19 //#define magma_dtrsm magmablas_dtrsm
20 #endif
21 /* === End defining what BLAS to use ======================================= */
22 
23 
24 #define Alo(i, j) (a + ((j)+off_j)*lda + (nb*(((i)/nb)%h)+off_i))
25 #define Aup(i, j) (a +(nb*(((j)/nb)%h)+off_j)*lda + (i+off_i))
26 
27 #define dlA(id, i, j) (d_lA[(id)] + (j)*ldda + (i))
28 #define dlP(id, i, j, k) (d_lP[(id)] + (k)*nb*lddp + (j)*lddp + (i))
29 #define dlPT(id, i, j, k) (d_lP[(id)] + (k)*nb*lddp + (j)*nb + (i))
30 
31 
32 extern "C" magma_int_t
34  magma_int_t off_i, magma_int_t off_j, magma_int_t nb,
35  double *d_lA[], magma_int_t ldda,
36  double *d_lP[], magma_int_t lddp,
37  double *a, magma_int_t lda, magma_int_t h,
38  magma_queue_t stream[][3], magma_event_t event[][5],
39  magma_int_t *info )
40 {
41 /* -- MAGMA (version 1.4.0) --
42  Univ. of Tennessee, Knoxville
43  Univ. of California, Berkeley
44  Univ. of Colorado, Denver
45  August 2013
46 
47  Purpose
48  =======
49  DPOTRF computes the Cholesky factorization of a real symmetric
50  positive definite matrix dA.
51  Auxiliary subroutine for dpotrf2_ooc. It is multiple gpu interface to compute
52  Cholesky of a "rectangular" matrix.
53 
54  The factorization has the form
55  dA = U**T * U, if UPLO = 'U', or
56  dA = L * L**T, if UPLO = 'L',
57  where U is an upper triangular matrix and L is lower triangular.
58 
59  This is the block version of the algorithm, calling Level 3 BLAS.
60 
61  Arguments
62  =========
63  UPLO (input) CHARACTER*1
64  = 'U': Upper triangle of dA is stored;
65  = 'L': Lower triangle of dA is stored.
66 
67  N (input) INTEGER
68  The order of the matrix dA. N >= 0.
69 
70  dA (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDDA,N)
71  On entry, the symmetric matrix dA. If UPLO = 'U', the leading
72  N-by-N upper triangular part of dA contains the upper
73  triangular part of the matrix dA, and the strictly lower
74  triangular part of dA is not referenced. If UPLO = 'L', the
75  leading N-by-N lower triangular part of dA contains the lower
76  triangular part of the matrix dA, and the strictly upper
77  triangular part of dA is not referenced.
78 
79  On exit, if INFO = 0, the factor U or L from the Cholesky
80  factorization dA = U**T * U or dA = L * L**T.
81 
82  LDDA (input) INTEGER
83  The leading dimension of the array dA. LDDA >= max(1,N).
84  To benefit from coalescent memory accesses LDDA must be
85  dividable by 16.
86 
87  INFO (output) INTEGER
88  = 0: successful exit
89  < 0: if INFO = -i, the i-th argument had an illegal value
90  > 0: if INFO = i, the leading minor of order i is not
91  positive definite, and the factorization could not be
92  completed.
93  ===================================================================== */
94 
95 
96  magma_int_t j, jb, nb0, nb2, d, dd, id, j_local, j_local2, buf;
97  char uplo_[2] = {uplo, 0};
98  double c_one = MAGMA_D_ONE;
99  double c_neg_one = MAGMA_D_NEG_ONE;
100  double d_one = 1.0;
101  double d_neg_one = -1.0;
102  int upper = lapackf77_lsame(uplo_, "U");
103  double *dlpanel;
104  magma_int_t n_local[MagmaMaxGPUs], ldpanel;
105  const magma_int_t stream1 = 0, stream2 = 1, stream3 = 2;
106 #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK)
107  /* used by dtrsm_work */
108  int trsm_nb = 128;
109  int trsm_n = trsm_nb*((nb+trsm_nb-1)/trsm_nb);
110  double *d_dinvA[MagmaMaxGPUs];
111  double *d_x[MagmaMaxGPUs];
112  #define dinvA(d,j) &(d_dinvA[(d)][(j)*trsm_nb*trsm_n])
113  #define dx(d,j) &(d_x[(d)][(j)*nb*m])
114  /*
115  * Allocate device memory for the inversed diagonal blocks, size=N*BLOCK_SIZE
116  */
117  for( d=0; d<num_gpus; d++ ) {
118  magma_setdevice(d);
119  if ( (MAGMA_SUCCESS != magma_dmalloc( &d_dinvA[d], 2*trsm_nb*trsm_n )) ||
120  (MAGMA_SUCCESS != magma_dmalloc( &d_x[d], 2*nb*(upper ? n : m) )) ) {
121  *info = MAGMA_ERR_DEVICE_ALLOC;
122  return *info;
123  }
124  }
125  magma_setdevice(0);
126 #endif
127 
128  *info = 0;
129  if ( (! upper) && (! lapackf77_lsame(uplo_, "L")) ) {
130  *info = -1;
131  } else if (n < 0) {
132  *info = -2;
133  } else if (!upper && num_gpus*ldda < max(1,n)) {
134  *info = -4;
135  } else if (upper && ldda < max(1,m)) {
136  *info = -4;
137  }
138  if (*info != 0) {
139  magma_xerbla( __func__, -(*info) );
140  return *info;
141  }
142 
143  /* initialization */
144  for( d=0; d<num_gpus; d++ ) {
145  /* local-n and local-ld */
146  if (upper) {
147  n_local[d] = (n/(nb*num_gpus))*nb;
148  if (d < (n/nb)%num_gpus)
149  n_local[d] += nb;
150  else if (d == (n/nb)%num_gpus)
151  n_local[d] += n%nb;
152  } else {
153  n_local[d] = (m/(nb*num_gpus))*nb;
154  if (d < (m/nb)%num_gpus)
155  n_local[d] += nb;
156  else if (d == (m/nb)%num_gpus)
157  n_local[d] += m%nb;
158  }
159  }
160 
161  /* == initialize the trace */
162  trace_init( 1, num_gpus, 3, (CUstream_st**)stream );
163 
164  if (upper)
165  {
166  /* ---------------------------------------------- */
167  /* Upper-triangular case */
168  /* > Compute the Cholesky factorization A = U'*U. */
169  /* ---------------------------------------------- */
170  for (j=0; j<m; j+=nb) {
171 
172  /* Set the GPU number that holds the current panel */
173  id = (j/nb)%num_gpus;
174  buf = (j/nb)%num_gpus; // right now, we have num_gpu buffers, so id and buf are the same..
175 
176  /* Set the local index where the current panel is */
177  j_local = j/(nb*num_gpus);
178  jb = min(nb, (m-j));
179 
180  /* Update the current diagonal block on stream1 */
181  magma_setdevice(id);
182  if( j > 0 ) {
183  magmablasSetKernelStream(stream[id][stream1]);
184  trace_gpu_start( id, stream1, "syrk", "syrk" );
186  d_neg_one, dlA(id, 0, nb*j_local), ldda,
187  d_one, dlA(id, j, nb*j_local), ldda);
188  trace_gpu_end( id, stream1 );
189  }
190 
191  /* send the diagonal to cpu on stream1 */
192  trace_gpu_start( id, stream1, "comm", "D to CPU" );
193  magma_dgetmatrix_async( jb, jb,
194  dlA(id, j, nb*j_local), ldda,
195  Aup(j,j), lda,
196  stream[id][stream1] );
197  trace_gpu_end( id, stream1 );
198 
199  /* update off-diagonal blocks in the panel */
200  if( j > 0 ) {
201  d = (j/nb+1)%num_gpus;
202  for( dd=0; dd<num_gpus; dd++ ) {
203  j_local2 = j_local+1;
204  if( d > id ) j_local2 --;
205  nb0 = nb*j_local2; // number of local columns in the panel, while jb is panel-size (number of rows)
206 
207  if( n_local[d] > nb0 ) {
208  magma_setdevice(d);
209  magmablasSetKernelStream(stream[d][stream2]);
210  if( d == id ) {
211  dlpanel = dlA(d,0,nb*j_local);
212  ldpanel = ldda;
213  // the GPU owns the row from start, and no need of synch.
214  //magma_queue_wait_event( stream[d][stream2], event[d][0] ); // rows arrived at gpu
215  } else {
216  dlpanel = dlP(d,nb,0,buf);
217  ldpanel = lddp;
218  magma_queue_wait_event( stream[d][stream2], event[d][0] ); // rows arrived at gpu
219  }
220  trace_gpu_start( d, stream2, "gemm", "gemm" );
222  jb, n_local[d]-nb0, j,
223  c_neg_one, dlpanel, ldpanel,
224  dlA(d, 0, nb0), ldda,
225  c_one, dlA(d, j, nb0), ldda);
226  trace_gpu_end( d, stream2 );
227  magma_event_record( event[d][2], stream[d][stream2] );
228  }
229  d = (d+1)%num_gpus;
230  }
231  }
232 
233  /* wait for panel and factorize it on cpu */
234  magma_setdevice(id);
235  magma_queue_sync( stream[id][stream1] );
236  trace_cpu_start( 0, "getrf", "getrf" );
237  lapackf77_dpotrf(MagmaUpperStr, &jb, Aup(j,j), &lda, info);
238  trace_cpu_end( 0 );
239  if (*info != 0) {
240  *info = *info + j;
241  break;
242  }
243 
244  /* send the diagonal to gpus on stream1 */
245  if ( (j+jb) < n) {
246  d = (j/nb+1)%num_gpus;
247  for( dd=0; dd<num_gpus; dd++ ) {
248  if( d == id ) {
249  dlpanel = dlA(d, j, nb*j_local);
250  ldpanel = ldda;
251  } else {
252  dlpanel = dlP(d,0,0,buf);
253  ldpanel = lddp;
254  }
255  magma_setdevice(d);
256  trace_gpu_start( d, stream1, "comm", "comm" );
257  magma_dsetmatrix_async( jb, jb,
258  Aup(j,j), lda,
259  dlpanel, ldpanel,
260  stream[d][stream1] );
261  trace_gpu_end( d, stream1 );
262  magma_event_record( event[d][1], stream[d][stream1] );
263  d = (d+1)%num_gpus;
264  }
265  } else {
266  magma_setdevice(id);
267  trace_gpu_start( id, stream1, "comm", "comm" );
268  magma_dsetmatrix_async( jb, jb,
269  Aup(j,j), lda,
270  dlA(id, j, nb*j_local), ldda,
271  stream[id][stream1] );
272  trace_gpu_end( id, stream1 );
273  }
274 
275  /* panel-factorize the off-diagonal */
276  if ( (j+jb) < n) {
277  d = (j/nb+1)%num_gpus;
278  for( dd=0; dd<num_gpus; dd++ ) {
279  /* next column */
280  j_local2 = j_local+1;
281  if( d > id ) j_local2--;
282  if( d == id ) {
283  dlpanel = dlA(d,j,nb*j_local);
284  ldpanel = ldda;
285  } else {
286  dlpanel = dlP(d,0,0,buf);
287  ldpanel = lddp;
288  }
289  nb2 = n_local[d] - j_local2*nb;
290 
291  magma_setdevice(d);
292  if( j+jb < m && d == (j/nb+1)%num_gpus ) {
293  /* owns the next column, look-ahead next block on stream1 */
294  nb0 = min(nb, nb2);
295  magmablasSetKernelStream(stream[d][stream1]);
296  magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for gemm update
297  trace_gpu_start( d, stream1, "trsm", "trsm" );
298 #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK)
299  magmablas_dlaset( MagmaUpperLower, trsm_nb, trsm_n, dinvA(d,0),trsm_nb );
300  magmablas_dlaset( MagmaUpperLower, nb0,jb, dx(d,0),nb0 );
303  jb, nb0, c_one,
304  dlpanel, ldpanel,
305  dlA(d, j, nb*j_local2), ldda,
306  1, dinvA(d,0), dx(d,0) );
307 #else
310  jb, nb0, c_one,
311  dlpanel, ldpanel,
312  dlA(d, j, nb*j_local2), ldda);
313 #endif
314  magma_event_record( event[d][4], stream[d][stream1] );
315  trace_gpu_end( d, stream1 );
316  } else if( nb2 > 0 ) {
317  /* update all the blocks on stream2 */
318  magma_queue_wait_event( stream[d][stream2], event[d][1] ); // wait for cholesky factor
319  trace_gpu_start( d, stream2, "trsm", "trsm" );
320  magmablasSetKernelStream(stream[d][stream2]);
321 #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK)
322  magmablas_dlaset( MagmaUpperLower, trsm_nb,trsm_n, dinvA(d,0),trsm_nb );
323  magmablas_dlaset( MagmaUpperLower, nb2,jb, dx(d,0),nb2 );
326  jb, nb2, c_one,
327  dlpanel, ldpanel,
328  dlA(d, j, nb*j_local2), ldda,
329  1, dinvA(d,0), dx(d,0) );
330 #else
333  jb, nb2, c_one,
334  dlpanel, ldpanel,
335  dlA(d, j, nb*j_local2), ldda);
336 #endif
337  trace_gpu_end( d, stream2 );
338  }
339  d = (d+1)%num_gpus;
340  } /* end of for */
341 
342  /* ========================================================== */
343  if( j+jb < m ) {
344  d = (j/nb+1)%num_gpus;
345  /* next column */
346  j_local2 = j_local+1;
347  if( d > id ) j_local2--;
348  nb0 = min(nb, n_local[d]-nb*j_local2 );
349 
350  /* even on 1 gpu, off-diagonals are copied to cpu (synchronize at the end). *
351  * so we have the Cholesky factor, but only diagonal submatrix of the big panel, *
352  * on cpu at the end. */
353  int d2, buf2;
354  magma_setdevice(d);
355  /* lookahead done */
356  magma_queue_wait_event( stream[d][stream3], event[d][4] );
357 
358  trace_gpu_start( d, stream3, "comm", "row to CPU" );
359  magma_dgetmatrix_async( (j+jb), nb0,
360  dlA(d, 0, nb*j_local2), ldda,
361  Aup(0,j+jb), lda,
362  stream[d][stream3] );
363  trace_gpu_end( d, stream3 );
364  magma_event_record( event[d][3], stream[d][stream3] );
365  /* needed on pluto */
366  //magma_queue_sync( stream[d][stream3] );
367 
368  /* broadcast rows to gpus on stream2 */
369  buf2 = ((j+jb)/nb)%num_gpus;
370  for( d2=0; d2<num_gpus; d2++ ) {
371  if( d2 != d )
372  {
373  magma_setdevice(d2);
374  trace_gpu_start( d2, stream3, "comm", "row to GPUs" );
375  magma_queue_wait_event( stream[d2][stream3], event[d][3] ); // rows arrived at cpu on stream3
376  magma_dsetmatrix_async( j+jb, nb0,
377  Aup(0,j+jb), lda,
378  dlP(d2,nb,0,buf2), lddp,
379  stream[d2][stream3] );
380  trace_gpu_end( d2, stream3 );
381  magma_event_record( event[d2][0], stream[d2][stream3] );
382  }
383  }
384 
385  /* =========================== */
386  /* update the remaining blocks */
387  nb2 = n_local[d]-(nb*j_local2 + nb0);
388  if( nb2 > 0 ) {
389  if( d == id ) {
390  dlpanel = dlA(d, j, nb*j_local);
391  ldpanel = ldda;
392  } else {
393  dlpanel = dlP(d,0,0,buf);
394  ldpanel = lddp;
395  }
396  magma_setdevice(d);
397  magmablasSetKernelStream(stream[d][stream2]);
398  trace_gpu_start( d, stream2, "trsm", "trsm" );
399 #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK)
400  int flag = 0;
401  if (flag == 0) {
402  magma_queue_wait_event( stream[d][stream2], event[d][4] ); // lookahead -> diagonal inversion
403  } else {
404  magmablas_dlaset( MagmaUpperLower, trsm_nb,trsm_n, dinvA(d,flag),trsm_nb );
405  magma_queue_wait_event( stream[d][stream2], event[d][1] ); // panel received
406  }
407  magmablas_dlaset( MagmaUpperLower, nb2,jb, dx(d,1),nb2 );
409  jb, nb2, c_one,
410  dlpanel, ldpanel,
411  dlA(d, j, nb*j_local2+nb0), ldda,
412  flag, dinvA(d,flag), dx(d,1) );
413 #else
414  magma_queue_wait_event( stream[d][stream2], event[d][1] ); // wait for cholesky factor
416  jb, nb2, c_one,
417  dlpanel, ldpanel,
418  dlA(d, j, nb*j_local2+nb0), ldda);
419 #endif
420  trace_gpu_end( d, stream2 );
421  }
422  }
423  } /* end of dtrsm */
424  } /* end of for j=1, .., n */
425  } else {
426 
427  /* ---------------------------------------------- */
428  /* Lower-triangular case */
429  /* > Compute the Cholesky factorization A = L*L'. */
430  /* ---------------------------------------------- */
431  for (j=0; j<n; j+=nb) {
432 
433  /* Set the GPU number that holds the current panel */
434  id = (j/nb)%num_gpus;
435  buf = (j/nb)%num_gpus;
436 
437  /* Set the local index where the current panel is */
438  j_local = j/(nb*num_gpus);
439  jb = min(nb, (n-j));
440 
441  /* Update the current diagonal block on stream1 */
442  magma_setdevice(id);
443  if( j > 0 ) {
444  magmablasSetKernelStream(stream[id][stream1]);
446  d_neg_one, dlA(id, nb*j_local, 0), ldda,
447  d_one, dlA(id, nb*j_local, j), ldda);
448  }
449 
450  /* send the diagonal to cpu on stream1 */
451  magma_dgetmatrix_async( jb, jb,
452  dlA(id, nb*j_local, j), ldda,
453  Alo(j,j), lda,
454  stream[id][stream1] );
455 
456  /* update off-diagonal blocks of the panel */
457  if( j > 0 ) {
458  d = (j/nb+1)%num_gpus;
459  for( dd=0; dd<num_gpus; dd++ ) {
460  j_local2 = j_local+1;
461  if( d > id ) j_local2 --;
462  nb0 = nb*j_local2;
463 
464  if( nb0 < n_local[d] ) {
465  magma_setdevice(d);
466  magmablasSetKernelStream(stream[d][stream2]);
467  if( d == id ) {
468  dlpanel = dlA(d, nb*j_local, 0);
469  ldpanel = ldda;
470  } else {
471  dlpanel = dlPT(d,0,nb,buf);
472  ldpanel = nb;
473  magma_queue_wait_event( stream[d][stream2], event[d][0] ); // rows arrived at gpu
474  }
476  n_local[d]-nb0, jb, j,
477  c_neg_one, dlA(d, nb0, 0), ldda,
478  dlpanel, ldpanel,
479  c_one, dlA(d, nb0, j), ldda);
480  magma_event_record( event[d][2], stream[d][stream2] );
481  }
482  d = (d+1)%num_gpus;
483  }
484  }
485 
486  /* wait for the panel and factorized it on cpu */
487  magma_setdevice(id);
488  magma_queue_sync( stream[id][stream1] );
489  lapackf77_dpotrf(MagmaLowerStr, &jb, Alo(j,j), &lda, info);
490  if (*info != 0) {
491  *info = *info + j;
492  break;
493  }
494 
495  /* send the diagonal to gpus on stream1 */
496  if ( (j+jb) < m) {
497  d = (j/nb+1)%num_gpus;
498  for( dd=0; dd<num_gpus; dd++ ) {
499  if( d == id ) {
500  dlpanel = dlA(d, nb*j_local, j);
501  ldpanel = ldda;
502  } else {
503  dlpanel = dlPT(d, 0, 0, buf);
504  ldpanel = nb;
505  }
506  magma_setdevice(d);
507  magma_dsetmatrix_async( jb, jb,
508  Alo(j,j), lda,
509  dlpanel, ldpanel,
510  stream[d][stream1] );
511  magma_event_record( event[d][1], stream[d][stream1] );
512  d = (d+1)%num_gpus;
513  }
514  } else {
515  magma_setdevice(id);
516  magma_dsetmatrix_async( jb, jb,
517  Alo(j,j), lda,
518  dlA(id, nb*j_local, j), ldda,
519  stream[id][stream1] );
520  }
521 
522  /* panel factorize the off-diagonal */
523  if ( (j+jb) < m) {
524  d = (j/nb+1)%num_gpus;
525  for( dd=0; dd<num_gpus; dd++ ) {
526  /* next column */
527  j_local2 = j_local+1;
528  if( d > id ) j_local2--;
529  if( d == id ) {
530  dlpanel = dlA(d, nb*j_local, j);
531  ldpanel = ldda;
532  } else {
533  dlpanel = dlPT(d, 0, 0, buf);
534  ldpanel = nb;
535  }
536  nb2 = n_local[d] - j_local2*nb;
537  nb0 = min(nb, nb2);
538 
539  magma_setdevice(d);
540  if( j+nb < n && d == (j/nb+1)%num_gpus ) { /* owns next column, look-ahead next block on stream1 */
541  if ( j > 0 ) magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for gemm update
542  magmablasSetKernelStream(stream[d][stream1]);
543 #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK)
544  magmablas_dlaset( MagmaUpperLower, trsm_nb, trsm_n, dinvA(d,0),trsm_nb );
545  magmablas_dlaset( MagmaUpperLower, nb0,jb, dx(d,0),nb0 );
548  nb0, jb, c_one,
549  dlpanel, ldpanel,
550  dlA(d, nb*j_local2, j), ldda,
551  1, dinvA(d,0), dx(d,0) );
552 #else
555  nb0, jb, c_one,
556  dlpanel, ldpanel,
557  dlA(d, nb*j_local2, j), ldda);
558 #endif
559  magma_event_record( event[d][4], stream[d][stream1] );
560  } else if( nb2 > 0 ) { /* other gpus updating all the blocks on stream2 */
561  /* update the entire column */
562  magma_queue_wait_event( stream[d][stream2], event[d][1] ); // wait for the cholesky factor
563  magmablasSetKernelStream(stream[d][stream2]);
564 #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK)
565  magmablas_dlaset( MagmaUpperLower, trsm_nb,trsm_n, dinvA(d,0),trsm_nb );
566  magmablas_dlaset( MagmaUpperLower, nb2,jb, dx(d,0),nb2 );
568  nb2, jb, c_one,
569  dlpanel, ldpanel,
570  dlA(d, nb*j_local2, j), ldda,
571  1, dinvA(d,0), dx(d,0) );
572 #else
574  nb2, jb, c_one,
575  dlpanel, ldpanel,
576  dlA(d, nb*j_local2, j), ldda);
577 #endif
578  }
579  d = (d+1)%num_gpus;
580  } /* end for d */
581 
582  /* ========================================================== */
583  if( j+jb < n ) {
584  d = (j/nb+1)%num_gpus;
585  /* next column */
586  j_local2 = j_local+1;
587  if( d > id ) j_local2--;
588  nb0 = min(nb, n_local[d]-nb*j_local2 );
589 
590  /* even on 1 gpu, we copy off-diagonal to cpu (but don't synchronize). */
591  /* so we have the Cholesky factor on cpu at the end. */
592  int d2, buf2;
593 //#define DPOTRF_DEVICE_TO_DEVICE
594 #ifdef DPOTRF_DEVICE_TO_DEVICE
595  // lookahead done
596 
597  /* broadcast the rows to gpus */
598  buf2 = ((j+jb)/nb)%num_gpus;
599  for( d2=0; d2<num_gpus; d2++ ) {
600  magma_setdevice(d2);
601  magma_queue_wait_event( stream[d2][stream3], event[d][4] );
602  if( d2 != d ) {
603  magma_dcopymatrix_async( nb0, j+jb,
604  dlPT(d2,0,nb,buf2), nb, // first nbxnb reserved for diagonal block
605  dlA(d, nb*j_local2, 0), ldda,
606  stream[d2][stream3] );
607  magma_event_record( event[d2][0], stream[d2][stream3] );
608  } else {
609  magma_dgetmatrix_async( nb0, j+jb,
610  dlA(d, nb*j_local2, 0), ldda,
611  Alo(j+jb,0), lda,
612  stream[d][stream3] );
613  }
614  }
615 #else
616  // lookahead done
617  magma_setdevice(d);
618  magma_queue_wait_event( stream[d][stream3], event[d][4] );
619  magma_dgetmatrix_async( nb0, j+jb,
620  dlA(d, nb*j_local2, 0), ldda,
621  Alo(j+jb,0), lda,
622  stream[d][stream3] );
623  magma_event_record( event[d][3], stream[d][stream3] );
624  /* syn on rows on CPU, seem to be needed on Pluto */
625  //magma_queue_sync( stream[d][stream3] );
626 
627  /* broadcast the rows to gpus */
628  buf2 = ((j+jb)/nb)%num_gpus;
629  for( d2=0; d2<num_gpus; d2++ ) {
630  if( d2 != d )
631  {
632  magma_setdevice(d2);
633  magma_queue_wait_event( stream[d2][stream3], event[d][3] ); // getmatrix done
634  magma_dsetmatrix_async( nb0, j+jb,
635  Alo(j+jb,0), lda,
636  dlPT(d2,0,nb,buf2), nb, // first nbxnb reserved for diagonal block
637  stream[d2][stream3] );
638  magma_event_record( event[d2][0], stream[d2][stream3] );
639  }
640  }
641 #endif
642  /* =================================== */
643  /* updates remaining blocks on stream2 */
644  nb2 = n_local[d] - (j_local2*nb + nb0);
645  if( nb2 > 0 ) {
646  if( d == id ) {
647  dlpanel = dlA(d, nb*j_local, j);
648  ldpanel = ldda;
649  } else {
650  dlpanel = dlPT(d,0,0,buf);
651  ldpanel = nb;
652  }
653  magma_setdevice(d);
654  magmablasSetKernelStream(stream[d][stream2]);
655  /* update the remaining blocks in the column */
656 #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK)
657  int flag = 0;
658  if (flag == 0) {
659  magma_queue_wait_event( stream[d][stream2], event[d][4] ); // lookahead -> diagonal inversion
660  } else {
661  magmablas_dlaset( MagmaUpperLower, trsm_nb,trsm_n, dinvA(d,flag),trsm_nb );
662  magma_queue_wait_event( stream[d][stream2], event[d][1] ); // panel received
663  }
664  magmablas_dlaset( MagmaUpperLower, nb2,jb, dx(d,1),nb2 );
666  nb2, jb, c_one,
667  dlpanel, ldpanel,
668  dlA(d, nb*j_local2+nb0, j), ldda,
669  flag, dinvA(d,flag), dx(d,1) );
670 #else
671  magma_queue_wait_event( stream[d][stream2], event[d][1] ); // panel received
673  nb2, jb, c_one,
674  dlpanel, ldpanel,
675  dlA(d, nb*j_local2+nb0, j), ldda);
676 #endif
677  }
678  }
679  }
680  }
681  } /* end of else not upper */
682 
683  /* == finalize the trace == */
684  trace_finalize( "dpotrf.svg","trace.css" );
685  for( d=0; d<num_gpus; d++ ) {
686  magma_setdevice(d);
687  for( j=0; j<3; j++ ) {
688  magma_queue_sync( stream[d][j] );
689  }
690 #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(DTRSM_WORK)
691  magma_free( d_dinvA[d] );
692  magma_free( d_x[d] );
693 #endif
695  }
696  magma_setdevice(0);
697 
698  return *info;
699 } /* magma_dpotrf_mgpu */
700 
701 #undef Alo
702 #undef Aup
703 #define A(i, j) (a +(j)*lda + (i))
704 #define dA(d, i, j) (dwork[(d)]+(j)*ldda + (i))
705 
706 extern "C" magma_int_t
708  magma_int_t off_i, magma_int_t off_j, magma_int_t nb,
709  double *a, magma_int_t lda,
710  double *dwork[], magma_int_t ldda,
711  magma_queue_t stream[][3], magma_int_t *info)
712 {
713  magma_int_t k;
714  if( lapackf77_lsame(uplo, "U") ) {
715  magma_int_t j, jj, jb, mj;
716 
717  /* go through each column */
718  for (j=off_j; j<n; j+=nb) {
719  jj = (j-off_j)/(nb*num_gpus);
720  k = ((j-off_j)/nb)%num_gpus;
721 
722  jb = min(nb, (n-j));
723  if(j+jb < off_j+m)
724  mj = (j-off_i)+jb;
725  else
726  mj = m;
727 
728  magma_setdevice(k);
729  magma_dsetmatrix_async( mj, jb,
730  A(off_i, j), lda,
731  dA(k, 0, jj*nb), ldda,
732  stream[k][0] );
733  }
734  }
735  else {
736  magma_int_t i, ii, ib, ni;
737 
738  /* go through each row */
739  for(i=off_i; i<m; i+=nb){
740  ii = (i-off_i)/(nb*num_gpus);
741  k = ((i-off_i)/nb)%num_gpus;
742 
743  ib = min(nb, (m-i));
744  if(i+ib < off_i+n)
745  ni = (i-off_i)+ib;
746  else
747  ni = n;
748 
749  magma_setdevice(k);
750  magma_dsetmatrix_async( ib, ni,
751  A(i, off_j), lda,
752  dA(k, ii*nb, 0), ldda,
753  stream[k][0] );
754  }
755  }
756  for( k=0; k<num_gpus; k++ ) {
757  magma_setdevice(k);
758  magma_queue_sync( stream[k][0] );
759  }
760  magma_setdevice(0);
761 
762  return *info;
763 }
764 
765 extern "C" magma_int_t
767  magma_int_t off_i, magma_int_t off_j, magma_int_t nb, magma_int_t NB,
768  double *a, magma_int_t lda,
769  double *dwork[], magma_int_t ldda,
770  magma_queue_t stream[][3], magma_int_t *info)
771 {
772  magma_int_t k;
773  if( lapackf77_lsame(uplo, "U") ) {
774  magma_int_t j, jj, jb, mj;
775 
776  /* go through each column */
777  for (j=off_j+NB; j<n; j+=nb) {
778  jj = (j-off_j)/(nb*num_gpus);
779  k = ((j-off_j)/nb)%num_gpus;
780 
781  jb = min(nb, (n-j));
782  if(j+jb < off_j+m)
783  mj = (j-off_i)+jb;
784  else
785  mj = m;
786 
787  magma_setdevice(k);
788  magma_dgetmatrix_async( mj, jb,
789  dA(k, 0, jj*nb), ldda,
790  A(off_i, j), lda,
791  stream[k][0] );
792  magma_queue_sync( stream[k][0] );
793  }
794  } else {
795  magma_int_t i, ii, ib, ni;
796 
797  /* go through each row */
798  for(i=off_i+NB; i<m; i+=nb){
799  ii = (i-off_i)/(nb*num_gpus);
800  k = ((i-off_i)/nb)%num_gpus;
801 
802  ib = min(nb, (m-i));
803  if(i+ib < off_i+n)
804  ni = (i-off_i)+ib;
805  else
806  ni = n;
807 
808  magma_setdevice(k);
809  magma_dgetmatrix_async( ib, ni,
810  dA(k, ii*nb, 0), ldda,
811  A(i, off_j), lda,
812  stream[k][0] );
813  magma_queue_sync( stream[k][0] );
814  }
815  }
816  /*for( k=0; k<num_gpus; k++ ) {
817  magma_setdevice(k);
818  magma_queue_sync( stream[k][0] );
819  }*/
820  magma_setdevice(0);
821 
822  return *info;
823 }
824 
825 #undef A
826 #undef dA
827 #undef dlA
828 #undef dlP
829 #undef dlPT
#define MagmaUpperLower
Definition: magma.h:63
#define min(a, b)
Definition: common_magma.h:86
#define MAGMA_D_ONE
Definition: magma.h:176
#define magma_dsetmatrix_async(m, n, hA_src, lda, dB_dst, lddb, queue)
Definition: magmablas_d.h:711
#define dlPT(id, i, j, k)
#define MagmaLeft
Definition: magma.h:68
#define __func__
Definition: common_magma.h:65
#define trace_init(x1, x2, x3, x4)
Definition: trace.h:26
#define MagmaUpper
Definition: magma.h:61
static magma_err_t magma_dmalloc(magmaDouble_ptr *ptrPtr, size_t n)
Definition: magma.h:78
#define MAGMA_ERR_DEVICE_ALLOC
Definition: magma_types.h:276
magma_int_t ldda
#define magma_free(ptr)
Definition: magma.h:57
void magma_dtrsm(magma_side_t side, magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_int_t m, magma_int_t n, double alpha, magmaDouble_const_ptr dA, magma_int_t ldda, magmaDouble_ptr dB, magma_int_t lddb)
void magmablas_dlaset(magma_uplo_t uplo, magma_int_t m, magma_int_t n, magmaDouble_ptr dA, magma_int_t ldda)
int magma_int_t
Definition: magmablas.h:12
#define magma_dgetmatrix_async(m, n, dA_src, ldda, hB_dst, ldb, queue)
Definition: magmablas_d.h:714
void magmablas_dtrsm_work(magma_side_t side, magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_int_t m, magma_int_t n, double alpha, magmaDouble_const_ptr dA, magma_int_t ldda, magmaDouble_ptr db, magma_int_t lddb, int flag, magmaDouble_ptr d_dinvA, magmaDouble_ptr dx)
magma_int_t magma_dpotrf3_mgpu(magma_int_t num_gpus, char uplo, magma_int_t m, magma_int_t n, magma_int_t off_i, magma_int_t off_j, magma_int_t nb, double *d_lA[], magma_int_t ldda, double *d_lP[], magma_int_t lddp, double *a, magma_int_t lda, magma_int_t h, magma_queue_t stream[][3], magma_event_t event[][5], magma_int_t *info)
#define Aup(i, j)
#define dA(d, i, j)
#define dwork(dev, i, j)
void magma_queue_wait_event(magma_queue_t queue, magma_event_t event)
cublasStatus_t magmablasSetKernelStream(magma_queue_t stream)
void magma_setdevice(magma_device_t dev)
#define dlA(id, i, j)
#define magma_dcopymatrix_async(m, n, dA_src, ldda, dB_dst, lddb, queue)
Definition: magmablas_d.h:717
#define MagmaLower
Definition: magma.h:62
#define trace_gpu_end(x1, x2)
Definition: trace.h:30
magma_int_t magma_ddtohpo(magma_int_t num_gpus, char *uplo, magma_int_t m, magma_int_t n, magma_int_t off_i, magma_int_t off_j, magma_int_t nb, magma_int_t NB, double *a, magma_int_t lda, double *work[], magma_int_t ldda, magma_queue_t stream[][3], magma_int_t *info)
#define Alo(i, j)
void magma_xerbla(const char *srname, magma_int_t info)
Definition: xerbla.cpp:8
#define MagmaMaxGPUs
Definition: magma_types.h:255
#define lapackf77_lsame
Definition: magma_lapack.h:23
void magma_event_record(magma_event_t event, magma_queue_t queue)
#define MagmaTrans
Definition: magma.h:58
#define trace_cpu_start(x1, x2, x3)
Definition: trace.h:27
#define MagmaLowerStr
Definition: magma.h:85
magma_int_t magma_dhtodpo(magma_int_t num_gpus, char *uplo, magma_int_t m, magma_int_t n, magma_int_t off_i, magma_int_t off_j, magma_int_t nb, double *h_A, magma_int_t lda, double *d_lA[], magma_int_t ldda, magma_queue_t stream[][3], magma_int_t *info)
#define trace_cpu_end(x1)
Definition: trace.h:28
#define MagmaNonUnit
Definition: magma.h:65
#define dlP(id, i, j, k)
#define MAGMA_SUCCESS
Definition: magma.h:106
void magma_dgemm(magma_trans_t transA, magma_trans_t transB, magma_int_t m, magma_int_t n, magma_int_t k, double alpha, magmaDouble_const_ptr dA, magma_int_t ldda, magmaDouble_const_ptr dB, magma_int_t lddb, double beta, magmaDouble_ptr dC, magma_int_t lddc)
#define A(i, j)
#define trace_gpu_start(x1, x2, x3, x4)
Definition: trace.h:29
#define MagmaRight
Definition: magma.h:69
#define dx(d, j)
void lapackf77_dpotrf(const char *uplo, const magma_int_t *n, double *A, const magma_int_t *lda, magma_int_t *info)
#define MAGMA_D_NEG_ONE
Definition: magma.h:178
void magma_dsyrk(magma_uplo_t uplo, magma_trans_t trans, magma_int_t n, magma_int_t k, double alpha, magmaDouble_const_ptr dA, magma_int_t ldda, double beta, magmaDouble_ptr dC, magma_int_t lddc)
#define trace_finalize(x1, x2)
Definition: trace.h:31
#define MagmaUpperStr
Definition: magma.h:84
#define MagmaNoTrans
Definition: magma.h:57
#define dinvA(d, j)
#define max(a, b)
Definition: common_magma.h:82
#define magma_queue_sync(queue)
Definition: magma.h:119