MAGMA  magma-1.4.0
Matrix Algebra on GPU and Multicore Architectures
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
dsymm_mgpu.cpp
Go to the documentation of this file.
1 /*
2  -- MAGMA (version 1.4.0) --
3  Univ. of Tennessee, Knoxville
4  Univ. of California, Berkeley
5  Univ. of Colorado, Denver
6  August 2013
7 
8  @generated d Tue Aug 13 16:45:23 2013
9  @author Mark Gates
10  @author Azzam Haidar
11 
12  This still has poor performance. Work in progress.
13 */
14 #include "common_magma.h"
15 #include "magma_bulge.h"
16 //#include "trace.h"
17 #include <assert.h>
18 
19 extern "C"
21  char side, char uplo, magma_int_t m, magma_int_t n,
22  double alpha,
23  double *dA[], magma_int_t ldda, magma_int_t offset,
24  double *dB[], magma_int_t lddb,
25  double beta, double *dC[], magma_int_t lddc,
26  double *dwork[], magma_int_t dworksiz,
27  double *C, magma_int_t ldc,
28  double *work[], magma_int_t worksiz,
29  magma_int_t ngpu, magma_int_t nb,
30  magma_queue_t streams[][20], magma_int_t nstream,
31  magma_event_t redevents[][MagmaMaxGPUs*MagmaMaxGPUs+10], magma_int_t nbevents,
33 {
34  #define dA(dev, i, j) (dA[dev] + (i) + (j)*ldda)
35  #define dB(dev, i, j) (dB[dev] + (i) + (j)*lddb)
36  #define dC(dev, i, j) (dC[dev] + (i) + (j)*lddc)
37  #define dwork(dev, i, j) (dwork[dev] + (i) + (j)*lddwork)
38  #define C(i, j) (C + (i) + (j)*ldc)
39  //printf("####################################################\n");
40  //printf(" start dsymm \n");
41  //printf("####################################################\n");
42 
43  assert( ldda >= m );
44  assert( lddb >= m );
45  assert( lddc >= m );
46  assert( nstream >= ngpu );
47  assert( nbevents >= ngpu*ngpu );
48 
49 
50  double c_one = MAGMA_D_ONE;
51 
52  double *dwork1[MagmaMaxGPUs];
53  double *dwork2[MagmaMaxGPUs];
54 
55 
56  magma_int_t maxgsize = n*m;
57  magma_int_t lddwork = lddc;
58  magma_int_t ldwork = m;
59  for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
60  dwork1[dev] = dwork[dev]; // size of dwork1 is n*lddwork
61  dwork2[dev] = dwork[dev]+n*lddwork; // size of dwork2 is maxgsize*ngpu
62  }
63  assert( dworksiz >= (n*lddwork+maxgsize*ngpu) );
64  assert( worksiz >= (n*ldwork) );
65 
66 
67  magma_device_t cdev;
68  magma_getdevice( &cdev );
69  magma_queue_t cstream;
70  magmablasGetKernelStream(&cstream);
71 
72 
73  magma_int_t dev, devperm, myblk, mycolsize, myblkoffst;
74  magma_int_t gmaster;
75  magma_int_t masterdev, lcdev, lccolsize, myngpu;
76 
77  magma_int_t stdev = (offset/nb)%ngpu;
78  magma_int_t blockoffset = offset % nb;
79  magma_int_t fstblksiz = 0;
80  if(blockoffset>0){
81  fstblksiz = min(m, (nb - blockoffset));
82  }
83  //magma_int_t nbblk = magma_ceildiv(m, nb);
84  magma_int_t nbblk = magma_ceildiv((m+blockoffset), nb);
85  magma_int_t remm = m- fstblksiz;
86  magma_int_t nbblkoffst = offset/nb;
87 
88 
89  magma_int_t nblstblks = -1;
90  magma_int_t devlstblk = -1;
91  magma_int_t lstblksiz = remm%nb;
92  if(lstblksiz>0){
93  nblstblks = nbblk%ngpu;
94  devlstblk = (nblstblks-1+ngpu)%ngpu;
95  }
96 
97  magma_int_t nbcmplxactive = 0;
98  magma_int_t cmplxisactive[MagmaMaxGPUs];
99  magma_int_t gpuisactive[MagmaMaxGPUs];
100  memset(gpuisactive, 0, MagmaMaxGPUs*sizeof(magma_int_t));
101  memset(cmplxisactive, 0, MagmaMaxGPUs*sizeof(magma_int_t));
102 
103 
104  for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
105  magma_setdevice( dev );
106  magmablasSetKernelStream( streams[ dev ][ 0 ] );
107  cudaMemset(dwork(dev,0,0), 0, (lddwork)*(n)*sizeof(double) );
108  // put all dC on all dev to 0 except the one which
109  // hold i==0 because this one has to multiply by beta.
110  if(dev!=stdev){
111  cudaMemset(dC(dev,0,0), 0, (lddc)*(n)*sizeof(double) );
112  }
113  }
114 
115  magma_int_t newoffset = offset;
116  // 1. symmetrize
117  if(blockoffset>0){
118  newoffset = offset+fstblksiz; // newoffset is adjusted over nb
119  magma_int_t myblkoffst = (nbblkoffst/ngpu)+(nbblkoffst%ngpu > stdev?1:0);
120  //printf("STDEV %d voici offset %d remm %d myblockoffset %d siz %d \n", stdev, offset, remm, myblkoffst, fstblksiz);
121  magma_setdevice( stdev );
122  magmablasSetKernelStream( streams[ stdev ][ 0 ] );
123  magmablas_dsymmetrize_tiles( MagmaLower, fstblksiz, dA(stdev, offset, myblkoffst*nb+blockoffset), ldda, 1, ngpu*nb, nb );
124  }
125 
126  for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
127  magma_int_t newstdev = (newoffset/nb)%ngpu;
128  magma_int_t nbblk = remm/nb; // number of block of size nb. if m%nb>0 then a last block exist and is of size ib=m%nb
129  magma_int_t myblk = (nbblk/ngpu) + (nbblk%ngpu > ((dev-newstdev+ngpu)%ngpu) ? 1:0 );
130  magma_int_t devperm = (dev-newstdev+ngpu)%ngpu;
131  magma_int_t nbblkoffst = newoffset/nb;
132  magma_int_t myblkoffst = (nbblkoffst/ngpu)+(nbblkoffst%ngpu > dev?1:0);
133  //printf("dev %d devperm %d newoffset %d rowoff %d coloff %d myblk %d \n", dev, devperm, newoffset, newoffset+devperm*nb, myblkoffst*nb, myblk);
134  magma_setdevice( dev );
135  magmablasSetKernelStream( streams[ dev ][ 0 ] );
136  magmablas_dsymmetrize_tiles( MagmaLower, nb, dA(dev, newoffset+devperm*nb, myblkoffst*nb), ldda, myblk, ngpu*nb, nb );
137  if(remm%nb>0){
138  magma_int_t nblstblks = (nbblk+1)%ngpu;
139  magma_int_t devlstblk = (nblstblks-1+ngpu)%ngpu;
140  //printf("==> siz %d devperm %d, devlstblk %d, newoffset+nbblk*nb %d, myblkoffst*nb+ myblk*nb %d\n", remm % nb, devperm, devlstblk, newoffset+nbblk*nb, myblkoffst*nb+ myblk*nb);
141  if(devperm==devlstblk)
142  magmablas_dsymmetrize( MagmaLower, remm % nb, dA(dev, newoffset+nbblk*nb, myblkoffst*nb+ myblk*nb), ldda ); // last partial tile
143  }
144  }
145 
146 
147 
148 
149 /*
150  magma_int_t siz = m+offset;
151  double *R=(double *) malloc(siz*siz*sizeof(double));
152  // collecte back A
153  magmablas_dgetmatrix_1D_bcyclic( siz, siz, dA, ldda, R, siz, ngpu, nb );
154  magma_setdevice( 0 );
155  magmablasSetKernelStream( streams[ dev ][ 0 ] );
156  //magma_dgetmatrix( siz, siz, dA[0], ldda, R, siz );
157  FILE *trace_file;
158  trace_file = fopen("AJETE/Aafter", "w");
159  for (int j = 0; j < siz ; j++)
160  for (int i = 0; i < siz ; i++)
161  fprintf(trace_file, "%10d%10d%40.30e\n", i+1, j+1, R[j*siz+i]);
162  fclose(trace_file);
163 return;
164 */
165 
166 
167  // ROW GEMM transpose a row and make a gemm with a block
168  // if only 1 GPU used the ROW GEMM is integrated with the
169  // COL GEMM (better accuracy observed) and better perf
170  if(ngpu>1){
171  for( magma_int_t i = fstblksiz; i < m; i += nb ) {
172  magma_int_t ib = min( nb, m-i ); // block size
173  magma_int_t ioff = i + offset; // start global index in parent matrix
174  //magma_int_t dev = (ioff / nb) % ngpu;
175  magma_int_t nbblkoffst = offset/nb;
176  magma_int_t nbblk = magma_ceildiv(i, nb);
177  for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
178 
179 
180  magma_int_t myblk = (nbblk/ngpu) + (nbblk%ngpu > ((dev-stdev+ngpu)%ngpu) ? 1:0 );
181  magma_int_t myblkoffst = (nbblkoffst/ngpu)+(nbblkoffst%ngpu > dev?1:0);
182 
183  magma_int_t myrowsize = myblk * nb;
184  magma_int_t coloffset = myblkoffst*nb;
185  if(dev==stdev) {
186  myrowsize = myrowsize -blockoffset;
187  coloffset = myblkoffst*nb+blockoffset;
188  }
189  //printf("ROW GEMM: voici i %d ib %d ioff %d nbblkoffst %d stdev %d dev %d myblk %d myblkoffset %d coloffset %d rowsize %d\n", i, ib, ioff, nbblkoffst, stdev, dev, myblk, myblkoffst, coloffset, myrowsize);
190  if(myrowsize>0){
191  magma_setdevice( dev );
192  magmablasSetKernelStream( streams[ dev ][ 1 ] );
193  magma_dgemm( MagmaTrans, MagmaNoTrans, myrowsize, n, ib,
194  alpha, dA(dev,ioff,coloffset), ldda,
195  dB(dev,i,0), lddb,
196  c_one, dwork(dev,0,0), lddwork );
197  }
198  }
199  }
200  for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
201  magma_setdevice( dev );
202  magma_event_record(redevents[dev][1], streams[dev][1]);
203  }
204  }
205 
206 
207  // COL GEMM
208  // blockoffset is offset within first block; for subsequent blocks it is 0
209  if(blockoffset>0){
210  magma_int_t ib = min( nb-blockoffset, m ); // block size
211  magma_int_t iblock = (offset / nb) / ngpu; // local block id
212  magma_int_t di = iblock*nb+blockoffset; // local index in parent matrix
213  magma_setdevice( stdev );
214  magmablasSetKernelStream( streams[ stdev ][ 0 ] );
215  //printf("DEV %d COL GEMM first ioff %d di %d m %d n %d ib %d \n", stdev, offset, di, m, n, ib);
217  alpha, dA(stdev,offset,di), ldda,
218  dB(stdev,0,0), lddb,
219  beta, dC(stdev,0,0), lddc );
220  }
221 
222 
223 
224  // COL GEMM
225  for( magma_int_t i = fstblksiz; i < m; i += nb ) {
226  magma_int_t ib = min( nb, m-i ); // block size
227  magma_int_t ioff = i + offset; // start global index in parent matrix
228  magma_int_t iblock = (ioff / nb) / ngpu; // local block id
229  magma_int_t dev = (ioff / nb) % ngpu;
230  magma_int_t di = iblock*nb; // local index in parent matrix
231 
232  //printf("DEV %d COL GEMM i %d ioff %d di %d m-i %d n %d ib %d \n", dev, i, ioff, di, m-i, n, ib);
233 
234  magma_setdevice( dev );
235  magmablasSetKernelStream( streams[ dev ][ 0 ] );
236  if(i==0){
237  magma_dgemm( MagmaNoTrans, MagmaNoTrans, m-i, n, ib,
238  alpha, dA(dev,ioff,di), ldda,
239  dB(dev,i,0), lddb,
240  beta, dC(dev,i,0), lddc );
241  }else{
242  magma_dgemm( MagmaNoTrans, MagmaNoTrans, m-i, n, ib,
243  alpha, dA(dev,ioff,di), ldda,
244  dB(dev,i,0), lddb,
245  c_one, dC(dev,i,0), lddc );
246  }
247  magma_event_record(redevents[dev][0], streams[dev][0]);
248  // if only 1 GPU is used, do the ROW GEMM
249  if(ngpu==1){
250  // NOTE THAT because the COL gemm write dC below the diagonal (i)
251  // and the ROW GEMM write dC from 0 to diag-1, so they could
252  // run in parallel on diferent stream.
253  //
254  // NO NO NO because
255  // it might happen that col finished i and strated i+1 while row still at i
256  // magmablasSetKernelStream( streams[ dev ][ 0 ] );
257  magma_dgemm( MagmaTrans, MagmaNoTrans, i, n, ib,
258  alpha, dA(dev,ioff,offset), ldda,
259  dB(dev,i,0), lddb,
260  c_one, dC(dev,0,0), lddc );
261  }
262  }
263 
264 
265 
266  if(ngpu>1){
267  for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
268  magma_int_t nbblk = magma_ceildiv((m+blockoffset), nb);
269  magma_int_t nbblkrow = nbblk-1;
270  magma_int_t devperm = (dev-stdev+ngpu)%ngpu;
271  magma_int_t myblk = (nbblkrow/ngpu) + (nbblkrow%ngpu > devperm ? 1:0 );
272  magma_int_t myrowsize = myblk * nb;
273  if(dev==stdev) {
274  myrowsize = myrowsize - blockoffset;
275  }
276 
277  //printf("blockoffset %d nbblkrow %d devperm %d DEV %d RECEIVING myblk %d myrowsize %d\n", blockoffset, nbblkrow, devperm, dev, myblk, myrowsize);
278  if(myrowsize>0){
279  magma_setdevice( dev );
280  magmablasSetKernelStream( streams[ dev ][ 0 ] );
281  magma_queue_wait_event(streams[ dev ][ 0 ], redevents[dev][1]);
282  //magma_queue_sync( streams[ dev ][ 1 ] );
283  // for each dev add the computed ROW block each on its placment with dC
284  for( magma_int_t blki = 0; blki < myblk; ++blki){
285  magma_int_t gbblki = (blki*ngpu + devperm)*nb - blockoffset;
286  magma_int_t lcblki = blki*nb;
287  magma_int_t ib = nb;// min(nb, m-gbblki);
288  if(dev==stdev){
289  lcblki = blki*nb-blockoffset;
290  if(blki==0){
291  gbblki = 0;
292  lcblki = 0;
293  ib = nb-blockoffset;
294  }
295  }
296  magmablas_dgeadd(ib, n, c_one,
297  &dwork[dev][lcblki], lddwork,
298  &dC[dev][gbblki] , lddc );
299  }
300  magma_event_record(redevents[dev][0], streams[dev][0]);
301  }
302  }
303  }
304 
305 
306 
307 
308  // ===========================================================
309  // COMMUNICATION ALL_REDUCE_SUM
310  // ===========================================================
311  if(ngpu==1){
312  return;
313  }
314  // INITIALIZE COMM
315  for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) {
316  masterdev = -1;
317  gnode[cmplxid][MagmaMaxGPUs+1] = -1;
318  myngpu = gnode[cmplxid][MagmaMaxGPUs];
319  for( magma_int_t idev = 0; idev < myngpu; ++idev ) {
320  dev = gnode[cmplxid][idev];
321  devperm = (dev-stdev+ngpu)%ngpu;
322  myblk = (nbblk/ngpu) + (nbblk%ngpu > devperm ? 1:0 );
323  mycolsize = myblk*nb;
324  myblkoffst = nb*((nbblkoffst/ngpu)+(nbblkoffst%ngpu > dev?1:0));
325  if(dev==stdev){
326  mycolsize -= blockoffset;
327  myblkoffst += blockoffset; // local index in parent matrix
328  }
329  if((devperm==devlstblk)&&(lstblksiz>0)){
330  mycolsize -= (nb-(remm%nb));
331  }
332  mycolsize = min(mycolsize, m);
333  if(mycolsize>0){
334  gpuisactive[dev] = mycolsize;
335  if(masterdev==-1) {
336  masterdev = dev;
337  nbcmplxactive = nbcmplxactive +1;
338  cmplxisactive[cmplxid] = 1;
339  gnode[cmplxid][MagmaMaxGPUs+1] = masterdev;
340  }
341  }
342  }
343  }
344 /*
345  for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
346  magma_setdevice( dev );
347  cudaDeviceSynchronize();
348  }
349 */
350  //*******************************
351  // each GPU send its result
352  // to its master. The master make
353  // the addition and then send to
354  // to the masters of other real
355  // and receive from the masters of
356  // other real make the addition
357  // and broadcast locally the final
358  // result.
359  //*******************************
360  //printf("=======================================================================\n");
361  //printf(" sending to my master \n");
362  //printf("=======================================================================\n");
363  for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) {
364  myngpu = gnode[cmplxid][MagmaMaxGPUs];
365  masterdev = gnode[cmplxid][MagmaMaxGPUs+1];
366  //check if real is active
367  if(masterdev!=-1){
368  for( magma_int_t idev = 0; idev < myngpu; ++idev ) {
369  dev = gnode[cmplxid][idev];
370  mycolsize = gpuisactive[dev];
371  if(mycolsize>0){
372  // I am an active GPU. if I am not the master, then send my result to my master.
373  // store result on dwork[masterdev][dev*maxgsize]
374  if(dev!=masterdev){
375  magma_setdevice( dev );
376  //printf(" GPU %d sending to my master %d\n", dev, masterdev);
377  // wait the geadd of my ROW and COL GEMM is done
378  magma_queue_wait_event(streams[ dev ][ 0 ], redevents[dev][0]);
379  // sending to the master of my real
380  cudaMemcpy2DAsync(&dwork2[masterdev][maxgsize*dev], m*sizeof(double),
381  &dC[dev][0], lddc*sizeof(double),
382  m*sizeof(double), n,
383  cudaMemcpyDeviceToDevice, streams[dev][0]);
384  magma_event_record(redevents[dev][masterdev], streams[dev][0]);
385  } // end I am not the masterdev
386  }// end if mycolsize>0
387  }// for idev
388  }// end of if masterdev!=-1 maening real is active
389  }// for cmplxid
390 /*
391  for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
392  magma_setdevice( dev );
393  cudaDeviceSynchronize();
394  }
395 */
396 
397  //printf("=======================================================================\n");
398  //printf(" each master do addition of local result and broadcast to other masters \n");
399  //printf("=======================================================================\n");
400  for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) {
401  myngpu = gnode[cmplxid][MagmaMaxGPUs];
402  masterdev = gnode[cmplxid][MagmaMaxGPUs+1];
403  //check if real is active
404  if(masterdev!=-1){
405  magma_setdevice( masterdev );
406  // addition is done on stream 0 sequentially
407  magmablasSetKernelStream( streams[ masterdev ][ 0 ] );
408  // wait the geadd of my ROW and COL GEMM is done
409  magma_queue_wait_event(streams[ masterdev ][ 0 ], redevents[masterdev][0]);
410  // ========================================
411  // local addition
412  // ========================================
413  for( magma_int_t l = 0; l < myngpu; ++l ) {
414  lcdev = gnode[cmplxid][l];
415  lccolsize = gpuisactive[lcdev];
416  if((lcdev!=masterdev)&&(lccolsize>0)){
417  //printf(" master %d receiving from %d and adding \n", masterdev, lcdev);
418  // this is an active GPU of my real.
419  // wait I received what he send it to me and then do addition.
420  magma_queue_wait_event(streams[ masterdev ][ 0 ], redevents[lcdev][masterdev]);
421  magmablas_dgeadd(m, n, c_one,
422  &dwork2[masterdev][maxgsize*lcdev], m,
423  &dC[masterdev][0] , lddc );
424  }
425  }// for l=1:myngpu
426  // because addition is done sequentially on stream 0,
427  // I have to record this to be able to synch using it
428  magma_event_record(redevents[masterdev][masterdev], streams[masterdev][0]);
429  // ========================================
430  //
431  // ========================================
432  // send to other masters
433  // ========================================
434  for( magma_int_t k = 0; k < nbcmplx; ++k ) {
435  if(k!=cmplxid){
436  gmaster = gnode[k][MagmaMaxGPUs+1];
437  if(gmaster!=-1){ //real is active
438  //Master has to wait until finish the local addition then send using gmaster stream.
439  //use stream 0 to make it sequential or stream gmaster to make it parallel.
440  //Now both re the same.
441  //printf(" master %d from cmplx %d sending to other master %d on cmplx %d \n", masterdev, cmplxid, gmaster, k);
442  magma_queue_wait_event(streams[ masterdev ][ gmaster ], redevents[masterdev][masterdev]);
443  cudaMemcpy2DAsync(&dwork2[gmaster][maxgsize*masterdev], m*sizeof(double),
444  &dC[masterdev][0], lddc*sizeof(double),
445  m*sizeof(double), n,
446  cudaMemcpyDeviceToDevice, streams[masterdev][gmaster]);
447  magma_event_record(redevents[masterdev][gmaster], streams[masterdev][gmaster]);
448  magma_event_record(redevents[masterdev][masterdev], streams[masterdev][gmaster]);
449  } // end of gmaster!=-1
450  } // end of k!=cmplxid
451  }// for k = 0: nbcmplx
452  // ========================================
453  }// end of if masterdev!=-1 maening real is active
454  }// for cmplxid
455 /*
456  for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
457  magma_setdevice( dev );
458  cudaDeviceSynchronize();
459  }
460 */
461  //printf("=======================================================================\n");
462  //printf(" each master wait receiving other masters results, do the addition and broadcast locally \n");
463  //printf("=======================================================================\n");
464  for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) {
465  myngpu = gnode[cmplxid][MagmaMaxGPUs];
466  masterdev = gnode[cmplxid][MagmaMaxGPUs+1];
467  //check if real is active
468  if(masterdev!=-1){
469  magma_setdevice( masterdev );
470  // addition is done on stream 0 sequentially
471  magmablasSetKernelStream( streams[ masterdev ][ 0 ] );
472  // master has to wait until finishing all the send to other masters.
473  magma_queue_wait_event(streams[ masterdev ][ 0 ], redevents[masterdev][masterdev]);
474  // ========================================
475  // addition of results from other masters
476  // ========================================
477  for( magma_int_t k = 0; k < nbcmplx; ++k ) {
478  if(k!=cmplxid){
479  gmaster = gnode[k][MagmaMaxGPUs+1];
480  if(gmaster!=-1){ //real is active
481  //Master has to wait until receiving from gmaster, then do addition using stream 0
482  //printf(" master %d from cmplx %d receiving from other master %d on cmplx %d and adding \n", masterdev, cmplxid, gmaster, k);
483  magma_queue_wait_event(streams[ masterdev ][ 0 ], redevents[gmaster][masterdev]);
484  magmablas_dgeadd(m, n, c_one,
485  &dwork2[masterdev][maxgsize*gmaster], m,
486  &dC[masterdev][0] , lddc );
487  } // end of gmaster!=-1
488  } // end of k!=cmplxid
489  }// for k = 0: nbcmplx
490  // because addition is done sequentially on stream 0,
491  // I have to record this to be able to synch using it
492  magma_event_record(redevents[masterdev][masterdev], streams[masterdev][0]);
493  // ========================================
494  // ========================================
495  // local broadcast of final results
496  // ========================================
497  for( magma_int_t l = 0; l < myngpu; ++l ) {
498  lcdev = gnode[cmplxid][l];
499  lccolsize = gpuisactive[lcdev];
500  if((lcdev!=masterdev)&&(lccolsize>0)){
501  // this is an active GPU of my real.
502  // wait the previous addition is done maening stream 0 is finished and broadcast sequentially for now.
503  // to make it parallel put stream lcdev instead of stream 0
504  //printf(" master %d broadcasting local to %d \n", masterdev, lcdev);
505  magma_queue_wait_event(streams[ masterdev ][ 0 ], redevents[masterdev][masterdev]);
506  cudaMemcpy2DAsync(&dC[lcdev][0], lddc*sizeof(double),
507  &dC[masterdev][0], lddc*sizeof(double),
508  m*sizeof(double), n,
509  cudaMemcpyDeviceToDevice, streams[masterdev][0]);
510  magma_event_record(redevents[masterdev][lcdev], streams[masterdev][0]);
511  }
512  }// for l=1:myngpu
513  // ========================================
514  }// end of if masterdev!=-1 maening real is active
515  }// for cmplxid
516 /*
517  for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
518  magma_setdevice( dev );
519  cudaDeviceSynchronize();
520  }
521 */
522 
523 
524  for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) {
525  myngpu = gnode[cmplxid][MagmaMaxGPUs];
526  masterdev = gnode[cmplxid][MagmaMaxGPUs+1];
527  //check if real is active
528  if(masterdev!=-1){
529  for( magma_int_t l = 0; l < myngpu; ++l ) {
530  lcdev = gnode[cmplxid][l];
531  lccolsize = gpuisactive[lcdev];
532  if(lccolsize>0){
533  magma_setdevice( lcdev );
534  magma_queue_wait_event(streams[ lcdev ][ 0 ], redevents[lcdev][0]);
535  magma_queue_wait_event(streams[ lcdev ][ 0 ], redevents[masterdev][lcdev]);
536  }
537  }// for l=1:myngpu
538  }// end of if masterdev!=-1 maening real is active
539  }// for cmplxid
540 
541 
542 
543  //printf("****************************************************\n");
544  //printf(" finish dsymm \n");
545  //printf("****************************************************\n");
546 
547  magma_setdevice( cdev );
548  magmablasSetKernelStream( cstream );
549 
550 }
#define min(a, b)
Definition: common_magma.h:86
#define MAGMA_D_ONE
Definition: magma.h:176
#define dB(dev, i, j)
#define dA(dev, i, j)
int magma_int_t
Definition: magmablas.h:12
#define C(i, j)
void magma_queue_wait_event(magma_queue_t queue, magma_event_t event)
cublasStatus_t magmablasSetKernelStream(magma_queue_t stream)
void magma_setdevice(magma_device_t dev)
void magmablas_dsymmetrize_tiles(magma_uplo_t uplo, magma_int_t m, magmaDouble_ptr dA, magma_int_t ldda, magma_int_t ntile, magma_int_t mstride, magma_int_t nstride)
void magmablas_dgeadd(magma_int_t m, magma_int_t n, double alpha, magmaDouble_const_ptr dA, magma_int_t ldda, magmaDouble_ptr dB, magma_int_t lddb)
#define MagmaLower
Definition: magma.h:62
void magma_getdevice(magma_device_t *dev)
magma_int_t magma_ceildiv(magma_int_t a, magma_int_t b)
Definition: magma_bulge.h:16
#define MagmaMaxGPUs
Definition: magma_types.h:255
void magma_event_record(magma_event_t event, magma_queue_t queue)
#define MagmaTrans
Definition: magma.h:58
void magma_dgemm(magma_trans_t transA, magma_trans_t transB, magma_int_t m, magma_int_t n, magma_int_t k, double alpha, magmaDouble_const_ptr dA, magma_int_t ldda, magmaDouble_const_ptr dB, magma_int_t lddb, double beta, magmaDouble_ptr dC, magma_int_t lddc)
void magmablas_dsymmetrize(magma_uplo_t uplo, magma_int_t m, magmaDouble_ptr dA, magma_int_t ldda)
void magmablas_dsymm_mgpu_com(magma_side_t side, magma_uplo_t uplo, magma_int_t m, magma_int_t n, double alpha, magmaDouble_ptr dA[], magma_int_t ldda, magma_int_t offset, magmaDouble_ptr dB[], magma_int_t lddb, double beta, magmaDouble_ptr dC[], magma_int_t lddc, magmaDouble_ptr dwork[], magma_int_t lddwork, double *C, magma_int_t ldc, double *work[], magma_int_t ldwork, magma_int_t ngpu, magma_int_t nb, magma_queue_t streams[][20], magma_int_t nstream, magma_event_t redevents[][MagmaMaxGPUs *MagmaMaxGPUs+10], magma_int_t nbevents, magma_int_t gnode[MagmaMaxGPUs][MagmaMaxGPUs+2], magma_int_t nbcmplx)
Definition: dsymm_mgpu.cpp:20
#define MagmaNoTrans
Definition: magma.h:57
cublasStatus_t magmablasGetKernelStream(magma_queue_t *stream)
#define dC(dev, i, j)
#define dwork(dev, i, j)