MAGMA  magma-1.4.0
Matrix Algebra on GPU and Multicore Architectures
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
dsymm_mgpu_spec.cpp File Reference
#include "common_magma.h"
#include "magma_bulge.h"
#include <assert.h>
Include dependency graph for dsymm_mgpu_spec.cpp:

Go to the source code of this file.

Macros

#define dA(dev, i, j)   (dA[dev] + (i) + (j)*ldda)
 
#define dB(dev, i, j)   (dB[dev] + (i) + (j)*lddb)
 
#define dC(dev, i, j)   (dC[dev] + (i) + (j)*lddc)
 
#define dwork(dev, i, j)   (dwork[dev] + (i) + (j)*lddwork)
 
#define C(i, j)   (C + (i) + (j)*ldc)
 

Functions

void magmablas_dsymm_mgpu_spec (char side, char uplo, magma_int_t m, magma_int_t n, double alpha, double *dA[], magma_int_t ldda, magma_int_t offset, double *dB[], magma_int_t lddb, double beta, double *dC[], magma_int_t lddc, double *dwork[], magma_int_t dworksiz, double *C, magma_int_t ldc, double *work[], magma_int_t ldwork, magma_int_t ngpu, magma_int_t nb, magma_queue_t streams[][20], magma_int_t nstream, magma_event_t redevents[][MagmaMaxGPUs *MagmaMaxGPUs+10], magma_int_t nbevents, magma_int_t gnode[MagmaMaxGPUs][MagmaMaxGPUs+2], magma_int_t nbcmplx)
 

Macro Definition Documentation

#define C (   i,
 
)    (C + (i) + (j)*ldc)
#define dA (   dev,
  i,
 
)    (dA[dev] + (i) + (j)*ldda)
#define dB (   dev,
  i,
 
)    (dB[dev] + (i) + (j)*lddb)
#define dC (   dev,
  i,
 
)    (dC[dev] + (i) + (j)*lddc)
#define dwork (   dev,
  i,
 
)    (dwork[dev] + (i) + (j)*lddwork)

Function Documentation

void magmablas_dsymm_mgpu_spec ( char  side,
char  uplo,
magma_int_t  m,
magma_int_t  n,
double  alpha,
double *  dA[],
magma_int_t  ldda,
magma_int_t  offset,
double *  dB[],
magma_int_t  lddb,
double  beta,
double *  dC[],
magma_int_t  lddc,
double *  dwork[],
magma_int_t  dworksiz,
double *  C,
magma_int_t  ldc,
double *  work[],
magma_int_t  ldwork,
magma_int_t  ngpu,
magma_int_t  nb,
magma_queue_t  streams[][20],
magma_int_t  nstream,
magma_event_t  redevents[][MagmaMaxGPUs *MagmaMaxGPUs+10],
magma_int_t  nbevents,
magma_int_t  gnode[MagmaMaxGPUs][MagmaMaxGPUs+2],
magma_int_t  nbcmplx 
)

Definition at line 17 of file dsymm_mgpu_spec.cpp.

References dA, dB, magma_ceildiv(), magma_dgemm(), magma_event_record(), magma_getdevice(), magma_queue_wait_event(), magma_setdevice(), magmablas_dlacpy(), magmablasGetKernelStream(), magmablasSetKernelStream(), MagmaMaxGPUs, MagmaNoTrans, MagmaTrans, and min.

31 {
32  #define dA(dev, i, j) (dA[dev] + (i) + (j)*ldda)
33  #define dB(dev, i, j) (dB[dev] + (i) + (j)*lddb)
34  #define dC(dev, i, j) (dC[dev] + (i) + (j)*lddc)
35  #define dwork(dev, i, j) (dwork[dev] + (i) + (j)*lddwork)
36  #define C(i, j) (C + (i) + (j)*ldc)
37 
38  assert( ldda >= m );
39  assert( lddb >= m );
40  assert( lddc >= m );
41  assert( nstream >= ngpu );
42  assert( nbevents >= ngpu*ngpu );
43 
44  double *dwork1[MagmaMaxGPUs];
45  double *dwork2[MagmaMaxGPUs];
46 
47 
48  magma_int_t lddwork = lddc;
49  for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
50  dwork1[dev] = dwork[dev];
51  dwork2[dev] = dwork[dev]+n*lddwork;
52  }
53  assert( dworksiz >= (2*n*lddwork) );
54 
55 
56 
57 
58 
59  magma_device_t cdev;
60  magma_getdevice( &cdev );
61  magma_queue_t cstream;
62  magmablasGetKernelStream(&cstream);
63 
64 
65  magma_int_t dev,devperm,myblk,mycolsize,myblkoffst;
66  magma_int_t gdev,gcolsize,gmaster,gngpu;
67  magma_int_t masterdev,lcdev,lccolsize,myngpu;
68 
69  magma_int_t stdev = (offset/nb)%ngpu;
70  magma_int_t blockoffset = offset % nb;
71  magma_int_t fstblksiz = 0;
72  if(blockoffset>0){
73  fstblksiz = min(m, (nb - blockoffset));
74  }
75  //magma_int_t nbblk = magma_ceildiv(m,nb);
76  magma_int_t nbblk = magma_ceildiv((m+blockoffset),nb);
77  magma_int_t maxgsize = n*nb*magma_ceildiv(nbblk,ngpu);
78  magma_int_t remm = m- fstblksiz;
79  magma_int_t nbblkoffst = offset/nb;
80 
81 
82  magma_int_t nblstblks = -1;
83  magma_int_t devlstblk = -1;
84  magma_int_t lstblksiz = remm%nb;
85  if(lstblksiz>0){
86  nblstblks = nbblk%ngpu;
87  devlstblk = (nblstblks-1+ngpu)%ngpu;
88  }
89 
90  magma_int_t nbcmplxactive = 0;
91  magma_int_t cmplxisactive[MagmaMaxGPUs];
92  magma_int_t gpuisactive[MagmaMaxGPUs];
93  memset(gpuisactive, 0, MagmaMaxGPUs*sizeof(magma_int_t));
94  memset(cmplxisactive, 0, MagmaMaxGPUs*sizeof(magma_int_t));
95 
96 
97  //*******************************
98  // each GPU make a GEMM with the
99  // transpose of its blocks to compute
100  // a final portion of X=A*VT
101  //*******************************
102  /* dB = V*T already ==> dB' = T'*V'
103  * compute T'*V'*X is equal to compute locally (VT)'_i*X_i
104  * then each GPU broadcast its X_i to assemble the full X which is used
105  * to compute W = X - 0.5 * V * T'*V'*X = X - 0.5 * V *dwork3
106  */
107  if(ngpu ==1){
108  magma_setdevice( 0 );
109  magmablasSetKernelStream( streams[ 0 ][ 0 ] );
110  // compute X[me] = A*VT = A[me]^tr *VT;
112  alpha, dA(0,offset,offset), ldda,
113  dB[0], lddb,
114  beta, dC[0], lddc );
115  return;
116  }
117  //ngpu>1
118  for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) {
119  masterdev = -1;
120  gnode[cmplxid][MagmaMaxGPUs+1] = -1;
121  myngpu = gnode[cmplxid][MagmaMaxGPUs];
122  for( magma_int_t idev = 0; idev < myngpu; ++idev ) {
123  dev = gnode[cmplxid][idev];
124  devperm = (dev-stdev+ngpu)%ngpu;
125  myblk = (nbblk/ngpu) + (nbblk%ngpu > devperm ? 1:0 );
126  mycolsize = myblk*nb;
127  myblkoffst = nb*((nbblkoffst/ngpu)+(nbblkoffst%ngpu > dev?1:0));
128  if(dev==stdev){
129  mycolsize -= blockoffset;
130  myblkoffst += blockoffset; // local index in parent matrix
131  }
132  if((devperm==devlstblk)&&(lstblksiz>0)){
133  mycolsize -= (nb-(remm%nb));
134  }
135  mycolsize = min(mycolsize,m);
136 
137 
138  if(mycolsize>0){
139  if(masterdev==-1) masterdev = dev;
140  //printf("dev %d devperm %d on cmplx %d master %d nbblk %d myblk %d m %d n %d mycolsize %d stdev %d fstblksize %d lastdev %d lastsize %d dA(%d,%d,%d) ==> dwork(%d,%d)\n",dev,devperm,cmplxid,masterdev,nbblk,myblk,m,n,mycolsize,stdev,fstblksiz,devlstblk,remm%nb,dev,offset,myblkoffst,dev,maxgsize*dev);
141  gpuisactive[dev] = mycolsize;
142  magma_setdevice( dev );
143  magmablasSetKernelStream( streams[ dev ][ dev ] );
144 
145  magma_dgemm( MagmaTrans, MagmaNoTrans, mycolsize, n, m,
146  alpha, dA(dev,offset,myblkoffst), ldda,
147  dB(dev,0,0), lddb,
148  beta, &dwork[dev][maxgsize*dev], mycolsize );
149  magma_event_record(redevents[dev][dev*ngpu+dev], streams[dev][dev]);
150  }
151  if(dev == masterdev){
152  nbcmplxactive = nbcmplxactive +1;
153  cmplxisactive[cmplxid] = 1;
154  gnode[cmplxid][MagmaMaxGPUs+1] = masterdev;
155  }
156  }
157  }
158 
159 
160 
161 /*
162  for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
163  magma_setdevice( dev );
164  magma_queue_sync( streams[ dev ][ dev ] );
165  }
166 */
167 
168 
169  //*******************************
170  // each Master GPU has the final
171  // result either by receiving
172  // from CPU of by making the add
173  // by himself, so now it is time
174  // to broadcast over the GPUs of
175  // its board.
176  //*******************************
177  //printf("=======================================================================\n");
178  //printf(" sending \n");
179  //printf("=======================================================================\n");
180 
181  for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) {
182  myngpu = gnode[cmplxid][MagmaMaxGPUs];
183  masterdev = gnode[cmplxid][MagmaMaxGPUs+1];
184  for( magma_int_t idev = 0; idev < myngpu; ++idev ) {
185  dev = gnode[cmplxid][idev];
186  mycolsize = gpuisactive[dev];
187  if(mycolsize>0){
188  // I am an active GPU send my portion local
189  // to all active gpu of my cmplex and global to the
190  // active master of the other real and they should
191  // send it out to their actives slaves.
192  magma_setdevice( dev );
193  //==============================================
194  // sending to the master of the active real
195  //==============================================
196  //printf ("\n\n**************GPU %d\n ",dev);
197  //printf (" GPU %d sending to cmplx masters\n",dev);
198  for( magma_int_t k = 0; k < nbcmplx; ++k ) {
199  if(k!=cmplxid){
200  gmaster = gnode[k][MagmaMaxGPUs+1];
201  if(gmaster!=-1){ //real is active
202  //printf (" device %d from cmplx %d is sending to master %d on cmplx %d block of size %d event %d\n",dev,cmplxid,gmaster,k,mycolsize,redevents[dev][gmaster*ngpu+dev]);
203  magma_queue_wait_event(streams[ dev ][ gmaster ], redevents[dev][dev*ngpu+dev]);
204  cudaMemcpy2DAsync(&dwork[gmaster][maxgsize*dev], mycolsize*sizeof(double),
205  &dwork[dev][maxgsize*dev], mycolsize*sizeof(double),
206  mycolsize*sizeof(double), n,
207  cudaMemcpyDeviceToDevice, streams[dev][gmaster]);
208  magma_event_record(redevents[dev][gmaster*ngpu+dev], streams[dev][gmaster]);
209  }
210  }
211  }
212  //==============================================
213  //
214  //==============================================
215  // sending to the active GPUs of my real
216  //==============================================
217  //printf (" GPU %d sending internal\n",dev);
218  for( magma_int_t l = 0; l < myngpu; ++l ) {
219  lcdev = gnode[cmplxid][l];
220  lccolsize = gpuisactive[lcdev];
221  if((lcdev!=dev)&&(lccolsize>0)){
222  //printf (" device %d from cmplx %d is sending internal to dev %d block of size %d event %d\n",dev,cmplxid,lcdev,mycolsize,redevents[dev][lcdev*ngpu+dev]);
223  magma_queue_wait_event(streams[ dev ][ lcdev ], redevents[dev][dev*ngpu+dev]);
224  cudaMemcpy2DAsync(&dwork[lcdev][maxgsize*dev], mycolsize*sizeof(double),
225  &dwork[dev][maxgsize*dev], mycolsize*sizeof(double),
226  mycolsize*sizeof(double), n,
227  cudaMemcpyDeviceToDevice, streams[dev][lcdev]);
228  magma_event_record(redevents[dev][lcdev*ngpu+dev], streams[dev][lcdev]);
229  }
230  }
231  //==============================================
232  }// end if mycolsize>0
233  }// for idev
234  }// for cmplxid
235 
236 
237  //printf("=======================================================================\n");
238  //printf(" master wait and resend internally \n");
239  //printf("=======================================================================\n");
240 
241  for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) {
242  myngpu = gnode[cmplxid][MagmaMaxGPUs];
243  masterdev = gnode[cmplxid][MagmaMaxGPUs+1];
244  //==============================================
245  // if I am active master so wait receiving contribution
246  // of the GPUs of other real and send it locally
247  //==============================================
248  if(masterdev != -1){
249  mycolsize = gpuisactive[masterdev];
250  magma_setdevice( masterdev );
251  //printf(" GPU %d distributing internal\n",masterdev);
252  for( magma_int_t k = 0; k < nbcmplx; ++k ) {
253  if(k!=cmplxid){
254  gngpu = gnode[k][MagmaMaxGPUs];
255  for( magma_int_t g = 0; g < gngpu; ++g ) {
256  gdev = gnode[k][g];
257  gcolsize = gpuisactive[gdev];
258  // check if I received from this GPU,
259  // if yes send it to my group
260  if(gcolsize>0){
261  magma_queue_wait_event(streams[ masterdev ][ gdev ], redevents[gdev][masterdev*ngpu+gdev]);
262  for( magma_int_t l = 0; l < myngpu; ++l ) {
263  lcdev = gnode[cmplxid][l];
264  lccolsize = gpuisactive[lcdev];
265  if((lcdev!=masterdev)&&(lccolsize>0)){
266  //printf(" Master %d on cmplx %d waiting on event %d is distributing internal results of %d to lcdev %d block of size %d event %d\n", masterdev,cmplxid,redevents[gdev][masterdev*ngpu+gdev],gdev,lcdev,gcolsize,redevents[masterdev][lcdev*ngpu+gdev]);
267  cudaMemcpy2DAsync(&dwork[lcdev][maxgsize*gdev], gcolsize*sizeof(double),
268  &dwork[masterdev][maxgsize*gdev], gcolsize*sizeof(double),
269  gcolsize*sizeof(double), n,
270  cudaMemcpyDeviceToDevice, streams[masterdev][gdev]);
271  magma_event_record(redevents[masterdev][lcdev*ngpu+gdev], streams[masterdev][gdev]);
272  }
273  }
274  }
275  }
276  }
277  }
278  }// if active master
279  //==============================================
280  }// for cmplxid
281 
282 
283 
284 
285 
286 /*
287 
288  for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
289  magma_setdevice( dev );
290  magma_queue_sync( streams[ dev ][ 0 ] );
291  for( magma_int_t s = 0; s < ngpu; ++s ) {
292  magma_queue_sync( streams[ dev ][ s ] );
293  }
294  }
295 */
296  //printf("=======================================================================\n");
297  //printf(" distributing \n");
298  //printf("=======================================================================\n");
299 
300  magma_int_t lcblki,gbblki,gblk,ib;
301 
302  for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) {
303  myngpu = gnode[cmplxid][MagmaMaxGPUs];
304  masterdev = gnode[cmplxid][MagmaMaxGPUs+1];
305  for( magma_int_t idev = 0; idev < myngpu; ++idev ) {
306  dev = gnode[cmplxid][idev];
307  mycolsize = gpuisactive[dev];
308  if(mycolsize>0){ // I am an active GPU
309  //printf("\n\n==============GPU %d collecting\n",dev);
310  magma_setdevice( dev );
311  // collect my results first as tyhere is no need to wait to
312  // receive nothing, just wait that my gemm are done.
313  // in theory this should be inside the loop but cuda was not
314  // able to run it first for all gpu and on gpu>0 it was waiting
315  // however it was on different stream so it should run. but maybe
316  // this is because there are too many function call and this make
317  // cuda not handleit so nice. anyway it coul dbe removed when cuda
318  // is able to lunch it first without wait.
319  gdev = dev;
320  gcolsize = gpuisactive[gdev];
321  if(gcolsize>0){
322  devperm = (gdev-stdev+ngpu)%ngpu;
323  gblk = (nbblk/ngpu) + (nbblk%ngpu > devperm ? 1:0 );
324  magmablasSetKernelStream( streams[ dev ][ gdev ] );
325  magma_queue_wait_event(streams[ dev ][ gdev ], redevents[gdev][dev*ngpu+gdev]);
326  //printf (" GPU %d stream %d doing dlacpy\n",dev,streams[ dev ][ gdev ]);
327  for( magma_int_t blki = 0; blki < gblk; ++blki){
328  gbblki = (blki*ngpu + devperm)*nb - blockoffset;
329  lcblki = blki*nb;
330  ib = nb;//min(nb,m-gbblki);
331  if(gdev==stdev){
332  lcblki = blki*nb-blockoffset;
333  if(blki==0){
334  gbblki = 0;
335  lcblki = 0;
336  ib = nb-blockoffset;
337  }
338  }
339  ib = min(ib,m-gbblki);
340  //printf(" blockoffset %d nbblk %d stdev %d receiving from gdev %d gblk %d gcolsize %d copying blki %d of size ibxn %dx%d from work[%d] to C[%d]\n", blockoffset,nbblk,stdev,gdev,gblk,gcolsize,blki,ib,n,lcblki,gbblki);
341  magmablas_dlacpy( 'A', ib, n, &dwork[dev][maxgsize*gdev+lcblki], gcolsize, &dC[dev][gbblki], lddc);
342  }// end blki
343  }
344 
345 
346 
347  for( magma_int_t k = 0; k < nbcmplx; ++k ) {
348  gngpu = gnode[k][MagmaMaxGPUs];
349  for( magma_int_t g = 0; g < gngpu; ++g ) {
350  gdev = gnode[k][g];
351  gcolsize = gpuisactive[gdev];
352  // if gcolsize>0, ==> gpu gdev was active and so
353  // I received from him/computed a portion of dwork,
354  // so go over its gblk and distribute it on dC.
355  if(gdev!=dev){
356  if(gcolsize>0){
357  devperm = (gdev-stdev+ngpu)%ngpu;
358  gblk = (nbblk/ngpu) + (nbblk%ngpu > devperm ? 1:0 );
359  magmablasSetKernelStream( streams[ dev ][ gdev ] );
360  if(k==cmplxid){
361  //we are on the same group so wait on event issued by gdev for me citing his id
362  magma_queue_wait_event(streams[ dev ][ gdev ], redevents[gdev][dev*ngpu+gdev]);
363  //printf (" GPU %d stream %d waiting on event %d to collecte from %d the size of gcolsize %d\n",dev,streams[ dev ][ gdev ],redevents[gdev][dev*ngpu+gdev],gdev,gcolsize);
364  }else{
365  //we are on different group so:
366  //if I am the master wait on the event issued by gdev for me citing his id
367  //else wait event issued by my master for me on the behalf of gdev
368  //printf (" GPU %d stream %d waiting on event %d to collecte from %d the size of gcolsize %d\n",dev,streams[ dev ][ gdev ],redevents[masterdev][dev*ngpu+gdev],gdev,gcolsize);
369  if(dev==masterdev)
370  magma_queue_wait_event(streams[ dev ][ gdev ], redevents[gdev][dev*ngpu+gdev]);
371  else
372  magma_queue_wait_event(streams[ dev ][ gdev ], redevents[masterdev][dev*ngpu+gdev]);
373  }
374  //printf (" GPU %d stream %d doing dlacpy\n",dev,streams[ dev ][ gdev ]);
375  for( magma_int_t blki = 0; blki < gblk; ++blki){
376  gbblki = (blki*ngpu + devperm)*nb - blockoffset;
377  lcblki = blki*nb;
378  ib = nb;//min(nb,m-gbblki);
379  if(gdev==stdev){
380  lcblki = blki*nb-blockoffset;
381  if(blki==0){
382  gbblki = 0;
383  lcblki = 0;
384  ib = nb-blockoffset;
385  }
386  }
387  ib = min(ib,m-gbblki);
388  //printf(" blockoffset %d nbblk %d stdev %d receiving from gdev %d gblk %d gcolsize %d copying blki %d of size ibxn %dx%d from work[%d] to C[%d]\n", blockoffset,nbblk,stdev,gdev,gblk,gcolsize,blki,ib,n,lcblki,gbblki);
389  magmablas_dlacpy( 'A', ib, n, &dwork[dev][maxgsize*gdev+lcblki], gcolsize, &dC[dev][gbblki], lddc);
390  }// end blki
391  }// en gcolsize>0 meaning gdev is active
392  } // end if gdev != dev
393  }// end loop over the g gpus of the cmplx k
394  }//end loop over the real k
395  }// end mycolsize>0 meaning that I am active
396  }// end loop over idev of cmplxid
397  }// end loop of the cmplx
398 
399 
400 
401 
402 
403 
404 
405  for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
406  magma_setdevice( dev );
407  cudaDeviceSynchronize();
408  }
409 
410  // put back the input gpu and its input stream
411  magma_setdevice( cdev );
412  magmablasSetKernelStream( cstream );
413 
414 }
#define min(a, b)
Definition: common_magma.h:86
magma_queue_t streams[MagmaMaxGPUs]
int magma_int_t
Definition: magmablas.h:12
magma_int_t ldda
void magma_queue_wait_event(magma_queue_t queue, magma_event_t event)
cublasStatus_t magmablasSetKernelStream(magma_queue_t stream)
void magma_setdevice(magma_device_t dev)
void magmablas_dlacpy(magma_uplo_t uplo, magma_int_t m, magma_int_t n, magmaDouble_const_ptr dA, magma_int_t ldda, magmaDouble_ptr dB, magma_int_t lddb)
void magma_getdevice(magma_device_t *dev)
magma_int_t magma_ceildiv(magma_int_t a, magma_int_t b)
Definition: magma_bulge.h:16
#define dC(dev, i, j)
#define MagmaMaxGPUs
Definition: magma_types.h:255
void magma_event_record(magma_event_t event, magma_queue_t queue)
#define dB(dev, i, j)
#define MagmaTrans
Definition: magma.h:58
#define dwork(dev, i, j)
void magma_dgemm(magma_trans_t transA, magma_trans_t transB, magma_int_t m, magma_int_t n, magma_int_t k, double alpha, magmaDouble_const_ptr dA, magma_int_t ldda, magmaDouble_const_ptr dB, magma_int_t lddb, double beta, magmaDouble_ptr dC, magma_int_t lddc)
#define dA(dev, i, j)
#define MagmaNoTrans
Definition: magma.h:57
cublasStatus_t magmablasGetKernelStream(magma_queue_t *stream)

Here is the call graph for this function:

Here is the caller graph for this function: