PULSAR  2.0.0
Parallel Ultra-Light Systolic Array Runtime
 All Data Structures Files Functions Typedefs Enumerations Macros Groups
svg_trace.c
Go to the documentation of this file.
1 
11 #include "svg_trace.h"
12 
13 // Per-core events.
14 static int eventNumCore [SVG_TRACE_MAX_CORES];
15 static double eventStartCore[SVG_TRACE_MAX_CORES][SVG_TRACE_MAX_EVENTS];
16 static double eventStopCore [SVG_TRACE_MAX_CORES][SVG_TRACE_MAX_EVENTS];
17 static int eventColorCore[SVG_TRACE_MAX_CORES][SVG_TRACE_MAX_EVENTS];
18 
19 // Per-device events.
20 static int eventNumDevice [SVG_TRACE_MAX_DEVICES];
21 static cudaEvent_t evStartDevice [SVG_TRACE_MAX_DEVICES][SVG_TRACE_MAX_EVENTS];
22 static cudaEvent_t evStopDevice [SVG_TRACE_MAX_DEVICES][SVG_TRACE_MAX_EVENTS];
23 static double eventStartDevice[SVG_TRACE_MAX_DEVICES][SVG_TRACE_MAX_EVENTS];
24 static double eventStopDevice [SVG_TRACE_MAX_DEVICES][SVG_TRACE_MAX_EVENTS];
25 static int eventColorDevice[SVG_TRACE_MAX_DEVICES][SVG_TRACE_MAX_EVENTS];
26 
27 // Per-device DMA events.
28 static int eventNumDMA [SVG_TRACE_MAX_DEVICES];
29 static cudaEvent_t evStartDMA [SVG_TRACE_MAX_DEVICES][SVG_TRACE_MAX_EVENTS];
30 static cudaEvent_t evStopDMA [SVG_TRACE_MAX_DEVICES][SVG_TRACE_MAX_EVENTS];
31 static double eventStartDMA[SVG_TRACE_MAX_DEVICES][SVG_TRACE_MAX_EVENTS];
32 static double eventStopDMA [SVG_TRACE_MAX_DEVICES][SVG_TRACE_MAX_EVENTS];
33 static int eventColorDMA[SVG_TRACE_MAX_DEVICES][SVG_TRACE_MAX_EVENTS];
34 
35 // Memory usage.
36 static long memoryLevelHost;
37 static long memoryMaxHost;
38 static long memoryLevelDevice[SVG_TRACE_MAX_DEVICES];
39 static long memoryMaxDevice [SVG_TRACE_MAX_DEVICES];
40 static pthread_spinlock_t memorySpinlockHost;
41 static pthread_spinlock_t memorySpinlockDevice[SVG_TRACE_MAX_DEVICES];
42 
43 static void svg_trace_print_cores(
44  int mpi_rank, int mpi_size,
45  int num_cores, int num_devices,
46  double hscale, double vscale, FILE *trace_file);
47 
48 static void svg_trace_print_devices(
49  int mpi_rank, int mpi_size,
50  int num_cores, int num_devices,
51  double hscale, double vscale, FILE *trace_file);
52 
53 static void svg_trace_print_dmas(
54  int mpi_rank, int mpi_size,
55  int num_cores, int num_devices,
56  double hscale, double vscale, FILE *trace_file);
57 
58 static void svg_trace_print_memory(int mpi_rank, int mpi_size, int num_devices);
59 
61 
67 {
68  struct timeval time_val;
69  struct timezone time_zone;
70  gettimeofday(&time_val, &time_zone);
71  return (double)(time_val.tv_sec) + (double)(time_val.tv_usec) / 1000000.0;
72 }
73 
75 
81 void svg_trace_init(int num_cores, int num_devices)
82 {
83  assert(num_cores <= SVG_TRACE_MAX_CORES);
84  assert(num_devices <= SVG_TRACE_MAX_DEVICES);
85  assert(__builtin_popcount(SVG_TRACE_MAX_EVENTS) == 0x01);
86  assert(__builtin_popcount(SVG_TRACE_MAX_MEM_SNAPSHOTS) == 0x01);
87 
88  // Initialize host memory level spinlock.
89  int retval = pthread_spin_init(
90  &memorySpinlockHost, PTHREAD_PROCESS_PRIVATE);
91  assert(retval == 0);
92 
93  int device;
94  // FOR each device.
95  for (device = 0; device < num_devices; device++) {
96  // Set device.
97  cudaError_t error = cudaSetDevice(device);
98  assert(error == cudaSuccess);
99 
100  int event;
101  // Initialize device and DMA events.
102  for (event = 0; event < SVG_TRACE_MAX_EVENTS; event++) {
103  cudaError_t error1 = cudaEventCreate(&evStartDevice[device][event]);
104  cudaError_t error2 = cudaEventCreate(&evStopDevice [device][event]);
105  assert(error1 == cudaSuccess);
106  assert(error2 == cudaSuccess);
107  error1 = cudaEventCreate(&evStartDMA[device][event]);
108  error2 = cudaEventCreate(&evStopDMA [device][event]);
109  assert(error1 == cudaSuccess);
110  assert(error2 == cudaSuccess);
111  }
112  // Initialize device memory level spinlock.
113  int retval = pthread_spin_init(
114  &memorySpinlockDevice[device], PTHREAD_PROCESS_PRIVATE);
115  assert(retval == 0);
116  }
117 }
118 
120 
125 void svg_trace_start_cpu(int thread_rank)
126 {
127  assert(thread_rank < SVG_TRACE_MAX_CORES);
128  eventStartCore[thread_rank][eventNumCore[thread_rank]] = get_time_of_day();
129 }
130 
132 
138 void svg_trace_stop_cpu(int thread_rank, int color)
139 {
140  assert(thread_rank < SVG_TRACE_MAX_CORES);
141  eventStopCore [thread_rank][eventNumCore[thread_rank]] = get_time_of_day();
142  eventColorCore[thread_rank][eventNumCore[thread_rank]] = color;
143  eventNumCore[thread_rank]++;
144  eventNumCore[thread_rank] &= (SVG_TRACE_MAX_EVENTS-1);
145 }
146 
148 
153 void svg_trace_start_gpu(cudaStream_t stream)
154 {
155  int device;
156  cudaGetDevice(&device);
157  assert(device < SVG_TRACE_MAX_DEVICES);
158  cudaError_t error = cudaEventRecord(
159  evStartDevice[device][eventNumDevice[device]], stream);
160  assert(error == cudaSuccess);
161 }
162 
164 
170 void svg_trace_stop_gpu(cudaStream_t stream, int color)
171 {
172  int device;
173  cudaGetDevice(&device);
174  assert(device < SVG_TRACE_MAX_DEVICES);
175  cudaError_t error = cudaEventRecord(
176  evStopDevice[device][eventNumDevice[device]], stream);
177  assert(error == cudaSuccess);
178  eventColorDevice[device][eventNumDevice[device]] = color;
179  eventNumDevice[device]++;
180  eventNumDevice[device] &= (SVG_TRACE_MAX_EVENTS-1);
181 }
182 
184 
189 void svg_trace_start_dma(cudaStream_t stream)
190 {
191  int device;
192  cudaGetDevice(&device);
193  assert(device < SVG_TRACE_MAX_DEVICES);
194  cudaError_t error = cudaEventRecord(
195  evStartDMA[device][eventNumDMA[device]], stream);
196  assert(error == cudaSuccess);
197 }
198 
200 
206 void svg_trace_stop_dma(cudaStream_t stream, int color)
207 {
208  int device;
209  cudaGetDevice(&device);
210  assert(device < SVG_TRACE_MAX_DEVICES);
211  cudaError_t error = cudaEventRecord(
212  evStopDMA[device][eventNumDMA[device]], stream);
213  assert(error == cudaSuccess);
214  eventColorDMA[device][eventNumDMA[device]] = color;
215  eventNumDMA[device]++;
216  eventNumDMA[device] &= (SVG_TRACE_MAX_EVENTS-1);
217 }
218 
220 
228 void svg_trace_memory_host(long delta)
229 {
230  pthread_spin_lock(&memorySpinlockHost);
231  memoryLevelHost += delta;
232  if (memoryLevelHost > memoryMaxHost)
233  memoryMaxHost = memoryLevelHost;
234  pthread_spin_unlock(&memorySpinlockHost);
235 }
236 
238 
243 void svg_trace_memory_device(long delta)
244 {
245  int device;
246  cudaGetDevice(&device);
247  assert(device < SVG_TRACE_MAX_DEVICES);
248  pthread_spin_lock(&memorySpinlockDevice[device]);
249  memoryLevelDevice[device] += delta;
250  if (memoryLevelDevice[device] > memoryMaxDevice[device])
251  memoryMaxDevice[device] = memoryLevelDevice[device];
252  pthread_spin_unlock(&memorySpinlockDevice[device]);
253 }
254 
256 
261 void svg_trace_finish(int num_cores, int num_devices)
262 {
263  int mpi_rank;
264  int mpi_size;
265  MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
266  MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
267 
268  int proc;
269  int core;
270  int event;
271  int device;
272  int snapshot;
273 
274  double max_time_in = 0.0;
275  // Find maximum timestamp.
276  // Compute horizontal scaling factor.
277  for (core = 0; core < SVG_TRACE_MAX_CORES; core++) {
278  double time =
279  eventStopCore [core][eventNumCore[core]-1] -
280  eventStopCore[0][0];
281  if (time > max_time_in)
282  max_time_in = time;
283  }
284  double max_time;
285 #ifdef MPI
286  MPI_Reduce(
287  &max_time_in, &max_time, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
288 #else
289  max_time = max_time_in;
290 #endif
291  double hscale_cpu = 2000.0 / max_time;
292  double hscale_gpu = hscale_cpu / 1000.0;
293 
294  // Compute the vertical scaling factor.
295  double vscale = 1000.0 / ((num_cores+num_devices*2)*mpi_size);
296 
297  FILE *trace_file;
298  if (mpi_rank == 0) {
299  char trace_file_name[SVG_TRACE_FILE_NAME_SIZE];
300  sprintf(trace_file_name, "trace_%d.svg", (int)(time(NULL)));
301  trace_file = fopen(trace_file_name, "w");
302  assert(trace_file != NULL);
303  fprintf(trace_file,
304  "<svg width=\"200mm\" height=\"100mm\" viewBox=\"0 0 2000 1000\">\n"
305  " <g>\n");
306  }
307  // Print cores' traces.
308  svg_trace_print_cores(
309  mpi_rank, mpi_size,
310  num_cores, num_devices,
311  hscale_cpu, vscale, trace_file);
312 
313  // Print devices' traces.
314  svg_trace_print_devices(
315  mpi_rank, mpi_size,
316  num_cores, num_devices,
317  hscale_gpu, vscale, trace_file);
318 
319  // Print DMAs' traces.
320  svg_trace_print_dmas(
321  mpi_rank, mpi_size,
322  num_cores, num_devices,
323  hscale_gpu, vscale, trace_file);
324 
325  if (mpi_rank == 0) {
326  fprintf(trace_file,
327  " </g>\n"
328  "</svg>\n");
329  fclose(trace_file);
330  }
331  // Print memory usage info.
332  svg_trace_print_memory(mpi_rank, mpi_size, num_devices);
333 }
334 
336 
347 static void svg_trace_print_cores(
348  int mpi_rank, int mpi_size,
349  int num_cores, int num_devices,
350  double hscale, double vscale, FILE *trace_file)
351 {
352  int proc;
353  int core;
354  int event;
355  if (mpi_rank == 0) {
356  for (proc = 0; proc < mpi_size; proc++) {
357  if (proc > 0) {
358  // Receive events.
359  MPI_Recv(
360  &eventNumCore[0],
361  SVG_TRACE_MAX_CORES, MPI_INT,
362  proc, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
363  MPI_Recv(
364  &eventStartCore[0][0],
365  SVG_TRACE_MAX_CORES*SVG_TRACE_MAX_EVENTS, MPI_DOUBLE,
366  proc, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
367  MPI_Recv(
368  &eventStopCore[0][0],
369  SVG_TRACE_MAX_CORES*SVG_TRACE_MAX_EVENTS, MPI_DOUBLE,
370  proc, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
371  MPI_Recv(
372  &eventColorCore[0][0],
373  SVG_TRACE_MAX_CORES*SVG_TRACE_MAX_EVENTS, MPI_INT,
374  proc, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
375  }
376  for (core = 0; core < num_cores; core++) {
377  for (event = 0; event < eventNumCore[core]; event++) {
378  double start = eventStartCore[core][event] - eventStopCore[0][0];
379  double stop = eventStopCore [core][event] - eventStopCore[0][0];
380  double width = (stop-start) * hscale;
381  int color = eventColorCore[core][event];
382  int thread = proc*(num_cores+2*num_devices);
383  thread += core;
384  fprintf(trace_file,
385  " <rect "
386  "x=\"%.2lf\" y=\"%.0lf\" width=\"%.2lf\" height=\"%.0lf\" "
387  "fill=\"#%06x\" stroke=\"#%06x\" stroke-width=\"0.5\"/>\n",
388  start * hscale,
389  thread * vscale,
390  width < 2.0 ? 2.0 : width,
391  vscale * 0.9,
392  abs(color),
393  color < 0 ? color : 0);
394  }
395  }
396  }
397  }
398  else {
399  // Send events.
400  MPI_Send(
401  &eventNumCore[0],
402  SVG_TRACE_MAX_CORES,
403  MPI_INT, 0, 0, MPI_COMM_WORLD);
404  MPI_Send(
405  &eventStartCore[0][0],
406  SVG_TRACE_MAX_CORES*SVG_TRACE_MAX_EVENTS,
407  MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
408  MPI_Send(
409  &eventStopCore[0][0],
410  SVG_TRACE_MAX_CORES*SVG_TRACE_MAX_EVENTS,
411  MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
412  MPI_Send(
413  &eventColorCore[0][0],
414  SVG_TRACE_MAX_CORES*SVG_TRACE_MAX_EVENTS,
415  MPI_INT, 0, 0, MPI_COMM_WORLD);
416  }
417 }
418 
420 
431 static void svg_trace_print_devices(
432  int mpi_rank, int mpi_size,
433  int num_cores, int num_devices,
434  double hscale, double vscale, FILE *trace_file)
435 {
436  int event;
437  int device;
438  // Convert cudaEvent_t to double.
439  for (device = 0; device < num_devices; device++) {
440  cudaError_t error = cudaSetDevice(device);
441  assert(error == cudaSuccess);
442  for (event = 0; event < eventNumDevice[device]; event++)
443  {
444  float fstart;
445  float fstop;
446  cudaEventElapsedTime(&fstart,
447  evStartDevice[device][0], evStartDevice[device][event]);
448  cudaEventElapsedTime(&fstop,
449  evStartDevice[device][0], evStopDevice[device][event]);
450  double dstart = (double)fstart;
451  double dstop = (double)fstop;
452  eventStartDevice[device][event] = dstart;
453  eventStopDevice[device][event] = dstop;
454  }
455  eventStartDevice[device][0] = 0.0;
456  eventStopDevice[device][0] = 0.0;
457  }
458  int proc;
459  if (mpi_rank == 0) {
460  for (proc = 0; proc < mpi_size; proc++) {
461  if (proc > 0) {
462  // Receive events.
463  MPI_Recv(
464  &eventNumDevice[0],
465  SVG_TRACE_MAX_DEVICES, MPI_INT,
466  proc, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
467  MPI_Recv(
468  &eventStartDevice[0][0],
469  SVG_TRACE_MAX_DEVICES*SVG_TRACE_MAX_EVENTS, MPI_DOUBLE,
470  proc, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
471  MPI_Recv(
472  &eventStopDevice[0][0],
473  SVG_TRACE_MAX_DEVICES*SVG_TRACE_MAX_EVENTS, MPI_DOUBLE,
474  proc, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
475  MPI_Recv(
476  &eventColorDevice[0][0],
477  SVG_TRACE_MAX_DEVICES*SVG_TRACE_MAX_EVENTS, MPI_INT,
478  proc, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
479  }
480  for (device = 0; device < num_devices; device++) {
481  for (event = 1; event < eventNumDevice[device]; event++) {
482  double start = eventStartDevice[device][event];
483  double stop = eventStopDevice [device][event];
484  double width = (stop-start) * hscale;
485  int color = eventColorDevice[device][event];
486  int accel = proc*(num_cores+2*num_devices);
487  accel += num_cores+device;
488  fprintf(trace_file,
489  " <rect "
490  "x=\"%.2lf\" y=\"%.0lf\" width=\"%.2lf\" height=\"%.0lf\" "
491  "fill=\"#%06x\" stroke=\"#%06x\" stroke-width=\"0.5\"/>\n",
492  start * hscale,
493  accel * vscale,
494  width < 2.0 ? 2.0 : width,
495  vscale * 0.9,
496  abs(color),
497  color < 0 ? color : 0);
498  }
499  }
500  }
501  }
502  else {
503  // Send events.
504  MPI_Send(
505  &eventNumDevice[0],
506  SVG_TRACE_MAX_DEVICES,
507  MPI_INT, 0, 0, MPI_COMM_WORLD);
508  MPI_Send(
509  &eventStartDevice[0][0],
510  SVG_TRACE_MAX_DEVICES*SVG_TRACE_MAX_EVENTS,
511  MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
512  MPI_Send(
513  &eventStopDevice[0][0],
514  SVG_TRACE_MAX_DEVICES*SVG_TRACE_MAX_EVENTS,
515  MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
516  MPI_Send(
517  &eventColorDevice[0][0],
518  SVG_TRACE_MAX_DEVICES*SVG_TRACE_MAX_EVENTS,
519  MPI_INT, 0, 0, MPI_COMM_WORLD);
520  }
521 }
522 
524 
535 static void svg_trace_print_dmas(
536  int mpi_rank, int mpi_size,
537  int num_cores, int num_devices,
538  double hscale, double vscale, FILE *trace_file)
539 {
540  int event;
541  int device;
542  // Convert cudaEvent_t to double.
543  for (device = 0; device < num_devices; device++) {
544  cudaError_t error = cudaSetDevice(device);
545  assert(error == cudaSuccess);
546  for (event = 0; event < eventNumDMA[device]; event++)
547  {
548  float fstart;
549  float fstop;
550  cudaEventElapsedTime(&fstart,
551  evStartDMA[device][0], evStartDMA[device][event]);
552  cudaEventElapsedTime(&fstop,
553  evStartDMA[device][0], evStopDMA[device][event]);
554  double dstart = (double)fstart;
555  double dstop = (double)fstop;
556  eventStartDMA[device][event] = dstart;
557  eventStopDMA[device][event] = dstop;
558  }
559  eventStartDMA[device][0] = 0.0;
560  eventStopDMA[device][0] = 0.0;
561  }
562  int proc;
563  if (mpi_rank == 0) {
564  for (proc = 0; proc < mpi_size; proc++) {
565  if (proc > 0) {
566  // Receive events.
567  MPI_Recv(
568  &eventNumDMA[0],
569  SVG_TRACE_MAX_DEVICES, MPI_INT,
570  proc, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
571  MPI_Recv(
572  &eventStartDMA[0][0],
573  SVG_TRACE_MAX_DEVICES*SVG_TRACE_MAX_EVENTS, MPI_DOUBLE,
574  proc, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
575  MPI_Recv(
576  &eventStopDMA[0][0],
577  SVG_TRACE_MAX_DEVICES*SVG_TRACE_MAX_EVENTS, MPI_DOUBLE,
578  proc, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
579  MPI_Recv(
580  &eventColorDMA[0][0],
581  SVG_TRACE_MAX_DEVICES*SVG_TRACE_MAX_EVENTS, MPI_INT,
582  proc, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
583  }
584  for (device = 0; device < num_devices; device++) {
585  for (event = 1; event < eventNumDMA[device]; event++) {
586  double start = eventStartDMA[device][event];
587  double stop = eventStopDMA [device][event];
588  double width = (stop-start) * hscale;
589  int color = eventColorDMA[device][event];
590  int accel = proc*(num_cores+2*num_devices);
591  accel += num_cores+num_devices+device;
592  fprintf(trace_file,
593  " <rect "
594  "x=\"%.2lf\" y=\"%.0lf\" width=\"%.2lf\" height=\"%.0lf\" "
595  "fill=\"#%06x\" stroke=\"#%06x\" stroke-width=\"0.5\"/>\n",
596  start * hscale,
597  accel * vscale,
598  width < 2.0 ? 2.0 : width,
599  vscale * 0.9,
600  abs(color),
601  color < 0 ? color : 0);
602  }
603  }
604  }
605  }
606  else {
607  // Send events.
608  MPI_Send(
609  &eventNumDMA[0],
610  SVG_TRACE_MAX_DEVICES,
611  MPI_INT, 0, 0, MPI_COMM_WORLD);
612  MPI_Send(
613  &eventStartDMA[0][0],
614  SVG_TRACE_MAX_DEVICES*SVG_TRACE_MAX_EVENTS,
615  MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
616  MPI_Send(
617  &eventStopDMA[0][0],
618  SVG_TRACE_MAX_DEVICES*SVG_TRACE_MAX_EVENTS,
619  MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
620  MPI_Send(
621  &eventColorDMA[0][0],
622  SVG_TRACE_MAX_DEVICES*SVG_TRACE_MAX_EVENTS,
623  MPI_INT, 0, 0, MPI_COMM_WORLD);
624  }
625 }
626 
628 
635 static void svg_trace_print_memory(int mpi_rank, int mpi_size, int num_devices)
636 {
637  int rank;
638  printf("\n");
639  for (rank = 0; rank < mpi_size; rank++) {
640  if (rank == mpi_rank) {
641 
642  // Print maximum host memory level.
643  if (labs(memoryMaxHost) < 1024)
644  printf("Host %d max:\t%ld B\n",
645  rank, memoryMaxHost);
646  else if (labs(memoryMaxHost) < 1024*1024)
647  printf("Host %d max:\t%.1lf KB\n",
648  rank, memoryMaxHost/1024.0);
649  else if (labs(memoryMaxHost) < 1024*1024*1024)
650  printf("Host %d max:\t%.1lf MB\n",
651  rank, memoryMaxHost/1024.0/1024.0);
652  else
653  printf("Host %d max:\t%.1lf GB\n",
654  rank, memoryMaxHost/1024.0/1024.0/1024.0);
655 
656  // Print final host memory level.
657  if (labs(memoryLevelHost) < 1024)
658  printf("Host %d end:\t%ld B\n",
659  rank, memoryLevelHost);
660  else if (labs(memoryLevelHost) < 1024*1024)
661  printf("Host %d end:\t%.1lf KB\n",
662  rank, memoryLevelHost/1024.0);
663  else if (labs(memoryLevelHost) < 1024*1024*1024)
664  printf("Host %d end:\t%.1lf MB\n",
665  rank, memoryLevelHost/1024.0/1024.0);
666  else
667  printf("Host %d end:\t%.1lf GB\n",
668  rank, memoryLevelHost/1024.0/1024.0/1024.0);
669 
670  int device;
671  for (device = 0; device < num_devices; device++) {
672 
673  // Print maximum host memory level.
674  if (labs(memoryMaxDevice[device]) < 1024)
675  printf(" Device %d max:\t%ld B\n",
676  device, memoryMaxDevice[device]);
677  else if (labs(memoryMaxDevice[device]) < 1024*1024)
678  printf(" Device %d max:\t%.1lf KB\n",
679  device, memoryMaxDevice[device]/1024.0);
680  else if (labs(memoryMaxDevice[device]) < 1024*1024*1024)
681  printf(" Device %d max:\t%.1lf MB\n",
682  device, memoryMaxDevice[device]/1024.0/1024.0);
683  else
684  printf(" Device %d max:\t%.1lf GB\n",
685  device, memoryMaxDevice[device]/1024.0/1024.0/1024.0);
686 
687  // Print final host memory level.
688  if (labs(memoryLevelDevice[device]) < 1024)
689  printf(" Device %d end:\t%ld B\n",
690  device, memoryLevelDevice[device]);
691  else if (labs(memoryLevelDevice[device]) < 1024*1024)
692  printf(" Device %d end:\t%.1lf KB\n",
693  device, memoryLevelDevice[device]/1024.0);
694  else if (labs(memoryLevelDevice[device]) < 1024*1024*1024)
695  printf(" Device %d end:\t%.1lf MB\n",
696  device, memoryLevelDevice[device]/1024.0/1024.0);
697  else
698  printf(" Device %d end:\t%.1lf GB\n",
699  device, memoryLevelDevice[device]/1024.0/1024.0/1024.0);
700 
701  }
702  MPI_Barrier(MPI_COMM_WORLD);
703  }
704  }
705 }