PAPI  5.4.0.0
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
linux-cuda.c
Go to the documentation of this file.
1 /****************************/
2 /* THIS IS OPEN SOURCE CODE */
3 /****************************/
4 
17 #include <dlfcn.h>
18 
19 #include "papi.h"
20 #include "papi_internal.h"
21 #include "papi_vector.h"
22 #include "papi_memory.h"
23 #include "linux-cuda.h"
24 
25 
26 /******** CHANGE PROTOTYPES TO DECLARE CUDA LIBRARY SYMBOLS AS WEAK **********
27  * This is done so that a version of PAPI built with the cuda component can *
28  * be installed on a system which does not have the cuda libraries installed. *
29  * *
30  * If this is done without these prototypes, then all papi services on the *
31  * system without the cuda libraries installed will fail. The PAPI libraries *
32  * contain references to the cuda libraries which are not installed. The *
33  * load of PAPI commands fails because the cuda library references can not be *
34  * resolved. *
35  * *
36  * This also defines pointers to the cuda library functions that we call. *
37  * These function pointers will be resolved with dlopen/dlsym calls at *
38  * component initialization time. The component then calls the cuda library *
39  * functions through these function pointers. *
40  *******************************************************************************/
42 #undef CUDAAPI
43 #define CUDAAPI __attribute__((weak))
44 CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev);
45 CUresult CUDAAPI cuCtxDestroy(CUcontext);
46 CUresult CUDAAPI cuCtxGetCurrent(CUcontext *);
47 CUresult CUDAAPI cuDeviceGet(CUdevice *, int);
48 CUresult CUDAAPI cuDeviceGetCount(int *);
49 CUresult CUDAAPI cuDeviceGetName(char *, int, CUdevice);
50 CUresult CUDAAPI cuInit(unsigned int);
51 
52 CUresult (*cuCtxCreatePtr)(CUcontext *pctx, unsigned int flags, CUdevice dev);
53 CUresult (*cuCtxDestroyPtr)(CUcontext);
54 CUresult (*cuCtxGetCurrentPtr)(CUcontext *);
55 CUresult (*cuDeviceGetPtr)(CUdevice *, int);
56 CUresult (*cuDeviceGetCountPtr)(int *);
57 CUresult (*cuDeviceGetNamePtr)(char *, int, CUdevice);
58 CUresult (*cuInitPtr)(unsigned int);
59 
60 #undef CUDARTAPI
61 #define CUDARTAPI __attribute__((weak))
62 cudaError_t CUDARTAPI cudaFree(void *);
63 cudaError_t CUDARTAPI cudaGetDevice(int *);
64 cudaError_t CUDARTAPI cudaRuntimeGetVersion( int *);
65 cudaError_t CUDARTAPI cudaDriverGetVersion( int *);
66 
67 cudaError_t (*cudaFreePtr)(void *);
68 cudaError_t (*cudaGetDevicePtr)(int *);
69 cudaError_t (*cudaRuntimeGetVersionPtr)(int *);
70 cudaError_t (*cudaDriverGetVersionPtr)(int *);
71 
72 #undef CUPTIAPI
73 #define CUPTIAPI __attribute__((weak))
74 CUptiResult CUPTIAPI cuptiDeviceEnumEventDomains(CUdevice, size_t *, CUpti_EventDomainID *);
75 CUptiResult CUPTIAPI cuptiDeviceGetEventDomainAttribute(CUdevice, CUpti_EventDomainID, CUpti_EventDomainAttribute, size_t *, void *);
76 CUptiResult CUPTIAPI cuptiDeviceGetNumEventDomains(CUdevice, uint32_t *);
77 CUptiResult CUPTIAPI cuptiEventDomainEnumEvents(CUpti_EventDomainID, size_t*, CUpti_EventID *);
78 CUptiResult CUPTIAPI cuptiEventDomainGetNumEvents(CUpti_EventDomainID, uint32_t *);
79 CUptiResult CUPTIAPI cuptiEventGetAttribute(CUpti_EventID, CUpti_EventAttribute, size_t *, void *);
80 CUptiResult CUPTIAPI cuptiEventGroupAddEvent(CUpti_EventGroup, CUpti_EventID);
81 CUptiResult CUPTIAPI cuptiEventGroupCreate(CUcontext, CUpti_EventGroup *, uint32_t);
82 CUptiResult CUPTIAPI cuptiEventGroupDestroy(CUpti_EventGroup);
83 CUptiResult CUPTIAPI cuptiEventGroupDisable(CUpti_EventGroup);
84 CUptiResult CUPTIAPI cuptiEventGroupEnable(CUpti_EventGroup);
85 CUptiResult CUPTIAPI cuptiEventGroupReadAllEvents(CUpti_EventGroup, CUpti_ReadEventFlags, size_t *, uint64_t *, size_t *, CUpti_EventID *, size_t *);
86 CUptiResult CUPTIAPI cuptiEventGroupRemoveAllEvents(CUpti_EventGroup);
87 CUptiResult CUPTIAPI cuptiEventGroupResetAllEvents(CUpti_EventGroup);
88 
89 CUptiResult (*cuptiDeviceEnumEventDomainsPtr)(CUdevice, size_t *, CUpti_EventDomainID *);
90 CUptiResult (*cuptiDeviceGetEventDomainAttributePtr)(CUdevice, CUpti_EventDomainID, CUpti_EventDomainAttribute, size_t *, void *);
91 CUptiResult (*cuptiDeviceGetNumEventDomainsPtr)(CUdevice, uint32_t *);
92 CUptiResult (*cuptiEventDomainEnumEventsPtr)(CUpti_EventDomainID, size_t*, CUpti_EventID *);
93 CUptiResult (*cuptiEventDomainGetNumEventsPtr)(CUpti_EventDomainID, uint32_t *);
94 CUptiResult (*cuptiEventGetAttributePtr)(CUpti_EventID, CUpti_EventAttribute, size_t *, void *);
95 CUptiResult (*cuptiEventGroupAddEventPtr)(CUpti_EventGroup, CUpti_EventID);
96 CUptiResult (*cuptiEventGroupCreatePtr)(CUcontext, CUpti_EventGroup *, uint32_t);
97 CUptiResult (*cuptiEventGroupDestroyPtr)(CUpti_EventGroup);
98 CUptiResult (*cuptiEventGroupDisablePtr)(CUpti_EventGroup);
99 CUptiResult (*cuptiEventGroupEnablePtr)(CUpti_EventGroup);
100 CUptiResult (*cuptiEventGroupReadAllEventsPtr)(CUpti_EventGroup, CUpti_ReadEventFlags, size_t *, uint64_t *, size_t *, CUpti_EventID *, size_t *);
101 CUptiResult (*cuptiEventGroupRemoveAllEventsPtr)(CUpti_EventGroup);
102 CUptiResult (*cuptiEventGroupResetAllEventsPtr)(CUpti_EventGroup);
103 
104 // file handles used to access cuda libraries with dlopen
105 static void* dl1 = NULL;
106 static void* dl2 = NULL;
107 static void* dl3 = NULL;
108 
109 static int linkCudaLibraries ();
110 
112 
113 
114 /******************************************************************************
115  ******** BEGIN FUNCTIONS USED INTERNALLY SPECIFIC TO THIS COMPONENT *********
116  *****************************************************************************/
117 /*
118  * Specify device(s): Counts number of cuda events available in this system
119  */
120 static int
121 detectDevice( void )
122 {
123  CUresult err;
124  int skipDevice = 0;
125  int id;
126  char deviceName_tmp[PAPI_MIN_STR_LEN] = "init";
127 
128  totalEventCount = 0;
129 
130 /* CUDA initialization */
131  err = (*cuInitPtr)( 0 );
132  if ( err != CUDA_SUCCESS ) {
133  SUBDBG ("Info: Error from cuInit(): %d\n", err);
134  return ( PAPI_ENOSUPP );
135  }
136 
137  /* How many gpgpu devices do we have? */
138  err = (*cuDeviceGetCountPtr)( &deviceCount );
139  CHECK_CU_ERROR( err, "cuDeviceGetCount" );
140  if ( deviceCount == 0 )
141  return ( PAPI_ENOSUPP );
142 
143  /* allocate memory for device data table */
144  device = ( DeviceData_t * ) malloc( sizeof ( DeviceData_t ) * deviceCount );
145  if ( device == NULL ) {
146  perror( "malloc(): Failed to allocate memory to CUDA device table" );
147  return ( PAPI_ENOSUPP );
148  }
149 
150  /* What are the devices? Get Name and # of domains per device */
151  for ( id = 0; id < deviceCount; id++ ) {
152  err = (*cuDeviceGetPtr)( &device[id].dev, id );
153  CHECK_CU_ERROR( err, "cuDeviceGet" );
154 
155  err = (*cuDeviceGetNamePtr)( device[id].name, PAPI_MIN_STR_LEN, device[id].dev );
156  CHECK_CU_ERROR( err, "cuDeviceGetName" );
157 
158  SUBDBG ("Cuda deviceName: %s\n", device[id].name);
159 
160  /* Skip device if there are multiple of the same type
161  and if it has been already added to the list */
162  if ( 0 == strcmp( deviceName_tmp, device[id].name ) ) {
163  skipDevice++;
164  continue;
165  }
166 
167  strcpy( deviceName_tmp, device[id].name );
168 
169  /* enumerate the domains on the device */
170  if ( 0 != enumEventDomains( device[id].dev, id ) )
171  return ( PAPI_ENOSUPP );
172  }
173 
174  deviceCount = deviceCount - skipDevice;
175 
176  /* return number of events provided via CuPTI */
177  return totalEventCount;
178 }
179 
180 
181 /*
182  * Detect supported domains for specified device
183  */
184 static int
185 enumEventDomains( CUdevice dev, int deviceId )
186 {
187  CUptiResult err = CUPTI_SUCCESS;
188  CUpti_EventDomainID *domainId = NULL;
189  uint32_t id = 0;
190  size_t size = 0;
191 
192  device[deviceId].domainCount = 0;
193 
194  /* get number of domains for device dev */
195  err = (*cuptiDeviceGetNumEventDomainsPtr)( dev, &device[deviceId].domainCount );
196  CHECK_CUPTI_ERROR( err, "cuptiDeviceGetNumEventDomains" );
197 
198  if ( device[deviceId].domainCount == 0 ) {
199  printf( "No domain is exposed by dev = %d\n", dev );
200  return -1;
201  }
202 
203  /* CuPTI domain struct */
204  size = sizeof ( CUpti_EventDomainID ) * device[deviceId].domainCount;
205  domainId = ( CUpti_EventDomainID * ) malloc( size );
206  if ( domainId == NULL ) {
207  perror( "malloc(): Failed to allocate memory to CuPTI domain ID" );
208  return -1;
209  }
210  memset( domainId, 0, size );
211 
212  /* PAPI domain struct */
213  device[deviceId].domain =
214  ( DomainData_t * ) malloc( sizeof ( DomainData_t ) *
215  device[deviceId].domainCount );
216  if ( device[deviceId].domain == NULL ) {
217  perror( "malloc(): Failed to allocate memory to PAPI domain struct" );
218  free(domainId);
219  return -1;
220  }
221 
222  /* Enumerates the event domains for a device dev */
223  err = (*cuptiDeviceEnumEventDomainsPtr)( dev, &size, domainId );
224  CHECK_CUPTI_ERROR( err, "cuptiDeviceEnumEventDomains" );
225 
226  /* enum domains */
227  for ( id = 0; id < device[deviceId].domainCount; id++ ) {
228  device[deviceId].domain[id].domainId = domainId[id];
229 
230  /* query domain name */
231  size = PAPI_MIN_STR_LEN;
232 #ifdef CUDA_4_0
233  err = cuptiEventDomainGetAttribute( dev,
234  device[deviceId].domain[id].
235  domainId,
236  CUPTI_EVENT_DOMAIN_ATTR_NAME, &size,
237  ( void * ) device[deviceId].
238  domain[id].name );
239  CHECK_CUPTI_ERROR( err, "cuptiEventDomainGetAttribute" );
240 
241  /* query num of events avaialble in the domain */
242  size = sizeof ( device[deviceId].domain[id].eventCount );
243  err = cuptiEventDomainGetAttribute( dev,
244  device[deviceId].domain[id].
245  domainId,
246  CUPTI_EVENT_DOMAIN_MAX_EVENTS,
247  &size,
248  ( void * ) &device[deviceId].
249  domain[id].eventCount );
250  CHECK_CUPTI_ERROR( err, "cuptiEventDomainGetAttribute" );
251 
252  /* enumerate the events for the domain[id] on the device dev */
253  if ( 0 != enumEvents( dev, deviceId, id ) )
254  return -1;
255 #else
256  err = (*cuptiDeviceGetEventDomainAttributePtr)( dev,
257  device[deviceId].domain[id].domainId,
258  CUPTI_EVENT_DOMAIN_ATTR_NAME, &size,
259  ( void * ) device[deviceId].domain[id].name );
260  CHECK_CUPTI_ERROR( err, "cuptiDeviceGetEventDomainAttribute" );
261 
262  /* query num of events avaialble in the domain */
263  err = (*cuptiEventDomainGetNumEventsPtr)( device[deviceId].domain[id].domainId,
264  &device[deviceId].domain[id].eventCount );
265  CHECK_CUPTI_ERROR( err, "cuptiEventDomainGetNumEvents" );
266 
267  /* enumerate the events for the domain[id] on the device deviceId */
268  if ( 0 != enumEvents( deviceId, id ) )
269  return -1;
270 #endif
271  }
272 
273  totalDomainCount += device[deviceId].domainCount;
274  free( domainId );
275  return 0;
276 }
277 
278 
279 /*
280  * Detect supported events for specified device domain
281  */
282 #ifdef CUDA_4_0
283 static int
284 enumEvents( CUdevice dev, int deviceId, int domainId )
285 #else
286 static int
287 enumEvents( int deviceId, int domainId )
288 #endif
289 {
290  CUptiResult err = CUPTI_SUCCESS;
291  CUpti_EventID *eventId = NULL;
292  size_t size = 0;
293  uint32_t id = 0;
294 
295  /* CuPTI event struct */
296  size =
297  sizeof ( CUpti_EventID ) * device[deviceId].domain[domainId].eventCount;
298  eventId = ( CUpti_EventID * ) malloc( size );
299  if ( eventId == NULL ) {
300  perror( "malloc(): Failed to allocate memory to CuPTI event ID" );
301  return -1;
302  }
303  memset( eventId, 0, size );
304 
305  /* PAPI event struct */
306  device[deviceId].domain[domainId].event =
307  ( EventData_t * ) malloc( sizeof ( EventData_t ) *
308  device[deviceId].domain[domainId].
309  eventCount );
310  if ( device[deviceId].domain[domainId].event == NULL ) {
311  perror( "malloc(): Failed to allocate memory to PAPI event struct" );
312  free(eventId);
313  return -1;
314  }
315 
316  /* enumerate the events for the domain[domainId] on the device[deviceId] */
317 #ifdef CUDA_4_0
318  err =
319  (*cuptiEventDomainEnumEventsPtr)( dev,
320  ( CUpti_EventDomainID ) device[deviceId].
321  domain[domainId].domainId, &size, eventId );
322 #else
323  err =
324  (*cuptiEventDomainEnumEventsPtr)( ( CUpti_EventDomainID ) device[deviceId].
325  domain[domainId].domainId, &size, eventId );
326 #endif
327  CHECK_CUPTI_ERROR( err, "cuptiEventDomainEnumEvents" );
328 
329  /* query event info */
330  for ( id = 0; id < device[deviceId].domain[domainId].eventCount; id++ ) {
331  device[deviceId].domain[domainId].event[id].eventId = eventId[id];
332 
333  /* query event name */
334  size = PAPI_MIN_STR_LEN;
335 #ifdef CUDA_4_0
336  err = (*cuptiEventGetAttributePtr)( dev,
337  device[deviceId].domain[domainId].
338  event[id].eventId, CUPTI_EVENT_ATTR_NAME,
339  &size,
340  ( uint8_t * ) device[deviceId].
341  domain[domainId].event[id].name );
342 #else
343  err = (*cuptiEventGetAttributePtr)( device[deviceId].domain[domainId].
344  event[id].eventId, CUPTI_EVENT_ATTR_NAME,
345  &size,
346  ( uint8_t * ) device[deviceId].
347  domain[domainId].event[id].name );
348 #endif
349  CHECK_CUPTI_ERROR( err, "cuptiEventGetAttribute" );
350 
351  /* query event description */
352  size = PAPI_2MAX_STR_LEN;
353 #ifdef CUDA_4_0
354  err = (*cuptiEventGetAttributePtr)( dev,
355  device[deviceId].domain[domainId].
356  event[id].eventId,
357  CUPTI_EVENT_ATTR_SHORT_DESCRIPTION, &size,
358  ( uint8_t * ) device[deviceId].
359  domain[domainId].event[id].desc );
360 #else
361  err = (*cuptiEventGetAttributePtr)( device[deviceId].domain[domainId].
362  event[id].eventId,
363  CUPTI_EVENT_ATTR_SHORT_DESCRIPTION, &size,
364  ( uint8_t * ) device[deviceId].
365  domain[domainId].event[id].desc );
366 #endif
367  CHECK_CUPTI_ERROR( err, "cuptiEventGetAttribute" );
368  }
369 
370  totalEventCount += device[deviceId].domain[domainId].eventCount;
371  free( eventId );
372  return 0;
373 }
374 
375 
376 /*
377  * Create the native events for specified domain and device
378  */
379 static int
381 {
382  int deviceId, id = 0;
383  uint32_t domainId, eventId;
384  int cuptiDomainId;
385  int i;
386  int devNameLen;
387 
388  /* create events for every GPU device and every domain per device */
389  for ( deviceId = 0; deviceId < deviceCount; deviceId++ ) {
390  /* for the event names, replace blanks in the device name with underscores */
391  devNameLen = strlen( device[deviceId].name );
392  for ( i = 0; i < devNameLen; i++ )
393  if ( device[deviceId].name[i] == ' ' )
394  device[deviceId].name[i] = '_';
395 
396  for ( domainId = 0; domainId < device[deviceId].domainCount;
397  domainId++ ) {
398  cuptiDomainId = device[deviceId].domain[domainId].domainId;
399 
400  for ( eventId = 0;
401  eventId < device[deviceId].domain[domainId].eventCount;
402  eventId++ ) {
403  unsigned int evtNameLen = strlen(device[deviceId].name) + strlen(device[deviceId].domain[domainId].name) +
404  strlen(device[deviceId].domain[domainId].event[eventId].name);
405  if (evtNameLen + 4 > sizeof(cuda_native_table[id].name)) {
406  SUBDBG("Event name too long to fit in cuda_native_table.name, event omitted: available space: %lu, space needed: %d\n",
407  sizeof(cuda_native_table[id].name), evtNameLen+4);
408  SUBDBG("device: %s, domain: %s, event: %s\n", device[deviceId].name, device[deviceId].domain[domainId].name,
409  device[deviceId].domain[domainId].event[eventId].name);
410  continue;
411  }
412  /* Save native event data */
413  sprintf( cuda_native_table[id].name,
414  "%s:%s:%s",
415  device[deviceId].name,
416  device[deviceId].domain[domainId].name,
417  device[deviceId].domain[domainId].event[eventId].name );
418 
419  strncpy( cuda_native_table[id].description,
420  device[deviceId].domain[domainId].event[eventId].desc,
421  PAPI_2MAX_STR_LEN-1 );
423 
424  /* The selector has to be !=0 . Starts with 1 */
426 
427  /* store event ID */
429  device[deviceId].domain[domainId].event[eventId].eventId;
430 
431  /* increment the table index counter */
432  id++;
433  }
434  }
435  }
436 
437  /* Return the number of events created */
438  return id;
439 }
440 
441 
442 /*
443  * Returns all event values from the CuPTI eventGroup
444  */
445 static int
446 getEventValue( long long *counts, CUpti_EventGroup eventGroup, AddedEvents_t addedEvents )
447 {
448  CUptiResult cuptiErr = CUPTI_SUCCESS;
449  size_t events_read, bufferSizeBytes, arraySizeBytes, i;
450  uint64_t *counterDataBuffer;
451  CUpti_EventID *eventIDArray;
452  int j;
453 
454  bufferSizeBytes = addedEvents.count * sizeof ( uint64_t );
455  counterDataBuffer = ( uint64_t * ) malloc( bufferSizeBytes );
456 
457  arraySizeBytes = addedEvents.count * sizeof ( CUpti_EventID );
458  eventIDArray = ( CUpti_EventID * ) malloc( arraySizeBytes );
459 
460  /* read counter data for the specified event from the CuPTI eventGroup */
461  cuptiErr = (*cuptiEventGroupReadAllEventsPtr)( eventGroup,
462  CUPTI_EVENT_READ_FLAG_NONE,
463  &bufferSizeBytes,
464  counterDataBuffer, &arraySizeBytes,
465  eventIDArray, &events_read );
466  CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupReadAllEvents" );
467 
468  if ( events_read != ( size_t ) addedEvents.count )
469  return -1;
470 
471  /* Since there is no guarantee that returned counter values are in the same
472  order as the counters in the PAPI addedEvents.list, we need to map the
473  CUpti_EventID to PAPI event ID values.
474  According to CuPTI doc: counter return values of counterDataBuffer
475  correspond to the return event IDs in eventIDArray */
476  for ( i = 0; i < events_read; i++ )
477  for ( j = 0; j < addedEvents.count; j++ )
478  if ( cuda_native_table[addedEvents.list[j]].resources.eventId ==
479  eventIDArray[i] )
480  // since cuptiEventGroupReadAllEvents() resets counter values to 0;
481  // we have to accumulate ourselves
482  counts[addedEvents.list[j]] = counts[addedEvents.list[j]] + counterDataBuffer[i];
483 
484  free( counterDataBuffer );
485  free( eventIDArray );
486  return 0;
487 }
488 
489 
490 /*****************************************************************************
491  ******************* BEGIN PAPI's COMPONENT REQUIRED FUNCTIONS *************
492  *****************************************************************************/
493 
494 /*
495  * This is called whenever a thread is initialized
496  */
497 int
499 {
500  ( void ) ctx;
501 
502  return PAPI_OK;
503 }
504 
505 
506 /* Initialize hardware counters, setup the function vector table
507  * and get hardware information, this routine is called when the
508  * PAPI process is initialized (IE PAPI_library_init)
509  *
510  * NOTE: only called by main thread (not by every thread) !!!
511  *
512  * Starting in CUDA 4.0, multiple CPU threads can access the same CUDA context.
513  * This is a much easier programming model then pre-4.0 as threads - using the
514  * same context - can share memory, data, etc.
515  * It's possible to create a different context for each thread, but then we are
516  * likely running into a limitation that only one context can be profiled at a time.
517  * ==> and we don't want this. That's why CUDA context creation is done in
518  * CUDA_init_component() (called only by main thread) rather than CUDA_init()
519  * or CUDA_init_control_state() (both called by each thread).
520  */
521 int
523 {
524  SUBDBG ("Entry: cidx: %d\n", cidx);
525  CUresult cuErr = CUDA_SUCCESS;
526 
527  /* link in all the cuda libraries and resolve the symbols we need to use */
528  if (linkCudaLibraries() != PAPI_OK) {
529  SUBDBG ("Dynamic link of CUDA libraries failed, component will be disabled.\n");
530  SUBDBG ("See disable reason in papi_component_avail output for more details.\n");
531  return (PAPI_ENOSUPP);
532  }
533 
534  /* Create dynamic event table */
535  NUM_EVENTS = detectDevice( );
536  if (NUM_EVENTS < 0) {
537  strncpy(_cuda_vector.cmp_info.disabled_reason, "Call to detectDevice failed.",PAPI_MAX_STR_LEN);
538  return (PAPI_ENOSUPP);
539  }
540  /* TODO: works only for one device right now;
541  need to find out if user can use 2 or more devices at same time */
542 
543  /* want create a CUDA context for either the default device or
544  the device specified with cudaSetDevice() in user code */
545  if ( CUDA_SUCCESS != (*cudaGetDevicePtr)( &currentDeviceID ) ) {
546  strncpy(_cuda_vector.cmp_info.disabled_reason, "No NVIDIA GPU's found.",PAPI_MAX_STR_LEN);
547  return ( PAPI_ENOSUPP );
548  }
549 
550  if ( getenv( "PAPI_VERBOSE" ) ) {
551  printf( "DEVICE USED: %s (%d)\n", device[currentDeviceID].name,
552  currentDeviceID );
553  }
554 
555  /* get the CUDA context from the calling CPU thread */
556  cuErr = (*cuCtxGetCurrentPtr)( &cuCtx );
557 
558  /* if no CUDA context is bound to the calling CPU thread yet, create one */
559  if ( cuErr != CUDA_SUCCESS || cuCtx == NULL ) {
560  cuErr = (*cuCtxCreatePtr)( &cuCtx, 0, device[currentDeviceID].dev );
561  CHECK_CU_ERROR( cuErr, "cuCtxCreate" );
562  }
563 
564  /* cuCtxGetCurrent() can return a non-null context that is not valid
565  because the context has not yet been initialized.
566  Here is a workaround:
567  cudaFree(NULL) forces the context to be initialized
568  if cudaFree(NULL) returns success then we are able to use the context in subsequent calls
569  if cudaFree(NULL) returns an error (or subsequent cupti* calls) then the context is not usable,
570  and will never be useable */
571  if ( CUDA_SUCCESS != (*cudaFreePtr)( NULL ) ) {
572  strncpy(_cuda_vector.cmp_info.disabled_reason, "Problem initializing CUDA context.",PAPI_MAX_STR_LEN);
573  return ( PAPI_ENOSUPP );
574  }
575 
576  /* Create dynamic event table */
578  malloc( sizeof ( CUDA_native_event_entry_t ) * NUM_EVENTS );
579  if ( cuda_native_table == NULL ) {
580  perror( "malloc(): Failed to allocate memory to events table" );
581  strncpy(_cuda_vector.cmp_info.disabled_reason, "Failed to allocate memory to events table.",PAPI_MAX_STR_LEN);
582  return ( PAPI_ENOSUPP );
583  }
584 
585  if ( NUM_EVENTS != createNativeEvents( ) ) {
586  strncpy(_cuda_vector.cmp_info.disabled_reason, "Error creating CUDA event list.",PAPI_MAX_STR_LEN);
587  return ( PAPI_ENOSUPP );
588  }
589 
590  /* Export the component id */
591  _cuda_vector.cmp_info.CmpIdx = cidx;
592 
593  /* Number of events */
594  _cuda_vector.cmp_info.num_native_events = NUM_EVENTS;
595 
596  return ( PAPI_OK );
597 }
598 
599 
600 /*
601  * Link the necessary CUDA libraries to use the cuda component. If any of them can not be found, then
602  * the CUDA component will just be disabled. This is done at runtime so that a version of PAPI built
603  * with the CUDA component can be installed and used on systems which have the CUDA libraries installed
604  * and on systems where these libraries are not installed.
605  */
606 static int
608 {
609  /* Attempt to guess if we were statically linked to libc, if so bail */
610  if ( _dl_non_dynamic_init != NULL ) {
611  strncpy(_cuda_vector.cmp_info.disabled_reason, "The cuda component does not support statically linking to libc.",PAPI_MAX_STR_LEN);
612  return PAPI_ENOSUPP;
613  }
614  /* Need to link in the cuda libraries, if not found disable the component */
615  dl1 = dlopen("libcuda.so", RTLD_NOW | RTLD_GLOBAL);
616  if (!dl1)
617  {
618  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDA library libcuda.so not found.",PAPI_MAX_STR_LEN);
619  return ( PAPI_ENOSUPP );
620  }
621  cuCtxCreatePtr = dlsym(dl1, "cuCtxCreate_v2");
622  if (dlerror() != NULL)
623  {
624  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDA function cuCtxCreate not found.",PAPI_MAX_STR_LEN);
625  return ( PAPI_ENOSUPP );
626  }
627  cuCtxDestroyPtr = dlsym(dl1, "cuCtxDestroy_v2");
628  if (dlerror() != NULL)
629  {
630  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDA function cuCtxDestroy not found.",PAPI_MAX_STR_LEN);
631  return ( PAPI_ENOSUPP );
632  }
633  cuCtxGetCurrentPtr = dlsym(dl1, "cuCtxGetCurrent");
634  if (dlerror() != NULL)
635  {
636  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDA function cuCtxGetCurrent not found.",PAPI_MAX_STR_LEN);
637  return ( PAPI_ENOSUPP );
638  }
639  cuDeviceGetPtr = dlsym(dl1, "cuDeviceGet");
640  if (dlerror() != NULL)
641  {
642  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDA function cuDeviceGet not found.",PAPI_MAX_STR_LEN);
643  return ( PAPI_ENOSUPP );
644  }
645  cuDeviceGetCountPtr = dlsym(dl1, "cuDeviceGetCount");
646  if (dlerror() != NULL)
647  {
648  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDA function cuDeviceGetCount not found.",PAPI_MAX_STR_LEN);
649  return ( PAPI_ENOSUPP );
650  }
651  cuDeviceGetNamePtr = dlsym(dl1, "cuDeviceGetName");
652  if (dlerror() != NULL)
653  {
654  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDA function cuDeviceGetName not found.",PAPI_MAX_STR_LEN);
655  return ( PAPI_ENOSUPP );
656  }
657  cuInitPtr = dlsym(dl1, "cuInit");
658  if (dlerror() != NULL)
659  {
660  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDA function cuInit not found.",PAPI_MAX_STR_LEN);
661  return ( PAPI_ENOSUPP );
662  }
663 
664  dl2 = dlopen("libcudart.so", RTLD_NOW | RTLD_GLOBAL);
665  if (!dl2)
666  {
667  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDA runtime library libcudart.so not found.",PAPI_MAX_STR_LEN);
668  return ( PAPI_ENOSUPP );
669  }
670  cudaFreePtr = dlsym(dl2, "cudaFree");
671  if (dlerror() != NULL)
672  {
673  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDART function cudaFree not found.",PAPI_MAX_STR_LEN);
674  return ( PAPI_ENOSUPP );
675  }
676  cudaGetDevicePtr = dlsym(dl2, "cudaGetDevice");
677  if (dlerror() != NULL)
678  {
679  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDART function cudaGetDevice not found.",PAPI_MAX_STR_LEN);
680  return ( PAPI_ENOSUPP );
681  }
682  cudaRuntimeGetVersionPtr = dlsym(dl2, "cudaRuntimeGetVersion");
683  if (dlerror() != NULL)
684  {
685  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDART function cudaRuntimeGetVersion not found.",PAPI_MAX_STR_LEN);
686  return ( PAPI_ENOSUPP );
687  }
688  cudaDriverGetVersionPtr = dlsym(dl2, "cudaDriverGetVersion");
689  if (dlerror() != NULL)
690  {
691  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDART function cudaDriverGetVersion not found.",PAPI_MAX_STR_LEN);
692  return ( PAPI_ENOSUPP );
693  }
694 
695  dl3 = dlopen("libcupti.so", RTLD_NOW | RTLD_GLOBAL);
696  if (!dl3)
697  {
698  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDA runtime library libcupti.so not found.",PAPI_MAX_STR_LEN);
699  return ( PAPI_ENOSUPP );
700  }
701  cuptiDeviceEnumEventDomainsPtr = dlsym(dl3, "cuptiDeviceEnumEventDomains");
702  if (dlerror() != NULL)
703  {
704  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiDeviceEnumEventDomains not found.",PAPI_MAX_STR_LEN);
705  return ( PAPI_ENOSUPP );
706  }
707  cuptiDeviceGetEventDomainAttributePtr = dlsym(dl3, "cuptiDeviceGetEventDomainAttribute");
708  if (dlerror() != NULL)
709  {
710  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiDeviceGetEventDomainAttribute not found.",PAPI_MAX_STR_LEN);
711  return ( PAPI_ENOSUPP );
712  }
713  cuptiDeviceGetNumEventDomainsPtr = dlsym(dl3, "cuptiDeviceGetNumEventDomains");
714  if (dlerror() != NULL)
715  {
716  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiDeviceGetNumEventDomains not found.",PAPI_MAX_STR_LEN);
717  return ( PAPI_ENOSUPP );
718  }
719  cuptiEventDomainEnumEventsPtr = dlsym(dl3, "cuptiEventDomainEnumEvents");
720  if (dlerror() != NULL)
721  {
722  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiEventDomainEnumEvents not found.",PAPI_MAX_STR_LEN);
723  return ( PAPI_ENOSUPP );
724  }
725  cuptiEventDomainGetNumEventsPtr = dlsym(dl3, "cuptiEventDomainGetNumEvents");
726  if (dlerror() != NULL)
727  {
728  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiEventDomainGetNumEvents not found.",PAPI_MAX_STR_LEN);
729  return ( PAPI_ENOSUPP );
730  }
731  cuptiEventGetAttributePtr = dlsym(dl3, "cuptiEventGetAttribute");
732  if (dlerror() != NULL)
733  {
734  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiEventGetAttribute not found.",PAPI_MAX_STR_LEN);
735  return ( PAPI_ENOSUPP );
736  }
737  cuptiEventGroupAddEventPtr = dlsym(dl3, "cuptiEventGroupAddEvent");
738  if (dlerror() != NULL)
739  {
740  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiEventGroupAddEvent not found.",PAPI_MAX_STR_LEN);
741  return ( PAPI_ENOSUPP );
742  }
743  cuptiEventGroupCreatePtr = dlsym(dl3, "cuptiEventGroupCreate");
744  if (dlerror() != NULL)
745  {
746  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiEventGroupCreate not found.",PAPI_MAX_STR_LEN);
747  return ( PAPI_ENOSUPP );
748  }
749  cuptiEventGroupDestroyPtr = dlsym(dl3, "cuptiEventGroupDestroy");
750  if (dlerror() != NULL)
751  {
752  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiEventGroupDestroy not found.",PAPI_MAX_STR_LEN);
753  return ( PAPI_ENOSUPP );
754  }
755  cuptiEventGroupDisablePtr = dlsym(dl3, "cuptiEventGroupDisable");
756  if (dlerror() != NULL)
757  {
758  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiEventGroupDisable not found.",PAPI_MAX_STR_LEN);
759  return ( PAPI_ENOSUPP );
760  }
761  cuptiEventGroupEnablePtr = dlsym(dl3, "cuptiEventGroupEnable");
762  if (dlerror() != NULL)
763  {
764  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiEventGroupEnable not found.",PAPI_MAX_STR_LEN);
765  return ( PAPI_ENOSUPP );
766  }
767  cuptiEventGroupReadAllEventsPtr = dlsym(dl3, "cuptiEventGroupReadAllEvents");
768  if (dlerror() != NULL)
769  {
770  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiEventGroupReadAllEvents not found.",PAPI_MAX_STR_LEN);
771  return ( PAPI_ENOSUPP );
772  }
773  cuptiEventGroupRemoveAllEventsPtr = dlsym(dl3, "cuptiEventGroupRemoveAllEvents");
774  if (dlerror() != NULL)
775  {
776  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiEventGroupRemoveAllEvents not found.",PAPI_MAX_STR_LEN);
777  return ( PAPI_ENOSUPP );
778  }
779  cuptiEventGroupResetAllEventsPtr = dlsym(dl3, "cuptiEventGroupResetAllEvents");
780  if (dlerror() != NULL)
781  {
782  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiEventGroupResetAllEvents not found.",PAPI_MAX_STR_LEN);
783  return ( PAPI_ENOSUPP );
784  }
785 
786  return ( PAPI_OK );
787 }
788 
789 
790 /*
791  * Control of counters (Reading/Writing/Starting/Stopping/Setup)
792  * functions
793  */
794 int
796 {
797  CUDA_control_state_t * CUDA_ctrl = ( CUDA_control_state_t * ) ctrl;
798  CUptiResult cuptiErr = CUPTI_SUCCESS;
799  int i;
800 
801  /* allocate memory for the list of events that are added to the CuPTI eventGroup */
802  CUDA_ctrl->addedEvents.list = malloc( sizeof ( int ) * NUM_EVENTS );
803  if ( CUDA_ctrl->addedEvents.list == NULL ) {
804  perror
805  ( "malloc(): Failed to allocate memory to table of events that are added to CuPTI eventGroup" );
806  return ( PAPI_ENOSUPP );
807  }
808 
809  /* initialize the event list */
810  for ( i = 0; i < NUM_EVENTS; i++ )
811  CUDA_ctrl->addedEvents.list[i] = 0;
812 
813 
814 
815  cuptiErr = (*cuptiEventGroupCreatePtr)( cuCtx, &CUDA_ctrl->eventGroup, 0 );
816  CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupCreate" );
817 
818  return PAPI_OK;
819 }
820 
821 
822 /*
823  *
824  */
825 int
827 {
828  ( void ) ctx;
829  int i;
830  CUDA_control_state_t * CUDA_ctrl = ( CUDA_control_state_t * ) ctrl;
831  CUptiResult cuptiErr = CUPTI_SUCCESS;
832 
833  // reset all event values to 0
834  for ( i = 0; i < NUM_EVENTS; i++ )
835  CUDA_ctrl->counts[i] = 0;
836 
837  cuptiErr = (*cuptiEventGroupEnablePtr)( CUDA_ctrl->eventGroup );
838  CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupEnable" );
839 
840  /* Resets all events in the CuPTI eventGroup to zero */
841  cuptiErr = (*cuptiEventGroupResetAllEventsPtr)( CUDA_ctrl->eventGroup );
842  CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupResetAllEvents" );
843 
844  return ( PAPI_OK );
845 }
846 
847 
848 /*
849  *
850  */
851 int
853 {
854  ( void ) ctx;
855  ( void ) ctrl;
856 
857  return ( PAPI_OK );
858 }
859 
860 
861 /*
862  *
863  */
864 int
866  long_long ** events, int flags )
867 {
868  ( void ) ctx;
869  ( void ) flags;
870  CUDA_control_state_t * CUDA_ctrl = ( CUDA_control_state_t * ) ctrl;
871 
872 
873  if ( 0 != getEventValue( CUDA_ctrl->counts, CUDA_ctrl->eventGroup, CUDA_ctrl->addedEvents ) )
874  return ( PAPI_ENOSUPP );
875 
876  *events = CUDA_ctrl->counts;
877 
878  return ( PAPI_OK );
879 }
880 
881 /*
882  *
883  */
884 int
886 {
887  CUDA_context_t *CUDA_ctx = (CUDA_context_t*)ctx;
888  free( CUDA_ctx->state.addedEvents.list );
889  return (PAPI_OK);
890 }
891 
892 /*
893  *
894  */
895 int
897 {
898  CUresult cuErr = CUDA_SUCCESS;
899 
900  /* if running a threaded application, we need to make sure that
901  a thread doesn't free the same memory location(s) more than once */
902  if ( CUDA_FREED == 0 ) {
903  uint32_t j;
904  int i;
905 
906  CUDA_FREED = 1;
907 
908  /* deallocate all the memory */
909  for ( i = 0; i < deviceCount; i++ ) {
910  for ( j = 0; j < device[i].domainCount; j++ )
911  free( device[i].domain[j].event );
912 
913  free( device[i].domain );
914  }
915 
916  free( device );
918 
919  /* destroy floating CUDA context */
920  cuErr = (*cuCtxDestroyPtr)( cuCtx );
921  if ( cuErr != CUDA_SUCCESS )
922  return ( PAPI_ENOSUPP ); // Not supported
923  }
924 
925  // close the dynamic libraries needed by this component (opened in the init substrate call)
926  dlclose(dl1);
927  dlclose(dl2);
928  dlclose(dl3);
929 
930  return ( PAPI_OK );
931 }
932 
933 
934 /* This function sets various options in the component
935  * The valid codes being passed in are PAPI_SET_DEFDOM,
936  * PAPI_SET_DOMAIN, PAPI_SETDEFGRN, PAPI_SET_GRANUL * and PAPI_SET_INHERIT
937  */
938 int
939 CUDA_ctl( hwd_context_t * ctx, int code, _papi_int_option_t * option )
940 {
941  ( void ) ctx;
942  ( void ) code;
943  ( void ) option;
944  return ( PAPI_OK );
945 }
946 
947 
948 //int CUDA_ntv_code_to_bits ( unsigned int EventCode, hwd_register_t * bits );
949 
950 
951 /*
952  *
953  */
954 int
956  NativeInfo_t * native, int count,
957  hwd_context_t * ctx )
958 {
959  ( void ) ctx;
960  CUDA_control_state_t * CUDA_ptr = ( CUDA_control_state_t * ) ptr;
961  int index, i;
962  CUptiResult cuptiErr = CUPTI_SUCCESS;
963 
964  /* Disable the CUDA eventGroup;
965  it also frees the perfmon hardware on the GPU */
966  cuptiErr = (*cuptiEventGroupDisablePtr)( CUDA_ptr->eventGroup );
967  CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupDisable" );
968 
969  cuptiErr = (*cuptiEventGroupRemoveAllEventsPtr)( CUDA_ptr->eventGroup );
970  CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupRemoveAllEvents" );
971 
972  // otherwise, add the events to the eventset
973  for ( i = 0; i < count; i++ ) {
974 
975  index = native[i].ni_event;
976  native[i].ni_position = index;
977 
978  /* store events, that have been added to the CuPTI eveentGroup
979  in a seperate place (addedEvents).
980  Needed, so that we can read the values for the added events only */
981  CUDA_ptr->addedEvents.count = count;
982  CUDA_ptr->addedEvents.list[i] = index;
983 
984  /* if this device name is different from the actual device the code is running on, then exit */
985  if ( 0 != strncmp( device[currentDeviceID].name,
986  cuda_native_table[index].name,
987  strlen( device[currentDeviceID].name ) ) ) {
988  fprintf( stderr, "Device %s is used -- BUT event %s is collected. \n ---> ERROR: Specify events for the device that is used!\n\n",
989  device[currentDeviceID].name, cuda_native_table[index].name );
990 
991  return ( PAPI_ENOSUPP ); // Not supported
992  }
993 
994  /* Add events to the CuPTI eventGroup */
995  cuptiErr =
996  (*cuptiEventGroupAddEventPtr)( CUDA_ptr->eventGroup,
998  eventId );
999  CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupAddEvent" );
1000  }
1001 
1002  return ( PAPI_OK );
1003 }
1004 
1005 
1006 /*
1007  * This function has to set the bits needed to count different domains
1008  * In particular: PAPI_DOM_USER, PAPI_DOM_KERNEL PAPI_DOM_OTHER
1009  * By default return PAPI_EINVAL if none of those are specified
1010  * and PAPI_OK with success
1011  * PAPI_DOM_USER is only user context is counted
1012  * PAPI_DOM_KERNEL is only the Kernel/OS context is counted
1013  * PAPI_DOM_OTHER is Exception/transient mode (like user TLB misses)
1014  * PAPI_DOM_ALL is all of the domains
1015  */
1016 int
1017 CUDA_set_domain( hwd_control_state_t * cntrl, int domain )
1018 {
1019  int found = 0;
1020  ( void ) cntrl;
1021 
1022  if ( PAPI_DOM_USER & domain )
1023  found = 1;
1024 
1025  if ( PAPI_DOM_KERNEL & domain )
1026  found = 1;
1027 
1028  if ( PAPI_DOM_OTHER & domain )
1029  found = 1;
1030 
1031  if ( !found )
1032  return ( PAPI_EINVAL );
1033 
1034  return ( PAPI_OK );
1035 }
1036 
1037 
1038 /*
1039  *
1040  */
1041 int
1043 {
1044  ( void ) ctx;
1045  CUDA_control_state_t * CUDA_ctrl = ( CUDA_control_state_t * ) ctrl;
1046  CUptiResult cuptiErr = CUPTI_SUCCESS;
1047 
1048  /* Resets all events in the CuPTI eventGroup to zero */
1049  cuptiErr = (*cuptiEventGroupResetAllEventsPtr)( CUDA_ctrl->eventGroup );
1050  CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupResetAllEvents" );
1051 
1052  return ( PAPI_OK );
1053 }
1054 
1055 
1056 /*
1057  * Disable and Destoy the CUDA eventGroup */
1058 int
1060 {
1061  ( void ) ctrl;
1062 
1063  // TODO: after cleanup_eventset() which destroys the eventset, update_control_state()
1064  // is called, which operates on the already destroyed eventset. Bad!
1065 #if 0
1066  CUDA_control_state_t * CUDA_ctrl = ( CUDA_control_state_t * ) ctrl;
1067  CUptiResult cuptiErr = CUPTI_SUCCESS;
1068 
1069  /* Disable the CUDA eventGroup;
1070  it also frees the perfmon hardware on the GPU */
1071  cuptiErr = (*cuptiEventGroupDisablePtr)( CUDA_ctrl->eventGroup );
1072  CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupDisable" );
1073 
1074  /* Call the CuPTI cleaning function before leaving */
1075  cuptiErr = (*cuptiEventGroupDestroyPtr)( CUDA_ctrl->eventGroup );
1076  CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupDestroy" );
1077 #endif
1078  return ( PAPI_OK );
1079 }
1080 
1081 
1082 /*
1083  * Native Event functions
1084  */
1085 int
1086 CUDA_ntv_enum_events( unsigned int *EventCode, int modifier )
1087 {
1088 
1089  switch ( modifier ) {
1090  case PAPI_ENUM_FIRST:
1091  *EventCode = 0;
1092 
1093  return ( PAPI_OK );
1094  break;
1095 
1096  case PAPI_ENUM_EVENTS:
1097  {
1098  int index = *EventCode;
1099 
1100  if ( index < NUM_EVENTS - 1 ) {
1101  *EventCode = *EventCode + 1;
1102  return ( PAPI_OK );
1103  } else
1104  return ( PAPI_ENOEVNT );
1105 
1106  break;
1107  }
1108  default:
1109  return ( PAPI_EINVAL );
1110  }
1111  return ( PAPI_EINVAL );
1112 }
1113 
1114 
1115 /*
1116  *
1117  */
1118 int
1119 CUDA_ntv_code_to_name( unsigned int EventCode, char *name, int len )
1120 {
1121  int index = EventCode;
1122 
1123  strncpy( name, cuda_native_table[index].name, len );
1124  return ( PAPI_OK );
1125 }
1126 
1127 
1128 /*
1129  *
1130  */
1131 int
1132 CUDA_ntv_code_to_descr( unsigned int EventCode, char *name, int len )
1133 {
1134  int index = EventCode;
1135 
1136  strncpy( name, cuda_native_table[index].description, len );
1137  return ( PAPI_OK );
1138 }
1139 
1140 
1141 /*
1142  *
1143  */
1144 int
1145 CUDA_ntv_code_to_bits( unsigned int EventCode, hwd_register_t * bits )
1146 {
1147  int index = EventCode;
1148 
1149  memcpy( ( CUDA_register_t * ) bits,
1150  &( cuda_native_table[index].resources ),
1151  sizeof ( CUDA_register_t ) );
1152 
1153  return ( PAPI_OK );
1154 }
1155 
1156 
1157 /*
1158  *
1159  */
1160 papi_vector_t _cuda_vector = {
1161  .cmp_info = {
1162  /* default component information (unspecified values are initialized to 0) */
1163  .name = "cuda",
1164  .short_name = "cuda",
1165  .version = "5.0",
1166  .description = "CuPTI provides the API for monitoring NVIDIA GPU hardware events",
1167  .num_mpx_cntrs = CUDA_MAX_COUNTERS,
1168  .num_cntrs = CUDA_MAX_COUNTERS,
1169  .default_domain = PAPI_DOM_USER,
1170  .default_granularity = PAPI_GRN_THR,
1171  .available_granularities = PAPI_GRN_THR,
1172  .hardware_intr_sig = PAPI_INT_SIGNAL,
1173 
1174  /* component specific cmp_info initializations */
1175  .fast_real_timer = 0,
1176  .fast_virtual_timer = 0,
1177  .attach = 0,
1178  .attach_must_ptrace = 0,
1179  .available_domains = PAPI_DOM_USER | PAPI_DOM_KERNEL,
1180  }
1181  ,
1182 
1183  /* sizes of framework-opaque component-private structures */
1184  .size = {
1185  .context = sizeof ( CUDA_context_t ),
1186  .control_state = sizeof ( CUDA_control_state_t ),
1187  .reg_value = sizeof ( CUDA_register_t ),
1188  .reg_alloc = sizeof ( CUDA_reg_alloc_t ),
1189  }
1190  ,
1191  /* function pointers in this component */
1192  .init_thread = CUDA_init_thread,
1193  .init_component = CUDA_init_component,
1194  .init_control_state = CUDA_init_control_state,
1195  .start = CUDA_start,
1196  .stop = CUDA_stop,
1197  .read = CUDA_read,
1198  .shutdown_component = CUDA_shutdown_component,
1199  .shutdown_thread = CUDA_shutdown_thread,
1200  .cleanup_eventset = CUDA_cleanup_eventset,
1201  .ctl = CUDA_ctl,
1202  .update_control_state = CUDA_update_control_state,
1203  .set_domain = CUDA_set_domain,
1204  .reset = CUDA_reset,
1205 
1206  .ntv_enum_events = CUDA_ntv_enum_events,
1207  .ntv_code_to_name = CUDA_ntv_code_to_name,
1208  .ntv_code_to_descr = CUDA_ntv_code_to_descr,
1209  .ntv_code_to_bits = CUDA_ntv_code_to_bits,
1210 };
char name[PAPI_MAX_STR_LEN]
Definition: papi.h:626
#define PAPI_ENOEVNT
Definition: papi.h:258
sprintf(splash[splash_line++],"\tIozone: Performance Test of File I/O\n")
memset(eventId, 0, size)
#define CUDAAPI
CUdevice dev
Definition: linux-cuda.h:68
CHECK_CUPTI_ERROR(err,"cuptiEventDomainEnumEvents")
CUDA_control_state_t state
Definition: linux-cuda.h:118
#define CUDA_MAX_COUNTERS
Definition: linux-cuda.h:47
int CUDA_ntv_code_to_bits(unsigned int EventCode, hwd_register_t *bits)
Definition: linux-cuda.c:1145
#define CHECK_CU_ERROR(err, cufunc)
Definition: linux-cuda.h:26
char * getenv()
char description[PAPI_2MAX_STR_LEN]
Definition: linux-cuda.h:97
long long flags
Definition: iozone.c:12330
int CUDA_ntv_code_to_name(unsigned int EventCode, char *name, int len)
Definition: linux-cuda.c:1119
AddedEvents_t addedEvents
Definition: linux-cuda.h:110
#define PAPI_ENOSUPP
Definition: papi.h:269
static int enumEvents(int domainId, int eventCount)
static int deviceCount
Definition: linux-cuda.h:136
This file has the source code for a component that enables PAPI-C to access hardware monitoring count...
int CUDA_set_domain(hwd_control_state_t *cntrl, int domain)
Definition: linux-cuda.c:1017
#define PAPI_DOM_KERNEL
Definition: papi.h:298
int CUDA_shutdown_thread(hwd_context_t *ctx)
Definition: linux-cuda.c:885
int CUDA_ntv_enum_events(unsigned int *EventCode, int modifier)
Definition: linux-cuda.c:1086
device[deviceId] domain[domainId] event
Definition: linux-cuda.c:306
totalEventCount
Definition: linux-cuda.c:370
return PAPI_OK
Definition: linux-nvml.c:458
int count
Definition: iozone.c:22422
int CUDA_init_thread(hwd_context_t *ctx)
Definition: linux-cuda.c:498
#define PAPI_DOM_USER
Definition: papi.h:296
void
Definition: iozone.c:18627
return PAPI_EINVAL
Definition: linux-nvml.c:408
PAPI_component_info_t cmp_info
Definition: papi_vector.h:20
int CUDA_ctl(hwd_context_t *ctx, int code, _papi_int_option_t *option)
Definition: linux-cuda.c:939
#define printf
Definition: papi_test.h:125
int CUDA_start(hwd_context_t *ctx, hwd_control_state_t *ctrl)
Definition: linux-cuda.c:826
CUpti_EventID eventId
Definition: linux-cuda.h:51
int CUDA_ntv_code_to_descr(unsigned int EventCode, char *name, int len)
Definition: linux-cuda.c:1132
Return codes and api definitions.
CUpti_EventID eventId
Definition: linux-cuda.h:88
static int createNativeEvents(void)
Definition: linux-cuda.c:380
err
Definition: linux-cuda.c:323
char events[MAX_EVENTS][BUFSIZ]
#define PAPI_2MAX_STR_LEN
Definition: papi.h:464
#define NUM_EVENTS
char disabled_reason[PAPI_MAX_STR_LEN]
Definition: papi.h:633
void(* _dl_non_dynamic_init)(void)
Definition: linux-cuda.c:41
int i
Definition: fileop.c:140
static CUcontext cuCtx
Definition: linux-cuda.h:156
static int currentDeviceID
Definition: linux-cuda.h:139
long long found
Definition: libasync.c:735
char *long long size
Definition: iozone.c:12023
free(dummyfile[xx])
static int cidx
Definition: event_info.c:40
int CUDA_reset(hwd_context_t *ctx, hwd_control_state_t *ctrl)
Definition: linux-cuda.c:1042
static int native
Definition: event_info.c:39
int CUDA_shutdown_component(void)
Definition: linux-cuda.c:896
uint32_t eventCount
Definition: linux-cuda.h:61
__attribute__((constructor))
Definition: init_fini.c:12
#define SUBDBG(format, args...)
Definition: papi_debug.h:63
static CUDA_native_event_entry_t * cuda_native_table
Definition: linux-cuda.h:133
int CUDA_stop(hwd_context_t *ctx, hwd_control_state_t *ctrl)
Definition: linux-cuda.c:852
EventData_t * event
Definition: linux-cuda.h:62
uint32_t domainCount
Definition: linux-cuda.h:70
#define PAPI_INT_SIGNAL
Definition: papi_internal.h:53
int CUDA_cleanup_eventset(hwd_control_state_t *ctrl)
Definition: linux-cuda.c:1059
#define PAPI_GRN_THR
Definition: papi.h:360
uint32_t id
Definition: linux-cuda.c:293
DomainData_t * domain
Definition: linux-cuda.h:71
papi_vector_t _cuda_vector
Definition: linux-cuda.c:1160
#define CUPTIAPI
int CUDA_init_component(int cidx)
Definition: linux-cuda.c:522
static int enumEventDomains(CUdevice dev, int deviceId)
Definition: linux-cuda.c:185
strcpy(filename, default_filename)
static int totalDomainCount
Definition: linux-cuda.h:137
static DeviceData_t * device
Definition: linux-cuda.h:155
char * name
Definition: iozone.c:23648
int
Definition: iozone.c:18528
#define PAPI_MIN_STR_LEN
Definition: papi.h:462
#define long_long
Definition: papi.h:550
int CUDA_read(hwd_context_t *ctx, hwd_control_state_t *ctrl, long_long **events, int flags)
Definition: linux-cuda.c:865
char name[PAPI_MIN_STR_LEN]
Definition: linux-cuda.h:69
Definition: linux-cuda.h:93
static int linkCudaLibraries()
Definition: linux-cuda.c:607
#define PAPI_MAX_STR_LEN
Definition: papi.h:463
int CUDA_update_control_state(hwd_control_state_t *ptr, NativeInfo_t *native, int count, hwd_context_t *ctx)
Definition: linux-cuda.c:955
#define PAPI_DOM_OTHER
Definition: papi.h:299
CUpti_EventDomainID domainId
Definition: linux-cuda.h:59
int CUDA_init_control_state(hwd_control_state_t *ctrl)
Definition: linux-cuda.c:795
long j
Definition: iozone.c:19135
CUpti_EventID * eventId
Definition: linux-cuda.c:291
long long counts[CUDA_MAX_COUNTERS]
Definition: linux-cuda.h:111
CUDA_register_t resources
Definition: linux-cuda.h:95
CUpti_EventGroup eventGroup
Definition: linux-cuda.h:109
static int CUDA_FREED
Definition: linux-cuda.h:140
unsigned int selector
Definition: linux-cuda.h:86
#define CUDARTAPI
static int getEventValue(long long *counts, CUpti_EventGroup eventGroup, AddedEvents_t addedEvents)
Definition: linux-cuda.c:446
char * ptr
Definition: iozone.c:23586