PAPI  5.3.2.0
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
linux-cuda.c
Go to the documentation of this file.
1 /****************************/
2 /* THIS IS OPEN SOURCE CODE */
3 /****************************/
4 
17 #include <dlfcn.h>
18 
19 #include "papi.h"
20 #include "papi_internal.h"
21 #include "papi_vector.h"
22 #include "papi_memory.h"
23 #include "linux-cuda.h"
24 
25 
26 /******** CHANGE PROTOTYPES TO DECLARE CUDA LIBRARY SYMBOLS AS WEAK **********
27  * This is done so that a version of PAPI built with the cuda component can *
28  * be installed on a system which does not have the cuda libraries installed. *
29  * *
30  * If this is done without these prototypes, then all papi services on the *
31  * system without the cuda libraries installed will fail. The PAPI libraries *
32  * contain references to the cuda libraries which are not installed. The *
33  * load of PAPI commands fails because the cuda library references can not be *
34  * resolved. *
35  * *
36  * This also defines pointers to the cuda library functions that we call. *
37  * These function pointers will be resolved with dlopen/dlsym calls at *
38  * component initialization time. The component then calls the cuda library *
39  * functions through these function pointers. *
40  *******************************************************************************/
42 #undef CUDAAPI
43 #define CUDAAPI __attribute__((weak))
44 CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev);
45 CUresult CUDAAPI cuCtxDestroy(CUcontext);
46 CUresult CUDAAPI cuCtxGetCurrent(CUcontext *);
47 CUresult CUDAAPI cuDeviceGet(CUdevice *, int);
48 CUresult CUDAAPI cuDeviceGetCount(int *);
49 CUresult CUDAAPI cuDeviceGetName(char *, int, CUdevice);
50 CUresult CUDAAPI cuInit(unsigned int);
51 
52 CUresult (*cuCtxCreatePtr)(CUcontext *pctx, unsigned int flags, CUdevice dev);
53 CUresult (*cuCtxDestroyPtr)(CUcontext);
54 CUresult (*cuCtxGetCurrentPtr)(CUcontext *);
55 CUresult (*cuDeviceGetPtr)(CUdevice *, int);
56 CUresult (*cuDeviceGetCountPtr)(int *);
57 CUresult (*cuDeviceGetNamePtr)(char *, int, CUdevice);
58 CUresult (*cuInitPtr)(unsigned int);
59 
60 #undef CUDARTAPI
61 #define CUDARTAPI __attribute__((weak))
62 cudaError_t CUDARTAPI cudaFree(void *);
63 cudaError_t CUDARTAPI cudaGetDevice(int *);
64 cudaError_t CUDARTAPI cudaRuntimeGetVersion( int *);
65 cudaError_t CUDARTAPI cudaDriverGetVersion( int *);
66 
67 cudaError_t (*cudaFreePtr)(void *);
68 cudaError_t (*cudaGetDevicePtr)(int *);
69 cudaError_t (*cudaRuntimeGetVersionPtr)(int *);
70 cudaError_t (*cudaDriverGetVersionPtr)(int *);
71 
72 #undef CUPTIAPI
73 #define CUPTIAPI __attribute__((weak))
74 CUptiResult CUPTIAPI cuptiDeviceEnumEventDomains(CUdevice, size_t *, CUpti_EventDomainID *);
75 CUptiResult CUPTIAPI cuptiDeviceGetEventDomainAttribute(CUdevice, CUpti_EventDomainID, CUpti_EventDomainAttribute, size_t *, void *);
76 CUptiResult CUPTIAPI cuptiDeviceGetNumEventDomains(CUdevice, uint32_t *);
77 CUptiResult CUPTIAPI cuptiEventDomainEnumEvents(CUpti_EventDomainID, size_t*, CUpti_EventID *);
78 CUptiResult CUPTIAPI cuptiEventDomainGetNumEvents(CUpti_EventDomainID, uint32_t *);
79 CUptiResult CUPTIAPI cuptiEventGetAttribute(CUpti_EventID, CUpti_EventAttribute, size_t *, void *);
80 CUptiResult CUPTIAPI cuptiEventGroupAddEvent(CUpti_EventGroup, CUpti_EventID);
81 CUptiResult CUPTIAPI cuptiEventGroupCreate(CUcontext, CUpti_EventGroup *, uint32_t);
82 CUptiResult CUPTIAPI cuptiEventGroupDestroy(CUpti_EventGroup);
83 CUptiResult CUPTIAPI cuptiEventGroupDisable(CUpti_EventGroup);
84 CUptiResult CUPTIAPI cuptiEventGroupEnable(CUpti_EventGroup);
85 CUptiResult CUPTIAPI cuptiEventGroupReadAllEvents(CUpti_EventGroup, CUpti_ReadEventFlags, size_t *, uint64_t *, size_t *, CUpti_EventID *, size_t *);
86 CUptiResult CUPTIAPI cuptiEventGroupRemoveAllEvents(CUpti_EventGroup);
87 CUptiResult CUPTIAPI cuptiEventGroupResetAllEvents(CUpti_EventGroup);
88 
89 CUptiResult (*cuptiDeviceEnumEventDomainsPtr)(CUdevice, size_t *, CUpti_EventDomainID *);
90 CUptiResult (*cuptiDeviceGetEventDomainAttributePtr)(CUdevice, CUpti_EventDomainID, CUpti_EventDomainAttribute, size_t *, void *);
91 CUptiResult (*cuptiDeviceGetNumEventDomainsPtr)(CUdevice, uint32_t *);
92 CUptiResult (*cuptiEventDomainEnumEventsPtr)(CUpti_EventDomainID, size_t*, CUpti_EventID *);
93 CUptiResult (*cuptiEventDomainGetNumEventsPtr)(CUpti_EventDomainID, uint32_t *);
94 CUptiResult (*cuptiEventGetAttributePtr)(CUpti_EventID, CUpti_EventAttribute, size_t *, void *);
95 CUptiResult (*cuptiEventGroupAddEventPtr)(CUpti_EventGroup, CUpti_EventID);
96 CUptiResult (*cuptiEventGroupCreatePtr)(CUcontext, CUpti_EventGroup *, uint32_t);
97 CUptiResult (*cuptiEventGroupDestroyPtr)(CUpti_EventGroup);
98 CUptiResult (*cuptiEventGroupDisablePtr)(CUpti_EventGroup);
99 CUptiResult (*cuptiEventGroupEnablePtr)(CUpti_EventGroup);
100 CUptiResult (*cuptiEventGroupReadAllEventsPtr)(CUpti_EventGroup, CUpti_ReadEventFlags, size_t *, uint64_t *, size_t *, CUpti_EventID *, size_t *);
101 CUptiResult (*cuptiEventGroupRemoveAllEventsPtr)(CUpti_EventGroup);
102 CUptiResult (*cuptiEventGroupResetAllEventsPtr)(CUpti_EventGroup);
103 
104 // file handles used to access cuda libraries with dlopen
105 static void* dl1 = NULL;
106 static void* dl2 = NULL;
107 static void* dl3 = NULL;
108 
109 static int linkCudaLibraries ();
110 
112 
113 
114 /******************************************************************************
115  ******** BEGIN FUNCTIONS USED INTERNALLY SPECIFIC TO THIS COMPONENT *********
116  *****************************************************************************/
117 /*
118  * Specify device(s): Counts number of cuda events available in this system
119  */
120 static int
121 detectDevice( void )
122 {
123  CUresult err;
124  int skipDevice = 0;
125  int id;
126  char deviceName_tmp[PAPI_MIN_STR_LEN] = "init";
127 
128  totalEventCount = 0;
129 
130 /* CUDA initialization */
131  err = (*cuInitPtr)( 0 );
132  if ( err != CUDA_SUCCESS ) {
133  SUBDBG ("Info: Error from cuInit(): %d\n", err);
134  return ( PAPI_ENOSUPP );
135  }
136 
137  /* How many gpgpu devices do we have? */
138  err = (*cuDeviceGetCountPtr)( &deviceCount );
139  CHECK_CU_ERROR( err, "cuDeviceGetCount" );
140  if ( deviceCount == 0 )
141  return ( PAPI_ENOSUPP );
142 
143  /* allocate memory for device data table */
144  device = ( DeviceData_t * ) malloc( sizeof ( DeviceData_t ) * deviceCount );
145  if ( device == NULL ) {
146  perror( "malloc(): Failed to allocate memory to CUDA device table" );
147  return ( PAPI_ENOSUPP );
148  }
149 
150  /* What are the devices? Get Name and # of domains per device */
151  for ( id = 0; id < deviceCount; id++ ) {
152  err = (*cuDeviceGetPtr)( &device[id].dev, id );
153  CHECK_CU_ERROR( err, "cuDeviceGet" );
154 
155  err = (*cuDeviceGetNamePtr)( device[id].name, PAPI_MIN_STR_LEN, device[id].dev );
156  CHECK_CU_ERROR( err, "cuDeviceGetName" );
157 
158  SUBDBG ("Cuda deviceName: %s\n", device[id].name);
159 
160  /* Skip device if there are multiple of the same type
161  and if it has been already added to the list */
162  if ( 0 == strcmp( deviceName_tmp, device[id].name ) ) {
163  skipDevice++;
164  continue;
165  }
166 
167  strcpy( deviceName_tmp, device[id].name );
168 
169  /* enumerate the domains on the device */
170  if ( 0 != enumEventDomains( device[id].dev, id ) )
171  return ( PAPI_ENOSUPP );
172  }
173 
174  deviceCount = deviceCount - skipDevice;
175 
176  /* return number of events provided via CuPTI */
177  return totalEventCount;
178 }
179 
180 
181 /*
182  * Detect supported domains for specified device
183  */
184 static int
185 enumEventDomains( CUdevice dev, int deviceId )
186 {
187  CUptiResult err = CUPTI_SUCCESS;
188  CUpti_EventDomainID *domainId = NULL;
189  uint32_t id = 0;
190  size_t size = 0;
191 
192  device[deviceId].domainCount = 0;
193 
194  /* get number of domains for device dev */
195  err = (*cuptiDeviceGetNumEventDomainsPtr)( dev, &device[deviceId].domainCount );
196  CHECK_CUPTI_ERROR( err, "cuptiDeviceGetNumEventDomains" );
197 
198  if ( device[deviceId].domainCount == 0 ) {
199  printf( "No domain is exposed by dev = %d\n", dev );
200  return -1;
201  }
202 
203  /* CuPTI domain struct */
204  size = sizeof ( CUpti_EventDomainID ) * device[deviceId].domainCount;
205  domainId = ( CUpti_EventDomainID * ) malloc( size );
206  if ( domainId == NULL ) {
207  perror( "malloc(): Failed to allocate memory to CuPTI domain ID" );
208  return -1;
209  }
210  memset( domainId, 0, size );
211 
212  /* PAPI domain struct */
213  device[deviceId].domain =
214  ( DomainData_t * ) malloc( sizeof ( DomainData_t ) *
215  device[deviceId].domainCount );
216  if ( device[deviceId].domain == NULL ) {
217  perror( "malloc(): Failed to allocate memory to PAPI domain struct" );
218  free(domainId);
219  return -1;
220  }
221 
222  /* Enumerates the event domains for a device dev */
223  err = (*cuptiDeviceEnumEventDomainsPtr)( dev, &size, domainId );
224  CHECK_CUPTI_ERROR( err, "cuptiDeviceEnumEventDomains" );
225 
226  /* enum domains */
227  for ( id = 0; id < device[deviceId].domainCount; id++ ) {
228  device[deviceId].domain[id].domainId = domainId[id];
229 
230  /* query domain name */
231  size = PAPI_MIN_STR_LEN;
232 #ifdef CUDA_4_0
233  err = cuptiEventDomainGetAttribute( dev,
234  device[deviceId].domain[id].
235  domainId,
236  CUPTI_EVENT_DOMAIN_ATTR_NAME, &size,
237  ( void * ) device[deviceId].
238  domain[id].name );
239  CHECK_CUPTI_ERROR( err, "cuptiEventDomainGetAttribute" );
240 
241  /* query num of events avaialble in the domain */
242  size = sizeof ( device[deviceId].domain[id].eventCount );
243  err = cuptiEventDomainGetAttribute( dev,
244  device[deviceId].domain[id].
245  domainId,
246  CUPTI_EVENT_DOMAIN_MAX_EVENTS,
247  &size,
248  ( void * ) &device[deviceId].
249  domain[id].eventCount );
250  CHECK_CUPTI_ERROR( err, "cuptiEventDomainGetAttribute" );
251 
252  /* enumerate the events for the domain[id] on the device dev */
253  if ( 0 != enumEvents( dev, deviceId, id ) )
254  return -1;
255 #else
256  err = (*cuptiDeviceGetEventDomainAttributePtr)( dev,
257  device[deviceId].domain[id].domainId,
258  CUPTI_EVENT_DOMAIN_ATTR_NAME, &size,
259  ( void * ) device[deviceId].domain[id].name );
260  CHECK_CUPTI_ERROR( err, "cuptiDeviceGetEventDomainAttribute" );
261 
262  /* query num of events avaialble in the domain */
263  err = (*cuptiEventDomainGetNumEventsPtr)( device[deviceId].domain[id].domainId,
264  &device[deviceId].domain[id].eventCount );
265  CHECK_CUPTI_ERROR( err, "cuptiEventDomainGetNumEvents" );
266 
267  /* enumerate the events for the domain[id] on the device deviceId */
268  if ( 0 != enumEvents( deviceId, id ) )
269  return -1;
270 #endif
271  }
272 
273  totalDomainCount += device[deviceId].domainCount;
274  free( domainId );
275  return 0;
276 }
277 
278 
279 /*
280  * Detect supported events for specified device domain
281  */
282 #ifdef CUDA_4_0
283 static int
284 enumEvents( CUdevice dev, int deviceId, int domainId )
285 #else
286 static int
287 enumEvents( int deviceId, int domainId )
288 #endif
289 {
290  CUptiResult err = CUPTI_SUCCESS;
291  CUpti_EventID *eventId = NULL;
292  size_t size = 0;
293  uint32_t id = 0;
294 
295  /* CuPTI event struct */
296  size =
297  sizeof ( CUpti_EventID ) * device[deviceId].domain[domainId].eventCount;
298  eventId = ( CUpti_EventID * ) malloc( size );
299  if ( eventId == NULL ) {
300  perror( "malloc(): Failed to allocate memory to CuPTI event ID" );
301  return -1;
302  }
303  memset( eventId, 0, size );
304 
305  /* PAPI event struct */
306  device[deviceId].domain[domainId].event =
307  ( EventData_t * ) malloc( sizeof ( EventData_t ) *
308  device[deviceId].domain[domainId].
309  eventCount );
310  if ( device[deviceId].domain[domainId].event == NULL ) {
311  perror( "malloc(): Failed to allocate memory to PAPI event struct" );
312  free(eventId);
313  return -1;
314  }
315 
316  /* enumerate the events for the domain[domainId] on the device[deviceId] */
317 #ifdef CUDA_4_0
318  err =
319  (*cuptiEventDomainEnumEventsPtr)( dev,
320  ( CUpti_EventDomainID ) device[deviceId].
321  domain[domainId].domainId, &size, eventId );
322 #else
323  err =
324  (*cuptiEventDomainEnumEventsPtr)( ( CUpti_EventDomainID ) device[deviceId].
325  domain[domainId].domainId, &size, eventId );
326 #endif
327  CHECK_CUPTI_ERROR( err, "cuptiEventDomainEnumEvents" );
328 
329  /* query event info */
330  for ( id = 0; id < device[deviceId].domain[domainId].eventCount; id++ ) {
331  device[deviceId].domain[domainId].event[id].eventId = eventId[id];
332 
333  /* query event name */
334  size = PAPI_MIN_STR_LEN;
335 #ifdef CUDA_4_0
336  err = (*cuptiEventGetAttributePtr)( dev,
337  device[deviceId].domain[domainId].
338  event[id].eventId, CUPTI_EVENT_ATTR_NAME,
339  &size,
340  ( uint8_t * ) device[deviceId].
341  domain[domainId].event[id].name );
342 #else
343  err = (*cuptiEventGetAttributePtr)( device[deviceId].domain[domainId].
344  event[id].eventId, CUPTI_EVENT_ATTR_NAME,
345  &size,
346  ( uint8_t * ) device[deviceId].
347  domain[domainId].event[id].name );
348 #endif
349  CHECK_CUPTI_ERROR( err, "cuptiEventGetAttribute" );
350 
351  /* query event description */
352  size = PAPI_2MAX_STR_LEN;
353 #ifdef CUDA_4_0
354  err = (*cuptiEventGetAttributePtr)( dev,
355  device[deviceId].domain[domainId].
356  event[id].eventId,
357  CUPTI_EVENT_ATTR_SHORT_DESCRIPTION, &size,
358  ( uint8_t * ) device[deviceId].
359  domain[domainId].event[id].desc );
360 #else
361  err = (*cuptiEventGetAttributePtr)( device[deviceId].domain[domainId].
362  event[id].eventId,
363  CUPTI_EVENT_ATTR_SHORT_DESCRIPTION, &size,
364  ( uint8_t * ) device[deviceId].
365  domain[domainId].event[id].desc );
366 #endif
367  CHECK_CUPTI_ERROR( err, "cuptiEventGetAttribute" );
368  }
369 
370  totalEventCount += device[deviceId].domain[domainId].eventCount;
371  free( eventId );
372  return 0;
373 }
374 
375 
376 /*
377  * Create the native events for specified domain and device
378  */
379 static int
381 {
382  int deviceId, id = 0;
383  uint32_t domainId, eventId;
384  int cuptiDomainId;
385  int i;
386  int devNameLen;
387 
388  /* create events for every GPU device and every domain per device */
389  for ( deviceId = 0; deviceId < deviceCount; deviceId++ ) {
390  /* for the event names, replace blanks in the device name with underscores */
391  devNameLen = strlen( device[deviceId].name );
392  for ( i = 0; i < devNameLen; i++ )
393  if ( device[deviceId].name[i] == ' ' )
394  device[deviceId].name[i] = '_';
395 
396  for ( domainId = 0; domainId < device[deviceId].domainCount;
397  domainId++ ) {
398  cuptiDomainId = device[deviceId].domain[domainId].domainId;
399 
400  for ( eventId = 0;
401  eventId < device[deviceId].domain[domainId].eventCount;
402  eventId++ ) {
403  /* Save native event data */
404  sprintf( cuda_native_table[id].name,
405  "%s:%s:%s",
406  device[deviceId].name,
407  device[deviceId].domain[domainId].name,
408  device[deviceId].domain[domainId].event[eventId].
409  name );
410 
411  strncpy( cuda_native_table[id].description,
412  device[deviceId].domain[domainId].event[eventId].desc,
414 
415  /* The selector has to be !=0 . Starts with 1 */
417 
418  /* store event ID */
420  device[deviceId].domain[domainId].event[eventId].eventId;
421 
422  /* increment the table index counter */
423  id++;
424  }
425  }
426  }
427 
428  /* Return the number of events created */
429  return id;
430 }
431 
432 
433 /*
434  * Returns all event values from the CuPTI eventGroup
435  */
436 static int
437 getEventValue( long long *counts, CUpti_EventGroup eventGroup, AddedEvents_t addedEvents )
438 {
439  CUptiResult cuptiErr = CUPTI_SUCCESS;
440  size_t events_read, bufferSizeBytes, arraySizeBytes, i;
441  uint64_t *counterDataBuffer;
442  CUpti_EventID *eventIDArray;
443  int j;
444 
445  bufferSizeBytes = addedEvents.count * sizeof ( uint64_t );
446  counterDataBuffer = ( uint64_t * ) malloc( bufferSizeBytes );
447 
448  arraySizeBytes = addedEvents.count * sizeof ( CUpti_EventID );
449  eventIDArray = ( CUpti_EventID * ) malloc( arraySizeBytes );
450 
451  /* read counter data for the specified event from the CuPTI eventGroup */
452  cuptiErr = (*cuptiEventGroupReadAllEventsPtr)( eventGroup,
453  CUPTI_EVENT_READ_FLAG_NONE,
454  &bufferSizeBytes,
455  counterDataBuffer, &arraySizeBytes,
456  eventIDArray, &events_read );
457  CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupReadAllEvents" );
458 
459  if ( events_read != ( size_t ) addedEvents.count )
460  return -1;
461 
462  /* Since there is no guarantee that returned counter values are in the same
463  order as the counters in the PAPI addedEvents.list, we need to map the
464  CUpti_EventID to PAPI event ID values.
465  According to CuPTI doc: counter return values of counterDataBuffer
466  correspond to the return event IDs in eventIDArray */
467  for ( i = 0; i < events_read; i++ )
468  for ( j = 0; j < addedEvents.count; j++ )
469  if ( cuda_native_table[addedEvents.list[j]].resources.eventId ==
470  eventIDArray[i] )
471  // since cuptiEventGroupReadAllEvents() resets counter values to 0;
472  // we have to accumulate ourselves
473  counts[addedEvents.list[j]] = counts[addedEvents.list[j]] + counterDataBuffer[i];
474 
475  free( counterDataBuffer );
476  free( eventIDArray );
477  return 0;
478 }
479 
480 
481 /*****************************************************************************
482  ******************* BEGIN PAPI's COMPONENT REQUIRED FUNCTIONS *************
483  *****************************************************************************/
484 
485 /*
486  * This is called whenever a thread is initialized
487  */
488 int
490 {
491  ( void ) ctx;
492 
493  return PAPI_OK;
494 }
495 
496 
497 /* Initialize hardware counters, setup the function vector table
498  * and get hardware information, this routine is called when the
499  * PAPI process is initialized (IE PAPI_library_init)
500  *
501  * NOTE: only called by main thread (not by every thread) !!!
502  *
503  * Starting in CUDA 4.0, multiple CPU threads can access the same CUDA context.
504  * This is a much easier programming model then pre-4.0 as threads - using the
505  * same context - can share memory, data, etc.
506  * It's possible to create a different context for each thread, but then we are
507  * likely running into a limitation that only one context can be profiled at a time.
508  * ==> and we don't want this. That's why CUDA context creation is done in
509  * CUDA_init_component() (called only by main thread) rather than CUDA_init()
510  * or CUDA_init_control_state() (both called by each thread).
511  */
512 int
514 {
515  SUBDBG ("Entry: cidx: %d\n", cidx);
516  CUresult cuErr = CUDA_SUCCESS;
517 
518  /* link in all the cuda libraries and resolve the symbols we need to use */
519  if (linkCudaLibraries() != PAPI_OK) {
520  SUBDBG ("Dynamic link of CUDA libraries failed, component will be disabled.\n");
521  SUBDBG ("See disable reason in papi_component_avail output for more details.\n");
522  return (PAPI_ENOSUPP);
523  }
524 
525  /* Create dynamic event table */
526  NUM_EVENTS = detectDevice( );
527  if (NUM_EVENTS < 0) {
528  strncpy(_cuda_vector.cmp_info.disabled_reason, "Call to detectDevice failed.",PAPI_MAX_STR_LEN);
529  return (PAPI_ENOSUPP);
530  }
531  /* TODO: works only for one device right now;
532  need to find out if user can use 2 or more devices at same time */
533 
534  /* want create a CUDA context for either the default device or
535  the device specified with cudaSetDevice() in user code */
536  if ( CUDA_SUCCESS != (*cudaGetDevicePtr)( &currentDeviceID ) ) {
537  strncpy(_cuda_vector.cmp_info.disabled_reason, "No NVIDIA GPU's found.",PAPI_MAX_STR_LEN);
538  return ( PAPI_ENOSUPP );
539  }
540 
541  if ( getenv( "PAPI_VERBOSE" ) ) {
542  printf( "DEVICE USED: %s (%d)\n", device[currentDeviceID].name,
543  currentDeviceID );
544  }
545 
546  /* get the CUDA context from the calling CPU thread */
547  cuErr = (*cuCtxGetCurrentPtr)( &cuCtx );
548 
549  /* if no CUDA context is bound to the calling CPU thread yet, create one */
550  if ( cuErr != CUDA_SUCCESS || cuCtx == NULL ) {
551  cuErr = (*cuCtxCreatePtr)( &cuCtx, 0, device[currentDeviceID].dev );
552  CHECK_CU_ERROR( cuErr, "cuCtxCreate" );
553  }
554 
555  /* cuCtxGetCurrent() can return a non-null context that is not valid
556  because the context has not yet been initialized.
557  Here is a workaround:
558  cudaFree(NULL) forces the context to be initialized
559  if cudaFree(NULL) returns success then we are able to use the context in subsequent calls
560  if cudaFree(NULL) returns an error (or subsequent cupti* calls) then the context is not usable,
561  and will never be useable */
562  if ( CUDA_SUCCESS != (*cudaFreePtr)( NULL ) ) {
563  strncpy(_cuda_vector.cmp_info.disabled_reason, "Problem initializing CUDA context.",PAPI_MAX_STR_LEN);
564  return ( PAPI_ENOSUPP );
565  }
566 
567  /* Create dynamic event table */
569  malloc( sizeof ( CUDA_native_event_entry_t ) * NUM_EVENTS );
570  if ( cuda_native_table == NULL ) {
571  perror( "malloc(): Failed to allocate memory to events table" );
572  strncpy(_cuda_vector.cmp_info.disabled_reason, "Failed to allocate memory to events table.",PAPI_MAX_STR_LEN);
573  return ( PAPI_ENOSUPP );
574  }
575 
576  if ( NUM_EVENTS != createNativeEvents( ) ) {
577  strncpy(_cuda_vector.cmp_info.disabled_reason, "Error creating CUDA event list.",PAPI_MAX_STR_LEN);
578  return ( PAPI_ENOSUPP );
579  }
580 
581  /* Export the component id */
582  _cuda_vector.cmp_info.CmpIdx = cidx;
583 
584  /* Number of events */
585  _cuda_vector.cmp_info.num_native_events = NUM_EVENTS;
586 
587  return ( PAPI_OK );
588 }
589 
590 
591 /*
592  * Link the necessary CUDA libraries to use the cuda component. If any of them can not be found, then
593  * the CUDA component will just be disabled. This is done at runtime so that a version of PAPI built
594  * with the CUDA component can be installed and used on systems which have the CUDA libraries installed
595  * and on systems where these libraries are not installed.
596  */
597 static int
599 {
600  /* Attempt to guess if we were statically linked to libc, if so bail */
601  if ( _dl_non_dynamic_init != NULL ) {
602  strncpy(_cuda_vector.cmp_info.disabled_reason, "The cuda component does not support statically linking to libc.",PAPI_MAX_STR_LEN);
603  return PAPI_ENOSUPP;
604  }
605  /* Need to link in the cuda libraries, if not found disable the component */
606  dl1 = dlopen("libcuda.so", RTLD_NOW | RTLD_GLOBAL);
607  if (!dl1)
608  {
609  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDA library libcuda.so not found.",PAPI_MAX_STR_LEN);
610  return ( PAPI_ENOSUPP );
611  }
612  cuCtxCreatePtr = dlsym(dl1, "cuCtxCreate_v2");
613  if (dlerror() != NULL)
614  {
615  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDA function cuCtxCreate not found.",PAPI_MAX_STR_LEN);
616  return ( PAPI_ENOSUPP );
617  }
618  cuCtxDestroyPtr = dlsym(dl1, "cuCtxDestroy_v2");
619  if (dlerror() != NULL)
620  {
621  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDA function cuCtxDestroy not found.",PAPI_MAX_STR_LEN);
622  return ( PAPI_ENOSUPP );
623  }
624  cuCtxGetCurrentPtr = dlsym(dl1, "cuCtxGetCurrent");
625  if (dlerror() != NULL)
626  {
627  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDA function cuCtxGetCurrent not found.",PAPI_MAX_STR_LEN);
628  return ( PAPI_ENOSUPP );
629  }
630  cuDeviceGetPtr = dlsym(dl1, "cuDeviceGet");
631  if (dlerror() != NULL)
632  {
633  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDA function cuDeviceGet not found.",PAPI_MAX_STR_LEN);
634  return ( PAPI_ENOSUPP );
635  }
636  cuDeviceGetCountPtr = dlsym(dl1, "cuDeviceGetCount");
637  if (dlerror() != NULL)
638  {
639  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDA function cuDeviceGetCount not found.",PAPI_MAX_STR_LEN);
640  return ( PAPI_ENOSUPP );
641  }
642  cuDeviceGetNamePtr = dlsym(dl1, "cuDeviceGetName");
643  if (dlerror() != NULL)
644  {
645  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDA function cuDeviceGetName not found.",PAPI_MAX_STR_LEN);
646  return ( PAPI_ENOSUPP );
647  }
648  cuInitPtr = dlsym(dl1, "cuInit");
649  if (dlerror() != NULL)
650  {
651  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDA function cuInit not found.",PAPI_MAX_STR_LEN);
652  return ( PAPI_ENOSUPP );
653  }
654 
655  dl2 = dlopen("libcudart.so", RTLD_NOW | RTLD_GLOBAL);
656  if (!dl2)
657  {
658  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDA runtime library libcudart.so not found.",PAPI_MAX_STR_LEN);
659  return ( PAPI_ENOSUPP );
660  }
661  cudaFreePtr = dlsym(dl2, "cudaFree");
662  if (dlerror() != NULL)
663  {
664  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDART function cudaFree not found.",PAPI_MAX_STR_LEN);
665  return ( PAPI_ENOSUPP );
666  }
667  cudaGetDevicePtr = dlsym(dl2, "cudaGetDevice");
668  if (dlerror() != NULL)
669  {
670  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDART function cudaGetDevice not found.",PAPI_MAX_STR_LEN);
671  return ( PAPI_ENOSUPP );
672  }
673  cudaRuntimeGetVersionPtr = dlsym(dl2, "cudaRuntimeGetVersion");
674  if (dlerror() != NULL)
675  {
676  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDART function cudaRuntimeGetVersion not found.",PAPI_MAX_STR_LEN);
677  return ( PAPI_ENOSUPP );
678  }
679  cudaDriverGetVersionPtr = dlsym(dl2, "cudaDriverGetVersion");
680  if (dlerror() != NULL)
681  {
682  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDART function cudaDriverGetVersion not found.",PAPI_MAX_STR_LEN);
683  return ( PAPI_ENOSUPP );
684  }
685 
686  dl3 = dlopen("libcupti.so", RTLD_NOW | RTLD_GLOBAL);
687  if (!dl3)
688  {
689  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDA runtime library libcupti.so not found.",PAPI_MAX_STR_LEN);
690  return ( PAPI_ENOSUPP );
691  }
692  cuptiDeviceEnumEventDomainsPtr = dlsym(dl3, "cuptiDeviceEnumEventDomains");
693  if (dlerror() != NULL)
694  {
695  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiDeviceEnumEventDomains not found.",PAPI_MAX_STR_LEN);
696  return ( PAPI_ENOSUPP );
697  }
698  cuptiDeviceGetEventDomainAttributePtr = dlsym(dl3, "cuptiDeviceGetEventDomainAttribute");
699  if (dlerror() != NULL)
700  {
701  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiDeviceGetEventDomainAttribute not found.",PAPI_MAX_STR_LEN);
702  return ( PAPI_ENOSUPP );
703  }
704  cuptiDeviceGetNumEventDomainsPtr = dlsym(dl3, "cuptiDeviceGetNumEventDomains");
705  if (dlerror() != NULL)
706  {
707  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiDeviceGetNumEventDomains not found.",PAPI_MAX_STR_LEN);
708  return ( PAPI_ENOSUPP );
709  }
710  cuptiEventDomainEnumEventsPtr = dlsym(dl3, "cuptiEventDomainEnumEvents");
711  if (dlerror() != NULL)
712  {
713  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiEventDomainEnumEvents not found.",PAPI_MAX_STR_LEN);
714  return ( PAPI_ENOSUPP );
715  }
716  cuptiEventDomainGetNumEventsPtr = dlsym(dl3, "cuptiEventDomainGetNumEvents");
717  if (dlerror() != NULL)
718  {
719  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiEventDomainGetNumEvents not found.",PAPI_MAX_STR_LEN);
720  return ( PAPI_ENOSUPP );
721  }
722  cuptiEventGetAttributePtr = dlsym(dl3, "cuptiEventGetAttribute");
723  if (dlerror() != NULL)
724  {
725  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiEventGetAttribute not found.",PAPI_MAX_STR_LEN);
726  return ( PAPI_ENOSUPP );
727  }
728  cuptiEventGroupAddEventPtr = dlsym(dl3, "cuptiEventGroupAddEvent");
729  if (dlerror() != NULL)
730  {
731  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiEventGroupAddEvent not found.",PAPI_MAX_STR_LEN);
732  return ( PAPI_ENOSUPP );
733  }
734  cuptiEventGroupCreatePtr = dlsym(dl3, "cuptiEventGroupCreate");
735  if (dlerror() != NULL)
736  {
737  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiEventGroupCreate not found.",PAPI_MAX_STR_LEN);
738  return ( PAPI_ENOSUPP );
739  }
740  cuptiEventGroupDestroyPtr = dlsym(dl3, "cuptiEventGroupDestroy");
741  if (dlerror() != NULL)
742  {
743  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiEventGroupDestroy not found.",PAPI_MAX_STR_LEN);
744  return ( PAPI_ENOSUPP );
745  }
746  cuptiEventGroupDisablePtr = dlsym(dl3, "cuptiEventGroupDisable");
747  if (dlerror() != NULL)
748  {
749  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiEventGroupDisable not found.",PAPI_MAX_STR_LEN);
750  return ( PAPI_ENOSUPP );
751  }
752  cuptiEventGroupEnablePtr = dlsym(dl3, "cuptiEventGroupEnable");
753  if (dlerror() != NULL)
754  {
755  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiEventGroupEnable not found.",PAPI_MAX_STR_LEN);
756  return ( PAPI_ENOSUPP );
757  }
758  cuptiEventGroupReadAllEventsPtr = dlsym(dl3, "cuptiEventGroupReadAllEvents");
759  if (dlerror() != NULL)
760  {
761  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiEventGroupReadAllEvents not found.",PAPI_MAX_STR_LEN);
762  return ( PAPI_ENOSUPP );
763  }
764  cuptiEventGroupRemoveAllEventsPtr = dlsym(dl3, "cuptiEventGroupRemoveAllEvents");
765  if (dlerror() != NULL)
766  {
767  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiEventGroupRemoveAllEvents not found.",PAPI_MAX_STR_LEN);
768  return ( PAPI_ENOSUPP );
769  }
770  cuptiEventGroupResetAllEventsPtr = dlsym(dl3, "cuptiEventGroupResetAllEvents");
771  if (dlerror() != NULL)
772  {
773  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiEventGroupResetAllEvents not found.",PAPI_MAX_STR_LEN);
774  return ( PAPI_ENOSUPP );
775  }
776 
777  return ( PAPI_OK );
778 }
779 
780 
781 /*
782  * Control of counters (Reading/Writing/Starting/Stopping/Setup)
783  * functions
784  */
785 int
787 {
788  CUDA_control_state_t * CUDA_ctrl = ( CUDA_control_state_t * ) ctrl;
789  CUptiResult cuptiErr = CUPTI_SUCCESS;
790  int i;
791 
792  /* allocate memory for the list of events that are added to the CuPTI eventGroup */
793  CUDA_ctrl->addedEvents.list = malloc( sizeof ( int ) * NUM_EVENTS );
794  if ( CUDA_ctrl->addedEvents.list == NULL ) {
795  perror
796  ( "malloc(): Failed to allocate memory to table of events that are added to CuPTI eventGroup" );
797  return ( PAPI_ENOSUPP );
798  }
799 
800  /* initialize the event list */
801  for ( i = 0; i < NUM_EVENTS; i++ )
802  CUDA_ctrl->addedEvents.list[i] = 0;
803 
804 
805 
806  cuptiErr = (*cuptiEventGroupCreatePtr)( cuCtx, &CUDA_ctrl->eventGroup, 0 );
807  CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupCreate" );
808 
809  return PAPI_OK;
810 }
811 
812 
813 /*
814  *
815  */
816 int
818 {
819  ( void ) ctx;
820  int i;
821  CUDA_control_state_t * CUDA_ctrl = ( CUDA_control_state_t * ) ctrl;
822  CUptiResult cuptiErr = CUPTI_SUCCESS;
823 
824  // reset all event values to 0
825  for ( i = 0; i < NUM_EVENTS; i++ )
826  CUDA_ctrl->counts[i] = 0;
827 
828  cuptiErr = (*cuptiEventGroupEnablePtr)( CUDA_ctrl->eventGroup );
829  CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupEnable" );
830 
831  /* Resets all events in the CuPTI eventGroup to zero */
832  cuptiErr = (*cuptiEventGroupResetAllEventsPtr)( CUDA_ctrl->eventGroup );
833  CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupResetAllEvents" );
834 
835  return ( PAPI_OK );
836 }
837 
838 
839 /*
840  *
841  */
842 int
844 {
845  ( void ) ctx;
846  ( void ) ctrl;
847 
848  return ( PAPI_OK );
849 }
850 
851 
852 /*
853  *
854  */
855 int
857  long_long ** events, int flags )
858 {
859  ( void ) ctx;
860  ( void ) flags;
861  CUDA_control_state_t * CUDA_ctrl = ( CUDA_control_state_t * ) ctrl;
862 
863 
864  if ( 0 != getEventValue( CUDA_ctrl->counts, CUDA_ctrl->eventGroup, CUDA_ctrl->addedEvents ) )
865  return ( PAPI_ENOSUPP );
866 
867  *events = CUDA_ctrl->counts;
868 
869  return ( PAPI_OK );
870 }
871 
872 /*
873  *
874  */
875 int
877 {
878  CUDA_context_t *CUDA_ctx = (CUDA_context_t*)ctx;
879  free( CUDA_ctx->state.addedEvents.list );
880  return (PAPI_OK);
881 }
882 
883 /*
884  *
885  */
886 int
888 {
889  CUresult cuErr = CUDA_SUCCESS;
890 
891  /* if running a threaded application, we need to make sure that
892  a thread doesn't free the same memory location(s) more than once */
893  if ( CUDA_FREED == 0 ) {
894  uint32_t j;
895  int i;
896 
897  CUDA_FREED = 1;
898 
899  /* deallocate all the memory */
900  for ( i = 0; i < deviceCount; i++ ) {
901  for ( j = 0; j < device[i].domainCount; j++ )
902  free( device[i].domain[j].event );
903 
904  free( device[i].domain );
905  }
906 
907  free( device );
909 
910  /* destroy floating CUDA context */
911  cuErr = (*cuCtxDestroyPtr)( cuCtx );
912  if ( cuErr != CUDA_SUCCESS )
913  return ( PAPI_ENOSUPP ); // Not supported
914  }
915 
916  // close the dynamic libraries needed by this component (opened in the init substrate call)
917  dlclose(dl1);
918  dlclose(dl2);
919  dlclose(dl3);
920 
921  return ( PAPI_OK );
922 }
923 
924 
925 /* This function sets various options in the component
926  * The valid codes being passed in are PAPI_SET_DEFDOM,
927  * PAPI_SET_DOMAIN, PAPI_SETDEFGRN, PAPI_SET_GRANUL * and PAPI_SET_INHERIT
928  */
929 int
930 CUDA_ctl( hwd_context_t * ctx, int code, _papi_int_option_t * option )
931 {
932  ( void ) ctx;
933  ( void ) code;
934  ( void ) option;
935  return ( PAPI_OK );
936 }
937 
938 
939 //int CUDA_ntv_code_to_bits ( unsigned int EventCode, hwd_register_t * bits );
940 
941 
942 /*
943  *
944  */
945 int
947  NativeInfo_t * native, int count,
948  hwd_context_t * ctx )
949 {
950  ( void ) ctx;
951  CUDA_control_state_t * CUDA_ptr = ( CUDA_control_state_t * ) ptr;
952  int index, i;
953  CUptiResult cuptiErr = CUPTI_SUCCESS;
954 
955  /* Disable the CUDA eventGroup;
956  it also frees the perfmon hardware on the GPU */
957  cuptiErr = (*cuptiEventGroupDisablePtr)( CUDA_ptr->eventGroup );
958  CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupDisable" );
959 
960  cuptiErr = (*cuptiEventGroupRemoveAllEventsPtr)( CUDA_ptr->eventGroup );
961  CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupRemoveAllEvents" );
962 
963  // otherwise, add the events to the eventset
964  for ( i = 0; i < count; i++ ) {
965 
966  index = native[i].ni_event;
967  native[i].ni_position = index;
968 
969  /* store events, that have been added to the CuPTI eveentGroup
970  in a seperate place (addedEvents).
971  Needed, so that we can read the values for the added events only */
972  CUDA_ptr->addedEvents.count = count;
973  CUDA_ptr->addedEvents.list[i] = index;
974 
975  /* if this device name is different from the actual device the code is running on, then exit */
976  if ( 0 != strncmp( device[currentDeviceID].name,
977  cuda_native_table[index].name,
978  strlen( device[currentDeviceID].name ) ) ) {
979  fprintf( stderr, "Device %s is used -- BUT event %s is collected. \n ---> ERROR: Specify events for the device that is used!\n\n",
980  device[currentDeviceID].name, cuda_native_table[index].name );
981 
982  return ( PAPI_ENOSUPP ); // Not supported
983  }
984 
985  /* Add events to the CuPTI eventGroup */
986  cuptiErr =
987  (*cuptiEventGroupAddEventPtr)( CUDA_ptr->eventGroup,
989  eventId );
990  CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupAddEvent" );
991  }
992 
993  return ( PAPI_OK );
994 }
995 
996 
997 /*
998  * This function has to set the bits needed to count different domains
999  * In particular: PAPI_DOM_USER, PAPI_DOM_KERNEL PAPI_DOM_OTHER
1000  * By default return PAPI_EINVAL if none of those are specified
1001  * and PAPI_OK with success
1002  * PAPI_DOM_USER is only user context is counted
1003  * PAPI_DOM_KERNEL is only the Kernel/OS context is counted
1004  * PAPI_DOM_OTHER is Exception/transient mode (like user TLB misses)
1005  * PAPI_DOM_ALL is all of the domains
1006  */
1007 int
1008 CUDA_set_domain( hwd_control_state_t * cntrl, int domain )
1009 {
1010  int found = 0;
1011  ( void ) cntrl;
1012 
1013  if ( PAPI_DOM_USER & domain )
1014  found = 1;
1015 
1016  if ( PAPI_DOM_KERNEL & domain )
1017  found = 1;
1018 
1019  if ( PAPI_DOM_OTHER & domain )
1020  found = 1;
1021 
1022  if ( !found )
1023  return ( PAPI_EINVAL );
1024 
1025  return ( PAPI_OK );
1026 }
1027 
1028 
1029 /*
1030  *
1031  */
1032 int
1034 {
1035  ( void ) ctx;
1036  CUDA_control_state_t * CUDA_ctrl = ( CUDA_control_state_t * ) ctrl;
1037  CUptiResult cuptiErr = CUPTI_SUCCESS;
1038 
1039  /* Resets all events in the CuPTI eventGroup to zero */
1040  cuptiErr = (*cuptiEventGroupResetAllEventsPtr)( CUDA_ctrl->eventGroup );
1041  CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupResetAllEvents" );
1042 
1043  return ( PAPI_OK );
1044 }
1045 
1046 
1047 /*
1048  * Disable and Destoy the CUDA eventGroup */
1049 int
1051 {
1052  ( void ) ctrl;
1053 
1054  // TODO: after cleanup_eventset() which destroys the eventset, update_control_state()
1055  // is called, which operates on the already destroyed eventset. Bad!
1056 #if 0
1057  CUDA_control_state_t * CUDA_ctrl = ( CUDA_control_state_t * ) ctrl;
1058  CUptiResult cuptiErr = CUPTI_SUCCESS;
1059 
1060  /* Disable the CUDA eventGroup;
1061  it also frees the perfmon hardware on the GPU */
1062  cuptiErr = (*cuptiEventGroupDisablePtr)( CUDA_ctrl->eventGroup );
1063  CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupDisable" );
1064 
1065  /* Call the CuPTI cleaning function before leaving */
1066  cuptiErr = (*cuptiEventGroupDestroyPtr)( CUDA_ctrl->eventGroup );
1067  CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupDestroy" );
1068 #endif
1069  return ( PAPI_OK );
1070 }
1071 
1072 
1073 /*
1074  * Native Event functions
1075  */
1076 int
1077 CUDA_ntv_enum_events( unsigned int *EventCode, int modifier )
1078 {
1079 
1080  switch ( modifier ) {
1081  case PAPI_ENUM_FIRST:
1082  *EventCode = 0;
1083 
1084  return ( PAPI_OK );
1085  break;
1086 
1087  case PAPI_ENUM_EVENTS:
1088  {
1089  int index = *EventCode;
1090 
1091  if ( index < NUM_EVENTS - 1 ) {
1092  *EventCode = *EventCode + 1;
1093  return ( PAPI_OK );
1094  } else
1095  return ( PAPI_ENOEVNT );
1096 
1097  break;
1098  }
1099  default:
1100  return ( PAPI_EINVAL );
1101  }
1102  return ( PAPI_EINVAL );
1103 }
1104 
1105 
1106 /*
1107  *
1108  */
1109 int
1110 CUDA_ntv_code_to_name( unsigned int EventCode, char *name, int len )
1111 {
1112  int index = EventCode;
1113 
1114  strncpy( name, cuda_native_table[index].name, len );
1115  return ( PAPI_OK );
1116 }
1117 
1118 
1119 /*
1120  *
1121  */
1122 int
1123 CUDA_ntv_code_to_descr( unsigned int EventCode, char *name, int len )
1124 {
1125  int index = EventCode;
1126 
1127  strncpy( name, cuda_native_table[index].description, len );
1128  return ( PAPI_OK );
1129 }
1130 
1131 
1132 /*
1133  *
1134  */
1135 int
1136 CUDA_ntv_code_to_bits( unsigned int EventCode, hwd_register_t * bits )
1137 {
1138  int index = EventCode;
1139 
1140  memcpy( ( CUDA_register_t * ) bits,
1141  &( cuda_native_table[index].resources ),
1142  sizeof ( CUDA_register_t ) );
1143 
1144  return ( PAPI_OK );
1145 }
1146 
1147 
1148 /*
1149  *
1150  */
1151 papi_vector_t _cuda_vector = {
1152  .cmp_info = {
1153  /* default component information (unspecified values are initialized to 0) */
1154  .name = "cuda",
1155  .short_name = "cuda",
1156  .version = "5.0",
1157  .description = "CuPTI provides the API for monitoring NVIDIA GPU hardware events",
1158  .num_mpx_cntrs = CUDA_MAX_COUNTERS,
1159  .num_cntrs = CUDA_MAX_COUNTERS,
1160  .default_domain = PAPI_DOM_USER,
1161  .default_granularity = PAPI_GRN_THR,
1162  .available_granularities = PAPI_GRN_THR,
1163  .hardware_intr_sig = PAPI_INT_SIGNAL,
1164 
1165  /* component specific cmp_info initializations */
1166  .fast_real_timer = 0,
1167  .fast_virtual_timer = 0,
1168  .attach = 0,
1169  .attach_must_ptrace = 0,
1170  .available_domains = PAPI_DOM_USER | PAPI_DOM_KERNEL,
1171  }
1172  ,
1173 
1174  /* sizes of framework-opaque component-private structures */
1175  .size = {
1176  .context = sizeof ( CUDA_context_t ),
1177  .control_state = sizeof ( CUDA_control_state_t ),
1178  .reg_value = sizeof ( CUDA_register_t ),
1179  .reg_alloc = sizeof ( CUDA_reg_alloc_t ),
1180  }
1181  ,
1182  /* function pointers in this component */
1183  .init_thread = CUDA_init_thread,
1184  .init_component = CUDA_init_component,
1185  .init_control_state = CUDA_init_control_state,
1186  .start = CUDA_start,
1187  .stop = CUDA_stop,
1188  .read = CUDA_read,
1189  .shutdown_component = CUDA_shutdown_component,
1190  .shutdown_thread = CUDA_shutdown_thread,
1191  .cleanup_eventset = CUDA_cleanup_eventset,
1192  .ctl = CUDA_ctl,
1193  .update_control_state = CUDA_update_control_state,
1194  .set_domain = CUDA_set_domain,
1195  .reset = CUDA_reset,
1196 
1197  .ntv_enum_events = CUDA_ntv_enum_events,
1198  .ntv_code_to_name = CUDA_ntv_code_to_name,
1199  .ntv_code_to_descr = CUDA_ntv_code_to_descr,
1200  .ntv_code_to_bits = CUDA_ntv_code_to_bits,
1201 };
char name[PAPI_MAX_STR_LEN]
Definition: papi.h:625
sprintf(splash[splash_line++],"\tIozone: Performance Test of File I/O\n")
memset(eventId, 0, size)
#define CUDAAPI
CUdevice dev
Definition: linux-cuda.h:68
CHECK_CUPTI_ERROR(err,"cuptiEventDomainEnumEvents")
CUDA_control_state_t state
Definition: linux-cuda.h:118
#define CUDA_MAX_COUNTERS
Definition: linux-cuda.h:47
int CUDA_ntv_code_to_bits(unsigned int EventCode, hwd_register_t *bits)
Definition: linux-cuda.c:1136
#define CHECK_CU_ERROR(err, cufunc)
Definition: linux-cuda.h:26
char * getenv()
long long flags
Definition: iozone.c:12330
int CUDA_ntv_code_to_name(unsigned int EventCode, char *name, int len)
Definition: linux-cuda.c:1110
AddedEvents_t addedEvents
Definition: linux-cuda.h:110
#define PAPI_MAX_STR_LEN
Definition: fpapi.h:43
#define PAPI_MIN_STR_LEN
Definition: fpapi.h:41
static int enumEvents(int domainId, int eventCount)
static int deviceCount
Definition: linux-cuda.h:136
#define PAPI_ENOEVNT
Definition: fpapi.h:112
This file has the source code for a component that enables PAPI-C to access hardware monitoring count...
int CUDA_set_domain(hwd_control_state_t *cntrl, int domain)
Definition: linux-cuda.c:1008
int CUDA_shutdown_thread(hwd_context_t *ctx)
Definition: linux-cuda.c:876
int CUDA_ntv_enum_events(unsigned int *EventCode, int modifier)
Definition: linux-cuda.c:1077
device[deviceId] domain[domainId] event
Definition: linux-cuda.c:306
totalEventCount
Definition: linux-cuda.c:370
return PAPI_OK
Definition: linux-nvml.c:458
int count
Definition: iozone.c:22422
int CUDA_init_thread(hwd_context_t *ctx)
Definition: linux-cuda.c:489
#define PAPI_DOM_OTHER
Definition: fpapi.h:23
#define PAPI_DOM_KERNEL
Definition: fpapi.h:22
void
Definition: iozone.c:18627
return PAPI_EINVAL
Definition: linux-nvml.c:408
PAPI_component_info_t cmp_info
Definition: papi_vector.h:20
int CUDA_ctl(hwd_context_t *ctx, int code, _papi_int_option_t *option)
Definition: linux-cuda.c:930
#define printf
Definition: papi_test.h:125
int CUDA_start(hwd_context_t *ctx, hwd_control_state_t *ctrl)
Definition: linux-cuda.c:817
CUpti_EventID eventId
Definition: linux-cuda.h:51
int CUDA_ntv_code_to_descr(unsigned int EventCode, char *name, int len)
Definition: linux-cuda.c:1123
Return codes and api definitions.
CUpti_EventID eventId
Definition: linux-cuda.h:88
static int createNativeEvents(void)
Definition: linux-cuda.c:380
err
Definition: linux-cuda.c:323
#define PAPI_2MAX_STR_LEN
Definition: papi.h:464
#define NUM_EVENTS
char disabled_reason[PAPI_MAX_STR_LEN]
Definition: papi.h:632
void(* _dl_non_dynamic_init)(void)
Definition: linux-cuda.c:41
int i
Definition: fileop.c:140
#define PAPI_ENOSUPP
Definition: fpapi.h:123
static CUcontext cuCtx
Definition: linux-cuda.h:156
static int currentDeviceID
Definition: linux-cuda.h:139
long long found
Definition: libasync.c:735
char *long long size
Definition: iozone.c:12023
free(dummyfile[xx])
static int cidx
Definition: event_info.c:40
int CUDA_reset(hwd_context_t *ctx, hwd_control_state_t *ctrl)
Definition: linux-cuda.c:1033
static int native
Definition: event_info.c:39
int CUDA_shutdown_component(void)
Definition: linux-cuda.c:887
uint32_t eventCount
Definition: linux-cuda.h:61
__attribute__((constructor))
Definition: init_fini.c:12
#define SUBDBG(format, args...)
Definition: papi_debug.h:63
static CUDA_native_event_entry_t * cuda_native_table
Definition: linux-cuda.h:133
int CUDA_stop(hwd_context_t *ctx, hwd_control_state_t *ctrl)
Definition: linux-cuda.c:843
char events[MAX_EVENTS][BUFSIZ]
EventData_t * event
Definition: linux-cuda.h:62
uint32_t domainCount
Definition: linux-cuda.h:70
#define PAPI_INT_SIGNAL
Definition: papi_internal.h:53
int CUDA_cleanup_eventset(hwd_control_state_t *ctrl)
Definition: linux-cuda.c:1050
uint32_t id
Definition: linux-cuda.c:293
DomainData_t * domain
Definition: linux-cuda.h:71
papi_vector_t _cuda_vector
Definition: linux-cuda.c:1151
#define CUPTIAPI
int CUDA_init_component(int cidx)
Definition: linux-cuda.c:513
static int enumEventDomains(CUdevice dev, int deviceId)
Definition: linux-cuda.c:185
strcpy(filename, default_filename)
static int totalDomainCount
Definition: linux-cuda.h:137
static DeviceData_t * device
Definition: linux-cuda.h:155
char * name
Definition: iozone.c:23648
int
Definition: iozone.c:18528
#define long_long
Definition: papi.h:549
int CUDA_read(hwd_context_t *ctx, hwd_control_state_t *ctrl, long_long **events, int flags)
Definition: linux-cuda.c:856
char name[PAPI_MIN_STR_LEN]
Definition: linux-cuda.h:69
Definition: linux-cuda.h:93
static int linkCudaLibraries()
Definition: linux-cuda.c:598
int CUDA_update_control_state(hwd_control_state_t *ptr, NativeInfo_t *native, int count, hwd_context_t *ctx)
Definition: linux-cuda.c:946
CUpti_EventDomainID domainId
Definition: linux-cuda.h:59
#define PAPI_DOM_USER
Definition: fpapi.h:21
int CUDA_init_control_state(hwd_control_state_t *ctrl)
Definition: linux-cuda.c:786
long j
Definition: iozone.c:19135
CUpti_EventID * eventId
Definition: linux-cuda.c:291
#define PAPI_GRN_THR
Definition: fpapi.h:67
long long counts[CUDA_MAX_COUNTERS]
Definition: linux-cuda.h:111
CUDA_register_t resources
Definition: linux-cuda.h:95
CUpti_EventGroup eventGroup
Definition: linux-cuda.h:109
static int CUDA_FREED
Definition: linux-cuda.h:140
unsigned int selector
Definition: linux-cuda.h:86
#define CUDARTAPI
static int getEventValue(long long *counts, CUpti_EventGroup eventGroup, AddedEvents_t addedEvents)
Definition: linux-cuda.c:437
char * ptr
Definition: iozone.c:23586