PAPI  5.4.1.0
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
linux-cuda.c
Go to the documentation of this file.
1 /****************************/
2 /* THIS IS OPEN SOURE CODE */
3 /****************************/
4 
17 #include <dlfcn.h>
18 #include <cupti.h>
19 #include <cuda_runtime_api.h>
20 
21 #include "papi.h"
22 #include "papi_memory.h"
23 #include "papi_internal.h"
24 #include "papi_vector.h"
25 
26 /* this number assumes that there will never be more events than indicated */
27 #define PAPICUDA_MAX_COUNTERS 512
28 
29 /* Contains device list, pointer to device desciption, and the list of available events */
30 typedef struct papicuda_context {
32  struct papicuda_device_desc *deviceArray;
33  uint32_t availEventSize;
34  CUpti_EventID* availEventIDArray;
36  struct papicuda_name_desc* availEventDesc;
38 
39 /* Store the name and description for an event */
40 typedef struct papicuda_name_desc {
42  char description[PAPI_2MAX_STR_LEN];
44 
45 /* For a device, store device description */
46 typedef struct papicuda_device_desc {
47  CUdevice cuDev; /* CUDA device */
48  unsigned int deviceNum;
49  char deviceName[PAPI_MIN_STR_LEN];
50  uint32_t maxDomains; /* number of domains per device */
51  CUpti_EventDomainID *domainIDArray; /* Array[maxDomains] of domain IDs */
52  uint32_t *domainIDNumEvents; /* Array[maxDomains] of num of events in that domain */
54 
55 /* Control structure tracks array of active contexts, records active events and their values */
56 typedef struct papicuda_control {
58  struct papicuda_active_cucontext_s *arrayOfActiveCUContexts[PAPICUDA_MAX_COUNTERS];
60  int activeEventIndex[PAPICUDA_MAX_COUNTERS];
61  int activeEventContextIdx[PAPICUDA_MAX_COUNTERS];
62  long long activeEventValues[PAPICUDA_MAX_COUNTERS];
64 
65 /* For each active context, which CUDA events are being measured, context eventgroups containing events */
66 typedef struct papicuda_active_cucontext_s {
67  CUcontext context;
68  int deviceNum;
70  CUpti_EventGroup eventGroup[PAPICUDA_MAX_COUNTERS];
72 
73 // file handles used to access cuda libraries with dlopen
74 static void *dl1 = NULL;
75 static void *dl2 = NULL;
76 static void *dl3 = NULL;
77 
78 /* The PAPI side (external) variable as a global */
80 
81 /* Global variable for hardware description, event and metric lists */
83 
84 /* This global variable points to the head of the control state list */
86 
87 /* Macros for error checking... each arg is only referenced/evaluated once */
88 #define CHECK_CU_ERROR(err, cufunc) \
89  if( (err) != CUDA_SUCCESS ) { PAPIERROR( "CUDA Driver API function failed '%s'", cufunc ); return -1; }
90 
91 #define CHECK_CUPTI_ERROR(err, cuptifunc) \
92  if( (err) != CUPTI_SUCCESS ) { PAPIERROR( "CUPTI API function failed '%s'", cuptifunc ); return -1; }
93 
94 #define CHECK_PRINT_EVAL( err, str, eval ) \
95  if( (err) ) { PAPIERROR( "%s", str ); eval; }
96 
97 /******** CHANGE PROTOTYPES TO DECLARE CUDA LIBRARY SYMBOLS AS WEAK **********
98  * This is done so that a version of PAPI built with the cuda component can *
99  * be installed on a system which does not have the cuda libraries installed. *
100  * *
101  * If this is done without these prototypes, then all papi services on the *
102  * system without the cuda libraries installed will fail. The PAPI libraries *
103  * contain references to the cuda libraries which are not installed. The *
104  * load of PAPI commands fails because the cuda library references can not be *
105  * resolved. *
106  * *
107  * This also defines pointers to the cuda library functions that we call. *
108  * These function pointers will be resolved with dlopen/dlsym calls at *
109  * component initialization time. The component then calls the cuda library *
110  * functions through these function pointers. *
111  *******************************************************************************/
113 #undef CUDAAPI
114 #define CUDAAPI __attribute__((weak))
115 CUresult CUDAAPI cuCtxGetCurrent( CUcontext * );
116 CUresult CUDAAPI cuDeviceGet( CUdevice *, int );
117 CUresult CUDAAPI cuDeviceGetCount( int * );
118 CUresult CUDAAPI cuDeviceGetName( char *, int, CUdevice );
119 CUresult CUDAAPI cuDeviceGetName( char *, int, CUdevice );
120 CUresult CUDAAPI cuInit( unsigned int );
121 CUresult CUDAAPI cuCtxPopCurrent( CUcontext * pctx );
122 CUresult CUDAAPI cuCtxPushCurrent( CUcontext pctx );
123 
124 CUresult( *cuCtxCreatePtr )( CUcontext * pctx, unsigned int flags, CUdevice dev );
125 CUresult( *cuCtxDestroyPtr )( CUcontext );
126 CUresult( *cuCtxGetCurrentPtr )( CUcontext * );
127 CUresult( *cuDeviceGetPtr )( CUdevice *, int );
128 CUresult( *cuDeviceGetCountPtr )( int * );
129 CUresult( *cuDeviceGetNamePtr )( char *, int, CUdevice );
130 CUresult( *cuInitPtr )( unsigned int );
131 CUresult( *cuCtxPopCurrentPtr )( CUcontext * pctx );
132 CUresult( *cuCtxPushCurrentPtr )( CUcontext pctx );
133 
134 #undef CUDARTAPI
135 #define CUDARTAPI __attribute__((weak))
136 cudaError_t CUDARTAPI cudaGetDevice( int * );
137 cudaError_t CUDARTAPI cudaSetDevice( int );
138 cudaError_t CUDARTAPI cudaFree( void * );
139 
140 cudaError_t ( *cudaGetDevicePtr )( int * );
141 cudaError_t ( *cudaSetDevicePtr )( int );
142 cudaError_t (*cudaFreePtr)(void *);
143 
144 #undef CUPTIAPI
145 #define CUPTIAPI __attribute__((weak))
146 CUptiResult CUPTIAPI cuptiDeviceEnumEventDomains( CUdevice, size_t *, CUpti_EventDomainID * );
147 CUptiResult CUPTIAPI cuptiDeviceGetNumEventDomains( CUdevice, uint32_t * );
148 CUptiResult CUPTIAPI cuptiEventDomainEnumEvents( CUpti_EventDomainID, size_t *, CUpti_EventID * );
149 CUptiResult CUPTIAPI cuptiEventDomainGetNumEvents( CUpti_EventDomainID, uint32_t * );
150 CUptiResult CUPTIAPI cuptiEventDomainGetAttribute ( CUpti_EventDomainID eventDomain, CUpti_EventDomainAttribute attrib, size_t* valueSize, void* value );
151 CUptiResult CUPTIAPI cuptiEventGroupAddEvent( CUpti_EventGroup, CUpti_EventID );
152 CUptiResult CUPTIAPI cuptiEventGroupCreate( CUcontext, CUpti_EventGroup *, uint32_t );
153 CUptiResult CUPTIAPI cuptiEventGroupDestroy( CUpti_EventGroup );
154 CUptiResult CUPTIAPI cuptiEventGroupDisable( CUpti_EventGroup );
155 CUptiResult CUPTIAPI cuptiEventGroupEnable( CUpti_EventGroup );
156 CUptiResult CUPTIAPI cuptiEventGroupReadAllEvents( CUpti_EventGroup, CUpti_ReadEventFlags, size_t *, uint64_t *, size_t *, CUpti_EventID *, size_t * );
157 CUptiResult CUPTIAPI cuptiEventGroupResetAllEvents( CUpti_EventGroup );
158 CUptiResult CUPTIAPI cuptiEventGetAttribute( CUpti_EventID, CUpti_EventAttribute, size_t *, void * );
159 
160 CUptiResult( *cuptiDeviceEnumEventDomainsPtr )( CUdevice, size_t *, CUpti_EventDomainID * );
161 CUptiResult( *cuptiDeviceGetNumEventDomainsPtr )( CUdevice, uint32_t * );
162 CUptiResult( *cuptiEventDomainEnumEventsPtr )( CUpti_EventDomainID, size_t *, CUpti_EventID * );
163 CUptiResult( *cuptiEventDomainGetNumEventsPtr )( CUpti_EventDomainID, uint32_t * );
164 CUptiResult( *cuptiEventDomainGetAttributePtr ) ( CUpti_EventDomainID eventDomain, CUpti_EventDomainAttribute attrib, size_t* valueSize, void* value );
165 CUptiResult( *cuptiEventGroupAddEventPtr )( CUpti_EventGroup, CUpti_EventID );
166 CUptiResult( *cuptiEventGroupCreatePtr )( CUcontext, CUpti_EventGroup *, uint32_t );
167 CUptiResult( *cuptiEventGroupDestroyPtr )( CUpti_EventGroup );
168 CUptiResult( *cuptiEventGroupDisablePtr )( CUpti_EventGroup );
169 CUptiResult( *cuptiEventGroupEnablePtr )( CUpti_EventGroup );
170 CUptiResult( *cuptiEventGroupReadAllEventsPtr )( CUpti_EventGroup, CUpti_ReadEventFlags, size_t *, uint64_t *, size_t *, CUpti_EventID *, size_t * );
171 CUptiResult( *cuptiEventGroupResetAllEventsPtr )( CUpti_EventGroup );
172 CUptiResult( *cuptiEventGetAttributePtr )( CUpti_EventID, CUpti_EventAttribute, size_t *, void * );
173 
174 /******************************************************************************
175  ******** BEGIN FUNCTIONS USED INTERNALLY SPECIFIC TO THIS COMPONENT *********
176  *****************************************************************************/
177 
178 /*
179  * Link the necessary CUDA libraries to use the cuda component. If any of them can not be found, then
180  * the CUDA component will just be disabled. This is done at runtime so that a version of PAPI built
181  * with the CUDA component can be installed and used on systems which have the CUDA libraries installed
182  * and on systems where these libraries are not installed.
183  */
184 #define CHECK_DL_STATUS( err, str ) if( err ) { strncpy( _cuda_vector.cmp_info.disabled_reason, str, PAPI_MAX_STR_LEN ); return ( PAPI_ENOSUPP ); }
185 
186 static int papicuda_linkCudaLibraries()
187 {
188  /* Attempt to guess if we were statically linked to libc, if so bail */
189  if( _dl_non_dynamic_init != NULL ) {
190  strncpy( _cuda_vector.cmp_info.disabled_reason, "The cuda component does not support statically linking to libc.", PAPI_MAX_STR_LEN );
191  return PAPI_ENOSUPP;
192  }
193  /* Need to link in the cuda libraries, if not found disable the component */
194  dl1 = dlopen( "libcuda.so", RTLD_NOW | RTLD_GLOBAL );
195  CHECK_DL_STATUS( !dl1 , "CUDA library libcuda.so not found." );
196  cuCtxGetCurrentPtr = dlsym( dl1, "cuCtxGetCurrent" );
197  CHECK_DL_STATUS( dlerror()!=NULL , "CUDA function cuCtxGetCurrent not found." );
198  cuDeviceGetPtr = dlsym( dl1, "cuDeviceGet" );
199  CHECK_DL_STATUS( dlerror()!=NULL, "CUDA function cuDeviceGet not found." );
200  cuDeviceGetCountPtr = dlsym( dl1, "cuDeviceGetCount" );
201  CHECK_DL_STATUS( dlerror()!=NULL, "CUDA function cuDeviceGetCount not found." );
202  cuDeviceGetNamePtr = dlsym( dl1, "cuDeviceGetName" );
203  CHECK_DL_STATUS( dlerror()!=NULL, "CUDA function cuDeviceGetName not found." );
204  cuInitPtr = dlsym( dl1, "cuInit" );
205  CHECK_DL_STATUS( dlerror()!=NULL, "CUDA function cuInit not found." );
206  cuCtxPopCurrentPtr = dlsym( dl1, "cuCtxPopCurrent" );
207  CHECK_DL_STATUS( dlerror()!=NULL, "CUDA function cuCtxPopCurrent not found." );
208  cuCtxPushCurrentPtr = dlsym( dl1, "cuCtxPushCurrent" );
209  CHECK_DL_STATUS( dlerror()!=NULL, "CUDA function cuCtxPushCurrent not found." );
210 
211  dl2 = dlopen( "libcudart.so", RTLD_NOW | RTLD_GLOBAL );
212  CHECK_DL_STATUS( !dl2, "CUDA runtime library libcudart.so not found." );
213  cudaGetDevicePtr = dlsym( dl2, "cudaGetDevice" );
214  CHECK_DL_STATUS( dlerror()!=NULL, "CUDART function cudaGetDevice not found." );
215  cudaSetDevicePtr = dlsym( dl2, "cudaSetDevice" );
216  CHECK_DL_STATUS( dlerror()!=NULL, "CUDART function cudaSetDevice not found." );
217  cudaFreePtr = dlsym( dl2, "cudaFree" );
218  CHECK_DL_STATUS( dlerror()!=NULL, "CUDART function cudaFree not found." );
219 
220  dl3 = dlopen( "libcupti.so", RTLD_NOW | RTLD_GLOBAL );
221  CHECK_DL_STATUS( !dl3, "CUDA runtime library libcupti.so not found." );
222  cuptiDeviceEnumEventDomainsPtr = dlsym( dl3, "cuptiDeviceEnumEventDomains" );
223  CHECK_DL_STATUS( dlerror()!=NULL, "CUPTI function cuptiDeviceEnumEventDomains not found." );
224  cuptiDeviceGetNumEventDomainsPtr = dlsym( dl3, "cuptiDeviceGetNumEventDomains" );
225  CHECK_DL_STATUS( dlerror()!=NULL, "CUPTI function cuptiDeviceGetNumEventDomains not found." );
226  cuptiEventDomainEnumEventsPtr = dlsym( dl3, "cuptiEventDomainEnumEvents" );
227  CHECK_DL_STATUS( dlerror()!=NULL, "CUPTI function cuptiEventDomainEnumEvents not found." );
228  cuptiEventDomainGetNumEventsPtr = dlsym( dl3, "cuptiEventDomainGetNumEvents" );
229  CHECK_DL_STATUS( dlerror()!=NULL, "CUPTI function cuptiEventDomainGetNumEvents not found." );
230  cuptiEventGetAttributePtr = dlsym( dl3, "cuptiEventGetAttribute" );
231  CHECK_DL_STATUS( dlerror()!=NULL, "CUPTI function cuptiEventGetAttribute not found." );
232  cuptiEventGroupAddEventPtr = dlsym( dl3, "cuptiEventGroupAddEvent" );
233  CHECK_DL_STATUS( dlerror()!=NULL, "CUPTI function cuptiEventGroupAddEvent not found." );
234  cuptiEventGroupCreatePtr = dlsym( dl3, "cuptiEventGroupCreate" );
235  CHECK_DL_STATUS( dlerror()!=NULL, "CUPTI function cuptiEventGroupCreate not found." );
236  cuptiEventGroupDestroyPtr = dlsym( dl3, "cuptiEventGroupDestroy" );
237  CHECK_DL_STATUS( dlerror()!=NULL, "CUPTI function cuptiEventGroupDestroy not found." );
238  cuptiEventGroupDisablePtr = dlsym( dl3, "cuptiEventGroupDisable" );
239  CHECK_DL_STATUS( dlerror()!=NULL, "CUPTI function cuptiEventGroupDisable not found." );
240  cuptiEventGroupEnablePtr = dlsym( dl3, "cuptiEventGroupEnable" );
241  CHECK_DL_STATUS( dlerror()!=NULL, "CUPTI function cuptiEventGroupEnable not found." );
242  cuptiEventGroupReadAllEventsPtr = dlsym( dl3, "cuptiEventGroupReadAllEvents" );
243  CHECK_DL_STATUS( dlerror()!=NULL, "CUPTI function cuptiEventGroupReadAllEvents not found." );
244  cuptiEventGroupResetAllEventsPtr = dlsym( dl3, "cuptiEventGroupResetAllEvents" );
245  CHECK_DL_STATUS( dlerror()!=NULL, "CUPTI function cuptiEventGroupResetAllEvents not found." );
246  return ( PAPI_OK );
247 }
248 
249 /* Called during component initialization to get a list of all available events */
251 {
252  SUBDBG( "Entering\n" );
253  CUptiResult cuptiErr;
254  CUresult cuErr;
255  unsigned int deviceNum;
256  uint32_t domainNum, eventNum;
257  papicuda_device_desc_t *mydevice;
258  char tmpStr[PAPI_MIN_STR_LEN];
259  tmpStr[PAPI_MIN_STR_LEN-1]='\0';
260  size_t tmpSizeBytes;
261  int ii;
262 
263  /* How many gpgpu devices do we have? */
264  cuErr = ( *cuDeviceGetCountPtr )( &gctxt->deviceCount );
265  if ( cuErr==CUDA_ERROR_NOT_INITIALIZED ) {
266  /* If CUDA not initilaized, initialized CUDA and retry the device list */
267  /* This is required for some of the PAPI tools, that do not call the init functions */
268  CHECK_CU_ERROR( ( *cuInitPtr )( 0 ), "cuInit" );
269  cuErr = ( *cuDeviceGetCountPtr )( &gctxt->deviceCount );
270  }
271  CHECK_CU_ERROR( cuErr, "cuDeviceGetCount" );
272  CHECK_PRINT_EVAL( gctxt->deviceCount==0, "ERROR CUDA: Could not find any CUDA devices", return( PAPI_ENOSUPP ) );
273  SUBDBG( "Found %d devices\n", gctxt->deviceCount );
274 
275  /* allocate memory for device information */
277  CHECK_PRINT_EVAL( !gctxt->deviceArray, "ERROR CUDA: Could not allocate memory for CUDA device structure", return( PAPI_ENOSUPP ) );
278 
279  /* For each device, get domains and domain-events counts */
280  gctxt->availEventSize = 0;
281  for( deviceNum = 0; deviceNum < ( uint )gctxt->deviceCount; deviceNum++ ) {
282  mydevice = &gctxt->deviceArray[deviceNum];
283  /* Get device id for each device */
284  CHECK_CU_ERROR( ( *cuDeviceGetPtr )( &mydevice->cuDev, deviceNum ), "cuDeviceGet" );
285  /* Get device name */
286  CHECK_CU_ERROR( ( *cuDeviceGetNamePtr )( mydevice->deviceName, PAPI_MIN_STR_LEN-1, mydevice->cuDev ), "cuDeviceGetName" );
287  mydevice->deviceName[PAPI_MIN_STR_LEN-1]='\0';
288  /* Get max num domains for each device */
289  CHECK_CUPTI_ERROR( ( *cuptiDeviceGetNumEventDomainsPtr )( mydevice->cuDev, &mydevice->maxDomains ), "cuptiDeviceGetNumEventDomains" );
290  /* Allocate space to hold domain IDs */
291  mydevice->domainIDArray = ( CUpti_EventDomainID * ) papi_calloc( mydevice->maxDomains, sizeof( CUpti_EventDomainID ) );
292  CHECK_PRINT_EVAL( !mydevice->domainIDArray, "ERROR CUDA: Could not allocate memory for CUDA device domains", return( PAPI_ENOMEM ) );
293  /* Put domain ids into allocated space */
294  size_t domainarraysize = mydevice->maxDomains * sizeof( CUpti_EventDomainID );
295  CHECK_CUPTI_ERROR( ( *cuptiDeviceEnumEventDomainsPtr )( mydevice->cuDev, &domainarraysize, mydevice->domainIDArray ), "cuptiDeviceEnumEventDomains" );
296  /* Allocate space to hold domain event counts */
297  mydevice->domainIDNumEvents = ( uint32_t * ) papi_calloc( mydevice->maxDomains, sizeof( uint32_t ) );
298  CHECK_PRINT_EVAL( !mydevice->domainIDNumEvents, "ERROR CUDA: Could not allocate memory for domain event counts", return( PAPI_ENOMEM ) );
299  /* For each domain, get event counts in domainNumEvents[] */
300  for ( domainNum=0; domainNum < mydevice->maxDomains; domainNum++ ) {
301  CUpti_EventDomainID domainID = mydevice->domainIDArray[domainNum];
302  /* Get num events in domain */
303  //SUBDBG( "Device %d:%d calling cuptiEventDomainGetNumEventsPtr with domainID %d \n", deviceNum, mydevice->cuDev, domainID );
304  CHECK_CUPTI_ERROR( ( *cuptiEventDomainGetNumEventsPtr ) ( domainID, &mydevice->domainIDNumEvents[domainNum] ), "cuptiEventDomainGetNumEvents" );
305  /* Keep track of overall number of events */
306  gctxt->availEventSize += mydevice->domainIDNumEvents[domainNum];
307  }
308  }
309 
310  /* Allocate space for all events and descriptors */
311  gctxt->availEventIDArray = ( CUpti_EventID * ) papi_calloc( gctxt->availEventSize, sizeof( CUpti_EventID ) );
312  CHECK_PRINT_EVAL( !gctxt->availEventIDArray, "ERROR CUDA: Could not allocate memory for events", return( PAPI_ENOMEM ) );
313  gctxt->availEventDeviceNum = ( int * ) papi_calloc( gctxt->availEventSize, sizeof( int ) );
314  CHECK_PRINT_EVAL( !gctxt->availEventDeviceNum, "ERROR CUDA: Could not allocate memory", return( PAPI_ENOMEM ) );
316  CHECK_PRINT_EVAL( !gctxt->availEventDesc, "ERROR CUDA: Could not allocate memory for events", return( PAPI_ENOMEM ) );
317  /* Record the events and descriptions */
318  int idxEventArray = 0;
319  for( deviceNum = 0; deviceNum < ( uint )gctxt->deviceCount; deviceNum++ ) {
320  mydevice = &gctxt->deviceArray[deviceNum];
321  //SUBDBG( "For device %d %d maxdomains %d \n", deviceNum, mydevice->cuDev, mydevice->maxDomains );
322  /* Get and store event IDs, names, descriptions into the large arrays allocated */
323  for ( domainNum=0; domainNum < mydevice->maxDomains; domainNum++ ) {
324  /* Get domain id */
325  CUpti_EventDomainID domainID = mydevice->domainIDArray[domainNum];
326  uint32_t domainNumEvents = mydevice->domainIDNumEvents[domainNum];
327  SUBDBG( "For device %d domain %d %d numEvents %d\n", mydevice->cuDev, domainNum, domainID, domainNumEvents );
328  /* Allocate temp space for eventIDs for this domain */
329  CUpti_EventID *domainEventIDArray = ( CUpti_EventID * ) papi_calloc( domainNumEvents, sizeof( CUpti_EventID ) );
330  CHECK_PRINT_EVAL( !domainEventIDArray, "ERROR CUDA: Could not allocate memory for events", return( PAPI_ENOMEM ) );
331  /* Load the domain eventIDs in temp space */
332  size_t domainEventArraySize = domainNumEvents * sizeof( CUpti_EventID );
333  cuptiErr = ( *cuptiEventDomainEnumEventsPtr ) ( domainID, &domainEventArraySize, domainEventIDArray );
334  CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventDomainEnumEvents" );
335  /* For each event, get and store name and description */
336  for ( eventNum=0; eventNum<domainNumEvents; eventNum++ ) {
337  /* Record the event IDs in native event array */
338  CUpti_EventID myeventID = domainEventIDArray[eventNum];
339  gctxt->availEventIDArray[idxEventArray] = myeventID;
340  gctxt->availEventDeviceNum[idxEventArray] = deviceNum;
341  /* Get event name */
342  tmpSizeBytes = PAPI_MIN_STR_LEN-1 * sizeof( char );
343  cuptiErr = ( *cuptiEventGetAttributePtr ) ( myeventID, CUPTI_EVENT_ATTR_NAME, &tmpSizeBytes, tmpStr ) ;
344  CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGetAttribute" );
345  /* Save a full path for the event, filling spaces with underscores */
346  //snprintf( gctxt->availEventDesc[idxEventArray].name, PAPI_MIN_STR_LEN, "%s:%d:%s", mydevice->deviceName, deviceNum, tmpStr );
347  snprintf( gctxt->availEventDesc[idxEventArray].name, PAPI_MIN_STR_LEN, "device:%d:%s", deviceNum, tmpStr );
348  gctxt->availEventDesc[idxEventArray].name[PAPI_MIN_STR_LEN-1] = '\0';
349  char *nameTmpPtr = gctxt->availEventDesc[idxEventArray].name;
350  for ( ii = 0; ii < ( int )strlen( nameTmpPtr ); ii++ ) if ( nameTmpPtr[ii] == ' ' ) nameTmpPtr[ii] = '_';
351  /* Save description in the native event array */
352  tmpSizeBytes = PAPI_2MAX_STR_LEN-1 * sizeof( char );
353  cuptiErr = ( *cuptiEventGetAttributePtr ) ( myeventID, CUPTI_EVENT_ATTR_SHORT_DESCRIPTION, &tmpSizeBytes, gctxt->availEventDesc[idxEventArray].description );
354  CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGetAttribute" );
355  gctxt->availEventDesc[idxEventArray].description[PAPI_2MAX_STR_LEN-1] = '\0';
356  // SUBDBG( "Event ID:%d Name:%s Desc:%s\n", gctxt->availEventIDArray[idxEventArray], gctxt->availEventDesc[idxEventArray].name, gctxt->availEventDesc[idxEventArray].description );
357  /* Increment index past events in this domain to start of next domain */
358  idxEventArray++;
359  }
360  papi_free ( domainEventIDArray );
361  }
362  }
363  /* return 0 if everything went OK */
364  return 0;
365 }
366 
367 
368 /*****************************************************************************
369  ******************* BEGIN PAPI's COMPONENT REQUIRED FUNCTIONS *************
370  *****************************************************************************/
371 
372 /*
373  * This is called whenever a thread is initialized.
374  */
376 {
377  ( void ) ctx;
378  SUBDBG( "Entering\n" );
379 
380  return PAPI_OK;
381 }
382 
383 
388 /* NOTE: only called by main thread (not by every thread) !!! Starting
389  in CUDA 4.0, multiple CPU threads can access the same CUDA
390  context. This is a much easier programming model then pre-4.0 as
391  threads - using the same context - can share memory, data,
392  etc. It's possible to create a different context for each thread.
393  That's why CUDA context creation is done in CUDA_init_component()
394  (called only by main thread) rather than CUDA_init() or
395  CUDA_init_control_state() (both called by each thread). */
396 static int papicuda_init_component( int cidx )
397 {
398  SUBDBG( "Entering with cidx: %d\n", cidx );
399  int err;
400 
401  /* link in all the cuda libraries and resolve the symbols we need to use */
402  if( papicuda_linkCudaLibraries() != PAPI_OK ) {
403  PAPIERROR( "Dynamic link of CUDA libraries failed, component will be disabled.\n" );
404  return ( PAPI_ENOSUPP );
405  }
406 
407  /* Create the structure */
408  if ( !global_papicuda_context )
409  global_papicuda_context = ( papicuda_context_t* ) papi_calloc( 1, sizeof( papicuda_context_t ) );
410 
411  /* Get list of all native CUDA events supported */
412  err = papicuda_list_all_events( global_papicuda_context );
413  CHECK_PRINT_EVAL( err!=0, "ERROR: Could not get a list of CUDA/CUPTI events", return( PAPI_ENOSUPP ) );
414 
415  /* Export some information */
416  _cuda_vector.cmp_info.CmpIdx = cidx;
417  _cuda_vector.cmp_info.num_native_events = global_papicuda_context->availEventSize;
418  _cuda_vector.cmp_info.num_cntrs = _cuda_vector.cmp_info.num_native_events;
419  _cuda_vector.cmp_info.num_mpx_cntrs = _cuda_vector.cmp_info.num_native_events;
420 
421  //SUBDBG( "Exiting PAPI_OK\n" );
422  return ( PAPI_OK );
423 }
424 
425 
431 {
432  SUBDBG( "Entering\n" );
433  ( void ) ctrl;
435 
436  CHECK_PRINT_EVAL( !gctxt, "Error: The PAPI CUDA component needs to be initialized first", return( PAPI_ENOINIT ) );
437  /* If no events were found during the initial component initialization, return error */
438  if( global_papicuda_context->availEventSize <= 0 ) {
439  strncpy( _cuda_vector.cmp_info.disabled_reason, "ERROR CUDA: No events exist", PAPI_MAX_STR_LEN );
440  return ( PAPI_EMISC );
441  }
442  /* If it does not exist, create the global structure to hold CUDA contexts and active events */
443  if ( !global_papicuda_control ) {
444  global_papicuda_control = ( papicuda_control_t* ) papi_calloc( 1, sizeof( papicuda_control_t ) );
445  global_papicuda_control->countOfActiveCUContexts = 0;
446  global_papicuda_control->activeEventCount = 0;
447  }
448  return PAPI_OK;
449 }
450 
456 static int papicuda_update_control_state( hwd_control_state_t *ctrl, NativeInfo_t *nativeInfo, int nativeCount, hwd_context_t *ctx )
457 {
458  /* Note: NativeInfo_t is defined in papi_internal.h */
459  SUBDBG( "Entering with nativeCount %d\n", nativeCount );
460  ( void ) ctx;
461  ( void ) ctrl;
464  papicuda_active_cucontext_t *currctrl;
465  int currDeviceNum, currContextIdx, cuContextIdx;
466  CUcontext currCuCtx;
467  int index, ii, jj;
468 
469  if ( nativeCount == 0 ) {
470  /* Does nativeCount=0 implies that the component is being reset!? */
471  /* gctrl->activeEventCount = 0; */
472  } else {
473  /* nativecount>0 so we need to process the events */
474  // SUBDBG( "There are currently %d contexts\n", gctrl->countOfActiveCUContexts );
475 
476  /* Get/query some device and context specific information */
477  CHECK_PRINT_EVAL( ( *cudaGetDevicePtr )( &currDeviceNum )!=CUDA_SUCCESS, "cudaGetDevice: CUDA device MUST be set before adding events", return( PAPI_EMISC ) );
478  CHECK_PRINT_EVAL( ( *cudaFreePtr )( NULL )!=CUDA_SUCCESS, "cudaFree: Failed to free in this CUDA context", return( PAPI_EMISC ) );
479  CHECK_PRINT_EVAL( ( *cuCtxGetCurrentPtr )( &currCuCtx )!=CUDA_SUCCESS, "cuCtxGetCurrent: CUDA context MUST be initialized before adding events", return ( PAPI_EMISC ) );
480 
481  /* Find current context/control, creating it if does not exist */
482  for ( cuContextIdx=0; cuContextIdx<gctrl->countOfActiveCUContexts; cuContextIdx++ )
483  if ( gctrl->arrayOfActiveCUContexts[cuContextIdx]->context == currCuCtx ) break;
484  CHECK_PRINT_EVAL( cuContextIdx==PAPICUDA_MAX_COUNTERS, "Exceeded hardcoded maximum number of contexts (PAPICUDA_MAX_COUNTERS)", return( PAPI_EMISC ) );
485  if ( cuContextIdx==gctrl->countOfActiveCUContexts ) {
486  gctrl->arrayOfActiveCUContexts[cuContextIdx] = papi_calloc( 1, sizeof( papicuda_active_cucontext_t ) );
487  CHECK_PRINT_EVAL( ( gctrl->arrayOfActiveCUContexts[cuContextIdx]==NULL ), "Memory allocation for new active context failed", return( PAPI_ENOMEM ) ) ;
488  gctrl->arrayOfActiveCUContexts[cuContextIdx]->context = currCuCtx;
489  gctrl->arrayOfActiveCUContexts[cuContextIdx]->deviceNum = currDeviceNum;
490  gctrl->countOfActiveCUContexts++;
491  SUBDBG( "Added a new context ... now %d\n", gctrl->countOfActiveCUContexts );
492  }
493  currContextIdx = cuContextIdx;
494  currctrl = gctrl->arrayOfActiveCUContexts[currContextIdx];
495  /* At this point, currCuCtx is at index cuContextIdx in the arrayOfActiveCUContexts array */
496 
497  /* For each event, check if it is already added. If not, try to added it to the current context.
498  Try each existing eventgroup. If none will have this event, create a new event group. If new event group will not have it... fail */
499  /* For each event */
500  for( ii = 0; ii < nativeCount; ii++ ) {
501  index = nativeInfo[ii].ni_event; /* Get the PAPI event index from the user */
502  /* Check to see if event is already in some context */
503  SUBDBG( "Searching %d active events to see if event %d %s is already in some context\n", gctrl->activeEventCount, index, gctxt->availEventDesc[index].name );
504  int eventAlreadyAdded=0;
505  for( jj = 0; jj < gctrl->activeEventCount; jj++ ) {
506  if ( gctrl->activeEventIndex[jj] == index ) {
507  eventAlreadyAdded=1;
508  break;
509  }
510  }
511 
512  /* If event was not found in any context.. try to insert it into current context */
513  if ( !eventAlreadyAdded ) {
514  SUBDBG( "Need to add event %d %s to the current context\n", index, gctxt->availEventDesc[index].name );
515  /* Make sure that the device number for the event matches the device for this context */
516  CHECK_PRINT_EVAL( (currDeviceNum!=gctxt->availEventDeviceNum[index]), "Current CUDA device cannot use this event", return( PAPI_EINVAL ) );
517  /* if this event index corresponds to something from availEventIDArray */
518  if ( index < ( int )gctxt->availEventSize ) {
519  /* lookup cuptieventid for this event index */
520  CUpti_EventID cuptieventid = gctxt->availEventIDArray[index];
521  CUpti_EventGroup cuptieventgroup;
522  int addstatus=!CUPTI_SUCCESS, gg;
523  SUBDBG( "Event %s is going to be added to current context %d having %d eventgroups\n", gctxt->availEventDesc[index].name, currContextIdx, currctrl->numEventGroups );
524  /* For each existing eventgroup, try to insert this event */
525  for ( gg=0; gg<currctrl->numEventGroups; gg++ ) {
526  cuptieventgroup = currctrl->eventGroup[gg];
527  addstatus = ( *cuptiEventGroupAddEventPtr )( cuptieventgroup, cuptieventid );
528  if ( addstatus==CUPTI_SUCCESS ) {
529  SUBDBG( "Event %s successfully added to current eventgroup %d:%d\n", gctxt->availEventDesc[index].name, currContextIdx, gg );
530  break;
531  }
532  }
533  /* If the event could not be added to any earlier eventgroup, create a new one and try again. Fail if this does not succeed */
534  if ( addstatus!=CUPTI_SUCCESS ) {
535  //SUBDBG( "Event %s needs a new eventgroup\n", gctxt->availEventDesc[index].name );
536  CHECK_PRINT_EVAL( ( gg>PAPICUDA_MAX_COUNTERS-1 ), "For current CUDA device, could not add event (no more eventgroups can be added)", return( PAPI_EMISC ) );
537  //SUBDBG( "gg %d context %d %p\n", gg, currctrl->context, currctrl->context );
538  CHECK_CUPTI_ERROR( ( *cuptiEventGroupCreatePtr )( currctrl->context, &currctrl->eventGroup[gg], 0 ), "cuptiEventGroupCreate" );
539  cuptieventgroup = currctrl->eventGroup[gg];
540  currctrl->numEventGroups++;
541  addstatus = ( *cuptiEventGroupAddEventPtr )( cuptieventgroup, cuptieventid );
542  CHECK_PRINT_EVAL( ( addstatus!=CUPTI_SUCCESS ), "cuptiEventGroupAddEvent: Could not add event (event may not match CUDA context)", return( PAPI_EMISC ) );
543  SUBDBG( "Event %s successfully added to new eventgroup %d:%d\n", gctxt->availEventDesc[index].name, currContextIdx, gg );
544  }
545  }
546 
547  /* Record index of this active event back into the nativeInfo structure */
548  nativeInfo[ii].ni_position = gctrl->activeEventCount;
549  /* record added event at the higher level */
550  CHECK_PRINT_EVAL( ( gctrl->activeEventCount==PAPICUDA_MAX_COUNTERS-1 ), "Exceeded maximum num of events (PAPI_MAX_COUNTERS)", return( PAPI_EMISC ) );
551  gctrl->activeEventIndex[gctrl->activeEventCount] = index;
552  gctrl->activeEventContextIdx[gctrl->activeEventCount] = currContextIdx;
553  gctrl->activeEventValues[gctrl->activeEventCount] = 0;
554  gctrl->activeEventCount++;
555 
556  }
557  }
558  }
559  return ( PAPI_OK );
560 }
561 
566 {
567  SUBDBG( "Entering\n" );
568  ( void ) ctx;
569  ( void ) ctrl;
571  //papicuda_context_t *gctxt = global_papicuda_context;
572  papicuda_active_cucontext_t *currctrl;
573  int cuContextIdx, gg, ii;
574  CUptiResult cuptiErr;
575  CUcontext saveCtx, tmpCtx;
576 
577  //SUBDBG( "Reset all active event values\n" );
578  for ( ii=0; ii<gctrl->activeEventCount; ii++ )
579  gctrl->activeEventValues[ii] = 0;
580 
581  // SUBDBG( "Switch to each context and enable CUDA eventgroups associated with that context\n" );
582  /* Save current cuda context */
583  CHECK_CU_ERROR( ( *cuCtxPopCurrentPtr ) ( &saveCtx ), "cuCtxPopCurrent" );
584  /* Switch to each context and enable CUDA eventgroups */
585  for ( cuContextIdx=0; cuContextIdx<gctrl->countOfActiveCUContexts; cuContextIdx++ ) {
586  currctrl = gctrl->arrayOfActiveCUContexts[cuContextIdx];
587  //SUBDBG( "Try to switch to context %d associated with device %d\n", cuContextIdx, currctrl->deviceNum );
588  CHECK_CU_ERROR( ( *cuCtxPushCurrentPtr ) ( currctrl->context ), "cuCtxPushCurrent" );
589  for ( gg=0; gg<currctrl->numEventGroups; gg++ ) {
590  // SUBDBG( "Enable event group\n" );
591  cuptiErr = ( *cuptiEventGroupEnablePtr )( currctrl->eventGroup[gg] );
592  CHECK_PRINT_EVAL( ( cuptiErr!=CUPTI_SUCCESS ), "cuptiEventGroupEnable: Could not enable one of the event groups", return( PAPI_EMISC ) );
593  // SUBDBG( "Reset events in eventgroup\n" );
594  cuptiErr = ( *cuptiEventGroupResetAllEventsPtr )( currctrl->eventGroup[gg] );
595  CHECK_PRINT_EVAL( ( cuptiErr!=CUPTI_SUCCESS ), "cuptiEventGroupResetAllEvents: Could not reset the event groups", return( PAPI_EMISC ) );
596  SUBDBG( "For papicuda context %d on device %d event group %d was enabled and reset\n", cuContextIdx, currctrl->deviceNum, gg );
597  }
598  // SUBDBG( "Pop temp context\n" );
599  CHECK_CU_ERROR( ( *cuCtxPopCurrentPtr ) ( &tmpCtx ), "cuCtxPopCurrent" );
600  }
601  //SUBDBG( "Restore original context\n" );
602  CHECK_CU_ERROR( ( *cuCtxPushCurrentPtr ) ( saveCtx ), "cuCtxPushCurrent" );
603  return ( PAPI_OK );
604 }
605 
608 {
609  SUBDBG( "Entering to disable all CUPTI eventgroups\n" );
610  ( void ) ctx;
611  ( void ) ctrl;
613  papicuda_active_cucontext_t *currctrl;
614  int cuContextIdx, gg;
615  CUptiResult cuptiErr;
616  CUcontext saveCtx, tmpCtx;
617 
618  // SUBDBG( "Save initial CUDA context\n" );
619  CHECK_CU_ERROR( ( *cuCtxPopCurrentPtr ) ( &saveCtx ), "cuCtxPopCurrent" );
620  // SUBDBG( "Switch to each context and disable CUDA eventgroups\n" );
621  for ( cuContextIdx=0; cuContextIdx<gctrl->countOfActiveCUContexts; cuContextIdx++ ) {
622  currctrl = gctrl->arrayOfActiveCUContexts[cuContextIdx];
623  //SUBDBG( "Try to switch to context %d associated with device %d\n", cuContextIdx, currctrl->deviceNum );
624  CHECK_CU_ERROR( ( *cuCtxPushCurrentPtr ) ( currctrl->context ), "cuCtxPushCurrent" );
625  for ( gg=0; gg<currctrl->numEventGroups; gg++ ) {
626  // SUBDBG( "Disable events in eventgroup\n" );
627  cuptiErr = ( *cuptiEventGroupDisablePtr )( currctrl->eventGroup[gg] );
628  CHECK_PRINT_EVAL( ( cuptiErr!=CUPTI_SUCCESS ), "cuptiEventGroupDisable: Could not disable the event groups", return( PAPI_EMISC ) );
629  SUBDBG( "For papicuda context %d on device %d event group %d was disabled\n", cuContextIdx, currctrl->deviceNum, gg );
630  }
631  CHECK_CU_ERROR( ( *cuCtxPopCurrentPtr ) ( &tmpCtx ), "cuCtxPopCurrent" );
632  }
633  //SUBDBG( "Restore original context\n" );
634  CHECK_CU_ERROR( ( *cuCtxPushCurrentPtr ) ( saveCtx ), "cuCtxPushCurrent" );
635  return ( PAPI_OK );
636 }
637 
638 
642 static int papicuda_read( hwd_context_t * ctx, hwd_control_state_t * ctrl, long long ** events, int flags )
643 {
644  SUBDBG( "Entering\n" );
645  ( void ) ctx;
646  ( void ) ctrl;
647  ( void ) flags;
650  papicuda_active_cucontext_t *currctrl;
651  int cuContextIdx, gg, ii, jj;
652  CUcontext saveCtx, tmpCtx;
653  CUptiResult cuptiErr;
654  size_t readEventValueBufferSize = sizeof( uint64_t )*PAPICUDA_MAX_COUNTERS;
655  uint64_t readEventValueBuffer[PAPICUDA_MAX_COUNTERS];
656  size_t readEventIDArraySize = sizeof( CUpti_EventID )*PAPICUDA_MAX_COUNTERS;
657  CUpti_EventID readEventIDArray[PAPICUDA_MAX_COUNTERS];
658  size_t numEventIDsRead;
659 
660  SUBDBG( "Switch to each context and read CUDA eventgroups\n" );
661  // SUBDBG( "Save initial CUDA context\n" );
662  CHECK_CU_ERROR( ( *cuCtxPopCurrentPtr ) ( &saveCtx ), "cuCtxPopCurrent" );
663  /* Switch to each context and enable CUDA eventgroups */
664  for ( cuContextIdx=0; cuContextIdx<gctrl->countOfActiveCUContexts; cuContextIdx++ ) {
665  currctrl = gctrl->arrayOfActiveCUContexts[cuContextIdx];
666  // SUBDBG( "Switch to context %d associated with device %d\n", cuContextIdx, currctrl->deviceNum );
667  CHECK_CU_ERROR( ( *cuCtxPushCurrentPtr ) ( currctrl->context ), "cuCtxPushCurrent" );
668  for ( gg=0; gg<currctrl->numEventGroups; gg++ ) {
669  // SUBDBG( "Read from context %d eventgroup %d\n", cuContextIdx, gg );
670  cuptiErr = ( *cuptiEventGroupReadAllEventsPtr )( currctrl->eventGroup[gg], CUPTI_EVENT_READ_FLAG_NONE, &readEventValueBufferSize, readEventValueBuffer, &readEventIDArraySize, readEventIDArray, &numEventIDsRead );
671  CHECK_PRINT_EVAL( ( cuptiErr!=CUPTI_SUCCESS ), "cuptiEventGroupReadAllEvents: Could not read from CUPTI eventgroup", return( PAPI_EMISC ) );
672  /* Match read values against active events by scanning activeEvents array and matching associated availEventIDs */
673  for( ii = 0; ii < ( int )numEventIDsRead; ii++ ) {
674  for( jj = 0; jj < gctrl->activeEventCount; jj++ ) {
675  int eventIndex = gctrl->activeEventIndex[jj];
676  if ( gctrl->activeEventContextIdx[jj]==cuContextIdx && gctxt->availEventIDArray[eventIndex]==readEventIDArray[ii] ) {
677  gctrl->activeEventValues[jj] += ( long long )readEventValueBuffer[ii];
678  SUBDBG( "Matched read-eventID %d:%d value %ld activeEvent %d value %lld \n", jj, (int)readEventIDArray[ii], readEventValueBuffer[ii], eventIndex, gctrl->activeEventValues[jj] );
679  break;
680  }
681  }
682  }
683  }
684  CUresult cuErr = ( *cuCtxPopCurrentPtr ) ( &tmpCtx );
685  if ( cuErr != CUDA_SUCCESS ) PAPIERROR ( "Error popping context %d\n", cuErr );
686  CHECK_CU_ERROR( cuErr, "cuCtxPopCurrent" );
687  }
688  //SUBDBG( "Restore original context\n" );
689  CHECK_CU_ERROR( ( *cuCtxPushCurrentPtr ) ( saveCtx ), "cuCtxPushCurrent" );
690  *events = gctrl->activeEventValues;
691  return ( PAPI_OK );
692 }
693 
696 {
697  SUBDBG( "Entering\n" );
698  ( void ) ctx;
699 
700  return ( PAPI_OK );
701 }
702 
704 static int papicuda_shutdown_component( void )
705 {
706  SUBDBG( "Entering\n" );
709  int deviceNum, cuContextIdx;
710  /* Free context */
711  if ( gctxt ) {
712  for( deviceNum = 0; deviceNum < gctxt->deviceCount; deviceNum++ ) {
713  papicuda_device_desc_t *mydevice = &gctxt->deviceArray[deviceNum];
714  papi_free( mydevice->domainIDArray );
715  papi_free( mydevice->domainIDNumEvents );
716  }
717  papi_free( gctxt->availEventIDArray );
718  papi_free( gctxt->availEventDeviceNum );
719  papi_free( gctxt->availEventDesc );
720  papi_free( gctxt->deviceArray );
721  papi_free( gctxt );
722  global_papicuda_context = gctxt = NULL;
723  }
724  /* Free control */
725  if ( gctrl ) {
726  for ( cuContextIdx=0; cuContextIdx<gctrl->countOfActiveCUContexts; cuContextIdx++ )
727  if ( gctrl->arrayOfActiveCUContexts[cuContextIdx]!=NULL )
728  papi_free( gctrl->arrayOfActiveCUContexts[cuContextIdx] );
729  papi_free( gctrl );
730  global_papicuda_control = gctrl = NULL;
731  }
732  // close the dynamic libraries needed by this component (opened in the init substrate call)
733  dlclose( dl1 );
734  dlclose( dl2 );
735  dlclose( dl3 );
736  return ( PAPI_OK );
737 }
738 
739 
745 static int papicuda_ctrl( hwd_context_t * ctx, int code, _papi_int_option_t * option )
746 {
747  SUBDBG( "Entering\n" );
748  ( void ) ctx;
749  ( void ) code;
750  ( void ) option;
751  return ( PAPI_OK );
752 }
753 
754 
755 /*
756  * This function has to set the bits needed to count different domains
757  * In particular: PAPI_DOM_USER, PAPI_DOM_KERNEL PAPI_DOM_OTHER
758  * By default return PAPI_EINVAL if none of those are specified
759  * and PAPI_OK with success
760  * PAPI_DOM_USER is only user context is counted
761  * PAPI_DOM_KERNEL is only the Kernel/OS context is counted
762  * PAPI_DOM_OTHER is Exception/transient mode (like user TLB misses)
763  * PAPI_DOM_ALL is all of the domains
764  */
765 static int papicuda_set_domain( hwd_control_state_t * ctrl, int domain )
766 {
767  SUBDBG( "Entering\n" );
768  ( void ) ctrl;
769  if ( ( PAPI_DOM_USER & domain ) ||
770  ( PAPI_DOM_KERNEL & domain ) ||
771  ( PAPI_DOM_OTHER & domain ) )
772  return ( PAPI_OK );
773  else
774  return ( PAPI_EINVAL );
775  return ( PAPI_OK );
776 }
777 
778 
784 {
785  SUBDBG( "Entering\n" );
786  ( void ) ctx;
787  ( void ) ctrl;
789  papicuda_active_cucontext_t *currctrl;
790  int cuContextIdx, gg, ii;
791  CUptiResult cuptiErr;
792  CUcontext saveCtx, tmpCtx;
793 
794  //SUBDBG( "Reset all active event values\n" );
795  for ( ii=0; ii<gctrl->activeEventCount; ii++ )
796  gctrl->activeEventValues[ii] = 0;
797  // SUBDBG( "Save initial CUDA context and restore later\n" );
798  CHECK_CU_ERROR( ( *cuCtxPopCurrentPtr ) ( &saveCtx ), "cuCtxPopCurrent" );
799  // SUBDBG( "Switch to each context and reset CUDA eventgroups\n" );
800  for ( cuContextIdx=0; cuContextIdx<gctrl->countOfActiveCUContexts; cuContextIdx++ ) {
801  currctrl = gctrl->arrayOfActiveCUContexts[cuContextIdx];
802  //SUBDBG( "Try to switch to context %d associated with device %d\n", cuContextIdx, currctrl->deviceNum );
803  CHECK_CU_ERROR( ( *cuCtxPushCurrentPtr ) ( currctrl->context ), "cuCtxPushCurrent" );
804  for ( gg=0; gg<currctrl->numEventGroups; gg++ ) {
805  // SUBDBG( "Reset events in eventgroup\n" );
806  cuptiErr = ( *cuptiEventGroupResetAllEventsPtr )( currctrl->eventGroup[gg] );
807  CHECK_PRINT_EVAL( ( cuptiErr!=CUPTI_SUCCESS ), "cuptiEventGroupResetAllEvents: Could not reset the event groups", return( PAPI_EMISC ) );
808  SUBDBG( "For papicuda context %d on device %d event group %d was enabled and reset\n", cuContextIdx, currctrl->deviceNum, gg );
809  }
810  CHECK_CU_ERROR( ( *cuCtxPopCurrentPtr ) ( &tmpCtx ), "cuCtxPopCurrent" );
811  }
812  // SUBDBG( "Restore original context\n" );
813  CHECK_CU_ERROR( ( *cuCtxPushCurrentPtr ) ( saveCtx ), "cuCtxPushCurrent" );
814  return ( PAPI_OK );
815 }
816 
817 
818 /*
819  * Disable and destroy the CUDA eventGroup
820 */
822 {
823  SUBDBG( "Entering\n" );
824  ( void ) ctrl;
826  papicuda_active_cucontext_t *currctrl;
827  int cuContextIdx, gg;
828  CUptiResult cuptiErr;
829  CUcontext saveCtx, tmpCtx;
830 
831  SUBDBG( "Switch to each context and disable CUDA eventgroups\n" );
832  /* Save current cuda context and restore later */
833  CHECK_CU_ERROR( ( *cuCtxPopCurrentPtr ) ( &saveCtx ), "cuCtxPopCurrent" );
834  /* Switch to each context and enable CUDA eventgroups */
835  for ( cuContextIdx=0; cuContextIdx<gctrl->countOfActiveCUContexts; cuContextIdx++ ) {
836  currctrl = gctrl->arrayOfActiveCUContexts[cuContextIdx];
837  /* Switch to this device / cuda context */
838  CHECK_CU_ERROR( ( *cuCtxPushCurrentPtr ) ( currctrl->context ), "cuCtxPushCurrent" );
839  for ( gg=0; gg<currctrl->numEventGroups; gg++ ) {
840  /* Destroy the eventGroups; it also frees the perfmon hardware on the GPU */
841  cuptiErr = ( *cuptiEventGroupDestroyPtr )( currctrl->eventGroup[gg] );
842  CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupDestroy" );
843  }
844  currctrl->numEventGroups = 0;
845  CHECK_CU_ERROR( ( *cuCtxPopCurrentPtr ) ( &tmpCtx ), "cuCtxPopCurrent" );
846  }
847  CHECK_CU_ERROR( ( *cuCtxPushCurrentPtr ) ( saveCtx ), "cuCtxPushCurrent" );
848  /* Record that there are no active contexts or events */
849  gctrl->activeEventCount = 0;
850  return ( PAPI_OK );
851 }
852 
853 
858 static int papicuda_ntv_enum_events( unsigned int *EventCode, int modifier )
859 {
860  //SUBDBG( "Entering\n" );
861  switch( modifier ) {
862  case PAPI_ENUM_FIRST:
863  *EventCode = 0;
864  return ( PAPI_OK );
865  break;
866  case PAPI_ENUM_EVENTS:
867  if( *EventCode < global_papicuda_context->availEventSize - 1 ) {
868  *EventCode = *EventCode + 1;
869  return ( PAPI_OK );
870  } else
871  return ( PAPI_ENOEVNT );
872  break;
873  default:
874  return ( PAPI_EINVAL );
875  }
876  return ( PAPI_OK );
877 }
878 
879 
885 static int papicuda_ntv_code_to_name( unsigned int EventCode, char *name, int len )
886 {
887  //SUBDBG( "Entering EventCode %d\n", EventCode );
888  unsigned int index = EventCode;
890  if ( index < gctxt->availEventSize ) {
891  strncpy( name, gctxt->availEventDesc[index].name, len );
892  } else {
893  return ( PAPI_EINVAL );
894  }
895  //SUBDBG( "EventCode %d: Exit %s\n", EventCode, name );
896  return ( PAPI_OK );
897 }
898 
899 
905 static int papicuda_ntv_code_to_descr( unsigned int EventCode, char *name, int len )
906 {
907  //SUBDBG( "Entering\n" );
908  unsigned int index = EventCode;
910  if ( index < gctxt->availEventSize ) {
911  strncpy( name, gctxt->availEventDesc[index].description, len );
912  } else {
913  return ( PAPI_EINVAL );
914  }
915  return ( PAPI_OK );
916 }
917 
918 
920 papi_vector_t _cuda_vector = {
921  .cmp_info = {
922  /* default component information (unspecified values are initialized to 0) */
923  .name = "cuda",
924  .short_name = "cuda",
925  .version = "5.1",
926  .description = "The CUDA component uses CuPTI for NVIDIA GPU hardware events",
927  .num_mpx_cntrs = PAPICUDA_MAX_COUNTERS,
928  .num_cntrs = PAPICUDA_MAX_COUNTERS,
929  .default_domain = PAPI_DOM_USER,
930  .default_granularity = PAPI_GRN_THR,
931  .available_granularities = PAPI_GRN_THR,
932  .hardware_intr_sig = PAPI_INT_SIGNAL,
933  /* component specific cmp_info initializations */
934  .fast_real_timer = 0,
935  .fast_virtual_timer = 0,
936  .attach = 0,
937  .attach_must_ptrace = 0,
938  .available_domains = PAPI_DOM_USER | PAPI_DOM_KERNEL,
939  },
940  /* sizes of framework-opaque component-private structures... these are all unused in this component */
941  .size = {
942  .context = 1, /* sizeof( papicuda_context_t ), */
943  .control_state = 1, /*sizeof( papicuda_control_t ), */
944  .reg_value = 1, /*sizeof( papicuda_register_t ), */
945  .reg_alloc = 1, /*sizeof( papicuda_reg_alloc_t ), */
946  },
947  /* function pointers in this component */
948  .init_thread = papicuda_init_thread, /* ( hwd_context_t * ctx ) */
949  .init_component = papicuda_init_component, /* ( int cidx ) */
950  .init_control_state = papicuda_init_control_state, /* ( hwd_control_state_t * ctrl ) */
951  .start = papicuda_start, /* ( hwd_context_t * ctx, hwd_control_state_t * ctrl ) */
952  .stop = papicuda_stop, /* ( hwd_context_t * ctx, hwd_control_state_t * ctrl ) */
953  .read = papicuda_read, /* ( hwd_context_t * ctx, hwd_control_state_t * ctrl, long_long ** events, int flags ) */
954  .shutdown_component = papicuda_shutdown_component, /* ( void ) */
955  .shutdown_thread = papicuda_shutdown_thread, /* ( hwd_context_t * ctx ) */
956  .cleanup_eventset = papicuda_cleanup_eventset, /* ( hwd_control_state_t * ctrl ) */
957  .ctl = papicuda_ctrl, /* ( hwd_context_t * ctx, int code, _papi_int_option_t * option ) */
958  .update_control_state = papicuda_update_control_state, /* ( hwd_control_state_t * ptr, NativeInfo_t * native, int count, hwd_context_t * ctx ) */
959  .set_domain = papicuda_set_domain, /* ( hwd_control_state_t * cntrl, int domain ) */
960  .reset = papicuda_reset, /* ( hwd_context_t * ctx, hwd_control_state_t * ctrl ) */
961  .ntv_enum_events = papicuda_ntv_enum_events, /* ( unsigned int *EventCode, int modifier ) */
962  .ntv_code_to_name = papicuda_ntv_code_to_name, /* ( unsigned int EventCode, char *name, int len ) */
963  .ntv_code_to_descr = papicuda_ntv_code_to_descr, /* ( unsigned int EventCode, char *name, int len ) */
964  //.ntv_code_to_bits = papicuda_ntv_code_to_bits, /* ( unsigned int EventCode, hwd_register_t * bits ) */
965 
966 };
967 
char name[PAPI_MAX_STR_LEN]
Definition: papi.h:626
#define PAPI_ENOEVNT
Definition: papi.h:258
#define CHECK_CU_ERROR(err, cufunc)
Definition: linux-cuda.c:88
#define CUDAAPI
static papicuda_control_t * global_papicuda_control
Definition: linux-cuda.c:85
static int papicuda_start(hwd_context_t *ctx, hwd_control_state_t *ctrl)
Definition: linux-cuda.c:565
static int papicuda_init_thread(hwd_context_t *ctx)
Definition: linux-cuda.c:375
static int papicuda_read(hwd_context_t *ctx, hwd_control_state_t *ctrl, long long **events, int flags)
Definition: linux-cuda.c:642
static papicuda_context_t * global_papicuda_context
Definition: linux-cuda.c:82
static int papicuda_update_control_state(hwd_control_state_t *ctrl, NativeInfo_t *nativeInfo, int nativeCount, hwd_context_t *ctx)
Definition: linux-cuda.c:456
long long flags
Definition: iozone.c:12330
#define PAPICUDA_MAX_COUNTERS
Definition: linux-cuda.c:27
uint32_t * domainIDNumEvents
Definition: linux-cuda.c:52
#define papi_free(a)
Definition: papi_memory.h:35
CUpti_EventID * availEventIDArray
Definition: linux-cuda.c:34
#define PAPI_ENOSUPP
Definition: papi.h:269
#define PAPI_DOM_KERNEL
Definition: papi.h:298
int * availEventDeviceNum
Definition: linux-cuda.c:35
static int papicuda_set_domain(hwd_control_state_t *ctrl, int domain)
Definition: linux-cuda.c:765
return PAPI_OK
Definition: linux-nvml.c:458
static int papicuda_ntv_code_to_name(unsigned int EventCode, char *name, int len)
Definition: linux-cuda.c:885
#define PAPI_DOM_USER
Definition: papi.h:296
void
Definition: iozone.c:18627
return PAPI_EINVAL
Definition: linux-nvml.c:408
PAPI_component_info_t cmp_info
Definition: papi_vector.h:20
void double value
Definition: iozone.c:18781
#define CHECK_PRINT_EVAL(err, str, eval)
Definition: linux-cuda.c:94
Return codes and api definitions.
static void * dl1
Definition: linux-cuda.c:74
char events[MAX_EVENTS][BUFSIZ]
int activeEventContextIdx[PAPICUDA_MAX_COUNTERS]
Definition: linux-cuda.c:61
#define PAPI_EMISC
Definition: papi.h:265
struct papicuda_name_desc * availEventDesc
Definition: linux-cuda.c:36
#define PAPI_2MAX_STR_LEN
Definition: papi.h:464
char disabled_reason[PAPI_MAX_STR_LEN]
Definition: papi.h:633
static void * dl2
Definition: linux-cuda.c:75
CUpti_EventDomainID * domainIDArray
Definition: linux-cuda.c:51
#define CHECK_CUPTI_ERROR(err, cuptifunc)
Definition: linux-cuda.c:91
static int papicuda_stop(hwd_context_t *ctx, hwd_control_state_t *ctrl)
Definition: linux-cuda.c:607
static int papicuda_list_all_events(papicuda_context_t *gctxt)
Definition: linux-cuda.c:250
static int cidx
Definition: event_info.c:40
static int papicuda_reset(hwd_context_t *ctx, hwd_control_state_t *ctrl)
Definition: linux-cuda.c:783
int countOfActiveCUContexts
Definition: linux-cuda.c:57
static int papicuda_cleanup_eventset(hwd_control_state_t *ctrl)
Definition: linux-cuda.c:821
__attribute__((constructor))
Definition: init_fini.c:12
uint32_t availEventSize
Definition: linux-cuda.c:33
#define SUBDBG(format, args...)
Definition: papi_debug.h:63
long long
Definition: iozone.c:19827
void PAPIERROR(char *format,...)
static int papicuda_ntv_enum_events(unsigned int *EventCode, int modifier)
Definition: linux-cuda.c:858
CUpti_EventGroup eventGroup[PAPICUDA_MAX_COUNTERS]
Definition: linux-cuda.c:70
static void * dl3
Definition: linux-cuda.c:76
int papicuda_shutdown_thread(hwd_context_t *ctx)
Definition: linux-cuda.c:695
#define PAPI_INT_SIGNAL
Definition: papi_internal.h:53
#define PAPI_GRN_THR
Definition: papi.h:360
unsigned uint
Definition: perfmon.c:40
papi_vector_t _cuda_vector
Definition: linux-cuda.c:79
unsigned int deviceNum
Definition: linux-cuda.c:48
#define CUPTIAPI
#define PAPI_ENOMEM
Definition: papi.h:252
static int papicuda_shutdown_component(void)
Definition: linux-cuda.c:704
static int papicuda_ntv_code_to_descr(unsigned int EventCode, char *name, int len)
Definition: linux-cuda.c:905
struct papicuda_device_desc * deviceArray
Definition: linux-cuda.c:32
static int papicuda_init_control_state(hwd_control_state_t *ctrl)
Definition: linux-cuda.c:430
static int papicuda_ctrl(hwd_context_t *ctx, int code, _papi_int_option_t *option)
Definition: linux-cuda.c:745
char * name
Definition: iozone.c:23648
long long activeEventValues[PAPICUDA_MAX_COUNTERS]
Definition: linux-cuda.c:62
int
Definition: iozone.c:18528
#define PAPI_MIN_STR_LEN
Definition: papi.h:462
#define PAPI_ENOINIT
Definition: papi.h:267
#define PAPI_MAX_STR_LEN
Definition: papi.h:463
char deviceName[PAPI_MIN_STR_LEN]
Definition: linux-cuda.c:49
#define CHECK_DL_STATUS(err, str)
static int papicuda_init_component(int cidx)
Definition: linux-cuda.c:396
void(* _dl_non_dynamic_init)(cudaError_t CUDARTAPI cudaFree void)
Definition: linux-cuda.c:112
#define PAPI_DOM_OTHER
Definition: papi.h:299
struct papicuda_active_cucontext_s * arrayOfActiveCUContexts[PAPICUDA_MAX_COUNTERS]
Definition: linux-cuda.c:58
int activeEventIndex[PAPICUDA_MAX_COUNTERS]
Definition: linux-cuda.c:60
#define papi_calloc(a, b)
Definition: papi_memory.h:37
#define CUDARTAPI