PAPI  5.6.0.0
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
linux-cuda.c
Go to the documentation of this file.
1 
17 #include <dlfcn.h>
18 #include <cupti.h>
19 #include <cuda_runtime_api.h>
20 
21 #include "papi.h"
22 #include "papi_memory.h"
23 #include "papi_internal.h"
24 #include "papi_vector.h"
25 
26 /* this number assumes that there will never be more events than indicated */
27 #define PAPICUDA_MAX_COUNTERS 512
28 
29 // #define PAPICUDA_KERNEL_REPLAY_MODE
30 
31 /* Contains device list, pointer to device desciption, and the list of available events */
32 typedef struct papicuda_context {
34  struct papicuda_device_desc *deviceArray;
35  uint32_t availEventSize;
36  CUpti_ActivityKind *availEventKind;
38  uint32_t *availEventIDArray;
40  struct papicuda_name_desc *availEventDesc;
42 
43 /* Store the name and description for an event */
44 typedef struct papicuda_name_desc {
46  char description[PAPI_2MAX_STR_LEN];
48 
49 /* For a device, store device description */
50 typedef struct papicuda_device_desc {
51  CUdevice cuDev;
52  int deviceNum;
53  char deviceName[PAPI_MIN_STR_LEN];
54  uint32_t maxDomains; /* number of domains per device */
55  CUpti_EventDomainID *domainIDArray; /* Array[maxDomains] of domain IDs */
56  uint32_t *domainIDNumEvents; /* Array[maxDomains] of num of events in that domain */
58 
59 /* Control structure tracks array of active contexts, records active events and their values */
60 typedef struct papicuda_control {
62  struct papicuda_active_cucontext_s *arrayOfActiveCUContexts[PAPICUDA_MAX_COUNTERS];
63  uint32_t activeEventCount;
64  int activeEventIndex[PAPICUDA_MAX_COUNTERS];
65  long long activeEventValues[PAPICUDA_MAX_COUNTERS];
69 
70 /* For each active context, which CUDA events are being measured, context eventgroups containing events */
71 typedef struct papicuda_active_cucontext_s {
72  CUcontext cuCtx;
73  int deviceNum;
74  uint32_t conMetricsCount;
75  CUpti_EventID conMetrics[PAPICUDA_MAX_COUNTERS];
76  CUpti_MetricValue conMetricValues[PAPICUDA_MAX_COUNTERS];
77  uint32_t conEventsCount;
78  CUpti_EventID conEvents[PAPICUDA_MAX_COUNTERS];
79  uint64_t conEventValues[PAPICUDA_MAX_COUNTERS];
80  CUpti_EventGroupSets *eventGroupPasses;
82 
83 // file handles used to access cuda libraries with dlopen
84 static void *dl1 = NULL;
85 static void *dl2 = NULL;
86 static void *dl3 = NULL;
87 
88 /* The PAPI side (external) variable as a global */
90 
91 /* Global variable for hardware description, event and metric lists */
93 
94 /* This global variable points to the head of the control state list */
96 
97 /* Macros for error checking... each arg is only referenced/evaluated once */
98 #define CHECK_PRINT_EVAL( checkcond, str, evalthis ) \
99  do { \
100  int _cond = (checkcond); \
101  if (_cond) { \
102  SUBDBG("error: condition %s failed: %s.\n", #checkcond, str); \
103  evalthis; \
104  } \
105  } while (0)
106 
107 #define CUDA_CALL( call, handleerror ) \
108  do { \
109  cudaError_t _status = (call); \
110  if (_status != cudaSuccess) { \
111  SUBDBG("error: function %s failed with error %d.\n", #call, _status); \
112  handleerror; \
113  } \
114  } while (0)
115 
116 #define CU_CALL( call, handleerror ) \
117  do { \
118  CUresult _status = (call); \
119  if (_status != CUDA_SUCCESS) { \
120  SUBDBG("error: function %s failed with error %d.\n", #call, _status); \
121  handleerror; \
122  } \
123  } while (0)
124 
125 
126 #define CUPTI_CALL(call, handleerror) \
127  do { \
128  CUptiResult _status = (call); \
129  if (_status != CUPTI_SUCCESS) { \
130  const char *errstr; \
131  (*cuptiGetResultStringPtr)(_status, &errstr); \
132  SUBDBG("error: function %s failed with error %s.\n", #call, errstr); \
133  handleerror; \
134  } \
135  } while (0)
136 
137 #define BUF_SIZE (32 * 1024)
138 #define ALIGN_SIZE (8)
139 #define ALIGN_BUFFER(buffer, align) \
140  (((uintptr_t) (buffer) & ((align)-1)) ? ((buffer) + (align) - ((uintptr_t) (buffer) & ((align)-1))) : (buffer))
141 
142 /* Function prototypes */
144 
145 /* ****** CHANGE PROTOTYPES TO DECLARE CUDA LIBRARY SYMBOLS AS WEAK **********
146  * This is done so that a version of PAPI built with the cuda component can *
147  * be installed on a system which does not have the cuda libraries installed. *
148  * *
149  * If this is done without these prototypes, then all papi services on the *
150  * system without the cuda libraries installed will fail. The PAPI libraries *
151  * contain references to the cuda libraries which are not installed. The *
152  * load of PAPI commands fails because the cuda library references can not be *
153  * resolved. *
154  * *
155  * This also defines pointers to the cuda library functions that we call. *
156  * These function pointers will be resolved with dlopen/dlsym calls at *
157  * component initialization time. The component then calls the cuda library *
158  * functions through these function pointers. *
159  *******************************************************************************/
161 
162 #define CUAPIWEAK __attribute__( ( weak ) )
163 #define DECLARECUFUNC(funcname, funcsig) CUresult CUAPIWEAK funcname funcsig; CUresult( *funcname##Ptr ) funcsig;
164 DECLARECUFUNC(cuCtxGetCurrent, (CUcontext *));
165 DECLARECUFUNC(cuCtxSetCurrent, (CUcontext));
166 DECLARECUFUNC(cuDeviceGet, (CUdevice *, int));
167 DECLARECUFUNC(cuDeviceGetCount, (int *));
168 DECLARECUFUNC(cuDeviceGetName, (char *, int, CUdevice));
169 DECLARECUFUNC(cuInit, (unsigned int));
170 DECLARECUFUNC(cuCtxPopCurrent, (CUcontext * pctx));
171 DECLARECUFUNC(cuCtxPushCurrent, (CUcontext pctx));
172 DECLARECUFUNC(cuCtxSynchronize, ());
173 
174 #define CUDAAPIWEAK __attribute__( ( weak ) )
175 #define DECLARECUDAFUNC(funcname, funcsig) cudaError_t CUDAAPIWEAK funcname funcsig; cudaError_t( *funcname##Ptr ) funcsig;
176 DECLARECUDAFUNC(cudaGetDevice, (int *));
178 DECLARECUDAFUNC(cudaFree, (void *));
179 
180 #define CUPTIAPIWEAK __attribute__( ( weak ) )
181 #define DECLARECUPTIFUNC(funcname, funcsig) CUptiResult CUPTIAPIWEAK funcname funcsig; CUptiResult( *funcname##Ptr ) funcsig;
182 /* CUptiResult CUPTIAPIWEAK cuptiDeviceEnumEventDomains( CUdevice, size_t *, CUpti_EventDomainID * ); */
183 /* CUptiResult( *cuptiDeviceEnumEventDomainsPtr )( CUdevice, size_t *, CUpti_EventDomainID * ); */
184 DECLARECUPTIFUNC(cuptiDeviceEnumMetrics, (CUdevice device, size_t * arraySizeBytes, CUpti_MetricID * metricArray));
185 DECLARECUPTIFUNC(cuptiDeviceGetEventDomainAttribute, (CUdevice device, CUpti_EventDomainID eventDomain, CUpti_EventDomainAttribute attrib, size_t * valueSize, void *value));
186 DECLARECUPTIFUNC(cuptiDeviceGetNumMetrics, (CUdevice device, uint32_t * numMetrics));
187 DECLARECUPTIFUNC(cuptiEventGroupGetAttribute, (CUpti_EventGroup eventGroup, CUpti_EventGroupAttribute attrib, size_t * valueSize, void *value));
188 DECLARECUPTIFUNC(cuptiEventGroupReadEvent, (CUpti_EventGroup eventGroup, CUpti_ReadEventFlags flags, CUpti_EventID event, size_t * eventValueBufferSizeBytes, uint64_t * eventValueBuffer));
189 DECLARECUPTIFUNC(cuptiEventGroupSetAttribute, (CUpti_EventGroup eventGroup, CUpti_EventGroupAttribute attrib, size_t valueSize, void *value));
190 DECLARECUPTIFUNC(cuptiEventGroupSetDisable, (CUpti_EventGroupSet * eventGroupSet));
191 DECLARECUPTIFUNC(cuptiEventGroupSetEnable, (CUpti_EventGroupSet * eventGroupSet));
192 DECLARECUPTIFUNC(cuptiEventGroupSetsCreate, (CUcontext context, size_t eventIdArraySizeBytes, CUpti_EventID * eventIdArray, CUpti_EventGroupSets ** eventGroupPasses));
193 DECLARECUPTIFUNC(cuptiEventGroupSetsDestroy, (CUpti_EventGroupSets * eventGroupSets));
194 DECLARECUPTIFUNC(cuptiGetTimestamp, (uint64_t * timestamp));
195 DECLARECUPTIFUNC(cuptiMetricEnumEvents, (CUpti_MetricID metric, size_t * eventIdArraySizeBytes, CUpti_EventID * eventIdArray));
196 DECLARECUPTIFUNC(cuptiMetricGetAttribute, (CUpti_MetricID metric, CUpti_MetricAttribute attrib, size_t * valueSize, void *value));
197 DECLARECUPTIFUNC(cuptiMetricGetNumEvents, (CUpti_MetricID metric, uint32_t * numEvents));
198 DECLARECUPTIFUNC(cuptiMetricGetValue, (CUdevice device, CUpti_MetricID metric, size_t eventIdArraySizeBytes, CUpti_EventID * eventIdArray, size_t eventValueArraySizeBytes, uint64_t * eventValueArray, uint64_t timeDuration, CUpti_MetricValue * metricValue));
199 DECLARECUPTIFUNC(cuptiSetEventCollectionMode, (CUcontext context, CUpti_EventCollectionMode mode));
200 DECLARECUPTIFUNC(cuptiDeviceEnumEventDomains, (CUdevice, size_t *, CUpti_EventDomainID *));
201 DECLARECUPTIFUNC(cuptiDeviceGetNumEventDomains, (CUdevice, uint32_t *));
202 DECLARECUPTIFUNC(cuptiEventDomainEnumEvents, (CUpti_EventDomainID, size_t *, CUpti_EventID *));
203 DECLARECUPTIFUNC(cuptiEventDomainGetAttribute, (CUpti_EventDomainID eventDomain, CUpti_EventDomainAttribute attrib, size_t * valueSize, void *value));
204 DECLARECUPTIFUNC(cuptiEventDomainGetNumEvents, (CUpti_EventDomainID, uint32_t *));
205 DECLARECUPTIFUNC(cuptiEventGetAttribute, (CUpti_EventID, CUpti_EventAttribute, size_t *, void *));
206 DECLARECUPTIFUNC(cuptiEventGroupAddEvent, (CUpti_EventGroup, CUpti_EventID));
207 DECLARECUPTIFUNC(cuptiEventGroupCreate, (CUcontext, CUpti_EventGroup *, uint32_t));
208 DECLARECUPTIFUNC(cuptiEventGroupDestroy, (CUpti_EventGroup));
209 DECLARECUPTIFUNC(cuptiEventGroupDisable, (CUpti_EventGroup));
210 DECLARECUPTIFUNC(cuptiEventGroupEnable, (CUpti_EventGroup));
211 DECLARECUPTIFUNC(cuptiEventGroupReadAllEvents, (CUpti_EventGroup, CUpti_ReadEventFlags, size_t *, uint64_t *, size_t *, CUpti_EventID *, size_t *));
212 DECLARECUPTIFUNC(cuptiEventGroupResetAllEvents, (CUpti_EventGroup));
213 DECLARECUPTIFUNC(cuptiGetResultString, (CUptiResult result, const char **str));
214 DECLARECUPTIFUNC(cuptiEnableKernelReplayMode, ( CUcontext context ));
215 DECLARECUPTIFUNC(cuptiDisableKernelReplayMode, ( CUcontext context ));
216 
217 
218 /*****************************************************************************
219  ******** BEGIN FUNCTIONS USED INTERNALLY SPECIFIC TO THIS COMPONENT *********
220  *****************************************************************************/
221 
222 /*
223  * Link the necessary CUDA libraries to use the cuda component. If any of them can not be found, then
224  * the CUDA component will just be disabled. This is done at runtime so that a version of PAPI built
225  * with the CUDA component can be installed and used on systems which have the CUDA libraries installed
226  * and on systems where these libraries are not installed.
227  */
228 static int papicuda_linkCudaLibraries()
229 {
230 #define DLSYM_AND_CHECK( dllib, name ) dlsym( dllib, name ); if ( dlerror()!=NULL ) { strncpy( _cuda_vector.cmp_info.disabled_reason, "A CUDA required function was not found in dynamic libs", PAPI_MAX_STR_LEN ); return ( PAPI_ENOSUPP ); }
231 
232  /* Attempt to guess if we were statically linked to libc, if so bail */
233  if(_dl_non_dynamic_init != NULL) {
234  strncpy(_cuda_vector.cmp_info.disabled_reason, "The CUDA component does not support statically linking to libc.", PAPI_MAX_STR_LEN);
235  return PAPI_ENOSUPP;
236  }
237  /* Need to link in the cuda libraries, if not found disable the component */
238  dl1 = dlopen("libcuda.so", RTLD_NOW | RTLD_GLOBAL);
239  CHECK_PRINT_EVAL(!dl1, "CUDA library libcuda.so not found.", return (PAPI_ENOSUPP));
240  cuCtxGetCurrentPtr = DLSYM_AND_CHECK(dl1, "cuCtxGetCurrent");
241  cuCtxSetCurrentPtr = DLSYM_AND_CHECK(dl1, "cuCtxSetCurrent");
242  cuDeviceGetPtr = DLSYM_AND_CHECK(dl1, "cuDeviceGet");
243  cuDeviceGetCountPtr = DLSYM_AND_CHECK(dl1, "cuDeviceGetCount");
244  cuDeviceGetNamePtr = DLSYM_AND_CHECK(dl1, "cuDeviceGetName");
245  cuInitPtr = DLSYM_AND_CHECK(dl1, "cuInit");
246  cuCtxPopCurrentPtr = DLSYM_AND_CHECK(dl1, "cuCtxPopCurrent");
247  cuCtxPushCurrentPtr = DLSYM_AND_CHECK(dl1, "cuCtxPushCurrent");
248  cuCtxSynchronizePtr = DLSYM_AND_CHECK(dl1, "cuCtxSynchronize");
249 
250  dl2 = dlopen("libcudart.so", RTLD_NOW | RTLD_GLOBAL | RTLD_NODELETE);
251  CHECK_PRINT_EVAL(!dl2, "CUDA runtime library libcudart.so not found.", return (PAPI_ENOSUPP));
252  cudaGetDevicePtr = DLSYM_AND_CHECK(dl2, "cudaGetDevice");
253  cudaSetDevicePtr = DLSYM_AND_CHECK(dl2, "cudaSetDevice");
254  cudaFreePtr = DLSYM_AND_CHECK(dl2, "cudaFree");
255 
256  dl3 = dlopen("libcupti.so", RTLD_NOW | RTLD_GLOBAL);
257  CHECK_PRINT_EVAL(!dl3, "CUDA runtime library libcudart.so not found.", return (PAPI_ENOSUPP));
258  /* The macro DLSYM_AND_CHECK results in the expansion example below */
259  /* cuptiDeviceEnumEventDomainsPtr = dlsym( dl3, "cuptiDeviceEnumEventDomains" ); */
260  /* if ( dlerror()!=NULL ) { strncpy( _cuda_vector.cmp_info.disabled_reason, "A CUDA required function was not found in dynamic libs", PAPI_MAX_STR_LEN ); return ( PAPI_ENOSUPP ); } */
261  cuptiDeviceEnumMetricsPtr = DLSYM_AND_CHECK(dl3, "cuptiDeviceEnumMetrics");
262  cuptiDeviceGetEventDomainAttributePtr = DLSYM_AND_CHECK(dl3, "cuptiDeviceGetEventDomainAttribute");
263  cuptiDeviceGetNumMetricsPtr = DLSYM_AND_CHECK(dl3, "cuptiDeviceGetNumMetrics");
264  cuptiEventGroupGetAttributePtr = DLSYM_AND_CHECK(dl3, "cuptiEventGroupGetAttribute");
265  cuptiEventGroupReadEventPtr = DLSYM_AND_CHECK(dl3, "cuptiEventGroupReadEvent");
266  cuptiEventGroupSetAttributePtr = DLSYM_AND_CHECK(dl3, "cuptiEventGroupSetAttribute");
267  cuptiEventGroupSetDisablePtr = DLSYM_AND_CHECK(dl3, "cuptiEventGroupSetDisable");
268  cuptiEventGroupSetEnablePtr = DLSYM_AND_CHECK(dl3, "cuptiEventGroupSetEnable");
269  cuptiEventGroupSetsCreatePtr = DLSYM_AND_CHECK(dl3, "cuptiEventGroupSetsCreate");
270  cuptiEventGroupSetsDestroyPtr = DLSYM_AND_CHECK(dl3, "cuptiEventGroupSetsDestroy");
271  cuptiGetTimestampPtr = DLSYM_AND_CHECK(dl3, "cuptiGetTimestamp");
272  cuptiMetricEnumEventsPtr = DLSYM_AND_CHECK(dl3, "cuptiMetricEnumEvents");
273  cuptiMetricGetAttributePtr = DLSYM_AND_CHECK(dl3, "cuptiMetricGetAttribute");
274  cuptiMetricGetNumEventsPtr = DLSYM_AND_CHECK(dl3, "cuptiMetricGetNumEvents");
275  cuptiMetricGetValuePtr = DLSYM_AND_CHECK(dl3, "cuptiMetricGetValue");
276  cuptiSetEventCollectionModePtr = DLSYM_AND_CHECK(dl3, "cuptiSetEventCollectionMode");
277  cuptiDeviceEnumEventDomainsPtr = DLSYM_AND_CHECK(dl3, "cuptiDeviceEnumEventDomains");
278  cuptiDeviceGetNumEventDomainsPtr = DLSYM_AND_CHECK(dl3, "cuptiDeviceGetNumEventDomains");
279  cuptiEventDomainEnumEventsPtr = DLSYM_AND_CHECK(dl3, "cuptiEventDomainEnumEvents");
280  cuptiEventDomainGetAttributePtr = DLSYM_AND_CHECK(dl3, "cuptiEventDomainGetAttribute");
281  cuptiEventDomainGetNumEventsPtr = DLSYM_AND_CHECK(dl3, "cuptiEventDomainGetNumEvents");
282  cuptiEventGetAttributePtr = DLSYM_AND_CHECK(dl3, "cuptiEventGetAttribute");
283  cuptiEventGroupAddEventPtr = DLSYM_AND_CHECK(dl3, "cuptiEventGroupAddEvent");
284  cuptiEventGroupCreatePtr = DLSYM_AND_CHECK(dl3, "cuptiEventGroupCreate");
285  cuptiEventGroupDestroyPtr = DLSYM_AND_CHECK(dl3, "cuptiEventGroupDestroy");
286  cuptiEventGroupDisablePtr = DLSYM_AND_CHECK(dl3, "cuptiEventGroupDisable");
287  cuptiEventGroupEnablePtr = DLSYM_AND_CHECK(dl3, "cuptiEventGroupEnable");
288  cuptiEventGroupReadAllEventsPtr = DLSYM_AND_CHECK(dl3, "cuptiEventGroupReadAllEvents");
289  cuptiEventGroupResetAllEventsPtr = DLSYM_AND_CHECK(dl3, "cuptiEventGroupResetAllEvents");
290  cuptiGetResultStringPtr = DLSYM_AND_CHECK(dl3, "cuptiGetResultString");
291  cuptiEnableKernelReplayModePtr = DLSYM_AND_CHECK(dl3, "cuptiEnableKernelReplayMode");
292  cuptiDisableKernelReplayModePtr = DLSYM_AND_CHECK(dl3, "cuptiEnableKernelReplayMode");
293  return (PAPI_OK);
294 }
295 
296 
298 {
299  SUBDBG("Entering\n");
300  CUresult cuErr;
301  int deviceNum;
302  uint32_t domainNum, eventNum;
303  papicuda_device_desc_t *mydevice;
304  char tmpStr[PAPI_MIN_STR_LEN];
305  tmpStr[PAPI_MIN_STR_LEN - 1] = '\0';
306  size_t tmpSizeBytes;
307  int ii;
308  uint32_t maxEventSize;
309 
310  /* How many CUDA devices do we have? */
311  cuErr = (*cuDeviceGetCountPtr) (&gctxt->deviceCount);
312  if(cuErr == CUDA_ERROR_NOT_INITIALIZED) {
313  /* If CUDA not initilaized, initialized CUDA and retry the device list */
314  /* This is required for some of the PAPI tools, that do not call the init functions */
315  if(((*cuInitPtr) (0)) != CUDA_SUCCESS) {
316  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDA cannot be found and initialized (cuInit failed).", PAPI_MAX_STR_LEN);
317  return PAPI_ENOSUPP;
318  }
319  CU_CALL((*cuDeviceGetCountPtr) (&gctxt->deviceCount), return (PAPI_EMISC));
320  }
321 
322  if(gctxt->deviceCount == 0) {
323  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDA initialized but no CUDA devices found.", PAPI_MAX_STR_LEN);
324  return PAPI_ENOSUPP;
325  }
326  SUBDBG("Found %d devices\n", gctxt->deviceCount);
327 
328  /* allocate memory for device information */
330  CHECK_PRINT_EVAL(!gctxt->deviceArray, "ERROR CUDA: Could not allocate memory for CUDA device structure", return (PAPI_ENOMEM));
331 
332  /* For each device, get domains and domain-events counts */
333  maxEventSize = 0;
334  for(deviceNum = 0; deviceNum < gctxt->deviceCount; deviceNum++) {
335  mydevice = &gctxt->deviceArray[deviceNum];
336  /* Get device id, name, numeventdomains for each device */
337  CU_CALL((*cuDeviceGetPtr) (&mydevice->cuDev, deviceNum), return (PAPI_EMISC));
338  CU_CALL((*cuDeviceGetNamePtr) (mydevice->deviceName, PAPI_MIN_STR_LEN - 1, mydevice->cuDev), return (PAPI_EMISC));
339  mydevice->deviceName[PAPI_MIN_STR_LEN - 1] = '\0';
340  CUPTI_CALL((*cuptiDeviceGetNumEventDomainsPtr) (mydevice->cuDev, &mydevice->maxDomains), return (PAPI_EMISC));
341  /* Allocate space to hold domain IDs */
342  mydevice->domainIDArray = (CUpti_EventDomainID *) papi_calloc(mydevice->maxDomains, sizeof(CUpti_EventDomainID));
343  CHECK_PRINT_EVAL(!mydevice->domainIDArray, "ERROR CUDA: Could not allocate memory for CUDA device domains", return (PAPI_ENOMEM));
344  /* Put domain ids into allocated space */
345  size_t domainarraysize = mydevice->maxDomains * sizeof(CUpti_EventDomainID);
346  CUPTI_CALL((*cuptiDeviceEnumEventDomainsPtr) (mydevice->cuDev, &domainarraysize, mydevice->domainIDArray), return (PAPI_EMISC));
347  /* Allocate space to hold domain event counts */
348  mydevice->domainIDNumEvents = (uint32_t *) papi_calloc(mydevice->maxDomains, sizeof(uint32_t));
349  CHECK_PRINT_EVAL(!mydevice->domainIDNumEvents, "ERROR CUDA: Could not allocate memory for domain event counts", return (PAPI_ENOMEM));
350  /* For each domain, get event counts in domainNumEvents[] */
351  for(domainNum = 0; domainNum < mydevice->maxDomains; domainNum++) {
352  CUpti_EventDomainID domainID = mydevice->domainIDArray[domainNum];
353  /* Get num events in domain */
354  // SUBDBG( "Device %d:%d calling cuptiEventDomainGetNumEventsPtr with domainID %d \n", deviceNum, mydevice->cuDev, domainID );
355  CUPTI_CALL((*cuptiEventDomainGetNumEventsPtr) (domainID, &mydevice->domainIDNumEvents[domainNum]), return (PAPI_EMISC));
356  /* Keep track of overall number of events */
357  maxEventSize += mydevice->domainIDNumEvents[domainNum];
358  }
359  }
360 
361  /* Create space for metrics */
362  for(deviceNum = 0; deviceNum < gctxt->deviceCount; deviceNum++) {
363  uint32_t maxMetrics;
364  mydevice = &gctxt->deviceArray[deviceNum];
365  // CUPTI_CALL((*cuptiDeviceGetNumMetricsPtr) (mydevice->cuDev, &maxMetrics), return (PAPI_EMISC));
366  if ( (*cuptiDeviceGetNumMetricsPtr) (mydevice->cuDev, &maxMetrics) != CUPTI_SUCCESS )
367  maxMetrics = 0;
368  maxEventSize += maxMetrics;
369  }
370 
371  /* Allocate space for all events and descriptors */
372  gctxt->availEventKind = (CUpti_ActivityKind *) papi_calloc(maxEventSize, sizeof(CUpti_ActivityKind));
373  CHECK_PRINT_EVAL(!gctxt->availEventKind, "ERROR CUDA: Could not allocate memory", return (PAPI_ENOMEM));
374  gctxt->availEventDeviceNum = (int *) papi_calloc(maxEventSize, sizeof(int));
375  CHECK_PRINT_EVAL(!gctxt->availEventDeviceNum, "ERROR CUDA: Could not allocate memory", return (PAPI_ENOMEM));
376  gctxt->availEventIDArray = (CUpti_EventID *) papi_calloc(maxEventSize, sizeof(CUpti_EventID));
377  CHECK_PRINT_EVAL(!gctxt->availEventIDArray, "ERROR CUDA: Could not allocate memory", return (PAPI_ENOMEM));
378  gctxt->availEventIsBeingMeasuredInEventset = (uint32_t *) papi_calloc(maxEventSize, sizeof(uint32_t));
379  CHECK_PRINT_EVAL(!gctxt->availEventIsBeingMeasuredInEventset, "ERROR CUDA: Could not allocate memory", return (PAPI_ENOMEM));
380  gctxt->availEventDesc = (papicuda_name_desc_t *) papi_calloc(maxEventSize, sizeof(papicuda_name_desc_t));
381  CHECK_PRINT_EVAL(!gctxt->availEventDesc, "ERROR CUDA: Could not allocate memory", return (PAPI_ENOMEM));
382 
383  /* Record the events and descriptions */
384  uint32_t idxEventArray = 0;
385  for(deviceNum = 0; deviceNum < gctxt->deviceCount; deviceNum++) {
386  mydevice = &gctxt->deviceArray[deviceNum];
387  // SUBDBG( "For device %d %d maxdomains %d \n", deviceNum, mydevice->cuDev, mydevice->maxDomains );
388  /* Get and store event IDs, names, descriptions into the large arrays allocated */
389  for(domainNum = 0; domainNum < mydevice->maxDomains; domainNum++) {
390  /* Get domain id */
391  CUpti_EventDomainID domainID = mydevice->domainIDArray[domainNum];
392  uint32_t domainNumEvents = mydevice->domainIDNumEvents[domainNum];
393  // SUBDBG( "For device %d domain %d domainID %d numEvents %d\n", mydevice->cuDev, domainNum, domainID, domainNumEvents );
394  /* Allocate temp space for eventIDs for this domain */
395  CUpti_EventID *domainEventIDArray = (CUpti_EventID *) papi_calloc(domainNumEvents, sizeof(CUpti_EventID));
396  CHECK_PRINT_EVAL(!domainEventIDArray, "ERROR CUDA: Could not allocate memory for events", return (PAPI_ENOMEM));
397  /* Load the domain eventIDs in temp space */
398  size_t domainEventArraySize = domainNumEvents * sizeof(CUpti_EventID);
399  CUPTI_CALL((*cuptiEventDomainEnumEventsPtr) (domainID, &domainEventArraySize, domainEventIDArray), return (PAPI_EMISC));
400  /* For each event, get and store name and description */
401  for(eventNum = 0; eventNum < domainNumEvents; eventNum++) {
402  /* Record the event IDs in native event array */
403  CUpti_EventID myeventCuptiEventId = domainEventIDArray[eventNum];
404  gctxt->availEventKind[idxEventArray] = CUPTI_ACTIVITY_KIND_EVENT;
405  gctxt->availEventIDArray[idxEventArray] = myeventCuptiEventId;
406  gctxt->availEventDeviceNum[idxEventArray] = deviceNum;
407  /* Get event name */
408  tmpSizeBytes = PAPI_MIN_STR_LEN - 1 * sizeof(char);
409  CUPTI_CALL((*cuptiEventGetAttributePtr) (myeventCuptiEventId, CUPTI_EVENT_ATTR_NAME, &tmpSizeBytes, tmpStr), return (PAPI_EMISC));
410  /* Save a full path for the event, filling spaces with underscores */
411  // snprintf( gctxt->availEventDesc[idxEventArray].name, PAPI_MIN_STR_LEN, "%s:%d:%s", mydevice->deviceName, deviceNum, tmpStr );
412  snprintf(gctxt->availEventDesc[idxEventArray].name, PAPI_MIN_STR_LEN, "event:%s:device=%d", tmpStr, deviceNum);
413  gctxt->availEventDesc[idxEventArray].name[PAPI_MIN_STR_LEN - 1] = '\0';
414  char *nameTmpPtr = gctxt->availEventDesc[idxEventArray].name;
415  for(ii = 0; ii < (int) strlen(nameTmpPtr); ii++)
416  if(nameTmpPtr[ii] == ' ')
417  nameTmpPtr[ii] = '_';
418  /* Save description in the native event array */
419  tmpSizeBytes = PAPI_2MAX_STR_LEN - 1 * sizeof(char);
420  CUPTI_CALL((*cuptiEventGetAttributePtr) (myeventCuptiEventId, CUPTI_EVENT_ATTR_SHORT_DESCRIPTION, &tmpSizeBytes, gctxt->availEventDesc[idxEventArray].description), return (PAPI_EMISC));
421  gctxt->availEventDesc[idxEventArray].description[PAPI_2MAX_STR_LEN - 1] = '\0';
422  // SUBDBG( "Event ID:%d Name:%s Desc:%s\n", gctxt->availEventIDArray[idxEventArray], gctxt->availEventDesc[idxEventArray].name, gctxt->availEventDesc[idxEventArray].description );
423  /* Increment index past events in this domain to start of next domain */
424  idxEventArray++;
425  }
426  papi_free(domainEventIDArray);
427  }
428  }
429 
430  /* Retrieve and store metric information for each device */
431  SUBDBG("Checking for metrics\n");
432  for(deviceNum = 0; deviceNum < gctxt->deviceCount; deviceNum++) {
433  uint32_t maxMetrics, i;
434  CUpti_MetricID *metricIdList = NULL;
435  mydevice = &gctxt->deviceArray[deviceNum];
436  // CUPTI_CALL((*cuptiDeviceGetNumMetricsPtr) (mydevice->cuDev, &maxMetrics), return (PAPI_EMISC));
437  if ( (*cuptiDeviceGetNumMetricsPtr) (mydevice->cuDev, &maxMetrics) != CUPTI_SUCCESS ) {
438  maxMetrics = 0;
439  continue;
440  }
441  SUBDBG("Device %d: Checking each of the (maxMetrics) %d metrics\n", deviceNum, maxMetrics);
442  size_t size = maxMetrics * sizeof(CUpti_EventID);
443  metricIdList = (CUpti_MetricID *) papi_calloc(maxMetrics, sizeof(CUpti_EventID));
444  CHECK_PRINT_EVAL(metricIdList == NULL, "Out of memory", return (PAPI_ENOMEM));
445  CUPTI_CALL((*cuptiDeviceEnumMetricsPtr) (mydevice->cuDev, &size, metricIdList), return (PAPI_EMISC));
446  for(i = 0; i < maxMetrics; i++) {
447  gctxt->availEventIDArray[idxEventArray] = metricIdList[i];
448  gctxt->availEventKind[idxEventArray] = CUPTI_ACTIVITY_KIND_METRIC;
449  gctxt->availEventDeviceNum[idxEventArray] = deviceNum;
450  size = PAPI_MIN_STR_LEN;
451  CUPTI_CALL((*cuptiMetricGetAttributePtr) (metricIdList[i], CUPTI_METRIC_ATTR_NAME, &size, (uint8_t *) tmpStr), return (PAPI_EMISC));
452  // FIXME SOMEDAY: For this release the nvlink metrics are not functioning so skip them
453  if(strstr(tmpStr, "nvlink")!=NULL) continue;
454  // FIXME SOMEDAY: For this release the nvlink metrics are not functioning so skip them
455  if(size >= PAPI_MIN_STR_LEN)
456  gctxt->availEventDesc[idxEventArray].name[PAPI_MIN_STR_LEN - 1] = '\0';
457  snprintf(gctxt->availEventDesc[idxEventArray].name, PAPI_MIN_STR_LEN, "metric:%s:device=%d", tmpStr, deviceNum);
458  size = PAPI_2MAX_STR_LEN;
459  CUPTI_CALL((*cuptiMetricGetAttributePtr) (metricIdList[i], CUPTI_METRIC_ATTR_LONG_DESCRIPTION, &size, (uint8_t *) gctxt->availEventDesc[idxEventArray].description), return (PAPI_EMISC));
460  if(size >= PAPI_2MAX_STR_LEN)
461  gctxt->availEventDesc[idxEventArray].description[PAPI_2MAX_STR_LEN - 1] = '\0';
462  // SUBDBG( "For device %d availEvent[%d] %s\n", mydevice->cuDev, idxEventArray, gctxt->availEventDesc[idxEventArray].name);
463  idxEventArray++;
464  }
465  papi_free(metricIdList);
466  }
467  gctxt->availEventSize = idxEventArray;
468 
469  /* return 0 if everything went OK */
470  return 0;
471 }
472 
473 
474 /*
475  This routine tries to convert all CUPTI values to long long values.
476  If the CUPTI value is an integer type, it is cast to long long. If
477  the CUPTI value is a percent, it is multiplied by 100 to return the
478  integer percentage. If the CUPTI value is a double, the value
479  is cast to long long... this can be a severe truncation.
480  */
481 static int papicuda_convert_metric_value_to_long_long(CUpti_MetricValue metricValue, CUpti_MetricValueKind valueKind, long long int *papiValue)
482 {
483  union {
484  long long ll;
485  double fp;
486  } tmpValue;
487 
488  SUBDBG("Try to convert the CUPTI metric value kind (index %d) to PAPI value (long long or double)\n", valueKind);
489  switch (valueKind) {
490  case CUPTI_METRIC_VALUE_KIND_DOUBLE:
491  SUBDBG("Metric double %f\n", metricValue.metricValueDouble);
492  tmpValue.ll = (long long)(metricValue.metricValueDouble);
493  //CHECK_PRINT_EVAL(tmpValue.fp - metricValue.metricValueDouble > 1e-6, "Error converting metric\n", return (PAPI_EMISC));
494  break;
495  case CUPTI_METRIC_VALUE_KIND_UINT64:
496  SUBDBG("Metric uint64 = %llu\n", (unsigned long long) metricValue.metricValueUint64);
497  tmpValue.ll = (long long) (metricValue.metricValueUint64);
498  CHECK_PRINT_EVAL(tmpValue.ll - metricValue.metricValueUint64 > 1e-6, "Error converting metric\n", return (PAPI_EMISC));
499  break;
500  case CUPTI_METRIC_VALUE_KIND_INT64:
501  SUBDBG("Metric int64 = %lld\n", (long long) metricValue.metricValueInt64);
502  tmpValue.ll = (long long) (metricValue.metricValueInt64);
503  CHECK_PRINT_EVAL(tmpValue.ll - metricValue.metricValueInt64 > 1e-6, "Error converting metric\n", return (PAPI_EMISC));
504  break;
505  case CUPTI_METRIC_VALUE_KIND_PERCENT:
506  SUBDBG("Metric percent = %f%%\n", metricValue.metricValuePercent);
507  tmpValue.ll = (long long)(metricValue.metricValuePercent*100);
508  //CHECK_PRINT_EVAL(tmpValue.ll - metricValue.metricValuePercent > 1e-6, "Error converting metric\n", return (PAPI_EMISC));
509  break;
510  case CUPTI_METRIC_VALUE_KIND_THROUGHPUT:
511  SUBDBG("Metric throughput %llu bytes/sec\n", (unsigned long long) metricValue.metricValueThroughput);
512  tmpValue.ll = (long long) (metricValue.metricValueThroughput);
513  CHECK_PRINT_EVAL(tmpValue.ll - metricValue.metricValueThroughput > 1e-6, "Error converting metric\n", return (PAPI_EMISC));
514  break;
515  case CUPTI_METRIC_VALUE_KIND_UTILIZATION_LEVEL:
516  SUBDBG("Metric utilization level %u\n", (unsigned int) metricValue.metricValueUtilizationLevel);
517  tmpValue.ll = (long long) (metricValue.metricValueUtilizationLevel);
518  CHECK_PRINT_EVAL(tmpValue.ll - metricValue.metricValueUtilizationLevel > 1e-6, "Error converting metric\n", return (PAPI_EMISC));
519  break;
520  default:
521  CHECK_PRINT_EVAL(1, "ERROR: unsupported metric value kind", return (PAPI_EINVAL));
522  exit(-1);
523  }
524  *papiValue = tmpValue.ll;
525  return (PAPI_OK);
526 }
527 
528 
529 /* ****************************************************************************
530  ******************* BEGIN PAPI's COMPONENT REQUIRED FUNCTIONS *************
531  **************************************************************************** */
532 
533 /*
534  * This is called whenever a thread is initialized.
535  */
537 {
538  (void) ctx;
539  SUBDBG("Entering\n");
540  return PAPI_OK;
541 }
542 
543 
544 /* Initialize hardware counters, setup the function vector table
545  * and get hardware information, this routine is called when the
546  * PAPI process is initialized (IE PAPI_library_init)
547  */
548 /* NOTE: only called by main thread (not by every thread) !!! Starting
549  in CUDA 4.0, multiple CPU threads can access the same CUDA
550  context. This is a much easier programming model then pre-4.0 as
551  threads - using the same context - can share memory, data,
552  etc. It's possible to create a different context for each
553  thread. That's why CUDA context creation is done in
554  CUDA_init_component() (called only by main thread) rather than
555  CUDA_init() or CUDA_init_control_state() (both called by each
556  thread). */
558 {
559  SUBDBG("Entering with cidx: %d\n", cidx);
560  int rv;
561 
562  /* link in all the cuda libraries and resolve the symbols we need to use */
563  if(papicuda_linkCudaLibraries() != PAPI_OK) {
564  SUBDBG("Dynamic link of CUDA libraries failed, component will be disabled.\n");
565  SUBDBG("See disable reason in papi_component_avail output for more details.\n");
566  return (PAPI_ENOSUPP);
567  }
568 
569  /* Create the structure */
570  if(!global_papicuda_context)
571  global_papicuda_context = (papicuda_context_t *) papi_calloc(1, sizeof(papicuda_context_t));
572 
573  /* Get list of all native CUDA events supported */
574  rv = papicuda_add_native_events(global_papicuda_context);
575  if(rv != 0)
576  return (rv);
577 
578  /* Export some information */
579  _cuda_vector.cmp_info.CmpIdx = cidx;
580  _cuda_vector.cmp_info.num_native_events = global_papicuda_context->availEventSize;
581  _cuda_vector.cmp_info.num_cntrs = _cuda_vector.cmp_info.num_native_events;
582  _cuda_vector.cmp_info.num_mpx_cntrs = _cuda_vector.cmp_info.num_native_events;
583 
584  return (PAPI_OK);
585 }
586 
587 
588 /* Setup a counter control state.
589  * In general a control state holds the hardware info for an
590  * EventSet.
591  */
593 {
594  SUBDBG("Entering\n");
595  (void) ctrl;
597 
598  CHECK_PRINT_EVAL(!gctxt, "Error: The PAPI CUDA component needs to be initialized first", return (PAPI_ENOINIT));
599  /* If no events were found during the initial component initialization, return error */
600  if(global_papicuda_context->availEventSize <= 0) {
601  strncpy(_cuda_vector.cmp_info.disabled_reason, "ERROR CUDA: No events exist", PAPI_MAX_STR_LEN);
602  return (PAPI_EMISC);
603  }
604  /* If it does not exist, create the global structure to hold CUDA contexts and active events */
605  if(!global_papicuda_control) {
606  global_papicuda_control = (papicuda_control_t *) papi_calloc(1, sizeof(papicuda_control_t));
607  global_papicuda_control->countOfActiveCUContexts = 0;
608  global_papicuda_control->activeEventCount = 0;
609  }
610  return PAPI_OK;
611 }
612 
613 /* Triggered by eventset operations like add or remove. For CUDA,
614  * needs to be called multiple times from each seperate CUDA context
615  * with the events to be measured from that context. For each
616  * context, create eventgroups for the events.
617  */
618 /* Note: NativeInfo_t is defined in papi_internal.h */
619 static int papicuda_update_control_state(hwd_control_state_t * ctrl, NativeInfo_t * nativeInfo, int nativeCount, hwd_context_t * ctx)
620 {
621  SUBDBG("Entering with nativeCount %d\n", nativeCount);
622  (void) ctx;
623  // (void) ctrl;
626  int currDeviceNum;
627  CUcontext currCuCtx;
628  int eventContextIdx;
629  CUcontext eventCuCtx;
630  int index, ii;
631  uint32_t numEvents, ee, cc;
632 
633  /* Return if no events */
634  if(nativeCount == 0)
635  return (PAPI_OK);
636 
637  /* Get deviceNum, initialize context if needed via free, get context */
638  // CU_CALL( (*cuCtxGetCurrentPtr)(&currCuCtx), return(PAPI_EMISC));
639  CUDA_CALL((*cudaGetDevicePtr) (&currDeviceNum), return (PAPI_EMISC));
640  SUBDBG("currDeviceNum %d \n", currDeviceNum);
641  CUDA_CALL((*cudaFreePtr) (NULL), return (PAPI_EMISC));
642  CU_CALL((*cuCtxGetCurrentPtr) (&currCuCtx), return (PAPI_EMISC));
643  SUBDBG("currDeviceNum %d cuCtx %p \n", currDeviceNum, currCuCtx);
644 
645  /* Handle user request of events to be monitored */
646  for(ii = 0; ii < nativeCount; ii++) {
647  /* Get the PAPI event index from the user */
648  index = nativeInfo[ii].ni_event;
649 #ifdef DEBUG
650  char *eventName = gctxt->availEventDesc[index].name;
651 #endif
652  int eventDeviceNum = gctxt->availEventDeviceNum[index];
653 
654  /* if this event is already added continue to next ii, if not, mark it as being added */
655  if(gctxt->availEventIsBeingMeasuredInEventset[index] == 1) {
656  SUBDBG("Skipping event %s which is already added\n", eventName);
657  continue;
658  } else
659  gctxt->availEventIsBeingMeasuredInEventset[index] = 1;
660 
661  /* Find context/control in papicuda, creating it if does not exist */
662  for(cc = 0; cc < gctrl->countOfActiveCUContexts; cc++) {
663  CHECK_PRINT_EVAL(cc >= PAPICUDA_MAX_COUNTERS, "Exceeded hardcoded maximum number of contexts (PAPICUDA_MAX_COUNTERS)", return (PAPI_EMISC));
664  if(gctrl->arrayOfActiveCUContexts[cc]->deviceNum == eventDeviceNum) {
665  eventCuCtx = gctrl->arrayOfActiveCUContexts[cc]->cuCtx;
666  SUBDBG("Event %s device %d already has a cuCtx %p registered\n", eventName, eventDeviceNum, eventCuCtx);
667  if(eventCuCtx != currCuCtx)
668  CU_CALL((*cuCtxPushCurrentPtr) (eventCuCtx), return (PAPI_EMISC));
669  break;
670  }
671  }
672  // Create context if it does not exit
673  if(cc == gctrl->countOfActiveCUContexts) {
674  SUBDBG("Event %s device %d does not have a cuCtx registered yet...\n", eventName, eventDeviceNum);
675  if(currDeviceNum != eventDeviceNum) {
676  CUDA_CALL((*cudaSetDevicePtr) (eventDeviceNum), return (PAPI_EMISC));
677  CUDA_CALL((*cudaFreePtr) (NULL), return (PAPI_EMISC));
678  CU_CALL((*cuCtxGetCurrentPtr) (&eventCuCtx), return (PAPI_EMISC));
679  } else {
680  eventCuCtx = currCuCtx;
681  }
683  CHECK_PRINT_EVAL(gctrl->arrayOfActiveCUContexts[cc] == NULL, "Memory allocation for new active context failed", return (PAPI_ENOMEM));
684  gctrl->arrayOfActiveCUContexts[cc]->deviceNum = eventDeviceNum;
685  gctrl->arrayOfActiveCUContexts[cc]->cuCtx = eventCuCtx;
686  gctrl->arrayOfActiveCUContexts[cc]->eventGroupPasses = NULL;
687  gctrl->arrayOfActiveCUContexts[cc]->conMetricsCount = 0;
688  gctrl->arrayOfActiveCUContexts[cc]->conEventsCount = 0;
689  gctrl->countOfActiveCUContexts++;
690  SUBDBG("Added a new context deviceNum %d cuCtx %p ... now countOfActiveCUContexts is %d\n", eventDeviceNum, eventCuCtx, gctrl->countOfActiveCUContexts);
691  }
692  eventContextIdx = cc;
693 
694  papicuda_active_cucontext_t *eventctrl = gctrl->arrayOfActiveCUContexts[eventContextIdx];
695  switch (gctxt->availEventKind[index]) {
696  case CUPTI_ACTIVITY_KIND_METRIC:
697  SUBDBG("Need to add metric %d %s \n", index, eventName);
698  /* For the metric, find list of events required */
699  CUpti_MetricID metricId = gctxt->availEventIDArray[index];
700  CUPTI_CALL((*cuptiMetricGetNumEventsPtr) (metricId, &numEvents), return (PAPI_EINVAL));
701  size_t sizeBytes = numEvents * sizeof(CUpti_EventID);
702  CUpti_EventID *eventIdArray = papi_malloc(sizeBytes);
703  CHECK_PRINT_EVAL(eventIdArray == NULL, "Malloc failed", return (PAPI_ENOMEM));
704  CUPTI_CALL((*cuptiMetricEnumEventsPtr) (metricId, &sizeBytes, eventIdArray), return (PAPI_EINVAL));
705  SUBDBG("For metric %s, append the list of %d required events\n", eventName, numEvents);
706  for(ee = 0; ee < numEvents; ee++) {
707  eventctrl->conEvents[eventctrl->conEventsCount] = eventIdArray[ee];
708  eventctrl->conEventsCount++;
709  SUBDBG("For metric %s, appended event %d - %d %d to this context (conEventsCount %d)\n", eventName, ee, eventIdArray[ee], eventctrl->conEvents[eventctrl->conEventsCount], eventctrl->conEventsCount);
710  if (eventctrl->conEventsCount >= PAPICUDA_MAX_COUNTERS) {
711  SUBDBG("Num events (generated by metric) exceeded PAPICUDA_MAX_COUNTERS\n");
712  return(PAPI_EINVAL);
713  }
714  }
715  eventctrl->conMetrics[eventctrl->conMetricsCount] = metricId;
716  eventctrl->conMetricsCount++;
717  if (eventctrl->conMetricsCount >= PAPICUDA_MAX_COUNTERS) {
718  SUBDBG("Num metrics exceeded PAPICUDA_MAX_COUNTERS\n");
719  return(PAPI_EINVAL);
720  }
721  break;
722 
723  case CUPTI_ACTIVITY_KIND_EVENT:
724  SUBDBG("Need to add event %d %s to the context\n", index, eventName);
725  /* lookup cuptieventid for this event index */
726  CUpti_EventID eventId = gctxt->availEventIDArray[index];
727  eventctrl->conEvents[eventctrl->conEventsCount] = eventId;
728  eventctrl->conEventsCount++;
729  break;
730 
731  default:
732  CHECK_PRINT_EVAL(1, "Unknown CUPTI measure", return (PAPI_EMISC));
733  break;
734  }
735 
736  if (eventctrl->conEventsCount >= PAPICUDA_MAX_COUNTERS) {
737  SUBDBG("Num events exceeded PAPICUDA_MAX_COUNTERS\n");
738  return(PAPI_EINVAL);
739  }
740 
741  /* Record index of this active event back into the nativeInfo structure */
742  nativeInfo[ii].ni_position = gctrl->activeEventCount;
743  /* record added event at the higher level */
744  CHECK_PRINT_EVAL(gctrl->activeEventCount == PAPICUDA_MAX_COUNTERS - 1, "Exceeded maximum num of events (PAPI_MAX_COUNTERS)", return (PAPI_EMISC));
745  gctrl->activeEventIndex[gctrl->activeEventCount] = index;
746  // gctrl->activeEventContextIdx[gctrl->activeEventCount] = eventContextIdx;
747  gctrl->activeEventValues[gctrl->activeEventCount] = 0;
748  gctrl->activeEventCount++;
749 
750  /* Create/recreate eventgrouppass structures for the added event and context */
751  SUBDBG("Create eventGroupPasses for context (destroy pre-existing) (nativeCount %d, conEventsCount %d) \n", gctrl->activeEventCount, eventctrl->conEventsCount);
752  if(eventctrl->conEventsCount > 0) {
753  // SUBDBG("Destroy prevous eventGroupPasses for the context \n");
754  if(eventctrl->eventGroupPasses != NULL)
755  CUPTI_CALL((*cuptiEventGroupSetsDestroyPtr) (eventctrl->eventGroupPasses), return (PAPI_EMISC));
756  eventctrl->eventGroupPasses = NULL;
757  size_t sizeBytes = (eventctrl->conEventsCount) * sizeof(CUpti_EventID);
758  // SUBDBG("About to create eventGroupPasses for the context (sizeBytes %zu) \n", sizeBytes);
759 #ifdef PAPICUDA_KERNEL_REPLAY_MODE
760  CUPTI_CALL((*cuptiEnableKernelReplayModePtr) (eventCuCtx), return (PAPI_ECMP));
761  CUPTI_CALL((*cuptiEventGroupSetsCreatePtr) (eventCuCtx, sizeBytes, eventctrl->conEvents, &eventctrl->eventGroupPasses), return (PAPI_ECMP));
762 #else
763  CUPTI_CALL((*cuptiSetEventCollectionModePtr)(eventCuCtx,CUPTI_EVENT_COLLECTION_MODE_KERNEL), return(PAPI_ECMP));
764  CUPTI_CALL((*cuptiEventGroupSetsCreatePtr) (eventCuCtx, sizeBytes, eventctrl->conEvents, &eventctrl->eventGroupPasses), return (PAPI_EMISC));
765  if (eventctrl->eventGroupPasses->numSets > 1) {
766  SUBDBG("Error occured: The combined CUPTI events require more than 1 pass... try different events\n");
768  return(PAPI_ECOMBO);
769  } else {
770  SUBDBG("Created eventGroupPasses for context total-events %d in-this-context %d passes-requied %d) \n", gctrl->activeEventCount, eventctrl->conEventsCount, eventctrl->eventGroupPasses->numSets);
771  }
772 
773 #endif
774  }
775 
776  if(eventCuCtx != currCuCtx)
777  CU_CALL((*cuCtxPopCurrentPtr) (&eventCuCtx), return (PAPI_EMISC));
778 
779  }
780  return (PAPI_OK);
781 }
782 
783 /* Triggered by PAPI_start().
784  * For CUDA component, switch to each context and start all eventgroups.
785 */
787 {
788  SUBDBG("Entering\n");
789  (void) ctx;
790  (void) ctrl;
792  // papicuda_context_t *gctxt = global_papicuda_context;
793  uint32_t ii, gg, cc, ss;
794  int saveDeviceNum = -1;
795 
796  SUBDBG("Reset all active event values\n");
797  for(ii = 0; ii < gctrl->activeEventCount; ii++)
798  gctrl->activeEventValues[ii] = 0;
799 
800  SUBDBG("Save current context, then switch to each active device/context and enable eventgroups\n");
801  CUDA_CALL((*cudaGetDevicePtr) (&saveDeviceNum), return (PAPI_EMISC));
802  CUPTI_CALL((*cuptiGetTimestampPtr) (&gctrl->cuptiStartTimestampNs), return (PAPI_EMISC));
803  for(cc = 0; cc < gctrl->countOfActiveCUContexts; cc++) {
804  int eventDeviceNum = gctrl->arrayOfActiveCUContexts[cc]->deviceNum;
805  CUcontext eventCuCtx = gctrl->arrayOfActiveCUContexts[cc]->cuCtx;
806  SUBDBG("Set to device %d cuCtx %p \n", eventDeviceNum, eventCuCtx);
807  // CUDA_CALL( (*cudaSetDevicePtr)(eventDeviceNum), return(PAPI_EMISC));
808  if(eventDeviceNum != saveDeviceNum)
809  CU_CALL((*cuCtxPushCurrentPtr) (eventCuCtx), return (PAPI_EMISC));
810  CUpti_EventGroupSets *eventEventGroupPasses = gctrl->arrayOfActiveCUContexts[cc]->eventGroupPasses;
811  for (ss=0; ss<eventEventGroupPasses->numSets; ss++) {
812  CUpti_EventGroupSet groupset = eventEventGroupPasses->sets[ss];
813  for(gg = 0; gg < groupset.numEventGroups; gg++) {
814  CUpti_EventGroup group = groupset.eventGroups[gg];
815  uint32_t one = 1;
816  CUPTI_CALL((*cuptiEventGroupSetAttributePtr) (group, CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES, sizeof(uint32_t), &one), return (PAPI_EMISC));
817  }
818  CUPTI_CALL((*cuptiEventGroupSetEnablePtr) (&groupset), return (PAPI_EMISC));
819  }
820  if(eventDeviceNum != saveDeviceNum)
821  CU_CALL((*cuCtxPopCurrentPtr) (&eventCuCtx), return (PAPI_EMISC));
822  }
823 
824  return (PAPI_OK);
825 }
826 
827 
828 /* Triggered by PAPI_read(). For CUDA component, switch to each
829  * context, read all the eventgroups, and put the values in the
830  * correct places. */
831 static int papicuda_read(hwd_context_t * ctx, hwd_control_state_t * ctrl, long long **values, int flags)
832 {
833  SUBDBG("Entering\n");
834  (void) ctx;
835  (void) ctrl;
836  (void) flags;
839  uint32_t gg, ii, jj, ee, instanceK, cc, rr, ss;
840  int saveDeviceNum;
841  size_t eventIdsSize = PAPICUDA_MAX_COUNTERS * sizeof(CUpti_EventID);
842  uint64_t readEventValueBuffer[PAPICUDA_MAX_COUNTERS];
843  CUpti_EventID readEventIDArray[PAPICUDA_MAX_COUNTERS];
844 
845  // Get read time stamp
846  CUPTI_CALL((*cuptiGetTimestampPtr) (&gctrl->cuptiReadTimestampNs), return (PAPI_EMISC));
847  uint64_t durationNs = gctrl->cuptiReadTimestampNs - gctrl->cuptiStartTimestampNs;
849 
850  SUBDBG("Save current context, then switch to each active device/context and enable eventgroups\n");
851  CUDA_CALL((*cudaGetDevicePtr) (&saveDeviceNum), return (PAPI_EMISC));
852  for(cc = 0; cc < gctrl->countOfActiveCUContexts; cc++) {
853  int currDeviceNum = gctrl->arrayOfActiveCUContexts[cc]->deviceNum;
854  CUcontext currCuCtx = gctrl->arrayOfActiveCUContexts[cc]->cuCtx;
855  SUBDBG("Set to device %d cuCtx %p \n", currDeviceNum, currCuCtx);
856  if(currDeviceNum != saveDeviceNum)
857  CU_CALL((*cuCtxPushCurrentPtr) (currCuCtx), return (PAPI_EMISC));
858  else
859  CU_CALL((*cuCtxSetCurrentPtr) (currCuCtx), return (PAPI_EMISC));
860 
861  size_t numEventIDsRead = 0;
862  CU_CALL((*cuCtxSynchronizePtr) (), return (PAPI_EMISC));
863  CUpti_EventGroupSets *currEventGroupPasses = gctrl->arrayOfActiveCUContexts[cc]->eventGroupPasses;
864  uint32_t numEvents, numInstances, numTotalInstances;
865  size_t sizeofuint32num = sizeof(uint32_t);
866  CUpti_EventDomainID groupDomainID;
867  size_t groupDomainIDSize = sizeof(groupDomainID);
868  CUdevice cudevice = gctxt->deviceArray[currDeviceNum].cuDev;
869 
870  /* Since we accumulate the eventValues in a buffer, it needs to be cleared for each context */
871  for(ee = 0; ee < PAPICUDA_MAX_COUNTERS; ee++)
872  readEventValueBuffer[ee] = 0;
873 
874  for (ss=0; ss<currEventGroupPasses->numSets; ss++) {
875  CUpti_EventGroupSet groupset = currEventGroupPasses->sets[ss];
876  SUBDBG("Read events in this context\n");
877  for(gg = 0; gg < groupset.numEventGroups; gg++) {
878  CUpti_EventGroup group = groupset.eventGroups[gg];
879  CUPTI_CALL((*cuptiEventGroupGetAttributePtr) (group, CUPTI_EVENT_GROUP_ATTR_EVENT_DOMAIN_ID, &groupDomainIDSize, &groupDomainID), return (PAPI_EMISC));
880  CUPTI_CALL((*cuptiDeviceGetEventDomainAttributePtr) (cudevice, groupDomainID, CUPTI_EVENT_DOMAIN_ATTR_TOTAL_INSTANCE_COUNT, &sizeofuint32num, &numTotalInstances), return (PAPI_EMISC));
881  CUPTI_CALL((*cuptiEventGroupGetAttributePtr) (group, CUPTI_EVENT_GROUP_ATTR_INSTANCE_COUNT, &sizeofuint32num, &numInstances), return (PAPI_EMISC));
882  CUPTI_CALL((*cuptiEventGroupGetAttributePtr) (group, CUPTI_EVENT_GROUP_ATTR_NUM_EVENTS, &sizeofuint32num, &numEvents), return (PAPI_EMISC));
883  eventIdsSize = PAPICUDA_MAX_COUNTERS * sizeof(CUpti_EventID);
884  CUpti_EventID eventIds[PAPICUDA_MAX_COUNTERS];
885  CUPTI_CALL((*cuptiEventGroupGetAttributePtr) (group, CUPTI_EVENT_GROUP_ATTR_EVENTS, &eventIdsSize, eventIds), return (PAPI_EMISC));
886  SUBDBG("Context %d eventgroup %d domain numTotalInstaces %u numInstances %u numEvents %u\n", cc, gg, numTotalInstances, numInstances, numEvents);
887  size_t valuesSize = sizeof(uint64_t) * numInstances;
888  uint64_t *values = (uint64_t *) papi_malloc(valuesSize);
889  CHECK_PRINT_EVAL(values == NULL, "Out of memory", return (PAPI_ENOMEM));
890  /* For each event, read all values and normalize */
891  for(ee = 0; ee < numEvents; ee++) {
892  CUPTI_CALL((*cuptiEventGroupReadEventPtr) (group, CUPTI_EVENT_READ_FLAG_NONE, eventIds[ee], &valuesSize, values), return (PAPI_EMISC));
893  // sum collect event values from all instances
894  uint64_t valuesum = 0;
895  for(instanceK = 0; instanceK < numInstances; instanceK++)
896  valuesum += values[instanceK];
897  // It seems that the same event can occur multiple times in eventIds, so we need to accumulate values in older valueBuffers if needed
898  // Scan thru readEvents looking for a match, break if found, if not found, increment numEventIDsRead
899  for(rr = 0; rr < numEventIDsRead; rr++)
900  if(readEventIDArray[rr] == eventIds[ee])
901  break;
902  /* If the event was not found, increment the numEventIDsRead */
903  if(rr == numEventIDsRead)
904  numEventIDsRead++;
905  readEventIDArray[rr] = eventIds[ee];
906  readEventValueBuffer[rr] += valuesum;
907  size_t tmpStrSize = PAPI_MIN_STR_LEN - 1 * sizeof(char);
908  char tmpStr[PAPI_MIN_STR_LEN];
909  CUPTI_CALL((*cuptiEventGetAttributePtr) (eventIds[ee], CUPTI_EVENT_ATTR_NAME, &tmpStrSize, tmpStr), return (PAPI_EMISC));
910  SUBDBG("Read context %d eventgroup %d numEventIDsRead %lu device %d event %d/%d %d name %s value %lu (rr %d id %d val %lu) \n", cc, gg, numEventIDsRead, currDeviceNum, ee, numEvents, eventIds[ee], tmpStr, valuesum, rr,
911  eventIds[rr], readEventValueBuffer[rr]);
912  }
913  papi_free(values);
914  }
915  }
916 
917  // normalize the event values to represent the total number of domain instances on the device
918  for(ii = 0; ii < numEventIDsRead; ii++)
919  readEventValueBuffer[numEventIDsRead] = (readEventValueBuffer[numEventIDsRead] * numTotalInstances) / numInstances;
920 
921  /* For this pushed device and context, figure out the event and metric values and record them into the arrays */
922  SUBDBG("For this device and context, match read values against active events by scanning activeEvents array and matching associated availEventIDs\n");
923  for(jj = 0; jj < gctrl->activeEventCount; jj++) {
924  int index = gctrl->activeEventIndex[jj];
925  /* If the device/context does not match the current context, move to next */
926  if(gctxt->availEventDeviceNum[index] != currDeviceNum)
927  continue;
928  uint32_t eventId = gctxt->availEventIDArray[index];
929  switch (gctxt->availEventKind[index]) {
930  case CUPTI_ACTIVITY_KIND_EVENT:
931  SUBDBG("Searching for activeEvent %s eventId %u\n", gctxt->availEventDesc[index].name, eventId);
932  for(ii = 0; ii < numEventIDsRead; ii++) {
933  SUBDBG("Look at readEventIDArray[%u/%zu] with id %u\n", ii, numEventIDsRead, readEventIDArray[ii]);
934  if(readEventIDArray[ii] == eventId) {
935  gctrl->activeEventValues[jj] += (long long) readEventValueBuffer[ii];
936  SUBDBG("Matched read-eventID %d:%d eventName %s value %ld activeEvent %d value %lld \n", jj, (int) eventId, gctxt->availEventDesc[index].name, readEventValueBuffer[ii], index, gctrl->activeEventValues[jj]);
937  break;
938  }
939  }
940  break;
941 
942  case CUPTI_ACTIVITY_KIND_METRIC:
943  SUBDBG("For the metric, find list of events required to calculate this metric value\n");
944  CUpti_MetricID metricId = gctxt->availEventIDArray[index];
945  int metricDeviceNum = gctxt->availEventDeviceNum[index];
946  CUdevice cudevice = gctxt->deviceArray[metricDeviceNum].cuDev;
947  uint32_t numEvents, ee;
948  CUPTI_CALL((*cuptiMetricGetNumEventsPtr) (metricId, &numEvents), return (PAPI_EINVAL));
949  SUBDBG("Metric %s needs %d events\n", gctxt->availEventDesc[index].name, numEvents);
950  size_t eventIdArraySizeBytes = numEvents * sizeof(CUpti_EventID);
951  CUpti_EventID *eventIdArray = papi_malloc(eventIdArraySizeBytes);
952  CHECK_PRINT_EVAL(eventIdArray == NULL, "Malloc failed", return (PAPI_ENOMEM));
953  size_t eventValueArraySizeBytes = numEvents * sizeof(uint64_t);
954  uint64_t *eventValueArray = papi_malloc(eventValueArraySizeBytes);
955  CHECK_PRINT_EVAL(eventValueArray == NULL, "Malloc failed", return (PAPI_ENOMEM));
956  CUPTI_CALL((*cuptiMetricEnumEventsPtr) (metricId, &eventIdArraySizeBytes, eventIdArray), return (PAPI_EINVAL));
957  // Match metrics for the users events
958  for(ee = 0; ee < numEvents; ee++) {
959  for(ii = 0; ii < numEventIDsRead; ii++) {
960  if(eventIdArray[ee] == readEventIDArray[ii]) {
961  SUBDBG("Matched metric %s, found %d/%d events with eventId %d\n", gctxt->availEventDesc[index].name, ee, numEvents, readEventIDArray[ii]);
962  eventValueArray[ee] = readEventValueBuffer[ii];
963  break;
964  }
965  }
966  CHECK_PRINT_EVAL(ii == numEventIDsRead, "Could not find required event for metric", return (PAPI_EINVAL));
967  }
968 
969  // Use CUPTI to calculate a metric. Return all metric values mapped into long long values.
970  CUpti_MetricValue metricValue;
971  CUpti_MetricValueKind valueKind;
972  size_t valueKindSize = sizeof(valueKind);
973  CUPTI_CALL((*cuptiMetricGetAttributePtr) (metricId, CUPTI_METRIC_ATTR_VALUE_KIND, &valueKindSize, &valueKind), return (PAPI_EMISC));
974  CUPTI_CALL((*cuptiMetricGetValuePtr) (cudevice, metricId, eventIdArraySizeBytes, eventIdArray, eventValueArraySizeBytes, eventValueArray, durationNs, &metricValue), return (PAPI_EMISC));
975  int retval = papicuda_convert_metric_value_to_long_long(metricValue, valueKind, &(gctrl->activeEventValues[jj]));
976  if(retval != PAPI_OK)
977  return (retval);
978  papi_free(eventIdArray);
979  papi_free(eventValueArray);
980  break;
981 
982  default:
983  SUBDBG("Not handled");
984  break;
985  }
986  }
987 
988  /* Pop the pushed context */
989  if(currDeviceNum != saveDeviceNum)
990  CU_CALL((*cuCtxPopCurrentPtr) (&currCuCtx), return (PAPI_EMISC));
991  }
992  *values = gctrl->activeEventValues;
993  return (PAPI_OK);
994 }
995 
996 /* Triggered by PAPI_stop() */
998 {
999  SUBDBG("Entering\n");
1000  (void) ctx;
1001  (void) ctrl;
1003  uint32_t cc, ss;
1004  int saveDeviceNum;
1005 
1006  SUBDBG("Save current context, then switch to each active device/context and enable eventgroups\n");
1007  CUDA_CALL((*cudaGetDevicePtr) (&saveDeviceNum), return (PAPI_EMISC));
1008  for(cc = 0; cc < gctrl->countOfActiveCUContexts; cc++) {
1009  int currDeviceNum = gctrl->arrayOfActiveCUContexts[cc]->deviceNum;
1010  CUcontext currCuCtx = gctrl->arrayOfActiveCUContexts[cc]->cuCtx;
1011  SUBDBG("Set to device %d cuCtx %p \n", currDeviceNum, currCuCtx);
1012  if(currDeviceNum != saveDeviceNum)
1013  CU_CALL((*cuCtxPushCurrentPtr) (currCuCtx), return (PAPI_EMISC));
1014  else
1015  CU_CALL((*cuCtxSetCurrentPtr) (currCuCtx), return (PAPI_EMISC));
1016  CUpti_EventGroupSets *currEventGroupPasses = gctrl->arrayOfActiveCUContexts[cc]->eventGroupPasses;
1017  for (ss=0; ss<currEventGroupPasses->numSets; ss++) {
1018  CUpti_EventGroupSet groupset = currEventGroupPasses->sets[ss];
1019  CUPTI_CALL((*cuptiEventGroupSetDisablePtr) (&groupset), return (PAPI_EMISC));
1020  }
1021  /* Pop the pushed context */
1022  if(currDeviceNum != saveDeviceNum)
1023  CU_CALL((*cuCtxPopCurrentPtr) (&currCuCtx), return (PAPI_EMISC));
1024 
1025  }
1026  return (PAPI_OK);
1027 }
1028 
1029 
1030 /*
1031  * Disable and destroy the CUDA eventGroup
1032  */
1034 {
1035  SUBDBG("Entering\n");
1036  (void) ctrl;
1038  // papicuda_active_cucontext_t *currctrl;
1039  uint32_t cc;
1040  int saveDeviceNum;
1041 
1042  SUBDBG("Save current context, then switch to each active device/context and enable eventgroups\n");
1043  CUDA_CALL((*cudaGetDevicePtr) (&saveDeviceNum), return (PAPI_EMISC));
1044  for(cc = 0; cc < gctrl->countOfActiveCUContexts; cc++) {
1045  CUcontext currCuCtx = gctrl->arrayOfActiveCUContexts[cc]->cuCtx;
1046  int currDeviceNum = gctrl->arrayOfActiveCUContexts[cc]->deviceNum;
1047  CUpti_EventGroupSets *currEventGroupPasses = gctrl->arrayOfActiveCUContexts[cc]->eventGroupPasses;
1048  if(currDeviceNum != saveDeviceNum)
1049  CU_CALL((*cuCtxPushCurrentPtr) (currCuCtx), return (PAPI_EMISC));
1050  else
1051  CU_CALL((*cuCtxSetCurrentPtr) (currCuCtx), return (PAPI_EMISC));
1052  //CUPTI_CALL((*cuptiEventGroupSetsDestroyPtr) (currEventGroupPasses), return (PAPI_EMISC));
1053  (*cuptiEventGroupSetsDestroyPtr) (currEventGroupPasses);
1054  gctrl->arrayOfActiveCUContexts[cc]->eventGroupPasses = NULL;
1055  papi_free( gctrl->arrayOfActiveCUContexts[cc] );
1056  /* Pop the pushed context */
1057  if(currDeviceNum != saveDeviceNum)
1058  CU_CALL((*cuCtxPopCurrentPtr) (&currCuCtx), return (PAPI_EMISC));
1059  }
1060  /* Record that there are no active contexts or events */
1061  gctrl->countOfActiveCUContexts = 0;
1062  gctrl->activeEventCount = 0;
1063  return (PAPI_OK);
1064 }
1065 
1066 
1067 /* Called at thread shutdown. Does nothing in the CUDA component. */
1069 {
1070  SUBDBG("Entering\n");
1071  (void) ctx;
1072 
1073  return (PAPI_OK);
1074 }
1075 
1076 /* Triggered by PAPI_shutdown() and frees memory allocated in the CUDA component. */
1078 {
1079  SUBDBG("Entering\n");
1082  int deviceNum;
1083  uint32_t cc;
1084  /* Free context */
1085  if(gctxt) {
1086  for(deviceNum = 0; deviceNum < gctxt->deviceCount; deviceNum++) {
1087  papicuda_device_desc_t *mydevice = &gctxt->deviceArray[deviceNum];
1088  papi_free(mydevice->domainIDArray);
1089  papi_free(mydevice->domainIDNumEvents);
1090  }
1091  papi_free(gctxt->availEventIDArray);
1093  papi_free(gctxt->availEventKind);
1095  papi_free(gctxt->availEventDesc);
1096  papi_free(gctxt->deviceArray);
1097  papi_free(gctxt);
1098  global_papicuda_context = gctxt = NULL;
1099  }
1100  /* Free control */
1101  if(gctrl) {
1102  for(cc = 0; cc < gctrl->countOfActiveCUContexts; cc++) {
1103 #ifdef PAPICUDA_KERNEL_REPLAY_MODE
1104  CUcontext currCuCtx = gctrl->arrayOfActiveCUContexts[cc]->cuCtx;
1105  CUPTI_CALL((*cuptiDisableKernelReplayModePtr) (currCuCtx), return (PAPI_EMISC));
1106 #endif
1107  if(gctrl->arrayOfActiveCUContexts[cc] != NULL)
1108  papi_free(gctrl->arrayOfActiveCUContexts[cc]);
1109  }
1110  papi_free(gctrl);
1111  global_papicuda_control = gctrl = NULL;
1112  }
1113  // close the dynamic libraries needed by this component (opened in the init substrate call)
1114  dlclose(dl1);
1115  dlclose(dl2);
1116  dlclose(dl3);
1117  return (PAPI_OK);
1118 }
1119 
1120 
1121 /* Triggered by PAPI_reset() but only if the EventSet is currently
1122  * running. If the eventset is not currently running, then the saved
1123  * value in the EventSet is set to zero without calling this
1124  * routine. */
1126 {
1127  (void) ctx;
1128  (void) ctrl;
1130  uint32_t gg, ii, cc, ss;
1131  int saveDeviceNum;
1132 
1133  SUBDBG("Reset all active event values\n");
1134  for(ii = 0; ii < gctrl->activeEventCount; ii++)
1135  gctrl->activeEventValues[ii] = 0;
1136 
1137  SUBDBG("Save current context, then switch to each active device/context and reset\n");
1138  CUDA_CALL((*cudaGetDevicePtr) (&saveDeviceNum), return (PAPI_EMISC));
1139  for(cc = 0; cc < gctrl->countOfActiveCUContexts; cc++) {
1140  CUcontext currCuCtx = gctrl->arrayOfActiveCUContexts[cc]->cuCtx;
1141  int currDeviceNum = gctrl->arrayOfActiveCUContexts[cc]->deviceNum;
1142  if(currDeviceNum != saveDeviceNum)
1143  CU_CALL((*cuCtxPushCurrentPtr) (currCuCtx), return (PAPI_EMISC));
1144  else
1145  CU_CALL((*cuCtxSetCurrentPtr) (currCuCtx), return (PAPI_EMISC));
1146  CUpti_EventGroupSets *currEventGroupPasses = gctrl->arrayOfActiveCUContexts[cc]->eventGroupPasses;
1147  for (ss=0; ss<currEventGroupPasses->numSets; ss++) {
1148  CUpti_EventGroupSet groupset = currEventGroupPasses->sets[ss];
1149  for(gg = 0; gg < groupset.numEventGroups; gg++) {
1150  CUpti_EventGroup group = groupset.eventGroups[gg];
1151  CUPTI_CALL((*cuptiEventGroupResetAllEventsPtr) (group), return (PAPI_EMISC));
1152  }
1153  CUPTI_CALL((*cuptiEventGroupSetEnablePtr) (&groupset), return (PAPI_EMISC));
1154  }
1155  if(currDeviceNum != saveDeviceNum)
1156  CU_CALL((*cuCtxPopCurrentPtr) (&currCuCtx), return (PAPI_EMISC));
1157  }
1158  return (PAPI_OK);
1159 }
1160 
1161 
1162 /* This function sets various options in the component - Does nothing in the CUDA component.
1163  @param[in] ctx -- hardware context
1164  @param[in] code valid are PAPI_SET_DEFDOM, PAPI_SET_DOMAIN, PAPI_SETDEFGRN, PAPI_SET_GRANUL and PAPI_SET_INHERIT
1165  @param[in] option -- options to be set
1166 */
1167 static int papicuda_ctrl(hwd_context_t * ctx, int code, _papi_int_option_t * option)
1168 {
1169  SUBDBG("Entering\n");
1170  (void) ctx;
1171  (void) code;
1172  (void) option;
1173  return (PAPI_OK);
1174 }
1175 
1176 /*
1177  * This function has to set the bits needed to count different domains
1178  * In particular: PAPI_DOM_USER, PAPI_DOM_KERNEL PAPI_DOM_OTHER
1179  * By default return PAPI_EINVAL if none of those are specified
1180  * and PAPI_OK with success
1181  * PAPI_DOM_USER is only user context is counted
1182  * PAPI_DOM_KERNEL is only the Kernel/OS context is counted
1183  * PAPI_DOM_OTHER is Exception/transient mode (like user TLB misses)
1184  * PAPI_DOM_ALL is all of the domains
1185  */
1186 static int papicuda_set_domain(hwd_control_state_t * ctrl, int domain)
1187 {
1188  SUBDBG("Entering\n");
1189  (void) ctrl;
1190  if((PAPI_DOM_USER & domain) || (PAPI_DOM_KERNEL & domain) || (PAPI_DOM_OTHER & domain) || (PAPI_DOM_ALL & domain))
1191  return (PAPI_OK);
1192  else
1193  return (PAPI_EINVAL);
1194  return (PAPI_OK);
1195 }
1196 
1197 
1198 /* Enumerate Native Events.
1199  * @param EventCode is the event of interest
1200  * @param modifier is one of PAPI_ENUM_FIRST, PAPI_ENUM_EVENTS
1201  */
1202 static int papicuda_ntv_enum_events(unsigned int *EventCode, int modifier)
1203 {
1204  // SUBDBG( "Entering (get next event after %u)\n", *EventCode );
1205  switch (modifier) {
1206  case PAPI_ENUM_FIRST:
1207  *EventCode = 0;
1208  return (PAPI_OK);
1209  break;
1210  case PAPI_ENUM_EVENTS:
1211  if(*EventCode < global_papicuda_context->availEventSize - 1) {
1212  *EventCode = *EventCode + 1;
1213  return (PAPI_OK);
1214  } else
1215  return (PAPI_ENOEVNT);
1216  break;
1217  default:
1218  return (PAPI_EINVAL);
1219  }
1220  return (PAPI_OK);
1221 }
1222 
1223 
1224 /* Takes a native event code and passes back the name
1225  * @param EventCode is the native event code
1226  * @param name is a pointer for the name to be copied to
1227  * @param len is the size of the name string
1228  */
1229 static int papicuda_ntv_code_to_name(unsigned int EventCode, char *name, int len)
1230 {
1231  // SUBDBG( "Entering EventCode %d\n", EventCode );
1232  unsigned int index = EventCode;
1234  if(index < gctxt->availEventSize) {
1235  strncpy(name, gctxt->availEventDesc[index].name, len);
1236  } else {
1237  return (PAPI_EINVAL);
1238  }
1239  // SUBDBG( "Exit: EventCode %d: Name %s\n", EventCode, name );
1240  return (PAPI_OK);
1241 }
1242 
1243 
1244 /* Takes a native event code and passes back the event description
1245  * @param EventCode is the native event code
1246  * @param descr is a pointer for the description to be copied to
1247  * @param len is the size of the descr string
1248  */
1249 static int papicuda_ntv_code_to_descr(unsigned int EventCode, char *name, int len)
1250 {
1251  // SUBDBG( "Entering\n" );
1252  unsigned int index = EventCode;
1254  if(index < gctxt->availEventSize) {
1255  strncpy(name, gctxt->availEventDesc[index].description, len);
1256  } else {
1257  return (PAPI_EINVAL);
1258  }
1259  return (PAPI_OK);
1260 }
1261 
1262 
1263 /* Vector that points to entry points for the component */
1264 papi_vector_t _cuda_vector = {
1265  .cmp_info = {
1266  /* default component information (unspecified values are initialized to 0) */
1267  .name = "cuda",
1268  .short_name = "cuda",
1269  .version = "5.1",
1270  .description = "CUDA events and metrics via NVIDIA CuPTI interfaces",
1271  .num_mpx_cntrs = PAPICUDA_MAX_COUNTERS,
1272  .num_cntrs = PAPICUDA_MAX_COUNTERS,
1273  .default_domain = PAPI_DOM_USER,
1274  .default_granularity = PAPI_GRN_THR,
1275  .available_granularities = PAPI_GRN_THR,
1276  .hardware_intr_sig = PAPI_INT_SIGNAL,
1277  /* component specific cmp_info initializations */
1278  .fast_real_timer = 0,
1279  .fast_virtual_timer = 0,
1280  .attach = 0,
1281  .attach_must_ptrace = 0,
1282  .available_domains = PAPI_DOM_USER | PAPI_DOM_KERNEL,
1283  }
1284  ,
1285  /* sizes of framework-opaque component-private structures... these are all unused in this component */
1286  .size = {
1287  .context = 1, /* sizeof( papicuda_context_t ), */
1288  .control_state = 1, /* sizeof( papicuda_control_t ), */
1289  .reg_value = 1, /* sizeof( papicuda_register_t ), */
1290  .reg_alloc = 1, /* sizeof( papicuda_reg_alloc_t ), */
1291  }
1292  ,
1293  /* function pointers in this component */
1294  .start = papicuda_start, /* ( hwd_context_t * ctx, hwd_control_state_t * ctrl ) */
1295  .stop = papicuda_stop, /* ( hwd_context_t * ctx, hwd_control_state_t * ctrl ) */
1296  .read = papicuda_read, /* ( hwd_context_t * ctx, hwd_control_state_t * ctrl, long_long ** events, int flags ) */
1297  .reset = papicuda_reset, /* ( hwd_context_t * ctx, hwd_control_state_t * ctrl ) */
1298  .cleanup_eventset = papicuda_cleanup_eventset, /* ( hwd_control_state_t * ctrl ) */
1299 
1300  .init_component = papicuda_init_component, /* ( int cidx ) */
1301  .init_thread = papicuda_init_thread, /* ( hwd_context_t * ctx ) */
1302  .init_control_state = papicuda_init_control_state, /* ( hwd_control_state_t * ctrl ) */
1303  .update_control_state = papicuda_update_control_state, /* ( hwd_control_state_t * ptr, NativeInfo_t * native, int count, hwd_context_t * ctx ) */
1304 
1305  .ctl = papicuda_ctrl, /* ( hwd_context_t * ctx, int code, _papi_int_option_t * option ) */
1306  .set_domain = papicuda_set_domain, /* ( hwd_control_state_t * cntrl, int domain ) */
1307  .ntv_enum_events = papicuda_ntv_enum_events, /* ( unsigned int *EventCode, int modifier ) */
1308  .ntv_code_to_name = papicuda_ntv_code_to_name, /* ( unsigned int EventCode, char *name, int len ) */
1309  .ntv_code_to_descr = papicuda_ntv_code_to_descr, /* ( unsigned int EventCode, char *name, int len ) */
1310  .shutdown_thread = papicuda_shutdown_thread, /* ( hwd_context_t * ctx ) */
1311  .shutdown_component = papicuda_shutdown_component, /* ( void ) */
1312 };
char name[PAPI_MAX_STR_LEN]
Definition: papi.h:629
#define PAPI_ENOEVNT
Definition: papi.h:260
static papicuda_control_t * global_papicuda_control
Definition: linux-cuda.c:95
static int papicuda_start(hwd_context_t *ctx, hwd_control_state_t *ctrl)
Definition: linux-cuda.c:786
static int papicuda_init_thread(hwd_context_t *ctx)
Definition: linux-cuda.c:536
CUpti_EventID conEvents[PAPICUDA_MAX_COUNTERS]
Definition: linux-cuda.c:78
uint64_t cuptiStartTimestampNs
Definition: linux-cuda.c:66
static papicuda_context_t * global_papicuda_context
Definition: linux-cuda.c:92
static int papicuda_update_control_state(hwd_control_state_t *ctrl, NativeInfo_t *nativeInfo, int nativeCount, hwd_context_t *ctx)
Definition: linux-cuda.c:619
long long flags
Definition: iozone.c:12330
#define PAPICUDA_MAX_COUNTERS
Definition: linux-cuda.c:27
uint32_t * domainIDNumEvents
Definition: linux-cuda.c:56
#define papi_free(a)
Definition: papi_memory.h:35
uint32_t * availEventIDArray
Definition: linux-cuda.c:38
#define papi_malloc(a)
Definition: papi_memory.h:34
#define PAPI_ENOSUPP
Definition: papi.h:271
CUpti_EventGroupSets * eventGroupPasses
Definition: linux-cuda.c:80
#define PAPI_DOM_KERNEL
Definition: papi.h:300
int * availEventDeviceNum
Definition: linux-cuda.c:37
#define PAPI_DOM_ALL
Definition: papi.h:303
int cudaSetDevice(int devnum, int n1, int n2, int n3, void *ptr1)
#define CU_CALL(call, handleerror)
Definition: linux-cuda.c:116
static int papicuda_set_domain(hwd_control_state_t *ctrl, int domain)
Definition: linux-cuda.c:1186
return PAPI_OK
Definition: linux-nvml.c:497
static int papicuda_ntv_code_to_name(unsigned int EventCode, char *name, int len)
Definition: linux-cuda.c:1229
#define DLSYM_AND_CHECK(dllib, name)
#define PAPI_DOM_USER
Definition: papi.h:298
static int papicuda_read(hwd_context_t *ctx, hwd_control_state_t *ctrl, long long **values, int flags)
Definition: linux-cuda.c:831
void
Definition: iozone.c:18627
return PAPI_EINVAL
Definition: linux-nvml.c:436
PAPI_component_info_t cmp_info
Definition: papi_vector.h:20
static int papicuda_convert_metric_value_to_long_long(CUpti_MetricValue metricValue, CUpti_MetricValueKind valueKind, long long int *papiValue)
Definition: linux-cuda.c:481
static FILE * fp
void double value
Definition: iozone.c:18781
Return codes and api definitions.
static void * dl1
Definition: linux-cuda.c:84
#define CHECK_PRINT_EVAL(checkcond, str, evalthis)
Definition: linux-cuda.c:98
struct client_command cc
Definition: iozone.c:21326
#define PAPI_EMISC
Definition: papi.h:267
struct papicuda_name_desc * availEventDesc
Definition: linux-cuda.c:40
#define PAPI_2MAX_STR_LEN
Definition: papi.h:466
static int cidx
char disabled_reason[PAPI_MAX_STR_LEN]
Definition: papi.h:636
static void * dl2
Definition: linux-cuda.c:85
CUpti_EventDomainID * domainIDArray
Definition: linux-cuda.c:55
void(* _dl_non_dynamic_init)(void)
Definition: linux-cuda.c:160
int i
Definition: fileop.c:140
static int papicuda_stop(hwd_context_t *ctx, hwd_control_state_t *ctrl)
Definition: linux-cuda.c:997
int one
char *long long size
Definition: iozone.c:12023
static int papicuda_reset(hwd_context_t *ctx, hwd_control_state_t *ctrl)
Definition: linux-cuda.c:1125
static int papicuda_cleanup_eventset(hwd_control_state_t *ctrl)
Definition: linux-cuda.c:1033
__attribute__((constructor))
Definition: init_fini.c:12
uint32_t availEventSize
Definition: linux-cuda.c:35
#define SUBDBG(format, args...)
Definition: papi_debug.h:63
long long
Definition: iozone.c:19827
#define CUDA_CALL(call, handleerror)
Definition: linux-cuda.c:107
static int papicuda_ntv_enum_events(unsigned int *EventCode, int modifier)
Definition: linux-cuda.c:1202
#define DECLARECUFUNC(funcname, funcsig)
static void * dl3
Definition: linux-cuda.c:86
#define PAPI_ECMP
Definition: papi.h:256
int papicuda_shutdown_thread(hwd_context_t *ctx)
Definition: linux-cuda.c:1068
#define PAPI_INT_SIGNAL
Definition: papi_internal.h:53
#define PAPI_GRN_THR
Definition: papi.h:362
papi_vector_t _cuda_vector
Definition: linux-cuda.c:89
#define PAPI_ENOMEM
Definition: papi.h:254
static int papicuda_shutdown_component(void)
Definition: linux-cuda.c:1077
static int papicuda_ntv_code_to_descr(unsigned int EventCode, char *name, int len)
Definition: linux-cuda.c:1249
struct papicuda_device_desc * deviceArray
Definition: linux-cuda.c:34
static int papicuda_add_native_events(papicuda_context_t *gctxt)
Definition: linux-cuda.c:297
uint32_t countOfActiveCUContexts
Definition: linux-cuda.c:61
static int papicuda_init_control_state(hwd_control_state_t *ctrl)
Definition: linux-cuda.c:592
static int papicuda_ctrl(hwd_context_t *ctx, int code, _papi_int_option_t *option)
Definition: linux-cuda.c:1167
char * name
Definition: iozone.c:23648
long long activeEventValues[PAPICUDA_MAX_COUNTERS]
Definition: linux-cuda.c:65
int
Definition: iozone.c:18528
#define PAPI_MIN_STR_LEN
Definition: papi.h:464
#define PAPI_ENOINIT
Definition: papi.h:269
#define DECLARECUPTIFUNC(funcname, funcsig)
#define PAPI_MAX_STR_LEN
Definition: papi.h:465
char deviceName[PAPI_MIN_STR_LEN]
Definition: linux-cuda.c:53
static int papicuda_init_component(int cidx)
Definition: linux-cuda.c:557
#define PAPI_ECOMBO
Definition: papi.h:277
#define PAPI_DOM_OTHER
Definition: papi.h:301
return
Definition: iozone.c:22170
CUpti_EventID conMetrics[PAPICUDA_MAX_COUNTERS]
Definition: linux-cuda.c:75
CUpti_ActivityKind * availEventKind
Definition: linux-cuda.c:36
static long long values[NUM_EVENTS]
Definition: init_fini.c:10
ssize_t retval
Definition: libasync.c:338
struct papicuda_active_cucontext_s * arrayOfActiveCUContexts[PAPICUDA_MAX_COUNTERS]
Definition: linux-cuda.c:62
int activeEventIndex[PAPICUDA_MAX_COUNTERS]
Definition: linux-cuda.c:64
uint32_t activeEventCount
Definition: linux-cuda.c:63
void exit()
uint32_t * availEventIsBeingMeasuredInEventset
Definition: linux-cuda.c:39
uint64_t cuptiReadTimestampNs
Definition: linux-cuda.c:67
#define papi_calloc(a, b)
Definition: papi_memory.h:37
#define CUPTI_CALL(call, handleerror)
Definition: linux-cuda.c:126
#define DECLARECUDAFUNC(funcname, funcsig)