linux-nvml.c File Reference

This is an NVML component, it demos the component interface and implements two counters nvmlDeviceGetPowerUsage, nvmlDeviceGetTemperature from Nvidia Management Library. Please refer to NVML documentation for details about nvmlDeviceGetPowerUsage, nvmlDeviceGetTemperature. Power is reported in mW and temperature in Celcius. More...

Include dependency graph for linux-nvml.c:

Go to the source code of this file.

Data Structures

struct  nvml_context_t

Defines

#define CUDAAPI   __attribute__((weak))
#define CUDARTAPI   __attribute__((weak))
#define DECLDIR   __attribute__((weak))
#define NVML_MAX_COUNTERS   100

Functions

unsigned long long getClockSpeed (nvmlDevice_t dev, nvmlClockType_t which_one)
unsigned long long getEccLocalErrors (nvmlDevice_t dev, nvmlEccBitType_t bits, int which_one)
unsigned long long getFanSpeed (nvmlDevice_t dev)
unsigned long long getMaxClockSpeed (nvmlDevice_t dev, nvmlClockType_t which_one)
unsigned long long getMemoryInfo (nvmlDevice_t dev, int which_one)
unsigned long long getPState (nvmlDevice_t dev)
unsigned long long getPowerUsage (nvmlDevice_t dev)
unsigned long long getTemperature (nvmlDevice_t dev)
unsigned long long getTotalEccErrors (nvmlDevice_t dev, nvmlEccBitType_t bits)
unsigned long long getUtilization (nvmlDevice_t dev, int which_one)
static void nvml_hardware_reset ()
static int nvml_hardware_read (long long *value, int which_one)
int _papi_nvml_init_thread (hwd_context_t *ctx)
static int detectDevices ()
static void createNativeEvents ()
int _papi_nvml_init_component (int cidx)
static int linkCudaLibraries ()
int _papi_nvml_init_control_state (hwd_control_state_t *ctl)
int _papi_nvml_update_control_state (hwd_control_state_t *ctl, NativeInfo_t *native, int count, hwd_context_t *ctx)
int _papi_nvml_start (hwd_context_t *ctx, hwd_control_state_t *ctl)
int _papi_nvml_stop (hwd_context_t *ctx, hwd_control_state_t *ctl)
int _papi_nvml_read (hwd_context_t *ctx, hwd_control_state_t *ctl, long long **events, int flags)
int _papi_nvml_write (hwd_context_t *ctx, hwd_control_state_t *ctl, long long *events)
int _papi_nvml_reset (hwd_context_t *ctx, hwd_control_state_t *ctl)
int _papi_nvml_shutdown_component ()
int _papi_nvml_shutdown_thread (hwd_context_t *ctx)
int _papi_nvml_ctl (hwd_context_t *ctx, int code, _papi_int_option_t *option)
int _papi_nvml_set_domain (hwd_control_state_t *cntrl, int domain)
int _papi_nvml_ntv_enum_events (unsigned int *EventCode, int modifier)
int _papi_nvml_ntv_code_to_name (unsigned int EventCode, char *name, int len)
int _papi_nvml_ntv_code_to_descr (unsigned int EventCode, char *descr, int len)
int _papi_nvml_ntv_code_to_info (unsigned int EventCode, PAPI_event_info_t *info)

Variables

void(* _dl_non_dynamic_init )(void)
 nvml_control_state_t
static nvml_native_event_entry_tnvml_native_table = NULL
static int device_count = 0
static int num_events = 0
static nvmlDevice_t * devices = NULL
static int * features = NULL
papi_vector_t _nvml_vector

Detailed Description

Author:
Kiran Kumar Kasichayanula kkasicha@utk.edu
James Ralph ralph@eecs.utk.edu

Definition in file linux-nvml.c.


Define Documentation

#define CUDAAPI   __attribute__((weak))
#define CUDARTAPI   __attribute__((weak))
#define DECLDIR   __attribute__((weak))
#define NVML_MAX_COUNTERS   100

Function Documentation

int _papi_nvml_ctl ( hwd_context_t ctx,
int  code,
_papi_int_option_t option 
)

This function sets various options in the component

Parameters:
code valid are PAPI_SET_DEFDOM, PAPI_SET_DOMAIN, PAPI_SETDEFGRN, PAPI_SET_GRANUL and PAPI_SET_INHERIT

Definition at line 1358 of file linux-nvml.c.

01359 {
01360         SUBDBG( "Enter: ctx: %p, code: %d\n", ctx, code );
01361 
01362         (void) ctx;
01363         (void) code;
01364         (void) option;
01365 
01366 
01367         /* FIXME.  This should maybe set up more state, such as which counters are active and */
01368         /*         counter mappings. */
01369 
01370         return PAPI_OK;
01371 }

int _papi_nvml_init_component ( int  cidx  ) 

Initialize hardware counters, setup the function vector table and get hardware information, this routine is called when the PAPI process is initialized (IE PAPI_library_init)

Definition at line 909 of file linux-nvml.c.

00910 {
00911         SUBDBG ("Entry: cidx: %d\n", cidx);
00912         nvmlReturn_t ret;
00913         cudaError_t cuerr;
00914         int papi_errorcode;
00915 
00916         int cuda_count = 0;
00917         unsigned int nvml_count = 0;
00918 
00919         /* link in the cuda and nvml libraries and resolve the symbols we need to use */
00920         if (linkCudaLibraries() != PAPI_OK) {
00921             SUBDBG ("Dynamic link of CUDA libraries failed, component will be disabled.\n");
00922             SUBDBG ("See disable reason in papi_component_avail output for more details.\n");
00923             return (PAPI_ENOSUPP);
00924         }
00925 
00926         ret = (*nvmlInitPtr)();
00927         if ( NVML_SUCCESS != ret ) {
00928                 strcpy(_nvml_vector.cmp_info.disabled_reason, "The NVIDIA managament library failed to initialize.");
00929                 return PAPI_ENOSUPP;
00930         }
00931 
00932         cuerr = (*cuInitPtr)( 0 );
00933         if ( CUDA_SUCCESS != cuerr ) {
00934                 strcpy(_nvml_vector.cmp_info.disabled_reason, "The CUDA library failed to initialize.");
00935                 return PAPI_ENOSUPP;
00936         }
00937 
00938         /* Figure out the number of CUDA devices in the system */
00939         ret = (*nvmlDeviceGetCountPtr)( &nvml_count );
00940         if ( NVML_SUCCESS != ret ) {
00941                 strcpy(_nvml_vector.cmp_info.disabled_reason, "Unable to get a count of devices from the NVIDIA managament library.");
00942                 return PAPI_ENOSUPP;
00943         }
00944 
00945         cuerr = (*cudaGetDeviceCountPtr)( &cuda_count );
00946         if ( CUDA_SUCCESS != cuerr ) {
00947                 strcpy(_nvml_vector.cmp_info.disabled_reason, "Unable to get a device count from CUDA.");
00948                 return PAPI_ENOSUPP;
00949         }
00950 
00951         /* We can probably recover from this, when we're clever */
00952         if ( (cuda_count > 0) && (nvml_count != (unsigned int)cuda_count ) ) {
00953                 strcpy(_nvml_vector.cmp_info.disabled_reason, "Cuda and the NVIDIA managament library have different device counts.");
00954                 return PAPI_ENOSUPP;
00955         }
00956 
00957         device_count = cuda_count;
00958 
00959         /* A per device representation of what events are present */
00960         features = (int*)papi_malloc(sizeof(int) * device_count );
00961 
00962         /* Handles to each device */
00963         devices = (nvmlDevice_t*)papi_malloc(sizeof(nvmlDevice_t) * device_count);
00964 
00965         /* Figure out what events are supported on each card. */
00966         if ( (papi_errorcode = detectDevices( ) ) != PAPI_OK ) {
00967             papi_free(features);
00968             papi_free(devices);
00969             sprintf(_nvml_vector.cmp_info.disabled_reason, "An error occured in device feature detection, please check your NVIDIA Management Library and CUDA install." );
00970             return PAPI_ENOSUPP;
00971         }
00972 
00973         /* The assumption is that if everything went swimmingly in detectDevices, 
00974             all nvml calls here should be fine. */
00975         createNativeEvents( );
00976 
00977         /* Export the total number of events available */
00978         _nvml_vector.cmp_info.num_native_events = num_events;
00979 
00980         /* Export the component id */
00981         _nvml_vector.cmp_info.CmpIdx = cidx;
00982 
00983         /* Export the number of 'counters' */
00984         _nvml_vector.cmp_info.num_cntrs = num_events;
00985         _nvml_vector.cmp_info.num_mpx_cntrs = num_events;
00986 
00987         return PAPI_OK;
00988 }

Here is the call graph for this function:

int _papi_nvml_init_control_state ( hwd_control_state_t ctl  ) 

Setup a counter control state. In general a control state holds the hardware info for an EventSet.

Definition at line 1170 of file linux-nvml.c.

01171 {
01172         SUBDBG( "nvml_init_control_state... %p\n", ctl );
01173         nvml_control_state_t *nvml_ctl = ( nvml_control_state_t * ) ctl;
01174         memset( nvml_ctl, 0, sizeof ( nvml_control_state_t ) );
01175 
01176         return PAPI_OK;
01177 }

int _papi_nvml_init_thread ( hwd_context_t ctx  ) 

This is called whenever a thread is initialized

Definition at line 469 of file linux-nvml.c.

00470 {
00471         (void) ctx;
00472 
00473         SUBDBG( "Enter: ctx: %p\n", ctx );
00474 
00475         return PAPI_OK;
00476 }

int _papi_nvml_ntv_code_to_descr ( unsigned int  EventCode,
char *  descr,
int  len 
)

Takes a native event code and passes back the event description

Parameters:
EventCode is the native event code
descr is a pointer for the description to be copied to
len is the size of the descr string

Definition at line 1488 of file linux-nvml.c.

01489 {
01490         int index;
01491         index = EventCode;
01492 
01493         if (index >= num_events) return PAPI_ENOEVNT;
01494 
01495         strncpy( descr, nvml_native_table[index].description, len );
01496 
01497         return PAPI_OK;
01498 }

int _papi_nvml_ntv_code_to_info ( unsigned int  EventCode,
PAPI_event_info_t info 
)

Takes a native event code and passes back the event info

Parameters:
EventCode is the native event code
info is a pointer for the info to be copied to

Definition at line 1505 of file linux-nvml.c.

01506 {
01507 
01508   int index = EventCode;
01509 
01510   if ( ( index < 0) || (index >= num_events )) return PAPI_ENOEVNT;
01511 
01512   strncpy( info->symbol, nvml_native_table[index].name, sizeof(info->symbol)-1);
01513   info->symbol[sizeof(info->symbol)-1] = '\0';
01514 
01515   strncpy( info->units, nvml_native_table[index].units, sizeof(info->units)-1);
01516   info->units[sizeof(info->units)-1] = '\0';
01517 
01518   strncpy( info->long_descr, nvml_native_table[index].description, sizeof(info->long_descr)-1);
01519   info->long_descr[sizeof(info->long_descr)-1] = '\0';
01520 
01521 //  info->data_type = nvml_native_table[index].return_type;
01522 
01523   return PAPI_OK;
01524 }

int _papi_nvml_ntv_code_to_name ( unsigned int  EventCode,
char *  name,
int  len 
)

Takes a native event code and passes back the name

Parameters:
EventCode is the native event code
name is a pointer for the name to be copied to
len is the size of the name string

Definition at line 1467 of file linux-nvml.c.

01468 {
01469         SUBDBG("Entry: EventCode: %#x, name: %s, len: %d\n", EventCode, name, len);
01470         int index;
01471 
01472         index = EventCode;
01473 
01474         /* Make sure we are in range */
01475         if (index >= num_events) return PAPI_ENOEVNT;
01476 
01477         strncpy( name, nvml_native_table[index].name, len );
01478 
01479         return PAPI_OK;
01480 }

int _papi_nvml_ntv_enum_events ( unsigned int *  EventCode,
int  modifier 
)

Enumerate Native Events

Parameters:
EventCode is the event of interest
modifier is one of PAPI_ENUM_FIRST, PAPI_ENUM_EVENTS If your component has attribute masks then these need to be handled here as well.

Definition at line 1426 of file linux-nvml.c.

01427 {
01428         int index;
01429 
01430         switch ( modifier ) {
01431 
01432                 /* return EventCode of first event */
01433                 case PAPI_ENUM_FIRST:
01434                         /* return the first event that we support */
01435 
01436                         *EventCode = 0;
01437                         return PAPI_OK;
01438 
01439                         /* return EventCode of next available event */
01440                 case PAPI_ENUM_EVENTS:
01441                         index = *EventCode;
01442 
01443                         /* Make sure we are in range */
01444                         if ( index < num_events - 1 ) {
01445 
01446                                 /* This assumes a non-sparse mapping of the events */
01447                                 *EventCode = *EventCode + 1;
01448                                 return PAPI_OK;
01449                         } else {
01450                                 return PAPI_ENOEVNT;
01451                         }
01452                         break;
01453 
01454                 default:
01455                         return PAPI_EINVAL;
01456         }
01457 
01458         return PAPI_EINVAL;
01459 }

int _papi_nvml_read ( hwd_context_t ctx,
hwd_control_state_t ctl,
long long **  events,
int  flags 
)

Triggered by PAPI_read()

Definition at line 1252 of file linux-nvml.c.

01254 {
01255         SUBDBG( "Enter: ctx: %p, flags: %d\n", ctx, flags );
01256 
01257         (void) ctx;
01258         (void) flags;
01259         int i;
01260         int ret;
01261         nvml_control_state_t* nvml_ctl = ( nvml_control_state_t*) ctl;   
01262 
01263 
01264         for (i=0;i<nvml_ctl->num_events;i++) {
01265                 if ( PAPI_OK != 
01266                                 ( ret = nvml_hardware_read( &nvml_ctl->counter[i], 
01267                                                             nvml_ctl->which_counter[i]) ))
01268                         return ret;
01269 
01270         }
01271         /* return pointer to the values we read */
01272         *events = nvml_ctl->counter;    
01273         return PAPI_OK;
01274 }

Here is the call graph for this function:

int _papi_nvml_reset ( hwd_context_t ctx,
hwd_control_state_t ctl 
)

Triggered by PAPI_reset() but only if the EventSet is currently running

Definition at line 1300 of file linux-nvml.c.

01301 {
01302         SUBDBG( "Enter: ctx: %p, ctl: %p\n", ctx, ctl );
01303         
01304         (void) ctx;
01305         (void) ctl;
01306 
01307         /* Reset the hardware */
01308         nvml_hardware_reset(  );
01309 
01310         return PAPI_OK;
01311 }

Here is the call graph for this function:

int _papi_nvml_set_domain ( hwd_control_state_t cntrl,
int  domain 
)

This function has to set the bits needed to count different domains In particular: PAPI_DOM_USER, PAPI_DOM_KERNEL PAPI_DOM_OTHER By default return PAPI_EINVAL if none of those are specified and PAPI_OK with success PAPI_DOM_USER is only user context is counted PAPI_DOM_KERNEL is only the Kernel/OS context is counted PAPI_DOM_OTHER is Exception/transient mode (like user TLB misses) PAPI_DOM_ALL is all of the domains

Definition at line 1383 of file linux-nvml.c.

01384 {
01385         SUBDBG( "Enter: cntrl: %p, domain: %d\n", cntrl, domain );
01386 
01387         (void) cntrl;
01388 
01389         int found = 0;
01390 
01391         if ( PAPI_DOM_USER & domain ) {
01392                 SUBDBG( " PAPI_DOM_USER \n" );
01393                 found = 1;
01394         }
01395         if ( PAPI_DOM_KERNEL & domain ) {
01396                 SUBDBG( " PAPI_DOM_KERNEL \n" );
01397                 found = 1;
01398         }
01399         if ( PAPI_DOM_OTHER & domain ) {
01400                 SUBDBG( " PAPI_DOM_OTHER \n" );
01401                 found = 1;
01402         }
01403         if ( PAPI_DOM_ALL & domain ) {
01404                 SUBDBG( " PAPI_DOM_ALL \n" );
01405                 found = 1;
01406         }
01407         if ( !found )
01408                 return ( PAPI_EINVAL );
01409 
01410         return PAPI_OK;
01411 }

int _papi_nvml_shutdown_component (  ) 

Triggered by PAPI_shutdown()

Definition at line 1315 of file linux-nvml.c.

01316 {
01317         SUBDBG( "Enter:\n" );
01318 
01319     if (nvml_native_table != NULL)
01320         papi_free(nvml_native_table);
01321     if (devices != NULL)
01322         papi_free(devices);
01323     if (features != NULL)
01324         papi_free(features);
01325 
01326         (*nvmlShutdownPtr)();
01327 
01328         device_count = 0;
01329         num_events = 0;
01330 
01331         // close the dynamic libraries needed by this component (opened in the init component call)
01332         dlclose(dl1);
01333         dlclose(dl2);
01334         dlclose(dl3);
01335 
01336         return PAPI_OK;
01337 }

int _papi_nvml_shutdown_thread ( hwd_context_t ctx  ) 

Called at thread shutdown

Definition at line 1341 of file linux-nvml.c.

01342 {
01343         SUBDBG( "Enter: ctx: %p\n", ctx );
01344 
01345         (void) ctx;
01346 
01347         /* Last chance to clean up thread */
01348 
01349         return PAPI_OK;
01350 }

int _papi_nvml_start ( hwd_context_t ctx,
hwd_control_state_t ctl 
)

Triggered by PAPI_start()

Definition at line 1209 of file linux-nvml.c.

01210 {
01211         SUBDBG( "Enter: ctx: %p, ctl: %p\n", ctx, ctl );
01212 
01213         (void) ctx;
01214         (void) ctl;
01215 
01216         /* anything that would need to be set at counter start time */
01217 
01218         /* reset */
01219         /* start the counting */
01220 
01221         return PAPI_OK;
01222 }

int _papi_nvml_stop ( hwd_context_t ctx,
hwd_control_state_t ctl 
)

Triggered by PAPI_stop()

Definition at line 1227 of file linux-nvml.c.

01228 {
01229         SUBDBG( "Enter: ctx: %p, ctl: %p\n", ctx, ctl );
01230 
01231         int i;
01232         (void) ctx;
01233         (void) ctl;
01234         int ret;
01235 
01236         nvml_control_state_t* nvml_ctl = ( nvml_control_state_t*) ctl;
01237 
01238         for (i=0;i<nvml_ctl->num_events;i++) {
01239                 if ( PAPI_OK != 
01240                                 ( ret = nvml_hardware_read( &nvml_ctl->counter[i], 
01241                                                             nvml_ctl->which_counter[i]) ))
01242                         return ret;
01243 
01244         }
01245 
01246         return PAPI_OK;
01247 }

Here is the call graph for this function:

int _papi_nvml_update_control_state ( hwd_control_state_t ctl,
NativeInfo_t native,
int  count,
hwd_context_t ctx 
)

Triggered by eventset operations like add or remove

Definition at line 1182 of file linux-nvml.c.

01186 {
01187         SUBDBG( "Enter: ctl: %p, ctx: %p\n", ctl, ctx );
01188         int i, index;
01189 
01190         nvml_control_state_t *nvml_ctl = ( nvml_control_state_t * ) ctl;   
01191         (void) ctx;
01192 
01193 
01194         /* if no events, return */
01195         if (count==0) return PAPI_OK;
01196 
01197         for( i = 0; i < count; i++ ) {
01198                 index = native[i].ni_event;
01199                 nvml_ctl->which_counter[i]=index;
01200                 /* We have no constraints on event position, so any event */
01201                 /* can be in any slot.                                    */
01202                 native[i].ni_position = i;
01203         }
01204         nvml_ctl->num_events=count;
01205         return PAPI_OK;
01206 }

int _papi_nvml_write ( hwd_context_t ctx,
hwd_control_state_t ctl,
long long *  events 
)

Triggered by PAPI_write(), but only if the counters are running

Definition at line 1279 of file linux-nvml.c.

01281 {
01282         SUBDBG( "Enter: ctx: %p, ctl: %p\n", ctx, ctl );
01283 
01284         (void) ctx;
01285         (void) ctl;
01286         (void) events;
01287 
01288 
01289         /* You can change ECC mode and compute exclusivity modes on the cards */
01290         /* But I don't see this as a function of a PAPI component at this time */
01291         /* All implementation issues aside. */
01292         return PAPI_OK;
01293 }

static void createNativeEvents ( void   )  [static]

Definition at line 665 of file linux-nvml.c.

00666 {
00667         char name[64];
00668         char sanitized_name[PAPI_MAX_STR_LEN];
00669         char names[device_count][64];
00670 
00671         int i, nameLen = 0, j;
00672         int isUnique = 1;
00673 
00674         nvml_native_event_entry_t* entry;
00675         nvmlReturn_t ret;
00676 
00677         nvml_native_table = (nvml_native_event_entry_t*) papi_malloc( 
00678                         sizeof(nvml_native_event_entry_t) * num_events );   
00679         memset( nvml_native_table, 0x0, sizeof(nvml_native_event_entry_t) * num_events );
00680         entry = &nvml_native_table[0];
00681 
00682         for (i=0; i < device_count; i++ ) {
00683                 memset( names[i], 0x0, 64 );
00684                 isUnique = 1;
00685                 ret = (*nvmlDeviceGetNamePtr)( devices[i], name, sizeof(name)-1 );
00686                 name[sizeof(name)-1] = '\0';    // to safely use strlen operation below, the variable 'name' must be null terminated
00687 
00688                 for (j=0; j < i; j++ ) 
00689                 {
00690                         if ( 0 == strncmp( name, names[j], 64 ) )
00691                                 isUnique = 0;
00692                 }
00693 
00694                 if ( isUnique ) {
00695                         nameLen = strlen(name);
00696                         strncpy(sanitized_name, name, PAPI_MAX_STR_LEN );
00697                         for (j=0; j < nameLen; j++)
00698                                 if ( ' ' == sanitized_name[j] )
00699                                         sanitized_name[j] = '_';
00700 
00701 
00702 
00703                         if ( HAS_FEATURE( features[i], FEATURE_CLOCK_INFO ) ) {
00704                                 sprintf( entry->name, "%s:graphics_clock", sanitized_name );
00705                                 strncpy(entry->description,"Graphics clock domain (MHz).", PAPI_MAX_STR_LEN );
00706                                 entry->options.clock = NVML_CLOCK_GRAPHICS;
00707                                 entry->type = FEATURE_CLOCK_INFO;
00708                                 entry++;
00709 
00710                                 sprintf( entry->name, "%s:sm_clock", sanitized_name);
00711                                 strncpy(entry->description,"SM clock domain (MHz).", PAPI_MAX_STR_LEN);
00712                                 entry->options.clock = NVML_CLOCK_SM;
00713                                 entry->type = FEATURE_CLOCK_INFO;
00714                                 entry++;
00715 
00716                                 sprintf( entry->name, "%s:memory_clock", sanitized_name);
00717                                 strncpy(entry->description,"Memory clock domain (MHz).", PAPI_MAX_STR_LEN);
00718                                 entry->options.clock = NVML_CLOCK_MEM;
00719                                 entry->type = FEATURE_CLOCK_INFO;
00720                                 entry++;
00721                         }   
00722 
00723                         if ( HAS_FEATURE( features[i], FEATURE_ECC_LOCAL_ERRORS ) ) { 
00724                                 sprintf(entry->name, "%s:l1_single_ecc_errors", sanitized_name);
00725                                 strncpy(entry->description,"L1 cache single bit ECC", PAPI_MAX_STR_LEN);
00726                                 entry->options.ecc_opts = (struct local_ecc){
00727                                         .bits = NVML_SINGLE_BIT_ECC,
00728                                                 .which_one = LOCAL_ECC_L1,
00729                                 };
00730                                 entry->type = FEATURE_ECC_LOCAL_ERRORS;
00731                                 entry++;
00732 
00733                                 sprintf(entry->name, "%s:l2_single_ecc_errors", sanitized_name);
00734                                 strncpy(entry->description,"L2 cache single bit ECC", PAPI_MAX_STR_LEN);
00735                                 entry->options.ecc_opts = (struct local_ecc){
00736                                         .bits = NVML_SINGLE_BIT_ECC,
00737                                                 .which_one = LOCAL_ECC_L2,
00738                                 };
00739                                 entry->type = FEATURE_ECC_LOCAL_ERRORS;
00740                                 entry++;
00741 
00742                                 sprintf(entry->name, "%s:memory_single_ecc_errors", sanitized_name);
00743                                 strncpy(entry->description,"Device memory single bit ECC", PAPI_MAX_STR_LEN);
00744                                 entry->options.ecc_opts = (struct local_ecc){
00745                                         .bits = NVML_SINGLE_BIT_ECC,
00746                                                 .which_one = LOCAL_ECC_MEM,
00747                                 };
00748                                 entry->type = FEATURE_ECC_LOCAL_ERRORS;
00749                                 entry++;
00750 
00751                                 sprintf(entry->name, "%s:regfile_single_ecc_errors", sanitized_name);
00752                                 strncpy(entry->description,"Register file single bit ECC", PAPI_MAX_STR_LEN);
00753                                 entry->options.ecc_opts = (struct local_ecc){
00754                                         .bits = NVML_SINGLE_BIT_ECC,
00755                                                 .which_one = LOCAL_ECC_REGFILE,
00756                                 };
00757                                 entry->type = FEATURE_ECC_LOCAL_ERRORS;
00758                                 entry++;
00759 
00760                                 sprintf(entry->name, "%s:1l_double_ecc_errors", sanitized_name);
00761                                 strncpy(entry->description,"L1 cache double bit ECC", PAPI_MAX_STR_LEN);
00762                                 entry->options.ecc_opts = (struct local_ecc){
00763                                         .bits = NVML_DOUBLE_BIT_ECC,
00764                                                 .which_one = LOCAL_ECC_L1,
00765                                 };
00766                                 entry->type = FEATURE_ECC_LOCAL_ERRORS;
00767                                 entry++;
00768 
00769                                 sprintf(entry->name, "%s:l2_double_ecc_errors", sanitized_name);
00770                                 strncpy(entry->description,"L2 cache double bit ECC", PAPI_MAX_STR_LEN);
00771                                 entry->options.ecc_opts = (struct local_ecc){
00772                                         .bits = NVML_DOUBLE_BIT_ECC,
00773                                                 .which_one = LOCAL_ECC_L2,
00774                                 };
00775                                 entry->type = FEATURE_ECC_LOCAL_ERRORS;
00776                                 entry++;
00777 
00778                                 sprintf(entry->name, "%s:memory_double_ecc_errors", sanitized_name);
00779                                 strncpy(entry->description,"Device memory double bit ECC", PAPI_MAX_STR_LEN);
00780                                 entry->options.ecc_opts = (struct local_ecc){
00781                                         .bits = NVML_DOUBLE_BIT_ECC,
00782                                                 .which_one = LOCAL_ECC_MEM,
00783                                 };
00784                                 entry->type = FEATURE_ECC_LOCAL_ERRORS;
00785                                 entry++;
00786 
00787                                 sprintf(entry->name, "%s:regfile_double_ecc_errors", sanitized_name);
00788                                 strncpy(entry->description,"Register file double bit ECC", PAPI_MAX_STR_LEN);
00789                                 entry->options.ecc_opts = (struct local_ecc){
00790                                         .bits = NVML_DOUBLE_BIT_ECC,
00791                                                 .which_one = LOCAL_ECC_REGFILE,
00792                                 };
00793                                 entry->type = FEATURE_ECC_LOCAL_ERRORS;
00794                                 entry++;
00795                         }
00796 
00797                         if ( HAS_FEATURE( features[i], FEATURE_FAN_SPEED ) ) {
00798                                 sprintf( entry->name, "%s:fan_speed", sanitized_name);
00799                                 strncpy(entry->description,"The fan speed expressed as a percent of the maximum, i.e. full speed is 100%", PAPI_MAX_STR_LEN);
00800                                 entry->type = FEATURE_FAN_SPEED;
00801                                 entry++;
00802                         }
00803 
00804                         if ( HAS_FEATURE( features[i], FEATURE_MAX_CLOCK ) ) {
00805                                 sprintf( entry->name, "%s:graphics_max_clock", sanitized_name);
00806                                 strncpy(entry->description,"Maximal Graphics clock domain (MHz).", PAPI_MAX_STR_LEN);
00807                                 entry->options.clock = NVML_CLOCK_GRAPHICS;
00808                                 entry->type = FEATURE_MAX_CLOCK;
00809                                 entry++;
00810 
00811                                 sprintf( entry->name, "%s:sm_max_clock", sanitized_name);
00812                                 strncpy(entry->description,"Maximal SM clock domain (MHz).", PAPI_MAX_STR_LEN);
00813                                 entry->options.clock = NVML_CLOCK_SM;
00814                                 entry->type = FEATURE_MAX_CLOCK;
00815                                 entry++;
00816 
00817                                 sprintf( entry->name, "%s:memory_max_clock", sanitized_name);
00818                                 strncpy(entry->description,"Maximal Memory clock domain (MHz).", PAPI_MAX_STR_LEN);
00819                                 entry->options.clock = NVML_CLOCK_MEM;
00820                                 entry->type = FEATURE_MAX_CLOCK;
00821                                 entry++;
00822                         }
00823 
00824                         if ( HAS_FEATURE( features[i], FEATURE_MEMORY_INFO ) ) {
00825                                 sprintf( entry->name, "%s:total_memory", sanitized_name);
00826                                 strncpy(entry->description,"Total installed FB memory (in bytes).", PAPI_MAX_STR_LEN);
00827                                 entry->options.which_one = MEMINFO_TOTAL_MEMORY;
00828                                 entry->type = FEATURE_MEMORY_INFO;
00829                                 entry++;
00830 
00831                                 sprintf( entry->name, "%s:unallocated_memory", sanitized_name);
00832                                 strncpy(entry->description,"Uncallocated FB memory (in bytes).", PAPI_MAX_STR_LEN);
00833                                 entry->options.which_one = MEMINFO_UNALLOCED;
00834                                 entry->type = FEATURE_MEMORY_INFO;
00835                                 entry++;
00836 
00837                                 sprintf( entry->name, "%s:allocated_memory", sanitized_name);
00838                                 strncpy(entry->description, "Allocated FB memory (in bytes). Note that the driver/GPU always sets aside a small amount of memory for bookkeeping.", PAPI_MAX_STR_LEN);
00839                                 entry->options.which_one = MEMINFO_ALLOCED;
00840                                 entry->type = FEATURE_MEMORY_INFO;
00841                                 entry++;
00842                         }
00843 
00844                         if ( HAS_FEATURE( features[i], FEATURE_PERF_STATES ) ) {
00845                                 sprintf( entry->name, "%s:pstate", sanitized_name);
00846                                 strncpy(entry->description,"The performance state of the device.", PAPI_MAX_STR_LEN);
00847                                 entry->type = FEATURE_PERF_STATES;
00848                                 entry++;
00849                         }
00850 
00851                         if ( HAS_FEATURE( features[i], FEATURE_POWER ) ) {
00852                                 sprintf( entry->name, "%s:power", sanitized_name);
00853                                 // set the power event units value to "mW" for miliwatts
00854                                 strncpy( entry->units, "mW",PAPI_MIN_STR_LEN);
00855                                 strncpy(entry->description,"Power usage reading for the device, in miliwatts. This is the power draw (+/-5 watts) for the entire board: GPU, memory, etc.", PAPI_MAX_STR_LEN);
00856                                 entry->type = FEATURE_POWER;
00857                                 entry++;
00858                         }
00859 
00860                         if ( HAS_FEATURE( features[i], FEATURE_TEMP ) ) {
00861                                 sprintf( entry->name, "%s:temperature", sanitized_name);
00862                                 strncpy(entry->description,"Current temperature readings for the device, in degrees C.", PAPI_MAX_STR_LEN);
00863                                 entry->type = FEATURE_TEMP;
00864                                 entry++;
00865                         }
00866 
00867                         if ( HAS_FEATURE( features[i], FEATURE_ECC_TOTAL_ERRORS ) ) {
00868                                 sprintf( entry->name, "%s:total_ecc_errors", sanitized_name);
00869                                 strncpy(entry->description,"Total single bit errors.", PAPI_MAX_STR_LEN);
00870                                 entry->options.ecc_opts = (struct local_ecc){ 
00871                                         .bits = NVML_SINGLE_BIT_ECC, 
00872                                 };
00873                                 entry->type = FEATURE_ECC_TOTAL_ERRORS;
00874                                 entry++;
00875 
00876                                 sprintf( entry->name, "%s:total_ecc_errors", sanitized_name);
00877                                 strncpy(entry->description,"Total double bit errors.", PAPI_MAX_STR_LEN);
00878                                 entry->options.ecc_opts = (struct local_ecc){ 
00879                                         .bits = NVML_DOUBLE_BIT_ECC, 
00880                                 };
00881                                 entry->type = FEATURE_ECC_TOTAL_ERRORS;
00882                                 entry++;
00883                         }
00884 
00885                         if ( HAS_FEATURE( features[i], FEATURE_UTILIZATION ) ) {
00886                                 sprintf( entry->name, "%s:gpu_utilization", sanitized_name);
00887                                 strncpy(entry->description,"Percent of time over the past second during which one or more kernels was executing on the GPU.", PAPI_MAX_STR_LEN);
00888                                 entry->options.which_one = GPU_UTILIZATION;
00889                                 entry->type = FEATURE_UTILIZATION;
00890                                 entry++;
00891 
00892                                 sprintf( entry->name, "%s:memory_utilization", sanitized_name);
00893                                 strncpy(entry->description,"Percent of time over the past second during which global (device) memory was being read or written.", PAPI_MAX_STR_LEN);
00894                                 entry->options.which_one = MEMORY_UTILIZATION;
00895                                 entry->type = FEATURE_UTILIZATION;
00896                                 entry++;
00897                         }
00898                         strncpy( names[i], name, sizeof(names[0])-1);
00899                         names[i][sizeof(names[0])-1] = '\0';
00900                 }
00901         }
00902 }

Here is the caller graph for this function:

static int detectDevices (  )  [static]

Definition at line 479 of file linux-nvml.c.

00480 {
00481         nvmlReturn_t ret;
00482         nvmlEnableState_t mode = NVML_FEATURE_DISABLED;
00483         nvmlDevice_t handle;
00484         nvmlPciInfo_t info;
00485 
00486         cudaError_t cuerr;
00487 
00488         char busId[16];
00489         char name[64];
00490         char inforomECC[16];
00491         char inforomPower[16];
00492         char names[device_count][64];
00493         char nvml_busIds[device_count][16];
00494 
00495         float ecc_version = 0.0, power_version = 0.0;
00496 
00497         int i = 0,
00498             j = 0;
00499         int isTesla = 0;
00500         int isFermi = 0;
00501         int isUnique = 1;
00502 
00503         unsigned int temp = 0;
00504 
00505 
00506         /* list of nvml pci_busids */
00507     for (i=0; i < device_count; i++) {
00508         ret = (*nvmlDeviceGetHandleByIndexPtr)( i, &handle );
00509         if ( NVML_SUCCESS != ret ) {
00510             SUBDBG("nvmlDeviceGetHandleByIndex(%d) failed\n", i);
00511             return PAPI_ESYS;
00512         }
00513 
00514         ret = (*nvmlDeviceGetPciInfoPtr)( handle, &info );
00515         if ( NVML_SUCCESS != ret ) {
00516             SUBDBG("nvmlDeviceGetPciInfo() failed %s\n", (*nvmlErrorStringPtr)(ret) );
00517             return PAPI_ESYS;
00518         }
00519         strncpy(nvml_busIds[i], info.busId, sizeof(nvml_busIds[i])-1);
00520         nvml_busIds[i][sizeof(nvml_busIds[i])-1] = '\0';
00521     }
00522 
00523     /* We want to key our list of nvmlDevice_ts by each device's cuda index */
00524     for (i=0; i < device_count; i++) {
00525             cuerr = (*cudaDeviceGetPCIBusIdPtr)( busId, 16, i );
00526             if ( CUDA_SUCCESS != cuerr ) {
00527                 SUBDBG("cudaDeviceGetPCIBusId failed.\n");
00528                 return PAPI_ESYS;
00529             }
00530             for (j=0; j < device_count; j++ ) {
00531                     if ( !strncmp( busId, nvml_busIds[j], 16) ) {
00532                             ret = (*nvmlDeviceGetHandleByIndexPtr)(j, &devices[i] );
00533                             if ( NVML_SUCCESS != ret ) {
00534                                 SUBDBG("nvmlDeviceGetHandleByIndex(%d, &devices[%d]) failed.\n", j, i);
00535                                 return PAPI_ESYS;
00536                             }
00537                             break;
00538                     }
00539             }   
00540     }
00541 
00542         memset(names, 0x0, device_count*64);
00543         /* So for each card, check whats querable */
00544         for (i=0; i < device_count; i++ ) {
00545                 isTesla=0;
00546                 isFermi=1;
00547                 isUnique = 1;
00548                 features[i] = 0;
00549 
00550                 ret = (*nvmlDeviceGetNamePtr)( devices[i], name, sizeof(name)-1 );
00551                 if ( NVML_SUCCESS != ret) {
00552                     SUBDBG("nvmlDeviceGetName failed \n");
00553                     return PAPI_ESYS;
00554                 }
00555 
00556                 name[sizeof(name)-1] = '\0';    // to safely use strstr operation below, the variable 'name' must be null terminated
00557 
00558                 for (j=0; j < i; j++ ) 
00559                         if ( 0 == strncmp( name, names[j], 64 ) ) {
00560                                 /* if we have a match, and IF everything is sane, 
00561                                  * devices with the same name eg Tesla C2075 share features */
00562                                 isUnique = 0;
00563                                 features[i] = features[j];
00564 
00565                         }
00566 
00567                 if ( isUnique ) {
00568                         ret = (*nvmlDeviceGetInforomVersionPtr)( devices[i], NVML_INFOROM_ECC, inforomECC, 16);
00569                         if ( NVML_SUCCESS != ret ) {
00570                                 SUBDBG("nvmlGetInforomVersion carps %s\n", (*nvmlErrorStringPtr)(ret ) );
00571                                 isFermi = 0;
00572                         }
00573                         ret = (*nvmlDeviceGetInforomVersionPtr)( devices[i], NVML_INFOROM_POWER, inforomPower, 16);
00574                         if ( NVML_SUCCESS != ret ) {
00575                                 /* This implies the card is older then Fermi */
00576                                 SUBDBG("nvmlGetInforomVersion carps %s\n", (*nvmlErrorStringPtr)(ret ) );
00577                                 SUBDBG("Based upon the return to nvmlGetInforomVersion, we conclude this card is older then Fermi.\n");
00578                                 isFermi = 0;
00579                         } 
00580 
00581                         ecc_version = strtof(inforomECC, NULL );
00582                         power_version = strtof( inforomPower, NULL);
00583 
00584                         isTesla = ( NULL == strstr(name, "Tesla") ) ? 0:1;
00585 
00586                         /* For Tesla and Quadro products from Fermi and Kepler families. */
00587                         if ( isFermi ) {
00588                                 features[i] |= FEATURE_CLOCK_INFO;
00589                                 num_events += 3;
00590                         }
00591 
00592                         /*  For Tesla and Quadro products from Fermi and Kepler families. 
00593                             requires NVML_INFOROM_ECC 2.0 or higher for location-based counts
00594                             requires NVML_INFOROM_ECC 1.0 or higher for all other ECC counts
00595                             requires ECC mode to be enabled. */
00596                         ret = (*nvmlDeviceGetEccModePtr)( devices[i], &mode, NULL );
00597                         if ( NVML_SUCCESS == ret ) {
00598                             if ( NVML_FEATURE_ENABLED == mode) {
00599                             if ( ecc_version >= 2.0 ) {
00600                                 features[i] |= FEATURE_ECC_LOCAL_ERRORS;
00601                                 num_events += 8; /* {single bit, two bit errors} x { reg, l1, l2, memory } */
00602                             }
00603                             if ( ecc_version >= 1.0 ) {
00604                                 features[i] |= FEATURE_ECC_TOTAL_ERRORS;
00605                                 num_events += 2; /* single bit errors, double bit errors */
00606                             }
00607                             }
00608                         } else {
00609                             SUBDBG("nvmlDeviceGetEccMode does not appear to be supported. (nvml\
00610 return code %d)\n", ret);
00611                         }
00612 
00613                         /* For all discrete products with dedicated fans */
00614                         features[i] |= FEATURE_FAN_SPEED;
00615                         num_events++;
00616 
00617                         /* For Tesla and Quadro products from Fermi and Kepler families. */
00618                         if ( isFermi ) {
00619                                 features[i] |= FEATURE_MAX_CLOCK;
00620                                 num_events += 3;
00621                         }
00622 
00623                         /* For all products */
00624                         features[i] |= FEATURE_MEMORY_INFO;
00625                         num_events += 3; /* total, free, used */
00626 
00627                         /* For Tesla and Quadro products from the Fermi and Kepler families. */
00628                         if ( isFermi ) {
00629                                 features[i] |= FEATURE_PERF_STATES;
00630                                 num_events++;
00631                         }
00632 
00633                         /*  For "GF11x" Tesla and Quadro products from the Fermi family
00634                             requires NVML_INFOROM_POWER 3.0 or higher
00635                             For Tesla and Quadro products from the Kepler family
00636                             does not require NVML_INFOROM_POWER */
00637                         /* Just try reading power, if it works, enable it*/
00638                         ret = (*nvmlDeviceGetPowerUsagePtr)( devices[i], &temp);
00639                         if ( NVML_SUCCESS == ret ) {
00640                             features[i] |= FEATURE_POWER;
00641                             num_events++;
00642                         } else {
00643                             SUBDBG("nvmlDeviceGetPowerUsage does not appear to be supported on\
00644 this card. (nvml return code %d)\n", ret );
00645                         }
00646 
00647                         /* For all discrete and S-class products. */
00648                         features[i] |= FEATURE_TEMP;
00649                         num_events++;
00650 
00651                         /* For Tesla and Quadro products from the Fermi and Kepler families */
00652                         if (isFermi) {
00653                                 features[i] |= FEATURE_UTILIZATION;
00654                                 num_events += 2;
00655                         }
00656 
00657                         strncpy( names[i], name, sizeof(names[0])-1);
00658                         names[i][sizeof(names[0])-1] = '\0';
00659                 }
00660         }
00661         return PAPI_OK;
00662 }

Here is the caller graph for this function:

unsigned long long getClockSpeed ( nvmlDevice_t  dev,
nvmlClockType_t  which_one 
)

Definition at line 156 of file linux-nvml.c.

00157 {
00158         unsigned int ret = 0;
00159         nvmlReturn_t bad; 
00160         bad = (*nvmlDeviceGetClockInfoPtr)( dev, which_one, &ret );
00161 
00162         if ( NVML_SUCCESS != bad ) {
00163                 SUBDBG( "something went wrong %s\n", (*nvmlErrorStringPtr)(bad));
00164         }
00165 
00166         return (unsigned long long)ret;
00167 }

Here is the caller graph for this function:

unsigned long long getEccLocalErrors ( nvmlDevice_t  dev,
nvmlEccBitType_t  bits,
int  which_one 
)

Definition at line 170 of file linux-nvml.c.

00171 {
00172         nvmlEccErrorCounts_t counts;
00173 
00174         nvmlReturn_t bad; 
00175         bad = (*nvmlDeviceGetDetailedEccErrorsPtr)( dev, bits, NVML_VOLATILE_ECC , &counts);
00176 
00177         if ( NVML_SUCCESS != bad ) {
00178                 SUBDBG( "something went wrong %s\n", (*nvmlErrorStringPtr)(bad));
00179         }
00180 
00181 
00182         switch ( which_one ) {
00183                 case LOCAL_ECC_REGFILE:
00184                         return counts.registerFile;
00185                 case LOCAL_ECC_L1:
00186                         return counts.l1Cache;
00187                 case LOCAL_ECC_L2:
00188                         return counts.l2Cache;
00189                 case LOCAL_ECC_MEM:
00190                         return counts.deviceMemory;
00191                 default:
00192                         ;
00193         }
00194         return (unsigned long long)-1;
00195 }

Here is the caller graph for this function:

unsigned long long getFanSpeed ( nvmlDevice_t  dev  ) 

Definition at line 198 of file linux-nvml.c.

00199 {
00200         unsigned int ret = 0;
00201         nvmlReturn_t bad; 
00202         bad = (*nvmlDeviceGetFanSpeedPtr)( dev, &ret );
00203 
00204         if ( NVML_SUCCESS != bad ) {
00205                 SUBDBG( "something went wrong %s\n", (*nvmlErrorStringPtr)(bad));
00206         }
00207 
00208 
00209         return (unsigned long long)ret; 
00210 }

Here is the caller graph for this function:

unsigned long long getMaxClockSpeed ( nvmlDevice_t  dev,
nvmlClockType_t  which_one 
)

Definition at line 213 of file linux-nvml.c.

00214 {
00215         unsigned int ret = 0;
00216         nvmlReturn_t bad; 
00217         bad = (*nvmlDeviceGetClockInfoPtr)( dev, which_one, &ret );
00218 
00219         if ( NVML_SUCCESS != bad ) {
00220                 SUBDBG( "something went wrong %s\n", (*nvmlErrorStringPtr)(bad));
00221         }
00222 
00223 
00224         return (unsigned long long) ret;
00225 }

Here is the caller graph for this function:

unsigned long long getMemoryInfo ( nvmlDevice_t  dev,
int  which_one 
)

Definition at line 228 of file linux-nvml.c.

00229 {
00230         nvmlMemory_t meminfo;
00231         nvmlReturn_t bad; 
00232         bad = (*nvmlDeviceGetMemoryInfoPtr)( dev, &meminfo );
00233 
00234         if ( NVML_SUCCESS != bad ) {
00235                 SUBDBG( "something went wrong %s\n", (*nvmlErrorStringPtr)(bad));
00236         }
00237 
00238         switch (which_one) {
00239                 case MEMINFO_TOTAL_MEMORY:
00240                         return meminfo.total;
00241                 case MEMINFO_UNALLOCED:
00242                         return meminfo.free;
00243                 case MEMINFO_ALLOCED:
00244                         return meminfo.used;
00245                 default:
00246                         ;
00247         }
00248         return (unsigned long long)-1;
00249 }

Here is the caller graph for this function:

unsigned long long getPowerUsage ( nvmlDevice_t  dev  ) 

Definition at line 308 of file linux-nvml.c.

00309 {
00310         unsigned int power;
00311         nvmlReturn_t bad; 
00312         bad = (*nvmlDeviceGetPowerUsagePtr)( dev, &power );
00313 
00314         if ( NVML_SUCCESS != bad ) {
00315                 SUBDBG( "something went wrong %s\n", (*nvmlErrorStringPtr)(bad));
00316         }
00317 
00318 
00319         return (unsigned long long) power;
00320 }

Here is the caller graph for this function:

unsigned long long getPState ( nvmlDevice_t  dev  ) 

Definition at line 252 of file linux-nvml.c.

00253 {
00254         unsigned int ret = 0;
00255         nvmlPstates_t state = NVML_PSTATE_15;
00256         nvmlReturn_t bad; 
00257         bad = (*nvmlDeviceGetPerformanceStatePtr)( dev, &state );
00258 
00259         if ( NVML_SUCCESS != bad ) {
00260                 SUBDBG( "something went wrong %s\n", (*nvmlErrorStringPtr)(bad));
00261         }
00262 
00263 
00264         switch ( state ) {
00265                 case NVML_PSTATE_15:
00266                         ret++;
00267                 case NVML_PSTATE_14:
00268                         ret++;
00269                 case NVML_PSTATE_13:
00270                         ret++;
00271                 case NVML_PSTATE_12:
00272                         ret++;
00273                 case NVML_PSTATE_11:
00274                         ret++;
00275                 case NVML_PSTATE_10:
00276                         ret++;
00277                 case NVML_PSTATE_9:
00278                         ret++;
00279                 case NVML_PSTATE_8:
00280                         ret++;
00281                 case NVML_PSTATE_7:
00282                         ret++;
00283                 case NVML_PSTATE_6:
00284                         ret++;
00285                 case NVML_PSTATE_5:
00286                         ret++;
00287                 case NVML_PSTATE_4:
00288                         ret++;
00289                 case NVML_PSTATE_3:
00290                         ret++;
00291                 case NVML_PSTATE_2:
00292                         ret++;
00293                 case NVML_PSTATE_1:
00294                         ret++;
00295                 case NVML_PSTATE_0:
00296                         break;
00297                 case NVML_PSTATE_UNKNOWN:
00298                 default:
00299                         /* This should never happen? 
00300                          * The API docs just state Unknown performance state... */
00301                         return (unsigned long long) -1;
00302         }
00303 
00304         return (unsigned long long)ret;
00305 }

Here is the caller graph for this function:

unsigned long long getTemperature ( nvmlDevice_t  dev  ) 

Definition at line 323 of file linux-nvml.c.

00324 {
00325         unsigned int ret = 0;
00326         nvmlReturn_t bad; 
00327         bad = (*nvmlDeviceGetTemperaturePtr)( dev, NVML_TEMPERATURE_GPU, &ret );
00328 
00329         if ( NVML_SUCCESS != bad ) {
00330                 SUBDBG( "something went wrong %s\n", (*nvmlErrorStringPtr)(bad));
00331         }
00332 
00333 
00334         return (unsigned long long)ret;
00335 }

Here is the caller graph for this function:

unsigned long long getTotalEccErrors ( nvmlDevice_t  dev,
nvmlEccBitType_t  bits 
)

Definition at line 338 of file linux-nvml.c.

00339 {
00340         unsigned long long counts = 0;
00341         nvmlReturn_t bad; 
00342         bad = (*nvmlDeviceGetTotalEccErrorsPtr)( dev, bits, NVML_VOLATILE_ECC , &counts);
00343 
00344         if ( NVML_SUCCESS != bad ) {
00345                 SUBDBG( "something went wrong %s\n", (*nvmlErrorStringPtr)(bad));
00346         }
00347 
00348 
00349         return counts;
00350 }

Here is the caller graph for this function:

unsigned long long getUtilization ( nvmlDevice_t  dev,
int  which_one 
)

Definition at line 356 of file linux-nvml.c.

00357 {
00358         nvmlUtilization_t util;
00359         nvmlReturn_t bad; 
00360         bad = (*nvmlDeviceGetUtilizationRatesPtr)( dev, &util );
00361 
00362         if ( NVML_SUCCESS != bad ) {
00363                 SUBDBG( "something went wrong %s\n", (*nvmlErrorStringPtr)(bad));
00364         }
00365 
00366 
00367         switch (which_one) {
00368                 case GPU_UTILIZATION:
00369                         return (unsigned long long) util.gpu;
00370                 case MEMORY_UTILIZATION:
00371                         return (unsigned long long) util.memory;
00372                 default:
00373                         ;
00374         }
00375 
00376         return (unsigned long long) -1;
00377 }

Here is the caller graph for this function:

static int linkCudaLibraries (  )  [static]

Definition at line 998 of file linux-nvml.c.

00999 {
01000     /* Attempt to guess if we were statically linked to libc, if so bail */
01001     if ( _dl_non_dynamic_init != NULL ) {
01002         strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML component does not support statically linking of libc.", PAPI_MAX_STR_LEN);
01003         return PAPI_ENOSUPP;
01004     }
01005 
01006     /* Need to link in the cuda libraries, if not found disable the component */
01007     dl1 = dlopen("libcuda.so", RTLD_NOW | RTLD_GLOBAL);
01008     if (!dl1)
01009     {
01010         strncpy(_nvml_vector.cmp_info.disabled_reason, "CUDA library libcuda.so not found.",PAPI_MAX_STR_LEN);
01011         return ( PAPI_ENOSUPP );
01012     }
01013     cuInitPtr = dlsym(dl1, "cuInit");
01014     if (dlerror() != NULL)
01015     {
01016         strncpy(_nvml_vector.cmp_info.disabled_reason, "CUDA function cuInit not found.",PAPI_MAX_STR_LEN);
01017         return ( PAPI_ENOSUPP );
01018     }
01019 
01020     dl2 = dlopen("libcudart.so", RTLD_NOW | RTLD_GLOBAL);
01021     if (!dl2)
01022     {
01023         strncpy(_nvml_vector.cmp_info.disabled_reason, "CUDA runtime library libcudart.so not found.",PAPI_MAX_STR_LEN);
01024         return ( PAPI_ENOSUPP );
01025     }
01026     cudaGetDevicePtr = dlsym(dl2, "cudaGetDevice");
01027     if (dlerror() != NULL)
01028     {
01029         strncpy(_nvml_vector.cmp_info.disabled_reason, "CUDART function cudaGetDevice not found.",PAPI_MAX_STR_LEN);
01030         return ( PAPI_ENOSUPP );
01031     }
01032     cudaGetDeviceCountPtr = dlsym(dl2, "cudaGetDeviceCount");
01033     if (dlerror() != NULL)
01034     {
01035         strncpy(_nvml_vector.cmp_info.disabled_reason, "CUDART function cudaGetDeviceCount not found.",PAPI_MAX_STR_LEN);
01036         return ( PAPI_ENOSUPP );
01037     }
01038     cudaDeviceGetPCIBusIdPtr = dlsym(dl2, "cudaDeviceGetPCIBusId");
01039     if (dlerror() != NULL)
01040     {
01041         strncpy(_nvml_vector.cmp_info.disabled_reason, "CUDART function cudaDeviceGetPCIBusId not found.",PAPI_MAX_STR_LEN);
01042         return ( PAPI_ENOSUPP );
01043     }
01044 
01045     dl3 = dlopen("libnvidia-ml.so", RTLD_NOW | RTLD_GLOBAL);
01046     if (!dl3)
01047     {
01048         strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML runtime library libnvidia-ml.so not found.",PAPI_MAX_STR_LEN);
01049         return ( PAPI_ENOSUPP );
01050     }
01051     nvmlDeviceGetClockInfoPtr = dlsym(dl3, "nvmlDeviceGetClockInfo");
01052     if (dlerror() != NULL)
01053     {
01054         strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetClockInfo not found.",PAPI_MAX_STR_LEN);
01055         return ( PAPI_ENOSUPP );
01056     }
01057     nvmlErrorStringPtr = dlsym(dl3, "nvmlErrorString");
01058     if (dlerror() != NULL)
01059     {
01060         strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlErrorString not found.",PAPI_MAX_STR_LEN);
01061         return ( PAPI_ENOSUPP );
01062     }
01063     nvmlDeviceGetDetailedEccErrorsPtr = dlsym(dl3, "nvmlDeviceGetDetailedEccErrors");
01064     if (dlerror() != NULL)
01065     {
01066         strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetDetailedEccErrors not found.",PAPI_MAX_STR_LEN);
01067         return ( PAPI_ENOSUPP );
01068     }
01069     nvmlDeviceGetFanSpeedPtr = dlsym(dl3, "nvmlDeviceGetFanSpeed");
01070     if (dlerror() != NULL)
01071     {
01072         strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetFanSpeed not found.",PAPI_MAX_STR_LEN);
01073         return ( PAPI_ENOSUPP );
01074     }
01075     nvmlDeviceGetMemoryInfoPtr = dlsym(dl3, "nvmlDeviceGetMemoryInfo");
01076     if (dlerror() != NULL)
01077     {
01078         strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetMemoryInfo not found.",PAPI_MAX_STR_LEN);
01079         return ( PAPI_ENOSUPP );
01080     }
01081     nvmlDeviceGetPerformanceStatePtr = dlsym(dl3, "nvmlDeviceGetPerformanceState");
01082     if (dlerror() != NULL)
01083     {
01084         strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetPerformanceState not found.",PAPI_MAX_STR_LEN);
01085         return ( PAPI_ENOSUPP );
01086     }
01087     nvmlDeviceGetPowerUsagePtr = dlsym(dl3, "nvmlDeviceGetPowerUsage");
01088     if (dlerror() != NULL)
01089     {
01090         strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetPowerUsage not found.",PAPI_MAX_STR_LEN);
01091         return ( PAPI_ENOSUPP );
01092     }
01093     nvmlDeviceGetTemperaturePtr = dlsym(dl3, "nvmlDeviceGetTemperature");
01094     if (dlerror() != NULL)
01095     {
01096         strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetTemperature not found.",PAPI_MAX_STR_LEN);
01097         return ( PAPI_ENOSUPP );
01098     }
01099     nvmlDeviceGetTotalEccErrorsPtr = dlsym(dl3, "nvmlDeviceGetTotalEccErrors");
01100     if (dlerror() != NULL)
01101     {
01102         strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetTotalEccErrors not found.",PAPI_MAX_STR_LEN);
01103         return ( PAPI_ENOSUPP );
01104     }
01105     nvmlDeviceGetUtilizationRatesPtr = dlsym(dl3, "nvmlDeviceGetUtilizationRates");
01106     if (dlerror() != NULL)
01107     {
01108         strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetUtilizationRates not found.",PAPI_MAX_STR_LEN);
01109         return ( PAPI_ENOSUPP );
01110     }
01111     nvmlDeviceGetHandleByIndexPtr = dlsym(dl3, "nvmlDeviceGetHandleByIndex");
01112     if (dlerror() != NULL)
01113     {
01114         strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetHandleByIndex not found.",PAPI_MAX_STR_LEN);
01115         return ( PAPI_ENOSUPP );
01116     }
01117     nvmlDeviceGetPciInfoPtr = dlsym(dl3, "nvmlDeviceGetPciInfo");
01118     if (dlerror() != NULL)
01119     {
01120         strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetPciInfo not found.",PAPI_MAX_STR_LEN);
01121         return ( PAPI_ENOSUPP );
01122     }
01123     nvmlDeviceGetNamePtr = dlsym(dl3, "nvmlDeviceGetName");
01124     if (dlerror() != NULL)
01125     {
01126         strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetName not found.",PAPI_MAX_STR_LEN);
01127         return ( PAPI_ENOSUPP );
01128     }
01129     nvmlDeviceGetInforomVersionPtr = dlsym(dl3, "nvmlDeviceGetInforomVersion");
01130     if (dlerror() != NULL)
01131     {
01132         strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetInforomVersion not found.",PAPI_MAX_STR_LEN);
01133         return ( PAPI_ENOSUPP );
01134     }
01135     nvmlDeviceGetEccModePtr = dlsym(dl3, "nvmlDeviceGetEccMode");
01136     if (dlerror() != NULL)
01137     {
01138         strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetEccMode not found.",PAPI_MAX_STR_LEN);
01139         return ( PAPI_ENOSUPP );
01140     }
01141     nvmlInitPtr = dlsym(dl3, "nvmlInit");
01142     if (dlerror() != NULL)
01143     {
01144         strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlInit not found.",PAPI_MAX_STR_LEN);
01145         return ( PAPI_ENOSUPP );
01146     }
01147     nvmlDeviceGetCountPtr = dlsym(dl3, "nvmlDeviceGetCount");
01148     if (dlerror() != NULL)
01149     {
01150         strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetCount not found.",PAPI_MAX_STR_LEN);
01151         return ( PAPI_ENOSUPP );
01152     }
01153     nvmlShutdownPtr = dlsym(dl3, "nvmlShutdown");
01154     if (dlerror() != NULL)
01155     {
01156         strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlShutdown not found.",PAPI_MAX_STR_LEN);
01157         return ( PAPI_ENOSUPP );
01158     }
01159 
01160     return ( PAPI_OK );
01161 }

Here is the caller graph for this function:

static int nvml_hardware_read ( long long *  value,
int  which_one 
) [static]

Code that reads event values.

Definition at line 395 of file linux-nvml.c.

00397 {
00398         nvml_native_event_entry_t *entry;
00399         nvmlDevice_t handle;
00400         int cudaIdx = -1;
00401 
00402         entry = &nvml_native_table[which_one];
00403         *value = (long long) -1;
00404         /* replace entry->resources with the current cuda_device->nvml device */
00405         (*cudaGetDevicePtr)( &cudaIdx );
00406 
00407         if ( cudaIdx < 0 || cudaIdx > device_count )
00408             return PAPI_EINVAL;
00409 
00410         /* Make sure the device we are running on has the requested event */
00411         if ( !HAS_FEATURE( features[cudaIdx] , entry->type) ) 
00412                 return PAPI_EINVAL;
00413 
00414         handle = devices[cudaIdx];
00415 
00416         switch (entry->type) {
00417                 case FEATURE_CLOCK_INFO:
00418                         *value =  getClockSpeed(    handle, 
00419                                         (nvmlClockType_t)entry->options.clock );
00420                         break;
00421                 case FEATURE_ECC_LOCAL_ERRORS:
00422                         *value = getEccLocalErrors(     handle, 
00423                                         (nvmlEccBitType_t)entry->options.ecc_opts.bits, 
00424                                         (int)entry->options.ecc_opts.which_one);
00425                         break;
00426                 case FEATURE_FAN_SPEED:
00427                         *value = getFanSpeed( handle );
00428                         break;
00429                 case FEATURE_MAX_CLOCK:
00430                         *value = getMaxClockSpeed(  handle, 
00431                                         (nvmlClockType_t)entry->options.clock );
00432                         break;
00433                 case FEATURE_MEMORY_INFO:
00434                         *value = getMemoryInfo(     handle, 
00435                                         (int)entry->options.which_one );
00436                         break;
00437                 case FEATURE_PERF_STATES:
00438                         *value = getPState( handle );
00439                         break;
00440                 case FEATURE_POWER:
00441                         *value = getPowerUsage( handle );
00442                         break;
00443                 case FEATURE_TEMP:
00444                         *value = getTemperature( handle );
00445                         break;
00446                 case FEATURE_ECC_TOTAL_ERRORS:
00447                         *value = getTotalEccErrors(     handle, 
00448                                         (nvmlEccBitType_t)entry->options.ecc_opts.bits );
00449                         break;
00450                 case FEATURE_UTILIZATION:
00451                         *value = getUtilization(    handle, 
00452                                         (int)entry->options.which_one );
00453                         break;
00454                 default:
00455                         return PAPI_EINVAL;
00456         }
00457 
00458         return PAPI_OK;
00459 
00460 
00461 }

Here is the call graph for this function:

Here is the caller graph for this function:

static void nvml_hardware_reset (  )  [static]

Definition at line 380 of file linux-nvml.c.

00381 {
00382         /* nvmlDeviceSet* and nvmlDeviceClear* calls require root/admin access, so while 
00383          * possible to implement a reset on the ECC counters, we pass */
00384         /* 
00385            int i;
00386            for ( i=0; i < device_count; i++ )
00387            nvmlDeviceClearEccErrorCounts( device[i], NVML_VOLATILE_ECC ); 
00388          */
00389 }

Here is the caller graph for this function:


Variable Documentation

void(* _dl_non_dynamic_init)(void)

Holds control flags. Usually there's one of these per event-set. Usually this is out-of band configuration of the hardware

< Copy of counts, holds results when stopped

Definition at line 39 of file linux-nvml.c.

00131 {
00132         int num_events;
00133         int which_counter[NVML_MAX_COUNTERS];
00134         long long counter[NVML_MAX_COUNTERS];   
00135 } nvml_control_state_t;

Vector that points to entry points for our component

Definition at line 1527 of file linux-nvml.c.

int device_count = 0 [static]

Number of devices detected at component_init time

Definition at line 147 of file linux-nvml.c.

nvmlDevice_t* devices = NULL [static]

Definition at line 152 of file linux-nvml.c.

int* features = NULL [static]

Definition at line 153 of file linux-nvml.c.

int num_events = 0 [static]

number of events in the table

Definition at line 150 of file linux-nvml.c.

Definition at line 135 of file linux-nvml.c.

This table contains the native events

Definition at line 144 of file linux-nvml.c.


Generated on 17 Nov 2016 for PAPI by  doxygen 1.6.1