PAPI 7.1.0.0
Loading...
Searching...
No Matches
linux-nvml.c File Reference

This is an NVML component, it demos the component interface and implements a number of counters from the Nvidia Management Library. Please refer to NVML documentation for details about nvmlDeviceGetPowerUsage, nvmlDeviceGetTemperature. Power is reported in mW and temperature in Celcius. The counter descriptions should contain the units that the measurement returns. More...

Include dependency graph for linux-nvml.c:

Go to the source code of this file.

Data Structures

struct  nvml_context_t
 

Macros

#define DECLDIR   __attribute__((weak))
 
#define NVML_MAX_COUNTERS   100
 
#define DO_SOME_CHECKING(vectorp)
 

Functions

unsigned long long getClockSpeed (nvmlDevice_t dev, nvmlClockType_t which_one)
 
unsigned long long getEccLocalErrors (nvmlDevice_t dev, nvmlEccBitType_t bits, int which_one)
 
unsigned long long getFanSpeed (nvmlDevice_t dev)
 
unsigned long long getMaxClockSpeed (nvmlDevice_t dev, nvmlClockType_t which_one)
 
unsigned long long getMemoryInfo (nvmlDevice_t dev, int which_one)
 
unsigned long long getPState (nvmlDevice_t dev)
 
unsigned long long getPowerUsage (nvmlDevice_t dev)
 
unsigned long long getTemperature (nvmlDevice_t dev)
 
unsigned long long getTotalEccErrors (nvmlDevice_t dev, nvmlEccBitType_t bits)
 
unsigned long long getUtilization (nvmlDevice_t dev, int which_one)
 
unsigned long long getPowerManagementLimit (nvmlDevice_t dev)
 
static int _papi_nvml_init_private (void)
 
static int _nvml_check_n_initialize (papi_vector_t *vector)
 
static void nvml_hardware_reset ()
 
static int nvml_hardware_read (long long *value, int which_one)
 
static int nvml_hardware_write (long long *value, int which_one)
 
int _papi_nvml_init_thread (hwd_context_t *ctx)
 
static int detectDevices ()
 
static void createNativeEvents ()
 
int _papi_nvml_shutdown_component ()
 
static int _papi_nvml_init_component (int cidx)
 
static int linkCudaLibraries ()
 
int _papi_nvml_init_control_state (hwd_control_state_t *ctl)
 
int _papi_nvml_update_control_state (hwd_control_state_t *ctl, NativeInfo_t *native, int count, hwd_context_t *ctx)
 
int _papi_nvml_start (hwd_context_t *ctx, hwd_control_state_t *ctl)
 
int _papi_nvml_stop (hwd_context_t *ctx, hwd_control_state_t *ctl)
 
int _papi_nvml_read (hwd_context_t *ctx, hwd_control_state_t *ctl, long long **events, int flags)
 
int _papi_nvml_write (hwd_context_t *ctx, hwd_control_state_t *ctl, long long *events)
 
int _papi_nvml_reset (hwd_context_t *ctx, hwd_control_state_t *ctl)
 
int _papi_nvml_shutdown_thread (hwd_context_t *ctx)
 
int _papi_nvml_ctl (hwd_context_t *ctx, int code, _papi_int_option_t *option)
 
int _papi_nvml_set_domain (hwd_control_state_t *cntrl, int domain)
 
int _papi_nvml_ntv_enum_events (unsigned int *EventCode, int modifier)
 
int _papi_nvml_ntv_code_to_name (unsigned int EventCode, char *name, int len)
 
int _papi_nvml_ntv_code_to_descr (unsigned int EventCode, char *descr, int len)
 
int _papi_nvml_ntv_code_to_info (unsigned int EventCode, PAPI_event_info_t *info)
 

Variables

void(* _dl_non_dynamic_init )(void)
 
 nvml_control_state_t
 
static nvml_native_event_entry_tnvml_native_table = NULL
 
static intnvml_dev_id_table = NULL
 
static int device_count = 0
 
static int num_events = 0
 
static nvmlDevice_t * devices = NULL
 
static intfeatures = NULL
 
static unsigned intpower_management_initial_limit = NULL
 
static unsigned intpower_management_limit_constraint_min = NULL
 
static unsigned intpower_management_limit_constraint_max = NULL
 
papi_vector_t _nvml_vector
 

Detailed Description

Author
Kiran Kumar Kasichayanula kkasi.nosp@m.cha@.nosp@m.utk.e.nosp@m.du
James Ralph ralph.nosp@m.@eec.nosp@m.s.utk.nosp@m..edu

Definition in file linux-nvml.c.

Macro Definition Documentation

◆ DECLDIR

#define DECLDIR   __attribute__((weak))

◆ DO_SOME_CHECKING

#define DO_SOME_CHECKING (   vectorp)
Value:
do { \
int err = _nvml_check_n_initialize(vectorp); \
if (PAPI_OK != err) return err; \
} while(0)
#define PAPI_OK
Definition: f90papi.h:73
static int _nvml_check_n_initialize(papi_vector_t *vector)
Definition: linux-nvml.c:404

Definition at line 411 of file linux-nvml.c.

◆ NVML_MAX_COUNTERS

#define NVML_MAX_COUNTERS   100

Function Documentation

◆ _nvml_check_n_initialize()

static int _nvml_check_n_initialize ( papi_vector_t vector)
static

Definition at line 404 of file linux-nvml.c.

405{
406 if (!vector->cmp_info.initialized)
408 return PAPI_OK;
409}
static int _papi_nvml_init_private(void)
Definition: linux-nvml.c:1125
PAPI_component_info_t cmp_info
Definition: papi_vector.h:20
Here is the call graph for this function:

◆ _papi_nvml_ctl()

int _papi_nvml_ctl ( hwd_context_t ctx,
int  code,
_papi_int_option_t option 
)

This function sets various options in the component

Parameters
codevalid are PAPI_SET_DEFDOM, PAPI_SET_DOMAIN, PAPI_SETDEFGRN, PAPI_SET_GRANUL and PAPI_SET_INHERIT

Definition at line 1570 of file linux-nvml.c.

1571{
1572 SUBDBG("Enter: ctx: %p, code: %d\n", ctx, code);
1573
1574 (void) ctx;
1575 (void) code;
1576 (void) option;
1577
1578 /* FIXME. This should maybe set up more state, such as which counters are active and */
1579 /* counter mappings. */
1580
1581 return PAPI_OK;
1582}
#define SUBDBG(format, args...)
Definition: papi_debug.h:64

◆ _papi_nvml_init_component()

static int _papi_nvml_init_component ( int  cidx)
static

Initialize hardware counters, setup the function vector table and get hardware information, this routine is called when the PAPI process is initialized (IE PAPI_library_init)

Definition at line 1105 of file linux-nvml.c.

1106{
1107 SUBDBG("Entry: cidx: %d\n", cidx);
1108 /* Export the total number of events available */
1110
1111 /* Export the component id */
1113
1114 /* Export the number of 'counters' */
1117
1119 "Not initialized. Access component events to initialize it.");
1121
1122 return PAPI_EDELAY_INIT;
1123}
#define PAPI_EDELAY_INIT
Definition: f90papi.h:271
papi_vector_t _nvml_vector
Definition: linux-nvml.c:1740
static int cidx
char disabled_reason[PAPI_HUGE_STR_LEN]
Definition: papi.h:634

◆ _papi_nvml_init_control_state()

int _papi_nvml_init_control_state ( hwd_control_state_t ctl)

Setup a counter control state. In general a control state holds the hardware info for an EventSet.

Definition at line 1408 of file linux-nvml.c.

1409{
1410 SUBDBG("nvml_init_control_state... %p\n", ctl);
1412 nvml_control_state_t *nvml_ctl = (nvml_control_state_t *) ctl;
1413 memset(nvml_ctl, 0, sizeof(nvml_control_state_t));
1414
1415 return PAPI_OK;
1416}
#define DO_SOME_CHECKING(vectorp)
Definition: linux-nvml.c:411
nvml_control_state_t
Definition: linux-nvml.c:129

◆ _papi_nvml_init_private()

int _papi_nvml_init_private ( void  )
static

Definition at line 1125 of file linux-nvml.c.

1126{
1127 nvmlReturn_t ret;
1128 int err = PAPI_OK;
1129 unsigned int nvml_count = 0;
1130
1132 if (_nvml_vector.cmp_info.initialized) goto nvml_init_private_exit;
1133
1134 SUBDBG("Private init with component idx: %d\n", _nvml_vector.cmp_info.CmpIdx);
1135 /* link in the NVML libraries and resolve the symbols we need to use */
1136 if (linkCudaLibraries() != PAPI_OK) {
1137 SUBDBG("Dynamic link of CUDA libraries failed, component will be disabled.\n");
1138 SUBDBG("See disable reason in papi_component_avail output for more details.\n");
1139 _papi_nvml_shutdown_component(); // clean up any open dynLibs, mallocs, etc.
1140 err = (PAPI_ENOSUPP);
1141 goto nvml_init_private_exit;
1142 }
1143
1144 ret = (*nvmlInitPtr)();
1145 if (NVML_SUCCESS != ret) {
1146 strcpy(_nvml_vector.cmp_info.disabled_reason, "The NVIDIA management library failed to initialize.");
1147 _papi_nvml_shutdown_component(); // clean up any open dynLibs, mallocs, etc.
1148 err = PAPI_ENOSUPP;
1149 goto nvml_init_private_exit;
1150 }
1151
1152 /* Figure out the number of CUDA devices in the system */
1153 ret = (*nvmlDeviceGetCountPtr)(&nvml_count);
1154 if (NVML_SUCCESS != ret) {
1155 strcpy(_nvml_vector.cmp_info.disabled_reason, "Unable to get a count of devices from the NVIDIA management library.");
1156 _papi_nvml_shutdown_component(); // clean up any open dynLibs, mallocs, etc.
1157 err = PAPI_ENOSUPP;
1158 goto nvml_init_private_exit;
1159 }
1160
1161 device_count = nvml_count;
1162 SUBDBG("Need to setup NVML with %d devices\n", device_count);
1163
1164 /* A per device representation of what events are present */
1165 features = (int*)papi_malloc(sizeof(int) * device_count);
1166 if (features == NULL) {
1168 "%s failed to alloc %lu bytes for features.", __func__, sizeof(int)*device_count);
1169 _nvml_vector.cmp_info.disabled_reason[PAPI_MAX_STR_LEN-1]=0; // force null termination.
1170 _papi_nvml_shutdown_component(); // clean up any open dynLibs, mallocs, etc.
1171 err = PAPI_ENOMEM;
1172 goto nvml_init_private_exit;
1173 }
1174
1175 /* Handles to each device */
1176 devices = (nvmlDevice_t*)papi_malloc(sizeof(nvmlDevice_t) * device_count);
1177 if (devices == NULL) {
1179 "%s failed to alloc %lu bytes for features.", __func__, (sizeof(nvmlDevice_t) * device_count));
1180 _nvml_vector.cmp_info.disabled_reason[PAPI_MAX_STR_LEN-1]=0; // force null termination.
1181 _papi_nvml_shutdown_component(); // clean up any open dynLibs, mallocs, etc.
1182 err = PAPI_ENOMEM;
1183 goto nvml_init_private_exit;
1184 }
1185
1186 /* For each device, store the intial power value to enable reset if power is altered */
1187 power_management_initial_limit = (unsigned int*)papi_malloc(sizeof(unsigned int) * device_count);
1188 if (power_management_initial_limit == NULL) {
1190 "%s failed to alloc %lu bytes for power_management_initial_limit.", __func__, (sizeof(unsigned int) * device_count));
1191 _nvml_vector.cmp_info.disabled_reason[PAPI_MAX_STR_LEN-1]=0; // force null termination.
1192 _papi_nvml_shutdown_component(); // clean up any open dynLibs, mallocs, etc.
1193 err = PAPI_ENOMEM;
1194 goto nvml_init_private_exit;
1195 }
1196 power_management_limit_constraint_min = (unsigned int*)papi_malloc(sizeof(unsigned int) * device_count);
1199 "%s failed to alloc %lu bytes for power_management_limit_constraint_min.", __func__, (sizeof(unsigned int) * device_count));
1200 _nvml_vector.cmp_info.disabled_reason[PAPI_MAX_STR_LEN-1]=0; // force null termination.
1201 _papi_nvml_shutdown_component(); // clean up any open dynLibs, mallocs, etc.
1202 err = PAPI_ENOMEM;
1203 goto nvml_init_private_exit;
1204 }
1205 power_management_limit_constraint_max = (unsigned int*)papi_malloc(sizeof(unsigned int) * device_count);
1208 "%s failed to alloc %lu bytes for power_management_limit_constraint_max.", __func__, (sizeof(unsigned int) * device_count));
1209 _nvml_vector.cmp_info.disabled_reason[PAPI_MAX_STR_LEN-1]=0; // force null termination.
1210 _papi_nvml_shutdown_component(); // clean up any open dynLibs, mallocs, etc.
1211 err = PAPI_ENOMEM;
1212 goto nvml_init_private_exit;
1213 }
1214
1215 /* Figure out what events are supported on each card. */
1216 if (detectDevices() != PAPI_OK) {
1217 sprintf(_nvml_vector.cmp_info.disabled_reason, "An error occured in device feature detection, please check your NVIDIA Management Library and CUDA install.");
1218 _papi_nvml_shutdown_component(); // clean up any open dynLibs, mallocs, etc.
1219 err = PAPI_ENOSUPP;
1220 goto nvml_init_private_exit;
1221 }
1222
1223 /* The assumption is that if everything went swimmingly in detectDevices,
1224 all nvml calls here should be fine. */
1226
1227 /* Export the total number of events available */
1229
1230 /* Export the number of 'counters' */
1233
1234nvml_init_private_exit:
1237
1239
1240 return err;
1241}
Lock one of two mutex variables defined in papi.h.
Unlock one of the mutex variables defined in papi.h.
#define PAPI_ENOSUPP
Definition: f90papi.h:244
#define PAPI_MAX_STR_LEN
Definition: f90papi.h:77
#define PAPI_ENOMEM
Definition: f90papi.h:16
static int detectDevices()
Definition: linux-nvml.c:593
static int num_events
Definition: linux-nvml.c:144
static int * features
Definition: linux-nvml.c:147
int _papi_nvml_shutdown_component()
Definition: linux-nvml.c:1076
static unsigned int * power_management_initial_limit
Definition: linux-nvml.c:148
static int device_count
Definition: linux-nvml.c:141
static nvmlDevice_t * devices
Definition: linux-nvml.c:146
static int linkCudaLibraries()
Definition: linux-nvml.c:1250
static unsigned int * power_management_limit_constraint_min
Definition: linux-nvml.c:149
static void createNativeEvents()
Definition: linux-nvml.c:751
static unsigned int * power_management_limit_constraint_max
Definition: linux-nvml.c:150
#define COMPONENT_LOCK
Definition: papi_internal.h:90
#define papi_malloc(a)
Definition: papi_memory.h:34
Here is the call graph for this function:
Here is the caller graph for this function:

◆ _papi_nvml_init_thread()

int _papi_nvml_init_thread ( hwd_context_t ctx)

This is called whenever a thread is initialized

Definition at line 583 of file linux-nvml.c.

584{
585 (void) ctx;
586
587 SUBDBG("Enter: ctx: %p\n", ctx);
588
589 return PAPI_OK;
590}

◆ _papi_nvml_ntv_code_to_descr()

int _papi_nvml_ntv_code_to_descr ( unsigned int  EventCode,
char *  descr,
int  len 
)

Takes a native event code and passes back the event description

Parameters
EventCodeis the native event code
descris a pointer for the description to be copied to
lenis the size of the descr string

Definition at line 1701 of file linux-nvml.c.

1702{
1703 int index;
1704 index = EventCode;
1705
1706 if (index >= num_events) return PAPI_ENOEVNT;
1707
1708 strncpy(descr, nvml_native_table[index].description, len);
1709
1710 return PAPI_OK;
1711}
#define PAPI_ENOEVNT
Definition: f90papi.h:139
static nvml_native_event_entry_t * nvml_native_table
Definition: linux-nvml.c:137
char * descr

◆ _papi_nvml_ntv_code_to_info()

int _papi_nvml_ntv_code_to_info ( unsigned int  EventCode,
PAPI_event_info_t info 
)

Takes a native event code and passes back the event info

Parameters
EventCodeis the native event code
infois a pointer for the info to be copied to

Definition at line 1718 of file linux-nvml.c.

1719{
1720
1721 int index = EventCode;
1722
1723 if ((index < 0) || (index >= num_events)) return PAPI_ENOEVNT;
1724
1725 strncpy(info->symbol, nvml_native_table[index].name, sizeof(info->symbol) - 1);
1726 info->symbol[sizeof(info->symbol) - 1] = '\0';
1727
1728 strncpy(info->units, nvml_native_table[index].units, sizeof(info->units) - 1);
1729 info->units[sizeof(info->units) - 1] = '\0';
1730
1731 strncpy(info->long_descr, nvml_native_table[index].description, sizeof(info->long_descr) - 1);
1732 info->long_descr[sizeof(info->long_descr) - 1] = '\0';
1733
1734// info->data_type = nvml_native_table[index].return_type;
1735
1736 return PAPI_OK;
1737}
char units[PAPI_MIN_STR_LEN]
Definition: papi.h:969
char symbol[PAPI_HUGE_STR_LEN]
Definition: papi.h:960
char long_descr[PAPI_HUGE_STR_LEN]
Definition: papi.h:963
char units[PAPI_MIN_STR_LEN]
Definition: linux-nvml.h:51
char description[PAPI_MAX_STR_LEN]
Definition: linux-nvml.h:52
char name[PAPI_MAX_STR_LEN]
Definition: linux-nvml.h:50

◆ _papi_nvml_ntv_code_to_name()

int _papi_nvml_ntv_code_to_name ( unsigned int  EventCode,
char *  name,
int  len 
)

Takes a native event code and passes back the name

Parameters
EventCodeis the native event code
nameis a pointer for the name to be copied to
lenis the size of the name string

Definition at line 1678 of file linux-nvml.c.

1679{
1680 SUBDBG("Entry: EventCode: %#x, name: %s, len: %d\n", EventCode, name, len);
1681 int index;
1682
1684
1685 index = EventCode;
1686
1687 /* Make sure we are in range */
1688 if (index >= num_events) return PAPI_ENOEVNT;
1689
1690 strncpy(name, nvml_native_table[index].name, len);
1691
1692 return PAPI_OK;
1693}
const char * name
Definition: rocs.c:225

◆ _papi_nvml_ntv_enum_events()

int _papi_nvml_ntv_enum_events ( unsigned int EventCode,
int  modifier 
)

Enumerate Native Events

Parameters
EventCodeis the event of interest
modifieris one of PAPI_ENUM_FIRST, PAPI_ENUM_EVENTS If your component has attribute masks then these need to be handled here as well.

Definition at line 1635 of file linux-nvml.c.

1636{
1637 int index;
1638
1640
1641 switch (modifier) {
1642
1643 /* return EventCode of first event */
1644 case PAPI_ENUM_FIRST:
1645 /* return the first event that we support */
1646
1647 *EventCode = 0;
1648 return PAPI_OK;
1649
1650 /* return EventCode of next available event */
1651 case PAPI_ENUM_EVENTS:
1652 index = *EventCode;
1653
1654 /* Make sure we are in range */
1655 if (index < num_events - 1) {
1656
1657 /* This assumes a non-sparse mapping of the events */
1658 *EventCode = *EventCode + 1;
1659 return PAPI_OK;
1660 } else {
1661 return PAPI_ENOEVNT;
1662 }
1663 break;
1664
1665 default:
1666 return PAPI_EINVAL;
1667 }
1668
1669 return PAPI_EINVAL;
1670}
#define PAPI_ENUM_EVENTS
Definition: f90papi.h:224
#define PAPI_ENUM_FIRST
Definition: f90papi.h:85
#define PAPI_EINVAL
Definition: f90papi.h:115

◆ _papi_nvml_read()

int _papi_nvml_read ( hwd_context_t ctx,
hwd_control_state_t ctl,
long long **  events,
int  flags 
)

Triggered by PAPI_read()

Definition at line 1488 of file linux-nvml.c.

1490{
1491 SUBDBG("Enter: ctx: %p, flags: %d\n", ctx, flags);
1492
1493 (void) ctx;
1494 (void) flags;
1495 int i;
1496 int ret;
1497 nvml_control_state_t* nvml_ctl = (nvml_control_state_t*) ctl;
1498
1499 for (i = 0; i < nvml_ctl->num_events; i++) {
1500 if (PAPI_OK !=
1501 (ret = nvml_hardware_read(&nvml_ctl->counter[i],
1502 nvml_ctl->which_counter[i])))
1503 return ret;
1504
1505 }
1506 /* return pointer to the values we read */
1507 *events = nvml_ctl->counter;
1508 return PAPI_OK;
1509}
int i
char events[MAX_EVENTS][BUFSIZ]
static int nvml_hardware_read(long long *value, int which_one)
Definition: linux-nvml.c:450
Here is the call graph for this function:

◆ _papi_nvml_reset()

int _papi_nvml_reset ( hwd_context_t ctx,
hwd_control_state_t ctl 
)

Triggered by PAPI_reset() but only if the EventSet is currently running

Definition at line 1540 of file linux-nvml.c.

1541{
1542 SUBDBG("Enter: ctx: %p, ctl: %p\n", ctx, ctl);
1543
1544 (void) ctx;
1545 (void) ctl;
1546
1547 /* Reset the hardware */
1549
1550 return PAPI_OK;
1551}
static void nvml_hardware_reset()
Definition: linux-nvml.c:417
Here is the call graph for this function:

◆ _papi_nvml_set_domain()

int _papi_nvml_set_domain ( hwd_control_state_t cntrl,
int  domain 
)

This function has to set the bits needed to count different domains In particular: PAPI_DOM_USER, PAPI_DOM_KERNEL PAPI_DOM_OTHER By default return PAPI_EINVAL if none of those are specified and PAPI_OK with success PAPI_DOM_USER is only user context is counted PAPI_DOM_KERNEL is only the Kernel/OS context is counted PAPI_DOM_OTHER is Exception/transient mode (like user TLB misses) PAPI_DOM_ALL is all of the domains

Definition at line 1594 of file linux-nvml.c.

1595{
1596 SUBDBG("Enter: cntrl: %p, domain: %d\n", cntrl, domain);
1597
1598 (void) cntrl;
1599
1600 int found = 0;
1601
1602 if (PAPI_DOM_USER & domain) {
1603 SUBDBG(" PAPI_DOM_USER \n");
1604 found = 1;
1605 }
1606 if (PAPI_DOM_KERNEL & domain) {
1607 SUBDBG(" PAPI_DOM_KERNEL \n");
1608 found = 1;
1609 }
1610 if (PAPI_DOM_OTHER & domain) {
1611 SUBDBG(" PAPI_DOM_OTHER \n");
1612 found = 1;
1613 }
1614 if (PAPI_DOM_ALL & domain) {
1615 SUBDBG(" PAPI_DOM_ALL \n");
1616 found = 1;
1617 }
1618 if (!found)
1619 return (PAPI_EINVAL);
1620
1621 return PAPI_OK;
1622}
#define PAPI_DOM_USER
Definition: f90papi.h:174
#define PAPI_DOM_OTHER
Definition: f90papi.h:21
#define PAPI_DOM_KERNEL
Definition: f90papi.h:254
#define PAPI_DOM_ALL
Definition: f90papi.h:261

◆ _papi_nvml_shutdown_component()

int _papi_nvml_shutdown_component ( )

Definition at line 1076 of file linux-nvml.c.

1077{
1078 SUBDBG("Enter:\n");
1082 if (devices != NULL) papi_free(devices);
1083 if (features != NULL) papi_free(features);
1087 if (nvmlShutdownPtr) (*nvmlShutdownPtr)(); // Call nvml shutdown if we got that far.
1088
1089 device_count = 0;
1090 num_events = 0;
1091
1092 // close the dynamic libraries needed by this component (opened in the init component call)
1093 if (dl3) {dlclose(dl3); dl3=NULL;}
1094
1095 return PAPI_OK;
1096}
nvmlReturn_t(* nvmlShutdownPtr)(void)
Definition: benchSANVML.c:84
static void * dl3
Definition: benchSANVML.c:115
static int * nvml_dev_id_table
Definition: linux-nvml.c:138
#define papi_free(a)
Definition: papi_memory.h:35
Here is the call graph for this function:
Here is the caller graph for this function:

◆ _papi_nvml_shutdown_thread()

int _papi_nvml_shutdown_thread ( hwd_context_t ctx)

Called at thread shutdown

Definition at line 1555 of file linux-nvml.c.

1556{
1557 SUBDBG("Enter: ctx: %p\n", ctx);
1558
1559 (void) ctx;
1560
1561 /* Last chance to clean up thread */
1562
1563 return PAPI_OK;
1564}

◆ _papi_nvml_start()

int _papi_nvml_start ( hwd_context_t ctx,
hwd_control_state_t ctl 
)

Triggered by PAPI_start()

Definition at line 1447 of file linux-nvml.c.

1448{
1449 SUBDBG("Enter: ctx: %p, ctl: %p\n", ctx, ctl);
1450
1451 (void) ctx;
1452 (void) ctl;
1453
1454 /* anything that would need to be set at counter start time */
1455
1456 /* reset */
1457 /* start the counting */
1458
1459 return PAPI_OK;
1460}

◆ _papi_nvml_stop()

int _papi_nvml_stop ( hwd_context_t ctx,
hwd_control_state_t ctl 
)

Triggered by PAPI_stop()

Definition at line 1464 of file linux-nvml.c.

1465{
1466 SUBDBG("Enter: ctx: %p, ctl: %p\n", ctx, ctl);
1467
1468 int i;
1469 (void) ctx;
1470 (void) ctl;
1471 int ret;
1472
1473 nvml_control_state_t* nvml_ctl = (nvml_control_state_t*) ctl;
1474
1475 for (i = 0; i < nvml_ctl->num_events; i++) {
1476 if (PAPI_OK !=
1477 (ret = nvml_hardware_read(&nvml_ctl->counter[i],
1478 nvml_ctl->which_counter[i])))
1479 return ret;
1480
1481 }
1482
1483 return PAPI_OK;
1484}
Here is the call graph for this function:

◆ _papi_nvml_update_control_state()

int _papi_nvml_update_control_state ( hwd_control_state_t ctl,
NativeInfo_t native,
int  count,
hwd_context_t ctx 
)

Triggered by eventset operations like add or remove

Definition at line 1420 of file linux-nvml.c.

1424{
1425 SUBDBG("Enter: ctl: %p, ctx: %p\n", ctl, ctx);
1426 int i, index;
1427
1428 nvml_control_state_t *nvml_ctl = (nvml_control_state_t *) ctl;
1429 (void) ctx;
1430
1432 /* if no events, return */
1433 if (count == 0) return PAPI_OK;
1434
1435 for (i = 0; i < count; i++) {
1436 index = native[i].ni_event;
1437 nvml_ctl->which_counter[i] = index;
1438 /* We have no constraints on event position, so any event */
1439 /* can be in any slot. */
1440 native[i].ni_position = i;
1441 }
1442 nvml_ctl->num_events = count;
1443 return PAPI_OK;
1444}
static long count
static int native

◆ _papi_nvml_write()

int _papi_nvml_write ( hwd_context_t ctx,
hwd_control_state_t ctl,
long long events 
)

Triggered by PAPI_write(), but only if the counters are running

Definition at line 1514 of file linux-nvml.c.

1515{
1516 SUBDBG("Enter: ctx: %p, ctl: %p\n", ctx, ctl);
1517 (void) ctx;
1518 nvml_control_state_t* nvml_ctl = (nvml_control_state_t*) ctl;
1519 int i;
1520 int ret;
1521
1522 /* You can change ECC mode and compute exclusivity modes on the cards */
1523 /* But I don't see this as a function of a PAPI component at this time */
1524 /* All implementation issues aside. */
1525
1526 // Currently POWER_MANAGEMENT can be written
1527 for (i = 0; i < nvml_ctl->num_events; i++) {
1528 if (PAPI_OK != (ret = nvml_hardware_write(&events[i], nvml_ctl->which_counter[i])))
1529 return ret;
1530 }
1531
1532 /* return pointer to the values we read */
1533 return PAPI_OK;
1534}
static int nvml_hardware_write(long long *value, int which_one)
Definition: linux-nvml.c:532
Here is the call graph for this function:

◆ createNativeEvents()

static void createNativeEvents ( void  )
static

Definition at line 751 of file linux-nvml.c.

752{
754 char sanitized_name[PAPI_MIN_STR_LEN];
756
757 int i, nameLen = 0, j, devTableIdx = 0;
758
760 nvmlReturn_t ret;
761
765 entry = &nvml_native_table[0];
766 nvml_dev_id_table = (int*) papi_malloc(num_events*sizeof(int));
767
768 for (i = 0; i < device_count; i++) {
769 memset(names[i], 0x0, PAPI_MAX_STR_LEN);
770 ret = (*nvmlDeviceGetNamePtr)(devices[i], name, sizeof(name) - 1);
771 if (NVML_SUCCESS != ret) {
772 SUBDBG("nvmlDeviceGetName failed \n");
773 const char *name_unknown = "deviceNameUnknown";
774 strncpy(name, name_unknown, strlen(name_unknown) + 1);
775 }
776
777 nameLen = strlen(name);
778 strncpy(sanitized_name, name, PAPI_MIN_STR_LEN);
779
780 int retval = snprintf(sanitized_name, sizeof(name), "%s:device_%d", name, i);
781 if (retval > (int)sizeof(name)) {
782 SUBDBG("Device name is too long %s:device%d", name, i);
783 return;
784 }
785 sanitized_name[sizeof(name) - 1] = '\0';
786
787 for (j = 0; j < nameLen; j++)
788 if (' ' == sanitized_name[j])
789 sanitized_name[j] = '_';
790
792 sprintf(entry->name, "%s:graphics_clock", sanitized_name);
793 strncpy(entry->description, "Graphics clock domain (MHz).", PAPI_MAX_STR_LEN);
794 entry->options.clock = NVML_CLOCK_GRAPHICS;
795 entry->type = FEATURE_CLOCK_INFO;
796 entry++;
797 nvml_dev_id_table[devTableIdx] = i;
798 devTableIdx++;
799
800 sprintf(entry->name, "%s:sm_clock", sanitized_name);
801 strncpy(entry->description, "SM clock domain (MHz).", PAPI_MAX_STR_LEN);
802 entry->options.clock = NVML_CLOCK_SM;
803 entry->type = FEATURE_CLOCK_INFO;
804 entry++;
805 nvml_dev_id_table[devTableIdx] = i;
806 devTableIdx++;
807
808 sprintf(entry->name, "%s:memory_clock", sanitized_name);
809 strncpy(entry->description, "Memory clock domain (MHz).", PAPI_MAX_STR_LEN);
810 entry->options.clock = NVML_CLOCK_MEM;
811 entry->type = FEATURE_CLOCK_INFO;
812 entry++;
813 nvml_dev_id_table[devTableIdx] = i;
814 devTableIdx++;
815 }
816
818 sprintf(entry->name, "%s:l1_single_ecc_errors", sanitized_name);
819 strncpy(entry->description, "L1 cache single bit ECC", PAPI_MAX_STR_LEN);
820 entry->options.ecc_opts = (struct local_ecc) {
821 .bits = NVML_SINGLE_BIT_ECC,
822 .which_one = LOCAL_ECC_L1,
823 };
825 entry++;
826 nvml_dev_id_table[devTableIdx] = i;
827 devTableIdx++;
828
829 sprintf(entry->name, "%s:l2_single_ecc_errors", sanitized_name);
830 strncpy(entry->description, "L2 cache single bit ECC", PAPI_MAX_STR_LEN);
831 entry->options.ecc_opts = (struct local_ecc) {
832 .bits = NVML_SINGLE_BIT_ECC,
833 .which_one = LOCAL_ECC_L2,
834 };
836 entry++;
837 nvml_dev_id_table[devTableIdx] = i;
838 devTableIdx++;
839
840 sprintf(entry->name, "%s:memory_single_ecc_errors", sanitized_name);
841 strncpy(entry->description, "Device memory single bit ECC", PAPI_MAX_STR_LEN);
842 entry->options.ecc_opts = (struct local_ecc) {
843 .bits = NVML_SINGLE_BIT_ECC,
844 .which_one = LOCAL_ECC_MEM,
845 };
847 entry++;
848 nvml_dev_id_table[devTableIdx] = i;
849 devTableIdx++;
850
851 sprintf(entry->name, "%s:regfile_single_ecc_errors", sanitized_name);
852 strncpy(entry->description, "Register file single bit ECC", PAPI_MAX_STR_LEN);
853 entry->options.ecc_opts = (struct local_ecc) {
854 .bits = NVML_SINGLE_BIT_ECC,
855 .which_one = LOCAL_ECC_REGFILE,
856 };
858 entry++;
859 nvml_dev_id_table[devTableIdx] = i;
860 devTableIdx++;
861
862 sprintf(entry->name, "%s:1l_double_ecc_errors", sanitized_name);
863 strncpy(entry->description, "L1 cache double bit ECC", PAPI_MAX_STR_LEN);
864 entry->options.ecc_opts = (struct local_ecc) {
865 .bits = NVML_DOUBLE_BIT_ECC,
866 .which_one = LOCAL_ECC_L1,
867 };
869 entry++;
870 nvml_dev_id_table[devTableIdx] = i;
871 devTableIdx++;
872
873 sprintf(entry->name, "%s:l2_double_ecc_errors", sanitized_name);
874 strncpy(entry->description, "L2 cache double bit ECC", PAPI_MAX_STR_LEN);
875 entry->options.ecc_opts = (struct local_ecc) {
876 .bits = NVML_DOUBLE_BIT_ECC,
877 .which_one = LOCAL_ECC_L2,
878 };
880 entry++;
881 nvml_dev_id_table[devTableIdx] = i;
882 devTableIdx++;
883
884 sprintf(entry->name, "%s:memory_double_ecc_errors", sanitized_name);
885 strncpy(entry->description, "Device memory double bit ECC", PAPI_MAX_STR_LEN);
886 entry->options.ecc_opts = (struct local_ecc) {
887 .bits = NVML_DOUBLE_BIT_ECC,
888 .which_one = LOCAL_ECC_MEM,
889 };
891 entry++;
892 nvml_dev_id_table[devTableIdx] = i;
893 devTableIdx++;
894
895 sprintf(entry->name, "%s:regfile_double_ecc_errors", sanitized_name);
896 strncpy(entry->description, "Register file double bit ECC", PAPI_MAX_STR_LEN);
897 entry->options.ecc_opts = (struct local_ecc) {
898 .bits = NVML_DOUBLE_BIT_ECC,
899 .which_one = LOCAL_ECC_REGFILE,
900 };
902 entry++;
903 nvml_dev_id_table[devTableIdx] = i;
904 devTableIdx++;
905 }
906
908 sprintf(entry->name, "%s:fan_speed", sanitized_name);
909 strncpy(entry->description, "The fan speed expressed as a percent of the maximum, i.e. full speed is 100%", PAPI_MAX_STR_LEN);
910 entry->type = FEATURE_FAN_SPEED;
911 entry++;
912 nvml_dev_id_table[devTableIdx] = i;
913 devTableIdx++;
914 }
915
917 sprintf(entry->name, "%s:graphics_max_clock", sanitized_name);
918 strncpy(entry->description, "Maximal Graphics clock domain (MHz).", PAPI_MAX_STR_LEN);
919 entry->options.clock = NVML_CLOCK_GRAPHICS;
920 entry->type = FEATURE_MAX_CLOCK;
921 entry++;
922 nvml_dev_id_table[devTableIdx] = i;
923 devTableIdx++;
924
925 sprintf(entry->name, "%s:sm_max_clock", sanitized_name);
926 strncpy(entry->description, "Maximal SM clock domain (MHz).", PAPI_MAX_STR_LEN);
927 entry->options.clock = NVML_CLOCK_SM;
928 entry->type = FEATURE_MAX_CLOCK;
929 entry++;
930 nvml_dev_id_table[devTableIdx] = i;
931 devTableIdx++;
932
933 sprintf(entry->name, "%s:memory_max_clock", sanitized_name);
934 strncpy(entry->description, "Maximal Memory clock domain (MHz).", PAPI_MAX_STR_LEN);
935 entry->options.clock = NVML_CLOCK_MEM;
936 entry->type = FEATURE_MAX_CLOCK;
937 entry++;
938 nvml_dev_id_table[devTableIdx] = i;
939 devTableIdx++;
940 }
941
943 sprintf(entry->name, "%s:total_memory", sanitized_name);
944 strncpy(entry->description, "Total installed FB memory (in bytes).", PAPI_MAX_STR_LEN);
946 entry->type = FEATURE_MEMORY_INFO;
947 entry++;
948 nvml_dev_id_table[devTableIdx] = i;
949 devTableIdx++;
950
951 sprintf(entry->name, "%s:unallocated_memory", sanitized_name);
952 strncpy(entry->description, "Uncallocated FB memory (in bytes).", PAPI_MAX_STR_LEN);
954 entry->type = FEATURE_MEMORY_INFO;
955 entry++;
956 nvml_dev_id_table[devTableIdx] = i;
957 devTableIdx++;
958
959 sprintf(entry->name, "%s:allocated_memory", sanitized_name);
960 strncpy(entry->description, "Allocated FB memory (in bytes). Note that the driver/GPU always sets aside a small amount of memory for bookkeeping.", PAPI_MAX_STR_LEN);
962 entry->type = FEATURE_MEMORY_INFO;
963 entry++;
964 nvml_dev_id_table[devTableIdx] = i;
965 devTableIdx++;
966 }
967
969 sprintf(entry->name, "%s:pstate", sanitized_name);
970 strncpy(entry->description, "The performance state of the device.", PAPI_MAX_STR_LEN);
971 entry->type = FEATURE_PERF_STATES;
972 entry++;
973 nvml_dev_id_table[devTableIdx] = i;
974 devTableIdx++;
975 }
976
978 sprintf(entry->name, "%s:power", sanitized_name);
979 // set the power event units value to "mW" for miliwatts
980 strncpy(entry->units, "mW", PAPI_MIN_STR_LEN);
981 strncpy(entry->description, "Power usage reading for the device, in miliwatts. This is the power draw (+/-5 watts) for the entire board: GPU, memory, etc.", PAPI_MAX_STR_LEN);
982 entry->type = FEATURE_POWER;
983 entry++;
984 nvml_dev_id_table[devTableIdx] = i;
985 devTableIdx++;
986 }
987
989 sprintf(entry->name, "%s:temperature", sanitized_name);
990 strncpy(entry->description, "Current temperature readings for the device, in degrees C.", PAPI_MAX_STR_LEN);
991 entry->type = FEATURE_TEMP;
992 entry++;
993 nvml_dev_id_table[devTableIdx] = i;
994 devTableIdx++;
995 }
996
998 sprintf(entry->name, "%s:total_ecc_errors", sanitized_name);
999 strncpy(entry->description, "Total single bit errors.", PAPI_MAX_STR_LEN);
1000 entry->options.ecc_opts = (struct local_ecc) {
1001 .bits = NVML_SINGLE_BIT_ECC,
1002 };
1004 entry++;
1005
1006 sprintf(entry->name, "%s:total_ecc_errors", sanitized_name);
1007 strncpy(entry->description, "Total double bit errors.", PAPI_MAX_STR_LEN);
1008 entry->options.ecc_opts = (struct local_ecc) {
1009 .bits = NVML_DOUBLE_BIT_ECC,
1010 };
1012 entry++;
1013 nvml_dev_id_table[devTableIdx] = i;
1014 devTableIdx++;
1015 }
1016
1018 sprintf(entry->name, "%s:gpu_utilization", sanitized_name);
1019 strncpy(entry->description, "Percent of time over the past second during which one or more kernels was executing on the GPU.", PAPI_MAX_STR_LEN);
1021 entry->type = FEATURE_UTILIZATION;
1022 entry++;
1023 nvml_dev_id_table[devTableIdx] = i;
1024 devTableIdx++;
1025
1026 sprintf(entry->name, "%s:memory_utilization", sanitized_name);
1027 strncpy(entry->description, "Percent of time over the past second during which global (device) memory was being read or written.", PAPI_MAX_STR_LEN);
1029 entry->type = FEATURE_UTILIZATION;
1030 entry++;
1031 nvml_dev_id_table[devTableIdx] = i;
1032 devTableIdx++;
1033 }
1034
1036 sprintf(entry->name, "%s:power_management_limit", sanitized_name);
1037 // set the power event units value to "mW" for milliwatts
1038 strncpy(entry->units, "mW", PAPI_MIN_STR_LEN);
1039 strncpy(entry->description, "Power draw upper bound limit (in mW) for the device. Writable (with appropriate privileges) on supported Kepler or later.", PAPI_MAX_STR_LEN - 1);
1040 entry->description[PAPI_MAX_STR_LEN - 1] = '\0';
1042 entry++;
1043 nvml_dev_id_table[devTableIdx] = i;
1044 devTableIdx++;
1045 }
1047 sprintf(entry->name, "%s:power_management_limit_constraint_min", sanitized_name);
1048 strncpy(entry->units, "mW", PAPI_MIN_STR_LEN);
1049 strncpy(entry->description, "The minimum power management limit in milliwatts.", PAPI_MAX_STR_LEN);
1051 entry++;
1052 nvml_dev_id_table[devTableIdx] = i;
1053 devTableIdx++;
1054 }
1055
1057 sprintf(entry->name, "%s:power_management_limit_constraint_max", sanitized_name);
1058 strncpy(entry->units, "mW", PAPI_MIN_STR_LEN);
1059 strncpy(entry->description, "The maximum power management limit in milliwatts.", PAPI_MAX_STR_LEN);
1061 entry++;
1062 nvml_dev_id_table[devTableIdx] = i;
1063 devTableIdx++;
1064 }
1065
1066 strncpy(names[i], name, sizeof(names[0]) - 1);
1067 names[i][sizeof(names[0]) - 1] = '\0';
1068 }
1069} // create native events.
const char * names[NUM_EVENTS]
#define PAPI_MIN_STR_LEN
Definition: f90papi.h:208
#define MEMINFO_TOTAL_MEMORY
Definition: linux-nvml.h:22
#define FEATURE_CLOCK_INFO
Definition: linux-nvml.h:6
#define HAS_FEATURE(features, query)
Definition: linux-nvml.h:20
#define MEMINFO_UNALLOCED
Definition: linux-nvml.h:23
#define FEATURE_UTILIZATION
Definition: linux-nvml.h:15
#define FEATURE_NVML_POWER_MANAGEMENT_LIMIT_CONSTRAINT_MAX
Definition: linux-nvml.h:18
#define GPU_UTILIZATION
Definition: linux-nvml.h:31
#define FEATURE_ECC_TOTAL_ERRORS
Definition: linux-nvml.h:14
#define FEATURE_FAN_SPEED
Definition: linux-nvml.h:8
#define FEATURE_POWER
Definition: linux-nvml.h:12
#define FEATURE_MAX_CLOCK
Definition: linux-nvml.h:9
#define MEMINFO_ALLOCED
Definition: linux-nvml.h:24
#define LOCAL_ECC_L2
Definition: linux-nvml.h:28
#define FEATURE_TEMP
Definition: linux-nvml.h:13
#define FEATURE_NVML_POWER_MANAGEMENT_LIMIT_CONSTRAINT_MIN
Definition: linux-nvml.h:17
#define LOCAL_ECC_MEM
Definition: linux-nvml.h:29
#define MEMORY_UTILIZATION
Definition: linux-nvml.h:32
#define FEATURE_ECC_LOCAL_ERRORS
Definition: linux-nvml.h:7
#define LOCAL_ECC_L1
Definition: linux-nvml.h:27
#define FEATURE_MEMORY_INFO
Definition: linux-nvml.h:10
#define LOCAL_ECC_REGFILE
Definition: linux-nvml.h:26
#define FEATURE_PERF_STATES
Definition: linux-nvml.h:11
#define FEATURE_POWER_MANAGEMENT
Definition: linux-nvml.h:16
nvmlEccBitType_t bits
Definition: linux-nvml.h:38
Definition: linux-nvml.h:48
int type
Definition: linux-nvml.h:53
nvml_resource_options_t options
Definition: linux-nvml.h:49
nvmlClockType_t clock
Definition: linux-nvml.h:43
struct local_ecc ecc_opts
Definition: linux-nvml.h:44
int retval
Definition: zero_fork.c:53
Here is the caller graph for this function:

◆ detectDevices()

static int detectDevices ( )
static

Definition at line 593 of file linux-nvml.c.

594{
595 nvmlReturn_t ret;
596 nvmlEnableState_t mode = NVML_FEATURE_DISABLED;
597 nvmlEnableState_t pendingmode = NVML_FEATURE_DISABLED;
598
599 char name[64];
600 char inforomECC[16];
601 char names[device_count][64];
602
603 float ecc_version = 0.0;
604
605 int i = 0;
606
607 unsigned int temp = 0;
608
609 memset(names, 0x0, device_count * 64);
610
611 /* So for each card, check whats querable */
612 for (i = 0; i < device_count; i++) {
613 features[i] = 0;
614
615 ret = (*nvmlDeviceGetHandleByIndexPtr)(i, &devices[i]);
616 if (NVML_SUCCESS != ret) {
617 SUBDBG("nvmlDeviceGetHandleByIndex(%d, &devices[%d]) failed.\n", i, i);
618 return PAPI_ESYS;
619 }
620
621 ret = (*nvmlDeviceGetNamePtr)(devices[i], name, sizeof(name) - 1);
622 if (NVML_SUCCESS != ret) {
623 SUBDBG("nvmlDeviceGetName failed \n");
624 const char *name_unknown = "deviceNameUnknown";
625 strncpy(name, name_unknown, strlen(name_unknown) + 1);
626 }
627
628 ret = (*nvmlDeviceGetInforomVersionPtr)(devices[i], NVML_INFOROM_ECC, inforomECC, 16);
629 if (NVML_SUCCESS != ret) {
630 SUBDBG("nvmlGetInforomVersion fails %s\n", (*nvmlErrorStringPtr)(ret));
631 } else {
632 ecc_version = strtof(inforomECC, NULL);
633 }
634
635 if (getClockSpeed(devices[i], NVML_CLOCK_GRAPHICS) != (unsigned long long) - 1) {
637 num_events += 3;
638 }
639
640 /* For Tesla and Quadro products from Fermi and Kepler families.
641 requires NVML_INFOROM_ECC 2.0 or higher for location-based counts
642 requires NVML_INFOROM_ECC 1.0 or higher for all other ECC counts
643 requires ECC mode to be enabled. */
644 ret = (*nvmlDeviceGetEccModePtr)(devices[i], &mode, &pendingmode);
645 if (NVML_SUCCESS == ret) {
646 if (NVML_FEATURE_ENABLED == mode) {
647 if (ecc_version >= 2.0) {
649 num_events += 8; /* {single bit, two bit errors} x { reg, l1, l2, memory } */
650 }
651 if (ecc_version >= 1.0) {
653 num_events += 2; /* single bit errors, double bit errors */
654 }
655 }
656 } else {
657 SUBDBG("nvmlDeviceGetEccMode does not appear to be supported. (nvml return code %d)\n", ret);
658 }
659
660 /* Check if fan speed is available */
661 if (getFanSpeed(devices[i]) != (unsigned long long) - 1) {
663 num_events++;
664 }
665
666 /* Check if clock data are available */
667 if (getMaxClockSpeed(devices[i], NVML_CLOCK_GRAPHICS) != (unsigned long long) - 1) {
669 num_events += 3;
670 }
671
672 /* For all products */
674 num_events += 3; /* total, free, used */
675
676 /* Check if performance state is available */
677 if (getPState(devices[i]) != (unsigned long long) - 1) {
679 num_events++;
680 }
681
682 /* For "GF11x" Tesla and Quadro products from the Fermi family
683 requires NVML_INFOROM_POWER 3.0 or higher
684 For Tesla and Quadro products from the Kepler family
685 does not require NVML_INFOROM_POWER */
686 /* Just try reading power, if it works, enable it*/
687 ret = (*nvmlDeviceGetPowerUsagePtr)(devices[i], &temp);
688 if (NVML_SUCCESS == ret) {
690 num_events++;
691 } else {
692 SUBDBG("nvmlDeviceGetPowerUsage does not appear to be supported on this card. (nvml return code %d)\n", ret);
693 }
694
695 /* Check if temperature data are available */
696 if (getTemperature(devices[i]) != (unsigned long long) - 1) {
698 num_events++;
699 }
700
701 // For power_management_limit
702 {
703 // Just try the call to see if it works
704 unsigned int templimit = 0;
705 ret = (*nvmlDeviceGetPowerManagementLimitPtr)(devices[i], &templimit);
706 if (ret == NVML_SUCCESS && templimit > 0) {
709 num_events += 1;
710 } else {
712 SUBDBG("nvmlDeviceGetPowerManagementLimit not appear to be supported on this card. (NVML code %d)\n", ret);
713 }
714 }
715
716 // For power_management_limit_constraints, minimum and maximum
717 {
718 unsigned int minLimit = 0, maxLimit = 0;
719 ret = (*nvmlDeviceGetPowerManagementLimitConstraintsPtr)(devices[i], &minLimit, &maxLimit);
720 if (ret == NVML_SUCCESS) {
723 num_events += 1;
726 num_events += 1;
727 } else {
730 }
731 SUBDBG("Done nvmlDeviceGetPowerManagementLimitConstraintsPtr\n");
732 }
733
734 /* Check if temperature data are available */
735 if (getUtilization(devices[i], GPU_UTILIZATION) != (unsigned long long) - 1) {
737 num_events += 2;
738 }
739
740 int retval = snprintf(names[i], sizeof(name), "%s:device:%d", name, i);
741 if (retval > (int)sizeof(name)) {
742 SUBDBG("Device name is too long %s:device%d", name, i);
743 return (PAPI_EINVAL);
744 }
745 names[i][sizeof(name) - 1] = '\0';
746 }
747 return PAPI_OK;
748}
char *(* nvmlErrorStringPtr)(nvmlReturn_t)
Definition: benchSANVML.c:64
#define PAPI_ESYS
Definition: f90papi.h:136
unsigned long long getFanSpeed(nvmlDevice_t dev)
Definition: linux-nvml.c:195
unsigned long long getUtilization(nvmlDevice_t dev, int which_one)
Definition: linux-nvml.c:363
unsigned long long getClockSpeed(nvmlDevice_t dev, nvmlClockType_t which_one)
Definition: linux-nvml.c:153
unsigned long long getTemperature(nvmlDevice_t dev)
Definition: linux-nvml.c:332
unsigned long long getMaxClockSpeed(nvmlDevice_t dev, nvmlClockType_t which_one)
Definition: linux-nvml.c:209
unsigned long long getPState(nvmlDevice_t dev)
Definition: linux-nvml.c:248
Here is the call graph for this function:
Here is the caller graph for this function:

◆ getClockSpeed()

unsigned long long getClockSpeed ( nvmlDevice_t  dev,
nvmlClockType_t  which_one 
)

Definition at line 153 of file linux-nvml.c.

154{
155 unsigned int ret = 0;
156 nvmlReturn_t bad;
157 bad = (*nvmlDeviceGetClockInfoPtr)(dev, which_one, &ret);
158
159 if (NVML_SUCCESS != bad) {
160 SUBDBG("something went wrong %s\n", (*nvmlErrorStringPtr)(bad));
161 return (unsigned long long) - 1;
162 }
163
164 return (unsigned long long)ret;
165}
Here is the caller graph for this function:

◆ getEccLocalErrors()

unsigned long long getEccLocalErrors ( nvmlDevice_t  dev,
nvmlEccBitType_t  bits,
int  which_one 
)

Definition at line 168 of file linux-nvml.c.

169{
170 nvmlEccErrorCounts_t counts;
171
172 nvmlReturn_t bad;
173 bad = (*nvmlDeviceGetDetailedEccErrorsPtr)(dev, bits, NVML_VOLATILE_ECC , &counts);
174
175 if (NVML_SUCCESS != bad) {
176 SUBDBG("something went wrong %s\n", (*nvmlErrorStringPtr)(bad));
177 return (unsigned long long) - 1;
178 }
179 switch (which_one) {
181 return counts.registerFile;
182 case LOCAL_ECC_L1:
183 return counts.l1Cache;
184 case LOCAL_ECC_L2:
185 return counts.l2Cache;
186 case LOCAL_ECC_MEM:
187 return counts.deviceMemory;
188 default:
189 ;
190 }
191 return (unsigned long long) - 1;
192}
Here is the caller graph for this function:

◆ getFanSpeed()

unsigned long long getFanSpeed ( nvmlDevice_t  dev)

Definition at line 195 of file linux-nvml.c.

196{
197 unsigned int ret = 0;
198 nvmlReturn_t bad;
199 bad = (*nvmlDeviceGetFanSpeedPtr)(dev, &ret);
200
201 if (NVML_SUCCESS != bad) {
202 SUBDBG("something went wrong %s\n", (*nvmlErrorStringPtr)(bad));
203 return (unsigned long long) - 1;
204 }
205 return (unsigned long long)ret;
206}
Here is the caller graph for this function:

◆ getMaxClockSpeed()

unsigned long long getMaxClockSpeed ( nvmlDevice_t  dev,
nvmlClockType_t  which_one 
)

Definition at line 209 of file linux-nvml.c.

210{
211 unsigned int ret = 0;
212 nvmlReturn_t bad;
213 bad = (*nvmlDeviceGetClockInfoPtr)(dev, which_one, &ret);
214
215 if (NVML_SUCCESS != bad) {
216 SUBDBG("something went wrong %s\n", (*nvmlErrorStringPtr)(bad));
217 return (unsigned long long) - 1;
218 }
219 return (unsigned long long) ret;
220}
Here is the caller graph for this function:

◆ getMemoryInfo()

unsigned long long getMemoryInfo ( nvmlDevice_t  dev,
int  which_one 
)

Definition at line 223 of file linux-nvml.c.

224{
225 nvmlMemory_t meminfo;
226 nvmlReturn_t bad;
227 bad = (*nvmlDeviceGetMemoryInfoPtr)(dev, &meminfo);
228
229 if (NVML_SUCCESS != bad) {
230 SUBDBG("something went wrong %s\n", (*nvmlErrorStringPtr)(bad));
231 return (unsigned long long) - 1;
232 }
233
234 switch (which_one) {
236 return meminfo.total;
238 return meminfo.free;
239 case MEMINFO_ALLOCED:
240 return meminfo.used;
241 default:
242 ;
243 }
244 return (unsigned long long) - 1;
245}
Here is the caller graph for this function:

◆ getPowerManagementLimit()

unsigned long long getPowerManagementLimit ( nvmlDevice_t  dev)

Definition at line 386 of file linux-nvml.c.

387{
388 unsigned int limit;
389 nvmlReturn_t rv;
390 rv = (*nvmlDeviceGetPowerManagementLimitPtr)(dev, &limit);
391 if (NVML_SUCCESS != rv) {
392 SUBDBG("something went wrong %s\n", (*nvmlErrorStringPtr)(rv));
393 return (unsigned long long) 0;
394 }
395 return (unsigned long long) limit;
396}
Here is the caller graph for this function:

◆ getPowerUsage()

unsigned long long getPowerUsage ( nvmlDevice_t  dev)

Definition at line 318 of file linux-nvml.c.

319{
320 unsigned int power;
321 nvmlReturn_t bad;
322 bad = (*nvmlDeviceGetPowerUsagePtr)(dev, &power);
323
324 if (NVML_SUCCESS != bad) {
325 SUBDBG("something went wrong %s\n", (*nvmlErrorStringPtr)(bad));
326 return (unsigned long long) - 1;
327 }
328 return (unsigned long long) power;
329}
Here is the caller graph for this function:

◆ getPState()

unsigned long long getPState ( nvmlDevice_t  dev)

Definition at line 248 of file linux-nvml.c.

249{
250 unsigned int ret = 0;
251 nvmlPstates_t state = NVML_PSTATE_15;
252 nvmlReturn_t bad;
253 bad = (*nvmlDeviceGetPerformanceStatePtr)(dev, &state);
254
255 if (NVML_SUCCESS != bad) {
256 SUBDBG("something went wrong %s\n", (*nvmlErrorStringPtr)(bad));
257 return (unsigned long long) - 1;
258 }
259 switch (state) {
260 case NVML_PSTATE_15:
261 ret++;
262 // fall through
263 case NVML_PSTATE_14:
264 ret++;
265 // fall through
266 case NVML_PSTATE_13:
267 ret++;
268 // fall through
269 case NVML_PSTATE_12:
270 ret++;
271 // fall through
272 case NVML_PSTATE_11:
273 ret++;
274 // fall through
275 case NVML_PSTATE_10:
276 ret++;
277 // fall through
278 case NVML_PSTATE_9:
279 ret++;
280 // fall through
281 case NVML_PSTATE_8:
282 ret++;
283 // fall through
284 case NVML_PSTATE_7:
285 ret++;
286 // fall through
287 case NVML_PSTATE_6:
288 ret++;
289 // fall through
290 case NVML_PSTATE_5:
291 ret++;
292 // fall through
293 case NVML_PSTATE_4:
294 ret++;
295 // fall through
296 case NVML_PSTATE_3:
297 ret++;
298 // fall through
299 case NVML_PSTATE_2:
300 ret++;
301 // fall through
302 case NVML_PSTATE_1:
303 ret++;
304 // fall through
305 case NVML_PSTATE_0:
306 break;
307 // fall through
308 case NVML_PSTATE_UNKNOWN:
309 default:
310 /* This should never happen?
311 * The API docs just state Unknown performance state... */
312 return (unsigned long long) - 1;
313 }
314 return (unsigned long long)ret;
315}
bool state
Definition: papi_hl.c:155
Here is the caller graph for this function:

◆ getTemperature()

unsigned long long getTemperature ( nvmlDevice_t  dev)

Definition at line 332 of file linux-nvml.c.

333{
334 unsigned int ret = 0;
335 nvmlReturn_t bad;
336 bad = (*nvmlDeviceGetTemperaturePtr)(dev, NVML_TEMPERATURE_GPU, &ret);
337
338 if (NVML_SUCCESS != bad) {
339 SUBDBG("something went wrong %s\n", (*nvmlErrorStringPtr)(bad));
340 return (unsigned long long) - 1;
341 }
342 return (unsigned long long)ret;
343}
Here is the caller graph for this function:

◆ getTotalEccErrors()

unsigned long long getTotalEccErrors ( nvmlDevice_t  dev,
nvmlEccBitType_t  bits 
)

Definition at line 346 of file linux-nvml.c.

347{
348 unsigned long long counts = 0;
349 nvmlReturn_t bad;
350 bad = (*nvmlDeviceGetTotalEccErrorsPtr)(dev, bits, NVML_VOLATILE_ECC , &counts);
351
352 if (NVML_SUCCESS != bad) {
353 SUBDBG("something went wrong %s\n", (*nvmlErrorStringPtr)(bad));
354 return (unsigned long long) - 1;
355 }
356 return counts;
357}
Here is the caller graph for this function:

◆ getUtilization()

unsigned long long getUtilization ( nvmlDevice_t  dev,
int  which_one 
)

Definition at line 363 of file linux-nvml.c.

364{
365 nvmlUtilization_t util;
366 nvmlReturn_t bad;
367 bad = (*nvmlDeviceGetUtilizationRatesPtr)(dev, &util);
368
369 if (NVML_SUCCESS != bad) {
370 SUBDBG("something went wrong %s\n", (*nvmlErrorStringPtr)(bad));
371 return (unsigned long long) - 1;
372 }
373
374 switch (which_one) {
375 case GPU_UTILIZATION:
376 return (unsigned long long) util.gpu;
378 return (unsigned long long) util.memory;
379 default:
380 ;
381 }
382
383 return (unsigned long long) - 1;
384}
Here is the caller graph for this function:

◆ linkCudaLibraries()

static int linkCudaLibraries ( )
static

Definition at line 1250 of file linux-nvml.c.

1251{
1252 char path_lib[1024];
1253 /* Attempt to guess if we were statically linked to libc, if so bail */
1254 if (_dl_non_dynamic_init != NULL) {
1255 strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML component does not support statically linking of libc.", PAPI_MAX_STR_LEN);
1256 return PAPI_ENOSUPP;
1257 }
1258
1259 // Need to link in the NVML libraries, if any not found disable the component.
1260 // getenv returns NULL if environment variable is not found.
1261 char *cuda_root = getenv("PAPI_CUDA_ROOT");
1262
1263 // We need the NVML main library, normally libnvidia-ml.so.
1264 dl3 = NULL; // Ensure reset to NULL.
1265
1266 // Step 1: Process override if given.
1267 if (strlen(nvml_main) > 0) { // If override given, it MUST work.
1268 dl3 = dlopen(nvml_main, RTLD_NOW | RTLD_GLOBAL); // Try to open that path.
1269 if (dl3 == NULL) {
1270 snprintf(_nvml_vector.cmp_info.disabled_reason, PAPI_MAX_STR_LEN, "PAPI_NVML_MAIN override '%s' given in Rules.nvml not found.", nvml_main);
1271 return(PAPI_ENOSUPP); // Override given but not found.
1272 }
1273 }
1274
1275 // Step 2: Try system paths, will work with Spack, LD_LIBRARY_PATH, default paths.
1276 if (dl3 == NULL) { // If no override,
1277 dl3 = dlopen("libnvidia-ml.so", RTLD_NOW | RTLD_GLOBAL); // Try system paths.
1278 }
1279
1280 // Step 3: Try the explicit install default.
1281 if (dl3 == NULL && cuda_root != NULL) { // If ROOT given, it doesn't HAVE to work.
1282 snprintf(path_lib, 1024, "%s/lib64/libnvidia-ml.so", cuda_root); // PAPI Root check.
1283 dl3 = dlopen(path_lib, RTLD_NOW | RTLD_GLOBAL); // Try to open that path.
1284 }
1285
1286 // Check for failure.
1287 if (dl3 == NULL) {
1288 snprintf(_nvml_vector.cmp_info.disabled_reason, PAPI_MAX_STR_LEN, "libnvidia-ml.so not found.");
1289 return(PAPI_ENOSUPP); // Not found on default paths.
1290 }
1291
1292 // We have a dl3. (libnvidia-ml.so).
1293
1294 nvmlDeviceGetClockInfoPtr = dlsym(dl3, "nvmlDeviceGetClockInfo");
1295 if (dlerror() != NULL) {
1296 strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetClockInfo not found.", PAPI_MAX_STR_LEN);
1297 return (PAPI_ENOSUPP);
1298 }
1299 nvmlErrorStringPtr = dlsym(dl3, "nvmlErrorString");
1300 if (dlerror() != NULL) {
1301 strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlErrorString not found.", PAPI_MAX_STR_LEN);
1302 return (PAPI_ENOSUPP);
1303 }
1304 nvmlDeviceGetDetailedEccErrorsPtr = dlsym(dl3, "nvmlDeviceGetDetailedEccErrors");
1305 if (dlerror() != NULL) {
1306 strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetDetailedEccErrors not found.", PAPI_MAX_STR_LEN);
1307 return (PAPI_ENOSUPP);
1308 }
1309 nvmlDeviceGetFanSpeedPtr = dlsym(dl3, "nvmlDeviceGetFanSpeed");
1310 if (dlerror() != NULL) {
1311 strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetFanSpeed not found.", PAPI_MAX_STR_LEN);
1312 return (PAPI_ENOSUPP);
1313 }
1314 nvmlDeviceGetMemoryInfoPtr = dlsym(dl3, "nvmlDeviceGetMemoryInfo");
1315 if (dlerror() != NULL) {
1316 strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetMemoryInfo not found.", PAPI_MAX_STR_LEN);
1317 return (PAPI_ENOSUPP);
1318 }
1319 nvmlDeviceGetPerformanceStatePtr = dlsym(dl3, "nvmlDeviceGetPerformanceState");
1320 if (dlerror() != NULL) {
1321 strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetPerformanceState not found.", PAPI_MAX_STR_LEN);
1322 return (PAPI_ENOSUPP);
1323 }
1324 nvmlDeviceGetPowerUsagePtr = dlsym(dl3, "nvmlDeviceGetPowerUsage");
1325 if (dlerror() != NULL) {
1326 strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetPowerUsage not found.", PAPI_MAX_STR_LEN);
1327 return (PAPI_ENOSUPP);
1328 }
1329 nvmlDeviceGetTemperaturePtr = dlsym(dl3, "nvmlDeviceGetTemperature");
1330 if (dlerror() != NULL) {
1331 strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetTemperature not found.", PAPI_MAX_STR_LEN);
1332 return (PAPI_ENOSUPP);
1333 }
1334 nvmlDeviceGetTotalEccErrorsPtr = dlsym(dl3, "nvmlDeviceGetTotalEccErrors");
1335 if (dlerror() != NULL) {
1336 strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetTotalEccErrors not found.", PAPI_MAX_STR_LEN);
1337 return (PAPI_ENOSUPP);
1338 }
1339 nvmlDeviceGetUtilizationRatesPtr = dlsym(dl3, "nvmlDeviceGetUtilizationRates");
1340 if (dlerror() != NULL) {
1341 strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetUtilizationRates not found.", PAPI_MAX_STR_LEN);
1342 return (PAPI_ENOSUPP);
1343 }
1344 nvmlDeviceGetHandleByIndexPtr = dlsym(dl3, "nvmlDeviceGetHandleByIndex");
1345 if (dlerror() != NULL) {
1346 strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetHandleByIndex not found.", PAPI_MAX_STR_LEN);
1347 return (PAPI_ENOSUPP);
1348 }
1349 nvmlDeviceGetPciInfoPtr = dlsym(dl3, "nvmlDeviceGetPciInfo");
1350 if (dlerror() != NULL) {
1351 strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetPciInfo not found.", PAPI_MAX_STR_LEN);
1352 return (PAPI_ENOSUPP);
1353 }
1354 nvmlDeviceGetNamePtr = dlsym(dl3, "nvmlDeviceGetName");
1355 if (dlerror() != NULL) {
1356 strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetName not found.", PAPI_MAX_STR_LEN);
1357 return (PAPI_ENOSUPP);
1358 }
1359 nvmlDeviceGetInforomVersionPtr = dlsym(dl3, "nvmlDeviceGetInforomVersion");
1360 if (dlerror() != NULL) {
1361 strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetInforomVersion not found.", PAPI_MAX_STR_LEN);
1362 return (PAPI_ENOSUPP);
1363 }
1364 nvmlDeviceGetEccModePtr = dlsym(dl3, "nvmlDeviceGetEccMode");
1365 if (dlerror() != NULL) {
1366 strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetEccMode not found.", PAPI_MAX_STR_LEN);
1367 return (PAPI_ENOSUPP);
1368 }
1369 nvmlInitPtr = dlsym(dl3, "nvmlInit");
1370 if (dlerror() != NULL) {
1371 strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlInit not found.", PAPI_MAX_STR_LEN);
1372 return (PAPI_ENOSUPP);
1373 }
1374 nvmlDeviceGetCountPtr = dlsym(dl3, "nvmlDeviceGetCount");
1375 if (dlerror() != NULL) {
1376 strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetCount not found.", PAPI_MAX_STR_LEN);
1377 return (PAPI_ENOSUPP);
1378 }
1379 nvmlShutdownPtr = dlsym(dl3, "nvmlShutdown");
1380 if (dlerror() != NULL) {
1381 strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlShutdown not found.", PAPI_MAX_STR_LEN);
1382 return (PAPI_ENOSUPP);
1383 }
1384 nvmlDeviceGetPowerManagementLimitPtr = dlsym(dl3, "nvmlDeviceGetPowerManagementLimit");
1385 if (dlerror() != NULL) {
1386 strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetPowerManagementLimit not found.", PAPI_MAX_STR_LEN);
1387 return (PAPI_ENOSUPP);
1388 }
1389 nvmlDeviceSetPowerManagementLimitPtr = dlsym(dl3, "nvmlDeviceSetPowerManagementLimit");
1390 if (dlerror() != NULL) {
1391 strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceSetPowerManagementLimit not found.", PAPI_MAX_STR_LEN);
1392 return (PAPI_ENOSUPP);
1393 }
1394 nvmlDeviceGetPowerManagementLimitConstraintsPtr = dlsym(dl3, "nvmlDeviceGetPowerManagementLimitConstraints");
1395 if (dlerror() != NULL) {
1396 strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetPowerManagementLimitConstraints not found.", PAPI_MAX_STR_LEN);
1397 return (PAPI_ENOSUPP);
1398 }
1399 return (PAPI_OK);
1400}
nvmlReturn_t(* nvmlDeviceGetCountPtr)(unsigned int *dest)
Definition: benchSANVML.c:66
nvmlReturn_t(* nvmlDeviceGetMemoryInfoPtr)(nvmlDevice_t, nvmlMemory_t *)
Definition: benchSANVML.c:72
nvmlReturn_t(* nvmlDeviceGetEccModePtr)(nvmlDevice_t, nvmlEnableState_t *, nvmlEnableState_t *)
Definition: benchSANVML.c:68
nvmlReturn_t(* nvmlInitPtr)(void)
Definition: benchSANVML.c:83
nvmlReturn_t(* nvmlDeviceGetTotalEccErrorsPtr)(nvmlDevice_t, nvmlEccBitType_t, nvmlEccCounterType_t, unsigned long long *)
Definition: benchSANVML.c:80
nvmlReturn_t(* nvmlDeviceGetNamePtr)(nvmlDevice_t, char *, unsigned int)
Definition: benchSANVML.c:73
nvmlReturn_t(* nvmlDeviceGetUtilizationRatesPtr)(nvmlDevice_t, nvmlUtilization_t *)
Definition: benchSANVML.c:81
nvmlReturn_t(* nvmlDeviceGetFanSpeedPtr)(nvmlDevice_t, unsigned int *)
Definition: benchSANVML.c:69
nvmlReturn_t(* nvmlDeviceGetClockInfoPtr)(nvmlDevice_t, nvmlClockType_t, unsigned int *)
Definition: benchSANVML.c:65
nvmlReturn_t(* nvmlDeviceGetPowerUsagePtr)(nvmlDevice_t, unsigned int *)
Definition: benchSANVML.c:78
nvmlReturn_t(* nvmlDeviceGetPciInfoPtr)(nvmlDevice_t, nvmlPciInfo_t *)
Definition: benchSANVML.c:74
nvmlReturn_t(* nvmlDeviceGetHandleByIndexPtr)(unsigned int, nvmlDevice_t *)
Definition: benchSANVML.c:70
nvmlReturn_t(* nvmlDeviceSetPowerManagementLimitPtr)(nvmlDevice_t device, unsigned int limit)
Definition: benchSANVML.c:82
nvmlReturn_t(* nvmlDeviceGetInforomVersionPtr)(nvmlDevice_t, nvmlInforomObject_t, char *, unsigned int)
Definition: benchSANVML.c:71
nvmlReturn_t(* nvmlDeviceGetPerformanceStatePtr)(nvmlDevice_t, nvmlPstates_t *)
Definition: benchSANVML.c:75
nvmlReturn_t(* nvmlDeviceGetPowerManagementLimitPtr)(nvmlDevice_t device, unsigned int *limit)
Definition: benchSANVML.c:77
nvmlReturn_t(* nvmlDeviceGetDetailedEccErrorsPtr)(nvmlDevice_t, nvmlEccBitType_t, nvmlEccCounterType_t, nvmlEccErrorCounts_t *)
Definition: benchSANVML.c:67
nvmlReturn_t(* nvmlDeviceGetTemperaturePtr)(nvmlDevice_t, nvmlTemperatureSensors_t, unsigned int *)
Definition: benchSANVML.c:79
nvmlReturn_t(* nvmlDeviceGetPowerManagementLimitConstraintsPtr)(nvmlDevice_t device, unsigned int *minLimit, unsigned int *maxLimit)
Definition: benchSANVML.c:76
void(* _dl_non_dynamic_init)(void)
Definition: linux-nvml.c:46
Here is the caller graph for this function:

◆ nvml_hardware_read()

static int nvml_hardware_read ( long long value,
int  which_one 
)
static

Code that reads event values.

Definition at line 450 of file linux-nvml.c.

452{
454 nvmlDevice_t handle;
455 int cudaIdx = -1;
456
457 entry = &nvml_native_table[which_one];
458 *value = (long long) - 1;
459 /* replace entry->resources with the current cuda_device->nvml device */
460 cudaIdx = nvml_dev_id_table[which_one];
461
462 if (cudaIdx < 0 || cudaIdx > device_count)
463 return PAPI_EINVAL;
464
465 /* Make sure the device we are running on has the requested event */
466 if (!HAS_FEATURE(features[cudaIdx] , entry->type))
467 return PAPI_EINVAL;
468
469 handle = devices[cudaIdx];
470
471 switch (entry->type) {
473 *value = getClockSpeed(handle, (nvmlClockType_t)entry->options.clock);
474 break;
476 *value = getEccLocalErrors(handle,
477 (nvmlEccBitType_t)entry->options.ecc_opts.bits,
478 (int)entry->options.ecc_opts.which_one);
479 break;
481 *value = getFanSpeed(handle);
482 break;
484 *value = getMaxClockSpeed(handle,
485 (nvmlClockType_t)entry->options.clock);
486 break;
488 *value = getMemoryInfo(handle,
489 (int)entry->options.which_one);
490 break;
492 *value = getPState(handle);
493 break;
494 case FEATURE_POWER:
495 *value = getPowerUsage(handle);
496 break;
497 case FEATURE_TEMP:
498 *value = getTemperature(handle);
499 break;
501 *value = getTotalEccErrors(handle,
502 (nvmlEccBitType_t)entry->options.ecc_opts.bits);
503 break;
505 *value = getUtilization(handle,
506 (int)entry->options.which_one);
507 break;
510 break;
511
514 break;
515
518 break;
519
520 default:
521 return PAPI_EINVAL;
522 }
523 if (*value == (long long)(unsigned long long) - 1)
524 return PAPI_EINVAL;
525
526 return PAPI_OK;
527}
static papi_handle_t handle
Definition: Gamum.c:21
unsigned long long getMemoryInfo(nvmlDevice_t dev, int which_one)
Definition: linux-nvml.c:223
unsigned long long getPowerManagementLimit(nvmlDevice_t dev)
Definition: linux-nvml.c:386
unsigned long long getEccLocalErrors(nvmlDevice_t dev, nvmlEccBitType_t bits, int which_one)
Definition: linux-nvml.c:168
unsigned long long getPowerUsage(nvmlDevice_t dev)
Definition: linux-nvml.c:318
unsigned long long getTotalEccErrors(nvmlDevice_t dev, nvmlEccBitType_t bits)
Definition: linux-nvml.c:346
long long int long long
Definition: sde_internal.h:85
int which_one
Definition: linux-nvml.h:39
Here is the call graph for this function:
Here is the caller graph for this function:

◆ nvml_hardware_reset()

static void nvml_hardware_reset ( )
static

Definition at line 417 of file linux-nvml.c.

418{
419 /* nvmlDeviceSet* and nvmlDeviceClear* calls require root/admin access, so while
420 * possible to implement a reset on the ECC counters, we pass */
421 /*
422 for ( i=0; i < device_count; i++ )
423 nvmlDeviceClearEccErrorCounts( device[i], NVML_VOLATILE_ECC );
424 */
425 int i;
426 nvmlReturn_t ret;
427 unsigned int templimit = 0;
428 for (i = 0; i < device_count; i++) {
430 // if power management is available
432 ret = (*nvmlDeviceGetPowerManagementLimitPtr)(devices[i], &templimit);
433 if ((ret == NVML_SUCCESS) && (templimit != power_management_initial_limit[i])) {
434 SUBDBG("Reset power_management_limit on device %d to initial value of %d \n", i, power_management_initial_limit[i]);
435 // if power is not at its initial value
436 // reset to initial value
437 ret = (*nvmlDeviceSetPowerManagementLimitPtr)(devices[i], power_management_initial_limit[i]);
438 if (ret != NVML_SUCCESS)
439 SUBDBG("Unable to reset the NVML power management limit on device %i to %ull (return code %d) \n", i, power_management_initial_limit[i] , ret);
440 }
441 }
442 }
443 }
444}
Here is the caller graph for this function:

◆ nvml_hardware_write()

static int nvml_hardware_write ( long long value,
int  which_one 
)
static

Code that reads event values.

Definition at line 532 of file linux-nvml.c.

533{
535 nvmlDevice_t handle;
536 int cudaIdx = -1;
537 nvmlReturn_t nvret;
538
539 entry = &nvml_native_table[which_one];
540 /* replace entry->resources with the current cuda_device->nvml device */
541 cudaIdx = nvml_dev_id_table[which_one];
542
543 if (cudaIdx < 0 || cudaIdx > device_count)
544 return PAPI_EINVAL;
545
546 /* Make sure the device we are running on has the requested event */
547 if (!HAS_FEATURE(features[cudaIdx] , entry->type))
548 return PAPI_EINVAL;
549
550 handle = devices[cudaIdx];
551
552 switch (entry->type) {
554 unsigned int setToPower = (unsigned int) * value;
555 if (setToPower < power_management_limit_constraint_min[cudaIdx]) {
556 SUBDBG("Error: Desired power %u mW < minimum %u mW on device %d\n", setToPower, power_management_limit_constraint_min[cudaIdx], cudaIdx);
557 return PAPI_EINVAL;
558 }
559 if (setToPower > power_management_limit_constraint_max[cudaIdx]) {
560 SUBDBG("Error: Desired power %u mW > maximum %u mW on device %d\n", setToPower, power_management_limit_constraint_max[cudaIdx], cudaIdx);
561 return PAPI_EINVAL;
562 }
563 if ((nvret = (*nvmlDeviceSetPowerManagementLimitPtr)(handle, setToPower)) != NVML_SUCCESS) {
564 SUBDBG("Error: %s\n", (*nvmlErrorStringPtr)(nvret));
565 return PAPI_EINVAL;
566 }
567 }
568 break;
569
570 default:
571 return PAPI_EINVAL;
572 }
573
574 return PAPI_OK;
575}
int
Definition: sde_internal.h:89
Here is the caller graph for this function:

Variable Documentation

◆ _dl_non_dynamic_init

void(* _dl_non_dynamic_init) (void) ( void  )

Holds control flags. Usually there's one of these per event-set. Usually this is out-of band configuration of the hardware

< Copy of counts, holds results when stopped

Definition at line 46 of file linux-nvml.c.

125 {
126 int num_events;
127 int which_counter[NVML_MAX_COUNTERS];
128 long long counter[NVML_MAX_COUNTERS];
#define NVML_MAX_COUNTERS

◆ _nvml_vector

papi_vector_t _nvml_vector

Vector that points to entry points for our component

Definition at line 1740 of file linux-nvml.c.

◆ device_count

int device_count = 0
static

Number of devices detected at component_init time

Definition at line 141 of file linux-nvml.c.

◆ devices

nvmlDevice_t* devices = NULL
static

Definition at line 146 of file linux-nvml.c.

◆ features

int* features = NULL
static

Definition at line 147 of file linux-nvml.c.

◆ num_events

int num_events = 0
static

number of events in the table

Definition at line 144 of file linux-nvml.c.

◆ nvml_control_state_t

nvml_control_state_t

Definition at line 129 of file linux-nvml.c.

◆ nvml_dev_id_table

int* nvml_dev_id_table = NULL
static

Definition at line 138 of file linux-nvml.c.

◆ nvml_native_table

nvml_native_event_entry_t* nvml_native_table = NULL
static

This table contains the native events

Definition at line 137 of file linux-nvml.c.

◆ power_management_initial_limit

unsigned int* power_management_initial_limit = NULL
static

Definition at line 148 of file linux-nvml.c.

◆ power_management_limit_constraint_max

unsigned int* power_management_limit_constraint_max = NULL
static

Definition at line 150 of file linux-nvml.c.

◆ power_management_limit_constraint_min

unsigned int* power_management_limit_constraint_min = NULL
static

Definition at line 149 of file linux-nvml.c.