11#include <cupti_target.h>
12#include <cupti_profiler_target.h>
13#include <nvperf_host.h>
14#include <nvperf_cuda_host.h>
15#include <nvperf_target.h>
25typedef NVPW_CUDA_MetricsContext_Create_Params
MCCP_t;
46 char *description,
int *numDep, NVPA_RawMetricRequest **pRMR);
47static int check_num_passes(
struct NVPA_RawMetricsConfig *pRawMetricsConfig,
int rmr_count,
48 NVPA_RawMetricRequest *rmr,
int *num_pass);
118#define NVPW_CALL( call, handleerror ) \
120 NVPA_Status _status = (call); \
121 LOGCUPTICALL("\t" #call "\n"); \
122 if (_status != NVPA_STATUS_SUCCESS) { \
123 ERRDBG("NVPA Error %d: Error in call to " #call "\n", _status); \
134 ERRDBG(
"libcupti.so should already be loaded.\n");
198 char dlname[] =
"libnvperf_host.so";
199 char lookup_path[PATH_MAX];
201 char *papi_cuda_perfworks = getenv(
"PAPI_CUDA_PERFWORKS");
202 if (papi_cuda_perfworks) {
203 sprintf(lookup_path,
"%s/%s", papi_cuda_perfworks, dlname);
204 dl_nvpw = dlopen(lookup_path, RTLD_NOW | RTLD_GLOBAL);
207 const char *standard_paths[] = {
208 "%s/extras/CUPTI/lib64/%s",
217 char *papi_cuda_root = getenv(
"PAPI_CUDA_ROOT");
218 if (papi_cuda_root && !
dl_nvpw) {
223 dl_nvpw = dlopen(dlname, RTLD_NOW | RTLD_GLOBAL);
225 ERRDBG(
"Loading libnvperf_host.so failed.\n");
261 LOGDBG(
"NVPW library loaded from %s\n", info.dli_fname);
308 CUpti_Profiler_Initialize_Params profilerInitializeParams = { CUpti_Profiler_Initialize_Params_STRUCT_SIZE, NULL };
310 if (papi_errno != CUPTI_SUCCESS) {
311 ERRDBG(
"CUPTI error %d: cuptiProfilerInitialize failed.\n", papi_errno);
321 CUpti_Profiler_DeInitialize_Params profilerDeInitializeParams = { CUpti_Profiler_DeInitialize_Params_STRUCT_SIZE, NULL };
323 if (papi_errno != CUPTI_SUCCESS) {
324 ERRDBG(
"CUPTI Error %d: cuptiProfilerDeInitialize failed.\n", papi_errno);
334 NVPW_InitializeHost_Params perfInitHostParams = { NVPW_InitializeHost_Params_STRUCT_SIZE, NULL };
336 if (papi_errno != NVPA_STATUS_SUCCESS) {
337 ERRDBG(
"NVPW Error %d: NVPW_InitializeHostPtr failed.\n", papi_errno);
346 CUpti_Device_GetChipName_Params getChipName = {
347 .structSize = CUpti_Device_GetChipName_Params_STRUCT_SIZE,
351 getChipName.deviceIndex = dev_num;
353 if (papi_errno != CUPTI_SUCCESS) {
354 ERRDBG(
"CUPTI error %d: Failed to get chip name for device %d\n", papi_errno, dev_num);
357 strcpy(chipName, getChipName.pChipName);
370 NVPA_RawMetricRequest *
rmr;
398 if (nv_name == NULL) {
403 const char token[] =
":device=";
404 const int tok_len = 8;
407 char *getdevstr = strstr(
name, token);
408 if (getdevstr == NULL) {
409 ERRDBG(
"Event name does not contain device number.\n");
412 getdevstr += tok_len;
413 *gpuid = strtol(getdevstr, &rest, 10);
414 numchars = strlen(
name) - strlen(getdevstr) - tok_len;
415 memcpy(nv_name,
name, numchars);
416 nv_name[numchars] =
'\0';
424 int i, gpu_id, papi_errno =
PAPI_OK;
442 if (gpu_id < 0 || gpu_id >
num_gpus) {
447 LOGDBG(
"Adding event gpu %d name %s with code %d at pos %d\n", gpu_id, evt_rec->
name, evt_rec->
evt_code,
i);
454 char *description,
int *numDep, NVPA_RawMetricRequest **pRMR)
458 NVPA_Status nvpa_err;
460 if (nv_name == NULL || description == NULL) {
464 NVPW_MetricsContext_GetMetricProperties_Begin_Params getMetricPropertiesBeginParams = {
465 .structSize = NVPW_MetricsContext_GetMetricProperties_Begin_Params_STRUCT_SIZE,
467 .pMetricsContext = pMetricsContext,
468 .pMetricName = nv_name,
472 if (nvpa_err != NVPA_STATUS_SUCCESS || getMetricPropertiesBeginParams.ppRawMetricDependencies == NULL) {
473 strcpy(description,
"Could not get description.");
477 for (num_dep = 0; getMetricPropertiesBeginParams.ppRawMetricDependencies[num_dep] != NULL; num_dep++) {;}
479 NVPA_RawMetricRequest *rmr = (NVPA_RawMetricRequest *)
papi_calloc(num_dep,
sizeof(NVPA_RawMetricRequest));
484 for (
i = 0;
i < num_dep;
i++) {
485 rmr[
i].pMetricName = strdup(getMetricPropertiesBeginParams.ppRawMetricDependencies[
i]);
487 rmr[
i].keepInstances = 1;
488 rmr[
i].structSize = NVPW_MetricsContext_GetMetricProperties_End_Params_STRUCT_SIZE;
492 getMetricPropertiesBeginParams.pDescription,
493 getMetricPropertiesBeginParams.pDimUnits);
495 ERRDBG(
"String formatting exceeded max string length.\n");
500 NVPW_MetricsContext_GetMetricProperties_End_Params getMetricPropertiesEndParams = {
501 .structSize = NVPW_MetricsContext_GetMetricProperties_End_Params_STRUCT_SIZE,
503 .pMetricsContext = pMetricsContext,
513 NVPA_RawMetricRequest *all_rmr=NULL;
514 int count_raw_metrics = 0;
517 NVPA_RawMetricRequest *temp;
521 for (
i = 0;
i < gpu_ctl->event_names->count;
i++) {
528 papi_errno =
retrieve_metric_details(gpu_ctl->pmetricsContextCreateParams->pMetricsContext, nv_name, evt_rec->
desc, &num_dep, &temp);
534 all_rmr = (NVPA_RawMetricRequest *)
papi_realloc(all_rmr, (count_raw_metrics + num_dep) *
sizeof(NVPA_RawMetricRequest));
535 if (all_rmr == NULL) {
539 for (j = 0; j < num_dep; j++) {
540 k = j + count_raw_metrics;
541 all_rmr[k].structSize = temp[j].structSize;
542 all_rmr[k].pPriv = NULL;
543 all_rmr[k].pMetricName = strdup(temp[j].pMetricName);
544 all_rmr[k].keepInstances = 1;
545 all_rmr[k].isolated = 1;
548 count_raw_metrics += num_dep;
551 gpu_ctl->rmr = all_rmr;
552 gpu_ctl->rmr_count = count_raw_metrics;
557static int check_num_passes(
struct NVPA_RawMetricsConfig *pRawMetricsConfig,
int rmr_count, NVPA_RawMetricRequest *rmr,
int *num_pass)
560 NVPW_RawMetricsConfig_BeginPassGroup_Params beginPassGroupParams = {
561 .structSize = NVPW_RawMetricsConfig_BeginPassGroup_Params_STRUCT_SIZE,
563 .pRawMetricsConfig = pRawMetricsConfig,
568 NVPW_RawMetricsConfig_AddMetrics_Params addMetricsParams = {
569 .structSize = NVPW_RawMetricsConfig_AddMetrics_Params_STRUCT_SIZE,
571 .pRawMetricsConfig = pRawMetricsConfig,
572 .pRawMetricRequests = rmr,
573 .numMetricRequests = rmr_count,
577 NVPW_RawMetricsConfig_EndPassGroup_Params endPassGroupParams = {
578 .structSize = NVPW_RawMetricsConfig_EndPassGroup_Params_STRUCT_SIZE,
580 .pRawMetricsConfig = pRawMetricsConfig,
584 NVPW_RawMetricsConfig_GetNumPasses_Params rawMetricsConfigGetNumPassesParams = {
585 .structSize = NVPW_RawMetricsConfig_GetNumPasses_Params_STRUCT_SIZE,
587 .pRawMetricsConfig = pRawMetricsConfig,
591 int numNestingLevels = 1, numIsolatedPasses, numPipelinedPasses;
592 numIsolatedPasses = rawMetricsConfigGetNumPassesParams.numIsolatedPasses;
593 numPipelinedPasses = rawMetricsConfigGetNumPassesParams.numPipelinedPasses;
595 *num_pass = numPipelinedPasses + numIsolatedPasses * numNestingLevels;
598 ERRDBG(
"Metrics requested requires multiple passes to profile.\n");
609 int gpu_id, found, papi_errno =
PAPI_OK;
610 cuptip_gpu_state_t *gpu_ctl;
612 for (gpu_id = 0; gpu_id <
num_gpus; gpu_id++) {
613 gpu_ctl = &(
state->gpu_ctl[gpu_id]);
616 gpu_ctl->pmetricsContextCreateParams =
state->gpu_ctl[found].pmetricsContextCreateParams;
624 pMCCP->structSize = NVPW_CUDA_MetricsContext_Create_Params_STRUCT_SIZE;
627 gpu_ctl->pmetricsContextCreateParams = pMCCP;
638 int gpu_id, found, papi_errno =
PAPI_OK;
639 cuptip_gpu_state_t *gpu_ctl;
641 for (gpu_id = 0; gpu_id <
num_gpus; gpu_id++) {
642 gpu_ctl = &(
state->gpu_ctl[gpu_id]);
645 gpu_ctl->pmetricsContextCreateParams = NULL;
648 if (gpu_ctl->pmetricsContextCreateParams->pMetricsContext) {
649 NVPW_MetricsContext_Destroy_Params mCDP = {
650 .structSize = NVPW_MetricsContext_Destroy_Params_STRUCT_SIZE,
652 .pMetricsContext = gpu_ctl->pmetricsContextCreateParams->pMetricsContext,
655 papi_free(gpu_ctl->pmetricsContextCreateParams);
656 gpu_ctl->pmetricsContextCreateParams = NULL;
669 int gpu_id, papi_errno =
PAPI_OK, passes;
670 cuptip_gpu_state_t *gpu_ctl;
672 for (gpu_id = 0; gpu_id <
num_gpus; gpu_id++) {
673 gpu_ctl = &(
state->gpu_ctl[gpu_id]);
674 if (gpu_ctl->event_names->count == 0) {
683 NVPW_CUDA_RawMetricsConfig_Create_Params nvpw_metricsConfigCreateParams = {
684 .structSize = NVPW_CUDA_RawMetricsConfig_Create_Params_STRUCT_SIZE,
686 .activityKind = NVPA_ACTIVITY_KIND_PROFILER,
691 papi_errno =
check_num_passes(nvpw_metricsConfigCreateParams.pRawMetricsConfig,
692 gpu_ctl->rmr_count, gpu_ctl->rmr, &passes);
694 NVPW_RawMetricsConfig_Destroy_Params rawMetricsConfigDestroyParams = {
695 .structSize = NVPW_RawMetricsConfig_Destroy_Params_STRUCT_SIZE,
697 .pRawMetricsConfig = nvpw_metricsConfigCreateParams.pRawMetricsConfig,
715 CUpti_Profiler_GetCounterAvailability_Params getCounterAvailabilityParams = {
716 .structSize = CUpti_Profiler_GetCounterAvailability_Params_STRUCT_SIZE,
719 .pCounterAvailabilityImage = NULL,
722 if (papi_errno != CUPTI_SUCCESS) {
723 ERRDBG(
"CUPTI error %d: Failed to get size.\n", papi_errno);
727 gpu_ctl->counterAvailabilityImage.size = getCounterAvailabilityParams.counterAvailabilityImageSize;
728 gpu_ctl->counterAvailabilityImage.data = (uint8_t *)
papi_malloc(gpu_ctl->counterAvailabilityImage.size);
729 if (gpu_ctl->counterAvailabilityImage.data == NULL) {
733 getCounterAvailabilityParams.pCounterAvailabilityImage = gpu_ctl->counterAvailabilityImage.data;
735 if (papi_errno != CUPTI_SUCCESS) {
736 ERRDBG(
"CUPTI error %d: Failed to get bytes.\n", papi_errno);
745 NVPW_CUDA_RawMetricsConfig_Create_Params nvpw_metricsConfigCreateParams = {
746 .structSize = NVPW_CUDA_RawMetricsConfig_Create_Params_STRUCT_SIZE,
748 .activityKind = NVPA_ACTIVITY_KIND_PROFILER,
753 if( gpu_ctl->counterAvailabilityImage.data != NULL) {
754 NVPW_RawMetricsConfig_SetCounterAvailability_Params setCounterAvailabilityParams = {
755 .structSize = NVPW_RawMetricsConfig_SetCounterAvailability_Params_STRUCT_SIZE,
757 .pRawMetricsConfig = nvpw_metricsConfigCreateParams.pRawMetricsConfig,
758 .pCounterAvailabilityImage = gpu_ctl->counterAvailabilityImage.data,
763 NVPW_RawMetricsConfig_BeginPassGroup_Params beginPassGroupParams = {
764 .structSize = NVPW_RawMetricsConfig_BeginPassGroup_Params_STRUCT_SIZE,
766 .pRawMetricsConfig = nvpw_metricsConfigCreateParams.pRawMetricsConfig,
771 NVPW_RawMetricsConfig_AddMetrics_Params addMetricsParams = {
772 .structSize = NVPW_RawMetricsConfig_AddMetrics_Params_STRUCT_SIZE,
774 .pRawMetricsConfig = nvpw_metricsConfigCreateParams.pRawMetricsConfig,
775 .pRawMetricRequests = gpu_ctl->rmr,
776 .numMetricRequests = gpu_ctl->rmr_count,
780 NVPW_RawMetricsConfig_EndPassGroup_Params endPassGroupParams = {
781 .structSize = NVPW_RawMetricsConfig_EndPassGroup_Params_STRUCT_SIZE,
783 .pRawMetricsConfig = nvpw_metricsConfigCreateParams.pRawMetricsConfig,
787 NVPW_RawMetricsConfig_GenerateConfigImage_Params generateConfigImageParams = {
788 .structSize = NVPW_RawMetricsConfig_GenerateConfigImage_Params_STRUCT_SIZE,
790 .pRawMetricsConfig = nvpw_metricsConfigCreateParams.pRawMetricsConfig,
794 NVPW_RawMetricsConfig_GetConfigImage_Params getConfigImageParams = {
795 .structSize = NVPW_RawMetricsConfig_GetConfigImage_Params_STRUCT_SIZE,
797 .pRawMetricsConfig = nvpw_metricsConfigCreateParams.pRawMetricsConfig,
803 gpu_ctl->configImage.size = getConfigImageParams.bytesCopied;
804 gpu_ctl->configImage.data = (uint8_t *)
papi_calloc(gpu_ctl->configImage.size,
sizeof(uint8_t));
805 if (gpu_ctl->configImage.data == NULL) {
806 ERRDBG(
"calloc gpu_ctl->configImage.data failed!");
810 getConfigImageParams.bytesAllocated = gpu_ctl->configImage.size;
811 getConfigImageParams.pBuffer = gpu_ctl->configImage.data;
814 NVPW_RawMetricsConfig_Destroy_Params rawMetricsConfigDestroyParams = {
815 .structSize = NVPW_RawMetricsConfig_Destroy_Params_STRUCT_SIZE,
817 .pRawMetricsConfig = nvpw_metricsConfigCreateParams.pRawMetricsConfig,
829 NVPW_CounterDataBuilder_Create_Params counterDataBuilderCreateParams = {
830 .structSize = NVPW_CounterDataBuilder_Create_Params_STRUCT_SIZE,
836 NVPW_CounterDataBuilder_AddMetrics_Params addMetricsParams = {
837 .structSize = NVPW_CounterDataBuilder_AddMetrics_Params_STRUCT_SIZE,
839 .pCounterDataBuilder = counterDataBuilderCreateParams.pCounterDataBuilder,
840 .pRawMetricRequests = gpu_ctl->rmr,
841 .numMetricRequests = gpu_ctl->rmr_count,
845 NVPW_CounterDataBuilder_GetCounterDataPrefix_Params getCounterDataPrefixParams = {
846 .structSize = NVPW_CounterDataBuilder_GetCounterDataPrefix_Params_STRUCT_SIZE,
848 .pCounterDataBuilder = counterDataBuilderCreateParams.pCounterDataBuilder,
854 gpu_ctl->counterDataImagePrefix.size = getCounterDataPrefixParams.bytesCopied;
855 gpu_ctl->counterDataImagePrefix.data = (uint8_t *)
papi_calloc(gpu_ctl->counterDataImagePrefix.size,
sizeof(uint8_t));
856 if (gpu_ctl->counterDataImagePrefix.data == NULL) {
857 ERRDBG(
"calloc gpu_ctl->counterDataImagePrefix.data failed!");
861 getCounterDataPrefixParams.bytesAllocated = gpu_ctl->counterDataImagePrefix.size;
862 getCounterDataPrefixParams.pBuffer = gpu_ctl->counterDataImagePrefix.data;
865 NVPW_CounterDataBuilder_Destroy_Params counterDataBuilderDestroyParams = {
866 .structSize = NVPW_CounterDataBuilder_Destroy_Params_STRUCT_SIZE,
868 .pCounterDataBuilder = counterDataBuilderCreateParams.pCounterDataBuilder,
880 gpu_ctl->counterDataImageOptions = (CUpti_Profiler_CounterDataImageOptions) {
881 .structSize = CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE,
883 .pCounterDataPrefix = gpu_ctl->counterDataImagePrefix.data,
884 .counterDataPrefixSize = gpu_ctl->counterDataImagePrefix.size,
886 .maxNumRangeTreeNodes = 1,
887 .maxRangeNameLength = 64,
890 CUpti_Profiler_CounterDataImage_CalculateSize_Params calculateSizeParams = {
891 .structSize = CUpti_Profiler_CounterDataImage_CalculateSize_Params_STRUCT_SIZE,
893 .sizeofCounterDataImageOptions = CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE,
894 .pOptions = &gpu_ctl->counterDataImageOptions,
898 gpu_ctl->initializeParams = (CUpti_Profiler_CounterDataImage_Initialize_Params) {
899 .structSize = CUpti_Profiler_CounterDataImage_Initialize_Params_STRUCT_SIZE,
901 .sizeofCounterDataImageOptions = CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE,
902 .pOptions = &gpu_ctl->counterDataImageOptions,
903 .counterDataImageSize = calculateSizeParams.counterDataImageSize,
906 gpu_ctl->counterDataImage.size = calculateSizeParams.counterDataImageSize;
907 gpu_ctl->counterDataImage.data = (uint8_t *)
papi_calloc(gpu_ctl->counterDataImage.size,
sizeof(uint8_t));
908 if (gpu_ctl->counterDataImage.data == NULL) {
909 ERRDBG(
"calloc gpu_ctl->counterDataImage.data failed!\n");
913 gpu_ctl->initializeParams.pCounterDataImage = gpu_ctl->counterDataImage.data;
916 CUpti_Profiler_CounterDataImage_CalculateScratchBufferSize_Params scratchBufferSizeParams = {
917 .structSize = CUpti_Profiler_CounterDataImage_CalculateScratchBufferSize_Params_STRUCT_SIZE,
919 .counterDataImageSize = calculateSizeParams.counterDataImageSize,
920 .pCounterDataImage = gpu_ctl->initializeParams.pCounterDataImage,
924 gpu_ctl->counterDataScratchBuffer.size = scratchBufferSizeParams.counterDataScratchBufferSize;
925 gpu_ctl->counterDataScratchBuffer.data = (uint8_t *)
papi_calloc(gpu_ctl->counterDataScratchBuffer.size,
sizeof(uint8_t));
926 if (gpu_ctl->counterDataScratchBuffer.data == NULL) {
927 ERRDBG(
"calloc gpu_ctl->counterDataScratchBuffer.data failed!\n");
931 gpu_ctl->initScratchBufferParams = (CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params) {
932 .structSize = CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params_STRUCT_SIZE,
934 .counterDataImageSize = calculateSizeParams.counterDataImageSize,
935 .pCounterDataImage = gpu_ctl->initializeParams.pCounterDataImage,
936 .counterDataScratchBufferSize = gpu_ctl->counterDataScratchBuffer.size,
937 .pCounterDataScratchBuffer = gpu_ctl->counterDataScratchBuffer.data,
949 papi_free(gpu_ctl->counterDataImagePrefix.data);
951 papi_free(gpu_ctl->counterDataImage.data);
952 papi_free(gpu_ctl->counterDataScratchBuffer.data);
953 papi_free(gpu_ctl->counterAvailabilityImage.data);
954 gpu_ctl->counterDataImagePrefix.data = NULL;
955 gpu_ctl->configImage.data = NULL;
956 gpu_ctl->counterDataImage.data = NULL;
957 gpu_ctl->counterDataScratchBuffer.data = NULL;
958 gpu_ctl->counterAvailabilityImage.data = NULL;
959 gpu_ctl->counterDataImagePrefix.size = 0;
960 gpu_ctl->configImage.size = 0;
961 gpu_ctl->counterDataImage.size = 0;
962 gpu_ctl->counterDataScratchBuffer.size = 0;
963 gpu_ctl->counterAvailabilityImage.size = 0;
970 byte_array_t *configImage = &(gpu_ctl->configImage);
971 byte_array_t *counterDataScratchBuffer = &(gpu_ctl->counterDataScratchBuffer);
972 byte_array_t *counterDataImage = &(gpu_ctl->counterDataImage);
974 CUpti_Profiler_BeginSession_Params beginSessionParams = {
975 .structSize = CUpti_Profiler_BeginSession_Params_STRUCT_SIZE,
978 .counterDataImageSize = counterDataImage->size,
979 .pCounterDataImage = counterDataImage->data,
980 .counterDataScratchBufferSize = counterDataScratchBuffer->size,
981 .pCounterDataScratchBuffer = counterDataScratchBuffer->data,
982 .range = CUPTI_UserRange,
983 .replayMode = CUPTI_UserReplay,
984 .maxRangesPerPass = 1,
985 .maxLaunchesPerPass = 1,
989 CUpti_Profiler_SetConfig_Params setConfigParams = {
990 .structSize = CUpti_Profiler_SetConfig_Params_STRUCT_SIZE,
993 .pConfig = configImage->data,
994 .configSize = configImage->size,
995 .minNestingLevel = 1,
996 .numNestingLevels = 1,
998 .targetNestingLevel = 1,
1002 CUpti_Profiler_BeginPass_Params beginPassParams = {
1003 .structSize = CUpti_Profiler_BeginPass_Params_STRUCT_SIZE,
1009 CUpti_Profiler_EnableProfiling_Params enableProfilingParams = {
1010 .structSize = CUpti_Profiler_EnableProfiling_Params_STRUCT_SIZE,
1017 sprintf(rangeName,
"PAPI_Range_%d", gpu_ctl->gpu_id);
1018 CUpti_Profiler_PushRange_Params pushRangeParams = {
1019 .structSize = CUpti_Profiler_PushRange_Params_STRUCT_SIZE,
1022 .pRangeName = (
const char*) &rangeName,
1023 .rangeNameLength = 100,
1035 COMPDBG(
"EndProfiling. dev = %d\n", gpu_ctl->gpu_id);
1038 CUpti_Profiler_DisableProfiling_Params disableProfilingParams = {
1039 .structSize = CUpti_Profiler_DisableProfiling_Params_STRUCT_SIZE,
1045 CUpti_Profiler_PopRange_Params popRangeParams = {
1046 .structSize = CUpti_Profiler_PopRange_Params_STRUCT_SIZE,
1052 CUpti_Profiler_EndPass_Params endPassParams = {
1053 .structSize = CUpti_Profiler_EndPass_Params_STRUCT_SIZE,
1059 CUpti_Profiler_FlushCounterData_Params flushCounterDataParams = {
1060 .structSize = CUpti_Profiler_FlushCounterData_Params_STRUCT_SIZE,
1066 CUpti_Profiler_UnsetConfig_Params unsetConfigParams = {
1067 .structSize = CUpti_Profiler_UnsetConfig_Params_STRUCT_SIZE,
1073 CUpti_Profiler_EndSession_Params endSessionParams = {
1074 .structSize = CUpti_Profiler_EndSession_Params_STRUCT_SIZE,
1087 COMPDBG(
"eval_metric_values. dev = %d\n", gpu_ctl->gpu_id);
1088 if (!gpu_ctl->counterDataImage.size) {
1089 ERRDBG(
"Counter Data Image is empty!\n");
1093 int numMetrics = gpu_ctl->event_names->count;
1096 char **metricNames = (
char**)
papi_calloc(numMetrics,
sizeof(
char *));
1097 if (metricNames == NULL) {
1098 ERRDBG(
"calloc metricNames failed.\n");
1102 for (
i = 0;
i < numMetrics;
i++) {
1111 metricNames[
i] = (
char *) &(evt_rec->
desc);
1112 LOGDBG(
"Setting metric name %s\n", metricNames[
i]);
1115 double *gpuValues = (
double*)
papi_malloc(numMetrics *
sizeof(
double));
1116 if (gpuValues == NULL) {
1117 ERRDBG(
"malloc gpuValues failed.\n");
1121 NVPW_MetricsContext_SetCounterData_Params setCounterDataParams = {
1122 .structSize = NVPW_MetricsContext_SetCounterData_Params_STRUCT_SIZE,
1124 .pMetricsContext = gpu_ctl->pmetricsContextCreateParams->pMetricsContext,
1125 .pCounterDataImage = gpu_ctl->counterDataImage.data,
1130 NVPW_MetricsContext_EvaluateToGpuValues_Params evalToGpuParams = {
1131 .structSize = NVPW_MetricsContext_EvaluateToGpuValues_Params_STRUCT_SIZE,
1133 .pMetricsContext = gpu_ctl->pmetricsContextCreateParams->pMetricsContext,
1134 .numMetrics = numMetrics,
1135 .ppMetricNames = (
const char*
const*) metricNames,
1136 .pMetricValues = gpuValues,
1140 for (
i = 0;
i < (
int) gpu_ctl->event_names->count;
i++) {
1146 evt_rec->
value = gpuValues[
i];
1159 for (
i = 0;
i < gpu_id;
i++) {
1169 int gpu_id, papi_errno =
PAPI_OK;
1175 for (gpu_id = 0; gpu_id <
num_gpus; gpu_id++) {
1182 for (gpu_id = 0; gpu_id <
num_gpus; gpu_id++) {
1189 if (pMCCP == NULL) {
1193 pMCCP->structSize = NVPW_CUDA_MetricsContext_Create_Params_STRUCT_SIZE;
1197 avail_events[gpu_id].pmetricsContextCreateParams = pMCCP;
1208 int gpu_id,
i, found, listsubmetrics = 1, papi_errno =
PAPI_OK;
1213 for (gpu_id = 0; gpu_id <
num_gpus; gpu_id++) {
1214 LOGDBG(
"Getting metric names for gpu %d\n", gpu_id);
1223 NVPW_MetricsContext_GetMetricNames_Begin_Params getMetricNameBeginParams = {
1224 .structSize = NVPW_MetricsContext_GetMetricNames_Begin_Params_STRUCT_SIZE,
1226 .pMetricsContext =
avail_events[gpu_id].pmetricsContextCreateParams->pMetricsContext,
1227 .hidePeakSubMetrics = !listsubmetrics,
1228 .hidePerCycleSubMetrics = !listsubmetrics,
1229 .hidePctOfPeakSubMetrics = !listsubmetrics,
1233 avail_events[gpu_id].num_metrics = getMetricNameBeginParams.numMetrics;
1241 getMetricNameBeginParams.ppMetricNames[
i],
1248 NVPW_MetricsContext_GetMetricNames_End_Params getMetricNameEndParams = {
1249 .structSize = NVPW_MetricsContext_GetMetricNames_End_Params_STRUCT_SIZE,
1251 .pMetricsContext =
avail_events[gpu_id].pmetricsContextCreateParams->pMetricsContext,
1260 unsigned int curr = all_evt_names->
count;
1261 for (gpu_id = 0; gpu_id <
num_gpus; gpu_id++) {
1269 ERRDBG(
"String formatting exceeded maximum length.\n");
1291 int papi_errno, numdep, gpu_id, passes;
1294 NVPA_RawMetricRequest *temp;
1301 ERRDBG(
"Event name not found in avail_events array.\n");
1304 char *desc = evt_rec->
desc;
1305 if (desc[0] ==
'\0') {
1307 nv_name, desc, &numdep, &temp);
1309 NVPW_CUDA_RawMetricsConfig_Create_Params nvpw_metricsConfigCreateParams = {
1310 .structSize = NVPW_CUDA_RawMetricsConfig_Create_Params_STRUCT_SIZE,
1312 .activityKind = NVPA_ACTIVITY_KIND_PROFILER,
1317 papi_errno =
check_num_passes(nvpw_metricsConfigCreateParams.pRawMetricsConfig,
1318 numdep, temp, &passes);
1320 NVPW_RawMetricsConfig_Destroy_Params rawMetricsConfigDestroyParams = {
1321 .structSize = NVPW_RawMetricsConfig_Destroy_Params_STRUCT_SIZE,
1323 .pRawMetricsConfig = nvpw_metricsConfigCreateParams.pRawMetricsConfig,
1327 snprintf(desc + strlen(desc),
PAPI_2MAX_STR_LEN - strlen(desc),
" Numpass=%d", passes);
1329 snprintf(desc + strlen(desc),
PAPI_2MAX_STR_LEN - strlen(desc),
" (multi-pass not supported)");
1332 const char *token_sw_evt =
"sass";
1333 if (strstr(nv_name, token_sw_evt) != NULL) {
1339 strcpy(description, desc);
1351 NVPW_MetricsContext_Destroy_Params metricsContextDestroyParams;
1355 for (gpu_id = 0; gpu_id <
num_gpus; gpu_id++) {
1360 avail_events[gpu_id].pmetricsContextCreateParams = NULL;
1363 if (
avail_events[gpu_id].pmetricsContextCreateParams->pMetricsContext) {
1364 metricsContextDestroyParams = (NVPW_MetricsContext_Destroy_Params) {
1365 .structSize = NVPW_MetricsContext_Destroy_Params_STRUCT_SIZE,
1367 .pMetricsContext =
avail_events[gpu_id].pmetricsContextCreateParams->pMetricsContext,
1372 avail_events[gpu_id].pmetricsContextCreateParams = NULL;
1416 if (papi_errno != CUDA_SUCCESS) {
1428 int papi_errno =
PAPI_OK, gpu_id;
1430 if (
state == NULL) {
1434 if (
state->gpu_ctl == NULL) {
1437 for (gpu_id = 0; gpu_id <
num_gpus; gpu_id++) {
1438 state->gpu_ctl[gpu_id].gpu_id = gpu_id;
1455 state->info = thr_info;
1465 cuptip_control_t
state = *pstate;
1471 for (j = 0; j <
state->gpu_ctl[
i].rmr_count; j++) {
1485 cuptip_gpu_state_t *gpu_ctl;
1486 CUcontext userCtx, ctx;
1488 if (userCtx == NULL) {
1494 for (gpu_id = 0; gpu_id <
num_gpus; gpu_id++) {
1495 gpu_ctl = &(
state->gpu_ctl[gpu_id]);
1496 if (gpu_ctl->event_names->count == 0) {
1499 LOGDBG(
"Device num %d: event_count %d, rmr count %d\n", gpu_id, gpu_ctl->event_names->count, gpu_ctl->rmr_count);
1502 ERRDBG(
"Profiling same gpu from multiple event sets not allowed.\n");
1509 ERRDBG(
"Error getting counter availability image.\n");
1517 ERRDBG(
"Failed to create CUPTI profiler state for gpu %d\n", gpu_id);
1522 ERRDBG(
"Failed to start profiling for gpu %d\n", gpu_id);
1541 cuptip_gpu_state_t *gpu_ctl;
1542 CUcontext userCtx = NULL, ctx = NULL;
1544 if (userCtx == NULL) {
1551 ERRDBG(
"Profiler is already stopped.\n");
1555 for (gpu_id=0; gpu_id<
num_gpus; gpu_id++) {
1556 gpu_ctl = &(
state->gpu_ctl[gpu_id]);
1557 if (gpu_ctl->event_names->count == 0) {
1564 ERRDBG(
"Failed to stop profiling on gpu %d\n", gpu_id);
1585 if (strstr(
evt_name,
".sum") != NULL) {
1588 else if (strstr(
evt_name,
".min") != NULL) {
1591 else if (strstr(
evt_name,
".max") != NULL) {
1602 int papi_errno, gpu_id,
i;
1603 cuptip_gpu_state_t *gpu_ctl = NULL;
1604 CUcontext userCtx = NULL, ctx = NULL;
1606 if (userCtx == NULL) {
1610 unsigned int evt_pos;
1613 for (gpu_id = 0; gpu_id <
num_gpus; gpu_id++) {
1614 gpu_ctl = &(
state->gpu_ctl[gpu_id]);
1615 if (gpu_ctl->event_names->count == 0) {
1622 CUpti_Profiler_PopRange_Params popRangeParams = {
1623 .structSize = CUpti_Profiler_PopRange_Params_STRUCT_SIZE,
1629 CUpti_Profiler_EndPass_Params endPassParams = {
1630 .structSize = CUpti_Profiler_EndPass_Params_STRUCT_SIZE,
1636 CUpti_Profiler_FlushCounterData_Params flushCounterDataParams = {
1637 .structSize = CUpti_Profiler_FlushCounterData_Params_STRUCT_SIZE,
1647 for (
i = 0;
i < (
int) gpu_ctl->event_names->count;
i++) {
1679 CUpti_Profiler_BeginPass_Params beginPassParams = {
1680 .structSize = CUpti_Profiler_BeginPass_Params_STRUCT_SIZE,
1687 sprintf(rangeName,
"PAPI_Range_%d", gpu_ctl->gpu_id);
1688 CUpti_Profiler_PushRange_Params pushRangeParams = {
1689 .structSize = CUpti_Profiler_PushRange_Params_STRUCT_SIZE,
1692 .pRangeName = (
const char*) &rangeName,
1693 .rangeNameLength = 100,
1698 state->read_count++;
1710 state->read_count = 0;
static const char * event_names[2]
void * cuptic_load_dynamic_syms(const char *parent_path, const char *dlname, const char *search_subpaths[])
void cuptic_disabled_reason_set(const char *msg)
cudaError_t(* cudaFreePtr)(void *)
CUresult(* cuCtxSetCurrentPtr)(CUcontext)
int cuptic_device_release(cuptiu_event_table_t *evt_table)
int cuptic_ctxarr_get_ctx(cuptic_info_t info, int gpu_idx, CUcontext *ctx)
int cuptic_ctxarr_update_current(cuptic_info_t info)
int cuptic_device_get_count(int *num_gpus)
const char * linked_cudart_path
CUresult(* cuCtxGetCurrentPtr)(CUcontext *)
CUresult(* cuInitPtr)(unsigned int)
int cuptic_device_acquire(cuptiu_event_table_t *evt_table)
#define CUDA_CALL(call, handleerror)
#define CUDART_CALL(call, handleerror)
#define CUPTI_CALL(call, handleerror)
#define DLSYM_AND_CHECK(dllib, name)
static int nvpw_cuda_metricscontext_create(cuptip_control_t state)
static void free_all_enumerated_metrics(void)
NVPA_Status(* NVPW_RawMetricsConfig_EndPassGroupPtr)(NVPW_RawMetricsConfig_EndPassGroup_Params *params)
static int load_nvpw_sym(void)
NVPA_Status(* NVPW_RawMetricsConfig_SetCounterAvailabilityPtr)(NVPW_RawMetricsConfig_SetCounterAvailability_Params *params)
NVPA_Status(* NVPW_MetricsContext_GetCounterNames_EndPtr)(NVPW_MetricsContext_GetCounterNames_End_Params *pParams)
static int init_all_metrics(void)
static int get_measured_values(cuptip_gpu_state_t *gpu_ctl)
NVPA_Status(* NVPW_RawMetricsConfig_AddMetricsPtr)(NVPW_RawMetricsConfig_AddMetrics_Params *params)
static int initialize_perfworks_api(void)
NVPA_Status(* NVPW_MetricsContext_GetMetricProperties_EndPtr)(NVPW_MetricsContext_GetMetricProperties_End_Params *p)
CUptiResult(* cuptiProfilerCounterDataImageCalculateScratchBufferSizePtr)(CUpti_Profiler_CounterDataImage_CalculateScratchBufferSize_Params *params)
static int get_event_names_rmr(cuptip_gpu_state_t *gpu_ctl)
static int reset_cupti_prof_config_images(cuptip_gpu_state_t *gpu_ctl)
NVPA_Status(* NVPW_MetricsContext_EvaluateToGpuValuesPtr)(NVPW_MetricsContext_EvaluateToGpuValues_Params *params)
NVPA_Status(* NVPW_RawMetricsConfig_DestroyPtr)(NVPW_RawMetricsConfig_Destroy_Params *params)
int cuptip_control_reset(cuptip_control_t state)
CUptiResult(* cuptiProfilerPopRangePtr)(CUpti_Profiler_PopRange_Params *params)
NVPA_Status(* NVPW_CUDA_RawMetricsConfig_CreatePtr)(NVPW_CUDA_RawMetricsConfig_Create_Params *)
NVPA_Status(* NVPW_GetSupportedChipNamesPtr)(NVPW_GetSupportedChipNames_Params *params)
CUptiResult(* cuptiProfilerDisableProfilingPtr)(CUpti_Profiler_DisableProfiling_Params *params)
CUptiResult(* cuptiDeviceGetChipNamePtr)(CUpti_Device_GetChipName_Params *params)
int cuptip_control_read(cuptip_control_t state, long long *values)
NVPA_Status(* NVPW_MetricsContext_DestroyPtr)(NVPW_MetricsContext_Destroy_Params *params)
NVPA_Status(* NVPW_RawMetricsConfig_GetConfigImagePtr)(NVPW_RawMetricsConfig_GetConfigImage_Params *params)
static list_metrics_t * avail_events
static int nvpw_cuda_metricscontext_destroy(cuptip_control_t state)
static int end_profiling(cuptip_gpu_state_t *gpu_ctl)
int cuptip_event_enum(cuptiu_event_table_t *all_evt_names)
NVPA_Status(* NVPW_MetricsContext_GetMetricNames_EndPtr)(NVPW_MetricsContext_GetMetricNames_End_Params *params)
NVPA_Status(* NVPW_RawMetricsConfig_IsAddMetricsPossiblePtr)(NVPW_RawMetricsConfig_IsAddMetricsPossible_Params *params)
CUptiResult(* cuptiProfilerDeInitializePtr)(CUpti_Profiler_DeInitialize_Params *params)
int cuptip_control_create(cuptiu_event_table_t *event_names, cuptic_info_t thr_info, cuptip_control_t *pstate)
static int initialize_cupti_profiler_api(void)
CUptiResult(* cuptiFinalizePtr)(void)
static enum collection_method_e get_event_collection_method(const char *evt_name)
int cuptip_control_stop(cuptip_control_t state)
NVPA_Status(* NVPW_MetricsContext_GetMetricNames_BeginPtr)(NVPW_MetricsContext_GetMetricNames_Begin_Params *params)
CUptiResult(* cuptiProfilerBeginSessionPtr)(CUpti_Profiler_BeginSession_Params *params)
struct NVPA_MetricsContext NVPA_MetricsContext
CUptiResult(* cuptiProfilerInitializePtr)(CUpti_Profiler_Initialize_Params *params)
static int create_counter_data_image(cuptip_gpu_state_t *gpu_ctl)
CUptiResult(* cuptiProfilerEndPassPtr)(CUpti_Profiler_EndPass_Params *params)
int cuptip_control_start(cuptip_control_t state)
NVPA_Status(* NVPW_CounterDataBuilder_DestroyPtr)(NVPW_CounterDataBuilder_Destroy_Params *params)
CUptiResult(* cuptiProfilerPushRangePtr)(CUpti_Profiler_PushRange_Params *params)
NVPA_Status(* NVPW_InitializeHostPtr)(NVPW_InitializeHost_Params *params)
CUptiResult(* cuptiProfilerCounterDataImageCalculateSizePtr)(CUpti_Profiler_CounterDataImage_CalculateSize_Params *params)
int cuptip_event_name_to_descr(const char *evt_name, char *description)
static int begin_profiling(cuptip_gpu_state_t *gpu_ctl)
static int unload_nvpw_sym(void)
static int add_events_per_gpu(cuptip_control_t state, cuptiu_event_table_t *event_names)
static int get_counter_availability(cuptip_gpu_state_t *gpu_ctl)
NVPA_Status(* NVPW_RawMetricsConfig_GetNumPassesPtr)(NVPW_RawMetricsConfig_GetNumPasses_Params *params)
NVPA_Status(* NVPW_CUDA_MetricsContext_CreatePtr)(NVPW_CUDA_MetricsContext_Create_Params *params)
static int get_chip_name(int dev_num, char *chipName)
NVPA_Status(* NVPW_MetricsContext_GetCounterNames_BeginPtr)(NVPW_MetricsContext_GetCounterNames_Begin_Params *pParams)
static int check_num_passes(struct NVPA_RawMetricsConfig *pRawMetricsConfig, int rmr_count, NVPA_RawMetricRequest *rmr, int *num_pass)
CUptiResult(* cuptiProfilerFlushCounterDataPtr)(CUpti_Profiler_FlushCounterData_Params *params)
static int load_cupti_perf_sym(void)
NVPW_CUDA_MetricsContext_Create_Params MCCP_t
NVPA_Status(* NVPW_RawMetricsConfig_BeginPassGroupPtr)(NVPW_RawMetricsConfig_BeginPassGroup_Params *params)
static int event_name_tokenize(const char *name, char *nv_name, int *gpuid)
CUptiResult(* cuptiProfilerCounterDataImageInitializePtr)(CUpti_Profiler_CounterDataImage_Initialize_Params *params)
CUptiResult(* cuptiProfilerEndSessionPtr)(CUpti_Profiler_EndSession_Params *params)
static int retrieve_metric_details(NVPA_MetricsContext *pMetricsContext, const char *nv_name, char *description, int *numDep, NVPA_RawMetricRequest **pRMR)
CUptiResult(* cuptiProfilerSetConfigPtr)(CUpti_Profiler_SetConfig_Params *params)
static int finalize_cupti_profiler_api(void)
NVPA_Status(* NVPW_CounterDataBuilder_GetCounterDataPrefixPtr)(NVPW_CounterDataBuilder_GetCounterDataPrefix_Params *params)
#define NVPW_CALL(call, handleerror)
CUptiResult(* cuptiProfilerUnsetConfigPtr)(CUpti_Profiler_UnsetConfig_Params *params)
NVPA_Status(* NVPW_RawMetricsConfig_GenerateConfigImagePtr)(NVPW_RawMetricsConfig_GenerateConfigImage_Params *params)
NVPA_Status(* NVPW_MetricsContext_SetCounterDataPtr)(NVPW_MetricsContext_SetCounterData_Params *params)
static int metric_get_config_image(cuptip_gpu_state_t *gpu_ctl)
CUptiResult(* cuptiProfilerEnableProfilingPtr)(CUpti_Profiler_EnableProfiling_Params *params)
CUptiResult(* cuptiProfilerCounterDataImageInitializeScratchBufferPtr)(CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params *params)
NVPA_Status(* NVPW_MetricsContext_GetMetricProperties_BeginPtr)(NVPW_MetricsContext_GetMetricProperties_Begin_Params *p)
NVPA_Status(* NVPW_CounterData_GetNumRangesPtr)(NVPW_CounterData_GetNumRanges_Params *params)
static int metric_get_counter_data_prefix_image(cuptip_gpu_state_t *gpu_ctl)
static int control_state_validate(cuptip_control_t state)
CUptiResult(* cuptiProfilerBeginPassPtr)(CUpti_Profiler_BeginPass_Params *params)
static int find_same_chipname(int gpu_id)
NVPA_Status(* NVPW_Profiler_CounterData_GetRangeDescriptionsPtr)(NVPW_Profiler_CounterData_GetRangeDescriptions_Params *params)
int cuptip_control_destroy(cuptip_control_t *pstate)
NVPA_Status(* NVPW_CounterDataBuilder_AddMetricsPtr)(NVPW_CounterDataBuilder_AddMetrics_Params *params)
static int unload_cupti_perf_sym(void)
NVPA_Status(* NVPW_CounterDataBuilder_CreatePtr)(NVPW_CounterDataBuilder_Create_Params *params)
CUptiResult(* cuptiProfilerGetCounterAvailabilityPtr)(CUpti_Profiler_GetCounterAvailability_Params *params)
int cuptip_shutdown(void)
void cuptiu_event_table_destroy(cuptiu_event_table_t **pevt_table)
int cuptiu_event_table_get_item(cuptiu_event_table_t *evt_table, int evt_idx, cuptiu_event_t **record)
int cuptiu_event_table_create_init_capacity(int capacity, int sizeof_rec, cuptiu_event_table_t **pevt_table)
int cuptiu_event_table_insert_record(cuptiu_event_table_t *evt_table, const char *evt_name, unsigned int evt_code, int evt_pos)
int cuptiu_event_table_find_name(cuptiu_event_table_t *evt_table, const char *evt_name, cuptiu_event_t **found_rec)
int cuptiu_event_table_create(int sizeof_rec, cuptiu_event_table_t **pevt_table)
char * evt_name(evstock *stock, int index)
#define PAPI_2MAX_STR_LEN
static long long values[NUM_EVENTS]
#define ERRDBG(format, args...)
#define LOGDBG(format, args...)
#define COMPDBG(format, args...)
Return codes and api definitions.
#define papi_calloc(a, b)
#define papi_realloc(a, b)
cuptip_gpu_state_t * gpu_ctl
CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params initScratchBufferParams
byte_array_t counterDataScratchBuffer
NVPA_RawMetricRequest * rmr
byte_array_t counterDataImagePrefix
CUpti_Profiler_CounterDataImage_Initialize_Params initializeParams
cuptiu_event_table_t * event_names
byte_array_t counterAvailabilityImage
MCCP_t * pmetricsContextCreateParams
byte_array_t counterDataImage
CUpti_Profiler_CounterDataImageOptions counterDataImageOptions
char name[PAPI_2MAX_STR_LEN]
char desc[PAPI_2MAX_STR_LEN]
cuptiu_event_table_t * nv_metrics
MCCP_t * pmetricsContextCreateParams