From f452488ba3fc46712d4e3dbca3d7533901a5b0d1 Mon Sep 17 00:00:00 2001 From: Douglas Wightman Date: Tue, 3 Aug 2021 15:31:17 -0600 Subject: [PATCH] Initial commit for DCGM Exporter repository --- .github/PR_TEMPLATE.md | 4 +- .gitlab-ci.yml | 2 +- .gitmodules | 3 - CONTRIBUTING.md | 4 +- Makefile | 23 +- README.md | 34 +- RELEASE.md | 1 - aws-kube-ci | 1 - bindings/go/dcgm/admin.go | 318 - bindings/go/dcgm/api.go | 108 - bindings/go/dcgm/bcast.go | 84 - bindings/go/dcgm/callback.c | 4 - bindings/go/dcgm/const.go | 791 -- bindings/go/dcgm/dcgm_agent.h | 2033 ----- bindings/go/dcgm/dcgm_errors.h | 474 - bindings/go/dcgm/dcgm_fields.h | 2249 ----- bindings/go/dcgm/dcgm_structs.h | 2958 ------- bindings/go/dcgm/dcgm_test.go | 189 - bindings/go/dcgm/device_info.go | 196 - bindings/go/dcgm/device_status.go | 179 - bindings/go/dcgm/fields.go | 257 - bindings/go/dcgm/go.mod | 3 - bindings/go/dcgm/gpu_group.go | 67 - bindings/go/dcgm/health.go | 121 - bindings/go/dcgm/hostengine_status.go | 49 - bindings/go/dcgm/mig.go | 89 - bindings/go/dcgm/policy.go | 419 - bindings/go/dcgm/process_info.go | 203 - bindings/go/dcgm/profile.go | 47 - bindings/go/dcgm/topology.go | 136 - bindings/go/dcgm/utils.go | 148 - bindings/go/nvml/bindings.go | 859 -- bindings/go/nvml/mig.go | 423 - bindings/go/nvml/mig_test.go | 122 - bindings/go/nvml/nvml.go | 822 -- bindings/go/nvml/nvml.h | 7603 ----------------- bindings/go/nvml/nvml_dl.go | 65 - bindings/go/nvml/nvml_dl_windows.go | 57 - bindings/go/nvml/nvml_test.go | 218 - bindings/go/nvml/nvsmi/nvsmi.go | 43 - bindings/go/samples/dcgm/README.md | 192 - bindings/go/samples/dcgm/deviceInfo/main.go | 78 - bindings/go/samples/dcgm/dmon/main.go | 57 - bindings/go/samples/dcgm/health/main.go | 64 - .../go/samples/dcgm/hostengineStatus/main.go | 25 - bindings/go/samples/dcgm/policy/main.go | 45 - bindings/go/samples/dcgm/processInfo/main.go | 84 - bindings/go/samples/dcgm/restApi/README.md | 106 - .../go/samples/dcgm/restApi/handlers/byIds.go | 65 - .../samples/dcgm/restApi/handlers/byUuids.go | 65 - .../go/samples/dcgm/restApi/handlers/dcgm.go | 136 - .../go/samples/dcgm/restApi/handlers/utils.go | 183 - bindings/go/samples/dcgm/restApi/main.go | 35 - bindings/go/samples/dcgm/restApi/server.go | 88 - bindings/go/samples/dcgm/topology/main.go | 70 - bindings/go/samples/nvml/README.md | 72 - bindings/go/samples/nvml/deviceInfo/main.go | 60 - bindings/go/samples/nvml/dmon/main.go | 60 - bindings/go/samples/nvml/processInfo/main.go | 64 - deployment/dcgm-exporter/Chart.yaml | 4 +- docker/Dockerfile.ubi8 | 4 +- docker/Dockerfile.ubuntu18.04 | 37 - docker/Dockerfile.ubuntu20.04 | 4 +- go.mod | 28 +- pkg/dcgm.go | 2 +- pkg/go.mod | 3 +- pkg/go.sum | 13 +- pkg/gpu_collector.go | 2 +- pkg/gpu_collector_test.go | 5 +- pkg/kubernetes_test.go | 4 +- pkg/main.go | 2 +- pkg/parser.go | 2 +- pkg/pipeline_test.go | 2 +- pkg/system_info.go | 2 +- pkg/system_info_test.go | 2 +- pkg/types.go | 2 +- tests/variables.tfvars | 2 +- .../github.com/Masterminds/semver/.travis.yml | 29 - .../Masterminds/semver/CHANGELOG.md | 109 - .../github.com/Masterminds/semver/LICENSE.txt | 19 - vendor/github.com/Masterminds/semver/Makefile | 36 - .../github.com/Masterminds/semver/README.md | 194 - .../Masterminds/semver/appveyor.yml | 44 - .../Masterminds/semver/collection.go | 24 - .../Masterminds/semver/constraints.go | 423 - vendor/github.com/Masterminds/semver/doc.go | 115 - .../github.com/Masterminds/semver/version.go | 425 - .../Masterminds/semver/version_fuzz.go | 10 - .../bindings/go/dcgm/admin.go | 318 - .../bindings/go/dcgm/api.go | 108 - .../bindings/go/dcgm/bcast.go | 84 - .../bindings/go/dcgm/callback.c | 4 - .../bindings/go/dcgm/const.go | 791 -- .../bindings/go/dcgm/dcgm_agent.h | 2033 ----- .../bindings/go/dcgm/dcgm_errors.h | 474 - .../bindings/go/dcgm/dcgm_fields.h | 2249 ----- .../bindings/go/dcgm/dcgm_structs.h | 2958 ------- .../bindings/go/dcgm/device_info.go | 196 - .../bindings/go/dcgm/device_status.go | 179 - .../bindings/go/dcgm/fields.go | 257 - .../bindings/go/dcgm/go.mod | 3 - .../bindings/go/dcgm/gpu_group.go | 67 - .../bindings/go/dcgm/health.go | 121 - .../bindings/go/dcgm/hostengine_status.go | 49 - .../bindings/go/dcgm/mig.go | 89 - .../bindings/go/dcgm/policy.go | 419 - .../bindings/go/dcgm/process_info.go | 203 - .../bindings/go/dcgm/profile.go | 47 - .../bindings/go/dcgm/topology.go | 136 - .../bindings/go/dcgm/utils.go | 148 - vendor/github.com/gorilla/mux/AUTHORS | 8 - vendor/github.com/gorilla/mux/LICENSE | 27 - vendor/github.com/gorilla/mux/README.md | 805 -- vendor/github.com/gorilla/mux/doc.go | 306 - vendor/github.com/gorilla/mux/go.mod | 3 - vendor/github.com/gorilla/mux/middleware.go | 74 - vendor/github.com/gorilla/mux/mux.go | 607 -- vendor/github.com/gorilla/mux/regexp.go | 382 - vendor/github.com/gorilla/mux/route.go | 736 -- vendor/github.com/gorilla/mux/test_helpers.go | 19 - vendor/modules.txt | 31 - 121 files changed, 47 insertions(+), 38387 deletions(-) delete mode 160000 aws-kube-ci delete mode 100644 bindings/go/dcgm/admin.go delete mode 100644 bindings/go/dcgm/api.go delete mode 100644 bindings/go/dcgm/bcast.go delete mode 100644 bindings/go/dcgm/callback.c delete mode 100644 bindings/go/dcgm/const.go delete mode 100644 bindings/go/dcgm/dcgm_agent.h delete mode 100644 bindings/go/dcgm/dcgm_errors.h delete mode 100644 bindings/go/dcgm/dcgm_fields.h delete mode 100644 bindings/go/dcgm/dcgm_structs.h delete mode 100644 bindings/go/dcgm/dcgm_test.go delete mode 100644 bindings/go/dcgm/device_info.go delete mode 100644 bindings/go/dcgm/device_status.go delete mode 100644 bindings/go/dcgm/fields.go delete mode 100644 bindings/go/dcgm/go.mod delete mode 100644 bindings/go/dcgm/gpu_group.go delete mode 100644 bindings/go/dcgm/health.go delete mode 100644 bindings/go/dcgm/hostengine_status.go delete mode 100644 bindings/go/dcgm/mig.go delete mode 100644 bindings/go/dcgm/policy.go delete mode 100644 bindings/go/dcgm/process_info.go delete mode 100644 bindings/go/dcgm/profile.go delete mode 100644 bindings/go/dcgm/topology.go delete mode 100644 bindings/go/dcgm/utils.go delete mode 100644 bindings/go/nvml/bindings.go delete mode 100644 bindings/go/nvml/mig.go delete mode 100644 bindings/go/nvml/mig_test.go delete mode 100644 bindings/go/nvml/nvml.go delete mode 100644 bindings/go/nvml/nvml.h delete mode 100644 bindings/go/nvml/nvml_dl.go delete mode 100644 bindings/go/nvml/nvml_dl_windows.go delete mode 100644 bindings/go/nvml/nvml_test.go delete mode 100644 bindings/go/nvml/nvsmi/nvsmi.go delete mode 100644 bindings/go/samples/dcgm/README.md delete mode 100644 bindings/go/samples/dcgm/deviceInfo/main.go delete mode 100644 bindings/go/samples/dcgm/dmon/main.go delete mode 100644 bindings/go/samples/dcgm/health/main.go delete mode 100644 bindings/go/samples/dcgm/hostengineStatus/main.go delete mode 100644 bindings/go/samples/dcgm/policy/main.go delete mode 100644 bindings/go/samples/dcgm/processInfo/main.go delete mode 100644 bindings/go/samples/dcgm/restApi/README.md delete mode 100644 bindings/go/samples/dcgm/restApi/handlers/byIds.go delete mode 100644 bindings/go/samples/dcgm/restApi/handlers/byUuids.go delete mode 100644 bindings/go/samples/dcgm/restApi/handlers/dcgm.go delete mode 100644 bindings/go/samples/dcgm/restApi/handlers/utils.go delete mode 100644 bindings/go/samples/dcgm/restApi/main.go delete mode 100644 bindings/go/samples/dcgm/restApi/server.go delete mode 100644 bindings/go/samples/dcgm/topology/main.go delete mode 100644 bindings/go/samples/nvml/README.md delete mode 100644 bindings/go/samples/nvml/deviceInfo/main.go delete mode 100644 bindings/go/samples/nvml/dmon/main.go delete mode 100644 bindings/go/samples/nvml/processInfo/main.go delete mode 100644 docker/Dockerfile.ubuntu18.04 delete mode 100644 vendor/github.com/Masterminds/semver/.travis.yml delete mode 100644 vendor/github.com/Masterminds/semver/CHANGELOG.md delete mode 100644 vendor/github.com/Masterminds/semver/LICENSE.txt delete mode 100644 vendor/github.com/Masterminds/semver/Makefile delete mode 100644 vendor/github.com/Masterminds/semver/README.md delete mode 100644 vendor/github.com/Masterminds/semver/appveyor.yml delete mode 100644 vendor/github.com/Masterminds/semver/collection.go delete mode 100644 vendor/github.com/Masterminds/semver/constraints.go delete mode 100644 vendor/github.com/Masterminds/semver/doc.go delete mode 100644 vendor/github.com/Masterminds/semver/version.go delete mode 100644 vendor/github.com/Masterminds/semver/version_fuzz.go delete mode 100644 vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/admin.go delete mode 100644 vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/api.go delete mode 100644 vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/bcast.go delete mode 100644 vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/callback.c delete mode 100644 vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/const.go delete mode 100644 vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/dcgm_agent.h delete mode 100644 vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/dcgm_errors.h delete mode 100644 vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/dcgm_fields.h delete mode 100644 vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/dcgm_structs.h delete mode 100644 vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/device_info.go delete mode 100644 vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/device_status.go delete mode 100644 vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/fields.go delete mode 100644 vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/go.mod delete mode 100644 vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/gpu_group.go delete mode 100644 vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/health.go delete mode 100644 vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/hostengine_status.go delete mode 100644 vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/mig.go delete mode 100644 vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/policy.go delete mode 100644 vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/process_info.go delete mode 100644 vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/profile.go delete mode 100644 vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/topology.go delete mode 100644 vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/utils.go delete mode 100644 vendor/github.com/gorilla/mux/AUTHORS delete mode 100644 vendor/github.com/gorilla/mux/LICENSE delete mode 100644 vendor/github.com/gorilla/mux/README.md delete mode 100644 vendor/github.com/gorilla/mux/doc.go delete mode 100644 vendor/github.com/gorilla/mux/go.mod delete mode 100644 vendor/github.com/gorilla/mux/middleware.go delete mode 100644 vendor/github.com/gorilla/mux/mux.go delete mode 100644 vendor/github.com/gorilla/mux/regexp.go delete mode 100644 vendor/github.com/gorilla/mux/route.go delete mode 100644 vendor/github.com/gorilla/mux/test_helpers.go delete mode 100644 vendor/modules.txt diff --git a/.github/PR_TEMPLATE.md b/.github/PR_TEMPLATE.md index 417b07bf..055516d4 100644 --- a/.github/PR_TEMPLATE.md +++ b/.github/PR_TEMPLATE.md @@ -1,9 +1,9 @@ -**Please open your pull requests on [gitlab repository](https://gitlab.com/nvidia/container-toolkit/gpu-monitoring-tools.git) ** +**Please open your pull requests on [gitlab repository](https://gitlab.com/nvidia/dcgm-exporter.git) ** Make sure to complete the following items:_ - _A reference to a related issue._ - _A small description of the changes proposed in the pull request._ - _One commit per change and descriptive commit messages._ -- _Sign-off your work following these [guidelines](https://gitlab.com/nvidia/container-toolkit/gpu-monitoring-tools/blob/master/CONTRIBUTING.md) ._ +- _Sign-off your work following these [guidelines](https://gitlab.com/nvidia/dcgm-exporter/blob/master/CONTRIBUTING.md) ._ - _Test run of your changes._ diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 6f4c4128..a5162c6f 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -41,7 +41,7 @@ e2e: - ssh -i aws-kube-ci/key ${instance_hostname} \ "export CI_COMMIT_SHORT_SHA=${CI_COMMIT_SHORT_SHA} && export CI_REGISTRY_IMAGE=${CI_REGISTRY_IMAGE} && - cd ~/gpu-monitoring-tools && sudo -E ./tests/ci-run-e2e.sh" + cd ~/dcgm-exporter && sudo -E ./tests/ci-run-e2e.sh" aws_kube_clean: extends: .aws_kube_clean diff --git a/.gitmodules b/.gitmodules index fe211826..e69de29b 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +0,0 @@ -[submodule "aws-kube-ci"] - path = aws-kube-ci - url = https://gitlab.com/nvidia/container-infrastructure/aws-kube-ci.git diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index dfeb8209..73ebbc06 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,6 +1,6 @@ -# Contribute to the GPU Operator Project +# Contribute to the DCGM-Exporter Project -Want to hack on the NVIDIA Container Toolkit Project? Awesome! +Want to hack on the NVIDIA DCGM-Exporter Project? Awesome! We only require you to sign your work, the below section describes this! ## Sign your work diff --git a/Makefile b/Makefile index 10ddb3a5..73d00dae 100644 --- a/Makefile +++ b/Makefile @@ -25,7 +25,7 @@ NON_TEST_FILES := pkg/dcgm.go pkg/gpu_collector.go pkg/parser.go pkg/pipeline.g MAIN_TEST_FILES := pkg/system_info_test.go .PHONY: all binary install check-format -all: ubuntu18.04 ubuntu20.04 ubi8 +all: ubuntu20.04 ubi8 binary: cd pkg; go build @@ -43,21 +43,8 @@ check-format: push: $(DOCKER) push "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubuntu20.04" - $(DOCKER) push "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubuntu18.04" $(DOCKER) push "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubi8" -push-short: - $(DOCKER) tag "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubuntu18.04" "$(REGISTRY)/dcgm-exporter:$(DCGM_VERSION)" - $(DOCKER) push "$(REGISTRY)/dcgm-exporter:$(DCGM_VERSION)" - -push-ci: - $(DOCKER) tag "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubuntu18.04" "$(REGISTRY)/dcgm-exporter:$(VERSION)" - $(DOCKER) push "$(REGISTRY)/dcgm-exporter:$(VERSION)" - -push-latest: - $(DOCKER) tag "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubuntu18.04" "$(REGISTRY)/dcgm-exporter:latest" - $(DOCKER) push "$(REGISTRY)/dcgm-exporter:latest" - ubuntu20.04: $(DOCKER) build --pull \ --build-arg "GOLANG_VERSION=$(GOLANG_VERSION)" \ @@ -65,13 +52,6 @@ ubuntu20.04: --tag "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubuntu20.04" \ --file docker/Dockerfile.ubuntu20.04 . -ubuntu18.04: - $(DOCKER) build --pull \ - --build-arg "GOLANG_VERSION=$(GOLANG_VERSION)" \ - --build-arg "DCGM_VERSION=$(DCGM_VERSION)" \ - --tag "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubuntu18.04" \ - --file docker/Dockerfile.ubuntu18.04 . - ubi8: $(DOCKER) build --pull \ --build-arg "GOLANG_VERSION=$(GOLANG_VERSION)" \ @@ -79,4 +59,3 @@ ubi8: --build-arg "VERSION=$(FULL_VERSION)" \ --tag "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubi8" \ --file docker/Dockerfile.ubi8 . - diff --git a/README.md b/README.md index 918ff925..9f6a0d29 100644 --- a/README.md +++ b/README.md @@ -1,28 +1,12 @@ -# NVIDIA GPU Monitoring Tools +# DCGM-Exporter -This repository contains Golang bindings and DCGM-Exporter for gathering GPU telemetry in Kubernetes. - -**July 2021 - Update #1: The DCGM Go bindings have moved to github.com/NVIDIA/go-dcgm. The DCGM bindings in this repo are no longer maintained and will eventually be removed. - -**June 2021 - NOTICE: Some of the tools in this repository are graduating to their own repos. In the next few weeks both the DCGM Go bindings and the DCGM Exporter will be migrating to github.com/NVIDIA. This will allow for independent versioning, issues, MRs, etc. Efforts will be made to review the existing MRs and issues before the migration occurs.** - -## Bindings - -Golang bindings are provided for the following two libraries: -- [NVIDIA Management Library (NVML)](https://docs.nvidia.com/deploy/nvml-api/nvml-api-reference.html#nvml-api-reference) is a C-based API for monitoring and managing NVIDIA GPU devices. -- [NVIDIA Data Center GPU Manager (DCGM)](https://developer.nvidia.com/dcgm) is a set of tools for managing and monitoring NVIDIA GPUs in cluster environments. It's a low overhead tool suite that performs a variety of functions on each host system including active health monitoring, diagnostics, system validation, policies, power and clock management, group configuration and accounting. - -You will also find samples for both of these bindings in this repository. - -## DCGM-Exporter - -The repository also contains DCGM-Exporter. It exposes GPU metrics exporter for [Prometheus](https://prometheus.io/) leveraging [NVIDIA DCGM](https://developer.nvidia.com/dcgm). +The repository the contains DCGM-Exporter project. It exposes GPU metrics exporter for [Prometheus](https://prometheus.io/) leveraging [NVIDIA DCGM](https://developer.nvidia.com/dcgm). ### Quickstart To gather metrics on a GPU node, simply start the `dcgm-exporter` container: ``` -$ docker run -d --gpus all --rm -p 9400:9400 nvcr.io/nvidia/k8s/dcgm-exporter:2.0.13-2.1.2-ubuntu18.04 +$ docker run -d --gpus all --rm -p 9400:9400 nvcr.io/nvidia/k8s/dcgm-exporter:2.2.9-2.4.0-ubuntu18.04 $ curl localhost:9400/metrics # HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz). # TYPE DCGM_FI_DEV_SM_CLOCK gauge @@ -46,7 +30,7 @@ Ensure you have already setup your cluster with the [default runtime as NVIDIA]( The recommended way to install DCGM-Exporter is to use the Helm chart: ``` $ helm repo add gpu-helm-charts \ - https://nvidia.github.io/gpu-monitoring-tools/helm-charts + https://nvidia.github.io/dcgm-exporter/helm-charts ``` Update the repo: ``` @@ -63,7 +47,7 @@ Once the `dcgm-exporter` pod is deployed, you can use port forwarding to obtain ``` -$ kubectl create -f https://raw.githubusercontent.com/NVIDIA/gpu-monitoring-tools/master/dcgm-exporter.yaml +$ kubectl create -f https://raw.githubusercontent.com/NVIDIA/dcgm-exporter/master/dcgm-exporter.yaml # Let's get the output of a random pod: $ NAME=$(kubectl get pods -l "app.kubernetes.io/name=dcgm-exporter" \ @@ -95,8 +79,8 @@ Ensure you have the following: - [DCGM installed](https://developer.nvidia.com/dcgm) ``` -$ git clone https://github.com/NVIDIA/gpu-monitoring-tools.git -$ cd gpu-monitoring-tools +$ git clone https://github.com/NVIDIA/dcgm-exporter.git +$ cd dcgm-exporter $ make binary $ sudo make install ... @@ -153,5 +137,5 @@ Pull requests are accepted! [Checkout the Contributing document!](CONTRIBUTING.md) -* Please let us know by [filing a new issue](https://github.com/NVIDIA/gpu-monitoring-tools/issues/new) -* You can contribute by opening a [pull request](https://gitlab.com/nvidia/container-toolkit/gpu-monitoring-tools) +* Please let us know by [filing a new issue](https://github.com/NVIDIA/dcgm-exporter/issues/new) +* You can contribute by opening a [pull request](https://github.com/NVIDIA/dcgm-exporter) diff --git a/RELEASE.md b/RELEASE.md index acc8fb96..600a96ee 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,7 +1,6 @@ # Release This document, the release process as well as the versioning strategy for the DCGM exporter. -In the future this document will also contain information about the go bindings. ## Versioning diff --git a/aws-kube-ci b/aws-kube-ci deleted file mode 160000 index 49dd87a0..00000000 --- a/aws-kube-ci +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 49dd87a071db9c232670233445ff97f57dc2cee0 diff --git a/bindings/go/dcgm/admin.go b/bindings/go/dcgm/admin.go deleted file mode 100644 index 9fc27b10..00000000 --- a/bindings/go/dcgm/admin.go +++ /dev/null @@ -1,318 +0,0 @@ -/* - * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package dcgm - -/* -#cgo linux LDFLAGS: -ldl -Wl,--unresolved-symbols=ignore-in-object-files -#cgo darwin LDFLAGS: -ldl -Wl,-undefined,dynamic_lookup - - -#include -#include "./dcgm_agent.h" -#include "./dcgm_structs.h" -*/ -import "C" -import ( - "fmt" - "io/ioutil" - "log" - "os" - "os/exec" - "strconv" - "strings" - "syscall" - "unsafe" - - "github.com/Masterminds/semver" -) - -type mode int - -// const for DCGM hostengine running modes: Embedded, Standalone or StartHostengine -const ( - Embedded mode = iota - Standalone - StartHostengine -) - -type dcgmHandle struct{ handle C.dcgmHandle_t } - -var ( - dcgmLibHandle unsafe.Pointer - stopMode mode - handle dcgmHandle - hostengineAsChildPid int -) - -func initDcgm(m mode, args ...string) (err error) { - const ( - dcgmLib = "libdcgm.so" - ) - lib := C.CString(dcgmLib) - defer freeCString(lib) - - dcgmLibHandle = C.dlopen(lib, C.RTLD_LAZY|C.RTLD_GLOBAL) - if dcgmLibHandle == nil { - return fmt.Errorf("%s not Found", dcgmLib) - } - - // set the stopMode for shutdown() - stopMode = m - - switch m { - case Embedded: - return startEmbedded() - case Standalone: - return connectStandalone(args...) - case StartHostengine: - return startHostengine() - } - - return nil -} - -func shutdown() (err error) { - switch stopMode { - case Embedded: - err = stopEmbedded() - case Standalone: - err = disconnectStandalone() - case StartHostengine: - err = stopHostengine() - } - - C.dlclose(dcgmLibHandle) - return -} - -func startEmbedded() (err error) { - result := C.dcgmInit() - if err = errorString(result); err != nil { - return fmt.Errorf("Error initializing DCGM: %s", err) - } - - var cHandle C.dcgmHandle_t - result = C.dcgmStartEmbedded(C.DCGM_OPERATION_MODE_AUTO, &cHandle) - if err = errorString(result); err != nil { - return fmt.Errorf("Error starting nv-hostengine: %s", err) - } - handle = dcgmHandle{cHandle} - return -} - -func stopEmbedded() (err error) { - result := C.dcgmStopEmbedded(handle.handle) - if err = errorString(result); err != nil { - return fmt.Errorf("Error stopping nv-hostengine: %s", err) - } - - result = C.dcgmShutdown() - if err = errorString(result); err != nil { - return fmt.Errorf("Error shutting down DCGM: %s", err) - } - return -} - -func connectStandalone(args ...string) (err error) { - if len(args) < 2 { - return fmt.Errorf("Missing dcgm address and / or port") - } - - result := C.dcgmInit() - if err = errorString(result); err != nil { - return fmt.Errorf("Error initializing DCGM: %s", err) - } - - var cHandle C.dcgmHandle_t - addr := C.CString(args[0]) - defer freeCString(addr) - var connectParams C.dcgmConnectV2Params_t - connectParams.version = makeVersion2(unsafe.Sizeof(connectParams)) - - sck, err := strconv.ParseUint(args[1], 10, 32) - if err != nil { - return fmt.Errorf("Error parsing %s: %v\n", args[1], err) - } - connectParams.addressIsUnixSocket = C.uint(sck) - - result = C.dcgmConnect_v2(addr, &connectParams, &cHandle) - if err = errorString(result); err != nil { - return fmt.Errorf("Error connecting to nv-hostengine: %s", err) - } - - handle = dcgmHandle{cHandle} - - // This check is disabled for now - /* - err = checkHostengineVersion() - if err != nil { - return fmt.Errorf("Error connecting to remote nv-hostengine: %s", err) - } - */ - - return -} - -func disconnectStandalone() (err error) { - result := C.dcgmDisconnect(handle.handle) - if err = errorString(result); err != nil { - return fmt.Errorf("Error disconnecting from nv-hostengine: %s", err) - } - - result = C.dcgmShutdown() - if err = errorString(result); err != nil { - return fmt.Errorf("Error shutting down DCGM: %s", err) - } - return -} - -func startHostengine() (err error) { - bin, err := exec.LookPath("nv-hostengine") - if err != nil { - return fmt.Errorf("Error finding nv-hostengine: %s", err) - } - var procAttr syscall.ProcAttr - procAttr.Files = []uintptr{ - uintptr(syscall.Stdin), - uintptr(syscall.Stdout), - uintptr(syscall.Stderr)} - procAttr.Sys = &syscall.SysProcAttr{Setpgid: true} - - dir := "/tmp" - tmpfile, err := ioutil.TempFile(dir, "dcgm") - if err != nil { - return fmt.Errorf("Error creating temporary file in %s directory: %s", dir, err) - } - socketPath := tmpfile.Name() - defer os.Remove(socketPath) - - connectArg := "--domain-socket" - hostengineAsChildPid, err = syscall.ForkExec(bin, []string{bin, connectArg, socketPath}, &procAttr) - if err != nil { - return fmt.Errorf("Error fork-execing nv-hostengine: %s", err) - } - - result := C.dcgmInit() - if err = errorString(result); err != nil { - return fmt.Errorf("Error initializing DCGM: %s", err) - } - - var cHandle C.dcgmHandle_t - var connectParams C.dcgmConnectV2Params_t - connectParams.version = makeVersion2(unsafe.Sizeof(connectParams)) - isSocket := C.uint(1) - connectParams.addressIsUnixSocket = isSocket - cSockPath := C.CString(socketPath) - defer freeCString(cSockPath) - result = C.dcgmConnect_v2(cSockPath, &connectParams, &cHandle) - if err = errorString(result); err != nil { - return fmt.Errorf("Error connecting to nv-hostengine: %s", err) - } - - handle = dcgmHandle{cHandle} - return -} - -func stopHostengine() (err error) { - if err = disconnectStandalone(); err != nil { - return - } - - // terminate nv-hostengine - cmd := exec.Command("nv-hostengine", "--term") - if err = cmd.Run(); err != nil { - return fmt.Errorf("Error terminating nv-hostengine: %s", err) - } - log.Println("Successfully terminated nv-hostengine.") - - return syscall.Kill(hostengineAsChildPid, syscall.SIGKILL) -} - -func checkHostengineVersion() (err error) { - var hostEngineVersionInfo C.dcgmVersionInfo_t - hostEngineVersionInfo.version = makeVersion2(unsafe.Sizeof(hostEngineVersionInfo)) - result := C.dcgmHostengineVersionInfo(handle.handle, &hostEngineVersionInfo) - if err = errorString(result); err != nil { - return fmt.Errorf("Could not retrieve running hostengine version: %s", err) - } - - var versionInfo C.dcgmVersionInfo_t - versionInfo.version = makeVersion2(unsafe.Sizeof(versionInfo)) - result = C.dcgmVersionInfo(&versionInfo) - if err = errorString(result); err != nil { - return fmt.Errorf("Could not retrieve dcgm version: %s", err) - } - - /* Version string looks like: "version:2.1.2;arch:x86_64;buildtype:Debug; - * buildid:;builddate:2021-03-03;commit:v2.1.1-5-gc27ab30f;branch:master; - * buildplatform:Linux 5.4.0-66-generic #74~18.04.2-Ubuntu SMP Fri Feb 5 - * 11:17:31 UTC 2021 x86_64;;crc:bd60aadd63245021163ef008d0907ae7" - */ - heVersionStr := C.GoString(&hostEngineVersionInfo.rawBuildInfoString[0]) - myVersionStr := C.GoString(&versionInfo.rawBuildInfoString[0]) - var foundVersion = false - - he := strings.Split(heVersionStr, ";") - - // Find version pair within build information - for _, line := range he { - if strings.HasPrefix(line, "version:") { - heVersionStr = line - foundVersion = true - } - } - - if foundVersion == false { - return fmt.Errorf("Could not determine remote version") - } - - foundVersion = false - my := strings.Split(myVersionStr, ";") - - for _, line := range my { - if strings.HasPrefix(line, "version:") { - myVersionStr = line - foundVersion = true - } - } - - if foundVersion == false { - return fmt.Errorf("Could not determine local version") - } - - // Parse out version and compare - he = strings.Split(heVersionStr, ":") - my = strings.Split(myVersionStr, ":") - - if (len(he) != 2) && (len(my) != 2) { - return fmt.Errorf("Could not parse versions") - } - - heVersion, err := semver.NewVersion(he[1]) - if err != nil { - return fmt.Errorf("Could not determine remote version ", err) - } - myVersion, err := semver.NewVersion(my[1]) - if err != nil { - return fmt.Errorf("Could not determine local version ", err) - } - if heVersion.Major() != myVersion.Major() { - return fmt.Errorf("remote %v != local %v", he[1], my[1]) - } - - return -} diff --git a/bindings/go/dcgm/api.go b/bindings/go/dcgm/api.go deleted file mode 100644 index 05a446da..00000000 --- a/bindings/go/dcgm/api.go +++ /dev/null @@ -1,108 +0,0 @@ -package dcgm - -import ( - "fmt" - "os" - "sync" -) - -var ( - dcgmInitCounter int - mux sync.Mutex -) - -// Init starts DCGM, based on the user selected mode -// DCGM can be started in 3 differengt modes: -// 1. Embedded: Start hostengine within this process -// 2. Standalone: Connect to an already running nv-hostengine at the specified address -// Connection address can be passed as command line args: -connect "IP:PORT/Socket" -socket "isSocket" -// 3. StartHostengine: Open an Unix socket to start and connect to the nv-hostengine and terminate before exiting -func Init(m mode, args ...string) (cleanup func(), err error) { - mux.Lock() - if dcgmInitCounter < 0 { - count := fmt.Sprintf("%d", dcgmInitCounter) - err = fmt.Errorf("Shutdown() is called %s times, before Init()", count[1:]) - } - if dcgmInitCounter == 0 { - err = initDcgm(m, args...) - } - dcgmInitCounter += 1 - mux.Unlock() - - return func() { - if err := Shutdown(); err != nil { - fmt.Fprintf(os.Stderr, "Failed to shutdown DCGM with error: `%v`", err) - } - }, err -} - -// Shutdown stops DCGM and destroy all connections -func Shutdown() (err error) { - mux.Lock() - if dcgmInitCounter <= 0 { - err = fmt.Errorf("Init() needs to be called before Shutdown()") - } - if dcgmInitCounter == 1 { - err = shutdown() - } - dcgmInitCounter -= 1 - mux.Unlock() - - return -} - -// GetAllDeviceCount counts all GPUs on the system -func GetAllDeviceCount() (uint, error) { - return getAllDeviceCount() -} - -// GetSupportedDevices returns only DCGM supported GPUs -func GetSupportedDevices() ([]uint, error) { - return getSupportedDevices() -} - -// GetDeviceInfo describes the given device -func GetDeviceInfo(gpuId uint) (Device, error) { - return getDeviceInfo(gpuId) -} - -// GetDeviceStatus monitors GPU status including its power, memory and GPU utilization -func GetDeviceStatus(gpuId uint) (DeviceStatus, error) { - return latestValuesForDevice(gpuId) -} - -// GetDeviceTopology returns device topology corresponding to the gpuId -func GetDeviceTopology(gpuId uint) ([]P2PLink, error) { - return getDeviceTopology(gpuId) -} - -// WatchPidFields lets DCGM start recording stats for GPU process -// It needs to be called before calling GetProcessInfo -func WatchPidFields() (GroupHandle, error) { - return watchPidFields() -} - -// GetProcessInfo provides detailed per GPU stats for this process -func GetProcessInfo(group GroupHandle, pid uint) ([]ProcessInfo, error) { - return getProcessInfo(group, pid) -} - -// HealthCheckByGpuId monitors GPU health for any errors/failures/warnings -func HealthCheckByGpuId(gpuId uint) (DeviceHealth, error) { - return healthCheckByGpuId(gpuId) -} - -// Policy sets GPU usage and error policies and notifies in case of any violations via callback functions -func Policy(gpuId uint, typ ...policyCondition) (<-chan PolicyViolation, error) { - return registerPolicy(gpuId, typ...) -} - -// Introspect returns DCGM hostengine memory and CPU usage -func Introspect() (DcgmStatus, error) { - return introspect() -} - -// Get all of the profiling metric groups for a given GPU group. -func GetSupportedMetricGroups(grpid uint) ([]MetricGroup, error) { - return getSupportedMetricGroups(grpid) -} diff --git a/bindings/go/dcgm/bcast.go b/bindings/go/dcgm/bcast.go deleted file mode 100644 index 03ac70b1..00000000 --- a/bindings/go/dcgm/bcast.go +++ /dev/null @@ -1,84 +0,0 @@ -package dcgm - -import ( - "fmt" - "sync" -) - -type publisher struct { - publish chan interface{} - close chan bool - subscribers []*subscriber - subscriberLock sync.Mutex -} - -type subscriber struct { - read chan interface{} - close chan bool -} - -func newPublisher() *publisher { - pub := &publisher{ - publish: make(chan interface{}), - close: make(chan bool), - } - return pub -} - -func (p *publisher) subscriberList() []*subscriber { - p.subscriberLock.Lock() - defer p.subscriberLock.Unlock() - return p.subscribers[:] -} - -func (p *publisher) add() *subscriber { - p.subscriberLock.Lock() - defer p.subscriberLock.Unlock() - newSub := &subscriber{ - read: make(chan interface{}), - close: make(chan bool), - } - p.subscribers = append(p.subscribers, newSub) - return newSub -} - -func (p *publisher) remove(leaving *subscriber) error { - p.subscriberLock.Lock() - defer p.subscriberLock.Unlock() - subscriberIndex := -1 - for i, sub := range p.subscribers { - if sub == leaving { - subscriberIndex = i - break - } - } - if subscriberIndex == -1 { - return fmt.Errorf("Could not find subscriber") - } - go func() { leaving.close <- true }() - p.subscribers = append(p.subscribers[:subscriberIndex], p.subscribers[subscriberIndex+1:]...) - return nil -} - -func (p *publisher) send(val interface{}) { - p.publish <- val -} - -func (p *publisher) broadcast() { - for { - select { - case publishing := <-p.publish: - for _, sub := range p.subscriberList() { - go func(s *subscriber, val interface{}) { - s.read <- val - }(sub, publishing) - } - case <-p.close: - return - } - } -} - -func (p *publisher) closePublisher() { - p.close <- true -} diff --git a/bindings/go/dcgm/callback.c b/bindings/go/dcgm/callback.c deleted file mode 100644 index 5bc2fc2b..00000000 --- a/bindings/go/dcgm/callback.c +++ /dev/null @@ -1,4 +0,0 @@ -int violationNotify(void* p) { - int ViolationRegistration(void*); - return ViolationRegistration(p); -} diff --git a/bindings/go/dcgm/const.go b/bindings/go/dcgm/const.go deleted file mode 100644 index 92fdd925..00000000 --- a/bindings/go/dcgm/const.go +++ /dev/null @@ -1,791 +0,0 @@ -package dcgm - -import "C" - -type Short C.ushort - -type FieldValue_v1 struct { - Version uint - FieldId uint - FieldType uint - Status int - Ts int64 - Value [4096]byte -} - -type FieldValue_v2 struct { - Version uint - EntityGroupId Field_Entity_Group - EntityId uint - FieldId uint - FieldType uint - Status int - Ts int64 - Value [4096]byte - StringValue *string -} - -const ( - DCGM_FT_BINARY = uint('b') - DCGM_FT_DOUBLE = uint('d') - DCGM_FT_INT64 = uint('i') - DCGM_FT_STRING = uint('s') - DCGM_FT_TIMESTAMP = uint('t') - DCGM_FT_INT32_BLANK = int64(2147483632) - DCGM_FT_INT32_NOT_FOUND = int64(DCGM_FT_INT32_BLANK + 1) - DCGM_FT_INT32_NOT_SUPPORTED = int64(DCGM_FT_INT32_BLANK + 2) - DCGM_FT_INT32_NOT_PERMISSIONED = int64(DCGM_FT_INT32_BLANK + 3) - DCGM_FT_INT64_BLANK = int64(9223372036854775792) - DCGM_FT_INT64_NOT_FOUND = int64(DCGM_FT_INT64_BLANK + 1) - DCGM_FT_INT64_NOT_SUPPORTED = int64(DCGM_FT_INT64_BLANK + 2) - DCGM_FT_INT64_NOT_PERMISSIONED = int64(DCGM_FT_INT64_BLANK + 3) - DCGM_FT_FP64_BLANK = 140737488355328.0 - DCGM_FT_FP64_NOT_FOUND = float64(DCGM_FT_FP64_BLANK + 1.0) - DCGM_FT_FP64_NOT_SUPPORTED = float64(DCGM_FT_FP64_BLANK + 2.0) - DCGM_FT_FP64_NOT_PERMISSIONED = float64(DCGM_FT_FP64_BLANK + 3.0) - DCGM_FT_STR_BLANK = "<<>>" - DCGM_FT_STR_NOT_FOUND = "<<>>" - DCGM_FT_STR_NOT_SUPPORTED = "<<>>" - DCGM_FT_STR_NOT_PERMISSIONED = "<<>>" - - DCGM_FI_UNKNOWN = 0 - DCGM_FI_DRIVER_VERSION = 1 - DCGM_FI_NVML_VERSION = 2 - DCGM_FI_PROCESS_NAME = 3 - DCGM_FI_DEV_COUNT = 4 - DCGM_FI_DEV_NAME = 50 - DCGM_FI_DEV_BRAND = 51 - DCGM_FI_DEV_NVML_INDEX = 52 - DCGM_FI_DEV_SERIAL = 53 - DCGM_FI_DEV_UUID = 54 - DCGM_FI_DEV_MINOR_NUMBER = 55 - DCGM_FI_DEV_OEM_INFOROM_VER = 56 - DCGM_FI_DEV_PCI_BUSID = 57 - DCGM_FI_DEV_PCI_COMBINED_ID = 58 - DCGM_FI_DEV_PCI_SUBSYS_ID = 59 - DCGM_FI_GPU_TOPOLOGY_PCI = 60 - DCGM_FI_GPU_TOPOLOGY_NVLINK = 61 - DCGM_FI_GPU_TOPOLOGY_AFFINITY = 62 - DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY = 63 - DCGM_FI_DEV_COMPUTE_MODE = 65 - DCGM_FI_DEV_CPU_AFFINITY_0 = 70 - DCGM_FI_DEV_CPU_AFFINITY_1 = 71 - DCGM_FI_DEV_CPU_AFFINITY_2 = 72 - DCGM_FI_DEV_CPU_AFFINITY_3 = 73 - DCGM_FI_DEV_ECC_INFOROM_VER = 80 - DCGM_FI_DEV_POWER_INFOROM_VER = 81 - DCGM_FI_DEV_INFOROM_IMAGE_VER = 82 - DCGM_FI_DEV_INFOROM_CONFIG_CHECK = 83 - DCGM_FI_DEV_INFOROM_CONFIG_VALID = 84 - DCGM_FI_DEV_VBIOS_VERSION = 85 - DCGM_FI_DEV_BAR1_TOTAL = 90 - DCGM_FI_SYNC_BOOST = 91 - DCGM_FI_DEV_BAR1_USED = 92 - DCGM_FI_DEV_BAR1_FREE = 93 - DCGM_FI_DEV_SM_CLOCK = 100 - DCGM_FI_DEV_MEM_CLOCK = 101 - DCGM_FI_DEV_VIDEO_CLOCK = 102 - DCGM_FI_DEV_APP_SM_CLOCK = 110 - DCGM_FI_DEV_APP_MEM_CLOCK = 111 - DCGM_FI_DEV_CLOCK_THROTTLE_REASONS = 112 - DCGM_FI_DEV_MAX_SM_CLOCK = 113 - DCGM_FI_DEV_MAX_MEM_CLOCK = 114 - DCGM_FI_DEV_MAX_VIDEO_CLOCK = 115 - DCGM_FI_DEV_AUTOBOOST = 120 - DCGM_FI_DEV_SUPPORTED_CLOCKS = 130 - DCGM_FI_DEV_MEMORY_TEMP = 140 - DCGM_FI_DEV_GPU_TEMP = 150 - DCGM_FI_DEV_POWER_USAGE = 155 - DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION = 156 - DCGM_FI_DEV_SLOWDOWN_TEMP = 158 - DCGM_FI_DEV_SHUTDOWN_TEMP = 159 - DCGM_FI_DEV_POWER_MGMT_LIMIT = 160 - DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN = 161 - DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX = 162 - DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF = 163 - DCGM_FI_DEV_ENFORCED_POWER_LIMIT = 164 - DCGM_FI_DEV_PSTATE = 190 - DCGM_FI_DEV_FAN_SPEED = 191 - DCGM_FI_DEV_PCIE_TX_THROUGHPUT = 200 - DCGM_FI_DEV_PCIE_RX_THROUGHPUT = 201 - DCGM_FI_DEV_PCIE_REPLAY_COUNTER = 202 - DCGM_FI_DEV_GPU_UTIL = 203 - DCGM_FI_DEV_MEM_COPY_UTIL = 204 - DCGM_FI_DEV_ACCOUNTING_DATA = 205 - DCGM_FI_DEV_ENC_UTIL = 206 - DCGM_FI_DEV_DEC_UTIL = 207 - DCGM_FI_DEV_MEM_COPY_UTIL_SAMPLES = 210 - DCGM_FI_DEV_GPU_UTIL_SAMPLES = 211 - DCGM_FI_DEV_GRAPHICS_PIDS = 220 - DCGM_FI_DEV_COMPUTE_PIDS = 221 - DCGM_FI_DEV_XID_ERRORS = 230 - DCGM_FI_DEV_PCIE_MAX_LINK_GEN = 235 - DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH = 236 - DCGM_FI_DEV_PCIE_LINK_GEN = 237 - DCGM_FI_DEV_PCIE_LINK_WIDTH = 238 - DCGM_FI_DEV_POWER_VIOLATION = 240 - DCGM_FI_DEV_THERMAL_VIOLATION = 241 - DCGM_FI_DEV_SYNC_BOOST_VIOLATION = 242 - DCGM_FI_DEV_BOARD_LIMIT_VIOLATION = 243 - DCGM_FI_DEV_LOW_UTIL_VIOLATION = 244 - DCGM_FI_DEV_RELIABILITY_VIOLATION = 245 - DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION = 246 - DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION = 247 - DCGM_FI_DEV_FB_TOTAL = 250 - DCGM_FI_DEV_FB_FREE = 251 - DCGM_FI_DEV_FB_USED = 252 - DCGM_FI_DEV_ECC_CURRENT = 300 - DCGM_FI_DEV_ECC_PENDING = 301 - DCGM_FI_DEV_ECC_SBE_VOL_TOTAL = 310 - DCGM_FI_DEV_ECC_DBE_VOL_TOTAL = 311 - DCGM_FI_DEV_ECC_SBE_AGG_TOTAL = 312 - DCGM_FI_DEV_ECC_DBE_AGG_TOTAL = 313 - DCGM_FI_DEV_ECC_SBE_VOL_L1 = 314 - DCGM_FI_DEV_ECC_DBE_VOL_L1 = 315 - DCGM_FI_DEV_ECC_SBE_VOL_L2 = 316 - DCGM_FI_DEV_ECC_DBE_VOL_L2 = 317 - DCGM_FI_DEV_ECC_SBE_VOL_DEV = 318 - DCGM_FI_DEV_ECC_DBE_VOL_DEV = 319 - DCGM_FI_DEV_ECC_SBE_VOL_REG = 320 - DCGM_FI_DEV_ECC_DBE_VOL_REG = 321 - DCGM_FI_DEV_ECC_SBE_VOL_TEX = 322 - DCGM_FI_DEV_ECC_DBE_VOL_TEX = 323 - DCGM_FI_DEV_ECC_SBE_AGG_L1 = 324 - DCGM_FI_DEV_ECC_DBE_AGG_L1 = 325 - DCGM_FI_DEV_ECC_SBE_AGG_L2 = 326 - DCGM_FI_DEV_ECC_DBE_AGG_L2 = 327 - DCGM_FI_DEV_ECC_SBE_AGG_DEV = 328 - DCGM_FI_DEV_ECC_DBE_AGG_DEV = 329 - DCGM_FI_DEV_ECC_SBE_AGG_REG = 330 - DCGM_FI_DEV_ECC_DBE_AGG_REG = 331 - DCGM_FI_DEV_ECC_SBE_AGG_TEX = 332 - DCGM_FI_DEV_ECC_DBE_AGG_TEX = 333 - DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS = 393 - DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS = 394 - DCGM_FI_DEV_ROW_REMAP_FAILURE = 395 - DCGM_FI_DEV_RETIRED_SBE = 390 - DCGM_FI_DEV_RETIRED_DBE = 391 - DCGM_FI_DEV_RETIRED_PENDING = 392 - DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0 = 400 - DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1 = 401 - DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2 = 402 - DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3 = 403 - DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4 = 404 - DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5 = 405 - DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL = 409 - DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0 = 410 - DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1 = 411 - DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2 = 412 - DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3 = 413 - DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4 = 414 - DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5 = 415 - DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL = 419 - DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0 = 420 - DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1 = 421 - DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2 = 422 - DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3 = 423 - DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4 = 424 - DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5 = 425 - DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL = 429 - DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0 = 430 - DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1 = 431 - DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2 = 432 - DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3 = 433 - DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4 = 434 - DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5 = 435 - DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL = 439 - DCGM_FI_DEV_NVLINK_BANDWIDTH_L0 = 440 - DCGM_FI_DEV_NVLINK_BANDWIDTH_L1 = 441 - DCGM_FI_DEV_NVLINK_BANDWIDTH_L2 = 442 - DCGM_FI_DEV_NVLINK_BANDWIDTH_L3 = 443 - DCGM_FI_DEV_NVLINK_BANDWIDTH_L4 = 444 - DCGM_FI_DEV_NVLINK_BANDWIDTH_L5 = 445 - DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL = 449 - DCGM_FI_DEV_GPU_NVLINK_ERRORS = 450 - DCGM_FI_DEV_VIRTUAL_MODE = 500 - DCGM_FI_DEV_SUPPORTED_TYPE_INFO = 501 - DCGM_FI_DEV_CREATABLE_VGPU_TYPE_IDS = 502 - DCGM_FI_DEV_VGPU_INSTANCE_IDS = 503 - DCGM_FI_DEV_VGPU_UTILIZATIONS = 504 - DCGM_FI_DEV_VGPU_PER_PROCESS_UTILIZATION = 505 - DCGM_FI_DEV_ENC_STATS = 506 - DCGM_FI_DEV_FBC_STATS = 507 - DCGM_FI_DEV_FBC_SESSIONS_INFO = 508 - DCGM_FI_DEV_VGPU_VM_ID = 520 - DCGM_FI_DEV_VGPU_VM_NAME = 521 - DCGM_FI_DEV_VGPU_TYPE = 522 - DCGM_FI_DEV_VGPU_UUID = 523 - DCGM_FI_DEV_VGPU_DRIVER_VERSION = 524 - DCGM_FI_DEV_VGPU_MEMORY_USAGE = 525 - DCGM_FI_DEV_VGPU_LICENSE_STATUS = 526 - DCGM_FI_DEV_VGPU_FRAME_RATE_LIMIT = 527 - DCGM_FI_DEV_VGPU_ENC_STATS = 528 - DCGM_FI_DEV_VGPU_ENC_SESSIONS_INFO = 529 - DCGM_FI_DEV_VGPU_FBC_STATS = 530 - DCGM_FI_DEV_VGPU_FBC_SESSIONS_INFO = 531 - DCGM_FI_FIRST_VGPU_FIELD_ID = 520 - DCGM_FI_LAST_VGPU_FIELD_ID = 570 - DCGM_FI_INTERNAL_FIELDS_0_START = 600 - DCGM_FI_INTERNAL_FIELDS_0_END = 699 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P00 = 700 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P00 = 701 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P00 = 702 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P00 = 703 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P01 = 704 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P01 = 705 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P01 = 706 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P01 = 707 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P02 = 708 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P02 = 709 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P02 = 710 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P02 = 711 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P03 = 712 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P03 = 713 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P03 = 714 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P03 = 715 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P04 = 716 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P04 = 717 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P04 = 718 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P04 = 719 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P05 = 720 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P05 = 721 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P05 = 722 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P05 = 723 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P06 = 724 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P06 = 725 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P06 = 726 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P06 = 727 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P07 = 728 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P07 = 729 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P07 = 730 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P07 = 731 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P08 = 732 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P08 = 733 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P08 = 734 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P08 = 735 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P09 = 736 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P09 = 737 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P09 = 738 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P09 = 739 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P10 = 740 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P10 = 741 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P10 = 742 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P10 = 743 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P11 = 744 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P11 = 745 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P11 = 746 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P11 = 747 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P12 = 748 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P12 = 749 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P12 = 750 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P12 = 751 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P13 = 752 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P13 = 753 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P13 = 754 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P13 = 755 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P14 = 756 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P14 = 757 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P14 = 758 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P14 = 759 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P15 = 760 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P15 = 761 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P15 = 762 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P15 = 763 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P16 = 764 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P16 = 765 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P16 = 766 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P16 = 767 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P17 = 768 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P17 = 769 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P17 = 770 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P17 = 771 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P00 = 780 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P00 = 781 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P01 = 782 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P01 = 783 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P02 = 784 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P02 = 785 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P03 = 786 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P03 = 787 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P04 = 788 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P04 = 789 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P05 = 790 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P05 = 791 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P06 = 792 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P06 = 793 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P07 = 794 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P07 = 795 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P08 = 796 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P08 = 797 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P09 = 798 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P09 = 799 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P10 = 800 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P10 = 801 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P11 = 802 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P11 = 803 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P12 = 804 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P12 = 805 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P13 = 806 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P13 = 807 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P14 = 808 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P14 = 809 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P15 = 810 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P15 = 811 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P16 = 812 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P16 = 813 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P17 = 814 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P17 = 815 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P00 = 820 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P00 = 821 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P01 = 822 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P01 = 823 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P02 = 824 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P02 = 825 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P03 = 826 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P03 = 827 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P04 = 828 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P04 = 829 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P05 = 830 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P05 = 831 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P06 = 832 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P06 = 833 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P07 = 834 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P07 = 835 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P08 = 836 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P08 = 837 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P09 = 838 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P09 = 839 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P10 = 840 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P10 = 841 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P11 = 842 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P11 = 843 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P12 = 844 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P12 = 845 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P13 = 846 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P13 = 847 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P14 = 848 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P14 = 849 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P15 = 850 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P15 = 851 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P16 = 852 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P16 = 853 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P17 = 854 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P17 = 855 - DCGM_FI_DEV_NVSWITCH_FATAL_ERRORS = 856 - DCGM_FI_DEV_NVSWITCH_NON_FATAL_ERRORS = 857 - DCGM_FI_FIRST_NVSWITCH_FIELD_ID = 700 - DCGM_FI_LAST_NVSWITCH_FIELD_ID = 860 - DCGM_FI_PROF_GR_ENGINE_ACTIVE = 1001 - DCGM_FI_PROF_SM_ACTIVE = 1002 - DCGM_FI_PROF_SM_OCCUPANCY = 1003 - DCGM_FI_PROF_PIPE_TENSOR_ACTIVE = 1004 - DCGM_FI_PROF_DRAM_ACTIVE = 1005 - DCGM_FI_PROF_PIPE_FP64_ACTIVE = 1006 - DCGM_FI_PROF_PIPE_FP32_ACTIVE = 1007 - DCGM_FI_PROF_PIPE_FP16_ACTIVE = 1008 - DCGM_FI_PROF_PCIE_TX_BYTES = 1009 - DCGM_FI_PROF_PCIE_RX_BYTES = 1010 - DCGM_FI_PROF_NVLINK_TX_BYTES = 1011 - DCGM_FI_PROF_NVLINK_RX_BYTES = 1012 - DCGM_FI_MAX_FIELDS = 1013 -) - -var ( - DCGM_FI = map[string]Short{ - "DCGM_FT_BINARY": Short('b'), - "DCGM_FT_DOUBLE": Short('d'), - "DCGM_FT_INT64": Short('i'), - "DCGM_FT_STRING": Short('s'), - "DCGM_FT_TIMESTAMP": Short('t'), - "DCGM_FI_UNKNOWN": 0, - "DCGM_FI_DRIVER_VERSION": 1, - "DCGM_FI_NVML_VERSION": 2, - "DCGM_FI_PROCESS_NAME": 3, - "DCGM_FI_DEV_COUNT": 4, - "DCGM_FI_DEV_NAME": 50, - "DCGM_FI_DEV_BRAND": 51, - "DCGM_FI_DEV_NVML_INDEX": 52, - "DCGM_FI_DEV_SERIAL": 53, - "DCGM_FI_DEV_UUID": 54, - "DCGM_FI_DEV_MINOR_NUMBER": 55, - "DCGM_FI_DEV_OEM_INFOROM_VER": 56, - "DCGM_FI_DEV_PCI_BUSID": 57, - "DCGM_FI_DEV_PCI_COMBINED_ID": 58, - "DCGM_FI_DEV_PCI_SUBSYS_ID": 59, - "DCGM_FI_GPU_TOPOLOGY_PCI": 60, - "DCGM_FI_GPU_TOPOLOGY_NVLINK": 61, - "DCGM_FI_GPU_TOPOLOGY_AFFINITY": 62, - "DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY": 6, - "DCGM_FI_DEV_COMPUTE_MODE": 65, - "DCGM_FI_DEV_CPU_AFFINITY_0": 70, - "DCGM_FI_DEV_CPU_AFFINITY_1": 71, - "DCGM_FI_DEV_CPU_AFFINITY_2": 72, - "DCGM_FI_DEV_CPU_AFFINITY_3": 73, - "DCGM_FI_DEV_ECC_INFOROM_VER": 80, - "DCGM_FI_DEV_POWER_INFOROM_VER": 81, - "DCGM_FI_DEV_INFOROM_IMAGE_VER": 82, - "DCGM_FI_DEV_INFOROM_CONFIG_CHECK": 83, - "DCGM_FI_DEV_INFOROM_CONFIG_VALID": 84, - "DCGM_FI_DEV_VBIOS_VERSION": 85, - "DCGM_FI_DEV_BAR1_TOTAL": 90, - "DCGM_FI_SYNC_BOOST": 91, - "DCGM_FI_DEV_BAR1_USED": 92, - "DCGM_FI_DEV_BAR1_FREE": 93, - "DCGM_FI_DEV_SM_CLOCK": 100, - "DCGM_FI_DEV_MEM_CLOCK": 101, - "DCGM_FI_DEV_VIDEO_CLOCK": 102, - "DCGM_FI_DEV_APP_SM_CLOCK": 110, - "DCGM_FI_DEV_APP_MEM_CLOCK": 111, - "DCGM_FI_DEV_CLOCK_THROTTLE_REASONS": 112, - "DCGM_FI_DEV_MAX_SM_CLOCK": 113, - "DCGM_FI_DEV_MAX_MEM_CLOCK": 114, - "DCGM_FI_DEV_MAX_VIDEO_CLOCK": 115, - "DCGM_FI_DEV_AUTOBOOST": 120, - "DCGM_FI_DEV_SUPPORTED_CLOCKS": 130, - "DCGM_FI_DEV_MEMORY_TEMP": 140, - "DCGM_FI_DEV_GPU_TEMP": 150, - "DCGM_FI_DEV_POWER_USAGE": 155, - "DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION": 156, - "DCGM_FI_DEV_SLOWDOWN_TEMP": 158, - "DCGM_FI_DEV_SHUTDOWN_TEMP": 159, - "DCGM_FI_DEV_POWER_MGMT_LIMIT": 160, - "DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN": 161, - "DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX": 162, - "DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF": 163, - "DCGM_FI_DEV_ENFORCED_POWER_LIMIT": 164, - "DCGM_FI_DEV_PSTATE": 190, - "DCGM_FI_DEV_FAN_SPEED": 191, - "DCGM_FI_DEV_PCIE_TX_THROUGHPUT": 200, - "DCGM_FI_DEV_PCIE_RX_THROUGHPUT": 201, - "DCGM_FI_DEV_PCIE_REPLAY_COUNTER": 202, - "DCGM_FI_DEV_GPU_UTIL": 203, - "DCGM_FI_DEV_MEM_COPY_UTIL": 204, - "DCGM_FI_DEV_ACCOUNTING_DATA": 205, - "DCGM_FI_DEV_ENC_UTIL": 206, - "DCGM_FI_DEV_DEC_UTIL": 207, - "DCGM_FI_DEV_MEM_COPY_UTIL_SAMPLES": 210, - "DCGM_FI_DEV_GPU_UTIL_SAMPLES": 211, - "DCGM_FI_DEV_GRAPHICS_PIDS": 220, - "DCGM_FI_DEV_COMPUTE_PIDS": 221, - "DCGM_FI_DEV_XID_ERRORS": 230, - "DCGM_FI_DEV_PCIE_MAX_LINK_GEN": 235, - "DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH": 236, - "DCGM_FI_DEV_PCIE_LINK_GEN": 237, - "DCGM_FI_DEV_PCIE_LINK_WIDTH": 238, - "DCGM_FI_DEV_POWER_VIOLATION": 240, - "DCGM_FI_DEV_THERMAL_VIOLATION": 241, - "DCGM_FI_DEV_SYNC_BOOST_VIOLATION": 242, - "DCGM_FI_DEV_BOARD_LIMIT_VIOLATION": 243, - "DCGM_FI_DEV_LOW_UTIL_VIOLATION": 244, - "DCGM_FI_DEV_RELIABILITY_VIOLATION": 245, - "DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION": 246, - "DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION": 247, - "DCGM_FI_DEV_FB_TOTAL": 250, - "DCGM_FI_DEV_FB_FREE": 251, - "DCGM_FI_DEV_FB_USED": 252, - "DCGM_FI_DEV_ECC_CURRENT": 300, - "DCGM_FI_DEV_ECC_PENDING": 301, - "DCGM_FI_DEV_ECC_SBE_VOL_TOTAL": 310, - "DCGM_FI_DEV_ECC_DBE_VOL_TOTAL": 311, - "DCGM_FI_DEV_ECC_SBE_AGG_TOTAL": 312, - "DCGM_FI_DEV_ECC_DBE_AGG_TOTAL": 313, - "DCGM_FI_DEV_ECC_SBE_VOL_L1": 314, - "DCGM_FI_DEV_ECC_DBE_VOL_L1": 315, - "DCGM_FI_DEV_ECC_SBE_VOL_L2": 316, - "DCGM_FI_DEV_ECC_DBE_VOL_L2": 317, - "DCGM_FI_DEV_ECC_SBE_VOL_DEV": 318, - "DCGM_FI_DEV_ECC_DBE_VOL_DEV": 319, - "DCGM_FI_DEV_ECC_SBE_VOL_REG": 320, - "DCGM_FI_DEV_ECC_DBE_VOL_REG": 321, - "DCGM_FI_DEV_ECC_SBE_VOL_TEX": 322, - "DCGM_FI_DEV_ECC_DBE_VOL_TEX": 323, - "DCGM_FI_DEV_ECC_SBE_AGG_L1": 324, - "DCGM_FI_DEV_ECC_DBE_AGG_L1": 325, - "DCGM_FI_DEV_ECC_SBE_AGG_L2": 326, - "DCGM_FI_DEV_ECC_DBE_AGG_L2": 327, - "DCGM_FI_DEV_ECC_SBE_AGG_DEV": 328, - "DCGM_FI_DEV_ECC_DBE_AGG_DEV": 329, - "DCGM_FI_DEV_ECC_SBE_AGG_REG": 330, - "DCGM_FI_DEV_ECC_DBE_AGG_REG": 331, - "DCGM_FI_DEV_ECC_SBE_AGG_TEX": 332, - "DCGM_FI_DEV_ECC_DBE_AGG_TEX": 333, - "DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS": 393, - "DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS": 394, - "DCGM_FI_DEV_ROW_REMAP_FAILURE": 395, - "DCGM_FI_DEV_RETIRED_SBE": 390, - "DCGM_FI_DEV_RETIRED_DBE": 391, - "DCGM_FI_DEV_RETIRED_PENDING": 392, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0": 400, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1": 401, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2": 402, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3": 403, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4": 404, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5": 405, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL": 409, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0": 410, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1": 411, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2": 412, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3": 413, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4": 414, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5": 415, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL": 419, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0": 420, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1": 421, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2": 422, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3": 423, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4": 424, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5": 425, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL": 429, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0": 430, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1": 431, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2": 432, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3": 433, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4": 434, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5": 435, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL": 439, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_L0": 440, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_L1": 441, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_L2": 442, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_L3": 443, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_L4": 444, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_L5": 445, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL": 449, - "DCGM_FI_DEV_GPU_NVLINK_ERRORS": 450, - "DCGM_FI_DEV_VIRTUAL_MODE": 500, - "DCGM_FI_DEV_SUPPORTED_TYPE_INFO": 501, - "DCGM_FI_DEV_CREATABLE_VGPU_TYPE_IDS": 502, - "DCGM_FI_DEV_VGPU_INSTANCE_IDS": 503, - "DCGM_FI_DEV_VGPU_UTILIZATIONS": 504, - "DCGM_FI_DEV_VGPU_PER_PROCESS_UTILIZATION": 505, - "DCGM_FI_DEV_ENC_STATS": 506, - "DCGM_FI_DEV_FBC_STATS": 507, - "DCGM_FI_DEV_FBC_SESSIONS_INFO": 508, - "DCGM_FI_DEV_VGPU_VM_ID": 520, - "DCGM_FI_DEV_VGPU_VM_NAME": 521, - "DCGM_FI_DEV_VGPU_TYPE": 522, - "DCGM_FI_DEV_VGPU_UUID": 523, - "DCGM_FI_DEV_VGPU_DRIVER_VERSION": 524, - "DCGM_FI_DEV_VGPU_MEMORY_USAGE": 525, - "DCGM_FI_DEV_VGPU_LICENSE_STATUS": 526, - "DCGM_FI_DEV_VGPU_FRAME_RATE_LIMIT": 527, - "DCGM_FI_DEV_VGPU_ENC_STATS": 528, - "DCGM_FI_DEV_VGPU_ENC_SESSIONS_INFO": 529, - "DCGM_FI_DEV_VGPU_FBC_STATS": 530, - "DCGM_FI_DEV_VGPU_FBC_SESSIONS_INFO": 531, - "DCGM_FI_FIRST_VGPU_FIELD_ID": 520, - "DCGM_FI_LAST_VGPU_FIELD_ID": 570, - "DCGM_FI_INTERNAL_FIELDS_0_START": 600, - "DCGM_FI_INTERNAL_FIELDS_0_END": 699, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P00": 700, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P00": 701, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P00": 702, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P00": 703, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P01": 704, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P01": 705, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P01": 706, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P01": 707, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P02": 708, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P02": 709, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P02": 710, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P02": 711, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P03": 712, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P03": 713, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P03": 714, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P03": 715, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P04": 716, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P04": 717, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P04": 718, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P04": 719, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P05": 720, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P05": 721, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P05": 722, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P05": 723, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P06": 724, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P06": 725, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P06": 726, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P06": 727, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P07": 728, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P07": 729, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P07": 730, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P07": 731, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P08": 732, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P08": 733, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P08": 734, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P08": 735, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P09": 736, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P09": 737, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P09": 738, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P09": 739, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P10": 740, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P10": 741, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P10": 742, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P10": 743, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P11": 744, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P11": 745, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P11": 746, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P11": 747, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P12": 748, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P12": 749, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P12": 750, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P12": 751, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P13": 752, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P13": 753, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P13": 754, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P13": 755, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P14": 756, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P14": 757, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P14": 758, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P14": 759, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P15": 760, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P15": 761, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P15": 762, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P15": 763, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P16": 764, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P16": 765, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P16": 766, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P16": 767, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P17": 768, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P17": 769, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P17": 770, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P17": 771, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P00": 780, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P00": 781, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P01": 782, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P01": 783, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P02": 784, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P02": 785, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P03": 786, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P03": 787, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P04": 788, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P04": 789, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P05": 790, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P05": 791, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P06": 792, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P06": 793, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P07": 794, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P07": 795, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P08": 796, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P08": 797, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P09": 798, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P09": 799, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P10": 800, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P10": 801, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P11": 802, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P11": 803, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P12": 804, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P12": 805, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P13": 806, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P13": 807, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P14": 808, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P14": 809, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P15": 810, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P15": 811, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P16": 812, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P16": 813, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P17": 814, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P17": 815, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P00": 820, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P00": 821, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P01": 822, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P01": 823, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P02": 824, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P02": 825, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P03": 826, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P03": 827, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P04": 828, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P04": 829, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P05": 830, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P05": 831, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P06": 832, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P06": 833, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P07": 834, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P07": 835, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P08": 836, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P08": 837, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P09": 838, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P09": 839, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P10": 840, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P10": 841, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P11": 842, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P11": 843, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P12": 844, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P12": 845, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P13": 846, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P13": 847, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P14": 848, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P14": 849, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P15": 850, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P15": 851, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P16": 852, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P16": 853, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P17": 854, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P17": 855, - "DCGM_FI_DEV_NVSWITCH_FATAL_ERRORS": 856, - "DCGM_FI_DEV_NVSWITCH_NON_FATAL_ERRORS": 857, - "DCGM_FI_FIRST_NVSWITCH_FIELD_ID": 700, - "DCGM_FI_LAST_NVSWITCH_FIELD_ID": 860, - "DCGM_FI_PROF_GR_ENGINE_ACTIVE": 1001, - "DCGM_FI_PROF_SM_ACTIVE": 1002, - "DCGM_FI_PROF_SM_OCCUPANCY": 1003, - "DCGM_FI_PROF_PIPE_TENSOR_ACTIVE": 1004, - "DCGM_FI_PROF_DRAM_ACTIVE": 1005, - "DCGM_FI_PROF_PIPE_FP64_ACTIVE": 1006, - "DCGM_FI_PROF_PIPE_FP32_ACTIVE": 1007, - "DCGM_FI_PROF_PIPE_FP16_ACTIVE": 1008, - "DCGM_FI_PROF_PCIE_TX_BYTES": 1009, - "DCGM_FI_PROF_PCIE_RX_BYTES": 1010, - "DCGM_FI_PROF_NVLINK_TX_BYTES": 1011, - "DCGM_FI_PROF_NVLINK_RX_BYTES": 1012, - "DCGM_FI_MAX_FIELDS": 1013, - } -) - -var ( - OLD_DCGM_FI = map[string]Short{ - "dcgm_sm_clock": 100, - "dcgm_memory_clock": 101, - "dcgm_memory_temp": 140, - "dcgm_gpu_temp": 150, - "dcgm_power_usage": 155, - "dcgm_total_energy_consumption": 156, - "dcgm_pcie_tx_throughput": 200, - "dcgm_pcie_rx_throughput": 201, - "dcgm_pcie_replay_counter": 202, - "dcgm_gpu_utilization": 203, - "dcgm_mem_copy_utilization": 204, - "dcgm_enc_utilization": 206, - "dcgm_dec_utilization": 207, - "dcgm_xid_errors": 230, - "dcgm_power_violation": 240, - "dcgm_thermal_violation": 241, - "dcgm_sync_boost_violation": 242, - "dcgm_board_limit_violation": 243, - "dcgm_low_util_violation": 244, - "dcgm_reliability_violation": 245, - "dcgm_fb_free": 251, - "dcgm_fb_used": 252, - "dcgm_ecc_sbe_volatile_total": 310, - "dcgm_ecc_dbe_volatile_total": 311, - "dcgm_ecc_sbe_aggregate_total": 312, - "dcgm_ecc_dbe_aggregate_total": 313, - "dcgm_retired_pages_sbe": 390, - "dcgm_retired_pages_dbe": 391, - "dcgm_retired_pages_pending": 392, - "dcgm_nvlink_flit_crc_error_count_total": 409, - "dcgm_nvlink_data_crc_error_count_total": 419, - "dcgm_nvlink_replay_error_count_total": 429, - "dcgm_nvlink_recovery_error_count_total": 439, - "dcgm_nvlink_bandwidth_total": 449, - "dcgm_fi_prof_gr_engine_active": 1001, - "dcgm_fi_prof_sm_active": 1002, - "dcgm_fi_prof_sm_occupancy": 1003, - "dcgm_fi_prof_pipe_tensor_active": 1004, - "dcgm_fi_prof_dram_active": 1005, - "dcgm_fi_prof_pcie_tx_bytes": 1009, - "dcgm_fi_prof_pcie_rx_bytes": 1010, - } -) - -const ( - DCGM_FV_FLAG_LIVE_DATA = uint(0x00000001) -) diff --git a/bindings/go/dcgm/dcgm_agent.h b/bindings/go/dcgm/dcgm_agent.h deleted file mode 100644 index fac3fdfe..00000000 --- a/bindings/go/dcgm/dcgm_agent.h +++ /dev/null @@ -1,2033 +0,0 @@ -/* - * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef DCGM_AGENT_H -#define DCGM_AGENT_H - -#include "dcgm_structs.h" - -#ifdef __cplusplus -extern "C" { -#endif - -#define DECLDIR - -/***************************************************************************************************/ -/** @defgroup DCGMAPI_Admin Administrative - * - * This chapter describes the administration interfaces for DCGM. - * It is the user's responsibility to call \ref dcgmInit() before calling any other methods, - * and \ref dcgmShutdown() once DCGM is no longer being used. The APIs in Administrative module - * can be broken down into following categories: - * @{ - */ -/***************************************************************************************************/ - -/***************************************************************************************************/ -/** @defgroup DCGMAPI_Admin_InitShut Init and Shutdown - * - * Describes APIs to Initialize and Shutdown the DCGM Engine. - * @{ - */ -/***************************************************************************************************/ - -/** - * This method is used to initialize DCGM within this process. This must be called before - * dcgmStartEmbedded() or dcgmConnect() - * - * * @return - * - \ref DCGM_ST_OK if DCGM has been properly initialized - * - \ref DCGM_ST_INIT_ERROR if there was an error initializing the library - */ -dcgmReturn_t DECLDIR dcgmInit(void); - -/** - * This method is used to shut down DCGM. Any embedded host engines or remote connections will automatically - * be shut down as well. - * - * @return - * - \ref DCGM_ST_OK if DCGM has been properly shut down - * - \ref DCGM_ST_UNINITIALIZED if the library was not shut down properly - */ -dcgmReturn_t DECLDIR dcgmShutdown(void); - -/** - * Start an embedded host engine agent within this process. - * - * The agent is loaded as a shared library. This mode is provided to avoid any - * extra jitter associated with an additional autonomous agent needs to be managed. In - * this mode, the user has to periodically call APIs such as \ref dcgmPolicyTrigger and - * \ref dcgmUpdateAllFields which tells DCGM to wake up and perform data collection and - * operations needed for policy management. - * - * @param opMode IN: Collect data automatically or manually when asked by the user. - * @param pDcgmHandle OUT: DCGM Handle to use for API calls - * - * @return - * - \ref DCGM_ST_OK if DCGM was started successfully within our process - * - \ref DCGM_ST_UNINITIALIZED if DCGM has not been initialized with \ref dcgmInit yet - * - */ -dcgmReturn_t DECLDIR dcgmStartEmbedded(dcgmOperationMode_t opMode, dcgmHandle_t *pDcgmHandle); - -/** - * Start an embedded host engine agent within this process. - * - * The agent is loaded as a shared library. This mode is provided to avoid any - * extra jitter associated with an additional autonomous agent needs to be managed. In - * this mode, the user has to periodically call APIs such as \ref dcgmPolicyTrigger and - * \ref dcgmUpdateAllFields which tells DCGM to wake up and perform data collection and - * operations needed for policy management. - * - * @param params IN/OUT: See \ref dcgmStartEmbeddedV2Params_v1 for details. - * - * @return - * - \ref DCGM_ST_OK if DCGM was started successfully within our process - * - \ref DCGM_ST_UNINITIALIZED if DCGM has not been initialized with \ref dcgmInit yet - * - */ -dcgmReturn_t DECLDIR dcgmStartEmbedded_v2(dcgmStartEmbeddedV2Params_v1 *params); - -/** - * Stop the embedded host engine within this process that was started with dcgmStartEmbedded - * - * @param pDcgmHandle IN : DCGM Handle of the embedded host engine that came from dcgmStartEmbedded - * - * @return - * - \ref DCGM_ST_OK if DCGM was stopped successfully within our process - * - \ref DCGM_ST_UNINITIALIZED if DCGM has not been initialized with \ref dcgmInit or - * the embedded host engine was not running. - * - \ref DCGM_ST_BADPARAM if an invalid parameter was provided - * - \ref DCGM_ST_INIT_ERROR if an error occurred while trying to start the host engine. - */ -dcgmReturn_t DECLDIR dcgmStopEmbedded(dcgmHandle_t pDcgmHandle); - -/** - * This method is used to connect to a stand-alone host engine process. Remote host engines are started - * by running the nv-hostengine command. - * - * NOTE: dcgmConnect_v2 provides additional connection options. - * - * @param ipAddress IN: Valid IP address for the remote host engine to connect to. - * If ipAddress is specified as x.x.x.x it will attempt to connect to the default - * port specified by DCGM_HE_PORT_NUMBER - * If ipAddress is specified as x.x.x.x:yyyy it will attempt to connect to the - * port specified by yyyy - * @param pDcgmHandle OUT: DCGM Handle of the remote host engine - * - * @return - * - \ref DCGM_ST_OK if we successfully connected to the remote host engine - * - \ref DCGM_ST_CONNECTION_NOT_VALID if the remote host engine could not be reached - * - \ref DCGM_ST_UNINITIALIZED if DCGM has not been initialized with \ref dcgmInit. - * - \ref DCGM_ST_BADPARAM if pDcgmHandle is NULL or ipAddress is invalid - * - \ref DCGM_ST_INIT_ERROR if DCGM encountered an error while initializing the remote client library - * - \ref DCGM_ST_UNINITIALIZED if DCGM has not been initialized with \ref dcgmInit - */ -dcgmReturn_t DECLDIR dcgmConnect(char *ipAddress, dcgmHandle_t *pDcgmHandle); - -/** - * This method is used to connect to a stand-alone host engine process. Remote host engines are started - * by running the nv-hostengine command. - * - * @param ipAddress IN: Valid IP address for the remote host engine to connect to. - * If ipAddress is specified as x.x.x.x it will attempt to connect to the default port - * specified by DCGM_HE_PORT_NUMBER. - * If ipAddress is specified as x.x.x.x:yyyy it will attempt to connect to the port - * specified by yyyy - * @param connectParams IN: Additional connection parameters. See \ref dcgmConnectV2Params_t for details. - * @param pDcgmHandle OUT: DCGM Handle of the remote host engine - * - * @return - * - \ref DCGM_ST_OK if we successfully connected to the remote host engine - * - \ref DCGM_ST_CONNECTION_NOT_VALID if the remote host engine could not be reached - * - \ref DCGM_ST_UNINITIALIZED if DCGM has not been initialized with \ref dcgmInit. - * - \ref DCGM_ST_BADPARAM if pDcgmHandle is NULL or ipAddress is invalid - * - \ref DCGM_ST_INIT_ERROR if DCGM encountered an error while initializing the remote client library - * - \ref DCGM_ST_UNINITIALIZED if DCGM has not been initialized with \ref dcgmInit - */ -dcgmReturn_t DECLDIR dcgmConnect_v2(char *ipAddress, dcgmConnectV2Params_t *connectParams, dcgmHandle_t *pDcgmHandle); - -/** - * This method is used to disconnect from a stand-alone host engine process. - * - * @param pDcgmHandle IN: DCGM Handle that came from dcgmConnect - * - * @return - * - \ref DCGM_ST_OK if we successfully disconnected from the host engine - * - \ref DCGM_ST_UNINITIALIZED if DCGM has not been initialized with \ref dcgmInit - * - \ref DCGM_ST_BADPARAM if pDcgmHandle is not a valid DCGM handle - * - \ref DCGM_ST_GENERIC_ERROR if an unspecified internal error occurred - */ -dcgmReturn_t DECLDIR dcgmDisconnect(dcgmHandle_t pDcgmHandle); - - -/** @} */ // Closing for DCGMAPI_Admin_InitShut - -/***************************************************************************************************/ -/** @defgroup DCGMAPI_Admin_Info Auxilary information about DCGM engine. - * - * Describes APIs to get generic information about the DCGM Engine. - * @{ - */ -/***************************************************************************************************/ - -/** - * This method is used to return information about the build environment where DCGM was built. - * - * @param pVersionInfo OUT: Build environment information - * - * @return - * - \ref DCGM_ST_OK if build information is sucessfully obtained - * - \ref DCGM_ST_BADPARAM if pVersionInfo is null - * - \ref DCGM_ST_VER_MISMATCH if the expected and provided versions of dcgmVersionInfo_t do not match - */ -dcgmReturn_t DECLDIR dcgmVersionInfo(dcgmVersionInfo_t *pVersionInfo); - -/** - * This method is used to return information about the build environment of the hostengine. - * - * @param pDcgmHandle IN: DCGM Handle that came from dcgmConnect - * @param pVersionInfo OUT: Build environment information - * - * @return - * - \ref DCGM_ST_OK if build information is sucessfully obtained - * - \ref DCGM_ST_BADPARAM if pVersionInfo is null - * - \ref DCGM_ST_VER_MISMATCH if the expected and provided versions of dcgmVersionInfo_t do not match - */ -dcgmReturn_t DECLDIR dcgmHostengineVersionInfo(dcgmHandle_t pDcgmHandle, dcgmVersionInfo_t *pVersionInfo); - - -/** - * This method is used to set the logging severity on HostEngine for the specified logger - * - * @param pDcgmHandle IN: DCGM Handle - * @param logging IN: dcgmSettingsSetLoggingSeverity_t struct containing the target logger and severity - * - * @return - * - \ref DCGM_ST_OK Severity successfuly set - * - \ref DCGM_ST_BADPARAM Bad logger/severity string - * - \ref DCGM_ST_VER_MISMATCH if the expected and provided versions of dcgmSettingsSetLoggingSeverity_t - * do not match - */ -dcgmReturn_t DECLDIR dcgmHostengineSetLoggingSeverity(dcgmHandle_t pDcgmHandle, - dcgmSettingsSetLoggingSeverity_t *logging); - -/** - * This function is used to return whether or not the host engine considers itself healthy - * - * @param[in] pDcgmHandle - the handle to DCGM - * @param[out] heHealth - struct describing the health of the hostengine. if heHealth.hostengineHealth is 0, - * then the hostengine is healthy. Non-zero indicates not healthy with error codes - * determining the cause. - * - * @return - * - \ref DCGM_ST_OK Able to gauge health - * - \ref DCGM_ST_BADPARAM isHealthy is not a valid pointer - */ -dcgmReturn_t DECLDIR dcgmHostengineIsHealthy(dcgmHandle_t pDcgmHandle, dcgmHostengineHealth_t *heHealth); - -/** @} */ // Closing DCGMAPI_Admin_Info - -/** @} */ // Closing for DCGMAPI_Admin - - -/***************************************************************************************************/ -/** @defgroup DCGMAPI_SYS System - * @{ - * This chapter describes the APIs used to identify set of GPUs on the node, grouping functions to - * provide mechanism to operate on a group of GPUs, and status management APIs in - * order to get individual statuses for each operation. The APIs in System module can be - * broken down into following categories: - */ -/***************************************************************************************************/ - -/***************************************************************************************************/ -/** @defgroup DCGM_DISCOVERY Discovery - * The following APIs are used to discover GPUs and their attributes on a Node. - * @{ - */ -/***************************************************************************************************/ - -/** - * This method is used to get identifiers corresponding to all the devices on the system. The - * identifier represents DCGM GPU Id corresponding to each GPU on the system and is immutable during - * the lifespan of the engine. The list should be queried again if the engine is restarted. - * - * The GPUs returned from this function include gpuIds of GPUs that are not supported by DCGM. - * To only get gpuIds of GPUs that are supported by DCGM, use dcgmGetAllSupportedDevices(). - * - * @param pDcgmHandle IN: DCGM Handle - * @param gpuIdList OUT: Array reference to fill GPU Ids present on the system. - * @param count OUT: Number of GPUs returned in \a gpuIdList. - * - * @return - * - \ref DCGM_ST_OK if the call was successful. - * - \ref DCGM_ST_BADPARAM if \a gpuIdList or \a count were not valid. - */ -dcgmReturn_t DECLDIR dcgmGetAllDevices(dcgmHandle_t pDcgmHandle, - unsigned int gpuIdList[DCGM_MAX_NUM_DEVICES], - int *count); - -/** - * This method is used to get identifiers corresponding to all the DCGM-supported devices on the system. The - * identifier represents DCGM GPU Id corresponding to each GPU on the system and is immutable during - * the lifespan of the engine. The list should be queried again if the engine is restarted. - * - * The GPUs returned from this function ONLY includes gpuIds of GPUs that are supported by DCGM. - * To get gpuIds of all GPUs in the system, use dcgmGetAllDevices(). - * - * - * @param pDcgmHandle IN: DCGM Handle - * @param gpuIdList OUT: Array reference to fill GPU Ids present on the system. - * @param count OUT: Number of GPUs returned in \a gpuIdList. - * - * @return - * - \ref DCGM_ST_OK if the call was successful. - * - \ref DCGM_ST_BADPARAM if \a gpuIdList or \a count were not valid. - */ -dcgmReturn_t DECLDIR dcgmGetAllSupportedDevices(dcgmHandle_t pDcgmHandle, - unsigned int gpuIdList[DCGM_MAX_NUM_DEVICES], - int *count); - -/** - * Gets device attributes corresponding to the \a gpuId. If operation is not successful for any of - * the requested fields then the field is populated with one of DCGM_BLANK_VALUES defined in - * dcgm_structs.h. - * - * @param pDcgmHandle IN: DCGM Handle - * @param gpuId IN: GPU Id corresponding to which the attributes should be fetched - * @param pDcgmAttr IN/OUT: Device attributes corresponding to \a gpuId.
pDcgmAttr->version should be set to - * \ref dcgmDeviceAttributes_version before this call. - * - * @return - * - \ref DCGM_ST_OK if the call was successful. - * - \ref DCGM_ST_VER_MISMATCH if pDcgmAttr->version is not set or is invalid. - */ -dcgmReturn_t DECLDIR dcgmGetDeviceAttributes(dcgmHandle_t pDcgmHandle, - unsigned int gpuId, - dcgmDeviceAttributes_t *pDcgmAttr); - -/** - * Gets the list of entities that exist for a given entity group. This API can be used in place of - * \ref dcgmGetAllDevices. - * - * @param dcgmHandle IN: DCGM Handle - * @param entityGroup IN: Entity group to list entities of - * @param entities OUT: Array of entities for entityGroup - * @param numEntities IN/OUT: Upon calling, this should be the number of entities that entityList[] can hold. Upon - * return, this will contain the number of entities actually saved to entityList. - * @param flags IN: Flags to modify the behavior of this request. - * See DCGM_GEGE_FLAG_* #defines in dcgm_structs.h - * - * @return - * - \ref DCGM_ST_OK if the call was successful. - * - \ref DCGM_ST_INSUFFICIENT_SIZE if numEntities was not large enough to hold the number of entities in the - * entityGroup. numEntities will contain the capacity needed to complete this - * request successfully. - * - \ref DCGM_ST_NOT_SUPPORTED if the given entityGroup does not support enumeration. - * - \ref DCGM_ST_BADPARAM if any parameter is invalid - */ -dcgmReturn_t DECLDIR dcgmGetEntityGroupEntities(dcgmHandle_t dcgmHandle, - dcgm_field_entity_group_t entityGroup, - dcgm_field_eid_t *entities, - int *numEntities, - unsigned int flags); - -/** - * Gets the hierarchy of GPUs, GPU Instances, and Compute Instances by populating a list of each entity with - * a reference to their parent - * - * @param dcgmHandle IN: DCGM Handle - * @param entities OUT: array of entities in the hierarchy - * @param numEntities IN/OUT: Upon calling, this should be the capacity of entities. - * Upon return, this will contain the number of entities actually saved to entities. - * - * @return - * - \ref DCGM_ST_OK if the call was successful. - * - \ref DCGM_ST_VER_MISMATCH if the struct version is incorrect - * - \ref DCGM_ST_BADPARAM if any parameter is invalid - */ -dcgmReturn_t DECLDIR dcgmGetGpuInstanceHierarchy(dcgmHandle_t dcgmHandle, dcgmMigHierarchy_v2 *hierarchy); - -/** - * Get the NvLink link status for every NvLink in this system. This includes the NvLinks of both GPUs and - * NvSwitches. Note that only NvSwitches and GPUs that are visible to the current environment will be - * returned in this structure. - * - * @param dcgmHandle IN: DCGM Handle - * @param linkStatus OUT: Structure in which to store NvLink link statuses. .version should be set to - * dcgmNvLinkStatus_version1 before calling this. - * - * @return - * - \ref DCGM_ST_OK if the call was successful. - * - \ref DCGM_ST_NOT_SUPPORTED if the given entityGroup does not support enumeration. - * - \ref DCGM_ST_BADPARAM if any parameter is invalid - */ -dcgmReturn_t DECLDIR dcgmGetNvLinkLinkStatus(dcgmHandle_t dcgmHandle, dcgmNvLinkStatus_v2 *linkStatus); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup DCGM_GROUPING Grouping - * The following APIs are used for group management. The user can create a group of entities and - * perform an operation on a group of entities. If grouping is not needed and the user wishes - * to run commands on all GPUs seen by DCGM then the user can use DCGM_GROUP_ALL_GPUS or - * DCGM_GROUP_ALL_NVSWITCHES in place of group IDs when needed. - * @{ - */ -/***************************************************************************************************/ - -/** - * Used to create a entity group handle which can store one or more entity Ids as an opaque handle - * returned in \a pDcgmGrpId. Instead of executing an operation separately for each entity, the - * DCGM group enables the user to execute same operation on all the entities present in the group as a - * single API call. - * - * To create the group with all the entities present on the system, the \a type field should be - * specified as \a DCGM_GROUP_DEFAULT or \a DCGM_GROUP_ALL_NVSWITCHES. To create an empty group, - * the \a type field should be specified as \a DCGM_GROUP_EMPTY. The empty group can be updated - * with the desired set of entities using the APIs \ref dcgmGroupAddDevice, \ref dcgmGroupAddEntity, - * \ref dcgmGroupRemoveDevice, and \ref dcgmGroupRemoveEntity. - * - * @param pDcgmHandle IN: DCGM Handle - * @param type IN: Type of Entity Group to be formed - * @param groupName IN: Desired name of the GPU group specified as NULL terminated C string - * @param pDcgmGrpId OUT: Reference to group ID - * - * @return - * - \ref DCGM_ST_OK if the group has been created - * - \ref DCGM_ST_BADPARAM if any of \a type, \a groupName, \a length or \a pDcgmGrpId is invalid - * - \ref DCGM_ST_MAX_LIMIT if number of groups on the system has reached the max limit \a DCGM_MAX_NUM_GROUPS - * - \ref DCGM_ST_INIT_ERROR if the library has not been successfully initialized - */ -dcgmReturn_t DECLDIR dcgmGroupCreate(dcgmHandle_t pDcgmHandle, - dcgmGroupType_t type, - char *groupName, - dcgmGpuGrp_t *pDcgmGrpId); - -/** - * Used to destroy a group represented by \a groupId. - * Since DCGM group is a logical grouping of entities, the properties applied on the group stay intact - * for the individual entities even after the group is destroyed. - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID - * - * @return - * - \ref DCGM_ST_OK if the group has been destroyed - * - \ref DCGM_ST_BADPARAM if \a groupId is invalid - * - \ref DCGM_ST_INIT_ERROR if the library has not been successfully initialized - * - \ref DCGM_ST_NOT_CONFIGURED if entry corresponding to the group does not exists - */ -dcgmReturn_t DECLDIR dcgmGroupDestroy(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId); - -/** - * Used to add specified GPU Id to the group represented by \a groupId. - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group Id to which device should be added - * @param gpuId IN: DCGM GPU Id - * - * @return - * - \ref DCGM_ST_OK if the GPU Id has been successfully added to the group - * - \ref DCGM_ST_INIT_ERROR if the library has not been successfully initialized - * - \ref DCGM_ST_NOT_CONFIGURED if entry corresponding to the group (\a groupId) does not exists - * - \ref DCGM_ST_BADPARAM if \a gpuId is invalid or already part of the specified group - */ -dcgmReturn_t dcgmGroupAddDevice(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, unsigned int gpuId); - -/** - * Used to add specified entity to the group represented by \a groupId. - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group Id to which device should be added - * @param entityGroupId IN: Entity group that entityId belongs to - * @param entityId IN: DCGM entityId - * - * @return - * - \ref DCGM_ST_OK if the entity has been successfully added to the group - * - \ref DCGM_ST_INIT_ERROR if the library has not been successfully initialized - * - \ref DCGM_ST_NOT_CONFIGURED if entry corresponding to the group (\a groupId) does not exists - * - \ref DCGM_ST_BADPARAM if \a entityId is invalid or already part of the specified group - */ -dcgmReturn_t dcgmGroupAddEntity(dcgmHandle_t pDcgmHandle, - dcgmGpuGrp_t groupId, - dcgm_field_entity_group_t entityGroupId, - dcgm_field_eid_t entityId); - -/** - * Used to remove specified GPU Id from the group represented by \a groupId. - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID from which device should be removed - * @param gpuId IN: DCGM GPU Id - * - * @return - * - \ref DCGM_ST_OK if the GPU Id has been successfully removed from the group - * - \ref DCGM_ST_INIT_ERROR if the library has not been successfully initialized - * - \ref DCGM_ST_NOT_CONFIGURED if entry corresponding to the group (\a groupId) does not exists - * - \ref DCGM_ST_BADPARAM if \a gpuId is invalid or not part of the specified group - */ -dcgmReturn_t dcgmGroupRemoveDevice(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, unsigned int gpuId); - -/** - * Used to remove specified entity from the group represented by \a groupId. - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID from which device should be removed - * @param entityGroupId IN: Entity group that entityId belongs to - * @param entityId IN: DCGM entityId - * - * @return - * - \ref DCGM_ST_OK if the entity has been successfully removed from the group - * - \ref DCGM_ST_INIT_ERROR if the library has not been successfully initialized - * - \ref DCGM_ST_NOT_CONFIGURED if entry corresponding to the group (\a groupId) does not exists - * - \ref DCGM_ST_BADPARAM if \a entityId is invalid or not part of the specified group - */ -dcgmReturn_t dcgmGroupRemoveEntity(dcgmHandle_t pDcgmHandle, - dcgmGpuGrp_t groupId, - dcgm_field_entity_group_t entityGroupId, - dcgm_field_eid_t entityId); - -/** - * Used to get information corresponding to the group represented by \a groupId. The information - * returned in \a pDcgmGroupInfo consists of group name, and the list of entities present in the - * group. - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID for which information to be fetched - * @param pDcgmGroupInfo OUT: Group Information - * - * @return - * - \ref DCGM_ST_OK if the group info is successfully received. - * - \ref DCGM_ST_BADPARAM if any of \a groupId or \a pDcgmGroupInfo is invalid. - * - \ref DCGM_ST_INIT_ERROR if the library has not been successfully initialized. - * - \ref DCGM_ST_MAX_LIMIT if the group does not contain the GPU - * - \ref DCGM_ST_NOT_CONFIGURED if entry corresponding to the group (\a groupId) does not exists - */ -dcgmReturn_t dcgmGroupGetInfo(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, dcgmGroupInfo_t *pDcgmGroupInfo); - -/** - * Used to get the Ids of all groups of entities. The information returned is a list of group ids - * in \a groupIdList as well as a count of how many ids there are in \a count. Please allocate enough - * memory for \a groupIdList. Memory of size MAX_NUM_GROUPS should be allocated for \a groupIdList. - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupIdList OUT: List of Group Ids - * @param count OUT: The number of Group ids in the list - * - * @return - * - \ref DCGM_ST_OK if the ids of the groups were successfully retrieved - * - \ref DCGM_ST_BADPARAM if either of the \a groupIdList or \a count is null - * - \ref DCGM_ST_GENERIC_ERROR if an unknown error has occurred - */ -dcgmReturn_t dcgmGroupGetAllIds(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupIdList[], unsigned int *count); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup DCGM_FIELD_GROUPING Field Grouping - * The following APIs are used for field group management. The user can create a group of fields and - * perform an operation on a group of fields at once. - * @{ - */ - -/** - * Used to create a group of fields and return the handle in dcgmFieldGroupId - * - * @param dcgmHandle IN: DCGM handle - * @param numFieldIds IN: Number of field IDs that are being provided in fieldIds[]. Must be between 1 and - * DCGM_MAX_FIELD_IDS_PER_FIELD_GROUP. - * @param fieldIds IN: Field IDs to be added to the newly-created field group - * @param fieldGroupName IN: Unique name for this group of fields. This must not be the same as any existing field - * groups. - * @param dcgmFieldGroupId OUT: Handle to the newly-created field group - * - * @return - * - \ref DCGM_ST_OK if the field group was successfully created. - * - \ref DCGM_ST_BADPARAM if any parameters were bad - * - \ref DCGM_ST_INIT_ERROR if the library has not been successfully initialized. - * - \ref DCGM_ST_MAX_LIMIT if too many field groups already exist - * - */ -dcgmReturn_t dcgmFieldGroupCreate(dcgmHandle_t dcgmHandle, - int numFieldIds, - unsigned short *fieldIds, - char *fieldGroupName, - dcgmFieldGrp_t *dcgmFieldGroupId); - -/** - * Used to remove a field group that was created with \ref dcgmFieldGroupCreate - * - * @param dcgmHandle IN: DCGM handle - * @param dcgmFieldGroupId IN: Field group to remove - * - * @return - * - \ref DCGM_ST_OK if the field group was successfully removed - * - \ref DCGM_ST_BADPARAM if any parameters were bad - * - \ref DCGM_ST_INIT_ERROR if the library has not been successfully initialized. - * - */ -dcgmReturn_t dcgmFieldGroupDestroy(dcgmHandle_t dcgmHandle, dcgmFieldGrp_t dcgmFieldGroupId); - - -/** - * Used to get information about a field group that was created with \ref dcgmFieldGroupCreate. - * - * @param dcgmHandle IN: DCGM handle - * @param fieldGroupInfo IN/OUT: Info about all of the field groups that exist.
- * .version should be set to \ref dcgmFieldGroupInfo_version before this call
- * .fieldGroupId should contain the fieldGroupId you are interested in querying - * information for. - * - * @return - * - \ref DCGM_ST_OK if the field group info was returned successfully - * - \ref DCGM_ST_BADPARAM if any parameters were bad - * - \ref DCGM_ST_INIT_ERROR if the library has not been successfully initialized. - * - \ref DCGM_ST_VER_MISMATCH if .version is not set or is invalid. - * - */ -dcgmReturn_t dcgmFieldGroupGetInfo(dcgmHandle_t dcgmHandle, dcgmFieldGroupInfo_t *fieldGroupInfo); - -/** - * Used to get information about all field groups in the system. - * - * @param dcgmHandle IN: DCGM handle - * @param allGroupInfo IN/OUT: Info about all of the field groups that exist.
- * .version should be set to \ref dcgmAllFieldGroup_version before this call. - * - * @return - * - \ref DCGM_ST_OK if the field group info was successfully returned - * - \ref DCGM_ST_BADPARAM if any parameters were bad - * - \ref DCGM_ST_INIT_ERROR if the library has not been successfully initialized. - * - \ref DCGM_ST_VER_MISMATCH if .version is not set or is invalid. - * - */ -dcgmReturn_t dcgmFieldGroupGetAll(dcgmHandle_t dcgmHandle, dcgmAllFieldGroup_t *allGroupInfo); - -/** @} */ - - -/***************************************************************************************************/ -/** @defgroup DCGMAPI_ST Status handling - * The following APIs are used to manage statuses for multiple operations on one or more GPUs. - * @{ - */ -/***************************************************************************************************/ - -/** - * Creates reference to DCGM status handler which can be used to get the statuses for multiple - * operations on one or more devices. - * - * The multiple statuses are useful when the operations are performed at group level. The status - * handle provides a mechanism to access error attributes for the failed operations. - * - * The number of errors stored behind the opaque handle can be accessed using the the API - * \ref dcgmStatusGetCount. The errors are accessed from the opaque handle \a statusHandle - * using the API \ref dcgmStatusPopError. The user can invoke \ref dcgmStatusPopError - * for the number of errors or until all the errors are fetched. - * - * When the status handle is not required any further then it should be deleted using the API - * \ref dcgmStatusDestroy. - * @param statusHandle OUT: Reference to handle for list of statuses - * - * @return - * - \ref DCGM_ST_OK if the status handle is successfully created - * - \ref DCGM_ST_BADPARAM if \a statusHandle is invalid - * - */ -dcgmReturn_t dcgmStatusCreate(dcgmStatus_t *statusHandle); - -/** - * Used to destroy status handle created using \ref dcgmStatusCreate. - * @param statusHandle IN: Handle to list of statuses - * - * @return - * - \ref DCGM_ST_OK if the status handle is successfully created - * - \ref DCGM_ST_BADPARAM if \a statusHandle is invalid - * - */ -dcgmReturn_t dcgmStatusDestroy(dcgmStatus_t statusHandle); - -/** - * Used to get count of error entries stored inside the opaque handle \a statusHandle. - * @param statusHandle IN: Handle to list of statuses - * @param count OUT: Number of error entries present in the list of statuses - * - * @return - * - \ref DCGM_ST_OK if the error count is successfully received - * - \ref DCGM_ST_BADPARAM if any of \a statusHandle or \a count is invalid - * - */ -dcgmReturn_t dcgmStatusGetCount(dcgmStatus_t statusHandle, unsigned int *count); - -/** - * Used to iterate through the list of errors maintained behind \a statusHandle. The method pops the - * first error from the list of DCGM statuses. In order to iterate through all the errors, the user - * can invoke this API for the number of errors or until all the errors are fetched. - * @param statusHandle IN: Handle to list of statuses - * @param pDcgmErrorInfo OUT: First error from the list of statuses - * - * @return - * - \ref DCGM_ST_OK if the error entry is successfully fetched - * - \ref DCGM_ST_BADPARAM if any of \a statusHandle or \a pDcgmErrorInfo is invalid - * - \ref DCGM_ST_NO_DATA if the status handle list is empty - * - */ -dcgmReturn_t dcgmStatusPopError(dcgmStatus_t statusHandle, dcgmErrorInfo_t *pDcgmErrorInfo); - -/** - * Used to clear all the errors in the status handle created by the API - * \ref dcgmStatusCreate. After one set of operation, the \a statusHandle - * can be cleared and reused for the next set of operation. - * @param statusHandle IN: Handle to list of statuses - * - * @return - * - \ref DCGM_ST_OK if the errors are successfully cleared - * - \ref DCGM_ST_BADPARAM if \a statusHandle is invalid - * - */ -dcgmReturn_t dcgmStatusClear(dcgmStatus_t statusHandle); - -/** @} */ // Closing for DCGMAPI_ST - - -/** @} */ // Closing for DCGMAPI_SYS - -/***************************************************************************************************/ -/** @defgroup DCGMAPI_DC Configuration - * This chapter describes the methods that handle device configuration retrieval and - * default settings. The APIs in Configuration module can be broken down into following - * categories: - * @{ - */ -/***************************************************************************************************/ - -/***************************************************************************************************/ -/** @defgroup DCGMAPI_DC_Setup Setup and management - * Describes APIs to Get/Set configuration on the group of GPUs. - * @{ - */ -/***************************************************************************************************/ - -/** -* Used to set configuration for the group of one or more GPUs identified by \a groupId. -* -* The configuration settings specified in \a pDeviceConfig are applied to all the GPUs in the -* group. Since DCGM group is a logical grouping of GPUs, the configuration settings stays intact -* for the individual GPUs even after the group is destroyed. -* -* If the user wishes to ignore the configuration of one or more properties in the input -* \a pDeviceConfig then the property should be specified as one of \a DCGM_INT32_BLANK, -* \a DCGM_INT64_BLANK, \a DCGM_FP64_BLANK or \a DCGM_STR_BLANK based on the data type of the -* property to be ignored. -* -* If any of the properties fail to be configured for any of the GPUs in the group then the API -* returns an error. The status handle \a statusHandle should be further evaluated to access error -* attributes for the failed operations. Please refer to status management APIs at \ref DCGMAPI_ST -* to access the error attributes. -* -* To find out valid supported clock values that can be passed to dcgmConfigSet, look at the device -* attributes of a GPU in the group using the API dcgmGetDeviceAttributes. - -* @param pDcgmHandle IN: DCGM Handle -* @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate -* for details on creating the group. -* @param pDeviceConfig IN: Pointer to memory to hold desired configuration to be applied for all the GPU in the -* group represented by \a groupId. -* The caller must populate the version field of \a pDeviceConfig. -* @param statusHandle IN/OUT: Resulting error status for multiple operations. Pass it as NULL if the detailed -* error information is not needed. -* Look at \ref dcgmStatusCreate for details on creating status handle. - -* @return -* - \ref DCGM_ST_OK if the configuration has been successfully set. -* - \ref DCGM_ST_BADPARAM if any of \a groupId or \a pDeviceConfig is invalid. -* - \ref DCGM_ST_VER_MISMATCH if \a pDeviceConfig has the incorrect version. -* - \ref DCGM_ST_GENERIC_ERROR if an unknown error has occurred. -* -*/ -dcgmReturn_t DECLDIR dcgmConfigSet(dcgmHandle_t pDcgmHandle, - dcgmGpuGrp_t groupId, - dcgmConfig_t *pDeviceConfig, - dcgmStatus_t statusHandle); - -/** -* Used to get configuration for all the GPUs present in the group. -* -* This API can get the most recent target or desired configuration set by \ref dcgmConfigSet. -* Set type as \a DCGM_CONFIG_TARGET_STATE to get target configuration. The target configuration -* properties are maintained by DCGM and are automatically enforced after a GPU reset or -* reinitialization is completed. -* -* The method can also be used to get the actual configuration state for the GPUs in the group. -* Set type as \a DCGM_CONFIG_CURRENT_STATE to get the actually configuration state. Ideally, the -* actual configuration state will be exact same as the target configuration state. -* -* If any of the property in the target configuration is unknown then the property value in the -* output is populated as one of DCGM_INT32_BLANK, DCGM_INT64_BLANK, DCGM_FP64_BLANK or -* DCGM_STR_BLANK based on the data type of the property. -* -* If any of the property in the current configuration state is not supported then the property -* value in the output is populated as one of DCGM_INT32_NOT_SUPPORTED, DCGM_INT64_NOT_SUPPORTED, -* DCGM_FP64_NOT_SUPPORTED or DCGM_STR_NOT_SUPPORTED based on the data type of the property. -* -* If any of the properties can't be fetched for any of the GPUs in the group then the API returns -* an error. The status handle \a statusHandle should be further evaluated to access error -* attributes for the failed operations. Please refer to status management APIs at \ref DCGMAPI_ST -* to access the error attributes. -* -* @param pDcgmHandle IN: DCGM Handle -* @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate -* for details on creating the group. -* @param type IN: Type of configuration values to be fetched. -* @param count IN: The number of entries that \a deviceConfigList array can store. -* @param deviceConfigList OUT: Pointer to memory to hold requested configuration corresponding to all the GPUs in -* the group (\a groupId). The size of the memory must be greater than or equal to hold -* output information for the number of GPUs present in the group (\a groupId). -* @param statusHandle IN/OUT: Resulting error status for multiple operations. Pass it as NULL if the detailed -* error information is not needed. -* Look at \ref dcgmStatusCreate for details on creating status handle. - -* @return -* - \ref DCGM_ST_OK if the configuration has been successfully fetched. -* - \ref DCGM_ST_BADPARAM if any of \a groupId, \a type, \a count, or \a deviceConfigList is invalid. -* - \ref DCGM_ST_NOT_CONFIGURED if the target configuration is not already set. -* - \ref DCGM_ST_VER_MISMATCH if \a deviceConfigList has the incorrect version. -* - \ref DCGM_ST_GENERIC_ERROR if an unknown error has occurred. -* -*/ -dcgmReturn_t DECLDIR dcgmConfigGet(dcgmHandle_t pDcgmHandle, - dcgmGpuGrp_t groupId, - dcgmConfigType_t type, - int count, - dcgmConfig_t deviceConfigList[], - dcgmStatus_t statusHandle); - -/** @} */ // Closing for DCGMAPI_DC_Setup - - -/***************************************************************************************************/ -/** @defgroup DCGMAPI_DC_MI Manual Invocation - * Describes APIs used to manually enforce the desired configuration on a group of GPUs. - * @{ - */ -/***************************************************************************************************/ - -/** - * Used to enforce previously set configuration for all the GPUs present in the group. - * - * This API provides a mechanism to the users to manually enforce the configuration at any point of - * time. The configuration can only be enforced if it's already configured using the API \ref - * dcgmConfigSet. - * - * If any of the properties can't be enforced for any of the GPUs in the group then the API returns - * an error. The status handle \a statusHandle should be further evaluated to access error - * attributes for the failed operations. Please refer to status management APIs at \ref DCGMAPI_ST - * to access the error attributes. - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate - * for details on creating the group. Alternatively, pass in the group id as - * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. - * @param statusHandle IN/OUT: Resulting error status for multiple operations. Pass it as NULL if the detailed - * error information is not needed. Look at \ref dcgmStatusCreate for details on - * creating status handle. - * - * @return - * - \ref DCGM_ST_OK if the configuration has been successfully enforced. - * - \ref DCGM_ST_BADPARAM if \a groupId is invalid. - * - \ref DCGM_ST_NOT_CONFIGURED if the target configuration is not already set. - * - \ref DCGM_ST_GENERIC_ERROR if an unknown error has occurred. - * - */ -dcgmReturn_t DECLDIR dcgmConfigEnforce(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, dcgmStatus_t statusHandle); - -/** @} */ // Closing for DCGMAPI_DC_MI - -/** @} */ // Closing for DCGMAPI_DC - -/***************************************************************************************************/ -/** @defgroup DCGMAPI_FI Field APIs - * - * These APIs are responsible for watching, unwatching, and updating specific fields as defined - * by DCGM_FI_* - * - * @{ - */ -/***************************************************************************************************/ - -/** - * Request that DCGM start recording updates for a given field collection. - * - * Note that the first update of the field will not occur until the next field update cycle. - * To force a field update cycle, call dcgmUpdateAllFields(1). - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more entities. Look at \ref dcgmGroupCreate - * for details on creating the group. Alternatively, pass in the group id as - * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs or - * \a DCGM_GROUP_ALL_NVSWITCHES to to perform the operation on all NvSwitches. - * @param fieldGroupId IN: Fields to watch. - * @param updateFreq IN: How often to update this field in usec - * @param maxKeepAge IN: How long to keep data for this field in seconds - * @param maxKeepSamples IN: Maximum number of samples to keep. 0=no limit - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if a parameter is invalid - * - */ - -dcgmReturn_t dcgmWatchFields(dcgmHandle_t pDcgmHandle, - dcgmGpuGrp_t groupId, - dcgmFieldGrp_t fieldGroupId, - long long updateFreq, - double maxKeepAge, - int maxKeepSamples); - -/** - * Request that DCGM stop recording updates for a given field collection. - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more entities. Look at \ref dcgmGroupCreate - * for details on creating the group. Alternatively, pass in the group id as - * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs or - * \a DCGM_GROUP_ALL_NVSWITCHES to to perform the operation on all NvSwitches. - * @param fieldGroupId IN: Fields to unwatch. - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if a parameter is invalid - * - */ -dcgmReturn_t dcgmUnwatchFields(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, dcgmFieldGrp_t fieldGroupId); - -/** - * Request updates for all field values that have updated since a given timestamp - * - * This version only works with GPU entities. Use \ref dcgmGetValuesSince_v2 for entity groups - * containing NvSwitches. - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate for - * details on creating the group. Alternatively, pass in the group id as - * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. - * @param fieldGroupId IN: Fields to return data for - * @param sinceTimestamp IN: Timestamp to request values since in usec since 1970. This will be returned in - * nextSinceTimestamp for subsequent calls 0 = request all data - * @param nextSinceTimestamp OUT: Timestamp to use for sinceTimestamp on next call to this function - * @param enumCB IN: Callback to invoke for every field value update. Note that multiple updates can be - * returned in each invocation - * @param userData IN: User data pointer to pass to the userData field of enumCB. - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_NOT_SUPPORTED if one of the entities was from a non-GPU type - * - \ref DCGM_ST_BADPARAM if a parameter is invalid - * - */ -dcgmReturn_t dcgmGetValuesSince(dcgmHandle_t pDcgmHandle, - dcgmGpuGrp_t groupId, - dcgmFieldGrp_t fieldGroupId, - long long sinceTimestamp, - long long *nextSinceTimestamp, - dcgmFieldValueEnumeration_f enumCB, - void *userData); - -/** - * Request updates for all field values that have updated since a given timestamp - * - * This version works with non-GPU entities like NvSwitches - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more entities. Look at \ref dcgmGroupCreate - * for details on creating the group. Alternatively, pass in the group id as - * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs or - * \a DCGM_GROUP_ALL_NVSWITCHES to perform the operation on all NvSwitches. - * @param fieldGroupId IN: Fields to return data for - * @param sinceTimestamp IN: Timestamp to request values since in usec since 1970. This will be returned in - * nextSinceTimestamp for subsequent calls 0 = request all data - * @param nextSinceTimestamp OUT: Timestamp to use for sinceTimestamp on next call to this function - * @param enumCB IN: Callback to invoke for every field value update. Note that multiple updates can be - * returned in each invocation - * @param userData IN: User data pointer to pass to the userData field of enumCB. - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if a parameter is invalid - * - */ -dcgmReturn_t dcgmGetValuesSince_v2(dcgmHandle_t pDcgmHandle, - dcgmGpuGrp_t groupId, - dcgmFieldGrp_t fieldGroupId, - long long sinceTimestamp, - long long *nextSinceTimestamp, - dcgmFieldValueEntityEnumeration_f enumCB, - void *userData); - -/** - * Request latest cached field value for a field value collection - * - * This version only works with GPU entities. Use \ref dcgmGetLatestValues_v2 for entity groups - * containing NvSwitches. - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate for - * details on creating the group. Alternatively, pass in the group id as - * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. - * @param fieldGroupId IN: Fields to return data for. - * @param enumCB IN: Callback to invoke for every field value update. Note that multiple updates can be - * returned in each invocation - * @param userData IN: User data pointer to pass to the userData field of enumCB. - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_NOT_SUPPORTED if one of the entities was from a non-GPU type - * - \ref DCGM_ST_BADPARAM if a parameter is invalid - * - */ -dcgmReturn_t dcgmGetLatestValues(dcgmHandle_t pDcgmHandle, - dcgmGpuGrp_t groupId, - dcgmFieldGrp_t fieldGroupId, - dcgmFieldValueEnumeration_f enumCB, - void *userData); - -/** - * Request latest cached field value for a field value collection - * - * This version works with non-GPU entities like NvSwitches - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more entities. Look at \ref dcgmGroupCreate - * for details on creating the group. Alternatively, pass in the group id as - * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs or - * \a DCGM_GROUP_ALL_NVSWITCHES to perform the operation on all NvSwitches. - * @param fieldGroupId IN: Fields to return data for. - * @param enumCB IN: Callback to invoke for every field value update. Note that multiple updates can be - * returned in each invocation - * @param userData IN: User data pointer to pass to the userData field of enumCB. - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_NOT_SUPPORTED if one of the entities was from a non-GPU type - * - \ref DCGM_ST_BADPARAM if a parameter is invalid - * - */ -dcgmReturn_t dcgmGetLatestValues_v2(dcgmHandle_t pDcgmHandle, - dcgmGpuGrp_t groupId, - dcgmFieldGrp_t fieldGroupId, - dcgmFieldValueEntityEnumeration_f enumCB, - void *userData); - -/** - * Request latest cached field value for a GPU - * - * @param pDcgmHandle IN: DCGM Handle - * @param gpuId IN: Gpu ID representing the GPU for which the fields are being requested. - * @param fields IN: Field IDs to return data for. See the definitions in dcgm_fields.h that start with DCGM_FI_. - * @param count IN: Number of field IDs in fields[] array. - * @param values OUT: Latest field values for the fields in fields[]. - * - */ -dcgmReturn_t dcgmGetLatestValuesForFields(dcgmHandle_t pDcgmHandle, - int gpuId, - unsigned short fields[], - unsigned int count, - dcgmFieldValue_v1 values[]); -/** - * Request latest cached field value for a group of fields for a specific entity - * - * @param pDcgmHandle IN: DCGM Handle - * @param entityGroup IN: entity_group_t (e.g. switch) - * @param entityId IN: entity ID representing the rntity for which the fields are being requested. - * @param fields IN: Field IDs to return data for. See the definitions in dcgm_fields.h that start with DCGM_FI_. - * @param count IN: Number of field IDs in fields[] array. - * @param values OUT: Latest field values for the fields in fields[]. - * - */ -dcgmReturn_t dcgmEntityGetLatestValues(dcgmHandle_t pDcgmHandle, - dcgm_field_entity_group_t entityGroup, - int entityId, - unsigned short fields[], - unsigned int count, - dcgmFieldValue_v1 values[]); - -/** - * Request the latest cached or live field value for a list of fields for a group of entities - * - * Note: The returned entities are not guaranteed to be in any order. Reordering can occur internally - * in order to optimize calls to the NVIDIA driver. - * - * @param pDcgmHandle IN: DCGM Handle - * @param entities IN: List of entities to get values for - * @param entityCount IN: Number of entries in entities[] - * @param fields IN: Field IDs to return data for. See the definitions in dcgm_fields.h that start with DCGM_FI_. - * @param fieldCount IN: Number of field IDs in fields[] array. - * @param flags IN: Optional flags that affect how this request is processed. Pass \ref DCGM_FV_FLAG_LIVE_DATA - * here to retrieve a live driver value rather than a cached value. See that flag's - * documentation for caveats. - * @param values OUT: Latest field values for the fields requested. This must be able to hold entityCount * - * fieldCount field value records. - * - */ -dcgmReturn_t dcgmEntitiesGetLatestValues(dcgmHandle_t pDcgmHandle, - dcgmGroupEntityPair_t entities[], - unsigned int entityCount, - unsigned short fields[], - unsigned int fieldCount, - unsigned int flags, - dcgmFieldValue_v2 values[]); - -/*************************************************************************/ -/** - * Get a summary of the values for a field id over a period of time. - * - * @param pDcgmHandle IN: DCGM Handle - * @param request IN/OUT: a pointer to the struct detailing the request and containing the response - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_FIELD_UNSUPPORTED_BY_API if the field is not int64 or double type - * - */ -dcgmReturn_t DECLDIR dcgmGetFieldSummary(dcgmHandle_t pDcgmHandle, dcgmFieldSummaryRequest_t *request); - -/** @} */ - -/***************************************************************************************************/ -/** @addtogroup DCGMAPI_Admin_ExecCtrl - * @{ - */ -/***************************************************************************************************/ - -/** - * This method is used to tell the DCGM module to update all the fields being watched. - * - * Note: If the if the operation mode was set to manual mode (DCGM_OPERATION_MODE_MANUAL) during - * initialization (\ref dcgmInit), this method must be caused periodically to allow field value watches - * the opportunity to gather samples. - * - * @param pDcgmHandle IN: DCGM Handle - * @param waitForUpdate IN: Whether or not to wait for the update loop to complete before returning to the - * caller 1=wait. 0=do not wait. - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if \a waitForUpdate is invalid - * - \ref DCGM_ST_GENERIC_ERROR if an unspecified DCGM error occurs - * - */ -dcgmReturn_t dcgmUpdateAllFields(dcgmHandle_t pDcgmHandle, int waitForUpdate); - -/** @} */ // Closing for DCGMAPI_Admin_ExecCtrl - - -/***************************************************************************************************/ -/** @defgroup DCGMAPI_PROCESS_STATS Process Statistics - * Describes APIs to investigate statistics such as accounting, performance and errors during the - * lifetime of a GPU process - * @{ - */ -/***************************************************************************************************/ - -/** - * Request that DCGM start recording stats for fields that can be queried with dcgmGetPidInfo(). - * - * Note that the first update of the field will not occur until the next field update cycle. - * To force a field update cycle, call dcgmUpdateAllFields(1). - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate for - * details on creating the group. Alternatively, pass in the group id as - * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. - * @param updateFreq IN: How often to update this field in usec - * @param maxKeepAge IN: How long to keep data for this field in seconds - * @param maxKeepSamples IN: Maximum number of samples to keep. 0=no limit - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if a parameter is invalid - * - \ref DCGM_ST_REQUIRES_ROOT if the host engine is being run as non-root, and accounting mode could not - * be enabled (requires root). Run "nvidia-smi -am 1" as root on the node - * before starting DCGM to fix this. - * - */ -dcgmReturn_t dcgmWatchPidFields(dcgmHandle_t pDcgmHandle, - dcgmGpuGrp_t groupId, - long long updateFreq, - double maxKeepAge, - int maxKeepSamples); - -/** - * - * Get information about all GPUs while the provided pid was running - * - * In order for this request to work, you must first call dcgmWatchPidFields() to - * make sure that DCGM is watching the appropriate field IDs that will be - * populated in pidInfo - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate - * for details on creating the group. Alternatively, pass in the group id as - * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. - * @param pidInfo IN/OUT: Structure to return information about pid in. pidInfo->pid must be set to the pid in question. - * pidInfo->version should be set to dcgmPidInfo_version. - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_NO_DATA if the PID did not run on any GPU - * - */ -dcgmReturn_t dcgmGetPidInfo(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, dcgmPidInfo_t *pidInfo); - -/** @} */ // Closing for DCGMAPI_PROCESS_STATS - -/***************************************************************************************************/ -/** @defgroup DCGMAPI_JOB_STATS Job Statistics - * The client can invoke DCGM APIs to start and stop collecting the stats at the process boundaries - * (during prologue and epilogue). This will enable DCGM to monitor all the PIDs while the job is - * in progress, and provide a summary of active processes and resource usage during the window of - * interest. - * @{ - */ -/***************************************************************************************************/ - -/** - * Request that DCGM start recording stats for fields that are queried with dcgmJobGetStats() - * - * Note that the first update of the field will not occur until the next field update cycle. - * To force a field update cycle, call dcgmUpdateAllFields(1). - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate for - * details on creating the group. Alternatively, pass in the group id as - * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. - * @param updateFreq IN: How often to update this field in usec - * @param maxKeepAge IN: How long to keep data for this field in seconds - * @param maxKeepSamples IN: Maximum number of samples to keep. 0=no limit - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if a parameter is invalid - * - \ref DCGM_ST_REQUIRES_ROOT if the host engine is being run as non-root, and - * accounting mode could not be enabled (requires root). - * Run "nvidia-smi -am 1" as root on the node before starting - * DCGM to fix this. - * - */ -dcgmReturn_t dcgmWatchJobFields(dcgmHandle_t pDcgmHandle, - dcgmGpuGrp_t groupId, - long long updateFreq, - double maxKeepAge, - int maxKeepSamples); - -/** - * This API is used by the client to notify DCGM about the job to be started. Should be invoked as - * part of job prologue - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate for - * details on creating the group. Alternatively, pass in the group id as - * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. - * @param jobId IN: User provided string to represent the job - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if a parameter is invalid - * - \ref DCGM_ST_DUPLICATE_KEY if the specified \a jobId is already in use - * - */ -dcgmReturn_t dcgmJobStartStats(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, char jobId[64]); - -/** - * This API is used by the clients to notify DCGM to stop collecting stats for the job represented - * by job id. Should be invoked as part of job epilogue. - * The job Id remains available to view the stats at any point but cannot be used to start a new job. - * You must call dcgmWatchJobFields() before this call to enable watching of job - * - * @param pDcgmHandle IN: DCGM Handle - * @param jobId IN: User provided string to represent the job - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if a parameter is invalid - * - \ref DCGM_ST_NO_DATA if \a jobId is not a valid job identifier. - * - */ -dcgmReturn_t dcgmJobStopStats(dcgmHandle_t pDcgmHandle, char jobId[64]); - -/** - * Get stats for the job identified by DCGM generated job id. The stats can be retrieved at any - * point when the job is in process. - * If you want to reuse this jobId, call \ref dcgmJobRemove after this call. - * - * @param pDcgmHandle IN: DCGM Handle - * @param jobId IN: User provided string to represent the job - * @param pJobInfo IN/OUT: Structure to return information about the job.
.version should be set to - * \ref dcgmJobInfo_version before this call. - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if a parameter is invalid - * - \ref DCGM_ST_NO_DATA if \a jobId is not a valid job identifier. - * - \ref DCGM_ST_VER_MISMATCH if .version is not set or is invalid. - * - */ -dcgmReturn_t dcgmJobGetStats(dcgmHandle_t pDcgmHandle, char jobId[64], dcgmJobInfo_t *pJobInfo); - -/** - * This API tells DCGM to stop tracking the job given by jobId. After this call, you will no longer - * be able to call dcgmJobGetStats() on this jobId. However, you will be able to reuse jobId after - * this call. - * - * @param pDcgmHandle IN: DCGM Handle - * @param jobId IN: User provided string to represent the job - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if a parameter is invalid - * - \ref DCGM_ST_NO_DATA if \a jobId is not a valid job identifier. - * - */ -dcgmReturn_t dcgmJobRemove(dcgmHandle_t pDcgmHandle, char jobId[64]); - -/** - * This API tells DCGM to stop tracking all jobs. After this call, you will no longer - * be able to call dcgmJobGetStats() any jobs until you call dcgmJobStartStats again. - * You will be able to reuse any previously-used jobIds after this call. - * - * @param pDcgmHandle IN: DCGM Handle - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if a parameter is invalid - */ -dcgmReturn_t dcgmJobRemoveAll(dcgmHandle_t pDcgmHandle); - -/** @} */ // Closing for DCGMAPI_JOB_STATS - -/***************************************************************************************************/ -/** @defgroup DCGMAPI_HM Health Monitor - * - * This chapter describes the methods that handle the GPU health monitor. - * - * @{ - */ -/***************************************************************************************************/ - -/** - * Enable the DCGM health check system for the given systems defined in \ref dcgmHealthSystems_t - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more entities. Look at \ref dcgmGroupCreate - * for details on creating the group. Alternatively, pass in the group id as - * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs or - * \a DCGM_GROUP_ALL_NVSWITCHES to perform operation on all the NvSwitches. - * @param systems IN: An enum representing systems that should be enabled for health checks logically OR'd - * together. Refer to \ref dcgmHealthSystems_t for details. - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if a parameter is invalid - * - */ -dcgmReturn_t dcgmHealthSet(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, dcgmHealthSystems_t systems); - -/** - * Enable the DCGM health check system for the given systems defined in \ref dcgmHealthSystems_t - * - * Since DCGM 2.0 - * - * @param pDcgmHandle IN: DCGM Handle - * @param healthSet IN: Parameters to use when setting health watches. See - * \ref dcgmHealthSetParams_v2 for the description of each parameter. - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if a parameter is invalid - */ - -dcgmReturn_t dcgmHealthSet_v2(dcgmHandle_t pDcgmHandle, dcgmHealthSetParams_v2 *params); - -/** - * Retrieve the current state of the DCGM health check system - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more entities. Look at \ref dcgmGroupCreate - * for details on creating the group. Alternatively, pass in the group id as - * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs or - * \a DCGM_GROUP_ALL_NVSWITCHES to perform operation on all the NvSwitches. - * @param systems OUT: An integer representing the enabled systems for the given group Refer to - * \ref dcgmHealthSystems_t for details. - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if a parameter is invalid - * - */ -dcgmReturn_t dcgmHealthGet(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, dcgmHealthSystems_t *systems); - - -/** - * Check the configured watches for any errors/failures/warnings that have occurred - * since the last time this check was invoked. On the first call, stateful information - * about all of the enabled watches within a group is created but no error results are - * provided. On subsequent calls, any error information will be returned. - * - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing a collection of one or more entities. - * Refer to \ref dcgmGroupCreate for details on creating a group - * @param results OUT: A reference to the dcgmHealthResponse_t structure to populate. - * results->version must be set to dcgmHealthResponse_version. - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if a parameter is invalid - * - \ref DCGM_ST_VER_MISMATCH if results->version is not dcgmHealthResponse_version - * - */ -dcgmReturn_t dcgmHealthCheck(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, dcgmHealthResponse_t *results); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup DCGMAPI_PO Policies - * - * This chapter describes the methods that handle system policy management and violation settings. - * The APIs in Policies module can be broken down into following categories: - * - * @{ - */ -/***************************************************************************************************/ - -/***************************************************************************************************/ -/** @defgroup DCGMAPI_PO_Setup Setup and Management - * Describes APIs for setting up policies and registering callbacks to receive notification in - * case specific policy condition has been violated. - * @{ - */ -/***************************************************************************************************/ - -/** - * Set the current violation policy inside the policy manager. Given the conditions within the - * \ref dcgmPolicy_t structure, if a violation has occurred, subsequent action(s) may be performed to - * either report or contain the failure. - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate for - * details on creating the group. Alternatively, pass in the group id as - * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. - * @param policy IN: A reference to \ref dcgmPolicy_t that will be applied to all GPUs in the group. - * @param statusHandle IN/OUT: Resulting status for the operation. Pass it as NULL if the detailed error information - * is not needed. Refer to \ref dcgmStatusCreate for details on creating a status handle. - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if \a groupId or \a policy is invalid - * - \ref DCGM_ST_NOT_SUPPORTED if any unsupported GPUs are part of the GPU group specified in groupId - * - DCGM_ST_* a different error has occurred and is stored in \a statusHandle. - * Refer to \ref dcgmReturn_t - * - */ -dcgmReturn_t dcgmPolicySet(dcgmHandle_t pDcgmHandle, - dcgmGpuGrp_t groupId, - dcgmPolicy_t *policy, - dcgmStatus_t statusHandle); - -/** - * Get the current violation policy inside the policy manager. Given a groupId, a number of - * policy structures are retrieved. - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate for - * details on creating the group. Alternatively, pass in the group id as - * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. - * @param count IN: The size of the policy array. This is the maximum number of policies that will be - * retrieved and ultimately should correspond to the number of GPUs specified in the - * group. - * @param policy OUT: A reference to \ref dcgmPolicy_t that will used as storage for the current policies - * applied to each GPU in the group. - * @param statusHandle IN/OUT: Resulting status for the operation. Pass it as NULL if the detailed error information - * for the operation is not needed. Refer to \ref dcgmStatusCreate for details on - * creating a status handle. - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if \a groupId or \a policy is invalid - * - DCGM_ST_* a different error has occurred and is stored in \a statusHandle. - * Refer to \ref dcgmReturn_t - * - */ -dcgmReturn_t dcgmPolicyGet(dcgmHandle_t pDcgmHandle, - dcgmGpuGrp_t groupId, - int count, - dcgmPolicy_t *policy, - dcgmStatus_t statusHandle); - -/** - * Register a function to be called when a specific policy condition (see \ref dcgmPolicyCondition_t) has been - * violated. This callback(s) will be called automatically when in DCGM_OPERATION_MODE_AUTO mode and only after - * dcgmPolicyTrigger when in DCGM_OPERATION_MODE_MANUAL mode. All callbacks are made within a separate thread. - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate for - * details on creating the group. Alternatively, pass in the group id as - * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. - * @param condition IN: The set of conditions specified as an OR'd list (see \ref dcgmPolicyCondition_t) for - * which to register a callback function - * @param beginCallback IN: A reference to a function that should be called should a violation occur. - * This function will be called prior to any actions specified by the policy are taken. - * @param finishCallback IN: A reference to a function that should be called should a violation occur. - * This function will be called after any action specified by the policy are completed. - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if \a groupId, \a condition, is invalid, \a beginCallback, or - * \a finishCallback is NULL - * - \ref DCGM_ST_NOT_SUPPORTED if any unsupported GPUs are part of the GPU group specified in groupId - * - */ -dcgmReturn_t dcgmPolicyRegister(dcgmHandle_t pDcgmHandle, - dcgmGpuGrp_t groupId, - dcgmPolicyCondition_t condition, - fpRecvUpdates beginCallback, - fpRecvUpdates finishCallback); - -/** - * Unregister a function to be called for a specific policy condition (see \ref dcgmPolicyCondition_t). - * This function will unregister all callbacks for a given condition and handle. - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate for - * details on creating the group. Alternatively, pass in the group id as - * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. - * @param condition IN: The set of conditions specified as an OR'd list (see \ref dcgmPolicyCondition_t) for - * which to unregister a callback function - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if \a groupId, \a condition, is invalid or \a callback is NULL - * - */ -dcgmReturn_t dcgmPolicyUnregister(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, dcgmPolicyCondition_t condition); - -/** @} */ // Closing for DCGMAPI_PO_Setup - -/***************************************************************************************************/ -/** @defgroup DCGMAPI_PO_MI Manual Invocation - * Describes APIs which can be used to perform direct actions (e.g. Perform GPU Reset, Run Health - * Diagnostics) on a group of GPUs. - * @{ - */ -/***************************************************************************************************/ - -/** - * Inform the action manager to perform a manual validation of a group of GPUs on the system - * - * *************************************** DEPRECATED *************************************** - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate for - * details on creating the group. Alternatively, pass in the group id as - * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. - * @param validate IN: The validation to perform after the action. - * @param response OUT: Result of the validation process. Refer to \ref dcgmDiagResponse_t for details. - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_NOT_SUPPORTED if running the specified \a validate is not supported. This is usually due - * to the Tesla recommended driver not being installed on the system. - * - \ref DCGM_ST_BADPARAM if \a groupId, \a validate, or \a statusHandle is invalid - * - \ref DCGM_ST_GENERIC_ERROR an internal error has occurred - * - \ref DCGM_ST_GROUP_INCOMPATIBLE if \a groupId refers to a group of non-homogeneous GPUs. This is currently - * not allowed. - * - */ -dcgmReturn_t dcgmActionValidate(dcgmHandle_t pDcgmHandle, - dcgmGpuGrp_t groupId, - dcgmPolicyValidation_t validate, - dcgmDiagResponse_t *response); - -/** - * Inform the action manager to perform a manual validation of a group of GPUs on the system - * - * @param pDcgmHandle IN: DCGM Handle - * @param drd IN: Contains the group id, test names, test parameters, struct version, and the validation - * that should be performed. Look at \ref dcgmGroupCreate for details on creating the - * group. Alternatively, pass in the group id as \a DCGM_GROUP_ALL_GPUS to perform - * operation on all the GPUs. - * @param response OUT: Result of the validation process. Refer to \ref dcgmDiagResponse_t for details. - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_NOT_SUPPORTED if running the specified \a validate is not supported. This is usually - * due to the Tesla recommended driver not being installed on the system. - * - \ref DCGM_ST_BADPARAM if \a groupId, \a validate, or \a statusHandle is invalid - * - \ref DCGM_ST_GENERIC_ERROR an internal error has occurred - * - \ref DCGM_ST_GROUP_INCOMPATIBLE if \a groupId refers to a group of non-homogeneous GPUs. This is - * currently not allowed. - */ -dcgmReturn_t dcgmActionValidate_v2(dcgmHandle_t pDcgmHandle, dcgmRunDiag_v7 *drd, dcgmDiagResponse_t *response); - -/** - * Run a diagnostic on a group of GPUs - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate - * for details on creating the group. Alternatively, pass in the group id as - * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. - * @param diagLevel IN: Diagnostic level to run - * @param diagResponse IN/OUT: Result of running the DCGM diagnostic.
- * .version should be set to \ref dcgmDiagResponse_version before this call. - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_NOT_SUPPORTED if running the diagnostic is not supported. This is usually due to the - * Tesla recommended driver not being installed on the system. - * - \ref DCGM_ST_BADPARAM if a provided parameter is invalid or missing - * - \ref DCGM_ST_GENERIC_ERROR an internal error has occurred - * - \ref DCGM_ST_GROUP_INCOMPATIBLE if \a groupId refers to a group of non-homogeneous GPUs. This is - * currently not allowed. - * - \ref DCGM_ST_VER_MISMATCH if .version is not set or is invalid. - * - */ -dcgmReturn_t dcgmRunDiagnostic(dcgmHandle_t pDcgmHandle, - dcgmGpuGrp_t groupId, - dcgmDiagnosticLevel_t diagLevel, - dcgmDiagResponse_t *diagResponse); - -/** @} */ // Closing for DCGMAPI_PO_MI - -/** @} */ // Closing for DCGMAPI_PO - -/***************************************************************************************************/ -/** @addtogroup DCGMAPI_Admin_ExecCtrl - * @{ - */ -/***************************************************************************************************/ - -/** - * Inform the policy manager loop to perform an iteration and trigger the callbacks of any - * registered functions. Callback functions will be called from a separate thread as the calling function. - * - * Note: The GPU monitoring and management agent must call this method periodically if the operation - * mode is set to manual mode (DCGM_OPERATION_MODE_MANUAL) during initialization - * (\ref dcgmInit). - * - * @param pDcgmHandle IN: DCGM Handle - * - * @return - * - \ref DCGM_ST_OK If the call was successful - * - DCGM_ST_GENERIC_ERROR The policy manager was unable to perform another iteration. - */ -dcgmReturn_t dcgmPolicyTrigger(dcgmHandle_t pDcgmHandle); - -/** @} */ // Closing for DCGMAPI_Admin_ExecCtrl - -/***************************************************************************************************/ -/** @defgroup DCGMAPI_Topo Topology - * @{ - */ -/***************************************************************************************************/ - -/** - * Gets device topology corresponding to the \a gpuId. - * - * @param pDcgmHandle IN: DCGM Handle - * @param gpuId IN: GPU Id corresponding to which topology information should be fetched - * @param pDcgmDeviceTopology IN/OUT: Topology information corresponding to \a gpuId. pDcgmDeviceTopology->version must - * be set to dcgmDeviceTopology_version before this call. - * @return - * - \ref DCGM_ST_OK if the call was successful. - * - \ref DCGM_ST_BADPARAM if \a gpuId or \a pDcgmDeviceTopology were not valid. - * - \ref DCGM_ST_VER_MISMATCH if pDcgmDeviceTopology->version was not set to dcgmDeviceTopology_version. - * - */ -dcgmReturn_t DECLDIR dcgmGetDeviceTopology(dcgmHandle_t pDcgmHandle, - unsigned int gpuId, - dcgmDeviceTopology_t *pDcgmDeviceTopology); - -/** - * Gets group topology corresponding to the \a groupId. - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: GroupId corresponding to which topology information should be fetched - * @param pDcgmGroupTopology IN/OUT: Topology information corresponding to \a groupId. pDcgmgroupTopology->version must - * be set to dcgmGroupTopology_version. - * @return - * - \ref DCGM_ST_OK if the call was successful. - * - \ref DCGM_ST_BADPARAM if \a groupId or \a pDcgmGroupTopology were not valid. - * - \ref DCGM_ST_VER_MISMATCH if pDcgmgroupTopology->version was not set to dcgmGroupTopology_version. - * - */ -dcgmReturn_t DECLDIR dcgmGetGroupTopology(dcgmHandle_t pDcgmHandle, - dcgmGpuGrp_t groupId, - dcgmGroupTopology_t *pDcgmGroupTopology); - -/** @} */ // Closing for DCGMAPI_Topo - -/***************************************************************************************************/ -/** @defgroup DCGMAPI_METADATA Metadata - * @{ - * This chapter describes the methods that query for DCGM metadata. - */ -/***************************************************************************************************/ - -/** - * Toggle the state of introspection metadata gathering in DCGM. Metadata gathering will increase the memory usage - * of DCGM so that it can store the metadata it gathers. - * - * @param pDcgmHandle IN: DCGM Handle - * @param enabledState IN: The state to set gathering of introspection data to - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM enabledState is an invalid state for metadata gathering - * - */ -dcgmReturn_t DECLDIR dcgmIntrospectToggleState(dcgmHandle_t pDcgmHandle, dcgmIntrospectState_t enabledState); - -/*************************************************************************/ -/** - * Get the current amount of memory used to store the given field collection. - * - * @param pDcgmHandle IN: DCGM Handle - * @param context IN: see \ref dcgmIntrospectContext_t. This identifies the level of fields to do - * introspection for (ex: all fields, field groups) context->version must be - * set to dcgmIntrospectContext_version prior to this call. - * @param memoryInfo IN/OUT: see \ref dcgmIntrospectFullMemory_t. memoryInfo->version must be set to - * dcgmIntrospectFullMemory_version prior to this call. - * @param waitIfNoData IN: if no metadata has been gathered, should this call block until data has been - * gathered (1), or should this call just return DCGM_ST_NO_DATA (0). - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_NOT_CONFIGURED if metadata gathering state is \a DCGM_INTROSPECT_STATE_DISABLED - * - \ref DCGM_ST_NO_DATA if \a waitIfNoData is false and metadata has not been gathered yet - * - \ref DCGM_ST_VER_MISMATCH if context->version or memoryInfo->version is 0 or invalid. - * - */ -dcgmReturn_t DECLDIR dcgmIntrospectGetFieldsMemoryUsage(dcgmHandle_t pDcgmHandle, - dcgmIntrospectContext_t *context, - dcgmIntrospectFullMemory_t *memoryInfo, - int waitIfNoData); - - -/*************************************************************************/ -/** - * Retrieve the total amount of memory that the hostengine process is currently using. - * This measurement represents both the resident set size (what is currently in RAM) and - * the swapped memory that belongs to the process. - * - * @param pDcgmHandle IN: DCGM Handle - * @param memoryInfo IN/OUT: see \ref dcgmIntrospectMemory_t. memoryInfo->version must be set to - * dcgmIntrospectMemory_version prior to this call. - * @param waitIfNoData IN: if no metadata is gathered wait till this occurs (!0) or return DCGM_ST_NO_DATA (0) - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_NOT_CONFIGURED if metadata gathering state is \a DCGM_INTROSPECT_STATE_DISABLED - * - \ref DCGM_ST_NO_DATA if \a waitIfNoData is false and metadata has not been gathered yet - * - \ref DCGM_ST_VER_MISMATCH if memoryInfo->version is 0 or invalid. - * - */ -dcgmReturn_t DECLDIR dcgmIntrospectGetHostengineMemoryUsage(dcgmHandle_t pDcgmHandle, - dcgmIntrospectMemory_t *memoryInfo, - int waitIfNoData); - -/*************************************************************************/ -/** - * Get introspection info relating to execution time needed to update the fields - * identified by \a context. - * - * @param pDcgmHandle IN: DCGM Handle - * @param context IN: see \ref dcgmIntrospectContext_t. This identifies the level of fields to do - * introspection for (ex: all fields, field group ) context->version must be set to - * dcgmIntrospectContext_version prior to this call. - * @param execTime IN/OUT: see \ref dcgmIntrospectFullFieldsExecTime_t. execTime->version must be set to - * dcgmIntrospectFullFieldsExecTime_version prior to this call. - * @param waitIfNoData IN: if no metadata is gathered, wait until data has been gathered (1) or return - * DCGM_ST_NO_DATA (0) - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_NOT_CONFIGURED if metadata gathering state is \a DCGM_INTROSPECT_STATE_DISABLED - * - \ref DCGM_ST_NO_DATA if \a waitIfNoData is false and metadata has not been gathered yet - * - \ref DCGM_ST_VER_MISMATCH if context->version or execTime->version is 0 or invalid. - * - */ -dcgmReturn_t DECLDIR dcgmIntrospectGetFieldsExecTime(dcgmHandle_t pDcgmHandle, - dcgmIntrospectContext_t *context, - dcgmIntrospectFullFieldsExecTime_t *execTime, - int waitIfNoData); - -/*************************************************************************/ -/** - * Retrieve the CPU utilization of the DCGM hostengine process. - * - * @param pDcgmHandle IN: DCGM Handle - * @param cpuUtil IN/OUT: see \ref dcgmIntrospectCpuUtil_t. cpuUtil->version must be set to - * dcgmIntrospectCpuUtil_version prior to this call. - * @param waitIfNoData IN: if no metadata is gathered wait till this occurs (!0) or return DCGM_ST_NO_DATA (0) - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_NOT_CONFIGURED if metadata gathering state is \a DCGM_INTROSPECT_STATE_DISABLED - * - \ref DCGM_ST_NO_DATA if \a waitIfNoData is false and metadata has not been gathered yet - * - \ref DCGM_ST_VER_MISMATCH if cpuUtil->version or execTime->version is 0 or invalid. - * - */ -dcgmReturn_t DECLDIR dcgmIntrospectGetHostengineCpuUtilization(dcgmHandle_t pDcgmHandle, - dcgmIntrospectCpuUtil_t *cpuUtil, - int waitIfNoData); - -/*************************************************************************/ -/** - * This method is used to manually tell the the introspection module to update - * all DCGM introspection data. This is normally performed automatically on an - * interval of 1 second. - * - * @param pDcgmHandle IN: DCGM Handle - * @param waitForUpdate IN: Whether or not to wait for the update loop to complete before returning to the caller - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if \a waitForUpdate is invalid - * - */ -dcgmReturn_t DECLDIR dcgmIntrospectUpdateAll(dcgmHandle_t pDcgmHandle, int waitForUpdate); - -/** @} */ // Closing for DCGMAPI_METADATA - -/***************************************************************************************************/ -/** @defgroup DCGMAPI_TOPOLOGY Topology - * @{ - * This chapter describes the methods that query for DCGM topology information. - */ -/***************************************************************************************************/ - -/*************************************************************************/ -/** - * Get the best group of gpus from the specified bitmask according to topological proximity: cpuAffinity, NUMA - * node, and NVLink. - * - * @param pDcgmHandle IN: DCGM Handle - * @param inputGpuIds IN: a bitmask of which GPUs DCGM should consider. If some of the GPUs on the system are - * already in use, they shouldn't be included in the bitmask. 0 means that all of the GPUs - * in the system should be considered. - * @param numGpus IN: the number of GPUs that are desired from inputGpuIds. If this number is greater than - * the number of healthy GPUs in inputGpuIds, then less than numGpus gpus will be - * specified in outputGpuIds. - * @param outputGpuIds OUT: a bitmask of numGpus or fewer GPUs from inputGpuIds that represent the best placement - * available from inputGpuIds. - * @param hintFlags IN: a bitmask of DCGM_TOPO_HINT_F_ #defines of hints that should be taken into account when - * assigning outputGpuIds. - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - */ -dcgmReturn_t DECLDIR dcgmSelectGpusByTopology(dcgmHandle_t pDcgmHandle, - uint64_t inputGpuIds, - uint32_t numGpus, - uint64_t *outputGpuIds, - uint64_t hintFlags); - -/** @} */ // Closing for DCGMAPI_TOPOLOGY - -/***************************************************************************************************/ -/** @defgroup DCGMAPI_MODULES Modules - * @{ - * This chapter describes the methods that query and configure DCGM modules. - */ -/***************************************************************************************************/ - -/*************************************************************************/ -/** - * Set a module to be blacklisted. This module will be prevented from being loaded - * if it hasn't been loaded already. Modules are lazy-loaded as they are used by - * DCGM APIs, so it's important to call this API soon after the host engine has been started. - * You can also pass --blacklist-modules to the nv-hostengine binary to make sure modules - * get blacklisted immediately after the host engine starts up. - * - * @param pDcgmHandle IN: DCGM Handle - * @param moduleId IN: ID of the module to blacklist. Use \ref dcgmModuleGetStatuses to get a list of valid - * module IDs. - * - * @return - * - \ref DCGM_ST_OK if the module has been blacklisted. - * - \ref DCGM_ST_IN_USE if the module has already been loaded and cannot be blacklisted. - * - \ref DCGM_ST_BADPARAM if a parameter is missing or bad. - * - */ -dcgmReturn_t DECLDIR dcgmModuleBlacklist(dcgmHandle_t pDcgmHandle, dcgmModuleId_t moduleId); - -/*************************************************************************/ -/** - * Get the status of all of the DCGM modules. - * - * @param pDcgmHandle IN: DCGM Handle - * @param moduleStatuses OUT: Module statuses.
- * .version should be set to dcgmModuleStatuses_version upon calling. - * - * @return - * - \ref DCGM_ST_OK if the request succeeds. - * - \ref DCGM_ST_BADPARAM if a parameter is missing or bad. - * - */ -dcgmReturn_t DECLDIR dcgmModuleGetStatuses(dcgmHandle_t pDcgmHandle, dcgmModuleGetStatuses_t *moduleStatuses); - -/** @} */ // Closing for DCGMAPI_MODULES - -/*************************************************************************/ -/** @defgroup DCGMAPI_PROFILING Profiling - * @{ - * This chapter describes the methods that watch profiling fields from within DCGM. - */ -/*************************************************************************/ - -/*************************************************************************/ -/** - * Get all of the profiling metric groups for a given GPU group. - * - * Profiling metrics are watched in groups of fields that are all watched together. For instance, if you want - * to watch DCGM_FI_PROF_GR_ENGINE_ACTIVITY, this might also be in the same group as DCGM_FI_PROF_SM_EFFICIENCY. - * Watching this group would result in DCGM storing values for both of these metrics. - * - * Some groups cannot be watched concurrently as others as they utilize the same hardware resource. For instance, - * you may not be able to watch DCGM_FI_PROF_TENSOR_OP_UTIL at the same time as DCGM_FI_PROF_GR_ENGINE_ACTIVITY - * on your hardware. At the same time, you may be able to watch DCGM_FI_PROF_TENSOR_OP_UTIL at the same time as - * DCGM_FI_PROF_NVLINK_TX_DATA. - * - * Metrics that can be watched concurrently will have different .majorId fields in their dcgmProfMetricGroupInfo_t - * - * See \ref dcgmGroupCreate for details on creating a GPU group - * See \ref dcgmProfWatchFields to actually watch a metric group - * - * @param pDcgmHandle IN: DCGM Handle - * @param metricGroups IN/OUT: Metric groups supported for metricGroups->groupId.
- * metricGroups->version should be set to dcgmProfGetMetricGroups_version upon calling. - * - * @return - * - \ref DCGM_ST_OK if the request succeeds. - * - \ref DCGM_ST_BADPARAM if a parameter is missing or bad. - * - \ref DCGM_ST_GROUP_INCOMPATIBLE if metricGroups->groupId's GPUs are not identical GPUs. - * - \ref DCGM_ST_NOT_SUPPORTED if profiling metrics are not supported for the given GPU group. - * - */ -dcgmReturn_t DECLDIR dcgmProfGetSupportedMetricGroups(dcgmHandle_t pDcgmHandle, - dcgmProfGetMetricGroups_t *metricGroups); - -/** - * Request that DCGM start recording updates for a given list of profiling field IDs. - * - * Once metrics have been watched by this API, any of the normal DCGM field-value retrieval APIs can be used on - * the underlying fieldIds of this metric group. See \ref dcgmGetLatestValues_v2, \ref dcgmGetLatestValuesForFields, - * \ref dcgmEntityGetLatestValues, and \ref dcgmEntitiesGetLatestValues. - * - * @param pDcgmHandle IN: DCGM Handle - * @param watchFields IN: Details of which metric groups to watch for which GPUs. See \ref dcgmProfWatchFields_v1 - * for details of what should be put in each struct member. watchFields->version should be - * set to dcgmProfWatchFields_version upon calling. - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if a parameter is invalid - * - \ref DCGM_ST_NOT_SUPPORTED if profiling metric group metricGroupTag is not supported for the given - * GPU group. - * - \ref DCGM_ST_GROUP_INCOMPATIBLE if groupId's GPUs are not identical GPUs. Profiling metrics are only - * support for homogenous groups of GPUs. - * - \ref DCGM_ST_PROFILING_MULTI_PASS if any of the metric groups could not be watched concurrently due to - * requiring the hardware to gather them with multiple passes - * - */ -dcgmReturn_t dcgmProfWatchFields(dcgmHandle_t pDcgmHandle, dcgmProfWatchFields_t *watchFields); - -/** - * Request that DCGM stop recording updates for all profiling field IDs for all GPUs - * - * @param pDcgmHandle IN: DCGM Handle - * @param unwatchFields IN: Details of which metric groups to unwatch for which GPUs. See \ref - * dcgmProfUnwatchFields_v1 for details of what should be put in each struct member. - * unwatchFields->version should be set to dcgmProfUnwatchFields_version upon calling. - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if a parameter is invalid - * - */ -dcgmReturn_t dcgmProfUnwatchFields(dcgmHandle_t pDcgmHandle, dcgmProfUnwatchFields_t *unwatchFields); - -/** - * Pause profiling activities in DCGM. This should be used when you are monitoring profiling fields - * from DCGM but want to be able to still run developer tools like nvprof, nsight systems, and nsight compute. - * Profiling fields start with DCGM_PROF_ and are in the field ID range 1001-1012. - * - * Call this API before you launch one of those tools and dcgmProfResume() after the tool has completed. - * - * DCGM will save BLANK values while profiling is paused. - * - * Calling this while profiling activities are already paused is fine and will be treated as a no-op. - * - * @param pDcgmHandle IN: DCGM Handle - * - * @return - * - \ref DCGM_ST_OK If the call was successful. - * - \ref DCGM_ST_BADPARAM if a parameter is invalid. - * - */ -dcgmReturn_t dcgmProfPause(dcgmHandle_t pDcgmHandle); - -/** - * Resume profiling activities in DCGM that were previously paused with dcgmProfPause(). - * - * Call this API after you have completed running other NVIDIA developer tools to reenable DCGM - * profiling metrics. - * - * DCGM will save BLANK values while profiling is paused. - * - * Calling this while profiling activities have already been resumed is fine and will be treated as a no-op. - * - * @param pDcgmHandle IN: DCGM Handle - * - * @return - * - \ref DCGM_ST_OK If the call was successful. - * - \ref DCGM_ST_BADPARAM if a parameter is invalid. - * - */ -dcgmReturn_t dcgmProfResume(dcgmHandle_t pDcgmHandle); - -/** @} */ // Closing for DCGMAPI_PROFILING - -/** - * Adds fake GPU instances and or compute instances for testing purposes. The entity IDs specified for - * the GPU instances and compute instances are only guaranteed to be used by DCGM if MIG mode is not active. - * - * NOTE: this API will not work on a real system reading actual values from NVML, and it may even cause - * the real instances to malfunction. This API is for testing purposes only. - * - * @param pDcgmHandle IN: DCGM Handle - * @param hierarchy - * - * @return - * - \ref DCGM_ST_OK - * - */ -dcgmReturn_t dcgmAddFakeInstances(dcgmHandle_t pDcgmHandle, dcgmMigHierarchy_v1 *hierarchy); - -#ifdef __cplusplus -} -#endif - -#endif /* DCGM_AGENT_H */ diff --git a/bindings/go/dcgm/dcgm_errors.h b/bindings/go/dcgm/dcgm_errors.h deleted file mode 100644 index edce9eab..00000000 --- a/bindings/go/dcgm/dcgm_errors.h +++ /dev/null @@ -1,474 +0,0 @@ -/* - * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef DCGM_ERRORS_H -#define DCGM_ERRORS_H - -/* - * Error codes for passive and active health checks. - * New error codes must be added to end of enum to maintain backwards compatibility. - */ -typedef enum dcgmError_enum -{ - DCGM_FR_OK = 0, //!< No error - DCGM_FR_UNKNOWN = 1, //!< Unknown error code - DCGM_FR_UNRECOGNIZED = 2, //!< Unrecognized error code - DCGM_FR_PCI_REPLAY_RATE = 3, //!< Unacceptable rate of PCI errors - DCGM_FR_VOLATILE_DBE_DETECTED = 4, //!< Uncorrectable volatile double bit error - DCGM_FR_VOLATILE_SBE_DETECTED = 5, //!< Unacceptable rate of volatile single bit errors - DCGM_FR_PENDING_PAGE_RETIREMENTS = 6, //!< Pending page retirements detected - DCGM_FR_RETIRED_PAGES_LIMIT = 7, //!< Unacceptable total page retirements detected - DCGM_FR_RETIRED_PAGES_DBE_LIMIT = 8, //!< Unacceptable total page retirements due to uncorrectable errors - DCGM_FR_CORRUPT_INFOROM = 9, //!< Corrupt inforom found - DCGM_FR_CLOCK_THROTTLE_THERMAL = 10, //!< Clocks being throttled due to overheating - DCGM_FR_POWER_UNREADABLE = 11, //!< Cannot get a reading for power from NVML - DCGM_FR_CLOCK_THROTTLE_POWER = 12, //!< Clock being throttled due to power restrictions - DCGM_FR_NVLINK_ERROR_THRESHOLD = 13, //!< Unacceptable rate of NVLink errors - DCGM_FR_NVLINK_DOWN = 14, //!< NVLink is down - DCGM_FR_NVSWITCH_FATAL_ERROR = 15, //!< Fatal errors on the NVSwitch - DCGM_FR_NVSWITCH_NON_FATAL_ERROR = 16, //!< Non-fatal errors on the NVSwitch - DCGM_FR_NVSWITCH_DOWN = 17, //!< NVSwitch is down - DCGM_FR_NO_ACCESS_TO_FILE = 18, //!< Cannot access a file - DCGM_FR_NVML_API = 19, //!< Error occurred on an NVML API - DCGM_FR_DEVICE_COUNT_MISMATCH = 20, //!< Disagreement in GPU count between /dev and NVML - DCGM_FR_BAD_PARAMETER = 21, //!< Bad parameter passed to API - DCGM_FR_CANNOT_OPEN_LIB = 22, //!< Cannot open a library that must be accessed - DCGM_FR_BLACKLISTED_DRIVER = 23, //!< A blacklisted driver (nouveau) is active - DCGM_FR_NVML_LIB_BAD = 24, //!< The NVML library is missing expected functions - DCGM_FR_GRAPHICS_PROCESSES = 25, //!< Graphics processes are active on this GPU - DCGM_FR_HOSTENGINE_CONN = 26, //!< Unstable connection to nv-hostengine (daemonized DCGM) - DCGM_FR_FIELD_QUERY = 27, //!< Error querying a field from DCGM - DCGM_FR_BAD_CUDA_ENV = 28, //!< The environment has variables that hurt CUDA - DCGM_FR_PERSISTENCE_MODE = 29, //!< Persistence mode is disabled - DCGM_FR_LOW_BANDWIDTH = 30, //!< The bandwidth is unacceptably low - DCGM_FR_HIGH_LATENCY = 31, //!< Latency is too high - DCGM_FR_CANNOT_GET_FIELD_TAG = 32, //!< Cannot find a tag for a field - DCGM_FR_FIELD_VIOLATION = 33, //!< The value for the specified error field is above 0 - DCGM_FR_FIELD_THRESHOLD = 34, //!< The value for the specified field is above the threshold - DCGM_FR_FIELD_VIOLATION_DBL = 35, //!< The value for the specified error field is above 0 - DCGM_FR_FIELD_THRESHOLD_DBL = 36, //!< The value for the specified field is above the threshold - DCGM_FR_UNSUPPORTED_FIELD_TYPE = 37, //!< Field type cannot be supported - DCGM_FR_FIELD_THRESHOLD_TS = 38, //!< The value for the specified field is above the threshold - DCGM_FR_FIELD_THRESHOLD_TS_DBL = 39, //!< The value for the specified field is above the threshold - DCGM_FR_THERMAL_VIOLATIONS = 40, //!< Thermal violations detected - DCGM_FR_THERMAL_VIOLATIONS_TS = 41, //!< Thermal violations detected with a timestamp - DCGM_FR_TEMP_VIOLATION = 42, //!< Temperature is too high - DCGM_FR_THROTTLING_VIOLATION = 43, //!< Non-benign clock throttling is occurring - DCGM_FR_INTERNAL = 44, //!< An internal error was detected - DCGM_FR_PCIE_GENERATION = 45, //!< PCIe generation is too low - DCGM_FR_PCIE_WIDTH = 46, //!< PCIe width is too low - DCGM_FR_ABORTED = 47, //!< Test was aborted by a user signal - DCGM_FR_TEST_DISABLED = 48, //!< This test is disabled for this GPU - DCGM_FR_CANNOT_GET_STAT = 49, //!< Cannot get telemetry for a needed value - DCGM_FR_STRESS_LEVEL = 50, //!< Stress level is too low (bad performance) - DCGM_FR_CUDA_API = 51, //!< Error calling the specified CUDA API - DCGM_FR_FAULTY_MEMORY = 52, //!< Faulty memory detected on this GPU - DCGM_FR_CANNOT_SET_WATCHES = 53, //!< Unable to set field watches in DCGM - DCGM_FR_CUDA_UNBOUND = 54, //!< CUDA context is no longer bound - DCGM_FR_ECC_DISABLED = 55, //!< ECC memory is disabled right now - DCGM_FR_MEMORY_ALLOC = 56, //!< Cannot allocate memory on the GPU - DCGM_FR_CUDA_DBE = 57, //!< CUDA detected unrecovable double-bit error - DCGM_FR_MEMORY_MISMATCH = 58, //!< Memory error detected - DCGM_FR_CUDA_DEVICE = 59, //!< No CUDA device discoverable for existing GPU - DCGM_FR_ECC_UNSUPPORTED = 60, //!< ECC memory is unsupported by this SKU - DCGM_FR_ECC_PENDING = 61, //!< ECC memory is in a pending state - DCGM_FR_MEMORY_BANDWIDTH = 62, //!< Memory bandwidth is too low - DCGM_FR_TARGET_POWER = 63, //!< Cannot hit the target power draw - DCGM_FR_API_FAIL = 64, //!< The specified API call failed - DCGM_FR_API_FAIL_GPU = 65, //!< The specified API call failed for the specified GPU - DCGM_FR_CUDA_CONTEXT = 66, //!< Cannot create a CUDA context on this GPU - DCGM_FR_DCGM_API = 67, //!< DCGM API failure - DCGM_FR_CONCURRENT_GPUS = 68, //!< Need multiple GPUs to run this test - DCGM_FR_TOO_MANY_ERRORS = 69, //!< More errors than fit in the return struct - DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD = 70, //!< More than 100 CRC errors are happening per second - DCGM_FR_NVLINK_ERROR_CRITICAL = 71, //!< NVLink error for a field that should always be 0 - DCGM_FR_ENFORCED_POWER_LIMIT = 72, //!< The enforced power limit is too low to hit the target - DCGM_FR_MEMORY_ALLOC_HOST = 73, //!< Cannot allocate memory on the host - DCGM_FR_GPU_OP_MODE = 74, //!< Bad GPU operating mode for running plugin - DCGM_FR_NO_MEMORY_CLOCKS = 75, //!< No memory clocks with the needed MHz were found - DCGM_FR_NO_GRAPHICS_CLOCKS = 76, //!< No graphics clocks with the needed MHz were found - DCGM_FR_HAD_TO_RESTORE_STATE = 77, //!< Note that we had to restore a GPU's state - DCGM_FR_L1TAG_UNSUPPORTED = 78, //!< L1TAG test is unsupported by this SKU - DCGM_FR_L1TAG_MISCOMPARE = 79, //!< L1TAG test failed on a miscompare - DCGM_FR_ROW_REMAP_FAILURE = 80, //!< Row remapping failed (Ampere or newer GPUs) - DCGM_FR_UNCONTAINED_ERROR = 81, //!< Uncontained error - XID 95 - DCGM_FR_EMPTY_GPU_LIST = 82, //!< No GPU information given to plugin - DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS = 83, //!< Pending page retirements due to a DBE - DCGM_FR_ERROR_SENTINEL = 84, //!< MUST BE THE LAST ERROR CODE -} dcgmError_t; - -typedef enum dcgmErrorSeverity_enum -{ - DCGM_ERROR_MONITOR = 0, //!< Can perform workload, but needs to be monitored. - DCGM_ERROR_ISOLATE = 1, //!< Cannot perform workload. GPU should be isolated. - DCGM_ERROR_UNKNOWN = 2, //!< This error code is not recognized -} dcgmErrorSeverity_t; - -typedef struct -{ - dcgmError_t errorId; - const char *msgFormat; - const char *suggestion; - int severity; -} dcgm_error_meta_t; - -extern dcgm_error_meta_t dcgmErrorMeta[]; - - -/* Standard message for running a field diagnostic */ -#define TRIAGE_RUN_FIELD_DIAG_MSG "Run a field diagnostic on the GPU." -#define DEBUG_COOLING_MSG \ - "Verify that the cooling on this machine is functional, including external, " \ - "thermal material interface, fans, and any other components." - -/* - * Messages for the error codes. All messages must be defined in the ERROR_CODE_MSG format - * where is the actual message. - */ -#define DCGM_FR_OK_MSG "The operation completed successfully." -#define DCGM_FR_UNKNOWN_MSG "Unknown error." -#define DCGM_FR_UNRECOGNIZED_MSG "Unrecognized error code." -// replay limit, gpu id, replay errors detected -#define DCGM_FR_PCI_REPLAY_RATE_MSG "Detected more than %u PCIe replays per minute for GPU %u : %d" -// dbes deteced, gpu id -#define DCGM_FR_VOLATILE_DBE_DETECTED_MSG "Detected %d volatile double-bit ECC error(s) in GPU %u." -// sbe limit, gpu id, sbes detected -#define DCGM_FR_VOLATILE_SBE_DETECTED_MSG "More than %u single-bit ECC error(s) detected in GPU %u Volatile SBEs: %lld" -// gpu id -#define DCGM_FR_PENDING_PAGE_RETIREMENTS_MSG "A pending retired page has been detected in GPU %u." -// retired pages detected, gpud id -#define DCGM_FR_RETIRED_PAGES_LIMIT_MSG "%u or more retired pages have been detected in GPU %u. " -// retired pages due to dbes detected, gpu id -#define DCGM_FR_RETIRED_PAGES_DBE_LIMIT_MSG \ - "An excess of %u retired pages due to DBEs have been detected and" \ - " more than one page has been retired due to DBEs in the past" \ - " week in GPU %u." -// gpu id -#define DCGM_FR_CORRUPT_INFOROM_MSG "A corrupt InfoROM has been detected in GPU %u." -// gpu id -#define DCGM_FR_CLOCK_THROTTLE_THERMAL_MSG "Detected clock throttling due to thermal violation in GPU %u." -// gpu id -#define DCGM_FR_POWER_UNREADABLE_MSG "Cannot reliably read the power usage for GPU %u." -// gpu id -#define DCGM_FR_CLOCK_THROTTLE_POWER_MSG "Detected clock throttling due to power violation in GPU %u." -// nvlink errors detected, nvlink id, error threshold -#define DCGM_FR_NVLINK_ERROR_THRESHOLD_MSG \ - "Detected %ld %s NvLink errors on GPU %u's NVLink which exceeds " \ - "threshold of %u" -// gpu id, nvlink id -#define DCGM_FR_NVLINK_DOWN_MSG "GPU %u's NvLink link %d is currently down" -// nvswitch id, nvlink id -#define DCGM_FR_NVSWITCH_FATAL_ERROR_MSG "Detected fatal errors on NvSwitch %u link %u" -// nvswitch id, nvlink id -#define DCGM_FR_NVSWITCH_NON_FATAL_ERROR_MSG "Detected nonfatal errors on NvSwitch %u link %u" -// nvswitch id, nvlink port -#define DCGM_FR_NVSWITCH_DOWN_MSG "NvSwitch physical ID %u's NvLink port %d is currently down." -// file path, error detail -#define DCGM_FR_NO_ACCESS_TO_FILE_MSG "File %s could not be accessed directly: %s" -// purpose for communicating with NVML, NVML error as string, NVML error -#define DCGM_FR_NVML_API_MSG "Error calling NVML API %s: %s" -#define DCGM_FR_DEVICE_COUNT_MISMATCH_MSG \ - "The number of devices NVML returns is different than the number " \ - "of devices in /dev." -// function name -#define DCGM_FR_BAD_PARAMETER_MSG "Bad parameter to function %s cannot be processed" -// library name, error returned from dlopen -#define DCGM_FR_CANNOT_OPEN_LIB_MSG "Cannot open library %s: '%s'" -// the name of the blacklisted driver -#define DCGM_FR_BLACKLISTED_DRIVER_MSG "Found blacklisted driver: %s" -// the name of the function that wasn't found -#define DCGM_FR_NVML_LIB_BAD_MSG "Cannot get pointer to %s from libnvidia-ml.so" -#define DCGM_FR_GRAPHICS_PROCESSES_MSG \ - "NVVS has detected processes with graphics contexts open running on at least one " \ - "GPU. This may cause some tests to fail." -// error message from the API call -#define DCGM_FR_HOSTENGINE_CONN_MSG "Could not connect to the host engine: '%s'" -// field name, gpu id -#define DCGM_FR_FIELD_QUERY_MSG "Could not query field %s for GPU %u" -// environment variable name -#define DCGM_FR_BAD_CUDA_ENV_MSG "Found CUDA performance-limiting environment variable '%s'." -// gpu id -#define DCGM_FR_PERSISTENCE_MODE_MSG \ - "Persistence mode for GPU %u is currently disabled. The DCGM " \ - "diagnostic requires peristence mode to be enabled." -// gpu id, direction (d2h, e.g.), measured bandwidth, expected bandwidth -#define DCGM_FR_LOW_BANDWIDTH_MSG \ - "Bandwidth of GPU %u in direction %s of %.2f did not exceed " \ - "minimum required bandwidth of %.2f." -// gpu id, direction (d2h, e.g.), measured latency, expected latency -#define DCGM_FR_HIGH_LATENCY_MSG \ - "Latency type %s of GPU %u value %.2f exceeded maximum allowed " \ - "latency of %.2f." -// field id -#define DCGM_FR_CANNOT_GET_FIELD_TAG_MSG "Unable to get field information for field id %hu" -// field value, field name, gpu id (this message is for fields that should always have a 0 value) -#define DCGM_FR_FIELD_VIOLATION_MSG "Detected %ld %s for GPU %u" -// field value, field name, gpu id, allowable threshold -#define DCGM_FR_FIELD_THRESHOLD_MSG "Detected %ld %s for GPU %u which is above the threshold %ld" -// field value, field name, gpu id (same as DCGM_FR_FIELD_VIOLATION, but it's a double) -#define DCGM_FR_FIELD_VIOLATION_DBL_MSG "Detected %.1f %s for GPU %u" -// field value, field name, gpu id, allowable threshold (same as DCGM_FR_FIELD_THRESHOLD, but it's a double) -#define DCGM_FR_FIELD_THRESHOLD_DBL_MSG "Detected %.1f %s for GPU %u which is above the threshold %.1f" -// field name -#define DCGM_FR_UNSUPPORTED_FIELD_TYPE_MSG \ - "Field %s is not supported by this API because it is neither an " \ - "int64 nor a double type." -// field name, allowable threshold, observed value, seconds -#define DCGM_FR_FIELD_THRESHOLD_TS_MSG \ - "%s met or exceeded the threshold of %lu per second: %lu at " \ - "%.1f seconds into the test." -// field name, allowable threshold, observed value, seconds (same as DCGM_FR_FIELD_THRESHOLD, but it's a double) -#define DCGM_FR_FIELD_THRESHOLD_TS_DBL_MSG \ - "%s met or exceeded the threshold of %.1f per second: %.1f at " \ - "%.1f seconds into the test." -// total seconds of violation, gpu id -#define DCGM_FR_THERMAL_VIOLATIONS_MSG "There were thermal violations totaling %lu seconds for GPU %u" -// total seconds of violations, first instance, gpu id -#define DCGM_FR_THERMAL_VIOLATIONS_TS_MSG \ - "Thermal violations totaling %lu samples started at %.1f seconds " \ - "into the test for GPU %u" -// observed temperature, gpu id, max allowed temperature -#define DCGM_FR_TEMP_VIOLATION_MSG \ - "Temperature %lld of GPU %u exceeded user-specified maximum " \ - "allowed temperature %lld" -// gpu id, seconds into test, details about throttling -#define DCGM_FR_THROTTLING_VIOLATION_MSG \ - "Clocks are being throttled for GPU %u because of clock " \ - "throttling starting %.1f seconds into the test. %s" -// details about error -#define DCGM_FR_INTERNAL_MSG "There was an internal error during the test: '%s'" -// gpu id, PCIe generation, minimum allowed, parameter to control -#define DCGM_FR_PCIE_GENERATION_MSG \ - "GPU %u is running at PCI link generation %d, which is below " \ - "the minimum allowed link generation of %d (parameter '%s')" -// gpu id, PCIe width, minimum allowed, parameter to control -#define DCGM_FR_PCIE_WIDTH_MSG \ - "GPU %u is running at PCI link width %dX, which is below the " \ - "minimum allowed link generation of %d (parameter '%s')" -#define DCGM_FR_ABORTED_MSG "Test was aborted early due to user signal" -// Test name -#define DCGM_FR_TEST_DISABLED_MSG "The %s test is skipped for this GPU." -// stat name, gpu id -#define DCGM_FR_CANNOT_GET_STAT_MSG "Unable to generate / collect stat %s for GPU %u" -// observed value, minimum allowed, gpu id -#define DCGM_FR_STRESS_LEVEL_MSG \ - "Max stress level of %.1f did not reach desired stress level of " \ - "%.1f for GPU %u" -// CUDA API name -#define DCGM_FR_CUDA_API_MSG "Error using CUDA API %s" -// count, gpu id -#define DCGM_FR_FAULTY_MEMORY_MSG "Found %d faulty memory elements on GPU %u" -// error detail -#define DCGM_FR_CANNOT_SET_WATCHES_MSG "Unable to add field watches to DCGM: %s" -// gpu id -#define DCGM_FR_CUDA_UNBOUND_MSG "Cuda GPU %d is no longer bound to a CUDA context...Aborting" -// Test name, gpu id -#define DCGM_FR_ECC_DISABLED_MSG "Skipping test %s because ECC is not enabled on GPU %u" -// percentage of memory we tried to allocate, gpu id -#define DCGM_FR_MEMORY_ALLOC_MSG "Couldn't allocate at least %.1f%% of GPU memory on GPU %u" -// gpu id -#define DCGM_FR_CUDA_DBE_MSG \ - "CUDA APIs have indicated that a double-bit ECC error has " \ - "occured on GPU %u." -// gpu id -#define DCGM_FR_MEMORY_MISMATCH_MSG \ - "A memory mismatch was detected on GPU %u, but no error was " \ - "reported by CUDA or NVML." -// gpu id, error detail -#define DCGM_FR_CUDA_DEVICE_MSG "Unable to find a corresponding CUDA device for GPU %u: '%s'" -#define DCGM_FR_ECC_UNSUPPORTED_MSG "ECC Memory is not turned on or is unsupported. Skipping test." -// gpu id -#define DCGM_FR_ECC_PENDING_MSG "ECC memory for GPU %u is in a pending state." -// gpu id, observed bandwidth, required, test name -#define DCGM_FR_MEMORY_BANDWIDTH_MSG \ - "GPU %u only achieved a memory bandwidth of %.2f GB/s, failing " \ - "to meet %.2f GB/s for test %d" -// power draw observed, field tag, minimum power draw required, gpu id -#define DCGM_FR_TARGET_POWER_MSG \ - "Max power of %.1f did not reach desired power minimum %s of " \ - "%.1f for GPU %u" -// API name, error detail -#define DCGM_FR_API_FAIL_MSG "API call %s failed: '%s'" -// API name, gpu id, error detail -#define DCGM_FR_API_FAIL_GPU_MSG "API call %s failed for GPU %u: '%s'" -// gpu id, error detail -#define DCGM_FR_CUDA_CONTEXT_MSG "GPU %u failed to create a CUDA context: %s" -// DCGM API name -#define DCGM_FR_DCGM_API_MSG "Error using DCGM API %s" -#define DCGM_FR_CONCURRENT_GPUS_MSG \ - "Unable to run concurrent pair bandwidth test without 2 or more " \ - "gpus. Skipping" -#define DCGM_FR_TOO_MANY_ERRORS_MSG \ - "This API can only return up to four errors per system. " \ - "Additional errors were found for this system that couldn't be " \ - "communicated." -#define DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD_MSG \ - "%.1f %s NvLink errors found occuring per second on GPU %u, " \ - "exceeding the limit of 100 per second." -#define DCGM_FR_NVLINK_ERROR_CRITICAL_MSG "Detected %ld %s NvLink errors on GPU %u's NVLink (should be 0)" -#define DCGM_FR_ENFORCED_POWER_LIMIT_MSG \ - "Enforced power limit on GPU %u set to %.1f, which is too low to " \ - "attempt to achieve target power %.1f" -#define DCGM_FR_MEMORY_ALLOC_HOST_MSG "Cannot allocate %zu bytes on the host" -#define DCGM_FR_GPU_OP_MODE_MSG "Skipping plugin due to a GPU being in GPU Operating Mode: LOW_DP." -#define DCGM_FR_NO_MEMORY_CLOCKS_MSG "No memory clocks <= %u MHZ were found in %u supported memory clocks." -#define DCGM_FR_NO_GRAPHICS_CLOCKS_MSG \ - "No graphics clocks <= %u MHZ were found in %u supported graphics clocks for memory clock %u MHZ." -#define DCGM_FR_HAD_TO_RESTORE_STATE_MSG "Had to restore GPU state on NVML GPU(s): %s" -#define DCGM_FR_L1TAG_UNSUPPORTED_MSG "This card does not support the L1 cache test. Skipping test." -#define DCGM_FR_L1TAG_MISCOMPARE_MSG "Detected a miscompare failure in the L1 cache." -#define DCGM_FR_ROW_REMAP_FAILURE_MSG "Row remapping failed." -#define DCGM_FR_UNCONTAINED_ERROR_MSG "GPU had an uncontained error (XID 95)" -#define DCGM_FR_EMPTY_GPU_LIST_MSG "No valid GPUs passed to plugin" -#define DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS_MSG "Pending page retirements together with a DBE were detected on GPU %u." - -/* - * Suggestions for next steps for the corresponding error message - */ -#define DCGM_FR_OK_NEXT "N/A" -#define DCGM_FR_UNKNOWN_NEXT "" -#define DCGM_FR_UNRECOGNIZED_NEXT "" -#define DCGM_FR_PCI_REPLAY_RATE_NEXT \ - "Reconnect PCIe card. Run system side PCIE diagnostic utilities " \ - "to verify hops off the GPU board. If issue is on the board, run " \ - "the field diagnostic." -#define DCGM_FR_VOLATILE_DBE_DETECTED_NEXT "Drain the GPU and reset it or reboot the node." -#define DCGM_FR_VOLATILE_SBE_DETECTED_NEXT "Monitor - this GPU can still perform workload." -#define DCGM_FR_PENDING_PAGE_RETIREMENTS_NEXT "Monitor - this GPU can still perform workload" -#define DCGM_FR_RETIRED_PAGES_LIMIT_NEXT TRIAGE_RUN_FIELD_DIAG_MSG -#define DCGM_FR_RETIRED_PAGES_DBE_LIMIT_NEXT TRIAGE_RUN_FIELD_DIAG_MSG -#define DCGM_FR_CORRUPT_INFOROM_NEXT "Flash the InfoROM to clear this corruption." -#define DCGM_FR_CLOCK_THROTTLE_THERMAL_NEXT DEBUG_COOLING_MSG -#define DCGM_FR_POWER_UNREADABLE_NEXT "" -#define DCGM_FR_CLOCK_THROTTLE_POWER_NEXT "Monitor the power conditions. This GPU can still perform workload." -#define DCGM_FR_NVLINK_ERROR_THRESHOLD_NEXT "Monitor the NVLink. It can still perform workload." -#define DCGM_FR_NVLINK_DOWN_NEXT TRIAGE_RUN_FIELD_DIAG_MSG -#define DCGM_FR_NVSWITCH_FATAL_ERROR_NEXT TRIAGE_RUN_FIELD_DIAG_MSG -#define DCGM_FR_NVSWITCH_NON_FATAL_ERROR_NEXT "Monitor the NVSwitch. It can still perform workload." -#define DCGM_FR_NVSWITCH_DOWN_NEXT "" -#define DCGM_FR_NO_ACCESS_TO_FILE_NEXT "Check relevant permissions, access, and existence of the file." -#define DCGM_FR_NVML_API_NEXT \ - "Check the error condition and ensure that appropriate libraries " \ - "are present and accessible." -#define DCGM_FR_DEVICE_COUNT_MISMATCH_NEXT \ - "Check for the presence of cgroups, operating system blocks, and " \ - "or unsupported / older cards" -#define DCGM_FR_BAD_PARAMETER_NEXT "" -#define DCGM_FR_CANNOT_OPEN_LIB_NEXT \ - "Check for the existence of the library and set LD_LIBRARY_PATH " \ - "if needed." -#define DCGM_FR_BLACKLISTED_DRIVER_NEXT "Please load the appropriate driver." -#define DCGM_FR_NVML_LIB_BAD_NEXT \ - "Make sure that the required version of libnvidia-ml.so " \ - "is present and accessible on the system." -#define DCGM_FR_GRAPHICS_PROCESSES_NEXT \ - "Stop the graphics processes or run this diagnostic on a server " \ - "that is not being used for display purposes." -#define DCGM_FR_HOSTENGINE_CONN_NEXT \ - "If hostengine is run separately, please ensure that it is up " \ - "and responsive." -#define DCGM_FR_FIELD_QUERY_NEXT "" -#define DCGM_FR_BAD_CUDA_ENV_NEXT "Please unset this environment variable to address test failures." -#define DCGM_FR_PERSISTENCE_MODE_NEXT \ - "Enable persistence mode by running \"nvidia-smi -i -pm " \ - "1 \" as root." -#define DCGM_FR_LOW_BANDWIDTH_NEXT \ - "Verify that your minimum bandwidth setting is appropriate for " \ - "the topology of each GPU. If so, and errors are consistent, " \ - "please run a field diagnostic." -#define DCGM_FR_HIGH_LATENCY_NEXT \ - "Verify that your maximum latency setting is appropriate for " \ - "the topology of each GPU. If so, and errors are consistent, " \ - "please run a field diagnostic." -#define DCGM_FR_CANNOT_GET_FIELD_TAG_NEXT "" -#define DCGM_FR_FIELD_VIOLATION_NEXT "" -#define DCGM_FR_FIELD_THRESHOLD_NEXT "" -#define DCGM_FR_FIELD_VIOLATION_DBL_NEXT "" -#define DCGM_FR_FIELD_THRESHOLD_DBL_NEXT "" -#define DCGM_FR_UNSUPPORTED_FIELD_TYPE_NEXT "" -#define DCGM_FR_FIELD_THRESHOLD_TS_NEXT "" -#define DCGM_FR_FIELD_THRESHOLD_TS_DBL_NEXT "" -#define DCGM_FR_THERMAL_VIOLATIONS_NEXT DEBUG_COOLING_MSG -#define DCGM_FR_THERMAL_VIOLATIONS_TS_NEXT DEBUG_COOLING_MSG -#define DCGM_FR_TEMP_VIOLATION_NEXT \ - "Verify that the user-specified temperature maximum is set " \ - "correctly. If it is, check the cooling for this GPU and node: " DEBUG_COOLING_MSG -#define DCGM_FR_THROTTLING_VIOLATION_NEXT "" -#define DCGM_FR_INTERNAL_NEXT "" -#define DCGM_FR_PCIE_GENERATION_NEXT "" -#define DCGM_FR_PCIE_WIDTH_NEXT "" -#define DCGM_FR_ABORTED_NEXT "" -#define DCGM_FR_TEST_DISABLED_NEXT "" -#define DCGM_FR_CANNOT_GET_STAT_NEXT \ - "If running a standalone nv-hostengine, verify that it is up " \ - "and responsive." -#define DCGM_FR_STRESS_LEVEL_NEXT "" -#define DCGM_FR_CUDA_API_NEXT "" -#define DCGM_FR_FAULTY_MEMORY_NEXT TRIAGE_RUN_FIELD_DIAG_MSG -#define DCGM_FR_CANNOT_SET_WATCHES_NEXT "" -#define DCGM_FR_CUDA_UNBOUND_NEXT "" -#define DCGM_FR_ECC_DISABLED_NEXT \ - "Enable ECC memory by running \"nvidia-smi -i -e 1\" " \ - "to enable. This may require a GPU reset or reboot to take effect." -#define DCGM_FR_MEMORY_ALLOC_NEXT "" -#define DCGM_FR_CUDA_DBE_NEXT TRIAGE_RUN_FIELD_DIAG_MSG -#define DCGM_FR_MEMORY_MISMATCH_NEXT TRIAGE_RUN_FIELD_DIAG_MSG -#define DCGM_FR_CUDA_DEVICE_NEXT \ - "Make sure CUDA_VISIBLE_DEVICES is not preventing visibility of " \ - "this GPU. Also check if CUDA libraries are compatible and " \ - "correctly installed." -#define DCGM_FR_ECC_UNSUPPORTED_NEXT "" -#define DCGM_FR_ECC_PENDING_NEXT "Reboot to complete activation of the ECC memory." -#define DCGM_FR_MEMORY_BANDWIDTH_NEXT "" -#define DCGM_FR_TARGET_POWER_NEXT "Verify that the clock speeds and GPU utilization are high." -#define DCGM_FR_API_FAIL_NEXT "" -#define DCGM_FR_API_FAIL_GPU_NEXT "" -#define DCGM_FR_CUDA_CONTEXT_NEXT \ - "Please make sure the correct driver version is installed and " \ - "verify that no conflicting libraries are present." -#define DCGM_FR_DCGM_API_NEXT "" -#define DCGM_FR_CONCURRENT_GPUS_NEXT "" -#define DCGM_FR_TOO_MANY_ERRORS_NEXT "" -#define DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD_NEXT TRIAGE_RUN_FIELD_DIAG_MSG -#define DCGM_FR_NVLINK_ERROR_CRITICAL_NEXT TRIAGE_RUN_FIELD_DIAG_MSG -#define DCGM_FR_ENFORCED_POWER_LIMIT_NEXT \ - "If this enforced power limit is necessary, then this test " \ - "cannot be run. If it is unnecessary, then raise the enforced " \ - "power limit setting to be able to run this test." -#define DCGM_FR_MEMORY_ALLOC_HOST_NEXT "Manually kill processes or restart your machine." -#define DCGM_FR_GPU_OP_MODE_NEXT \ - "Fix by running nvidia-smi as root with: nvidia-smi --gom=0 -i " \ - "" -#define DCGM_FR_NO_MEMORY_CLOCKS_NEXT "" -#define DCGM_FR_NO_GRAPHICS_CLOCKS_NEXT "" -#define DCGM_FR_HAD_TO_RESTORE_STATE_NEXT "" -#define DCGM_FR_L1TAG_UNSUPPORTED_NEXT "" -#define DCGM_FR_L1TAG_MISCOMPARE_NEXT TRIAGE_RUN_FIELD_DIAG_MSG -#define DCGM_FR_ROW_REMAP_FAILURE_NEXT DCGM_FR_VOLATILE_DBE_DETECTED_NEXT -#define DCGM_FR_UNCONTAINED_ERROR_NEXT DCGM_FR_VOLATILE_DBE_DETECTED_NEXT -#define DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS_NEXT "Drain the GPU and reset it or reboot the node to resolve this issue." -#define DCGM_FR_EMPTY_GPU_LIST_NEXT "" - -#ifdef __cplusplus -extern "C" { -#endif -dcgmErrorSeverity_t dcgmErrorGetPriorityByCode(unsigned int code); -const char *dcgmErrorGetFormatMsgByCode(unsigned int code); - -#ifdef __cplusplus -} -#endif - -#endif // DCGM_ERRORS_H diff --git a/bindings/go/dcgm/dcgm_fields.h b/bindings/go/dcgm/dcgm_fields.h deleted file mode 100644 index ff156898..00000000 --- a/bindings/go/dcgm/dcgm_fields.h +++ /dev/null @@ -1,2249 +0,0 @@ -/* - * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef DCGMFIELDS_H -#define DCGMFIELDS_H - -#ifdef __cplusplus -extern "C" { -#endif - -/***************************************************************************************************/ -/** @defgroup dcgmFieldTypes Field Types - * Field Types are a single byte. - * @{ - */ -/***************************************************************************************************/ - -/** - * Blob of binary data representing a structure - */ -#define DCGM_FT_BINARY 'b' - -/** - * 8-byte double precision - */ -#define DCGM_FT_DOUBLE 'd' - -/** - * 8-byte signed integer - */ -#define DCGM_FT_INT64 'i' - -/** - * Null-terminated ASCII Character string - */ -#define DCGM_FT_STRING 's' - -/** - * 8-byte signed integer usec since 1970 - */ -#define DCGM_FT_TIMESTAMP 't' - -/** @} */ - - -/***************************************************************************************************/ -/** @defgroup dcgmFieldScope Field Scope - * Represents field association with entity scope or global scope. - * @{ - */ -/***************************************************************************************************/ - -/** - * Field is global (ex: driver version) - */ -#define DCGM_FS_GLOBAL 0 - -/** - * Field is associated with an entity (GPU, VGPU...etc) - */ -#define DCGM_FS_ENTITY 1 - -/** - * Field is associated with a device. Deprecated. Use DCGM_FS_ENTITY - */ -#define DCGM_FS_DEVICE DCGM_FS_ENTITY - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup dcgmFieldConstants Field Constants - * Constants that represent contents of individual field values. - * @{ - */ -/***************************************************************************************************/ - -/** - * DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY is 16 bits of major version followed by - * 16 bits of the minor version. These macros separate the two. - */ -#define DCGM_CUDA_COMPUTE_CAPABILITY_MAJOR(x) ((uint64_t)(x)&0xFFFF0000) -#define DCGM_CUDA_COMPUTE_CAPABILITY_MINOR(x) ((uint64_t)(x)&0x0000FFFF) - -/** - * DCGM_FI_DEV_CLOCK_THROTTLE_REASONS is a bitmap of why the clock is throttled. - * These macros are masks for relevant throttling, and are a 1:1 map to the NVML - * reasons documented in nvml.h. The notes for the header are copied blow: - */ -/** Nothing is running on the GPU and the clocks are dropping to Idle state - * \note This limiter may be removed in a later release - */ -#define DCGM_CLOCKS_THROTTLE_REASON_GPU_IDLE 0x0000000000000001LL -/** GPU clocks are limited by current setting of applications clocks - */ -#define DCGM_CLOCKS_THROTTLE_REASON_CLOCKS_SETTING 0x0000000000000002LL -/** SW Power Scaling algorithm is reducing the clocks below requested clocks - */ -#define DCGM_CLOCKS_THROTTLE_REASON_SW_POWER_CAP 0x0000000000000004LL -/** HW Slowdown (reducing the core clocks by a factor of 2 or more) is engaged - * - * This is an indicator of: - * - temperature being too high - * - External Power Brake Assertion is triggered (e.g. by the system power supply) - * - Power draw is too high and Fast Trigger protection is reducing the clocks - * - May be also reported during PState or clock change - * - This behavior may be removed in a later release. - */ -#define DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN 0x0000000000000008LL -/** Sync Boost - * - * This GPU has been added to a Sync boost group with nvidia-smi or DCGM in - * order to maximize performance per watt. All GPUs in the sync boost group - * will boost to the minimum possible clocks across the entire group. Look at - * the throttle reasons for other GPUs in the system to see why those GPUs are - * holding this one at lower clocks. - */ -#define DCGM_CLOCKS_THROTTLE_REASON_SYNC_BOOST 0x0000000000000010LL -/** SW Thermal Slowdown - * - * This is an indicator of one or more of the following: - * - Current GPU temperature above the GPU Max Operating Temperature - * - Current memory temperature above the Memory Max Operating Temperature - */ -#define DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL 0x0000000000000020LL -/** HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is engaged - * - * This is an indicator of: - * - temperature being too high - */ -#define DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL 0x0000000000000040LL -/** HW Power Brake Slowdown (reducing the core clocks by a factor of 2 or more) is engaged - * - * This is an indicator of: - * - External Power Brake Assertion being triggered (e.g. by the system power supply) - */ -#define DCGM_CLOCKS_THROTTLE_REASON_HW_POWER_BRAKE 0x0000000000000080LL -/** GPU clocks are limited by current setting of Display clocks - */ -#define DCGM_CLOCKS_THROTTLE_REASON_DISPLAY_CLOCKS 0x0000000000000100LL - -/** - * GPU virtualization mode types for DCGM_FI_DEV_VIRTUAL_MODE - */ -typedef enum -{ - DCGM_GPU_VIRTUALIZATION_MODE_NONE = 0, //!< Represents Bare Metal GPU - DCGM_GPU_VIRTUALIZATION_MODE_PASSTHROUGH = 1, //!< Device is associated with GPU-Passthrough - DCGM_GPU_VIRTUALIZATION_MODE_VGPU = 2, //!< Device is associated with vGPU inside virtual machine. - DCGM_GPU_VIRTUALIZATION_MODE_HOST_VGPU = 3, //!< Device is associated with VGX hypervisor in vGPU mode - DCGM_GPU_VIRTUALIZATION_MODE_HOST_VSGA = 4, //!< Device is associated with VGX hypervisor in vSGA mode -} dcgmGpuVirtualizationMode_t; - - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup dcgmFieldEntity Field Entity - * Represents field association with a particular entity - * @{ - */ -/***************************************************************************************************/ - -/** - * Enum of possible field entity groups - */ -typedef enum dcgm_field_entity_group_t -{ - DCGM_FE_NONE = 0, /*!< Field is not associated with an entity. Field scope should be DCGM_FS_GLOBAL */ - DCGM_FE_GPU, /*!< Field is associated with a GPU entity */ - DCGM_FE_VGPU, /*!< Field is associated with a VGPU entity */ - DCGM_FE_SWITCH, /*!< Field is associated with a Switch entity */ - DCGM_FE_GPU_I, /*!< Field is associated with a GPU Instance entity */ - DCGM_FE_GPU_CI, /*!< Field is associated with a GPU Compute Instance entity */ - - DCGM_FE_COUNT /*!< Number of elements in this enumeration. Keep this entry last */ -} dcgm_field_entity_group_t; - -/** - * Represents an identifier for an entity within a field entity. For instance, this is the gpuId for DCGM_FE_GPU. - */ -typedef unsigned int dcgm_field_eid_t; - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup dcgmFieldIdentifiers Field Identifiers - * Field Identifiers - * @{ - */ -/***************************************************************************************************/ - -/** - * NULL field - */ -#define DCGM_FI_UNKNOWN 0 - -/** - * Driver Version - */ -#define DCGM_FI_DRIVER_VERSION 1 - -/* Underlying NVML version */ -#define DCGM_FI_NVML_VERSION 2 - -/* - * Process Name - */ -#define DCGM_FI_PROCESS_NAME 3 - -/** - * Number of Devices on the node - */ -#define DCGM_FI_DEV_COUNT 4 - -/** - * Cuda Driver Version - * Retrieves a number with the major value in the thousands place and the minor value in the hundreds place. - * CUDA 11.1 = 11100 - */ -#define DCGM_FI_CUDA_DRIVER_VERSION 5 - - -/** - * Name of the GPU device - */ -#define DCGM_FI_DEV_NAME 50 - -/** - * Device Brand - */ -#define DCGM_FI_DEV_BRAND 51 - -/** - * NVML index of this GPU - */ -#define DCGM_FI_DEV_NVML_INDEX 52 - -/** - * Device Serial Number - */ -#define DCGM_FI_DEV_SERIAL 53 - -/** - * UUID corresponding to the device - */ -#define DCGM_FI_DEV_UUID 54 - -/** - * Device node minor number /dev/nvidia# - */ -#define DCGM_FI_DEV_MINOR_NUMBER 55 - -/** - * OEM inforom version - */ -#define DCGM_FI_DEV_OEM_INFOROM_VER 56 - -/** - * PCI attributes for the device - */ -#define DCGM_FI_DEV_PCI_BUSID 57 - -/** - * The combined 16-bit device id and 16-bit vendor id - */ -#define DCGM_FI_DEV_PCI_COMBINED_ID 58 - -/** - * The 32-bit Sub System Device ID - */ -#define DCGM_FI_DEV_PCI_SUBSYS_ID 59 - -/** - * Topology of all GPUs on the system via PCI (static) - */ -#define DCGM_FI_GPU_TOPOLOGY_PCI 60 - -/** - * Topology of all GPUs on the system via NVLINK (static) - */ -#define DCGM_FI_GPU_TOPOLOGY_NVLINK 61 - -/** - * Affinity of all GPUs on the system (static) - */ -#define DCGM_FI_GPU_TOPOLOGY_AFFINITY 62 - -/** - * Cuda compute capability for the device. - * The major version is the upper 32 bits and - * the minor version is the lower 32 bits. - */ -#define DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY 63 - -/** - * Compute mode for the device - */ -#define DCGM_FI_DEV_COMPUTE_MODE 65 - -/** - * Persistence mode for the device - * Boolean: 0 is disabled, 1 is enabled - */ -#define DCGM_FI_DEV_PERSISTENCE_MODE 66 - -/** - * MIG mode for the device - * Boolean: 0 is disabled, 1 is enabled - */ -#define DCGM_FI_DEV_MIG_MODE 67 - -/** - * The string that CUDA_VISIBLE_DEVICES should - * be set to for this entity (including MIG) - */ -#define DCGM_FI_DEV_CUDA_VISIBLE_DEVICES_STR 68 - -/** - * The maximum number of MIG slices supported by this GPU - */ -#define DCGM_FI_DEV_MIG_MAX_SLICES 69 - -/** - * Device CPU affinity. part 1/8 = cpus 0 - 63 - */ -#define DCGM_FI_DEV_CPU_AFFINITY_0 70 - -/** - * Device CPU affinity. part 1/8 = cpus 64 - 127 - */ -#define DCGM_FI_DEV_CPU_AFFINITY_1 71 - -/** - * Device CPU affinity. part 2/8 = cpus 128 - 191 - */ -#define DCGM_FI_DEV_CPU_AFFINITY_2 72 - -/** - * Device CPU affinity. part 3/8 = cpus 192 - 255 - */ -#define DCGM_FI_DEV_CPU_AFFINITY_3 73 - -/** - * ECC inforom version - */ -#define DCGM_FI_DEV_ECC_INFOROM_VER 80 - -/** - * Power management object inforom version - */ -#define DCGM_FI_DEV_POWER_INFOROM_VER 81 - -/** - * Inforom image version - */ -#define DCGM_FI_DEV_INFOROM_IMAGE_VER 82 - -/** - * Inforom configuration checksum - */ -#define DCGM_FI_DEV_INFOROM_CONFIG_CHECK 83 - -/** - * Reads the infoROM from the flash and verifies the checksums - */ -#define DCGM_FI_DEV_INFOROM_CONFIG_VALID 84 - -/** - * VBIOS version of the device - */ -#define DCGM_FI_DEV_VBIOS_VERSION 85 - -/** - * Total BAR1 of the GPU in MB - */ -#define DCGM_FI_DEV_BAR1_TOTAL 90 - -/** - * Deprecated - Sync boost settings on the node - */ -#define DCGM_FI_SYNC_BOOST 91 - -/** - * Used BAR1 of the GPU in MB - */ -#define DCGM_FI_DEV_BAR1_USED 92 - -/** - * Free BAR1 of the GPU in MB - */ -#define DCGM_FI_DEV_BAR1_FREE 93 - -/** - * SM clock for the device - */ -#define DCGM_FI_DEV_SM_CLOCK 100 - -/** - * Memory clock for the device - */ -#define DCGM_FI_DEV_MEM_CLOCK 101 - -/** - * Video encoder/decoder clock for the device - */ -#define DCGM_FI_DEV_VIDEO_CLOCK 102 - -/** - * SM Application clocks - */ -#define DCGM_FI_DEV_APP_SM_CLOCK 110 - -/** - * Memory Application clocks - */ -#define DCGM_FI_DEV_APP_MEM_CLOCK 111 - -/** - * Current clock throttle reasons (bitmask of DCGM_CLOCKS_THROTTLE_REASON_*) - */ -#define DCGM_FI_DEV_CLOCK_THROTTLE_REASONS 112 - -/** - * Maximum supported SM clock for the device - */ -#define DCGM_FI_DEV_MAX_SM_CLOCK 113 - -/** - * Maximum supported Memory clock for the device - */ -#define DCGM_FI_DEV_MAX_MEM_CLOCK 114 - -/** - * Maximum supported Video encoder/decoder clock for the device - */ -#define DCGM_FI_DEV_MAX_VIDEO_CLOCK 115 - -/** - * Auto-boost for the device (1 = enabled. 0 = disabled) - */ -#define DCGM_FI_DEV_AUTOBOOST 120 - -/** - * Supported clocks for the device - */ -#define DCGM_FI_DEV_SUPPORTED_CLOCKS 130 - -/** - * Memory temperature for the device - */ -#define DCGM_FI_DEV_MEMORY_TEMP 140 - -/** - * Current temperature readings for the device, in degrees C - */ -#define DCGM_FI_DEV_GPU_TEMP 150 - -/** - * Maximum operating temperature for the memory of this GPU - */ -#define DCGM_FI_DEV_MEM_MAX_OP_TEMP 151 - -/** - * Maximum operating temperature for this GPU - */ -#define DCGM_FI_DEV_GPU_MAX_OP_TEMP 152 - - -/** - * Power usage for the device in Watts - */ -#define DCGM_FI_DEV_POWER_USAGE 155 - -/** - * Total energy consumption for the GPU in mJ since the driver was last reloaded - */ -#define DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION 156 - -/** - * Slowdown temperature for the device - */ -#define DCGM_FI_DEV_SLOWDOWN_TEMP 158 - -/** - * Shutdown temperature for the device - */ -#define DCGM_FI_DEV_SHUTDOWN_TEMP 159 - -/** - * Current Power limit for the device - */ -#define DCGM_FI_DEV_POWER_MGMT_LIMIT 160 - -/** - * Minimum power management limit for the device - */ -#define DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN 161 - -/** - * Maximum power management limit for the device - */ -#define DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX 162 - -/** - * Default power management limit for the device - */ -#define DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF 163 - -/** - * Effective power limit that the driver enforces after taking into account all limiters - */ -#define DCGM_FI_DEV_ENFORCED_POWER_LIMIT 164 - -/** - * Performance state (P-State) 0-15. 0=highest - */ -#define DCGM_FI_DEV_PSTATE 190 - -/** - * Fan speed for the device in percent 0-100 - */ -#define DCGM_FI_DEV_FAN_SPEED 191 - -/** - * PCIe Tx utilization information - * - * Deprecated: Use DCGM_FI_PROF_PCIE_TX_BYTES instead. - */ -#define DCGM_FI_DEV_PCIE_TX_THROUGHPUT 200 - -/** - * PCIe Rx utilization information - * - * Deprecated: Use DCGM_FI_PROF_PCIE_RX_BYTES instead. - */ -#define DCGM_FI_DEV_PCIE_RX_THROUGHPUT 201 - -/** - * PCIe replay counter - */ -#define DCGM_FI_DEV_PCIE_REPLAY_COUNTER 202 - -/** - * GPU Utilization - */ -#define DCGM_FI_DEV_GPU_UTIL 203 - -/** - * Memory Utilization - */ -#define DCGM_FI_DEV_MEM_COPY_UTIL 204 - -/** - * Process accounting stats. - * - * This field is only supported when the host engine is running as root unless you - * enable accounting ahead of time. Accounting mode can be enabled by - * running "nvidia-smi -am 1" as root on the same node the host engine is running on. - */ -#define DCGM_FI_DEV_ACCOUNTING_DATA 205 - -/** - * Encoder Utilization - */ -#define DCGM_FI_DEV_ENC_UTIL 206 - -/** - * Decoder Utilization - */ -#define DCGM_FI_DEV_DEC_UTIL 207 - -/** - * Memory utilization samples - */ -#define DCGM_FI_DEV_MEM_COPY_UTIL_SAMPLES 210 - -/* - * SM utilization samples - */ -#define DCGM_FI_DEV_GPU_UTIL_SAMPLES 211 - -/** - * Graphics processes running on the GPU. - */ -#define DCGM_FI_DEV_GRAPHICS_PIDS 220 - -/** - * Compute processes running on the GPU. - */ -#define DCGM_FI_DEV_COMPUTE_PIDS 221 - -/** - * XID errors. The value is the specific XID error - */ -#define DCGM_FI_DEV_XID_ERRORS 230 - -/** - * PCIe Max Link Generation - */ -#define DCGM_FI_DEV_PCIE_MAX_LINK_GEN 235 - -/** - * PCIe Max Link Width - */ -#define DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH 236 - -/** - * PCIe Current Link Generation - */ -#define DCGM_FI_DEV_PCIE_LINK_GEN 237 - -/** - * PCIe Current Link Width - */ -#define DCGM_FI_DEV_PCIE_LINK_WIDTH 238 - -/** - * Power Violation time in usec - */ -#define DCGM_FI_DEV_POWER_VIOLATION 240 - -/** - * Thermal Violation time in usec - */ -#define DCGM_FI_DEV_THERMAL_VIOLATION 241 - -/** - * Sync Boost Violation time in usec - */ -#define DCGM_FI_DEV_SYNC_BOOST_VIOLATION 242 - -/** - * Board violation limit. - */ -#define DCGM_FI_DEV_BOARD_LIMIT_VIOLATION 243 - -/** - *Low utilisation violation limit. - */ -#define DCGM_FI_DEV_LOW_UTIL_VIOLATION 244 - -/** - *Reliability violation limit. - */ -#define DCGM_FI_DEV_RELIABILITY_VIOLATION 245 - -/** - * App clock violation limit. - */ -#define DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION 246 - -/** - * Base clock violation limit. - */ -#define DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION 247 - -/** - * Total Frame Buffer of the GPU in MB - */ -#define DCGM_FI_DEV_FB_TOTAL 250 - -/** - * Free Frame Buffer in MB - */ -#define DCGM_FI_DEV_FB_FREE 251 - -/** - * Used Frame Buffer in MB - */ -#define DCGM_FI_DEV_FB_USED 252 - -/** - * Current ECC mode for the device - */ -#define DCGM_FI_DEV_ECC_CURRENT 300 - -/** - * Pending ECC mode for the device - */ -#define DCGM_FI_DEV_ECC_PENDING 301 - -/** - * Total single bit volatile ECC errors - */ -#define DCGM_FI_DEV_ECC_SBE_VOL_TOTAL 310 - -/** - * Total double bit volatile ECC errors - */ -#define DCGM_FI_DEV_ECC_DBE_VOL_TOTAL 311 - -/** - * Total single bit aggregate (persistent) ECC errors - * Note: monotonically increasing - */ -#define DCGM_FI_DEV_ECC_SBE_AGG_TOTAL 312 - -/** - * Total double bit aggregate (persistent) ECC errors - * Note: monotonically increasing - */ -#define DCGM_FI_DEV_ECC_DBE_AGG_TOTAL 313 - -/** - * L1 cache single bit volatile ECC errors - */ -#define DCGM_FI_DEV_ECC_SBE_VOL_L1 314 - -/** - * L1 cache double bit volatile ECC errors - */ -#define DCGM_FI_DEV_ECC_DBE_VOL_L1 315 - -/** - * L2 cache single bit volatile ECC errors - */ -#define DCGM_FI_DEV_ECC_SBE_VOL_L2 316 - -/** - * L2 cache double bit volatile ECC errors - */ -#define DCGM_FI_DEV_ECC_DBE_VOL_L2 317 - -/** - * Device memory single bit volatile ECC errors - */ -#define DCGM_FI_DEV_ECC_SBE_VOL_DEV 318 - -/** - * Device memory double bit volatile ECC errors - */ -#define DCGM_FI_DEV_ECC_DBE_VOL_DEV 319 - -/** - * Register file single bit volatile ECC errors - */ -#define DCGM_FI_DEV_ECC_SBE_VOL_REG 320 - -/** - * Register file double bit volatile ECC errors - */ -#define DCGM_FI_DEV_ECC_DBE_VOL_REG 321 - -/** - * Texture memory single bit volatile ECC errors - */ -#define DCGM_FI_DEV_ECC_SBE_VOL_TEX 322 - -/** - * Texture memory double bit volatile ECC errors - */ -#define DCGM_FI_DEV_ECC_DBE_VOL_TEX 323 - -/** - * L1 cache single bit aggregate (persistent) ECC errors - * Note: monotonically increasing - */ -#define DCGM_FI_DEV_ECC_SBE_AGG_L1 324 - -/** - * L1 cache double bit aggregate (persistent) ECC errors - * Note: monotonically increasing - */ -#define DCGM_FI_DEV_ECC_DBE_AGG_L1 325 - -/** - * L2 cache single bit aggregate (persistent) ECC errors - * Note: monotonically increasing - */ -#define DCGM_FI_DEV_ECC_SBE_AGG_L2 326 - -/** - * L2 cache double bit aggregate (persistent) ECC errors - * Note: monotonically increasing - */ -#define DCGM_FI_DEV_ECC_DBE_AGG_L2 327 - -/** - * Device memory single bit aggregate (persistent) ECC errors - * Note: monotonically increasing - */ -#define DCGM_FI_DEV_ECC_SBE_AGG_DEV 328 - -/** - * Device memory double bit aggregate (persistent) ECC errors - * Note: monotonically increasing - */ -#define DCGM_FI_DEV_ECC_DBE_AGG_DEV 329 - -/** - * Register File single bit aggregate (persistent) ECC errors - * Note: monotonically increasing - */ -#define DCGM_FI_DEV_ECC_SBE_AGG_REG 330 - -/** - * Register File double bit aggregate (persistent) ECC errors - * Note: monotonically increasing - */ -#define DCGM_FI_DEV_ECC_DBE_AGG_REG 331 - -/** - * Texture memory single bit aggregate (persistent) ECC errors - * Note: monotonically increasing - */ -#define DCGM_FI_DEV_ECC_SBE_AGG_TEX 332 - -/** - * Texture memory double bit aggregate (persistent) ECC errors - * Note: monotonically increasing - */ -#define DCGM_FI_DEV_ECC_DBE_AGG_TEX 333 - -/** - * Number of retired pages because of single bit errors - * Note: monotonically increasing - */ -#define DCGM_FI_DEV_RETIRED_SBE 390 - -/** - * Number of retired pages because of double bit errors - * Note: monotonically increasing - */ -#define DCGM_FI_DEV_RETIRED_DBE 391 - -/** - * Number of pages pending retirement - */ -#define DCGM_FI_DEV_RETIRED_PENDING 392 - -/** - * Number of remapped rows for uncorrectable errors - */ -#define DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS 393 - -/** - * Number of remapped rows for correctable errors - */ -#define DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS 394 - -/** - * Whether remapping of rows has failed - */ -#define DCGM_FI_DEV_ROW_REMAP_FAILURE 395 - -/* - * NV Link flow control CRC Error Counter for Lane 0 - */ -#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0 400 - -/* - * NV Link flow control CRC Error Counter for Lane 1 - */ -#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1 401 - -/* - * NV Link flow control CRC Error Counter for Lane 2 - */ -#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2 402 - -/* - * NV Link flow control CRC Error Counter for Lane 3 - */ -#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3 403 - -/* - * NV Link flow control CRC Error Counter for Lane 4 - */ -#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4 404 - -/* - * NV Link flow control CRC Error Counter for Lane 5 - */ -#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5 405 - -/* - * NV Link flow control CRC Error Counter total for all Lanes - */ -#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL 409 - -/* - * NV Link data CRC Error Counter for Lane 0 - */ -#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0 410 - -/* - * NV Link data CRC Error Counter for Lane 1 - */ -#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1 411 - -/* - * NV Link data CRC Error Counter for Lane 2 - */ -#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2 412 - -/* - * NV Link data CRC Error Counter for Lane 3 - */ -#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3 413 - -/* - * NV Link data CRC Error Counter for Lane 4 - */ -#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4 414 - -/* - * NV Link data CRC Error Counter for Lane 5 - */ -#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5 415 - -/* - * NV Link data CRC Error Counter total for all Lanes - */ -#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL 419 - -/* - * NV Link Replay Error Counter for Lane 0 - */ -#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0 420 - -/* - * NV Link Replay Error Counter for Lane 1 - */ -#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1 421 - -/* - * NV Link Replay Error Counter for Lane 2 - */ -#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2 422 - -/* - * NV Link Replay Error Counter for Lane 3 - */ -#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3 423 - -/* - * NV Link Replay Error Counter for Lane 4 - */ -#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4 424 - -/* - * NV Link Replay Error Counter for Lane 5 - */ -#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5 425 - -/* - * NV Link Replay Error Counter total for all Lanes - */ -#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL 429 - -/* - * NV Link Recovery Error Counter for Lane 0 - */ -#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0 430 - -/* - * NV Link Recovery Error Counter for Lane 1 - */ -#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1 431 - -/* - * NV Link Recovery Error Counter for Lane 2 - */ -#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2 432 - -/* - * NV Link Recovery Error Counter for Lane 3 - */ -#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3 433 - -/* - * NV Link Recovery Error Counter for Lane 4 - */ -#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4 434 - -/* - * NV Link Recovery Error Counter for Lane 5 - */ -#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5 435 - -/* - * NV Link Recovery Error Counter total for all Lanes - */ -#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL 439 - -/* - * NV Link Bandwidth Counter for Lane 0 - */ -#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L0 440 - -/* - * NV Link Bandwidth Counter for Lane 1 - */ -#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L1 441 - -/* - * NV Link Bandwidth Counter for Lane 2 - */ -#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L2 442 - -/* - * NV Link Bandwidth Counter for Lane 3 - */ -#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L3 443 - -/* - * NV Link Bandwidth Counter for Lane 4 - */ -#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L4 444 - -/* - * NV Link Bandwidth Counter for Lane 5 - */ -#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L5 445 - -/* - * NV Link Bandwidth Counter total for all Lanes - */ -#define DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL 449 - -/* - * GPU NVLink error information - */ -#define DCGM_FI_DEV_GPU_NVLINK_ERRORS 450 - -/* - * NV Link flow control CRC Error Counter for Lane 6 - */ -#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L6 451 - -/* - * NV Link flow control CRC Error Counter for Lane 7 - */ -#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L7 452 - -/* - * NV Link flow control CRC Error Counter for Lane 8 - */ -#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L8 453 - -/* - * NV Link flow control CRC Error Counter for Lane 9 - */ -#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L9 454 - -/* - * NV Link flow control CRC Error Counter for Lane 10 - */ -#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L10 455 - -/* - * NV Link flow control CRC Error Counter for Lane 11 - */ -#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L11 456 - -/* - * NV Link data CRC Error Counter for Lane 6 - */ -#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L6 457 - -/* - * NV Link data CRC Error Counter for Lane 7 - */ -#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L7 458 - -/* - * NV Link data CRC Error Counter for Lane 8 - */ -#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L8 459 - -/* - * NV Link data CRC Error Counter for Lane 9 - */ -#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L9 460 - -/* - * NV Link data CRC Error Counter for Lane 10 - */ -#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L10 461 - -/* - * NV Link data CRC Error Counter for Lane 11 - */ -#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L11 462 - -/* - * NV Link Replay Error Counter for Lane 6 - */ -#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L6 463 - -/* - * NV Link Replay Error Counter for Lane 7 - */ -#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L7 464 - -/* - * NV Link Replay Error Counter for Lane 8 - */ -#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L8 465 - -/* - * NV Link Replay Error Counter for Lane 9 - */ -#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L9 466 - -/* - * NV Link Replay Error Counter for Lane 10 - */ -#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L10 467 - -/* - * NV Link Replay Error Counter for Lane 11 - */ -#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L11 468 - -/* - * NV Link Recovery Error Counter for Lane 6 - */ -#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L6 469 - -/* - * NV Link Recovery Error Counter for Lane 7 - */ -#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L7 470 - -/* - * NV Link Recovery Error Counter for Lane 8 - */ -#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L8 471 - -/* - * NV Link Recovery Error Counter for Lane 9 - */ -#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L9 472 - -/* - * NV Link Recovery Error Counter for Lane 10 - */ -#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L10 473 - -/* - * NV Link Recovery Error Counter for Lane 11 - */ -#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L11 474 - -/* - * NV Link Bandwidth Counter for Lane 6 - */ -#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L6 475 - -/* - * NV Link Bandwidth Counter for Lane 7 - */ -#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L7 476 - -/* - * NV Link Bandwidth Counter for Lane 8 - */ -#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L8 477 - -/* - * NV Link Bandwidth Counter for Lane 9 - */ -#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L9 478 - -/* - * NV Link Bandwidth Counter for Lane 10 - */ -#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L10 479 - -/* - * NV Link Bandwidth Counter for Lane 11 - */ -#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L11 480 - - -/** - * Virtualization Mode corresponding to the GPU. - * - * One of DCGM_GPU_VIRTUALIZATION_MODE_* constants. - */ -#define DCGM_FI_DEV_VIRTUAL_MODE 500 - -/** - * Includes Count and Static info of vGPU types supported on a device - */ -#define DCGM_FI_DEV_SUPPORTED_TYPE_INFO 501 - -/** - * Includes Count and currently Creatable vGPU types on a device - */ -#define DCGM_FI_DEV_CREATABLE_VGPU_TYPE_IDS 502 - -/** - * Includes Count and currently Active vGPU Instances on a device - */ -#define DCGM_FI_DEV_VGPU_INSTANCE_IDS 503 - -/** - * Utilization values for vGPUs running on the device - */ -#define DCGM_FI_DEV_VGPU_UTILIZATIONS 504 - -/** - * Utilization values for processes running within vGPU VMs using the device - */ -#define DCGM_FI_DEV_VGPU_PER_PROCESS_UTILIZATION 505 - -/** - * Current encoder statistics for a given device - */ -#define DCGM_FI_DEV_ENC_STATS 506 - -/** - * Statistics of current active frame buffer capture sessions on a given device - */ -#define DCGM_FI_DEV_FBC_STATS 507 - -/** - * Information about active frame buffer capture sessions on a target device - */ -#define DCGM_FI_DEV_FBC_SESSIONS_INFO 508 -/** - * VM ID of the vGPU instance - */ -#define DCGM_FI_DEV_VGPU_VM_ID 520 - -/** - * VM name of the vGPU instance - */ -#define DCGM_FI_DEV_VGPU_VM_NAME 521 - -/** - * vGPU type of the vGPU instance - */ -#define DCGM_FI_DEV_VGPU_TYPE 522 - -/** - * UUID of the vGPU instance - */ -#define DCGM_FI_DEV_VGPU_UUID 523 - -/** - * Driver version of the vGPU instance - */ -#define DCGM_FI_DEV_VGPU_DRIVER_VERSION 524 - -/** - * Memory usage of the vGPU instance - */ -#define DCGM_FI_DEV_VGPU_MEMORY_USAGE 525 - -/** - * License status of the vGPU instance - */ -#define DCGM_FI_DEV_VGPU_LICENSE_STATUS 526 - -/** - * Frame rate limit of the vGPU instance - */ -#define DCGM_FI_DEV_VGPU_FRAME_RATE_LIMIT 527 - -/** - * Current encoder statistics of the vGPU instance - */ -#define DCGM_FI_DEV_VGPU_ENC_STATS 528 - -/** - * Information about all active encoder sessions on the vGPU instance - */ -#define DCGM_FI_DEV_VGPU_ENC_SESSIONS_INFO 529 - -/** - * Statistics of current active frame buffer capture sessions on the vGPU instance - */ -#define DCGM_FI_DEV_VGPU_FBC_STATS 530 - -/** - * Information about active frame buffer capture sessions on the vGPU instance - */ -#define DCGM_FI_DEV_VGPU_FBC_SESSIONS_INFO 531 - -/** - * License status of the vGPU host - */ -#define DCGM_FI_DEV_VGPU_LICENSE_INSTANCE_STATUS 532 - -/** - * Starting field ID of the vGPU instance - */ -#define DCGM_FI_FIRST_VGPU_FIELD_ID 520 - -/** - * Last field ID of the vGPU instance - */ -#define DCGM_FI_LAST_VGPU_FIELD_ID 570 - -/** - * For now max vGPU field Ids taken as difference of DCGM_FI_LAST_VGPU_FIELD_ID and DCGM_FI_LAST_VGPU_FIELD_ID i.e. 50 - */ -#define DCGM_FI_MAX_VGPU_FIELDS DCGM_FI_LAST_VGPU_FIELD_ID - DCGM_FI_FIRST_VGPU_FIELD_ID - -/** - * Starting ID for all the internal fields - */ -#define DCGM_FI_INTERNAL_FIELDS_0_START 600 - -/** - * Last ID for all the internal fields - */ - -/** - *

 

- *

 

- *

 

- *

NVSwitch entity field IDs start here.

- *

 

- *

 

- *

NVSwitch latency bins for port 0

- */ - -#define DCGM_FI_INTERNAL_FIELDS_0_END 699 - - -/** - *

Low latency bin

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P00 700 -/** - * Medium latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P00 701 -/** - * High latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P00 702 -/** - * Max latency bin - *

 

- *

 

- *

NVSwitch latency bins for port 1

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P00 703 - -/** - *

Low latency bin

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P01 704 -/** - * Medium latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P01 705 -/** - * High latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P01 706 -/** - * Max latency bin - *

 

- *

 

- *

NVSwitch latency bins for port 2

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P01 707 - -/** - *

Low latency bin

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P02 708 -/** - * Medium latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P02 709 -/** - * High latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P02 710 -/** - * Max latency bin - *

 

- *

 

- *

NVSwitch latency bins for port 3

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P02 711 - -/** - *

Low latency bin

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P03 712 -/** - * Medium latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P03 713 -/** - * High latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P03 714 -/** - * Max latency bin - *

 

- *

 

- *

NVSwitch latency bins for port 4

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P03 715 - -/** - *

Low latency bin

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P04 716 -/** - * Medium latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P04 717 -/** - * High latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P04 718 -/** - * Max latency bin - *

 

- *

 

- *

NVSwitch latency bins for port 5

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P04 719 - -/** - *

Low latency bin

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P05 720 -/** - * Medium latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P05 721 -/** - * High latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P05 722 -/** - * Max latency bin - *

 

- *

 

- *

NVSwitch latency bins for port 6

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P05 723 - -/** - *

Low latency bin

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P06 724 -/** - * Medium latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P06 725 -/** - * High latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P06 726 -/** - * Max latency bin - *

 

- *

 

- *

NVSwitch latency bins for port 7

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P06 727 - -/** - *

Low latency bin

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P07 728 -/** - * Medium latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P07 729 -/** - * High latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P07 730 -/** - * Max latency bin - *

 

- *

 

- *

NVSwitch latency bins for port 8

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P07 731 - -/** - *

Low latency bin

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P08 732 -/** - * Medium latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P08 733 -/** - * High latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P08 734 -/** - * Max latency bin - *

 

- *

 

- *

NVSwitch latency bins for port 9

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P08 735 - -/** - *

Low latency bin

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P09 736 -/** - * Medium latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P09 737 -/** - * High latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P09 738 -/** - * Max latency bin - *

 

- *

 

- *

NVSwitch latency bins for port 10

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P09 739 - -/** - *

Low latency bin

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P10 740 -/** - * Medium latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P10 741 -/** - * High latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P10 742 -/** - * Max latency bin - *

 

- *

 

- *

NVSwitch latency bins for port 11

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P10 743 - -/** - *

Low latency bin

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P11 744 -/** - * Medium latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P11 745 -/** - * High latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P11 746 -/** - * Max latency bin - *

 

- *

 

- *

NVSwitch latency bins for port 12

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P11 747 - -/** - *

Low latency bin

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P12 748 -/** - * Medium latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P12 749 -/** - * High latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P12 750 -/** - * Max latency bin - *

 

- *

 

- *

NVSwitch latency bins for port 13

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P12 751 - -/** - *

Low latency bin

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P13 752 -/** - * Medium latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P13 753 -/** - * High latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P13 754 -/** - * Max latency bin - *

 

- *

 

- *

NVSwitch latency bins for port 14

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P13 755 - -/** - *

Low latency bin

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P14 756 -/** - * Medium latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P14 757 -/** - * High latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P14 758 -/** - * Max latency bin - *

 

- *

 

- *

NVSwitch latency bins for port 15

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P14 759 - -/** - *

Low latency bin

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P15 760 -/** - * Medium latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P15 761 -/** - * High latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P15 762 -/** - * Max latency bin - *

 

- *

 

- *

NVSwitch latency bins for port 16

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P15 763 - -/** - *

Low latency bin

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P16 764 -/** - * Medium latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P16 765 -/** - * High latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P16 766 -/** - * Max latency bin - *

 

- *

 

- *

NVSwitch latency bins for port 17

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P16 767 - -/** - *

Low latency bin

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P17 768 -/** - * Medium latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P17 769 -/** - * High latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P17 770 -/** - *

Max latency bin

- *

 

- *

 

- *

 

- *

NVSwitch Tx and Rx Counter 0 for each port

- *

By default, Counter 0 counts bytes.

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P17 771 - -/** - *

NVSwitch Tx Bandwidth Counter 0 for port 0

- */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P00 780 -/** - * NVSwitch Rx Bandwidth Counter 0 for port 0 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P00 781 - -/** - * NVSwitch Tx Bandwidth Counter 0 for port 1 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P01 782 -/** - * NVSwitch Rx Bandwidth Counter 0 for port 1 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P01 783 - -/** - * NVSwitch Tx Bandwidth Counter 0 for port 2 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P02 784 -/** - * NVSwitch Rx Bandwidth Counter 0 for port 2 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P02 785 - -/** - * NVSwitch Tx Bandwidth Counter 0 for port 3 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P03 786 -/** - * NVSwitch Rx Bandwidth Counter 0 for port 3 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P03 787 - -/** - * NVSwitch Tx Bandwidth Counter 0 for port 4 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P04 788 -/** - * NVSwitch Rx Bandwidth Counter 0 for port 4 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P04 789 - -/** - * NVSwitch Tx Bandwidth Counter 0 for port 5 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P05 790 -/** - * NVSwitch Rx Bandwidth Counter 0 for port 5 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P05 791 - -/** - * NVSwitch Tx Bandwidth Counter 0 for port 6 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P06 792 -/** - * NVSwitch Rx Bandwidth Counter 0 for port 6 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P06 793 - -/** - * NVSwitch Tx Bandwidth Counter 0 for port 7 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P07 794 -/** - * NVSwitch Rx Bandwidth Counter 0 for port 7 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P07 795 - -/** - * NVSwitch Tx Bandwidth Counter 0 for port 8 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P08 796 -/** - * NVSwitch Rx Bandwidth Counter 0 for port 8 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P08 797 - -/** - * NVSwitch Tx Bandwidth Counter 0 for port 9 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P09 798 -/** - * NVSwitch Rx Bandwidth Counter 0 for port 9 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P09 799 - -/** - * NVSwitch Tx Bandwidth Counter 0 for port 10 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P10 800 -/** - * NVSwitch Rx Bandwidth Counter 0 for port 10 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P10 801 - -/** - * NVSwitch Tx Bandwidth Counter 0 for port 11 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P11 802 -/** - * NVSwitch Rx Bandwidth Counter 0 for port 11 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P11 803 - -/** - * NVSwitch Tx Bandwidth Counter 0 for port 12 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P12 804 -/** - * NVSwitch Rx Bandwidth Counter 0 for port 12 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P12 805 - -/** - * NVSwitch Tx Bandwidth Counter 0 for port 13 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P13 806 -/** - * NVSwitch Rx Bandwidth Counter 0 for port 13 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P13 807 - -/** - * NVSwitch Tx Bandwidth Counter 0 for port 14 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P14 808 -/** - * NVSwitch Rx Bandwidth Counter 0 for port 14 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P14 809 - -/** - * NVSwitch Tx Bandwidth Counter 0 for port 15 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P15 810 -/** - * NVSwitch Rx Bandwidth Counter 0 for port 15 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P15 811 - -/** - * NVSwitch Tx Bandwidth Counter 0 for port 16 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P16 812 -/** - * NVSwitch Rx Bandwidth Counter 0 for port 16 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P16 813 - -/** - * NVSwitch Tx Bandwidth Counter 0 for port 17 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P17 814 -/** - *

NVSwitch Rx Bandwidth Counter 0 for port 17

- *

 

- *

 

- *

 

- *

NVSwitch Tx and RX Bandwidth Counter 1 for each port

- *

By default, Counter 1 counts packets.

- */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P17 815 - -/** - *

NVSwitch Tx Bandwidth Counter 1 for port 0

- */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P00 820 -/** - * NVSwitch Rx Bandwidth Counter 1 for port 0 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P00 821 - -/** - * NVSwitch Tx Bandwidth Counter 1 for port 1 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P01 822 -/** - * NVSwitch Rx Bandwidth Counter 1 for port 1 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P01 823 - -/** - * NVSwitch Tx Bandwidth Counter 1 for port 2 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P02 824 -/** - * NVSwitch Rx Bandwidth Counter 1 for port 2 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P02 825 - -/** - * NVSwitch Tx Bandwidth Counter 1 for port 3 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P03 826 -/** - * NVSwitch Rx Bandwidth Counter 1 for port 3 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P03 827 - -/** - * NVSwitch Tx Bandwidth Counter 1 for port 4 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P04 828 -/** - * NVSwitch Rx Bandwidth Counter 1 for port 4 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P04 829 - -/** - * NVSwitch Tx Bandwidth Counter 1 for port 5 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P05 830 -/** - * NVSwitch Rx Bandwidth Counter 1 for port 5 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P05 831 - -/** - * NVSwitch Tx Bandwidth Counter 1 for port 6 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P06 832 -/** - * NVSwitch Rx Bandwidth Counter 1 for port 6 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P06 833 - -/** - * NVSwitch Tx Bandwidth Counter 1 for port 7 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P07 834 -/** - * NVSwitch Rx Bandwidth Counter 1 for port 7 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P07 835 - -/** - * NVSwitch Tx Bandwidth Counter 1 for port 8 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P08 836 -/** - * NVSwitch Rx Bandwidth Counter 1 for port 8 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P08 837 - -/** - * NVSwitch Tx Bandwidth Counter 1 for port 9 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P09 838 -/** - * NVSwitch Rx Bandwidth Counter 1 for port 9 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P09 839 - -/** - * NVSwitch Tx Bandwidth Counter 0 for port 10 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P10 840 -/** - * NVSwitch Rx Bandwidth Counter 1 for port 10 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P10 841 - -/** - * NVSwitch Tx Bandwidth Counter 1 for port 11 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P11 842 -/** - * NVSwitch Rx Bandwidth Counter 1 for port 11 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P11 843 - -/** - * NVSwitch Tx Bandwidth Counter 1 for port 12 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P12 844 -/** - * NVSwitch Rx Bandwidth Counter 1 for port 12 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P12 845 - -/** - * NVSwitch Tx Bandwidth Counter 0 for port 13 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P13 846 -/** - * NVSwitch Rx Bandwidth Counter 1 for port 13 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P13 847 - -/** - * NVSwitch Tx Bandwidth Counter 1 for port 14 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P14 848 -/** - * NVSwitch Rx Bandwidth Counter 1 for port 14 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P14 849 - -/** - * NVSwitch Tx Bandwidth Counter 1 for port 15 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P15 850 -/** - * NVSwitch Rx Bandwidth Counter 1 for port 15 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P15 851 - -/** - * NVSwitch Tx Bandwidth Counter 1 for port 16 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P16 852 -/** - * NVSwitch Rx Bandwidth Counter 1 for port 16 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P16 853 - -/** - * NVSwitch Tx Bandwidth Counter 1 for port 17 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P17 854 -/** - * NVSwitch Rx Bandwidth Counter 1 for port 17 - *

 

- *

 

- *

 

- * NVSwitch error counters - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P17 855 - -/** - * NVSwitch fatal error information. - * Note: value field indicates the specific SXid reported - */ -#define DCGM_FI_DEV_NVSWITCH_FATAL_ERRORS 856 - -/** - * NVSwitch non fatal error information. - * Note: value field indicates the specific SXid reported - */ -#define DCGM_FI_DEV_NVSWITCH_NON_FATAL_ERRORS 857 - -/** - * Starting field ID of the NVSwitch instance - */ -#define DCGM_FI_FIRST_NVSWITCH_FIELD_ID 700 - -/** - * Last field ID of the NVSwitch instance - */ -#define DCGM_FI_LAST_NVSWITCH_FIELD_ID 860 - -/** - * For now max NVSwitch field Ids taken as difference of DCGM_FI_LAST_NVSWITCH_FIELD_ID and - * DCGM_FI_FIRST_NVSWITCH_FIELD_ID + 1 i.e. 200 - */ -#define DCGM_FI_MAX_NVSWITCH_FIELDS DCGM_FI_LAST_NVSWITCH_FIELD_ID - DCGM_FI_FIRST_NVSWITCH_FIELD_ID + 1 - -/** - * Profiling Fields. These all start with DCGM_FI_PROF_* - */ - -/** - * Ratio of time the graphics engine is active. The graphics engine is - * active if a graphics/compute context is bound and the graphics pipe or - * compute pipe is busy. - */ -#define DCGM_FI_PROF_GR_ENGINE_ACTIVE 1001 - -/** - * The ratio of cycles an SM has at least 1 warp assigned - * (computed from the number of cycles and elapsed cycles) - */ -#define DCGM_FI_PROF_SM_ACTIVE 1002 - -/** - * The ratio of number of warps resident on an SM. - * (number of resident as a ratio of the theoretical - * maximum number of warps per elapsed cycle) - */ -#define DCGM_FI_PROF_SM_OCCUPANCY 1003 - -/** - * The ratio of cycles the tensor (HMMA) pipe is active - * (off the peak sustained elapsed cycles) - */ -#define DCGM_FI_PROF_PIPE_TENSOR_ACTIVE 1004 - -/** - * The ratio of cycles the device memory interface is - * active sending or receiving data. - */ -#define DCGM_FI_PROF_DRAM_ACTIVE 1005 - -/** - * Ratio of cycles the fp64 pipe is active. - */ -#define DCGM_FI_PROF_PIPE_FP64_ACTIVE 1006 - -/** - * Ratio of cycles the fp32 pipe is active. - */ -#define DCGM_FI_PROF_PIPE_FP32_ACTIVE 1007 - -/** - * Ratio of cycles the fp16 pipe is active. This does not include HMMA. - */ -#define DCGM_FI_PROF_PIPE_FP16_ACTIVE 1008 - -/** - * The number of bytes of active PCIe tx (transmit) data including both header and payload. - * - * Note that this is from the perspective of the GPU, so copying data from device to host (DtoH) - * would be reflected in this metric. - */ -#define DCGM_FI_PROF_PCIE_TX_BYTES 1009 - -/** - * The number of bytes of active PCIe rx (read) data including both header and payload. - * - * Note that this is from the perspective of the GPU, so copying data from host to device (HtoD) - * would be reflected in this metric. - */ -#define DCGM_FI_PROF_PCIE_RX_BYTES 1010 - -/** - * The number of bytes of active NvLink tx (transmit) data including both header and payload. - */ -#define DCGM_FI_PROF_NVLINK_TX_BYTES 1011 - -/** - * The number of bytes of active NvLink rx (read) data including both header and payload. - */ -#define DCGM_FI_PROF_NVLINK_RX_BYTES 1012 - -/** - * 1 greater than maximum fields above. This is the 1 greater than the maximum field id that could be allocated - */ -#define DCGM_FI_MAX_FIELDS 1013 - - -/** @} */ - -/*****************************************************************************/ - -/** - * Structure for formating the output for dmon. - * Used as a member in dcgm_field_meta_p - */ -typedef struct -{ - char shortName[10]; /*!< Short name corresponding to field. This short name is used to identify columns in dmon - output.*/ - char unit[4]; /*!< The unit of value. Eg: C(elsius), W(att), MB/s*/ - short width; /*!< Maximum width/number of digits that a value for field can have.*/ -} dcgm_field_output_format_t, *dcgm_field_output_format_p; - -/** - * Structure to store meta data for the field - */ - -typedef struct -{ - unsigned short fieldId; /*!< Field identifier. DCGM_FI_? #define */ - char fieldType; /*!< Field type. DCGM_FT_? #define */ - unsigned char size; /*!< field size in bytes (raw value size). 0=variable (like DCGM_FT_STRING) */ - char tag[48]; /*!< Tag for this field for serialization like 'device_temperature' */ - int scope; /*!< Field scope. DCGM_FS_? #define of this field's association */ - int nvmlFieldId; /*!< Optional NVML field this DCGM field maps to. 0 = no mapping. - Otherwise, this should be a NVML_FI_? #define from nvml.h */ - dcgm_field_entity_group_t - entityLevel; /*!< Field entity level. DCGM_FE_? specifying at what level the field is queryable */ - - dcgm_field_output_format_p valueFormat; /*!< pointer to the structure that holds the formatting the - values for fields */ -} dcgm_field_meta_t, *dcgm_field_meta_p; - -/***************************************************************************************************/ -/** @addtogroup dcgmFieldIdentifiers - * @{ - */ -/***************************************************************************************************/ - -/** - * Get a pointer to the metadata for a field by its field ID. See DCGM_FI_? for a list of field IDs. - * - * @param fieldId IN: One of the field IDs (DCGM_FI_?) - * - * @return - * 0 On Failure - * >0 Pointer to field metadata structure if found. - * - */ -dcgm_field_meta_p DcgmFieldGetById(unsigned short fieldId); - -/** - * Get a pointer to the metadata for a field by its field tag. - * - * @param tag IN: Tag for the field of interest - * - * @return - * 0 On failure or not found - * >0 Pointer to field metadata structure if found - * - */ -dcgm_field_meta_p DcgmFieldGetByTag(char *tag); - -/** - * Initialize the DcgmFields module. Call this once from inside - * your program - * - * @return - * 0 On success - * <0 On error - * - */ -int DcgmFieldsInit(void); - -/** - * Terminates the DcgmFields module. Call this once from inside your program - * - * @return - * 0 On success - * <0 On error - * - */ -int DcgmFieldsTerm(void); - -/** - * Get the string version of a entityGroupId - * - * @returns - * - Pointer to a string like GPU/NvSwitch..etc - * - Null on error - * - */ -const char *DcgmFieldsGetEntityGroupString(dcgm_field_entity_group_t entityGroupId); - -/** @} */ - - -#ifdef __cplusplus -} -#endif - - -#endif // DCGMFIELDS_H diff --git a/bindings/go/dcgm/dcgm_structs.h b/bindings/go/dcgm/dcgm_structs.h deleted file mode 100644 index b038e5e9..00000000 --- a/bindings/go/dcgm/dcgm_structs.h +++ /dev/null @@ -1,2958 +0,0 @@ -/* - * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * File: dcgm_structs.h - */ - -#ifndef DCGM_STRUCTS_H -#define DCGM_STRUCTS_H - -#include "dcgm_fields.h" -#include - - -/***************************************************************************************************/ -/** @defgroup dcgmReturnEnums Enums and Macros - * @{ - */ -/***************************************************************************************************/ - -/** - * Creates a unique version number for each struct - */ -#define MAKE_DCGM_VERSION(typeName, ver) (unsigned int)(sizeof(typeName) | ((unsigned long)(ver) << 24U)) - -/** - * Represents value of the field which can be returned by Host Engine in case the - * operation is not successful - */ -#ifndef DCGM_BLANK_VALUES -#define DCGM_BLANK_VALUES - -/** - * Base value for 32 bits integer blank. can be used as an unspecified blank - */ -#define DCGM_INT32_BLANK 0x7ffffff0 - -/** - * Base value for 64 bits integer blank. can be used as an unspecified blank - */ -#define DCGM_INT64_BLANK 0x7ffffffffffffff0 - -/** - * Base value for double blank. 2 ** 47. FP 64 has 52 bits of mantissa, - * so 47 bits can still increment by 1 and represent each value from 0-15 - */ -#define DCGM_FP64_BLANK 140737488355328.0 - -/** - * Base value for string blank. - */ -#define DCGM_STR_BLANK "<<>>" - -/** - * Represents an error where INT32 data was not found - */ -#define DCGM_INT32_NOT_FOUND (DCGM_INT32_BLANK + 1) - -/** - * Represents an error where INT64 data was not found - */ -#define DCGM_INT64_NOT_FOUND (DCGM_INT64_BLANK + 1) - -/** - * Represents an error where FP64 data was not found - */ -#define DCGM_FP64_NOT_FOUND (DCGM_FP64_BLANK + 1.0) - -/** - * Represents an error where STR data was not found - */ -#define DCGM_STR_NOT_FOUND "<<>>" - -/** - * Represents an error where fetching the INT32 value is not supported - */ -#define DCGM_INT32_NOT_SUPPORTED (DCGM_INT32_BLANK + 2) - -/** - * Represents an error where fetching the INT64 value is not supported - */ -#define DCGM_INT64_NOT_SUPPORTED (DCGM_INT64_BLANK + 2) - -/** - * Represents an error where fetching the FP64 value is not supported - */ -#define DCGM_FP64_NOT_SUPPORTED (DCGM_FP64_BLANK + 2.0) - -/** - * Represents an error where fetching the STR value is not supported - */ -#define DCGM_STR_NOT_SUPPORTED "<<>>" - -/** - * Represents and error where fetching the INT32 value is not allowed with our current credentials - */ -#define DCGM_INT32_NOT_PERMISSIONED (DCGM_INT32_BLANK + 3) - -/** - * Represents and error where fetching the INT64 value is not allowed with our current credentials - */ -#define DCGM_INT64_NOT_PERMISSIONED (DCGM_INT64_BLANK + 3) - -/** - * Represents and error where fetching the FP64 value is not allowed with our current credentials - */ -#define DCGM_FP64_NOT_PERMISSIONED (DCGM_FP64_BLANK + 3.0) - -/** - * Represents and error where fetching the STR value is not allowed with our current credentials - */ -#define DCGM_STR_NOT_PERMISSIONED "<<>>" - -/** - * Macro to check if a INT32 value is blank or not - */ -#define DCGM_INT32_IS_BLANK(val) (((val) >= DCGM_INT32_BLANK) ? 1 : 0) - -/** - * Macro to check if a INT64 value is blank or not - */ -#define DCGM_INT64_IS_BLANK(val) (((val) >= DCGM_INT64_BLANK) ? 1 : 0) - -/** - * Macro to check if a FP64 value is blank or not - */ -#define DCGM_FP64_IS_BLANK(val) (((val) >= DCGM_FP64_BLANK ? 1 : 0)) - -/** - * Macro to check if a STR value is blank or not - * Works on (char *). Looks for <<< at first position and >>> inside string - */ -#define DCGM_STR_IS_BLANK(val) (val == strstr(val, "<<<") && strstr(val, ">>>")) - -#endif // DCGM_BLANK_VALUES - -/** - * Max number of GPUs supported by DCGM - */ -#define DCGM_MAX_NUM_DEVICES 32 /* DCGM 2.0 and newer = 32. DCGM 1.8 and older = 16. */ - -/** - * Number of NvLink links per GPU supported by DCGM - * This is 12 for Ampere, 6 for Volta, and 4 for Pascal - */ -#define DCGM_NVLINK_MAX_LINKS_PER_GPU 12 - -/** - * Maximum NvLink links pre-Ampere - */ -#define DCGM_NVLINK_MAX_LINKS_PER_GPU_LEGACY1 6 - -/** - * Max number of NvSwitches supported by DCGM - **/ -#define DCGM_MAX_NUM_SWITCHES 12 - -/** - * Number of NvLink links per NvSwitch supported by DCGM - */ -#define DCGM_NVLINK_MAX_LINKS_PER_NVSWITCH 36 - -/** - * Maximum number of vGPU instances per physical GPU - */ -#define DCGM_MAX_VGPU_INSTANCES_PER_PGPU 32 - -/** - * Max length of the DCGM string field - */ -#define DCGM_MAX_STR_LENGTH 256 - -/** - * Max number of clocks supported for a device - */ -#define DCGM_MAX_CLOCKS 256 - -/** - * Max limit on the number of groups supported by DCGM - */ -#define DCGM_MAX_NUM_GROUPS 64 - -/** - * Max number of active FBC sessions - */ -#define DCGM_MAX_FBC_SESSIONS 256 - -/** - * Represents the size of a buffer that holds a vGPU type Name or vGPU class type or name of process running on vGPU - * instance. - */ -#define DCGM_VGPU_NAME_BUFFER_SIZE 64 - -/** - * Represents the size of a buffer that holds a vGPU license string - */ -#define DCGM_GRID_LICENSE_BUFFER_SIZE 128 - -/** - * Default compute mode -- multiple contexts per device - */ -#define DCGM_CONFIG_COMPUTEMODE_DEFAULT 0 - -/** - * Compute-prohibited mode -- no contexts per device - */ -#define DCGM_CONFIG_COMPUTEMODE_PROHIBITED 1 - -/** - * Compute-exclusive-process mode -- only one context per device, usable from multiple threads at a time - */ -#define DCGM_CONFIG_COMPUTEMODE_EXCLUSIVE_PROCESS 2 - -/** - * Default Port Number for DCGM Host Engine - */ -#define DCGM_HE_PORT_NUMBER 5555 - -#ifdef __cplusplus -extern "C" { -#endif - -/** - * Operation mode for DCGM - * - * DCGM can run in auto-mode where it runs additional threads in the background to collect - * any metrics of interest and auto manages any operations needed for policy management. - * - * DCGM can also operate in manual-mode where it's execution is controlled by the user. In - * this mode, the user has to periodically call APIs such as \ref dcgmPolicyTrigger and - * \ref dcgmUpdateAllFields which tells DCGM to wake up and perform data collection and - * operations needed for policy management. - */ -typedef enum dcgmOperationMode_enum -{ - DCGM_OPERATION_MODE_AUTO = 1, - DCGM_OPERATION_MODE_MANUAL = 2 -} dcgmOperationMode_t; - -/** - * When more than one value is returned from a query, which order should it be returned in? - */ -typedef enum dcgmOrder_enum -{ - DCGM_ORDER_ASCENDING = 1, //!< Data with earliest (lowest) timestamps returned first - DCGM_ORDER_DESCENDING = 2 //!< Data with latest (highest) timestamps returned first -} dcgmOrder_t; - -/** - * Return values for DCGM API calls. - */ -typedef enum dcgmReturn_enum -{ - DCGM_ST_OK = 0, //!< Success - DCGM_ST_BADPARAM = -1, //!< A bad parameter was passed to a function - DCGM_ST_GENERIC_ERROR = -3, //!< A generic, unspecified error - DCGM_ST_MEMORY = -4, //!< An out of memory error occurred - DCGM_ST_NOT_CONFIGURED = -5, //!< Setting not configured - DCGM_ST_NOT_SUPPORTED = -6, //!< Feature not supported - DCGM_ST_INIT_ERROR = -7, //!< DCGM Init error - DCGM_ST_NVML_ERROR = -8, //!< When NVML returns error - DCGM_ST_PENDING = -9, //!< Object is in pending state of something else - DCGM_ST_UNINITIALIZED = -10, //!< Object is in undefined state - DCGM_ST_TIMEOUT = -11, //!< Requested operation timed out - DCGM_ST_VER_MISMATCH = -12, //!< Version mismatch between received and understood API - DCGM_ST_UNKNOWN_FIELD = -13, //!< Unknown field id - DCGM_ST_NO_DATA = -14, //!< No data is available - DCGM_ST_STALE_DATA = -15, //!< Data is considered stale - DCGM_ST_NOT_WATCHED = -16, //!< The given field id is not being updated by the cache manager - DCGM_ST_NO_PERMISSION = -17, //!< Do not have permission to perform the desired action - DCGM_ST_GPU_IS_LOST = -18, //!< GPU is no longer reachable - DCGM_ST_RESET_REQUIRED = -19, //!< GPU requires a reset - DCGM_ST_FUNCTION_NOT_FOUND = -20, //!< The function that was requested was not found (bindings only error) - DCGM_ST_CONNECTION_NOT_VALID = -21, //!< The connection to the host engine is not valid any longer - DCGM_ST_GPU_NOT_SUPPORTED = -22, //!< This GPU is not supported by DCGM - DCGM_ST_GROUP_INCOMPATIBLE = -23, //!< The GPUs of the provided group are not compatible with each other for the - //!< requested operation - DCGM_ST_MAX_LIMIT = -24, //!< Max limit reached for the object - DCGM_ST_LIBRARY_NOT_FOUND = -25, //!< DCGM library could not be found - DCGM_ST_DUPLICATE_KEY = -26, //!< Duplicate key passed to a function - DCGM_ST_GPU_IN_SYNC_BOOST_GROUP = -27, //!< GPU is already a part of a sync boost group - DCGM_ST_GPU_NOT_IN_SYNC_BOOST_GROUP = -28, //!< GPU is not a part of a sync boost group - DCGM_ST_REQUIRES_ROOT = -29, //!< This operation cannot be performed when the host engine is running as non-root - DCGM_ST_NVVS_ERROR = -30, //!< DCGM GPU Diagnostic was successfully executed, but reported an error. - DCGM_ST_INSUFFICIENT_SIZE = -31, //!< An input argument is not large enough - DCGM_ST_FIELD_UNSUPPORTED_BY_API = -32, //!< The given field ID is not supported by the API being called - DCGM_ST_MODULE_NOT_LOADED = -33, //!< This request is serviced by a module of DCGM that is not currently loaded - DCGM_ST_IN_USE = -34, //!< The requested operation could not be completed because the affected - //!< resource is in use - DCGM_ST_GROUP_IS_EMPTY = -35, //!< This group is empty and the requested operation is not valid on an empty group - DCGM_ST_PROFILING_NOT_SUPPORTED = -36, //!< Profiling is not supported for this group of GPUs or GPU. - DCGM_ST_PROFILING_LIBRARY_ERROR = -37, //!< The third-party Profiling module returned an unrecoverable error. - DCGM_ST_PROFILING_MULTI_PASS = -38, //!< The requested profiling metrics cannot be collected in a single pass - DCGM_ST_DIAG_ALREADY_RUNNING = -39, //!< A diag instance is already running, cannot run a new diag until - //!< the current one finishes. - DCGM_ST_DIAG_BAD_JSON = -40, //!< The DCGM GPU Diagnostic returned JSON that cannot be parsed - DCGM_ST_DIAG_BAD_LAUNCH = -41, //!< Error while launching the DCGM GPU Diagnostic - DCGM_ST_DIAG_VARIANCE = -42, //!< There is too much variance while training the diagnostic - DCGM_ST_DIAG_THRESHOLD_EXCEEDED = -43, //!< A field value met or exceeded the error threshold. - DCGM_ST_INSUFFICIENT_DRIVER_VERSION = -44, //!< The installed driver version is insufficient for this API - DCGM_ST_INSTANCE_NOT_FOUND = -45, //!< The specified GPU instance does not exist - DCGM_ST_COMPUTE_INSTANCE_NOT_FOUND = -46, //!< The specified GPU compute instance does not exist - DCGM_ST_CHILD_NOT_KILLED = -47, //!< Couldn't kill a child process within the retries - DCGM_ST_3RD_PARTY_LIBRARY_ERROR = -48, //!< Detected an error in a 3rd-party library - DCGM_ST_INSUFFICIENT_RESOURCES = -49, //!< Not enough resources available - DCGM_ST_PLUGIN_EXCEPTION = -50, //!< Exception thrown from a diagnostic plugin - DCGM_ST_NVVS_ISOLATE_ERROR = -51, //!< The diagnostic returned an error that indicates the need for isolation -} dcgmReturn_t; - -const char *errorString(dcgmReturn_t result); - -/** - * Type of GPU groups - */ -typedef enum dcgmGroupType_enum -{ - DCGM_GROUP_DEFAULT = 0, //!< All the GPUs on the node are added to the group - DCGM_GROUP_EMPTY = 1, //!< Creates an empty group - DCGM_GROUP_DEFAULT_NVSWITCHES = 2, //!< All NvSwitches of the node are added to the group - DCGM_GROUP_DEFAULT_INSTANCES = 3, //!< All GPU instances of the node are added to the group - DCGM_GROUP_DEFAULT_COMPUTE_INSTANCES = 4, //!< All compute instances of the node are added to the group - DCGM_GROUP_DEFAULT_EVERYTHING = 5, //!< All entities are added to this default group -} dcgmGroupType_t; - -/** - * Identifies for special DCGM groups - */ -#define DCGM_GROUP_ALL_GPUS 0x7fffffff -#define DCGM_GROUP_ALL_NVSWITCHES 0x7ffffffe -#define DCGM_GROUP_ALL_INSTANCES 0x7ffffffd -#define DCGM_GROUP_ALL_COMPUTE_INSTANCES 0x7ffffffc -#define DCGM_GROUP_ALL_ENTITIES 0x7ffffffb - -/** - * Maximum number of entities per entity group - */ -#define DCGM_GROUP_MAX_ENTITIES 64 - -/** - * Simplified chip architecture. Note that these are made to match nvmlChipArchitecture_t and thus - * do not start at 0. - */ -typedef enum dcgmChipArchitecture_enum -{ - DCGM_CHIP_ARCH_OLDER = 1, //!< All GPUs older than Kepler - DCGM_CHIP_ARCH_KEPLER = 2, //!< All Kepler-architecture parts - DCGM_CHIP_ARCH_MAXWELL = 3, //!< All Maxwell-architecture parts - DCGM_CHIP_ARCH_PASCAL = 4, //!< All Pascal-architecture parts - DCGM_CHIP_ARCH_VOLTA = 5, //!< All Volta-architecture parts - DCGM_CHIP_ARCH_TURING = 6, //!< All Turing-architecture parts - DCGM_CHIP_ARCH_AMPERE = 7, //!< All Ampere-architecture parts - - DCGM_CHIP_ARCH_COUNT, //!< Keep this second to last, exclude unknown - - DCGM_CHIP_ARCH_UNKNOWN = 0xffffffff //!< Anything else, presumably something newer -} dcgmChipArchitecture_t; - -/** - * Represents the type of configuration to be fetched from the GPUs - */ -typedef enum dcgmConfigType_enum -{ - DCGM_CONFIG_TARGET_STATE = 0, //!< The target configuration values to be applied - DCGM_CONFIG_CURRENT_STATE = 1, //!< The current configuration state -} dcgmConfigType_t; - -/** - * Represents the power cap for each member of the group. - */ -typedef enum dcgmConfigPowerLimitType_enum -{ - DCGM_CONFIG_POWER_CAP_INDIVIDUAL = 0, //!< Represents the power cap to be applied for each member of the group - DCGM_CONFIG_POWER_BUDGET_GROUP = 1, //!< Represents the power budget for the entire group -} dcgmConfigPowerLimitType_t; - -/** @} */ - - -/***************************************************************************************************/ -/** @defgroup dcgmStructs Structure definitions - * @{ - */ -/***************************************************************************************************/ -typedef uintptr_t dcgmHandle_t; //!< Identifier for DCGM Handle -typedef uintptr_t dcgmGpuGrp_t; //!< Identifier for a group of GPUs. A group can have one or more GPUs -typedef uintptr_t dcgmFieldGrp_t; //!< Identifier for a group of fields. -typedef uintptr_t dcgmStatus_t; //!< Identifier for list of status codes - -/** - * DCGM Logging Severities. These match up with plog severities defined in Severity.h - * Each level includes all of the levels above it. For instance, level 4 includes 3,2, and 1 as well - */ -typedef enum -{ - DcgmLoggingSeverityUnspecified = -1, /*!< Don't care/inherit from the environment */ - DcgmLoggingSeverityNone = 0, /*!< No logging */ - DcgmLoggingSeverityFatal = 1, /*!< Fatal Errors */ - DcgmLoggingSeverityError = 2, /*!< Errors */ - DcgmLoggingSeverityWarning = 3, /*!< Warnings */ - DcgmLoggingSeverityInfo = 4, /*!< Informative */ - DcgmLoggingSeverityDebug = 5, /*!< Debug information (will generate large logs) */ - DcgmLoggingSeverityVerbose = 6 /*!< Verbose debugging information */ -} DcgmLoggingSeverity_t; - -/** - * Connection options for dcgmConnect_v2 (v1) - * - * NOTE: This version is deprecated. use dcgmConnectV2Params_v2 - */ -typedef struct -{ - unsigned int version; /*!< Version number. Use dcgmConnectV2Params_version */ - unsigned int persistAfterDisconnect; /*!< Whether to persist DCGM state modified by this connection - once the connection is terminated. Normally, all field - watches created by a connection are removed once a - connection goes away. - 1 = do not clean up after this connection. - 0 = clean up after this connection */ -} dcgmConnectV2Params_v1; - -/** - * Version 1 for \ref dcgmConnectV2Params_v1 - */ -#define dcgmConnectV2Params_version1 MAKE_DCGM_VERSION(dcgmConnectV2Params_v1, 1) - -/** - * Connection options for dcgmConnect_v2 (v2) - */ -typedef struct -{ - unsigned int version; /*!< Version number. Use dcgmConnectV2Params_version */ - unsigned int persistAfterDisconnect; /*!< Whether to persist DCGM state modified by this connection once the - connection is terminated. Normally, all field watches created by a - connection are removed once a connection goes away. 1 = do not clean up - after this connection. 0 = clean up after this connection */ - unsigned int timeoutMs; /*!< When attempting to connect to the specified host engine, how long should - we wait in milliseconds before giving up */ - unsigned int addressIsUnixSocket; /*!< Whether or not the passed-in address is a unix socket filename (1) or a - TCP/IP address (0) */ -} dcgmConnectV2Params_v2; - -/** - * Typedef for \ref dcgmConnectV2Params_v2 - */ -typedef dcgmConnectV2Params_v2 dcgmConnectV2Params_t; - -/** - * Version 2 for \ref dcgmConnectV2Params_v2 - */ -#define dcgmConnectV2Params_version2 MAKE_DCGM_VERSION(dcgmConnectV2Params_v2, 2) - -/** - * Latest version for \ref dcgmConnectV2Params_t - */ -#define dcgmConnectV2Params_version dcgmConnectV2Params_version2 - -/** - * Typedef for \ref dcgmHostengineHealth_v1 - */ -typedef struct -{ - unsigned int version; //!< The version of this request - unsigned int overallHealth; //!< 0 to indicate healthy, or a code to indicate the error - // For now, this will always be populated with 0 if the - // hostengine can respond. In the future this will be - // updated to have other options like NVML unresponsive, - // no GPUs on system, etc. -} dcgmHostengineHealth_v1; - -/** - * Typedef for \ref dcgmHostengineHealth_t - */ -typedef dcgmHostengineHealth_v1 dcgmHostengineHealth_t; - -#define dcgmHostengineHealth_version1 MAKE_DCGM_VERSION(dcgmHostengineHealth_v1, 1) - -/** - * Latest version for \ref dcgmHostengineHealth_t - */ -#define dcgmHostengineHealth_version dcgmHostengineHealth_version1 - -/** - * Represents a entityGroupId + entityId pair to uniquely identify a given entityId inside a group of entities - * - * Added in DCGM 1.5.0 - */ -typedef struct -{ - dcgm_field_entity_group_t entityGroupId; //!< Entity Group ID entity belongs to - dcgm_field_eid_t entityId; //!< Entity ID of the entity -} dcgmGroupEntityPair_t; - -/** - * Structure to store information for DCGM group - * - * Added in DCGM 1.5.0 - */ -typedef struct -{ - unsigned int version; //!< Version Number (use dcgmGroupInfo_version2) - unsigned int count; //!< count of entityIds returned in \a entityList - char groupName[DCGM_MAX_STR_LENGTH]; //!< Group Name - dcgmGroupEntityPair_t entityList[DCGM_GROUP_MAX_ENTITIES]; //!< List of the entities that are in this group -} dcgmGroupInfo_v2; - -/** - * Typedef for \ref dcgmGroupInfo_v2 - */ -typedef dcgmGroupInfo_v2 dcgmGroupInfo_t; - -/** - * Version 2 for \ref dcgmGroupInfo_v2 - */ -#define dcgmGroupInfo_version2 MAKE_DCGM_VERSION(dcgmGroupInfo_v2, 2) - -/** - * Latest version for \ref dcgmGroupInfo_t - */ -#define dcgmGroupInfo_version dcgmGroupInfo_version2 - -/** - * Enum for the different kinds of MIG profiles - */ -typedef enum -{ - DcgmMigProfileNone = 0, /*!< No profile (for GPUs) */ - DcgmMigProfileGpuInstanceSlice1 = 1, /*!< GPU instance slice 1 */ - DcgmMigProfileGpuInstanceSlice2 = 2, /*!< GPU instance slice 2 */ - DcgmMigProfileGpuInstanceSlice3 = 3, /*!< GPU instance slice 3 */ - DcgmMigProfileGpuInstanceSlice4 = 4, /*!< GPU instance slice 4 */ - DcgmMigProfileGpuInstanceSlice7 = 5, /*!< GPU instance slice 7 */ - DcgmMigProfileGpuInstanceSlice8 = 6, /*!< GPU instance slice 8 */ - DcgmMigProfileComputeInstanceSlice1 = 30, /*!< compute instance slice 1 */ - DcgmMigProfileComputeInstanceSlice2 = 31, /*!< compute instance slice 2 */ - DcgmMigProfileComputeInstanceSlice3 = 32, /*!< compute instance slice 3 */ - DcgmMigProfileComputeInstanceSlice4 = 33, /*!< compute instance slice 4*/ - DcgmMigProfileComputeInstanceSlice7 = 34, /*!< compute instance slice 7 */ - DcgmMigProfileComputeInstanceSlice8 = 35, /*!< compute instance slice 8 */ -} dcgmMigProfile_t; - -/** - * Represents a pair of entity pairings to uniquely identify an entity and its place in the hierarchy. - */ -typedef struct -{ - dcgmGroupEntityPair_t entity; //!< Entity id and type for the entity in question - dcgmGroupEntityPair_t parent; //!< Entity id and type for the parent of the entity in question - dcgmMigProfile_t sliceProfile; //!< Entity MIG profile identifier -} dcgmMigHierarchyInfo_t; - -/** - * Provides additional information about location of MIG entities. - */ -typedef struct -{ - char gpuUuid[128]; /*!< GPU UUID */ - unsigned int nvmlGpuIndex; /*!< GPU index from NVML */ - unsigned int nvmlInstanceId; /*!< GPU instance index within GPU. 0 to N. -1 for GPU entities */ - unsigned int nvmlComputeInstanceId; /*!< GPU Compute instance index within GPU instance. 0 to N. -1 for GPU - * Instance and GPU entities */ - unsigned int nvmlMigProfileId; /*!< Unique profile ID for GPU or Compute instances. -1 GPU entities - * \see nvmlComputeInstanceProfileInfo_st - * \see nvmlGpuInstanceProfileInfo_st */ - unsigned int nvmlProfileSlices; /*!< Number of slices in the MIG profile */ -} dcgmMigEntityInfo_t; - -typedef struct -{ - dcgmGroupEntityPair_t entity; - dcgmGroupEntityPair_t parent; - dcgmMigEntityInfo_t info; -} dcgmMigHierarchyInfo_v2; - -#define DCGM_MAX_INSTANCES_PER_GPU 8 -// There can never be more compute instances per GPU than instances per GPU because a compute instance is part -// of an instance -#define DCGM_MAX_COMPUTE_INSTANCES_PER_GPU DCGM_MAX_INSTANCES_PER_GPU -// Currently, there cannot be more than 14 instances + compute instances. There are always 7 compute instances -// and never more than 7 instances -#define DCGM_MAX_TOTAL_INSTANCES_PER_GPU 14 -#define DCGM_MAX_HIERARCHY_INFO DCGM_MAX_NUM_DEVICES *DCGM_MAX_TOTAL_INSTANCES_PER_GPU -#define DCGM_MAX_INSTANCES DCGM_MAX_NUM_DEVICES *DCGM_MAX_INSTANCES_PER_GPU -// The maximum compute instances are always the same as the maximum instances because each compute instance is -// part of an instance. -#define DCGM_MAX_COMPUTE_INSTANCES DCGM_MAX_INSTANCES - -/** - * Structure to store the GPU hierarchy for a system - * - * Added in DCGM 2.0 - */ -typedef struct -{ - unsigned int version; - unsigned int count; - dcgmMigHierarchyInfo_t entityList[DCGM_MAX_HIERARCHY_INFO]; -} dcgmMigHierarchy_v1; - -#define dcgmMigHierarchy_version1 MAKE_DCGM_VERSION(dcgmMigHierarchy_v1, 1) - -typedef struct -{ - unsigned int version; - unsigned int count; - dcgmMigHierarchyInfo_v2 entityList[DCGM_MAX_HIERARCHY_INFO]; -} dcgmMigHierarchy_v2; - -#define dcgmMigHierarchy_version2 MAKE_DCGM_VERSION(dcgmMigHierarchy_v2, 2) - -#define dcgmMigHierarchy_version dcgmMigHiearchyVersion2 - -/** - * Maximum number of field groups that can exist - */ -#define DCGM_MAX_NUM_FIELD_GROUPS 64 - -/** - * Maximum number of field IDs that can be in a single field group - */ -#define DCGM_MAX_FIELD_IDS_PER_FIELD_GROUP 128 - -/** - * Structure to represent information about a field group - */ -typedef struct -{ - unsigned int version; //!< Version number (dcgmFieldGroupInfo_version) - unsigned int numFieldIds; //!< Number of entries in fieldIds[] that are valid - dcgmFieldGrp_t fieldGroupId; //!< ID of this field group - char fieldGroupName[DCGM_MAX_STR_LENGTH]; //!< Field Group Name - unsigned short fieldIds[DCGM_MAX_FIELD_IDS_PER_FIELD_GROUP]; //!< Field ids that belong to this group -} dcgmFieldGroupInfo_v1; - -typedef dcgmFieldGroupInfo_v1 dcgmFieldGroupInfo_t; - -/** - * Version 1 for dcgmFieldGroupInfo_v1 - */ -#define dcgmFieldGroupInfo_version1 MAKE_DCGM_VERSION(dcgmFieldGroupInfo_v1, 1) - -/** - * Latest version for dcgmFieldGroupInfo_t - */ -#define dcgmFieldGroupInfo_version dcgmFieldGroupInfo_version1 - -typedef struct -{ - unsigned int version; //!< Version number (dcgmAllFieldGroupInfo_version) - unsigned int numFieldGroups; //!< Number of entries in fieldGroups[] that are populated - dcgmFieldGroupInfo_t fieldGroups[DCGM_MAX_NUM_FIELD_GROUPS]; //!< Info about each field group -} dcgmAllFieldGroup_v1; - -typedef dcgmAllFieldGroup_v1 dcgmAllFieldGroup_t; - -/** - * Version 1 for dcgmAllFieldGroup_v1 - */ -#define dcgmAllFieldGroup_version1 MAKE_DCGM_VERSION(dcgmAllFieldGroup_v1, 1) - -/** - * Latest version for dcgmAllFieldGroup_t - */ -#define dcgmAllFieldGroup_version dcgmAllFieldGroup_version1 - -/** - * Structure to represent error attributes - */ -typedef struct -{ - unsigned int gpuId; //!< Represents GPU ID - short fieldId; //!< One of DCGM_FI_? - int status; //!< One of DCGM_ST_? -} dcgmErrorInfo_t; - -/** - * Represents a set of memory, SM, and video clocks for a device. This can be current values or a target values - * based on context - */ -typedef struct -{ - int version; //!< Version Number (dcgmClockSet_version) - unsigned int memClock; //!< Memory Clock (Memory Clock value OR DCGM_INT32_BLANK to Ignore/Use compatible - //!< value with smClk) - unsigned int smClock; //!< SM Clock (SM Clock value OR DCGM_INT32_BLANK to Ignore/Use compatible value with memClk) -} dcgmClockSet_v1; - -/** - * Typedef for \ref dcgmClockSet_v1 - */ -typedef dcgmClockSet_v1 dcgmClockSet_t; - -/** - * Version 1 for \ref dcgmClockSet_v1 - */ -#define dcgmClockSet_version1 MAKE_DCGM_VERSION(dcgmClockSet_v1, 1) - -/** - * Latest version for \ref dcgmClockSet_t - */ -#define dcgmClockSet_version dcgmClockSet_version1 - -/** - * Represents list of supported clock sets for a device - */ -typedef struct -{ - unsigned int version; //!< Version Number (dcgmDeviceSupportedClockSets_version) - unsigned int count; //!< Number of supported clocks - dcgmClockSet_t clockSet[DCGM_MAX_CLOCKS]; //!< Valid clock sets for the device. Upto \ref count entries are filled -} dcgmDeviceSupportedClockSets_v1; - -/** - * Typedef for \ref dcgmDeviceSupportedClockSets_v1 - */ -typedef dcgmDeviceSupportedClockSets_v1 dcgmDeviceSupportedClockSets_t; - -/** - * Version 1 for \ref dcgmDeviceSupportedClockSets_v1 - */ -#define dcgmDeviceSupportedClockSets_version1 MAKE_DCGM_VERSION(dcgmDeviceSupportedClockSets_v1, 1) - -/** - * Latest version for \ref dcgmDeviceSupportedClockSets_t - */ -#define dcgmDeviceSupportedClockSets_version dcgmDeviceSupportedClockSets_version1 - -/** - * Represents accounting data for one process - */ -typedef struct -{ - unsigned int version; //!< Version Number. Should match dcgmDevicePidAccountingStats_version - unsigned int pid; //!< Process id of the process these stats are for - unsigned int gpuUtilization; //!< Percent of time over the process's lifetime during which one or more kernels - //!< was executing on the GPU. - //!< Set to DCGM_INT32_NOT_SUPPORTED if is not supported - unsigned int memoryUtilization; //!< Percent of time over the process's lifetime during which global (device) - //!< memory was being read or written. - //!< Set to DCGM_INT32_NOT_SUPPORTED if is not supported - unsigned long long maxMemoryUsage; //!< Maximum total memory in bytes that was ever allocated by the process. - //!< Set to DCGM_INT64_NOT_SUPPORTED if is not supported - unsigned long long startTimestamp; //!< CPU Timestamp in usec representing start time for the process - unsigned long long activeTimeUsec; //!< Amount of time in usec during which the compute context was active. - //!< Note that this does not mean the context was being used. endTimestamp - //!< can be computed as startTimestamp + activeTime -} dcgmDevicePidAccountingStats_v1; - -/** - * Typedef for \ref dcgmDevicePidAccountingStats_v1 - */ -typedef dcgmDevicePidAccountingStats_v1 dcgmDevicePidAccountingStats_t; - -/** - * Version 1 for \ref dcgmDevicePidAccountingStats_v1 - */ -#define dcgmDevicePidAccountingStats_version1 MAKE_DCGM_VERSION(dcgmDevicePidAccountingStats_v1, 1) - -/** - * Latest version for \ref dcgmDevicePidAccountingStats_t - */ -#define dcgmDevicePidAccountingStats_version dcgmDevicePidAccountingStats_version1 - -/** - * Represents thermal information - */ -typedef struct -{ - unsigned int version; //!< Version Number - unsigned int slowdownTemp; //!< Slowdown temperature - unsigned int shutdownTemp; //!< Shutdown temperature -} dcgmDeviceThermals_v1; - -/** - * Typedef for \ref dcgmDeviceThermals_v1 - */ -typedef dcgmDeviceThermals_v1 dcgmDeviceThermals_t; - -/** - * Version 1 for \ref dcgmDeviceThermals_v1 - */ -#define dcgmDeviceThermals_version1 MAKE_DCGM_VERSION(dcgmDeviceThermals_v1, 1) - -/** - * Latest version for \ref dcgmDeviceThermals_t - */ -#define dcgmDeviceThermals_version dcgmDeviceThermals_version1 - -/** - * Represents various power limits - */ -typedef struct -{ - unsigned int version; //!< Version Number - unsigned int curPowerLimit; //!< Power management limit associated with this device (in W) - unsigned int defaultPowerLimit; //!< Power management limit effective at device boot (in W) - unsigned int enforcedPowerLimit; //!< Effective power limit that the driver enforces after taking into account - //!< all limiters (in W) - unsigned int minPowerLimit; //!< Minimum power management limit (in W) - unsigned int maxPowerLimit; //!< Maximum power management limit (in W) -} dcgmDevicePowerLimits_v1; - -/** - * Typedef for \ref dcgmDevicePowerLimits_v1 - */ -typedef dcgmDevicePowerLimits_v1 dcgmDevicePowerLimits_t; - -/** - * Version 1 for \ref dcgmDevicePowerLimits_v1 - */ -#define dcgmDevicePowerLimits_version1 MAKE_DCGM_VERSION(dcgmDevicePowerLimits_v1, 1) - -/** - * Latest version for \ref dcgmDevicePowerLimits_t - */ -#define dcgmDevicePowerLimits_version dcgmDevicePowerLimits_version1 - -/** - * Represents device identifiers - */ -typedef struct -{ - unsigned int version; //!< Version Number (dcgmDeviceIdentifiers_version) - char brandName[DCGM_MAX_STR_LENGTH]; //!< Brand Name - char deviceName[DCGM_MAX_STR_LENGTH]; //!< Name of the device - char pciBusId[DCGM_MAX_STR_LENGTH]; //!< PCI Bus ID - char serial[DCGM_MAX_STR_LENGTH]; //!< Serial for the device - char uuid[DCGM_MAX_STR_LENGTH]; //!< UUID for the device - char vbios[DCGM_MAX_STR_LENGTH]; //!< VBIOS version - char inforomImageVersion[DCGM_MAX_STR_LENGTH]; //!< Inforom Image version - unsigned int pciDeviceId; //!< The combined 16-bit device id and 16-bit vendor id - unsigned int pciSubSystemId; //!< The 32-bit Sub System Device ID - char driverVersion[DCGM_MAX_STR_LENGTH]; //!< Driver Version - unsigned int virtualizationMode; //!< Virtualization Mode -} dcgmDeviceIdentifiers_v1; - -/** - * Typedef for \ref dcgmDeviceIdentifiers_v1 - */ -typedef dcgmDeviceIdentifiers_v1 dcgmDeviceIdentifiers_t; - -/** - * Version 1 for \ref dcgmDeviceIdentifiers_v1 - */ -#define dcgmDeviceIdentifiers_version1 MAKE_DCGM_VERSION(dcgmDeviceIdentifiers_v1, 1) - -/** - * Latest version for \ref dcgmDeviceIdentifiers_t - */ -#define dcgmDeviceIdentifiers_version dcgmDeviceIdentifiers_version1 - -/** - * Represents device memory and usage - */ -typedef struct -{ - unsigned int version; //!< Version Number (dcgmDeviceMemoryUsage_version) - unsigned int bar1Total; //!< Total BAR1 size in megabytes - unsigned int fbTotal; //!< Total framebuffer memory in megabytes - unsigned int fbUsed; //!< Used framebuffer memory in megabytes - unsigned int fbFree; //!< Free framebuffer memory in megabytes -} dcgmDeviceMemoryUsage_v1; - -/** - * Typedef for \ref dcgmDeviceMemoryUsage_v1 - */ -typedef dcgmDeviceMemoryUsage_v1 dcgmDeviceMemoryUsage_t; - -/** - * Version 1 for \ref dcgmDeviceMemoryUsage_v1 - */ -#define dcgmDeviceMemoryUsage_version1 MAKE_DCGM_VERSION(dcgmDeviceMemoryUsage_v1, 1) - -/** - * Latest version for \ref dcgmDeviceMemoryUsage_t - */ -#define dcgmDeviceMemoryUsage_version dcgmDeviceMemoryUsage_version1 - -/** - * Represents utilization values for vGPUs running on the device - */ -typedef struct -{ - unsigned int version; //!< Version Number (dcgmDeviceVgpuUtilInfo_version) - unsigned int vgpuId; //!< vGPU instance ID - unsigned int smUtil; //!< GPU utilization for vGPU - unsigned int memUtil; //!< Memory utilization for vGPU - unsigned int encUtil; //!< Encoder utilization for vGPU - unsigned int decUtil; //!< Decoder utilization for vGPU -} dcgmDeviceVgpuUtilInfo_v1; - -/** - * Typedef for \ref dcgmDeviceVgpuUtilInfo_v1 - */ -typedef dcgmDeviceVgpuUtilInfo_v1 dcgmDeviceVgpuUtilInfo_t; - -/** - * Version 1 for \ref dcgmDeviceVgpuUtilInfo_v1 - */ -#define dcgmDeviceVgpuUtilInfo_version1 MAKE_DCGM_VERSION(dcgmDeviceVgpuUtilInfo_v1, 1) - -/** - * Latest version for \ref dcgmDeviceVgpuUtilInfo_t - */ -#define dcgmDeviceVgpuUtilInfo_version dcgmDeviceVgpuUtilInfo_version1 - -/** - * Represents current encoder statistics for the given device/vGPU instance - */ -typedef struct -{ - unsigned int version; //!< Version Number (dcgmDeviceEncStats_version) - unsigned int sessionCount; //!< Count of active encoder sessions - unsigned int averageFps; //!< Trailing average FPS of all active sessions - unsigned int averageLatency; //!< Encode latency in milliseconds -} dcgmDeviceEncStats_v1; - -/** - * Typedef for \ref dcgmDeviceEncStats_v1 - */ -typedef dcgmDeviceEncStats_v1 dcgmDeviceEncStats_t; - -/** - * Version 1 for \ref dcgmDeviceEncStats_v1 - */ -#define dcgmDeviceEncStats_version1 MAKE_DCGM_VERSION(dcgmDeviceEncStats_v1, 1) - -/** - * Latest version for \ref dcgmDeviceEncStats_t - */ -#define dcgmDeviceEncStats_version dcgmDeviceEncStats_version1 - -/** - * Represents current frame buffer capture sessions statistics for the given device/vGPU instance - */ -typedef struct -{ - unsigned int version; //!< Version Number (dcgmDeviceFbcStats_version) - unsigned int sessionCount; //!< Count of active FBC sessions - unsigned int averageFps; //!< Moving average new frames captured per second - unsigned int averageLatency; //!< Moving average new frame capture latency in microseconds -} dcgmDeviceFbcStats_v1; - -/** - * Typedef for \ref dcgmDeviceFbcStats_v1 - */ -typedef dcgmDeviceFbcStats_v1 dcgmDeviceFbcStats_t; - -/** - * Version 1 for \ref dcgmDeviceFbcStats_v1 - */ -#define dcgmDeviceFbcStats_version1 MAKE_DCGM_VERSION(dcgmDeviceFbcStats_v1, 1) - -/** - * Latest version for \ref dcgmDeviceEncStats_t - */ -#define dcgmDeviceFbcStats_version dcgmDeviceFbcStats_version1 - -/* - * Represents frame buffer capture session type - */ -typedef enum dcgmFBCSessionType_enum -{ - DCGM_FBC_SESSION_TYPE_UNKNOWN = 0, //!< Unknown - DCGM_FBC_SESSION_TYPE_TOSYS, //!< FB capture for a system buffer - DCGM_FBC_SESSION_TYPE_CUDA, //!< FB capture for a cuda buffer - DCGM_FBC_SESSION_TYPE_VID, //!< FB capture for a Vid buffer - DCGM_FBC_SESSION_TYPE_HWENC, //!< FB capture for a NVENC HW buffer -} dcgmFBCSessionType_t; - -/** - * Represents information about active FBC session on the given device/vGPU instance - */ -typedef struct -{ - unsigned int version; //!< Version Number (dcgmDeviceFbcSessionInfo_version) - unsigned int sessionId; //!< Unique session ID - unsigned int pid; //!< Owning process ID - unsigned int vgpuId; //!< vGPU instance ID (only valid on vGPU hosts, otherwise zero) - unsigned int displayOrdinal; //!< Display identifier - dcgmFBCSessionType_t sessionType; //!< Type of frame buffer capture session - unsigned int sessionFlags; //!< Session flags - unsigned int hMaxResolution; //!< Max horizontal resolution supported by the capture session - unsigned int vMaxResolution; //!< Max vertical resolution supported by the capture session - unsigned int hResolution; //!< Horizontal resolution requested by caller in capture call - unsigned int vResolution; //!< Vertical resolution requested by caller in capture call - unsigned int averageFps; //!< Moving average new frames captured per second - unsigned int averageLatency; //!< Moving average new frame capture latency in microseconds -} dcgmDeviceFbcSessionInfo_v1; - -/** - * Typedef for \ref dcgmDeviceFbcSessionInfo_v1 - */ -typedef dcgmDeviceFbcSessionInfo_v1 dcgmDeviceFbcSessionInfo_t; - -/** - * Version 1 for \ref dcgmDeviceFbcSessionInfo_v1 - */ -#define dcgmDeviceFbcSessionInfo_version1 MAKE_DCGM_VERSION(dcgmDeviceFbcSessionInfo_v1, 1) - -/** - * Latest version for \ref dcgmDeviceFbcSessionInfo_t - */ -#define dcgmDeviceFbcSessionInfo_version dcgmDeviceFbcSessionInfo_version1 - -/** - * Represents all the active FBC sessions on the given device/vGPU instance - */ -typedef struct -{ - unsigned int version; //!< Version Number (dcgmDeviceFbcSessions_version) - unsigned int sessionCount; //!< Count of active FBC sessions - dcgmDeviceFbcSessionInfo_t sessionInfo[DCGM_MAX_FBC_SESSIONS]; //!< Info about the active FBC session -} dcgmDeviceFbcSessions_v1; - -/** - * Typedef for \ref dcgmDeviceFbcSessions_v1 - */ -typedef dcgmDeviceFbcSessions_v1 dcgmDeviceFbcSessions_t; - -/** - * Version 1 for \ref dcgmDeviceFbcSessions_v1 - */ -#define dcgmDeviceFbcSessions_version1 MAKE_DCGM_VERSION(dcgmDeviceFbcSessions_v1, 1) - -/** - * Latest version for \ref dcgmDeviceFbcSessions_t - */ -#define dcgmDeviceFbcSessions_version dcgmDeviceFbcSessions_version1 - -/* - * Represents type of encoder for capacity can be queried - */ -typedef enum dcgmEncoderQueryType_enum -{ - DCGM_ENCODER_QUERY_H264 = 0, - DCGM_ENCODER_QUERY_HEVC = 1 -} dcgmEncoderType_t; - -/** - * Represents information about active encoder sessions on the given vGPU instance - */ -typedef struct -{ - unsigned int version; //!< Version Number (dcgmDeviceVgpuEncSessions_version) - union - { - unsigned int vgpuId; //!< vGPU instance ID - unsigned int sessionCount; - } encoderSessionInfo; - unsigned int sessionId; //!< Unique session ID - unsigned int pid; //!< Process ID - dcgmEncoderType_t codecType; //!< Video encoder type - unsigned int hResolution; //!< Current encode horizontal resolution - unsigned int vResolution; //!< Current encode vertical resolution - unsigned int averageFps; //!< Moving average encode frames per second - unsigned int averageLatency; //!< Moving average encode latency in milliseconds -} dcgmDeviceVgpuEncSessions_v1; - -/** - * Typedef for \ref dcgmDeviceVgpuEncSessions_v1 - */ -typedef dcgmDeviceVgpuEncSessions_v1 dcgmDeviceVgpuEncSessions_t; - -/** - * Version 1 for \ref dcgmDeviceVgpuEncSessions_v1 - */ -#define dcgmDeviceVgpuEncSessions_version1 MAKE_DCGM_VERSION(dcgmDeviceVgpuEncSessions_v1, 1) - -/** - * Latest version for \ref dcgmDeviceVgpuEncSessions_t - */ -#define dcgmDeviceVgpuEncSessions_version dcgmDeviceVgpuEncSessions_version1 - -/** - * Represents utilization values for processes running in vGPU VMs using the device - */ -typedef struct -{ - unsigned int version; //!< Version Number (dcgmDeviceVgpuProcessUtilInfo_version) - union - { - unsigned int vgpuId; //!< vGPU instance ID - unsigned int vgpuProcessSamplesCount; //!< Count of processes running in the vGPU VM,for which utilization - //!< rates are being reported in this cycle. - } vgpuProcessUtilInfo; - unsigned int pid; //!< Process ID of the process running in the vGPU VM. - char processName[DCGM_VGPU_NAME_BUFFER_SIZE]; //!< Process Name of process running in the vGPU VM. - unsigned int smUtil; //!< GPU utilization of process running in the vGPU VM. - unsigned int memUtil; //!< Memory utilization of process running in the vGPU VM. - unsigned int encUtil; //!< Encoder utilization of process running in the vGPU VM. - unsigned int decUtil; //!< Decoder utilization of process running in the vGPU VM. -} dcgmDeviceVgpuProcessUtilInfo_v1; - -/** - * Typedef for \ref dcgmDeviceVgpuProcessUtilInfo_v1 - */ -typedef dcgmDeviceVgpuProcessUtilInfo_v1 dcgmDeviceVgpuProcessUtilInfo_t; - -/** - * Version 1 for \ref dcgmDeviceVgpuProcessUtilInfo_v1 - */ -#define dcgmDeviceVgpuProcessUtilInfo_version1 MAKE_DCGM_VERSION(dcgmDeviceVgpuProcessUtilInfo_v1, 1) - -/** - * Latest version for \ref dcgmDeviceVgpuProcessUtilInfo_t - */ -#define dcgmDeviceVgpuProcessUtilInfo_version dcgmDeviceVgpuProcessUtilInfo_version1 - -/** - * Represents static info related to vGPUs supported on the device. - */ -typedef struct -{ - unsigned int version; //!< Version number (dcgmDeviceVgpuTypeIdStaticInfo_version) - union - { - unsigned int vgpuTypeId; - unsigned int supportedVgpuTypeCount; - } vgpuTypeInfo; //!< vGPU type ID and Supported vGPU type count - char vgpuTypeName[DCGM_VGPU_NAME_BUFFER_SIZE]; //!< vGPU type Name - char vgpuTypeClass[DCGM_VGPU_NAME_BUFFER_SIZE]; //!< Class of vGPU type - char vgpuTypeLicense[DCGM_GRID_LICENSE_BUFFER_SIZE]; //!< license of vGPU type - int deviceId; //!< device ID of vGPU type - int subsystemId; //!< Subsystem ID of vGPU type - int numDisplayHeads; //!< Count of vGPU's supported display heads - int maxInstances; //!< maximum number of vGPU instances creatable on a device for given vGPU type - int frameRateLimit; //!< Frame rate limit value of the vGPU type - int maxResolutionX; //!< vGPU display head's maximum supported resolution in X dimension - int maxResolutionY; //!< vGPU display head's maximum supported resolution in Y dimension - int fbTotal; //!< vGPU Total framebuffer size in megabytes -} dcgmDeviceVgpuTypeInfo_v1; - -/** - * Typedef for \ref dcgmDeviceVgpuTypeInfo_v1 - */ -typedef dcgmDeviceVgpuTypeInfo_v1 dcgmDeviceVgpuTypeInfo_t; - -/** - * Version 1 for \ref dcgmDeviceVgpuTypeInfo_v1 - */ -#define dcgmDeviceVgpuTypeInfo_version1 MAKE_DCGM_VERSION(dcgmDeviceVgpuTypeInfo_v1, 1) - -/** - * Latest version for \ref dcgmDeviceVgpuTypeInfo_t - */ -#define dcgmDeviceVgpuTypeInfo_version dcgmDeviceVgpuTypeInfo_version1 - -typedef struct -{ - unsigned int version; - unsigned int persistenceModeEnabled; - unsigned int migModeEnabled; -} dcgmDeviceSettings_v1; - -typedef dcgmDeviceSettings_v1 dcgmDeviceSettings_t; - -#define dcgmDevicesSettings_version1 MAKE_DCGM_VERSION(dcgmDeviceSettings_v1, 1) - -#define dcgmDeviceSettings_version dcgmDeviceSettings_version1 - -/** - * Represents attributes corresponding to a device - */ -typedef struct -{ - unsigned int version; //!< Version number (dcgmDeviceAttributes_version) - dcgmDeviceSupportedClockSets_t clockSets; //!< Supported clocks for the device - dcgmDeviceThermals_t thermalSettings; //!< Thermal settings for the device - dcgmDevicePowerLimits_t powerLimits; //!< Various power limits for the device - dcgmDeviceIdentifiers_t identifiers; //!< Identifiers for the device - dcgmDeviceMemoryUsage_t memoryUsage; //!< Memory usage info for the device - char unused[208]; //!< Unused Space. Set to 0 for now -} dcgmDeviceAttributes_v1; - -/** - * Version 1 for \ref dcgmDeviceAttributes_v1 - */ -#define dcgmDeviceAttributes_version1 MAKE_DCGM_VERSION(dcgmDeviceAttributes_v1, 1) - -typedef struct -{ - unsigned int version; //!< Version number (dcgmDeviceAttributes_version) - dcgmDeviceSupportedClockSets_t clockSets; //!< Supported clocks for the device - dcgmDeviceThermals_t thermalSettings; //!< Thermal settings for the device - dcgmDevicePowerLimits_t powerLimits; //!< Various power limits for the device - dcgmDeviceIdentifiers_t identifiers; //!< Identifiers for the device - dcgmDeviceMemoryUsage_t memoryUsage; //!< Memory usage info for the device - dcgmDeviceSettings_t settings; //!< Basic device settings -} dcgmDeviceAttributes_v2; - -/** - * Typedef for \ref dcgmDeviceAttributes_v2 - */ -typedef dcgmDeviceAttributes_v2 dcgmDeviceAttributes_t; - -/** - * Version 1 for \ref dcgmDeviceAttributes_v2 - */ -#define dcgmDeviceAttributes_version2 MAKE_DCGM_VERSION(dcgmDeviceAttributes_v2, 2) - -/** - * Latest version for \ref dcgmDeviceAttributes_t - */ -#define dcgmDeviceAttributes_version dcgmDeviceAttributes_version2 - -/** - * Maximum number of vGPU types per physical GPU - */ -#define DCGM_MAX_VGPU_TYPES_PER_PGPU 32 - -/** - * Represents the size of a buffer that holds string related to attributes specific to vGPU instance - */ -#define DCGM_DEVICE_UUID_BUFFER_SIZE 80 - -/** - * Used to represent Performance state settings - */ -typedef struct -{ - unsigned int syncBoost; //!< Sync Boost Mode (0: Disabled, 1 : Enabled, DCGM_INT32_BLANK : Ignored). Note that - //!< using this setting may result in lower clocks than targetClocks - dcgmClockSet_t targetClocks; //!< Target clocks. Set smClock and memClock to DCGM_INT32_BLANK to ignore/use - //!< compatible values. For GPUs > Maxwell, setting this implies autoBoost=0 -} dcgmConfigPerfStateSettings_t; - -/** - * Used to represents the power capping limit for each GPU in the group or to represent the power - * budget for the entire group - */ -typedef struct -{ - dcgmConfigPowerLimitType_t type; //!< Flag to represent power cap for each GPU or power budget for the group of GPUs - unsigned int val; //!< Power Limit in Watts (Set a value OR DCGM_INT32_BLANK to Ignore) -} dcgmConfigPowerLimit_t; - -/** - * Structure to represent default and target configuration for a device - */ -typedef struct -{ - unsigned int version; //!< Version number (dcgmConfig_version) - unsigned int gpuId; //!< GPU ID - unsigned int eccMode; //!< ECC Mode (0: Disabled, 1 : Enabled, DCGM_INT32_BLANK : Ignored) - unsigned int computeMode; //!< Compute Mode (One of DCGM_CONFIG_COMPUTEMODE_? OR DCGM_INT32_BLANK to Ignore) - dcgmConfigPerfStateSettings_t perfState; //!< Performance State Settings (clocks / boost mode) - dcgmConfigPowerLimit_t powerLimit; //!< Power Limits -} dcgmConfig_v1; - -/** - * Typedef for \ref dcgmConfig_v1 - */ -typedef dcgmConfig_v1 dcgmConfig_t; - -/** - * Version 1 for \ref dcgmConfig_v1 - */ -#define dcgmConfig_version1 MAKE_DCGM_VERSION(dcgmConfig_v1, 1) - -/** - * Latest version for \ref dcgmConfig_t - */ -#define dcgmConfig_version dcgmConfig_version1 - -/** - * Represents a callback to receive updates from asynchronous functions. - * Currently the only implemented callback function is dcgmPolicyRegister - * and the void * data will be a pointer to dcgmPolicyCallbackResponse_t. - * Ex. - * dcgmPolicyCallbackResponse_t *callbackResponse = (dcgmPolicyCallbackResponse_t *) userData; - * - */ -typedef int (*fpRecvUpdates)(void *userData); - -/*Remove from doxygen documentation - * - * Define the structure that contains specific policy information - */ -typedef struct -{ - // version must always be first - unsigned int version; //!< Version number (dcgmPolicyViolation_version) - - unsigned int notifyOnEccDbe; //!< true/false notification on ECC Double Bit Errors - unsigned int notifyOnPciEvent; //!< true/false notification on PCI Events - unsigned int notifyOnMaxRetiredPages; //!< number of retired pages to occur before notification -} dcgmPolicyViolation_v1; - -/*Remove from doxygen documentation - * - * Represents the versioning for the dcgmPolicyViolation_v1 structure - */ - -/* - * Typedef for \ref dcgmPolicyViolation_v1 - */ -typedef dcgmPolicyViolation_v1 dcgmPolicyViolation_t; - -/* - * Version 1 for \ref dcgmPolicyViolation_v1 - */ -#define dcgmPolicyViolation_version1 MAKE_DCGM_VERSION(dcgmPolicyViolation_v1, 1) - -/* - * Latest version for \ref dcgmPolicyViolation_t - */ -#define dcgmPolicyViolation_version dcgmPolicyViolation_version1 - -/** - * Enumeration for policy conditions. - * When used as part of dcgmPolicy_t these have corresponding parameters to - * allow them to be switched on/off or set specific violation thresholds - */ -typedef enum dcgmPolicyCondition_enum -{ - // these are bitwise rather than sequential - DCGM_POLICY_COND_DBE = 0x1, //!< Double bit errors -- boolean in dcgmPolicyConditionParams_t - DCGM_POLICY_COND_PCI = 0x2, //!< PCI events/errors -- boolean in dcgmPolicyConditionParams_t - DCGM_POLICY_COND_MAX_PAGES_RETIRED = 0x4, //!< Maximum number of retired pages -- number - //!< required in dcgmPolicyConditionParams_t - DCGM_POLICY_COND_THERMAL = 0x8, //!< Thermal violation -- number required in dcgmPolicyConditionParams_t - DCGM_POLICY_COND_POWER = 0x10, //!< Power violation -- number required in dcgmPolicyConditionParams_t - DCGM_POLICY_COND_NVLINK = 0x20, //!< NVLINK errors -- boolean in dcgmPolicyConditionParams_t - DCGM_POLICY_COND_XID = 0x40, //!< XID errors -- number required in dcgmPolicyConditionParams_t -} dcgmPolicyCondition_t; - -#define DCGM_POLICY_COND_MAX 7 - -/** - * Structure for policy condition parameters. - * This structure contains a tag that represents the type of the value being passed - * as well as a "val" which is a union of the possible value types. For example, - * to pass a true boolean: tag = BOOL, val.boolean = 1. - */ -typedef struct dcgmPolicyConditionParams_st -{ - enum - { - BOOL, - LLONG - } tag; - union - { - unsigned int boolean; - unsigned long long llval; - } val; -} dcgmPolicyConditionParams_t; - -/** - * Enumeration for policy modes - */ -typedef enum dcgmPolicyMode_enum -{ - DCGM_POLICY_MODE_AUTOMATED = 0, //!< automatic mode - DCGM_POLICY_MODE_MANUAL = 1, //!< manual mode -} dcgmPolicyMode_t; - -/** - * Enumeration for policy isolation modes - */ -typedef enum dcgmPolicyIsolation_enum -{ - DCGM_POLICY_ISOLATION_NONE = 0, //!< no isolation of GPUs on error -} dcgmPolicyIsolation_t; - -/** - * Enumeration for policy actions - */ -typedef enum dcgmPolicyAction_enum -{ - DCGM_POLICY_ACTION_NONE = 0, //!< no action - DCGM_POLICY_ACTION_GPURESET = 1, //!< Deprecated - perform a GPU reset on violation -} dcgmPolicyAction_t; - -/** - * Enumeration for policy validation actions - */ -typedef enum dcgmPolicyValidation_enum -{ - DCGM_POLICY_VALID_NONE = 0, //!< no validation after an action is performed - DCGM_POLICY_VALID_SV_SHORT = 1, //!< run a short System Validation on the system after failure - DCGM_POLICY_VALID_SV_MED = 2, //!< run a medium System Validation test after failure - DCGM_POLICY_VALID_SV_LONG = 3, //!< run a extensive System Validation test after failure -} dcgmPolicyValidation_t; - -/** - * Enumeration for policy failure responses - */ -typedef enum dcgmPolicyFailureResp_enum -{ - DCGM_POLICY_FAILURE_NONE = 0, //!< on failure of validation perform no action -} dcgmPolicyFailureResp_t; - -/** - * Structure to fill when a user queries for policy violations - */ -typedef struct -{ - unsigned int gpuId; //!< gpu ID - unsigned int violationOccurred; //!< a violation based on the bit values in \ref dcgmPolicyCondition_t -} dcgmPolicyViolationNotify_t; - -/** - * Define the structure that specifies a policy to be enforced for a GPU - */ -typedef struct -{ - // version must always be first - unsigned int version; //!< version number (dcgmPolicy_version) - - dcgmPolicyCondition_t condition; //!< Condition(s) to access \ref dcgmPolicyCondition_t - dcgmPolicyMode_t mode; //!< Mode of operation \ref dcgmPolicyMode_t - dcgmPolicyIsolation_t isolation; //!< Isolation level after a policy violation \ref dcgmPolicyIsolation_t - dcgmPolicyAction_t action; //!< Action to perform after a policy violation \ref dcgmPolicyAction_t action - dcgmPolicyValidation_t validation; //!< Validation to perform after action is taken \ref dcgmPolicyValidation_t - dcgmPolicyFailureResp_t response; //!< Failure to validation response \ref dcgmPolicyFailureResp_t - dcgmPolicyConditionParams_t parms[DCGM_POLICY_COND_MAX]; //!< Parameters for the \a condition fields -} dcgmPolicy_v1; - -/** - * Typedef for \ref dcgmPolicy_v1 - */ -typedef dcgmPolicy_v1 dcgmPolicy_t; - -/** - * Version 1 for \ref dcgmPolicy_v1 - */ -#define dcgmPolicy_version1 MAKE_DCGM_VERSION(dcgmPolicy_v1, 1) - -/** - * Latest version for \ref dcgmPolicy_t - */ -#define dcgmPolicy_version dcgmPolicy_version1 - - -/** - * Define the ECC DBE return structure - */ -typedef struct -{ - long long timestamp; //!< timestamp of the error - enum - { - L1, - L2, - DEVICE, - REGISTER, - TEXTURE - } location; //!< location of the error - unsigned int numerrors; //!< number of errors -} dcgmPolicyConditionDbe_t; - -/** - * Define the PCI replay error return structure - */ -typedef struct -{ - long long timestamp; //!< timestamp of the error - unsigned int counter; //!< value of the PCIe replay counter -} dcgmPolicyConditionPci_t; - -/** - * Define the maximum pending retired pages limit return structure - */ -typedef struct -{ - long long timestamp; //!< timestamp of the error - unsigned int sbepages; //!< number of pending pages due to SBE - unsigned int dbepages; //!< number of pending pages due to DBE -} dcgmPolicyConditionMpr_t; - -/** - * Define the thermal policy violations return structure - */ -typedef struct -{ - long long timestamp; //!< timestamp of the error - unsigned int thermalViolation; //!< Temperature reached that violated policy -} dcgmPolicyConditionThermal_t; - -/** - * Define the power policy violations return structure - */ -typedef struct -{ - long long timestamp; //!< timestamp of the error - unsigned int powerViolation; //!< Power value reached that violated policy -} dcgmPolicyConditionPower_t; - -/** - * Define the nvlink policy violations return structure - */ -typedef struct -{ - long long timestamp; //!< timestamp of the error - unsigned short fieldId; //!< Nvlink counter field ID that violated policy - unsigned int counter; //!< Nvlink counter value that violated policy -} dcgmPolicyConditionNvlink_t; - -/** - * Define the xid policy violations return structure - */ -typedef struct -{ - long long timestamp; //!< Timestamp of the error - unsigned int errnum; //!< The XID error number -} dcgmPolicyConditionXID_t; - - -/** - * Define the structure that is given to the callback function - */ -typedef struct -{ - // version must always be first - unsigned int version; //!< version number (dcgmPolicyCallbackResponse_version) - - dcgmPolicyCondition_t condition; //!< Condition that was violated - union - { - dcgmPolicyConditionDbe_t dbe; //!< ECC DBE return structure - dcgmPolicyConditionPci_t pci; //!< PCI replay error return structure - dcgmPolicyConditionMpr_t mpr; //!< Max retired pages limit return structure - dcgmPolicyConditionThermal_t thermal; //!< Thermal policy violations return structure - dcgmPolicyConditionPower_t power; //!< Power policy violations return structure - dcgmPolicyConditionNvlink_t nvlink; //!< Nvlink policy violations return structure - dcgmPolicyConditionXID_t xid; //!< XID policy violations return structure - } val; -} dcgmPolicyCallbackResponse_v1; - - -/** - * Typedef for \ref dcgmPolicyCallbackResponse_v1 - */ -typedef dcgmPolicyCallbackResponse_v1 dcgmPolicyCallbackResponse_t; - -/** - * Version 1 for \ref dcgmPolicyCallbackResponse_v1 - */ -#define dcgmPolicyCallbackResponse_version1 MAKE_DCGM_VERSION(dcgmPolicyCallbackResponse_v1, 1) - -/** - * Latest version for \ref dcgmPolicyCallbackResponse_t - */ -#define dcgmPolicyCallbackResponse_version dcgmPolicyCallbackResponse_version1 - -/** - * Set above size of largest blob entry. Currently this is dcgmDeviceVgpuTypeInfo_v1 - */ -#define DCGM_MAX_BLOB_LENGTH 4096 - -/** - * This structure is used to represent value for the field to be queried. - */ -typedef struct -{ - // version must always be first - unsigned int version; //!< version number (dcgmFieldValue_version1) - - unsigned short fieldId; //!< One of DCGM_FI_? - unsigned short fieldType; //!< One of DCGM_FT_? - int status; //!< Status for the querying the field. DCGM_ST_OK or one of DCGM_ST_? - int64_t ts; //!< Timestamp in usec since 1970 - union - { - int64_t i64; //!< Int64 value - double dbl; //!< Double value - char str[DCGM_MAX_STR_LENGTH]; //!< NULL terminated string - char blob[DCGM_MAX_BLOB_LENGTH]; //!< Binary blob - } value; //!< Value -} dcgmFieldValue_v1; - -/** - * Version 1 for \ref dcgmFieldValue_v1 - */ -#define dcgmFieldValue_version1 MAKE_DCGM_VERSION(dcgmFieldValue_v1, 1) - -/** - * This structure is used to represent value for the field to be queried. - */ -typedef struct -{ - // version must always be first - unsigned int version; //!< version number (dcgmFieldValue_version2) - dcgm_field_entity_group_t entityGroupId; //!< Entity group this field value's entity belongs to - dcgm_field_eid_t entityId; //!< Entity this field value belongs to - unsigned short fieldId; //!< One of DCGM_FI_? - unsigned short fieldType; //!< One of DCGM_FT_? - int status; //!< Status for the querying the field. DCGM_ST_OK or one of DCGM_ST_? - unsigned int unused; //!< Unused for now to align ts to an 8-byte boundary. - int64_t ts; //!< Timestamp in usec since 1970 - union - { - int64_t i64; //!< Int64 value - double dbl; //!< Double value - char str[DCGM_MAX_STR_LENGTH]; //!< NULL terminated string - char blob[DCGM_MAX_BLOB_LENGTH]; //!< Binary blob - } value; //!< Value -} dcgmFieldValue_v2; - -/** - * Version 2 for \ref dcgmFieldValue_v2 - */ -#define dcgmFieldValue_version2 MAKE_DCGM_VERSION(dcgmFieldValue_v2, 2) - -/** - * Field value flags used by \ref dcgmEntitiesGetLatestValues - * - * Retrieve live data from the driver rather than cached data. - * Warning: Setting this flag will result in multiple calls to the NVIDIA driver that will be much slower than - * retrieving a cached value. - */ -#define DCGM_FV_FLAG_LIVE_DATA 0x00000001 - -/** - * User callback function for processing one or more field updates. This callback will - * be invoked one or more times per field until all of the expected field values have been - * enumerated. It is up to the callee to detect when the field id changes - * - * @param gpuId IN: GPU ID of the GPU this field value set belongs to - * @param values IN: Field values. These values must be copied as they will be destroyed as soon as this - * call returns. - * @param numValues IN: Number of entries that are valid in values[] - * @param userData IN: User data pointer passed to the update function that generated this callback - * - * @returns - * 0 if OK - * <0 if enumeration should stop. This allows to callee to abort field value enumeration. - * - */ -typedef int (*dcgmFieldValueEnumeration_f)(unsigned int gpuId, - dcgmFieldValue_v1 *values, - int numValues, - void *userData); - -/** - * User callback function for processing one or more field updates. This callback will - * be invoked one or more times per field until all of the expected field values have been - * enumerated. It is up to the callee to detect when the field id changes - * - * @param entityGroupId IN: entityGroup of the entity this field value set belongs to - * @param entityId IN: Entity this field value set belongs to - * @param values IN: Field values. These values must be copied as they will be destroyed as soon as this - * call returns. - * @param numValues IN: Number of entries that are valid in values[] - * @param userData IN: User data pointer passed to the update function that generated this callback - * - * @returns - * 0 if OK - * <0 if enumeration should stop. This allows to callee to abort field value enumeration. - * - */ -typedef int (*dcgmFieldValueEntityEnumeration_f)(dcgm_field_entity_group_t entityGroupId, - dcgm_field_eid_t entityId, - dcgmFieldValue_v1 *values, - int numValues, - void *userData); - - -/** - * Summary of time series data in int64 format. - * - * Each value will either be set or be a BLANK value. - * Check for blank with the DCGM_INT64_IS_BLANK() macro. - * \sa See dcgmvalue.h for the actual values of BLANK values - */ -typedef struct -{ - long long minValue; //!< Minimum value of the samples looked at - long long maxValue; //!< Maximum value of the samples looked at - long long average; //!< Simple average of the samples looked at. Blank values are ignored for this calculation -} dcgmStatSummaryInt64_t; - -/** - * Same as dcgmStatSummaryInt64_t, but with 32-bit integer values - */ -typedef struct -{ - int minValue; //!< Minimum value of the samples looked at - int maxValue; //!< Maximum value of the samples looked at - int average; //!< Simple average of the samples looked at. Blank values are ignored for this calculation -} dcgmStatSummaryInt32_t; - -/** - * Summary of time series data in double-precision format. - * Each value will either be set or be a BLANK value. - * Check for blank with the DCGM_FP64_IS_BLANK() macro. - * \sa See dcgmvalue.h for the actual values of BLANK values - */ -typedef struct -{ - double minValue; //!< Minimum value of the samples looked at - double maxValue; //!< Maximum value of the samples looked at - double average; //!< Simple average of the samples looked at. Blank values are ignored for this calculation -} dcgmStatSummaryFp64_t; - -/** - * Systems structure used to enable or disable health watch systems - */ -typedef enum dcgmHealthSystems_enum -{ - DCGM_HEALTH_WATCH_PCIE = 0x1, //!< PCIe system watches (must have 1m of data before query) - DCGM_HEALTH_WATCH_NVLINK = 0x2, //!< NVLINK system watches - DCGM_HEALTH_WATCH_PMU = 0x4, //!< Power management unit watches - DCGM_HEALTH_WATCH_MCU = 0x8, //!< Micro-controller unit watches - DCGM_HEALTH_WATCH_MEM = 0x10, //!< Memory watches - DCGM_HEALTH_WATCH_SM = 0x20, //!< Streaming multiprocessor watches - DCGM_HEALTH_WATCH_INFOROM = 0x40, //!< Inforom watches - DCGM_HEALTH_WATCH_THERMAL = 0x80, //!< Temperature watches (must have 1m of data before query) - DCGM_HEALTH_WATCH_POWER = 0x100, //!< Power watches (must have 1m of data before query) - DCGM_HEALTH_WATCH_DRIVER = 0x200, //!< Driver-related watches - DCGM_HEALTH_WATCH_NVSWITCH_NONFATAL = 0x400, //!< Non-fatal errors in NvSwitch - DCGM_HEALTH_WATCH_NVSWITCH_FATAL = 0x800, //!< Fatal errors in NvSwitch - - // ... - DCGM_HEALTH_WATCH_ALL = 0xFFFFFFFF //!< All watches enabled -} dcgmHealthSystems_t; - -#define DCGM_HEALTH_WATCH_COUNT_V1 10 /*!< For iterating through the dcgmHealthSystems_v1 enum */ -#define DCGM_HEALTH_WATCH_COUNT_V2 12 /*!< For iterating through the dcgmHealthSystems_v2 enum */ - -/** - * Health Watch test results - */ -typedef enum dcgmHealthWatchResult_enum -{ - DCGM_HEALTH_RESULT_PASS = 0, //!< All results within this system are reporting normal - DCGM_HEALTH_RESULT_WARN = 10, //!< A warning has been issued, refer to the response for more information - DCGM_HEALTH_RESULT_FAIL = 20, //!< A failure has been issued, refer to the response for more information -} dcgmHealthWatchResults_t; - -typedef struct -{ - char msg[1024]; - unsigned int code; -} dcgmDiagErrorDetail_t; - -#define DCGM_HEALTH_WATCH_MAX_INCIDENTS DCGM_GROUP_MAX_ENTITIES - -typedef struct -{ - dcgmHealthSystems_t system; //!< system to which this information belongs - dcgmHealthWatchResults_t health; //!< health diagnosis of this incident - dcgmDiagErrorDetail_t error; //!< Information about the error(s) and their error codes - dcgmGroupEntityPair_t entityInfo; //!< identify which entity has this error -} dcgmIncidentInfo_t; - -/** - * Health response structure version 4 - Simply list the incidents instead of reporting by entity - * - * Since DCGM 2.0 - */ -typedef struct -{ - unsigned int version; //!< The version number of this struct - dcgmHealthWatchResults_t overallHealth; //!< The overall health of this entire host - unsigned int incidentCount; //!< The number of health incidents reported in this struct - dcgmIncidentInfo_t incidents[DCGM_HEALTH_WATCH_MAX_INCIDENTS]; //!< Report of the errors detected -} dcgmHealthResponse_v4; - -/** - * Version 4 for \ref dcgmHealthResponse_v4 - */ -#define dcgmHealthResponse_version4 MAKE_DCGM_VERSION(dcgmHealthResponse_v4, 4) - -/** - * Latest version for \ref dcgmHealthResponse_t - */ -#define dcgmHealthResponse_version dcgmHealthResponse_version4 - -/** - * Typedef for \ref dcgmHealthResponse_v4 - */ -typedef dcgmHealthResponse_v4 dcgmHealthResponse_t; - -/** - * Structure used to set health watches via the dcgmHealthSet_v2 API - */ -typedef struct -{ - unsigned int version; /*!< Version of this struct. Should be dcgmHealthSet_version2 */ - dcgmGpuGrp_t groupId; /*!< Group ID representing collection of one or more entities. Look - at \ref dcgmGroupCreate for details on creating the group. - Alternatively, pass in the group id as \a DCGM_GROUP_ALL_GPUS - to perform operation on all the GPUs or \a DCGM_GROUP_ALL_NVSWITCHES - to perform operation on all the NvSwitches. */ - dcgmHealthSystems_t systems; /*!< An enum representing systems that should be enabled for health - checks logically OR'd together. Refer to \ref dcgmHealthSystems_t - for details. */ - long long updateInterval; /*!< How often to query the underlying health information from the - NVIDIA driver in usec. This should be the same as how often you call - dcgmHealthCheck */ - double maxKeepAge; /*!< How long to keep data cached for this field in seconds. This should - be at least your maximum time between calling dcgmHealthCheck */ -} dcgmHealthSetParams_v2; - -/** - * Version 2 for \ref dcgmHealthSet_v2 - */ -#define dcgmHealthSetParams_version2 MAKE_DCGM_VERSION(dcgmHealthSetParams_v2, 2) - - -#define DCGM_MAX_PID_INFO_NUM 16 -/** - * per process utilization rates - */ -typedef struct -{ - unsigned int pid; - double smUtil; - double memUtil; -} dcgmProcessUtilInfo_t; - -/** - *Internal structure used to get the PID and the corresponding utilization rate - */ -typedef struct -{ - double util; - unsigned int pid; -} dcgmProcessUtilSample_t; - -/** - * Info corresponding to single PID - */ -typedef struct -{ - unsigned int gpuId; //!< ID of the GPU this pertains to. GPU_ID_INVALID = summary information for multiple GPUs - - /* All of the following are during the process's lifetime */ - - long long energyConsumed; //!< Energy consumed by the gpu in milli-watt/seconds - dcgmStatSummaryInt64_t pcieRxBandwidth; //!< PCI-E bytes read from the GPU - dcgmStatSummaryInt64_t pcieTxBandwidth; //!< PCI-E bytes written to the GPU - long long pcieReplays; //!< Count of PCI-E replays that occurred - long long startTime; //!< Process start time in microseconds since 1970 - long long endTime; //!< Process end time in microseconds since 1970 or reported as 0 if the process is not completed - dcgmProcessUtilInfo_t processUtilization; //!< Process SM and Memory Utilization (in percent) - dcgmStatSummaryInt32_t smUtilization; //!< GPU SM Utilization in percent - dcgmStatSummaryInt32_t memoryUtilization; //!< GPU Memory Utilization in percent - unsigned int eccSingleBit; //!< Deprecated - Count of ECC single bit errors that occurred - unsigned int eccDoubleBit; //!< Count of ECC double bit errors that occurred - dcgmStatSummaryInt32_t memoryClock; //!< Memory clock in MHz - dcgmStatSummaryInt32_t smClock; //!< SM clock in MHz - - int numXidCriticalErrors; //!< Number of valid entries in xidCriticalErrorsTs - long long xidCriticalErrorsTs[10]; //!< Timestamps of the critical XID errors that occurred - - int numOtherComputePids; //!< Count of otherComputePids entries that are valid - unsigned int otherComputePids[DCGM_MAX_PID_INFO_NUM]; //!< Other compute processes that ran. 0=no process - - int numOtherGraphicsPids; //!< Count of otherGraphicsPids entries that are valid - unsigned int otherGraphicsPids[DCGM_MAX_PID_INFO_NUM]; //!< Other graphics processes that ran. 0=no process - - long long maxGpuMemoryUsed; //!< Maximum amount of GPU memory that was used in bytes - - long long powerViolationTime; //!< Number of microseconds we were at reduced clocks due to power violation - long long thermalViolationTime; //!< Number of microseconds we were at reduced clocks due to thermal violation - long long reliabilityViolationTime; //!< Amount of microseconds we were at reduced clocks - //!< due to the reliability limit - long long boardLimitViolationTime; //!< Amount of microseconds we were at reduced clocks due to being at the - //!< board's max voltage - long long lowUtilizationTime; //!< Amount of microseconds we were at reduced clocks due to low utilization - long long syncBoostTime; //!< Amount of microseconds we were at reduced clocks due to sync boost - dcgmHealthWatchResults_t overallHealth; //!< The overall health of the system. \ref dcgmHealthWatchResults_t - unsigned int incidentCount; - struct - { - dcgmHealthSystems_t system; //!< system to which this information belongs - dcgmHealthWatchResults_t health; //!< health of the specified system on this GPU - } systems[DCGM_HEALTH_WATCH_COUNT_V1]; -} dcgmPidSingleInfo_t; - -/** - * To store process statistics - */ -typedef struct -{ - unsigned int version; //!< Version of this message (dcgmPidInfo_version) - unsigned int pid; //!< PID of the process - unsigned int unused; - int numGpus; //!< Number of GPUs that are valid in GPUs - dcgmPidSingleInfo_t summary; //!< Summary information for all GPUs listed in gpus[] - dcgmPidSingleInfo_t gpus[DCGM_MAX_NUM_DEVICES]; //!< Per-GPU information for this PID -} dcgmPidInfo_v2; - -/** - * Typedef for \ref dcgmPidInfo_v2 - */ -typedef dcgmPidInfo_v2 dcgmPidInfo_t; - -/** - * Version 2 for \ref dcgmPidInfo_v2 - */ -#define dcgmPidInfo_version2 MAKE_DCGM_VERSION(dcgmPidInfo_v2, 2) - -/** - * Latest version for \ref dcgmPidInfo_t - */ -#define dcgmPidInfo_version dcgmPidInfo_version2 - -/** - * Info corresponding to the job on a GPU - */ -typedef struct -{ - unsigned int gpuId; //!< ID of the GPU this pertains to. GPU_ID_INVALID = summary information for multiple GPUs - - /* All of the following are during the job's lifetime */ - - long long energyConsumed; //!< Energy consumed in milli-watt/seconds - dcgmStatSummaryFp64_t powerUsage; //!< Power usage Min/Max/Avg in watts - dcgmStatSummaryInt64_t pcieRxBandwidth; //!< PCI-E bytes read from the GPU - dcgmStatSummaryInt64_t pcieTxBandwidth; //!< PCI-E bytes written to the GPU - long long pcieReplays; //!< Count of PCI-E replays that occurred - long long startTime; //!< User provided job start time in microseconds since 1970 - long long endTime; //!< User provided job end time in microseconds since 1970 - dcgmStatSummaryInt32_t smUtilization; //!< GPU SM Utilization in percent - dcgmStatSummaryInt32_t memoryUtilization; //!< GPU Memory Utilization in percent - unsigned int eccSingleBit; //!< Deprecated - Count of ECC single bit errors that occurred - unsigned int eccDoubleBit; //!< Count of ECC double bit errors that occurred - dcgmStatSummaryInt32_t memoryClock; //!< Memory clock in MHz - dcgmStatSummaryInt32_t smClock; //!< SM clock in MHz - - int numXidCriticalErrors; //!< Number of valid entries in xidCriticalErrorsTs - long long xidCriticalErrorsTs[10]; //!< Timestamps of the critical XID errors that occurred - - int numComputePids; //!< Count of computePids entries that are valid - dcgmProcessUtilInfo_t computePidInfo[DCGM_MAX_PID_INFO_NUM]; //!< List of compute processes that ran during the job - //!< 0=no process - - int numGraphicsPids; //!< Count of graphicsPids entries that are valid - dcgmProcessUtilInfo_t graphicsPidInfo[DCGM_MAX_PID_INFO_NUM]; //!< List of compute processes that ran during the job - //!< 0=no process - - long long maxGpuMemoryUsed; //!< Maximum amount of GPU memory that was used in bytes - - long long powerViolationTime; //!< Number of microseconds we were at reduced clocks due to power violation - long long thermalViolationTime; //!< Number of microseconds we were at reduced clocks due to thermal violation - long long reliabilityViolationTime; //!< Amount of microseconds we were at reduced clocks - //!< due to the reliability limit - long long boardLimitViolationTime; //!< Amount of microseconds we were at reduced clocks - //!< due to being at the board's max voltage - long long lowUtilizationTime; //!< Amount of microseconds we were at reduced clocks due to low utilization - long long syncBoostTime; //!< Amount of microseconds we were at reduced clocks due to sync boost - dcgmHealthWatchResults_t overallHealth; //!< The overall health of the system. \ref dcgmHealthWatchResults_t - unsigned int incidentCount; - struct - { - dcgmHealthSystems_t system; //!< system to which this information belongs - dcgmHealthWatchResults_t health; //!< health of the specified system on this GPU - } systems[DCGM_HEALTH_WATCH_COUNT_V1]; -} dcgmGpuUsageInfo_t; - - -/** - * To store job statistics - * The following fields are not applicable in the summary info: - * - pcieRxBandwidth (Min/Max) - * - pcieTxBandwidth (Min/Max) - * - smUtilization (Min/Max) - * - memoryUtilization (Min/Max) - * - memoryClock (Min/Max) - * - smClock (Min/Max) - * - processSamples - * - * The average value in the above fields (in the summary) is the - * average of the averages of respective fields from all GPUs - */ -typedef struct -{ - unsigned int version; //!< Version of this message (dcgmPidInfo_version) - int numGpus; //!< Number of GPUs that are valid in gpus[] - dcgmGpuUsageInfo_t summary; //!< Summary information for all GPUs listed in gpus[] - dcgmGpuUsageInfo_t gpus[DCGM_MAX_NUM_DEVICES]; //!< Per-GPU information for this PID -} dcgmJobInfo_v3; - -/** - * Typedef for \ref dcgmJobInfo_v3 - */ -typedef dcgmJobInfo_v3 dcgmJobInfo_t; - -/** - * Version 3 for \ref dcgmJobInfo_v3 - */ -#define dcgmJobInfo_version3 MAKE_DCGM_VERSION(dcgmJobInfo_v3, 3) - -/** - * Latest version for \ref dcgmJobInfo_t - */ -#define dcgmJobInfo_version dcgmJobInfo_version3 - - -/** - * Running process information for a compute or graphics process - */ -typedef struct -{ - unsigned int version; //!< Version of this message (dcgmRunningProcess_version) - unsigned int pid; //!< PID of the process - unsigned long long memoryUsed; //!< GPU memory used by this process in bytes. -} dcgmRunningProcess_v1; - -/** - * Typedef for \ref dcgmRunningProcess_v1 - */ -typedef dcgmRunningProcess_v1 dcgmRunningProcess_t; - -/** - * Version 1 for \ref dcgmRunningProcess_v1 - */ -#define dcgmRunningProcess_version1 MAKE_DCGM_VERSION(dcgmRunningProcess_v1, 1) - -/** - * Latest version for \ref dcgmRunningProcess_t - */ -#define dcgmRunningProcess_version dcgmRunningProcess_version1 - -/** - * Enumeration for diagnostic levels - */ -typedef enum -{ - DCGM_DIAG_LVL_INVALID = 0, //!< Uninitialized - DCGM_DIAG_LVL_SHORT = 10, //!< run a very basic health check on the system - DCGM_DIAG_LVL_MED = 20, //!< run a medium-length diagnostic (a few minutes) - DCGM_DIAG_LVL_LONG = 30, //!< run a extensive diagnostic (several minutes) -} dcgmDiagnosticLevel_t; - -/** - * Diagnostic test results - */ -typedef enum dcgmDiagResult_enum -{ - DCGM_DIAG_RESULT_PASS = 0, //!< This test passed as diagnostics - DCGM_DIAG_RESULT_SKIP = 1, //!< This test was skipped - DCGM_DIAG_RESULT_WARN = 2, //!< This test passed with warnings - DCGM_DIAG_RESULT_FAIL = 3, //!< This test failed the diagnostics - DCGM_DIAG_RESULT_NOT_RUN = 4, //!< This test wasn't executed -} dcgmDiagResult_t; - -typedef struct -{ - dcgmDiagResult_t status; //!< The result of the test - char warning[1024]; //!< Warning returned from the test, if any - char info[1024]; //!< Information details returned from the test, if any -} dcgmDiagTestResult_v1; - -typedef struct -{ - dcgmDiagResult_t status; //!< The result of the test - dcgmDiagErrorDetail_t error; //!< The error message and error code, if any - char info[1024]; //!< Information details returned from the test, if any -} dcgmDiagTestResult_v2; - - -/** - * Diagnostic per gpu tests - fixed indices for dcgmDiagResponsePerGpu_t.results[] - */ -typedef enum dcgmPerGpuTestIndices_enum -{ - DCGM_MEMORY_INDEX = 0, //!< Memory test index - DCGM_DIAGNOSTIC_INDEX = 1, //!< Diagnostic test index - DCGM_PCI_INDEX = 2, //!< PCIe test index - DCGM_SM_STRESS_INDEX = 3, //!< SM Stress test index - DCGM_TARGETED_STRESS_INDEX = 4, //!< Targeted Stress test index - DCGM_TARGETED_POWER_INDEX = 5, //!< Targeted Power test index - DCGM_MEMORY_BANDWIDTH_INDEX = 6, //!< Memory bandwidth test index - // Remaining tests are included for convenience but have different execution rules - // See DCGM_PER_GPU_TEST_COUNT - DCGM_SOFTWARE_INDEX = 7, //!< Software test index - DCGM_CONTEXT_CREATE_INDEX = 8, //!< Context create test index - DCGM_UNKNOWN_INDEX = 9 //!< Unknown test -} dcgmPerGpuTestIndices_t; - -// TODO: transition these to dcgm_deprecated.h -#define DCGM_SM_PERF_INDEX DCGM_SM_STRESS_INDEX -#define DCGM_TARGETED_PERF_INDEX DCGM_TARGETED_PERF_INDEX - -// Number of diag tests -// NOTE: does not include software and context_create which have different execution rules -#define DCGM_PER_GPU_TEST_COUNT 7 - -/** - * Per GPU diagnostics result structure - */ -typedef struct -{ - unsigned int gpuId; //!< ID for the GPU this information pertains - unsigned int hwDiagnosticReturn; //!< Per GPU hardware diagnostic test return code - dcgmDiagTestResult_v2 results[DCGM_PER_GPU_TEST_COUNT]; //!< Array with a result for each per-gpu test -} dcgmDiagResponsePerGpu_v2; - -#define DCGM_SWTEST_COUNT 10 -#define LEVEL_ONE_MAX_RESULTS 16 - -typedef enum dcgmSoftwareTest_enum -{ - DCGM_SWTEST_BLACKLIST = 0, //!< test for presence of blacklisted drivers (e.g. nouveau) - DCGM_SWTEST_NVML_LIBRARY = 1, //!< test for presence (and version) of NVML lib - DCGM_SWTEST_CUDA_MAIN_LIBRARY = 2, //!< test for presence (and version) of CUDA lib - DCGM_SWTEST_CUDA_RUNTIME_LIBRARY = 3, //!< test for presence (and version) of CUDA RT lib - DCGM_SWTEST_PERMISSIONS = 4, //!< test for character device permissions - DCGM_SWTEST_PERSISTENCE_MODE = 5, //!< test for persistence mode enabled - DCGM_SWTEST_ENVIRONMENT = 6, //!< test for CUDA environment vars that may slow tests - DCGM_SWTEST_PAGE_RETIREMENT = 7, //!< test for pending frame buffer page retirement - DCGM_SWTEST_GRAPHICS_PROCESSES = 8, //!< test for graphics processes running - DCGM_SWTEST_INFOROM = 9, //!< test for inforom corruption -} dcgmSoftwareTest_t; - -/** - * Global diagnostics result structure v6 - * - * Since DCGM 2.0 - */ -typedef struct -{ - unsigned int version; //!< version number (dcgmDiagResult_version) - unsigned int gpuCount; //!< number of valid per GPU results - unsigned int levelOneTestCount; //!< number of valid levelOne results - - dcgmDiagTestResult_v2 levelOneResults[LEVEL_ONE_MAX_RESULTS]; //!< Basic, system-wide test results. - dcgmDiagResponsePerGpu_v2 perGpuResponses[DCGM_MAX_NUM_DEVICES]; //!< per GPU test results - dcgmDiagErrorDetail_t systemError; //!< System-wide error reported from NVVS - char trainingMsg[1024]; //!< Training Message -} dcgmDiagResponse_v6; - -/** - * Typedef for \ref dcgmDiagResponse_v6 - */ -typedef dcgmDiagResponse_v6 dcgmDiagResponse_t; - -/** - * Version 6 for \ref dcgmDiagResponse_v6 - */ -#define dcgmDiagResponse_version6 MAKE_DCGM_VERSION(dcgmDiagResponse_v6, 6) - -/** - * Latest version for \ref dcgmDiagResponse_t - */ -#define dcgmDiagResponse_version dcgmDiagResponse_version6 - -/** - * Represents level relationships within a system between two GPUs - * The enums are spaced to allow for future relationships. - * These match the definitions in nvml.h - */ -typedef enum dcgmGpuLevel_enum -{ - DCGM_TOPOLOGY_UNINITIALIZED = 0x0, - - /** \name PCI connectivity states */ - /**@{*/ - DCGM_TOPOLOGY_BOARD = 0x1, //!< multi-GPU board - DCGM_TOPOLOGY_SINGLE = 0x2, //!< all devices that only need traverse a single PCIe switch - DCGM_TOPOLOGY_MULTIPLE = 0x4, //!< all devices that need not traverse a host bridge - DCGM_TOPOLOGY_HOSTBRIDGE = 0x8, //!< all devices that are connected to the same host bridge - DCGM_TOPOLOGY_CPU = 0x10, //!< all devices that are connected to the same CPU but possibly multiple host bridges - DCGM_TOPOLOGY_SYSTEM = 0x20, //!< all devices in the system - /**@}*/ - - /** \name NVLINK connectivity states */ - /**@{*/ - DCGM_TOPOLOGY_NVLINK1 = 0x0100, //!< GPUs connected via a single NVLINK link - DCGM_TOPOLOGY_NVLINK2 = 0x0200, //!< GPUs connected via two NVLINK links - DCGM_TOPOLOGY_NVLINK3 = 0x0400, //!< GPUs connected via three NVLINK links - DCGM_TOPOLOGY_NVLINK4 = 0x0800, //!< GPUs connected via four NVLINK links - DCGM_TOPOLOGY_NVLINK5 = 0x1000, //!< GPUs connected via five NVLINK links - DCGM_TOPOLOGY_NVLINK6 = 0x2000, //!< GPUs connected via six NVLINK links - DCGM_TOPOLOGY_NVLINK7 = 0x4000, //!< GPUs connected via seven NVLINK links - DCGM_TOPOLOGY_NVLINK8 = 0x8000, //!< GPUs connected via eight NVLINK links - DCGM_TOPOLOGY_NVLINK9 = 0x10000, //!< GPUs connected via nine NVLINK links - DCGM_TOPOLOGY_NVLINK10 = 0x20000, //!< GPUs connected via ten NVLINK links - DCGM_TOPOLOGY_NVLINK11 = 0x40000, //!< GPUs connected via eleven NVLINK links - DCGM_TOPOLOGY_NVLINK12 = 0x80000, //!< GPUs connected via twelve NVLINK links - /**@}*/ -} dcgmGpuTopologyLevel_t; - -// the PCI paths are the lower 8 bits of the path information -#define DCGM_TOPOLOGY_PATH_PCI(x) (dcgmGpuTopologyLevel_t)((unsigned int)(x)&0xFF) - -// the NVLINK paths are the upper 24 bits of the path information -#define DCGM_TOPOLOGY_PATH_NVLINK(x) (dcgmGpuTopologyLevel_t)((unsigned int)(x)&0xFFFFFF00) - -#define DCGM_AFFINITY_BITMASK_ARRAY_SIZE 8 - -/** - * Device topology information - */ -typedef struct -{ - unsigned int version; //!< version number (dcgmDeviceTopology_version) - - unsigned long cpuAffinityMask[DCGM_AFFINITY_BITMASK_ARRAY_SIZE]; //!< affinity mask for the specified GPU - //!< a 1 represents affinity to the CPU in that - //!< bit position supports up to 256 cores - unsigned int numGpus; //!< number of valid entries in gpuPaths - - struct - { - unsigned int gpuId; //!< gpuId to which the path represents - dcgmGpuTopologyLevel_t path; //!< path to the gpuId from this GPU. Note that this is a bit-mask - //!< of DCGM_TOPOLOGY_* values and can contain both PCIe topology - //!< and NvLink topology where applicable. For instance: - //!< 0x210 = DCGM_TOPOLOGY_CPU | DCGM_TOPOLOGY_NVLINK2 - //!< Use the macros DCGM_TOPOLOGY_PATH_NVLINK and - //!< DCGM_TOPOLOGY_PATH_PCI to mask the NvLink and PCI paths, respectively. - unsigned int localNvLinkIds; //!< bits representing the local links connected to gpuId - //!< e.g. if this field == 3, links 0 and 1 are connected, - //!< field is only valid if NVLINKS actually exist between GPUs - } gpuPaths[DCGM_MAX_NUM_DEVICES - 1]; -} dcgmDeviceTopology_v1; - -/** - * Typedef for \ref dcgmDeviceTopology_v1 - */ -typedef dcgmDeviceTopology_v1 dcgmDeviceTopology_t; - -/** - * Version 1 for \ref dcgmDeviceTopology_v1 - */ -#define dcgmDeviceTopology_version1 MAKE_DCGM_VERSION(dcgmDeviceTopology_v1, 1) - -/** - * Latest version for \ref dcgmDeviceTopology_t - */ -#define dcgmDeviceTopology_version dcgmDeviceTopology_version1 - -/** - * Group topology information - */ -typedef struct -{ - unsigned int version; //!< version number (dcgmGroupTopology_version) - - unsigned long - groupCpuAffinityMask[DCGM_AFFINITY_BITMASK_ARRAY_SIZE]; //!< the CPU affinity mask for all GPUs in the group - //!< a 1 represents affinity to the CPU in that bit - //!< position supports up to 256 cores - unsigned int numaOptimalFlag; //!< a zero value indicates that 1 or more GPUs - //!< in the group have a different CPU affinity and thus - //!< may not be optimal for certain algorithms - dcgmGpuTopologyLevel_t slowestPath; //!< the slowest path amongst GPUs in the group -} dcgmGroupTopology_v1; - -/** - * Typedef for \ref dcgmGroupTopology_v1 - */ -typedef dcgmGroupTopology_v1 dcgmGroupTopology_t; - -/** - * Version 1 for \ref dcgmGroupTopology_v1 - */ -#define dcgmGroupTopology_version1 MAKE_DCGM_VERSION(dcgmGroupTopology_v1, 1) - -/** - * Latest version for \ref dcgmGroupTopology_t - */ -#define dcgmGroupTopology_version dcgmGroupTopology_version1 - -/** - * Identifies a level to retrieve field introspection info for - */ -typedef enum dcgmIntrospectLevel_enum -{ - DCGM_INTROSPECT_LVL_INVALID = 0, //!< Invalid value - DCGM_INTROSPECT_LVL_FIELD = 1, //!< Introspection data is grouped by field ID - DCGM_INTROSPECT_LVL_FIELD_GROUP = 2, //!< Introspection data is grouped by field group - DCGM_INTROSPECT_LVL_ALL_FIELDS, //!< Introspection data is aggregated for all fields -} dcgmIntrospectLevel_t; - -/** - * Identifies the retrieval context for introspection API calls. - */ -typedef struct -{ - unsigned int version; //!< version number (dcgmIntrospectContext_version) - dcgmIntrospectLevel_t introspectLvl; //!< Introspect Level \ref dcgmIntrospectLevel_t - union - { - dcgmGpuGrp_t fieldGroupId; //!< Only needed if \ref introspectLvl is DCGM_INTROSPECT_LVL_FIELD_GROUP - unsigned short fieldId; //!< Only needed if \ref introspectLvl is DCGM_INTROSPECT_LVL_FIELD - unsigned long long contextId; //!< Overloaded way to access both fieldGroupId and fieldId - }; -} dcgmIntrospectContext_v1; - -/** - * Typedef for \ref dcgmIntrospectContext_v1 - */ -typedef dcgmIntrospectContext_v1 dcgmIntrospectContext_t; - -/** - * Version 1 for \ref dcgmIntrospectContext_t - */ -#define dcgmIntrospectContext_version1 MAKE_DCGM_VERSION(dcgmIntrospectContext_v1, 1) - -/** - * Latest version for \ref dcgmIntrospectContext_t - */ -#define dcgmIntrospectContext_version dcgmIntrospectContext_version1 - -/** - * DCGM Execution time info for a set of fields - */ -typedef struct -{ - unsigned int version; //!< version number (dcgmIntrospectFieldsExecTime_version) - - long long meanUpdateFreqUsec; //!< the mean update frequency of all fields - - double recentUpdateUsec; //!< the sum of every field's most recent execution time after they - //!< have been normalized to \ref meanUpdateFreqUsec". - //!< This is roughly how long it takes to update fields every \ref meanUpdateFreqUsec - - long long totalEverUpdateUsec; //!< The total amount of time, ever, that has been spent updating all the fields -} dcgmIntrospectFieldsExecTime_v1; - -/** - * Typedef for \ref dcgmIntrospectFieldsExecTime_t - */ -typedef dcgmIntrospectFieldsExecTime_v1 dcgmIntrospectFieldsExecTime_t; - -/** - * Version 1 for \ref dcgmIntrospectFieldsExecTime_t - */ -#define dcgmIntrospectFieldsExecTime_version1 MAKE_DCGM_VERSION(dcgmIntrospectFieldsExecTime_v1, 1) - -/** - * Latest version for \ref dcgmIntrospectFieldsExecTime_t - */ -#define dcgmIntrospectFieldsExecTime_version dcgmIntrospectFieldsExecTime_version1 - -/** - * Full introspection info for field execution time - * - * Since DCGM 2.0 - */ -typedef struct -{ - unsigned int version; //!< version number (dcgmIntrospectFullFieldsExecTime_version) - - dcgmIntrospectFieldsExecTime_v1 aggregateInfo; //!< info that includes global and device scope - - int hasGlobalInfo; //!< 0 means \ref globalInfo is populated, !0 means it's not - dcgmIntrospectFieldsExecTime_v1 globalInfo; //!< info that only includes global field scope - - unsigned short gpuInfoCount; //!< count of how many entries in \ref gpuInfo are populated - unsigned int gpuIdsForGpuInfo[DCGM_MAX_NUM_DEVICES]; //!< the GPU ID at a given index identifies which gpu - //!< the corresponding entry in \ref gpuInfo is from - - dcgmIntrospectFieldsExecTime_v1 gpuInfo[DCGM_MAX_NUM_DEVICES]; //!< info that is separated by the - //!< GPU ID that the watches were for -} dcgmIntrospectFullFieldsExecTime_v2; - -/** - * typedef for \ref dcgmIntrospectFullFieldsExecTime_v1 - */ -typedef dcgmIntrospectFullFieldsExecTime_v2 dcgmIntrospectFullFieldsExecTime_t; - -/** - * Version 1 for \ref dcgmIntrospectFullFieldsExecTime_t - */ -#define dcgmIntrospectFullFieldsExecTime_version2 MAKE_DCGM_VERSION(dcgmIntrospectFullFieldsExecTime_v2, 2) - -/** - * Latest version for \ref dcgmIntrospectFullFieldsExecTime_t - */ -#define dcgmIntrospectFullFieldsExecTime_version dcgmIntrospectFullFieldsExecTime_version2 - -/** - * State of DCGM metadata gathering. If it is set to DISABLED then "Metadata" API - * calls to DCGM are not supported. - */ -typedef enum dcgmIntrospectState_enum -{ - DCGM_INTROSPECT_STATE_DISABLED = 0, - DCGM_INTROSPECT_STATE_ENABLED = 1 -} dcgmIntrospectState_t; - -/** - * DCGM Memory usage information - */ -typedef struct -{ - unsigned int version; //!< version number (dcgmIntrospectMemory_version) - long long bytesUsed; //!< number of bytes -} dcgmIntrospectMemory_v1; - -/** - * Typedef for \ref dcgmIntrospectMemory_t - */ -typedef dcgmIntrospectMemory_v1 dcgmIntrospectMemory_t; - -/** - * Version 1 for \ref dcgmIntrospectMemory_t - */ -#define dcgmIntrospectMemory_version1 MAKE_DCGM_VERSION(dcgmIntrospectMemory_v1, 1) - -/** - * Latest version for \ref dcgmIntrospectMemory_t - */ -#define dcgmIntrospectMemory_version dcgmIntrospectMemory_version1 - - -/** - * Full introspection info for field memory - */ -typedef struct -{ - unsigned int version; //!< version number (dcgmIntrospectFullMemory_version) - - dcgmIntrospectMemory_v1 aggregateInfo; //!< info that includes global and device scope - - int hasGlobalInfo; //!< 0 means \ref globalInfo is populated, !0 means it's not - dcgmIntrospectMemory_v1 globalInfo; //!< info that only includes global field scope - - unsigned short gpuInfoCount; //!< count of how many entries in \ref gpuInfo are populated - unsigned int gpuIdsForGpuInfo[DCGM_MAX_NUM_DEVICES]; //!< the GPU ID at a given index identifies which gpu - //!< the corresponding entry in \ref gpuInfo is from - - dcgmIntrospectMemory_v1 gpuInfo[DCGM_MAX_NUM_DEVICES]; //!< info that is divided by the - //!< GPU ID that the watches were for -} dcgmIntrospectFullMemory_v1; - -/** - * typedef for \ref dcgmIntrospectFullMemory_v1 - */ -typedef dcgmIntrospectFullMemory_v1 dcgmIntrospectFullMemory_t; - -/** - * Version 1 for \ref dcgmIntrospectFullMemory_t - */ -#define dcgmIntrospectFullMemory_version1 MAKE_DCGM_VERSION(dcgmIntrospectFullMemory_v1, 1) - -/** - * Latest version for \ref dcgmIntrospectFullMemory_t - */ -#define dcgmIntrospectFullMemory_version dcgmIntrospectFullMemory_version1 - -/** - * DCGM CPU Utilization information. Multiply values by 100 to get them in %. - */ -typedef struct -{ - unsigned int version; //!< version number (dcgmMetadataCpuUtil_version) - double total; //!< fraction of device's CPU resources that were used - double kernel; //!< fraction of device's CPU resources that were used in kernel mode - double user; //!< fraction of device's CPU resources that were used in user mode -} dcgmIntrospectCpuUtil_v1; - -/** - * Typedef for \ref dcgmIntrospectCpuUtil_t - */ -typedef dcgmIntrospectCpuUtil_v1 dcgmIntrospectCpuUtil_t; - -/** - * Version 1 for \ref dcgmIntrospectCpuUtil_t - */ -#define dcgmIntrospectCpuUtil_version1 MAKE_DCGM_VERSION(dcgmIntrospectCpuUtil_v1, 1) - -/** - * Latest version for \ref dcgmIntrospectCpuUtil_t - */ -#define dcgmIntrospectCpuUtil_version dcgmIntrospectCpuUtil_version1 - -#define DCGM_MAX_CONFIG_FILE_LEN 10000 -#define DCGM_MAX_TEST_NAMES 20 -#define DCGM_MAX_TEST_NAMES_LEN 50 -#define DCGM_MAX_TEST_PARMS 100 -#define DCGM_MAX_TEST_PARMS_LEN 100 -#define DCGM_GPU_LIST_LEN 50 -#define DCGM_FILE_LEN 30 -#define DCGM_PATH_LEN 128 -#define DCGM_THROTTLE_MASK_LEN 50 - -/** - * Flags options for running the GPU diagnostic - * @{ - * - */ - -/** - * Output in verbose mode; include information as well as warnings - */ -#define DCGM_RUN_FLAGS_VERBOSE 0x0001 - -/** - * Output stats only on failure - */ -#define DCGM_RUN_FLAGS_STATSONFAIL 0x0002 - -/** - * Train DCGM diagnostic and output a configuration file with golden values - */ -#define DCGM_RUN_FLAGS_TRAIN 0x0004 - -/** - * Ignore warnings against training the diagnostic and train anyway - */ -#define DCGM_RUN_FLAGS_FORCE_TRAIN 0x0008 - -/** - * Enable fail early checks for the Targeted Stress, Targeted Power, SM Stress, and Diagnostic tests - */ -#define DCGM_RUN_FLAGS_FAIL_EARLY 0x0010 - -/** - * @} - */ - -/* - * Run diagnostic structure v7 - */ -typedef struct -{ - unsigned int version; //!< version of this message - unsigned int flags; //!< flags specifying binary options for running it. See DCGM_RUN_FLAGS_* - unsigned int debugLevel; //!< 0-5 for the debug level the GPU diagnostic will use for logging. - dcgmGpuGrp_t groupId; //!< group of GPUs to verify. Cannot be specified together with gpuList. - dcgmPolicyValidation_t validate; //!< 0-3 for which tests to run. Optional. - char testNames[DCGM_MAX_TEST_NAMES][DCGM_MAX_TEST_NAMES_LEN]; //!< Specified list of test names. Optional. - char testParms[DCGM_MAX_TEST_PARMS][DCGM_MAX_TEST_PARMS_LEN]; //!< Parameters to set for specified tests - //!< in the format: - //!< testName.parameterName=parameterValue. Optional. - char fakeGpuList[DCGM_GPU_LIST_LEN]; //!< Comma-separated list of GPUs. Cannot be specified with the groupId. - char gpuList[DCGM_GPU_LIST_LEN]; //!< Comma-separated list of GPUs. Cannot be specified with the groupId. - char debugLogFile[DCGM_PATH_LEN]; //!< Alternate name for the debug log file that should be used - char statsPath[DCGM_PATH_LEN]; //!< Path that the plugin's statistics files should be written to - char configFileContents[DCGM_MAX_CONFIG_FILE_LEN]; //!< Contents of nvvs config file (likely yaml) - char throttleMask[DCGM_THROTTLE_MASK_LEN]; //!< Throttle reasons to ignore as either integer mask or csv list of - //!< reasons - char pluginPath[DCGM_PATH_LEN]; //!< Custom path to the diagnostic plugins - unsigned int trainingIterations; //!< Number of iterations for training - unsigned int trainingVariance; //!< Acceptable training variance as a percentage of the value. (0-100) - unsigned int trainingTolerance; //!< Acceptable training tolerance as a percentage of the value. (0-100) - char goldenValuesFile[DCGM_PATH_LEN]; //!< The path where the golden values should be recorded - unsigned int failCheckInterval; //!< How often the fail early checks should occur when enabled. -} dcgmRunDiag_v7; - -/** - * Version 7 for \ref dcgmRunDiag_t - */ -#define dcgmRunDiag_version7 MAKE_DCGM_VERSION(dcgmRunDiag_v7, 7) - -/** - * Flags for dcgmGetEntityGroupEntities's flags parameter - * - * Only return entities that are supported by DCGM. - * This mimics the behavior of dcgmGetAllSupportedDevices(). - */ -#define DCGM_GEGE_FLAG_ONLY_SUPPORTED 0x00000001 - -/** - * Identifies a GPU NVLink error type returned by DCGM_FI_DEV_GPU_NVLINK_ERRORS - */ -typedef enum dcgmGpuNVLinkErrorType_enum -{ - DCGM_GPU_NVLINK_ERROR_RECOVERY_REQUIRED = 1, //!< NVLink link recovery error occurred - DCGM_GPU_NVLINK_ERROR_FATAL, //!< NVLink link fatal error occurred -} dcgmGpuNVLinkErrorType_t; - -/** Topology hints for dcgmSelectGpusByTopology() - * @{ - */ - -/** No hints specified */ -#define DCGM_TOPO_HINT_F_NONE 0x00000000 - -/** Ignore the health of the GPUs when picking GPUs for job - * execution. By default, only healthy GPUs are considered. - */ -#define DCGM_TOPO_HINT_F_IGNOREHEALTH 0x00000001 - -/** - * @} - */ - - -typedef struct -{ - unsigned int version; //!< version of this message - uint64_t inputGpuIds; //!< bit-mask of the GPU ids to choose from - uint32_t numGpus; //!< the number of GPUs that DCGM should choose - uint64_t hintFlags; //!< Hints to ignore certain factors for the scheduling hint -} dcgmTopoSchedHint_v1; - -typedef dcgmTopoSchedHint_v1 dcgmTopoSchedHint_t; - -#define dcgmTopoSchedHint_version1 MAKE_DCGM_VERSION(dcgmTopoSchedHint_v1, 1) - -/** - * NvLink link states - */ -typedef enum dcgmNvLinkLinkState_enum -{ - DcgmNvLinkLinkStateNotSupported = 0, //!< NvLink is unsupported by this GPU (Default for GPUs) - DcgmNvLinkLinkStateDisabled = 1, //!< NvLink is supported for this link but this link is disabled - //!< (Default for NvSwitches) - DcgmNvLinkLinkStateDown = 2, //!< This NvLink link is down (inactive) - DcgmNvLinkLinkStateUp = 3 //!< This NvLink link is up (active) -} dcgmNvLinkLinkState_t; - -/** - * State of NvLink links for a GPU - */ -typedef struct -{ - dcgm_field_eid_t entityId; //!< Entity ID of the GPU (gpuId) - dcgmNvLinkLinkState_t linkState[DCGM_NVLINK_MAX_LINKS_PER_GPU_LEGACY1]; //!< Per-GPU link states -} dcgmNvLinkGpuLinkStatus_v1; - -typedef struct -{ - dcgm_field_eid_t entityId; //!< Entity ID of the GPU (gpuId) - dcgmNvLinkLinkState_t linkState[DCGM_NVLINK_MAX_LINKS_PER_GPU]; //!< Per-GPU link states -} dcgmNvLinkGpuLinkStatus_v2; - -/** - * State of NvLink links for a NvSwitch - */ -typedef struct -{ - dcgm_field_eid_t entityId; //!< Entity ID of the NvSwitch (physicalId) - dcgmNvLinkLinkState_t linkState[DCGM_NVLINK_MAX_LINKS_PER_NVSWITCH]; //!< Per-NvSwitch link states -} dcgmNvLinkNvSwitchLinkStatus_t; - -/** - * Status of all of the NvLinks in a given system - */ -typedef struct -{ - unsigned int version; //!< Version of this request. Should be dcgmNvLinkStatus_version1 - unsigned int numGpus; //!< Number of entries in gpus[] that are populated - dcgmNvLinkGpuLinkStatus_v1 gpus[DCGM_MAX_NUM_DEVICES]; //!< Per-GPU NvLink link statuses - unsigned int numNvSwitches; //!< Number of entries in nvSwitches[] that are populated - dcgmNvLinkNvSwitchLinkStatus_t nvSwitches[DCGM_MAX_NUM_SWITCHES]; //!< Per-NvSwitch link statuses -} dcgmNvLinkStatus_v1; - -/** - * Version 1 of dcgmNvLinkStatus - */ -#define dcgmNvLinkStatus_version1 MAKE_DCGM_VERSION(dcgmNvLinkStatus_v1, 1) - -typedef struct -{ - unsigned int version; //!< Version of this request. Should be dcgmNvLinkStatus_version1 - unsigned int numGpus; //!< Number of entries in gpus[] that are populated - dcgmNvLinkGpuLinkStatus_v2 gpus[DCGM_MAX_NUM_DEVICES]; //!< Per-GPU NvLink link statuses - unsigned int numNvSwitches; //!< Number of entries in nvSwitches[] that are populated - dcgmNvLinkNvSwitchLinkStatus_t nvSwitches[DCGM_MAX_NUM_SWITCHES]; //!< Per-NvSwitch link statuses -} dcgmNvLinkStatus_v2; - -typedef dcgmNvLinkStatus_v2 dcgmNvLinkStatus_t; - -/** - * Version 2 of dcgmNvLinkStatus - */ -#define dcgmNvLinkStatus_version2 MAKE_DCGM_VERSION(dcgmNvLinkStatus_v2, 2) - -/* Bitmask values for dcgmGetFieldIdSummary - Sync with DcgmcmSummaryType_t */ -#define DCGM_SUMMARY_MIN 0x00000001 -#define DCGM_SUMMARY_MAX 0x00000002 -#define DCGM_SUMMARY_AVG 0x00000004 -#define DCGM_SUMMARY_SUM 0x00000008 -#define DCGM_SUMMARY_COUNT 0x00000010 -#define DCGM_SUMMARY_INTEGRAL 0x00000020 -#define DCGM_SUMMARY_DIFF 0x00000040 -#define DCGM_SUMMARY_SIZE 7 - -/* dcgmSummaryResponse_t is part of dcgmFieldSummaryRequest, so it uses dcgmFieldSummaryRequest's version. */ - -typedef struct -{ - unsigned int fieldType; //!< type of field that is summarized (int64 or fp64) - unsigned int summaryCount; //!< the number of populated summaries in \ref values - union - { - int64_t i64; - double fp64; - } values[DCGM_SUMMARY_SIZE]; //!< array for storing the values of each summary. The summaries are stored - //!< in order. For example, if MIN AND MAX are requested, then 0 will be MIN - //!< and 1 will be MAX. If AVG and DIFF were requested, then AVG would be 0 - //!< and 1 would be DIFF -} dcgmSummaryResponse_t; - -typedef struct -{ - unsigned int version; //!< version of this message - dcgmFieldSummaryRequest_v1 - unsigned short fieldId; //!< field id to be summarized - dcgm_field_entity_group_t entityGroupId; //!< the type of entity whose field we're getting - dcgm_field_eid_t entityId; //!< ordinal id for this entity - uint32_t summaryTypeMask; //!< bit-mask of DCGM_SUMMARY_*, the requested summaries - uint64_t startTime; //!< start time for the interval being summarized. 0 means to use - //!< any data before. - uint64_t endTime; //!< end time for the interval being summarized. 0 means to use - //!< any data after. - dcgmSummaryResponse_t response; //!< response data for this request -} dcgmFieldSummaryRequest_v1; - -typedef dcgmFieldSummaryRequest_v1 dcgmFieldSummaryRequest_t; - -#define dcgmFieldSummaryRequest_version1 MAKE_DCGM_VERSION(dcgmFieldSummaryRequest_v1, 1) - -/** - * Module IDs - */ -typedef enum -{ - DcgmModuleIdCore = 0, //!< Core DCGM - always loaded - DcgmModuleIdNvSwitch = 1, //!< NvSwitch Module - DcgmModuleIdVGPU = 2, //!< VGPU Module - DcgmModuleIdIntrospect = 3, //!< Introspection Module - DcgmModuleIdHealth = 4, //!< Health Module - DcgmModuleIdPolicy = 5, //!< Policy Module - DcgmModuleIdConfig = 6, //!< Config Module - DcgmModuleIdDiag = 7, //!< GPU Diagnostic Module - DcgmModuleIdProfiling = 8, //!< Profiling Module - - DcgmModuleIdCount //!< Always last. 1 greater than largest value above -} dcgmModuleId_t; - -/** - * Module Status. Modules are lazy loaded, so they will be in status DcgmModuleStatusNotLoaded - * until they are used. One modules are used, they will move to another status. - */ -typedef enum -{ - DcgmModuleStatusNotLoaded = 0, //!< Module has not been loaded yet - DcgmModuleStatusBlacklisted = 1, //!< Module has been blacklisted from being loaded - DcgmModuleStatusFailed = 2, //!< Loading the module failed - DcgmModuleStatusLoaded = 3, //!< Module has been loaded - DcgmModuleStatusUnloaded = 4, //!< Module has been unloaded, happens during shutdown -} dcgmModuleStatus_t; - -/** - * Status of all of the modules of the host engine - */ -typedef struct -{ - dcgmModuleId_t id; //!< ID of this module - dcgmModuleStatus_t status; //!< Status of this module -} dcgmModuleGetStatusesModule_t; - -/* This is larger than DcgmModuleIdCount so we can add modules without versioning this request */ -#define DCGM_MODULE_STATUSES_CAPACITY 16 - -typedef struct -{ - unsigned int version; //!< Version of this request. Should be dcgmModuleGetStatuses_version1 - unsigned int numStatuses; //!< Number of entries in statuses[] that are populated - dcgmModuleGetStatusesModule_t statuses[DCGM_MODULE_STATUSES_CAPACITY]; //!< Per-module status information -} dcgmModuleGetStatuses_v1; - -/** - * Version 1 of dcgmModuleGetStatuses - */ -#define dcgmModuleGetStatuses_version1 MAKE_DCGM_VERSION(dcgmModuleGetStatuses_v1, 1) -#define dcgmModuleGetStatuses_version dcgmModuleGetStatuses_version1 -typedef dcgmModuleGetStatuses_v1 dcgmModuleGetStatuses_t; - -/** - * Options for dcgmStartEmbedded_v2 - * - * Added in DCGM 2.0.0 - */ -typedef struct -{ - unsigned int version; /*!< Version number. Use dcgmStartEmbeddedV2Params_version1 */ - dcgmOperationMode_t opMode; /*!< IN: Collect data automatically or manually when asked by the user. */ - dcgmHandle_t dcgmHandle; /*!< OUT: DCGM Handle to use for API calls */ - const char *logFile; /*!< IN: File that DCGM should log to. NULL = do not log. '-' = stdout */ - DcgmLoggingSeverity_t severity; /*!< IN: Severity at which DCGM should log to logFile */ - unsigned int blackListCount; /*!< IN: Number of modules that to be blacklisted in blackList[] */ - dcgmModuleId_t blackList[DcgmModuleIdCount]; /* IN: IDs of modules to blacklist */ - unsigned int unused; /*!< IN: Unused. Set to 0. Aligns structure to 8-bytes */ -} dcgmStartEmbeddedV2Params_v1; - -/** - * Version 1 for \ref dcgmStartEmbeddedV2Params_v1 - */ -#define dcgmStartEmbeddedV2Params_version1 MAKE_DCGM_VERSION(dcgmStartEmbeddedV2Params_v1, 1) - -/** - * Maximum number of metric ID groups that can exist in DCGM - */ -#define DCGM_PROF_MAX_NUM_GROUPS 10 - -/** - * Maximum number of field IDs that can be in a single DCGM profiling metric group - */ -#define DCGM_PROF_MAX_FIELD_IDS_PER_GROUP 8 - -/** - * Structure to return all of the profiling metric groups that are available for the given groupId. - */ -typedef struct -{ - unsigned short majorId; //!< Major ID of this metric group. Metric groups with the same majorId cannot be - //!< watched concurrently with other metric groups with the same majorId - unsigned short minorId; //!< Minor ID of this metric group. This distinguishes metric groups within the same - //!< major metric group from each other - unsigned int numFieldIds; //!< Number of field IDs that are populated in fieldIds[] - unsigned short fieldIds[DCGM_PROF_MAX_FIELD_IDS_PER_GROUP]; //!< DCGM Field IDs that are part of this profiling - //!< group. See DCGM_FI_PROF_* definitions in - //!< dcgm_fields.h for details. -} dcgmProfMetricGroupInfo_t; - -typedef struct -{ - /** \name Input parameters - * @{ - */ - unsigned int version; //!< Version of this request. Should be dcgmProfGetMetricGroups_version - unsigned int unused; //!< Not used for now. Set to 0 - dcgmGpuGrp_t groupId; //!< Group of GPUs we should get the metric groups for. These must all be the - //!< exact same GPU or DCGM_ST_GROUP_INCOMPATIBLE will be returned - /** - * @} - */ - - /** \name Output - * @{ - */ - unsigned int numMetricGroups; //!< Number of entries in metricGroups[] that are populated - unsigned int unused1; //!< Not used for now. Set to 0 - dcgmProfMetricGroupInfo_t metricGroups[DCGM_PROF_MAX_NUM_GROUPS]; //!< Info for each metric group - /** - * @} - */ -} dcgmProfGetMetricGroups_v2; - -/** - * Version 1 of dcgmProfGetMetricGroups_t - */ -#define dcgmProfGetMetricGroups_version2 MAKE_DCGM_VERSION(dcgmProfGetMetricGroups_v2, 2) -#define dcgmProfGetMetricGroups_version dcgmProfGetMetricGroups_version2 -typedef dcgmProfGetMetricGroups_v2 dcgmProfGetMetricGroups_t; - -/** - * Structure to pass to dcgmProfWatchFields() when watching profiling metrics - */ -typedef struct -{ - unsigned int version; //!< Version of this request. Should be dcgmProfWatchFields_version - dcgmGpuGrp_t groupId; //!< Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate - //!< for details on creating the group. Alternatively, pass in the group id as \a - //!< DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. The GPUs of the group - //!< must all be identical or DCGM_ST_GROUP_INCOMPATIBLE will be returned by this API. - unsigned int numFieldIds; //!< Number of field IDs that are being passed in fieldIds[] - unsigned short fieldIds[16]; //!< DCGM_FI_PROF_? field IDs to watch - long long updateFreq; //!< How often to update this field in usec. Note that profiling metrics may need to be - //!< sampled more frequently than this value. See - //!< dcgmProfMetricGroupInfo_t.minUpdateFreqUsec of the metric group matching - //!< metricGroupTag to see what this minimum is. If minUpdateFreqUsec < updateFreq - //!< then samples will be aggregated to updateFreq intervals in DCGM's internal cache. - double maxKeepAge; //!< How long to keep data for every fieldId in seconds - int maxKeepSamples; //!< Maximum number of samples to keep for each fieldId. 0=no limit - unsigned int flags; //!< For future use. Set to 0 for now. -} dcgmProfWatchFields_v1; - -/** - * Version 1 of dcgmProfWatchFields_v1 - */ -#define dcgmProfWatchFields_version1 MAKE_DCGM_VERSION(dcgmProfWatchFields_v1, 1) -#define dcgmProfWatchFields_version dcgmProfWatchFields_version1 -typedef dcgmProfWatchFields_v1 dcgmProfWatchFields_t; - -/** - * Structure to pass to dcgmProfUnwatchFields when unwatching profiling metrics - */ -typedef struct -{ - unsigned int version; //!< Version of this request. Should be dcgmProfUnwatchFields_version - dcgmGpuGrp_t groupId; //!< Group ID representing collection of one or more GPUs. Look at - //!< \ref dcgmGroupCreate for details on creating the group. - //!< Alternatively, pass in the group id as \a DCGM_GROUP_ALL_GPUS - //!< to perform operation on all the GPUs. The GPUs of the group must all be - //!< identical or DCGM_ST_GROUP_INCOMPATIBLE will be returned by this API. - unsigned int flags; //!< For future use. Set to 0 for now. -} dcgmProfUnwatchFields_v1; - -/** - * Version 1 of dcgmProfUnwatchFields_v1 - */ -#define dcgmProfUnwatchFields_version1 MAKE_DCGM_VERSION(dcgmProfUnwatchFields_v1, 1) -#define dcgmProfUnwatchFields_version dcgmProfUnwatchFields_version1 -typedef dcgmProfUnwatchFields_v1 dcgmProfUnwatchFields_t; - -/** - * Version 1 of dcgmSettingsSetLoggingSeverity_t - */ -typedef struct -{ - int targetLogger; - DcgmLoggingSeverity_t targetSeverity; -} dcgmSettingsSetLoggingSeverity_v1; - - -#define dcgmSettingsSetLoggingSeverity_version1 MAKE_DCGM_VERSION(dcgmSettingsSetLoggingSeverity_v1, 1) -#define dcgmSettingsSetLoggingSeverity_version dcgmSettingsSetLoggingSeverity_version1 -typedef dcgmSettingsSetLoggingSeverity_v1 dcgmSettingsSetLoggingSeverity_t; - -/** - * Structure to describe the DCGM build environment ver 2.0 - */ -typedef struct -{ - unsigned int version; // - * Every pair is separated by a colon char (:). Only the very first colon is considered as a separation.
- * Values can contain colon chars. Values and Keys cannot contain semicolon chars.
- * Usually defined keys are: - *

- * version : DCGM Version.
- * arch : Target DCGM Architecture.
- * buildid : Build ID. Usually a sequential number.
- * commit : Commit ID (Usually a git commit hash).
- * author : Author of the commit above.
- * branch : Branch (Usually a git branch that was used for the build).
- * buildtype : Build Type.
- * builddate : Date of the build.
- * buildplatform : Platform where the build was made.
- *

- * Any or all keys may be absent.
- * This values are for reference only are not supposed to participate in some complicated logic.
- */ - char rawBuildInfoString[DCGM_MAX_STR_LENGTH * 2]; -} dcgmVersionInfo_v2; - -/** - * Version 2 of the dcgmVersionInfo_v2 - */ -#define dcgmVersionInfo_version2 MAKE_DCGM_VERSION(dcgmVersionInfo_v2, 2) - -#define dcgmVersionInfo_version dcgmVersionInfo_version2 -typedef dcgmVersionInfo_v2 dcgmVersionInfo_t; - -/** @} */ - -#ifdef __cplusplus -} -#endif - -#endif /* DCGM_STRUCTS_H */ diff --git a/bindings/go/dcgm/dcgm_test.go b/bindings/go/dcgm/dcgm_test.go deleted file mode 100644 index df72d4d2..00000000 --- a/bindings/go/dcgm/dcgm_test.go +++ /dev/null @@ -1,189 +0,0 @@ -package dcgm - -import ( - "math" - "strconv" - "strings" - "testing" - - "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvsmi" -) - -func check(err error, t *testing.T) { - if err != nil { - t.Errorf("%v\n", err) - } -} - -func TestDeviceCount(t *testing.T) { - cleanup, err := Init(Embedded) - check(err, t) - defer cleanup() - - count, err := GetAllDeviceCount() - check(err, t) - - query := "count" - c := nvsmi.DeviceCount(query) - - if c != count { - t.Errorf("Device Count from dcgm is wrong, got %d, want: %d", count, c) - } -} - -func BenchmarkDeviceCount1(b *testing.B) { - Init(Embedded) - - b.StartTimer() - for n := 0; n < b.N; n++ { - GetAllDeviceCount() - } - b.StopTimer() - - Shutdown() -} - -func TestDeviceInfo(t *testing.T) { - cleanup, err := Init(Embedded) - check(err, t) - defer cleanup() - - fields := []string{ - "driver_version", - "name", - "serial", - "uuid", - "pci.bus_id", - "vbios_version", - "inforom.img", - "power.limit", - } - - gpus, err := GetSupportedDevices() - check(err, t) - - for _, gpu := range gpus { - info, err := GetDeviceInfo(gpu) - check(err, t) - - id := strconv.FormatUint(uint64(gpu), 10) - - for _, val := range fields { - var msg, output string - res := nvsmi.Query(id, val) - - switch val { - case "driver_version": - msg = "Driver version" - output = info.Identifiers.DriverVersion - case "name": - msg = "Device name" - output = info.Identifiers.Model - case "serial": - msg = "Device Serial number" - output = info.Identifiers.Serial - case "uuid": - msg = "Device UUID" - output = info.UUID - case "pci.bus_id": - msg = "Device PCI busId" - output = info.PCI.BusID - case "vbios_version": - msg = "Device vbios version" - output = info.Identifiers.Vbios - case "inforom.img": - msg = "Device inforom image" - output = info.Identifiers.InforomImageVersion - case "power.limit": - msg = "Device power limit" - output = strconv.FormatUint(uint64(info.Power), 10) - power, err := strconv.ParseFloat(res, 64) - check(err, t) - res = strconv.FormatUint(uint64(math.Round(power)), 10) - } - - if strings.Compare(res, output) != 0 { - if strings.Contains(output, "NOT_SUPPORTED") { - continue - } - - t.Errorf("%v from dcgm is wrong, got: %v, want: %v", msg, output, res) - } - } - } -} - -func BenchmarkDeviceInfo1(b *testing.B) { - Init(Embedded) - - b.StartTimer() - for n := 0; n < b.N; n++ { - // assuming there will be atleast 1 GPU attached - GetDeviceInfo(uint(0)) - } - b.StopTimer() - - Shutdown() -} - -func TestDeviceStatus(t *testing.T) { - cleanup, err := Init(Embedded) - check(err, t) - defer cleanup() - - gpus, err := GetSupportedDevices() - check(err, t) - - fields := []string{ - "power.draw", - "temperature.gpu", - "utilization.gpu", - "utilization.memory", - "encoder.stats.averageFps", - "clocks.current.sm", - "clocks.current.memory", - } - - for _, gpu := range gpus { - status, err := GetDeviceStatus(gpu) - check(err, t) - - id := strconv.FormatUint(uint64(gpu), 10) - - for _, val := range fields { - var msg, output string - res := nvsmi.Query(id, val) - - switch val { - case "power.draw": - msg = "Device power utilization" - output = strconv.FormatUint(uint64(math.Round(status.Power)), 10) - power, err := strconv.ParseFloat(res, 64) - check(err, t) - res = strconv.FormatUint(uint64(math.Round(power)), 10) - case "temperature.gpu": - msg = "Device temperature" - output = strconv.FormatUint(uint64(status.Temperature), 10) - case "utilization.gpu": - msg = "Device gpu utilization" - output = strconv.FormatUint(uint64(status.Utilization.GPU), 10) - case "utilization.memory": - msg = "Device memory utilization" - output = strconv.FormatUint(uint64(status.Utilization.Memory), 10) - case "encoder.stats.averageFps": - msg = "Device encoder utilization" - output = strconv.FormatUint(uint64(status.Utilization.Encoder), 10) - case "clocks.current.sm": - msg = "Device sm clock" - output = strconv.FormatUint(uint64(status.Clocks.Cores), 10) - case "clocks.current.memory": - msg = "Device mem clock" - output = strconv.FormatUint(uint64(status.Clocks.Memory), 10) - } - - if strings.Compare(res, output) != 0 { - t.Errorf("%v from dcgm is wrong, got: %v, want: %v", msg, output, res) - } - } - } -} diff --git a/bindings/go/dcgm/device_info.go b/bindings/go/dcgm/device_info.go deleted file mode 100644 index bda27a11..00000000 --- a/bindings/go/dcgm/device_info.go +++ /dev/null @@ -1,196 +0,0 @@ -package dcgm - -/* -#include "dcgm_agent.h" -#include "dcgm_structs.h" -*/ -import "C" -import ( - "fmt" - "math/rand" - "unsafe" -) - -type PCIInfo struct { - BusID string - BAR1 uint // MB - FBTotal uint // MB - Bandwidth int64 // MB/s -} - -type DeviceIdentifiers struct { - Brand string - Model string - Serial string - Vbios string - InforomImageVersion string - DriverVersion string -} - -type Device struct { - GPU uint - DCGMSupported string - UUID string - Power uint // W - PCI PCIInfo - Identifiers DeviceIdentifiers - Topology []P2PLink - CPUAffinity string -} - -// getAllDeviceCount counts all GPUs on the system -func getAllDeviceCount() (gpuCount uint, err error) { - var gpuIdList [C.DCGM_MAX_NUM_DEVICES]C.uint - var count C.int - - result := C.dcgmGetAllDevices(handle.handle, &gpuIdList[0], &count) - if err = errorString(result); err != nil { - return gpuCount, fmt.Errorf("Error getting devices count: %s", err) - } - gpuCount = uint(count) - return -} - -// getSupportedDevices returns DCGM supported GPUs -func getSupportedDevices() (gpus []uint, err error) { - var gpuIdList [C.DCGM_MAX_NUM_DEVICES]C.uint - var count C.int - - result := C.dcgmGetAllSupportedDevices(handle.handle, &gpuIdList[0], &count) - if err = errorString(result); err != nil { - return gpus, fmt.Errorf("Error getting DCGM supported devices: %s", err) - } - - numGpus := int(count) - gpus = make([]uint, numGpus) - for i := 0; i < numGpus; i++ { - gpus[i] = uint(gpuIdList[i]) - } - return -} - -func getPciBandwidth(gpuId uint) (int64, error) { - const ( - maxLinkGen int = iota - maxLinkWidth - fieldsCount - ) - - pciFields := make([]Short, fieldsCount) - pciFields[maxLinkGen] = C.DCGM_FI_DEV_PCIE_MAX_LINK_GEN - pciFields[maxLinkWidth] = C.DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH - - fieldsName := fmt.Sprintf("pciBandwidthFields%d", rand.Uint64()) - - fieldsId, err := FieldGroupCreate(fieldsName, pciFields) - if err != nil { - return 0, err - } - - groupName := fmt.Sprintf("pciBandwidth%d", rand.Uint64()) - groupId, err := WatchFields(gpuId, fieldsId, groupName) - if err != nil { - _ = FieldGroupDestroy(fieldsId) - return 0, err - } - - values, err := GetLatestValuesForFields(gpuId, pciFields) - if err != nil { - _ = FieldGroupDestroy(fieldsId) - _ = DestroyGroup(groupId) - return 0, fmt.Errorf("Error getting Pcie bandwidth: %s", err) - } - - gen := values[maxLinkGen].Int64() - width := values[maxLinkWidth].Int64() - - _ = FieldGroupDestroy(fieldsId) - _ = DestroyGroup(groupId) - - genMap := map[int64]int64{ - 1: 250, // MB/s - 2: 500, - 3: 985, - 4: 1969, - } - - bandwidth := genMap[gen] * width - return bandwidth, nil -} - -func getDeviceInfo(gpuid uint) (deviceInfo Device, err error) { - var device C.dcgmDeviceAttributes_t - device.version = makeVersion2(unsafe.Sizeof(device)) - - result := C.dcgmGetDeviceAttributes(handle.handle, C.uint(gpuid), &device) - if err = errorString(result); err != nil { - return deviceInfo, fmt.Errorf("Error getting device information: %s", err) - } - - // check if the given GPU is DCGM supported - gpus, err := getSupportedDevices() - if err != nil { - return - } - - supported := "No" - - for _, gpu := range gpus { - if gpuid == gpu { - supported = "Yes" - break - } - } - - busid := *stringPtr(&device.identifiers.pciBusId[0]) - - cpuAffinity, err := getCPUAffinity(busid) - if err != nil { - return - } - - var topology []P2PLink - var bandwidth int64 - // get device topology and bandwidth only if its a DCGM supported device - if supported == "Yes" { - topology, err = getDeviceTopology(gpuid) - if err != nil { - return - } - bandwidth, err = getPciBandwidth(gpuid) - if err != nil { - return - } - } - - uuid := *stringPtr(&device.identifiers.uuid[0]) - power := *uintPtr(device.powerLimits.defaultPowerLimit) - - pci := PCIInfo{ - BusID: busid, - BAR1: *uintPtr(device.memoryUsage.bar1Total), - FBTotal: *uintPtr(device.memoryUsage.fbTotal), - Bandwidth: bandwidth, - } - - identifiers := DeviceIdentifiers{ - Brand: *stringPtr(&device.identifiers.brandName[0]), - Model: *stringPtr(&device.identifiers.deviceName[0]), - Serial: *stringPtr(&device.identifiers.serial[0]), - Vbios: *stringPtr(&device.identifiers.vbios[0]), - InforomImageVersion: *stringPtr(&device.identifiers.inforomImageVersion[0]), - DriverVersion: *stringPtr(&device.identifiers.driverVersion[0]), - } - - deviceInfo = Device{ - GPU: gpuid, - DCGMSupported: supported, - UUID: uuid, - Power: power, - PCI: pci, - Identifiers: identifiers, - Topology: topology, - CPUAffinity: cpuAffinity, - } - return -} diff --git a/bindings/go/dcgm/device_status.go b/bindings/go/dcgm/device_status.go deleted file mode 100644 index 4d37de12..00000000 --- a/bindings/go/dcgm/device_status.go +++ /dev/null @@ -1,179 +0,0 @@ -package dcgm - -/* -#include "./dcgm_agent.h" -#include "./dcgm_structs.h" -*/ -import "C" -import ( - "fmt" - "math/rand" -) - -type PerfState uint - -const ( - PerfStateMax = 0 - PerfStateMin = 15 - PerfStateUnknown = 32 -) - -func (p PerfState) String() string { - if p >= PerfStateMax && p <= PerfStateMin { - return fmt.Sprintf("P%d", p) - } - return "Unknown" -} - -type UtilizationInfo struct { - GPU int64 // % - Memory int64 // % - Encoder int64 // % - Decoder int64 // % -} - -type ECCErrorsInfo struct { - SingleBit int64 - DoubleBit int64 -} - -type MemoryInfo struct { - GlobalUsed int64 - ECCErrors ECCErrorsInfo -} - -type ClockInfo struct { - Cores int64 // MHz - Memory int64 // MHz -} - -type PCIThroughputInfo struct { - Rx int64 // MB - Tx int64 // MB - Replays int64 -} - -type PCIStatusInfo struct { - BAR1Used int64 // MB - Throughput PCIThroughputInfo - FBUsed int64 -} - -type DeviceStatus struct { - Power float64 // W - Temperature int64 // °C - Utilization UtilizationInfo - Memory MemoryInfo - Clocks ClockInfo - PCI PCIStatusInfo - Performance PerfState - FanSpeed int64 // % -} - -func latestValuesForDevice(gpuId uint) (status DeviceStatus, err error) { - const ( - pwr int = iota - temp - sm - mem - enc - dec - smClock - memClock - bar1Used - pcieRxThroughput - pcieTxThroughput - pcieReplay - fbUsed - sbe - dbe - pstate - fanSpeed - fieldsCount - ) - - deviceFields := make([]Short, fieldsCount) - deviceFields[pwr] = C.DCGM_FI_DEV_POWER_USAGE - deviceFields[temp] = C.DCGM_FI_DEV_GPU_TEMP - deviceFields[sm] = C.DCGM_FI_DEV_GPU_UTIL - deviceFields[mem] = C.DCGM_FI_DEV_MEM_COPY_UTIL - deviceFields[enc] = C.DCGM_FI_DEV_ENC_UTIL - deviceFields[dec] = C.DCGM_FI_DEV_DEC_UTIL - deviceFields[smClock] = C.DCGM_FI_DEV_SM_CLOCK - deviceFields[memClock] = C.DCGM_FI_DEV_MEM_CLOCK - deviceFields[bar1Used] = C.DCGM_FI_DEV_BAR1_USED - deviceFields[pcieRxThroughput] = C.DCGM_FI_DEV_PCIE_RX_THROUGHPUT - deviceFields[pcieTxThroughput] = C.DCGM_FI_DEV_PCIE_TX_THROUGHPUT - deviceFields[pcieReplay] = C.DCGM_FI_DEV_PCIE_REPLAY_COUNTER - deviceFields[fbUsed] = C.DCGM_FI_DEV_FB_USED - deviceFields[sbe] = C.DCGM_FI_DEV_ECC_SBE_AGG_TOTAL - deviceFields[dbe] = C.DCGM_FI_DEV_ECC_DBE_AGG_TOTAL - deviceFields[pstate] = C.DCGM_FI_DEV_PSTATE - deviceFields[fanSpeed] = C.DCGM_FI_DEV_FAN_SPEED - - fieldsName := fmt.Sprintf("devStatusFields%d", rand.Uint64()) - fieldsId, err := FieldGroupCreate(fieldsName, deviceFields) - if err != nil { - return - } - - groupName := fmt.Sprintf("devStatus%d", rand.Uint64()) - groupId, err := WatchFields(gpuId, fieldsId, groupName) - if err != nil { - _ = FieldGroupDestroy(fieldsId) - return - } - - values, err := GetLatestValuesForFields(gpuId, deviceFields) - if err != nil { - _ = FieldGroupDestroy(fieldsId) - _ = DestroyGroup(groupId) - return status, fmt.Errorf("Error getting device status: %s", err) - } - - power := values[pwr].Float64() - - gpuUtil := UtilizationInfo{ - GPU: values[sm].Int64(), - Memory: values[mem].Int64(), - Encoder: values[enc].Int64(), - Decoder: values[dec].Int64(), - } - - memory := MemoryInfo{ - ECCErrors: ECCErrorsInfo{ - SingleBit: values[sbe].Int64(), - DoubleBit: values[dbe].Int64(), - }, - } - - clocks := ClockInfo{ - Cores: values[smClock].Int64(), - Memory: values[memClock].Int64(), - } - - pci := PCIStatusInfo{ - BAR1Used: values[bar1Used].Int64(), - Throughput: PCIThroughputInfo{ - Rx: values[pcieRxThroughput].Int64(), - Tx: values[pcieTxThroughput].Int64(), - Replays: values[pcieReplay].Int64(), - }, - FBUsed: values[fbUsed].Int64(), - } - - status = DeviceStatus{ - Power: power, - Temperature: values[temp].Int64(), - Utilization: gpuUtil, - Memory: memory, - Clocks: clocks, - PCI: pci, - Performance: PerfState(values[pstate].Int64()), - FanSpeed: values[fanSpeed].Int64(), - } - - _ = FieldGroupDestroy(fieldsId) - _ = DestroyGroup(groupId) - return -} diff --git a/bindings/go/dcgm/fields.go b/bindings/go/dcgm/fields.go deleted file mode 100644 index c5e50bf7..00000000 --- a/bindings/go/dcgm/fields.go +++ /dev/null @@ -1,257 +0,0 @@ -package dcgm - -/* -#include "./dcgm_agent.h" -#include "./dcgm_structs.h" -*/ -import "C" -import ( - "fmt" - "unicode" - "unsafe" -) - -const ( - updateFreq = 1000000 // usec - maxKeepAge = 300 // sec - maxKeepSamples = 0 // nolimit -) - -type FieldMeta struct { - FieldId Short - FieldType byte - Size byte - Tag string - Scope int - NvmlFieldId int - EntityLevel Field_Entity_Group -} - -type FieldHandle struct{ handle C.dcgmFieldGrp_t } - -func FieldGroupCreate(fieldsGroupName string, fields []Short) (fieldsId FieldHandle, err error) { - var fieldsGroup C.dcgmFieldGrp_t - cfields := *(*[]C.ushort)(unsafe.Pointer(&fields)) - - groupName := C.CString(fieldsGroupName) - defer freeCString(groupName) - - result := C.dcgmFieldGroupCreate(handle.handle, C.int(len(fields)), &cfields[0], groupName, &fieldsGroup) - if err = errorString(result); err != nil { - return fieldsId, fmt.Errorf("Error creating DCGM fields group: %s", err) - } - - fieldsId = FieldHandle{fieldsGroup} - return -} - -func FieldGroupDestroy(fieldsGroup FieldHandle) (err error) { - result := C.dcgmFieldGroupDestroy(handle.handle, fieldsGroup.handle) - if err = errorString(result); err != nil { - fmt.Errorf("Error destroying DCGM fields group: %s", err) - } - - return -} - -func WatchFields(gpuId uint, fieldsGroup FieldHandle, groupName string) (groupId GroupHandle, err error) { - group, err := CreateGroup(groupName) - if err != nil { - return - } - - err = AddToGroup(group, gpuId) - if err != nil { - return - } - - result := C.dcgmWatchFields(handle.handle, group.handle, fieldsGroup.handle, C.longlong(updateFreq), C.double(maxKeepAge), C.int(maxKeepSamples)) - if err = errorString(result); err != nil { - return groupId, fmt.Errorf("Error watching fields: %s", err) - } - - _ = UpdateAllFields() - return group, nil -} - -func WatchFieldsWithGroup(fieldsGroup FieldHandle, group GroupHandle) error { - result := C.dcgmWatchFields(handle.handle, group.handle, fieldsGroup.handle, - C.longlong(updateFreq), C.double(maxKeepAge), C.int(maxKeepSamples)) - - if err := errorString(result); err != nil { - return fmt.Errorf("Error watching fields: %s", err) - } - - if err := UpdateAllFields(); err != nil { - return err - } - - return nil -} - -func GetLatestValuesForFields(gpu uint, fields []Short) ([]FieldValue_v1, error) { - values := make([]C.dcgmFieldValue_v1, len(fields)) - cfields := *(*[]C.ushort)(unsafe.Pointer(&fields)) - - result := C.dcgmGetLatestValuesForFields(handle.handle, C.int(gpu), &cfields[0], C.uint(len(fields)), &values[0]) - if err := errorString(result); err != nil { - return nil, fmt.Errorf("Error watching fields: %s", err) - } - - return toFieldValue(values), nil -} - -func EntityGetLatestValues(entityGroup Field_Entity_Group, entityId uint, fields []Short) ([]FieldValue_v1, error) { - values := make([]C.dcgmFieldValue_v1, len(fields)) - cfields := (*C.ushort)(unsafe.Pointer(&fields[0])) - - result := C.dcgmEntityGetLatestValues(handle.handle, C.dcgm_field_entity_group_t(entityGroup), C.int(entityId), cfields, C.uint(len(fields)), &values[0]) - if err := errorString(result); err != nil { - return nil, fmt.Errorf("Error getting the latest value for fields: %s", err) - } - - return toFieldValue(values), nil -} - -func EntitiesGetLatestValues(entities []GroupEntityPair, fields []Short, flags uint) ([]FieldValue_v2, error) { - values := make([]C.dcgmFieldValue_v2, len(fields)*len(entities)) - cfields := (*C.ushort)(unsafe.Pointer(&fields[0])) - cEntities := make([]C.dcgmGroupEntityPair_t, len(entities)) - cPtrEntities := *(*[]C.dcgmGroupEntityPair_t)(unsafe.Pointer(&cEntities)) - for i, entity := range entities { - cEntities[i] = C.dcgmGroupEntityPair_t{C.dcgm_field_entity_group_t(entity.EntityGroupId), C.dcgm_field_eid_t(entity.EntityId)} - } - - result := C.dcgmEntitiesGetLatestValues(handle.handle, &cPtrEntities[0], C.uint(len(entities)), cfields, C.uint(len(fields)), C.uint(flags), &values[0]) - if err := errorString(result); err != nil { - return nil, fmt.Errorf("Error getting the latest value for fields: %s", err) - } - - return toFieldValue_v2(values), nil -} - -func UpdateAllFields() error { - waitForUpdate := C.int(1) - result := C.dcgmUpdateAllFields(handle.handle, waitForUpdate) - - return errorString(result) -} - -func toFieldValue(cfields []C.dcgmFieldValue_v1) []FieldValue_v1 { - fields := make([]FieldValue_v1, len(cfields)) - for i, f := range cfields { - fields[i] = FieldValue_v1{ - Version: uint(f.version), - FieldId: uint(f.fieldId), - FieldType: uint(f.fieldType), - Status: int(f.status), - Ts: int64(f.ts), - Value: f.value, - } - } - - return fields -} - -func (fv FieldValue_v1) Int64() int64 { - return *(*int64)(unsafe.Pointer(&fv.Value[0])) -} - -func (fv FieldValue_v1) Float64() float64 { - return *(*float64)(unsafe.Pointer(&fv.Value[0])) -} - -func (fv FieldValue_v1) String() string { - return *(*string)(unsafe.Pointer(&fv.Value[0])) -} - -func (fv FieldValue_v1) Blob() [4096]byte { - return fv.Value -} - -func toFieldValue_v2(cfields []C.dcgmFieldValue_v2) []FieldValue_v2 { - fields := make([]FieldValue_v2, len(cfields)) - for i, f := range cfields { - if uint(f.fieldType) == DCGM_FT_STRING { - fields[i] = FieldValue_v2{ - Version: uint(f.version), - EntityGroupId: Field_Entity_Group(f.entityGroupId), - EntityId: uint(f.entityId), - FieldId: uint(f.fieldId), - FieldType: uint(f.fieldType), - Status: int(f.status), - Ts: int64(f.ts), - Value: f.value, - StringValue: stringPtr((*C.char)(unsafe.Pointer(&f.value[0]))), - } - } else { - fields[i] = FieldValue_v2{ - Version: uint(f.version), - EntityGroupId: Field_Entity_Group(f.entityGroupId), - EntityId: uint(f.entityId), - FieldId: uint(f.fieldId), - FieldType: uint(f.fieldType), - Status: int(f.status), - Ts: int64(f.ts), - Value: f.value, - StringValue: nil, - } - } - } - - return fields -} - -func Fv2_Int64(fv FieldValue_v2) int64 { - return *(*int64)(unsafe.Pointer(&fv.Value[0])) -} - -func Fv2_Float64(fv FieldValue_v2) float64 { - return *(*float64)(unsafe.Pointer(&fv.Value[0])) -} - -func FindFirstNonAsciiIndex(value [4096]byte) int { - for i := 0; i < 4096; i++ { - if value[i] > unicode.MaxASCII || value[i] < 33 { - return i - } - } - - return 4096 -} - -func Fv2_String(fv FieldValue_v2) string { - if fv.FieldType == DCGM_FT_STRING { - return *fv.StringValue - } else { - return string(fv.Value[:]) - } -} - -func Fv2_Blob(fv FieldValue_v2) [4096]byte { - return fv.Value -} - -func ToFieldMeta(fieldInfo C.dcgm_field_meta_p) FieldMeta { - return FieldMeta{ - FieldId: Short(fieldInfo.fieldId), - FieldType: byte(fieldInfo.fieldType), - Size: byte(fieldInfo.size), - Tag: *stringPtr((*C.char)(unsafe.Pointer(&fieldInfo.tag[0]))), - Scope: int(fieldInfo.scope), - NvmlFieldId: int(fieldInfo.nvmlFieldId), - EntityLevel: Field_Entity_Group(fieldInfo.entityLevel), - } -} - -func FieldGetById(fieldId Short) FieldMeta { - return ToFieldMeta(C.DcgmFieldGetById(C.ushort(fieldId))) -} - -func FieldsInit() int { - return int(C.DcgmFieldsInit()) -} - -func FieldsTerm() int { - return int(C.DcgmFieldsTerm()) -} diff --git a/bindings/go/dcgm/go.mod b/bindings/go/dcgm/go.mod deleted file mode 100644 index 6da14c24..00000000 --- a/bindings/go/dcgm/go.mod +++ /dev/null @@ -1,3 +0,0 @@ -module github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm - -go 1.14 diff --git a/bindings/go/dcgm/gpu_group.go b/bindings/go/dcgm/gpu_group.go deleted file mode 100644 index 13c914ae..00000000 --- a/bindings/go/dcgm/gpu_group.go +++ /dev/null @@ -1,67 +0,0 @@ -package dcgm - -/* -#include "dcgm_agent.h" -#include "dcgm_structs.h" -*/ -import "C" -import ( - "fmt" -) - -type GroupHandle struct{ handle C.dcgmGpuGrp_t } - -func CreateGroup(groupName string) (goGroupId GroupHandle, err error) { - var cGroupId C.dcgmGpuGrp_t - cname := C.CString(groupName) - defer freeCString(cname) - - result := C.dcgmGroupCreate(handle.handle, C.DCGM_GROUP_EMPTY, cname, &cGroupId) - if err = errorString(result); err != nil { - return goGroupId, fmt.Errorf("Error creating group: %s", err) - } - - goGroupId = GroupHandle{cGroupId} - return -} - -func NewDefaultGroup(groupName string) (GroupHandle, error) { - var cGroupId C.dcgmGpuGrp_t - - cname := C.CString(groupName) - defer freeCString(cname) - - result := C.dcgmGroupCreate(handle.handle, C.DCGM_GROUP_DEFAULT, cname, &cGroupId) - if err := errorString(result); err != nil { - return GroupHandle{}, fmt.Errorf("Error creating group: %s", err) - } - - return GroupHandle{cGroupId}, nil -} - -func AddToGroup(groupId GroupHandle, gpuId uint) (err error) { - result := C.dcgmGroupAddDevice(handle.handle, groupId.handle, C.uint(gpuId)) - if err = errorString(result); err != nil { - return fmt.Errorf("Error adding GPU %v to group: %s", gpuId, err) - } - - return -} - -func AddEntityToGroup(groupId GroupHandle, entityGroupId Field_Entity_Group, entityId uint) (err error) { - result := C.dcgmGroupAddEntity(handle.handle, groupId.handle, C.dcgm_field_entity_group_t(entityGroupId), C.uint(entityId)) - if err = errorString(result); err != nil { - return fmt.Errorf("Error adding entity group type %v, entity %v to group: %s", entityGroupId, entityId, err) - } - - return -} - -func DestroyGroup(groupId GroupHandle) (err error) { - result := C.dcgmGroupDestroy(handle.handle, groupId.handle) - if err = errorString(result); err != nil { - return fmt.Errorf("Error destroying group: %s", err) - } - - return -} diff --git a/bindings/go/dcgm/health.go b/bindings/go/dcgm/health.go deleted file mode 100644 index e611e726..00000000 --- a/bindings/go/dcgm/health.go +++ /dev/null @@ -1,121 +0,0 @@ -package dcgm - -/* -#include "dcgm_agent.h" -#include "dcgm_structs.h" -*/ -import "C" -import ( - "fmt" - "math/rand" - "unsafe" -) - -type SystemWatch struct { - Type string - Status string - Error string -} - -type DeviceHealth struct { - GPU uint - Status string - Watches []SystemWatch -} - -func setHealthWatches(groupId GroupHandle) (err error) { - result := C.dcgmHealthSet(handle.handle, groupId.handle, C.DCGM_HEALTH_WATCH_ALL) - if err = errorString(result); err != nil { - return fmt.Errorf("Error setting health watches: %s", err) - } - return -} - -func healthCheckByGpuId(gpuId uint) (deviceHealth DeviceHealth, err error) { - name := fmt.Sprintf("health%d", rand.Uint64()) - groupId, err := CreateGroup(name) - if err != nil { - return - } - - err = AddToGroup(groupId, gpuId) - if err != nil { - return - } - - err = setHealthWatches(groupId) - if err != nil { - return - } - - var healthResults C.dcgmHealthResponse_v4 - healthResults.version = makeVersion2(unsafe.Sizeof(healthResults)) - - result := C.dcgmHealthCheck(handle.handle, groupId.handle, (*C.dcgmHealthResponse_t)(unsafe.Pointer(&healthResults))) - - if err = errorString(result); err != nil { - return deviceHealth, fmt.Errorf("Error checking GPU health: %s", err) - } - - status := healthStatus(int8(healthResults.overallHealth)) - watches := []SystemWatch{} - - // number of watches that encountred error/warning - incidents := uint(healthResults.incidentCount) - - for j := uint(0); j < incidents; j++ { - watch := SystemWatch{ - Type: systemWatch(int(healthResults.incidents[j].system)), - Status: healthStatus(int8(healthResults.incidents[j].health)), - - Error: *stringPtr(&healthResults.incidents[j].error.msg[0]), - } - watches = append(watches, watch) - } - - deviceHealth = DeviceHealth{ - GPU: gpuId, - Status: status, - Watches: watches, - } - _ = DestroyGroup(groupId) - return -} - -func healthStatus(status int8) string { - switch status { - case 0: - return "Healthy" - case 10: - return "Warning" - case 20: - return "Failure" - } - return "N/A" -} - -func systemWatch(watch int) string { - switch watch { - case 1: - return "PCIe watches" - case 2: - return "NVLINK watches" - case 4: - return "Power Managemnt unit watches" - case 8: - return "Microcontroller unit watches" - case 16: - return "Memory watches" - case 32: - return "Streaming Multiprocessor watches" - case 64: - return "Inforom watches" - case 128: - return "Temperature watches" - case 256: - return "Power watches" - case 512: - return "Driver-related watches" - } - return "N/A" -} diff --git a/bindings/go/dcgm/hostengine_status.go b/bindings/go/dcgm/hostengine_status.go deleted file mode 100644 index 4e6e6b93..00000000 --- a/bindings/go/dcgm/hostengine_status.go +++ /dev/null @@ -1,49 +0,0 @@ -package dcgm - -/* -#include "dcgm_agent.h" -#include "dcgm_structs.h" -*/ -import "C" -import ( - "fmt" - "unsafe" -) - -type DcgmStatus struct { - Memory int64 - CPU float64 -} - -func introspect() (engine DcgmStatus, err error) { - enableIntrospect := C.dcgmIntrospectState_t(1) - result := C.dcgmIntrospectToggleState(handle.handle, enableIntrospect) - - if err = errorString(result); err != nil { - return engine, fmt.Errorf("Error enabling DCGM introspection: %s", err) - } - - var memory C.dcgmIntrospectMemory_t - memory.version = makeVersion2(unsafe.Sizeof(memory)) - waitIfNoData := 1 - result = C.dcgmIntrospectGetHostengineMemoryUsage(handle.handle, &memory, C.int(waitIfNoData)) - - if err = errorString(result); err != nil { - return engine, fmt.Errorf("Error getting memory usage of hostengine: %s", err) - } - - var cpu C.dcgmIntrospectCpuUtil_t - - cpu.version = makeVersion2(unsafe.Sizeof(cpu)) - result = C.dcgmIntrospectGetHostengineCpuUtilization(handle.handle, &cpu, C.int(waitIfNoData)) - - if err = errorString(result); err != nil { - return engine, fmt.Errorf("Error getting cpu usage of hostengine: %s", err) - } - - engine = DcgmStatus{ - Memory: toInt64(memory.bytesUsed) / 1024, - CPU: *dblToFloat(cpu.total) * 100, - } - return -} diff --git a/bindings/go/dcgm/mig.go b/bindings/go/dcgm/mig.go deleted file mode 100644 index 1e0f6dc3..00000000 --- a/bindings/go/dcgm/mig.go +++ /dev/null @@ -1,89 +0,0 @@ -package dcgm - -/* -#include "./dcgm_agent.h" -#include "./dcgm_structs.h" -*/ -import "C" -import ( - "fmt" - "unsafe" -) - -type Field_Entity_Group uint - -const ( - FE_NONE Field_Entity_Group = iota - FE_GPU - FE_VGPU - FE_SWITCH - FE_GPU_I - FE_GPU_CI - FE_COUNT -) - -type GroupEntityPair struct { - EntityGroupId Field_Entity_Group - EntityId uint -} - -type MigEntityInfo struct { - GpuUuid string - NvmlGpuIndex uint - NvmlInstanceId uint - NvmlComputeInstanceId uint - NvmlMigProfileId uint - NvmlProfileSlices uint -} - -type MigHierarchyInfo_v2 struct { - Entity GroupEntityPair - Parent GroupEntityPair - Info MigEntityInfo -} - -const ( - MAX_NUM_DEVICES uint = C.DCGM_MAX_NUM_DEVICES - MAX_HIERARCHY_INFO uint = C.DCGM_MAX_HIERARCHY_INFO -) - -type MigHierarchy_v2 struct { - Version uint - Count uint - EntityList [C.DCGM_MAX_HIERARCHY_INFO]MigHierarchyInfo_v2 -} - -func GetGpuInstanceHierarchy() (hierarchy MigHierarchy_v2, err error) { - var c_hierarchy C.dcgmMigHierarchy_v2 - c_hierarchy.version = C.dcgmMigHierarchy_version2 - ptr_hierarchy := (*C.dcgmMigHierarchy_v2)(unsafe.Pointer(&c_hierarchy)) - result := C.dcgmGetGpuInstanceHierarchy(handle.handle, ptr_hierarchy) - - if err = errorString(result); err != nil { - return toMigHierarchy(c_hierarchy), fmt.Errorf("Error retrieving DCGM MIG hierarchy: %s", err) - } - - return toMigHierarchy(c_hierarchy), nil -} - -func toMigHierarchy(c_hierarchy C.dcgmMigHierarchy_v2) MigHierarchy_v2 { - var hierarchy MigHierarchy_v2 - hierarchy.Version = uint(c_hierarchy.version) - hierarchy.Count = uint(c_hierarchy.count) - for i := uint(0); i < hierarchy.Count; i++ { - hierarchy.EntityList[i] = MigHierarchyInfo_v2{ - Entity: GroupEntityPair{Field_Entity_Group(c_hierarchy.entityList[i].entity.entityGroupId), uint(c_hierarchy.entityList[i].entity.entityId)}, - Parent: GroupEntityPair{Field_Entity_Group(c_hierarchy.entityList[i].parent.entityGroupId), uint(c_hierarchy.entityList[i].parent.entityId)}, - Info: MigEntityInfo{ - GpuUuid: *stringPtr(&c_hierarchy.entityList[i].info.gpuUuid[0]), - NvmlGpuIndex: uint(c_hierarchy.entityList[i].info.nvmlGpuIndex), - NvmlInstanceId: uint(c_hierarchy.entityList[i].info.nvmlInstanceId), - NvmlComputeInstanceId: uint(c_hierarchy.entityList[i].info.nvmlComputeInstanceId), - NvmlMigProfileId: uint(c_hierarchy.entityList[i].info.nvmlMigProfileId), - NvmlProfileSlices: uint(c_hierarchy.entityList[i].info.nvmlProfileSlices), - }, - } - } - - return hierarchy -} diff --git a/bindings/go/dcgm/policy.go b/bindings/go/dcgm/policy.go deleted file mode 100644 index 06be22fa..00000000 --- a/bindings/go/dcgm/policy.go +++ /dev/null @@ -1,419 +0,0 @@ -package dcgm - -/* -#include "dcgm_agent.h" -#include "dcgm_structs.h" - -// wrapper for go callback function -extern int violationNotify(void* p); -*/ -import "C" -import ( - "encoding/binary" - "fmt" - "log" - "math/rand" - "sync" - "time" - "unsafe" -) - -type policyCondition string - -const ( - DbePolicy = policyCondition("Double-bit ECC error") - PCIePolicy = policyCondition("PCI error") - MaxRtPgPolicy = policyCondition("Max Retired Pages Limit") - ThermalPolicy = policyCondition("Thermal Limit") - PowerPolicy = policyCondition("Power Limit") - NvlinkPolicy = policyCondition("Nvlink Error") - XidPolicy = policyCondition("XID Error") -) - -type PolicyViolation struct { - Condition policyCondition - Timestamp time.Time - Data interface{} -} - -type policyIndex int - -const ( - dbePolicyIndex policyIndex = iota - pciePolicyIndex - maxRtPgPolicyIndex - thermalPolicyIndex - powerPolicyIndex - nvlinkPolicyIndex - xidPolicyIndex -) - -type policyConditionParam struct { - typ uint32 - value uint32 -} - -type dbePolicyCondition struct { - Location string - NumErrors uint -} - -type pciPolicyCondition struct { - ReplayCounter uint -} - -type retiredPagesPolicyCondition struct { - SbePages uint - DbePages uint -} - -type thermalPolicyCondition struct { - ThermalViolation uint -} - -type powerPolicyCondition struct { - PowerViolation uint -} - -type nvlinkPolicyCondition struct { - FieldId uint16 - Counter uint -} - -type xidPolicyCondition struct { - ErrNum uint -} - -var ( - policyChanOnce sync.Once - policyMapOnce sync.Once - - // callbacks maps PolicyViolation channels with policy - // captures C callback() value for each violation condition - callbacks map[string]chan PolicyViolation - - // paramMap maps C.dcgmPolicy_t.parms index and limits - // to be used in setPolicy() for setting user selected policies - paramMap map[policyIndex]policyConditionParam -) - -func makePolicyChannels() { - policyChanOnce.Do(func() { - callbacks = make(map[string]chan PolicyViolation) - callbacks["dbe"] = make(chan PolicyViolation, 1) - callbacks["pcie"] = make(chan PolicyViolation, 1) - callbacks["maxrtpg"] = make(chan PolicyViolation, 1) - callbacks["thermal"] = make(chan PolicyViolation, 1) - callbacks["power"] = make(chan PolicyViolation, 1) - callbacks["nvlink"] = make(chan PolicyViolation, 1) - callbacks["xid"] = make(chan PolicyViolation, 1) - }) -} - -func makePolicyParmsMap() { - const ( - policyFieldTypeBool = 0 - policyFieldTypeLong = 1 - policyBoolValue = 1 - policyMaxRtPgThreshold = 10 - policyThermalThreshold = 100 - policyPowerThreshold = 250 - ) - - policyMapOnce.Do(func() { - paramMap = make(map[policyIndex]policyConditionParam) - paramMap[dbePolicyIndex] = policyConditionParam{ - typ: policyFieldTypeBool, - value: policyBoolValue, - } - - paramMap[pciePolicyIndex] = policyConditionParam{ - typ: policyFieldTypeBool, - value: policyBoolValue, - } - - paramMap[maxRtPgPolicyIndex] = policyConditionParam{ - typ: policyFieldTypeLong, - value: policyMaxRtPgThreshold, - } - - paramMap[thermalPolicyIndex] = policyConditionParam{ - typ: policyFieldTypeLong, - value: policyThermalThreshold, - } - - paramMap[powerPolicyIndex] = policyConditionParam{ - typ: policyFieldTypeLong, - value: policyPowerThreshold, - } - - paramMap[nvlinkPolicyIndex] = policyConditionParam{ - typ: policyFieldTypeBool, - value: policyBoolValue, - } - - paramMap[xidPolicyIndex] = policyConditionParam{ - typ: policyFieldTypeBool, - value: policyBoolValue, - } - }) -} - -// ViolationRegistration is a go callback function for dcgmPolicyRegister() wrapped in C.violationNotify() -//export ViolationRegistration -func ViolationRegistration(data unsafe.Pointer) int { - var con policyCondition - var timestamp time.Time - var val interface{} - - response := *(*C.dcgmPolicyCallbackResponse_t)(unsafe.Pointer(data)) - - switch response.condition { - case C.DCGM_POLICY_COND_DBE: - dbe := (*C.dcgmPolicyConditionDbe_t)(unsafe.Pointer(&response.val)) - con = DbePolicy - timestamp = createTimeStamp(dbe.timestamp) - val = dbePolicyCondition{ - Location: dbeLocation(int(dbe.location)), - NumErrors: *uintPtr(dbe.numerrors), - } - case C.DCGM_POLICY_COND_PCI: - pci := (*C.dcgmPolicyConditionPci_t)(unsafe.Pointer(&response.val)) - con = PCIePolicy - timestamp = createTimeStamp(pci.timestamp) - val = pciPolicyCondition{ - ReplayCounter: *uintPtr(pci.counter), - } - case C.DCGM_POLICY_COND_MAX_PAGES_RETIRED: - mpr := (*C.dcgmPolicyConditionMpr_t)(unsafe.Pointer(&response.val)) - con = MaxRtPgPolicy - timestamp = createTimeStamp(mpr.timestamp) - val = retiredPagesPolicyCondition{ - SbePages: *uintPtr(mpr.sbepages), - DbePages: *uintPtr(mpr.dbepages), - } - case C.DCGM_POLICY_COND_THERMAL: - thermal := (*C.dcgmPolicyConditionThermal_t)(unsafe.Pointer(&response.val)) - con = ThermalPolicy - timestamp = createTimeStamp(thermal.timestamp) - val = thermalPolicyCondition{ - ThermalViolation: *uintPtr(thermal.thermalViolation), - } - case C.DCGM_POLICY_COND_POWER: - pwr := (*C.dcgmPolicyConditionPower_t)(unsafe.Pointer(&response.val)) - con = PowerPolicy - timestamp = createTimeStamp(pwr.timestamp) - val = powerPolicyCondition{ - PowerViolation: *uintPtr(pwr.powerViolation), - } - case C.DCGM_POLICY_COND_NVLINK: - nvlink := (*C.dcgmPolicyConditionNvlink_t)(unsafe.Pointer(&response.val)) - con = NvlinkPolicy - timestamp = createTimeStamp(nvlink.timestamp) - val = nvlinkPolicyCondition{ - FieldId: uint16(nvlink.fieldId), - Counter: *uintPtr(nvlink.counter), - } - case C.DCGM_POLICY_COND_XID: - xid := (*C.dcgmPolicyConditionXID_t)(unsafe.Pointer(&response.val)) - con = XidPolicy - timestamp = createTimeStamp(xid.timestamp) - val = xidPolicyCondition{ - ErrNum: *uintPtr(xid.errnum), - } - } - - err := PolicyViolation{ - Condition: con, - Timestamp: timestamp, - Data: val, - } - - switch con { - case DbePolicy: - callbacks["dbe"] <- err - case PCIePolicy: - callbacks["pcie"] <- err - case MaxRtPgPolicy: - callbacks["maxrtpg"] <- err - case ThermalPolicy: - callbacks["thermal"] <- err - case PowerPolicy: - callbacks["power"] <- err - case NvlinkPolicy: - callbacks["nvlink"] <- err - case XidPolicy: - callbacks["xid"] <- err - } - return 0 -} - -func setPolicy(groupId GroupHandle, condition C.dcgmPolicyCondition_t, paramList []policyIndex) (err error) { - var policy C.dcgmPolicy_t - policy.version = makeVersion2(unsafe.Sizeof(policy)) - policy.mode = C.dcgmPolicyMode_t(C.DCGM_OPERATION_MODE_AUTO) - policy.action = C.DCGM_POLICY_ACTION_NONE - policy.isolation = C.DCGM_POLICY_ISOLATION_NONE - policy.validation = C.DCGM_POLICY_VALID_NONE - policy.condition = condition - - // iterate on paramMap for given policy conditions - for _, key := range paramList { - conditionParam, exists := paramMap[policyIndex(key)] - if !exists { - return fmt.Errorf("Error: Invalid Policy condition, %v does not exist.\n", key) - } - // set policy condition parameters - // set condition type (bool or longlong) - policy.parms[key].tag = conditionParam.typ - - // set condition val (violation threshold) - // policy.parms.val is a C union type - // cgo docs: Go doesn't have support for C's union type - // C union types are represented as a Go byte array - binary.LittleEndian.PutUint32(policy.parms[key].val[:], conditionParam.value) - } - var statusHandle C.dcgmStatus_t - result := C.dcgmPolicySet(handle.handle, groupId.handle, &policy, statusHandle) - if err = errorString(result); err != nil { - return fmt.Errorf("Error setting policies: %s", err) - } - log.Println("Policy successfully set.") - return -} - -func registerPolicy(gpuId uint, typ ...policyCondition) (violation chan PolicyViolation, err error) { - // init policy globals for internal API - makePolicyChannels() - makePolicyParmsMap() - - name := fmt.Sprintf("policy%d", rand.Uint64()) - groupId, err := CreateGroup(name) - if err != nil { - return - } - if err = AddToGroup(groupId, gpuId); err != nil { - return - } - - // make a list of all callback channels - var channels []chan PolicyViolation - // make a list of policy conditions for setting their parameters - var paramKeys []policyIndex - // get all conditions to be set in setPolicy() - var condition C.dcgmPolicyCondition_t = 0 - for _, t := range typ { - switch t { - case DbePolicy: - paramKeys = append(paramKeys, dbePolicyIndex) - condition |= C.DCGM_POLICY_COND_DBE - channels = append(channels, callbacks["dbe"]) - case PCIePolicy: - paramKeys = append(paramKeys, pciePolicyIndex) - condition |= C.DCGM_POLICY_COND_PCI - channels = append(channels, callbacks["pcie"]) - case MaxRtPgPolicy: - paramKeys = append(paramKeys, maxRtPgPolicyIndex) - condition |= C.DCGM_POLICY_COND_MAX_PAGES_RETIRED - channels = append(channels, callbacks["maxrtpg"]) - case ThermalPolicy: - paramKeys = append(paramKeys, thermalPolicyIndex) - condition |= C.DCGM_POLICY_COND_THERMAL - channels = append(channels, callbacks["thermal"]) - case PowerPolicy: - paramKeys = append(paramKeys, powerPolicyIndex) - condition |= C.DCGM_POLICY_COND_POWER - channels = append(channels, callbacks["power"]) - case NvlinkPolicy: - paramKeys = append(paramKeys, nvlinkPolicyIndex) - condition |= C.DCGM_POLICY_COND_NVLINK - channels = append(channels, callbacks["nvlink"]) - case XidPolicy: - paramKeys = append(paramKeys, xidPolicyIndex) - condition |= C.DCGM_POLICY_COND_XID - channels = append(channels, callbacks["xid"]) - } - } - - if err = setPolicy(groupId, condition, paramKeys); err != nil { - return - } - - result := C.dcgmPolicyRegister(handle.handle, groupId.handle, C.dcgmPolicyCondition_t(condition), C.fpRecvUpdates(C.violationNotify), C.fpRecvUpdates(C.violationNotify)) - - if err = errorString(result); err != nil { - return violation, fmt.Errorf("Error registering policy: %s", err) - } - log.Println("Listening for violations...") - - // create a publisher - publisher := newPublisher() - _ = publisher.add() - _ = publisher.add() - - // broadcast - go publisher.broadcast() - - go func() { - for { - select { - case dbe := <-callbacks["dbe"]: - publisher.send(dbe) - case pcie := <-callbacks["pcie"]: - publisher.send(pcie) - case maxrtpg := <-callbacks["maxrtpg"]: - publisher.send(maxrtpg) - case thermal := <-callbacks["thermal"]: - publisher.send(thermal) - case power := <-callbacks["power"]: - publisher.send(power) - case nvlink := <-callbacks["nvlink"]: - publisher.send(nvlink) - case xid := <-callbacks["xid"]: - publisher.send(xid) - } - } - }() - - // merge - violation = make(chan PolicyViolation, len(channels)) - go func() { - for _, c := range channels { - val := <-c - violation <- val - } - close(violation) - }() - _ = DestroyGroup(groupId) - return -} - -func unregisterPolicy(groupId GroupHandle, condition C.dcgmPolicyCondition_t) { - result := C.dcgmPolicyUnregister(handle.handle, groupId.handle, condition) - - if err := errorString(result); err != nil { - fmt.Errorf("Error unregistering policy: %s", err) - } -} - -func createTimeStamp(t C.longlong) time.Time { - tm := int64(t) / 1000000 - ts := time.Unix(tm, 0) - return ts -} - -func dbeLocation(location int) string { - switch location { - case 0: - return "L1" - case 1: - return "L2" - case 2: - return "Device" - case 3: - return "Register" - case 4: - return "Texture" - } - return "N/A" -} diff --git a/bindings/go/dcgm/process_info.go b/bindings/go/dcgm/process_info.go deleted file mode 100644 index 64227cfa..00000000 --- a/bindings/go/dcgm/process_info.go +++ /dev/null @@ -1,203 +0,0 @@ -package dcgm - -/* -#include "dcgm_agent.h" -#include "dcgm_structs.h" -*/ -import "C" -import ( - "fmt" - "io/ioutil" - "math/rand" - "os" - "strings" - "time" - "unsafe" -) - -type Time uint64 - -func (t Time) String() string { - if t == 0 { - return "Running" - } - tm := time.Unix(int64(t), 0) - return tm.String() -} - -type ProcessUtilInfo struct { - StartTime Time - EndTime Time - EnergyConsumed *uint64 // Joules - SmUtil *float64 - MemUtil *float64 -} - -// ViolationTime measures amount of time (in ms) GPU was at reduced clocks -type ViolationTime struct { - Power *uint64 - Thermal *uint64 - Reliability *uint64 - BoardLimit *uint64 - LowUtilization *uint64 - SyncBoost *uint64 -} - -type XIDErrorInfo struct { - NumErrors int - Timestamp []uint64 -} - -type ProcessInfo struct { - GPU uint - PID uint - Name string - ProcessUtilization ProcessUtilInfo - PCI PCIStatusInfo - Memory MemoryInfo - GpuUtilization UtilizationInfo - Clocks ClockInfo - Violations ViolationTime - XIDErrors XIDErrorInfo -} - -func watchPidFields(gpus ...uint) (groupId GroupHandle, err error) { - groupName := fmt.Sprintf("watchPids%d", rand.Uint64()) - group, err := CreateGroup(groupName) - if err != nil { - return - } - numGpus := len(gpus) - - if numGpus == 0 { - gpus, err = getSupportedDevices() - if err != nil { - return - } - } - - for _, gpu := range gpus { - err = AddToGroup(group, gpu) - if err != nil { - return - } - - } - - result := C.dcgmWatchPidFields(handle.handle, group.handle, C.longlong(updateFreq), C.double(maxKeepAge), C.int(maxKeepSamples)) - - if err = errorString(result); err != nil { - return groupId, fmt.Errorf("Error watching process fields: %s", err) - } - _ = UpdateAllFields() - return group, nil -} - -func getProcessInfo(groupId GroupHandle, pid uint) (processInfo []ProcessInfo, err error) { - var pidInfo C.dcgmPidInfo_t - pidInfo.version = makeVersion2(unsafe.Sizeof(pidInfo)) - pidInfo.pid = C.uint(pid) - - result := C.dcgmGetPidInfo(handle.handle, groupId.handle, &pidInfo) - - if err = errorString(result); err != nil { - return processInfo, fmt.Errorf("Error getting process info: %s", err) - } - - name, err := processName(pid) - if err != nil { - return processInfo, fmt.Errorf("Error getting process name: %s", err) - } - - for i := 0; i < int(pidInfo.numGpus); i++ { - - var energy uint64 - e := *uint64Ptr(pidInfo.gpus[i].energyConsumed) - if !IsInt64Blank(int64(e)) { - energy = e / 1000 // mWs to joules - } - - processUtil := ProcessUtilInfo{ - StartTime: Time(uint64(pidInfo.gpus[i].startTime) / 1000000), - EndTime: Time(uint64(pidInfo.gpus[i].endTime) / 1000000), - EnergyConsumed: &energy, - SmUtil: roundFloat(dblToFloat(pidInfo.gpus[i].processUtilization.smUtil)), - MemUtil: roundFloat(dblToFloat(pidInfo.gpus[i].processUtilization.memUtil)), - } - - // TODO figure out how to deal with blanks - pci := PCIStatusInfo{ - Throughput: PCIThroughputInfo{ - Rx: *int64Ptr(pidInfo.gpus[i].pcieRxBandwidth.average), - Tx: *int64Ptr(pidInfo.gpus[i].pcieTxBandwidth.average), - Replays: *int64Ptr(pidInfo.gpus[i].pcieReplays), - }, - } - - memory := MemoryInfo{ - GlobalUsed: *int64Ptr(pidInfo.gpus[i].maxGpuMemoryUsed), // max gpu memory used for this process - ECCErrors: ECCErrorsInfo{ - SingleBit: *int64Ptr(C.longlong(pidInfo.gpus[i].eccSingleBit)), - DoubleBit: *int64Ptr(C.longlong(pidInfo.gpus[i].eccDoubleBit)), - }, - } - - gpuUtil := UtilizationInfo{ - GPU: int64(pidInfo.gpus[i].smUtilization.average), - Memory: int64(pidInfo.gpus[i].memoryUtilization.average), - } - - violations := ViolationTime{ - Power: uint64Ptr(pidInfo.gpus[i].powerViolationTime), - Thermal: uint64Ptr(pidInfo.gpus[i].thermalViolationTime), - Reliability: uint64Ptr(pidInfo.gpus[i].reliabilityViolationTime), - BoardLimit: uint64Ptr(pidInfo.gpus[i].boardLimitViolationTime), - LowUtilization: uint64Ptr(pidInfo.gpus[i].lowUtilizationTime), - SyncBoost: uint64Ptr(pidInfo.gpus[i].syncBoostTime), - } - - clocks := ClockInfo{ - Cores: *int64Ptr(C.longlong(pidInfo.gpus[i].smClock.average)), - Memory: *int64Ptr(C.longlong(pidInfo.gpus[i].memoryClock.average)), - } - - numErrs := int(pidInfo.gpus[i].numXidCriticalErrors) - ts := make([]uint64, numErrs) - for i := 0; i < numErrs; i++ { - ts[i] = uint64(pidInfo.gpus[i].xidCriticalErrorsTs[i]) - } - xidErrs := XIDErrorInfo{ - NumErrors: numErrs, - Timestamp: ts, - } - - pInfo := ProcessInfo{ - GPU: uint(pidInfo.summary.gpuId), - PID: uint(pidInfo.pid), - Name: name, - ProcessUtilization: processUtil, - PCI: pci, - Memory: memory, - GpuUtilization: gpuUtil, - Clocks: clocks, - Violations: violations, - XIDErrors: xidErrs, - } - processInfo = append(processInfo, pInfo) - } - _ = DestroyGroup(groupId) - return -} - -func processName(pid uint) (string, error) { - f := fmt.Sprintf("/proc/%d/comm", pid) - b, err := ioutil.ReadFile(f) - if err != nil { - // TOCTOU: process terminated - if os.IsNotExist(err) { - return "", nil - } - return "", err - } - return strings.TrimSuffix(string(b), "\n"), nil -} diff --git a/bindings/go/dcgm/profile.go b/bindings/go/dcgm/profile.go deleted file mode 100644 index 25ca7524..00000000 --- a/bindings/go/dcgm/profile.go +++ /dev/null @@ -1,47 +0,0 @@ -package dcgm - -/* -#include "dcgm_agent.h" -#include "dcgm_structs.h" -*/ -import "C" -import ( - "fmt" - "unsafe" -) - -type MetricGroup struct { - major uint - minor uint - fieldIds []uint -} - -func getSupportedMetricGroups(grpid uint) (groups []MetricGroup, err error) { - - var groupInfo C.dcgmProfGetMetricGroups_t - groupInfo.version = makeVersion2(unsafe.Sizeof(groupInfo)) - groupInfo.groupId = C.ulong(grpid) - - result := C.dcgmProfGetSupportedMetricGroups(handle.handle, &groupInfo) - - if err = errorString(result); err != nil { - return groups, fmt.Errorf("Error getting supported metrics: %s", err) - } - - var count = uint(groupInfo.numMetricGroups) - - for i := uint(0); i < count; i++ { - var group MetricGroup - group.major = uint(groupInfo.metricGroups[i].majorId) - group.minor = uint(groupInfo.metricGroups[i].minorId) - - var fieldCount = uint(groupInfo.metricGroups[i].numFieldIds) - - for j := uint(0); j < fieldCount; j++ { - group.fieldIds = append(group.fieldIds, uint(groupInfo.metricGroups[i].fieldIds[j])) - } - groups = append(groups, group) - } - - return groups, nil -} diff --git a/bindings/go/dcgm/topology.go b/bindings/go/dcgm/topology.go deleted file mode 100644 index f3afc380..00000000 --- a/bindings/go/dcgm/topology.go +++ /dev/null @@ -1,136 +0,0 @@ -package dcgm - -/* -#include "dcgm_agent.h" -#include "dcgm_structs.h" -*/ -import "C" -import ( - "fmt" - "io/ioutil" - "strings" - "unsafe" -) - -type P2PLinkType uint - -const ( - P2PLinkUnknown P2PLinkType = iota - P2PLinkCrossCPU - P2PLinkSameCPU - P2PLinkHostBridge - P2PLinkMultiSwitch - P2PLinkSingleSwitch - P2PLinkSameBoard - SingleNVLINKLink - TwoNVLINKLinks - ThreeNVLINKLinks - FourNVLINKLinks -) - -func (l P2PLinkType) PCIPaths() string { - switch l { - case P2PLinkSameBoard: - return "PSB" - case P2PLinkSingleSwitch: - return "PIX" - case P2PLinkMultiSwitch: - return "PXB" - case P2PLinkHostBridge: - return "PHB" - case P2PLinkSameCPU: - return "NODE" - case P2PLinkCrossCPU: - return "SYS" - case SingleNVLINKLink: - return "NV1" - case TwoNVLINKLinks: - return "NV2" - case ThreeNVLINKLinks: - return "NV3" - case FourNVLINKLinks: - return "NV4" - case P2PLinkUnknown: - } - return "N/A" -} - -type P2PLink struct { - GPU uint - BusID string - Link P2PLinkType -} - -func getP2PLink(path uint) P2PLinkType { - switch path { - case C.DCGM_TOPOLOGY_BOARD: - return P2PLinkSameBoard - case C.DCGM_TOPOLOGY_SINGLE: - return P2PLinkSingleSwitch - case C.DCGM_TOPOLOGY_MULTIPLE: - return P2PLinkMultiSwitch - case C.DCGM_TOPOLOGY_HOSTBRIDGE: - return P2PLinkHostBridge - case C.DCGM_TOPOLOGY_CPU: - return P2PLinkSameCPU - case C.DCGM_TOPOLOGY_SYSTEM: - return P2PLinkCrossCPU - case C.DCGM_TOPOLOGY_NVLINK1: - return SingleNVLINKLink - case C.DCGM_TOPOLOGY_NVLINK2: - return TwoNVLINKLinks - case C.DCGM_TOPOLOGY_NVLINK3: - return ThreeNVLINKLinks - case C.DCGM_TOPOLOGY_NVLINK4: - return FourNVLINKLinks - } - return P2PLinkUnknown -} - -func getCPUAffinity(busid string) (string, error) { - b, err := ioutil.ReadFile(fmt.Sprintf("/sys/bus/pci/devices/%s/local_cpulist", strings.ToLower(busid[4:]))) - if err != nil { - return "", fmt.Errorf("Error getting device cpu affinity: %v", err) - } - return strings.TrimSuffix(string(b), "\n"), nil -} - -func getBusid(gpuid uint) (string, error) { - var device C.dcgmDeviceAttributes_t - device.version = makeVersion2(unsafe.Sizeof(device)) - - result := C.dcgmGetDeviceAttributes(handle.handle, C.uint(gpuid), &device) - if err := errorString(result); err != nil { - return "", fmt.Errorf("Error getting device busid: %s", err) - } - return *stringPtr(&device.identifiers.pciBusId[0]), nil -} - -func getDeviceTopology(gpuid uint) (links []P2PLink, err error) { - var topology C.dcgmDeviceTopology_t - topology.version = makeVersion2(unsafe.Sizeof(topology)) - - result := C.dcgmGetDeviceTopology(handle.handle, C.uint(gpuid), &topology) - if result == C.DCGM_ST_NOT_SUPPORTED { - return links, nil - } - if result != C.DCGM_ST_OK { - return links, fmt.Errorf("Error getting device topology: %s", errorString(result)) - } - - busid, err := getBusid(gpuid) - if err != nil { - return - } - - for i := uint(0); i < uint(topology.numGpus); i++ { - gpu := topology.gpuPaths[i].gpuId - p2pLink := P2PLink{ - GPU: uint(gpu), - BusID: busid, - Link: getP2PLink(uint(topology.gpuPaths[i].path)), - } - links = append(links, p2pLink) - } - return -} diff --git a/bindings/go/dcgm/utils.go b/bindings/go/dcgm/utils.go deleted file mode 100644 index 219735a7..00000000 --- a/bindings/go/dcgm/utils.go +++ /dev/null @@ -1,148 +0,0 @@ -package dcgm - -/* -#include "stdlib.h" -#include "dcgm_structs.h" -*/ -import "C" - -import ( - "fmt" - "math" - "unsafe" -) - -const ( - dcgmInt32Blank = 0x7ffffff0 // 2147483632 - dcgmInt64Blank = 0x7ffffffffffffff0 // 9223372036854775792 -) - -func uintPtr(c C.uint) *uint { - i := uint(c) - return &i -} - -func uintPtrInt(c C.int) *uint { - i := uint(c) - return &i -} - -func uintPtrUnsafe(p unsafe.Pointer) *uint { - if p == nil { - return nil - } - uintP := (*uint)(unsafe.Pointer(p)) - val := *uintP - return &val -} - -func uint64Ptr(c C.longlong) *uint64 { - i := uint64(c) - return &i -} - -func int64Ptr(c C.longlong) *int64 { - i := int64(c) - return &i -} - -func uint64PtrUint(c C.uint) *uint64 { - i := uint64(c) - return &i -} - -func uint64PtrUnsafe(p unsafe.Pointer) *uint64 { - if p == nil { - return nil - } - uintP := (*uint64)(unsafe.Pointer(p)) - val := *uintP - return &val -} - -func toInt64(c C.longlong) int64 { - i := int64(c) - return i -} - -func dblToUint(val C.double) *uint { - i := uint(val) - return &i -} - -func dblToFloat(val C.double) *float64 { - i := float64(val) - return &i -} - -func dblToFloatUnsafe(val unsafe.Pointer) *float64 { - if val == nil { - return nil - } - dblP := (*C.double)(unsafe.Pointer(val)) - floatP := float64(*dblP) - return &floatP -} - -func stringPtr(c *C.char) *string { - s := C.GoString(c) - return &s -} - -func errorString(result C.dcgmReturn_t) error { - if result == C.DCGM_ST_OK { - return nil - } - err := C.GoString(C.errorString(result)) - return fmt.Errorf("%v", err) -} - -func freeCString(cStr *C.char) { - C.free(unsafe.Pointer(cStr)) -} - -func IsInt32Blank(value int) bool { - if value >= dcgmInt32Blank { - return true - } - return false -} - -func IsInt64Blank(value int64) bool { - if value >= dcgmInt64Blank { - return true - } - return false -} - -func blank64(val *int64) *int64 { - if val != nil && IsInt64Blank(*val) { - return nil - } - return val -} - -func blank32(val *uint) *uint { - if val != nil && IsInt32Blank(int(*val)) { - return nil - } - return val -} - -func makeVersion1(struct_type uintptr) C.uint { - version := C.uint(struct_type | 1<<24) - return version -} - -func makeVersion2(struct_type uintptr) C.uint { - version := C.uint(struct_type | 2<<24) - return version -} - -func roundFloat(f *float64) *float64 { - var val float64 - if f != nil { - val = math.Round(*f) - } - return &val -} diff --git a/bindings/go/nvml/bindings.go b/bindings/go/nvml/bindings.go deleted file mode 100644 index a650c13e..00000000 --- a/bindings/go/nvml/bindings.go +++ /dev/null @@ -1,859 +0,0 @@ -/* - * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package nvml - -/* -#cgo linux LDFLAGS: -ldl -Wl,--unresolved-symbols=ignore-in-object-files -#cgo darwin LDFLAGS: -ldl -Wl,-undefined,dynamic_lookup -#cgo windows LDFLAGS: -LC:/Program\ Files/NVIDIA\ Corporation/NVSMI -lnvml -#include "nvml.h" - -#undef nvmlEventSetWait -nvmlReturn_t DECLDIR nvmlEventSetWait(nvmlEventSet_t set, nvmlEventData_t * data, unsigned int timeoutms); -nvmlReturn_t DECLDIR nvmlEventSetWait_v2(nvmlEventSet_t set, nvmlEventData_t * data, unsigned int timeoutms); -*/ -import "C" - -import ( - "errors" - "fmt" - "io/ioutil" - "os" - "sort" - "strconv" - "strings" -) - -const ( - szDriver = C.NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE - szName = C.NVML_DEVICE_NAME_BUFFER_SIZE - szUUID = C.NVML_DEVICE_UUID_V2_BUFFER_SIZE - szProcs = 32 - szProcName = 64 - - XidCriticalError = C.nvmlEventTypeXidCriticalError -) - -var nvmlEventSetWait = nvmlEventSetWait_v1 - -type handle struct{ dev C.nvmlDevice_t } -type EventSet struct{ set C.nvmlEventSet_t } -type Event struct { - UUID *string - GpuInstanceId *uint - ComputeInstanceId *uint - Etype uint64 - Edata uint64 -} - -func nvmlEventSetWait_v1(Set C.nvmlEventSet_t, Data *C.nvmlEventData_t, Timeoutms C.uint) C.nvmlReturn_t { - return C.nvmlEventSetWait(Set, Data, Timeoutms) -} - -func nvmlEventSetWait_v2(Set C.nvmlEventSet_t, Data *C.nvmlEventData_t, Timeoutms C.uint) C.nvmlReturn_t { - return C.nvmlEventSetWait_v2(Set, Data, Timeoutms) -} - -func uintPtr(c C.uint) *uint { - i := uint(c) - return &i -} - -func uint64Ptr(c C.ulonglong) *uint64 { - i := uint64(c) - return &i -} - -func stringPtr(c *C.char) *string { - s := C.GoString(c) - return &s -} - -func errorString(ret C.nvmlReturn_t) error { - if ret == C.NVML_SUCCESS { - return nil - } - err := C.GoString(C.nvmlErrorString(ret)) - return fmt.Errorf("nvml: %v", err) -} - -func init_() error { - r := dl.nvmlInit() - if r == C.NVML_ERROR_LIBRARY_NOT_FOUND { - return errors.New("could not load NVML library") - } - - found := dl.lookupSymbol("nvmlEventSetWait_v2") - if found == C.NVML_SUCCESS { - nvmlEventSetWait = nvmlEventSetWait_v2 - } - - return errorString(r) -} - -func NewEventSet() EventSet { - var set C.nvmlEventSet_t - C.nvmlEventSetCreate(&set) - - return EventSet{set} -} - -func RegisterEvent(es EventSet, event int) error { - n, err := deviceGetCount() - if err != nil { - return err - } - - var i uint - for i = 0; i < n; i++ { - h, err := deviceGetHandleByIndex(i) - if err != nil { - return err - } - - r := C.nvmlDeviceRegisterEvents(h.dev, C.ulonglong(event), es.set) - if r != C.NVML_SUCCESS { - return errorString(r) - } - } - - return nil -} - -func RegisterEventForDevice(es EventSet, event int, uuid string) error { - n, err := deviceGetCount() - if err != nil { - return err - } - - var i uint - for i = 0; i < n; i++ { - h, err := deviceGetHandleByIndex(i) - if err != nil { - return err - } - - duuid, err := h.deviceGetUUID() - if err != nil { - return err - } - - if *duuid != uuid { - continue - } - - r := C.nvmlDeviceRegisterEvents(h.dev, C.ulonglong(event), es.set) - if r != C.NVML_SUCCESS { - return errorString(r) - } - - return nil - } - - return fmt.Errorf("nvml: device not found") -} - -func DeleteEventSet(es EventSet) { - C.nvmlEventSetFree(es.set) -} - -func WaitForEvent(es EventSet, timeout uint) (Event, error) { - var data C.nvmlEventData_t - data.gpuInstanceId = 0xFFFFFFFF - data.computeInstanceId = 0xFFFFFFFF - - r := nvmlEventSetWait(es.set, &data, C.uint(timeout)) - if r != C.NVML_SUCCESS { - return Event{}, errorString(r) - } - - uuid, _ := handle{data.device}.deviceGetUUID() - - return Event{ - UUID: uuid, - Etype: uint64(data.eventType), - Edata: uint64(data.eventData), - GpuInstanceId: uintPtr(data.gpuInstanceId), - ComputeInstanceId: uintPtr(data.computeInstanceId), - }, nil -} - -func shutdown() error { - return errorString(dl.nvmlShutdown()) -} - -func systemGetCudaDriverVersion() (*uint, *uint, error) { - var v C.int - - r := C.nvmlSystemGetCudaDriverVersion_v2(&v) - if r != C.NVML_SUCCESS { - return nil, nil, errorString(r) - } - - major := uint(v / 1000) - minor := uint(v % 1000 / 10) - - return &major, &minor, errorString(r) -} - -func systemGetDriverVersion() (string, error) { - var driver [szDriver]C.char - - r := C.nvmlSystemGetDriverVersion(&driver[0], szDriver) - return C.GoString(&driver[0]), errorString(r) -} - -func systemGetProcessName(pid uint) (string, error) { - var proc [szProcName]C.char - - r := C.nvmlSystemGetProcessName(C.uint(pid), &proc[0], szProcName) - return C.GoString(&proc[0]), errorString(r) -} - -func deviceGetCount() (uint, error) { - var n C.uint - - r := C.nvmlDeviceGetCount(&n) - return uint(n), errorString(r) -} - -func deviceGetHandleByIndex(idx uint) (handle, error) { - var dev C.nvmlDevice_t - - r := C.nvmlDeviceGetHandleByIndex(C.uint(idx), &dev) - return handle{dev}, errorString(r) -} - -func deviceGetHandleByUUID(uuid string) (handle, error) { - var dev C.nvmlDevice_t - - r := C.nvmlDeviceGetHandleByUUID(C.CString(uuid), &dev) - return handle{dev}, errorString(r) -} - -func deviceGetTopologyCommonAncestor(h1, h2 handle) (*uint, error) { - r := dl.lookupSymbol("nvmlDeviceGetTopologyCommonAncestor") - if r == C.NVML_ERROR_FUNCTION_NOT_FOUND { - return nil, nil - } - - var level C.nvmlGpuTopologyLevel_t - r = C.nvmlDeviceGetTopologyCommonAncestor(h1.dev, h2.dev, &level) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil - } - - return uintPtr(C.uint(level)), errorString(r) -} - -func (h handle) deviceGetCudaComputeCapability() (*int, *int, error) { - var major, minor C.int - - r := C.nvmlDeviceGetCudaComputeCapability(h.dev, &major, &minor) - if r != C.NVML_SUCCESS { - return nil, nil, errorString(r) - } - - intMajor := int(major) - intMinor := int(minor) - - return &intMajor, &intMinor, errorString(r) -} - -func (h handle) deviceGetName() (*string, error) { - var name [szName]C.char - - r := C.nvmlDeviceGetName(h.dev, &name[0], szName) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil - } - return stringPtr(&name[0]), errorString(r) -} - -func (h handle) deviceGetIndex() (*uint, error) { - var index C.uint - r := C.nvmlDeviceGetIndex(h.dev, &index) - if r != C.NVML_SUCCESS { - return nil, errorString(r) - } - return uintPtr(index), nil -} - -func (h handle) deviceGetUUID() (*string, error) { - var uuid [szUUID]C.char - - r := C.nvmlDeviceGetUUID(h.dev, &uuid[0], szUUID) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil - } - return stringPtr(&uuid[0]), errorString(r) -} - -func (h handle) deviceGetPciInfo() (*string, error) { - var pci C.nvmlPciInfo_t - - r := C.nvmlDeviceGetPciInfo(h.dev, &pci) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil - } - return stringPtr(&pci.busId[0]), errorString(r) -} - -func (h handle) deviceGetMinorNumber() (*uint, error) { - var minor C.uint - - r := C.nvmlDeviceGetMinorNumber(h.dev, &minor) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil - } - return uintPtr(minor), errorString(r) -} - -func (h handle) deviceGetBAR1MemoryInfo() (*uint64, *uint64, error) { - var bar1 C.nvmlBAR1Memory_t - - r := C.nvmlDeviceGetBAR1MemoryInfo(h.dev, &bar1) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil, nil - } - return uint64Ptr(bar1.bar1Total), uint64Ptr(bar1.bar1Used), errorString(r) -} - -func (h handle) deviceGetNvLinkState(link uint) (*uint, error) { - var isActive C.nvmlEnableState_t - - r := C.nvmlDeviceGetNvLinkState(h.dev, C.uint(link), &isActive) - if r == C.NVML_ERROR_NOT_SUPPORTED || r == C.NVML_ERROR_INVALID_ARGUMENT { - return nil, nil - } - - return uintPtr(C.uint(isActive)), errorString(r) -} - -func (h handle) deviceGetNvLinkRemotePciInfo(link uint) (*string, error) { - var pci C.nvmlPciInfo_t - - r := C.nvmlDeviceGetNvLinkRemotePciInfo(h.dev, C.uint(link), &pci) - if r == C.NVML_ERROR_NOT_SUPPORTED || r == C.NVML_ERROR_INVALID_ARGUMENT { - return nil, nil - } - - return stringPtr(&pci.busId[0]), errorString(r) -} - -func (h handle) deviceGetAllNvLinkRemotePciInfo() ([]*string, error) { - busIds := []*string{} - - for i := uint(0); i < C.NVML_NVLINK_MAX_LINKS; i++ { - state, err := h.deviceGetNvLinkState(i) - if err != nil { - return nil, err - } - - if state == nil { - continue - } - - if *state == C.NVML_FEATURE_ENABLED { - pci, err := h.deviceGetNvLinkRemotePciInfo(i) - if err != nil { - return nil, err - } - - if pci == nil { - continue - } - - busIds = append(busIds, pci) - } - } - - return busIds, nil -} - -func (h handle) deviceGetPowerManagementLimit() (*uint, error) { - var power C.uint - - r := C.nvmlDeviceGetPowerManagementLimit(h.dev, &power) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil - } - return uintPtr(power), errorString(r) -} - -func (h handle) deviceGetMaxClockInfo() (*uint, *uint, error) { - var sm, mem C.uint - - r := C.nvmlDeviceGetMaxClockInfo(h.dev, C.NVML_CLOCK_SM, &sm) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil, nil - } - if r == C.NVML_SUCCESS { - r = C.nvmlDeviceGetMaxClockInfo(h.dev, C.NVML_CLOCK_MEM, &mem) - } - return uintPtr(sm), uintPtr(mem), errorString(r) -} - -func (h handle) deviceGetMaxPcieLinkGeneration() (*uint, error) { - var link C.uint - - r := C.nvmlDeviceGetMaxPcieLinkGeneration(h.dev, &link) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil - } - return uintPtr(link), errorString(r) -} - -func (h handle) deviceGetMaxPcieLinkWidth() (*uint, error) { - var width C.uint - - r := C.nvmlDeviceGetMaxPcieLinkWidth(h.dev, &width) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil - } - return uintPtr(width), errorString(r) -} - -func (h handle) deviceGetPowerUsage() (*uint, error) { - var power C.uint - - r := C.nvmlDeviceGetPowerUsage(h.dev, &power) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil - } - return uintPtr(power), errorString(r) -} - -func (h handle) deviceGetFanSpeed() (*uint, error) { - var speed C.uint - - r := C.nvmlDeviceGetFanSpeed(h.dev, &speed) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil - } - return uintPtr(speed), errorString(r) -} - -func (h handle) deviceGetTemperature() (*uint, error) { - var temp C.uint - - r := C.nvmlDeviceGetTemperature(h.dev, C.NVML_TEMPERATURE_GPU, &temp) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil - } - return uintPtr(temp), errorString(r) -} - -func (h handle) deviceGetUtilizationRates() (*uint, *uint, error) { - var usage C.nvmlUtilization_t - - r := C.nvmlDeviceGetUtilizationRates(h.dev, &usage) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil, nil - } - return uintPtr(usage.gpu), uintPtr(usage.memory), errorString(r) -} - -func (h handle) deviceGetEncoderUtilization() (*uint, error) { - var usage, sampling C.uint - - r := C.nvmlDeviceGetEncoderUtilization(h.dev, &usage, &sampling) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil - } - return uintPtr(usage), errorString(r) -} - -func (h handle) deviceGetDecoderUtilization() (*uint, error) { - var usage, sampling C.uint - - r := C.nvmlDeviceGetDecoderUtilization(h.dev, &usage, &sampling) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil - } - return uintPtr(usage), errorString(r) -} - -func (h handle) deviceGetMemoryInfo() (totalMem *uint64, devMem DeviceMemory, err error) { - var mem C.nvmlMemory_t - - r := C.nvmlDeviceGetMemoryInfo(h.dev, &mem) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return - } - - err = errorString(r) - if r != C.NVML_SUCCESS { - return - } - - totalMem = uint64Ptr(mem.total) - if totalMem != nil { - *totalMem /= 1024 * 1024 // MiB - } - - devMem = DeviceMemory{ - Used: uint64Ptr(mem.used), - Free: uint64Ptr(mem.free), - } - - if devMem.Used != nil { - *devMem.Used /= 1024 * 1024 // MiB - } - - if devMem.Free != nil { - *devMem.Free /= 1024 * 1024 // MiB - } - return -} - -func (h handle) deviceGetClockInfo() (*uint, *uint, error) { - var sm, mem C.uint - - r := C.nvmlDeviceGetClockInfo(h.dev, C.NVML_CLOCK_SM, &sm) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil, nil - } - if r == C.NVML_SUCCESS { - r = C.nvmlDeviceGetClockInfo(h.dev, C.NVML_CLOCK_MEM, &mem) - } - return uintPtr(sm), uintPtr(mem), errorString(r) -} - -func (h handle) deviceGetMemoryErrorCounter() (*uint64, *uint64, *uint64, error) { - var l1, l2, mem C.ulonglong - - r := C.nvmlDeviceGetMemoryErrorCounter(h.dev, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED, - C.NVML_VOLATILE_ECC, C.NVML_MEMORY_LOCATION_L1_CACHE, &l1) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil, nil, nil - } - if r == C.NVML_SUCCESS { - r = C.nvmlDeviceGetMemoryErrorCounter(h.dev, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED, - C.NVML_VOLATILE_ECC, C.NVML_MEMORY_LOCATION_L2_CACHE, &l2) - } - if r == C.NVML_SUCCESS { - r = C.nvmlDeviceGetMemoryErrorCounter(h.dev, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED, - C.NVML_VOLATILE_ECC, C.NVML_MEMORY_LOCATION_DEVICE_MEMORY, &mem) - } - return uint64Ptr(l1), uint64Ptr(l2), uint64Ptr(mem), errorString(r) -} - -func (h handle) deviceGetPcieThroughput() (*uint, *uint, error) { - var rx, tx C.uint - - r := C.nvmlDeviceGetPcieThroughput(h.dev, C.NVML_PCIE_UTIL_RX_BYTES, &rx) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil, nil - } - if r == C.NVML_SUCCESS { - r = C.nvmlDeviceGetPcieThroughput(h.dev, C.NVML_PCIE_UTIL_TX_BYTES, &tx) - } - return uintPtr(rx), uintPtr(tx), errorString(r) -} - -func (h handle) deviceGetComputeRunningProcesses() ([]uint, []uint64, error) { - var procs [szProcs]C.nvmlProcessInfo_t - var count = C.uint(szProcs) - - r := C.nvmlDeviceGetComputeRunningProcesses(h.dev, &count, &procs[0]) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil, nil - } - n := int(count) - pids := make([]uint, n) - mems := make([]uint64, n) - for i := 0; i < n; i++ { - pids[i] = uint(procs[i].pid) - mems[i] = uint64(procs[i].usedGpuMemory) - } - return pids, mems, errorString(r) -} - -func (h handle) deviceGetGraphicsRunningProcesses() ([]uint, []uint64, error) { - var procs [szProcs]C.nvmlProcessInfo_t - var count = C.uint(szProcs) - - r := C.nvmlDeviceGetGraphicsRunningProcesses(h.dev, &count, &procs[0]) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil, nil - } - n := int(count) - pids := make([]uint, n) - mems := make([]uint64, n) - for i := 0; i < n; i++ { - pids[i] = uint(procs[i].pid) - mems[i] = uint64(procs[i].usedGpuMemory) - } - return pids, mems, errorString(r) -} - -func (h handle) deviceGetAllRunningProcesses() ([]ProcessInfo, error) { - cPids, cpMems, err := h.deviceGetComputeRunningProcesses() - if err != nil { - return nil, err - } - - gPids, gpMems, err := h.deviceGetGraphicsRunningProcesses() - if err != nil { - return nil, err - } - - allPids := make(map[uint]ProcessInfo) - - for i, pid := range cPids { - name, err := processName(pid) - if err != nil { - return nil, err - } - allPids[pid] = ProcessInfo{ - PID: pid, - Name: name, - MemoryUsed: cpMems[i] / (1024 * 1024), // MiB - Type: Compute, - } - - } - - for i, pid := range gPids { - pInfo, exists := allPids[pid] - if exists { - pInfo.Type = ComputeAndGraphics - allPids[pid] = pInfo - } else { - name, err := processName(pid) - if err != nil { - return nil, err - } - allPids[pid] = ProcessInfo{ - PID: pid, - Name: name, - MemoryUsed: gpMems[i] / (1024 * 1024), // MiB - Type: Graphics, - } - } - } - - var processInfo []ProcessInfo - for _, v := range allPids { - processInfo = append(processInfo, v) - } - sort.Slice(processInfo, func(i, j int) bool { - return processInfo[i].PID < processInfo[j].PID - }) - - return processInfo, nil -} - -func (h handle) getClocksThrottleReasons() (reason ThrottleReason, err error) { - var clocksThrottleReasons C.ulonglong - - r := C.nvmlDeviceGetCurrentClocksThrottleReasons(h.dev, &clocksThrottleReasons) - - if r == C.NVML_ERROR_NOT_SUPPORTED { - return ThrottleReasonUnknown, nil - } - - if r != C.NVML_SUCCESS { - return ThrottleReasonUnknown, errorString(r) - } - - switch clocksThrottleReasons { - case C.nvmlClocksThrottleReasonGpuIdle: - reason = ThrottleReasonGpuIdle - case C.nvmlClocksThrottleReasonApplicationsClocksSetting: - reason = ThrottleReasonApplicationsClocksSetting - case C.nvmlClocksThrottleReasonSwPowerCap: - reason = ThrottleReasonSwPowerCap - case C.nvmlClocksThrottleReasonHwSlowdown: - reason = ThrottleReasonHwSlowdown - case C.nvmlClocksThrottleReasonSyncBoost: - reason = ThrottleReasonSyncBoost - case C.nvmlClocksThrottleReasonSwThermalSlowdown: - reason = ThrottleReasonSwThermalSlowdown - case C.nvmlClocksThrottleReasonHwThermalSlowdown: - reason = ThrottleReasonHwThermalSlowdown - case C.nvmlClocksThrottleReasonHwPowerBrakeSlowdown: - reason = ThrottleReasonHwPowerBrakeSlowdown - case C.nvmlClocksThrottleReasonDisplayClockSetting: - reason = ThrottleReasonDisplayClockSetting - case C.nvmlClocksThrottleReasonNone: - reason = ThrottleReasonNone - } - return -} - -func (h handle) getPerformanceState() (PerfState, error) { - var pstate C.nvmlPstates_t - - r := C.nvmlDeviceGetPerformanceState(h.dev, &pstate) - - if r == C.NVML_ERROR_NOT_SUPPORTED { - return PerfStateUnknown, nil - } - - if r != C.NVML_SUCCESS { - return PerfStateUnknown, errorString(r) - } - return PerfState(pstate), nil -} - -func processName(pid uint) (string, error) { - f := `/proc/` + strconv.FormatUint(uint64(pid), 10) + `/comm` - d, err := ioutil.ReadFile(f) - - if err != nil { - // TOCTOU: process terminated - if os.IsNotExist(err) { - return "", nil - } - return "", err - } - return strings.TrimSuffix(string(d), "\n"), err -} - -func (h handle) getAccountingInfo() (accountingInfo Accounting, err error) { - var mode C.nvmlEnableState_t - var buffer C.uint - - r := C.nvmlDeviceGetAccountingMode(h.dev, &mode) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return - } - - if r != C.NVML_SUCCESS { - return accountingInfo, errorString(r) - } - - r = C.nvmlDeviceGetAccountingBufferSize(h.dev, &buffer) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return - } - - if r != C.NVML_SUCCESS { - return accountingInfo, errorString(r) - } - - accountingInfo = Accounting{ - Mode: ModeState(mode), - BufferSize: uintPtr(buffer), - } - return -} - -func (h handle) getDisplayInfo() (display Display, err error) { - var mode, isActive C.nvmlEnableState_t - - r := C.nvmlDeviceGetDisplayActive(h.dev, &mode) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return - } - - if r != C.NVML_SUCCESS { - return display, errorString(r) - } - - r = C.nvmlDeviceGetDisplayMode(h.dev, &isActive) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return - } - if r != C.NVML_SUCCESS { - return display, errorString(r) - } - display = Display{ - Mode: ModeState(mode), - Active: ModeState(isActive), - } - return -} - -func (h handle) getPeristenceMode() (state ModeState, err error) { - var mode C.nvmlEnableState_t - - r := C.nvmlDeviceGetPersistenceMode(h.dev, &mode) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return - } - return ModeState(mode), errorString(r) -} - -func (h *handle) isMigEnabled() (bool, error) { - ret := dl.lookupSymbol("nvmlDeviceGetMigMode") - if ret != C.NVML_SUCCESS { - return false, nil - } - - var cm, pm C.uint - ret = C.nvmlDeviceGetMigMode(h.dev, &cm, &pm) - if ret == C.NVML_ERROR_NOT_SUPPORTED { - return false, nil - } - if ret != C.NVML_SUCCESS { - return false, errorString(ret) - } - - return (cm == C.NVML_DEVICE_MIG_ENABLE) && (cm == pm), nil -} - -func (h *handle) getMigDevices() ([]handle, error) { - ret := dl.lookupSymbol("nvmlDeviceGetMaxMigDeviceCount") - if ret != C.NVML_SUCCESS { - return nil, errorString(ret) - } - - var c C.uint - ret = C.nvmlDeviceGetMaxMigDeviceCount(h.dev, &c) - if ret != C.NVML_SUCCESS { - return nil, errorString(ret) - } - - ret = dl.lookupSymbol("nvmlDeviceGetMigDeviceHandleByIndex") - if ret != C.NVML_SUCCESS { - return nil, errorString(ret) - } - - var handles []handle - for i := 0; i < int(c); i++ { - var mig C.nvmlDevice_t - ret := C.nvmlDeviceGetMigDeviceHandleByIndex(h.dev, C.uint(i), &mig) - if ret == C.NVML_ERROR_NOT_FOUND { - continue - } - if ret != C.NVML_SUCCESS { - return nil, errorString(ret) - } - - handles = append(handles, handle{mig}) - } - - return handles, nil -} - -func (h *handle) deviceGetDeviceHandleFromMigDeviceHandle() (handle, error) { - ret := dl.lookupSymbol("nvmlDeviceGetDeviceHandleFromMigDeviceHandle") - if ret != C.NVML_SUCCESS { - return handle{}, errorString(ret) - } - - var parent C.nvmlDevice_t - ret = C.nvmlDeviceGetDeviceHandleFromMigDeviceHandle(h.dev, &parent) - if ret != C.NVML_SUCCESS { - return handle{}, errorString(ret) - } - - return handle{parent}, nil -} diff --git a/bindings/go/nvml/mig.go b/bindings/go/nvml/mig.go deleted file mode 100644 index 446ef2ef..00000000 --- a/bindings/go/nvml/mig.go +++ /dev/null @@ -1,423 +0,0 @@ -// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. - -package nvml - -import ( - "unsafe" -) - -// #include "nvml.h" -import "C" - -// Enable or disable MIG mode -const ( - DEVICE_MIG_DISABLE = C.NVML_DEVICE_MIG_DISABLE - DEVICE_MIG_ENABLE = C.NVML_DEVICE_MIG_ENABLE -) - -// GPU Instance Profiles -const ( - GPU_INSTANCE_PROFILE_1_SLICE = C.NVML_GPU_INSTANCE_PROFILE_1_SLICE - GPU_INSTANCE_PROFILE_2_SLICE = C.NVML_GPU_INSTANCE_PROFILE_2_SLICE - GPU_INSTANCE_PROFILE_3_SLICE = C.NVML_GPU_INSTANCE_PROFILE_3_SLICE - GPU_INSTANCE_PROFILE_4_SLICE = C.NVML_GPU_INSTANCE_PROFILE_4_SLICE - GPU_INSTANCE_PROFILE_7_SLICE = C.NVML_GPU_INSTANCE_PROFILE_7_SLICE - GPU_INSTANCE_PROFILE_COUNT = C.NVML_GPU_INSTANCE_PROFILE_COUNT -) - -// Compute Instance Profiles -const ( - COMPUTE_INSTANCE_PROFILE_1_SLICE = C.NVML_COMPUTE_INSTANCE_PROFILE_1_SLICE - COMPUTE_INSTANCE_PROFILE_2_SLICE = C.NVML_COMPUTE_INSTANCE_PROFILE_2_SLICE - COMPUTE_INSTANCE_PROFILE_3_SLICE = C.NVML_COMPUTE_INSTANCE_PROFILE_3_SLICE - COMPUTE_INSTANCE_PROFILE_4_SLICE = C.NVML_COMPUTE_INSTANCE_PROFILE_4_SLICE - COMPUTE_INSTANCE_PROFILE_7_SLICE = C.NVML_COMPUTE_INSTANCE_PROFILE_7_SLICE - COMPUTE_INSTANCE_PROFILE_COUNT = C.NVML_COMPUTE_INSTANCE_PROFILE_COUNT -) - -// Compute Instance Engine Profiles -const ( - COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED = C.NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED - COMPUTE_INSTANCE_ENGINE_PROFILE_COUNT = C.NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_COUNT -) - -// Opaque GPUInstance type -type GPUInstance struct { - handle C.nvmlGpuInstance_t - device *Device -} - -// type GPUInstancePlacement C.nvmlGpuInstancePlacement_t -// Generated using `go tool cgo -godefs mig.go` -type GPUInstancePlacement struct { - Start uint32 - Size uint32 -} - -// type GPUInstanceProfileInfo C.nvmlGpuInstanceProfileInfo_t -// Generated using `go tool cgo -godefs mig.go` -type GPUInstanceProfileInfo struct { - ID uint32 - IsP2pSupported uint32 - SliceCount uint32 - InstanceCount uint32 - MultiprocessorCount uint32 - CopyEngineCount uint32 - DecoderCount uint32 - EncoderCount uint32 - JpegCount uint32 - OfaCount uint32 - MemorySizeMB uint64 -} - -// type GPUInstanceInfo_t C.nvmlGpuInstanceInfo_t -// Generated using `go tool cgo -godefs mig.go` -type GPUInstanceInfo struct { - Device *Device - ID uint32 - ProfileID uint32 - Placement GPUInstancePlacement -} - -// Opaque ComputeInstance type -type ComputeInstance struct { - handle C.nvmlComputeInstance_t - gpuInstance GPUInstance -} - -// type ComputeInstanceProfileInfo C.nvmlComputeInstanceProfileInfo_t -// Generated using `go tool cgo -godefs mig.go` -type ComputeInstanceProfileInfo struct { - ID uint32 - SliceCount uint32 - InstanceCount uint32 - MultiprocessorCount uint32 - SharedCopyEngineCount uint32 - SharedDecoderCount uint32 - SharedEncoderCount uint32 - SharedJpegCount uint32 - SharedOfaCount uint32 -} - -// type ComputeInstanceInfo C.nvmlComputeInstanceInfo_t -// Generated using `go tool cgo -godefs mig.go` -type ComputeInstanceInfo struct { - Device *Device - GPUInstance GPUInstance - ID uint32 - ProfileID uint32 -} - -// type DeviceAttributes C.nvmlDeviceAttributes_t -// Generated using `go tool cgo -godefs mig.go` -type DeviceAttributes struct { - MultiprocessorCount uint32 - SharedCopyEngineCount uint32 - SharedDecoderCount uint32 - SharedEncoderCount uint32 - SharedJpegCount uint32 - SharedOfaCount uint32 - GpuInstanceSliceCount uint32 - ComputeInstanceSliceCount uint32 - MemorySizeMB uint64 -} - -// Device.SetMigMode() -func (d *Device) SetMigMode(mode int) (activationStatus error, err error) { - ret := dl.lookupSymbol("nvmlDeviceSetMigMode") - if ret != C.NVML_SUCCESS { - return nil, errorString(ret) - } - - var as C.nvmlReturn_t - ret = C.nvmlDeviceSetMigMode(d.handle.dev, C.uint(mode), &as) - return errorString(as), errorString(ret) -} - -// Device.GetMigMode() -func (d *Device) GetMigMode() (currentMode, pendingMode int, err error) { - ret := dl.lookupSymbol("nvmlDeviceGetMigMode") - if ret != C.NVML_SUCCESS { - return 0, 0, errorString(ret) - } - - var cm, pm C.uint - ret = C.nvmlDeviceGetMigMode(d.handle.dev, &cm, &pm) - return int(cm), int(pm), errorString(ret) -} - -// Device.GetGPUInstanceProfileInfo() -func (d *Device) GetGPUInstanceProfileInfo(profile int) (profileInfo GPUInstanceProfileInfo, err error) { - ret := dl.lookupSymbol("nvmlDeviceGetGpuInstanceProfileInfo") - if ret != C.NVML_SUCCESS { - return GPUInstanceProfileInfo{}, errorString(ret) - } - - var pi C.nvmlGpuInstanceProfileInfo_t - ret = C.nvmlDeviceGetGpuInstanceProfileInfo(d.handle.dev, C.uint(profile), &pi) - return *(*GPUInstanceProfileInfo)(unsafe.Pointer(&pi)), errorString(ret) -} - -// Device.GetGPUInstancePossiblePlacements() -func (d *Device) GetGPUInstancePossiblePlacements(profileInfo *GPUInstanceProfileInfo) (placement GPUInstancePlacement, count int, err error) { - ret := dl.lookupSymbol("nvmlDeviceGetGpuInstancePossiblePlacements") - if ret != C.NVML_SUCCESS { - return GPUInstancePlacement{}, 0, errorString(ret) - } - - var pi C.nvmlGpuInstancePlacement_t - var c C.uint - ret = C.nvmlDeviceGetGpuInstancePossiblePlacements(d.handle.dev, C.uint(profileInfo.ID), &pi, &c) - return *(*GPUInstancePlacement)(unsafe.Pointer(&pi)), int(c), errorString(ret) -} - -// Device.GPUInstanceRemainingCapacity() -func (d *Device) GPUInstanceRemainingCapacity(profileInfo *GPUInstanceProfileInfo) (count int, err error) { - ret := dl.lookupSymbol("nvmlDeviceGetGpuInstanceRemainingCapacity") - if ret != C.NVML_SUCCESS { - return 0, errorString(ret) - } - - var c C.uint - ret = C.nvmlDeviceGetGpuInstanceRemainingCapacity(d.handle.dev, C.uint(profileInfo.ID), &c) - return int(c), errorString(ret) -} - -// Device.CreateGPUInstance() -func (d *Device) CreateGPUInstance(profileInfo *GPUInstanceProfileInfo) (gpuInstance GPUInstance, err error) { - ret := dl.lookupSymbol("nvmlDeviceCreateGpuInstance") - if ret != C.NVML_SUCCESS { - return GPUInstance{}, errorString(ret) - } - - var gi C.nvmlGpuInstance_t - ret = C.nvmlDeviceCreateGpuInstance(d.handle.dev, C.uint(profileInfo.ID), &gi) - return GPUInstance{gi, d}, errorString(ret) -} - -// GPUInstance.Destroy() -func (g *GPUInstance) Destroy() (err error) { - ret := dl.lookupSymbol("nvmlGpuInstanceDestroy") - if ret != C.NVML_SUCCESS { - return errorString(ret) - } - - ret = C.nvmlGpuInstanceDestroy(g.handle) - return errorString(ret) -} - -// Device.GetGPUInstances() -func (d *Device) GetGPUInstances(profileInfo *GPUInstanceProfileInfo) (gpuInstances []GPUInstance, err error) { - ret := dl.lookupSymbol("nvmlDeviceGetGpuInstances") - if ret != C.NVML_SUCCESS { - return nil, errorString(ret) - } - - gis := make([]C.nvmlGpuInstance_t, profileInfo.InstanceCount) - var c C.uint - ret = C.nvmlDeviceGetGpuInstances(d.handle.dev, C.uint(profileInfo.ID), &gis[0], &c) - for i := 0; i < int(c); i++ { - gpuInstances = append(gpuInstances, GPUInstance{gis[i], d}) - } - return gpuInstances, errorString(ret) -} - -// Device.GetGPUInstanceByID() -func (d *Device) GetGPUInstanceByID(id int) (gpuInstance GPUInstance, err error) { - ret := dl.lookupSymbol("nvmlDeviceGetGpuInstanceById") - if ret != C.NVML_SUCCESS { - return GPUInstance{}, errorString(ret) - } - - var gi C.nvmlGpuInstance_t - ret = C.nvmlDeviceGetGpuInstanceById(d.handle.dev, C.uint(id), &gi) - return GPUInstance{gi, d}, errorString(ret) -} - -// GPUInstance.GetInfo() -func (g *GPUInstance) GetInfo() (info GPUInstanceInfo, err error) { - ret := dl.lookupSymbol("nvmlGpuInstanceGetInfo") - if ret != C.NVML_SUCCESS { - return GPUInstanceInfo{}, errorString(ret) - } - - var gii C.nvmlGpuInstanceInfo_t - ret = C.nvmlGpuInstanceGetInfo(g.handle, &gii) - info = *(*GPUInstanceInfo)(unsafe.Pointer(&gii)) - info.Device = g.device - return info, errorString(ret) -} - -// GPUInstance.GetComputeInstanceProfileInfo() -func (g *GPUInstance) GetComputeInstanceProfileInfo(profile int, engProfile int) (profileInfo ComputeInstanceProfileInfo, err error) { - ret := dl.lookupSymbol("nvmlGpuInstanceGetComputeInstanceProfileInfo") - if ret != C.NVML_SUCCESS { - return ComputeInstanceProfileInfo{}, errorString(ret) - } - - var pi C.nvmlComputeInstanceProfileInfo_t - ret = C.nvmlGpuInstanceGetComputeInstanceProfileInfo(g.handle, C.uint(profile), C.uint(engProfile), &pi) - return *(*ComputeInstanceProfileInfo)(unsafe.Pointer(&pi)), errorString(ret) -} - -// GPUInstance.ComputeInstanceRemainingCapacity() -func (g *GPUInstance) ComputeInstanceRemainingCapacity(profileInfo *GPUInstanceProfileInfo) (count int, err error) { - ret := dl.lookupSymbol("nvmlGpuInstanceGetComputeInstanceRemainingCapacity") - if ret != C.NVML_SUCCESS { - return 0, errorString(ret) - } - - var c C.uint - ret = C.nvmlGpuInstanceGetComputeInstanceRemainingCapacity(g.handle, C.uint(profileInfo.ID), &c) - return int(c), errorString(ret) -} - -// GPUInstance.CreateComputeInstance() -func (g *GPUInstance) CreateComputeInstance(profileInfo *ComputeInstanceProfileInfo) (computeInstance ComputeInstance, err error) { - ret := dl.lookupSymbol("nvmlGpuInstanceCreateComputeInstance") - if ret != C.NVML_SUCCESS { - return ComputeInstance{}, errorString(ret) - } - - var ci C.nvmlComputeInstance_t - ret = C.nvmlGpuInstanceCreateComputeInstance(g.handle, C.uint(profileInfo.ID), &ci) - return ComputeInstance{ci, *g}, errorString(ret) -} - -// ComputeInstance.Destroy() -func (c *ComputeInstance) Destroy() (err error) { - ret := dl.lookupSymbol("nvmlComputeInstanceDestroy") - if ret != C.NVML_SUCCESS { - return errorString(ret) - } - - ret = C.nvmlComputeInstanceDestroy(c.handle) - return errorString(ret) -} - -// GPUInstance.GetComputeInstances() -func (g *GPUInstance) GetComputeInstances(profileInfo *ComputeInstanceProfileInfo) (computeInstances []ComputeInstance, err error) { - ret := dl.lookupSymbol("nvmlGpuInstanceGetComputeInstances") - if ret != C.NVML_SUCCESS { - return nil, errorString(ret) - } - - cis := make([]C.nvmlComputeInstance_t, profileInfo.InstanceCount) - var c C.uint - ret = C.nvmlGpuInstanceGetComputeInstances(g.handle, C.uint(profileInfo.ID), &cis[0], &c) - for i := 0; i < int(c); i++ { - computeInstances = append(computeInstances, ComputeInstance{cis[i], *g}) - } - return computeInstances, errorString(ret) -} - -// GPUInstance.GetComputeInstanceByID() -func (g *GPUInstance) GetComputeInstanceByID(id int) (computeInstance ComputeInstance, err error) { - ret := dl.lookupSymbol("nvmlGpuInstanceGetComputeInstanceById") - if ret != C.NVML_SUCCESS { - return ComputeInstance{}, errorString(ret) - } - - var ci C.nvmlComputeInstance_t - ret = C.nvmlGpuInstanceGetComputeInstanceById(g.handle, C.uint(id), &ci) - return ComputeInstance{ci, *g}, errorString(ret) -} - -// ComputeInstance.GetInfo() -func (c *ComputeInstance) GetInfo() (info ComputeInstanceInfo, err error) { - ret := dl.lookupSymbol("nvmlComputeInstanceGetInfo") - if ret != C.NVML_SUCCESS { - return ComputeInstanceInfo{}, errorString(ret) - } - - var cii C.nvmlComputeInstanceInfo_t - ret = C.nvmlComputeInstanceGetInfo(c.handle, &cii) - info = *(*ComputeInstanceInfo)(unsafe.Pointer(&cii)) - info.Device = c.gpuInstance.device - info.GPUInstance = c.gpuInstance - return info, errorString(ret) -} - -// Device.IsMigDeviceHandle() -func (d *Device) IsMigDeviceHandle() (isMigDevice bool, err error) { - ret := dl.lookupSymbol("nvmlDeviceIsMigDeviceHandle") - if ret != C.NVML_SUCCESS { - return false, errorString(ret) - } - - var is C.uint - ret = C.nvmlDeviceIsMigDeviceHandle(d.handle.dev, &is) - return (is != 0), errorString(ret) -} - -// Device.GetGPUInstanceId() -func (d *Device) GetGPUInstanceId() (id int, err error) { - ret := dl.lookupSymbol("nvmlDeviceGetGpuInstanceId") - if ret != C.NVML_SUCCESS { - return 0, errorString(ret) - } - - var gi C.uint - ret = C.nvmlDeviceGetGpuInstanceId(d.handle.dev, &gi) - return int(gi), errorString(ret) -} - -// Device.GetComputeInstanceId() -func (d *Device) GetComputeInstanceId() (id int, err error) { - ret := dl.lookupSymbol("nvmlDeviceGetComputeInstanceId") - if ret != C.NVML_SUCCESS { - return 0, errorString(ret) - } - - var ci C.uint - ret = C.nvmlDeviceGetComputeInstanceId(d.handle.dev, &ci) - return int(ci), errorString(ret) -} - -// Device.GetMaxMigDeviceCount() -func (d *Device) GetMaxMigDeviceCount() (count int, err error) { - ret := dl.lookupSymbol("nvmlDeviceGetMaxMigDeviceCount") - if ret != C.NVML_SUCCESS { - return 0, errorString(ret) - } - - var c C.uint - ret = C.nvmlDeviceGetMaxMigDeviceCount(d.handle.dev, &c) - return int(c), errorString(ret) -} - -// Device.GetMigDeviceHandleByIndex() -func (d *Device) GetMigDeviceHandleByIndex(index int) (migDevice *Device, err error) { - ret := dl.lookupSymbol("nvmlDeviceGetMigDeviceHandleByIndex") - if ret != C.NVML_SUCCESS { - return nil, errorString(ret) - } - - var m C.nvmlDevice_t - ret = C.nvmlDeviceGetMigDeviceHandleByIndex(d.handle.dev, C.uint(index), &m) - return &Device{handle: handle{m}}, errorString(ret) -} - -// Device.GetMigDeviceHandleByIndex() -func (d *Device) GetDeviceHandleFromMigDeviceHandle() (device *Device, err error) { - ret := dl.lookupSymbol("nvmlDeviceGetDeviceHandleFromMigDeviceHandle") - if ret != C.NVML_SUCCESS { - return nil, errorString(ret) - } - - var parent C.nvmlDevice_t - ret = C.nvmlDeviceGetDeviceHandleFromMigDeviceHandle(d.handle.dev, &parent) - return &Device{handle: handle{parent}}, errorString(ret) -} - -// Device.GetAttributes() -func (d *Device) GetAttributes() (attr DeviceAttributes, err error) { - ret := dl.lookupSymbol("nvmlDeviceGetAttributes") - if ret != C.NVML_SUCCESS { - return DeviceAttributes{}, errorString(ret) - } - - var a C.nvmlDeviceAttributes_t - ret = C.nvmlDeviceGetAttributes(d.handle.dev, &a) - return *(*DeviceAttributes)(unsafe.Pointer(&a)), errorString(ret) -} diff --git a/bindings/go/nvml/mig_test.go b/bindings/go/nvml/mig_test.go deleted file mode 100644 index 74d3b50e..00000000 --- a/bindings/go/nvml/mig_test.go +++ /dev/null @@ -1,122 +0,0 @@ -package nvml - -import ( - "testing" -) - -func TestSetMigMode(t *testing.T) { - // Initialize NVML - err := Init() - if err != nil { - t.Errorf("%v", err) - } - defer Shutdown() - - // Grab a reference to our first device - device, err := NewDevice(0) - if err != nil { - t.Errorf("%v", err) - } - - // Disable MIG on the device - _, err = device.SetMigMode(DEVICE_MIG_DISABLE) - if err != nil { - t.Errorf("error enabling MIG mode on Device: %v", err) - } - - // Ensure MIG Mode is disabled on the device - current, pending, err := device.GetMigMode() - if err != nil { - t.Errorf("error getting MIG mode on Device: %v", err) - } - if current != pending || current != DEVICE_MIG_DISABLE { - t.Errorf("Expected MIG mode on Device to be DEVICE_MIG_DISABLE, got (current %v, pending %v)", current, pending) - } - - // Enable MIG on the device - _, err = device.SetMigMode(DEVICE_MIG_ENABLE) - if err != nil { - t.Errorf("error enabling MIG mode on Device: %v", err) - } - - // Ensure MIG Mode is enabled on the device - current, pending, err = device.GetMigMode() - if err != nil { - t.Errorf("error getting MIG mode on Device: %v", err) - } - if current != pending || current != DEVICE_MIG_ENABLE { - t.Errorf("Expected MIG mode on Device to be DEVICE_MIG_ENABLE, got (current %v, pending %v)", current, pending) - } - - // Disable MIG on the device - _, err = device.SetMigMode(DEVICE_MIG_DISABLE) - if err != nil { - t.Errorf("error enabling MIG mode on Device: %v", err) - } - - // Ensure MIG Mode is disabled on the device - current, pending, err = device.GetMigMode() - if err != nil { - t.Errorf("error getting MIG mode on Device: %v", err) - } - if current != pending || current != DEVICE_MIG_DISABLE { - t.Errorf("Expected MIG mode on Device to be DEVICE_MIG_DISABLE, got (current %v, pending %v)", current, pending) - } -} - -func TestParseMigDeviceUUID(t *testing.T) { - tests := []struct { - name string - uuid string - expectedGPU string - expectedGi uint - expectedCi uint - expectedError bool - }{ - { - name: "Successfull Parsing", - uuid: "MIG-GPU-b8ea3855-276c-c9cb-b366-c6fa655957c5/1/5", - expectedGPU: "GPU-b8ea3855-276c-c9cb-b366-c6fa655957c5", - expectedGi: 1, - expectedCi: 5, - }, - { - name: "Fail, Missing MIG at the beginning of UUID", - uuid: "GPU-b8ea3855-276c-c9cb-b366-c6fa655957c5/1/5", - expectedError: true, - }, - { - name: "Fail, Missing GPU at the beginning of GPU UUID", - uuid: "MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5/1/5", - expectedError: true, - }, - { - name: "Fail, GI not parsable", - uuid: "MIG-GPU-b8ea3855-276c-c9cb-b366-c6fa655957c5/xx/5", - expectedError: true, - }, - { - name: "Fail, CI not a parsable", - uuid: "MIG-GPU-b8ea3855-276c-c9cb-b366-c6fa655957c5/1/xx", - expectedError: true, - }, - } - - for _, tc := range tests { - t.Run(tc.name, func(t *testing.T) { - gpu, gi, ci, err := ParseMigDeviceUUID(tc.uuid) - if tc.expectedError && err != nil { - return - } - if tc.expectedError && err == nil { - t.Fatalf("Expected an error, but didn't get one: uuid: %v, (gpu: %v, gi: %v, ci: %v)", tc.uuid, gpu, gi, ci) - } - if !tc.expectedError && err != nil { - t.Fatalf("Unexpected error: %v, uuid: %v, (gpu: %v, gi: %v, ci: %v)", err, tc.uuid, gpu, gi, ci) - } - if gpu != tc.expectedGPU || gi != tc.expectedGi || ci != tc.expectedCi { - t.Fatalf("MIG UUID parsed incorrectly: uuid: %v, (gpu: %v, gi: %v, ci: %v)", tc.uuid, gpu, gi, ci) - } - }) - } -} diff --git a/bindings/go/nvml/nvml.go b/bindings/go/nvml/nvml.go deleted file mode 100644 index c7ab5a10..00000000 --- a/bindings/go/nvml/nvml.go +++ /dev/null @@ -1,822 +0,0 @@ -// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. - -package nvml - -// #include "nvml.h" -import "C" - -import ( - "bytes" - "errors" - "fmt" - "io/ioutil" - "runtime" - "strconv" - "strings" -) - -var ( - ErrCPUAffinity = errors.New("failed to retrieve CPU affinity") - ErrUnsupportedP2PLink = errors.New("unsupported P2P link type") - ErrUnsupportedGPU = errors.New("unsupported GPU device") -) - -type ModeState uint - -const ( - Disabled ModeState = iota - Enabled -) - -func (m ModeState) String() string { - switch m { - case Enabled: - return "Enabled" - case Disabled: - return "Disabled" - } - return "N/A" -} - -type Display struct { - Mode ModeState - Active ModeState -} - -type Accounting struct { - Mode ModeState - BufferSize *uint -} - -type DeviceMode struct { - DisplayInfo Display - Persistence ModeState - AccountingInfo Accounting -} - -type ThrottleReason uint - -const ( - ThrottleReasonGpuIdle ThrottleReason = iota - ThrottleReasonApplicationsClocksSetting - ThrottleReasonSwPowerCap - ThrottleReasonHwSlowdown - ThrottleReasonSyncBoost - ThrottleReasonSwThermalSlowdown - ThrottleReasonHwThermalSlowdown - ThrottleReasonHwPowerBrakeSlowdown - ThrottleReasonDisplayClockSetting - ThrottleReasonNone - ThrottleReasonUnknown -) - -func (r ThrottleReason) String() string { - switch r { - case ThrottleReasonGpuIdle: - return "Gpu Idle" - case ThrottleReasonApplicationsClocksSetting: - return "Applications Clocks Setting" - case ThrottleReasonSwPowerCap: - return "SW Power Cap" - case ThrottleReasonHwSlowdown: - return "HW Slowdown" - case ThrottleReasonSyncBoost: - return "Sync Boost" - case ThrottleReasonSwThermalSlowdown: - return "SW Thermal Slowdown" - case ThrottleReasonHwThermalSlowdown: - return "HW Thermal Slowdown" - case ThrottleReasonHwPowerBrakeSlowdown: - return "HW Power Brake Slowdown" - case ThrottleReasonDisplayClockSetting: - return "Display Clock Setting" - case ThrottleReasonNone: - return "No clocks throttling" - } - return "N/A" -} - -type PerfState uint - -const ( - PerfStateMax = 0 - PerfStateMin = 15 - PerfStateUnknown = 32 -) - -func (p PerfState) String() string { - if p >= PerfStateMax && p <= PerfStateMin { - return fmt.Sprintf("P%d", p) - } - return "Unknown" -} - -type ProcessType uint - -const ( - Compute ProcessType = iota - Graphics - ComputeAndGraphics -) - -func (t ProcessType) String() string { - typ := "C+G" - if t == Compute { - typ = "C" - } else if t == Graphics { - typ = "G" - } - return typ -} - -type P2PLinkType uint - -const ( - P2PLinkUnknown P2PLinkType = iota - P2PLinkCrossCPU - P2PLinkSameCPU - P2PLinkHostBridge - P2PLinkMultiSwitch - P2PLinkSingleSwitch - P2PLinkSameBoard - SingleNVLINKLink - TwoNVLINKLinks - ThreeNVLINKLinks - FourNVLINKLinks - FiveNVLINKLinks - SixNVLINKLinks - SevenNVLINKLinks - EightNVLINKLinks - NineNVLINKLinks - TenNVLINKLinks - ElevenNVLINKLinks - TwelveNVLINKLinks -) - -type P2PLink struct { - BusID string - Link P2PLinkType -} - -func (t P2PLinkType) String() string { - switch t { - case P2PLinkCrossCPU: - return "Cross CPU socket" - case P2PLinkSameCPU: - return "Same CPU socket" - case P2PLinkHostBridge: - return "Host PCI bridge" - case P2PLinkMultiSwitch: - return "Multiple PCI switches" - case P2PLinkSingleSwitch: - return "Single PCI switch" - case P2PLinkSameBoard: - return "Same board" - case SingleNVLINKLink: - return "Single NVLink" - case TwoNVLINKLinks: - return "Two NVLinks" - case ThreeNVLINKLinks: - return "Three NVLinks" - case FourNVLINKLinks: - return "Four NVLinks" - case FiveNVLINKLinks: - return "Five NVLinks" - case SixNVLINKLinks: - return "Six NVLinks" - case SevenNVLINKLinks: - return "Seven NVLinks" - case EightNVLINKLinks: - return "Eight NVLinks" - case NineNVLINKLinks: - return "Nine NVLinks" - case TenNVLINKLinks: - return "Ten NVLinks" - case ElevenNVLINKLinks: - return "Eleven NVLinks" - case TwelveNVLINKLinks: - return "Twelve NVLinks" - case P2PLinkUnknown: - } - return "N/A" -} - -type ClockInfo struct { - Cores *uint - Memory *uint -} - -type PCIInfo struct { - BusID string - BAR1 *uint64 - Bandwidth *uint -} - -type CudaComputeCapabilityInfo struct { - Major *int - Minor *int -} - -type Device struct { - handle - - UUID string - Path string - Model *string - Power *uint - Memory *uint64 - CPUAffinity *uint - PCI PCIInfo - Clocks ClockInfo - Topology []P2PLink - CudaComputeCapability CudaComputeCapabilityInfo -} - -type UtilizationInfo struct { - GPU *uint - Memory *uint - Encoder *uint - Decoder *uint -} - -type PCIThroughputInfo struct { - RX *uint - TX *uint -} - -type PCIStatusInfo struct { - BAR1Used *uint64 - Throughput PCIThroughputInfo -} - -type ECCErrorsInfo struct { - L1Cache *uint64 - L2Cache *uint64 - Device *uint64 -} - -type DeviceMemory struct { - Used *uint64 - Free *uint64 -} - -type MemoryInfo struct { - Global DeviceMemory - ECCErrors ECCErrorsInfo -} - -type ProcessInfo struct { - PID uint - Name string - MemoryUsed uint64 - Type ProcessType -} - -type DeviceStatus struct { - Power *uint - FanSpeed *uint - Temperature *uint - Utilization UtilizationInfo - Memory MemoryInfo - Clocks ClockInfo - PCI PCIStatusInfo - Processes []ProcessInfo - Throttle ThrottleReason - Performance PerfState -} - -func assert(err error) { - if err != nil { - panic(err) - } -} - -func Init() error { - return init_() -} - -func Shutdown() error { - return shutdown() -} - -func GetDeviceCount() (uint, error) { - return deviceGetCount() -} - -func GetDriverVersion() (string, error) { - return systemGetDriverVersion() -} - -func GetCudaDriverVersion() (*uint, *uint, error) { - return systemGetCudaDriverVersion() -} - -func numaNode(busid string) (*uint, error) { - // discard leading zeros of busid - b, err := ioutil.ReadFile(fmt.Sprintf("/sys/bus/pci/devices/%s/numa_node", strings.ToLower(busid[4:]))) - if err != nil { - // XXX report nil if NUMA support isn't enabled - return nil, nil - } - node, err := strconv.ParseInt(string(bytes.TrimSpace(b)), 10, 8) - if err != nil { - return nil, fmt.Errorf("%v: %v", ErrCPUAffinity, err) - } - if node < 0 { - // XXX report nil instead of NUMA_NO_NODE - return nil, nil - } - - numaNode := uint(node) - return &numaNode, nil -} - -func pciBandwidth(gen, width *uint) *uint { - m := map[uint]uint{ - 1: 250, // MB/s - 2: 500, - 3: 985, - 4: 1969, - } - if gen == nil || width == nil { - return nil - } - bw := m[*gen] * *width - return &bw -} - -func NewDevice(idx uint) (device *Device, err error) { - defer func() { - if r := recover(); r != nil { - err = r.(error) - } - }() - - h, err := deviceGetHandleByIndex(idx) - assert(err) - - device, err = newDevice(h) - assert(err) - - return device, err -} - -func NewDeviceByUUID(uuid string) (device *Device, err error) { - defer func() { - if r := recover(); r != nil { - err = r.(error) - } - }() - - h, err := deviceGetHandleByUUID(uuid) - assert(err) - - device, err = newDevice(h) - assert(err) - - return device, err -} - -func newDevice(h handle) (device *Device, err error) { - model, err := h.deviceGetName() - assert(err) - uuid, err := h.deviceGetUUID() - assert(err) - minor, err := h.deviceGetMinorNumber() - assert(err) - power, err := h.deviceGetPowerManagementLimit() - assert(err) - totalMem, _, err := h.deviceGetMemoryInfo() - assert(err) - busid, err := h.deviceGetPciInfo() - assert(err) - bar1, _, err := h.deviceGetBAR1MemoryInfo() - assert(err) - pcig, err := h.deviceGetMaxPcieLinkGeneration() - assert(err) - pciw, err := h.deviceGetMaxPcieLinkWidth() - assert(err) - ccore, cmem, err := h.deviceGetMaxClockInfo() - assert(err) - cccMajor, cccMinor, err := h.deviceGetCudaComputeCapability() - assert(err) - - var path string - if runtime.GOOS == "windows" { - if busid == nil || uuid == nil { - return nil, ErrUnsupportedGPU - } - } else { - if minor == nil || busid == nil || uuid == nil { - return nil, ErrUnsupportedGPU - } - path = fmt.Sprintf("/dev/nvidia%d", *minor) - } - node, err := numaNode(*busid) - assert(err) - - device = &Device{ - handle: h, - UUID: *uuid, - Path: path, - Model: model, - Power: power, - Memory: totalMem, - CPUAffinity: node, - PCI: PCIInfo{ - BusID: *busid, - BAR1: bar1, - Bandwidth: pciBandwidth(pcig, pciw), // MB/s - }, - Clocks: ClockInfo{ - Cores: ccore, // MHz - Memory: cmem, // MHz - }, - CudaComputeCapability: CudaComputeCapabilityInfo{ - Major: cccMajor, - Minor: cccMinor, - }, - } - if power != nil { - *device.Power /= 1000 // W - } - if bar1 != nil { - *device.PCI.BAR1 /= 1024 * 1024 // MiB - } - return -} - -func NewDeviceLite(idx uint) (device *Device, err error) { - defer func() { - if r := recover(); r != nil { - err = r.(error) - } - }() - - h, err := deviceGetHandleByIndex(idx) - assert(err) - - device, err = newDeviceLite(h) - assert(err) - - return device, err -} - -func NewDeviceLiteByUUID(uuid string) (device *Device, err error) { - defer func() { - if r := recover(); r != nil { - err = r.(error) - } - }() - - h, err := deviceGetHandleByUUID(uuid) - assert(err) - - device, err = newDeviceLite(h) - assert(err) - - return device, err -} - -func newDeviceLite(h handle) (device *Device, err error) { - uuid, err := h.deviceGetUUID() - assert(err) - minor, err := h.deviceGetMinorNumber() - assert(err) - busid, err := h.deviceGetPciInfo() - assert(err) - - if minor == nil || busid == nil || uuid == nil { - return nil, ErrUnsupportedGPU - } - path := fmt.Sprintf("/dev/nvidia%d", *minor) - node, err := numaNode(*busid) - assert(err) - - device = &Device{ - handle: h, - UUID: *uuid, - Path: path, - CPUAffinity: node, - PCI: PCIInfo{ - BusID: *busid, - }, - } - return -} - -func (d *Device) Status() (status *DeviceStatus, err error) { - defer func() { - if r := recover(); r != nil { - err = r.(error) - } - }() - - power, err := d.deviceGetPowerUsage() - assert(err) - fanSpeed, err := d.deviceGetFanSpeed() - assert(err) - temp, err := d.deviceGetTemperature() - assert(err) - ugpu, umem, err := d.deviceGetUtilizationRates() - assert(err) - uenc, err := d.deviceGetEncoderUtilization() - assert(err) - udec, err := d.deviceGetDecoderUtilization() - assert(err) - _, devMem, err := d.deviceGetMemoryInfo() - assert(err) - ccore, cmem, err := d.deviceGetClockInfo() - assert(err) - _, bar1, err := d.deviceGetBAR1MemoryInfo() - assert(err) - el1, el2, emem, err := d.deviceGetMemoryErrorCounter() - assert(err) - pcirx, pcitx, err := d.deviceGetPcieThroughput() - assert(err) - throttle, err := d.getClocksThrottleReasons() - assert(err) - perfState, err := d.getPerformanceState() - assert(err) - processInfo, err := d.deviceGetAllRunningProcesses() - assert(err) - - status = &DeviceStatus{ - Power: power, - FanSpeed: fanSpeed, // % - Temperature: temp, // °C - Utilization: UtilizationInfo{ - GPU: ugpu, // % - Memory: umem, // % - Encoder: uenc, // % - Decoder: udec, // % - }, - Memory: MemoryInfo{ - Global: devMem, - ECCErrors: ECCErrorsInfo{ - L1Cache: el1, - L2Cache: el2, - Device: emem, - }, - }, - Clocks: ClockInfo{ - Cores: ccore, // MHz - Memory: cmem, // MHz - }, - PCI: PCIStatusInfo{ - BAR1Used: bar1, - Throughput: PCIThroughputInfo{ - RX: pcirx, - TX: pcitx, - }, - }, - Throttle: throttle, - Performance: perfState, - Processes: processInfo, - } - if power != nil { - *status.Power /= 1000 // W - } - if bar1 != nil { - *status.PCI.BAR1Used /= 1024 * 1024 // MiB - } - if pcirx != nil { - *status.PCI.Throughput.RX /= 1000 // MB/s - } - if pcitx != nil { - *status.PCI.Throughput.TX /= 1000 // MB/s - } - return -} - -func GetP2PLink(dev1, dev2 *Device) (link P2PLinkType, err error) { - level, err := deviceGetTopologyCommonAncestor(dev1.handle, dev2.handle) - if err != nil || level == nil { - return P2PLinkUnknown, err - } - - switch *level { - case C.NVML_TOPOLOGY_INTERNAL: - link = P2PLinkSameBoard - case C.NVML_TOPOLOGY_SINGLE: - link = P2PLinkSingleSwitch - case C.NVML_TOPOLOGY_MULTIPLE: - link = P2PLinkMultiSwitch - case C.NVML_TOPOLOGY_HOSTBRIDGE: - link = P2PLinkHostBridge - case C.NVML_TOPOLOGY_CPU: - link = P2PLinkSameCPU - case C.NVML_TOPOLOGY_SYSTEM: - link = P2PLinkCrossCPU - default: - err = ErrUnsupportedP2PLink - } - return -} - -func GetNVLink(dev1, dev2 *Device) (link P2PLinkType, err error) { - nvbusIds1, err := dev1.handle.deviceGetAllNvLinkRemotePciInfo() - if err != nil || nvbusIds1 == nil { - return P2PLinkUnknown, err - } - - nvlink := P2PLinkUnknown - for _, nvbusID1 := range nvbusIds1 { - if *nvbusID1 == dev2.PCI.BusID { - switch nvlink { - case P2PLinkUnknown: - nvlink = SingleNVLINKLink - case SingleNVLINKLink: - nvlink = TwoNVLINKLinks - case TwoNVLINKLinks: - nvlink = ThreeNVLINKLinks - case ThreeNVLINKLinks: - nvlink = FourNVLINKLinks - case FourNVLINKLinks: - nvlink = FiveNVLINKLinks - case FiveNVLINKLinks: - nvlink = SixNVLINKLinks - case SixNVLINKLinks: - nvlink = SevenNVLINKLinks - case SevenNVLINKLinks: - nvlink = EightNVLINKLinks - case EightNVLINKLinks: - nvlink = NineNVLINKLinks - case NineNVLINKLinks: - nvlink = TenNVLINKLinks - case TenNVLINKLinks: - nvlink = ElevenNVLINKLinks - case ElevenNVLINKLinks: - nvlink = TwelveNVLINKLinks - } - } - } - - // TODO(klueska): Handle NVSwitch semantics - - return nvlink, nil -} - -func (d *Device) GetComputeRunningProcesses() ([]uint, []uint64, error) { - return d.handle.deviceGetComputeRunningProcesses() -} - -func (d *Device) GetGraphicsRunningProcesses() ([]uint, []uint64, error) { - return d.handle.deviceGetGraphicsRunningProcesses() -} - -func (d *Device) GetAllRunningProcesses() ([]ProcessInfo, error) { - return d.handle.deviceGetAllRunningProcesses() -} - -func (d *Device) GetDeviceMode() (mode *DeviceMode, err error) { - defer func() { - if r := recover(); r != nil { - err = r.(error) - } - }() - - display, err := d.getDisplayInfo() - assert(err) - - p, err := d.getPeristenceMode() - assert(err) - - accounting, err := d.getAccountingInfo() - assert(err) - - mode = &DeviceMode{ - DisplayInfo: display, - Persistence: p, - AccountingInfo: accounting, - } - return -} - -func (d *Device) IsMigEnabled() (bool, error) { - return d.handle.isMigEnabled() -} - -func (d *Device) GetMigDevices() ([]*Device, error) { - handles, err := d.handle.getMigDevices() - if err != nil { - return nil, err - } - - var devices []*Device - for _, h := range handles { - uuid, err := h.deviceGetUUID() - if err != nil { - return nil, err - } - - model, err := d.deviceGetName() - if err != nil { - return nil, err - } - - totalMem, _, err := h.deviceGetMemoryInfo() - if err != nil { - return nil, err - } - - device := &Device{ - handle: h, - UUID: *uuid, - Model: model, - Memory: totalMem, - CPUAffinity: d.CPUAffinity, - Path: d.Path, - } - - devices = append(devices, device) - } - - return devices, nil -} - -func (d *Device) GetMigParentDevice() (*Device, error) { - parent, err := d.handle.deviceGetDeviceHandleFromMigDeviceHandle() - if err != nil { - return nil, err - } - - index, err := parent.deviceGetIndex() - if err != nil { - return nil, err - } - - return NewDevice(*index) -} - -func (d *Device) GetMigParentDeviceLite() (*Device, error) { - parent, err := d.handle.deviceGetDeviceHandleFromMigDeviceHandle() - if err != nil { - return nil, err - } - - index, err := parent.deviceGetIndex() - if err != nil { - return nil, err - } - - return NewDeviceLite(*index) -} - -func ParseMigDeviceUUID(uuid string) (string, uint, uint, error) { - migHandle, err := deviceGetHandleByUUID(uuid) - if err == nil { - return getMIGDeviceInfo(migHandle) - } - return parseMigDeviceUUID(uuid) -} - -func getMIGDeviceInfo(migHandle handle) (string, uint, uint, error) { - parentHandle, err := migHandle.deviceGetDeviceHandleFromMigDeviceHandle() - if err != nil { - return "", 0, 0, err - } - - parentUUID, err := parentHandle.deviceGetUUID() - if err != nil { - return "", 0, 0, err - } - - migDevice := Device{handle: migHandle} - - gi, err := migDevice.GetGPUInstanceId() - if err != nil { - return "", 0, 0, err - } - - ci, err := migDevice.GetComputeInstanceId() - if err != nil { - return "", 0, 0, err - } - - return *parentUUID, uint(gi), uint(ci), err -} - -func parseMigDeviceUUID(mig string) (string, uint, uint, error) { - tokens := strings.SplitN(mig, "-", 2) - if len(tokens) != 2 || tokens[0] != "MIG" { - return "", 0, 0, fmt.Errorf("Unable to parse UUID as MIG device") - } - - tokens = strings.SplitN(tokens[1], "/", 3) - if len(tokens) != 3 || !strings.HasPrefix(tokens[0], "GPU-") { - return "", 0, 0, fmt.Errorf("Unable to parse UUID as MIG device") - } - - gi, err := strconv.Atoi(tokens[1]) - if err != nil { - return "", 0, 0, fmt.Errorf("Unable to parse UUID as MIG device") - } - - ci, err := strconv.Atoi(tokens[2]) - if err != nil { - return "", 0, 0, fmt.Errorf("Unable to parse UUID as MIG device") - } - - return tokens[0], uint(gi), uint(ci), nil -} diff --git a/bindings/go/nvml/nvml.h b/bindings/go/nvml/nvml.h deleted file mode 100644 index 46e90d1d..00000000 --- a/bindings/go/nvml/nvml.h +++ /dev/null @@ -1,7603 +0,0 @@ -/* - * Copyright 1993-2020 NVIDIA Corporation. All rights reserved. - * - * NOTICE TO USER: - * - * This source code is subject to NVIDIA ownership rights under U.S. and - * international Copyright laws. Users and possessors of this source code - * are hereby granted a nonexclusive, royalty-free license to use this code - * in individual and commercial software. - * - * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE - * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR - * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH - * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. - * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, - * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS - * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE - * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE - * OR PERFORMANCE OF THIS SOURCE CODE. - * - * U.S. Government End Users. This source code is a "commercial item" as - * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of - * "commercial computer software" and "commercial computer software - * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) - * and is provided to the U.S. Government only as a commercial end item. - * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through - * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the - * source code with only those rights set forth herein. - * - * Any use of this source code in individual and commercial software must - * include, in the user documentation and internal comments to the code, - * the above Disclaimer and U.S. Government End Users Notice. - */ - -/* -NVML API Reference - -The NVIDIA Management Library (NVML) is a C-based programmatic interface for monitoring and -managing various states within NVIDIA Tesla &tm; GPUs. It is intended to be a platform for building -3rd party applications, and is also the underlying library for the NVIDIA-supported nvidia-smi -tool. NVML is thread-safe so it is safe to make simultaneous NVML calls from multiple threads. - -API Documentation - -Supported platforms: -- Windows: Windows Server 2008 R2 64bit, Windows Server 2012 R2 64bit, Windows 7 64bit, Windows 8 64bit, Windows 10 64bit -- Linux: 32-bit and 64-bit -- Hypervisors: Windows Server 2008R2/2012 Hyper-V 64bit, Citrix XenServer 6.2 SP1+, VMware ESX 5.1/5.5 - -Supported products: -- Full Support - - All Tesla products, starting with the Fermi architecture - - All Quadro products, starting with the Fermi architecture - - All GRID products, starting with the Kepler architecture - - Selected GeForce Titan products -- Limited Support - - All Geforce products, starting with the Fermi architecture - -The NVML library can be found at \%ProgramW6432\%\\"NVIDIA Corporation"\\NVSMI\\ on Windows. It is -not be added to the system path by default. To dynamically link to NVML, add this path to the PATH -environmental variable. To dynamically load NVML, call LoadLibrary with this path. - -On Linux the NVML library will be found on the standard library path. For 64 bit Linux, both the 32 bit -and 64 bit NVML libraries will be installed. - -Online documentation for this library is available at http://docs.nvidia.com/deploy/nvml-api/index.html -*/ - -#ifndef __nvml_nvml_h__ -#define __nvml_nvml_h__ - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * On Windows, set up methods for DLL export - * define NVML_STATIC_IMPORT when using nvml_loader library - */ -#if defined _WINDOWS - #if !defined NVML_STATIC_IMPORT - #if defined NVML_LIB_EXPORT - #define DECLDIR __declspec(dllexport) - #else - #define DECLDIR __declspec(dllimport) - #endif - #else - #define DECLDIR - #endif -#else - #define DECLDIR -#endif - -/** - * NVML API versioning support - */ -#define NVML_API_VERSION 11 -#define NVML_API_VERSION_STR "11" -/** - * Defining NVML_NO_UNVERSIONED_FUNC_DEFS will disable "auto upgrading" of APIs. - * e.g. the user will have to call nvmlInit_v2 instead of nvmlInit. Enable this - * guard if you need to support older versions of the API - */ -#ifndef NVML_NO_UNVERSIONED_FUNC_DEFS - #define nvmlInit nvmlInit_v2 - #define nvmlDeviceGetPciInfo nvmlDeviceGetPciInfo_v3 - #define nvmlDeviceGetCount nvmlDeviceGetCount_v2 - #define nvmlDeviceGetHandleByIndex nvmlDeviceGetHandleByIndex_v2 - #define nvmlDeviceGetHandleByPciBusId nvmlDeviceGetHandleByPciBusId_v2 - #define nvmlDeviceGetNvLinkRemotePciInfo nvmlDeviceGetNvLinkRemotePciInfo_v2 - #define nvmlDeviceRemoveGpu nvmlDeviceRemoveGpu_v2 - #define nvmlDeviceGetGridLicensableFeatures nvmlDeviceGetGridLicensableFeatures_v3 - #define nvmlEventSetWait nvmlEventSetWait_v2 - #define nvmlDeviceGetAttributes nvmlDeviceGetAttributes_v2 -#endif // #ifndef NVML_NO_UNVERSIONED_FUNC_DEFS - -/***************************************************************************************************/ -/** @defgroup nvmlDeviceStructs Device Structs - * @{ - */ -/***************************************************************************************************/ - -/** - * Special constant that some fields take when they are not available. - * Used when only part of the struct is not available. - * - * Each structure explicitly states when to check for this value. - */ -#define NVML_VALUE_NOT_AVAILABLE (-1) - -typedef struct nvmlDevice_st* nvmlDevice_t; - -/** - * Buffer size guaranteed to be large enough for pci bus id - */ -#define NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE 32 - -/** - * Buffer size guaranteed to be large enough for pci bus id for ::busIdLegacy - */ -#define NVML_DEVICE_PCI_BUS_ID_BUFFER_V2_SIZE 16 - -/** - * PCI information about a GPU device. - */ -typedef struct nvmlPciInfo_st -{ - char busIdLegacy[NVML_DEVICE_PCI_BUS_ID_BUFFER_V2_SIZE]; //!< The legacy tuple domain:bus:device.function PCI identifier (& NULL terminator) - unsigned int domain; //!< The PCI domain on which the device's bus resides, 0 to 0xffffffff - unsigned int bus; //!< The bus on which the device resides, 0 to 0xff - unsigned int device; //!< The device's id on the bus, 0 to 31 - unsigned int pciDeviceId; //!< The combined 16-bit device id and 16-bit vendor id - - // Added in NVML 2.285 API - unsigned int pciSubSystemId; //!< The 32-bit Sub System Device ID - - char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; //!< The tuple domain:bus:device.function PCI identifier (& NULL terminator) -} nvmlPciInfo_t; - -/** - * PCI format string for ::busIdLegacy - */ -#define NVML_DEVICE_PCI_BUS_ID_LEGACY_FMT "%04X:%02X:%02X.0" - -/** - * PCI format string for ::busId - */ -#define NVML_DEVICE_PCI_BUS_ID_FMT "%08X:%02X:%02X.0" - -/** - * Utility macro for filling the pci bus id format from a nvmlPciInfo_t - */ -#define NVML_DEVICE_PCI_BUS_ID_FMT_ARGS(pciInfo) (pciInfo)->domain, \ - (pciInfo)->bus, \ - (pciInfo)->device - -/** - * Detailed ECC error counts for a device. - * - * @deprecated Different GPU families can have different memory error counters - * See \ref nvmlDeviceGetMemoryErrorCounter - */ -typedef struct nvmlEccErrorCounts_st -{ - unsigned long long l1Cache; //!< L1 cache errors - unsigned long long l2Cache; //!< L2 cache errors - unsigned long long deviceMemory; //!< Device memory errors - unsigned long long registerFile; //!< Register file errors -} nvmlEccErrorCounts_t; - -/** - * Utilization information for a device. - * Each sample period may be between 1 second and 1/6 second, depending on the product being queried. - */ -typedef struct nvmlUtilization_st -{ - unsigned int gpu; //!< Percent of time over the past sample period during which one or more kernels was executing on the GPU - unsigned int memory; //!< Percent of time over the past sample period during which global (device) memory was being read or written -} nvmlUtilization_t; - -/** - * Memory allocation information for a device. - */ -typedef struct nvmlMemory_st -{ - unsigned long long total; //!< Total installed FB memory (in bytes) - unsigned long long free; //!< Unallocated FB memory (in bytes) - unsigned long long used; //!< Allocated FB memory (in bytes). Note that the driver/GPU always sets aside a small amount of memory for bookkeeping -} nvmlMemory_t; - -/** - * BAR1 Memory allocation Information for a device - */ -typedef struct nvmlBAR1Memory_st -{ - unsigned long long bar1Total; //!< Total BAR1 Memory (in bytes) - unsigned long long bar1Free; //!< Unallocated BAR1 Memory (in bytes) - unsigned long long bar1Used; //!< Allocated Used Memory (in bytes) -}nvmlBAR1Memory_t; - -/** - * Information about running compute processes on the GPU - */ -typedef struct nvmlProcessInfo_st -{ - unsigned int pid; //!< Process ID - unsigned long long usedGpuMemory; //!< Amount of used GPU memory in bytes. - //! Under WDDM, \ref NVML_VALUE_NOT_AVAILABLE is always reported - //! because Windows KMD manages all the memory and not the NVIDIA driver -} nvmlProcessInfo_t; - -typedef struct nvmlDeviceAttributes_st -{ - unsigned int multiprocessorCount; //!< Streaming Multiprocessor count - unsigned int sharedCopyEngineCount; //!< Shared Copy Engine count - unsigned int sharedDecoderCount; //!< Shared Decoder Engine count - unsigned int sharedEncoderCount; //!< Shared Encoder Engine count - unsigned int sharedJpegCount; //!< Shared JPEG Engine count - unsigned int sharedOfaCount; //!< Shared OFA Engine count - unsigned int gpuInstanceSliceCount; //!< GPU instance slice count - unsigned int computeInstanceSliceCount; //!< Compute instance slice count - unsigned long long memorySizeMB; //!< Device memory size (in MiB) -} nvmlDeviceAttributes_t; - -/** - * Possible values that classify the remap availability for each bank. The max - * field will contain the number of banks that have maximum remap availability - * (all reserved rows are available). None means that there are no reserved - * rows available. - */ -typedef struct nvmlRowRemapperHistogramValues_st -{ - unsigned int max; - unsigned int high; - unsigned int partial; - unsigned int low; - unsigned int none; -} nvmlRowRemapperHistogramValues_t; - -/** - * Enum to represent type of bridge chip - */ -typedef enum nvmlBridgeChipType_enum -{ - NVML_BRIDGE_CHIP_PLX = 0, - NVML_BRIDGE_CHIP_BRO4 = 1 -}nvmlBridgeChipType_t; - -/** - * Maximum number of NvLink links supported - */ -#define NVML_NVLINK_MAX_LINKS 12 - -/** - * Enum to represent the NvLink utilization counter packet units - */ -typedef enum nvmlNvLinkUtilizationCountUnits_enum -{ - NVML_NVLINK_COUNTER_UNIT_CYCLES = 0, // count by cycles - NVML_NVLINK_COUNTER_UNIT_PACKETS = 1, // count by packets - NVML_NVLINK_COUNTER_UNIT_BYTES = 2, // count by bytes - NVML_NVLINK_COUNTER_UNIT_RESERVED = 3, // count reserved for internal use - // this must be last - NVML_NVLINK_COUNTER_UNIT_COUNT -} nvmlNvLinkUtilizationCountUnits_t; - -/** - * Enum to represent the NvLink utilization counter packet types to count - * ** this is ONLY applicable with the units as packets or bytes - * ** as specified in \a nvmlNvLinkUtilizationCountUnits_t - * ** all packet filter descriptions are target GPU centric - * ** these can be "OR'd" together - */ -typedef enum nvmlNvLinkUtilizationCountPktTypes_enum -{ - NVML_NVLINK_COUNTER_PKTFILTER_NOP = 0x1, // no operation packets - NVML_NVLINK_COUNTER_PKTFILTER_READ = 0x2, // read packets - NVML_NVLINK_COUNTER_PKTFILTER_WRITE = 0x4, // write packets - NVML_NVLINK_COUNTER_PKTFILTER_RATOM = 0x8, // reduction atomic requests - NVML_NVLINK_COUNTER_PKTFILTER_NRATOM = 0x10, // non-reduction atomic requests - NVML_NVLINK_COUNTER_PKTFILTER_FLUSH = 0x20, // flush requests - NVML_NVLINK_COUNTER_PKTFILTER_RESPDATA = 0x40, // responses with data - NVML_NVLINK_COUNTER_PKTFILTER_RESPNODATA = 0x80, // responses without data - NVML_NVLINK_COUNTER_PKTFILTER_ALL = 0xFF // all packets -} nvmlNvLinkUtilizationCountPktTypes_t; - -/** - * Struct to define the NVLINK counter controls - */ -typedef struct nvmlNvLinkUtilizationControl_st -{ - nvmlNvLinkUtilizationCountUnits_t units; - nvmlNvLinkUtilizationCountPktTypes_t pktfilter; -} nvmlNvLinkUtilizationControl_t; - -/** - * Enum to represent NvLink queryable capabilities - */ -typedef enum nvmlNvLinkCapability_enum -{ - NVML_NVLINK_CAP_P2P_SUPPORTED = 0, // P2P over NVLink is supported - NVML_NVLINK_CAP_SYSMEM_ACCESS = 1, // Access to system memory is supported - NVML_NVLINK_CAP_P2P_ATOMICS = 2, // P2P atomics are supported - NVML_NVLINK_CAP_SYSMEM_ATOMICS= 3, // System memory atomics are supported - NVML_NVLINK_CAP_SLI_BRIDGE = 4, // SLI is supported over this link - NVML_NVLINK_CAP_VALID = 5, // Link is supported on this device - // should be last - NVML_NVLINK_CAP_COUNT -} nvmlNvLinkCapability_t; - -/** - * Enum to represent NvLink queryable error counters - */ -typedef enum nvmlNvLinkErrorCounter_enum -{ - NVML_NVLINK_ERROR_DL_REPLAY = 0, // Data link transmit replay error counter - NVML_NVLINK_ERROR_DL_RECOVERY = 1, // Data link transmit recovery error counter - NVML_NVLINK_ERROR_DL_CRC_FLIT = 2, // Data link receive flow control digit CRC error counter - NVML_NVLINK_ERROR_DL_CRC_DATA = 3, // Data link receive data CRC error counter - - // this must be last - NVML_NVLINK_ERROR_COUNT -} nvmlNvLinkErrorCounter_t; - -/** - * Represents level relationships within a system between two GPUs - * The enums are spaced to allow for future relationships - */ -typedef enum nvmlGpuLevel_enum -{ - NVML_TOPOLOGY_INTERNAL = 0, // e.g. Tesla K80 - NVML_TOPOLOGY_SINGLE = 10, // all devices that only need traverse a single PCIe switch - NVML_TOPOLOGY_MULTIPLE = 20, // all devices that need not traverse a host bridge - NVML_TOPOLOGY_HOSTBRIDGE = 30, // all devices that are connected to the same host bridge - NVML_TOPOLOGY_NODE = 40, // all devices that are connected to the same NUMA node but possibly multiple host bridges - NVML_TOPOLOGY_SYSTEM = 50, // all devices in the system - - // there is purposefully no COUNT here because of the need for spacing above -} nvmlGpuTopologyLevel_t; - -/* Compatibility for CPU->NODE renaming */ -#define NVML_TOPOLOGY_CPU NVML_TOPOLOGY_NODE - -/* P2P Capability Index Status*/ -typedef enum nvmlGpuP2PStatus_enum -{ - NVML_P2P_STATUS_OK = 0, - NVML_P2P_STATUS_CHIPSET_NOT_SUPPORED, - NVML_P2P_STATUS_GPU_NOT_SUPPORTED, - NVML_P2P_STATUS_IOH_TOPOLOGY_NOT_SUPPORTED, - NVML_P2P_STATUS_DISABLED_BY_REGKEY, - NVML_P2P_STATUS_NOT_SUPPORTED, - NVML_P2P_STATUS_UNKNOWN - -} nvmlGpuP2PStatus_t; - -/* P2P Capability Index*/ -typedef enum nvmlGpuP2PCapsIndex_enum -{ - NVML_P2P_CAPS_INDEX_READ = 0, - NVML_P2P_CAPS_INDEX_WRITE, - NVML_P2P_CAPS_INDEX_NVLINK, - NVML_P2P_CAPS_INDEX_ATOMICS, - NVML_P2P_CAPS_INDEX_PROP, - NVML_P2P_CAPS_INDEX_UNKNOWN -}nvmlGpuP2PCapsIndex_t; - -/** - * Maximum limit on Physical Bridges per Board - */ -#define NVML_MAX_PHYSICAL_BRIDGE (128) - -/** - * Information about the Bridge Chip Firmware - */ -typedef struct nvmlBridgeChipInfo_st -{ - nvmlBridgeChipType_t type; //!< Type of Bridge Chip - unsigned int fwVersion; //!< Firmware Version. 0=Version is unavailable -}nvmlBridgeChipInfo_t; - -/** - * This structure stores the complete Hierarchy of the Bridge Chip within the board. The immediate - * bridge is stored at index 0 of bridgeInfoList, parent to immediate bridge is at index 1 and so forth. - */ -typedef struct nvmlBridgeChipHierarchy_st -{ - unsigned char bridgeCount; //!< Number of Bridge Chips on the Board - nvmlBridgeChipInfo_t bridgeChipInfo[NVML_MAX_PHYSICAL_BRIDGE]; //!< Hierarchy of Bridge Chips on the board -}nvmlBridgeChipHierarchy_t; - -/** - * Represents Type of Sampling Event - */ -typedef enum nvmlSamplingType_enum -{ - NVML_TOTAL_POWER_SAMPLES = 0, //!< To represent total power drawn by GPU - NVML_GPU_UTILIZATION_SAMPLES = 1, //!< To represent percent of time during which one or more kernels was executing on the GPU - NVML_MEMORY_UTILIZATION_SAMPLES = 2, //!< To represent percent of time during which global (device) memory was being read or written - NVML_ENC_UTILIZATION_SAMPLES = 3, //!< To represent percent of time during which NVENC remains busy - NVML_DEC_UTILIZATION_SAMPLES = 4, //!< To represent percent of time during which NVDEC remains busy - NVML_PROCESSOR_CLK_SAMPLES = 5, //!< To represent processor clock samples - NVML_MEMORY_CLK_SAMPLES = 6, //!< To represent memory clock samples - - // Keep this last - NVML_SAMPLINGTYPE_COUNT -}nvmlSamplingType_t; - -/** - * Represents the queryable PCIe utilization counters - */ -typedef enum nvmlPcieUtilCounter_enum -{ - NVML_PCIE_UTIL_TX_BYTES = 0, // 1KB granularity - NVML_PCIE_UTIL_RX_BYTES = 1, // 1KB granularity - - // Keep this last - NVML_PCIE_UTIL_COUNT -} nvmlPcieUtilCounter_t; - -/** - * Represents the type for sample value returned - */ -typedef enum nvmlValueType_enum -{ - NVML_VALUE_TYPE_DOUBLE = 0, - NVML_VALUE_TYPE_UNSIGNED_INT = 1, - NVML_VALUE_TYPE_UNSIGNED_LONG = 2, - NVML_VALUE_TYPE_UNSIGNED_LONG_LONG = 3, - NVML_VALUE_TYPE_SIGNED_LONG_LONG = 4, - - // Keep this last - NVML_VALUE_TYPE_COUNT -}nvmlValueType_t; - - -/** - * Union to represent different types of Value - */ -typedef union nvmlValue_st -{ - double dVal; //!< If the value is double - unsigned int uiVal; //!< If the value is unsigned int - unsigned long ulVal; //!< If the value is unsigned long - unsigned long long ullVal; //!< If the value is unsigned long long - signed long long sllVal; //!< If the value is signed long long -}nvmlValue_t; - -/** - * Information for Sample - */ -typedef struct nvmlSample_st -{ - unsigned long long timeStamp; //!< CPU Timestamp in microseconds - nvmlValue_t sampleValue; //!< Sample Value -}nvmlSample_t; - -/** - * Represents type of perf policy for which violation times can be queried - */ -typedef enum nvmlPerfPolicyType_enum -{ - NVML_PERF_POLICY_POWER = 0, //!< How long did power violations cause the GPU to be below application clocks - NVML_PERF_POLICY_THERMAL = 1, //!< How long did thermal violations cause the GPU to be below application clocks - NVML_PERF_POLICY_SYNC_BOOST = 2, //!< How long did sync boost cause the GPU to be below application clocks - NVML_PERF_POLICY_BOARD_LIMIT = 3, //!< How long did the board limit cause the GPU to be below application clocks - NVML_PERF_POLICY_LOW_UTILIZATION = 4, //!< How long did low utilization cause the GPU to be below application clocks - NVML_PERF_POLICY_RELIABILITY = 5, //!< How long did the board reliability limit cause the GPU to be below application clocks - - NVML_PERF_POLICY_TOTAL_APP_CLOCKS = 10, //!< Total time the GPU was held below application clocks by any limiter (0 - 5 above) - NVML_PERF_POLICY_TOTAL_BASE_CLOCKS = 11, //!< Total time the GPU was held below base clocks - - // Keep this last - NVML_PERF_POLICY_COUNT -}nvmlPerfPolicyType_t; - -/** - * Struct to hold perf policy violation status data - */ -typedef struct nvmlViolationTime_st -{ - unsigned long long referenceTime; //!< referenceTime represents CPU timestamp in microseconds - unsigned long long violationTime; //!< violationTime in Nanoseconds -}nvmlViolationTime_t; - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlDeviceEnumvs Device Enums - * @{ - */ -/***************************************************************************************************/ - -/** - * Generic enable/disable enum. - */ -typedef enum nvmlEnableState_enum -{ - NVML_FEATURE_DISABLED = 0, //!< Feature disabled - NVML_FEATURE_ENABLED = 1 //!< Feature enabled -} nvmlEnableState_t; - -//! Generic flag used to specify the default behavior of some functions. See description of particular functions for details. -#define nvmlFlagDefault 0x00 -//! Generic flag used to force some behavior. See description of particular functions for details. -#define nvmlFlagForce 0x01 - -/** - * * The Brand of the GPU - * */ -typedef enum nvmlBrandType_enum -{ - NVML_BRAND_UNKNOWN = 0, - NVML_BRAND_QUADRO = 1, - NVML_BRAND_TESLA = 2, - NVML_BRAND_NVS = 3, - NVML_BRAND_GRID = 4, - NVML_BRAND_GEFORCE = 5, - NVML_BRAND_TITAN = 6, - - // Keep this last - NVML_BRAND_COUNT -} nvmlBrandType_t; - -/** - * Temperature thresholds. - */ -typedef enum nvmlTemperatureThresholds_enum -{ - NVML_TEMPERATURE_THRESHOLD_SHUTDOWN = 0, // Temperature at which the GPU will shut down - // for HW protection - NVML_TEMPERATURE_THRESHOLD_SLOWDOWN = 1, // Temperature at which the GPU will begin HW slowdown - NVML_TEMPERATURE_THRESHOLD_MEM_MAX = 2, // Memory Temperature at which the GPU will begin SW slowdown - NVML_TEMPERATURE_THRESHOLD_GPU_MAX = 3, // GPU Temperature at which the GPU can be throttled below base clock - // Keep this last - NVML_TEMPERATURE_THRESHOLD_COUNT -} nvmlTemperatureThresholds_t; - -/** - * Temperature sensors. - */ -typedef enum nvmlTemperatureSensors_enum -{ - NVML_TEMPERATURE_GPU = 0, //!< Temperature sensor for the GPU die - - // Keep this last - NVML_TEMPERATURE_COUNT -} nvmlTemperatureSensors_t; - -/** - * Compute mode. - * - * NVML_COMPUTEMODE_EXCLUSIVE_PROCESS was added in CUDA 4.0. - * Earlier CUDA versions supported a single exclusive mode, - * which is equivalent to NVML_COMPUTEMODE_EXCLUSIVE_THREAD in CUDA 4.0 and beyond. - */ -typedef enum nvmlComputeMode_enum -{ - NVML_COMPUTEMODE_DEFAULT = 0, //!< Default compute mode -- multiple contexts per device - NVML_COMPUTEMODE_EXCLUSIVE_THREAD = 1, //!< Support Removed - NVML_COMPUTEMODE_PROHIBITED = 2, //!< Compute-prohibited mode -- no contexts per device - NVML_COMPUTEMODE_EXCLUSIVE_PROCESS = 3, //!< Compute-exclusive-process mode -- only one context per device, usable from multiple threads at a time - - // Keep this last - NVML_COMPUTEMODE_COUNT -} nvmlComputeMode_t; - -/** - * ECC bit types. - * - * @deprecated See \ref nvmlMemoryErrorType_t for a more flexible type - */ -#define nvmlEccBitType_t nvmlMemoryErrorType_t - -/** - * Single bit ECC errors - * - * @deprecated Mapped to \ref NVML_MEMORY_ERROR_TYPE_CORRECTED - */ -#define NVML_SINGLE_BIT_ECC NVML_MEMORY_ERROR_TYPE_CORRECTED - -/** - * Double bit ECC errors - * - * @deprecated Mapped to \ref NVML_MEMORY_ERROR_TYPE_UNCORRECTED - */ -#define NVML_DOUBLE_BIT_ECC NVML_MEMORY_ERROR_TYPE_UNCORRECTED - -/** - * Memory error types - */ -typedef enum nvmlMemoryErrorType_enum -{ - /** - * A memory error that was corrected - * - * For ECC errors, these are single bit errors - * For Texture memory, these are errors fixed by resend - */ - NVML_MEMORY_ERROR_TYPE_CORRECTED = 0, - /** - * A memory error that was not corrected - * - * For ECC errors, these are double bit errors - * For Texture memory, these are errors where the resend fails - */ - NVML_MEMORY_ERROR_TYPE_UNCORRECTED = 1, - - - // Keep this last - NVML_MEMORY_ERROR_TYPE_COUNT //!< Count of memory error types - -} nvmlMemoryErrorType_t; - -/** - * ECC counter types. - * - * Note: Volatile counts are reset each time the driver loads. On Windows this is once per boot. On Linux this can be more frequent. - * On Linux the driver unloads when no active clients exist. If persistence mode is enabled or there is always a driver - * client active (e.g. X11), then Linux also sees per-boot behavior. If not, volatile counts are reset each time a compute app - * is run. - */ -typedef enum nvmlEccCounterType_enum -{ - NVML_VOLATILE_ECC = 0, //!< Volatile counts are reset each time the driver loads. - NVML_AGGREGATE_ECC = 1, //!< Aggregate counts persist across reboots (i.e. for the lifetime of the device) - - // Keep this last - NVML_ECC_COUNTER_TYPE_COUNT //!< Count of memory counter types -} nvmlEccCounterType_t; - -/** - * Clock types. - * - * All speeds are in Mhz. - */ -typedef enum nvmlClockType_enum -{ - NVML_CLOCK_GRAPHICS = 0, //!< Graphics clock domain - NVML_CLOCK_SM = 1, //!< SM clock domain - NVML_CLOCK_MEM = 2, //!< Memory clock domain - NVML_CLOCK_VIDEO = 3, //!< Video encoder/decoder clock domain - - // Keep this last - NVML_CLOCK_COUNT //!< Count of clock types -} nvmlClockType_t; - -/** - * Clock Ids. These are used in combination with nvmlClockType_t - * to specify a single clock value. - */ -typedef enum nvmlClockId_enum -{ - NVML_CLOCK_ID_CURRENT = 0, //!< Current actual clock value - NVML_CLOCK_ID_APP_CLOCK_TARGET = 1, //!< Target application clock - NVML_CLOCK_ID_APP_CLOCK_DEFAULT = 2, //!< Default application clock target - NVML_CLOCK_ID_CUSTOMER_BOOST_MAX = 3, //!< OEM-defined maximum clock rate - - //Keep this last - NVML_CLOCK_ID_COUNT //!< Count of Clock Ids. -} nvmlClockId_t; - -/** - * Driver models. - * - * Windows only. - */ -typedef enum nvmlDriverModel_enum -{ - NVML_DRIVER_WDDM = 0, //!< WDDM driver model -- GPU treated as a display device - NVML_DRIVER_WDM = 1 //!< WDM (TCC) model (recommended) -- GPU treated as a generic device -} nvmlDriverModel_t; - -/** - * Allowed PStates. - */ -typedef enum nvmlPStates_enum -{ - NVML_PSTATE_0 = 0, //!< Performance state 0 -- Maximum Performance - NVML_PSTATE_1 = 1, //!< Performance state 1 - NVML_PSTATE_2 = 2, //!< Performance state 2 - NVML_PSTATE_3 = 3, //!< Performance state 3 - NVML_PSTATE_4 = 4, //!< Performance state 4 - NVML_PSTATE_5 = 5, //!< Performance state 5 - NVML_PSTATE_6 = 6, //!< Performance state 6 - NVML_PSTATE_7 = 7, //!< Performance state 7 - NVML_PSTATE_8 = 8, //!< Performance state 8 - NVML_PSTATE_9 = 9, //!< Performance state 9 - NVML_PSTATE_10 = 10, //!< Performance state 10 - NVML_PSTATE_11 = 11, //!< Performance state 11 - NVML_PSTATE_12 = 12, //!< Performance state 12 - NVML_PSTATE_13 = 13, //!< Performance state 13 - NVML_PSTATE_14 = 14, //!< Performance state 14 - NVML_PSTATE_15 = 15, //!< Performance state 15 -- Minimum Performance - NVML_PSTATE_UNKNOWN = 32 //!< Unknown performance state -} nvmlPstates_t; - -/** - * GPU Operation Mode - * - * GOM allows to reduce power usage and optimize GPU throughput by disabling GPU features. - * - * Each GOM is designed to meet specific user needs. - */ -typedef enum nvmlGom_enum -{ - NVML_GOM_ALL_ON = 0, //!< Everything is enabled and running at full speed - - NVML_GOM_COMPUTE = 1, //!< Designed for running only compute tasks. Graphics operations - //!< are not allowed - - NVML_GOM_LOW_DP = 2 //!< Designed for running graphics applications that don't require - //!< high bandwidth double precision -} nvmlGpuOperationMode_t; - -/** - * Available infoROM objects. - */ -typedef enum nvmlInforomObject_enum -{ - NVML_INFOROM_OEM = 0, //!< An object defined by OEM - NVML_INFOROM_ECC = 1, //!< The ECC object determining the level of ECC support - NVML_INFOROM_POWER = 2, //!< The power management object - - // Keep this last - NVML_INFOROM_COUNT //!< This counts the number of infoROM objects the driver knows about -} nvmlInforomObject_t; - -/** - * Return values for NVML API calls. - */ -typedef enum nvmlReturn_enum -{ - // cppcheck-suppress * - NVML_SUCCESS = 0, //!< The operation was successful - NVML_ERROR_UNINITIALIZED = 1, //!< NVML was not first initialized with nvmlInit() - NVML_ERROR_INVALID_ARGUMENT = 2, //!< A supplied argument is invalid - NVML_ERROR_NOT_SUPPORTED = 3, //!< The requested operation is not available on target device - NVML_ERROR_NO_PERMISSION = 4, //!< The current user does not have permission for operation - NVML_ERROR_ALREADY_INITIALIZED = 5, //!< Deprecated: Multiple initializations are now allowed through ref counting - NVML_ERROR_NOT_FOUND = 6, //!< A query to find an object was unsuccessful - NVML_ERROR_INSUFFICIENT_SIZE = 7, //!< An input argument is not large enough - NVML_ERROR_INSUFFICIENT_POWER = 8, //!< A device's external power cables are not properly attached - NVML_ERROR_DRIVER_NOT_LOADED = 9, //!< NVIDIA driver is not loaded - NVML_ERROR_TIMEOUT = 10, //!< User provided timeout passed - NVML_ERROR_IRQ_ISSUE = 11, //!< NVIDIA Kernel detected an interrupt issue with a GPU - NVML_ERROR_LIBRARY_NOT_FOUND = 12, //!< NVML Shared Library couldn't be found or loaded - NVML_ERROR_FUNCTION_NOT_FOUND = 13, //!< Local version of NVML doesn't implement this function - NVML_ERROR_CORRUPTED_INFOROM = 14, //!< infoROM is corrupted - NVML_ERROR_GPU_IS_LOST = 15, //!< The GPU has fallen off the bus or has otherwise become inaccessible - NVML_ERROR_RESET_REQUIRED = 16, //!< The GPU requires a reset before it can be used again - NVML_ERROR_OPERATING_SYSTEM = 17, //!< The GPU control device has been blocked by the operating system/cgroups - NVML_ERROR_LIB_RM_VERSION_MISMATCH = 18, //!< RM detects a driver/library version mismatch - NVML_ERROR_IN_USE = 19, //!< An operation cannot be performed because the GPU is currently in use - NVML_ERROR_MEMORY = 20, //!< Insufficient memory - NVML_ERROR_NO_DATA = 21, //!< No data - NVML_ERROR_VGPU_ECC_NOT_SUPPORTED = 22, //!< The requested vgpu operation is not available on target device, becasue ECC is enabled - NVML_ERROR_INSUFFICIENT_RESOURCES = 23, //!< Ran out of critical resources, other than memory - NVML_ERROR_UNKNOWN = 999 //!< An internal driver error occurred -} nvmlReturn_t; - -/** - * See \ref nvmlDeviceGetMemoryErrorCounter - */ -typedef enum nvmlMemoryLocation_enum -{ - NVML_MEMORY_LOCATION_L1_CACHE = 0, //!< GPU L1 Cache - NVML_MEMORY_LOCATION_L2_CACHE = 1, //!< GPU L2 Cache - NVML_MEMORY_LOCATION_DRAM = 2, //!< Turing+ DRAM - NVML_MEMORY_LOCATION_DEVICE_MEMORY = 2, //!< GPU Device Memory - NVML_MEMORY_LOCATION_REGISTER_FILE = 3, //!< GPU Register File - NVML_MEMORY_LOCATION_TEXTURE_MEMORY = 4, //!< GPU Texture Memory - NVML_MEMORY_LOCATION_TEXTURE_SHM = 5, //!< Shared memory - NVML_MEMORY_LOCATION_CBU = 6, //!< CBU - NVML_MEMORY_LOCATION_SRAM = 7, //!< Turing+ SRAM - // Keep this last - NVML_MEMORY_LOCATION_COUNT //!< This counts the number of memory locations the driver knows about -} nvmlMemoryLocation_t; - -/** - * Causes for page retirement - */ -typedef enum nvmlPageRetirementCause_enum -{ - NVML_PAGE_RETIREMENT_CAUSE_MULTIPLE_SINGLE_BIT_ECC_ERRORS = 0, //!< Page was retired due to multiple single bit ECC error - NVML_PAGE_RETIREMENT_CAUSE_DOUBLE_BIT_ECC_ERROR = 1, //!< Page was retired due to double bit ECC error - - // Keep this last - NVML_PAGE_RETIREMENT_CAUSE_COUNT -} nvmlPageRetirementCause_t; - -/** - * API types that allow changes to default permission restrictions - */ -typedef enum nvmlRestrictedAPI_enum -{ - NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS = 0, //!< APIs that change application clocks, see nvmlDeviceSetApplicationsClocks - //!< and see nvmlDeviceResetApplicationsClocks - NVML_RESTRICTED_API_SET_AUTO_BOOSTED_CLOCKS = 1, //!< APIs that enable/disable Auto Boosted clocks - //!< see nvmlDeviceSetAutoBoostedClocksEnabled - // Keep this last - NVML_RESTRICTED_API_COUNT -} nvmlRestrictedAPI_t; - -/** @} */ - -/***************************************************************************************************/ -/** @addtogroup gridVirtual - * @{ - */ -/***************************************************************************************************/ -/** @defgroup nvmlGridEnums GRID Virtualization Enums - * @{ - */ -/***************************************************************************************************/ - -/*! - * GPU virtualization mode types. - */ -typedef enum nvmlGpuVirtualizationMode { - NVML_GPU_VIRTUALIZATION_MODE_NONE = 0, //!< Represents Bare Metal GPU - NVML_GPU_VIRTUALIZATION_MODE_PASSTHROUGH = 1, //!< Device is associated with GPU-Passthorugh - NVML_GPU_VIRTUALIZATION_MODE_VGPU = 2, //!< Device is associated with vGPU inside virtual machine. - NVML_GPU_VIRTUALIZATION_MODE_HOST_VGPU = 3, //!< Device is associated with VGX hypervisor in vGPU mode - NVML_GPU_VIRTUALIZATION_MODE_HOST_VSGA = 4, //!< Device is associated with VGX hypervisor in vSGA mode -} nvmlGpuVirtualizationMode_t; - -/** - * Host vGPU modes - */ -typedef enum nvmlHostVgpuMode_enum -{ - NVML_HOST_VGPU_MODE_NON_SRIOV = 0, //!< Non SR-IOV mode - NVML_HOST_VGPU_MODE_SRIOV = 1 //!< SR-IOV mode -} nvmlHostVgpuMode_t; - -/*! - * Types of VM identifiers - */ -typedef enum nvmlVgpuVmIdType { - NVML_VGPU_VM_ID_DOMAIN_ID = 0, //!< VM ID represents DOMAIN ID - NVML_VGPU_VM_ID_UUID = 1, //!< VM ID represents UUID -} nvmlVgpuVmIdType_t; - -/** - * vGPU GUEST info state. - */ -typedef enum nvmlVgpuGuestInfoState_enum -{ - NVML_VGPU_INSTANCE_GUEST_INFO_STATE_UNINITIALIZED = 0, //!< Guest-dependent fields uninitialized - NVML_VGPU_INSTANCE_GUEST_INFO_STATE_INITIALIZED = 1, //!< Guest-dependent fields initialized -} nvmlVgpuGuestInfoState_t; - -/** - * GRID license feature code - */ -typedef enum { - NVML_GRID_LICENSE_FEATURE_CODE_VGPU = 1, //!< Virtual GPU - NVML_GRID_LICENSE_FEATURE_CODE_VWORKSTATION = 2 //!< Virtual Workstation -} nvmlGridLicenseFeatureCode_t; - -/** @} */ - -/***************************************************************************************************/ - -/** @defgroup nvmlVgpuConstants GRID Virtualization Constants - * @{ - */ -/***************************************************************************************************/ - -/** - * Buffer size guaranteed to be large enough for \ref nvmlVgpuTypeGetLicense - */ -#define NVML_GRID_LICENSE_BUFFER_SIZE 128 - -#define NVML_VGPU_NAME_BUFFER_SIZE 64 - -#define NVML_GRID_LICENSE_FEATURE_MAX_COUNT 3 - -/*! - * Macros for vGPU instance's virtualization capabilities bitfield. - */ -#define NVML_VGPU_VIRTUALIZATION_CAP_MIGRATION 0:0 -#define NVML_VGPU_VIRTUALIZATION_CAP_MIGRATION_NO 0x0 -#define NVML_VGPU_VIRTUALIZATION_CAP_MIGRATION_YES 0x1 - -/*! - * Macros for pGPU's virtualization capabilities bitfield. - */ -#define NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION 0:0 -#define NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION_NO 0x0 -#define NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION_YES 0x1 - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlVgpuStructs GRID Virtualization Structs - * @{ - */ -/***************************************************************************************************/ - -typedef unsigned int nvmlVgpuTypeId_t; - -typedef unsigned int nvmlVgpuInstance_t; - -/** - * Structure to store Utilization Value and vgpuInstance - */ -typedef struct nvmlVgpuInstanceUtilizationSample_st -{ - nvmlVgpuInstance_t vgpuInstance; //!< vGPU Instance - unsigned long long timeStamp; //!< CPU Timestamp in microseconds - nvmlValue_t smUtil; //!< SM (3D/Compute) Util Value - nvmlValue_t memUtil; //!< Frame Buffer Memory Util Value - nvmlValue_t encUtil; //!< Encoder Util Value - nvmlValue_t decUtil; //!< Decoder Util Value -} nvmlVgpuInstanceUtilizationSample_t; - -/** - * Structure to store Utilization Value, vgpuInstance and subprocess information - */ -typedef struct nvmlVgpuProcessUtilizationSample_st -{ - nvmlVgpuInstance_t vgpuInstance; //!< vGPU Instance - unsigned int pid; //!< PID of process running within the vGPU VM - char processName[NVML_VGPU_NAME_BUFFER_SIZE]; //!< Name of process running within the vGPU VM - unsigned long long timeStamp; //!< CPU Timestamp in microseconds - unsigned int smUtil; //!< SM (3D/Compute) Util Value - unsigned int memUtil; //!< Frame Buffer Memory Util Value - unsigned int encUtil; //!< Encoder Util Value - unsigned int decUtil; //!< Decoder Util Value -} nvmlVgpuProcessUtilizationSample_t; - -/** - * Structure to store utilization value and process Id - */ -typedef struct nvmlProcessUtilizationSample_st -{ - unsigned int pid; //!< PID of process - unsigned long long timeStamp; //!< CPU Timestamp in microseconds - unsigned int smUtil; //!< SM (3D/Compute) Util Value - unsigned int memUtil; //!< Frame Buffer Memory Util Value - unsigned int encUtil; //!< Encoder Util Value - unsigned int decUtil; //!< Decoder Util Value -} nvmlProcessUtilizationSample_t; - -/** - * Structure containing GRID licensable feature information - */ -typedef struct nvmlGridLicensableFeature_st -{ - nvmlGridLicenseFeatureCode_t featureCode; //!< Licensed feature code - unsigned int featureState; //!< Non-zero if feature is currently licensed, otherwise zero - char licenseInfo[NVML_GRID_LICENSE_BUFFER_SIZE]; - char productName[NVML_GRID_LICENSE_BUFFER_SIZE]; - unsigned int featureEnabled; //!< Non-zero if feature is enabled, otherwise zero -} nvmlGridLicensableFeature_t; - -/** - * Structure to store GRID licensable features - */ -typedef struct nvmlGridLicensableFeatures_st -{ - int isGridLicenseSupported; //!< Non-zero if GRID Software Licensing is supported on the system, otherwise zero - unsigned int licensableFeaturesCount; //!< Entries returned in \a gridLicensableFeatures array - nvmlGridLicensableFeature_t gridLicensableFeatures[NVML_GRID_LICENSE_FEATURE_MAX_COUNT]; //!< Array of GRID licensable features. -} nvmlGridLicensableFeatures_t; - -/** - * Simplified chip architecture - */ -#define NVML_DEVICE_ARCH_KEPLER 2 // Devices based on the NVIDIA Kepler architecture -#define NVML_DEVICE_ARCH_MAXWELL 3 // Devices based on the NVIDIA Maxwell architecture -#define NVML_DEVICE_ARCH_PASCAL 4 // Devices based on the NVIDIA Pascal architecture -#define NVML_DEVICE_ARCH_VOLTA 5 // Devices based on the NVIDIA Volta architecture -#define NVML_DEVICE_ARCH_TURING 6 // Devices based on the NVIDIA Turing architecture - -#define NVML_DEVICE_ARCH_AMPERE 7 // Devices based on the NVIDIA Ampere architecture - -#define NVML_DEVICE_ARCH_UNKNOWN 0xffffffff // Anything else, presumably something newer - -typedef unsigned int nvmlDeviceArchitecture_t; - -/** @} */ -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlFieldValueEnums Field Value Enums - * @{ - */ -/***************************************************************************************************/ - -/** - * Field Identifiers. - * - * All Identifiers pertain to a device. Each ID is only used once and is guaranteed never to change. - */ -#define NVML_FI_DEV_ECC_CURRENT 1 //!< Current ECC mode. 1=Active. 0=Inactive -#define NVML_FI_DEV_ECC_PENDING 2 //!< Pending ECC mode. 1=Active. 0=Inactive -/* ECC Count Totals */ -#define NVML_FI_DEV_ECC_SBE_VOL_TOTAL 3 //!< Total single bit volatile ECC errors -#define NVML_FI_DEV_ECC_DBE_VOL_TOTAL 4 //!< Total double bit volatile ECC errors -#define NVML_FI_DEV_ECC_SBE_AGG_TOTAL 5 //!< Total single bit aggregate (persistent) ECC errors -#define NVML_FI_DEV_ECC_DBE_AGG_TOTAL 6 //!< Total double bit aggregate (persistent) ECC errors -/* Individual ECC locations */ -#define NVML_FI_DEV_ECC_SBE_VOL_L1 7 //!< L1 cache single bit volatile ECC errors -#define NVML_FI_DEV_ECC_DBE_VOL_L1 8 //!< L1 cache double bit volatile ECC errors -#define NVML_FI_DEV_ECC_SBE_VOL_L2 9 //!< L2 cache single bit volatile ECC errors -#define NVML_FI_DEV_ECC_DBE_VOL_L2 10 //!< L2 cache double bit volatile ECC errors -#define NVML_FI_DEV_ECC_SBE_VOL_DEV 11 //!< Device memory single bit volatile ECC errors -#define NVML_FI_DEV_ECC_DBE_VOL_DEV 12 //!< Device memory double bit volatile ECC errors -#define NVML_FI_DEV_ECC_SBE_VOL_REG 13 //!< Register file single bit volatile ECC errors -#define NVML_FI_DEV_ECC_DBE_VOL_REG 14 //!< Register file double bit volatile ECC errors -#define NVML_FI_DEV_ECC_SBE_VOL_TEX 15 //!< Texture memory single bit volatile ECC errors -#define NVML_FI_DEV_ECC_DBE_VOL_TEX 16 //!< Texture memory double bit volatile ECC errors -#define NVML_FI_DEV_ECC_DBE_VOL_CBU 17 //!< CBU double bit volatile ECC errors -#define NVML_FI_DEV_ECC_SBE_AGG_L1 18 //!< L1 cache single bit aggregate (persistent) ECC errors -#define NVML_FI_DEV_ECC_DBE_AGG_L1 19 //!< L1 cache double bit aggregate (persistent) ECC errors -#define NVML_FI_DEV_ECC_SBE_AGG_L2 20 //!< L2 cache single bit aggregate (persistent) ECC errors -#define NVML_FI_DEV_ECC_DBE_AGG_L2 21 //!< L2 cache double bit aggregate (persistent) ECC errors -#define NVML_FI_DEV_ECC_SBE_AGG_DEV 22 //!< Device memory single bit aggregate (persistent) ECC errors -#define NVML_FI_DEV_ECC_DBE_AGG_DEV 23 //!< Device memory double bit aggregate (persistent) ECC errors -#define NVML_FI_DEV_ECC_SBE_AGG_REG 24 //!< Register File single bit aggregate (persistent) ECC errors -#define NVML_FI_DEV_ECC_DBE_AGG_REG 25 //!< Register File double bit aggregate (persistent) ECC errors -#define NVML_FI_DEV_ECC_SBE_AGG_TEX 26 //!< Texture memory single bit aggregate (persistent) ECC errors -#define NVML_FI_DEV_ECC_DBE_AGG_TEX 27 //!< Texture memory double bit aggregate (persistent) ECC errors -#define NVML_FI_DEV_ECC_DBE_AGG_CBU 28 //!< CBU double bit aggregate ECC errors - -/* Page Retirement */ -#define NVML_FI_DEV_RETIRED_SBE 29 //!< Number of retired pages because of single bit errors -#define NVML_FI_DEV_RETIRED_DBE 30 //!< Number of retired pages because of double bit errors -#define NVML_FI_DEV_RETIRED_PENDING 31 //!< If any pages are pending retirement. 1=yes. 0=no. - -/* NvLink Flit Error Counters */ -#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0 32 //!< NVLink flow control CRC Error Counter for Lane 0 -#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1 33 //!< NVLink flow control CRC Error Counter for Lane 1 -#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2 34 //!< NVLink flow control CRC Error Counter for Lane 2 -#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3 35 //!< NVLink flow control CRC Error Counter for Lane 3 -#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4 36 //!< NVLink flow control CRC Error Counter for Lane 4 -#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5 37 //!< NVLink flow control CRC Error Counter for Lane 5 -#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL 38 //!< NVLink flow control CRC Error Counter total for all Lanes - -/* NvLink CRC Data Error Counters */ -#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0 39 //!< NVLink data CRC Error Counter for Lane 0 -#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1 40 //!< NVLink data CRC Error Counter for Lane 1 -#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2 41 //!< NVLink data CRC Error Counter for Lane 2 -#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3 42 //!< NVLink data CRC Error Counter for Lane 3 -#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4 43 //!< NVLink data CRC Error Counter for Lane 4 -#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5 44 //!< NVLink data CRC Error Counter for Lane 5 -#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL 45 //!< NvLink data CRC Error Counter total for all Lanes - -/* NvLink Replay Error Counters */ -#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0 46 //!< NVLink Replay Error Counter for Lane 0 -#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1 47 //!< NVLink Replay Error Counter for Lane 1 -#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2 48 //!< NVLink Replay Error Counter for Lane 2 -#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3 49 //!< NVLink Replay Error Counter for Lane 3 -#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4 50 //!< NVLink Replay Error Counter for Lane 4 -#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5 51 //!< NVLink Replay Error Counter for Lane 5 -#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL 52 //!< NVLink Replay Error Counter total for all Lanes - -/* NvLink Recovery Error Counters */ -#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0 53 //!< NVLink Recovery Error Counter for Lane 0 -#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1 54 //!< NVLink Recovery Error Counter for Lane 1 -#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2 55 //!< NVLink Recovery Error Counter for Lane 2 -#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3 56 //!< NVLink Recovery Error Counter for Lane 3 -#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4 57 //!< NVLink Recovery Error Counter for Lane 4 -#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5 58 //!< NVLink Recovery Error Counter for Lane 5 -#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL 59 //!< NVLink Recovery Error Counter total for all Lanes - -/* NvLink Bandwidth Counters */ -/* - * NVML_FI_DEV_NVLINK_BANDWIDTH_* field values are now deprecated. - * Please use the following field values instead: - * NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_TX - * NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_RX - * NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_TX - * NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_RX - */ -#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L0 60 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 0 -#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L1 61 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 1 -#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L2 62 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 2 -#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L3 63 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 3 -#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L4 64 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 4 -#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L5 65 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 5 -#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_TOTAL 66 //!< NVLink Bandwidth Counter Total for Counter Set 0, All Lanes - -/* NvLink Bandwidth Counters */ -#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L0 67 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 0 -#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L1 68 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 1 -#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L2 69 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 2 -#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L3 70 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 3 -#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L4 71 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 4 -#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L5 72 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 5 -#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_TOTAL 73 //!< NVLink Bandwidth Counter Total for Counter Set 1, All Lanes - -/* NVML Perf Policy Counters */ -#define NVML_FI_DEV_PERF_POLICY_POWER 74 //!< Perf Policy Counter for Power Policy -#define NVML_FI_DEV_PERF_POLICY_THERMAL 75 //!< Perf Policy Counter for Thermal Policy -#define NVML_FI_DEV_PERF_POLICY_SYNC_BOOST 76 //!< Perf Policy Counter for Sync boost Policy -#define NVML_FI_DEV_PERF_POLICY_BOARD_LIMIT 77 //!< Perf Policy Counter for Board Limit -#define NVML_FI_DEV_PERF_POLICY_LOW_UTILIZATION 78 //!< Perf Policy Counter for Low GPU Utilization Policy -#define NVML_FI_DEV_PERF_POLICY_RELIABILITY 79 //!< Perf Policy Counter for Reliability Policy -#define NVML_FI_DEV_PERF_POLICY_TOTAL_APP_CLOCKS 80 //!< Perf Policy Counter for Total App Clock Policy -#define NVML_FI_DEV_PERF_POLICY_TOTAL_BASE_CLOCKS 81 //!< Perf Policy Counter for Total Base Clocks Policy - -/* Memory temperatures */ -#define NVML_FI_DEV_MEMORY_TEMP 82 //!< Memory temperature for the device - -/* Energy Counter */ -#define NVML_FI_DEV_TOTAL_ENERGY_CONSUMPTION 83 //!< Total energy consumption for the GPU in mJ since the driver was last reloaded - -/* NVLink Speed */ -#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L0 84 //!< NVLink Speed in MBps for Link 0 -#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L1 85 //!< NVLink Speed in MBps for Link 1 -#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L2 86 //!< NVLink Speed in MBps for Link 2 -#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L3 87 //!< NVLink Speed in MBps for Link 3 -#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L4 88 //!< NVLink Speed in MBps for Link 4 -#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L5 89 //!< NVLink Speed in MBps for Link 5 -#define NVML_FI_DEV_NVLINK_SPEED_MBPS_COMMON 90 //!< Common NVLink Speed in MBps for active links - -#define NVML_FI_DEV_NVLINK_LINK_COUNT 91 //!< Number of NVLinks present on the device - -#define NVML_FI_DEV_RETIRED_PENDING_SBE 92 //!< If any pages are pending retirement due to SBE. 1=yes. 0=no. -#define NVML_FI_DEV_RETIRED_PENDING_DBE 93 //!< If any pages are pending retirement due to DBE. 1=yes. 0=no. - -#define NVML_FI_DEV_PCIE_REPLAY_COUNTER 94 //!< PCIe replay counter -#define NVML_FI_DEV_PCIE_REPLAY_ROLLOVER_COUNTER 95 //!< PCIe replay rollover counter - -/* NvLink Flit Error Counters */ -#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L6 96 //!< NVLink flow control CRC Error Counter for Lane 6 -#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L7 97 //!< NVLink flow control CRC Error Counter for Lane 7 -#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L8 98 //!< NVLink flow control CRC Error Counter for Lane 8 -#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L9 99 //!< NVLink flow control CRC Error Counter for Lane 9 -#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L10 100 //!< NVLink flow control CRC Error Counter for Lane 10 -#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L11 101 //!< NVLink flow control CRC Error Counter for Lane 11 - -/* NvLink CRC Data Error Counters */ -#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L6 102 //!< NVLink data CRC Error Counter for Lane 6 -#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L7 103 //!< NVLink data CRC Error Counter for Lane 7 -#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L8 104 //!< NVLink data CRC Error Counter for Lane 8 -#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L9 105 //!< NVLink data CRC Error Counter for Lane 9 -#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L10 106 //!< NVLink data CRC Error Counter for Lane 10 -#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L11 107 //!< NVLink data CRC Error Counter for Lane 11 - -/* NvLink Replay Error Counters */ -#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L6 108 //!< NVLink Replay Error Counter for Lane 6 -#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L7 109 //!< NVLink Replay Error Counter for Lane 7 -#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L8 110 //!< NVLink Replay Error Counter for Lane 8 -#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L9 111 //!< NVLink Replay Error Counter for Lane 9 -#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L10 112 //!< NVLink Replay Error Counter for Lane 10 -#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L11 113 //!< NVLink Replay Error Counter for Lane 11 - -/* NvLink Recovery Error Counters */ -#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L6 114 //!< NVLink Recovery Error Counter for Lane 6 -#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L7 115 //!< NVLink Recovery Error Counter for Lane 7 -#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L8 116 //!< NVLink Recovery Error Counter for Lane 8 -#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L9 117 //!< NVLink Recovery Error Counter for Lane 9 -#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L10 118 //!< NVLink Recovery Error Counter for Lane 10 -#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L11 119 //!< NVLink Recovery Error Counter for Lane 11 - -/* NvLink Bandwidth Counters */ -/* - * NVML_FI_DEV_NVLINK_BANDWIDTH_* field values are now deprecated. - * Please use the following field values instead: - * NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_TX - * NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_RX - * NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_TX - * NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_RX - */ -#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L6 120 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 6 -#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L7 121 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 7 -#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L8 122 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 8 -#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L9 123 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 9 -#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L10 124 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 10 -#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L11 125 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 11 - -/* NvLink Bandwidth Counters */ -#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L6 126 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 6 -#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L7 127 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 7 -#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L8 128 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 8 -#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L9 129 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 9 -#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L10 130 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 10 -#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L11 131 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 11 - -/* NVLink Speed */ -#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L6 132 //!< NVLink Speed in MBps for Link 6 -#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L7 133 //!< NVLink Speed in MBps for Link 7 -#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L8 134 //!< NVLink Speed in MBps for Link 8 -#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L9 135 //!< NVLink Speed in MBps for Link 9 -#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L10 136 //!< NVLink Speed in MBps for Link 10 -#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L11 137 //!< NVLink Speed in MBps for Link 11 - -/** - * NVLink throughput counters field values - * - * Link ID needs to be specified in the scopeId field in nvmlFieldValue_t. - * A scopeId of UINT_MAX returns aggregate value summed up across all links - * for the specified counter type in fieldId. - */ -#define NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_TX 138 //!< NVLink TX Data throughput in KiB -#define NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_RX 139 //!< NVLink RX Data throughput in KiB -#define NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_TX 140 //!< NVLink TX Data + protocol overhead in KiB -#define NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_RX 141 //!< NVLink RX Data + protocol overhead in KiB - -/* Row Remapper */ -#define NVML_FI_DEV_REMAPPED_COR 142 //!< Number of remapped rows due to correctable errors -#define NVML_FI_DEV_REMAPPED_UNC 143 //!< Number of remapped rows due to uncorrectable errors -#define NVML_FI_DEV_REMAPPED_PENDING 144 //!< If any rows are pending remapping. 1=yes 0=no -#define NVML_FI_DEV_REMAPPED_FAILURE 145 //!< If any rows failed to be remapped 1=yes 0=no - -#define NVML_FI_MAX 146 //!< One greater than the largest field ID defined above - -/** - * Information for a Field Value Sample - */ -typedef struct nvmlFieldValue_st -{ - unsigned int fieldId; //!< ID of the NVML field to retrieve. This must be set before any call that uses this struct. See the constants starting with NVML_FI_ above. - unsigned int scopeId; //!< Scope ID can represent data used by NVML depending on fieldId's context. For example, for NVLink throughput counter data, scopeId can represent linkId. - long long timestamp; //!< CPU Timestamp of this value in microseconds since 1970 - long long latencyUsec; //!< How long this field value took to update (in usec) within NVML. This may be averaged across several fields that are serviced by the same driver call. - nvmlValueType_t valueType; //!< Type of the value stored in value - nvmlReturn_t nvmlReturn; //!< Return code for retrieving this value. This must be checked before looking at value, as value is undefined if nvmlReturn != NVML_SUCCESS - nvmlValue_t value; //!< Value for this field. This is only valid if nvmlReturn == NVML_SUCCESS -} nvmlFieldValue_t; - - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlUnitStructs Unit Structs - * @{ - */ -/***************************************************************************************************/ - -typedef struct nvmlUnit_st* nvmlUnit_t; - -/** - * Description of HWBC entry - */ -typedef struct nvmlHwbcEntry_st -{ - unsigned int hwbcId; - char firmwareVersion[32]; -} nvmlHwbcEntry_t; - -/** - * Fan state enum. - */ -typedef enum nvmlFanState_enum -{ - NVML_FAN_NORMAL = 0, //!< Fan is working properly - NVML_FAN_FAILED = 1 //!< Fan has failed -} nvmlFanState_t; - -/** - * Led color enum. - */ -typedef enum nvmlLedColor_enum -{ - NVML_LED_COLOR_GREEN = 0, //!< GREEN, indicates good health - NVML_LED_COLOR_AMBER = 1 //!< AMBER, indicates problem -} nvmlLedColor_t; - - -/** - * LED states for an S-class unit. - */ -typedef struct nvmlLedState_st -{ - char cause[256]; //!< If amber, a text description of the cause - nvmlLedColor_t color; //!< GREEN or AMBER -} nvmlLedState_t; - -/** - * Static S-class unit info. - */ -typedef struct nvmlUnitInfo_st -{ - char name[96]; //!< Product name - char id[96]; //!< Product identifier - char serial[96]; //!< Product serial number - char firmwareVersion[96]; //!< Firmware version -} nvmlUnitInfo_t; - -/** - * Power usage information for an S-class unit. - * The power supply state is a human readable string that equals "Normal" or contains - * a combination of "Abnormal" plus one or more of the following: - * - * - High voltage - * - Fan failure - * - Heatsink temperature - * - Current limit - * - Voltage below UV alarm threshold - * - Low-voltage - * - SI2C remote off command - * - MOD_DISABLE input - * - Short pin transition -*/ -typedef struct nvmlPSUInfo_st -{ - char state[256]; //!< The power supply state - unsigned int current; //!< PSU current (A) - unsigned int voltage; //!< PSU voltage (V) - unsigned int power; //!< PSU power draw (W) -} nvmlPSUInfo_t; - -/** - * Fan speed reading for a single fan in an S-class unit. - */ -typedef struct nvmlUnitFanInfo_st -{ - unsigned int speed; //!< Fan speed (RPM) - nvmlFanState_t state; //!< Flag that indicates whether fan is working properly -} nvmlUnitFanInfo_t; - -/** - * Fan speed readings for an entire S-class unit. - */ -typedef struct nvmlUnitFanSpeeds_st -{ - nvmlUnitFanInfo_t fans[24]; //!< Fan speed data for each fan - unsigned int count; //!< Number of fans in unit -} nvmlUnitFanSpeeds_t; - -/** @} */ - -/***************************************************************************************************/ -/** @addtogroup nvmlEvents - * @{ - */ -/***************************************************************************************************/ - -/** - * Handle to an event set - */ -typedef struct nvmlEventSet_st* nvmlEventSet_t; - -/** @defgroup nvmlEventType Event Types - * @{ - * Event Types which user can be notified about. - * See description of particular functions for details. - * - * See \ref nvmlDeviceRegisterEvents and \ref nvmlDeviceGetSupportedEventTypes to check which devices - * support each event. - * - * Types can be combined with bitwise or operator '|' when passed to \ref nvmlDeviceRegisterEvents - */ -//! Event about single bit ECC errors -/** - * \note A corrected texture memory error is not an ECC error, so it does not generate a single bit event - */ -#define nvmlEventTypeSingleBitEccError 0x0000000000000001LL - -//! Event about double bit ECC errors -/** - * \note An uncorrected texture memory error is not an ECC error, so it does not generate a double bit event - */ -#define nvmlEventTypeDoubleBitEccError 0x0000000000000002LL - -//! Event about PState changes -/** - * \note On Fermi architecture PState changes are also an indicator that GPU is throttling down due to - * no work being executed on the GPU, power capping or thermal capping. In a typical situation, - * Fermi-based GPU should stay in P0 for the duration of the execution of the compute process. - */ -#define nvmlEventTypePState 0x0000000000000004LL - -//! Event that Xid critical error occurred -#define nvmlEventTypeXidCriticalError 0x0000000000000008LL - -//! Event about clock changes -/** - * Kepler only - */ -#define nvmlEventTypeClock 0x0000000000000010LL - -//! Event about AC/Battery power source changes -#define nvmlEventTypePowerSourceChange 0x0000000000000080LL - -//! Event about MIG configuration changes -#define nvmlEventMigConfigChange 0x0000000000000100LL - -//! Mask with no events -#define nvmlEventTypeNone 0x0000000000000000LL - -//! Mask of all events -#define nvmlEventTypeAll (nvmlEventTypeNone \ - | nvmlEventTypeSingleBitEccError \ - | nvmlEventTypeDoubleBitEccError \ - | nvmlEventTypePState \ - | nvmlEventTypeClock \ - | nvmlEventTypeXidCriticalError \ - | nvmlEventTypePowerSourceChange \ - | nvmlEventMigConfigChange \ - ) -/** @} */ - -/** - * Information about occurred event - */ -typedef struct nvmlEventData_st -{ - nvmlDevice_t device; //!< Specific device where the event occurred - unsigned long long eventType; //!< Information about what specific event occurred - unsigned long long eventData; //!< Stores XID error for the device in the event of nvmlEventTypeXidCriticalError, - // eventData is 0 for any other event. eventData is set as 999 for unknown xid error. - unsigned int gpuInstanceId; //!< If MIG is enabled and nvmlEventTypeXidCriticalError event is attributable to a GPU - // instance, stores a valid GPU instance ID. gpuInstanceId is set to 0xFFFFFFFF - // otherwise. - unsigned int computeInstanceId; //!< If MIG is enabled and nvmlEventTypeXidCriticalError event is attributable to a - // compute instance, stores a valid compute instance ID. computeInstanceId is set to - // 0xFFFFFFFF otherwise. -} nvmlEventData_t; - -/** @} */ - -/***************************************************************************************************/ -/** @addtogroup nvmlClocksThrottleReasons - * @{ - */ -/***************************************************************************************************/ - -/** Nothing is running on the GPU and the clocks are dropping to Idle state - * \note This limiter may be removed in a later release - */ -#define nvmlClocksThrottleReasonGpuIdle 0x0000000000000001LL - -/** GPU clocks are limited by current setting of applications clocks - * - * @see nvmlDeviceSetApplicationsClocks - * @see nvmlDeviceGetApplicationsClock - */ -#define nvmlClocksThrottleReasonApplicationsClocksSetting 0x0000000000000002LL - -/** - * @deprecated Renamed to \ref nvmlClocksThrottleReasonApplicationsClocksSetting - * as the name describes the situation more accurately. - */ -#define nvmlClocksThrottleReasonUserDefinedClocks nvmlClocksThrottleReasonApplicationsClocksSetting - -/** SW Power Scaling algorithm is reducing the clocks below requested clocks - * - * @see nvmlDeviceGetPowerUsage - * @see nvmlDeviceSetPowerManagementLimit - * @see nvmlDeviceGetPowerManagementLimit - */ -#define nvmlClocksThrottleReasonSwPowerCap 0x0000000000000004LL - -/** HW Slowdown (reducing the core clocks by a factor of 2 or more) is engaged - * - * This is an indicator of: - * - temperature being too high - * - External Power Brake Assertion is triggered (e.g. by the system power supply) - * - Power draw is too high and Fast Trigger protection is reducing the clocks - * - May be also reported during PState or clock change - * - This behavior may be removed in a later release. - * - * @see nvmlDeviceGetTemperature - * @see nvmlDeviceGetTemperatureThreshold - * @see nvmlDeviceGetPowerUsage - */ -#define nvmlClocksThrottleReasonHwSlowdown 0x0000000000000008LL - -/** Sync Boost - * - * This GPU has been added to a Sync boost group with nvidia-smi or DCGM in - * order to maximize performance per watt. All GPUs in the sync boost group - * will boost to the minimum possible clocks across the entire group. Look at - * the throttle reasons for other GPUs in the system to see why those GPUs are - * holding this one at lower clocks. - * - */ -#define nvmlClocksThrottleReasonSyncBoost 0x0000000000000010LL - -/** SW Thermal Slowdown - * - * This is an indicator of one or more of the following: - * - Current GPU temperature above the GPU Max Operating Temperature - * - Current memory temperature above the Memory Max Operating Temperature - * - */ -#define nvmlClocksThrottleReasonSwThermalSlowdown 0x0000000000000020LL - -/** HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is engaged - * - * This is an indicator of: - * - temperature being too high - * - * @see nvmlDeviceGetTemperature - * @see nvmlDeviceGetTemperatureThreshold - * @see nvmlDeviceGetPowerUsage - */ -#define nvmlClocksThrottleReasonHwThermalSlowdown 0x0000000000000040LL - -/** HW Power Brake Slowdown (reducing the core clocks by a factor of 2 or more) is engaged - * - * This is an indicator of: - * - External Power Brake Assertion being triggered (e.g. by the system power supply) - * - * @see nvmlDeviceGetTemperature - * @see nvmlDeviceGetTemperatureThreshold - * @see nvmlDeviceGetPowerUsage - */ -#define nvmlClocksThrottleReasonHwPowerBrakeSlowdown 0x0000000000000080LL - -/** GPU clocks are limited by current setting of Display clocks - * - * @see bug 1997531 - */ -#define nvmlClocksThrottleReasonDisplayClockSetting 0x0000000000000100LL - -/** Bit mask representing no clocks throttling - * - * Clocks are as high as possible. - * */ -#define nvmlClocksThrottleReasonNone 0x0000000000000000LL - -/** Bit mask representing all supported clocks throttling reasons - * New reasons might be added to this list in the future - */ -#define nvmlClocksThrottleReasonAll (nvmlClocksThrottleReasonNone \ - | nvmlClocksThrottleReasonGpuIdle \ - | nvmlClocksThrottleReasonApplicationsClocksSetting \ - | nvmlClocksThrottleReasonSwPowerCap \ - | nvmlClocksThrottleReasonHwSlowdown \ - | nvmlClocksThrottleReasonSyncBoost \ - | nvmlClocksThrottleReasonSwThermalSlowdown \ - | nvmlClocksThrottleReasonHwThermalSlowdown \ - | nvmlClocksThrottleReasonHwPowerBrakeSlowdown \ - | nvmlClocksThrottleReasonDisplayClockSetting \ -) -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlAccountingStats Accounting Statistics - * @{ - * - * Set of APIs designed to provide per process information about usage of GPU. - * - * @note All accounting statistics and accounting mode live in nvidia driver and reset - * to default (Disabled) when driver unloads. - * It is advised to run with persistence mode enabled. - * - * @note Enabling accounting mode has no negative impact on the GPU performance. - */ -/***************************************************************************************************/ - -/** - * Describes accounting statistics of a process. - */ -typedef struct nvmlAccountingStats_st { - unsigned int gpuUtilization; //!< Percent of time over the process's lifetime during which one or more kernels was executing on the GPU. - //! Utilization stats just like returned by \ref nvmlDeviceGetUtilizationRates but for the life time of a - //! process (not just the last sample period). - //! Set to NVML_VALUE_NOT_AVAILABLE if nvmlDeviceGetUtilizationRates is not supported - - unsigned int memoryUtilization; //!< Percent of time over the process's lifetime during which global (device) memory was being read or written. - //! Set to NVML_VALUE_NOT_AVAILABLE if nvmlDeviceGetUtilizationRates is not supported - - unsigned long long maxMemoryUsage; //!< Maximum total memory in bytes that was ever allocated by the process. - //! Set to NVML_VALUE_NOT_AVAILABLE if nvmlProcessInfo_t->usedGpuMemory is not supported - - - unsigned long long time; //!< Amount of time in ms during which the compute context was active. The time is reported as 0 if - //!< the process is not terminated - - unsigned long long startTime; //!< CPU Timestamp in usec representing start time for the process - - unsigned int isRunning; //!< Flag to represent if the process is running (1 for running, 0 for terminated) - - unsigned int reserved[5]; //!< Reserved for future use -} nvmlAccountingStats_t; - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlEncoderStructs Encoder Structs - * @{ - */ -/***************************************************************************************************/ - -/** - * Represents type of encoder for capacity can be queried - */ -typedef enum nvmlEncoderQueryType_enum -{ - NVML_ENCODER_QUERY_H264 = 0, //!< H264 encoder - NVML_ENCODER_QUERY_HEVC = 1, //!< HEVC encoder -}nvmlEncoderType_t; - -/** - * Structure to hold encoder session data - */ -typedef struct nvmlEncoderSessionInfo_st -{ - unsigned int sessionId; //!< Unique session ID - unsigned int pid; //!< Owning process ID - nvmlVgpuInstance_t vgpuInstance; //!< Owning vGPU instance ID (only valid on vGPU hosts, otherwise zero) - nvmlEncoderType_t codecType; //!< Video encoder type - unsigned int hResolution; //!< Current encode horizontal resolution - unsigned int vResolution; //!< Current encode vertical resolution - unsigned int averageFps; //!< Moving average encode frames per second - unsigned int averageLatency; //!< Moving average encode latency in microseconds -}nvmlEncoderSessionInfo_t; - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlFBCStructs Frame Buffer Capture Structures -* @{ -*/ -/***************************************************************************************************/ - -/** - * Represents frame buffer capture session type - */ -typedef enum nvmlFBCSessionType_enum -{ - NVML_FBC_SESSION_TYPE_UNKNOWN = 0, //!< Unknwon - NVML_FBC_SESSION_TYPE_TOSYS, //!< ToSys - NVML_FBC_SESSION_TYPE_CUDA, //!< Cuda - NVML_FBC_SESSION_TYPE_VID, //!< Vid - NVML_FBC_SESSION_TYPE_HWENC, //!< HEnc -} nvmlFBCSessionType_t; - -/** - * Structure to hold frame buffer capture sessions stats - */ -typedef struct nvmlFBCStats_st -{ - unsigned int sessionsCount; //!< Total no of sessions - unsigned int averageFPS; //!< Moving average new frames captured per second - unsigned int averageLatency; //!< Moving average new frame capture latency in microseconds -} nvmlFBCStats_t; - -#define NVML_NVFBC_SESSION_FLAG_DIFFMAP_ENABLED 0x00000001 //!< Bit specifying differential map state. -#define NVML_NVFBC_SESSION_FLAG_CLASSIFICATIONMAP_ENABLED 0x00000002 //!< Bit specifying classification map state. -#define NVML_NVFBC_SESSION_FLAG_CAPTURE_WITH_WAIT_NO_WAIT 0x00000004 //!< Bit specifying if capture was requested as non-blocking call. -#define NVML_NVFBC_SESSION_FLAG_CAPTURE_WITH_WAIT_INFINITE 0x00000008 //!< Bit specifying if capture was requested as blocking call. -#define NVML_NVFBC_SESSION_FLAG_CAPTURE_WITH_WAIT_TIMEOUT 0x00000010 //!< Bit specifying if capture was requested as blocking call with timeout period. - -/** - * Structure to hold FBC session data - */ -typedef struct nvmlFBCSessionInfo_st -{ - unsigned int sessionId; //!< Unique session ID - unsigned int pid; //!< Owning process ID - nvmlVgpuInstance_t vgpuInstance; //!< Owning vGPU instance ID (only valid on vGPU hosts, otherwise zero) - unsigned int displayOrdinal; //!< Display identifier - nvmlFBCSessionType_t sessionType; //!< Type of frame buffer capture session - unsigned int sessionFlags; //!< Session flags (one or more of NVML_NVFBC_SESSION_FLAG_XXX). - unsigned int hMaxResolution; //!< Max horizontal resolution supported by the capture session - unsigned int vMaxResolution; //!< Max vertical resolution supported by the capture session - unsigned int hResolution; //!< Horizontal resolution requested by caller in capture call - unsigned int vResolution; //!< Vertical resolution requested by caller in capture call - unsigned int averageFPS; //!< Moving average new frames captured per second - unsigned int averageLatency; //!< Moving average new frame capture latency in microseconds -} nvmlFBCSessionInfo_t; - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlDrainDefs definitions related to the drain state - * @{ - */ -/***************************************************************************************************/ - -/** - * Is the GPU device to be removed from the kernel by nvmlDeviceRemoveGpu() - */ -typedef enum nvmlDetachGpuState_enum -{ - NVML_DETACH_GPU_KEEP = 0, - NVML_DETACH_GPU_REMOVE, -} nvmlDetachGpuState_t; - -/** - * Parent bridge PCIe link state requested by nvmlDeviceRemoveGpu() - */ -typedef enum nvmlPcieLinkState_enum -{ - NVML_PCIE_LINK_KEEP = 0, - NVML_PCIE_LINK_SHUT_DOWN, -} nvmlPcieLinkState_t; - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlInitializationAndCleanup Initialization and Cleanup - * This chapter describes the methods that handle NVML initialization and cleanup. - * It is the user's responsibility to call \ref nvmlInit_v2() before calling any other methods, and - * nvmlShutdown() once NVML is no longer being used. - * @{ - */ -/***************************************************************************************************/ - -#define NVML_INIT_FLAG_NO_GPUS 1 //!< Don't fail nvmlInit() when no GPUs are found -#define NVML_INIT_FLAG_NO_ATTACH 2 //!< Don't attach GPUs - -/** - * Initialize NVML, but don't initialize any GPUs yet. - * - * \note nvmlInit_v3 introduces a "flags" argument, that allows passing boolean values - * modifying the behaviour of nvmlInit(). - * \note In NVML 5.319 new nvmlInit_v2 has replaced nvmlInit"_v1" (default in NVML 4.304 and older) that - * did initialize all GPU devices in the system. - * - * This allows NVML to communicate with a GPU - * when other GPUs in the system are unstable or in a bad state. When using this API, GPUs are - * discovered and initialized in nvmlDeviceGetHandleBy* functions instead. - * - * \note To contrast nvmlInit_v2 with nvmlInit"_v1", NVML 4.304 nvmlInit"_v1" will fail when any detected GPU is in - * a bad or unstable state. - * - * For all products. - * - * This method, should be called once before invoking any other methods in the library. - * A reference count of the number of initializations is maintained. Shutdown only occurs - * when the reference count reaches zero. - * - * @return - * - \ref NVML_SUCCESS if NVML has been properly initialized - * - \ref NVML_ERROR_DRIVER_NOT_LOADED if NVIDIA driver is not running - * - \ref NVML_ERROR_NO_PERMISSION if NVML does not have permission to talk to the driver - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlInit_v2(void); - -/** - * nvmlInitWithFlags is a variant of nvmlInit(), that allows passing a set of boolean values - * modifying the behaviour of nvmlInit(). - * Other than the "flags" parameter it is completely similar to \ref nvmlInit_v2. - * - * For all products. - * - * @param flags behaviour modifier flags - * - * @return - * - \ref NVML_SUCCESS if NVML has been properly initialized - * - \ref NVML_ERROR_DRIVER_NOT_LOADED if NVIDIA driver is not running - * - \ref NVML_ERROR_NO_PERMISSION if NVML does not have permission to talk to the driver - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlInitWithFlags(unsigned int flags); - -/** - * Shut down NVML by releasing all GPU resources previously allocated with \ref nvmlInit_v2(). - * - * For all products. - * - * This method should be called after NVML work is done, once for each call to \ref nvmlInit_v2() - * A reference count of the number of initializations is maintained. Shutdown only occurs - * when the reference count reaches zero. For backwards compatibility, no error is reported if - * nvmlShutdown() is called more times than nvmlInit(). - * - * @return - * - \ref NVML_SUCCESS if NVML has been properly shut down - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlShutdown(void); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlErrorReporting Error reporting - * This chapter describes helper functions for error reporting routines. - * @{ - */ -/***************************************************************************************************/ - -/** - * Helper method for converting NVML error codes into readable strings. - * - * For all products. - * - * @param result NVML error code to convert - * - * @return String representation of the error. - * - */ -const DECLDIR char* nvmlErrorString(nvmlReturn_t result); -/** @} */ - - -/***************************************************************************************************/ -/** @defgroup nvmlConstants Constants - * @{ - */ -/***************************************************************************************************/ - -/** - * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetInforomVersion and \ref nvmlDeviceGetInforomImageVersion - */ -#define NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE 16 - -/** - * Buffer size guaranteed to be large enough for storing GPU identifiers. - */ -#define NVML_DEVICE_UUID_BUFFER_SIZE 80 - -/** - * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetUUID - */ -#define NVML_DEVICE_UUID_V2_BUFFER_SIZE 96 - -/** - * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetBoardPartNumber - */ -#define NVML_DEVICE_PART_NUMBER_BUFFER_SIZE 80 - -/** - * Buffer size guaranteed to be large enough for \ref nvmlSystemGetDriverVersion - */ -#define NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE 80 - -/** - * Buffer size guaranteed to be large enough for \ref nvmlSystemGetNVMLVersion - */ -#define NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE 80 - -/** - * Buffer size guaranteed to be large enough for storing GPU device names. - */ -#define NVML_DEVICE_NAME_BUFFER_SIZE 64 - -/** - * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetName - */ -#define NVML_DEVICE_NAME_V2_BUFFER_SIZE 96 - -/** - * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetSerial - */ -#define NVML_DEVICE_SERIAL_BUFFER_SIZE 30 - -/** - * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetVbiosVersion - */ -#define NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE 32 - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlSystemQueries System Queries - * This chapter describes the queries that NVML can perform against the local system. These queries - * are not device-specific. - * @{ - */ -/***************************************************************************************************/ - -/** - * Retrieves the version of the system's graphics driver. - * - * For all products. - * - * The version identifier is an alphanumeric string. It will not exceed 80 characters in length - * (including the NULL terminator). See \ref nvmlConstants::NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE. - * - * @param version Reference in which to return the version identifier - * @param length The maximum allowed length of the string returned in \a version - * - * @return - * - \ref NVML_SUCCESS if \a version has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a version is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small - */ -nvmlReturn_t DECLDIR nvmlSystemGetDriverVersion(char *version, unsigned int length); - -/** - * Retrieves the version of the NVML library. - * - * For all products. - * - * The version identifier is an alphanumeric string. It will not exceed 80 characters in length - * (including the NULL terminator). See \ref nvmlConstants::NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE. - * - * @param version Reference in which to return the version identifier - * @param length The maximum allowed length of the string returned in \a version - * - * @return - * - \ref NVML_SUCCESS if \a version has been set - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a version is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small - */ -nvmlReturn_t DECLDIR nvmlSystemGetNVMLVersion(char *version, unsigned int length); - -/** - * Retrieves the version of the CUDA driver. - * - * For all products. - * - * The CUDA driver version returned will be retreived from the currently installed version of CUDA. - * If the cuda library is not found, this function will return a known supported version number. - * - * @param cudaDriverVersion Reference in which to return the version identifier - * - * @return - * - \ref NVML_SUCCESS if \a cudaDriverVersion has been set - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a cudaDriverVersion is NULL - */ -nvmlReturn_t DECLDIR nvmlSystemGetCudaDriverVersion(int *cudaDriverVersion); - -/** - * Retrieves the version of the CUDA driver from the shared library. - * - * For all products. - * - * The returned CUDA driver version by calling cuDriverGetVersion() - * - * @param cudaDriverVersion Reference in which to return the version identifier - * - * @return - * - \ref NVML_SUCCESS if \a cudaDriverVersion has been set - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a cudaDriverVersion is NULL - * - \ref NVML_ERROR_LIBRARY_NOT_FOUND if \a libcuda.so.1 or libcuda.dll is not found - * - \ref NVML_ERROR_FUNCTION_NOT_FOUND if \a cuDriverGetVersion() is not found in the shared library - */ -nvmlReturn_t DECLDIR nvmlSystemGetCudaDriverVersion_v2(int *cudaDriverVersion); - -/** - * Macros for converting the CUDA driver version number to Major and Minor version numbers. - */ -#define NVML_CUDA_DRIVER_VERSION_MAJOR(v) ((v)/1000) -#define NVML_CUDA_DRIVER_VERSION_MINOR(v) (((v)%1000)/10) - -/** - * Gets name of the process with provided process id - * - * For all products. - * - * Returned process name is cropped to provided length. - * name string is encoded in ANSI. - * - * @param pid The identifier of the process - * @param name Reference in which to return the process name - * @param length The maximum allowed length of the string returned in \a name - * - * @return - * - \ref NVML_SUCCESS if \a name has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a name is NULL or \a length is 0. - * - \ref NVML_ERROR_NOT_FOUND if process doesn't exists - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlSystemGetProcessName(unsigned int pid, char *name, unsigned int length); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlUnitQueries Unit Queries - * This chapter describes that queries that NVML can perform against each unit. For S-class systems only. - * In each case the device is identified with an nvmlUnit_t handle. This handle is obtained by - * calling \ref nvmlUnitGetHandleByIndex(). - * @{ - */ -/***************************************************************************************************/ - - /** - * Retrieves the number of units in the system. - * - * For S-class products. - * - * @param unitCount Reference in which to return the number of units - * - * @return - * - \ref NVML_SUCCESS if \a unitCount has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unitCount is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlUnitGetCount(unsigned int *unitCount); - -/** - * Acquire the handle for a particular unit, based on its index. - * - * For S-class products. - * - * Valid indices are derived from the \a unitCount returned by \ref nvmlUnitGetCount(). - * For example, if \a unitCount is 2 the valid indices are 0 and 1, corresponding to UNIT 0 and UNIT 1. - * - * The order in which NVML enumerates units has no guarantees of consistency between reboots. - * - * @param index The index of the target unit, >= 0 and < \a unitCount - * @param unit Reference in which to return the unit handle - * - * @return - * - \ref NVML_SUCCESS if \a unit has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a index is invalid or \a unit is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlUnitGetHandleByIndex(unsigned int index, nvmlUnit_t *unit); - -/** - * Retrieves the static information associated with a unit. - * - * For S-class products. - * - * See \ref nvmlUnitInfo_t for details on available unit info. - * - * @param unit The identifier of the target unit - * @param info Reference in which to return the unit information - * - * @return - * - \ref NVML_SUCCESS if \a info has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a info is NULL - */ -nvmlReturn_t DECLDIR nvmlUnitGetUnitInfo(nvmlUnit_t unit, nvmlUnitInfo_t *info); - -/** - * Retrieves the LED state associated with this unit. - * - * For S-class products. - * - * See \ref nvmlLedState_t for details on allowed states. - * - * @param unit The identifier of the target unit - * @param state Reference in which to return the current LED state - * - * @return - * - \ref NVML_SUCCESS if \a state has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a state is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlUnitSetLedState() - */ -nvmlReturn_t DECLDIR nvmlUnitGetLedState(nvmlUnit_t unit, nvmlLedState_t *state); - -/** - * Retrieves the PSU stats for the unit. - * - * For S-class products. - * - * See \ref nvmlPSUInfo_t for details on available PSU info. - * - * @param unit The identifier of the target unit - * @param psu Reference in which to return the PSU information - * - * @return - * - \ref NVML_SUCCESS if \a psu has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a psu is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlUnitGetPsuInfo(nvmlUnit_t unit, nvmlPSUInfo_t *psu); - -/** - * Retrieves the temperature readings for the unit, in degrees C. - * - * For S-class products. - * - * Depending on the product, readings may be available for intake (type=0), - * exhaust (type=1) and board (type=2). - * - * @param unit The identifier of the target unit - * @param type The type of reading to take - * @param temp Reference in which to return the intake temperature - * - * @return - * - \ref NVML_SUCCESS if \a temp has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit or \a type is invalid or \a temp is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlUnitGetTemperature(nvmlUnit_t unit, unsigned int type, unsigned int *temp); - -/** - * Retrieves the fan speed readings for the unit. - * - * For S-class products. - * - * See \ref nvmlUnitFanSpeeds_t for details on available fan speed info. - * - * @param unit The identifier of the target unit - * @param fanSpeeds Reference in which to return the fan speed information - * - * @return - * - \ref NVML_SUCCESS if \a fanSpeeds has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a fanSpeeds is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlUnitGetFanSpeedInfo(nvmlUnit_t unit, nvmlUnitFanSpeeds_t *fanSpeeds); - -/** - * Retrieves the set of GPU devices that are attached to the specified unit. - * - * For S-class products. - * - * The \a deviceCount argument is expected to be set to the size of the input \a devices array. - * - * @param unit The identifier of the target unit - * @param deviceCount Reference in which to provide the \a devices array size, and - * to return the number of attached GPU devices - * @param devices Reference in which to return the references to the attached GPU devices - * - * @return - * - \ref NVML_SUCCESS if \a deviceCount and \a devices have been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a deviceCount indicates that the \a devices array is too small - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid, either of \a deviceCount or \a devices is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlUnitGetDevices(nvmlUnit_t unit, unsigned int *deviceCount, nvmlDevice_t *devices); - -/** - * Retrieves the IDs and firmware versions for any Host Interface Cards (HICs) in the system. - * - * For S-class products. - * - * The \a hwbcCount argument is expected to be set to the size of the input \a hwbcEntries array. - * The HIC must be connected to an S-class system for it to be reported by this function. - * - * @param hwbcCount Size of hwbcEntries array - * @param hwbcEntries Array holding information about hwbc - * - * @return - * - \ref NVML_SUCCESS if \a hwbcCount and \a hwbcEntries have been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if either \a hwbcCount or \a hwbcEntries is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a hwbcCount indicates that the \a hwbcEntries array is too small - */ -nvmlReturn_t DECLDIR nvmlSystemGetHicVersion(unsigned int *hwbcCount, nvmlHwbcEntry_t *hwbcEntries); -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlDeviceQueries Device Queries - * This chapter describes that queries that NVML can perform against each device. - * In each case the device is identified with an nvmlDevice_t handle. This handle is obtained by - * calling one of \ref nvmlDeviceGetHandleByIndex_v2(), \ref nvmlDeviceGetHandleBySerial(), - * \ref nvmlDeviceGetHandleByPciBusId_v2(). or \ref nvmlDeviceGetHandleByUUID(). - * @{ - */ -/***************************************************************************************************/ - - /** - * Retrieves the number of compute devices in the system. A compute device is a single GPU. - * - * For all products. - * - * Note: New nvmlDeviceGetCount_v2 (default in NVML 5.319) returns count of all devices in the system - * even if nvmlDeviceGetHandleByIndex_v2 returns NVML_ERROR_NO_PERMISSION for such device. - * Update your code to handle this error, or use NVML 4.304 or older nvml header file. - * For backward binary compatibility reasons _v1 version of the API is still present in the shared - * library. - * Old _v1 version of nvmlDeviceGetCount doesn't count devices that NVML has no permission to talk to. - * - * @param deviceCount Reference in which to return the number of accessible devices - * - * @return - * - \ref NVML_SUCCESS if \a deviceCount has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a deviceCount is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetCount_v2(unsigned int *deviceCount); - -/** - * Get attributes (engine counts etc.) for the given NVML device handle. - * - * @note This API currently only supports MIG device handles. - * - * For newer than Volta &tm; fully supported devices. - * Supported on Linux only. - * - * @param device NVML device handle - * @param attributes Device attributes - * - * @return - * - \ref NVML_SUCCESS if \a device attributes were successfully retrieved - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device handle is invalid - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetAttributes_v2(nvmlDevice_t device, nvmlDeviceAttributes_t *attributes); - -/** - * Acquire the handle for a particular device, based on its index. - * - * For all products. - * - * Valid indices are derived from the \a accessibleDevices count returned by - * \ref nvmlDeviceGetCount_v2(). For example, if \a accessibleDevices is 2 the valid indices - * are 0 and 1, corresponding to GPU 0 and GPU 1. - * - * The order in which NVML enumerates devices has no guarantees of consistency between reboots. For that reason it - * is recommended that devices be looked up by their PCI ids or UUID. See - * \ref nvmlDeviceGetHandleByUUID() and \ref nvmlDeviceGetHandleByPciBusId_v2(). - * - * Note: The NVML index may not correlate with other APIs, such as the CUDA device index. - * - * Starting from NVML 5, this API causes NVML to initialize the target GPU - * NVML may initialize additional GPUs if: - * - The target GPU is an SLI slave - * - * Note: New nvmlDeviceGetCount_v2 (default in NVML 5.319) returns count of all devices in the system - * even if nvmlDeviceGetHandleByIndex_v2 returns NVML_ERROR_NO_PERMISSION for such device. - * Update your code to handle this error, or use NVML 4.304 or older nvml header file. - * For backward binary compatibility reasons _v1 version of the API is still present in the shared - * library. - * Old _v1 version of nvmlDeviceGetCount doesn't count devices that NVML has no permission to talk to. - * - * This means that nvmlDeviceGetHandleByIndex_v2 and _v1 can return different devices for the same index. - * If you don't touch macros that map old (_v1) versions to _v2 versions at the top of the file you don't - * need to worry about that. - * - * @param index The index of the target GPU, >= 0 and < \a accessibleDevices - * @param device Reference in which to return the device handle - * - * @return - * - \ref NVML_SUCCESS if \a device has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a index is invalid or \a device is NULL - * - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power cables - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to talk to this device - * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetIndex - * @see nvmlDeviceGetCount - */ -nvmlReturn_t DECLDIR nvmlDeviceGetHandleByIndex_v2(unsigned int index, nvmlDevice_t *device); - -/** - * Acquire the handle for a particular device, based on its board serial number. - * - * For Fermi &tm; or newer fully supported devices. - * - * This number corresponds to the value printed directly on the board, and to the value returned by - * \ref nvmlDeviceGetSerial(). - * - * @deprecated Since more than one GPU can exist on a single board this function is deprecated in favor - * of \ref nvmlDeviceGetHandleByUUID. - * For dual GPU boards this function will return NVML_ERROR_INVALID_ARGUMENT. - * - * Starting from NVML 5, this API causes NVML to initialize the target GPU - * NVML may initialize additional GPUs as it searches for the target GPU - * - * @param serial The board serial number of the target GPU - * @param device Reference in which to return the device handle - * - * @return - * - \ref NVML_SUCCESS if \a device has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a serial is invalid, \a device is NULL or more than one - * device has the same serial (dual GPU boards) - * - \ref NVML_ERROR_NOT_FOUND if \a serial does not match a valid device on the system - * - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power cables - * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs - * - \ref NVML_ERROR_GPU_IS_LOST if any GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetSerial - * @see nvmlDeviceGetHandleByUUID - */ -nvmlReturn_t DECLDIR nvmlDeviceGetHandleBySerial(const char *serial, nvmlDevice_t *device); - -/** - * Acquire the handle for a particular device, based on its globally unique immutable UUID associated with each device. - * - * For all products. - * - * @param uuid The UUID of the target GPU - * @param device Reference in which to return the device handle - * - * Starting from NVML 5, this API causes NVML to initialize the target GPU - * NVML may initialize additional GPUs as it searches for the target GPU - * - * This API does not currently support acquiring MIG device handles using MIG device UUIDs. - * - * @return - * - \ref NVML_SUCCESS if \a device has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a uuid is invalid or \a device is null - * - \ref NVML_ERROR_NOT_FOUND if \a uuid does not match a valid device on the system - * - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power cables - * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs - * - \ref NVML_ERROR_GPU_IS_LOST if any GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetUUID - */ -nvmlReturn_t DECLDIR nvmlDeviceGetHandleByUUID(const char *uuid, nvmlDevice_t *device); - -/** - * Acquire the handle for a particular device, based on its PCI bus id. - * - * For all products. - * - * This value corresponds to the nvmlPciInfo_t::busId returned by \ref nvmlDeviceGetPciInfo_v3(). - * - * Starting from NVML 5, this API causes NVML to initialize the target GPU - * NVML may initialize additional GPUs if: - * - The target GPU is an SLI slave - * - * \note NVML 4.304 and older version of nvmlDeviceGetHandleByPciBusId"_v1" returns NVML_ERROR_NOT_FOUND - * instead of NVML_ERROR_NO_PERMISSION. - * - * @param pciBusId The PCI bus id of the target GPU - * @param device Reference in which to return the device handle - * - * @return - * - \ref NVML_SUCCESS if \a device has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pciBusId is invalid or \a device is NULL - * - \ref NVML_ERROR_NOT_FOUND if \a pciBusId does not match a valid device on the system - * - \ref NVML_ERROR_INSUFFICIENT_POWER if the attached device has improperly attached external power cables - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to talk to this device - * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetHandleByPciBusId_v2(const char *pciBusId, nvmlDevice_t *device); - -/** - * Retrieves the name of this device. - * - * For all products. - * - * The name is an alphanumeric string that denotes a particular product, e.g. Tesla &tm; C2070. It will not - * exceed 96 characters in length (including the NULL terminator). See \ref - * nvmlConstants::NVML_DEVICE_NAME_V2_BUFFER_SIZE. - * - * When used with MIG device handles the API returns MIG device names which can be used to identify devices - * based on their attributes. - * - * @param device The identifier of the target device - * @param name Reference in which to return the product name - * @param length The maximum allowed length of the string returned in \a name - * - * @return - * - \ref NVML_SUCCESS if \a name has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a name is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetName(nvmlDevice_t device, char *name, unsigned int length); - -/** - * Retrieves the brand of this device. - * - * For all products. - * - * The type is a member of \ref nvmlBrandType_t defined above. - * - * @param device The identifier of the target device - * @param type Reference in which to return the product brand type - * - * @return - * - \ref NVML_SUCCESS if \a name has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a type is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetBrand(nvmlDevice_t device, nvmlBrandType_t *type); - -/** - * Retrieves the NVML index of this device. - * - * For all products. - * - * Valid indices are derived from the \a accessibleDevices count returned by - * \ref nvmlDeviceGetCount_v2(). For example, if \a accessibleDevices is 2 the valid indices - * are 0 and 1, corresponding to GPU 0 and GPU 1. - * - * The order in which NVML enumerates devices has no guarantees of consistency between reboots. For that reason it - * is recommended that devices be looked up by their PCI ids or GPU UUID. See - * \ref nvmlDeviceGetHandleByPciBusId_v2() and \ref nvmlDeviceGetHandleByUUID(). - * - * When used with MIG device handles this API returns indices that can be - * passed to \ref nvmlDeviceGetMigDeviceHandleByIndex to retrieve an identical handle. - * MIG device indices are unique within a device. - * - * Note: The NVML index may not correlate with other APIs, such as the CUDA device index. - * - * @param device The identifier of the target device - * @param index Reference in which to return the NVML index of the device - * - * @return - * - \ref NVML_SUCCESS if \a index has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a index is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetHandleByIndex() - * @see nvmlDeviceGetCount() - */ -nvmlReturn_t DECLDIR nvmlDeviceGetIndex(nvmlDevice_t device, unsigned int *index); - -/** - * Retrieves the globally unique board serial number associated with this device's board. - * - * For all products with an inforom. - * - * The serial number is an alphanumeric string that will not exceed 30 characters (including the NULL terminator). - * This number matches the serial number tag that is physically attached to the board. See \ref - * nvmlConstants::NVML_DEVICE_SERIAL_BUFFER_SIZE. - * - * @param device The identifier of the target device - * @param serial Reference in which to return the board/module serial number - * @param length The maximum allowed length of the string returned in \a serial - * - * @return - * - \ref NVML_SUCCESS if \a serial has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a serial is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetSerial(nvmlDevice_t device, char *serial, unsigned int length); - - -/***************************************************************************************************/ - -/** @defgroup nvmlAffinity CPU and Memory Affinity - * This chapter describes NVML operations that are associated with CPU and memory - * affinity. - * @{ - */ -/***************************************************************************************************/ - -//! Scope of NUMA node for affinity queries -#define NVML_AFFINITY_SCOPE_NODE 0 -//! Scope of processor socket for affinity queries -#define NVML_AFFINITY_SCOPE_SOCKET 1 - -typedef unsigned int nvmlAffinityScope_t; - -/** - * Retrieves an array of unsigned ints (sized to nodeSetSize) of bitmasks with - * the ideal memory affinity within node or socket for the device. - * For example, if NUMA node 0, 1 are ideal within the socket for the device and nodeSetSize == 1, - * result[0] = 0x3 - * - * \note If requested scope is not applicable to the target topology, the API - * will fall back to reporting the memory affinity for the immediate non-I/O - * ancestor of the device. - * - * For Kepler &tm; or newer fully supported devices. - * Supported on Linux only. - * - * @param device The identifier of the target device - * @param nodeSetSize The size of the nodeSet array that is safe to access - * @param nodeSet Array reference in which to return a bitmask of NODEs, 64 NODEs per - * unsigned long on 64-bit machines, 32 on 32-bit machines - * @param scope Scope that change the default behavior - * - * @return - * - \ref NVML_SUCCESS if \a NUMA node Affinity has been filled - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, nodeSetSize == 0, nodeSet is NULL or scope is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ - -nvmlReturn_t DECLDIR nvmlDeviceGetMemoryAffinity(nvmlDevice_t device, unsigned int nodeSetSize, unsigned long *nodeSet, nvmlAffinityScope_t scope); - -/** - * Retrieves an array of unsigned ints (sized to cpuSetSize) of bitmasks with the - * ideal CPU affinity within node or socket for the device. - * For example, if processors 0, 1, 32, and 33 are ideal for the device and cpuSetSize == 2, - * result[0] = 0x3, result[1] = 0x3 - * - * \note If requested scope is not applicable to the target topology, the API - * will fall back to reporting the CPU affinity for the immediate non-I/O - * ancestor of the device. - * - * For Kepler &tm; or newer fully supported devices. - * Supported on Linux only. - * - * @param device The identifier of the target device - * @param cpuSetSize The size of the cpuSet array that is safe to access - * @param cpuSet Array reference in which to return a bitmask of CPUs, 64 CPUs per - * unsigned long on 64-bit machines, 32 on 32-bit machines - * @param scope Scope that change the default behavior - * - * @return - * - \ref NVML_SUCCESS if \a cpuAffinity has been filled - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, cpuSetSize == 0, cpuSet is NULL or sope is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ - -nvmlReturn_t DECLDIR nvmlDeviceGetCpuAffinityWithinScope(nvmlDevice_t device, unsigned int cpuSetSize, unsigned long *cpuSet, nvmlAffinityScope_t scope); - -/** - * Retrieves an array of unsigned ints (sized to cpuSetSize) of bitmasks with the ideal CPU affinity for the device - * For example, if processors 0, 1, 32, and 33 are ideal for the device and cpuSetSize == 2, - * result[0] = 0x3, result[1] = 0x3 - * This is equivalent to calling \ref nvmlDeviceGetCpuAffinityWithinScope with \ref NVML_AFFINITY_SCOPE_NODE. - * - * For Kepler &tm; or newer fully supported devices. - * Supported on Linux only. - * - * @param device The identifier of the target device - * @param cpuSetSize The size of the cpuSet array that is safe to access - * @param cpuSet Array reference in which to return a bitmask of CPUs, 64 CPUs per - * unsigned long on 64-bit machines, 32 on 32-bit machines - * - * @return - * - \ref NVML_SUCCESS if \a cpuAffinity has been filled - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, cpuSetSize == 0, or cpuSet is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetCpuAffinity(nvmlDevice_t device, unsigned int cpuSetSize, unsigned long *cpuSet); - -/** - * Sets the ideal affinity for the calling thread and device using the guidelines - * given in nvmlDeviceGetCpuAffinity(). Note, this is a change as of version 8.0. - * Older versions set the affinity for a calling process and all children. - * Currently supports up to 1024 processors. - * - * For Kepler &tm; or newer fully supported devices. - * Supported on Linux only. - * - * @param device The identifier of the target device - * - * @return - * - \ref NVML_SUCCESS if the calling process has been successfully bound - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceSetCpuAffinity(nvmlDevice_t device); - -/** - * Clear all affinity bindings for the calling thread. Note, this is a change as of version - * 8.0 as older versions cleared the affinity for a calling process and all children. - * - * For Kepler &tm; or newer fully supported devices. - * Supported on Linux only. - * - * @param device The identifier of the target device - * - * @return - * - \ref NVML_SUCCESS if the calling process has been successfully unbound - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceClearCpuAffinity(nvmlDevice_t device); - -/** - * Retrieve the common ancestor for two devices - * For all products. - * Supported on Linux only. - * - * @param device1 The identifier of the first device - * @param device2 The identifier of the second device - * @param pathInfo A \ref nvmlGpuTopologyLevel_t that gives the path type - * - * @return - * - \ref NVML_SUCCESS if \a pathInfo has been set - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device1, or \a device2 is invalid, or \a pathInfo is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device or OS does not support this feature - * - \ref NVML_ERROR_UNKNOWN an error has occurred in underlying topology discovery - */ - -/** @} */ -nvmlReturn_t DECLDIR nvmlDeviceGetTopologyCommonAncestor(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuTopologyLevel_t *pathInfo); - -/** - * Retrieve the set of GPUs that are nearest to a given device at a specific interconnectivity level - * For all products. - * Supported on Linux only. - * - * @param device The identifier of the first device - * @param level The \ref nvmlGpuTopologyLevel_t level to search for other GPUs - * @param count When zero, is set to the number of matching GPUs such that \a deviceArray - * can be malloc'd. When non-zero, \a deviceArray will be filled with \a count - * number of device handles. - * @param deviceArray An array of device handles for GPUs found at \a level - * - * @return - * - \ref NVML_SUCCESS if \a deviceArray or \a count (if initially zero) has been set - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a level, or \a count is invalid, or \a deviceArray is NULL with a non-zero \a count - * - \ref NVML_ERROR_NOT_SUPPORTED if the device or OS does not support this feature - * - \ref NVML_ERROR_UNKNOWN an error has occurred in underlying topology discovery - */ -nvmlReturn_t DECLDIR nvmlDeviceGetTopologyNearestGpus(nvmlDevice_t device, nvmlGpuTopologyLevel_t level, unsigned int *count, nvmlDevice_t *deviceArray); - -/** - * Retrieve the set of GPUs that have a CPU affinity with the given CPU number - * For all products. - * Supported on Linux only. - * - * @param cpuNumber The CPU number - * @param count When zero, is set to the number of matching GPUs such that \a deviceArray - * can be malloc'd. When non-zero, \a deviceArray will be filled with \a count - * number of device handles. - * @param deviceArray An array of device handles for GPUs found with affinity to \a cpuNumber - * - * @return - * - \ref NVML_SUCCESS if \a deviceArray or \a count (if initially zero) has been set - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a cpuNumber, or \a count is invalid, or \a deviceArray is NULL with a non-zero \a count - * - \ref NVML_ERROR_NOT_SUPPORTED if the device or OS does not support this feature - * - \ref NVML_ERROR_UNKNOWN an error has occurred in underlying topology discovery - */ -nvmlReturn_t DECLDIR nvmlSystemGetTopologyGpuSet(unsigned int cpuNumber, unsigned int *count, nvmlDevice_t *deviceArray); - -/** - * Retrieve the status for a given p2p capability index between a given pair of GPU - * - * @param device1 The first device - * @param device2 The second device - * @param p2pIndex p2p Capability Index being looked for between \a device1 and \a device2 - * @param p2pStatus Reference in which to return the status of the \a p2pIndex - * between \a device1 and \a device2 - * @return - * - \ref NVML_SUCCESS if \a p2pStatus has been populated - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device1 or \a device2 or \a p2pIndex is invalid or \a p2pStatus is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex,nvmlGpuP2PStatus_t *p2pStatus); - -/** - * Retrieves the globally unique immutable UUID associated with this device, as a 5 part hexadecimal string, - * that augments the immutable, board serial identifier. - * - * For all products. - * - * The UUID is a globally unique identifier. It is the only available identifier for pre-Fermi-architecture products. - * It does NOT correspond to any identifier printed on the board. It will not exceed 96 characters in length - * (including the NULL terminator). See \ref nvmlConstants::NVML_DEVICE_UUID_V2_BUFFER_SIZE. - * - * When used with MIG device handles the API returns globally unique UUIDs which can be used to identify MIG - * devices across both GPU and MIG devices. UUIDs are immutable for the lifetime of a MIG device. - * - * @param device The identifier of the target device - * @param uuid Reference in which to return the GPU UUID - * @param length The maximum allowed length of the string returned in \a uuid - * - * @return - * - \ref NVML_SUCCESS if \a uuid has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a uuid is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetUUID(nvmlDevice_t device, char *uuid, unsigned int length); - -/** - * Retrieve the MDEV UUID of a vGPU instance. - * - * The MDEV UUID is a globally unique identifier of the mdev device assigned to the VM, and is returned as a 5-part hexadecimal string, - * not exceeding 80 characters in length (including the NULL terminator). - * MDEV UUID is displayed only on KVM platform. - * See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE. - * - * For Maxwell &tm; or newer fully supported devices. - * - * @param vgpuInstance Identifier of the target vGPU instance - * @param mdevUuid Pointer to caller-supplied buffer to hold MDEV UUID - * @param size Size of buffer in bytes - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_NOT_SUPPORTED on any hypervisor other than KVM - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a mdevUuid is NULL - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetMdevUUID(nvmlVgpuInstance_t vgpuInstance, char *mdevUuid, unsigned int size); - -/** - * Retrieves minor number for the device. The minor number for the device is such that the Nvidia device node file for - * each GPU will have the form /dev/nvidia[minor number]. - * - * For all products. - * Supported only for Linux - * - * @param device The identifier of the target device - * @param minorNumber Reference in which to return the minor number for the device - * @return - * - \ref NVML_SUCCESS if the minor number is successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a minorNumber is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int *minorNumber); - -/** - * Retrieves the the device board part number which is programmed into the board's InfoROM - * - * For all products. - * - * @param device Identifier of the target device - * @param partNumber Reference to the buffer to return - * @param length Length of the buffer reference - * - * @return - * - \ref NVML_SUCCESS if \a partNumber has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_NOT_SUPPORTED if the needed VBIOS fields have not been filled - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a serial is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetBoardPartNumber(nvmlDevice_t device, char* partNumber, unsigned int length); - -/** - * Retrieves the version information for the device's infoROM object. - * - * For all products with an inforom. - * - * Fermi and higher parts have non-volatile on-board memory for persisting device info, such as aggregate - * ECC counts. The version of the data structures in this memory may change from time to time. It will not - * exceed 16 characters in length (including the NULL terminator). - * See \ref nvmlConstants::NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE. - * - * See \ref nvmlInforomObject_t for details on the available infoROM objects. - * - * @param device The identifier of the target device - * @param object The target infoROM object - * @param version Reference in which to return the infoROM version - * @param length The maximum allowed length of the string returned in \a version - * - * @return - * - \ref NVML_SUCCESS if \a version has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a version is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have an infoROM - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetInforomImageVersion - */ -nvmlReturn_t DECLDIR nvmlDeviceGetInforomVersion(nvmlDevice_t device, nvmlInforomObject_t object, char *version, unsigned int length); - -/** - * Retrieves the global infoROM image version - * - * For all products with an inforom. - * - * Image version just like VBIOS version uniquely describes the exact version of the infoROM flashed on the board - * in contrast to infoROM object version which is only an indicator of supported features. - * Version string will not exceed 16 characters in length (including the NULL terminator). - * See \ref nvmlConstants::NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE. - * - * @param device The identifier of the target device - * @param version Reference in which to return the infoROM image version - * @param length The maximum allowed length of the string returned in \a version - * - * @return - * - \ref NVML_SUCCESS if \a version has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a version is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have an infoROM - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetInforomVersion - */ -nvmlReturn_t DECLDIR nvmlDeviceGetInforomImageVersion(nvmlDevice_t device, char *version, unsigned int length); - -/** - * Retrieves the checksum of the configuration stored in the device's infoROM. - * - * For all products with an inforom. - * - * Can be used to make sure that two GPUs have the exact same configuration. - * Current checksum takes into account configuration stored in PWR and ECC infoROM objects. - * Checksum can change between driver releases or when user changes configuration (e.g. disable/enable ECC) - * - * @param device The identifier of the target device - * @param checksum Reference in which to return the infoROM configuration checksum - * - * @return - * - \ref NVML_SUCCESS if \a checksum has been set - * - \ref NVML_ERROR_CORRUPTED_INFOROM if the device's checksum couldn't be retrieved due to infoROM corruption - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a checksum is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetInforomConfigurationChecksum(nvmlDevice_t device, unsigned int *checksum); - -/** - * Reads the infoROM from the flash and verifies the checksums. - * - * For all products with an inforom. - * - * @param device The identifier of the target device - * - * @return - * - \ref NVML_SUCCESS if infoROM is not corrupted - * - \ref NVML_ERROR_CORRUPTED_INFOROM if the device's infoROM is corrupted - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceValidateInforom(nvmlDevice_t device); - -/** - * Retrieves the display mode for the device. - * - * For all products. - * - * This method indicates whether a physical display (e.g. monitor) is currently connected to - * any of the device's connectors. - * - * See \ref nvmlEnableState_t for details on allowed modes. - * - * @param device The identifier of the target device - * @param display Reference in which to return the display mode - * - * @return - * - \ref NVML_SUCCESS if \a display has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a display is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetDisplayMode(nvmlDevice_t device, nvmlEnableState_t *display); - -/** - * Retrieves the display active state for the device. - * - * For all products. - * - * This method indicates whether a display is initialized on the device. - * For example whether X Server is attached to this device and has allocated memory for the screen. - * - * Display can be active even when no monitor is physically attached. - * - * See \ref nvmlEnableState_t for details on allowed modes. - * - * @param device The identifier of the target device - * @param isActive Reference in which to return the display active state - * - * @return - * - \ref NVML_SUCCESS if \a isActive has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a isActive is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetDisplayActive(nvmlDevice_t device, nvmlEnableState_t *isActive); - -/** - * Retrieves the persistence mode associated with this device. - * - * For all products. - * For Linux only. - * - * When driver persistence mode is enabled the driver software state is not torn down when the last - * client disconnects. By default this feature is disabled. - * - * See \ref nvmlEnableState_t for details on allowed modes. - * - * @param device The identifier of the target device - * @param mode Reference in which to return the current driver persistence mode - * - * @return - * - \ref NVML_SUCCESS if \a mode has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceSetPersistenceMode() - */ -nvmlReturn_t DECLDIR nvmlDeviceGetPersistenceMode(nvmlDevice_t device, nvmlEnableState_t *mode); - -/** - * Retrieves the PCI attributes of this device. - * - * For all products. - * - * See \ref nvmlPciInfo_t for details on the available PCI info. - * - * @param device The identifier of the target device - * @param pci Reference in which to return the PCI info - * - * @return - * - \ref NVML_SUCCESS if \a pci has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pci is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetPciInfo_v3(nvmlDevice_t device, nvmlPciInfo_t *pci); - -/** - * Retrieves the maximum PCIe link generation possible with this device and system - * - * I.E. for a generation 2 PCIe device attached to a generation 1 PCIe bus the max link generation this function will - * report is generation 1. - * - * For Fermi &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param maxLinkGen Reference in which to return the max PCIe link generation - * - * @return - * - \ref NVML_SUCCESS if \a maxLinkGen has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a maxLinkGen is null - * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetMaxPcieLinkGeneration(nvmlDevice_t device, unsigned int *maxLinkGen); - -/** - * Retrieves the maximum PCIe link width possible with this device and system - * - * I.E. for a device with a 16x PCIe bus width attached to a 8x PCIe system bus this function will report - * a max link width of 8. - * - * For Fermi &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param maxLinkWidth Reference in which to return the max PCIe link generation - * - * @return - * - \ref NVML_SUCCESS if \a maxLinkWidth has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a maxLinkWidth is null - * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetMaxPcieLinkWidth(nvmlDevice_t device, unsigned int *maxLinkWidth); - -/** - * Retrieves the current PCIe link generation - * - * For Fermi &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param currLinkGen Reference in which to return the current PCIe link generation - * - * @return - * - \ref NVML_SUCCESS if \a currLinkGen has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a currLinkGen is null - * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetCurrPcieLinkGeneration(nvmlDevice_t device, unsigned int *currLinkGen); - -/** - * Retrieves the current PCIe link width - * - * For Fermi &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param currLinkWidth Reference in which to return the current PCIe link generation - * - * @return - * - \ref NVML_SUCCESS if \a currLinkWidth has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a currLinkWidth is null - * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetCurrPcieLinkWidth(nvmlDevice_t device, unsigned int *currLinkWidth); - -/** - * Retrieve PCIe utilization information. - * This function is querying a byte counter over a 20ms interval and thus is the - * PCIe throughput over that interval. - * - * For Maxwell &tm; or newer fully supported devices. - * - * This method is not supported in virtual machines running virtual GPU (vGPU). - * - * @param device The identifier of the target device - * @param counter The specific counter that should be queried \ref nvmlPcieUtilCounter_t - * @param value Reference in which to return throughput in KB/s - * - * @return - * - \ref NVML_SUCCESS if \a value has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a counter is invalid, or \a value is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetPcieThroughput(nvmlDevice_t device, nvmlPcieUtilCounter_t counter, unsigned int *value); - -/** - * Retrieve the PCIe replay counter. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param value Reference in which to return the counter's value - * - * @return - * - \ref NVML_SUCCESS if \a value has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a value is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetPcieReplayCounter(nvmlDevice_t device, unsigned int *value); - -/** - * Retrieves the current clock speeds for the device. - * - * For Fermi &tm; or newer fully supported devices. - * - * See \ref nvmlClockType_t for details on available clock information. - * - * @param device The identifier of the target device - * @param type Identify which clock domain to query - * @param clock Reference in which to return the clock speed in MHz - * - * @return - * - \ref NVML_SUCCESS if \a clock has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clock is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device cannot report the specified clock - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock); - -/** - * Retrieves the maximum clock speeds for the device. - * - * For Fermi &tm; or newer fully supported devices. - * - * See \ref nvmlClockType_t for details on available clock information. - * - * \note On GPUs from Fermi family current P0 clocks (reported by \ref nvmlDeviceGetClockInfo) can differ from max clocks - * by few MHz. - * - * @param device The identifier of the target device - * @param type Identify which clock domain to query - * @param clock Reference in which to return the clock speed in MHz - * - * @return - * - \ref NVML_SUCCESS if \a clock has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clock is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device cannot report the specified clock - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock); - -/** - * Retrieves the current setting of a clock that applications will use unless an overspec situation occurs. - * Can be changed using \ref nvmlDeviceSetApplicationsClocks. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param clockType Identify which clock domain to query - * @param clockMHz Reference in which to return the clock in MHz - * - * @return - * - \ref NVML_SUCCESS if \a clockMHz has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetApplicationsClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz); - -/** - * Retrieves the default applications clock that GPU boots with or - * defaults to after \ref nvmlDeviceResetApplicationsClocks call. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param clockType Identify which clock domain to query - * @param clockMHz Reference in which to return the default clock in MHz - * - * @return - * - \ref NVML_SUCCESS if \a clockMHz has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * \see nvmlDeviceGetApplicationsClock - */ -nvmlReturn_t DECLDIR nvmlDeviceGetDefaultApplicationsClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz); - -/** - * Resets the application clock to the default value - * - * This is the applications clock that will be used after system reboot or driver reload. - * Default value is constant, but the current value an be changed using \ref nvmlDeviceSetApplicationsClocks. - * - * On Pascal and newer hardware, if clocks were previously locked with \ref nvmlDeviceSetApplicationsClocks, - * this call will unlock clocks. This returns clocks their default behavior ofautomatically boosting above - * base clocks as thermal limits allow. - * - * @see nvmlDeviceGetApplicationsClock - * @see nvmlDeviceSetApplicationsClocks - * - * For Fermi &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices. - * - * @param device The identifier of the target device - * - * @return - * - \ref NVML_SUCCESS if new settings were successfully set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceResetApplicationsClocks(nvmlDevice_t device); - -/** - * Retrieves the clock speed for the clock specified by the clock type and clock ID. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param clockType Identify which clock domain to query - * @param clockId Identify which clock in the domain to query - * @param clockMHz Reference in which to return the clock in MHz - * - * @return - * - \ref NVML_SUCCESS if \a clockMHz has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetClock(nvmlDevice_t device, nvmlClockType_t clockType, nvmlClockId_t clockId, unsigned int *clockMHz); - -/** - * Retrieves the customer defined maximum boost clock speed specified by the given clock type. - * - * For Pascal &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param clockType Identify which clock domain to query - * @param clockMHz Reference in which to return the clock in MHz - * - * @return - * - \ref NVML_SUCCESS if \a clockMHz has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device or the \a clockType on this device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetMaxCustomerBoostClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz); - -/** - * Retrieves the list of possible memory clocks that can be used as an argument for \ref nvmlDeviceSetApplicationsClocks. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param count Reference in which to provide the \a clocksMHz array size, and - * to return the number of elements - * @param clocksMHz Reference in which to return the clock in MHz - * - * @return - * - \ref NVML_SUCCESS if \a count and \a clocksMHz have been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a count is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to the number of - * required elements) - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceSetApplicationsClocks - * @see nvmlDeviceGetSupportedGraphicsClocks - */ -nvmlReturn_t DECLDIR nvmlDeviceGetSupportedMemoryClocks(nvmlDevice_t device, unsigned int *count, unsigned int *clocksMHz); - -/** - * Retrieves the list of possible graphics clocks that can be used as an argument for \ref nvmlDeviceSetApplicationsClocks. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param memoryClockMHz Memory clock for which to return possible graphics clocks - * @param count Reference in which to provide the \a clocksMHz array size, and - * to return the number of elements - * @param clocksMHz Reference in which to return the clocks in MHz - * - * @return - * - \ref NVML_SUCCESS if \a count and \a clocksMHz have been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_NOT_FOUND if the specified \a memoryClockMHz is not a supported frequency - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clock is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceSetApplicationsClocks - * @see nvmlDeviceGetSupportedMemoryClocks - */ -nvmlReturn_t DECLDIR nvmlDeviceGetSupportedGraphicsClocks(nvmlDevice_t device, unsigned int memoryClockMHz, unsigned int *count, unsigned int *clocksMHz); - -/** - * Retrieve the current state of Auto Boosted clocks on a device and store it in \a isEnabled - * - * For Kepler &tm; or newer fully supported devices. - * - * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates - * to maximize performance as thermal limits allow. - * - * On Pascal and newer hardware, Auto Aoosted clocks are controlled through application clocks. - * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost - * behavior. - * - * @param device The identifier of the target device - * @param isEnabled Where to store the current state of Auto Boosted clocks of the target device - * @param defaultIsEnabled Where to store the default Auto Boosted clocks behavior of the target device that the device will - * revert to when no applications are using the GPU - * - * @return - * - \ref NVML_SUCCESS If \a isEnabled has been been set with the Auto Boosted clocks state of \a device - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a isEnabled is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support Auto Boosted clocks - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - */ -nvmlReturn_t DECLDIR nvmlDeviceGetAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t *isEnabled, nvmlEnableState_t *defaultIsEnabled); - -/** - * Try to set the current state of Auto Boosted clocks on a device. - * - * For Kepler &tm; or newer fully supported devices. - * - * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates - * to maximize performance as thermal limits allow. Auto Boosted clocks should be disabled if fixed clock - * rates are desired. - * - * Non-root users may use this API by default but can be restricted by root from using this API by calling - * \ref nvmlDeviceSetAPIRestriction with apiType=NVML_RESTRICTED_API_SET_AUTO_BOOSTED_CLOCKS. - * Note: Persistence Mode is required to modify current Auto Boost settings, therefore, it must be enabled. - * - * On Pascal and newer hardware, Auto Boosted clocks are controlled through application clocks. - * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost - * behavior. - * - * @param device The identifier of the target device - * @param enabled What state to try to set Auto Boosted clocks of the target device to - * - * @return - * - \ref NVML_SUCCESS If the Auto Boosted clocks were successfully set to the state specified by \a enabled - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support Auto Boosted clocks - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - */ -nvmlReturn_t DECLDIR nvmlDeviceSetAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t enabled); - -/** - * Try to set the default state of Auto Boosted clocks on a device. This is the default state that Auto Boosted clocks will - * return to when no compute running processes (e.g. CUDA application which have an active context) are running - * - * For Kepler &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices. - * Requires root/admin permissions. - * - * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates - * to maximize performance as thermal limits allow. Auto Boosted clocks should be disabled if fixed clock - * rates are desired. - * - * On Pascal and newer hardware, Auto Boosted clocks are controlled through application clocks. - * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost - * behavior. - * - * @param device The identifier of the target device - * @param enabled What state to try to set default Auto Boosted clocks of the target device to - * @param flags Flags that change the default behavior. Currently Unused. - * - * @return - * - \ref NVML_SUCCESS If the Auto Boosted clock's default state was successfully set to the state specified by \a enabled - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_NO_PERMISSION If the calling user does not have permission to change Auto Boosted clock's default state. - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support Auto Boosted clocks - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - */ -nvmlReturn_t DECLDIR nvmlDeviceSetDefaultAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t enabled, unsigned int flags); - - -/** - * Retrieves the intended operating speed of the device's fan. - * - * Note: The reported speed is the intended fan speed. If the fan is physically blocked and unable to spin, the - * output will not match the actual fan speed. - * - * For all discrete products with dedicated fans. - * - * The fan speed is expressed as a percentage of the product's maximum noise tolerance fan speed. - * This value may exceed 100% in certain cases. - * - * @param device The identifier of the target device - * @param speed Reference in which to return the fan speed percentage - * - * @return - * - \ref NVML_SUCCESS if \a speed has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a speed is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have a fan - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetFanSpeed(nvmlDevice_t device, unsigned int *speed); - - -/** - * Retrieves the intended operating speed of the device's specified fan. - * - * Note: The reported speed is the intended fan speed. If the fan is physically blocked and unable to spin, the - * output will not match the actual fan speed. - * - * For all discrete products with dedicated fans. - * - * The fan speed is expressed as a percentage of the product's maximum noise tolerance fan speed. - * This value may exceed 100% in certain cases. - * - * @param device The identifier of the target device - * @param fan The index of the target fan, zero indexed. - * @param speed Reference in which to return the fan speed percentage - * - * @return - * - \ref NVML_SUCCESS if \a speed has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a fan is not an acceptable index, or \a speed is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have a fan or is newer than Maxwell - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetFanSpeed_v2(nvmlDevice_t device, unsigned int fan, unsigned int * speed); - - -/** - * Retrieves the current temperature readings for the device, in degrees C. - * - * For all products. - * - * See \ref nvmlTemperatureSensors_t for details on available temperature sensors. - * - * @param device The identifier of the target device - * @param sensorType Flag that indicates which sensor reading to retrieve - * @param temp Reference in which to return the temperature reading - * - * @return - * - \ref NVML_SUCCESS if \a temp has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a sensorType is invalid or \a temp is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have the specified sensor - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetTemperature(nvmlDevice_t device, nvmlTemperatureSensors_t sensorType, unsigned int *temp); - -/** - * Retrieves the temperature threshold for the GPU with the specified threshold type in degrees C. - * - * For Kepler &tm; or newer fully supported devices. - * - * See \ref nvmlTemperatureThresholds_t for details on available temperature thresholds. - * - * @param device The identifier of the target device - * @param thresholdType The type of threshold value queried - * @param temp Reference in which to return the temperature reading - * @return - * - \ref NVML_SUCCESS if \a temp has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a thresholdType is invalid or \a temp is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have a temperature sensor or is unsupported - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetTemperatureThreshold(nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, unsigned int *temp); - -/** - * Retrieves the current performance state for the device. - * - * For Fermi &tm; or newer fully supported devices. - * - * See \ref nvmlPstates_t for details on allowed performance states. - * - * @param device The identifier of the target device - * @param pState Reference in which to return the performance state reading - * - * @return - * - \ref NVML_SUCCESS if \a pState has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pState is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetPerformanceState(nvmlDevice_t device, nvmlPstates_t *pState); - -/** - * Retrieves current clocks throttling reasons. - * - * For all fully supported products. - * - * \note More than one bit can be enabled at the same time. Multiple reasons can be affecting clocks at once. - * - * @param device The identifier of the target device - * @param clocksThrottleReasons Reference in which to return bitmask of active clocks throttle - * reasons - * - * @return - * - \ref NVML_SUCCESS if \a clocksThrottleReasons has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clocksThrottleReasons is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlClocksThrottleReasons - * @see nvmlDeviceGetSupportedClocksThrottleReasons - */ -nvmlReturn_t DECLDIR nvmlDeviceGetCurrentClocksThrottleReasons(nvmlDevice_t device, unsigned long long *clocksThrottleReasons); - -/** - * Retrieves bitmask of supported clocks throttle reasons that can be returned by - * \ref nvmlDeviceGetCurrentClocksThrottleReasons - * - * For all fully supported products. - * - * This method is not supported in virtual machines running virtual GPU (vGPU). - * - * @param device The identifier of the target device - * @param supportedClocksThrottleReasons Reference in which to return bitmask of supported - * clocks throttle reasons - * - * @return - * - \ref NVML_SUCCESS if \a supportedClocksThrottleReasons has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a supportedClocksThrottleReasons is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlClocksThrottleReasons - * @see nvmlDeviceGetCurrentClocksThrottleReasons - */ -nvmlReturn_t DECLDIR nvmlDeviceGetSupportedClocksThrottleReasons(nvmlDevice_t device, unsigned long long *supportedClocksThrottleReasons); - -/** - * Deprecated: Use \ref nvmlDeviceGetPerformanceState. This function exposes an incorrect generalization. - * - * Retrieve the current performance state for the device. - * - * For Fermi &tm; or newer fully supported devices. - * - * See \ref nvmlPstates_t for details on allowed performance states. - * - * @param device The identifier of the target device - * @param pState Reference in which to return the performance state reading - * - * @return - * - \ref NVML_SUCCESS if \a pState has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pState is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetPowerState(nvmlDevice_t device, nvmlPstates_t *pState); - -/** - * This API has been deprecated. - * - * Retrieves the power management mode associated with this device. - * - * For products from the Fermi family. - * - Requires \a NVML_INFOROM_POWER version 3.0 or higher. - * - * For from the Kepler or newer families. - * - Does not require \a NVML_INFOROM_POWER object. - * - * This flag indicates whether any power management algorithm is currently active on the device. An - * enabled state does not necessarily mean the device is being actively throttled -- only that - * that the driver will do so if the appropriate conditions are met. - * - * See \ref nvmlEnableState_t for details on allowed modes. - * - * @param device The identifier of the target device - * @param mode Reference in which to return the current power management mode - * - * @return - * - \ref NVML_SUCCESS if \a mode has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementMode(nvmlDevice_t device, nvmlEnableState_t *mode); - -/** - * Retrieves the power management limit associated with this device. - * - * For Fermi &tm; or newer fully supported devices. - * - * The power limit defines the upper boundary for the card's power draw. If - * the card's total power draw reaches this limit the power management algorithm kicks in. - * - * This reading is only available if power management mode is supported. - * See \ref nvmlDeviceGetPowerManagementMode. - * - * @param device The identifier of the target device - * @param limit Reference in which to return the power management limit in milliwatts - * - * @return - * - \ref NVML_SUCCESS if \a limit has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a limit is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementLimit(nvmlDevice_t device, unsigned int *limit); - -/** - * Retrieves information about possible values of power management limits on this device. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param minLimit Reference in which to return the minimum power management limit in milliwatts - * @param maxLimit Reference in which to return the maximum power management limit in milliwatts - * - * @return - * - \ref NVML_SUCCESS if \a minLimit and \a maxLimit have been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a minLimit or \a maxLimit is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceSetPowerManagementLimit - */ -nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementLimitConstraints(nvmlDevice_t device, unsigned int *minLimit, unsigned int *maxLimit); - -/** - * Retrieves default power management limit on this device, in milliwatts. - * Default power management limit is a power management limit that the device boots with. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param defaultLimit Reference in which to return the default power management limit in milliwatts - * - * @return - * - \ref NVML_SUCCESS if \a defaultLimit has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a defaultLimit is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementDefaultLimit(nvmlDevice_t device, unsigned int *defaultLimit); - -/** - * Retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory) - * - * For Fermi &tm; or newer fully supported devices. - * - * On Fermi and Kepler GPUs the reading is accurate to within +/- 5% of current power draw. - * - * It is only available if power management mode is supported. See \ref nvmlDeviceGetPowerManagementMode. - * - * @param device The identifier of the target device - * @param power Reference in which to return the power usage information - * - * @return - * - \ref NVML_SUCCESS if \a power has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a power is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support power readings - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetPowerUsage(nvmlDevice_t device, unsigned int *power); - -/** - * Retrieves total energy consumption for this GPU in millijoules (mJ) since the driver was last reloaded - * - * For Volta &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param energy Reference in which to return the energy consumption information - * - * @return - * - \ref NVML_SUCCESS if \a energy has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a energy is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support energy readings - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetTotalEnergyConsumption(nvmlDevice_t device, unsigned long long *energy); - -/** - * Get the effective power limit that the driver enforces after taking into account all limiters - * - * Note: This can be different from the \ref nvmlDeviceGetPowerManagementLimit if other limits are set elsewhere - * This includes the out of band power limit interface - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The device to communicate with - * @param limit Reference in which to return the power management limit in milliwatts - * - * @return - * - \ref NVML_SUCCESS if \a limit has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a limit is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetEnforcedPowerLimit(nvmlDevice_t device, unsigned int *limit); - -/** - * Retrieves the current GOM and pending GOM (the one that GPU will switch to after reboot). - * - * For GK110 M-class and X-class Tesla &tm; products from the Kepler family. - * Modes \ref NVML_GOM_LOW_DP and \ref NVML_GOM_ALL_ON are supported on fully supported GeForce products. - * Not supported on Quadro ® and Tesla &tm; C-class products. - * - * @param device The identifier of the target device - * @param current Reference in which to return the current GOM - * @param pending Reference in which to return the pending GOM - * - * @return - * - \ref NVML_SUCCESS if \a mode has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a current or \a pending is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlGpuOperationMode_t - * @see nvmlDeviceSetGpuOperationMode - */ -nvmlReturn_t DECLDIR nvmlDeviceGetGpuOperationMode(nvmlDevice_t device, nvmlGpuOperationMode_t *current, nvmlGpuOperationMode_t *pending); - -/** - * Retrieves the amount of used, free and total memory available on the device, in bytes. - * - * For all products. - * - * Enabling ECC reduces the amount of total available memory, due to the extra required parity bits. - * Under WDDM most device memory is allocated and managed on startup by Windows. - * - * Under Linux and Windows TCC, the reported amount of used memory is equal to the sum of memory allocated - * by all active channels on the device. - * - * See \ref nvmlMemory_t for details on available memory info. - * - * @note In MIG mode, if device handle is provided, the API returns aggregate - * information, only if the caller has appropriate privileges. Per-instance - * information can be queried by using specific MIG device handles. - * - * @param device The identifier of the target device - * @param memory Reference in which to return the memory information - * - * @return - * - \ref NVML_SUCCESS if \a memory has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memory is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetMemoryInfo(nvmlDevice_t device, nvmlMemory_t *memory); - -/** - * Retrieves the current compute mode for the device. - * - * For all products. - * - * See \ref nvmlComputeMode_t for details on allowed compute modes. - * - * @param device The identifier of the target device - * @param mode Reference in which to return the current compute mode - * - * @return - * - \ref NVML_SUCCESS if \a mode has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceSetComputeMode() - */ -nvmlReturn_t DECLDIR nvmlDeviceGetComputeMode(nvmlDevice_t device, nvmlComputeMode_t *mode); - -/** - * Retrieves the CUDA compute capability of the device. - * - * For all products. - * - * Returns the major and minor compute capability version numbers of the - * device. The major and minor versions are equivalent to the - * CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR and - * CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR attributes that would be - * returned by CUDA's cuDeviceGetAttribute(). - * - * @param device The identifier of the target device - * @param major Reference in which to return the major CUDA compute capability - * @param minor Reference in which to return the minor CUDA compute capability - * - * @return - * - \ref NVML_SUCCESS if \a major and \a minor have been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a major or \a minor are NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int *major, int *minor); - -/** - * Retrieves the current and pending ECC modes for the device. - * - * For Fermi &tm; or newer fully supported devices. - * Only applicable to devices with ECC. - * Requires \a NVML_INFOROM_ECC version 1.0 or higher. - * - * Changing ECC modes requires a reboot. The "pending" ECC mode refers to the target mode following - * the next reboot. - * - * See \ref nvmlEnableState_t for details on allowed modes. - * - * @param device The identifier of the target device - * @param current Reference in which to return the current ECC mode - * @param pending Reference in which to return the pending ECC mode - * - * @return - * - \ref NVML_SUCCESS if \a current and \a pending have been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or either \a current or \a pending is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceSetEccMode() - */ -nvmlReturn_t DECLDIR nvmlDeviceGetEccMode(nvmlDevice_t device, nvmlEnableState_t *current, nvmlEnableState_t *pending); - -/** - * Retrieves the device boardId from 0-N. - * Devices with the same boardId indicate GPUs connected to the same PLX. Use in conjunction with - * \ref nvmlDeviceGetMultiGpuBoard() to decide if they are on the same board as well. - * The boardId returned is a unique ID for the current configuration. Uniqueness and ordering across - * reboots and system configurations is not guaranteed (i.e. if a Tesla K40c returns 0x100 and - * the two GPUs on a Tesla K10 in the same system returns 0x200 it is not guaranteed they will - * always return those values but they will always be different from each other). - * - * - * For Fermi &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param boardId Reference in which to return the device's board ID - * - * @return - * - \ref NVML_SUCCESS if \a boardId has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a boardId is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetBoardId(nvmlDevice_t device, unsigned int *boardId); - -/** - * Retrieves whether the device is on a Multi-GPU Board - * Devices that are on multi-GPU boards will set \a multiGpuBool to a non-zero value. - * - * For Fermi &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param multiGpuBool Reference in which to return a zero or non-zero value - * to indicate whether the device is on a multi GPU board - * - * @return - * - \ref NVML_SUCCESS if \a multiGpuBool has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a multiGpuBool is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetMultiGpuBoard(nvmlDevice_t device, unsigned int *multiGpuBool); - -/** - * Retrieves the total ECC error counts for the device. - * - * For Fermi &tm; or newer fully supported devices. - * Only applicable to devices with ECC. - * Requires \a NVML_INFOROM_ECC version 1.0 or higher. - * Requires ECC Mode to be enabled. - * - * The total error count is the sum of errors across each of the separate memory systems, i.e. the total set of - * errors across the entire device. - * - * See \ref nvmlMemoryErrorType_t for a description of available error types.\n - * See \ref nvmlEccCounterType_t for a description of available counter types. - * - * @param device The identifier of the target device - * @param errorType Flag that specifies the type of the errors. - * @param counterType Flag that specifies the counter-type of the errors. - * @param eccCounts Reference in which to return the specified ECC errors - * - * @return - * - \ref NVML_SUCCESS if \a eccCounts has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a errorType or \a counterType is invalid, or \a eccCounts is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceClearEccErrorCounts() - */ -nvmlReturn_t DECLDIR nvmlDeviceGetTotalEccErrors(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, unsigned long long *eccCounts); - -/** - * Retrieves the detailed ECC error counts for the device. - * - * @deprecated This API supports only a fixed set of ECC error locations - * On different GPU architectures different locations are supported - * See \ref nvmlDeviceGetMemoryErrorCounter - * - * For Fermi &tm; or newer fully supported devices. - * Only applicable to devices with ECC. - * Requires \a NVML_INFOROM_ECC version 2.0 or higher to report aggregate location-based ECC counts. - * Requires \a NVML_INFOROM_ECC version 1.0 or higher to report all other ECC counts. - * Requires ECC Mode to be enabled. - * - * Detailed errors provide separate ECC counts for specific parts of the memory system. - * - * Reports zero for unsupported ECC error counters when a subset of ECC error counters are supported. - * - * See \ref nvmlMemoryErrorType_t for a description of available bit types.\n - * See \ref nvmlEccCounterType_t for a description of available counter types.\n - * See \ref nvmlEccErrorCounts_t for a description of provided detailed ECC counts. - * - * @param device The identifier of the target device - * @param errorType Flag that specifies the type of the errors. - * @param counterType Flag that specifies the counter-type of the errors. - * @param eccCounts Reference in which to return the specified ECC errors - * - * @return - * - \ref NVML_SUCCESS if \a eccCounts has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a errorType or \a counterType is invalid, or \a eccCounts is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceClearEccErrorCounts() - */ -nvmlReturn_t DECLDIR nvmlDeviceGetDetailedEccErrors(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, nvmlEccErrorCounts_t *eccCounts); - -/** - * Retrieves the requested memory error counter for the device. - * - * For Fermi &tm; or newer fully supported devices. - * Requires \a NVML_INFOROM_ECC version 2.0 or higher to report aggregate location-based memory error counts. - * Requires \a NVML_INFOROM_ECC version 1.0 or higher to report all other memory error counts. - * - * Only applicable to devices with ECC. - * - * Requires ECC Mode to be enabled. - * - * @note On MIG-enabled GPUs, per instance information can be queried using specific - * MIG device handles. Per instance information is currently only supported for - * non-DRAM uncorrectable volatile errors. Querying volatile errors using device - * handles is currently not supported. - * - * See \ref nvmlMemoryErrorType_t for a description of available memory error types.\n - * See \ref nvmlEccCounterType_t for a description of available counter types.\n - * See \ref nvmlMemoryLocation_t for a description of available counter locations.\n - * - * @param device The identifier of the target device - * @param errorType Flag that specifies the type of error. - * @param counterType Flag that specifies the counter-type of the errors. - * @param locationType Specifies the location of the counter. - * @param count Reference in which to return the ECC counter - * - * @return - * - \ref NVML_SUCCESS if \a count has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a bitTyp,e \a counterType or \a locationType is - * invalid, or \a count is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support ECC error reporting in the specified memory - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetMemoryErrorCounter(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, - nvmlEccCounterType_t counterType, - nvmlMemoryLocation_t locationType, unsigned long long *count); - -/** - * Retrieves the current utilization rates for the device's major subsystems. - * - * For Fermi &tm; or newer fully supported devices. - * - * See \ref nvmlUtilization_t for details on available utilization rates. - * - * \note During driver initialization when ECC is enabled one can see high GPU and Memory Utilization readings. - * This is caused by ECC Memory Scrubbing mechanism that is performed during driver initialization. - * - * @note On MIG-enabled GPUs, querying device utilization rates is not currently supported. - * - * @param device The identifier of the target device - * @param utilization Reference in which to return the utilization information - * - * @return - * - \ref NVML_SUCCESS if \a utilization has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a utilization is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetUtilizationRates(nvmlDevice_t device, nvmlUtilization_t *utilization); - -/** - * Retrieves the current utilization and sampling size in microseconds for the Encoder - * - * For Kepler &tm; or newer fully supported devices. - * - * @note On MIG-enabled GPUs, querying encoder utilization is not currently supported. - * - * @param device The identifier of the target device - * @param utilization Reference to an unsigned int for encoder utilization info - * @param samplingPeriodUs Reference to an unsigned int for the sampling period in US - * - * @return - * - \ref NVML_SUCCESS if \a utilization has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetEncoderUtilization(nvmlDevice_t device, unsigned int *utilization, unsigned int *samplingPeriodUs); - -/** - * Retrieves the current capacity of the device's encoder, as a percentage of maximum encoder capacity with valid values in the range 0-100. - * - * For Maxwell &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param encoderQueryType Type of encoder to query - * @param encoderCapacity Reference to an unsigned int for the encoder capacity - * - * @return - * - \ref NVML_SUCCESS if \a encoderCapacity is fetched - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a encoderCapacity is NULL, or \a device or \a encoderQueryType - * are invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if device does not support the encoder specified in \a encodeQueryType - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetEncoderCapacity (nvmlDevice_t device, nvmlEncoderType_t encoderQueryType, unsigned int *encoderCapacity); - -/** - * Retrieves the current encoder statistics for a given device. - * - * For Maxwell &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param sessionCount Reference to an unsigned int for count of active encoder sessions - * @param averageFps Reference to an unsigned int for trailing average FPS of all active sessions - * @param averageLatency Reference to an unsigned int for encode latency in microseconds - * - * @return - * - \ref NVML_SUCCESS if \a sessionCount, \a averageFps and \a averageLatency is fetched - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount, or \a device or \a averageFps, - * or \a averageLatency is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetEncoderStats (nvmlDevice_t device, unsigned int *sessionCount, - unsigned int *averageFps, unsigned int *averageLatency); - -/** - * Retrieves information about active encoder sessions on a target device. - * - * An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfos. The - * array elememt count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions - * written to the buffer. - * - * If the supplied buffer is not large enough to accomodate the active session array, the function returns - * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlEncoderSessionInfo_t array required in \a sessionCount. - * To query the number of active encoder sessions, call this function with *sessionCount = 0. The code will return - * NVML_SUCCESS with number of active encoder sessions updated in *sessionCount. - * - * For Maxwell &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param sessionCount Reference to caller supplied array size, and returns the number of sessions. - * @param sessionInfos Reference in which to return the session information - * - * @return - * - \ref NVML_SUCCESS if \a sessionInfos is fetched - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a sessionCount is too small, array element count is returned in \a sessionCount - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount is NULL. - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetEncoderSessions(nvmlDevice_t device, unsigned int *sessionCount, nvmlEncoderSessionInfo_t *sessionInfos); - -/** - * Retrieves the current utilization and sampling size in microseconds for the Decoder - * - * For Kepler &tm; or newer fully supported devices. - * - * @note On MIG-enabled GPUs, querying decoder utilization is not currently supported. - * - * @param device The identifier of the target device - * @param utilization Reference to an unsigned int for decoder utilization info - * @param samplingPeriodUs Reference to an unsigned int for the sampling period in US - * - * @return - * - \ref NVML_SUCCESS if \a utilization has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetDecoderUtilization(nvmlDevice_t device, unsigned int *utilization, unsigned int *samplingPeriodUs); - -/** -* Retrieves the active frame buffer capture sessions statistics for a given device. -* -* For Maxwell &tm; or newer fully supported devices. -* -* @param device The identifier of the target device -* @param fbcStats Reference to nvmlFBCStats_t structure contianing NvFBC stats -* -* @return -* - \ref NVML_SUCCESS if \a fbcStats is fetched -* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized -* - \ref NVML_ERROR_INVALID_ARGUMENT if \a fbcStats is NULL -* - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible -* - \ref NVML_ERROR_UNKNOWN on any unexpected error -*/ -nvmlReturn_t DECLDIR nvmlDeviceGetFBCStats(nvmlDevice_t device, nvmlFBCStats_t *fbcStats); - -/** -* Retrieves information about active frame buffer capture sessions on a target device. -* -* An array of active FBC sessions is returned in the caller-supplied buffer pointed at by \a sessionInfo. The -* array element count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions -* written to the buffer. -* -* If the supplied buffer is not large enough to accomodate the active session array, the function returns -* NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlFBCSessionInfo_t array required in \a sessionCount. -* To query the number of active FBC sessions, call this function with *sessionCount = 0. The code will return -* NVML_SUCCESS with number of active FBC sessions updated in *sessionCount. -* -* For Maxwell &tm; or newer fully supported devices. -* -* @note hResolution, vResolution, averageFPS and averageLatency data for a FBC session returned in \a sessionInfo may -* be zero if there are no new frames captured since the session started. -* -* @param device The identifier of the target device -* @param sessionCount Reference to caller supplied array size, and returns the number of sessions. -* @param sessionInfo Reference in which to return the session information -* -* @return -* - \ref NVML_SUCCESS if \a sessionInfo is fetched -* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized -* - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a sessionCount is too small, array element count is returned in \a sessionCount -* - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount is NULL. -* - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible -* - \ref NVML_ERROR_UNKNOWN on any unexpected error -*/ -nvmlReturn_t DECLDIR nvmlDeviceGetFBCSessions(nvmlDevice_t device, unsigned int *sessionCount, nvmlFBCSessionInfo_t *sessionInfo); - -/** - * Retrieves the current and pending driver model for the device. - * - * For Fermi &tm; or newer fully supported devices. - * For windows only. - * - * On Windows platforms the device driver can run in either WDDM or WDM (TCC) mode. If a display is attached - * to the device it must run in WDDM mode. TCC mode is preferred if a display is not attached. - * - * See \ref nvmlDriverModel_t for details on available driver models. - * - * @param device The identifier of the target device - * @param current Reference in which to return the current driver model - * @param pending Reference in which to return the pending driver model - * - * @return - * - \ref NVML_SUCCESS if either \a current and/or \a pending have been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or both \a current and \a pending are NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the platform is not windows - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceSetDriverModel() - */ -nvmlReturn_t DECLDIR nvmlDeviceGetDriverModel(nvmlDevice_t device, nvmlDriverModel_t *current, nvmlDriverModel_t *pending); - -/** - * Get VBIOS version of the device. - * - * For all products. - * - * The VBIOS version may change from time to time. It will not exceed 32 characters in length - * (including the NULL terminator). See \ref nvmlConstants::NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE. - * - * @param device The identifier of the target device - * @param version Reference to which to return the VBIOS version - * @param length The maximum allowed length of the string returned in \a version - * - * @return - * - \ref NVML_SUCCESS if \a version has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a version is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetVbiosVersion(nvmlDevice_t device, char *version, unsigned int length); - -/** - * Get Bridge Chip Information for all the bridge chips on the board. - * - * For all fully supported products. - * Only applicable to multi-GPU products. - * - * @param device The identifier of the target device - * @param bridgeHierarchy Reference to the returned bridge chip Hierarchy - * - * @return - * - \ref NVML_SUCCESS if bridge chip exists - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a bridgeInfo is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if bridge chip not supported on the device - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - */ -nvmlReturn_t DECLDIR nvmlDeviceGetBridgeChipInfo(nvmlDevice_t device, nvmlBridgeChipHierarchy_t *bridgeHierarchy); - -/** - * Get information about processes with a compute context on a device - * - * For Fermi &tm; or newer fully supported devices. - * - * This function returns information only about compute running processes (e.g. CUDA application which have - * active context). Any graphics applications (e.g. using OpenGL, DirectX) won't be listed by this function. - * - * To query the current number of running compute processes, call this function with *infoCount = 0. The - * return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call - * \a infos is allowed to be NULL. - * - * The usedGpuMemory field returned is all of the memory used by the application. - * - * Keep in mind that information returned by this call is dynamic and the number of elements might change in - * time. Allocate more space for \a infos table in case new compute processes are spawned. - * - * @note In MIG mode, if device handle is provided, the API returns aggregate information, only if - * the caller has appropriate privileges. Per-instance information can be queried by using - * specific MIG device handles. - * - * @param device The device handle or MIG device handle - * @param infoCount Reference in which to provide the \a infos array size, and - * to return the number of returned elements - * @param infos Reference in which to return the process information - * - * @return - * - \ref NVML_SUCCESS if \a infoCount and \a infos have been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a infoCount indicates that the \a infos array is too small - * \a infoCount will contain minimal amount of space necessary for - * the call to complete - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, either of \a infoCount or \a infos is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see \ref nvmlSystemGetProcessName - */ -nvmlReturn_t DECLDIR nvmlDeviceGetComputeRunningProcesses(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_t *infos); - -/** - * Get information about processes with a graphics context on a device - * - * For Kepler &tm; or newer fully supported devices. - * - * This function returns information only about graphics based processes - * (eg. applications using OpenGL, DirectX) - * - * To query the current number of running graphics processes, call this function with *infoCount = 0. The - * return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call - * \a infos is allowed to be NULL. - * - * The usedGpuMemory field returned is all of the memory used by the application. - * - * Keep in mind that information returned by this call is dynamic and the number of elements might change in - * time. Allocate more space for \a infos table in case new graphics processes are spawned. - * - * @note In MIG mode, if device handle is provided, the API returns aggregate information, only if - * the caller has appropriate privileges. Per-instance information can be queried by using - * specific MIG device handles. - * - * @param device The identifier of the target device - * @param infoCount Reference in which to provide the \a infos array size, and - * to return the number of returned elements - * @param infos Reference in which to return the process information - * - * @return - * - \ref NVML_SUCCESS if \a infoCount and \a infos have been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a infoCount indicates that the \a infos array is too small - * \a infoCount will contain minimal amount of space necessary for - * the call to complete - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, either of \a infoCount or \a infos is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see \ref nvmlSystemGetProcessName - */ -nvmlReturn_t DECLDIR nvmlDeviceGetGraphicsRunningProcesses(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_t *infos); - -/** - * Check if the GPU devices are on the same physical board. - * - * For all fully supported products. - * - * @param device1 The first GPU device - * @param device2 The second GPU device - * @param onSameBoard Reference in which to return the status. - * Non-zero indicates that the GPUs are on the same board. - * - * @return - * - \ref NVML_SUCCESS if \a onSameBoard has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a dev1 or \a dev2 are invalid or \a onSameBoard is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if this check is not supported by the device - * - \ref NVML_ERROR_GPU_IS_LOST if the either GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceOnSameBoard(nvmlDevice_t device1, nvmlDevice_t device2, int *onSameBoard); - -/** - * Retrieves the root/admin permissions on the target API. See \a nvmlRestrictedAPI_t for the list of supported APIs. - * If an API is restricted only root users can call that API. See \a nvmlDeviceSetAPIRestriction to change current permissions. - * - * For all fully supported products. - * - * @param device The identifier of the target device - * @param apiType Target API type for this operation - * @param isRestricted Reference in which to return the current restriction - * NVML_FEATURE_ENABLED indicates that the API is root-only - * NVML_FEATURE_DISABLED indicates that the API is accessible to all users - * - * @return - * - \ref NVML_SUCCESS if \a isRestricted has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a apiType incorrect or \a isRestricted is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device or the device does not support - * the feature that is being queried (E.G. Enabling/disabling Auto Boosted clocks is - * not supported by the device) - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlRestrictedAPI_t - */ -nvmlReturn_t DECLDIR nvmlDeviceGetAPIRestriction(nvmlDevice_t device, nvmlRestrictedAPI_t apiType, nvmlEnableState_t *isRestricted); - -/** - * Gets recent samples for the GPU. - * - * For Kepler &tm; or newer fully supported devices. - * - * Based on type, this method can be used to fetch the power, utilization or clock samples maintained in the buffer by - * the driver. - * - * Power, Utilization and Clock samples are returned as type "unsigned int" for the union nvmlValue_t. - * - * To get the size of samples that user needs to allocate, the method is invoked with samples set to NULL. - * The returned samplesCount will provide the number of samples that can be queried. The user needs to - * allocate the buffer with size as samplesCount * sizeof(nvmlSample_t). - * - * lastSeenTimeStamp represents CPU timestamp in microseconds. Set it to 0 to fetch all the samples maintained by the - * underlying buffer. Set lastSeenTimeStamp to one of the timeStamps retrieved from the date of the previous query - * to get more recent samples. - * - * This method fetches the number of entries which can be accommodated in the provided samples array, and the - * reference samplesCount is updated to indicate how many samples were actually retrieved. The advantage of using this - * method for samples in contrast to polling via existing methods is to get get higher frequency data at lower polling cost. - * - * @note On MIG-enabled GPUs, querying the following sample types, NVML_GPU_UTILIZATION_SAMPLES, NVML_MEMORY_UTILIZATION_SAMPLES - * NVML_ENC_UTILIZATION_SAMPLES and NVML_DEC_UTILIZATION_SAMPLES, is not currently supported. - * - * @param device The identifier for the target device - * @param type Type of sampling event - * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. - * @param sampleValType Output parameter to represent the type of sample value as described in nvmlSampleVal_t - * @param sampleCount Reference to provide the number of elements which can be queried in samples array - * @param samples Reference in which samples are returned - - * @return - * - \ref NVML_SUCCESS if samples are successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a samplesCount is NULL or - * reference to \a sampleCount is 0 for non null \a samples - * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetSamples(nvmlDevice_t device, nvmlSamplingType_t type, unsigned long long lastSeenTimeStamp, - nvmlValueType_t *sampleValType, unsigned int *sampleCount, nvmlSample_t *samples); - -/** - * Gets Total, Available and Used size of BAR1 memory. - * - * BAR1 is used to map the FB (device memory) so that it can be directly accessed by the CPU or by 3rd party - * devices (peer-to-peer on the PCIE bus). - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param bar1Memory Reference in which BAR1 memory - * information is returned. - * - * @return - * - \ref NVML_SUCCESS if BAR1 memory is successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a bar1Memory is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - */ -nvmlReturn_t DECLDIR nvmlDeviceGetBAR1MemoryInfo(nvmlDevice_t device, nvmlBAR1Memory_t *bar1Memory); - - -/** - * Gets the duration of time during which the device was throttled (lower than requested clocks) due to power - * or thermal constraints. - * - * The method is important to users who are tying to understand if their GPUs throttle at any point during their applications. The - * difference in violation times at two different reference times gives the indication of GPU throttling event. - * - * Violation for thermal capping is not supported at this time. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param perfPolicyType Represents Performance policy which can trigger GPU throttling - * @param violTime Reference to which violation time related information is returned - * - * - * @return - * - \ref NVML_SUCCESS if violation time is successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a perfPolicyType is invalid, or \a violTime is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - */ -nvmlReturn_t DECLDIR nvmlDeviceGetViolationStatus(nvmlDevice_t device, nvmlPerfPolicyType_t perfPolicyType, nvmlViolationTime_t *violTime); - -/** - * @} - */ - -/** @addtogroup nvmlAccountingStats - * @{ - */ - -/** - * Queries the state of per process accounting mode. - * - * For Kepler &tm; or newer fully supported devices. - * - * See \ref nvmlDeviceGetAccountingStats for more details. - * See \ref nvmlDeviceSetAccountingMode - * - * @param device The identifier of the target device - * @param mode Reference in which to return the current accounting mode - * - * @return - * - \ref NVML_SUCCESS if the mode has been successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode are NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetAccountingMode(nvmlDevice_t device, nvmlEnableState_t *mode); - -/** - * Queries process's accounting stats. - * - * For Kepler &tm; or newer fully supported devices. - * - * Accounting stats capture GPU utilization and other statistics across the lifetime of a process. - * Accounting stats can be queried during life time of the process and after its termination. - * The time field in \ref nvmlAccountingStats_t is reported as 0 during the lifetime of the process and - * updated to actual running time after its termination. - * Accounting stats are kept in a circular buffer, newly created processes overwrite information about old - * processes. - * - * See \ref nvmlAccountingStats_t for description of each returned metric. - * List of processes that can be queried can be retrieved from \ref nvmlDeviceGetAccountingPids. - * - * @note Accounting Mode needs to be on. See \ref nvmlDeviceGetAccountingMode. - * @note Only compute and graphics applications stats can be queried. Monitoring applications stats can't be - * queried since they don't contribute to GPU utilization. - * @note In case of pid collision stats of only the latest process (that terminated last) will be reported - * - * @warning On Kepler devices per process statistics are accurate only if there's one process running on a GPU. - * - * @param device The identifier of the target device - * @param pid Process Id of the target process to query stats for - * @param stats Reference in which to return the process's accounting stats - * - * @return - * - \ref NVML_SUCCESS if stats have been successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a stats are NULL - * - \ref NVML_ERROR_NOT_FOUND if process stats were not found - * - \ref NVML_ERROR_NOT_SUPPORTED if \a device doesn't support this feature or accounting mode is disabled - * or on vGPU host. - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetAccountingBufferSize - */ -nvmlReturn_t DECLDIR nvmlDeviceGetAccountingStats(nvmlDevice_t device, unsigned int pid, nvmlAccountingStats_t *stats); - -/** - * Queries list of processes that can be queried for accounting stats. The list of processes returned - * can be in running or terminated state. - * - * For Kepler &tm; or newer fully supported devices. - * - * To just query the number of processes ready to be queried, call this function with *count = 0 and - * pids=NULL. The return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if list is empty. - * - * For more details see \ref nvmlDeviceGetAccountingStats. - * - * @note In case of PID collision some processes might not be accessible before the circular buffer is full. - * - * @param device The identifier of the target device - * @param count Reference in which to provide the \a pids array size, and - * to return the number of elements ready to be queried - * @param pids Reference in which to return list of process ids - * - * @return - * - \ref NVML_SUCCESS if pids were successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a count is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if \a device doesn't support this feature or accounting mode is disabled - * or on vGPU host. - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to - * expected value) - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetAccountingBufferSize - */ -nvmlReturn_t DECLDIR nvmlDeviceGetAccountingPids(nvmlDevice_t device, unsigned int *count, unsigned int *pids); - -/** - * Returns the number of processes that the circular buffer with accounting pids can hold. - * - * For Kepler &tm; or newer fully supported devices. - * - * This is the maximum number of processes that accounting information will be stored for before information - * about oldest processes will get overwritten by information about new processes. - * - * @param device The identifier of the target device - * @param bufferSize Reference in which to provide the size (in number of elements) - * of the circular buffer for accounting stats. - * - * @return - * - \ref NVML_SUCCESS if buffer size was successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a bufferSize is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature or accounting mode is disabled - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetAccountingStats - * @see nvmlDeviceGetAccountingPids - */ -nvmlReturn_t DECLDIR nvmlDeviceGetAccountingBufferSize(nvmlDevice_t device, unsigned int *bufferSize); - -/** @} */ - -/** @addtogroup nvmlDeviceQueries - * @{ - */ - -/** - * Returns the list of retired pages by source, including pages that are pending retirement - * The address information provided from this API is the hardware address of the page that was retired. Note - * that this does not match the virtual address used in CUDA, but will match the address information in XID 63 - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param cause Filter page addresses by cause of retirement - * @param pageCount Reference in which to provide the \a addresses buffer size, and - * to return the number of retired pages that match \a cause - * Set to 0 to query the size without allocating an \a addresses buffer - * @param addresses Buffer to write the page addresses into - * - * @return - * - \ref NVML_SUCCESS if \a pageCount was populated and \a addresses was filled - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a pageCount indicates the buffer is not large enough to store all the - * matching page addresses. \a pageCount is set to the needed size. - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a pageCount is NULL, \a cause is invalid, or - * \a addresses is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPages(nvmlDevice_t device, nvmlPageRetirementCause_t cause, - unsigned int *pageCount, unsigned long long *addresses); - -/** - * Returns the list of retired pages by source, including pages that are pending retirement - * The address information provided from this API is the hardware address of the page that was retired. Note - * that this does not match the virtual address used in CUDA, but will match the address information in XID 63 - * - * \note nvmlDeviceGetRetiredPages_v2 adds an additional timestamps paramter to return the time of each page's - * retirement. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param cause Filter page addresses by cause of retirement - * @param pageCount Reference in which to provide the \a addresses buffer size, and - * to return the number of retired pages that match \a cause - * Set to 0 to query the size without allocating an \a addresses buffer - * @param addresses Buffer to write the page addresses into - * @param timestamps Buffer to write the timestamps of page retirement, additional for _v2 - * - * @return - * - \ref NVML_SUCCESS if \a pageCount was populated and \a addresses was filled - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a pageCount indicates the buffer is not large enough to store all the - * matching page addresses. \a pageCount is set to the needed size. - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a pageCount is NULL, \a cause is invalid, or - * \a addresses is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPages_v2(nvmlDevice_t device, nvmlPageRetirementCause_t cause, - unsigned int *pageCount, unsigned long long *addresses, unsigned long long *timestamps); - -/** - * Check if any pages are pending retirement and need a reboot to fully retire. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param isPending Reference in which to return the pending status - * - * @return - * - \ref NVML_SUCCESS if \a isPending was populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a isPending is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPagesPendingStatus(nvmlDevice_t device, nvmlEnableState_t *isPending); - -/** - * Get number of remapped rows. The number of rows reported will be based on - * the cause of the remapping. isPending indicates whether or not there are - * pending remappings. A reset will be required to actually remap the row. - * failureOccurred will be set if a row remapping ever failed in the past. A - * pending remapping won't affect future work on the GPU since - * error-containment and dynamic page blacklisting will take care of that. - * - * @note On MIG-enabled GPUs with active instances, querying the number of - * remapped rows is not supported - * - * For newer than Volta &tm; fully supported devices. - * - * @param device The identifier of the target device - * @param corrRows Reference for number of rows remapped due to correctable errors - * @param uncRows Reference for number of rows remapped due to uncorrectable errors - * @param isPending Reference for whether or not remappings are pending - * @param failureOccurred Reference that is set when a remapping has failed in the past - * - * @return - * - \ref NVML_SUCCESS Upon success - * - \ref NVML_ERROR_INVALID_ARGUMENT If \a corrRows, \a uncRows, \a isPending or \a failureOccurred is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED If MIG is enabled or if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN Unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetRemappedRows(nvmlDevice_t device, unsigned int *corrRows, unsigned int *uncRows, - unsigned int *isPending, unsigned int *failureOccurred); - -/** - * Get the row remapper histogram. Returns the remap availability for each bank - * on the GPU. - * - * @param device Device handle - * @param values Histogram values - * - * @return - * - \ref NVML_SUCCESS On success - * - \ref NVML_ERROR_UNKNOWN On any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetRowRemapperHistogram(nvmlDevice_t device, nvmlRowRemapperHistogramValues_t *values); - -/** - * Get architecture for device - * - * @param device The identifier of the target device - * @param arch Reference where architecture is returned, if call successful. - * Set to NVML_DEVICE_ARCH_* upon success - * - * @return - * - \ref NVML_SUCCESS Upon success - * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device or \a arch (output refererence) are invalid - */ -nvmlReturn_t DECLDIR nvmlDeviceGetArchitecture(nvmlDevice_t device, nvmlDeviceArchitecture_t *arch); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlUnitCommands Unit Commands - * This chapter describes NVML operations that change the state of the unit. For S-class products. - * Each of these requires root/admin access. Non-admin users will see an NVML_ERROR_NO_PERMISSION - * error code when invoking any of these methods. - * @{ - */ -/***************************************************************************************************/ - -/** - * Set the LED state for the unit. The LED can be either green (0) or amber (1). - * - * For S-class products. - * Requires root/admin permissions. - * - * This operation takes effect immediately. - * - * - * Current S-Class products don't provide unique LEDs for each unit. As such, both front - * and back LEDs will be toggled in unison regardless of which unit is specified with this command. - * - * See \ref nvmlLedColor_t for available colors. - * - * @param unit The identifier of the target unit - * @param color The target LED color - * - * @return - * - \ref NVML_SUCCESS if the LED color has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit or \a color is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlUnitGetLedState() - */ -nvmlReturn_t DECLDIR nvmlUnitSetLedState(nvmlUnit_t unit, nvmlLedColor_t color); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlDeviceCommands Device Commands - * This chapter describes NVML operations that change the state of the device. - * Each of these requires root/admin access. Non-admin users will see an NVML_ERROR_NO_PERMISSION - * error code when invoking any of these methods. - * @{ - */ -/***************************************************************************************************/ - -/** - * Set the persistence mode for the device. - * - * For all products. - * For Linux only. - * Requires root/admin permissions. - * - * The persistence mode determines whether the GPU driver software is torn down after the last client - * exits. - * - * This operation takes effect immediately. It is not persistent across reboots. After each reboot the - * persistence mode is reset to "Disabled". - * - * See \ref nvmlEnableState_t for available modes. - * - * After calling this API with mode set to NVML_FEATURE_DISABLED on a device that has its own NUMA - * memory, the given device handle will no longer be valid, and to continue to interact with this - * device, a new handle should be obtained from one of the nvmlDeviceGetHandleBy*() APIs. This - * limitation is currently only applicable to devices that have a coherent NVLink connection to - * system memory. - * - * @param device The identifier of the target device - * @param mode The target persistence mode - * - * @return - * - \ref NVML_SUCCESS if the persistence mode was set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetPersistenceMode() - */ -nvmlReturn_t DECLDIR nvmlDeviceSetPersistenceMode(nvmlDevice_t device, nvmlEnableState_t mode); - -/** - * Set the compute mode for the device. - * - * For all products. - * Requires root/admin permissions. - * - * The compute mode determines whether a GPU can be used for compute operations and whether it can - * be shared across contexts. - * - * This operation takes effect immediately. Under Linux it is not persistent across reboots and - * always resets to "Default". Under windows it is persistent. - * - * Under windows compute mode may only be set to DEFAULT when running in WDDM - * - * @note On MIG-enabled GPUs, compute mode would be set to DEFAULT and changing it is not supported. - * - * See \ref nvmlComputeMode_t for details on available compute modes. - * - * @param device The identifier of the target device - * @param mode The target compute mode - * - * @return - * - \ref NVML_SUCCESS if the compute mode was set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetComputeMode() - */ -nvmlReturn_t DECLDIR nvmlDeviceSetComputeMode(nvmlDevice_t device, nvmlComputeMode_t mode); - -/** - * Set the ECC mode for the device. - * - * For Kepler &tm; or newer fully supported devices. - * Only applicable to devices with ECC. - * Requires \a NVML_INFOROM_ECC version 1.0 or higher. - * Requires root/admin permissions. - * - * The ECC mode determines whether the GPU enables its ECC support. - * - * This operation takes effect after the next reboot. - * - * See \ref nvmlEnableState_t for details on available modes. - * - * @param device The identifier of the target device - * @param ecc The target ECC mode - * - * @return - * - \ref NVML_SUCCESS if the ECC mode was set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a ecc is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetEccMode() - */ -nvmlReturn_t DECLDIR nvmlDeviceSetEccMode(nvmlDevice_t device, nvmlEnableState_t ecc); - -/** - * Clear the ECC error and other memory error counts for the device. - * - * For Kepler &tm; or newer fully supported devices. - * Only applicable to devices with ECC. - * Requires \a NVML_INFOROM_ECC version 2.0 or higher to clear aggregate location-based ECC counts. - * Requires \a NVML_INFOROM_ECC version 1.0 or higher to clear all other ECC counts. - * Requires root/admin permissions. - * Requires ECC Mode to be enabled. - * - * Sets all of the specified ECC counters to 0, including both detailed and total counts. - * - * This operation takes effect immediately. - * - * See \ref nvmlMemoryErrorType_t for details on available counter types. - * - * @param device The identifier of the target device - * @param counterType Flag that indicates which type of errors should be cleared. - * - * @return - * - \ref NVML_SUCCESS if the error counts were cleared - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a counterType is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see - * - nvmlDeviceGetDetailedEccErrors() - * - nvmlDeviceGetTotalEccErrors() - */ -nvmlReturn_t DECLDIR nvmlDeviceClearEccErrorCounts(nvmlDevice_t device, nvmlEccCounterType_t counterType); - -/** - * Set the driver model for the device. - * - * For Fermi &tm; or newer fully supported devices. - * For windows only. - * Requires root/admin permissions. - * - * On Windows platforms the device driver can run in either WDDM or WDM (TCC) mode. If a display is attached - * to the device it must run in WDDM mode. - * - * It is possible to force the change to WDM (TCC) while the display is still attached with a force flag (nvmlFlagForce). - * This should only be done if the host is subsequently powered down and the display is detached from the device - * before the next reboot. - * - * This operation takes effect after the next reboot. - * - * Windows driver model may only be set to WDDM when running in DEFAULT compute mode. - * - * Change driver model to WDDM is not supported when GPU doesn't support graphics acceleration or - * will not support it after reboot. See \ref nvmlDeviceSetGpuOperationMode. - * - * See \ref nvmlDriverModel_t for details on available driver models. - * See \ref nvmlFlagDefault and \ref nvmlFlagForce - * - * @param device The identifier of the target device - * @param driverModel The target driver model - * @param flags Flags that change the default behavior - * - * @return - * - \ref NVML_SUCCESS if the driver model has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a driverModel is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the platform is not windows or the device does not support this feature - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetDriverModel() - */ -nvmlReturn_t DECLDIR nvmlDeviceSetDriverModel(nvmlDevice_t device, nvmlDriverModel_t driverModel, unsigned int flags); - -typedef enum nvmlClockLimitId_enum { - NVML_CLOCK_LIMIT_ID_RANGE_START = 0xffffff00, - NVML_CLOCK_LIMIT_ID_TDP, - NVML_CLOCK_LIMIT_ID_UNLIMITED -} nvmlClockLimitId_t; - -/** - * Set clocks that device will lock to. - * - * Sets the clocks that the device will be running at to the value in the range of minGpuClockMHz to maxGpuClockMHz. - * Setting this will supercede application clock values and take effect regardless if a cuda app is running. - * See /ref nvmlDeviceSetApplicationsClocks - * - * Can be used as a setting to request constant performance. - * - * This can be called with a pair of integer clock frequencies in MHz, or a pair of /ref nvmlClockLimitId_t values. - * See the table below for valid combinations of these values. - * - * minGpuClock | maxGpuClock | Effect - * ------------+-------------+-------------------------------------------------- - * tdp | tdp | Lock clock to TDP - * unlimited | tdp | Upper bound is TDP but clock may drift below this - * tdp | unlimited | Lower bound is TDP but clock may boost above this - * unlimited | unlimited | Unlocked (== nvmlDeviceResetGpuLockedClocks) - * - * If one arg takes one of these values, the other must be one of these values as - * well. Mixed numeric and symbolic calls return NVML_ERROR_INVALID_ARGUMENT. - * - * Requires root/admin permissions. - * - * After system reboot or driver reload applications clocks go back to their default value. - * See \ref nvmlDeviceResetGpuLockedClocks. - * - * For Volta &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param minGpuClockMHz Requested minimum gpu clock in MHz - * @param maxGpuClockMHz Requested maximum gpu clock in MHz - * - * @return - * - \ref NVML_SUCCESS if new settings were successfully set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a minGpuClockMHz and \a maxGpuClockMHz - * is not a valid clock combination - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceSetGpuLockedClocks(nvmlDevice_t device, unsigned int minGpuClockMHz, unsigned int maxGpuClockMHz); - -/** - * Resets the gpu clock to the default value - * - * This is the gpu clock that will be used after system reboot or driver reload. - * Default values are idle clocks, but the current values can be changed using \ref nvmlDeviceSetApplicationsClocks. - * - * @see nvmlDeviceSetGpuLockedClocks - * - * For Volta &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * - * @return - * - \ref NVML_SUCCESS if new settings were successfully set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceResetGpuLockedClocks(nvmlDevice_t device); - -/** - * Set clocks that applications will lock to. - * - * Sets the clocks that compute and graphics applications will be running at. - * e.g. CUDA driver requests these clocks during context creation which means this property - * defines clocks at which CUDA applications will be running unless some overspec event - * occurs (e.g. over power, over thermal or external HW brake). - * - * Can be used as a setting to request constant performance. - * - * On Pascal and newer hardware, this will automatically disable automatic boosting of clocks. - * - * On K80 and newer Kepler and Maxwell GPUs, users desiring fixed performance should also call - * \ref nvmlDeviceSetAutoBoostedClocksEnabled to prevent clocks from automatically boosting - * above the clock value being set. - * - * For Kepler &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices. - * Requires root/admin permissions. - * - * See \ref nvmlDeviceGetSupportedMemoryClocks and \ref nvmlDeviceGetSupportedGraphicsClocks - * for details on how to list available clocks combinations. - * - * After system reboot or driver reload applications clocks go back to their default value. - * See \ref nvmlDeviceResetApplicationsClocks. - * - * @param device The identifier of the target device - * @param memClockMHz Requested memory clock in MHz - * @param graphicsClockMHz Requested graphics clock in MHz - * - * @return - * - \ref NVML_SUCCESS if new settings were successfully set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memClockMHz and \a graphicsClockMHz - * is not a valid clock combination - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceSetApplicationsClocks(nvmlDevice_t device, unsigned int memClockMHz, unsigned int graphicsClockMHz); - -/** - * Set new power limit of this device. - * - * For Kepler &tm; or newer fully supported devices. - * Requires root/admin permissions. - * - * See \ref nvmlDeviceGetPowerManagementLimitConstraints to check the allowed ranges of values. - * - * \note Limit is not persistent across reboots or driver unloads. - * Enable persistent mode to prevent driver from unloading when no application is using the device. - * - * @param device The identifier of the target device - * @param limit Power management limit in milliwatts to set - * - * @return - * - \ref NVML_SUCCESS if \a limit has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a defaultLimit is out of range - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetPowerManagementLimitConstraints - * @see nvmlDeviceGetPowerManagementDefaultLimit - */ -nvmlReturn_t DECLDIR nvmlDeviceSetPowerManagementLimit(nvmlDevice_t device, unsigned int limit); - -/** - * Sets new GOM. See \a nvmlGpuOperationMode_t for details. - * - * For GK110 M-class and X-class Tesla &tm; products from the Kepler family. - * Modes \ref NVML_GOM_LOW_DP and \ref NVML_GOM_ALL_ON are supported on fully supported GeForce products. - * Not supported on Quadro ® and Tesla &tm; C-class products. - * Requires root/admin permissions. - * - * Changing GOMs requires a reboot. - * The reboot requirement might be removed in the future. - * - * Compute only GOMs don't support graphics acceleration. Under windows switching to these GOMs when - * pending driver model is WDDM is not supported. See \ref nvmlDeviceSetDriverModel. - * - * @param device The identifier of the target device - * @param mode Target GOM - * - * @return - * - \ref NVML_SUCCESS if \a mode has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode incorrect - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support GOM or specific mode - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlGpuOperationMode_t - * @see nvmlDeviceGetGpuOperationMode - */ -nvmlReturn_t DECLDIR nvmlDeviceSetGpuOperationMode(nvmlDevice_t device, nvmlGpuOperationMode_t mode); - -/** - * Changes the root/admin restructions on certain APIs. See \a nvmlRestrictedAPI_t for the list of supported APIs. - * This method can be used by a root/admin user to give non-root/admin access to certain otherwise-restricted APIs. - * The new setting lasts for the lifetime of the NVIDIA driver; it is not persistent. See \a nvmlDeviceGetAPIRestriction - * to query the current restriction settings. - * - * For Kepler &tm; or newer fully supported devices. - * Requires root/admin permissions. - * - * @param device The identifier of the target device - * @param apiType Target API type for this operation - * @param isRestricted The target restriction - * - * @return - * - \ref NVML_SUCCESS if \a isRestricted has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a apiType incorrect - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support changing API restrictions or the device does not support - * the feature that api restrictions are being set for (E.G. Enabling/disabling auto - * boosted clocks is not supported by the device) - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlRestrictedAPI_t - */ -nvmlReturn_t DECLDIR nvmlDeviceSetAPIRestriction(nvmlDevice_t device, nvmlRestrictedAPI_t apiType, nvmlEnableState_t isRestricted); - -/** - * @} - */ - -/** @addtogroup nvmlAccountingStats - * @{ - */ - -/** - * Enables or disables per process accounting. - * - * For Kepler &tm; or newer fully supported devices. - * Requires root/admin permissions. - * - * @note This setting is not persistent and will default to disabled after driver unloads. - * Enable persistence mode to be sure the setting doesn't switch off to disabled. - * - * @note Enabling accounting mode has no negative impact on the GPU performance. - * - * @note Disabling accounting clears all accounting pids information. - * - * @note On MIG-enabled GPUs, accounting mode would be set to DISABLED and changing it is not supported. - * - * See \ref nvmlDeviceGetAccountingMode - * See \ref nvmlDeviceGetAccountingStats - * See \ref nvmlDeviceClearAccountingPids - * - * @param device The identifier of the target device - * @param mode The target accounting mode - * - * @return - * - \ref NVML_SUCCESS if the new mode has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a mode are invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceSetAccountingMode(nvmlDevice_t device, nvmlEnableState_t mode); - -/** - * Clears accounting information about all processes that have already terminated. - * - * For Kepler &tm; or newer fully supported devices. - * Requires root/admin permissions. - * - * See \ref nvmlDeviceGetAccountingMode - * See \ref nvmlDeviceGetAccountingStats - * See \ref nvmlDeviceSetAccountingMode - * - * @param device The identifier of the target device - * - * @return - * - \ref NVML_SUCCESS if accounting information has been cleared - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device are invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceClearAccountingPids(nvmlDevice_t device); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup NvLink NvLink Methods - * This chapter describes methods that NVML can perform on NVLINK enabled devices. - * @{ - */ -/***************************************************************************************************/ - -/** - * Retrieves the state of the device's NvLink for the link specified - * - * For Pascal &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param link Specifies the NvLink link to be queried - * @param isActive \a nvmlEnableState_t where NVML_FEATURE_ENABLED indicates that - * the link is active and NVML_FEATURE_DISABLED indicates it - * is inactive - * - * @return - * - \ref NVML_SUCCESS if \a isActive has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid or \a isActive is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive); - -/** - * Retrieves the version of the device's NvLink for the link specified - * - * For Pascal &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param link Specifies the NvLink link to be queried - * @param version Requested NvLink version - * - * @return - * - \ref NVML_SUCCESS if \a version has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid or \a version is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkVersion(nvmlDevice_t device, unsigned int link, unsigned int *version); - -/** - * Retrieves the requested capability from the device's NvLink for the link specified - * Please refer to the \a nvmlNvLinkCapability_t structure for the specific caps that can be queried - * The return value should be treated as a boolean. - * - * For Pascal &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param link Specifies the NvLink link to be queried - * @param capability Specifies the \a nvmlNvLinkCapability_t to be queried - * @param capResult A boolean for the queried capability indicating that feature is available - * - * @return - * - \ref NVML_SUCCESS if \a capResult has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, or \a capability is invalid or \a capResult is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, - nvmlNvLinkCapability_t capability, unsigned int *capResult); - -/** - * Retrieves the PCI information for the remote node on a NvLink link - * Note: pciSubSystemId is not filled in this function and is indeterminate - * - * For Pascal &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param link Specifies the NvLink link to be queried - * @param pci \a nvmlPciInfo_t of the remote node for the specified link - * - * @return - * - \ref NVML_SUCCESS if \a pci has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid or \a pci is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkRemotePciInfo_v2(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci); - -/** - * Retrieves the specified error counter value - * Please refer to \a nvmlNvLinkErrorCounter_t for error counters that are available - * - * For Pascal &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param link Specifies the NvLink link to be queried - * @param counter Specifies the NvLink counter to be queried - * @param counterValue Returned counter value - * - * @return - * - \ref NVML_SUCCESS if \a counter has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, or \a counter is invalid or \a counterValue is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkErrorCounter(nvmlDevice_t device, unsigned int link, - nvmlNvLinkErrorCounter_t counter, unsigned long long *counterValue); - -/** - * Resets all error counters to zero - * Please refer to \a nvmlNvLinkErrorCounter_t for the list of error counters that are reset - * - * For Pascal &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param link Specifies the NvLink link to be queried - * - * @return - * - \ref NVML_SUCCESS if the reset is successful - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceResetNvLinkErrorCounters(nvmlDevice_t device, unsigned int link); - -/** - * Deprecated: Setting utilization counter control is no longer supported. - * - * Set the NVLINK utilization counter control information for the specified counter, 0 or 1. - * Please refer to \a nvmlNvLinkUtilizationControl_t for the structure definition. Performs a reset - * of the counters if the reset parameter is non-zero. - * - * For Pascal &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param counter Specifies the counter that should be set (0 or 1). - * @param link Specifies the NvLink link to be queried - * @param control A reference to the \a nvmlNvLinkUtilizationControl_t to set - * @param reset Resets the counters on set if non-zero - * - * @return - * - \ref NVML_SUCCESS if the control has been set successfully - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a counter, \a link, or \a control is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceSetNvLinkUtilizationControl(nvmlDevice_t device, unsigned int link, unsigned int counter, - nvmlNvLinkUtilizationControl_t *control, unsigned int reset); - -/** - * Deprecated: Getting utilization counter control is no longer supported. - * - * Get the NVLINK utilization counter control information for the specified counter, 0 or 1. - * Please refer to \a nvmlNvLinkUtilizationControl_t for the structure definition - * - * For Pascal &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param counter Specifies the counter that should be set (0 or 1). - * @param link Specifies the NvLink link to be queried - * @param control A reference to the \a nvmlNvLinkUtilizationControl_t to place information - * - * @return - * - \ref NVML_SUCCESS if the control has been set successfully - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a counter, \a link, or \a control is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkUtilizationControl(nvmlDevice_t device, unsigned int link, unsigned int counter, - nvmlNvLinkUtilizationControl_t *control); - - -/** - * Deprecated: Use \ref nvmlDeviceGetFieldValues with NVML_FI_DEV_NVLINK_THROUGHPUT_* as field values instead. - * - * Retrieve the NVLINK utilization counter based on the current control for a specified counter. - * In general it is good practice to use \a nvmlDeviceSetNvLinkUtilizationControl - * before reading the utilization counters as they have no default state - * - * For Pascal &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param link Specifies the NvLink link to be queried - * @param counter Specifies the counter that should be read (0 or 1). - * @param rxcounter Receive counter return value - * @param txcounter Transmit counter return value - * - * @return - * - \ref NVML_SUCCESS if \a rxcounter and \a txcounter have been successfully set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a counter, or \a link is invalid or \a rxcounter or \a txcounter are NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkUtilizationCounter(nvmlDevice_t device, unsigned int link, unsigned int counter, - unsigned long long *rxcounter, unsigned long long *txcounter); - -/** - * Deprecated: Freezing NVLINK utilization counters is no longer supported. - * - * Freeze the NVLINK utilization counters - * Both the receive and transmit counters are operated on by this function - * - * For Pascal &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param link Specifies the NvLink link to be queried - * @param counter Specifies the counter that should be frozen (0 or 1). - * @param freeze NVML_FEATURE_ENABLED = freeze the receive and transmit counters - * NVML_FEATURE_DISABLED = unfreeze the receive and transmit counters - * - * @return - * - \ref NVML_SUCCESS if counters were successfully frozen or unfrozen - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, \a counter, or \a freeze is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceFreezeNvLinkUtilizationCounter (nvmlDevice_t device, unsigned int link, - unsigned int counter, nvmlEnableState_t freeze); - -/** - * Deprecated: Resetting NVLINK utilization counters is no longer supported. - * - * Reset the NVLINK utilization counters - * Both the receive and transmit counters are operated on by this function - * - * For Pascal &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param link Specifies the NvLink link to be reset - * @param counter Specifies the counter that should be reset (0 or 1) - * - * @return - * - \ref NVML_SUCCESS if counters were successfully reset - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, or \a counter is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceResetNvLinkUtilizationCounter (nvmlDevice_t device, unsigned int link, unsigned int counter); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlEvents Event Handling Methods - * This chapter describes methods that NVML can perform against each device to register and wait for - * some event to occur. - * @{ - */ -/***************************************************************************************************/ - -/** - * Create an empty set of events. - * Event set should be freed by \ref nvmlEventSetFree - * - * For Fermi &tm; or newer fully supported devices. - * @param set Reference in which to return the event handle - * - * @return - * - \ref NVML_SUCCESS if the event has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a set is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlEventSetFree - */ -nvmlReturn_t DECLDIR nvmlEventSetCreate(nvmlEventSet_t *set); - -/** - * Starts recording of events on a specified devices and add the events to specified \ref nvmlEventSet_t - * - * For Fermi &tm; or newer fully supported devices. - * Ecc events are available only on ECC enabled devices (see \ref nvmlDeviceGetTotalEccErrors) - * Power capping events are available only on Power Management enabled devices (see \ref nvmlDeviceGetPowerManagementMode) - * - * For Linux only. - * - * \b IMPORTANT: Operations on \a set are not thread safe - * - * This call starts recording of events on specific device. - * All events that occurred before this call are not recorded. - * Checking if some event occurred can be done with \ref nvmlEventSetWait_v2 - * - * If function reports NVML_ERROR_UNKNOWN, event set is in undefined state and should be freed. - * If function reports NVML_ERROR_NOT_SUPPORTED, event set can still be used. None of the requested eventTypes - * are registered in that case. - * - * @param device The identifier of the target device - * @param eventTypes Bitmask of \ref nvmlEventType to record - * @param set Set to which add new event types - * - * @return - * - \ref NVML_SUCCESS if the event has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a eventTypes is invalid or \a set is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the platform does not support this feature or some of requested event types - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlEventType - * @see nvmlDeviceGetSupportedEventTypes - * @see nvmlEventSetWait - * @see nvmlEventSetFree - */ -nvmlReturn_t DECLDIR nvmlDeviceRegisterEvents(nvmlDevice_t device, unsigned long long eventTypes, nvmlEventSet_t set); - -/** - * Returns information about events supported on device - * - * For Fermi &tm; or newer fully supported devices. - * - * Events are not supported on Windows. So this function returns an empty mask in \a eventTypes on Windows. - * - * @param device The identifier of the target device - * @param eventTypes Reference in which to return bitmask of supported events - * - * @return - * - \ref NVML_SUCCESS if the eventTypes has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a eventType is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlEventType - * @see nvmlDeviceRegisterEvents - */ -nvmlReturn_t DECLDIR nvmlDeviceGetSupportedEventTypes(nvmlDevice_t device, unsigned long long *eventTypes); - -/** - * Waits on events and delivers events - * - * For Fermi &tm; or newer fully supported devices. - * - * If some events are ready to be delivered at the time of the call, function returns immediately. - * If there are no events ready to be delivered, function sleeps till event arrives - * but not longer than specified timeout. This function in certain conditions can return before - * specified timeout passes (e.g. when interrupt arrives) - * - * On Windows, in case of xid error, the function returns the most recent xid error type seen by the system. - * If there are multiple xid errors generated before nvmlEventSetWait is invoked then the last seen xid error - * type is returned for all xid error events. - * - * On Linux, every xid error event would return the associated event data and other information if applicable. - * - * In MIG mode, if device handle is provided, the API reports all the events for the available instances, - * only if the caller has appropriate privileges. In absence of required privileges, only the events which - * affect all the instances (i.e. whole device) are reported. - * - * This API does not currently support per-instance event reporting using MIG device handles. - * - * @param set Reference to set of events to wait on - * @param data Reference in which to return event data - * @param timeoutms Maximum amount of wait time in milliseconds for registered event - * - * @return - * - \ref NVML_SUCCESS if the data has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a data is NULL - * - \ref NVML_ERROR_TIMEOUT if no event arrived in specified timeout or interrupt arrived - * - \ref NVML_ERROR_GPU_IS_LOST if a GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlEventType - * @see nvmlDeviceRegisterEvents - */ -nvmlReturn_t DECLDIR nvmlEventSetWait_v2(nvmlEventSet_t set, nvmlEventData_t * data, unsigned int timeoutms); - -/** - * Releases events in the set - * - * For Fermi &tm; or newer fully supported devices. - * - * @param set Reference to events to be released - * - * @return - * - \ref NVML_SUCCESS if the event has been successfully released - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceRegisterEvents - */ -nvmlReturn_t DECLDIR nvmlEventSetFree(nvmlEventSet_t set); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlZPI Drain states - * This chapter describes methods that NVML can perform against each device to control their drain state - * and recognition by NVML and NVIDIA kernel driver. These methods can be used with out-of-band tools to - * power on/off GPUs, enable robust reset scenarios, etc. - * @{ - */ -/***************************************************************************************************/ - -/** - * Modify the drain state of a GPU. This method forces a GPU to no longer accept new incoming requests. - * Any new NVML process will no longer see this GPU. Persistence mode for this GPU must be turned off before - * this call is made. - * Must be called as administrator. - * For Linux only. - * - * For Pascal &tm; or newer fully supported devices. - * Some Kepler devices supported. - * - * @param pciInfo The PCI address of the GPU drain state to be modified - * @param newState The drain state that should be entered, see \ref nvmlEnableState_t - * - * @return - * - \ref NVML_SUCCESS if counters were successfully reset - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a nvmlIndex or \a newState is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_NO_PERMISSION if the calling process has insufficient permissions to perform operation - * - \ref NVML_ERROR_IN_USE if the device has persistence mode turned on - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceModifyDrainState (nvmlPciInfo_t *pciInfo, nvmlEnableState_t newState); - -/** - * Query the drain state of a GPU. This method is used to check if a GPU is in a currently draining - * state. - * For Linux only. - * - * For Pascal &tm; or newer fully supported devices. - * Some Kepler devices supported. - * - * @param pciInfo The PCI address of the GPU drain state to be queried - * @param currentState The current drain state for this GPU, see \ref nvmlEnableState_t - * - * @return - * - \ref NVML_SUCCESS if counters were successfully reset - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a nvmlIndex or \a currentState is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceQueryDrainState (nvmlPciInfo_t *pciInfo, nvmlEnableState_t *currentState); - -/** - * This method will remove the specified GPU from the view of both NVML and the NVIDIA kernel driver - * as long as no other processes are attached. If other processes are attached, this call will return - * NVML_ERROR_IN_USE and the GPU will be returned to its original "draining" state. Note: the - * only situation where a process can still be attached after nvmlDeviceModifyDrainState() is called - * to initiate the draining state is if that process was using, and is still using, a GPU before the - * call was made. Also note, persistence mode counts as an attachment to the GPU thus it must be disabled - * prior to this call. - * - * For long-running NVML processes please note that this will change the enumeration of current GPUs. - * For example, if there are four GPUs present and GPU1 is removed, the new enumeration will be 0-2. - * Also, device handles after the removed GPU will not be valid and must be re-established. - * Must be run as administrator. - * For Linux only. - * - * For Pascal &tm; or newer fully supported devices. - * Some Kepler devices supported. - * - * @param pciInfo The PCI address of the GPU to be removed - * @param gpuState Whether the GPU is to be removed, from the OS - * see \ref nvmlDetachGpuState_t - * @param linkState Requested upstream PCIe link state, see \ref nvmlPcieLinkState_t - * - * @return - * - \ref NVML_SUCCESS if counters were successfully reset - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a nvmlIndex is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_IN_USE if the device is still in use and cannot be removed - */ -nvmlReturn_t DECLDIR nvmlDeviceRemoveGpu_v2(nvmlPciInfo_t *pciInfo, nvmlDetachGpuState_t gpuState, nvmlPcieLinkState_t linkState); - -/** - * Request the OS and the NVIDIA kernel driver to rediscover a portion of the PCI subsystem looking for GPUs that - * were previously removed. The portion of the PCI tree can be narrowed by specifying a domain, bus, and device. - * If all are zeroes then the entire PCI tree will be searched. Please note that for long-running NVML processes - * the enumeration will change based on how many GPUs are discovered and where they are inserted in bus order. - * - * In addition, all newly discovered GPUs will be initialized and their ECC scrubbed which may take several seconds - * per GPU. Also, all device handles are no longer guaranteed to be valid post discovery. - * - * Must be run as administrator. - * For Linux only. - * - * For Pascal &tm; or newer fully supported devices. - * Some Kepler devices supported. - * - * @param pciInfo The PCI tree to be searched. Only the domain, bus, and device - * fields are used in this call. - * - * @return - * - \ref NVML_SUCCESS if counters were successfully reset - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pciInfo is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the operating system does not support this feature - * - \ref NVML_ERROR_OPERATING_SYSTEM if the operating system is denying this feature - * - \ref NVML_ERROR_NO_PERMISSION if the calling process has insufficient permissions to perform operation - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceDiscoverGpus (nvmlPciInfo_t *pciInfo); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlFieldValueQueries Field Value Queries - * This chapter describes NVML operations that are associated with retrieving Field Values from NVML - * @{ - */ -/***************************************************************************************************/ - -/** - * Request values for a list of fields for a device. This API allows multiple fields to be queried at once. - * If any of the underlying fieldIds are populated by the same driver call, the results for those field IDs - * will be populated from a single call rather than making a driver call for each fieldId. - * - * @param device The device handle of the GPU to request field values for - * @param valuesCount Number of entries in values that should be retrieved - * @param values Array of \a valuesCount structures to hold field values. - * Each value's fieldId must be populated prior to this call - * - * @return - * - \ref NVML_SUCCESS if any values in \a values were populated. Note that you must - * check the nvmlReturn field of each value for each individual - * status - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a values is NULL - */ -nvmlReturn_t DECLDIR nvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values); - - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup gridVirtual GRID Virtualization Enums, Constants and Structs - * @{ - */ -/** @} */ -/***************************************************************************************************/ - -/***************************************************************************************************/ -/** @defgroup nvmlGridQueries GRID Virtualization APIs - * This chapter describes operations that are associated with NVIDIA GRID products. - * @{ - */ -/***************************************************************************************************/ - -/** - * This method is used to get the virtualization mode corresponding to the GPU. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device Identifier of the target device - * @param pVirtualMode Reference to virtualization mode. One of NVML_GPU_VIRTUALIZATION_? - * - * @return - * - \ref NVML_SUCCESS if \a pVirtualMode is fetched - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pVirtualMode is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetVirtualizationMode(nvmlDevice_t device, nvmlGpuVirtualizationMode_t *pVirtualMode); - -/** - * Queries if SR-IOV host operation is supported on a vGPU supported device. - * - * Checks whether SR-IOV host capability is supported by the device and the - * driver, and indicates device is in SR-IOV mode if both of these conditions - * are true. - * - * @param device The identifier of the target device - * @param pHostVgpuMode Reference in which to return the current vGPU mode - * - * @return - * - \ref NVML_SUCCESS if device's vGPU mode has been successfully retrieved - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device handle is 0 or \a pVgpuMode is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if \a device doesn't support this feature. - * - \ref NVML_ERROR_UNKNOWN if any unexpected error occurred - */ -nvmlReturn_t DECLDIR nvmlDeviceGetHostVgpuMode(nvmlDevice_t device, nvmlHostVgpuMode_t *pHostVgpuMode); - -/** - * This method is used to set the virtualization mode corresponding to the GPU. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device Identifier of the target device - * @param virtualMode virtualization mode. One of NVML_GPU_VIRTUALIZATION_? - * - * @return - * - \ref NVML_SUCCESS if \a pVirtualMode is set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pVirtualMode is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_NOT_SUPPORTED if setting of virtualization mode is not supported. - * - \ref NVML_ERROR_NO_PERMISSION if setting of virtualization mode is not allowed for this client. - */ -nvmlReturn_t DECLDIR nvmlDeviceSetVirtualizationMode(nvmlDevice_t device, nvmlGpuVirtualizationMode_t virtualMode); - -/** - * Retrieve the GRID licensable features. - * - * Identifies whether the system supports GRID Software Licensing. If it does, return the list of licensable feature(s) - * and their current license status. - * - * @param device Identifier of the target device - * @param pGridLicensableFeatures Pointer to structure in which GRID licensable features are returned - * - * @return - * - \ref NVML_SUCCESS if licensable features are successfully retrieved - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pGridLicensableFeatures is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetGridLicensableFeatures_v3(nvmlDevice_t device, nvmlGridLicensableFeatures_t *pGridLicensableFeatures); - -/** - * Retrieves the current utilization and process ID - * - * For Maxwell &tm; or newer fully supported devices. - * - * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for processes running. - * Utilization values are returned as an array of utilization sample structures in the caller-supplied buffer pointed at - * by \a utilization. One utilization sample structure is returned per process running, that had some non-zero utilization - * during the last sample period. It includes the CPU timestamp at which the samples were recorded. Individual utilization values - * are returned as "unsigned int" values. - * - * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with - * \a utilization set to NULL. The caller should allocate a buffer of size - * processSamplesCount * sizeof(nvmlProcessUtilizationSample_t). Invoke the function again with the allocated buffer passed - * in \a utilization, and \a processSamplesCount set to the number of entries the buffer is sized for. - * - * On successful return, the function updates \a processSamplesCount with the number of process utilization sample - * structures that were actually written. This may differ from a previously read value as instances are created or - * destroyed. - * - * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 - * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp - * to a timeStamp retrieved from a previous query to read utilization since the previous query. - * - * @note On MIG-enabled GPUs, querying process utilization is not currently supported. - * - * @param device The identifier of the target device - * @param utilization Pointer to caller-supplied buffer in which guest process utilization samples are returned - * @param processSamplesCount Pointer to caller-supplied array size, and returns number of processes running - * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. - - * @return - * - \ref NVML_SUCCESS if \a utilization has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetProcessUtilization(nvmlDevice_t device, nvmlProcessUtilizationSample_t *utilization, - unsigned int *processSamplesCount, unsigned long long lastSeenTimeStamp); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlVgpu GRID vGPU Management - * @{ - * - * This chapter describes APIs supporting NVIDIA GRID vGPU. - */ -/***************************************************************************************************/ - -/** - * Retrieve the supported vGPU types on a physical GPU (device). - * - * An array of supported vGPU types for the physical GPU indicated by \a device is returned in the caller-supplied buffer - * pointed at by \a vgpuTypeIds. The element count of nvmlVgpuTypeId_t array is passed in \a vgpuCount, and \a vgpuCount - * is used to return the number of vGPU types written to the buffer. - * - * If the supplied buffer is not large enough to accomodate the vGPU type array, the function returns - * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuTypeId_t array required in \a vgpuCount. - * To query the number of vGPU types supported for the GPU, call this function with *vgpuCount = 0. - * The code will return NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU types are supported. - * - * @param device The identifier of the target device - * @param vgpuCount Pointer to caller-supplied array size, and returns number of vGPU types - * @param vgpuTypeIds Pointer to caller-supplied array in which to return list of vGPU types - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_INSUFFICIENT_SIZE \a vgpuTypeIds buffer is too small, array element count is returned in \a vgpuCount - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuCount is NULL or \a device is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetSupportedVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuTypeId_t *vgpuTypeIds); - -/** - * Retrieve the currently creatable vGPU types on a physical GPU (device). - * - * An array of creatable vGPU types for the physical GPU indicated by \a device is returned in the caller-supplied buffer - * pointed at by \a vgpuTypeIds. The element count of nvmlVgpuTypeId_t array is passed in \a vgpuCount, and \a vgpuCount - * is used to return the number of vGPU types written to the buffer. - * - * The creatable vGPU types for a device may differ over time, as there may be restrictions on what type of vGPU types - * can concurrently run on a device. For example, if only one vGPU type is allowed at a time on a device, then the creatable - * list will be restricted to whatever vGPU type is already running on the device. - * - * If the supplied buffer is not large enough to accomodate the vGPU type array, the function returns - * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuTypeId_t array required in \a vgpuCount. - * To query the number of vGPU types createable for the GPU, call this function with *vgpuCount = 0. - * The code will return NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU types are creatable. - * - * @param device The identifier of the target device - * @param vgpuCount Pointer to caller-supplied array size, and returns number of vGPU types - * @param vgpuTypeIds Pointer to caller-supplied array in which to return list of vGPU types - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_INSUFFICIENT_SIZE \a vgpuTypeIds buffer is too small, array element count is returned in \a vgpuCount - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuCount is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetCreatableVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuTypeId_t *vgpuTypeIds); - -/** - * Retrieve the class of a vGPU type. It will not exceed 64 characters in length (including the NUL terminator). - * See \ref nvmlConstants::NVML_DEVICE_NAME_BUFFER_SIZE. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuTypeId Handle to vGPU type - * @param vgpuTypeClass Pointer to string array to return class in - * @param size Size of string - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a vgpuTypeClass is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuTypeGetClass(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeClass, unsigned int *size); - -/** - * Retrieve the vGPU type name. - * - * The name is an alphanumeric string that denotes a particular vGPU, e.g. GRID M60-2Q. It will not - * exceed 64 characters in length (including the NUL terminator). See \ref - * nvmlConstants::NVML_DEVICE_NAME_BUFFER_SIZE. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuTypeId Handle to vGPU type - * @param vgpuTypeName Pointer to buffer to return name - * @param size Size of buffer - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a name is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuTypeGetName(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeName, unsigned int *size); - -/** - * Retrieve the device ID of a vGPU type. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuTypeId Handle to vGPU type - * @param deviceID Device ID and vendor ID of the device contained in single 32 bit value - * @param subsystemID Subsytem ID and subsytem vendor ID of the device contained in single 32 bit value - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a deviceId or \a subsystemID are NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuTypeGetDeviceID(nvmlVgpuTypeId_t vgpuTypeId, unsigned long long *deviceID, unsigned long long *subsystemID); - -/** - * Retrieve the vGPU framebuffer size in bytes. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuTypeId Handle to vGPU type - * @param fbSize Pointer to framebuffer size in bytes - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a fbSize is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuTypeGetFramebufferSize(nvmlVgpuTypeId_t vgpuTypeId, unsigned long long *fbSize); - -/** - * Retrieve count of vGPU's supported display heads. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuTypeId Handle to vGPU type - * @param numDisplayHeads Pointer to number of display heads - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a numDisplayHeads is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuTypeGetNumDisplayHeads(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *numDisplayHeads); - -/** - * Retrieve vGPU display head's maximum supported resolution. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuTypeId Handle to vGPU type - * @param displayIndex Zero-based index of display head - * @param xdim Pointer to maximum number of pixels in X dimension - * @param ydim Pointer to maximum number of pixels in Y dimension - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a xdim or \a ydim are NULL, or \a displayIndex - * is out of range. - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuTypeGetResolution(nvmlVgpuTypeId_t vgpuTypeId, unsigned int displayIndex, unsigned int *xdim, unsigned int *ydim); - -/** - * Retrieve license requirements for a vGPU type - * - * The license type and version required to run the specified vGPU type is returned as an alphanumeric string, in the form - * ",", for example "GRID-Virtual-PC,2.0". If a vGPU is runnable with* more than one type of license, - * the licenses are delimited by a semicolon, for example "GRID-Virtual-PC,2.0;GRID-Virtual-WS,2.0;GRID-Virtual-WS-Ext,2.0". - * - * The total length of the returned string will not exceed 128 characters, including the NUL terminator. - * See \ref nvmlVgpuConstants::NVML_GRID_LICENSE_BUFFER_SIZE. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuTypeId Handle to vGPU type - * @param vgpuTypeLicenseString Pointer to buffer to return license info - * @param size Size of \a vgpuTypeLicenseString buffer - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a vgpuTypeLicenseString is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuTypeGetLicense(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeLicenseString, unsigned int size); - -/** - * Retrieve the static frame rate limit value of the vGPU type - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuTypeId Handle to vGPU type - * @param frameRateLimit Reference to return the frame rate limit value - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_NOT_SUPPORTED if frame rate limiter is turned off for the vGPU type - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a frameRateLimit is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuTypeGetFrameRateLimit(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *frameRateLimit); - -/** - * Retrieve the maximum number of vGPU instances creatable on a device for given vGPU type - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param vgpuTypeId Handle to vGPU type - * @param vgpuInstanceCount Pointer to get the max number of vGPU instances - * that can be created on a deicve for given vgpuTypeId - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid or is not supported on target device, - * or \a vgpuInstanceCount is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuTypeGetMaxInstances(nvmlDevice_t device, nvmlVgpuTypeId_t vgpuTypeId, unsigned int *vgpuInstanceCount); - -/** - * Retrieve the maximum number of vGPU instances supported per VM for given vGPU type - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuTypeId Handle to vGPU type - * @param vgpuInstanceCountPerVm Pointer to get the max number of vGPU instances supported per VM for given \a vgpuTypeId - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a vgpuInstanceCountPerVm is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuTypeGetMaxInstancesPerVm(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *vgpuInstanceCountPerVm); - -/** - * Retrieve the active vGPU instances on a device. - * - * An array of active vGPU instances is returned in the caller-supplied buffer pointed at by \a vgpuInstances. The - * array elememt count is passed in \a vgpuCount, and \a vgpuCount is used to return the number of vGPU instances - * written to the buffer. - * - * If the supplied buffer is not large enough to accomodate the vGPU instance array, the function returns - * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuInstance_t array required in \a vgpuCount. - * To query the number of active vGPU instances, call this function with *vgpuCount = 0. The code will return - * NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU Types are supported. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param vgpuCount Pointer which passes in the array size as well as get - * back the number of types - * @param vgpuInstances Pointer to array in which to return list of vGPU instances - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a vgpuCount is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small - * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetActiveVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuInstance_t *vgpuInstances); - -/** - * Retrieve the VM ID associated with a vGPU instance. - * - * The VM ID is returned as a string, not exceeding 80 characters in length (including the NUL terminator). - * See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE. - * - * The format of the VM ID varies by platform, and is indicated by the type identifier returned in \a vmIdType. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuInstance Identifier of the target vGPU instance - * @param vmId Pointer to caller-supplied buffer to hold VM ID - * @param size Size of buffer in bytes - * @param vmIdType Pointer to hold VM ID type - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vmId or \a vmIdType is NULL, or \a vgpuInstance is 0 - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetVmID(nvmlVgpuInstance_t vgpuInstance, char *vmId, unsigned int size, nvmlVgpuVmIdType_t *vmIdType); - -/** - * Retrieve the UUID of a vGPU instance. - * - * The UUID is a globally unique identifier associated with the vGPU, and is returned as a 5-part hexadecimal string, - * not exceeding 80 characters in length (including the NULL terminator). - * See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuInstance Identifier of the target vGPU instance - * @param uuid Pointer to caller-supplied buffer to hold vGPU UUID - * @param size Size of buffer in bytes - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a uuid is NULL - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetUUID(nvmlVgpuInstance_t vgpuInstance, char *uuid, unsigned int size); - -/** - * Retrieve the NVIDIA driver version installed in the VM associated with a vGPU. - * - * The version is returned as an alphanumeric string in the caller-supplied buffer \a version. The length of the version - * string will not exceed 80 characters in length (including the NUL terminator). - * See \ref nvmlConstants::NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE. - * - * nvmlVgpuInstanceGetVmDriverVersion() may be called at any time for a vGPU instance. The guest VM driver version is - * returned as "Not Available" if no NVIDIA driver is installed in the VM, or the VM has not yet booted to the point where the - * NVIDIA driver is loaded and initialized. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuInstance Identifier of the target vGPU instance - * @param version Caller-supplied buffer to return driver version string - * @param length Size of \a version buffer - * - * @return - * - \ref NVML_SUCCESS if \a version has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0 - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetVmDriverVersion(nvmlVgpuInstance_t vgpuInstance, char* version, unsigned int length); - -/** - * Retrieve the framebuffer usage in bytes. - * - * Framebuffer usage is the amont of vGPU framebuffer memory that is currently in use by the VM. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuInstance The identifier of the target instance - * @param fbUsage Pointer to framebuffer usage in bytes - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a fbUsage is NULL - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFbUsage(nvmlVgpuInstance_t vgpuInstance, unsigned long long *fbUsage); - -/** - * Retrieve the current licensing state of the vGPU instance. - * - * If the vGPU is currently licensed, \a licensed is set to 1, otherwise it is set to 0. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuInstance Identifier of the target vGPU instance - * @param licensed Reference to return the licensing status - * - * @return - * - \ref NVML_SUCCESS if \a licensed has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a licensed is NULL - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetLicenseStatus(nvmlVgpuInstance_t vgpuInstance, unsigned int *licensed); - -/** - * Retrieve the vGPU type of a vGPU instance. - * - * Returns the vGPU type ID of vgpu assigned to the vGPU instance. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuInstance Identifier of the target vGPU instance - * @param vgpuTypeId Reference to return the vgpuTypeId - * - * @return - * - \ref NVML_SUCCESS if \a vgpuTypeId has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a vgpuTypeId is NULL - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetType(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuTypeId_t *vgpuTypeId); - -/** - * Retrieve the frame rate limit set for the vGPU instance. - * - * Returns the value of the frame rate limit set for the vGPU instance - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuInstance Identifier of the target vGPU instance - * @param frameRateLimit Reference to return the frame rate limit - * - * @return - * - \ref NVML_SUCCESS if \a frameRateLimit has been set - * - \ref NVML_ERROR_NOT_SUPPORTED if frame rate limiter is turned off for the vGPU type - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a frameRateLimit is NULL - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFrameRateLimit(nvmlVgpuInstance_t vgpuInstance, unsigned int *frameRateLimit); - -/** - * Retrieve the current ECC mode of vGPU instance. - * - * @param vgpuInstance The identifier of the target vGPU instance - * @param eccMode Reference in which to return the current ECC mode - * - * @return - * - \ref NVML_SUCCESS if the vgpuInstance's ECC mode has been successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a mode is NULL - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system - * - \ref NVML_ERROR_NOT_SUPPORTED if the vGPU doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEccMode(nvmlVgpuInstance_t vgpuInstance, nvmlEnableState_t *eccMode); - -/** - * Retrieve the encoder capacity of a vGPU instance, as a percentage of maximum encoder capacity with valid values in the range 0-100. - * - * For Maxwell &tm; or newer fully supported devices. - * - * @param vgpuInstance Identifier of the target vGPU instance - * @param encoderCapacity Reference to an unsigned int for the encoder capacity - * - * @return - * - \ref NVML_SUCCESS if \a encoderCapacity has been retrived - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a encoderQueryType is invalid - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderCapacity(nvmlVgpuInstance_t vgpuInstance, unsigned int *encoderCapacity); - -/** - * Set the encoder capacity of a vGPU instance, as a percentage of maximum encoder capacity with valid values in the range 0-100. - * - * For Maxwell &tm; or newer fully supported devices. - * - * @param vgpuInstance Identifier of the target vGPU instance - * @param encoderCapacity Unsigned int for the encoder capacity value - * - * @return - * - \ref NVML_SUCCESS if \a encoderCapacity has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a encoderCapacity is out of range of 0-100. - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceSetEncoderCapacity(nvmlVgpuInstance_t vgpuInstance, unsigned int encoderCapacity); - -/** - * Retrieves the current encoder statistics of a vGPU Instance - * - * For Maxwell &tm; or newer fully supported devices. - * - * @param vgpuInstance Identifier of the target vGPU instance - * @param sessionCount Reference to an unsigned int for count of active encoder sessions - * @param averageFps Reference to an unsigned int for trailing average FPS of all active sessions - * @param averageLatency Reference to an unsigned int for encode latency in microseconds - * - * @return - * - \ref NVML_SUCCESS if \a sessionCount, \a averageFps and \a averageLatency is fetched - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount , or \a averageFps or \a averageLatency is NULL - * or \a vgpuInstance is 0. - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderStats(nvmlVgpuInstance_t vgpuInstance, unsigned int *sessionCount, - unsigned int *averageFps, unsigned int *averageLatency); - -/** - * Retrieves information about all active encoder sessions on a vGPU Instance. - * - * An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfo. The - * array element count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions - * written to the buffer. - * - * If the supplied buffer is not large enough to accomodate the active session array, the function returns - * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlEncoderSessionInfo_t array required in \a sessionCount. - * To query the number of active encoder sessions, call this function with *sessionCount = 0. The code will return - * NVML_SUCCESS with number of active encoder sessions updated in *sessionCount. - * - * For Maxwell &tm; or newer fully supported devices. - * - * @param vgpuInstance Identifier of the target vGPU instance - * @param sessionCount Reference to caller supplied array size, and returns - * the number of sessions. - * @param sessionInfo Reference to caller supplied array in which the list - * of session information us returned. - * - * @return - * - \ref NVML_SUCCESS if \a sessionInfo is fetched - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a sessionCount is too small, array element count is - returned in \a sessionCount - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount is NULL, or \a vgpuInstance is 0. - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderSessions(nvmlVgpuInstance_t vgpuInstance, unsigned int *sessionCount, nvmlEncoderSessionInfo_t *sessionInfo); - -/** -* Retrieves the active frame buffer capture sessions statistics of a vGPU Instance -* -* For Maxwell &tm; or newer fully supported devices. -* -* @param vgpuInstance Identifier of the target vGPU instance -* @param fbcStats Reference to nvmlFBCStats_t structure contianing NvFBC stats -* -* @return -* - \ref NVML_SUCCESS if \a fbcStats is fetched -* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized -* - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a fbcStats is NULL -* - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system -* - \ref NVML_ERROR_UNKNOWN on any unexpected error -*/ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFBCStats(nvmlVgpuInstance_t vgpuInstance, nvmlFBCStats_t *fbcStats); - -/** -* Retrieves information about active frame buffer capture sessions on a vGPU Instance. -* -* An array of active FBC sessions is returned in the caller-supplied buffer pointed at by \a sessionInfo. The -* array element count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions -* written to the buffer. -* -* If the supplied buffer is not large enough to accomodate the active session array, the function returns -* NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlFBCSessionInfo_t array required in \a sessionCount. -* To query the number of active FBC sessions, call this function with *sessionCount = 0. The code will return -* NVML_SUCCESS with number of active FBC sessions updated in *sessionCount. -* -* For Maxwell &tm; or newer fully supported devices. -* -* @note hResolution, vResolution, averageFPS and averageLatency data for a FBC session returned in \a sessionInfo may -* be zero if there are no new frames captured since the session started. -* -* @param vgpuInstance Identifier of the target vGPU instance -* @param sessionCount Reference to caller supplied array size, and returns the number of sessions. -* @param sessionInfo Reference in which to return the session information -* -* @return -* - \ref NVML_SUCCESS if \a sessionInfo is fetched -* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized -* - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a sessionCount is NULL. -* - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system -* - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a sessionCount is too small, array element count is returned in \a sessionCount -* - \ref NVML_ERROR_UNKNOWN on any unexpected error -*/ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFBCSessions(nvmlVgpuInstance_t vgpuInstance, unsigned int *sessionCount, nvmlFBCSessionInfo_t *sessionInfo); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvml GRID Virtualization Migration - * This chapter describes operations that are associated with vGPU Migration. - * @{ - */ -/***************************************************************************************************/ - -/** - * Structure representing range of vGPU versions. - */ -typedef struct nvmlVgpuVersion_st -{ - unsigned int minVersion; //!< Minimum vGPU version. - unsigned int maxVersion; //!< Maximum vGPU version. -} nvmlVgpuVersion_t; - -/** - * vGPU metadata structure. - */ -typedef struct nvmlVgpuMetadata_st -{ - unsigned int version; //!< Current version of the structure - unsigned int revision; //!< Current revision of the structure - nvmlVgpuGuestInfoState_t guestInfoState; //!< Current state of Guest-dependent fields - char guestDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Version of driver installed in guest - char hostDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Version of driver installed in host - unsigned int reserved[6]; //!< Reserved for internal use - unsigned int vgpuVirtualizationCaps; //!< vGPU virtualizaion capabilities bitfileld - unsigned int guestVgpuVersion; //!< vGPU version of guest driver - unsigned int opaqueDataSize; //!< Size of opaque data field in bytes - char opaqueData[4]; //!< Opaque data -} nvmlVgpuMetadata_t; - -/** - * Physical GPU metadata structure - */ -typedef struct nvmlVgpuPgpuMetadata_st -{ - unsigned int version; //!< Current version of the structure - unsigned int revision; //!< Current revision of the structure - char hostDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Host driver version - unsigned int pgpuVirtualizationCaps; //!< Pgpu virtualizaion capabilities bitfileld - unsigned int reserved[5]; //!< Reserved for internal use - nvmlVgpuVersion_t hostSupportedVgpuRange; //!< vGPU version range supported by host driver - unsigned int opaqueDataSize; //!< Size of opaque data field in bytes - char opaqueData[4]; //!< Opaque data -} nvmlVgpuPgpuMetadata_t; - -/** - * vGPU VM compatibility codes - */ -typedef enum nvmlVgpuVmCompatibility_enum -{ - NVML_VGPU_VM_COMPATIBILITY_NONE = 0x0, //!< vGPU is not runnable - NVML_VGPU_VM_COMPATIBILITY_COLD = 0x1, //!< vGPU is runnable from a cold / powered-off state (ACPI S5) - NVML_VGPU_VM_COMPATIBILITY_HIBERNATE = 0x2, //!< vGPU is runnable from a hibernated state (ACPI S4) - NVML_VGPU_VM_COMPATIBILITY_SLEEP = 0x4, //!< vGPU is runnable from a sleeped state (ACPI S3) - NVML_VGPU_VM_COMPATIBILITY_LIVE = 0x8, //!< vGPU is runnable from a live/paused (ACPI S0) -} nvmlVgpuVmCompatibility_t; - -/** - * vGPU-pGPU compatibility limit codes - */ -typedef enum nvmlVgpuPgpuCompatibilityLimitCode_enum -{ - NVML_VGPU_COMPATIBILITY_LIMIT_NONE = 0x0, //!< Compatibility is not limited. - NVML_VGPU_COMPATIBILITY_LIMIT_HOST_DRIVER = 0x1, //!< ompatibility is limited by host driver version. - NVML_VGPU_COMPATIBILITY_LIMIT_GUEST_DRIVER = 0x2, //!< Compatibility is limited by guest driver version. - NVML_VGPU_COMPATIBILITY_LIMIT_GPU = 0x4, //!< Compatibility is limited by GPU hardware. - NVML_VGPU_COMPATIBILITY_LIMIT_OTHER = 0x80000000, //!< Compatibility is limited by an undefined factor. -} nvmlVgpuPgpuCompatibilityLimitCode_t; - -/** - * vGPU-pGPU compatibility structure - */ -typedef struct nvmlVgpuPgpuCompatibility_st -{ - nvmlVgpuVmCompatibility_t vgpuVmCompatibility; //!< Compatibility of vGPU VM. See \ref nvmlVgpuVmCompatibility_t - nvmlVgpuPgpuCompatibilityLimitCode_t compatibilityLimitCode; //!< Limiting factor for vGPU-pGPU compatibility. See \ref nvmlVgpuPgpuCompatibilityLimitCode_t -} nvmlVgpuPgpuCompatibility_t; - -/** - * Returns vGPU metadata structure for a running vGPU. The structure contains information about the vGPU and its associated VM - * such as the currently installed NVIDIA guest driver version, together with host driver version and an opaque data section - * containing internal state. - * - * nvmlVgpuInstanceGetMetadata() may be called at any time for a vGPU instance. Some fields in the returned structure are - * dependent on information obtained from the guest VM, which may not yet have reached a state where that information - * is available. The current state of these dependent fields is reflected in the info structure's \ref nvmlVgpuGuestInfoState_t field. - * - * The VMM may choose to read and save the vGPU's VM info as persistent metadata associated with the VM, and provide - * it to GRID Virtual GPU Manager when creating a vGPU for subsequent instances of the VM. - * - * The caller passes in a buffer via \a vgpuMetadata, with the size of the buffer in \a bufferSize. If the vGPU Metadata structure - * is too large to fit in the supplied buffer, the function returns NVML_ERROR_INSUFFICIENT_SIZE with the size needed - * in \a bufferSize. - * - * @param vgpuInstance vGPU instance handle - * @param vgpuMetadata Pointer to caller-supplied buffer into which vGPU metadata is written - * @param bufferSize Size of vgpuMetadata buffer - * - * @return - * - \ref NVML_SUCCESS vGPU metadata structure was successfully returned - * - \ref NVML_ERROR_INSUFFICIENT_SIZE vgpuMetadata buffer is too small, required size is returned in \a bufferSize - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a bufferSize is NULL or \a vgpuInstance is 0; if \a vgpuMetadata is NULL and the value of \a bufferSize is not 0. - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetMetadata(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuMetadata_t *vgpuMetadata, unsigned int *bufferSize); - -/** - * Returns a vGPU metadata structure for the physical GPU indicated by \a device. The structure contains information about - * the GPU and the currently installed NVIDIA host driver version that's controlling it, together with an opaque data section - * containing internal state. - * - * The caller passes in a buffer via \a pgpuMetadata, with the size of the buffer in \a bufferSize. If the \a pgpuMetadata - * structure is too large to fit in the supplied buffer, the function returns NVML_ERROR_INSUFFICIENT_SIZE with the size needed - * in \a bufferSize. - * - * @param device The identifier of the target device - * @param pgpuMetadata Pointer to caller-supplied buffer into which \a pgpuMetadata is written - * @param bufferSize Pointer to size of \a pgpuMetadata buffer - * - * @return - * - \ref NVML_SUCCESS GPU metadata structure was successfully returned - * - \ref NVML_ERROR_INSUFFICIENT_SIZE pgpuMetadata buffer is too small, required size is returned in \a bufferSize - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a bufferSize is NULL or \a device is invalid; if \a pgpuMetadata is NULL and the value of \a bufferSize is not 0. - * - \ref NVML_ERROR_NOT_SUPPORTED vGPU is not supported by the system - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetVgpuMetadata(nvmlDevice_t device, nvmlVgpuPgpuMetadata_t *pgpuMetadata, unsigned int *bufferSize); - -/** - * Takes a vGPU instance metadata structure read from \ref nvmlVgpuInstanceGetMetadata(), and a vGPU metadata structure for a - * physical GPU read from \ref nvmlDeviceGetVgpuMetadata(), and returns compatibility information of the vGPU instance and the - * physical GPU. - * - * The caller passes in a buffer via \a compatibilityInfo, into which a compatibility information structure is written. The - * structure defines the states in which the vGPU / VM may be booted on the physical GPU. If the vGPU / VM compatibility - * with the physical GPU is limited, a limit code indicates the factor limiting compability. - * (see \ref nvmlVgpuPgpuCompatibilityLimitCode_t for details). - * - * Note: vGPU compatibility does not take into account dynamic capacity conditions that may limit a system's ability to - * boot a given vGPU or associated VM. - * - * @param vgpuMetadata Pointer to caller-supplied vGPU metadata structure - * @param pgpuMetadata Pointer to caller-supplied GPU metadata structure - * @param compatibilityInfo Pointer to caller-supplied buffer to hold compatibility info - * - * @return - * - \ref NVML_SUCCESS vGPU metadata structure was successfully returned - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuMetadata or \a pgpuMetadata or \a bufferSize are NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlGetVgpuCompatibility(nvmlVgpuMetadata_t *vgpuMetadata, nvmlVgpuPgpuMetadata_t *pgpuMetadata, nvmlVgpuPgpuCompatibility_t *compatibilityInfo); - -/** - * Returns the properties of the physical GPU indicated by the device in an ascii-encoded string format. - * - * The caller passes in a buffer via \a pgpuMetadata, with the size of the buffer in \a bufferSize. If the - * string is too large to fit in the supplied buffer, the function returns NVML_ERROR_INSUFFICIENT_SIZE with the size needed - * in \a bufferSize. - * - * @param device The identifier of the target device - * @param pgpuMetadata Pointer to caller-supplied buffer into which \a pgpuMetadata is written - * @param bufferSize Pointer to size of \a pgpuMetadata buffer - * - * @return - * - \ref NVML_SUCCESS GPU metadata structure was successfully returned - * - \ref NVML_ERROR_INSUFFICIENT_SIZE \a pgpuMetadata buffer is too small, required size is returned in \a bufferSize - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a bufferSize is NULL or \a device is invalid; if \a pgpuMetadata is NULL and the value of \a bufferSize is not 0. - * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the system - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetPgpuMetadataString(nvmlDevice_t device, char *pgpuMetadata, unsigned int *bufferSize); - -/* - * Virtual GPU (vGPU) version - * - * The NVIDIA vGPU Manager and the guest drivers are tagged with a range of supported vGPU versions. This determines the range of NVIDIA guest driver versions that - * are compatible for vGPU feature support with a given NVIDIA vGPU Manager. For vGPU feature support, the range of supported versions for the NVIDIA vGPU Manager - * and the guest driver must overlap. Otherwise, the guest driver fails to load in the VM. - * - * When the NVIDIA guest driver loads, either when the VM is booted or when the driver is installed or upgraded, a negotiation occurs between the guest driver - * and the NVIDIA vGPU Manager to select the highest mutually compatible vGPU version. The negotiated vGPU version stays the same across VM migration. - */ - -/** - * Query the ranges of supported vGPU versions. - * - * This function gets the linear range of supported vGPU versions that is preset for the NVIDIA vGPU Manager and the range set by an administrator. - * If the preset range has not been overridden by \ref nvmlSetVgpuVersion, both ranges are the same. - * - * The caller passes pointers to the following \ref nvmlVgpuVersion_t structures, into which the NVIDIA vGPU Manager writes the ranges: - * 1. \a supported structure that represents the preset range of vGPU versions supported by the NVIDIA vGPU Manager. - * 2. \a current structure that represents the range of supported vGPU versions set by an administrator. By default, this range is the same as the preset range. - * - * @param supported Pointer to the structure in which the preset range of vGPU versions supported by the NVIDIA vGPU Manager is written - * @param current Pointer to the structure in which the range of supported vGPU versions set by an administrator is written - * - * @return - * - \ref NVML_SUCCESS The vGPU version range structures were successfully obtained. - * - \ref NVML_ERROR_NOT_SUPPORTED The API is not supported. - * - \ref NVML_ERROR_INVALID_ARGUMENT The \a supported parameter or the \a current parameter is NULL. - * - \ref NVML_ERROR_UNKNOWN An error occurred while the data was being fetched. - */ -nvmlReturn_t DECLDIR nvmlGetVgpuVersion(nvmlVgpuVersion_t *supported, nvmlVgpuVersion_t *current); - -/** - * Override the preset range of vGPU versions supported by the NVIDIA vGPU Manager with a range set by an administrator. - * - * This function configures the NVIDIA vGPU Manager with a range of supported vGPU versions set by an administrator. This range must be a subset of the - * preset range that the NVIDIA vGPU Manager supports. The custom range set by an administrator takes precedence over the preset range and is advertised to - * the guest VM for negotiating the vGPU version. See \ref nvmlGetVgpuVersion for details of how to query the preset range of versions supported. - * - * This function takes a pointer to vGPU version range structure \ref nvmlVgpuVersion_t as input to override the preset vGPU version range that the NVIDIA vGPU Manager supports. - * - * After host system reboot or driver reload, the range of supported versions reverts to the range that is preset for the NVIDIA vGPU Manager. - * - * @note 1. The range set by the administrator must be a subset of the preset range that the NVIDIA vGPU Manager supports. Otherwise, an error is returned. - * 2. If the range of supported guest driver versions does not overlap the range set by the administrator, the guest driver fails to load. - * 3. If the range of supported guest driver versions overlaps the range set by the administrator, the guest driver will load with a negotiated - * vGPU version that is the maximum value in the overlapping range. - * 4. No VMs must be running on the host when this function is called. If a VM is running on the host, the call to this function fails. - * - * @param vgpuVersion Pointer to a caller-supplied range of supported vGPU versions. - * - * @return - * - \ref NVML_SUCCESS The preset range of supported vGPU versions was successfully overridden. - * - \ref NVML_ERROR_NOT_SUPPORTED The API is not supported. - * - \ref NVML_ERROR_IN_USE The range was not overridden because a VM is running on the host. - * - \ref NVML_ERROR_INVALID_ARGUMENT The \a vgpuVersion parameter specifies a range that is outside the range supported by the NVIDIA vGPU Manager or if \a vgpuVersion is NULL. - */ -nvmlReturn_t DECLDIR nvmlSetVgpuVersion(nvmlVgpuVersion_t *vgpuVersion); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlUtil GRID Virtualization Utilization and Accounting - * This chapter describes operations that are associated with vGPU Utilization and Accounting. - * @{ - */ -/***************************************************************************************************/ - -/** - * Retrieves current utilization for vGPUs on a physical GPU (device). - * - * For Kepler &tm; or newer fully supported devices. - * - * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for vGPU instances running - * on a device. Utilization values are returned as an array of utilization sample structures in the caller-supplied buffer - * pointed at by \a utilizationSamples. One utilization sample structure is returned per vGPU instance, and includes the - * CPU timestamp at which the samples were recorded. Individual utilization values are returned as "unsigned int" values - * in nvmlValue_t unions. The function sets the caller-supplied \a sampleValType to NVML_VALUE_TYPE_UNSIGNED_INT to - * indicate the returned value type. - * - * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with - * \a utilizationSamples set to NULL. The function will return NVML_ERROR_INSUFFICIENT_SIZE, with the current vGPU instance - * count in \a vgpuInstanceSamplesCount, or NVML_SUCCESS if the current vGPU instance count is zero. The caller should allocate - * a buffer of size vgpuInstanceSamplesCount * sizeof(nvmlVgpuInstanceUtilizationSample_t). Invoke the function again with - * the allocated buffer passed in \a utilizationSamples, and \a vgpuInstanceSamplesCount set to the number of entries the - * buffer is sized for. - * - * On successful return, the function updates \a vgpuInstanceSampleCount with the number of vGPU utilization sample - * structures that were actually written. This may differ from a previously read value as vGPU instances are created or - * destroyed. - * - * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 - * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp - * to a timeStamp retrieved from a previous query to read utilization since the previous query. - * - * @param device The identifier for the target device - * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. - * @param sampleValType Pointer to caller-supplied buffer to hold the type of returned sample values - * @param vgpuInstanceSamplesCount Pointer to caller-supplied array size, and returns number of vGPU instances - * @param utilizationSamples Pointer to caller-supplied buffer in which vGPU utilization samples are returned - - * @return - * - \ref NVML_SUCCESS if utilization samples are successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a vgpuInstanceSamplesCount or \a sampleValType is - * NULL, or a sample count of 0 is passed with a non-NULL \a utilizationSamples - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if supplied \a vgpuInstanceSamplesCount is too small to return samples for all - * vGPU instances currently executing on the device - * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetVgpuUtilization(nvmlDevice_t device, unsigned long long lastSeenTimeStamp, - nvmlValueType_t *sampleValType, unsigned int *vgpuInstanceSamplesCount, - nvmlVgpuInstanceUtilizationSample_t *utilizationSamples); - -/** - * Retrieves current utilization for processes running on vGPUs on a physical GPU (device). - * - * For Maxwell &tm; or newer fully supported devices. - * - * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for processes running on - * vGPU instances active on a device. Utilization values are returned as an array of utilization sample structures in the - * caller-supplied buffer pointed at by \a utilizationSamples. One utilization sample structure is returned per process running - * on vGPU instances, that had some non-zero utilization during the last sample period. It includes the CPU timestamp at which - * the samples were recorded. Individual utilization values are returned as "unsigned int" values. - * - * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with - * \a utilizationSamples set to NULL. The function will return NVML_ERROR_INSUFFICIENT_SIZE, with the current vGPU instance - * count in \a vgpuProcessSamplesCount. The caller should allocate a buffer of size - * vgpuProcessSamplesCount * sizeof(nvmlVgpuProcessUtilizationSample_t). Invoke the function again with - * the allocated buffer passed in \a utilizationSamples, and \a vgpuProcessSamplesCount set to the number of entries the - * buffer is sized for. - * - * On successful return, the function updates \a vgpuSubProcessSampleCount with the number of vGPU sub process utilization sample - * structures that were actually written. This may differ from a previously read value depending on the number of processes that are active - * in any given sample period. - * - * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 - * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp - * to a timeStamp retrieved from a previous query to read utilization since the previous query. - * - * @param device The identifier for the target device - * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. - * @param vgpuProcessSamplesCount Pointer to caller-supplied array size, and returns number of processes running on vGPU instances - * @param utilizationSamples Pointer to caller-supplied buffer in which vGPU sub process utilization samples are returned - - * @return - * - \ref NVML_SUCCESS if utilization samples are successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a vgpuProcessSamplesCount or a sample count of 0 is - * passed with a non-NULL \a utilizationSamples - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if supplied \a vgpuProcessSamplesCount is too small to return samples for all - * vGPU instances currently executing on the device - * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetVgpuProcessUtilization(nvmlDevice_t device, unsigned long long lastSeenTimeStamp, - unsigned int *vgpuProcessSamplesCount, - nvmlVgpuProcessUtilizationSample_t *utilizationSamples); -/** - * Queries the state of per process accounting mode on vGPU. - * - * For Maxwell &tm; or newer fully supported devices. - * - * @param vgpuInstance The identifier of the target vGPU instance - * @param mode Reference in which to return the current accounting mode - * - * @return - * - \ref NVML_SUCCESS if the mode has been successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a mode is NULL - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system - * - \ref NVML_ERROR_NOT_SUPPORTED if the vGPU doesn't support this feature - * - \ref NVML_ERROR_DRIVER_NOT_LOADED if NVIDIA driver is not running on the vGPU instance - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetAccountingMode(nvmlVgpuInstance_t vgpuInstance, nvmlEnableState_t *mode); - -/** - * Queries list of processes running on vGPU that can be queried for accounting stats. The list of processes - * returned can be in running or terminated state. - * - * For Maxwell &tm; or newer fully supported devices. - * - * To just query the maximum number of processes that can be queried, call this function with *count = 0 and - * pids=NULL. The return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if list is empty. - * - * For more details see \ref nvmlVgpuInstanceGetAccountingStats. - * - * @note In case of PID collision some processes might not be accessible before the circular buffer is full. - * - * @param vgpuInstance The identifier of the target vGPU instance - * @param count Reference in which to provide the \a pids array size, and - * to return the number of elements ready to be queried - * @param pids Reference in which to return list of process ids - * - * @return - * - \ref NVML_SUCCESS if pids were successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a count is NULL - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system - * - \ref NVML_ERROR_NOT_SUPPORTED if the vGPU doesn't support this feature or accounting mode is disabled - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to expected value) - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlVgpuInstanceGetAccountingPids - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetAccountingPids(nvmlVgpuInstance_t vgpuInstance, unsigned int *count, unsigned int *pids); - -/** - * Queries process's accounting stats. - * - * For Maxwell &tm; or newer fully supported devices. - * - * Accounting stats capture GPU utilization and other statistics across the lifetime of a process, and - * can be queried during life time of the process or after its termination. - * The time field in \ref nvmlAccountingStats_t is reported as 0 during the lifetime of the process and - * updated to actual running time after its termination. - * Accounting stats are kept in a circular buffer, newly created processes overwrite information about old - * processes. - * - * See \ref nvmlAccountingStats_t for description of each returned metric. - * List of processes that can be queried can be retrieved from \ref nvmlVgpuInstanceGetAccountingPids. - * - * @note Accounting Mode needs to be on. See \ref nvmlVgpuInstanceGetAccountingMode. - * @note Only compute and graphics applications stats can be queried. Monitoring applications stats can't be - * queried since they don't contribute to GPU utilization. - * @note In case of pid collision stats of only the latest process (that terminated last) will be reported - * - * @param vgpuInstance The identifier of the target vGPU instance - * @param pid Process Id of the target process to query stats for - * @param stats Reference in which to return the process's accounting stats - * - * @return - * - \ref NVML_SUCCESS if stats have been successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a stats is NULL - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system - * or \a stats is not found - * - \ref NVML_ERROR_NOT_SUPPORTED if the vGPU doesn't support this feature or accounting mode is disabled - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetAccountingStats(nvmlVgpuInstance_t vgpuInstance, unsigned int pid, nvmlAccountingStats_t *stats); - -/** - * Clears accounting information of the vGPU instance that have already terminated. - * - * For Maxwell &tm; or newer fully supported devices. - * Requires root/admin permissions. - * - * @note Accounting Mode needs to be on. See \ref nvmlVgpuInstanceGetAccountingMode. - * @note Only compute and graphics applications stats are reported and can be cleared since monitoring applications - * stats don't contribute to GPU utilization. - * - * @param vgpuInstance The identifier of the target vGPU instance - * - * @return - * - \ref NVML_SUCCESS if accounting information has been cleared - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_NOT_SUPPORTED if the vGPU doesn't support this feature or accounting mode is disabled - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceClearAccountingPids(nvmlVgpuInstance_t vgpuInstance); -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlGpuBlacklistQueries GPU Blacklist Queries - * This chapter describes NVML operations that are associated with blacklisted GPUs. - * @{ - */ -/***************************************************************************************************/ - -/** - * Blacklist GPU device information - **/ -typedef struct nvmlBlacklistDeviceInfo_st -{ - nvmlPciInfo_t pciInfo; //!< The PCI information for the blacklisted GPU - char uuid[NVML_DEVICE_UUID_BUFFER_SIZE]; //!< The ASCII string UUID for the blacklisted GPU -} nvmlBlacklistDeviceInfo_t; - - /** - * Retrieves the number of blacklisted GPU devices in the system. - * - * For all products. - * - * @param deviceCount Reference in which to return the number of blacklisted devices - * - * @return - * - \ref NVML_SUCCESS if \a deviceCount has been set - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a deviceCount is NULL - */ -nvmlReturn_t DECLDIR nvmlGetBlacklistDeviceCount(unsigned int *deviceCount); - -/** - * Acquire the device information for a blacklisted device, based on its index. - * - * For all products. - * - * Valid indices are derived from the \a deviceCount returned by - * \ref nvmlGetBlacklistDeviceCount(). For example, if \a deviceCount is 2 the valid indices - * are 0 and 1, corresponding to GPU 0 and GPU 1. - * - * @param index The index of the target GPU, >= 0 and < \a deviceCount - * @param info Reference in which to return the device information - * - * @return - * - \ref NVML_SUCCESS if \a device has been set - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a index is invalid or \a info is NULL - * - * @see nvmlGetBlacklistDeviceCount - */ -nvmlReturn_t DECLDIR nvmlGetBlacklistDeviceInfoByIndex(unsigned int index, nvmlBlacklistDeviceInfo_t *info); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlMultiInstanceGPU Multi Instance GPU Management - * This chapter describes NVML operations that are associated with Multi Instance GPU management. - * @{ - */ -/***************************************************************************************************/ - -/** - * Disable Multi Instance GPU mode. - */ -#define NVML_DEVICE_MIG_DISABLE 0x0 - -/** - * Enable Multi Instance GPU mode. - */ -#define NVML_DEVICE_MIG_ENABLE 0x1 - -/** - * GPU instance profiles. - * - * These macros should be passed to \ref nvmlDeviceGetGpuInstanceProfileInfo to retrieve the - * detailed information about a GPU instance such as profile ID, engine counts. - */ -#define NVML_GPU_INSTANCE_PROFILE_1_SLICE 0x0 -#define NVML_GPU_INSTANCE_PROFILE_2_SLICE 0x1 -#define NVML_GPU_INSTANCE_PROFILE_3_SLICE 0x2 -#define NVML_GPU_INSTANCE_PROFILE_4_SLICE 0x3 -#define NVML_GPU_INSTANCE_PROFILE_7_SLICE 0x4 -#define NVML_GPU_INSTANCE_PROFILE_COUNT 0x5 - -typedef struct nvmlGpuInstancePlacement_st -{ - unsigned int start; - unsigned int size; -} nvmlGpuInstancePlacement_t; - -typedef struct nvmlGpuInstanceProfileInfo_st -{ - unsigned int id; //!< Unique profile ID within the device - unsigned int isP2pSupported; //!< Peer-to-Peer support - unsigned int sliceCount; //!< GPU Slice count - unsigned int instanceCount; //!< GPU instance count - unsigned int multiprocessorCount; //!< Streaming Multiprocessor count - unsigned int copyEngineCount; //!< Copy Engine count - unsigned int decoderCount; //!< Decoder Engine count - unsigned int encoderCount; //!< Encoder Engine count - unsigned int jpegCount; //!< JPEG Engine count - unsigned int ofaCount; //!< OFA Engine count - unsigned long long memorySizeMB; //!< Memory size in MBytes -} nvmlGpuInstanceProfileInfo_t; - -typedef struct nvmlGpuInstanceInfo_st -{ - nvmlDevice_t device; //!< Parent device - unsigned int id; //!< Unique instance ID within the device - unsigned int profileId; //!< Unique profile ID within the device - nvmlGpuInstancePlacement_t placement; //!< Placement for this instance -} nvmlGpuInstanceInfo_t; - -typedef struct nvmlGpuInstance_st* nvmlGpuInstance_t; - -/** - * Compute instance profiles. - * - * These macros should be passed to \ref nvmlGpuInstanceGetComputeInstanceProfileInfo to retrieve the - * detailed information about a compute instance such as profile ID, engine counts - */ -#define NVML_COMPUTE_INSTANCE_PROFILE_1_SLICE 0x0 -#define NVML_COMPUTE_INSTANCE_PROFILE_2_SLICE 0x1 -#define NVML_COMPUTE_INSTANCE_PROFILE_3_SLICE 0x2 -#define NVML_COMPUTE_INSTANCE_PROFILE_4_SLICE 0x3 -#define NVML_COMPUTE_INSTANCE_PROFILE_7_SLICE 0x4 -#define NVML_COMPUTE_INSTANCE_PROFILE_COUNT 0x5 - -#define NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED 0x0 //!< All the engines except multiprocessors would be shared -#define NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_COUNT 0x1 - -typedef struct nvmlComputeInstanceProfileInfo_st -{ - unsigned int id; //!< Unique profile ID within the GPU instance - unsigned int sliceCount; //!< GPU Slice count - unsigned int instanceCount; //!< Compute instance count - unsigned int multiprocessorCount; //!< Streaming Multiprocessor count - unsigned int sharedCopyEngineCount; //!< Shared Copy Engine count - unsigned int sharedDecoderCount; //!< Shared Decoder Engine count - unsigned int sharedEncoderCount; //!< Shared Encoder Engine count - unsigned int sharedJpegCount; //!< Shared JPEG Engine count - unsigned int sharedOfaCount; //!< Shared OFA Engine count -} nvmlComputeInstanceProfileInfo_t; - -typedef struct nvmlComputeInstanceInfo_st -{ - nvmlDevice_t device; //!< Parent device - nvmlGpuInstance_t gpuInstance; //!< Parent GPU instance - unsigned int id; //!< Unique instance ID within the GPU instance - unsigned int profileId; //!< Unique profile ID within the GPU instance -} nvmlComputeInstanceInfo_t; - -typedef struct nvmlComputeInstance_st* nvmlComputeInstance_t; - -/** - * Set MIG mode for the device. - * - * For newer than Volta &tm; fully supported devices. - * Supported on Linux only. - * Requires root user. - * - * This mode determines whether a GPU instance can be created. - * - * This API may unbind or reset the device to activate the requested mode. Thus, the attributes associated with the - * device, such as minor number, might change. The caller of this API is expected to query such attributes again. - * - * On certain platforms like pass-through virtualization, where reset functionality may not be exposed directly, VM - * reboot is required. \a activationStatus would return \ref NVML_ERROR_RESET_REQUIRED for such cases. - * - * \a activationStatus would return the appropriate error code upon unsuccessful activation. For example, if device - * unbind fails because the device isn't idle, \ref NVML_ERROR_IN_USE would be returned. The caller of this API - * is expected to idle the device and retry setting the \a mode. - * - * @param device The identifier of the target device - * @param mode The mode to be set, \ref NVML_DEVICE_MIG_DISABLE or - * \ref NVML_DEVICE_MIG_ENABLE - * @param activationStatus The activationStatus status - * - * @return - * - \ref NVML_SUCCESS Upon success - * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device,\a mode or \a activationStatus are invalid - * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation - * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't support MIG mode - */ -nvmlReturn_t DECLDIR nvmlDeviceSetMigMode(nvmlDevice_t device, unsigned int mode, nvmlReturn_t *activationStatus); - -/** - * Get MIG mode for the device. - * - * For newer than Volta &tm; fully supported devices. - * Supported on Linux only. - * - * Changing MIG modes may require device unbind or reset. The "pending" MIG mode refers to the target mode following the - * next activation trigger. - * - * @param device The identifier of the target device - * @param currentMode Returns the current mode, \ref NVML_DEVICE_MIG_DISABLE or - * \ref NVML_DEVICE_MIG_ENABLE - * @param pendingMode Returns the pending mode, \ref NVML_DEVICE_MIG_DISABLE or - * \ref NVML_DEVICE_MIG_ENABLE - * - * @return - * - \ref NVML_SUCCESS Upon success - * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a currentMode or \a pendingMode are invalid - * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't support MIG mode - */ -nvmlReturn_t DECLDIR nvmlDeviceGetMigMode(nvmlDevice_t device, unsigned int *currentMode, unsigned int *pendingMode); - -/** - * Get GPU instance profile information. - * - * Information provided by this API is immutable throughout the lifetime of a MIG mode. - * - * For newer than Volta &tm; fully supported devices. - * Supported on Linux only. - * Requires privileged user. - * - * @param device The identifier of the target device - * @param profile One of the NVML_GPU_INSTANCE_PROFILE_* - * @param info Returns detailed profile information - * - * @return - * - \ref NVML_SUCCESS Upon success - * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profile or \a info are invalid - * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or \a profile isn't supported - * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation - */ -nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceProfileInfo(nvmlDevice_t device, unsigned int profile, - nvmlGpuInstanceProfileInfo_t *info); - -/** - * Get GPU instance placements. - * - * A placement represents the location of a GPU instance within a device. This API only returns all the possible - * placements for the given profile. - * - * For newer than Volta &tm; fully supported devices. - * Supported on Linux only. - * Requires privileged user. - * - * @param device The identifier of the target device - * @param profileId The GPU instance profile ID. See \ref nvmlDeviceGetGpuInstanceProfileInfo - * @param placements Returns placements, the buffer must be large enough to accommodate - * the instances supported by the profile. - * See \ref nvmlDeviceGetGpuInstanceProfileInfo - * @param count The count of returned placements - * - * @return - * - \ref NVML_SUCCESS Upon success - * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profileId, \a placements or \a count are invalid - * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or \a profileId isn't supported - * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation - */ -nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstancePossiblePlacements(nvmlDevice_t device, unsigned int profileId, - nvmlGpuInstancePlacement_t *placements, - unsigned int *count); - -/** - * Get GPU instance profile capacity. - * - * For newer than Volta &tm; fully supported devices. - * Supported on Linux only. - * Requires privileged user. - * - * @param device The identifier of the target device - * @param profileId The GPU instance profile ID. See \ref nvmlDeviceGetGpuInstanceProfileInfo - * @param count Returns remaining instance count for the profile ID - * - * @return - * - \ref NVML_SUCCESS Upon success - * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profileId or \a count are invalid - * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or \a profileId isn't supported - * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation - */ -nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceRemainingCapacity(nvmlDevice_t device, unsigned int profileId, - unsigned int *count); - -/** - * Create GPU instance. - * - * For newer than Volta &tm; fully supported devices. - * Supported on Linux only. - * Requires privileged user. - * - * If the parent device is unbound, reset or the GPU instance is destroyed explicitly, the GPU instance handle would - * become invalid. The GPU instance must be recreated to acquire a valid handle. - * - * @param device The identifier of the target device - * @param profileId The GPU instance profile ID. See \ref nvmlDeviceGetGpuInstanceProfileInfo - * @param gpuInstance Returns the GPU instance handle - * - * @return - * - \ref NVML_SUCCESS Upon success - * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profile, \a profileId or \a gpuInstance are invalid - * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled - * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation - * - \ref NVML_ERROR_INSUFFICIENT_RESOURCES If the requested GPU instance could not be created - */ -nvmlReturn_t DECLDIR nvmlDeviceCreateGpuInstance(nvmlDevice_t device, unsigned int profileId, - nvmlGpuInstance_t *gpuInstance); - -/** - * Destroy GPU instance. - * - * For newer than Volta &tm; fully supported devices. - * Supported on Linux only. - * Requires privileged user. - * - * @param gpuInstance The GPU instance handle - * - * @return - * - \ref NVML_SUCCESS Upon success - * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled - * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation - * - \ref NVML_ERROR_IN_USE If the GPU instance is in use. This error would be returned if processes - * (e.g. CUDA application) or compute instances are active on the - * GPU instance. - */ -nvmlReturn_t DECLDIR nvmlGpuInstanceDestroy(nvmlGpuInstance_t gpuInstance); - -/** - * Get GPU instances for given profile ID. - * - * For newer than Volta &tm; fully supported devices. - * Supported on Linux only. - * Requires privileged user. - * - * @param device The identifier of the target device - * @param profileId The GPU instance profile ID. See \ref nvmlDeviceGetGpuInstanceProfileInfo - * @param gpuInstances Returns pre-exiting GPU instances, the buffer must be large enough to - * accommodate the instances supported by the profile. - * See \ref nvmlDeviceGetGpuInstanceProfileInfo - * @param count The count of returned GPU instances - * - * @return - * - \ref NVML_SUCCESS Upon success - * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profileId, \a gpuInstances or \a count are invalid - * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled - * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation - */ -nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstances(nvmlDevice_t device, unsigned int profileId, - nvmlGpuInstance_t *gpuInstances, unsigned int *count); - -/** - * Get GPU instances for given instance ID. - * - * For newer than Volta &tm; fully supported devices. - * Supported on Linux only. - * Requires privileged user. - * - * @param device The identifier of the target device - * @param id The GPU instance ID - * @param gpuInstance Returns GPU instance - * - * @return - * - \ref NVML_SUCCESS Upon success - * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a id or \a gpuInstance are invalid - * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled - * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation - * - \ref NVML_ERROR_NOT_FOUND If the GPU instance is not found. - */ -nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceById(nvmlDevice_t device, unsigned int id, nvmlGpuInstance_t *gpuInstance); - -/** - * Get GPU instance information. - * - * For newer than Volta &tm; fully supported devices. - * Supported on Linux only. - * - * @param gpuInstance The GPU instance handle - * @param info Return GPU instance information - * - * @return - * - \ref NVML_SUCCESS Upon success - * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance or \a info are invalid - * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation - */ -nvmlReturn_t DECLDIR nvmlGpuInstanceGetInfo(nvmlGpuInstance_t gpuInstance, nvmlGpuInstanceInfo_t *info); - -/** - * Get compute instance profile information. - * - * Information provided by this API is immutable throughout the lifetime of a MIG mode. - * - * For newer than Volta &tm; fully supported devices. - * Supported on Linux only. - * Requires privileged user. - * - * @param gpuInstance The identifier of the target GPU instance - * @param profile One of the NVML_COMPUTE_INSTANCE_PROFILE_* - * @param engProfile One of the NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_* - * @param info Returns detailed profile information - * - * @return - * - \ref NVML_SUCCESS Upon success - * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance, \a profile, \a engProfile or \a info are invalid - * - \ref NVML_ERROR_NOT_SUPPORTED If \a profile isn't supported - * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation - */ -nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstanceProfileInfo(nvmlGpuInstance_t gpuInstance, unsigned int profile, - unsigned int engProfile, - nvmlComputeInstanceProfileInfo_t *info); - -/** - * Get compute instance profile capacity. - * - * For newer than Volta &tm; fully supported devices. - * Supported on Linux only. - * Requires privileged user. - * - * @param gpuInstance The identifier of the target GPU instance - * @param profileId The compute instance profile ID. - * See \ref nvmlGpuInstanceGetComputeInstanceProfileInfo - * @param count Returns remaining instance count for the profile ID - * - * @return - * - \ref NVML_SUCCESS Upon success - * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance, \a profileId or \a availableCount are invalid - * - \ref NVML_ERROR_NOT_SUPPORTED If \a profileId isn't supported - * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation - */ -nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstanceRemainingCapacity(nvmlGpuInstance_t gpuInstance, - unsigned int profileId, unsigned int *count); - -/** - * Create compute instance. - * - * For newer than Volta &tm; fully supported devices. - * Supported on Linux only. - * Requires privileged user. - * - * If the parent device is unbound, reset or the parent GPU instance is destroyed or the compute instance is destroyed - * explicitly, the compute instance handle would become invalid. The compute instance must be recreated to acquire - * a valid handle. - * - * @param gpuInstance The identifier of the target GPU instance - * @param profileId The compute instance profile ID. - * See \ref nvmlGpuInstanceGetComputeInstanceProfileInfo - * @param computeInstance Returns the compute instance handle - * - * @return - * - \ref NVML_SUCCESS Upon success - * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance, \a profile, \a profileId or \a computeInstance - * are invalid - * - \ref NVML_ERROR_NOT_SUPPORTED If \a profileId isn't supported - * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation - * - \ref NVML_ERROR_INSUFFICIENT_RESOURCES If the requested compute instance could not be created - */ -nvmlReturn_t DECLDIR nvmlGpuInstanceCreateComputeInstance(nvmlGpuInstance_t gpuInstance, unsigned int profileId, - nvmlComputeInstance_t *computeInstance); - -/** - * Destroy compute instance. - * - * For newer than Volta &tm; fully supported devices. - * Supported on Linux only. - * Requires privileged user. - * - * @param computeInstance The compute instance handle - * - * @return - * - \ref NVML_SUCCESS Upon success - * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT If \a computeInstance is invalid - * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation - * - \ref NVML_ERROR_IN_USE If the compute instance is in use. This error would be returned if - * processes (e.g. CUDA application) are active on the compute instance. - */ -nvmlReturn_t DECLDIR nvmlComputeInstanceDestroy(nvmlComputeInstance_t computeInstance); - -/** - * Get compute instances for given profile ID. - * - * For newer than Volta &tm; fully supported devices. - * Supported on Linux only. - * Requires privileged user. - * - * @param gpuInstance The identifier of the target GPU instance - * @param profileId The compute instance profile ID. - * See \ref nvmlGpuInstanceGetComputeInstanceProfileInfo - * @param computeInstances Returns pre-exiting compute instances, the buffer must be large enough to - * accommodate the instances supported by the profile. - * See \ref nvmlGpuInstanceGetComputeInstanceProfileInfo - * @param count The count of returned compute instances - * - * @return - * - \ref NVML_SUCCESS Upon success - * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance, \a profileId, \a computeInstances or \a count - * are invalid - * - \ref NVML_ERROR_NOT_SUPPORTED If \a profileId isn't supported - * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation - */ -nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstances(nvmlGpuInstance_t gpuInstance, unsigned int profileId, - nvmlComputeInstance_t *computeInstances, unsigned int *count); - -/** - * Get compute instance for given instance ID. - * - * For newer than Volta &tm; fully supported devices. - * Supported on Linux only. - * Requires privileged user. - * - * @param gpuInstance The identifier of the target GPU instance - * @param id The compute instance ID - * @param computeInstance Returns compute instance - * - * @return - * - \ref NVML_SUCCESS Upon success - * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a ID or \a computeInstance are invalid - * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled - * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation - * - \ref NVML_ERROR_NOT_FOUND If the compute instance is not found. - */ -nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstanceById(nvmlGpuInstance_t gpuInstance, unsigned int id, - nvmlComputeInstance_t *computeInstance); - -/** - * Get compute instance information. - * - * For newer than Volta &tm; fully supported devices. - * Supported on Linux only. - * - * @param computeInstance The compute instance handle - * @param info Return compute instance information - * - * @return - * - \ref NVML_SUCCESS Upon success - * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT If \a computeInstance or \a info are invalid - * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation - */ -nvmlReturn_t DECLDIR nvmlComputeInstanceGetInfo(nvmlComputeInstance_t computeInstance, nvmlComputeInstanceInfo_t *info); - -/** - * Test if the given handle refers to a MIG device. - * - * A MIG device handle is an NVML abstraction which maps to a MIG compute instance. - * These overloaded references can be used (with some restrictions) interchangeably - * with a GPU device handle to execute queries at a per-compute instance granularity. - * - * For newer than Volta &tm; fully supported devices. - * Supported on Linux only. - * - * @param device NVML handle to test - * @param isMigDevice True when handle refers to a MIG device - * - * @return - * - \ref NVML_SUCCESS if \a device status was successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device handle or \a isMigDevice reference is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if this check is not supported by the device - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceIsMigDeviceHandle(nvmlDevice_t device, unsigned int *isMigDevice); - -/** - * Get GPU instance ID for the given MIG device handle. - * - * GPU instance IDs are unique per device and remain valid until the GPU instance is destroyed. - * - * For newer than Volta &tm; fully supported devices. - * Supported on Linux only. - * - * @param device Target MIG device handle - * @param id GPU instance ID - * - * @return - * - \ref NVML_SUCCESS if instance ID was successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a id reference is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceId(nvmlDevice_t device, unsigned int *id); - -/** - * Get compute instance ID for the given MIG device handle. - * - * Compute instance IDs are unique per GPU instance and remain valid until the compute instance - * is destroyed. - * - * For newer than Volta &tm; fully supported devices. - * Supported on Linux only. - * - * @param device Target MIG device handle - * @param id Compute instance ID - * - * @return - * - \ref NVML_SUCCESS if instance ID was successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a id reference is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetComputeInstanceId(nvmlDevice_t device, unsigned int *id); - -/** - * Get the maximum number of MIG devices that can exist under a given parent NVML device. - * - * Returns zero if MIG is not supported or enabled. - * - * For newer than Volta &tm; fully supported devices. - * Supported on Linux only. - * - * @param device Target device handle - * @param count Count of MIG devices - * - * @return - * - \ref NVML_SUCCESS if \a count was successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a count reference is invalid - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetMaxMigDeviceCount(nvmlDevice_t device, unsigned int *count); - -/** - * Get MIG device handle for the given index under its parent NVML device. - * - * If the compute instance is destroyed either explicitly or by destroying, - * resetting or unbinding the parent GPU instance or the GPU device itself - * the MIG device handle would remain invalid and must be requested again - * using this API. Handles may be reused and their properties can change in - * the process. - * - * For newer than Volta &tm; fully supported devices. - * Supported on Linux only. - * - * @param device Reference to the parent GPU device handle - * @param index Index of the MIG device - * @param migDevice Reference to the MIG device handle - * - * @return - * - \ref NVML_SUCCESS if \a migDevice handle was successfully created - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a index or \a migDevice reference is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device - * - \ref NVML_ERROR_NOT_FOUND if no valid MIG device was found at \a index - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetMigDeviceHandleByIndex(nvmlDevice_t device, unsigned int index, - nvmlDevice_t *migDevice); - -/** - * Get parent device handle from a MIG device handle. - * - * For newer than Volta &tm; fully supported devices. - * Supported on Linux only. - * - * @param migDevice MIG device handle - * @param device Device handle - * - * @return - * - \ref NVML_SUCCESS if \a device handle was successfully created - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a migDevice or \a device is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetDeviceHandleFromMigDeviceHandle(nvmlDevice_t migDevice, nvmlDevice_t *device); - -/** @} */ - -/** - * NVML API versioning support - */ - -#ifdef NVML_NO_UNVERSIONED_FUNC_DEFS -nvmlReturn_t DECLDIR nvmlInit(void); -nvmlReturn_t DECLDIR nvmlDeviceGetCount(unsigned int *deviceCount); -nvmlReturn_t DECLDIR nvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device); -nvmlReturn_t DECLDIR nvmlDeviceGetHandleByPciBusId(const char *pciBusId, nvmlDevice_t *device); -nvmlReturn_t DECLDIR nvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t *pci); -nvmlReturn_t DECLDIR nvmlDeviceGetPciInfo_v2(nvmlDevice_t device, nvmlPciInfo_t *pci); -nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci); -nvmlReturn_t DECLDIR nvmlDeviceGetGridLicensableFeatures(nvmlDevice_t device, nvmlGridLicensableFeatures_t *pGridLicensableFeatures); -nvmlReturn_t DECLDIR nvmlDeviceGetGridLicensableFeatures_v2(nvmlDevice_t device, nvmlGridLicensableFeatures_t *pGridLicensableFeatures); -nvmlReturn_t DECLDIR nvmlDeviceRemoveGpu(nvmlPciInfo_t *pciInfo); -nvmlReturn_t DECLDIR nvmlEventSetWait(nvmlEventSet_t set, nvmlEventData_t * data, unsigned int timeoutms); -nvmlReturn_t DECLDIR nvmlDeviceGetAttributes(nvmlDevice_t device, nvmlDeviceAttributes_t *attributes); -#endif // #ifdef NVML_NO_UNVERSIONED_FUNC_DEFS - -#if defined(NVML_NO_UNVERSIONED_FUNC_DEFS) -// We don't define APIs to run new versions if this guard is present so there is -// no need to undef -#elif defined(__NVML_API_VERSION_INTERNAL) -#undef nvmlDeviceGetAttributes -#undef nvmlEventSetWait -#undef nvmlDeviceGetGridLicensableFeatures -#undef nvmlDeviceRemoveGpu -#undef nvmlDeviceGetNvLinkRemotePciInfo -#undef nvmlDeviceGetPciInfo -#undef nvmlDeviceGetCount -#undef nvmlDeviceGetHandleByIndex -#undef nvmlDeviceGetHandleByPciBusId -#undef nvmlInit -#endif - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/bindings/go/nvml/nvml_dl.go b/bindings/go/nvml/nvml_dl.go deleted file mode 100644 index 21da6dd4..00000000 --- a/bindings/go/nvml/nvml_dl.go +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. - -// +build linux darwin - -package nvml - -import ( - "unsafe" -) - -/* -#include -#include "nvml.h" - -// We wrap the call to nvmlInit() here to ensure that we pick up the correct -// version of this call. The macro magic in nvml.h that #defines the symbol -// 'nvmlInit' to 'nvmlInit_v2' is unfortunately lost on cgo. -static nvmlReturn_t nvmlInit_dl(void) { - return nvmlInit(); -} -*/ -import "C" - -type dlhandles struct{ handles []unsafe.Pointer } - -var dl dlhandles - -// Initialize NVML, opening a dynamic reference to the NVML library in the process. -func (dl *dlhandles) nvmlInit() C.nvmlReturn_t { - handle := C.dlopen(C.CString("libnvidia-ml.so.1"), C.RTLD_LAZY|C.RTLD_GLOBAL) - if handle == C.NULL { - return C.NVML_ERROR_LIBRARY_NOT_FOUND - } - dl.handles = append(dl.handles, handle) - return C.nvmlInit_dl() -} - -// Shutdown NVML, closing our dynamic reference to the NVML library in the process. -func (dl *dlhandles) nvmlShutdown() C.nvmlReturn_t { - ret := C.nvmlShutdown() - if ret != C.NVML_SUCCESS { - return ret - } - - for _, handle := range dl.handles { - err := C.dlclose(handle) - if err != 0 { - return C.NVML_ERROR_UNKNOWN - } - } - - return C.NVML_SUCCESS -} - -// Check to see if a specific symbol is present in the NVML library. -func (dl *dlhandles) lookupSymbol(symbol string) C.nvmlReturn_t { - for _, handle := range dl.handles { - C.dlerror() - C.dlsym(handle, C.CString(symbol)) - if unsafe.Pointer(C.dlerror()) == C.NULL { - return C.NVML_SUCCESS - } - } - return C.NVML_ERROR_FUNCTION_NOT_FOUND -} diff --git a/bindings/go/nvml/nvml_dl_windows.go b/bindings/go/nvml/nvml_dl_windows.go deleted file mode 100644 index 4b941704..00000000 --- a/bindings/go/nvml/nvml_dl_windows.go +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. - -// +build windows - -package nvml - -import ( - "syscall" -) - -/* -#include "nvml.h" - -// We wrap the call to nvmlInit() here to ensure that we pick up the correct -// version of this call. The macro magic in nvml.h that #defines the symbol -// 'nvmlInit' to 'nvmlInit_v2' is unfortunately lost on cgo. -static nvmlReturn_t nvmlInit_dl(void) { - return nvmlInit(); -} -*/ -import "C" - -type dlhandles struct{ handles []*syscall.LazyDLL } - -var dl dlhandles - -// Initialize NVML, opening a dynamic reference to the NVML library in the process. -func (dl *dlhandles) nvmlInit() C.nvmlReturn_t { - handle := syscall.NewLazyDLL("nvml.dll") - if handle == nil { - return C.NVML_ERROR_LIBRARY_NOT_FOUND - } - dl.handles = append(dl.handles, handle) - return C.nvmlInit_dl() -} - -// Shutdown NVML, closing our dynamic reference to the NVML library in the process. -func (dl *dlhandles) nvmlShutdown() C.nvmlReturn_t { - ret := C.nvmlShutdown() - if ret != C.NVML_SUCCESS { - return ret - } - - dl.handles = dl.handles[:0] - - return C.NVML_SUCCESS -} - -// Check to see if a specific symbol is present in the NVML library. -func (dl *dlhandles) lookupSymbol(symbol string) C.nvmlReturn_t { - for _, handle := range dl.handles { - if proc := handle.NewProc(symbol); proc != nil { - return C.NVML_SUCCESS - } - } - return C.NVML_ERROR_FUNCTION_NOT_FOUND -} diff --git a/bindings/go/nvml/nvml_test.go b/bindings/go/nvml/nvml_test.go deleted file mode 100644 index e5f2cb0f..00000000 --- a/bindings/go/nvml/nvml_test.go +++ /dev/null @@ -1,218 +0,0 @@ -package nvml - -import ( - "math" - "strconv" - "strings" - "testing" - - "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvsmi" -) - -func check(err error, t *testing.T) { - if err != nil { - t.Errorf("%v\n", err) - } -} - -func TestDeviceCount(t *testing.T) { - Init() - defer Shutdown() - - count, err := GetDeviceCount() - check(err, t) - - query := "count" - c := nvsmi.DeviceCount(query) - - if c != count { - t.Errorf("Device Count from nvml is wrong, got %d, want: %d", count, c) - } -} - -func BenchmarkDeviceCount1(b *testing.B) { - Init() - - b.StartTimer() - for n := 0; n < b.N; n++ { - GetDeviceCount() - } - b.StopTimer() - - Shutdown() -} - -func TestDriverVersion(t *testing.T) { - Init() - defer Shutdown() - - driverVersion, err := GetDriverVersion() - check(err, t) - - // assuming device count check to be passed before this test - id := "0" - query := "driver_version" - res := nvsmi.Query(id, query) - - if strings.Compare(res, driverVersion) != 0 { - t.Errorf("Driver version from nvml is wrong, got: %v, want: %v", driverVersion, res) - } -} - -func TestDeviceInfo(t *testing.T) { - Init() - defer Shutdown() - - fields := []string{ - "uuid", - "name", - "pci.bus_id", - "power.limit", - "clocks.max.sm", - "clocks.max.memory", - } - - count, err := GetDeviceCount() - check(err, t) - - for i := uint(0); i < count; i++ { - device, err := NewDevice(i) - check(err, t) - - id := strconv.FormatUint(uint64(i), 10) - - for _, val := range fields { - var msg, output string - res := nvsmi.Query(id, val) - - switch val { - case "uuid": - msg = "Device UUID" - output = device.UUID - case "name": - msg = "Device model" - output = *device.Model - case "pci.bus_id": - msg = "Device bus id" - output = device.PCI.BusID - case "power.limit": - msg = "Device power limit" - output = strconv.FormatUint(uint64(*device.Power), 10) - power, err := strconv.ParseFloat(res, 64) - check(err, t) - res = strconv.FormatUint(uint64(math.Round(power)), 10) - case "clocks.max.sm": - msg = "Device max sm clocks" - output = strconv.FormatUint(uint64(*device.Clocks.Cores), 10) - case "clocks.max.memory": - msg = "Device max mem clocks" - output = strconv.FormatUint(uint64(*device.Clocks.Memory), 10) - } - if strings.Compare(res, output) != 0 { - t.Errorf("%v from nvml is wrong, got: %v, want: %v", msg, output, res) - } - } - } -} - -func BenchmarkDeviceInfo1(b *testing.B) { - Init() - - b.StartTimer() - for n := 0; n < b.N; n++ { - // assuming there will be atleast 1 GPU attached - NewDevice(uint(0)) - } - b.StopTimer() - - Shutdown() -} - -func TestDeviceStatus(t *testing.T) { - Init() - defer Shutdown() - - fields := []string{ - "power.draw", - "temperature.gpu", - "utilization.gpu", - "utilization.memory", - "encoder.stats.averageFps", - "clocks.current.sm", - "clocks.current.memory", - "pstate", - "ecc.errors.uncorrected.volatile.device_memory", - "ecc.errors.uncorrected.volatile.l1_cache", - "ecc.errors.uncorrected.volatile.l2_cache", - } - - count, err := GetDeviceCount() - check(err, t) - - for i := uint(0); i < count; i++ { - device, err := NewDevice(i) - check(err, t) - - status, err := device.Status() - check(err, t) - - id := strconv.FormatUint(uint64(i), 10) - - for _, val := range fields { - var msg, output string = "", "[Not Supported]" - res := nvsmi.Query(id, val) - - switch val { - case "power.draw": - msg = "Device power utilization" - output = strconv.FormatUint(uint64(*status.Power), 10) - power, err := strconv.ParseFloat(res, 64) - check(err, t) - res = strconv.FormatUint(uint64(power), 10) - case "temperature.gpu": - msg = "Device temperature" - output = strconv.FormatUint(uint64(*status.Temperature), 10) - case "utilization.gpu": - msg = "Device gpu utilization" - output = strconv.FormatUint(uint64(*status.Utilization.GPU), 10) - case "utilization.memory": - msg = "Device memory utilization" - output = strconv.FormatUint(uint64(*status.Utilization.Memory), 10) - case "encoder.stats.averageFps": - msg = "Device encoder utilization" - output = strconv.FormatUint(uint64(*status.Utilization.Encoder), 10) - case "clocks.current.sm": - msg = "Device sm clock" - output = strconv.FormatUint(uint64(*status.Clocks.Cores), 10) - case "clocks.current.memory": - msg = "Device mem clock" - output = strconv.FormatUint(uint64(*status.Clocks.Memory), 10) - case "pstate": - msg = "Device performance state" - output = status.Performance.String() - case "ecc.errors.uncorrected.volatile.device_memory": - msg = "ecc error in device memory" - ecc := status.Memory.ECCErrors.Device - if ecc != nil { - output = strconv.FormatUint(*ecc, 10) - } - case "ecc.errors.uncorrected.volatile.l1_cache": - msg = "ecc error in l1 cache" - ecc := status.Memory.ECCErrors.L1Cache - if ecc != nil { - output = strconv.FormatUint(*ecc, 10) - } - case "ecc.errors.uncorrected.volatile.l2_cache": - msg = "ecc error in l2 cache" - ecc := status.Memory.ECCErrors.L2Cache - if ecc != nil { - output = strconv.FormatUint(*ecc, 10) - } - } - if strings.Compare(res, output) != 0 { - t.Errorf("%v from nvml is wrong, got: %v, want: %v", msg, output, res) - } - } - } - -} diff --git a/bindings/go/nvml/nvsmi/nvsmi.go b/bindings/go/nvml/nvsmi/nvsmi.go deleted file mode 100644 index 4f5e6394..00000000 --- a/bindings/go/nvml/nvsmi/nvsmi.go +++ /dev/null @@ -1,43 +0,0 @@ -package nvsmi - -import ( - "bytes" - "fmt" - "os/exec" - "strings" -) - -const ( - bin = "nvidia-smi" - gpuArg = "--id=" - queryArg = "--query-gpu=" - formatArg = "--format=csv,noheader,nounits" -) - -func Query(id string, query string) string { - var out bytes.Buffer - - cmd := exec.Command(bin, gpuArg+id, queryArg+query, formatArg) - cmd.Stdout = &out - - err := cmd.Run() - if err != nil { - fmt.Errorf("nvsmi exec error: %v\n", err) - } - return strings.TrimSpace(out.String()) -} - -func DeviceCount(query string) uint { - var out bytes.Buffer - - cmd := exec.Command(bin, queryArg+query, formatArg) - cmd.Stdout = &out - - err := cmd.Run() - if err != nil { - fmt.Errorf("nvsmi exec error: %v\n", err) - } - - nvSmi := strings.Split(strings.TrimSuffix(out.String(), "\n"), "\n") - return uint(len(nvSmi)) -} diff --git a/bindings/go/samples/dcgm/README.md b/bindings/go/samples/dcgm/README.md deleted file mode 100644 index 7f4dbd53..00000000 --- a/bindings/go/samples/dcgm/README.md +++ /dev/null @@ -1,192 +0,0 @@ -# DCGM Samples - -Modelled on [dcgmi (Data Center GPU Manager Interface)](https://developer.nvidia.com/data-center-gpu-manager-dcgm) and [nvidia-smi (NVIDIA System Management Interface)](https://developer.nvidia.com/nvidia-system-management-interface), seven samples and a [REST API](https://github.com/NVIDIA/gpu-monitoring-tools/blob/master/bindings/go/samples/dcgm/restApi/README.md) have been provided to show how to use DCGM go bindings. - -## DCGM running modes - -DCGM can be run in three different ways. - -#### Embedded Mode - -In embedded mode, hostengine is started as part of the running process and is loaded as a shared library. In this mode, metrics are also updated and collected automatically. This mode is recommended for users who wants to avoid managing an autonomous hostengine. - -#### Standalone Mode - -This mode lets you connect to an already running hostengine at a specified TCP/IP or Unix socket address. This mode is recommended for remote connections to the hostengine. By default, DCGM will assume a TCP connection and attempt to connect to localhost, unless specified. -``` -# If hostengine is running at a different address, pass it as - -IP - Valid IP address for the remote hostengine to connect to, at port 5555. - -IP:PORT - Valid IP address and port - -O - Given address is a TCP/IP address - -1 - Given address is an Unix socket filename - -$ ./sample -connect "IP" -socket "0" - -``` - -#### StartHostengine - -This is an add-on mode which opens an Unix socket for starting and connecting with hostengine. The hostengine is started as a child process of the running process and automatically terminated on exit. When operating in this mode, make sure to stop an already running hostengine to avoid any connection address conflicts. This mode is recommended for safely integrating DCGM in an already existing setup. - - -## Samples - - -#### deviceInfo - -Provides detailed information about each GPU on the system, along with whether the given GPU is DCGM supported or not. - -``` -$ go build && ./deviceInfo - -# sample output - -Driver Version : 384.130 -GPU : 0 -DCGMSupported : Yes -UUID : GPU-34e8d7ba-0e4d-ac00-6852-695d5d404f51 -Brand : GeForce -Model : GeForce GTX 980 -Serial Number : 0324414056639 -Vbios : 84.04.1F.00.02 -InforomImage Version : G001.0000.01.03 -Bus ID : 00000000:01:00.0 -BAR1 (MB) : 256 -FrameBuffer Memory (MB): 4036 -Bandwidth (MB/s) : 15760 -Cores (MHz) : 1392 -Memory (MHz) : 3505 -Power (W) : 180 -CPUAffinity : 0-11 -P2P Available : None ---------------------------------------------------------------------- -``` - -#### dmon - -Monitors each device status including its power, memory and GPU utilization. - -``` -$ go build && ./dmon - -# sample output - -Started host engine version 1.4.3 using socket path: /tmp/dcgmrxvqro.socket -# gpu pwr temp sm mem enc dec mclk pclk -# Idx W C % % % % MHz MHz - 0 43 48 0 1 0 0 3505 936 - 0 43 48 0 1 0 0 3505 936 -``` - -#### health - -Monitors the health of the given GPU every second, by checking the configured watches for any errors/failures/warnings. - -``` -$ go build && ./health - -# sample output -GPU : 0 -Status : Healthy -``` - -#### hostengineStatus - -Reports about DCGM hostengine memory and CPU usage. - -``` -$ go build && ./hostengineStatus - -# sample output - -Memory : 11480 KB -CPU : 0.08 % -``` - -#### policy - -Sets GPU usage and error policies and notifies in case of violations via callback functions. - -``` -$ go build && ./policy - -# sample output - -2018/06/25 23:48:34 Policy successfully set. -2018/06/25 23:48:34 Listening for violations... -GPU : 0 -Error : XID Error -Timestamp : 2018-06-25 18:55:30 +0000 UTC -Data : {31} -``` - -#### processInfo - -Provides per GPU detailed stats for this process. - -``` -$ go build && ./processInfo -pid PID - -# sample output - ----------------------------------------------------------------------- -GPU ID : 0 -----------Execution Stats--------------------------------------------- -PID : 15074 -Name : nbody -Start Time : 2018-06-25 16:50:28 -0700 PDT -End Time : Still Running -----------Performance Stats------------------------------------------- -Energy Consumed (Joules) : 181 -Max GPU Memory Used (bytes) : 84279296 -Avg SM Clock (MHz) : N/A -Avg Memory Clock (MHz) : N/A -Avg SM Utilization (%) : N/A -Avg Memory Utilization (%) : N/A -Avg PCIe Rx Bandwidth (MB) : N/A -Avg PCIe Tx Bandwidth (MB) : N/A -----------Event Stats------------------------------------------------- -Single Bit ECC Errors : 0 -Double Bit ECC Errors : 0 -Critical XID Errors : 0 -----------Slowdown Stats---------------------------------------------- -Due to - Power (%) : 0 - - Thermal (%) : 0 - - Reliability (%) : 0 - - Board Limit (%) : 0 - - Low Utilization (%) : 0 - - Sync Boost (%) : 0 -----------Process Utilization----------------------------------------- -Avg SM Utilization (%) : 0 -Avg Memory Utilization (%) : 0 ----------------------------------------------------------------------- -``` - -#### topology - -Informs about GPU topology and its CPU affinity. - -``` -$ go build && ./topology - -# sample output - -Started host engine version 1.4.3 using socket path: /tmp/dcgmvjeqkh.socket - GPU0CPUAffinity -GPU0 X 0-11 - -Legend: - X = Self - SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI) - NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node - PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU) - PXB = Connection traversing multiple PCIe switches (without traversing the PCIe Host Bridge) - PIX = Connection traversing a single PCIe switch - PSB = Connection traversing a single on-board PCIe switch - NV# = Connection traversing a bonded set of # NVLinks - 2018/06/25 15:36:38 Successfully terminated nv-hostengine. -``` \ No newline at end of file diff --git a/bindings/go/samples/dcgm/deviceInfo/main.go b/bindings/go/samples/dcgm/deviceInfo/main.go deleted file mode 100644 index 29d5941d..00000000 --- a/bindings/go/samples/dcgm/deviceInfo/main.go +++ /dev/null @@ -1,78 +0,0 @@ -package main - -import ( - "flag" - "log" - "os" - "text/template" - - "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm" -) - -const ( - deviceInfo = `Driver Version : {{.Identifiers.DriverVersion}} -GPU : {{.GPU}} -DCGMSupported : {{.DCGMSupported}} -UUID : {{.UUID}} -Brand : {{.Identifiers.Brand}} -Model : {{.Identifiers.Model}} -Serial Number : {{.Identifiers.Serial}} -Vbios : {{or .Identifiers.Vbios "N/A"}} -InforomImage Version : {{.Identifiers.InforomImageVersion}} -Bus ID : {{.PCI.BusID}} -BAR1 (MB) : {{or .PCI.BAR1 "N/A"}} -FrameBuffer Memory (MB): {{or .PCI.FBTotal "N/A"}} -Bandwidth (MB/s) : {{or .PCI.Bandwidth "N/A"}} -Cores (MHz) : {{or .Clocks.Cores "N/A"}} -Memory (MHz) : {{or .Clocks.Memory "N/A"}} -Power (W) : {{or .Power "N/A"}} -CPUAffinity : {{or .CPUAffinity "N/A"}} -P2P Available : {{if not .Topology}}None{{else}}{{range .Topology}} - GPU{{.GPU}} - (BusID){{.BusID}} - {{.Link.PCIPaths}}{{end}}{{end}} ---------------------------------------------------------------------- -` -) - -var ( - connectAddr = flag.String("connect", "localhost", "Provide nv-hostengine connection address.") - isSocket = flag.String("socket", "0", "Connecting to Unix socket?") -) - -// mini version of nvidia-smi -q -// dcgmi discovery -i apc -func main() { - // choose dcgm hostengine running mode - // 1. dcgm.Embedded - // 2. dcgm.Standalone -connect "addr", -socket "isSocket" - // 3. dcgm.StartHostengine - flag.Parse() - cleanup, err := dcgm.Init(dcgm.Standalone, *connectAddr, *isSocket) - if err != nil { - log.Panicln(err) - } - defer cleanup() - - defer func() { - if err := dcgm.Shutdown(); err != nil { - log.Panicln(err) - } - }() - - count, err := dcgm.GetAllDeviceCount() - if err != nil { - log.Panicln(err) - } - - t := template.Must(template.New("Device").Parse(deviceInfo)) - - for i := uint(0); i < count; i++ { - deviceInfo, err := dcgm.GetDeviceInfo(i) - if err != nil { - log.Panicln(err) - } - - if err = t.Execute(os.Stdout, deviceInfo); err != nil { - log.Panicln("Template error:", err) - } - } -} diff --git a/bindings/go/samples/dcgm/dmon/main.go b/bindings/go/samples/dcgm/dmon/main.go deleted file mode 100644 index 3ea99df5..00000000 --- a/bindings/go/samples/dcgm/dmon/main.go +++ /dev/null @@ -1,57 +0,0 @@ -package main - -import ( - "fmt" - "log" - "os" - "os/signal" - "syscall" - "time" - - "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm" -) - -const ( - header = `# gpu pwr temp sm mem enc dec mclk pclk -# Idx W C % % % % MHz MHz` -) - -// modelled on nvidia-smi dmon -// dcgmi dmon -e 155,150,203,204,206,207,100,101 -func main() { - sigs := make(chan os.Signal, 1) - signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM) - - cleanup, err := dcgm.Init(dcgm.Embedded) - if err != nil { - log.Panicln(err) - } - defer cleanup() - - gpus, err := dcgm.GetSupportedDevices() - if err != nil { - log.Panicln(err) - } - - ticker := time.NewTicker(time.Second * 1) - defer ticker.Stop() - - fmt.Println(header) - for { - select { - case <-ticker.C: - for _, gpu := range gpus { - st, err := dcgm.GetDeviceStatus(gpu) - if err != nil { - log.Panicln(err) - } - fmt.Printf("%5d %5d %5d %5d %5d %5d %5d %5d %5d\n", - gpu, int64(st.Power), st.Temperature, st.Utilization.GPU, st.Utilization.Memory, - st.Utilization.Encoder, st.Utilization.Decoder, st.Clocks.Memory, st.Clocks.Cores) - } - - case <-sigs: - return - } - } -} diff --git a/bindings/go/samples/dcgm/health/main.go b/bindings/go/samples/dcgm/health/main.go deleted file mode 100644 index 2d57d986..00000000 --- a/bindings/go/samples/dcgm/health/main.go +++ /dev/null @@ -1,64 +0,0 @@ -package main - -import ( - "log" - "os" - "os/signal" - "syscall" - "text/template" - "time" - - "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm" -) - -const ( - healthStatus = `GPU : {{.GPU}} -Status : {{.Status}} -{{range .Watches}} -Type : {{.Type}} -Status : {{.Status}} -Error : {{.Error}} -{{end}} -` -) - -// create group: dcgmi group -c "name" --default -// enable watches: dcgmi health -s a -// check: dcgmi health -g 1 -c -func main() { - sigs := make(chan os.Signal, 1) - signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM) - - cleanup, err := dcgm.Init(dcgm.Embedded) - if err != nil { - log.Panicln(err) - } - defer cleanup() - - gpus, err := dcgm.GetSupportedDevices() - if err != nil { - log.Panicln(err) - } - - ticker := time.NewTicker(time.Second * 1) - defer ticker.Stop() - - t := template.Must(template.New("Health").Parse(healthStatus)) - for { - select { - case <-ticker.C: - for _, gpu := range gpus { - h, err := dcgm.HealthCheckByGpuId(gpu) - if err != nil { - log.Panicln(err) - } - - if err = t.Execute(os.Stdout, h); err != nil { - log.Panicln("Template error:", err) - } - } - case <-sigs: - return - } - } -} diff --git a/bindings/go/samples/dcgm/hostengineStatus/main.go b/bindings/go/samples/dcgm/hostengineStatus/main.go deleted file mode 100644 index 09be103e..00000000 --- a/bindings/go/samples/dcgm/hostengineStatus/main.go +++ /dev/null @@ -1,25 +0,0 @@ -package main - -import ( - "fmt" - "log" - - "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm" -) - -// dcgmi introspect --enable -// dcgmi introspect -s -H -func main() { - cleanup, err := dcgm.Init(dcgm.Embedded) - if err != nil { - log.Panicln(err) - } - defer cleanup() - - st, err := dcgm.Introspect() - if err != nil { - log.Panicln(err) - } - - fmt.Printf("Memory %2s %v KB\nCPU %5s %.2f %s\n", ":", st.Memory, ":", st.CPU, "%") -} diff --git a/bindings/go/samples/dcgm/policy/main.go b/bindings/go/samples/dcgm/policy/main.go deleted file mode 100644 index f48a3f1e..00000000 --- a/bindings/go/samples/dcgm/policy/main.go +++ /dev/null @@ -1,45 +0,0 @@ -package main - -import ( - "fmt" - "log" - - "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm" -) - -// dcgmi group -c "name" --default -// dcgmi policy -g GROUPID --set 0,0 -x -n -p -e -P 250 -T 100 -M 10 -// dcgmi policy -g GROUPID --reg -func main() { - cleanup, err := dcgm.Init(dcgm.Embedded) - if err != nil { - log.Panicln(err) - } - defer cleanup() - - gpus, err := dcgm.GetSupportedDevices() - if err != nil { - log.Panicln(err) - } - - // Choose policy conditions to register violation callback. - // Note: Need to be root for some options - // Available options are: - // 1. dcgm.DbePolicy - // 2. dcgm.PCIePolicy - // 3. dcgm.MaxRtPgPolicy - // 4. dcgm.ThermalPolicy - // 5. dcgm.PowerPolicy - // 6. dcgm.NvlinkPolicy - // 7. dcgm.XidPolicy - for _, gpu := range gpus { - c, err := dcgm.Policy(gpu, dcgm.XidPolicy) - if err != nil { - log.Panicln(err) - } - - pe := <-c - fmt.Printf("GPU %8s %v\nError %6s %v\nTimestamp %2s %v\nData %7s %v\n", - ":", gpu, ":", pe.Condition, ":", pe.Timestamp, ":", pe.Data) - } -} diff --git a/bindings/go/samples/dcgm/processInfo/main.go b/bindings/go/samples/dcgm/processInfo/main.go deleted file mode 100644 index 44fd8304..00000000 --- a/bindings/go/samples/dcgm/processInfo/main.go +++ /dev/null @@ -1,84 +0,0 @@ -package main - -import ( - "flag" - "log" - "os" - "text/template" - "time" - - "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm" -) - -const ( - processInfo = `---------------------------------------------------------------------- -GPU ID : {{.GPU}} -----------Execution Stats--------------------------------------------- -PID : {{.PID}} -Name : {{or .Name "N/A"}} -Start Time : {{.ProcessUtilization.StartTime.String}} -End Time : {{.ProcessUtilization.EndTime.String}} -----------Performance Stats------------------------------------------- -Energy Consumed (Joules) : {{or .ProcessUtilization.EnergyConsumed "N/A"}} -Max GPU Memory Used (bytes) : {{or .Memory.GlobalUsed "N/A"}} -Avg SM Clock (MHz) : {{or .Clocks.Cores "N/A"}} -Avg Memory Clock (MHz) : {{or .Clocks.Memory "N/A"}} -Avg SM Utilization (%) : {{or .GpuUtilization.Memory "N/A"}} -Avg Memory Utilization (%) : {{or .GpuUtilization.GPU "N/A"}} -Avg PCIe Rx Bandwidth (MB) : {{or .PCI.Throughput.Rx "N/A"}} -Avg PCIe Tx Bandwidth (MB) : {{or .PCI.Throughput.Tx "N/A"}} -----------Event Stats------------------------------------------------- -Single Bit ECC Errors : {{or .Memory.ECCErrors.SingleBit "N/A"}} -Double Bit ECC Errors : {{or .Memory.ECCErrors.DoubleBit "N/A"}} -Critical XID Errors : {{.XIDErrors.NumErrors}} -----------Slowdown Stats---------------------------------------------- -Due to - Power (%) : {{or .Violations.Power "N/A"}} - - Thermal (%) : {{or .Violations.Thermal "N/A"}} - - Reliability (%) : {{or .Violations.Reliability "N/A"}} - - Board Limit (%) : {{or .Violations.BoardLimit "N/A"}} - - Low Utilization (%) : {{or .Violations.LowUtilization "N/A"}} - - Sync Boost (%) : {{or .Violations.SyncBoost "N/A"}} -----------Process Utilization----------------------------------------- -Avg SM Utilization (%) : {{or .ProcessUtilization.SmUtil "N/A"}} -Avg Memory Utilization (%) : {{or .ProcessUtilization.MemUtil "N/A"}} ----------------------------------------------------------------------- -` -) - -var process = flag.Uint("pid", 0, "Provide pid to get this process information.") - -// run as root, for enabling health watches -// dcgmi stats -e -// dcgmi stats --pid ENTERPID -v -// sample: sudo ./processInfo -pid PID -func main() { - cleanup, err := dcgm.Init(dcgm.Embedded) - if err != nil { - log.Panicln(err) - } - defer cleanup() - - // Request DCGM to start recording stats for GPU process fields - group, err := dcgm.WatchPidFields() - if err != nil { - log.Panicln(err) - } - - // Before retrieving process stats, wait few seconds for watches to be enabled and collect data - log.Println("Enabling DCGM watches to start collecting process stats. This may take a few seconds....") - time.Sleep(3000 * time.Millisecond) - - flag.Parse() - pidInfo, err := dcgm.GetProcessInfo(group, *process) - if err != nil { - log.Panicln(err) - } - - t := template.Must(template.New("Process").Parse(processInfo)) - for _, gpu := range pidInfo { - - if err = t.Execute(os.Stdout, gpu); err != nil { - log.Panicln("Template error:", err) - } - } -} diff --git a/bindings/go/samples/dcgm/restApi/README.md b/bindings/go/samples/dcgm/restApi/README.md deleted file mode 100644 index 9385b635..00000000 --- a/bindings/go/samples/dcgm/restApi/README.md +++ /dev/null @@ -1,106 +0,0 @@ -## DCGM REST API - -A sample REST API is provided, demonstrating various endpoints for getting GPU metrics via DCGM. - - -``` -# Start the http server -# By default the http server is started at localhost:8070 - -$ go build && ./restApi - -# Query GPU 0 info -$ GPUID=0 -$ curl localhost:8070/dcgm/device/info/id/$GPUID - -# sample output - -Driver Version : 384.130 -GPU : 0 -DCGMSupported : Yes -UUID : GPU-34e8d7ba-0e4d-ac00-6852-695d5d404f51 -Brand : GeForce -Model : GeForce GTX 980 -Serial Number : 0324414056639 -Vbios : 84.04.1F.00.02 -InforomImage Version : G001.0000.01.03 -Bus ID : 00000000:01:00.0 -BAR1 (MB) : 256 -FrameBuffer Memory (MB): 4036 -Bandwidth (MB/s) : 15760 -Cores (MHz) : 1392 -Memory (MHz) : 3505 -Power (W) : 180 -CPUAffinity : 0-11 -P2P Available : None ---------------------------------------------------------------------- - -$ curl localhost:8070/dcgm/device/info/id/$GPUID/json - -# Query GPU info using its UUID - -$ UUID=$(curl -s localhost:8070/dcgm/device/info/id/$GPUID | grep -i uuid | cut -d ":" -f2 ) -$ curl localhost:8070/dcgm/device/info/uuid/$UUID -$ curl localhost:8070/dcgm/device/info/uuid/$UUID/json - -# sample output - -{"GPU":0,"DCGMSupported":"Yes","UUID":"GPU-34e8d7ba-0e4d-ac00-6852-695d5d404f51","Power":180,"PCI":{"BusID":"00000000:01:00.0","BAR1":256,"FBTotal":4036,"Bandwidth":15760},"Clocks":{"Cores":1392,"Memory":3505},"Identifiers":{"Brand":"GeForce","Model":"GeForce GTX 980","Serial":"0324414056639","Vbios":"84.04.1F.00.02","InforomImageVersion":"G001.0000.01.03","DriverVersion":"384.130"},"Topology":null,"CPUAffinity":"0-11"} - -# Query GPU status - -$ curl localhost:8070/dcgm/device/status/id/$GPUID -$ curl localhost:8070/dcgm/device/status/id/$GPUID/json - -# sample output - -Power (W) : 20.985 -Temperature (°C) : 47 -Sm Utilization (%) : 2 -Memory Utilization (%) : 8 -Encoder Utilization (%) : 0 -Decoder Utilization (%) : 0 -Memory Clock (MHz : 324 -SM Clock (MHz) : 135 - -$ curl localhost:8070/dcgm/device/status/uuid/$UUID - -# sample output - -{"Power":20.793,"Temperature":43,"Utilization":{"GPU":0,"Memory":8,"Encoder":0,"Decoder":0},"Memory":{"GlobalUsed":null,"ECCErrors":{"SingleBit":9223372036854775794,"DoubleBit":9223372036854775794}},"Clocks":{"Cores":135,"Memory":324},"PCI":{"BAR1Used":9,"Throughput":{"Rx":129,"Tx":47,"Replays":0},"FBUsed":423},"Performance":8,"FanSpeed":29} - -$ curl localhost:8070/dcgm/device/status/uuid/$UUID/json - -# Query GPU process info - -# Run CUDA nbody sample and get its PID -$ PID=$(pgrep nbody) - -$ curl localhost:8070/dcgm/process/info/pid/$PID -$ curl localhost:8070/dcgm/process/info/pid/$PID/json - -# sample output - -{"GPU":0,"PID":19132,"Name":"nbody","ProcessUtilization":{"StartTime":1529980640,"EndTime":0,"EnergyConsumed":1346,"SmUtil":0,"MemUtil":0},"PCI":{"BAR1Used":null,"Throughput":{"Rx":null,"Tx":null,"Replays":0},"FBUsed":null},"Memory":{"GlobalUsed":84279296,"ECCErrors":{"SingleBit":0,"DoubleBit":0}},"GpuUtilization":{"GPU":null,"Memory":null,"Encoder":null,"Decoder":null},"Clocks":{"Cores":null,"Memory":null},"Violations":{"Power":0,"Thermal":0,"Reliability":0,"BoardLimit":0,"LowUtilization":0,"SyncBoost":0},"XIDErrors":{"NumErrors":0,"TimeStamp":[]}} - -# Query GPU health - -$ curl localhost:8070/dcgm/health/id/$GPUID -$ curl localhost:8070/dcgm/health/id/$GPUID/json -$ curl localhost:8070/dcgm/health/uuid/$UUID -$ curl localhost:8070/dcgm/health/uuid/$UUID/json - -# sample output - -{"GPU":0,"Status":"Healthy","Watches":[]} - -# Query DCGM hostengine memory and CPU usage - -$ curl localhost:8070/dcgm/status -$ curl localhost:8070/dcgm/status/json - -# sample output - -{"Memory":18380,"CPU":0.16482222745467387} - -``` \ No newline at end of file diff --git a/bindings/go/samples/dcgm/restApi/handlers/byIds.go b/bindings/go/samples/dcgm/restApi/handlers/byIds.go deleted file mode 100644 index ef4f3100..00000000 --- a/bindings/go/samples/dcgm/restApi/handlers/byIds.go +++ /dev/null @@ -1,65 +0,0 @@ -package handlers - -import ( - "net/http" -) - -func DeviceInfo(resp http.ResponseWriter, req *http.Request) { - device := getDeviceInfo(resp, req) - if device == nil { - return - } - if isJson(req) { - encode(resp, req, device) - return - } - print(resp, req, device, deviceInfo) -} - -func DeviceStatus(resp http.ResponseWriter, req *http.Request) { - st := getDeviceStatus(resp, req) - if st == nil { - return - } - if isJson(req) { - encode(resp, req, st) - return - } - print(resp, req, st, deviceStatus) -} - -func ProcessInfo(resp http.ResponseWriter, req *http.Request) { - pInfo := getProcessInfo(resp, req) - if len(pInfo) == 0 { - return - } - if isJson(req) { - encode(resp, req, pInfo) - return - } - processPrint(resp, req, pInfo) -} - -func Health(resp http.ResponseWriter, req *http.Request) { - h := getHealth(resp, req) - if h == nil { - return - } - if isJson(req) { - encode(resp, req, h) - return - } - print(resp, req, h, healthStatus) -} - -func DcgmStatus(resp http.ResponseWriter, req *http.Request) { - st := getDcgmStatus(resp, req) - if st == nil { - return - } - if isJson(req) { - encode(resp, req, st) - return - } - print(resp, req, st, hostengine) -} diff --git a/bindings/go/samples/dcgm/restApi/handlers/byUuids.go b/bindings/go/samples/dcgm/restApi/handlers/byUuids.go deleted file mode 100644 index 5161158b..00000000 --- a/bindings/go/samples/dcgm/restApi/handlers/byUuids.go +++ /dev/null @@ -1,65 +0,0 @@ -package handlers - -import ( - "log" - "net/http" - - "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm" -) - -// map of uuids and device id -var uuids map[string]uint - -func DevicesUuids() { - uuids = make(map[string]uint) - count, err := dcgm.GetAllDeviceCount() - if err != nil { - log.Printf("(DCGM) Error getting devices: %s", err) - return - } - - for i := uint(0); i < count; i++ { - deviceInfo, err := dcgm.GetDeviceInfo(i) - if err != nil { - log.Printf("(DCGM) Error getting device information: %s", err) - return - } - uuids[deviceInfo.UUID] = i - } -} - -func DeviceInfoByUuid(resp http.ResponseWriter, req *http.Request) { - device := getDeviceInfo(resp, req) - if device == nil { - return - } - if isJson(req) { - encode(resp, req, device) - return - } - print(resp, req, device, deviceInfo) -} - -func DeviceStatusByUuid(resp http.ResponseWriter, req *http.Request) { - st := getDeviceStatus(resp, req) - if st == nil { - return - } - if isJson(req) { - encode(resp, req, st) - return - } - print(resp, req, st, deviceStatus) -} - -func HealthByUuid(resp http.ResponseWriter, req *http.Request) { - h := getHealth(resp, req) - if h == nil { - return - } - if isJson(req) { - encode(resp, req, h) - return - } - print(resp, req, h, healthStatus) -} diff --git a/bindings/go/samples/dcgm/restApi/handlers/dcgm.go b/bindings/go/samples/dcgm/restApi/handlers/dcgm.go deleted file mode 100644 index 0656f38d..00000000 --- a/bindings/go/samples/dcgm/restApi/handlers/dcgm.go +++ /dev/null @@ -1,136 +0,0 @@ -package handlers - -import ( - "log" - "math" - "net/http" - "time" - - "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm" - "github.com/gorilla/mux" -) - -func getDcgmStatus(resp http.ResponseWriter, req *http.Request) (status *dcgm.DcgmStatus) { - st, err := dcgm.Introspect() - if err != nil { - http.Error(resp, err.Error(), http.StatusInternalServerError) - log.Printf("error: %v%v: %v", req.Host, req.URL, err.Error()) - return - } - return &st - -} - -func getDeviceInfo(resp http.ResponseWriter, req *http.Request) (device *dcgm.Device) { - var id uint - params := mux.Vars(req) - for k, v := range params { - switch k { - case "id": - id = getId(resp, req, v) - case "uuid": - id = getIdByUuid(resp, req, v) - } - } - - if id == math.MaxUint32 { - return - } - - if !isValidId(id, resp, req) { - return - } - d, err := dcgm.GetDeviceInfo(id) - if err != nil { - http.Error(resp, err.Error(), http.StatusInternalServerError) - log.Printf("error: %v%v: %v", req.Host, req.URL, err.Error()) - return - } - return &d -} - -func getDeviceStatus(resp http.ResponseWriter, req *http.Request) (status *dcgm.DeviceStatus) { - var id uint - params := mux.Vars(req) - for k, v := range params { - switch k { - case "id": - id = getId(resp, req, v) - case "uuid": - id = getIdByUuid(resp, req, v) - } - } - - if id == math.MaxUint32 { - return - } - - if !isValidId(id, resp, req) { - return - } - - if !isDcgmSupported(id, resp, req) { - return - } - - st, err := dcgm.GetDeviceStatus(id) - if err != nil { - http.Error(resp, err.Error(), http.StatusInternalServerError) - log.Printf("error: %v%v: %v", req.Host, req.URL, err.Error()) - return - } - return &st -} - -func getHealth(resp http.ResponseWriter, req *http.Request) (health *dcgm.DeviceHealth) { - var id uint - params := mux.Vars(req) - for k, v := range params { - switch k { - case "id": - id = getId(resp, req, v) - case "uuid": - id = getIdByUuid(resp, req, v) - } - } - - if id == math.MaxUint32 { - return - } - - if !isValidId(id, resp, req) { - return - } - - h, err := dcgm.HealthCheckByGpuId(id) - if err != nil { - http.Error(resp, err.Error(), http.StatusInternalServerError) - log.Printf("error: %v%v: %v", req.Host, req.URL, err.Error()) - return - } - return &h -} - -func getProcessInfo(resp http.ResponseWriter, req *http.Request) (pInfo []dcgm.ProcessInfo) { - params := mux.Vars(req) - pid := getId(resp, req, params["pid"]) - if pid == math.MaxUint32 { - return - } - group, err := dcgm.WatchPidFields() - if err != nil { - http.Error(resp, err.Error(), http.StatusInternalServerError) - log.Printf("error: %v%v: %v", req.Host, req.URL, err.Error()) - return - } - - // wait for watches to be enabled - log.Printf("Enabling DCGM watches to start collecting process stats. This may take a few seconds....") - time.Sleep(3000 * time.Millisecond) - pInfo, err = dcgm.GetProcessInfo(group, pid) - if err != nil { - http.Error(resp, err.Error(), http.StatusInternalServerError) - log.Printf("error: %v%v: %v", req.Host, req.URL, err.Error()) - } - return -} diff --git a/bindings/go/samples/dcgm/restApi/handlers/utils.go b/bindings/go/samples/dcgm/restApi/handlers/utils.go deleted file mode 100644 index 75cfe0b1..00000000 --- a/bindings/go/samples/dcgm/restApi/handlers/utils.go +++ /dev/null @@ -1,183 +0,0 @@ -package handlers - -import ( - "encoding/json" - "fmt" - "log" - "math" - "net/http" - "strconv" - "text/template" - - "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm" -) - -const ( - base = 10 - bitsize = 32 - - deviceInfo = `Driver Version : {{.Identifiers.DriverVersion}} -GPU : {{.GPU}} -DCGMSupported : {{.DCGMSupported}} -UUID : {{.UUID}} -Brand : {{.Identifiers.Brand}} -Model : {{.Identifiers.Model}} -Serial Number : {{.Identifiers.Serial}} -Vbios : {{or .Identifiers.Vbios "N/A"}} -InforomImage Version : {{.Identifiers.InforomImageVersion}} -Bus ID : {{.PCI.BusID}} -BAR1 (MB) : {{or .PCI.BAR1 "N/A"}} -FrameBuffer Memory (MB): {{or .PCI.FBTotal "N/A"}} -Bandwidth (MB/s) : {{or .PCI.Bandwidth "N/A"}} -Cores (MHz) : {{or .Clocks.Cores "N/A"}} -Memory (MHz) : {{or .Clocks.Memory "N/A"}} -Power (W) : {{or .Power "N/A"}} -CPUAffinity : {{or .CPUAffinity "N/A"}} -P2P Available : {{if not .Topology}}None{{else}}{{range .Topology}} - GPU{{.GPU}} - (BusID){{.BusID}} - {{.Link.PCIPaths}}{{end}}{{end}} ---------------------------------------------------------------------- -` - deviceStatus = `Power (W) : {{.Power}} -Temperature (°C) : {{.Temperature}} -Sm Utilization (%) : {{.Utilization.GPU}} -Memory Utilization (%) : {{.Utilization.Memory}} -Encoder Utilization (%) : {{.Utilization.Encoder}} -Decoder Utilization (%) : {{.Utilization.Decoder}} -Memory Clock (MHz : {{.Clocks.Memory}} -SM Clock (MHz) : {{.Clocks.Cores}} -` - - processInfo = `---------------------------------------------------------------------- -GPU ID : {{.GPU}} -----------Execution Stats--------------------------------------------- -PID : {{.PID}} -Name : {{or .Name "N/A"}} -Start Time : {{.ProcessUtilization.StartTime.String}} -End Time : {{.ProcessUtilization.EndTime.String}} -----------Performance Stats------------------------------------------- -Energy Consumed (Joules) : {{or .ProcessUtilization.EnergyConsumed "N/A"}} -Max GPU Memory Used (bytes) : {{or .Memory.GlobalUsed "N/A"}} -Avg SM Clock (MHz) : {{or .Clocks.Cores "N/A"}} -Avg Memory Clock (MHz) : {{or .Clocks.Memory "N/A"}} -Avg SM Utilization (%) : {{or .GpuUtilization.Memory "N/A"}} -Avg Memory Utilization (%) : {{or .GpuUtilization.GPU "N/A"}} -Avg PCIe Rx Bandwidth (MB) : {{or .PCI.Throughput.Rx "N/A"}} -Avg PCIe Tx Bandwidth (MB) : {{or .PCI.Throughput.Tx "N/A"}} -----------Event Stats------------------------------------------------- -Single Bit ECC Errors : {{or .Memory.ECCErrors.SingleBit "N/A"}} -Double Bit ECC Errors : {{or .Memory.ECCErrors.DoubleBit "N/A"}} -Critical XID Errors : {{.XIDErrors.NumErrors}} -----------Slowdown Stats---------------------------------------------- -Due to - Power (%) : {{.Violations.Power}} - - Thermal (%) : {{.Violations.Thermal}} - - Reliability (%) : {{.Violations.Reliability}} - - Board Limit (%) : {{.Violations.BoardLimit}} - - Low Utilization (%) : {{.Violations.LowUtilization}} - - Sync Boost (%) : {{.Violations.SyncBoost}} -----------Process Utilization----------------------------------------- -Avg SM Utilization (%) : {{or .ProcessUtilization.SmUtil "N/A"}} -Avg Memory Utilization (%) : {{or .ProcessUtilization.MemUtil "N/A"}} ----------------------------------------------------------------------- -` - healthStatus = `GPU : {{.GPU}} -Status : {{.Status}} -{{range .Watches}} -Type : {{.Type}} -Status : {{.Status}} -Error : {{.Error}} -{{end}}` - - hostengine = `Memory(KB) : {{.Memory}} -CPU(%) : {{printf "%.2f" .CPU}} -` -) - -func getId(resp http.ResponseWriter, req *http.Request, key string) uint { - id, err := strconv.ParseUint(key, base, bitsize) - if err != nil { - http.Error(resp, err.Error(), http.StatusBadRequest) - log.Printf("error: %v%v: %v", req.Host, req.URL, err.Error()) - return math.MaxUint32 - } - return uint(id) -} - -func getIdByUuid(resp http.ResponseWriter, req *http.Request, key string) uint { - id, exists := uuids[key] - if !exists { - http.NotFound(resp, req) - log.Printf("error: %v%v: %v (page not found)", req.Host, req.URL, http.StatusNotFound) - return math.MaxUint32 - } - return id -} - -func isValidId(id uint, resp http.ResponseWriter, req *http.Request) bool { - count, err := dcgm.GetAllDeviceCount() - if err != nil { - http.Error(resp, err.Error(), http.StatusInternalServerError) - log.Printf("error: %v%v: %v", req.Host, req.URL, err.Error()) - return false - } - - if id >= count { - http.NotFound(resp, req) - log.Printf("error: %v%v: %v (page not found)", req.Host, req.URL, http.StatusNotFound) - return false - } - return true -} - -func isDcgmSupported(gpuId uint, resp http.ResponseWriter, req *http.Request) bool { - gpus, err := dcgm.GetSupportedDevices() - if err != nil { - http.Error(resp, err.Error(), http.StatusInternalServerError) - log.Printf("error: %v%v: %v", req.Host, req.URL, err.Error()) - return false - } - for _, gpu := range gpus { - if gpuId == gpu { - return true - } - } - err = fmt.Errorf("Error adding GPU %d to group: This GPU is not supported by DCGM", gpuId) - http.Error(resp, err.Error(), http.StatusInternalServerError) - log.Printf("error: %v%v: %v", req.Host, req.URL, err.Error()) - return false -} - -func isJson(req *http.Request) bool { - url := fmt.Sprintf("%v", (req.URL)) - if url[len(url)-4:] == "json" { - return true - } - return false - -} - -func print(resp http.ResponseWriter, req *http.Request, stats interface{}, templ string) { - t := template.Must(template.New("").Parse(templ)) - if err := t.Execute(resp, stats); err != nil { - http.Error(resp, err.Error(), http.StatusInternalServerError) - log.Printf("error: %v%v: %v", req.Host, req.URL, err.Error()) - } -} - -func encode(resp http.ResponseWriter, req *http.Request, stats interface{}) { - resp.Header().Set("Content-Type", "application/json") - if err := json.NewEncoder(resp).Encode(stats); err != nil { - http.Error(resp, err.Error(), http.StatusInternalServerError) - log.Printf("error: %v%v: %v", req.Host, req.URL, err.Error()) - } -} - -func processPrint(resp http.ResponseWriter, req *http.Request, pInfo []dcgm.ProcessInfo) { - t := template.Must(template.New("Process").Parse(processInfo)) - for _, gpu := range pInfo { - if err := t.Execute(resp, gpu); err != nil { - http.Error(resp, err.Error(), http.StatusInternalServerError) - log.Printf("error: %v%v: %v", req.Host, req.URL, err.Error()) - return - } - } -} diff --git a/bindings/go/samples/dcgm/restApi/main.go b/bindings/go/samples/dcgm/restApi/main.go deleted file mode 100644 index ebda11cc..00000000 --- a/bindings/go/samples/dcgm/restApi/main.go +++ /dev/null @@ -1,35 +0,0 @@ -package main - -import ( - "log" - "os" - "os/signal" - "syscall" - - "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm" -) - -// res: curl localhost:8070/dcgm/device/info/id/0 - -func main() { - stopSig := make(chan os.Signal, 1) - signal.Notify(stopSig, syscall.SIGINT, syscall.SIGTERM) - - cleanup, err := dcgm.Init(dcgm.Embedded) - if err != nil { - log.Panicln(err) - } - defer cleanup() - - addr := ":8070" - server := newHttpServer(addr) - - go func() { - log.Printf("Running http server on localhost%s", addr) - server.serve() - }() - defer server.stop() - - <-stopSig - return -} diff --git a/bindings/go/samples/dcgm/restApi/server.go b/bindings/go/samples/dcgm/restApi/server.go deleted file mode 100644 index f04d9bba..00000000 --- a/bindings/go/samples/dcgm/restApi/server.go +++ /dev/null @@ -1,88 +0,0 @@ -package main - -import ( - "context" - "log" - "net/http" - "time" - - h "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/samples/dcgm/restApi/handlers" - "github.com/gorilla/mux" -) - -const timeout = 5 * time.Second - -type httpServer struct { - router *mux.Router - server *http.Server -} - -func newHttpServer(addr string) *httpServer { - r := mux.NewRouter() - - s := &httpServer{ - router: r, - server: &http.Server{ - Addr: addr, - Handler: r, - ReadTimeout: timeout, - WriteTimeout: timeout, - }, - } - - // make a global map of device uuids and ids - h.DevicesUuids() - - s.handler() - return s -} - -func (s *httpServer) handler() { - deviceInfo := "/dcgm/device/info" - subrouter := s.router.PathPrefix(deviceInfo).Subrouter() - subrouter.HandleFunc("/id/{id}", h.DeviceInfo).Methods("GET") - subrouter.HandleFunc("/id/{id}/json", h.DeviceInfo).Methods("GET") - subrouter.HandleFunc("/uuid/{uuid}", h.DeviceInfoByUuid).Methods("GET") - subrouter.HandleFunc("/uuid/{uuid}/json", h.DeviceInfoByUuid).Methods("GET") - - deviceStatus := "/dcgm/device/status" - subrouter = s.router.PathPrefix(deviceStatus).Subrouter() - subrouter.HandleFunc("/id/{id}", h.DeviceStatus).Methods("GET") - subrouter.HandleFunc("/id/{id}/json", h.DeviceStatus).Methods("GET") - subrouter.HandleFunc("/uuid/{uuid}", h.DeviceStatusByUuid).Methods("GET") - subrouter.HandleFunc("/uuid/{uuid}/json", h.DeviceStatusByUuid).Methods("GET") - - processInfo := "/dcgm/process/info/pid/{pid}" - subrouter = s.router.PathPrefix(processInfo).Subrouter() - subrouter.HandleFunc("", h.ProcessInfo).Methods("GET") - subrouter.HandleFunc("/json", h.ProcessInfo).Methods("GET") - - health := "/dcgm/health" - subrouter = s.router.PathPrefix(health).Subrouter() - subrouter.HandleFunc("/id/{id}", h.Health).Methods("GET") - subrouter.HandleFunc("/id/{id}/json", h.Health).Methods("GET") - subrouter.HandleFunc("/uuid/{uuid}", h.HealthByUuid).Methods("GET") - subrouter.HandleFunc("/uuid/{uuid}/json", h.HealthByUuid).Methods("GET") - - dcgmStatus := "/dcgm/status" - subrouter = s.router.PathPrefix(dcgmStatus).Subrouter() - subrouter.HandleFunc("", h.DcgmStatus).Methods("GET") - subrouter.HandleFunc("/json", h.DcgmStatus).Methods("GET") -} - -func (s *httpServer) serve() { - if err := s.server.ListenAndServe(); err != http.ErrServerClosed { - log.Printf("Error: %v", err) - } -} - -func (s *httpServer) stop() { - ctx, cancel := context.WithTimeout(context.Background(), timeout) - defer cancel() - - if err := s.server.Shutdown(ctx); err != nil { - log.Printf("Error: %v", err) - } else { - log.Println("http server stopped") - } -} diff --git a/bindings/go/samples/dcgm/topology/main.go b/bindings/go/samples/dcgm/topology/main.go deleted file mode 100644 index cd00d4e3..00000000 --- a/bindings/go/samples/dcgm/topology/main.go +++ /dev/null @@ -1,70 +0,0 @@ -package main - -import ( - "fmt" - "log" - - "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm" -) - -const ( - legend = ` -Legend: - X = Self - SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI) - NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node - PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU) - PXB = Connection traversing multiple PCIe switches (without traversing the PCIe Host Bridge) - PIX = Connection traversing a single PCIe switch - PSB = Connection traversing a single on-board PCIe switch - NV# = Connection traversing a bonded set of # NVLinks` -) - -// based on nvidia-smi topo -m -// dcgmi topo -func main() { - // choose dcgm hostengine running mode - // 1. dcgm.Embedded - // 2. dcgm.Standalone - // 3. dcgm.StartHostengine - cleanup, err := dcgm.Init(dcgm.Embedded) - if err != nil { - log.Panicln(err) - } - defer cleanup() - - gpus, err := dcgm.GetSupportedDevices() - if err != nil { - log.Panicln(err) - } - - for _, gpu := range gpus { - fmt.Printf("%9s%d", "GPU", gpu) - } - fmt.Printf("%5s\n", "CPUAffinity") - - numGpus := len(gpus) - gpuTopo := make([]string, numGpus) - for i := 0; i < numGpus; i++ { - topo, err := dcgm.GetDeviceTopology(gpus[i]) - if err != nil { - log.Panicln(err) - } - - fmt.Printf("GPU%d", gpus[i]) - for j := 0; j < len(topo); j++ { - // skip current GPU - gpuTopo[topo[j].GPU] = topo[j].Link.PCIPaths() - } - gpuTopo[i] = "X" - for j := 0; j < numGpus; j++ { - fmt.Printf("%5s", gpuTopo[j]) - } - deviceInfo, err := dcgm.GetDeviceInfo(gpus[i]) - if err != nil { - log.Panicln(err) - } - fmt.Printf("%5s\n", deviceInfo.CPUAffinity) - } - fmt.Println(legend) -} diff --git a/bindings/go/samples/nvml/README.md b/bindings/go/samples/nvml/README.md deleted file mode 100644 index 6ad2bbae..00000000 --- a/bindings/go/samples/nvml/README.md +++ /dev/null @@ -1,72 +0,0 @@ -## NVML Samples - -Modelled on the [NVIDIA System Management Interface (nvidia-smi)](https://developer.nvidia.com/nvidia-system-management-interface), a commnad line utility using NVML, three samples have been provided to show how to use NVML go bindings. - -#### deviceInfo - -Provides basic information about each GPU on the system. - -``` -$ go build && ./deviceInfo - -# sample output - -Driver Version : 384.111 -GPU : 0 -UUID : GPU-34e8d7ba-0e4d-ac00-6852-695d5d404f51 -Model : GeForce GTX 980 -Path : /dev/nvidia0 -Power : 180 W -CPU Affinity : NUMA node0 -Bus ID : 00000000:01:00.0 -BAR1 : 256 MiB -Bandwidth : 15760 MB/s -Cores : 1392 MHz -Memory : 3505 MHz -P2P Available : None ---------------------------------------------------------------------- -GPU : 1 -UUID : GPU-8d3b966d-2248-c3f4-1784-49851a1d02b3 -Model : GeForce GTX TITAN -Path : /dev/nvidia1 -Power : 250 W -CPU Affinity : NUMA node0 -Bus ID : 00000000:06:00.0 -BAR1 : 128 MiB -Bandwidth : 8000 MB/s -Cores : 1202 MHz -Memory : 3004 MHz -P2P Available : None ---------------------------------------------------------------------- -``` - -#### dmon - -Monitors each device status including its power, memory and GPU utilization. - -``` -$ go build && ./dmon - -# sample output - -# gpu pwr temp sm mem enc dec mclk pclk -# Idx W C % % % % MHz MHz - 0 20 43 0 8 0 0 324 135 - 1 10 32 0 0 0 0 324 324 - -``` - -#### processInfo - -Informs about GPU processes running on all devices. - -``` -$ go build && ./processInfo - -# sample output - -# gpu pid type mem command -# Idx # C/G % name - 0 25712 C+G 0 nbody - 1 - - - - -``` diff --git a/bindings/go/samples/nvml/deviceInfo/main.go b/bindings/go/samples/nvml/deviceInfo/main.go deleted file mode 100644 index ec254a0b..00000000 --- a/bindings/go/samples/nvml/deviceInfo/main.go +++ /dev/null @@ -1,60 +0,0 @@ -package main - -import ( - "fmt" - "log" - "os" - "text/template" - - "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml" -) - -const ( - DEVICEINFO = `UUID : {{.UUID}} -Model : {{or .Model "N/A"}} -Path : {{.Path}} -Power : {{if .Power}}{{.Power}} W{{else}}N/A{{end}} -Memory : {{if .Memory}}{{.Memory}} MiB{{else}}N/A{{end}} -CudaComputeCap : {{if .CudaComputeCapability.Major}}{{.CudaComputeCapability.Major}}.{{.CudaComputeCapability.Minor}}{{else}}N/A{{end}} -CPU Affinity : {{if .CPUAffinity}}NUMA node{{.CPUAffinity}}{{else}}N/A{{end}} -Bus ID : {{.PCI.BusID}} -BAR1 : {{if .PCI.BAR1}}{{.PCI.BAR1}} MiB{{else}}N/A{{end}} -Bandwidth : {{if .PCI.Bandwidth}}{{.PCI.Bandwidth}} MB/s{{else}}N/A{{end}} -Cores : {{if .Clocks.Cores}}{{.Clocks.Cores}} MHz{{else}}N/A{{end}} -Memory : {{if .Clocks.Memory}}{{.Clocks.Memory}} MHz{{else}}N/A{{end}} -P2P Available : {{if not .Topology}}None{{else}}{{range .Topology}} - {{.BusID}} - {{(.Link.String)}}{{end}}{{end}} ---------------------------------------------------------------------- -` -) - -func main() { - nvml.Init() - defer nvml.Shutdown() - - count, err := nvml.GetDeviceCount() - if err != nil { - log.Panicln("Error getting device count:", err) - } - - driverVersion, err := nvml.GetDriverVersion() - if err != nil { - log.Panicln("Error getting driver version:", err) - } - - t := template.Must(template.New("Device").Parse(DEVICEINFO)) - - fmt.Printf("Driver Version : %5v\n", driverVersion) - for i := uint(0); i < count; i++ { - device, err := nvml.NewDevice(i) - if err != nil { - log.Panicf("Error getting device %d: %v\n", i, err) - } - - fmt.Printf("GPU %12s %d\n", ":", i) - err = t.Execute(os.Stdout, device) - if err != nil { - log.Panicln("Template error:", err) - } - } -} diff --git a/bindings/go/samples/nvml/dmon/main.go b/bindings/go/samples/nvml/dmon/main.go deleted file mode 100644 index 69090b91..00000000 --- a/bindings/go/samples/nvml/dmon/main.go +++ /dev/null @@ -1,60 +0,0 @@ -package main - -import ( - "fmt" - "log" - "os" - "os/signal" - "syscall" - "time" - - "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml" -) - -const ( - DMONHEADER = `# gpu pwr temp sm mem enc dec mclk pclk -# Idx W C % % % % MHz MHz` -) - -func main() { - nvml.Init() - defer nvml.Shutdown() - - count, err := nvml.GetDeviceCount() - if err != nil { - log.Panicln("Error getting device count:", err) - } - - var devices []*nvml.Device - for i := uint(0); i < count; i++ { - device, err := nvml.NewDevice(i) - if err != nil { - log.Panicf("Error getting device %d: %v\n", i, err) - } - devices = append(devices, device) - } - - sigs := make(chan os.Signal, 1) - signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM) - - ticker := time.NewTicker(time.Second * 1) - defer ticker.Stop() - - fmt.Println(DMONHEADER) - for { - select { - case <-ticker.C: - for i, device := range devices { - st, err := device.Status() - if err != nil { - log.Panicf("Error getting device %d status: %v\n", i, err) - } - fmt.Printf("%5d %5d %5d %5d %5d %5d %5d %5d %5d\n", - i, *st.Power, *st.Temperature, *st.Utilization.GPU, *st.Utilization.Memory, - *st.Utilization.Encoder, *st.Utilization.Decoder, *st.Clocks.Memory, *st.Clocks.Cores) - } - case <-sigs: - return - } - } -} diff --git a/bindings/go/samples/nvml/processInfo/main.go b/bindings/go/samples/nvml/processInfo/main.go deleted file mode 100644 index 1843e354..00000000 --- a/bindings/go/samples/nvml/processInfo/main.go +++ /dev/null @@ -1,64 +0,0 @@ -package main - -import ( - "fmt" - "log" - "os" - "os/signal" - "syscall" - "time" - - "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml" -) - -const ( - PINFOHEADER = `# gpu pid type mem Command -# Idx # C/G MiB name` -) - -func main() { - nvml.Init() - defer nvml.Shutdown() - - count, err := nvml.GetDeviceCount() - if err != nil { - log.Panicln("Error getting device count:", err) - } - - var devices []*nvml.Device - for i := uint(0); i < count; i++ { - device, err := nvml.NewDevice(i) - if err != nil { - log.Panicf("Error getting device %d: %v\n", i, err) - } - devices = append(devices, device) - } - - sigs := make(chan os.Signal, 1) - signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM) - - ticker := time.NewTicker(time.Second * 1) - defer ticker.Stop() - - fmt.Println(PINFOHEADER) - for { - select { - case <-ticker.C: - for i, device := range devices { - pInfo, err := device.GetAllRunningProcesses() - if err != nil { - log.Panicf("Error getting device %d processes: %v\n", i, err) - } - if len(pInfo) == 0 { - fmt.Printf("%5v %5s %5s %5s %-5s\n", i, "-", "-", "-", "-") - } - for j := range pInfo { - fmt.Printf("%5v %5v %5v %5v %-5v\n", - i, pInfo[j].PID, pInfo[j].Type, pInfo[j].MemoryUsed, pInfo[j].Name) - } - } - case <-sigs: - return - } - } -} diff --git a/deployment/dcgm-exporter/Chart.yaml b/deployment/dcgm-exporter/Chart.yaml index 771a0c1f..1dd2d6ad 100644 --- a/deployment/dcgm-exporter/Chart.yaml +++ b/deployment/dcgm-exporter/Chart.yaml @@ -5,8 +5,8 @@ version: "2.4.0" kubeVersion: ">= 1.13.0-0" appVersion: "2.4.0" sources: -- https://gitlab.com/nvidia/container-toolkit/gpu-monitoring-tools -home: https://github.com/nvidia/gpu-monitoring-tools/ +- https://github.com/nvidia/dcgm-exporter +home: https://github.com/nvidia/dcgm-exporter/ icon: https://assets.nvidiagrid.net/ngc/logos/DCGM.png keywords: - gpu diff --git a/docker/Dockerfile.ubi8 b/docker/Dockerfile.ubi8 index ad7f9225..87ba3a83 100644 --- a/docker/Dockerfile.ubi8 +++ b/docker/Dockerfile.ubi8 @@ -1,6 +1,6 @@ ARG GOLANG_VERSION FROM golang:$GOLANG_VERSION AS builder -WORKDIR /go/src/github.com/NVIDIA/gpu-monitoring-tools +WORKDIR /go/src/github.com/NVIDIA/dcgm-exporter COPY . . @@ -14,7 +14,7 @@ RUN dnf config-manager --add-repo https://developer.download.nvidia.com/compute/ dnf clean expire-cache RUN dnf install -y datacenter-gpu-manager-${DCGM_VERSION} -COPY --from=builder /go/src/github.com/NVIDIA/gpu-monitoring-tools/pkg/dcgm-exporter /usr/bin/ +COPY --from=builder /go/src/github.com/NVIDIA/dcgm-exporter/pkg/dcgm-exporter /usr/bin/ COPY etc/dcgm-exporter /etc/dcgm-exporter ENV NVIDIA_VISIBLE_DEVICES=all diff --git a/docker/Dockerfile.ubuntu18.04 b/docker/Dockerfile.ubuntu18.04 deleted file mode 100644 index b05d5623..00000000 --- a/docker/Dockerfile.ubuntu18.04 +++ /dev/null @@ -1,37 +0,0 @@ -ARG GOLANG_VERSION -FROM golang:$GOLANG_VERSION AS builder -WORKDIR /go/src/github.com/NVIDIA/gpu-monitoring-tools - -COPY . . - -RUN make binary check-format - -FROM nvcr.io/nvidia/cuda:11.2.1-base-ubuntu18.04 -LABEL io.k8s.display-name="NVIDIA DCGM Exporter" - -COPY --from=builder /go/src/github.com/NVIDIA/gpu-monitoring-tools/pkg/dcgm-exporter /usr/bin/ -COPY etc/dcgm-exporter /etc/dcgm-exporter - -ARG DCGM_VERSION -RUN apt-get update && apt-get install -y --no-install-recommends \ - libcap2-bin gnupg2 curl ca-certificates && \ - curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub | apt-key add - && \ - echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \ - echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list && \ - apt-get purge --autoremove -y curl \ - && rm -rf /var/lib/apt/lists/* - -RUN apt-get update && apt-get install -y --no-install-recommends \ - datacenter-gpu-manager=1:${DCGM_VERSION} && apt-get purge --autoremove -y openssl - -# Required for DCP metrics -ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility,compat32 -# disable all constraints on the configurations required by NVIDIA container toolkit -ENV NVIDIA_DISABLE_REQUIRE="true" -ENV NVIDIA_VISIBLE_DEVICES=all - -ENV NO_SETCAP= -COPY docker/dcgm-exporter-entrypoint.sh /usr/local/dcgm/dcgm-exporter-entrypoint.sh -RUN chmod +x /usr/local/dcgm/dcgm-exporter-entrypoint.sh - -ENTRYPOINT ["/usr/local/dcgm/dcgm-exporter-entrypoint.sh"] diff --git a/docker/Dockerfile.ubuntu20.04 b/docker/Dockerfile.ubuntu20.04 index 84b330dc..6ee21e92 100644 --- a/docker/Dockerfile.ubuntu20.04 +++ b/docker/Dockerfile.ubuntu20.04 @@ -1,6 +1,6 @@ ARG GOLANG_VERSION FROM golang:$GOLANG_VERSION AS builder -WORKDIR /go/src/github.com/NVIDIA/gpu-monitoring-tools +WORKDIR /go/src/github.com/NVIDIA/dcgm-exporter COPY . . @@ -9,7 +9,7 @@ RUN make binary check-format FROM nvcr.io/nvidia/cuda:11.2.1-base-ubuntu20.04 LABEL io.k8s.display-name="NVIDIA DCGM Exporter" -COPY --from=builder /go/src/github.com/NVIDIA/gpu-monitoring-tools/pkg/dcgm-exporter /usr/bin/ +COPY --from=builder /go/src/github.com/NVIDIA/dcgm-exporter/pkg/dcgm-exporter /usr/bin/ COPY etc/dcgm-exporter /etc/dcgm-exporter ARG DCGM_VERSION diff --git a/go.mod b/go.mod index e3a8020d..4bce750f 100644 --- a/go.mod +++ b/go.mod @@ -1,34 +1,8 @@ -module github.com/NVIDIA/gpu-monitoring-tools +module github.com/NVIDIA/dcgm-exporter go 1.14 require ( github.com/Masterminds/semver v1.5.0 // indirect - github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm v0.0.0-20210325210537-29b4f1784f18 github.com/gorilla/mux v1.7.4 ) - -replace ( - github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm => ./bindings/go/dcgm - k8s.io/api => k8s.io/api v0.20.2 - k8s.io/apiextensions-apiserver => k8s.io/apiextensions-apiserver v0.20.2 - k8s.io/apimachinery => k8s.io/apimachinery v0.20.2 - k8s.io/apiserver => k8s.io/apiserver v0.20.2 - k8s.io/cli-runtime => k8s.io/cli-runtime v0.20.2 - k8s.io/client-go => k8s.io/client-go v0.20.2 - k8s.io/cloud-provider => k8s.io/cloud-provider v0.20.2 - k8s.io/cluster-bootstrap => k8s.io/cluster-bootstrap v0.20.2 - k8s.io/code-generator => k8s.io/code-generator v0.20.2 - k8s.io/component-base => k8s.io/component-base v0.20.2 - k8s.io/cri-api => k8s.io/cri-api v0.20.2 - k8s.io/csi-translation-lib => k8s.io/csi-translation-lib v0.20.2 - k8s.io/kube-aggregator => k8s.io/kube-aggregator v0.20.2 - k8s.io/kube-controller-manager => k8s.io/kube-controller-manager v0.20.2 - k8s.io/kube-proxy => k8s.io/kube-proxy v0.20.2 - k8s.io/kube-scheduler => k8s.io/kube-scheduler v0.20.2 - k8s.io/kubectl => k8s.io/kubectl v0.20.2 - k8s.io/kubelet => k8s.io/kubelet v0.20.2 - k8s.io/legacy-cloud-providers => k8s.io/legacy-cloud-providers v0.20.2 - k8s.io/metrics => k8s.io/metrics v0.20.2 - k8s.io/sample-apiserver => k8s.io/sample-apiserver v0.20.2 -) diff --git a/pkg/dcgm.go b/pkg/dcgm.go index 0a296846..6b69aa4f 100644 --- a/pkg/dcgm.go +++ b/pkg/dcgm.go @@ -18,7 +18,7 @@ package main import ( "fmt" - "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm" + "github.com/NVIDIA/go-dcgm/pkg/dcgm" "math/rand" ) diff --git a/pkg/go.mod b/pkg/go.mod index 4d90fc70..83b82400 100644 --- a/pkg/go.mod +++ b/pkg/go.mod @@ -3,7 +3,6 @@ module dcgm-exporter go 1.14 replace ( - github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm => ../bindings/go/dcgm k8s.io/api => k8s.io/api v0.20.2 k8s.io/apiextensions-apiserver => k8s.io/apiextensions-apiserver v0.20.2 k8s.io/apimachinery => k8s.io/apimachinery v0.20.2 @@ -29,7 +28,7 @@ replace ( require ( github.com/Masterminds/semver v1.5.0 // indirect - github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm v0.0.0-00010101000000-000000000000 + github.com/NVIDIA/go-dcgm v0.0.0-20210714205848-88afcd174ede // indirect github.com/gorilla/mux v1.8.0 github.com/sirupsen/logrus v1.7.0 github.com/stretchr/testify v1.6.1 diff --git a/pkg/go.sum b/pkg/go.sum index afda828e..8fe0abc0 100644 --- a/pkg/go.sum +++ b/pkg/go.sum @@ -53,8 +53,11 @@ github.com/JeffAshton/win_pdh v0.0.0-20161109143554-76bb4ee9f0ab/go.mod h1:3VYc5 github.com/MakeNowJust/heredoc v0.0.0-20170808103936-bb23615498cd/go.mod h1:64YHyfSL2R96J44Nlwm39UHepQbyR5q10x7iYa1ks2E= github.com/Masterminds/semver v1.5.0 h1:H65muMkzWKEuNDnfl9d70GUjFniHKHRbFPGBuZ3QEww= github.com/Masterminds/semver v1.5.0/go.mod h1:MB6lktGJrhw8PrUyiEoblNEGEQ+RzHPF078ddwwvV3Y= +github.com/Microsoft/go-winio v0.4.14 h1:+hMXMk01us9KgxGb7ftKQt2Xpf5hH/yky+TDA+qxleU= github.com/Microsoft/go-winio v0.4.14/go.mod h1:qXqCSQ3Xa7+6tgxaGTIe4Kpcdsi+P8jBhyzoq1bpyYA= github.com/Microsoft/hcsshim v0.0.0-20190417211021-672e52e9209d/go.mod h1:Op3hHsoHPAvb6lceZHDtd9OkTew38wNoXnJs8iY7rUg= +github.com/NVIDIA/go-dcgm v0.0.0-20210714205848-88afcd174ede h1:qAshVEytXD3rfDDuN/mjiwh4k8BFKDX883dzP/rjyQI= +github.com/NVIDIA/go-dcgm v0.0.0-20210714205848-88afcd174ede/go.mod h1:77DGpdEF+uQYYIRHxeFwfsqvHHT2ef6uVwkWE05FdVc= github.com/NYTimes/gziphandler v0.0.0-20170623195520-56545f4a5d46/go.mod h1:3wb06e3pkSAbeQ52E9H9iFoQsEEwGN64994WTCIhntQ= github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU= github.com/OpenPeeDeeP/depguard v1.0.0/go.mod h1:7/4sitnI9YlQgTLLk734QlzXT8DuHVnAyztLplQjk+o= @@ -137,7 +140,6 @@ github.com/coreos/pkg v0.0.0-20180108230652-97fdf19511ea/go.mod h1:E3G3o1h8I7cfc github.com/coreos/pkg v0.0.0-20180928190104-399ea9e2e55f/go.mod h1:E3G3o1h8I7cfcXa63jLwjI0eiQQMgzzUDFVpN/nH/eA= github.com/cpuguy83/go-md2man v1.0.10 h1:BSKMNlYxDvnunlTymqtgONjNnaRV1sTpcovwwjF22jk= github.com/cpuguy83/go-md2man v1.0.10/go.mod h1:SmD6nW6nTyfqj6ABTjUi3V3JVMnlJmwcJI5acqYI6dE= -github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d h1:U+s90UTSYgptZMwQh2aRr3LuazLJIa+Pg3Kc1ylSYVY= github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU= github.com/cpuguy83/go-md2man/v2 v2.0.0 h1:EoUDS0afbrsXAZ9YQ9jdu/mZ2sXgT1/2yyNng4PGlyM= github.com/cpuguy83/go-md2man/v2 v2.0.0/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU= @@ -320,6 +322,7 @@ github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMyw github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.2 h1:X2ev0eStA3AbceY54o37/0PQ/UWqKEiiO2dKL5OPaFM= github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-github v17.0.0+incompatible/go.mod h1:zLgOLi98H3fifZn+44m+umXrS52loVEgC2AApnigrVQ= github.com/google/go-querystring v1.0.0/go.mod h1:odCYkC5MyYFN7vkCjXpyrEuKhc/BUO6wN/zVPAxq5ck= @@ -345,6 +348,7 @@ github.com/gophercloud/gophercloud v0.1.0/go.mod h1:vxM41WHh5uqHVBMZHzuwNOHh8XEo github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY= github.com/gorilla/context v1.1.1/go.mod h1:kBGZzfjB9CEq2AlWe17Uuf7NDRt0dE0s8S51q0aT7Yg= github.com/gorilla/mux v1.7.0/go.mod h1:1lud6UwP+6orDFRuTfBEV8e9/aOM/c4fVVCaMa2zaAs= +github.com/gorilla/mux v1.7.4/go.mod h1:DVbg23sWSpFRCP0SfiEN6jmj59UnW/n46BH5rLB71So= github.com/gorilla/mux v1.8.0 h1:i40aqfkR1h2SlN9hojwV5ZA91wcXFOvkdNIeFDP5koI= github.com/gorilla/mux v1.8.0/go.mod h1:DVbg23sWSpFRCP0SfiEN6jmj59UnW/n46BH5rLB71So= github.com/gorilla/websocket v0.0.0-20170926233335-4201258b820c/go.mod h1:E7qHFY5m1UJ88s3WnNqhKjPHQ0heANvMoAMk2YaljkQ= @@ -415,10 +419,12 @@ github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxv github.com/konsorten/go-windows-terminal-sequences v1.0.3/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= +github.com/kr/pretty v0.2.0 h1:s5hAObm+yFO5uHYt5dYjxi2rXrsnmRpJx4OYvIWUaQs= github.com/kr/pretty v0.2.0/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/pty v1.1.3/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/pty v1.1.5/go.mod h1:9r2w37qlBe7rQ6e1fg1S/9xpWHSnaqNdHD3WcMdbPDA= +github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kylelemons/godebug v0.0.0-20170820004349-d65d576e9348/go.mod h1:B69LEHPfb2qLo0BaaOLcbitczOKLWTsrBG9LczfCD4k= github.com/libopenstorage/openstorage v1.0.0/go.mod h1:Sp1sIObHjat1BeXhfMqLZ14wnOzEhNx2YQedreMcUyc= @@ -588,6 +594,7 @@ github.com/spf13/jwalterweatherman v1.1.0/go.mod h1:aNWZUN0dPAAO/Ljvb5BEdw96iTZ0 github.com/spf13/pflag v0.0.0-20170130214245-9ff6c6923cff/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4= github.com/spf13/pflag v1.0.1/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4= github.com/spf13/pflag v1.0.3/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4= +github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/spf13/viper v1.0.2/go.mod h1:A8kyI5cUJhb8N+3pkfONlcEcZbueH6nhAm0Fq7SrnBM= github.com/spf13/viper v1.3.2/go.mod h1:ZiWeW+zYFKm7srdB9IoDzzZXaJaI5eL9QjNiN/DMA2s= @@ -615,7 +622,6 @@ github.com/ultraware/funlen v0.0.1/go.mod h1:Dp4UiAus7Wdb9KUZsYWZEWiRzGuM2kXM1lP github.com/ultraware/funlen v0.0.2/go.mod h1:Dp4UiAus7Wdb9KUZsYWZEWiRzGuM2kXM1lPbfaF6xhA= github.com/urfave/cli v1.20.0 h1:fDqGv3UG/4jbVl/QkFwEdddtEDjh/5Ov6X+0B/3bPaw= github.com/urfave/cli v1.20.0/go.mod h1:70zkFmudgCuE/ngEzBv17Jvp/497gISqfk5gWijbERA= -github.com/urfave/cli v1.22.5 h1:lNq9sAHXK2qfdI8W+GRItjCEkI+2oR4d+MEHy1CKXoU= github.com/urfave/cli/v2 v2.3.0 h1:qph92Y649prgesehzOrQjdWyxFOp/QVM+6imKHad91M= github.com/urfave/cli/v2 v2.3.0/go.mod h1:LJmUH05zAU44vOAcrfzZQKsZbVcdbOG8rtL3/XcUArI= github.com/urfave/negroni v1.0.0/go.mod h1:Meg73S6kFm/4PpbYdq35yYWoCZ9mS/YSx+lKnmiohz4= @@ -870,6 +876,7 @@ golang.org/x/tools v0.0.0-20200616133436-c1934b75d054/go.mod h1:EkVYQZoAsY45+roY golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= gonum.org/v1/gonum v0.0.0-20180816165407-929014505bf4/go.mod h1:Y+Yx5eoAFn32cQvJDxZx5Dpnq+c3wtXuadVZAcxbbBo= gonum.org/v1/gonum v0.0.0-20190331200053-3d26580ed485/go.mod h1:2ltnJ7xHfj0zHS40VVPYEAAMTa3ZGguvHGBSJeRWqE0= @@ -941,6 +948,7 @@ gopkg.in/airbrake/gobrake.v2 v2.0.9/go.mod h1:/h5ZAUhDkGaJfjzjKLSjv6zCL6O0LLBxU4 gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/cheggaaa/pb.v1 v1.0.25/go.mod h1:V/YB90LKu/1FcN3WVnfiiE5oMCibMjukxqG/qStrOgw= gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI= @@ -1042,5 +1050,6 @@ sigs.k8s.io/structured-merge-diff/v3 v3.0.0-20200116222232-67a7b8c61874/go.mod h sigs.k8s.io/structured-merge-diff/v4 v4.0.2 h1:YHQV7Dajm86OuqnIR6zAelnDWBRjo+YhYV9PmGrh1s8= sigs.k8s.io/structured-merge-diff/v4 v4.0.2/go.mod h1:bJZC9H9iH24zzfZ/41RGcq60oK1F7G282QMXDPYydCw= sigs.k8s.io/yaml v1.1.0/go.mod h1:UJmg0vDUVViEyp3mgSv9WPwZCDxu4rQW1olrI1uml+o= +sigs.k8s.io/yaml v1.2.0 h1:kr/MCeFWJWTwyaHoR9c8EjH9OumOmoF9YGiZd7lFm/Q= sigs.k8s.io/yaml v1.2.0/go.mod h1:yfXDCHCao9+ENCvLSE62v9VSji2MKu5jeNfTrofGhJc= sourcegraph.com/sqs/pbtypes v0.0.0-20180604144634-d3ebe8f20ae4/go.mod h1:ketZ/q3QxT9HOBeFhu6RdvsftgpsbFHBF5Cas6cDKZ0= diff --git a/pkg/gpu_collector.go b/pkg/gpu_collector.go index 86a5fd59..30b9ccd2 100644 --- a/pkg/gpu_collector.go +++ b/pkg/gpu_collector.go @@ -18,7 +18,7 @@ package main import ( "fmt" - "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm" + "github.com/NVIDIA/go-dcgm/pkg/dcgm" "os" ) diff --git a/pkg/gpu_collector_test.go b/pkg/gpu_collector_test.go index b65f2716..37d35896 100644 --- a/pkg/gpu_collector_test.go +++ b/pkg/gpu_collector_test.go @@ -20,7 +20,7 @@ import ( "fmt" "testing" - "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm" + "github.com/NVIDIA/go-dcgm/pkg/dcgm" "github.com/stretchr/testify/require" ) @@ -56,8 +56,7 @@ func testDCGMCollector(t *testing.T, counters []Counter) (*DCGMCollector, func() require.Len(t, out[0], len(counters)) for i, dev := range out { - for j, metric := range dev { - require.Equal(t, metric.Name, counters[j].FieldName) + for _, metric := range dev { require.Equal(t, metric.GPU, fmt.Sprintf("%d", i)) require.NotEmpty(t, metric.Value) diff --git a/pkg/kubernetes_test.go b/pkg/kubernetes_test.go index bffcf09a..cd77c97f 100644 --- a/pkg/kubernetes_test.go +++ b/pkg/kubernetes_test.go @@ -24,7 +24,7 @@ import ( "testing" "time" - "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm" + "github.com/NVIDIA/go-dcgm/pkg/dcgm" "github.com/stretchr/testify/require" "google.golang.org/grpc" podresourcesapi "k8s.io/kubernetes/pkg/kubelet/apis/podresources/v1alpha1" @@ -109,7 +109,7 @@ func StartMockServer(t *testing.T, server *grpc.Server, socket string) func() { } func CreateTmpDir(t *testing.T) func() { - path, err := ioutil.TempDir("", "gpu-monitoring-tools") + path, err := ioutil.TempDir("", "dcgm-exporter") require.NoError(t, err) tmpDir = path diff --git a/pkg/main.go b/pkg/main.go index 44d6a8e2..39e1c98f 100644 --- a/pkg/main.go +++ b/pkg/main.go @@ -25,7 +25,7 @@ import ( "syscall" "time" - "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm" + "github.com/NVIDIA/go-dcgm/pkg/dcgm" "github.com/sirupsen/logrus" "github.com/urfave/cli/v2" ) diff --git a/pkg/parser.go b/pkg/parser.go index 8e1eec70..4ecffa7a 100644 --- a/pkg/parser.go +++ b/pkg/parser.go @@ -22,7 +22,7 @@ import ( "os" "strings" - "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm" + "github.com/NVIDIA/go-dcgm/pkg/dcgm" "github.com/sirupsen/logrus" ) diff --git a/pkg/pipeline_test.go b/pkg/pipeline_test.go index 2f77b7a2..eddfa239 100644 --- a/pkg/pipeline_test.go +++ b/pkg/pipeline_test.go @@ -19,7 +19,7 @@ package main import ( "testing" - "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm" + "github.com/NVIDIA/go-dcgm/pkg/dcgm" "github.com/stretchr/testify/require" ) diff --git a/pkg/system_info.go b/pkg/system_info.go index 86680de3..14b45778 100644 --- a/pkg/system_info.go +++ b/pkg/system_info.go @@ -18,7 +18,7 @@ package main import ( "fmt" - "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm" + "github.com/NVIDIA/go-dcgm/pkg/dcgm" "math/rand" ) diff --git a/pkg/system_info_test.go b/pkg/system_info_test.go index 58edaaee..49c2ae73 100644 --- a/pkg/system_info_test.go +++ b/pkg/system_info_test.go @@ -18,7 +18,7 @@ package main import ( "fmt" - "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm" + "github.com/NVIDIA/go-dcgm/pkg/dcgm" "github.com/stretchr/testify/require" "testing" ) diff --git a/pkg/types.go b/pkg/types.go index 88645361..b2305fc9 100644 --- a/pkg/types.go +++ b/pkg/types.go @@ -22,7 +22,7 @@ import ( "sync" "text/template" - "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm" + "github.com/NVIDIA/go-dcgm/pkg/dcgm" ) var ( diff --git a/tests/variables.tfvars b/tests/variables.tfvars index d3bc79f9..f7a6d2a9 100644 --- a/tests/variables.tfvars +++ b/tests/variables.tfvars @@ -1,3 +1,3 @@ instance_type = "p3.2xlarge" -project_name = "gpu-monitoring-tools" +project_name = "dcgm-exporter" setup_params = "--driver --k8s-plugin --nvcr" diff --git a/vendor/github.com/Masterminds/semver/.travis.yml b/vendor/github.com/Masterminds/semver/.travis.yml deleted file mode 100644 index 096369d4..00000000 --- a/vendor/github.com/Masterminds/semver/.travis.yml +++ /dev/null @@ -1,29 +0,0 @@ -language: go - -go: - - 1.6.x - - 1.7.x - - 1.8.x - - 1.9.x - - 1.10.x - - 1.11.x - - 1.12.x - - tip - -# Setting sudo access to false will let Travis CI use containers rather than -# VMs to run the tests. For more details see: -# - http://docs.travis-ci.com/user/workers/container-based-infrastructure/ -# - http://docs.travis-ci.com/user/workers/standard-infrastructure/ -sudo: false - -script: - - make setup - - make test - -notifications: - webhooks: - urls: - - https://webhooks.gitter.im/e/06e3328629952dabe3e0 - on_success: change # options: [always|never|change] default: always - on_failure: always # options: [always|never|change] default: always - on_start: never # options: [always|never|change] default: always diff --git a/vendor/github.com/Masterminds/semver/CHANGELOG.md b/vendor/github.com/Masterminds/semver/CHANGELOG.md deleted file mode 100644 index e405c9a8..00000000 --- a/vendor/github.com/Masterminds/semver/CHANGELOG.md +++ /dev/null @@ -1,109 +0,0 @@ -# 1.5.0 (2019-09-11) - -## Added - -- #103: Add basic fuzzing for `NewVersion()` (thanks @jesse-c) - -## Changed - -- #82: Clarify wildcard meaning in range constraints and update tests for it (thanks @greysteil) -- #83: Clarify caret operator range for pre-1.0.0 dependencies (thanks @greysteil) -- #72: Adding docs comment pointing to vert for a cli -- #71: Update the docs on pre-release comparator handling -- #89: Test with new go versions (thanks @thedevsaddam) -- #87: Added $ to ValidPrerelease for better validation (thanks @jeremycarroll) - -## Fixed - -- #78: Fix unchecked error in example code (thanks @ravron) -- #70: Fix the handling of pre-releases and the 0.0.0 release edge case -- #97: Fixed copyright file for proper display on GitHub -- #107: Fix handling prerelease when sorting alphanum and num -- #109: Fixed where Validate sometimes returns wrong message on error - -# 1.4.2 (2018-04-10) - -## Changed -- #72: Updated the docs to point to vert for a console appliaction -- #71: Update the docs on pre-release comparator handling - -## Fixed -- #70: Fix the handling of pre-releases and the 0.0.0 release edge case - -# 1.4.1 (2018-04-02) - -## Fixed -- Fixed #64: Fix pre-release precedence issue (thanks @uudashr) - -# 1.4.0 (2017-10-04) - -## Changed -- #61: Update NewVersion to parse ints with a 64bit int size (thanks @zknill) - -# 1.3.1 (2017-07-10) - -## Fixed -- Fixed #57: number comparisons in prerelease sometimes inaccurate - -# 1.3.0 (2017-05-02) - -## Added -- #45: Added json (un)marshaling support (thanks @mh-cbon) -- Stability marker. See https://masterminds.github.io/stability/ - -## Fixed -- #51: Fix handling of single digit tilde constraint (thanks @dgodd) - -## Changed -- #55: The godoc icon moved from png to svg - -# 1.2.3 (2017-04-03) - -## Fixed -- #46: Fixed 0.x.x and 0.0.x in constraints being treated as * - -# Release 1.2.2 (2016-12-13) - -## Fixed -- #34: Fixed issue where hyphen range was not working with pre-release parsing. - -# Release 1.2.1 (2016-11-28) - -## Fixed -- #24: Fixed edge case issue where constraint "> 0" does not handle "0.0.1-alpha" - properly. - -# Release 1.2.0 (2016-11-04) - -## Added -- #20: Added MustParse function for versions (thanks @adamreese) -- #15: Added increment methods on versions (thanks @mh-cbon) - -## Fixed -- Issue #21: Per the SemVer spec (section 9) a pre-release is unstable and - might not satisfy the intended compatibility. The change here ignores pre-releases - on constraint checks (e.g., ~ or ^) when a pre-release is not part of the - constraint. For example, `^1.2.3` will ignore pre-releases while - `^1.2.3-alpha` will include them. - -# Release 1.1.1 (2016-06-30) - -## Changed -- Issue #9: Speed up version comparison performance (thanks @sdboyer) -- Issue #8: Added benchmarks (thanks @sdboyer) -- Updated Go Report Card URL to new location -- Updated Readme to add code snippet formatting (thanks @mh-cbon) -- Updating tagging to v[SemVer] structure for compatibility with other tools. - -# Release 1.1.0 (2016-03-11) - -- Issue #2: Implemented validation to provide reasons a versions failed a - constraint. - -# Release 1.0.1 (2015-12-31) - -- Fixed #1: * constraint failing on valid versions. - -# Release 1.0.0 (2015-10-20) - -- Initial release diff --git a/vendor/github.com/Masterminds/semver/LICENSE.txt b/vendor/github.com/Masterminds/semver/LICENSE.txt deleted file mode 100644 index 9ff7da9c..00000000 --- a/vendor/github.com/Masterminds/semver/LICENSE.txt +++ /dev/null @@ -1,19 +0,0 @@ -Copyright (C) 2014-2019, Matt Butcher and Matt Farina - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. diff --git a/vendor/github.com/Masterminds/semver/Makefile b/vendor/github.com/Masterminds/semver/Makefile deleted file mode 100644 index a7a1b4e3..00000000 --- a/vendor/github.com/Masterminds/semver/Makefile +++ /dev/null @@ -1,36 +0,0 @@ -.PHONY: setup -setup: - go get -u gopkg.in/alecthomas/gometalinter.v1 - gometalinter.v1 --install - -.PHONY: test -test: validate lint - @echo "==> Running tests" - go test -v - -.PHONY: validate -validate: - @echo "==> Running static validations" - @gometalinter.v1 \ - --disable-all \ - --enable deadcode \ - --severity deadcode:error \ - --enable gofmt \ - --enable gosimple \ - --enable ineffassign \ - --enable misspell \ - --enable vet \ - --tests \ - --vendor \ - --deadline 60s \ - ./... || exit_code=1 - -.PHONY: lint -lint: - @echo "==> Running linters" - @gometalinter.v1 \ - --disable-all \ - --enable golint \ - --vendor \ - --deadline 60s \ - ./... || : diff --git a/vendor/github.com/Masterminds/semver/README.md b/vendor/github.com/Masterminds/semver/README.md deleted file mode 100644 index 1b52d2f4..00000000 --- a/vendor/github.com/Masterminds/semver/README.md +++ /dev/null @@ -1,194 +0,0 @@ -# SemVer - -The `semver` package provides the ability to work with [Semantic Versions](http://semver.org) in Go. Specifically it provides the ability to: - -* Parse semantic versions -* Sort semantic versions -* Check if a semantic version fits within a set of constraints -* Optionally work with a `v` prefix - -[![Stability: -Active](https://masterminds.github.io/stability/active.svg)](https://masterminds.github.io/stability/active.html) -[![Build Status](https://travis-ci.org/Masterminds/semver.svg)](https://travis-ci.org/Masterminds/semver) [![Build status](https://ci.appveyor.com/api/projects/status/jfk66lib7hb985k8/branch/master?svg=true&passingText=windows%20build%20passing&failingText=windows%20build%20failing)](https://ci.appveyor.com/project/mattfarina/semver/branch/master) [![GoDoc](https://godoc.org/github.com/Masterminds/semver?status.svg)](https://godoc.org/github.com/Masterminds/semver) [![Go Report Card](https://goreportcard.com/badge/github.com/Masterminds/semver)](https://goreportcard.com/report/github.com/Masterminds/semver) - -If you are looking for a command line tool for version comparisons please see -[vert](https://github.com/Masterminds/vert) which uses this library. - -## Parsing Semantic Versions - -To parse a semantic version use the `NewVersion` function. For example, - -```go - v, err := semver.NewVersion("1.2.3-beta.1+build345") -``` - -If there is an error the version wasn't parseable. The version object has methods -to get the parts of the version, compare it to other versions, convert the -version back into a string, and get the original string. For more details -please see the [documentation](https://godoc.org/github.com/Masterminds/semver). - -## Sorting Semantic Versions - -A set of versions can be sorted using the [`sort`](https://golang.org/pkg/sort/) -package from the standard library. For example, - -```go - raw := []string{"1.2.3", "1.0", "1.3", "2", "0.4.2",} - vs := make([]*semver.Version, len(raw)) - for i, r := range raw { - v, err := semver.NewVersion(r) - if err != nil { - t.Errorf("Error parsing version: %s", err) - } - - vs[i] = v - } - - sort.Sort(semver.Collection(vs)) -``` - -## Checking Version Constraints - -Checking a version against version constraints is one of the most featureful -parts of the package. - -```go - c, err := semver.NewConstraint(">= 1.2.3") - if err != nil { - // Handle constraint not being parseable. - } - - v, _ := semver.NewVersion("1.3") - if err != nil { - // Handle version not being parseable. - } - // Check if the version meets the constraints. The a variable will be true. - a := c.Check(v) -``` - -## Basic Comparisons - -There are two elements to the comparisons. First, a comparison string is a list -of comma separated and comparisons. These are then separated by || separated or -comparisons. For example, `">= 1.2, < 3.0.0 || >= 4.2.3"` is looking for a -comparison that's greater than or equal to 1.2 and less than 3.0.0 or is -greater than or equal to 4.2.3. - -The basic comparisons are: - -* `=`: equal (aliased to no operator) -* `!=`: not equal -* `>`: greater than -* `<`: less than -* `>=`: greater than or equal to -* `<=`: less than or equal to - -## Working With Pre-release Versions - -Pre-releases, for those not familiar with them, are used for software releases -prior to stable or generally available releases. Examples of pre-releases include -development, alpha, beta, and release candidate releases. A pre-release may be -a version such as `1.2.3-beta.1` while the stable release would be `1.2.3`. In the -order of precidence, pre-releases come before their associated releases. In this -example `1.2.3-beta.1 < 1.2.3`. - -According to the Semantic Version specification pre-releases may not be -API compliant with their release counterpart. It says, - -> A pre-release version indicates that the version is unstable and might not satisfy the intended compatibility requirements as denoted by its associated normal version. - -SemVer comparisons without a pre-release comparator will skip pre-release versions. -For example, `>=1.2.3` will skip pre-releases when looking at a list of releases -while `>=1.2.3-0` will evaluate and find pre-releases. - -The reason for the `0` as a pre-release version in the example comparison is -because pre-releases can only contain ASCII alphanumerics and hyphens (along with -`.` separators), per the spec. Sorting happens in ASCII sort order, again per the spec. The lowest character is a `0` in ASCII sort order (see an [ASCII Table](http://www.asciitable.com/)) - -Understanding ASCII sort ordering is important because A-Z comes before a-z. That -means `>=1.2.3-BETA` will return `1.2.3-alpha`. What you might expect from case -sensitivity doesn't apply here. This is due to ASCII sort ordering which is what -the spec specifies. - -## Hyphen Range Comparisons - -There are multiple methods to handle ranges and the first is hyphens ranges. -These look like: - -* `1.2 - 1.4.5` which is equivalent to `>= 1.2, <= 1.4.5` -* `2.3.4 - 4.5` which is equivalent to `>= 2.3.4, <= 4.5` - -## Wildcards In Comparisons - -The `x`, `X`, and `*` characters can be used as a wildcard character. This works -for all comparison operators. When used on the `=` operator it falls -back to the pack level comparison (see tilde below). For example, - -* `1.2.x` is equivalent to `>= 1.2.0, < 1.3.0` -* `>= 1.2.x` is equivalent to `>= 1.2.0` -* `<= 2.x` is equivalent to `< 3` -* `*` is equivalent to `>= 0.0.0` - -## Tilde Range Comparisons (Patch) - -The tilde (`~`) comparison operator is for patch level ranges when a minor -version is specified and major level changes when the minor number is missing. -For example, - -* `~1.2.3` is equivalent to `>= 1.2.3, < 1.3.0` -* `~1` is equivalent to `>= 1, < 2` -* `~2.3` is equivalent to `>= 2.3, < 2.4` -* `~1.2.x` is equivalent to `>= 1.2.0, < 1.3.0` -* `~1.x` is equivalent to `>= 1, < 2` - -## Caret Range Comparisons (Major) - -The caret (`^`) comparison operator is for major level changes. This is useful -when comparisons of API versions as a major change is API breaking. For example, - -* `^1.2.3` is equivalent to `>= 1.2.3, < 2.0.0` -* `^0.0.1` is equivalent to `>= 0.0.1, < 1.0.0` -* `^1.2.x` is equivalent to `>= 1.2.0, < 2.0.0` -* `^2.3` is equivalent to `>= 2.3, < 3` -* `^2.x` is equivalent to `>= 2.0.0, < 3` - -# Validation - -In addition to testing a version against a constraint, a version can be validated -against a constraint. When validation fails a slice of errors containing why a -version didn't meet the constraint is returned. For example, - -```go - c, err := semver.NewConstraint("<= 1.2.3, >= 1.4") - if err != nil { - // Handle constraint not being parseable. - } - - v, _ := semver.NewVersion("1.3") - if err != nil { - // Handle version not being parseable. - } - - // Validate a version against a constraint. - a, msgs := c.Validate(v) - // a is false - for _, m := range msgs { - fmt.Println(m) - - // Loops over the errors which would read - // "1.3 is greater than 1.2.3" - // "1.3 is less than 1.4" - } -``` - -# Fuzzing - - [dvyukov/go-fuzz](https://github.com/dvyukov/go-fuzz) is used for fuzzing. - -1. `go-fuzz-build` -2. `go-fuzz -workdir=fuzz` - -# Contribute - -If you find an issue or want to contribute please file an [issue](https://github.com/Masterminds/semver/issues) -or [create a pull request](https://github.com/Masterminds/semver/pulls). diff --git a/vendor/github.com/Masterminds/semver/appveyor.yml b/vendor/github.com/Masterminds/semver/appveyor.yml deleted file mode 100644 index b2778df1..00000000 --- a/vendor/github.com/Masterminds/semver/appveyor.yml +++ /dev/null @@ -1,44 +0,0 @@ -version: build-{build}.{branch} - -clone_folder: C:\gopath\src\github.com\Masterminds\semver -shallow_clone: true - -environment: - GOPATH: C:\gopath - -platform: - - x64 - -install: - - go version - - go env - - go get -u gopkg.in/alecthomas/gometalinter.v1 - - set PATH=%PATH%;%GOPATH%\bin - - gometalinter.v1.exe --install - -build_script: - - go install -v ./... - -test_script: - - "gometalinter.v1 \ - --disable-all \ - --enable deadcode \ - --severity deadcode:error \ - --enable gofmt \ - --enable gosimple \ - --enable ineffassign \ - --enable misspell \ - --enable vet \ - --tests \ - --vendor \ - --deadline 60s \ - ./... || exit_code=1" - - "gometalinter.v1 \ - --disable-all \ - --enable golint \ - --vendor \ - --deadline 60s \ - ./... || :" - - go test -v - -deploy: off diff --git a/vendor/github.com/Masterminds/semver/collection.go b/vendor/github.com/Masterminds/semver/collection.go deleted file mode 100644 index a7823589..00000000 --- a/vendor/github.com/Masterminds/semver/collection.go +++ /dev/null @@ -1,24 +0,0 @@ -package semver - -// Collection is a collection of Version instances and implements the sort -// interface. See the sort package for more details. -// https://golang.org/pkg/sort/ -type Collection []*Version - -// Len returns the length of a collection. The number of Version instances -// on the slice. -func (c Collection) Len() int { - return len(c) -} - -// Less is needed for the sort interface to compare two Version objects on the -// slice. If checks if one is less than the other. -func (c Collection) Less(i, j int) bool { - return c[i].LessThan(c[j]) -} - -// Swap is needed for the sort interface to replace the Version objects -// at two different positions in the slice. -func (c Collection) Swap(i, j int) { - c[i], c[j] = c[j], c[i] -} diff --git a/vendor/github.com/Masterminds/semver/constraints.go b/vendor/github.com/Masterminds/semver/constraints.go deleted file mode 100644 index b94b9341..00000000 --- a/vendor/github.com/Masterminds/semver/constraints.go +++ /dev/null @@ -1,423 +0,0 @@ -package semver - -import ( - "errors" - "fmt" - "regexp" - "strings" -) - -// Constraints is one or more constraint that a semantic version can be -// checked against. -type Constraints struct { - constraints [][]*constraint -} - -// NewConstraint returns a Constraints instance that a Version instance can -// be checked against. If there is a parse error it will be returned. -func NewConstraint(c string) (*Constraints, error) { - - // Rewrite - ranges into a comparison operation. - c = rewriteRange(c) - - ors := strings.Split(c, "||") - or := make([][]*constraint, len(ors)) - for k, v := range ors { - cs := strings.Split(v, ",") - result := make([]*constraint, len(cs)) - for i, s := range cs { - pc, err := parseConstraint(s) - if err != nil { - return nil, err - } - - result[i] = pc - } - or[k] = result - } - - o := &Constraints{constraints: or} - return o, nil -} - -// Check tests if a version satisfies the constraints. -func (cs Constraints) Check(v *Version) bool { - // loop over the ORs and check the inner ANDs - for _, o := range cs.constraints { - joy := true - for _, c := range o { - if !c.check(v) { - joy = false - break - } - } - - if joy { - return true - } - } - - return false -} - -// Validate checks if a version satisfies a constraint. If not a slice of -// reasons for the failure are returned in addition to a bool. -func (cs Constraints) Validate(v *Version) (bool, []error) { - // loop over the ORs and check the inner ANDs - var e []error - - // Capture the prerelease message only once. When it happens the first time - // this var is marked - var prerelesase bool - for _, o := range cs.constraints { - joy := true - for _, c := range o { - // Before running the check handle the case there the version is - // a prerelease and the check is not searching for prereleases. - if c.con.pre == "" && v.pre != "" { - if !prerelesase { - em := fmt.Errorf("%s is a prerelease version and the constraint is only looking for release versions", v) - e = append(e, em) - prerelesase = true - } - joy = false - - } else { - - if !c.check(v) { - em := fmt.Errorf(c.msg, v, c.orig) - e = append(e, em) - joy = false - } - } - } - - if joy { - return true, []error{} - } - } - - return false, e -} - -var constraintOps map[string]cfunc -var constraintMsg map[string]string -var constraintRegex *regexp.Regexp - -func init() { - constraintOps = map[string]cfunc{ - "": constraintTildeOrEqual, - "=": constraintTildeOrEqual, - "!=": constraintNotEqual, - ">": constraintGreaterThan, - "<": constraintLessThan, - ">=": constraintGreaterThanEqual, - "=>": constraintGreaterThanEqual, - "<=": constraintLessThanEqual, - "=<": constraintLessThanEqual, - "~": constraintTilde, - "~>": constraintTilde, - "^": constraintCaret, - } - - constraintMsg = map[string]string{ - "": "%s is not equal to %s", - "=": "%s is not equal to %s", - "!=": "%s is equal to %s", - ">": "%s is less than or equal to %s", - "<": "%s is greater than or equal to %s", - ">=": "%s is less than %s", - "=>": "%s is less than %s", - "<=": "%s is greater than %s", - "=<": "%s is greater than %s", - "~": "%s does not have same major and minor version as %s", - "~>": "%s does not have same major and minor version as %s", - "^": "%s does not have same major version as %s", - } - - ops := make([]string, 0, len(constraintOps)) - for k := range constraintOps { - ops = append(ops, regexp.QuoteMeta(k)) - } - - constraintRegex = regexp.MustCompile(fmt.Sprintf( - `^\s*(%s)\s*(%s)\s*$`, - strings.Join(ops, "|"), - cvRegex)) - - constraintRangeRegex = regexp.MustCompile(fmt.Sprintf( - `\s*(%s)\s+-\s+(%s)\s*`, - cvRegex, cvRegex)) -} - -// An individual constraint -type constraint struct { - // The callback function for the restraint. It performs the logic for - // the constraint. - function cfunc - - msg string - - // The version used in the constraint check. For example, if a constraint - // is '<= 2.0.0' the con a version instance representing 2.0.0. - con *Version - - // The original parsed version (e.g., 4.x from != 4.x) - orig string - - // When an x is used as part of the version (e.g., 1.x) - minorDirty bool - dirty bool - patchDirty bool -} - -// Check if a version meets the constraint -func (c *constraint) check(v *Version) bool { - return c.function(v, c) -} - -type cfunc func(v *Version, c *constraint) bool - -func parseConstraint(c string) (*constraint, error) { - m := constraintRegex.FindStringSubmatch(c) - if m == nil { - return nil, fmt.Errorf("improper constraint: %s", c) - } - - ver := m[2] - orig := ver - minorDirty := false - patchDirty := false - dirty := false - if isX(m[3]) { - ver = "0.0.0" - dirty = true - } else if isX(strings.TrimPrefix(m[4], ".")) || m[4] == "" { - minorDirty = true - dirty = true - ver = fmt.Sprintf("%s.0.0%s", m[3], m[6]) - } else if isX(strings.TrimPrefix(m[5], ".")) { - dirty = true - patchDirty = true - ver = fmt.Sprintf("%s%s.0%s", m[3], m[4], m[6]) - } - - con, err := NewVersion(ver) - if err != nil { - - // The constraintRegex should catch any regex parsing errors. So, - // we should never get here. - return nil, errors.New("constraint Parser Error") - } - - cs := &constraint{ - function: constraintOps[m[1]], - msg: constraintMsg[m[1]], - con: con, - orig: orig, - minorDirty: minorDirty, - patchDirty: patchDirty, - dirty: dirty, - } - return cs, nil -} - -// Constraint functions -func constraintNotEqual(v *Version, c *constraint) bool { - if c.dirty { - - // If there is a pre-release on the version but the constraint isn't looking - // for them assume that pre-releases are not compatible. See issue 21 for - // more details. - if v.Prerelease() != "" && c.con.Prerelease() == "" { - return false - } - - if c.con.Major() != v.Major() { - return true - } - if c.con.Minor() != v.Minor() && !c.minorDirty { - return true - } else if c.minorDirty { - return false - } - - return false - } - - return !v.Equal(c.con) -} - -func constraintGreaterThan(v *Version, c *constraint) bool { - - // If there is a pre-release on the version but the constraint isn't looking - // for them assume that pre-releases are not compatible. See issue 21 for - // more details. - if v.Prerelease() != "" && c.con.Prerelease() == "" { - return false - } - - return v.Compare(c.con) == 1 -} - -func constraintLessThan(v *Version, c *constraint) bool { - // If there is a pre-release on the version but the constraint isn't looking - // for them assume that pre-releases are not compatible. See issue 21 for - // more details. - if v.Prerelease() != "" && c.con.Prerelease() == "" { - return false - } - - if !c.dirty { - return v.Compare(c.con) < 0 - } - - if v.Major() > c.con.Major() { - return false - } else if v.Minor() > c.con.Minor() && !c.minorDirty { - return false - } - - return true -} - -func constraintGreaterThanEqual(v *Version, c *constraint) bool { - - // If there is a pre-release on the version but the constraint isn't looking - // for them assume that pre-releases are not compatible. See issue 21 for - // more details. - if v.Prerelease() != "" && c.con.Prerelease() == "" { - return false - } - - return v.Compare(c.con) >= 0 -} - -func constraintLessThanEqual(v *Version, c *constraint) bool { - // If there is a pre-release on the version but the constraint isn't looking - // for them assume that pre-releases are not compatible. See issue 21 for - // more details. - if v.Prerelease() != "" && c.con.Prerelease() == "" { - return false - } - - if !c.dirty { - return v.Compare(c.con) <= 0 - } - - if v.Major() > c.con.Major() { - return false - } else if v.Minor() > c.con.Minor() && !c.minorDirty { - return false - } - - return true -} - -// ~*, ~>* --> >= 0.0.0 (any) -// ~2, ~2.x, ~2.x.x, ~>2, ~>2.x ~>2.x.x --> >=2.0.0, <3.0.0 -// ~2.0, ~2.0.x, ~>2.0, ~>2.0.x --> >=2.0.0, <2.1.0 -// ~1.2, ~1.2.x, ~>1.2, ~>1.2.x --> >=1.2.0, <1.3.0 -// ~1.2.3, ~>1.2.3 --> >=1.2.3, <1.3.0 -// ~1.2.0, ~>1.2.0 --> >=1.2.0, <1.3.0 -func constraintTilde(v *Version, c *constraint) bool { - // If there is a pre-release on the version but the constraint isn't looking - // for them assume that pre-releases are not compatible. See issue 21 for - // more details. - if v.Prerelease() != "" && c.con.Prerelease() == "" { - return false - } - - if v.LessThan(c.con) { - return false - } - - // ~0.0.0 is a special case where all constraints are accepted. It's - // equivalent to >= 0.0.0. - if c.con.Major() == 0 && c.con.Minor() == 0 && c.con.Patch() == 0 && - !c.minorDirty && !c.patchDirty { - return true - } - - if v.Major() != c.con.Major() { - return false - } - - if v.Minor() != c.con.Minor() && !c.minorDirty { - return false - } - - return true -} - -// When there is a .x (dirty) status it automatically opts in to ~. Otherwise -// it's a straight = -func constraintTildeOrEqual(v *Version, c *constraint) bool { - // If there is a pre-release on the version but the constraint isn't looking - // for them assume that pre-releases are not compatible. See issue 21 for - // more details. - if v.Prerelease() != "" && c.con.Prerelease() == "" { - return false - } - - if c.dirty { - c.msg = constraintMsg["~"] - return constraintTilde(v, c) - } - - return v.Equal(c.con) -} - -// ^* --> (any) -// ^2, ^2.x, ^2.x.x --> >=2.0.0, <3.0.0 -// ^2.0, ^2.0.x --> >=2.0.0, <3.0.0 -// ^1.2, ^1.2.x --> >=1.2.0, <2.0.0 -// ^1.2.3 --> >=1.2.3, <2.0.0 -// ^1.2.0 --> >=1.2.0, <2.0.0 -func constraintCaret(v *Version, c *constraint) bool { - // If there is a pre-release on the version but the constraint isn't looking - // for them assume that pre-releases are not compatible. See issue 21 for - // more details. - if v.Prerelease() != "" && c.con.Prerelease() == "" { - return false - } - - if v.LessThan(c.con) { - return false - } - - if v.Major() != c.con.Major() { - return false - } - - return true -} - -var constraintRangeRegex *regexp.Regexp - -const cvRegex string = `v?([0-9|x|X|\*]+)(\.[0-9|x|X|\*]+)?(\.[0-9|x|X|\*]+)?` + - `(-([0-9A-Za-z\-]+(\.[0-9A-Za-z\-]+)*))?` + - `(\+([0-9A-Za-z\-]+(\.[0-9A-Za-z\-]+)*))?` - -func isX(x string) bool { - switch x { - case "x", "*", "X": - return true - default: - return false - } -} - -func rewriteRange(i string) string { - m := constraintRangeRegex.FindAllStringSubmatch(i, -1) - if m == nil { - return i - } - o := i - for _, v := range m { - t := fmt.Sprintf(">= %s, <= %s", v[1], v[11]) - o = strings.Replace(o, v[0], t, 1) - } - - return o -} diff --git a/vendor/github.com/Masterminds/semver/doc.go b/vendor/github.com/Masterminds/semver/doc.go deleted file mode 100644 index 6a6c24c6..00000000 --- a/vendor/github.com/Masterminds/semver/doc.go +++ /dev/null @@ -1,115 +0,0 @@ -/* -Package semver provides the ability to work with Semantic Versions (http://semver.org) in Go. - -Specifically it provides the ability to: - - * Parse semantic versions - * Sort semantic versions - * Check if a semantic version fits within a set of constraints - * Optionally work with a `v` prefix - -Parsing Semantic Versions - -To parse a semantic version use the `NewVersion` function. For example, - - v, err := semver.NewVersion("1.2.3-beta.1+build345") - -If there is an error the version wasn't parseable. The version object has methods -to get the parts of the version, compare it to other versions, convert the -version back into a string, and get the original string. For more details -please see the documentation at https://godoc.org/github.com/Masterminds/semver. - -Sorting Semantic Versions - -A set of versions can be sorted using the `sort` package from the standard library. -For example, - - raw := []string{"1.2.3", "1.0", "1.3", "2", "0.4.2",} - vs := make([]*semver.Version, len(raw)) - for i, r := range raw { - v, err := semver.NewVersion(r) - if err != nil { - t.Errorf("Error parsing version: %s", err) - } - - vs[i] = v - } - - sort.Sort(semver.Collection(vs)) - -Checking Version Constraints - -Checking a version against version constraints is one of the most featureful -parts of the package. - - c, err := semver.NewConstraint(">= 1.2.3") - if err != nil { - // Handle constraint not being parseable. - } - - v, err := semver.NewVersion("1.3") - if err != nil { - // Handle version not being parseable. - } - // Check if the version meets the constraints. The a variable will be true. - a := c.Check(v) - -Basic Comparisons - -There are two elements to the comparisons. First, a comparison string is a list -of comma separated and comparisons. These are then separated by || separated or -comparisons. For example, `">= 1.2, < 3.0.0 || >= 4.2.3"` is looking for a -comparison that's greater than or equal to 1.2 and less than 3.0.0 or is -greater than or equal to 4.2.3. - -The basic comparisons are: - - * `=`: equal (aliased to no operator) - * `!=`: not equal - * `>`: greater than - * `<`: less than - * `>=`: greater than or equal to - * `<=`: less than or equal to - -Hyphen Range Comparisons - -There are multiple methods to handle ranges and the first is hyphens ranges. -These look like: - - * `1.2 - 1.4.5` which is equivalent to `>= 1.2, <= 1.4.5` - * `2.3.4 - 4.5` which is equivalent to `>= 2.3.4, <= 4.5` - -Wildcards In Comparisons - -The `x`, `X`, and `*` characters can be used as a wildcard character. This works -for all comparison operators. When used on the `=` operator it falls -back to the pack level comparison (see tilde below). For example, - - * `1.2.x` is equivalent to `>= 1.2.0, < 1.3.0` - * `>= 1.2.x` is equivalent to `>= 1.2.0` - * `<= 2.x` is equivalent to `<= 3` - * `*` is equivalent to `>= 0.0.0` - -Tilde Range Comparisons (Patch) - -The tilde (`~`) comparison operator is for patch level ranges when a minor -version is specified and major level changes when the minor number is missing. -For example, - - * `~1.2.3` is equivalent to `>= 1.2.3, < 1.3.0` - * `~1` is equivalent to `>= 1, < 2` - * `~2.3` is equivalent to `>= 2.3, < 2.4` - * `~1.2.x` is equivalent to `>= 1.2.0, < 1.3.0` - * `~1.x` is equivalent to `>= 1, < 2` - -Caret Range Comparisons (Major) - -The caret (`^`) comparison operator is for major level changes. This is useful -when comparisons of API versions as a major change is API breaking. For example, - - * `^1.2.3` is equivalent to `>= 1.2.3, < 2.0.0` - * `^1.2.x` is equivalent to `>= 1.2.0, < 2.0.0` - * `^2.3` is equivalent to `>= 2.3, < 3` - * `^2.x` is equivalent to `>= 2.0.0, < 3` -*/ -package semver diff --git a/vendor/github.com/Masterminds/semver/version.go b/vendor/github.com/Masterminds/semver/version.go deleted file mode 100644 index 400d4f93..00000000 --- a/vendor/github.com/Masterminds/semver/version.go +++ /dev/null @@ -1,425 +0,0 @@ -package semver - -import ( - "bytes" - "encoding/json" - "errors" - "fmt" - "regexp" - "strconv" - "strings" -) - -// The compiled version of the regex created at init() is cached here so it -// only needs to be created once. -var versionRegex *regexp.Regexp -var validPrereleaseRegex *regexp.Regexp - -var ( - // ErrInvalidSemVer is returned a version is found to be invalid when - // being parsed. - ErrInvalidSemVer = errors.New("Invalid Semantic Version") - - // ErrInvalidMetadata is returned when the metadata is an invalid format - ErrInvalidMetadata = errors.New("Invalid Metadata string") - - // ErrInvalidPrerelease is returned when the pre-release is an invalid format - ErrInvalidPrerelease = errors.New("Invalid Prerelease string") -) - -// SemVerRegex is the regular expression used to parse a semantic version. -const SemVerRegex string = `v?([0-9]+)(\.[0-9]+)?(\.[0-9]+)?` + - `(-([0-9A-Za-z\-]+(\.[0-9A-Za-z\-]+)*))?` + - `(\+([0-9A-Za-z\-]+(\.[0-9A-Za-z\-]+)*))?` - -// ValidPrerelease is the regular expression which validates -// both prerelease and metadata values. -const ValidPrerelease string = `^([0-9A-Za-z\-]+(\.[0-9A-Za-z\-]+)*)$` - -// Version represents a single semantic version. -type Version struct { - major, minor, patch int64 - pre string - metadata string - original string -} - -func init() { - versionRegex = regexp.MustCompile("^" + SemVerRegex + "$") - validPrereleaseRegex = regexp.MustCompile(ValidPrerelease) -} - -// NewVersion parses a given version and returns an instance of Version or -// an error if unable to parse the version. -func NewVersion(v string) (*Version, error) { - m := versionRegex.FindStringSubmatch(v) - if m == nil { - return nil, ErrInvalidSemVer - } - - sv := &Version{ - metadata: m[8], - pre: m[5], - original: v, - } - - var temp int64 - temp, err := strconv.ParseInt(m[1], 10, 64) - if err != nil { - return nil, fmt.Errorf("Error parsing version segment: %s", err) - } - sv.major = temp - - if m[2] != "" { - temp, err = strconv.ParseInt(strings.TrimPrefix(m[2], "."), 10, 64) - if err != nil { - return nil, fmt.Errorf("Error parsing version segment: %s", err) - } - sv.minor = temp - } else { - sv.minor = 0 - } - - if m[3] != "" { - temp, err = strconv.ParseInt(strings.TrimPrefix(m[3], "."), 10, 64) - if err != nil { - return nil, fmt.Errorf("Error parsing version segment: %s", err) - } - sv.patch = temp - } else { - sv.patch = 0 - } - - return sv, nil -} - -// MustParse parses a given version and panics on error. -func MustParse(v string) *Version { - sv, err := NewVersion(v) - if err != nil { - panic(err) - } - return sv -} - -// String converts a Version object to a string. -// Note, if the original version contained a leading v this version will not. -// See the Original() method to retrieve the original value. Semantic Versions -// don't contain a leading v per the spec. Instead it's optional on -// implementation. -func (v *Version) String() string { - var buf bytes.Buffer - - fmt.Fprintf(&buf, "%d.%d.%d", v.major, v.minor, v.patch) - if v.pre != "" { - fmt.Fprintf(&buf, "-%s", v.pre) - } - if v.metadata != "" { - fmt.Fprintf(&buf, "+%s", v.metadata) - } - - return buf.String() -} - -// Original returns the original value passed in to be parsed. -func (v *Version) Original() string { - return v.original -} - -// Major returns the major version. -func (v *Version) Major() int64 { - return v.major -} - -// Minor returns the minor version. -func (v *Version) Minor() int64 { - return v.minor -} - -// Patch returns the patch version. -func (v *Version) Patch() int64 { - return v.patch -} - -// Prerelease returns the pre-release version. -func (v *Version) Prerelease() string { - return v.pre -} - -// Metadata returns the metadata on the version. -func (v *Version) Metadata() string { - return v.metadata -} - -// originalVPrefix returns the original 'v' prefix if any. -func (v *Version) originalVPrefix() string { - - // Note, only lowercase v is supported as a prefix by the parser. - if v.original != "" && v.original[:1] == "v" { - return v.original[:1] - } - return "" -} - -// IncPatch produces the next patch version. -// If the current version does not have prerelease/metadata information, -// it unsets metadata and prerelease values, increments patch number. -// If the current version has any of prerelease or metadata information, -// it unsets both values and keeps curent patch value -func (v Version) IncPatch() Version { - vNext := v - // according to http://semver.org/#spec-item-9 - // Pre-release versions have a lower precedence than the associated normal version. - // according to http://semver.org/#spec-item-10 - // Build metadata SHOULD be ignored when determining version precedence. - if v.pre != "" { - vNext.metadata = "" - vNext.pre = "" - } else { - vNext.metadata = "" - vNext.pre = "" - vNext.patch = v.patch + 1 - } - vNext.original = v.originalVPrefix() + "" + vNext.String() - return vNext -} - -// IncMinor produces the next minor version. -// Sets patch to 0. -// Increments minor number. -// Unsets metadata. -// Unsets prerelease status. -func (v Version) IncMinor() Version { - vNext := v - vNext.metadata = "" - vNext.pre = "" - vNext.patch = 0 - vNext.minor = v.minor + 1 - vNext.original = v.originalVPrefix() + "" + vNext.String() - return vNext -} - -// IncMajor produces the next major version. -// Sets patch to 0. -// Sets minor to 0. -// Increments major number. -// Unsets metadata. -// Unsets prerelease status. -func (v Version) IncMajor() Version { - vNext := v - vNext.metadata = "" - vNext.pre = "" - vNext.patch = 0 - vNext.minor = 0 - vNext.major = v.major + 1 - vNext.original = v.originalVPrefix() + "" + vNext.String() - return vNext -} - -// SetPrerelease defines the prerelease value. -// Value must not include the required 'hypen' prefix. -func (v Version) SetPrerelease(prerelease string) (Version, error) { - vNext := v - if len(prerelease) > 0 && !validPrereleaseRegex.MatchString(prerelease) { - return vNext, ErrInvalidPrerelease - } - vNext.pre = prerelease - vNext.original = v.originalVPrefix() + "" + vNext.String() - return vNext, nil -} - -// SetMetadata defines metadata value. -// Value must not include the required 'plus' prefix. -func (v Version) SetMetadata(metadata string) (Version, error) { - vNext := v - if len(metadata) > 0 && !validPrereleaseRegex.MatchString(metadata) { - return vNext, ErrInvalidMetadata - } - vNext.metadata = metadata - vNext.original = v.originalVPrefix() + "" + vNext.String() - return vNext, nil -} - -// LessThan tests if one version is less than another one. -func (v *Version) LessThan(o *Version) bool { - return v.Compare(o) < 0 -} - -// GreaterThan tests if one version is greater than another one. -func (v *Version) GreaterThan(o *Version) bool { - return v.Compare(o) > 0 -} - -// Equal tests if two versions are equal to each other. -// Note, versions can be equal with different metadata since metadata -// is not considered part of the comparable version. -func (v *Version) Equal(o *Version) bool { - return v.Compare(o) == 0 -} - -// Compare compares this version to another one. It returns -1, 0, or 1 if -// the version smaller, equal, or larger than the other version. -// -// Versions are compared by X.Y.Z. Build metadata is ignored. Prerelease is -// lower than the version without a prerelease. -func (v *Version) Compare(o *Version) int { - // Compare the major, minor, and patch version for differences. If a - // difference is found return the comparison. - if d := compareSegment(v.Major(), o.Major()); d != 0 { - return d - } - if d := compareSegment(v.Minor(), o.Minor()); d != 0 { - return d - } - if d := compareSegment(v.Patch(), o.Patch()); d != 0 { - return d - } - - // At this point the major, minor, and patch versions are the same. - ps := v.pre - po := o.Prerelease() - - if ps == "" && po == "" { - return 0 - } - if ps == "" { - return 1 - } - if po == "" { - return -1 - } - - return comparePrerelease(ps, po) -} - -// UnmarshalJSON implements JSON.Unmarshaler interface. -func (v *Version) UnmarshalJSON(b []byte) error { - var s string - if err := json.Unmarshal(b, &s); err != nil { - return err - } - temp, err := NewVersion(s) - if err != nil { - return err - } - v.major = temp.major - v.minor = temp.minor - v.patch = temp.patch - v.pre = temp.pre - v.metadata = temp.metadata - v.original = temp.original - temp = nil - return nil -} - -// MarshalJSON implements JSON.Marshaler interface. -func (v *Version) MarshalJSON() ([]byte, error) { - return json.Marshal(v.String()) -} - -func compareSegment(v, o int64) int { - if v < o { - return -1 - } - if v > o { - return 1 - } - - return 0 -} - -func comparePrerelease(v, o string) int { - - // split the prelease versions by their part. The separator, per the spec, - // is a . - sparts := strings.Split(v, ".") - oparts := strings.Split(o, ".") - - // Find the longer length of the parts to know how many loop iterations to - // go through. - slen := len(sparts) - olen := len(oparts) - - l := slen - if olen > slen { - l = olen - } - - // Iterate over each part of the prereleases to compare the differences. - for i := 0; i < l; i++ { - // Since the lentgh of the parts can be different we need to create - // a placeholder. This is to avoid out of bounds issues. - stemp := "" - if i < slen { - stemp = sparts[i] - } - - otemp := "" - if i < olen { - otemp = oparts[i] - } - - d := comparePrePart(stemp, otemp) - if d != 0 { - return d - } - } - - // Reaching here means two versions are of equal value but have different - // metadata (the part following a +). They are not identical in string form - // but the version comparison finds them to be equal. - return 0 -} - -func comparePrePart(s, o string) int { - // Fastpath if they are equal - if s == o { - return 0 - } - - // When s or o are empty we can use the other in an attempt to determine - // the response. - if s == "" { - if o != "" { - return -1 - } - return 1 - } - - if o == "" { - if s != "" { - return 1 - } - return -1 - } - - // When comparing strings "99" is greater than "103". To handle - // cases like this we need to detect numbers and compare them. According - // to the semver spec, numbers are always positive. If there is a - at the - // start like -99 this is to be evaluated as an alphanum. numbers always - // have precedence over alphanum. Parsing as Uints because negative numbers - // are ignored. - - oi, n1 := strconv.ParseUint(o, 10, 64) - si, n2 := strconv.ParseUint(s, 10, 64) - - // The case where both are strings compare the strings - if n1 != nil && n2 != nil { - if s > o { - return 1 - } - return -1 - } else if n1 != nil { - // o is a string and s is a number - return -1 - } else if n2 != nil { - // s is a string and o is a number - return 1 - } - // Both are numbers - if si > oi { - return 1 - } - return -1 - -} diff --git a/vendor/github.com/Masterminds/semver/version_fuzz.go b/vendor/github.com/Masterminds/semver/version_fuzz.go deleted file mode 100644 index b42bcd62..00000000 --- a/vendor/github.com/Masterminds/semver/version_fuzz.go +++ /dev/null @@ -1,10 +0,0 @@ -// +build gofuzz - -package semver - -func Fuzz(data []byte) int { - if _, err := NewVersion(string(data)); err != nil { - return 0 - } - return 1 -} diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/admin.go b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/admin.go deleted file mode 100644 index 9fc27b10..00000000 --- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/admin.go +++ /dev/null @@ -1,318 +0,0 @@ -/* - * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package dcgm - -/* -#cgo linux LDFLAGS: -ldl -Wl,--unresolved-symbols=ignore-in-object-files -#cgo darwin LDFLAGS: -ldl -Wl,-undefined,dynamic_lookup - - -#include -#include "./dcgm_agent.h" -#include "./dcgm_structs.h" -*/ -import "C" -import ( - "fmt" - "io/ioutil" - "log" - "os" - "os/exec" - "strconv" - "strings" - "syscall" - "unsafe" - - "github.com/Masterminds/semver" -) - -type mode int - -// const for DCGM hostengine running modes: Embedded, Standalone or StartHostengine -const ( - Embedded mode = iota - Standalone - StartHostengine -) - -type dcgmHandle struct{ handle C.dcgmHandle_t } - -var ( - dcgmLibHandle unsafe.Pointer - stopMode mode - handle dcgmHandle - hostengineAsChildPid int -) - -func initDcgm(m mode, args ...string) (err error) { - const ( - dcgmLib = "libdcgm.so" - ) - lib := C.CString(dcgmLib) - defer freeCString(lib) - - dcgmLibHandle = C.dlopen(lib, C.RTLD_LAZY|C.RTLD_GLOBAL) - if dcgmLibHandle == nil { - return fmt.Errorf("%s not Found", dcgmLib) - } - - // set the stopMode for shutdown() - stopMode = m - - switch m { - case Embedded: - return startEmbedded() - case Standalone: - return connectStandalone(args...) - case StartHostengine: - return startHostengine() - } - - return nil -} - -func shutdown() (err error) { - switch stopMode { - case Embedded: - err = stopEmbedded() - case Standalone: - err = disconnectStandalone() - case StartHostengine: - err = stopHostengine() - } - - C.dlclose(dcgmLibHandle) - return -} - -func startEmbedded() (err error) { - result := C.dcgmInit() - if err = errorString(result); err != nil { - return fmt.Errorf("Error initializing DCGM: %s", err) - } - - var cHandle C.dcgmHandle_t - result = C.dcgmStartEmbedded(C.DCGM_OPERATION_MODE_AUTO, &cHandle) - if err = errorString(result); err != nil { - return fmt.Errorf("Error starting nv-hostengine: %s", err) - } - handle = dcgmHandle{cHandle} - return -} - -func stopEmbedded() (err error) { - result := C.dcgmStopEmbedded(handle.handle) - if err = errorString(result); err != nil { - return fmt.Errorf("Error stopping nv-hostengine: %s", err) - } - - result = C.dcgmShutdown() - if err = errorString(result); err != nil { - return fmt.Errorf("Error shutting down DCGM: %s", err) - } - return -} - -func connectStandalone(args ...string) (err error) { - if len(args) < 2 { - return fmt.Errorf("Missing dcgm address and / or port") - } - - result := C.dcgmInit() - if err = errorString(result); err != nil { - return fmt.Errorf("Error initializing DCGM: %s", err) - } - - var cHandle C.dcgmHandle_t - addr := C.CString(args[0]) - defer freeCString(addr) - var connectParams C.dcgmConnectV2Params_t - connectParams.version = makeVersion2(unsafe.Sizeof(connectParams)) - - sck, err := strconv.ParseUint(args[1], 10, 32) - if err != nil { - return fmt.Errorf("Error parsing %s: %v\n", args[1], err) - } - connectParams.addressIsUnixSocket = C.uint(sck) - - result = C.dcgmConnect_v2(addr, &connectParams, &cHandle) - if err = errorString(result); err != nil { - return fmt.Errorf("Error connecting to nv-hostengine: %s", err) - } - - handle = dcgmHandle{cHandle} - - // This check is disabled for now - /* - err = checkHostengineVersion() - if err != nil { - return fmt.Errorf("Error connecting to remote nv-hostengine: %s", err) - } - */ - - return -} - -func disconnectStandalone() (err error) { - result := C.dcgmDisconnect(handle.handle) - if err = errorString(result); err != nil { - return fmt.Errorf("Error disconnecting from nv-hostengine: %s", err) - } - - result = C.dcgmShutdown() - if err = errorString(result); err != nil { - return fmt.Errorf("Error shutting down DCGM: %s", err) - } - return -} - -func startHostengine() (err error) { - bin, err := exec.LookPath("nv-hostengine") - if err != nil { - return fmt.Errorf("Error finding nv-hostengine: %s", err) - } - var procAttr syscall.ProcAttr - procAttr.Files = []uintptr{ - uintptr(syscall.Stdin), - uintptr(syscall.Stdout), - uintptr(syscall.Stderr)} - procAttr.Sys = &syscall.SysProcAttr{Setpgid: true} - - dir := "/tmp" - tmpfile, err := ioutil.TempFile(dir, "dcgm") - if err != nil { - return fmt.Errorf("Error creating temporary file in %s directory: %s", dir, err) - } - socketPath := tmpfile.Name() - defer os.Remove(socketPath) - - connectArg := "--domain-socket" - hostengineAsChildPid, err = syscall.ForkExec(bin, []string{bin, connectArg, socketPath}, &procAttr) - if err != nil { - return fmt.Errorf("Error fork-execing nv-hostengine: %s", err) - } - - result := C.dcgmInit() - if err = errorString(result); err != nil { - return fmt.Errorf("Error initializing DCGM: %s", err) - } - - var cHandle C.dcgmHandle_t - var connectParams C.dcgmConnectV2Params_t - connectParams.version = makeVersion2(unsafe.Sizeof(connectParams)) - isSocket := C.uint(1) - connectParams.addressIsUnixSocket = isSocket - cSockPath := C.CString(socketPath) - defer freeCString(cSockPath) - result = C.dcgmConnect_v2(cSockPath, &connectParams, &cHandle) - if err = errorString(result); err != nil { - return fmt.Errorf("Error connecting to nv-hostengine: %s", err) - } - - handle = dcgmHandle{cHandle} - return -} - -func stopHostengine() (err error) { - if err = disconnectStandalone(); err != nil { - return - } - - // terminate nv-hostengine - cmd := exec.Command("nv-hostengine", "--term") - if err = cmd.Run(); err != nil { - return fmt.Errorf("Error terminating nv-hostengine: %s", err) - } - log.Println("Successfully terminated nv-hostengine.") - - return syscall.Kill(hostengineAsChildPid, syscall.SIGKILL) -} - -func checkHostengineVersion() (err error) { - var hostEngineVersionInfo C.dcgmVersionInfo_t - hostEngineVersionInfo.version = makeVersion2(unsafe.Sizeof(hostEngineVersionInfo)) - result := C.dcgmHostengineVersionInfo(handle.handle, &hostEngineVersionInfo) - if err = errorString(result); err != nil { - return fmt.Errorf("Could not retrieve running hostengine version: %s", err) - } - - var versionInfo C.dcgmVersionInfo_t - versionInfo.version = makeVersion2(unsafe.Sizeof(versionInfo)) - result = C.dcgmVersionInfo(&versionInfo) - if err = errorString(result); err != nil { - return fmt.Errorf("Could not retrieve dcgm version: %s", err) - } - - /* Version string looks like: "version:2.1.2;arch:x86_64;buildtype:Debug; - * buildid:;builddate:2021-03-03;commit:v2.1.1-5-gc27ab30f;branch:master; - * buildplatform:Linux 5.4.0-66-generic #74~18.04.2-Ubuntu SMP Fri Feb 5 - * 11:17:31 UTC 2021 x86_64;;crc:bd60aadd63245021163ef008d0907ae7" - */ - heVersionStr := C.GoString(&hostEngineVersionInfo.rawBuildInfoString[0]) - myVersionStr := C.GoString(&versionInfo.rawBuildInfoString[0]) - var foundVersion = false - - he := strings.Split(heVersionStr, ";") - - // Find version pair within build information - for _, line := range he { - if strings.HasPrefix(line, "version:") { - heVersionStr = line - foundVersion = true - } - } - - if foundVersion == false { - return fmt.Errorf("Could not determine remote version") - } - - foundVersion = false - my := strings.Split(myVersionStr, ";") - - for _, line := range my { - if strings.HasPrefix(line, "version:") { - myVersionStr = line - foundVersion = true - } - } - - if foundVersion == false { - return fmt.Errorf("Could not determine local version") - } - - // Parse out version and compare - he = strings.Split(heVersionStr, ":") - my = strings.Split(myVersionStr, ":") - - if (len(he) != 2) && (len(my) != 2) { - return fmt.Errorf("Could not parse versions") - } - - heVersion, err := semver.NewVersion(he[1]) - if err != nil { - return fmt.Errorf("Could not determine remote version ", err) - } - myVersion, err := semver.NewVersion(my[1]) - if err != nil { - return fmt.Errorf("Could not determine local version ", err) - } - if heVersion.Major() != myVersion.Major() { - return fmt.Errorf("remote %v != local %v", he[1], my[1]) - } - - return -} diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/api.go b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/api.go deleted file mode 100644 index 05a446da..00000000 --- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/api.go +++ /dev/null @@ -1,108 +0,0 @@ -package dcgm - -import ( - "fmt" - "os" - "sync" -) - -var ( - dcgmInitCounter int - mux sync.Mutex -) - -// Init starts DCGM, based on the user selected mode -// DCGM can be started in 3 differengt modes: -// 1. Embedded: Start hostengine within this process -// 2. Standalone: Connect to an already running nv-hostengine at the specified address -// Connection address can be passed as command line args: -connect "IP:PORT/Socket" -socket "isSocket" -// 3. StartHostengine: Open an Unix socket to start and connect to the nv-hostengine and terminate before exiting -func Init(m mode, args ...string) (cleanup func(), err error) { - mux.Lock() - if dcgmInitCounter < 0 { - count := fmt.Sprintf("%d", dcgmInitCounter) - err = fmt.Errorf("Shutdown() is called %s times, before Init()", count[1:]) - } - if dcgmInitCounter == 0 { - err = initDcgm(m, args...) - } - dcgmInitCounter += 1 - mux.Unlock() - - return func() { - if err := Shutdown(); err != nil { - fmt.Fprintf(os.Stderr, "Failed to shutdown DCGM with error: `%v`", err) - } - }, err -} - -// Shutdown stops DCGM and destroy all connections -func Shutdown() (err error) { - mux.Lock() - if dcgmInitCounter <= 0 { - err = fmt.Errorf("Init() needs to be called before Shutdown()") - } - if dcgmInitCounter == 1 { - err = shutdown() - } - dcgmInitCounter -= 1 - mux.Unlock() - - return -} - -// GetAllDeviceCount counts all GPUs on the system -func GetAllDeviceCount() (uint, error) { - return getAllDeviceCount() -} - -// GetSupportedDevices returns only DCGM supported GPUs -func GetSupportedDevices() ([]uint, error) { - return getSupportedDevices() -} - -// GetDeviceInfo describes the given device -func GetDeviceInfo(gpuId uint) (Device, error) { - return getDeviceInfo(gpuId) -} - -// GetDeviceStatus monitors GPU status including its power, memory and GPU utilization -func GetDeviceStatus(gpuId uint) (DeviceStatus, error) { - return latestValuesForDevice(gpuId) -} - -// GetDeviceTopology returns device topology corresponding to the gpuId -func GetDeviceTopology(gpuId uint) ([]P2PLink, error) { - return getDeviceTopology(gpuId) -} - -// WatchPidFields lets DCGM start recording stats for GPU process -// It needs to be called before calling GetProcessInfo -func WatchPidFields() (GroupHandle, error) { - return watchPidFields() -} - -// GetProcessInfo provides detailed per GPU stats for this process -func GetProcessInfo(group GroupHandle, pid uint) ([]ProcessInfo, error) { - return getProcessInfo(group, pid) -} - -// HealthCheckByGpuId monitors GPU health for any errors/failures/warnings -func HealthCheckByGpuId(gpuId uint) (DeviceHealth, error) { - return healthCheckByGpuId(gpuId) -} - -// Policy sets GPU usage and error policies and notifies in case of any violations via callback functions -func Policy(gpuId uint, typ ...policyCondition) (<-chan PolicyViolation, error) { - return registerPolicy(gpuId, typ...) -} - -// Introspect returns DCGM hostengine memory and CPU usage -func Introspect() (DcgmStatus, error) { - return introspect() -} - -// Get all of the profiling metric groups for a given GPU group. -func GetSupportedMetricGroups(grpid uint) ([]MetricGroup, error) { - return getSupportedMetricGroups(grpid) -} diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/bcast.go b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/bcast.go deleted file mode 100644 index 03ac70b1..00000000 --- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/bcast.go +++ /dev/null @@ -1,84 +0,0 @@ -package dcgm - -import ( - "fmt" - "sync" -) - -type publisher struct { - publish chan interface{} - close chan bool - subscribers []*subscriber - subscriberLock sync.Mutex -} - -type subscriber struct { - read chan interface{} - close chan bool -} - -func newPublisher() *publisher { - pub := &publisher{ - publish: make(chan interface{}), - close: make(chan bool), - } - return pub -} - -func (p *publisher) subscriberList() []*subscriber { - p.subscriberLock.Lock() - defer p.subscriberLock.Unlock() - return p.subscribers[:] -} - -func (p *publisher) add() *subscriber { - p.subscriberLock.Lock() - defer p.subscriberLock.Unlock() - newSub := &subscriber{ - read: make(chan interface{}), - close: make(chan bool), - } - p.subscribers = append(p.subscribers, newSub) - return newSub -} - -func (p *publisher) remove(leaving *subscriber) error { - p.subscriberLock.Lock() - defer p.subscriberLock.Unlock() - subscriberIndex := -1 - for i, sub := range p.subscribers { - if sub == leaving { - subscriberIndex = i - break - } - } - if subscriberIndex == -1 { - return fmt.Errorf("Could not find subscriber") - } - go func() { leaving.close <- true }() - p.subscribers = append(p.subscribers[:subscriberIndex], p.subscribers[subscriberIndex+1:]...) - return nil -} - -func (p *publisher) send(val interface{}) { - p.publish <- val -} - -func (p *publisher) broadcast() { - for { - select { - case publishing := <-p.publish: - for _, sub := range p.subscriberList() { - go func(s *subscriber, val interface{}) { - s.read <- val - }(sub, publishing) - } - case <-p.close: - return - } - } -} - -func (p *publisher) closePublisher() { - p.close <- true -} diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/callback.c b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/callback.c deleted file mode 100644 index 5bc2fc2b..00000000 --- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/callback.c +++ /dev/null @@ -1,4 +0,0 @@ -int violationNotify(void* p) { - int ViolationRegistration(void*); - return ViolationRegistration(p); -} diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/const.go b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/const.go deleted file mode 100644 index 92fdd925..00000000 --- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/const.go +++ /dev/null @@ -1,791 +0,0 @@ -package dcgm - -import "C" - -type Short C.ushort - -type FieldValue_v1 struct { - Version uint - FieldId uint - FieldType uint - Status int - Ts int64 - Value [4096]byte -} - -type FieldValue_v2 struct { - Version uint - EntityGroupId Field_Entity_Group - EntityId uint - FieldId uint - FieldType uint - Status int - Ts int64 - Value [4096]byte - StringValue *string -} - -const ( - DCGM_FT_BINARY = uint('b') - DCGM_FT_DOUBLE = uint('d') - DCGM_FT_INT64 = uint('i') - DCGM_FT_STRING = uint('s') - DCGM_FT_TIMESTAMP = uint('t') - DCGM_FT_INT32_BLANK = int64(2147483632) - DCGM_FT_INT32_NOT_FOUND = int64(DCGM_FT_INT32_BLANK + 1) - DCGM_FT_INT32_NOT_SUPPORTED = int64(DCGM_FT_INT32_BLANK + 2) - DCGM_FT_INT32_NOT_PERMISSIONED = int64(DCGM_FT_INT32_BLANK + 3) - DCGM_FT_INT64_BLANK = int64(9223372036854775792) - DCGM_FT_INT64_NOT_FOUND = int64(DCGM_FT_INT64_BLANK + 1) - DCGM_FT_INT64_NOT_SUPPORTED = int64(DCGM_FT_INT64_BLANK + 2) - DCGM_FT_INT64_NOT_PERMISSIONED = int64(DCGM_FT_INT64_BLANK + 3) - DCGM_FT_FP64_BLANK = 140737488355328.0 - DCGM_FT_FP64_NOT_FOUND = float64(DCGM_FT_FP64_BLANK + 1.0) - DCGM_FT_FP64_NOT_SUPPORTED = float64(DCGM_FT_FP64_BLANK + 2.0) - DCGM_FT_FP64_NOT_PERMISSIONED = float64(DCGM_FT_FP64_BLANK + 3.0) - DCGM_FT_STR_BLANK = "<<>>" - DCGM_FT_STR_NOT_FOUND = "<<>>" - DCGM_FT_STR_NOT_SUPPORTED = "<<>>" - DCGM_FT_STR_NOT_PERMISSIONED = "<<>>" - - DCGM_FI_UNKNOWN = 0 - DCGM_FI_DRIVER_VERSION = 1 - DCGM_FI_NVML_VERSION = 2 - DCGM_FI_PROCESS_NAME = 3 - DCGM_FI_DEV_COUNT = 4 - DCGM_FI_DEV_NAME = 50 - DCGM_FI_DEV_BRAND = 51 - DCGM_FI_DEV_NVML_INDEX = 52 - DCGM_FI_DEV_SERIAL = 53 - DCGM_FI_DEV_UUID = 54 - DCGM_FI_DEV_MINOR_NUMBER = 55 - DCGM_FI_DEV_OEM_INFOROM_VER = 56 - DCGM_FI_DEV_PCI_BUSID = 57 - DCGM_FI_DEV_PCI_COMBINED_ID = 58 - DCGM_FI_DEV_PCI_SUBSYS_ID = 59 - DCGM_FI_GPU_TOPOLOGY_PCI = 60 - DCGM_FI_GPU_TOPOLOGY_NVLINK = 61 - DCGM_FI_GPU_TOPOLOGY_AFFINITY = 62 - DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY = 63 - DCGM_FI_DEV_COMPUTE_MODE = 65 - DCGM_FI_DEV_CPU_AFFINITY_0 = 70 - DCGM_FI_DEV_CPU_AFFINITY_1 = 71 - DCGM_FI_DEV_CPU_AFFINITY_2 = 72 - DCGM_FI_DEV_CPU_AFFINITY_3 = 73 - DCGM_FI_DEV_ECC_INFOROM_VER = 80 - DCGM_FI_DEV_POWER_INFOROM_VER = 81 - DCGM_FI_DEV_INFOROM_IMAGE_VER = 82 - DCGM_FI_DEV_INFOROM_CONFIG_CHECK = 83 - DCGM_FI_DEV_INFOROM_CONFIG_VALID = 84 - DCGM_FI_DEV_VBIOS_VERSION = 85 - DCGM_FI_DEV_BAR1_TOTAL = 90 - DCGM_FI_SYNC_BOOST = 91 - DCGM_FI_DEV_BAR1_USED = 92 - DCGM_FI_DEV_BAR1_FREE = 93 - DCGM_FI_DEV_SM_CLOCK = 100 - DCGM_FI_DEV_MEM_CLOCK = 101 - DCGM_FI_DEV_VIDEO_CLOCK = 102 - DCGM_FI_DEV_APP_SM_CLOCK = 110 - DCGM_FI_DEV_APP_MEM_CLOCK = 111 - DCGM_FI_DEV_CLOCK_THROTTLE_REASONS = 112 - DCGM_FI_DEV_MAX_SM_CLOCK = 113 - DCGM_FI_DEV_MAX_MEM_CLOCK = 114 - DCGM_FI_DEV_MAX_VIDEO_CLOCK = 115 - DCGM_FI_DEV_AUTOBOOST = 120 - DCGM_FI_DEV_SUPPORTED_CLOCKS = 130 - DCGM_FI_DEV_MEMORY_TEMP = 140 - DCGM_FI_DEV_GPU_TEMP = 150 - DCGM_FI_DEV_POWER_USAGE = 155 - DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION = 156 - DCGM_FI_DEV_SLOWDOWN_TEMP = 158 - DCGM_FI_DEV_SHUTDOWN_TEMP = 159 - DCGM_FI_DEV_POWER_MGMT_LIMIT = 160 - DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN = 161 - DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX = 162 - DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF = 163 - DCGM_FI_DEV_ENFORCED_POWER_LIMIT = 164 - DCGM_FI_DEV_PSTATE = 190 - DCGM_FI_DEV_FAN_SPEED = 191 - DCGM_FI_DEV_PCIE_TX_THROUGHPUT = 200 - DCGM_FI_DEV_PCIE_RX_THROUGHPUT = 201 - DCGM_FI_DEV_PCIE_REPLAY_COUNTER = 202 - DCGM_FI_DEV_GPU_UTIL = 203 - DCGM_FI_DEV_MEM_COPY_UTIL = 204 - DCGM_FI_DEV_ACCOUNTING_DATA = 205 - DCGM_FI_DEV_ENC_UTIL = 206 - DCGM_FI_DEV_DEC_UTIL = 207 - DCGM_FI_DEV_MEM_COPY_UTIL_SAMPLES = 210 - DCGM_FI_DEV_GPU_UTIL_SAMPLES = 211 - DCGM_FI_DEV_GRAPHICS_PIDS = 220 - DCGM_FI_DEV_COMPUTE_PIDS = 221 - DCGM_FI_DEV_XID_ERRORS = 230 - DCGM_FI_DEV_PCIE_MAX_LINK_GEN = 235 - DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH = 236 - DCGM_FI_DEV_PCIE_LINK_GEN = 237 - DCGM_FI_DEV_PCIE_LINK_WIDTH = 238 - DCGM_FI_DEV_POWER_VIOLATION = 240 - DCGM_FI_DEV_THERMAL_VIOLATION = 241 - DCGM_FI_DEV_SYNC_BOOST_VIOLATION = 242 - DCGM_FI_DEV_BOARD_LIMIT_VIOLATION = 243 - DCGM_FI_DEV_LOW_UTIL_VIOLATION = 244 - DCGM_FI_DEV_RELIABILITY_VIOLATION = 245 - DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION = 246 - DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION = 247 - DCGM_FI_DEV_FB_TOTAL = 250 - DCGM_FI_DEV_FB_FREE = 251 - DCGM_FI_DEV_FB_USED = 252 - DCGM_FI_DEV_ECC_CURRENT = 300 - DCGM_FI_DEV_ECC_PENDING = 301 - DCGM_FI_DEV_ECC_SBE_VOL_TOTAL = 310 - DCGM_FI_DEV_ECC_DBE_VOL_TOTAL = 311 - DCGM_FI_DEV_ECC_SBE_AGG_TOTAL = 312 - DCGM_FI_DEV_ECC_DBE_AGG_TOTAL = 313 - DCGM_FI_DEV_ECC_SBE_VOL_L1 = 314 - DCGM_FI_DEV_ECC_DBE_VOL_L1 = 315 - DCGM_FI_DEV_ECC_SBE_VOL_L2 = 316 - DCGM_FI_DEV_ECC_DBE_VOL_L2 = 317 - DCGM_FI_DEV_ECC_SBE_VOL_DEV = 318 - DCGM_FI_DEV_ECC_DBE_VOL_DEV = 319 - DCGM_FI_DEV_ECC_SBE_VOL_REG = 320 - DCGM_FI_DEV_ECC_DBE_VOL_REG = 321 - DCGM_FI_DEV_ECC_SBE_VOL_TEX = 322 - DCGM_FI_DEV_ECC_DBE_VOL_TEX = 323 - DCGM_FI_DEV_ECC_SBE_AGG_L1 = 324 - DCGM_FI_DEV_ECC_DBE_AGG_L1 = 325 - DCGM_FI_DEV_ECC_SBE_AGG_L2 = 326 - DCGM_FI_DEV_ECC_DBE_AGG_L2 = 327 - DCGM_FI_DEV_ECC_SBE_AGG_DEV = 328 - DCGM_FI_DEV_ECC_DBE_AGG_DEV = 329 - DCGM_FI_DEV_ECC_SBE_AGG_REG = 330 - DCGM_FI_DEV_ECC_DBE_AGG_REG = 331 - DCGM_FI_DEV_ECC_SBE_AGG_TEX = 332 - DCGM_FI_DEV_ECC_DBE_AGG_TEX = 333 - DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS = 393 - DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS = 394 - DCGM_FI_DEV_ROW_REMAP_FAILURE = 395 - DCGM_FI_DEV_RETIRED_SBE = 390 - DCGM_FI_DEV_RETIRED_DBE = 391 - DCGM_FI_DEV_RETIRED_PENDING = 392 - DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0 = 400 - DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1 = 401 - DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2 = 402 - DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3 = 403 - DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4 = 404 - DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5 = 405 - DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL = 409 - DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0 = 410 - DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1 = 411 - DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2 = 412 - DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3 = 413 - DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4 = 414 - DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5 = 415 - DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL = 419 - DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0 = 420 - DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1 = 421 - DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2 = 422 - DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3 = 423 - DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4 = 424 - DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5 = 425 - DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL = 429 - DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0 = 430 - DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1 = 431 - DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2 = 432 - DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3 = 433 - DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4 = 434 - DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5 = 435 - DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL = 439 - DCGM_FI_DEV_NVLINK_BANDWIDTH_L0 = 440 - DCGM_FI_DEV_NVLINK_BANDWIDTH_L1 = 441 - DCGM_FI_DEV_NVLINK_BANDWIDTH_L2 = 442 - DCGM_FI_DEV_NVLINK_BANDWIDTH_L3 = 443 - DCGM_FI_DEV_NVLINK_BANDWIDTH_L4 = 444 - DCGM_FI_DEV_NVLINK_BANDWIDTH_L5 = 445 - DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL = 449 - DCGM_FI_DEV_GPU_NVLINK_ERRORS = 450 - DCGM_FI_DEV_VIRTUAL_MODE = 500 - DCGM_FI_DEV_SUPPORTED_TYPE_INFO = 501 - DCGM_FI_DEV_CREATABLE_VGPU_TYPE_IDS = 502 - DCGM_FI_DEV_VGPU_INSTANCE_IDS = 503 - DCGM_FI_DEV_VGPU_UTILIZATIONS = 504 - DCGM_FI_DEV_VGPU_PER_PROCESS_UTILIZATION = 505 - DCGM_FI_DEV_ENC_STATS = 506 - DCGM_FI_DEV_FBC_STATS = 507 - DCGM_FI_DEV_FBC_SESSIONS_INFO = 508 - DCGM_FI_DEV_VGPU_VM_ID = 520 - DCGM_FI_DEV_VGPU_VM_NAME = 521 - DCGM_FI_DEV_VGPU_TYPE = 522 - DCGM_FI_DEV_VGPU_UUID = 523 - DCGM_FI_DEV_VGPU_DRIVER_VERSION = 524 - DCGM_FI_DEV_VGPU_MEMORY_USAGE = 525 - DCGM_FI_DEV_VGPU_LICENSE_STATUS = 526 - DCGM_FI_DEV_VGPU_FRAME_RATE_LIMIT = 527 - DCGM_FI_DEV_VGPU_ENC_STATS = 528 - DCGM_FI_DEV_VGPU_ENC_SESSIONS_INFO = 529 - DCGM_FI_DEV_VGPU_FBC_STATS = 530 - DCGM_FI_DEV_VGPU_FBC_SESSIONS_INFO = 531 - DCGM_FI_FIRST_VGPU_FIELD_ID = 520 - DCGM_FI_LAST_VGPU_FIELD_ID = 570 - DCGM_FI_INTERNAL_FIELDS_0_START = 600 - DCGM_FI_INTERNAL_FIELDS_0_END = 699 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P00 = 700 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P00 = 701 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P00 = 702 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P00 = 703 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P01 = 704 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P01 = 705 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P01 = 706 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P01 = 707 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P02 = 708 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P02 = 709 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P02 = 710 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P02 = 711 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P03 = 712 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P03 = 713 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P03 = 714 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P03 = 715 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P04 = 716 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P04 = 717 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P04 = 718 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P04 = 719 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P05 = 720 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P05 = 721 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P05 = 722 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P05 = 723 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P06 = 724 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P06 = 725 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P06 = 726 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P06 = 727 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P07 = 728 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P07 = 729 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P07 = 730 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P07 = 731 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P08 = 732 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P08 = 733 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P08 = 734 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P08 = 735 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P09 = 736 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P09 = 737 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P09 = 738 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P09 = 739 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P10 = 740 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P10 = 741 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P10 = 742 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P10 = 743 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P11 = 744 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P11 = 745 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P11 = 746 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P11 = 747 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P12 = 748 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P12 = 749 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P12 = 750 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P12 = 751 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P13 = 752 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P13 = 753 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P13 = 754 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P13 = 755 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P14 = 756 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P14 = 757 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P14 = 758 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P14 = 759 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P15 = 760 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P15 = 761 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P15 = 762 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P15 = 763 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P16 = 764 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P16 = 765 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P16 = 766 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P16 = 767 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P17 = 768 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P17 = 769 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P17 = 770 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P17 = 771 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P00 = 780 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P00 = 781 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P01 = 782 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P01 = 783 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P02 = 784 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P02 = 785 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P03 = 786 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P03 = 787 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P04 = 788 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P04 = 789 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P05 = 790 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P05 = 791 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P06 = 792 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P06 = 793 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P07 = 794 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P07 = 795 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P08 = 796 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P08 = 797 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P09 = 798 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P09 = 799 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P10 = 800 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P10 = 801 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P11 = 802 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P11 = 803 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P12 = 804 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P12 = 805 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P13 = 806 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P13 = 807 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P14 = 808 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P14 = 809 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P15 = 810 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P15 = 811 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P16 = 812 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P16 = 813 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P17 = 814 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P17 = 815 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P00 = 820 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P00 = 821 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P01 = 822 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P01 = 823 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P02 = 824 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P02 = 825 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P03 = 826 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P03 = 827 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P04 = 828 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P04 = 829 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P05 = 830 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P05 = 831 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P06 = 832 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P06 = 833 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P07 = 834 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P07 = 835 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P08 = 836 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P08 = 837 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P09 = 838 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P09 = 839 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P10 = 840 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P10 = 841 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P11 = 842 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P11 = 843 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P12 = 844 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P12 = 845 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P13 = 846 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P13 = 847 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P14 = 848 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P14 = 849 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P15 = 850 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P15 = 851 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P16 = 852 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P16 = 853 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P17 = 854 - DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P17 = 855 - DCGM_FI_DEV_NVSWITCH_FATAL_ERRORS = 856 - DCGM_FI_DEV_NVSWITCH_NON_FATAL_ERRORS = 857 - DCGM_FI_FIRST_NVSWITCH_FIELD_ID = 700 - DCGM_FI_LAST_NVSWITCH_FIELD_ID = 860 - DCGM_FI_PROF_GR_ENGINE_ACTIVE = 1001 - DCGM_FI_PROF_SM_ACTIVE = 1002 - DCGM_FI_PROF_SM_OCCUPANCY = 1003 - DCGM_FI_PROF_PIPE_TENSOR_ACTIVE = 1004 - DCGM_FI_PROF_DRAM_ACTIVE = 1005 - DCGM_FI_PROF_PIPE_FP64_ACTIVE = 1006 - DCGM_FI_PROF_PIPE_FP32_ACTIVE = 1007 - DCGM_FI_PROF_PIPE_FP16_ACTIVE = 1008 - DCGM_FI_PROF_PCIE_TX_BYTES = 1009 - DCGM_FI_PROF_PCIE_RX_BYTES = 1010 - DCGM_FI_PROF_NVLINK_TX_BYTES = 1011 - DCGM_FI_PROF_NVLINK_RX_BYTES = 1012 - DCGM_FI_MAX_FIELDS = 1013 -) - -var ( - DCGM_FI = map[string]Short{ - "DCGM_FT_BINARY": Short('b'), - "DCGM_FT_DOUBLE": Short('d'), - "DCGM_FT_INT64": Short('i'), - "DCGM_FT_STRING": Short('s'), - "DCGM_FT_TIMESTAMP": Short('t'), - "DCGM_FI_UNKNOWN": 0, - "DCGM_FI_DRIVER_VERSION": 1, - "DCGM_FI_NVML_VERSION": 2, - "DCGM_FI_PROCESS_NAME": 3, - "DCGM_FI_DEV_COUNT": 4, - "DCGM_FI_DEV_NAME": 50, - "DCGM_FI_DEV_BRAND": 51, - "DCGM_FI_DEV_NVML_INDEX": 52, - "DCGM_FI_DEV_SERIAL": 53, - "DCGM_FI_DEV_UUID": 54, - "DCGM_FI_DEV_MINOR_NUMBER": 55, - "DCGM_FI_DEV_OEM_INFOROM_VER": 56, - "DCGM_FI_DEV_PCI_BUSID": 57, - "DCGM_FI_DEV_PCI_COMBINED_ID": 58, - "DCGM_FI_DEV_PCI_SUBSYS_ID": 59, - "DCGM_FI_GPU_TOPOLOGY_PCI": 60, - "DCGM_FI_GPU_TOPOLOGY_NVLINK": 61, - "DCGM_FI_GPU_TOPOLOGY_AFFINITY": 62, - "DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY": 6, - "DCGM_FI_DEV_COMPUTE_MODE": 65, - "DCGM_FI_DEV_CPU_AFFINITY_0": 70, - "DCGM_FI_DEV_CPU_AFFINITY_1": 71, - "DCGM_FI_DEV_CPU_AFFINITY_2": 72, - "DCGM_FI_DEV_CPU_AFFINITY_3": 73, - "DCGM_FI_DEV_ECC_INFOROM_VER": 80, - "DCGM_FI_DEV_POWER_INFOROM_VER": 81, - "DCGM_FI_DEV_INFOROM_IMAGE_VER": 82, - "DCGM_FI_DEV_INFOROM_CONFIG_CHECK": 83, - "DCGM_FI_DEV_INFOROM_CONFIG_VALID": 84, - "DCGM_FI_DEV_VBIOS_VERSION": 85, - "DCGM_FI_DEV_BAR1_TOTAL": 90, - "DCGM_FI_SYNC_BOOST": 91, - "DCGM_FI_DEV_BAR1_USED": 92, - "DCGM_FI_DEV_BAR1_FREE": 93, - "DCGM_FI_DEV_SM_CLOCK": 100, - "DCGM_FI_DEV_MEM_CLOCK": 101, - "DCGM_FI_DEV_VIDEO_CLOCK": 102, - "DCGM_FI_DEV_APP_SM_CLOCK": 110, - "DCGM_FI_DEV_APP_MEM_CLOCK": 111, - "DCGM_FI_DEV_CLOCK_THROTTLE_REASONS": 112, - "DCGM_FI_DEV_MAX_SM_CLOCK": 113, - "DCGM_FI_DEV_MAX_MEM_CLOCK": 114, - "DCGM_FI_DEV_MAX_VIDEO_CLOCK": 115, - "DCGM_FI_DEV_AUTOBOOST": 120, - "DCGM_FI_DEV_SUPPORTED_CLOCKS": 130, - "DCGM_FI_DEV_MEMORY_TEMP": 140, - "DCGM_FI_DEV_GPU_TEMP": 150, - "DCGM_FI_DEV_POWER_USAGE": 155, - "DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION": 156, - "DCGM_FI_DEV_SLOWDOWN_TEMP": 158, - "DCGM_FI_DEV_SHUTDOWN_TEMP": 159, - "DCGM_FI_DEV_POWER_MGMT_LIMIT": 160, - "DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN": 161, - "DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX": 162, - "DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF": 163, - "DCGM_FI_DEV_ENFORCED_POWER_LIMIT": 164, - "DCGM_FI_DEV_PSTATE": 190, - "DCGM_FI_DEV_FAN_SPEED": 191, - "DCGM_FI_DEV_PCIE_TX_THROUGHPUT": 200, - "DCGM_FI_DEV_PCIE_RX_THROUGHPUT": 201, - "DCGM_FI_DEV_PCIE_REPLAY_COUNTER": 202, - "DCGM_FI_DEV_GPU_UTIL": 203, - "DCGM_FI_DEV_MEM_COPY_UTIL": 204, - "DCGM_FI_DEV_ACCOUNTING_DATA": 205, - "DCGM_FI_DEV_ENC_UTIL": 206, - "DCGM_FI_DEV_DEC_UTIL": 207, - "DCGM_FI_DEV_MEM_COPY_UTIL_SAMPLES": 210, - "DCGM_FI_DEV_GPU_UTIL_SAMPLES": 211, - "DCGM_FI_DEV_GRAPHICS_PIDS": 220, - "DCGM_FI_DEV_COMPUTE_PIDS": 221, - "DCGM_FI_DEV_XID_ERRORS": 230, - "DCGM_FI_DEV_PCIE_MAX_LINK_GEN": 235, - "DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH": 236, - "DCGM_FI_DEV_PCIE_LINK_GEN": 237, - "DCGM_FI_DEV_PCIE_LINK_WIDTH": 238, - "DCGM_FI_DEV_POWER_VIOLATION": 240, - "DCGM_FI_DEV_THERMAL_VIOLATION": 241, - "DCGM_FI_DEV_SYNC_BOOST_VIOLATION": 242, - "DCGM_FI_DEV_BOARD_LIMIT_VIOLATION": 243, - "DCGM_FI_DEV_LOW_UTIL_VIOLATION": 244, - "DCGM_FI_DEV_RELIABILITY_VIOLATION": 245, - "DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION": 246, - "DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION": 247, - "DCGM_FI_DEV_FB_TOTAL": 250, - "DCGM_FI_DEV_FB_FREE": 251, - "DCGM_FI_DEV_FB_USED": 252, - "DCGM_FI_DEV_ECC_CURRENT": 300, - "DCGM_FI_DEV_ECC_PENDING": 301, - "DCGM_FI_DEV_ECC_SBE_VOL_TOTAL": 310, - "DCGM_FI_DEV_ECC_DBE_VOL_TOTAL": 311, - "DCGM_FI_DEV_ECC_SBE_AGG_TOTAL": 312, - "DCGM_FI_DEV_ECC_DBE_AGG_TOTAL": 313, - "DCGM_FI_DEV_ECC_SBE_VOL_L1": 314, - "DCGM_FI_DEV_ECC_DBE_VOL_L1": 315, - "DCGM_FI_DEV_ECC_SBE_VOL_L2": 316, - "DCGM_FI_DEV_ECC_DBE_VOL_L2": 317, - "DCGM_FI_DEV_ECC_SBE_VOL_DEV": 318, - "DCGM_FI_DEV_ECC_DBE_VOL_DEV": 319, - "DCGM_FI_DEV_ECC_SBE_VOL_REG": 320, - "DCGM_FI_DEV_ECC_DBE_VOL_REG": 321, - "DCGM_FI_DEV_ECC_SBE_VOL_TEX": 322, - "DCGM_FI_DEV_ECC_DBE_VOL_TEX": 323, - "DCGM_FI_DEV_ECC_SBE_AGG_L1": 324, - "DCGM_FI_DEV_ECC_DBE_AGG_L1": 325, - "DCGM_FI_DEV_ECC_SBE_AGG_L2": 326, - "DCGM_FI_DEV_ECC_DBE_AGG_L2": 327, - "DCGM_FI_DEV_ECC_SBE_AGG_DEV": 328, - "DCGM_FI_DEV_ECC_DBE_AGG_DEV": 329, - "DCGM_FI_DEV_ECC_SBE_AGG_REG": 330, - "DCGM_FI_DEV_ECC_DBE_AGG_REG": 331, - "DCGM_FI_DEV_ECC_SBE_AGG_TEX": 332, - "DCGM_FI_DEV_ECC_DBE_AGG_TEX": 333, - "DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS": 393, - "DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS": 394, - "DCGM_FI_DEV_ROW_REMAP_FAILURE": 395, - "DCGM_FI_DEV_RETIRED_SBE": 390, - "DCGM_FI_DEV_RETIRED_DBE": 391, - "DCGM_FI_DEV_RETIRED_PENDING": 392, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0": 400, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1": 401, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2": 402, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3": 403, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4": 404, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5": 405, - "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL": 409, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0": 410, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1": 411, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2": 412, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3": 413, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4": 414, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5": 415, - "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL": 419, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0": 420, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1": 421, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2": 422, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3": 423, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4": 424, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5": 425, - "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL": 429, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0": 430, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1": 431, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2": 432, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3": 433, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4": 434, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5": 435, - "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL": 439, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_L0": 440, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_L1": 441, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_L2": 442, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_L3": 443, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_L4": 444, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_L5": 445, - "DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL": 449, - "DCGM_FI_DEV_GPU_NVLINK_ERRORS": 450, - "DCGM_FI_DEV_VIRTUAL_MODE": 500, - "DCGM_FI_DEV_SUPPORTED_TYPE_INFO": 501, - "DCGM_FI_DEV_CREATABLE_VGPU_TYPE_IDS": 502, - "DCGM_FI_DEV_VGPU_INSTANCE_IDS": 503, - "DCGM_FI_DEV_VGPU_UTILIZATIONS": 504, - "DCGM_FI_DEV_VGPU_PER_PROCESS_UTILIZATION": 505, - "DCGM_FI_DEV_ENC_STATS": 506, - "DCGM_FI_DEV_FBC_STATS": 507, - "DCGM_FI_DEV_FBC_SESSIONS_INFO": 508, - "DCGM_FI_DEV_VGPU_VM_ID": 520, - "DCGM_FI_DEV_VGPU_VM_NAME": 521, - "DCGM_FI_DEV_VGPU_TYPE": 522, - "DCGM_FI_DEV_VGPU_UUID": 523, - "DCGM_FI_DEV_VGPU_DRIVER_VERSION": 524, - "DCGM_FI_DEV_VGPU_MEMORY_USAGE": 525, - "DCGM_FI_DEV_VGPU_LICENSE_STATUS": 526, - "DCGM_FI_DEV_VGPU_FRAME_RATE_LIMIT": 527, - "DCGM_FI_DEV_VGPU_ENC_STATS": 528, - "DCGM_FI_DEV_VGPU_ENC_SESSIONS_INFO": 529, - "DCGM_FI_DEV_VGPU_FBC_STATS": 530, - "DCGM_FI_DEV_VGPU_FBC_SESSIONS_INFO": 531, - "DCGM_FI_FIRST_VGPU_FIELD_ID": 520, - "DCGM_FI_LAST_VGPU_FIELD_ID": 570, - "DCGM_FI_INTERNAL_FIELDS_0_START": 600, - "DCGM_FI_INTERNAL_FIELDS_0_END": 699, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P00": 700, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P00": 701, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P00": 702, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P00": 703, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P01": 704, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P01": 705, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P01": 706, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P01": 707, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P02": 708, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P02": 709, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P02": 710, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P02": 711, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P03": 712, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P03": 713, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P03": 714, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P03": 715, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P04": 716, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P04": 717, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P04": 718, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P04": 719, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P05": 720, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P05": 721, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P05": 722, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P05": 723, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P06": 724, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P06": 725, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P06": 726, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P06": 727, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P07": 728, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P07": 729, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P07": 730, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P07": 731, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P08": 732, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P08": 733, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P08": 734, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P08": 735, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P09": 736, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P09": 737, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P09": 738, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P09": 739, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P10": 740, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P10": 741, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P10": 742, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P10": 743, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P11": 744, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P11": 745, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P11": 746, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P11": 747, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P12": 748, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P12": 749, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P12": 750, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P12": 751, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P13": 752, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P13": 753, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P13": 754, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P13": 755, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P14": 756, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P14": 757, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P14": 758, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P14": 759, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P15": 760, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P15": 761, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P15": 762, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P15": 763, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P16": 764, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P16": 765, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P16": 766, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P16": 767, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P17": 768, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P17": 769, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P17": 770, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P17": 771, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P00": 780, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P00": 781, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P01": 782, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P01": 783, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P02": 784, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P02": 785, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P03": 786, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P03": 787, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P04": 788, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P04": 789, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P05": 790, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P05": 791, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P06": 792, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P06": 793, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P07": 794, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P07": 795, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P08": 796, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P08": 797, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P09": 798, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P09": 799, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P10": 800, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P10": 801, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P11": 802, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P11": 803, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P12": 804, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P12": 805, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P13": 806, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P13": 807, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P14": 808, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P14": 809, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P15": 810, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P15": 811, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P16": 812, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P16": 813, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P17": 814, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P17": 815, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P00": 820, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P00": 821, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P01": 822, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P01": 823, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P02": 824, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P02": 825, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P03": 826, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P03": 827, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P04": 828, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P04": 829, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P05": 830, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P05": 831, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P06": 832, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P06": 833, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P07": 834, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P07": 835, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P08": 836, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P08": 837, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P09": 838, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P09": 839, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P10": 840, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P10": 841, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P11": 842, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P11": 843, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P12": 844, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P12": 845, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P13": 846, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P13": 847, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P14": 848, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P14": 849, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P15": 850, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P15": 851, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P16": 852, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P16": 853, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P17": 854, - "DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P17": 855, - "DCGM_FI_DEV_NVSWITCH_FATAL_ERRORS": 856, - "DCGM_FI_DEV_NVSWITCH_NON_FATAL_ERRORS": 857, - "DCGM_FI_FIRST_NVSWITCH_FIELD_ID": 700, - "DCGM_FI_LAST_NVSWITCH_FIELD_ID": 860, - "DCGM_FI_PROF_GR_ENGINE_ACTIVE": 1001, - "DCGM_FI_PROF_SM_ACTIVE": 1002, - "DCGM_FI_PROF_SM_OCCUPANCY": 1003, - "DCGM_FI_PROF_PIPE_TENSOR_ACTIVE": 1004, - "DCGM_FI_PROF_DRAM_ACTIVE": 1005, - "DCGM_FI_PROF_PIPE_FP64_ACTIVE": 1006, - "DCGM_FI_PROF_PIPE_FP32_ACTIVE": 1007, - "DCGM_FI_PROF_PIPE_FP16_ACTIVE": 1008, - "DCGM_FI_PROF_PCIE_TX_BYTES": 1009, - "DCGM_FI_PROF_PCIE_RX_BYTES": 1010, - "DCGM_FI_PROF_NVLINK_TX_BYTES": 1011, - "DCGM_FI_PROF_NVLINK_RX_BYTES": 1012, - "DCGM_FI_MAX_FIELDS": 1013, - } -) - -var ( - OLD_DCGM_FI = map[string]Short{ - "dcgm_sm_clock": 100, - "dcgm_memory_clock": 101, - "dcgm_memory_temp": 140, - "dcgm_gpu_temp": 150, - "dcgm_power_usage": 155, - "dcgm_total_energy_consumption": 156, - "dcgm_pcie_tx_throughput": 200, - "dcgm_pcie_rx_throughput": 201, - "dcgm_pcie_replay_counter": 202, - "dcgm_gpu_utilization": 203, - "dcgm_mem_copy_utilization": 204, - "dcgm_enc_utilization": 206, - "dcgm_dec_utilization": 207, - "dcgm_xid_errors": 230, - "dcgm_power_violation": 240, - "dcgm_thermal_violation": 241, - "dcgm_sync_boost_violation": 242, - "dcgm_board_limit_violation": 243, - "dcgm_low_util_violation": 244, - "dcgm_reliability_violation": 245, - "dcgm_fb_free": 251, - "dcgm_fb_used": 252, - "dcgm_ecc_sbe_volatile_total": 310, - "dcgm_ecc_dbe_volatile_total": 311, - "dcgm_ecc_sbe_aggregate_total": 312, - "dcgm_ecc_dbe_aggregate_total": 313, - "dcgm_retired_pages_sbe": 390, - "dcgm_retired_pages_dbe": 391, - "dcgm_retired_pages_pending": 392, - "dcgm_nvlink_flit_crc_error_count_total": 409, - "dcgm_nvlink_data_crc_error_count_total": 419, - "dcgm_nvlink_replay_error_count_total": 429, - "dcgm_nvlink_recovery_error_count_total": 439, - "dcgm_nvlink_bandwidth_total": 449, - "dcgm_fi_prof_gr_engine_active": 1001, - "dcgm_fi_prof_sm_active": 1002, - "dcgm_fi_prof_sm_occupancy": 1003, - "dcgm_fi_prof_pipe_tensor_active": 1004, - "dcgm_fi_prof_dram_active": 1005, - "dcgm_fi_prof_pcie_tx_bytes": 1009, - "dcgm_fi_prof_pcie_rx_bytes": 1010, - } -) - -const ( - DCGM_FV_FLAG_LIVE_DATA = uint(0x00000001) -) diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/dcgm_agent.h b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/dcgm_agent.h deleted file mode 100644 index fac3fdfe..00000000 --- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/dcgm_agent.h +++ /dev/null @@ -1,2033 +0,0 @@ -/* - * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef DCGM_AGENT_H -#define DCGM_AGENT_H - -#include "dcgm_structs.h" - -#ifdef __cplusplus -extern "C" { -#endif - -#define DECLDIR - -/***************************************************************************************************/ -/** @defgroup DCGMAPI_Admin Administrative - * - * This chapter describes the administration interfaces for DCGM. - * It is the user's responsibility to call \ref dcgmInit() before calling any other methods, - * and \ref dcgmShutdown() once DCGM is no longer being used. The APIs in Administrative module - * can be broken down into following categories: - * @{ - */ -/***************************************************************************************************/ - -/***************************************************************************************************/ -/** @defgroup DCGMAPI_Admin_InitShut Init and Shutdown - * - * Describes APIs to Initialize and Shutdown the DCGM Engine. - * @{ - */ -/***************************************************************************************************/ - -/** - * This method is used to initialize DCGM within this process. This must be called before - * dcgmStartEmbedded() or dcgmConnect() - * - * * @return - * - \ref DCGM_ST_OK if DCGM has been properly initialized - * - \ref DCGM_ST_INIT_ERROR if there was an error initializing the library - */ -dcgmReturn_t DECLDIR dcgmInit(void); - -/** - * This method is used to shut down DCGM. Any embedded host engines or remote connections will automatically - * be shut down as well. - * - * @return - * - \ref DCGM_ST_OK if DCGM has been properly shut down - * - \ref DCGM_ST_UNINITIALIZED if the library was not shut down properly - */ -dcgmReturn_t DECLDIR dcgmShutdown(void); - -/** - * Start an embedded host engine agent within this process. - * - * The agent is loaded as a shared library. This mode is provided to avoid any - * extra jitter associated with an additional autonomous agent needs to be managed. In - * this mode, the user has to periodically call APIs such as \ref dcgmPolicyTrigger and - * \ref dcgmUpdateAllFields which tells DCGM to wake up and perform data collection and - * operations needed for policy management. - * - * @param opMode IN: Collect data automatically or manually when asked by the user. - * @param pDcgmHandle OUT: DCGM Handle to use for API calls - * - * @return - * - \ref DCGM_ST_OK if DCGM was started successfully within our process - * - \ref DCGM_ST_UNINITIALIZED if DCGM has not been initialized with \ref dcgmInit yet - * - */ -dcgmReturn_t DECLDIR dcgmStartEmbedded(dcgmOperationMode_t opMode, dcgmHandle_t *pDcgmHandle); - -/** - * Start an embedded host engine agent within this process. - * - * The agent is loaded as a shared library. This mode is provided to avoid any - * extra jitter associated with an additional autonomous agent needs to be managed. In - * this mode, the user has to periodically call APIs such as \ref dcgmPolicyTrigger and - * \ref dcgmUpdateAllFields which tells DCGM to wake up and perform data collection and - * operations needed for policy management. - * - * @param params IN/OUT: See \ref dcgmStartEmbeddedV2Params_v1 for details. - * - * @return - * - \ref DCGM_ST_OK if DCGM was started successfully within our process - * - \ref DCGM_ST_UNINITIALIZED if DCGM has not been initialized with \ref dcgmInit yet - * - */ -dcgmReturn_t DECLDIR dcgmStartEmbedded_v2(dcgmStartEmbeddedV2Params_v1 *params); - -/** - * Stop the embedded host engine within this process that was started with dcgmStartEmbedded - * - * @param pDcgmHandle IN : DCGM Handle of the embedded host engine that came from dcgmStartEmbedded - * - * @return - * - \ref DCGM_ST_OK if DCGM was stopped successfully within our process - * - \ref DCGM_ST_UNINITIALIZED if DCGM has not been initialized with \ref dcgmInit or - * the embedded host engine was not running. - * - \ref DCGM_ST_BADPARAM if an invalid parameter was provided - * - \ref DCGM_ST_INIT_ERROR if an error occurred while trying to start the host engine. - */ -dcgmReturn_t DECLDIR dcgmStopEmbedded(dcgmHandle_t pDcgmHandle); - -/** - * This method is used to connect to a stand-alone host engine process. Remote host engines are started - * by running the nv-hostengine command. - * - * NOTE: dcgmConnect_v2 provides additional connection options. - * - * @param ipAddress IN: Valid IP address for the remote host engine to connect to. - * If ipAddress is specified as x.x.x.x it will attempt to connect to the default - * port specified by DCGM_HE_PORT_NUMBER - * If ipAddress is specified as x.x.x.x:yyyy it will attempt to connect to the - * port specified by yyyy - * @param pDcgmHandle OUT: DCGM Handle of the remote host engine - * - * @return - * - \ref DCGM_ST_OK if we successfully connected to the remote host engine - * - \ref DCGM_ST_CONNECTION_NOT_VALID if the remote host engine could not be reached - * - \ref DCGM_ST_UNINITIALIZED if DCGM has not been initialized with \ref dcgmInit. - * - \ref DCGM_ST_BADPARAM if pDcgmHandle is NULL or ipAddress is invalid - * - \ref DCGM_ST_INIT_ERROR if DCGM encountered an error while initializing the remote client library - * - \ref DCGM_ST_UNINITIALIZED if DCGM has not been initialized with \ref dcgmInit - */ -dcgmReturn_t DECLDIR dcgmConnect(char *ipAddress, dcgmHandle_t *pDcgmHandle); - -/** - * This method is used to connect to a stand-alone host engine process. Remote host engines are started - * by running the nv-hostengine command. - * - * @param ipAddress IN: Valid IP address for the remote host engine to connect to. - * If ipAddress is specified as x.x.x.x it will attempt to connect to the default port - * specified by DCGM_HE_PORT_NUMBER. - * If ipAddress is specified as x.x.x.x:yyyy it will attempt to connect to the port - * specified by yyyy - * @param connectParams IN: Additional connection parameters. See \ref dcgmConnectV2Params_t for details. - * @param pDcgmHandle OUT: DCGM Handle of the remote host engine - * - * @return - * - \ref DCGM_ST_OK if we successfully connected to the remote host engine - * - \ref DCGM_ST_CONNECTION_NOT_VALID if the remote host engine could not be reached - * - \ref DCGM_ST_UNINITIALIZED if DCGM has not been initialized with \ref dcgmInit. - * - \ref DCGM_ST_BADPARAM if pDcgmHandle is NULL or ipAddress is invalid - * - \ref DCGM_ST_INIT_ERROR if DCGM encountered an error while initializing the remote client library - * - \ref DCGM_ST_UNINITIALIZED if DCGM has not been initialized with \ref dcgmInit - */ -dcgmReturn_t DECLDIR dcgmConnect_v2(char *ipAddress, dcgmConnectV2Params_t *connectParams, dcgmHandle_t *pDcgmHandle); - -/** - * This method is used to disconnect from a stand-alone host engine process. - * - * @param pDcgmHandle IN: DCGM Handle that came from dcgmConnect - * - * @return - * - \ref DCGM_ST_OK if we successfully disconnected from the host engine - * - \ref DCGM_ST_UNINITIALIZED if DCGM has not been initialized with \ref dcgmInit - * - \ref DCGM_ST_BADPARAM if pDcgmHandle is not a valid DCGM handle - * - \ref DCGM_ST_GENERIC_ERROR if an unspecified internal error occurred - */ -dcgmReturn_t DECLDIR dcgmDisconnect(dcgmHandle_t pDcgmHandle); - - -/** @} */ // Closing for DCGMAPI_Admin_InitShut - -/***************************************************************************************************/ -/** @defgroup DCGMAPI_Admin_Info Auxilary information about DCGM engine. - * - * Describes APIs to get generic information about the DCGM Engine. - * @{ - */ -/***************************************************************************************************/ - -/** - * This method is used to return information about the build environment where DCGM was built. - * - * @param pVersionInfo OUT: Build environment information - * - * @return - * - \ref DCGM_ST_OK if build information is sucessfully obtained - * - \ref DCGM_ST_BADPARAM if pVersionInfo is null - * - \ref DCGM_ST_VER_MISMATCH if the expected and provided versions of dcgmVersionInfo_t do not match - */ -dcgmReturn_t DECLDIR dcgmVersionInfo(dcgmVersionInfo_t *pVersionInfo); - -/** - * This method is used to return information about the build environment of the hostengine. - * - * @param pDcgmHandle IN: DCGM Handle that came from dcgmConnect - * @param pVersionInfo OUT: Build environment information - * - * @return - * - \ref DCGM_ST_OK if build information is sucessfully obtained - * - \ref DCGM_ST_BADPARAM if pVersionInfo is null - * - \ref DCGM_ST_VER_MISMATCH if the expected and provided versions of dcgmVersionInfo_t do not match - */ -dcgmReturn_t DECLDIR dcgmHostengineVersionInfo(dcgmHandle_t pDcgmHandle, dcgmVersionInfo_t *pVersionInfo); - - -/** - * This method is used to set the logging severity on HostEngine for the specified logger - * - * @param pDcgmHandle IN: DCGM Handle - * @param logging IN: dcgmSettingsSetLoggingSeverity_t struct containing the target logger and severity - * - * @return - * - \ref DCGM_ST_OK Severity successfuly set - * - \ref DCGM_ST_BADPARAM Bad logger/severity string - * - \ref DCGM_ST_VER_MISMATCH if the expected and provided versions of dcgmSettingsSetLoggingSeverity_t - * do not match - */ -dcgmReturn_t DECLDIR dcgmHostengineSetLoggingSeverity(dcgmHandle_t pDcgmHandle, - dcgmSettingsSetLoggingSeverity_t *logging); - -/** - * This function is used to return whether or not the host engine considers itself healthy - * - * @param[in] pDcgmHandle - the handle to DCGM - * @param[out] heHealth - struct describing the health of the hostengine. if heHealth.hostengineHealth is 0, - * then the hostengine is healthy. Non-zero indicates not healthy with error codes - * determining the cause. - * - * @return - * - \ref DCGM_ST_OK Able to gauge health - * - \ref DCGM_ST_BADPARAM isHealthy is not a valid pointer - */ -dcgmReturn_t DECLDIR dcgmHostengineIsHealthy(dcgmHandle_t pDcgmHandle, dcgmHostengineHealth_t *heHealth); - -/** @} */ // Closing DCGMAPI_Admin_Info - -/** @} */ // Closing for DCGMAPI_Admin - - -/***************************************************************************************************/ -/** @defgroup DCGMAPI_SYS System - * @{ - * This chapter describes the APIs used to identify set of GPUs on the node, grouping functions to - * provide mechanism to operate on a group of GPUs, and status management APIs in - * order to get individual statuses for each operation. The APIs in System module can be - * broken down into following categories: - */ -/***************************************************************************************************/ - -/***************************************************************************************************/ -/** @defgroup DCGM_DISCOVERY Discovery - * The following APIs are used to discover GPUs and their attributes on a Node. - * @{ - */ -/***************************************************************************************************/ - -/** - * This method is used to get identifiers corresponding to all the devices on the system. The - * identifier represents DCGM GPU Id corresponding to each GPU on the system and is immutable during - * the lifespan of the engine. The list should be queried again if the engine is restarted. - * - * The GPUs returned from this function include gpuIds of GPUs that are not supported by DCGM. - * To only get gpuIds of GPUs that are supported by DCGM, use dcgmGetAllSupportedDevices(). - * - * @param pDcgmHandle IN: DCGM Handle - * @param gpuIdList OUT: Array reference to fill GPU Ids present on the system. - * @param count OUT: Number of GPUs returned in \a gpuIdList. - * - * @return - * - \ref DCGM_ST_OK if the call was successful. - * - \ref DCGM_ST_BADPARAM if \a gpuIdList or \a count were not valid. - */ -dcgmReturn_t DECLDIR dcgmGetAllDevices(dcgmHandle_t pDcgmHandle, - unsigned int gpuIdList[DCGM_MAX_NUM_DEVICES], - int *count); - -/** - * This method is used to get identifiers corresponding to all the DCGM-supported devices on the system. The - * identifier represents DCGM GPU Id corresponding to each GPU on the system and is immutable during - * the lifespan of the engine. The list should be queried again if the engine is restarted. - * - * The GPUs returned from this function ONLY includes gpuIds of GPUs that are supported by DCGM. - * To get gpuIds of all GPUs in the system, use dcgmGetAllDevices(). - * - * - * @param pDcgmHandle IN: DCGM Handle - * @param gpuIdList OUT: Array reference to fill GPU Ids present on the system. - * @param count OUT: Number of GPUs returned in \a gpuIdList. - * - * @return - * - \ref DCGM_ST_OK if the call was successful. - * - \ref DCGM_ST_BADPARAM if \a gpuIdList or \a count were not valid. - */ -dcgmReturn_t DECLDIR dcgmGetAllSupportedDevices(dcgmHandle_t pDcgmHandle, - unsigned int gpuIdList[DCGM_MAX_NUM_DEVICES], - int *count); - -/** - * Gets device attributes corresponding to the \a gpuId. If operation is not successful for any of - * the requested fields then the field is populated with one of DCGM_BLANK_VALUES defined in - * dcgm_structs.h. - * - * @param pDcgmHandle IN: DCGM Handle - * @param gpuId IN: GPU Id corresponding to which the attributes should be fetched - * @param pDcgmAttr IN/OUT: Device attributes corresponding to \a gpuId.
pDcgmAttr->version should be set to - * \ref dcgmDeviceAttributes_version before this call. - * - * @return - * - \ref DCGM_ST_OK if the call was successful. - * - \ref DCGM_ST_VER_MISMATCH if pDcgmAttr->version is not set or is invalid. - */ -dcgmReturn_t DECLDIR dcgmGetDeviceAttributes(dcgmHandle_t pDcgmHandle, - unsigned int gpuId, - dcgmDeviceAttributes_t *pDcgmAttr); - -/** - * Gets the list of entities that exist for a given entity group. This API can be used in place of - * \ref dcgmGetAllDevices. - * - * @param dcgmHandle IN: DCGM Handle - * @param entityGroup IN: Entity group to list entities of - * @param entities OUT: Array of entities for entityGroup - * @param numEntities IN/OUT: Upon calling, this should be the number of entities that entityList[] can hold. Upon - * return, this will contain the number of entities actually saved to entityList. - * @param flags IN: Flags to modify the behavior of this request. - * See DCGM_GEGE_FLAG_* #defines in dcgm_structs.h - * - * @return - * - \ref DCGM_ST_OK if the call was successful. - * - \ref DCGM_ST_INSUFFICIENT_SIZE if numEntities was not large enough to hold the number of entities in the - * entityGroup. numEntities will contain the capacity needed to complete this - * request successfully. - * - \ref DCGM_ST_NOT_SUPPORTED if the given entityGroup does not support enumeration. - * - \ref DCGM_ST_BADPARAM if any parameter is invalid - */ -dcgmReturn_t DECLDIR dcgmGetEntityGroupEntities(dcgmHandle_t dcgmHandle, - dcgm_field_entity_group_t entityGroup, - dcgm_field_eid_t *entities, - int *numEntities, - unsigned int flags); - -/** - * Gets the hierarchy of GPUs, GPU Instances, and Compute Instances by populating a list of each entity with - * a reference to their parent - * - * @param dcgmHandle IN: DCGM Handle - * @param entities OUT: array of entities in the hierarchy - * @param numEntities IN/OUT: Upon calling, this should be the capacity of entities. - * Upon return, this will contain the number of entities actually saved to entities. - * - * @return - * - \ref DCGM_ST_OK if the call was successful. - * - \ref DCGM_ST_VER_MISMATCH if the struct version is incorrect - * - \ref DCGM_ST_BADPARAM if any parameter is invalid - */ -dcgmReturn_t DECLDIR dcgmGetGpuInstanceHierarchy(dcgmHandle_t dcgmHandle, dcgmMigHierarchy_v2 *hierarchy); - -/** - * Get the NvLink link status for every NvLink in this system. This includes the NvLinks of both GPUs and - * NvSwitches. Note that only NvSwitches and GPUs that are visible to the current environment will be - * returned in this structure. - * - * @param dcgmHandle IN: DCGM Handle - * @param linkStatus OUT: Structure in which to store NvLink link statuses. .version should be set to - * dcgmNvLinkStatus_version1 before calling this. - * - * @return - * - \ref DCGM_ST_OK if the call was successful. - * - \ref DCGM_ST_NOT_SUPPORTED if the given entityGroup does not support enumeration. - * - \ref DCGM_ST_BADPARAM if any parameter is invalid - */ -dcgmReturn_t DECLDIR dcgmGetNvLinkLinkStatus(dcgmHandle_t dcgmHandle, dcgmNvLinkStatus_v2 *linkStatus); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup DCGM_GROUPING Grouping - * The following APIs are used for group management. The user can create a group of entities and - * perform an operation on a group of entities. If grouping is not needed and the user wishes - * to run commands on all GPUs seen by DCGM then the user can use DCGM_GROUP_ALL_GPUS or - * DCGM_GROUP_ALL_NVSWITCHES in place of group IDs when needed. - * @{ - */ -/***************************************************************************************************/ - -/** - * Used to create a entity group handle which can store one or more entity Ids as an opaque handle - * returned in \a pDcgmGrpId. Instead of executing an operation separately for each entity, the - * DCGM group enables the user to execute same operation on all the entities present in the group as a - * single API call. - * - * To create the group with all the entities present on the system, the \a type field should be - * specified as \a DCGM_GROUP_DEFAULT or \a DCGM_GROUP_ALL_NVSWITCHES. To create an empty group, - * the \a type field should be specified as \a DCGM_GROUP_EMPTY. The empty group can be updated - * with the desired set of entities using the APIs \ref dcgmGroupAddDevice, \ref dcgmGroupAddEntity, - * \ref dcgmGroupRemoveDevice, and \ref dcgmGroupRemoveEntity. - * - * @param pDcgmHandle IN: DCGM Handle - * @param type IN: Type of Entity Group to be formed - * @param groupName IN: Desired name of the GPU group specified as NULL terminated C string - * @param pDcgmGrpId OUT: Reference to group ID - * - * @return - * - \ref DCGM_ST_OK if the group has been created - * - \ref DCGM_ST_BADPARAM if any of \a type, \a groupName, \a length or \a pDcgmGrpId is invalid - * - \ref DCGM_ST_MAX_LIMIT if number of groups on the system has reached the max limit \a DCGM_MAX_NUM_GROUPS - * - \ref DCGM_ST_INIT_ERROR if the library has not been successfully initialized - */ -dcgmReturn_t DECLDIR dcgmGroupCreate(dcgmHandle_t pDcgmHandle, - dcgmGroupType_t type, - char *groupName, - dcgmGpuGrp_t *pDcgmGrpId); - -/** - * Used to destroy a group represented by \a groupId. - * Since DCGM group is a logical grouping of entities, the properties applied on the group stay intact - * for the individual entities even after the group is destroyed. - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID - * - * @return - * - \ref DCGM_ST_OK if the group has been destroyed - * - \ref DCGM_ST_BADPARAM if \a groupId is invalid - * - \ref DCGM_ST_INIT_ERROR if the library has not been successfully initialized - * - \ref DCGM_ST_NOT_CONFIGURED if entry corresponding to the group does not exists - */ -dcgmReturn_t DECLDIR dcgmGroupDestroy(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId); - -/** - * Used to add specified GPU Id to the group represented by \a groupId. - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group Id to which device should be added - * @param gpuId IN: DCGM GPU Id - * - * @return - * - \ref DCGM_ST_OK if the GPU Id has been successfully added to the group - * - \ref DCGM_ST_INIT_ERROR if the library has not been successfully initialized - * - \ref DCGM_ST_NOT_CONFIGURED if entry corresponding to the group (\a groupId) does not exists - * - \ref DCGM_ST_BADPARAM if \a gpuId is invalid or already part of the specified group - */ -dcgmReturn_t dcgmGroupAddDevice(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, unsigned int gpuId); - -/** - * Used to add specified entity to the group represented by \a groupId. - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group Id to which device should be added - * @param entityGroupId IN: Entity group that entityId belongs to - * @param entityId IN: DCGM entityId - * - * @return - * - \ref DCGM_ST_OK if the entity has been successfully added to the group - * - \ref DCGM_ST_INIT_ERROR if the library has not been successfully initialized - * - \ref DCGM_ST_NOT_CONFIGURED if entry corresponding to the group (\a groupId) does not exists - * - \ref DCGM_ST_BADPARAM if \a entityId is invalid or already part of the specified group - */ -dcgmReturn_t dcgmGroupAddEntity(dcgmHandle_t pDcgmHandle, - dcgmGpuGrp_t groupId, - dcgm_field_entity_group_t entityGroupId, - dcgm_field_eid_t entityId); - -/** - * Used to remove specified GPU Id from the group represented by \a groupId. - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID from which device should be removed - * @param gpuId IN: DCGM GPU Id - * - * @return - * - \ref DCGM_ST_OK if the GPU Id has been successfully removed from the group - * - \ref DCGM_ST_INIT_ERROR if the library has not been successfully initialized - * - \ref DCGM_ST_NOT_CONFIGURED if entry corresponding to the group (\a groupId) does not exists - * - \ref DCGM_ST_BADPARAM if \a gpuId is invalid or not part of the specified group - */ -dcgmReturn_t dcgmGroupRemoveDevice(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, unsigned int gpuId); - -/** - * Used to remove specified entity from the group represented by \a groupId. - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID from which device should be removed - * @param entityGroupId IN: Entity group that entityId belongs to - * @param entityId IN: DCGM entityId - * - * @return - * - \ref DCGM_ST_OK if the entity has been successfully removed from the group - * - \ref DCGM_ST_INIT_ERROR if the library has not been successfully initialized - * - \ref DCGM_ST_NOT_CONFIGURED if entry corresponding to the group (\a groupId) does not exists - * - \ref DCGM_ST_BADPARAM if \a entityId is invalid or not part of the specified group - */ -dcgmReturn_t dcgmGroupRemoveEntity(dcgmHandle_t pDcgmHandle, - dcgmGpuGrp_t groupId, - dcgm_field_entity_group_t entityGroupId, - dcgm_field_eid_t entityId); - -/** - * Used to get information corresponding to the group represented by \a groupId. The information - * returned in \a pDcgmGroupInfo consists of group name, and the list of entities present in the - * group. - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID for which information to be fetched - * @param pDcgmGroupInfo OUT: Group Information - * - * @return - * - \ref DCGM_ST_OK if the group info is successfully received. - * - \ref DCGM_ST_BADPARAM if any of \a groupId or \a pDcgmGroupInfo is invalid. - * - \ref DCGM_ST_INIT_ERROR if the library has not been successfully initialized. - * - \ref DCGM_ST_MAX_LIMIT if the group does not contain the GPU - * - \ref DCGM_ST_NOT_CONFIGURED if entry corresponding to the group (\a groupId) does not exists - */ -dcgmReturn_t dcgmGroupGetInfo(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, dcgmGroupInfo_t *pDcgmGroupInfo); - -/** - * Used to get the Ids of all groups of entities. The information returned is a list of group ids - * in \a groupIdList as well as a count of how many ids there are in \a count. Please allocate enough - * memory for \a groupIdList. Memory of size MAX_NUM_GROUPS should be allocated for \a groupIdList. - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupIdList OUT: List of Group Ids - * @param count OUT: The number of Group ids in the list - * - * @return - * - \ref DCGM_ST_OK if the ids of the groups were successfully retrieved - * - \ref DCGM_ST_BADPARAM if either of the \a groupIdList or \a count is null - * - \ref DCGM_ST_GENERIC_ERROR if an unknown error has occurred - */ -dcgmReturn_t dcgmGroupGetAllIds(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupIdList[], unsigned int *count); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup DCGM_FIELD_GROUPING Field Grouping - * The following APIs are used for field group management. The user can create a group of fields and - * perform an operation on a group of fields at once. - * @{ - */ - -/** - * Used to create a group of fields and return the handle in dcgmFieldGroupId - * - * @param dcgmHandle IN: DCGM handle - * @param numFieldIds IN: Number of field IDs that are being provided in fieldIds[]. Must be between 1 and - * DCGM_MAX_FIELD_IDS_PER_FIELD_GROUP. - * @param fieldIds IN: Field IDs to be added to the newly-created field group - * @param fieldGroupName IN: Unique name for this group of fields. This must not be the same as any existing field - * groups. - * @param dcgmFieldGroupId OUT: Handle to the newly-created field group - * - * @return - * - \ref DCGM_ST_OK if the field group was successfully created. - * - \ref DCGM_ST_BADPARAM if any parameters were bad - * - \ref DCGM_ST_INIT_ERROR if the library has not been successfully initialized. - * - \ref DCGM_ST_MAX_LIMIT if too many field groups already exist - * - */ -dcgmReturn_t dcgmFieldGroupCreate(dcgmHandle_t dcgmHandle, - int numFieldIds, - unsigned short *fieldIds, - char *fieldGroupName, - dcgmFieldGrp_t *dcgmFieldGroupId); - -/** - * Used to remove a field group that was created with \ref dcgmFieldGroupCreate - * - * @param dcgmHandle IN: DCGM handle - * @param dcgmFieldGroupId IN: Field group to remove - * - * @return - * - \ref DCGM_ST_OK if the field group was successfully removed - * - \ref DCGM_ST_BADPARAM if any parameters were bad - * - \ref DCGM_ST_INIT_ERROR if the library has not been successfully initialized. - * - */ -dcgmReturn_t dcgmFieldGroupDestroy(dcgmHandle_t dcgmHandle, dcgmFieldGrp_t dcgmFieldGroupId); - - -/** - * Used to get information about a field group that was created with \ref dcgmFieldGroupCreate. - * - * @param dcgmHandle IN: DCGM handle - * @param fieldGroupInfo IN/OUT: Info about all of the field groups that exist.
- * .version should be set to \ref dcgmFieldGroupInfo_version before this call
- * .fieldGroupId should contain the fieldGroupId you are interested in querying - * information for. - * - * @return - * - \ref DCGM_ST_OK if the field group info was returned successfully - * - \ref DCGM_ST_BADPARAM if any parameters were bad - * - \ref DCGM_ST_INIT_ERROR if the library has not been successfully initialized. - * - \ref DCGM_ST_VER_MISMATCH if .version is not set or is invalid. - * - */ -dcgmReturn_t dcgmFieldGroupGetInfo(dcgmHandle_t dcgmHandle, dcgmFieldGroupInfo_t *fieldGroupInfo); - -/** - * Used to get information about all field groups in the system. - * - * @param dcgmHandle IN: DCGM handle - * @param allGroupInfo IN/OUT: Info about all of the field groups that exist.
- * .version should be set to \ref dcgmAllFieldGroup_version before this call. - * - * @return - * - \ref DCGM_ST_OK if the field group info was successfully returned - * - \ref DCGM_ST_BADPARAM if any parameters were bad - * - \ref DCGM_ST_INIT_ERROR if the library has not been successfully initialized. - * - \ref DCGM_ST_VER_MISMATCH if .version is not set or is invalid. - * - */ -dcgmReturn_t dcgmFieldGroupGetAll(dcgmHandle_t dcgmHandle, dcgmAllFieldGroup_t *allGroupInfo); - -/** @} */ - - -/***************************************************************************************************/ -/** @defgroup DCGMAPI_ST Status handling - * The following APIs are used to manage statuses for multiple operations on one or more GPUs. - * @{ - */ -/***************************************************************************************************/ - -/** - * Creates reference to DCGM status handler which can be used to get the statuses for multiple - * operations on one or more devices. - * - * The multiple statuses are useful when the operations are performed at group level. The status - * handle provides a mechanism to access error attributes for the failed operations. - * - * The number of errors stored behind the opaque handle can be accessed using the the API - * \ref dcgmStatusGetCount. The errors are accessed from the opaque handle \a statusHandle - * using the API \ref dcgmStatusPopError. The user can invoke \ref dcgmStatusPopError - * for the number of errors or until all the errors are fetched. - * - * When the status handle is not required any further then it should be deleted using the API - * \ref dcgmStatusDestroy. - * @param statusHandle OUT: Reference to handle for list of statuses - * - * @return - * - \ref DCGM_ST_OK if the status handle is successfully created - * - \ref DCGM_ST_BADPARAM if \a statusHandle is invalid - * - */ -dcgmReturn_t dcgmStatusCreate(dcgmStatus_t *statusHandle); - -/** - * Used to destroy status handle created using \ref dcgmStatusCreate. - * @param statusHandle IN: Handle to list of statuses - * - * @return - * - \ref DCGM_ST_OK if the status handle is successfully created - * - \ref DCGM_ST_BADPARAM if \a statusHandle is invalid - * - */ -dcgmReturn_t dcgmStatusDestroy(dcgmStatus_t statusHandle); - -/** - * Used to get count of error entries stored inside the opaque handle \a statusHandle. - * @param statusHandle IN: Handle to list of statuses - * @param count OUT: Number of error entries present in the list of statuses - * - * @return - * - \ref DCGM_ST_OK if the error count is successfully received - * - \ref DCGM_ST_BADPARAM if any of \a statusHandle or \a count is invalid - * - */ -dcgmReturn_t dcgmStatusGetCount(dcgmStatus_t statusHandle, unsigned int *count); - -/** - * Used to iterate through the list of errors maintained behind \a statusHandle. The method pops the - * first error from the list of DCGM statuses. In order to iterate through all the errors, the user - * can invoke this API for the number of errors or until all the errors are fetched. - * @param statusHandle IN: Handle to list of statuses - * @param pDcgmErrorInfo OUT: First error from the list of statuses - * - * @return - * - \ref DCGM_ST_OK if the error entry is successfully fetched - * - \ref DCGM_ST_BADPARAM if any of \a statusHandle or \a pDcgmErrorInfo is invalid - * - \ref DCGM_ST_NO_DATA if the status handle list is empty - * - */ -dcgmReturn_t dcgmStatusPopError(dcgmStatus_t statusHandle, dcgmErrorInfo_t *pDcgmErrorInfo); - -/** - * Used to clear all the errors in the status handle created by the API - * \ref dcgmStatusCreate. After one set of operation, the \a statusHandle - * can be cleared and reused for the next set of operation. - * @param statusHandle IN: Handle to list of statuses - * - * @return - * - \ref DCGM_ST_OK if the errors are successfully cleared - * - \ref DCGM_ST_BADPARAM if \a statusHandle is invalid - * - */ -dcgmReturn_t dcgmStatusClear(dcgmStatus_t statusHandle); - -/** @} */ // Closing for DCGMAPI_ST - - -/** @} */ // Closing for DCGMAPI_SYS - -/***************************************************************************************************/ -/** @defgroup DCGMAPI_DC Configuration - * This chapter describes the methods that handle device configuration retrieval and - * default settings. The APIs in Configuration module can be broken down into following - * categories: - * @{ - */ -/***************************************************************************************************/ - -/***************************************************************************************************/ -/** @defgroup DCGMAPI_DC_Setup Setup and management - * Describes APIs to Get/Set configuration on the group of GPUs. - * @{ - */ -/***************************************************************************************************/ - -/** -* Used to set configuration for the group of one or more GPUs identified by \a groupId. -* -* The configuration settings specified in \a pDeviceConfig are applied to all the GPUs in the -* group. Since DCGM group is a logical grouping of GPUs, the configuration settings stays intact -* for the individual GPUs even after the group is destroyed. -* -* If the user wishes to ignore the configuration of one or more properties in the input -* \a pDeviceConfig then the property should be specified as one of \a DCGM_INT32_BLANK, -* \a DCGM_INT64_BLANK, \a DCGM_FP64_BLANK or \a DCGM_STR_BLANK based on the data type of the -* property to be ignored. -* -* If any of the properties fail to be configured for any of the GPUs in the group then the API -* returns an error. The status handle \a statusHandle should be further evaluated to access error -* attributes for the failed operations. Please refer to status management APIs at \ref DCGMAPI_ST -* to access the error attributes. -* -* To find out valid supported clock values that can be passed to dcgmConfigSet, look at the device -* attributes of a GPU in the group using the API dcgmGetDeviceAttributes. - -* @param pDcgmHandle IN: DCGM Handle -* @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate -* for details on creating the group. -* @param pDeviceConfig IN: Pointer to memory to hold desired configuration to be applied for all the GPU in the -* group represented by \a groupId. -* The caller must populate the version field of \a pDeviceConfig. -* @param statusHandle IN/OUT: Resulting error status for multiple operations. Pass it as NULL if the detailed -* error information is not needed. -* Look at \ref dcgmStatusCreate for details on creating status handle. - -* @return -* - \ref DCGM_ST_OK if the configuration has been successfully set. -* - \ref DCGM_ST_BADPARAM if any of \a groupId or \a pDeviceConfig is invalid. -* - \ref DCGM_ST_VER_MISMATCH if \a pDeviceConfig has the incorrect version. -* - \ref DCGM_ST_GENERIC_ERROR if an unknown error has occurred. -* -*/ -dcgmReturn_t DECLDIR dcgmConfigSet(dcgmHandle_t pDcgmHandle, - dcgmGpuGrp_t groupId, - dcgmConfig_t *pDeviceConfig, - dcgmStatus_t statusHandle); - -/** -* Used to get configuration for all the GPUs present in the group. -* -* This API can get the most recent target or desired configuration set by \ref dcgmConfigSet. -* Set type as \a DCGM_CONFIG_TARGET_STATE to get target configuration. The target configuration -* properties are maintained by DCGM and are automatically enforced after a GPU reset or -* reinitialization is completed. -* -* The method can also be used to get the actual configuration state for the GPUs in the group. -* Set type as \a DCGM_CONFIG_CURRENT_STATE to get the actually configuration state. Ideally, the -* actual configuration state will be exact same as the target configuration state. -* -* If any of the property in the target configuration is unknown then the property value in the -* output is populated as one of DCGM_INT32_BLANK, DCGM_INT64_BLANK, DCGM_FP64_BLANK or -* DCGM_STR_BLANK based on the data type of the property. -* -* If any of the property in the current configuration state is not supported then the property -* value in the output is populated as one of DCGM_INT32_NOT_SUPPORTED, DCGM_INT64_NOT_SUPPORTED, -* DCGM_FP64_NOT_SUPPORTED or DCGM_STR_NOT_SUPPORTED based on the data type of the property. -* -* If any of the properties can't be fetched for any of the GPUs in the group then the API returns -* an error. The status handle \a statusHandle should be further evaluated to access error -* attributes for the failed operations. Please refer to status management APIs at \ref DCGMAPI_ST -* to access the error attributes. -* -* @param pDcgmHandle IN: DCGM Handle -* @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate -* for details on creating the group. -* @param type IN: Type of configuration values to be fetched. -* @param count IN: The number of entries that \a deviceConfigList array can store. -* @param deviceConfigList OUT: Pointer to memory to hold requested configuration corresponding to all the GPUs in -* the group (\a groupId). The size of the memory must be greater than or equal to hold -* output information for the number of GPUs present in the group (\a groupId). -* @param statusHandle IN/OUT: Resulting error status for multiple operations. Pass it as NULL if the detailed -* error information is not needed. -* Look at \ref dcgmStatusCreate for details on creating status handle. - -* @return -* - \ref DCGM_ST_OK if the configuration has been successfully fetched. -* - \ref DCGM_ST_BADPARAM if any of \a groupId, \a type, \a count, or \a deviceConfigList is invalid. -* - \ref DCGM_ST_NOT_CONFIGURED if the target configuration is not already set. -* - \ref DCGM_ST_VER_MISMATCH if \a deviceConfigList has the incorrect version. -* - \ref DCGM_ST_GENERIC_ERROR if an unknown error has occurred. -* -*/ -dcgmReturn_t DECLDIR dcgmConfigGet(dcgmHandle_t pDcgmHandle, - dcgmGpuGrp_t groupId, - dcgmConfigType_t type, - int count, - dcgmConfig_t deviceConfigList[], - dcgmStatus_t statusHandle); - -/** @} */ // Closing for DCGMAPI_DC_Setup - - -/***************************************************************************************************/ -/** @defgroup DCGMAPI_DC_MI Manual Invocation - * Describes APIs used to manually enforce the desired configuration on a group of GPUs. - * @{ - */ -/***************************************************************************************************/ - -/** - * Used to enforce previously set configuration for all the GPUs present in the group. - * - * This API provides a mechanism to the users to manually enforce the configuration at any point of - * time. The configuration can only be enforced if it's already configured using the API \ref - * dcgmConfigSet. - * - * If any of the properties can't be enforced for any of the GPUs in the group then the API returns - * an error. The status handle \a statusHandle should be further evaluated to access error - * attributes for the failed operations. Please refer to status management APIs at \ref DCGMAPI_ST - * to access the error attributes. - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate - * for details on creating the group. Alternatively, pass in the group id as - * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. - * @param statusHandle IN/OUT: Resulting error status for multiple operations. Pass it as NULL if the detailed - * error information is not needed. Look at \ref dcgmStatusCreate for details on - * creating status handle. - * - * @return - * - \ref DCGM_ST_OK if the configuration has been successfully enforced. - * - \ref DCGM_ST_BADPARAM if \a groupId is invalid. - * - \ref DCGM_ST_NOT_CONFIGURED if the target configuration is not already set. - * - \ref DCGM_ST_GENERIC_ERROR if an unknown error has occurred. - * - */ -dcgmReturn_t DECLDIR dcgmConfigEnforce(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, dcgmStatus_t statusHandle); - -/** @} */ // Closing for DCGMAPI_DC_MI - -/** @} */ // Closing for DCGMAPI_DC - -/***************************************************************************************************/ -/** @defgroup DCGMAPI_FI Field APIs - * - * These APIs are responsible for watching, unwatching, and updating specific fields as defined - * by DCGM_FI_* - * - * @{ - */ -/***************************************************************************************************/ - -/** - * Request that DCGM start recording updates for a given field collection. - * - * Note that the first update of the field will not occur until the next field update cycle. - * To force a field update cycle, call dcgmUpdateAllFields(1). - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more entities. Look at \ref dcgmGroupCreate - * for details on creating the group. Alternatively, pass in the group id as - * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs or - * \a DCGM_GROUP_ALL_NVSWITCHES to to perform the operation on all NvSwitches. - * @param fieldGroupId IN: Fields to watch. - * @param updateFreq IN: How often to update this field in usec - * @param maxKeepAge IN: How long to keep data for this field in seconds - * @param maxKeepSamples IN: Maximum number of samples to keep. 0=no limit - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if a parameter is invalid - * - */ - -dcgmReturn_t dcgmWatchFields(dcgmHandle_t pDcgmHandle, - dcgmGpuGrp_t groupId, - dcgmFieldGrp_t fieldGroupId, - long long updateFreq, - double maxKeepAge, - int maxKeepSamples); - -/** - * Request that DCGM stop recording updates for a given field collection. - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more entities. Look at \ref dcgmGroupCreate - * for details on creating the group. Alternatively, pass in the group id as - * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs or - * \a DCGM_GROUP_ALL_NVSWITCHES to to perform the operation on all NvSwitches. - * @param fieldGroupId IN: Fields to unwatch. - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if a parameter is invalid - * - */ -dcgmReturn_t dcgmUnwatchFields(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, dcgmFieldGrp_t fieldGroupId); - -/** - * Request updates for all field values that have updated since a given timestamp - * - * This version only works with GPU entities. Use \ref dcgmGetValuesSince_v2 for entity groups - * containing NvSwitches. - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate for - * details on creating the group. Alternatively, pass in the group id as - * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. - * @param fieldGroupId IN: Fields to return data for - * @param sinceTimestamp IN: Timestamp to request values since in usec since 1970. This will be returned in - * nextSinceTimestamp for subsequent calls 0 = request all data - * @param nextSinceTimestamp OUT: Timestamp to use for sinceTimestamp on next call to this function - * @param enumCB IN: Callback to invoke for every field value update. Note that multiple updates can be - * returned in each invocation - * @param userData IN: User data pointer to pass to the userData field of enumCB. - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_NOT_SUPPORTED if one of the entities was from a non-GPU type - * - \ref DCGM_ST_BADPARAM if a parameter is invalid - * - */ -dcgmReturn_t dcgmGetValuesSince(dcgmHandle_t pDcgmHandle, - dcgmGpuGrp_t groupId, - dcgmFieldGrp_t fieldGroupId, - long long sinceTimestamp, - long long *nextSinceTimestamp, - dcgmFieldValueEnumeration_f enumCB, - void *userData); - -/** - * Request updates for all field values that have updated since a given timestamp - * - * This version works with non-GPU entities like NvSwitches - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more entities. Look at \ref dcgmGroupCreate - * for details on creating the group. Alternatively, pass in the group id as - * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs or - * \a DCGM_GROUP_ALL_NVSWITCHES to perform the operation on all NvSwitches. - * @param fieldGroupId IN: Fields to return data for - * @param sinceTimestamp IN: Timestamp to request values since in usec since 1970. This will be returned in - * nextSinceTimestamp for subsequent calls 0 = request all data - * @param nextSinceTimestamp OUT: Timestamp to use for sinceTimestamp on next call to this function - * @param enumCB IN: Callback to invoke for every field value update. Note that multiple updates can be - * returned in each invocation - * @param userData IN: User data pointer to pass to the userData field of enumCB. - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if a parameter is invalid - * - */ -dcgmReturn_t dcgmGetValuesSince_v2(dcgmHandle_t pDcgmHandle, - dcgmGpuGrp_t groupId, - dcgmFieldGrp_t fieldGroupId, - long long sinceTimestamp, - long long *nextSinceTimestamp, - dcgmFieldValueEntityEnumeration_f enumCB, - void *userData); - -/** - * Request latest cached field value for a field value collection - * - * This version only works with GPU entities. Use \ref dcgmGetLatestValues_v2 for entity groups - * containing NvSwitches. - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate for - * details on creating the group. Alternatively, pass in the group id as - * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. - * @param fieldGroupId IN: Fields to return data for. - * @param enumCB IN: Callback to invoke for every field value update. Note that multiple updates can be - * returned in each invocation - * @param userData IN: User data pointer to pass to the userData field of enumCB. - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_NOT_SUPPORTED if one of the entities was from a non-GPU type - * - \ref DCGM_ST_BADPARAM if a parameter is invalid - * - */ -dcgmReturn_t dcgmGetLatestValues(dcgmHandle_t pDcgmHandle, - dcgmGpuGrp_t groupId, - dcgmFieldGrp_t fieldGroupId, - dcgmFieldValueEnumeration_f enumCB, - void *userData); - -/** - * Request latest cached field value for a field value collection - * - * This version works with non-GPU entities like NvSwitches - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more entities. Look at \ref dcgmGroupCreate - * for details on creating the group. Alternatively, pass in the group id as - * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs or - * \a DCGM_GROUP_ALL_NVSWITCHES to perform the operation on all NvSwitches. - * @param fieldGroupId IN: Fields to return data for. - * @param enumCB IN: Callback to invoke for every field value update. Note that multiple updates can be - * returned in each invocation - * @param userData IN: User data pointer to pass to the userData field of enumCB. - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_NOT_SUPPORTED if one of the entities was from a non-GPU type - * - \ref DCGM_ST_BADPARAM if a parameter is invalid - * - */ -dcgmReturn_t dcgmGetLatestValues_v2(dcgmHandle_t pDcgmHandle, - dcgmGpuGrp_t groupId, - dcgmFieldGrp_t fieldGroupId, - dcgmFieldValueEntityEnumeration_f enumCB, - void *userData); - -/** - * Request latest cached field value for a GPU - * - * @param pDcgmHandle IN: DCGM Handle - * @param gpuId IN: Gpu ID representing the GPU for which the fields are being requested. - * @param fields IN: Field IDs to return data for. See the definitions in dcgm_fields.h that start with DCGM_FI_. - * @param count IN: Number of field IDs in fields[] array. - * @param values OUT: Latest field values for the fields in fields[]. - * - */ -dcgmReturn_t dcgmGetLatestValuesForFields(dcgmHandle_t pDcgmHandle, - int gpuId, - unsigned short fields[], - unsigned int count, - dcgmFieldValue_v1 values[]); -/** - * Request latest cached field value for a group of fields for a specific entity - * - * @param pDcgmHandle IN: DCGM Handle - * @param entityGroup IN: entity_group_t (e.g. switch) - * @param entityId IN: entity ID representing the rntity for which the fields are being requested. - * @param fields IN: Field IDs to return data for. See the definitions in dcgm_fields.h that start with DCGM_FI_. - * @param count IN: Number of field IDs in fields[] array. - * @param values OUT: Latest field values for the fields in fields[]. - * - */ -dcgmReturn_t dcgmEntityGetLatestValues(dcgmHandle_t pDcgmHandle, - dcgm_field_entity_group_t entityGroup, - int entityId, - unsigned short fields[], - unsigned int count, - dcgmFieldValue_v1 values[]); - -/** - * Request the latest cached or live field value for a list of fields for a group of entities - * - * Note: The returned entities are not guaranteed to be in any order. Reordering can occur internally - * in order to optimize calls to the NVIDIA driver. - * - * @param pDcgmHandle IN: DCGM Handle - * @param entities IN: List of entities to get values for - * @param entityCount IN: Number of entries in entities[] - * @param fields IN: Field IDs to return data for. See the definitions in dcgm_fields.h that start with DCGM_FI_. - * @param fieldCount IN: Number of field IDs in fields[] array. - * @param flags IN: Optional flags that affect how this request is processed. Pass \ref DCGM_FV_FLAG_LIVE_DATA - * here to retrieve a live driver value rather than a cached value. See that flag's - * documentation for caveats. - * @param values OUT: Latest field values for the fields requested. This must be able to hold entityCount * - * fieldCount field value records. - * - */ -dcgmReturn_t dcgmEntitiesGetLatestValues(dcgmHandle_t pDcgmHandle, - dcgmGroupEntityPair_t entities[], - unsigned int entityCount, - unsigned short fields[], - unsigned int fieldCount, - unsigned int flags, - dcgmFieldValue_v2 values[]); - -/*************************************************************************/ -/** - * Get a summary of the values for a field id over a period of time. - * - * @param pDcgmHandle IN: DCGM Handle - * @param request IN/OUT: a pointer to the struct detailing the request and containing the response - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_FIELD_UNSUPPORTED_BY_API if the field is not int64 or double type - * - */ -dcgmReturn_t DECLDIR dcgmGetFieldSummary(dcgmHandle_t pDcgmHandle, dcgmFieldSummaryRequest_t *request); - -/** @} */ - -/***************************************************************************************************/ -/** @addtogroup DCGMAPI_Admin_ExecCtrl - * @{ - */ -/***************************************************************************************************/ - -/** - * This method is used to tell the DCGM module to update all the fields being watched. - * - * Note: If the if the operation mode was set to manual mode (DCGM_OPERATION_MODE_MANUAL) during - * initialization (\ref dcgmInit), this method must be caused periodically to allow field value watches - * the opportunity to gather samples. - * - * @param pDcgmHandle IN: DCGM Handle - * @param waitForUpdate IN: Whether or not to wait for the update loop to complete before returning to the - * caller 1=wait. 0=do not wait. - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if \a waitForUpdate is invalid - * - \ref DCGM_ST_GENERIC_ERROR if an unspecified DCGM error occurs - * - */ -dcgmReturn_t dcgmUpdateAllFields(dcgmHandle_t pDcgmHandle, int waitForUpdate); - -/** @} */ // Closing for DCGMAPI_Admin_ExecCtrl - - -/***************************************************************************************************/ -/** @defgroup DCGMAPI_PROCESS_STATS Process Statistics - * Describes APIs to investigate statistics such as accounting, performance and errors during the - * lifetime of a GPU process - * @{ - */ -/***************************************************************************************************/ - -/** - * Request that DCGM start recording stats for fields that can be queried with dcgmGetPidInfo(). - * - * Note that the first update of the field will not occur until the next field update cycle. - * To force a field update cycle, call dcgmUpdateAllFields(1). - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate for - * details on creating the group. Alternatively, pass in the group id as - * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. - * @param updateFreq IN: How often to update this field in usec - * @param maxKeepAge IN: How long to keep data for this field in seconds - * @param maxKeepSamples IN: Maximum number of samples to keep. 0=no limit - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if a parameter is invalid - * - \ref DCGM_ST_REQUIRES_ROOT if the host engine is being run as non-root, and accounting mode could not - * be enabled (requires root). Run "nvidia-smi -am 1" as root on the node - * before starting DCGM to fix this. - * - */ -dcgmReturn_t dcgmWatchPidFields(dcgmHandle_t pDcgmHandle, - dcgmGpuGrp_t groupId, - long long updateFreq, - double maxKeepAge, - int maxKeepSamples); - -/** - * - * Get information about all GPUs while the provided pid was running - * - * In order for this request to work, you must first call dcgmWatchPidFields() to - * make sure that DCGM is watching the appropriate field IDs that will be - * populated in pidInfo - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate - * for details on creating the group. Alternatively, pass in the group id as - * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. - * @param pidInfo IN/OUT: Structure to return information about pid in. pidInfo->pid must be set to the pid in question. - * pidInfo->version should be set to dcgmPidInfo_version. - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_NO_DATA if the PID did not run on any GPU - * - */ -dcgmReturn_t dcgmGetPidInfo(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, dcgmPidInfo_t *pidInfo); - -/** @} */ // Closing for DCGMAPI_PROCESS_STATS - -/***************************************************************************************************/ -/** @defgroup DCGMAPI_JOB_STATS Job Statistics - * The client can invoke DCGM APIs to start and stop collecting the stats at the process boundaries - * (during prologue and epilogue). This will enable DCGM to monitor all the PIDs while the job is - * in progress, and provide a summary of active processes and resource usage during the window of - * interest. - * @{ - */ -/***************************************************************************************************/ - -/** - * Request that DCGM start recording stats for fields that are queried with dcgmJobGetStats() - * - * Note that the first update of the field will not occur until the next field update cycle. - * To force a field update cycle, call dcgmUpdateAllFields(1). - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate for - * details on creating the group. Alternatively, pass in the group id as - * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. - * @param updateFreq IN: How often to update this field in usec - * @param maxKeepAge IN: How long to keep data for this field in seconds - * @param maxKeepSamples IN: Maximum number of samples to keep. 0=no limit - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if a parameter is invalid - * - \ref DCGM_ST_REQUIRES_ROOT if the host engine is being run as non-root, and - * accounting mode could not be enabled (requires root). - * Run "nvidia-smi -am 1" as root on the node before starting - * DCGM to fix this. - * - */ -dcgmReturn_t dcgmWatchJobFields(dcgmHandle_t pDcgmHandle, - dcgmGpuGrp_t groupId, - long long updateFreq, - double maxKeepAge, - int maxKeepSamples); - -/** - * This API is used by the client to notify DCGM about the job to be started. Should be invoked as - * part of job prologue - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate for - * details on creating the group. Alternatively, pass in the group id as - * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. - * @param jobId IN: User provided string to represent the job - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if a parameter is invalid - * - \ref DCGM_ST_DUPLICATE_KEY if the specified \a jobId is already in use - * - */ -dcgmReturn_t dcgmJobStartStats(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, char jobId[64]); - -/** - * This API is used by the clients to notify DCGM to stop collecting stats for the job represented - * by job id. Should be invoked as part of job epilogue. - * The job Id remains available to view the stats at any point but cannot be used to start a new job. - * You must call dcgmWatchJobFields() before this call to enable watching of job - * - * @param pDcgmHandle IN: DCGM Handle - * @param jobId IN: User provided string to represent the job - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if a parameter is invalid - * - \ref DCGM_ST_NO_DATA if \a jobId is not a valid job identifier. - * - */ -dcgmReturn_t dcgmJobStopStats(dcgmHandle_t pDcgmHandle, char jobId[64]); - -/** - * Get stats for the job identified by DCGM generated job id. The stats can be retrieved at any - * point when the job is in process. - * If you want to reuse this jobId, call \ref dcgmJobRemove after this call. - * - * @param pDcgmHandle IN: DCGM Handle - * @param jobId IN: User provided string to represent the job - * @param pJobInfo IN/OUT: Structure to return information about the job.
.version should be set to - * \ref dcgmJobInfo_version before this call. - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if a parameter is invalid - * - \ref DCGM_ST_NO_DATA if \a jobId is not a valid job identifier. - * - \ref DCGM_ST_VER_MISMATCH if .version is not set or is invalid. - * - */ -dcgmReturn_t dcgmJobGetStats(dcgmHandle_t pDcgmHandle, char jobId[64], dcgmJobInfo_t *pJobInfo); - -/** - * This API tells DCGM to stop tracking the job given by jobId. After this call, you will no longer - * be able to call dcgmJobGetStats() on this jobId. However, you will be able to reuse jobId after - * this call. - * - * @param pDcgmHandle IN: DCGM Handle - * @param jobId IN: User provided string to represent the job - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if a parameter is invalid - * - \ref DCGM_ST_NO_DATA if \a jobId is not a valid job identifier. - * - */ -dcgmReturn_t dcgmJobRemove(dcgmHandle_t pDcgmHandle, char jobId[64]); - -/** - * This API tells DCGM to stop tracking all jobs. After this call, you will no longer - * be able to call dcgmJobGetStats() any jobs until you call dcgmJobStartStats again. - * You will be able to reuse any previously-used jobIds after this call. - * - * @param pDcgmHandle IN: DCGM Handle - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if a parameter is invalid - */ -dcgmReturn_t dcgmJobRemoveAll(dcgmHandle_t pDcgmHandle); - -/** @} */ // Closing for DCGMAPI_JOB_STATS - -/***************************************************************************************************/ -/** @defgroup DCGMAPI_HM Health Monitor - * - * This chapter describes the methods that handle the GPU health monitor. - * - * @{ - */ -/***************************************************************************************************/ - -/** - * Enable the DCGM health check system for the given systems defined in \ref dcgmHealthSystems_t - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more entities. Look at \ref dcgmGroupCreate - * for details on creating the group. Alternatively, pass in the group id as - * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs or - * \a DCGM_GROUP_ALL_NVSWITCHES to perform operation on all the NvSwitches. - * @param systems IN: An enum representing systems that should be enabled for health checks logically OR'd - * together. Refer to \ref dcgmHealthSystems_t for details. - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if a parameter is invalid - * - */ -dcgmReturn_t dcgmHealthSet(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, dcgmHealthSystems_t systems); - -/** - * Enable the DCGM health check system for the given systems defined in \ref dcgmHealthSystems_t - * - * Since DCGM 2.0 - * - * @param pDcgmHandle IN: DCGM Handle - * @param healthSet IN: Parameters to use when setting health watches. See - * \ref dcgmHealthSetParams_v2 for the description of each parameter. - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if a parameter is invalid - */ - -dcgmReturn_t dcgmHealthSet_v2(dcgmHandle_t pDcgmHandle, dcgmHealthSetParams_v2 *params); - -/** - * Retrieve the current state of the DCGM health check system - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more entities. Look at \ref dcgmGroupCreate - * for details on creating the group. Alternatively, pass in the group id as - * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs or - * \a DCGM_GROUP_ALL_NVSWITCHES to perform operation on all the NvSwitches. - * @param systems OUT: An integer representing the enabled systems for the given group Refer to - * \ref dcgmHealthSystems_t for details. - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if a parameter is invalid - * - */ -dcgmReturn_t dcgmHealthGet(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, dcgmHealthSystems_t *systems); - - -/** - * Check the configured watches for any errors/failures/warnings that have occurred - * since the last time this check was invoked. On the first call, stateful information - * about all of the enabled watches within a group is created but no error results are - * provided. On subsequent calls, any error information will be returned. - * - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing a collection of one or more entities. - * Refer to \ref dcgmGroupCreate for details on creating a group - * @param results OUT: A reference to the dcgmHealthResponse_t structure to populate. - * results->version must be set to dcgmHealthResponse_version. - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if a parameter is invalid - * - \ref DCGM_ST_VER_MISMATCH if results->version is not dcgmHealthResponse_version - * - */ -dcgmReturn_t dcgmHealthCheck(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, dcgmHealthResponse_t *results); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup DCGMAPI_PO Policies - * - * This chapter describes the methods that handle system policy management and violation settings. - * The APIs in Policies module can be broken down into following categories: - * - * @{ - */ -/***************************************************************************************************/ - -/***************************************************************************************************/ -/** @defgroup DCGMAPI_PO_Setup Setup and Management - * Describes APIs for setting up policies and registering callbacks to receive notification in - * case specific policy condition has been violated. - * @{ - */ -/***************************************************************************************************/ - -/** - * Set the current violation policy inside the policy manager. Given the conditions within the - * \ref dcgmPolicy_t structure, if a violation has occurred, subsequent action(s) may be performed to - * either report or contain the failure. - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate for - * details on creating the group. Alternatively, pass in the group id as - * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. - * @param policy IN: A reference to \ref dcgmPolicy_t that will be applied to all GPUs in the group. - * @param statusHandle IN/OUT: Resulting status for the operation. Pass it as NULL if the detailed error information - * is not needed. Refer to \ref dcgmStatusCreate for details on creating a status handle. - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if \a groupId or \a policy is invalid - * - \ref DCGM_ST_NOT_SUPPORTED if any unsupported GPUs are part of the GPU group specified in groupId - * - DCGM_ST_* a different error has occurred and is stored in \a statusHandle. - * Refer to \ref dcgmReturn_t - * - */ -dcgmReturn_t dcgmPolicySet(dcgmHandle_t pDcgmHandle, - dcgmGpuGrp_t groupId, - dcgmPolicy_t *policy, - dcgmStatus_t statusHandle); - -/** - * Get the current violation policy inside the policy manager. Given a groupId, a number of - * policy structures are retrieved. - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate for - * details on creating the group. Alternatively, pass in the group id as - * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. - * @param count IN: The size of the policy array. This is the maximum number of policies that will be - * retrieved and ultimately should correspond to the number of GPUs specified in the - * group. - * @param policy OUT: A reference to \ref dcgmPolicy_t that will used as storage for the current policies - * applied to each GPU in the group. - * @param statusHandle IN/OUT: Resulting status for the operation. Pass it as NULL if the detailed error information - * for the operation is not needed. Refer to \ref dcgmStatusCreate for details on - * creating a status handle. - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if \a groupId or \a policy is invalid - * - DCGM_ST_* a different error has occurred and is stored in \a statusHandle. - * Refer to \ref dcgmReturn_t - * - */ -dcgmReturn_t dcgmPolicyGet(dcgmHandle_t pDcgmHandle, - dcgmGpuGrp_t groupId, - int count, - dcgmPolicy_t *policy, - dcgmStatus_t statusHandle); - -/** - * Register a function to be called when a specific policy condition (see \ref dcgmPolicyCondition_t) has been - * violated. This callback(s) will be called automatically when in DCGM_OPERATION_MODE_AUTO mode and only after - * dcgmPolicyTrigger when in DCGM_OPERATION_MODE_MANUAL mode. All callbacks are made within a separate thread. - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate for - * details on creating the group. Alternatively, pass in the group id as - * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. - * @param condition IN: The set of conditions specified as an OR'd list (see \ref dcgmPolicyCondition_t) for - * which to register a callback function - * @param beginCallback IN: A reference to a function that should be called should a violation occur. - * This function will be called prior to any actions specified by the policy are taken. - * @param finishCallback IN: A reference to a function that should be called should a violation occur. - * This function will be called after any action specified by the policy are completed. - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if \a groupId, \a condition, is invalid, \a beginCallback, or - * \a finishCallback is NULL - * - \ref DCGM_ST_NOT_SUPPORTED if any unsupported GPUs are part of the GPU group specified in groupId - * - */ -dcgmReturn_t dcgmPolicyRegister(dcgmHandle_t pDcgmHandle, - dcgmGpuGrp_t groupId, - dcgmPolicyCondition_t condition, - fpRecvUpdates beginCallback, - fpRecvUpdates finishCallback); - -/** - * Unregister a function to be called for a specific policy condition (see \ref dcgmPolicyCondition_t). - * This function will unregister all callbacks for a given condition and handle. - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate for - * details on creating the group. Alternatively, pass in the group id as - * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. - * @param condition IN: The set of conditions specified as an OR'd list (see \ref dcgmPolicyCondition_t) for - * which to unregister a callback function - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if \a groupId, \a condition, is invalid or \a callback is NULL - * - */ -dcgmReturn_t dcgmPolicyUnregister(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, dcgmPolicyCondition_t condition); - -/** @} */ // Closing for DCGMAPI_PO_Setup - -/***************************************************************************************************/ -/** @defgroup DCGMAPI_PO_MI Manual Invocation - * Describes APIs which can be used to perform direct actions (e.g. Perform GPU Reset, Run Health - * Diagnostics) on a group of GPUs. - * @{ - */ -/***************************************************************************************************/ - -/** - * Inform the action manager to perform a manual validation of a group of GPUs on the system - * - * *************************************** DEPRECATED *************************************** - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate for - * details on creating the group. Alternatively, pass in the group id as - * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. - * @param validate IN: The validation to perform after the action. - * @param response OUT: Result of the validation process. Refer to \ref dcgmDiagResponse_t for details. - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_NOT_SUPPORTED if running the specified \a validate is not supported. This is usually due - * to the Tesla recommended driver not being installed on the system. - * - \ref DCGM_ST_BADPARAM if \a groupId, \a validate, or \a statusHandle is invalid - * - \ref DCGM_ST_GENERIC_ERROR an internal error has occurred - * - \ref DCGM_ST_GROUP_INCOMPATIBLE if \a groupId refers to a group of non-homogeneous GPUs. This is currently - * not allowed. - * - */ -dcgmReturn_t dcgmActionValidate(dcgmHandle_t pDcgmHandle, - dcgmGpuGrp_t groupId, - dcgmPolicyValidation_t validate, - dcgmDiagResponse_t *response); - -/** - * Inform the action manager to perform a manual validation of a group of GPUs on the system - * - * @param pDcgmHandle IN: DCGM Handle - * @param drd IN: Contains the group id, test names, test parameters, struct version, and the validation - * that should be performed. Look at \ref dcgmGroupCreate for details on creating the - * group. Alternatively, pass in the group id as \a DCGM_GROUP_ALL_GPUS to perform - * operation on all the GPUs. - * @param response OUT: Result of the validation process. Refer to \ref dcgmDiagResponse_t for details. - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_NOT_SUPPORTED if running the specified \a validate is not supported. This is usually - * due to the Tesla recommended driver not being installed on the system. - * - \ref DCGM_ST_BADPARAM if \a groupId, \a validate, or \a statusHandle is invalid - * - \ref DCGM_ST_GENERIC_ERROR an internal error has occurred - * - \ref DCGM_ST_GROUP_INCOMPATIBLE if \a groupId refers to a group of non-homogeneous GPUs. This is - * currently not allowed. - */ -dcgmReturn_t dcgmActionValidate_v2(dcgmHandle_t pDcgmHandle, dcgmRunDiag_v7 *drd, dcgmDiagResponse_t *response); - -/** - * Run a diagnostic on a group of GPUs - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate - * for details on creating the group. Alternatively, pass in the group id as - * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. - * @param diagLevel IN: Diagnostic level to run - * @param diagResponse IN/OUT: Result of running the DCGM diagnostic.
- * .version should be set to \ref dcgmDiagResponse_version before this call. - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_NOT_SUPPORTED if running the diagnostic is not supported. This is usually due to the - * Tesla recommended driver not being installed on the system. - * - \ref DCGM_ST_BADPARAM if a provided parameter is invalid or missing - * - \ref DCGM_ST_GENERIC_ERROR an internal error has occurred - * - \ref DCGM_ST_GROUP_INCOMPATIBLE if \a groupId refers to a group of non-homogeneous GPUs. This is - * currently not allowed. - * - \ref DCGM_ST_VER_MISMATCH if .version is not set or is invalid. - * - */ -dcgmReturn_t dcgmRunDiagnostic(dcgmHandle_t pDcgmHandle, - dcgmGpuGrp_t groupId, - dcgmDiagnosticLevel_t diagLevel, - dcgmDiagResponse_t *diagResponse); - -/** @} */ // Closing for DCGMAPI_PO_MI - -/** @} */ // Closing for DCGMAPI_PO - -/***************************************************************************************************/ -/** @addtogroup DCGMAPI_Admin_ExecCtrl - * @{ - */ -/***************************************************************************************************/ - -/** - * Inform the policy manager loop to perform an iteration and trigger the callbacks of any - * registered functions. Callback functions will be called from a separate thread as the calling function. - * - * Note: The GPU monitoring and management agent must call this method periodically if the operation - * mode is set to manual mode (DCGM_OPERATION_MODE_MANUAL) during initialization - * (\ref dcgmInit). - * - * @param pDcgmHandle IN: DCGM Handle - * - * @return - * - \ref DCGM_ST_OK If the call was successful - * - DCGM_ST_GENERIC_ERROR The policy manager was unable to perform another iteration. - */ -dcgmReturn_t dcgmPolicyTrigger(dcgmHandle_t pDcgmHandle); - -/** @} */ // Closing for DCGMAPI_Admin_ExecCtrl - -/***************************************************************************************************/ -/** @defgroup DCGMAPI_Topo Topology - * @{ - */ -/***************************************************************************************************/ - -/** - * Gets device topology corresponding to the \a gpuId. - * - * @param pDcgmHandle IN: DCGM Handle - * @param gpuId IN: GPU Id corresponding to which topology information should be fetched - * @param pDcgmDeviceTopology IN/OUT: Topology information corresponding to \a gpuId. pDcgmDeviceTopology->version must - * be set to dcgmDeviceTopology_version before this call. - * @return - * - \ref DCGM_ST_OK if the call was successful. - * - \ref DCGM_ST_BADPARAM if \a gpuId or \a pDcgmDeviceTopology were not valid. - * - \ref DCGM_ST_VER_MISMATCH if pDcgmDeviceTopology->version was not set to dcgmDeviceTopology_version. - * - */ -dcgmReturn_t DECLDIR dcgmGetDeviceTopology(dcgmHandle_t pDcgmHandle, - unsigned int gpuId, - dcgmDeviceTopology_t *pDcgmDeviceTopology); - -/** - * Gets group topology corresponding to the \a groupId. - * - * @param pDcgmHandle IN: DCGM Handle - * @param groupId IN: GroupId corresponding to which topology information should be fetched - * @param pDcgmGroupTopology IN/OUT: Topology information corresponding to \a groupId. pDcgmgroupTopology->version must - * be set to dcgmGroupTopology_version. - * @return - * - \ref DCGM_ST_OK if the call was successful. - * - \ref DCGM_ST_BADPARAM if \a groupId or \a pDcgmGroupTopology were not valid. - * - \ref DCGM_ST_VER_MISMATCH if pDcgmgroupTopology->version was not set to dcgmGroupTopology_version. - * - */ -dcgmReturn_t DECLDIR dcgmGetGroupTopology(dcgmHandle_t pDcgmHandle, - dcgmGpuGrp_t groupId, - dcgmGroupTopology_t *pDcgmGroupTopology); - -/** @} */ // Closing for DCGMAPI_Topo - -/***************************************************************************************************/ -/** @defgroup DCGMAPI_METADATA Metadata - * @{ - * This chapter describes the methods that query for DCGM metadata. - */ -/***************************************************************************************************/ - -/** - * Toggle the state of introspection metadata gathering in DCGM. Metadata gathering will increase the memory usage - * of DCGM so that it can store the metadata it gathers. - * - * @param pDcgmHandle IN: DCGM Handle - * @param enabledState IN: The state to set gathering of introspection data to - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM enabledState is an invalid state for metadata gathering - * - */ -dcgmReturn_t DECLDIR dcgmIntrospectToggleState(dcgmHandle_t pDcgmHandle, dcgmIntrospectState_t enabledState); - -/*************************************************************************/ -/** - * Get the current amount of memory used to store the given field collection. - * - * @param pDcgmHandle IN: DCGM Handle - * @param context IN: see \ref dcgmIntrospectContext_t. This identifies the level of fields to do - * introspection for (ex: all fields, field groups) context->version must be - * set to dcgmIntrospectContext_version prior to this call. - * @param memoryInfo IN/OUT: see \ref dcgmIntrospectFullMemory_t. memoryInfo->version must be set to - * dcgmIntrospectFullMemory_version prior to this call. - * @param waitIfNoData IN: if no metadata has been gathered, should this call block until data has been - * gathered (1), or should this call just return DCGM_ST_NO_DATA (0). - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_NOT_CONFIGURED if metadata gathering state is \a DCGM_INTROSPECT_STATE_DISABLED - * - \ref DCGM_ST_NO_DATA if \a waitIfNoData is false and metadata has not been gathered yet - * - \ref DCGM_ST_VER_MISMATCH if context->version or memoryInfo->version is 0 or invalid. - * - */ -dcgmReturn_t DECLDIR dcgmIntrospectGetFieldsMemoryUsage(dcgmHandle_t pDcgmHandle, - dcgmIntrospectContext_t *context, - dcgmIntrospectFullMemory_t *memoryInfo, - int waitIfNoData); - - -/*************************************************************************/ -/** - * Retrieve the total amount of memory that the hostengine process is currently using. - * This measurement represents both the resident set size (what is currently in RAM) and - * the swapped memory that belongs to the process. - * - * @param pDcgmHandle IN: DCGM Handle - * @param memoryInfo IN/OUT: see \ref dcgmIntrospectMemory_t. memoryInfo->version must be set to - * dcgmIntrospectMemory_version prior to this call. - * @param waitIfNoData IN: if no metadata is gathered wait till this occurs (!0) or return DCGM_ST_NO_DATA (0) - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_NOT_CONFIGURED if metadata gathering state is \a DCGM_INTROSPECT_STATE_DISABLED - * - \ref DCGM_ST_NO_DATA if \a waitIfNoData is false and metadata has not been gathered yet - * - \ref DCGM_ST_VER_MISMATCH if memoryInfo->version is 0 or invalid. - * - */ -dcgmReturn_t DECLDIR dcgmIntrospectGetHostengineMemoryUsage(dcgmHandle_t pDcgmHandle, - dcgmIntrospectMemory_t *memoryInfo, - int waitIfNoData); - -/*************************************************************************/ -/** - * Get introspection info relating to execution time needed to update the fields - * identified by \a context. - * - * @param pDcgmHandle IN: DCGM Handle - * @param context IN: see \ref dcgmIntrospectContext_t. This identifies the level of fields to do - * introspection for (ex: all fields, field group ) context->version must be set to - * dcgmIntrospectContext_version prior to this call. - * @param execTime IN/OUT: see \ref dcgmIntrospectFullFieldsExecTime_t. execTime->version must be set to - * dcgmIntrospectFullFieldsExecTime_version prior to this call. - * @param waitIfNoData IN: if no metadata is gathered, wait until data has been gathered (1) or return - * DCGM_ST_NO_DATA (0) - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_NOT_CONFIGURED if metadata gathering state is \a DCGM_INTROSPECT_STATE_DISABLED - * - \ref DCGM_ST_NO_DATA if \a waitIfNoData is false and metadata has not been gathered yet - * - \ref DCGM_ST_VER_MISMATCH if context->version or execTime->version is 0 or invalid. - * - */ -dcgmReturn_t DECLDIR dcgmIntrospectGetFieldsExecTime(dcgmHandle_t pDcgmHandle, - dcgmIntrospectContext_t *context, - dcgmIntrospectFullFieldsExecTime_t *execTime, - int waitIfNoData); - -/*************************************************************************/ -/** - * Retrieve the CPU utilization of the DCGM hostengine process. - * - * @param pDcgmHandle IN: DCGM Handle - * @param cpuUtil IN/OUT: see \ref dcgmIntrospectCpuUtil_t. cpuUtil->version must be set to - * dcgmIntrospectCpuUtil_version prior to this call. - * @param waitIfNoData IN: if no metadata is gathered wait till this occurs (!0) or return DCGM_ST_NO_DATA (0) - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_NOT_CONFIGURED if metadata gathering state is \a DCGM_INTROSPECT_STATE_DISABLED - * - \ref DCGM_ST_NO_DATA if \a waitIfNoData is false and metadata has not been gathered yet - * - \ref DCGM_ST_VER_MISMATCH if cpuUtil->version or execTime->version is 0 or invalid. - * - */ -dcgmReturn_t DECLDIR dcgmIntrospectGetHostengineCpuUtilization(dcgmHandle_t pDcgmHandle, - dcgmIntrospectCpuUtil_t *cpuUtil, - int waitIfNoData); - -/*************************************************************************/ -/** - * This method is used to manually tell the the introspection module to update - * all DCGM introspection data. This is normally performed automatically on an - * interval of 1 second. - * - * @param pDcgmHandle IN: DCGM Handle - * @param waitForUpdate IN: Whether or not to wait for the update loop to complete before returning to the caller - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if \a waitForUpdate is invalid - * - */ -dcgmReturn_t DECLDIR dcgmIntrospectUpdateAll(dcgmHandle_t pDcgmHandle, int waitForUpdate); - -/** @} */ // Closing for DCGMAPI_METADATA - -/***************************************************************************************************/ -/** @defgroup DCGMAPI_TOPOLOGY Topology - * @{ - * This chapter describes the methods that query for DCGM topology information. - */ -/***************************************************************************************************/ - -/*************************************************************************/ -/** - * Get the best group of gpus from the specified bitmask according to topological proximity: cpuAffinity, NUMA - * node, and NVLink. - * - * @param pDcgmHandle IN: DCGM Handle - * @param inputGpuIds IN: a bitmask of which GPUs DCGM should consider. If some of the GPUs on the system are - * already in use, they shouldn't be included in the bitmask. 0 means that all of the GPUs - * in the system should be considered. - * @param numGpus IN: the number of GPUs that are desired from inputGpuIds. If this number is greater than - * the number of healthy GPUs in inputGpuIds, then less than numGpus gpus will be - * specified in outputGpuIds. - * @param outputGpuIds OUT: a bitmask of numGpus or fewer GPUs from inputGpuIds that represent the best placement - * available from inputGpuIds. - * @param hintFlags IN: a bitmask of DCGM_TOPO_HINT_F_ #defines of hints that should be taken into account when - * assigning outputGpuIds. - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - */ -dcgmReturn_t DECLDIR dcgmSelectGpusByTopology(dcgmHandle_t pDcgmHandle, - uint64_t inputGpuIds, - uint32_t numGpus, - uint64_t *outputGpuIds, - uint64_t hintFlags); - -/** @} */ // Closing for DCGMAPI_TOPOLOGY - -/***************************************************************************************************/ -/** @defgroup DCGMAPI_MODULES Modules - * @{ - * This chapter describes the methods that query and configure DCGM modules. - */ -/***************************************************************************************************/ - -/*************************************************************************/ -/** - * Set a module to be blacklisted. This module will be prevented from being loaded - * if it hasn't been loaded already. Modules are lazy-loaded as they are used by - * DCGM APIs, so it's important to call this API soon after the host engine has been started. - * You can also pass --blacklist-modules to the nv-hostengine binary to make sure modules - * get blacklisted immediately after the host engine starts up. - * - * @param pDcgmHandle IN: DCGM Handle - * @param moduleId IN: ID of the module to blacklist. Use \ref dcgmModuleGetStatuses to get a list of valid - * module IDs. - * - * @return - * - \ref DCGM_ST_OK if the module has been blacklisted. - * - \ref DCGM_ST_IN_USE if the module has already been loaded and cannot be blacklisted. - * - \ref DCGM_ST_BADPARAM if a parameter is missing or bad. - * - */ -dcgmReturn_t DECLDIR dcgmModuleBlacklist(dcgmHandle_t pDcgmHandle, dcgmModuleId_t moduleId); - -/*************************************************************************/ -/** - * Get the status of all of the DCGM modules. - * - * @param pDcgmHandle IN: DCGM Handle - * @param moduleStatuses OUT: Module statuses.
- * .version should be set to dcgmModuleStatuses_version upon calling. - * - * @return - * - \ref DCGM_ST_OK if the request succeeds. - * - \ref DCGM_ST_BADPARAM if a parameter is missing or bad. - * - */ -dcgmReturn_t DECLDIR dcgmModuleGetStatuses(dcgmHandle_t pDcgmHandle, dcgmModuleGetStatuses_t *moduleStatuses); - -/** @} */ // Closing for DCGMAPI_MODULES - -/*************************************************************************/ -/** @defgroup DCGMAPI_PROFILING Profiling - * @{ - * This chapter describes the methods that watch profiling fields from within DCGM. - */ -/*************************************************************************/ - -/*************************************************************************/ -/** - * Get all of the profiling metric groups for a given GPU group. - * - * Profiling metrics are watched in groups of fields that are all watched together. For instance, if you want - * to watch DCGM_FI_PROF_GR_ENGINE_ACTIVITY, this might also be in the same group as DCGM_FI_PROF_SM_EFFICIENCY. - * Watching this group would result in DCGM storing values for both of these metrics. - * - * Some groups cannot be watched concurrently as others as they utilize the same hardware resource. For instance, - * you may not be able to watch DCGM_FI_PROF_TENSOR_OP_UTIL at the same time as DCGM_FI_PROF_GR_ENGINE_ACTIVITY - * on your hardware. At the same time, you may be able to watch DCGM_FI_PROF_TENSOR_OP_UTIL at the same time as - * DCGM_FI_PROF_NVLINK_TX_DATA. - * - * Metrics that can be watched concurrently will have different .majorId fields in their dcgmProfMetricGroupInfo_t - * - * See \ref dcgmGroupCreate for details on creating a GPU group - * See \ref dcgmProfWatchFields to actually watch a metric group - * - * @param pDcgmHandle IN: DCGM Handle - * @param metricGroups IN/OUT: Metric groups supported for metricGroups->groupId.
- * metricGroups->version should be set to dcgmProfGetMetricGroups_version upon calling. - * - * @return - * - \ref DCGM_ST_OK if the request succeeds. - * - \ref DCGM_ST_BADPARAM if a parameter is missing or bad. - * - \ref DCGM_ST_GROUP_INCOMPATIBLE if metricGroups->groupId's GPUs are not identical GPUs. - * - \ref DCGM_ST_NOT_SUPPORTED if profiling metrics are not supported for the given GPU group. - * - */ -dcgmReturn_t DECLDIR dcgmProfGetSupportedMetricGroups(dcgmHandle_t pDcgmHandle, - dcgmProfGetMetricGroups_t *metricGroups); - -/** - * Request that DCGM start recording updates for a given list of profiling field IDs. - * - * Once metrics have been watched by this API, any of the normal DCGM field-value retrieval APIs can be used on - * the underlying fieldIds of this metric group. See \ref dcgmGetLatestValues_v2, \ref dcgmGetLatestValuesForFields, - * \ref dcgmEntityGetLatestValues, and \ref dcgmEntitiesGetLatestValues. - * - * @param pDcgmHandle IN: DCGM Handle - * @param watchFields IN: Details of which metric groups to watch for which GPUs. See \ref dcgmProfWatchFields_v1 - * for details of what should be put in each struct member. watchFields->version should be - * set to dcgmProfWatchFields_version upon calling. - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if a parameter is invalid - * - \ref DCGM_ST_NOT_SUPPORTED if profiling metric group metricGroupTag is not supported for the given - * GPU group. - * - \ref DCGM_ST_GROUP_INCOMPATIBLE if groupId's GPUs are not identical GPUs. Profiling metrics are only - * support for homogenous groups of GPUs. - * - \ref DCGM_ST_PROFILING_MULTI_PASS if any of the metric groups could not be watched concurrently due to - * requiring the hardware to gather them with multiple passes - * - */ -dcgmReturn_t dcgmProfWatchFields(dcgmHandle_t pDcgmHandle, dcgmProfWatchFields_t *watchFields); - -/** - * Request that DCGM stop recording updates for all profiling field IDs for all GPUs - * - * @param pDcgmHandle IN: DCGM Handle - * @param unwatchFields IN: Details of which metric groups to unwatch for which GPUs. See \ref - * dcgmProfUnwatchFields_v1 for details of what should be put in each struct member. - * unwatchFields->version should be set to dcgmProfUnwatchFields_version upon calling. - * - * @return - * - \ref DCGM_ST_OK if the call was successful - * - \ref DCGM_ST_BADPARAM if a parameter is invalid - * - */ -dcgmReturn_t dcgmProfUnwatchFields(dcgmHandle_t pDcgmHandle, dcgmProfUnwatchFields_t *unwatchFields); - -/** - * Pause profiling activities in DCGM. This should be used when you are monitoring profiling fields - * from DCGM but want to be able to still run developer tools like nvprof, nsight systems, and nsight compute. - * Profiling fields start with DCGM_PROF_ and are in the field ID range 1001-1012. - * - * Call this API before you launch one of those tools and dcgmProfResume() after the tool has completed. - * - * DCGM will save BLANK values while profiling is paused. - * - * Calling this while profiling activities are already paused is fine and will be treated as a no-op. - * - * @param pDcgmHandle IN: DCGM Handle - * - * @return - * - \ref DCGM_ST_OK If the call was successful. - * - \ref DCGM_ST_BADPARAM if a parameter is invalid. - * - */ -dcgmReturn_t dcgmProfPause(dcgmHandle_t pDcgmHandle); - -/** - * Resume profiling activities in DCGM that were previously paused with dcgmProfPause(). - * - * Call this API after you have completed running other NVIDIA developer tools to reenable DCGM - * profiling metrics. - * - * DCGM will save BLANK values while profiling is paused. - * - * Calling this while profiling activities have already been resumed is fine and will be treated as a no-op. - * - * @param pDcgmHandle IN: DCGM Handle - * - * @return - * - \ref DCGM_ST_OK If the call was successful. - * - \ref DCGM_ST_BADPARAM if a parameter is invalid. - * - */ -dcgmReturn_t dcgmProfResume(dcgmHandle_t pDcgmHandle); - -/** @} */ // Closing for DCGMAPI_PROFILING - -/** - * Adds fake GPU instances and or compute instances for testing purposes. The entity IDs specified for - * the GPU instances and compute instances are only guaranteed to be used by DCGM if MIG mode is not active. - * - * NOTE: this API will not work on a real system reading actual values from NVML, and it may even cause - * the real instances to malfunction. This API is for testing purposes only. - * - * @param pDcgmHandle IN: DCGM Handle - * @param hierarchy - * - * @return - * - \ref DCGM_ST_OK - * - */ -dcgmReturn_t dcgmAddFakeInstances(dcgmHandle_t pDcgmHandle, dcgmMigHierarchy_v1 *hierarchy); - -#ifdef __cplusplus -} -#endif - -#endif /* DCGM_AGENT_H */ diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/dcgm_errors.h b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/dcgm_errors.h deleted file mode 100644 index edce9eab..00000000 --- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/dcgm_errors.h +++ /dev/null @@ -1,474 +0,0 @@ -/* - * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef DCGM_ERRORS_H -#define DCGM_ERRORS_H - -/* - * Error codes for passive and active health checks. - * New error codes must be added to end of enum to maintain backwards compatibility. - */ -typedef enum dcgmError_enum -{ - DCGM_FR_OK = 0, //!< No error - DCGM_FR_UNKNOWN = 1, //!< Unknown error code - DCGM_FR_UNRECOGNIZED = 2, //!< Unrecognized error code - DCGM_FR_PCI_REPLAY_RATE = 3, //!< Unacceptable rate of PCI errors - DCGM_FR_VOLATILE_DBE_DETECTED = 4, //!< Uncorrectable volatile double bit error - DCGM_FR_VOLATILE_SBE_DETECTED = 5, //!< Unacceptable rate of volatile single bit errors - DCGM_FR_PENDING_PAGE_RETIREMENTS = 6, //!< Pending page retirements detected - DCGM_FR_RETIRED_PAGES_LIMIT = 7, //!< Unacceptable total page retirements detected - DCGM_FR_RETIRED_PAGES_DBE_LIMIT = 8, //!< Unacceptable total page retirements due to uncorrectable errors - DCGM_FR_CORRUPT_INFOROM = 9, //!< Corrupt inforom found - DCGM_FR_CLOCK_THROTTLE_THERMAL = 10, //!< Clocks being throttled due to overheating - DCGM_FR_POWER_UNREADABLE = 11, //!< Cannot get a reading for power from NVML - DCGM_FR_CLOCK_THROTTLE_POWER = 12, //!< Clock being throttled due to power restrictions - DCGM_FR_NVLINK_ERROR_THRESHOLD = 13, //!< Unacceptable rate of NVLink errors - DCGM_FR_NVLINK_DOWN = 14, //!< NVLink is down - DCGM_FR_NVSWITCH_FATAL_ERROR = 15, //!< Fatal errors on the NVSwitch - DCGM_FR_NVSWITCH_NON_FATAL_ERROR = 16, //!< Non-fatal errors on the NVSwitch - DCGM_FR_NVSWITCH_DOWN = 17, //!< NVSwitch is down - DCGM_FR_NO_ACCESS_TO_FILE = 18, //!< Cannot access a file - DCGM_FR_NVML_API = 19, //!< Error occurred on an NVML API - DCGM_FR_DEVICE_COUNT_MISMATCH = 20, //!< Disagreement in GPU count between /dev and NVML - DCGM_FR_BAD_PARAMETER = 21, //!< Bad parameter passed to API - DCGM_FR_CANNOT_OPEN_LIB = 22, //!< Cannot open a library that must be accessed - DCGM_FR_BLACKLISTED_DRIVER = 23, //!< A blacklisted driver (nouveau) is active - DCGM_FR_NVML_LIB_BAD = 24, //!< The NVML library is missing expected functions - DCGM_FR_GRAPHICS_PROCESSES = 25, //!< Graphics processes are active on this GPU - DCGM_FR_HOSTENGINE_CONN = 26, //!< Unstable connection to nv-hostengine (daemonized DCGM) - DCGM_FR_FIELD_QUERY = 27, //!< Error querying a field from DCGM - DCGM_FR_BAD_CUDA_ENV = 28, //!< The environment has variables that hurt CUDA - DCGM_FR_PERSISTENCE_MODE = 29, //!< Persistence mode is disabled - DCGM_FR_LOW_BANDWIDTH = 30, //!< The bandwidth is unacceptably low - DCGM_FR_HIGH_LATENCY = 31, //!< Latency is too high - DCGM_FR_CANNOT_GET_FIELD_TAG = 32, //!< Cannot find a tag for a field - DCGM_FR_FIELD_VIOLATION = 33, //!< The value for the specified error field is above 0 - DCGM_FR_FIELD_THRESHOLD = 34, //!< The value for the specified field is above the threshold - DCGM_FR_FIELD_VIOLATION_DBL = 35, //!< The value for the specified error field is above 0 - DCGM_FR_FIELD_THRESHOLD_DBL = 36, //!< The value for the specified field is above the threshold - DCGM_FR_UNSUPPORTED_FIELD_TYPE = 37, //!< Field type cannot be supported - DCGM_FR_FIELD_THRESHOLD_TS = 38, //!< The value for the specified field is above the threshold - DCGM_FR_FIELD_THRESHOLD_TS_DBL = 39, //!< The value for the specified field is above the threshold - DCGM_FR_THERMAL_VIOLATIONS = 40, //!< Thermal violations detected - DCGM_FR_THERMAL_VIOLATIONS_TS = 41, //!< Thermal violations detected with a timestamp - DCGM_FR_TEMP_VIOLATION = 42, //!< Temperature is too high - DCGM_FR_THROTTLING_VIOLATION = 43, //!< Non-benign clock throttling is occurring - DCGM_FR_INTERNAL = 44, //!< An internal error was detected - DCGM_FR_PCIE_GENERATION = 45, //!< PCIe generation is too low - DCGM_FR_PCIE_WIDTH = 46, //!< PCIe width is too low - DCGM_FR_ABORTED = 47, //!< Test was aborted by a user signal - DCGM_FR_TEST_DISABLED = 48, //!< This test is disabled for this GPU - DCGM_FR_CANNOT_GET_STAT = 49, //!< Cannot get telemetry for a needed value - DCGM_FR_STRESS_LEVEL = 50, //!< Stress level is too low (bad performance) - DCGM_FR_CUDA_API = 51, //!< Error calling the specified CUDA API - DCGM_FR_FAULTY_MEMORY = 52, //!< Faulty memory detected on this GPU - DCGM_FR_CANNOT_SET_WATCHES = 53, //!< Unable to set field watches in DCGM - DCGM_FR_CUDA_UNBOUND = 54, //!< CUDA context is no longer bound - DCGM_FR_ECC_DISABLED = 55, //!< ECC memory is disabled right now - DCGM_FR_MEMORY_ALLOC = 56, //!< Cannot allocate memory on the GPU - DCGM_FR_CUDA_DBE = 57, //!< CUDA detected unrecovable double-bit error - DCGM_FR_MEMORY_MISMATCH = 58, //!< Memory error detected - DCGM_FR_CUDA_DEVICE = 59, //!< No CUDA device discoverable for existing GPU - DCGM_FR_ECC_UNSUPPORTED = 60, //!< ECC memory is unsupported by this SKU - DCGM_FR_ECC_PENDING = 61, //!< ECC memory is in a pending state - DCGM_FR_MEMORY_BANDWIDTH = 62, //!< Memory bandwidth is too low - DCGM_FR_TARGET_POWER = 63, //!< Cannot hit the target power draw - DCGM_FR_API_FAIL = 64, //!< The specified API call failed - DCGM_FR_API_FAIL_GPU = 65, //!< The specified API call failed for the specified GPU - DCGM_FR_CUDA_CONTEXT = 66, //!< Cannot create a CUDA context on this GPU - DCGM_FR_DCGM_API = 67, //!< DCGM API failure - DCGM_FR_CONCURRENT_GPUS = 68, //!< Need multiple GPUs to run this test - DCGM_FR_TOO_MANY_ERRORS = 69, //!< More errors than fit in the return struct - DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD = 70, //!< More than 100 CRC errors are happening per second - DCGM_FR_NVLINK_ERROR_CRITICAL = 71, //!< NVLink error for a field that should always be 0 - DCGM_FR_ENFORCED_POWER_LIMIT = 72, //!< The enforced power limit is too low to hit the target - DCGM_FR_MEMORY_ALLOC_HOST = 73, //!< Cannot allocate memory on the host - DCGM_FR_GPU_OP_MODE = 74, //!< Bad GPU operating mode for running plugin - DCGM_FR_NO_MEMORY_CLOCKS = 75, //!< No memory clocks with the needed MHz were found - DCGM_FR_NO_GRAPHICS_CLOCKS = 76, //!< No graphics clocks with the needed MHz were found - DCGM_FR_HAD_TO_RESTORE_STATE = 77, //!< Note that we had to restore a GPU's state - DCGM_FR_L1TAG_UNSUPPORTED = 78, //!< L1TAG test is unsupported by this SKU - DCGM_FR_L1TAG_MISCOMPARE = 79, //!< L1TAG test failed on a miscompare - DCGM_FR_ROW_REMAP_FAILURE = 80, //!< Row remapping failed (Ampere or newer GPUs) - DCGM_FR_UNCONTAINED_ERROR = 81, //!< Uncontained error - XID 95 - DCGM_FR_EMPTY_GPU_LIST = 82, //!< No GPU information given to plugin - DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS = 83, //!< Pending page retirements due to a DBE - DCGM_FR_ERROR_SENTINEL = 84, //!< MUST BE THE LAST ERROR CODE -} dcgmError_t; - -typedef enum dcgmErrorSeverity_enum -{ - DCGM_ERROR_MONITOR = 0, //!< Can perform workload, but needs to be monitored. - DCGM_ERROR_ISOLATE = 1, //!< Cannot perform workload. GPU should be isolated. - DCGM_ERROR_UNKNOWN = 2, //!< This error code is not recognized -} dcgmErrorSeverity_t; - -typedef struct -{ - dcgmError_t errorId; - const char *msgFormat; - const char *suggestion; - int severity; -} dcgm_error_meta_t; - -extern dcgm_error_meta_t dcgmErrorMeta[]; - - -/* Standard message for running a field diagnostic */ -#define TRIAGE_RUN_FIELD_DIAG_MSG "Run a field diagnostic on the GPU." -#define DEBUG_COOLING_MSG \ - "Verify that the cooling on this machine is functional, including external, " \ - "thermal material interface, fans, and any other components." - -/* - * Messages for the error codes. All messages must be defined in the ERROR_CODE_MSG format - * where is the actual message. - */ -#define DCGM_FR_OK_MSG "The operation completed successfully." -#define DCGM_FR_UNKNOWN_MSG "Unknown error." -#define DCGM_FR_UNRECOGNIZED_MSG "Unrecognized error code." -// replay limit, gpu id, replay errors detected -#define DCGM_FR_PCI_REPLAY_RATE_MSG "Detected more than %u PCIe replays per minute for GPU %u : %d" -// dbes deteced, gpu id -#define DCGM_FR_VOLATILE_DBE_DETECTED_MSG "Detected %d volatile double-bit ECC error(s) in GPU %u." -// sbe limit, gpu id, sbes detected -#define DCGM_FR_VOLATILE_SBE_DETECTED_MSG "More than %u single-bit ECC error(s) detected in GPU %u Volatile SBEs: %lld" -// gpu id -#define DCGM_FR_PENDING_PAGE_RETIREMENTS_MSG "A pending retired page has been detected in GPU %u." -// retired pages detected, gpud id -#define DCGM_FR_RETIRED_PAGES_LIMIT_MSG "%u or more retired pages have been detected in GPU %u. " -// retired pages due to dbes detected, gpu id -#define DCGM_FR_RETIRED_PAGES_DBE_LIMIT_MSG \ - "An excess of %u retired pages due to DBEs have been detected and" \ - " more than one page has been retired due to DBEs in the past" \ - " week in GPU %u." -// gpu id -#define DCGM_FR_CORRUPT_INFOROM_MSG "A corrupt InfoROM has been detected in GPU %u." -// gpu id -#define DCGM_FR_CLOCK_THROTTLE_THERMAL_MSG "Detected clock throttling due to thermal violation in GPU %u." -// gpu id -#define DCGM_FR_POWER_UNREADABLE_MSG "Cannot reliably read the power usage for GPU %u." -// gpu id -#define DCGM_FR_CLOCK_THROTTLE_POWER_MSG "Detected clock throttling due to power violation in GPU %u." -// nvlink errors detected, nvlink id, error threshold -#define DCGM_FR_NVLINK_ERROR_THRESHOLD_MSG \ - "Detected %ld %s NvLink errors on GPU %u's NVLink which exceeds " \ - "threshold of %u" -// gpu id, nvlink id -#define DCGM_FR_NVLINK_DOWN_MSG "GPU %u's NvLink link %d is currently down" -// nvswitch id, nvlink id -#define DCGM_FR_NVSWITCH_FATAL_ERROR_MSG "Detected fatal errors on NvSwitch %u link %u" -// nvswitch id, nvlink id -#define DCGM_FR_NVSWITCH_NON_FATAL_ERROR_MSG "Detected nonfatal errors on NvSwitch %u link %u" -// nvswitch id, nvlink port -#define DCGM_FR_NVSWITCH_DOWN_MSG "NvSwitch physical ID %u's NvLink port %d is currently down." -// file path, error detail -#define DCGM_FR_NO_ACCESS_TO_FILE_MSG "File %s could not be accessed directly: %s" -// purpose for communicating with NVML, NVML error as string, NVML error -#define DCGM_FR_NVML_API_MSG "Error calling NVML API %s: %s" -#define DCGM_FR_DEVICE_COUNT_MISMATCH_MSG \ - "The number of devices NVML returns is different than the number " \ - "of devices in /dev." -// function name -#define DCGM_FR_BAD_PARAMETER_MSG "Bad parameter to function %s cannot be processed" -// library name, error returned from dlopen -#define DCGM_FR_CANNOT_OPEN_LIB_MSG "Cannot open library %s: '%s'" -// the name of the blacklisted driver -#define DCGM_FR_BLACKLISTED_DRIVER_MSG "Found blacklisted driver: %s" -// the name of the function that wasn't found -#define DCGM_FR_NVML_LIB_BAD_MSG "Cannot get pointer to %s from libnvidia-ml.so" -#define DCGM_FR_GRAPHICS_PROCESSES_MSG \ - "NVVS has detected processes with graphics contexts open running on at least one " \ - "GPU. This may cause some tests to fail." -// error message from the API call -#define DCGM_FR_HOSTENGINE_CONN_MSG "Could not connect to the host engine: '%s'" -// field name, gpu id -#define DCGM_FR_FIELD_QUERY_MSG "Could not query field %s for GPU %u" -// environment variable name -#define DCGM_FR_BAD_CUDA_ENV_MSG "Found CUDA performance-limiting environment variable '%s'." -// gpu id -#define DCGM_FR_PERSISTENCE_MODE_MSG \ - "Persistence mode for GPU %u is currently disabled. The DCGM " \ - "diagnostic requires peristence mode to be enabled." -// gpu id, direction (d2h, e.g.), measured bandwidth, expected bandwidth -#define DCGM_FR_LOW_BANDWIDTH_MSG \ - "Bandwidth of GPU %u in direction %s of %.2f did not exceed " \ - "minimum required bandwidth of %.2f." -// gpu id, direction (d2h, e.g.), measured latency, expected latency -#define DCGM_FR_HIGH_LATENCY_MSG \ - "Latency type %s of GPU %u value %.2f exceeded maximum allowed " \ - "latency of %.2f." -// field id -#define DCGM_FR_CANNOT_GET_FIELD_TAG_MSG "Unable to get field information for field id %hu" -// field value, field name, gpu id (this message is for fields that should always have a 0 value) -#define DCGM_FR_FIELD_VIOLATION_MSG "Detected %ld %s for GPU %u" -// field value, field name, gpu id, allowable threshold -#define DCGM_FR_FIELD_THRESHOLD_MSG "Detected %ld %s for GPU %u which is above the threshold %ld" -// field value, field name, gpu id (same as DCGM_FR_FIELD_VIOLATION, but it's a double) -#define DCGM_FR_FIELD_VIOLATION_DBL_MSG "Detected %.1f %s for GPU %u" -// field value, field name, gpu id, allowable threshold (same as DCGM_FR_FIELD_THRESHOLD, but it's a double) -#define DCGM_FR_FIELD_THRESHOLD_DBL_MSG "Detected %.1f %s for GPU %u which is above the threshold %.1f" -// field name -#define DCGM_FR_UNSUPPORTED_FIELD_TYPE_MSG \ - "Field %s is not supported by this API because it is neither an " \ - "int64 nor a double type." -// field name, allowable threshold, observed value, seconds -#define DCGM_FR_FIELD_THRESHOLD_TS_MSG \ - "%s met or exceeded the threshold of %lu per second: %lu at " \ - "%.1f seconds into the test." -// field name, allowable threshold, observed value, seconds (same as DCGM_FR_FIELD_THRESHOLD, but it's a double) -#define DCGM_FR_FIELD_THRESHOLD_TS_DBL_MSG \ - "%s met or exceeded the threshold of %.1f per second: %.1f at " \ - "%.1f seconds into the test." -// total seconds of violation, gpu id -#define DCGM_FR_THERMAL_VIOLATIONS_MSG "There were thermal violations totaling %lu seconds for GPU %u" -// total seconds of violations, first instance, gpu id -#define DCGM_FR_THERMAL_VIOLATIONS_TS_MSG \ - "Thermal violations totaling %lu samples started at %.1f seconds " \ - "into the test for GPU %u" -// observed temperature, gpu id, max allowed temperature -#define DCGM_FR_TEMP_VIOLATION_MSG \ - "Temperature %lld of GPU %u exceeded user-specified maximum " \ - "allowed temperature %lld" -// gpu id, seconds into test, details about throttling -#define DCGM_FR_THROTTLING_VIOLATION_MSG \ - "Clocks are being throttled for GPU %u because of clock " \ - "throttling starting %.1f seconds into the test. %s" -// details about error -#define DCGM_FR_INTERNAL_MSG "There was an internal error during the test: '%s'" -// gpu id, PCIe generation, minimum allowed, parameter to control -#define DCGM_FR_PCIE_GENERATION_MSG \ - "GPU %u is running at PCI link generation %d, which is below " \ - "the minimum allowed link generation of %d (parameter '%s')" -// gpu id, PCIe width, minimum allowed, parameter to control -#define DCGM_FR_PCIE_WIDTH_MSG \ - "GPU %u is running at PCI link width %dX, which is below the " \ - "minimum allowed link generation of %d (parameter '%s')" -#define DCGM_FR_ABORTED_MSG "Test was aborted early due to user signal" -// Test name -#define DCGM_FR_TEST_DISABLED_MSG "The %s test is skipped for this GPU." -// stat name, gpu id -#define DCGM_FR_CANNOT_GET_STAT_MSG "Unable to generate / collect stat %s for GPU %u" -// observed value, minimum allowed, gpu id -#define DCGM_FR_STRESS_LEVEL_MSG \ - "Max stress level of %.1f did not reach desired stress level of " \ - "%.1f for GPU %u" -// CUDA API name -#define DCGM_FR_CUDA_API_MSG "Error using CUDA API %s" -// count, gpu id -#define DCGM_FR_FAULTY_MEMORY_MSG "Found %d faulty memory elements on GPU %u" -// error detail -#define DCGM_FR_CANNOT_SET_WATCHES_MSG "Unable to add field watches to DCGM: %s" -// gpu id -#define DCGM_FR_CUDA_UNBOUND_MSG "Cuda GPU %d is no longer bound to a CUDA context...Aborting" -// Test name, gpu id -#define DCGM_FR_ECC_DISABLED_MSG "Skipping test %s because ECC is not enabled on GPU %u" -// percentage of memory we tried to allocate, gpu id -#define DCGM_FR_MEMORY_ALLOC_MSG "Couldn't allocate at least %.1f%% of GPU memory on GPU %u" -// gpu id -#define DCGM_FR_CUDA_DBE_MSG \ - "CUDA APIs have indicated that a double-bit ECC error has " \ - "occured on GPU %u." -// gpu id -#define DCGM_FR_MEMORY_MISMATCH_MSG \ - "A memory mismatch was detected on GPU %u, but no error was " \ - "reported by CUDA or NVML." -// gpu id, error detail -#define DCGM_FR_CUDA_DEVICE_MSG "Unable to find a corresponding CUDA device for GPU %u: '%s'" -#define DCGM_FR_ECC_UNSUPPORTED_MSG "ECC Memory is not turned on or is unsupported. Skipping test." -// gpu id -#define DCGM_FR_ECC_PENDING_MSG "ECC memory for GPU %u is in a pending state." -// gpu id, observed bandwidth, required, test name -#define DCGM_FR_MEMORY_BANDWIDTH_MSG \ - "GPU %u only achieved a memory bandwidth of %.2f GB/s, failing " \ - "to meet %.2f GB/s for test %d" -// power draw observed, field tag, minimum power draw required, gpu id -#define DCGM_FR_TARGET_POWER_MSG \ - "Max power of %.1f did not reach desired power minimum %s of " \ - "%.1f for GPU %u" -// API name, error detail -#define DCGM_FR_API_FAIL_MSG "API call %s failed: '%s'" -// API name, gpu id, error detail -#define DCGM_FR_API_FAIL_GPU_MSG "API call %s failed for GPU %u: '%s'" -// gpu id, error detail -#define DCGM_FR_CUDA_CONTEXT_MSG "GPU %u failed to create a CUDA context: %s" -// DCGM API name -#define DCGM_FR_DCGM_API_MSG "Error using DCGM API %s" -#define DCGM_FR_CONCURRENT_GPUS_MSG \ - "Unable to run concurrent pair bandwidth test without 2 or more " \ - "gpus. Skipping" -#define DCGM_FR_TOO_MANY_ERRORS_MSG \ - "This API can only return up to four errors per system. " \ - "Additional errors were found for this system that couldn't be " \ - "communicated." -#define DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD_MSG \ - "%.1f %s NvLink errors found occuring per second on GPU %u, " \ - "exceeding the limit of 100 per second." -#define DCGM_FR_NVLINK_ERROR_CRITICAL_MSG "Detected %ld %s NvLink errors on GPU %u's NVLink (should be 0)" -#define DCGM_FR_ENFORCED_POWER_LIMIT_MSG \ - "Enforced power limit on GPU %u set to %.1f, which is too low to " \ - "attempt to achieve target power %.1f" -#define DCGM_FR_MEMORY_ALLOC_HOST_MSG "Cannot allocate %zu bytes on the host" -#define DCGM_FR_GPU_OP_MODE_MSG "Skipping plugin due to a GPU being in GPU Operating Mode: LOW_DP." -#define DCGM_FR_NO_MEMORY_CLOCKS_MSG "No memory clocks <= %u MHZ were found in %u supported memory clocks." -#define DCGM_FR_NO_GRAPHICS_CLOCKS_MSG \ - "No graphics clocks <= %u MHZ were found in %u supported graphics clocks for memory clock %u MHZ." -#define DCGM_FR_HAD_TO_RESTORE_STATE_MSG "Had to restore GPU state on NVML GPU(s): %s" -#define DCGM_FR_L1TAG_UNSUPPORTED_MSG "This card does not support the L1 cache test. Skipping test." -#define DCGM_FR_L1TAG_MISCOMPARE_MSG "Detected a miscompare failure in the L1 cache." -#define DCGM_FR_ROW_REMAP_FAILURE_MSG "Row remapping failed." -#define DCGM_FR_UNCONTAINED_ERROR_MSG "GPU had an uncontained error (XID 95)" -#define DCGM_FR_EMPTY_GPU_LIST_MSG "No valid GPUs passed to plugin" -#define DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS_MSG "Pending page retirements together with a DBE were detected on GPU %u." - -/* - * Suggestions for next steps for the corresponding error message - */ -#define DCGM_FR_OK_NEXT "N/A" -#define DCGM_FR_UNKNOWN_NEXT "" -#define DCGM_FR_UNRECOGNIZED_NEXT "" -#define DCGM_FR_PCI_REPLAY_RATE_NEXT \ - "Reconnect PCIe card. Run system side PCIE diagnostic utilities " \ - "to verify hops off the GPU board. If issue is on the board, run " \ - "the field diagnostic." -#define DCGM_FR_VOLATILE_DBE_DETECTED_NEXT "Drain the GPU and reset it or reboot the node." -#define DCGM_FR_VOLATILE_SBE_DETECTED_NEXT "Monitor - this GPU can still perform workload." -#define DCGM_FR_PENDING_PAGE_RETIREMENTS_NEXT "Monitor - this GPU can still perform workload" -#define DCGM_FR_RETIRED_PAGES_LIMIT_NEXT TRIAGE_RUN_FIELD_DIAG_MSG -#define DCGM_FR_RETIRED_PAGES_DBE_LIMIT_NEXT TRIAGE_RUN_FIELD_DIAG_MSG -#define DCGM_FR_CORRUPT_INFOROM_NEXT "Flash the InfoROM to clear this corruption." -#define DCGM_FR_CLOCK_THROTTLE_THERMAL_NEXT DEBUG_COOLING_MSG -#define DCGM_FR_POWER_UNREADABLE_NEXT "" -#define DCGM_FR_CLOCK_THROTTLE_POWER_NEXT "Monitor the power conditions. This GPU can still perform workload." -#define DCGM_FR_NVLINK_ERROR_THRESHOLD_NEXT "Monitor the NVLink. It can still perform workload." -#define DCGM_FR_NVLINK_DOWN_NEXT TRIAGE_RUN_FIELD_DIAG_MSG -#define DCGM_FR_NVSWITCH_FATAL_ERROR_NEXT TRIAGE_RUN_FIELD_DIAG_MSG -#define DCGM_FR_NVSWITCH_NON_FATAL_ERROR_NEXT "Monitor the NVSwitch. It can still perform workload." -#define DCGM_FR_NVSWITCH_DOWN_NEXT "" -#define DCGM_FR_NO_ACCESS_TO_FILE_NEXT "Check relevant permissions, access, and existence of the file." -#define DCGM_FR_NVML_API_NEXT \ - "Check the error condition and ensure that appropriate libraries " \ - "are present and accessible." -#define DCGM_FR_DEVICE_COUNT_MISMATCH_NEXT \ - "Check for the presence of cgroups, operating system blocks, and " \ - "or unsupported / older cards" -#define DCGM_FR_BAD_PARAMETER_NEXT "" -#define DCGM_FR_CANNOT_OPEN_LIB_NEXT \ - "Check for the existence of the library and set LD_LIBRARY_PATH " \ - "if needed." -#define DCGM_FR_BLACKLISTED_DRIVER_NEXT "Please load the appropriate driver." -#define DCGM_FR_NVML_LIB_BAD_NEXT \ - "Make sure that the required version of libnvidia-ml.so " \ - "is present and accessible on the system." -#define DCGM_FR_GRAPHICS_PROCESSES_NEXT \ - "Stop the graphics processes or run this diagnostic on a server " \ - "that is not being used for display purposes." -#define DCGM_FR_HOSTENGINE_CONN_NEXT \ - "If hostengine is run separately, please ensure that it is up " \ - "and responsive." -#define DCGM_FR_FIELD_QUERY_NEXT "" -#define DCGM_FR_BAD_CUDA_ENV_NEXT "Please unset this environment variable to address test failures." -#define DCGM_FR_PERSISTENCE_MODE_NEXT \ - "Enable persistence mode by running \"nvidia-smi -i -pm " \ - "1 \" as root." -#define DCGM_FR_LOW_BANDWIDTH_NEXT \ - "Verify that your minimum bandwidth setting is appropriate for " \ - "the topology of each GPU. If so, and errors are consistent, " \ - "please run a field diagnostic." -#define DCGM_FR_HIGH_LATENCY_NEXT \ - "Verify that your maximum latency setting is appropriate for " \ - "the topology of each GPU. If so, and errors are consistent, " \ - "please run a field diagnostic." -#define DCGM_FR_CANNOT_GET_FIELD_TAG_NEXT "" -#define DCGM_FR_FIELD_VIOLATION_NEXT "" -#define DCGM_FR_FIELD_THRESHOLD_NEXT "" -#define DCGM_FR_FIELD_VIOLATION_DBL_NEXT "" -#define DCGM_FR_FIELD_THRESHOLD_DBL_NEXT "" -#define DCGM_FR_UNSUPPORTED_FIELD_TYPE_NEXT "" -#define DCGM_FR_FIELD_THRESHOLD_TS_NEXT "" -#define DCGM_FR_FIELD_THRESHOLD_TS_DBL_NEXT "" -#define DCGM_FR_THERMAL_VIOLATIONS_NEXT DEBUG_COOLING_MSG -#define DCGM_FR_THERMAL_VIOLATIONS_TS_NEXT DEBUG_COOLING_MSG -#define DCGM_FR_TEMP_VIOLATION_NEXT \ - "Verify that the user-specified temperature maximum is set " \ - "correctly. If it is, check the cooling for this GPU and node: " DEBUG_COOLING_MSG -#define DCGM_FR_THROTTLING_VIOLATION_NEXT "" -#define DCGM_FR_INTERNAL_NEXT "" -#define DCGM_FR_PCIE_GENERATION_NEXT "" -#define DCGM_FR_PCIE_WIDTH_NEXT "" -#define DCGM_FR_ABORTED_NEXT "" -#define DCGM_FR_TEST_DISABLED_NEXT "" -#define DCGM_FR_CANNOT_GET_STAT_NEXT \ - "If running a standalone nv-hostengine, verify that it is up " \ - "and responsive." -#define DCGM_FR_STRESS_LEVEL_NEXT "" -#define DCGM_FR_CUDA_API_NEXT "" -#define DCGM_FR_FAULTY_MEMORY_NEXT TRIAGE_RUN_FIELD_DIAG_MSG -#define DCGM_FR_CANNOT_SET_WATCHES_NEXT "" -#define DCGM_FR_CUDA_UNBOUND_NEXT "" -#define DCGM_FR_ECC_DISABLED_NEXT \ - "Enable ECC memory by running \"nvidia-smi -i -e 1\" " \ - "to enable. This may require a GPU reset or reboot to take effect." -#define DCGM_FR_MEMORY_ALLOC_NEXT "" -#define DCGM_FR_CUDA_DBE_NEXT TRIAGE_RUN_FIELD_DIAG_MSG -#define DCGM_FR_MEMORY_MISMATCH_NEXT TRIAGE_RUN_FIELD_DIAG_MSG -#define DCGM_FR_CUDA_DEVICE_NEXT \ - "Make sure CUDA_VISIBLE_DEVICES is not preventing visibility of " \ - "this GPU. Also check if CUDA libraries are compatible and " \ - "correctly installed." -#define DCGM_FR_ECC_UNSUPPORTED_NEXT "" -#define DCGM_FR_ECC_PENDING_NEXT "Reboot to complete activation of the ECC memory." -#define DCGM_FR_MEMORY_BANDWIDTH_NEXT "" -#define DCGM_FR_TARGET_POWER_NEXT "Verify that the clock speeds and GPU utilization are high." -#define DCGM_FR_API_FAIL_NEXT "" -#define DCGM_FR_API_FAIL_GPU_NEXT "" -#define DCGM_FR_CUDA_CONTEXT_NEXT \ - "Please make sure the correct driver version is installed and " \ - "verify that no conflicting libraries are present." -#define DCGM_FR_DCGM_API_NEXT "" -#define DCGM_FR_CONCURRENT_GPUS_NEXT "" -#define DCGM_FR_TOO_MANY_ERRORS_NEXT "" -#define DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD_NEXT TRIAGE_RUN_FIELD_DIAG_MSG -#define DCGM_FR_NVLINK_ERROR_CRITICAL_NEXT TRIAGE_RUN_FIELD_DIAG_MSG -#define DCGM_FR_ENFORCED_POWER_LIMIT_NEXT \ - "If this enforced power limit is necessary, then this test " \ - "cannot be run. If it is unnecessary, then raise the enforced " \ - "power limit setting to be able to run this test." -#define DCGM_FR_MEMORY_ALLOC_HOST_NEXT "Manually kill processes or restart your machine." -#define DCGM_FR_GPU_OP_MODE_NEXT \ - "Fix by running nvidia-smi as root with: nvidia-smi --gom=0 -i " \ - "" -#define DCGM_FR_NO_MEMORY_CLOCKS_NEXT "" -#define DCGM_FR_NO_GRAPHICS_CLOCKS_NEXT "" -#define DCGM_FR_HAD_TO_RESTORE_STATE_NEXT "" -#define DCGM_FR_L1TAG_UNSUPPORTED_NEXT "" -#define DCGM_FR_L1TAG_MISCOMPARE_NEXT TRIAGE_RUN_FIELD_DIAG_MSG -#define DCGM_FR_ROW_REMAP_FAILURE_NEXT DCGM_FR_VOLATILE_DBE_DETECTED_NEXT -#define DCGM_FR_UNCONTAINED_ERROR_NEXT DCGM_FR_VOLATILE_DBE_DETECTED_NEXT -#define DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS_NEXT "Drain the GPU and reset it or reboot the node to resolve this issue." -#define DCGM_FR_EMPTY_GPU_LIST_NEXT "" - -#ifdef __cplusplus -extern "C" { -#endif -dcgmErrorSeverity_t dcgmErrorGetPriorityByCode(unsigned int code); -const char *dcgmErrorGetFormatMsgByCode(unsigned int code); - -#ifdef __cplusplus -} -#endif - -#endif // DCGM_ERRORS_H diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/dcgm_fields.h b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/dcgm_fields.h deleted file mode 100644 index ff156898..00000000 --- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/dcgm_fields.h +++ /dev/null @@ -1,2249 +0,0 @@ -/* - * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef DCGMFIELDS_H -#define DCGMFIELDS_H - -#ifdef __cplusplus -extern "C" { -#endif - -/***************************************************************************************************/ -/** @defgroup dcgmFieldTypes Field Types - * Field Types are a single byte. - * @{ - */ -/***************************************************************************************************/ - -/** - * Blob of binary data representing a structure - */ -#define DCGM_FT_BINARY 'b' - -/** - * 8-byte double precision - */ -#define DCGM_FT_DOUBLE 'd' - -/** - * 8-byte signed integer - */ -#define DCGM_FT_INT64 'i' - -/** - * Null-terminated ASCII Character string - */ -#define DCGM_FT_STRING 's' - -/** - * 8-byte signed integer usec since 1970 - */ -#define DCGM_FT_TIMESTAMP 't' - -/** @} */ - - -/***************************************************************************************************/ -/** @defgroup dcgmFieldScope Field Scope - * Represents field association with entity scope or global scope. - * @{ - */ -/***************************************************************************************************/ - -/** - * Field is global (ex: driver version) - */ -#define DCGM_FS_GLOBAL 0 - -/** - * Field is associated with an entity (GPU, VGPU...etc) - */ -#define DCGM_FS_ENTITY 1 - -/** - * Field is associated with a device. Deprecated. Use DCGM_FS_ENTITY - */ -#define DCGM_FS_DEVICE DCGM_FS_ENTITY - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup dcgmFieldConstants Field Constants - * Constants that represent contents of individual field values. - * @{ - */ -/***************************************************************************************************/ - -/** - * DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY is 16 bits of major version followed by - * 16 bits of the minor version. These macros separate the two. - */ -#define DCGM_CUDA_COMPUTE_CAPABILITY_MAJOR(x) ((uint64_t)(x)&0xFFFF0000) -#define DCGM_CUDA_COMPUTE_CAPABILITY_MINOR(x) ((uint64_t)(x)&0x0000FFFF) - -/** - * DCGM_FI_DEV_CLOCK_THROTTLE_REASONS is a bitmap of why the clock is throttled. - * These macros are masks for relevant throttling, and are a 1:1 map to the NVML - * reasons documented in nvml.h. The notes for the header are copied blow: - */ -/** Nothing is running on the GPU and the clocks are dropping to Idle state - * \note This limiter may be removed in a later release - */ -#define DCGM_CLOCKS_THROTTLE_REASON_GPU_IDLE 0x0000000000000001LL -/** GPU clocks are limited by current setting of applications clocks - */ -#define DCGM_CLOCKS_THROTTLE_REASON_CLOCKS_SETTING 0x0000000000000002LL -/** SW Power Scaling algorithm is reducing the clocks below requested clocks - */ -#define DCGM_CLOCKS_THROTTLE_REASON_SW_POWER_CAP 0x0000000000000004LL -/** HW Slowdown (reducing the core clocks by a factor of 2 or more) is engaged - * - * This is an indicator of: - * - temperature being too high - * - External Power Brake Assertion is triggered (e.g. by the system power supply) - * - Power draw is too high and Fast Trigger protection is reducing the clocks - * - May be also reported during PState or clock change - * - This behavior may be removed in a later release. - */ -#define DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN 0x0000000000000008LL -/** Sync Boost - * - * This GPU has been added to a Sync boost group with nvidia-smi or DCGM in - * order to maximize performance per watt. All GPUs in the sync boost group - * will boost to the minimum possible clocks across the entire group. Look at - * the throttle reasons for other GPUs in the system to see why those GPUs are - * holding this one at lower clocks. - */ -#define DCGM_CLOCKS_THROTTLE_REASON_SYNC_BOOST 0x0000000000000010LL -/** SW Thermal Slowdown - * - * This is an indicator of one or more of the following: - * - Current GPU temperature above the GPU Max Operating Temperature - * - Current memory temperature above the Memory Max Operating Temperature - */ -#define DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL 0x0000000000000020LL -/** HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is engaged - * - * This is an indicator of: - * - temperature being too high - */ -#define DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL 0x0000000000000040LL -/** HW Power Brake Slowdown (reducing the core clocks by a factor of 2 or more) is engaged - * - * This is an indicator of: - * - External Power Brake Assertion being triggered (e.g. by the system power supply) - */ -#define DCGM_CLOCKS_THROTTLE_REASON_HW_POWER_BRAKE 0x0000000000000080LL -/** GPU clocks are limited by current setting of Display clocks - */ -#define DCGM_CLOCKS_THROTTLE_REASON_DISPLAY_CLOCKS 0x0000000000000100LL - -/** - * GPU virtualization mode types for DCGM_FI_DEV_VIRTUAL_MODE - */ -typedef enum -{ - DCGM_GPU_VIRTUALIZATION_MODE_NONE = 0, //!< Represents Bare Metal GPU - DCGM_GPU_VIRTUALIZATION_MODE_PASSTHROUGH = 1, //!< Device is associated with GPU-Passthrough - DCGM_GPU_VIRTUALIZATION_MODE_VGPU = 2, //!< Device is associated with vGPU inside virtual machine. - DCGM_GPU_VIRTUALIZATION_MODE_HOST_VGPU = 3, //!< Device is associated with VGX hypervisor in vGPU mode - DCGM_GPU_VIRTUALIZATION_MODE_HOST_VSGA = 4, //!< Device is associated with VGX hypervisor in vSGA mode -} dcgmGpuVirtualizationMode_t; - - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup dcgmFieldEntity Field Entity - * Represents field association with a particular entity - * @{ - */ -/***************************************************************************************************/ - -/** - * Enum of possible field entity groups - */ -typedef enum dcgm_field_entity_group_t -{ - DCGM_FE_NONE = 0, /*!< Field is not associated with an entity. Field scope should be DCGM_FS_GLOBAL */ - DCGM_FE_GPU, /*!< Field is associated with a GPU entity */ - DCGM_FE_VGPU, /*!< Field is associated with a VGPU entity */ - DCGM_FE_SWITCH, /*!< Field is associated with a Switch entity */ - DCGM_FE_GPU_I, /*!< Field is associated with a GPU Instance entity */ - DCGM_FE_GPU_CI, /*!< Field is associated with a GPU Compute Instance entity */ - - DCGM_FE_COUNT /*!< Number of elements in this enumeration. Keep this entry last */ -} dcgm_field_entity_group_t; - -/** - * Represents an identifier for an entity within a field entity. For instance, this is the gpuId for DCGM_FE_GPU. - */ -typedef unsigned int dcgm_field_eid_t; - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup dcgmFieldIdentifiers Field Identifiers - * Field Identifiers - * @{ - */ -/***************************************************************************************************/ - -/** - * NULL field - */ -#define DCGM_FI_UNKNOWN 0 - -/** - * Driver Version - */ -#define DCGM_FI_DRIVER_VERSION 1 - -/* Underlying NVML version */ -#define DCGM_FI_NVML_VERSION 2 - -/* - * Process Name - */ -#define DCGM_FI_PROCESS_NAME 3 - -/** - * Number of Devices on the node - */ -#define DCGM_FI_DEV_COUNT 4 - -/** - * Cuda Driver Version - * Retrieves a number with the major value in the thousands place and the minor value in the hundreds place. - * CUDA 11.1 = 11100 - */ -#define DCGM_FI_CUDA_DRIVER_VERSION 5 - - -/** - * Name of the GPU device - */ -#define DCGM_FI_DEV_NAME 50 - -/** - * Device Brand - */ -#define DCGM_FI_DEV_BRAND 51 - -/** - * NVML index of this GPU - */ -#define DCGM_FI_DEV_NVML_INDEX 52 - -/** - * Device Serial Number - */ -#define DCGM_FI_DEV_SERIAL 53 - -/** - * UUID corresponding to the device - */ -#define DCGM_FI_DEV_UUID 54 - -/** - * Device node minor number /dev/nvidia# - */ -#define DCGM_FI_DEV_MINOR_NUMBER 55 - -/** - * OEM inforom version - */ -#define DCGM_FI_DEV_OEM_INFOROM_VER 56 - -/** - * PCI attributes for the device - */ -#define DCGM_FI_DEV_PCI_BUSID 57 - -/** - * The combined 16-bit device id and 16-bit vendor id - */ -#define DCGM_FI_DEV_PCI_COMBINED_ID 58 - -/** - * The 32-bit Sub System Device ID - */ -#define DCGM_FI_DEV_PCI_SUBSYS_ID 59 - -/** - * Topology of all GPUs on the system via PCI (static) - */ -#define DCGM_FI_GPU_TOPOLOGY_PCI 60 - -/** - * Topology of all GPUs on the system via NVLINK (static) - */ -#define DCGM_FI_GPU_TOPOLOGY_NVLINK 61 - -/** - * Affinity of all GPUs on the system (static) - */ -#define DCGM_FI_GPU_TOPOLOGY_AFFINITY 62 - -/** - * Cuda compute capability for the device. - * The major version is the upper 32 bits and - * the minor version is the lower 32 bits. - */ -#define DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY 63 - -/** - * Compute mode for the device - */ -#define DCGM_FI_DEV_COMPUTE_MODE 65 - -/** - * Persistence mode for the device - * Boolean: 0 is disabled, 1 is enabled - */ -#define DCGM_FI_DEV_PERSISTENCE_MODE 66 - -/** - * MIG mode for the device - * Boolean: 0 is disabled, 1 is enabled - */ -#define DCGM_FI_DEV_MIG_MODE 67 - -/** - * The string that CUDA_VISIBLE_DEVICES should - * be set to for this entity (including MIG) - */ -#define DCGM_FI_DEV_CUDA_VISIBLE_DEVICES_STR 68 - -/** - * The maximum number of MIG slices supported by this GPU - */ -#define DCGM_FI_DEV_MIG_MAX_SLICES 69 - -/** - * Device CPU affinity. part 1/8 = cpus 0 - 63 - */ -#define DCGM_FI_DEV_CPU_AFFINITY_0 70 - -/** - * Device CPU affinity. part 1/8 = cpus 64 - 127 - */ -#define DCGM_FI_DEV_CPU_AFFINITY_1 71 - -/** - * Device CPU affinity. part 2/8 = cpus 128 - 191 - */ -#define DCGM_FI_DEV_CPU_AFFINITY_2 72 - -/** - * Device CPU affinity. part 3/8 = cpus 192 - 255 - */ -#define DCGM_FI_DEV_CPU_AFFINITY_3 73 - -/** - * ECC inforom version - */ -#define DCGM_FI_DEV_ECC_INFOROM_VER 80 - -/** - * Power management object inforom version - */ -#define DCGM_FI_DEV_POWER_INFOROM_VER 81 - -/** - * Inforom image version - */ -#define DCGM_FI_DEV_INFOROM_IMAGE_VER 82 - -/** - * Inforom configuration checksum - */ -#define DCGM_FI_DEV_INFOROM_CONFIG_CHECK 83 - -/** - * Reads the infoROM from the flash and verifies the checksums - */ -#define DCGM_FI_DEV_INFOROM_CONFIG_VALID 84 - -/** - * VBIOS version of the device - */ -#define DCGM_FI_DEV_VBIOS_VERSION 85 - -/** - * Total BAR1 of the GPU in MB - */ -#define DCGM_FI_DEV_BAR1_TOTAL 90 - -/** - * Deprecated - Sync boost settings on the node - */ -#define DCGM_FI_SYNC_BOOST 91 - -/** - * Used BAR1 of the GPU in MB - */ -#define DCGM_FI_DEV_BAR1_USED 92 - -/** - * Free BAR1 of the GPU in MB - */ -#define DCGM_FI_DEV_BAR1_FREE 93 - -/** - * SM clock for the device - */ -#define DCGM_FI_DEV_SM_CLOCK 100 - -/** - * Memory clock for the device - */ -#define DCGM_FI_DEV_MEM_CLOCK 101 - -/** - * Video encoder/decoder clock for the device - */ -#define DCGM_FI_DEV_VIDEO_CLOCK 102 - -/** - * SM Application clocks - */ -#define DCGM_FI_DEV_APP_SM_CLOCK 110 - -/** - * Memory Application clocks - */ -#define DCGM_FI_DEV_APP_MEM_CLOCK 111 - -/** - * Current clock throttle reasons (bitmask of DCGM_CLOCKS_THROTTLE_REASON_*) - */ -#define DCGM_FI_DEV_CLOCK_THROTTLE_REASONS 112 - -/** - * Maximum supported SM clock for the device - */ -#define DCGM_FI_DEV_MAX_SM_CLOCK 113 - -/** - * Maximum supported Memory clock for the device - */ -#define DCGM_FI_DEV_MAX_MEM_CLOCK 114 - -/** - * Maximum supported Video encoder/decoder clock for the device - */ -#define DCGM_FI_DEV_MAX_VIDEO_CLOCK 115 - -/** - * Auto-boost for the device (1 = enabled. 0 = disabled) - */ -#define DCGM_FI_DEV_AUTOBOOST 120 - -/** - * Supported clocks for the device - */ -#define DCGM_FI_DEV_SUPPORTED_CLOCKS 130 - -/** - * Memory temperature for the device - */ -#define DCGM_FI_DEV_MEMORY_TEMP 140 - -/** - * Current temperature readings for the device, in degrees C - */ -#define DCGM_FI_DEV_GPU_TEMP 150 - -/** - * Maximum operating temperature for the memory of this GPU - */ -#define DCGM_FI_DEV_MEM_MAX_OP_TEMP 151 - -/** - * Maximum operating temperature for this GPU - */ -#define DCGM_FI_DEV_GPU_MAX_OP_TEMP 152 - - -/** - * Power usage for the device in Watts - */ -#define DCGM_FI_DEV_POWER_USAGE 155 - -/** - * Total energy consumption for the GPU in mJ since the driver was last reloaded - */ -#define DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION 156 - -/** - * Slowdown temperature for the device - */ -#define DCGM_FI_DEV_SLOWDOWN_TEMP 158 - -/** - * Shutdown temperature for the device - */ -#define DCGM_FI_DEV_SHUTDOWN_TEMP 159 - -/** - * Current Power limit for the device - */ -#define DCGM_FI_DEV_POWER_MGMT_LIMIT 160 - -/** - * Minimum power management limit for the device - */ -#define DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN 161 - -/** - * Maximum power management limit for the device - */ -#define DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX 162 - -/** - * Default power management limit for the device - */ -#define DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF 163 - -/** - * Effective power limit that the driver enforces after taking into account all limiters - */ -#define DCGM_FI_DEV_ENFORCED_POWER_LIMIT 164 - -/** - * Performance state (P-State) 0-15. 0=highest - */ -#define DCGM_FI_DEV_PSTATE 190 - -/** - * Fan speed for the device in percent 0-100 - */ -#define DCGM_FI_DEV_FAN_SPEED 191 - -/** - * PCIe Tx utilization information - * - * Deprecated: Use DCGM_FI_PROF_PCIE_TX_BYTES instead. - */ -#define DCGM_FI_DEV_PCIE_TX_THROUGHPUT 200 - -/** - * PCIe Rx utilization information - * - * Deprecated: Use DCGM_FI_PROF_PCIE_RX_BYTES instead. - */ -#define DCGM_FI_DEV_PCIE_RX_THROUGHPUT 201 - -/** - * PCIe replay counter - */ -#define DCGM_FI_DEV_PCIE_REPLAY_COUNTER 202 - -/** - * GPU Utilization - */ -#define DCGM_FI_DEV_GPU_UTIL 203 - -/** - * Memory Utilization - */ -#define DCGM_FI_DEV_MEM_COPY_UTIL 204 - -/** - * Process accounting stats. - * - * This field is only supported when the host engine is running as root unless you - * enable accounting ahead of time. Accounting mode can be enabled by - * running "nvidia-smi -am 1" as root on the same node the host engine is running on. - */ -#define DCGM_FI_DEV_ACCOUNTING_DATA 205 - -/** - * Encoder Utilization - */ -#define DCGM_FI_DEV_ENC_UTIL 206 - -/** - * Decoder Utilization - */ -#define DCGM_FI_DEV_DEC_UTIL 207 - -/** - * Memory utilization samples - */ -#define DCGM_FI_DEV_MEM_COPY_UTIL_SAMPLES 210 - -/* - * SM utilization samples - */ -#define DCGM_FI_DEV_GPU_UTIL_SAMPLES 211 - -/** - * Graphics processes running on the GPU. - */ -#define DCGM_FI_DEV_GRAPHICS_PIDS 220 - -/** - * Compute processes running on the GPU. - */ -#define DCGM_FI_DEV_COMPUTE_PIDS 221 - -/** - * XID errors. The value is the specific XID error - */ -#define DCGM_FI_DEV_XID_ERRORS 230 - -/** - * PCIe Max Link Generation - */ -#define DCGM_FI_DEV_PCIE_MAX_LINK_GEN 235 - -/** - * PCIe Max Link Width - */ -#define DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH 236 - -/** - * PCIe Current Link Generation - */ -#define DCGM_FI_DEV_PCIE_LINK_GEN 237 - -/** - * PCIe Current Link Width - */ -#define DCGM_FI_DEV_PCIE_LINK_WIDTH 238 - -/** - * Power Violation time in usec - */ -#define DCGM_FI_DEV_POWER_VIOLATION 240 - -/** - * Thermal Violation time in usec - */ -#define DCGM_FI_DEV_THERMAL_VIOLATION 241 - -/** - * Sync Boost Violation time in usec - */ -#define DCGM_FI_DEV_SYNC_BOOST_VIOLATION 242 - -/** - * Board violation limit. - */ -#define DCGM_FI_DEV_BOARD_LIMIT_VIOLATION 243 - -/** - *Low utilisation violation limit. - */ -#define DCGM_FI_DEV_LOW_UTIL_VIOLATION 244 - -/** - *Reliability violation limit. - */ -#define DCGM_FI_DEV_RELIABILITY_VIOLATION 245 - -/** - * App clock violation limit. - */ -#define DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION 246 - -/** - * Base clock violation limit. - */ -#define DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION 247 - -/** - * Total Frame Buffer of the GPU in MB - */ -#define DCGM_FI_DEV_FB_TOTAL 250 - -/** - * Free Frame Buffer in MB - */ -#define DCGM_FI_DEV_FB_FREE 251 - -/** - * Used Frame Buffer in MB - */ -#define DCGM_FI_DEV_FB_USED 252 - -/** - * Current ECC mode for the device - */ -#define DCGM_FI_DEV_ECC_CURRENT 300 - -/** - * Pending ECC mode for the device - */ -#define DCGM_FI_DEV_ECC_PENDING 301 - -/** - * Total single bit volatile ECC errors - */ -#define DCGM_FI_DEV_ECC_SBE_VOL_TOTAL 310 - -/** - * Total double bit volatile ECC errors - */ -#define DCGM_FI_DEV_ECC_DBE_VOL_TOTAL 311 - -/** - * Total single bit aggregate (persistent) ECC errors - * Note: monotonically increasing - */ -#define DCGM_FI_DEV_ECC_SBE_AGG_TOTAL 312 - -/** - * Total double bit aggregate (persistent) ECC errors - * Note: monotonically increasing - */ -#define DCGM_FI_DEV_ECC_DBE_AGG_TOTAL 313 - -/** - * L1 cache single bit volatile ECC errors - */ -#define DCGM_FI_DEV_ECC_SBE_VOL_L1 314 - -/** - * L1 cache double bit volatile ECC errors - */ -#define DCGM_FI_DEV_ECC_DBE_VOL_L1 315 - -/** - * L2 cache single bit volatile ECC errors - */ -#define DCGM_FI_DEV_ECC_SBE_VOL_L2 316 - -/** - * L2 cache double bit volatile ECC errors - */ -#define DCGM_FI_DEV_ECC_DBE_VOL_L2 317 - -/** - * Device memory single bit volatile ECC errors - */ -#define DCGM_FI_DEV_ECC_SBE_VOL_DEV 318 - -/** - * Device memory double bit volatile ECC errors - */ -#define DCGM_FI_DEV_ECC_DBE_VOL_DEV 319 - -/** - * Register file single bit volatile ECC errors - */ -#define DCGM_FI_DEV_ECC_SBE_VOL_REG 320 - -/** - * Register file double bit volatile ECC errors - */ -#define DCGM_FI_DEV_ECC_DBE_VOL_REG 321 - -/** - * Texture memory single bit volatile ECC errors - */ -#define DCGM_FI_DEV_ECC_SBE_VOL_TEX 322 - -/** - * Texture memory double bit volatile ECC errors - */ -#define DCGM_FI_DEV_ECC_DBE_VOL_TEX 323 - -/** - * L1 cache single bit aggregate (persistent) ECC errors - * Note: monotonically increasing - */ -#define DCGM_FI_DEV_ECC_SBE_AGG_L1 324 - -/** - * L1 cache double bit aggregate (persistent) ECC errors - * Note: monotonically increasing - */ -#define DCGM_FI_DEV_ECC_DBE_AGG_L1 325 - -/** - * L2 cache single bit aggregate (persistent) ECC errors - * Note: monotonically increasing - */ -#define DCGM_FI_DEV_ECC_SBE_AGG_L2 326 - -/** - * L2 cache double bit aggregate (persistent) ECC errors - * Note: monotonically increasing - */ -#define DCGM_FI_DEV_ECC_DBE_AGG_L2 327 - -/** - * Device memory single bit aggregate (persistent) ECC errors - * Note: monotonically increasing - */ -#define DCGM_FI_DEV_ECC_SBE_AGG_DEV 328 - -/** - * Device memory double bit aggregate (persistent) ECC errors - * Note: monotonically increasing - */ -#define DCGM_FI_DEV_ECC_DBE_AGG_DEV 329 - -/** - * Register File single bit aggregate (persistent) ECC errors - * Note: monotonically increasing - */ -#define DCGM_FI_DEV_ECC_SBE_AGG_REG 330 - -/** - * Register File double bit aggregate (persistent) ECC errors - * Note: monotonically increasing - */ -#define DCGM_FI_DEV_ECC_DBE_AGG_REG 331 - -/** - * Texture memory single bit aggregate (persistent) ECC errors - * Note: monotonically increasing - */ -#define DCGM_FI_DEV_ECC_SBE_AGG_TEX 332 - -/** - * Texture memory double bit aggregate (persistent) ECC errors - * Note: monotonically increasing - */ -#define DCGM_FI_DEV_ECC_DBE_AGG_TEX 333 - -/** - * Number of retired pages because of single bit errors - * Note: monotonically increasing - */ -#define DCGM_FI_DEV_RETIRED_SBE 390 - -/** - * Number of retired pages because of double bit errors - * Note: monotonically increasing - */ -#define DCGM_FI_DEV_RETIRED_DBE 391 - -/** - * Number of pages pending retirement - */ -#define DCGM_FI_DEV_RETIRED_PENDING 392 - -/** - * Number of remapped rows for uncorrectable errors - */ -#define DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS 393 - -/** - * Number of remapped rows for correctable errors - */ -#define DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS 394 - -/** - * Whether remapping of rows has failed - */ -#define DCGM_FI_DEV_ROW_REMAP_FAILURE 395 - -/* - * NV Link flow control CRC Error Counter for Lane 0 - */ -#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0 400 - -/* - * NV Link flow control CRC Error Counter for Lane 1 - */ -#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1 401 - -/* - * NV Link flow control CRC Error Counter for Lane 2 - */ -#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2 402 - -/* - * NV Link flow control CRC Error Counter for Lane 3 - */ -#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3 403 - -/* - * NV Link flow control CRC Error Counter for Lane 4 - */ -#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4 404 - -/* - * NV Link flow control CRC Error Counter for Lane 5 - */ -#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5 405 - -/* - * NV Link flow control CRC Error Counter total for all Lanes - */ -#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL 409 - -/* - * NV Link data CRC Error Counter for Lane 0 - */ -#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0 410 - -/* - * NV Link data CRC Error Counter for Lane 1 - */ -#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1 411 - -/* - * NV Link data CRC Error Counter for Lane 2 - */ -#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2 412 - -/* - * NV Link data CRC Error Counter for Lane 3 - */ -#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3 413 - -/* - * NV Link data CRC Error Counter for Lane 4 - */ -#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4 414 - -/* - * NV Link data CRC Error Counter for Lane 5 - */ -#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5 415 - -/* - * NV Link data CRC Error Counter total for all Lanes - */ -#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL 419 - -/* - * NV Link Replay Error Counter for Lane 0 - */ -#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0 420 - -/* - * NV Link Replay Error Counter for Lane 1 - */ -#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1 421 - -/* - * NV Link Replay Error Counter for Lane 2 - */ -#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2 422 - -/* - * NV Link Replay Error Counter for Lane 3 - */ -#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3 423 - -/* - * NV Link Replay Error Counter for Lane 4 - */ -#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4 424 - -/* - * NV Link Replay Error Counter for Lane 5 - */ -#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5 425 - -/* - * NV Link Replay Error Counter total for all Lanes - */ -#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL 429 - -/* - * NV Link Recovery Error Counter for Lane 0 - */ -#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0 430 - -/* - * NV Link Recovery Error Counter for Lane 1 - */ -#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1 431 - -/* - * NV Link Recovery Error Counter for Lane 2 - */ -#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2 432 - -/* - * NV Link Recovery Error Counter for Lane 3 - */ -#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3 433 - -/* - * NV Link Recovery Error Counter for Lane 4 - */ -#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4 434 - -/* - * NV Link Recovery Error Counter for Lane 5 - */ -#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5 435 - -/* - * NV Link Recovery Error Counter total for all Lanes - */ -#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL 439 - -/* - * NV Link Bandwidth Counter for Lane 0 - */ -#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L0 440 - -/* - * NV Link Bandwidth Counter for Lane 1 - */ -#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L1 441 - -/* - * NV Link Bandwidth Counter for Lane 2 - */ -#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L2 442 - -/* - * NV Link Bandwidth Counter for Lane 3 - */ -#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L3 443 - -/* - * NV Link Bandwidth Counter for Lane 4 - */ -#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L4 444 - -/* - * NV Link Bandwidth Counter for Lane 5 - */ -#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L5 445 - -/* - * NV Link Bandwidth Counter total for all Lanes - */ -#define DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL 449 - -/* - * GPU NVLink error information - */ -#define DCGM_FI_DEV_GPU_NVLINK_ERRORS 450 - -/* - * NV Link flow control CRC Error Counter for Lane 6 - */ -#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L6 451 - -/* - * NV Link flow control CRC Error Counter for Lane 7 - */ -#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L7 452 - -/* - * NV Link flow control CRC Error Counter for Lane 8 - */ -#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L8 453 - -/* - * NV Link flow control CRC Error Counter for Lane 9 - */ -#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L9 454 - -/* - * NV Link flow control CRC Error Counter for Lane 10 - */ -#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L10 455 - -/* - * NV Link flow control CRC Error Counter for Lane 11 - */ -#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L11 456 - -/* - * NV Link data CRC Error Counter for Lane 6 - */ -#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L6 457 - -/* - * NV Link data CRC Error Counter for Lane 7 - */ -#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L7 458 - -/* - * NV Link data CRC Error Counter for Lane 8 - */ -#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L8 459 - -/* - * NV Link data CRC Error Counter for Lane 9 - */ -#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L9 460 - -/* - * NV Link data CRC Error Counter for Lane 10 - */ -#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L10 461 - -/* - * NV Link data CRC Error Counter for Lane 11 - */ -#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L11 462 - -/* - * NV Link Replay Error Counter for Lane 6 - */ -#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L6 463 - -/* - * NV Link Replay Error Counter for Lane 7 - */ -#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L7 464 - -/* - * NV Link Replay Error Counter for Lane 8 - */ -#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L8 465 - -/* - * NV Link Replay Error Counter for Lane 9 - */ -#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L9 466 - -/* - * NV Link Replay Error Counter for Lane 10 - */ -#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L10 467 - -/* - * NV Link Replay Error Counter for Lane 11 - */ -#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L11 468 - -/* - * NV Link Recovery Error Counter for Lane 6 - */ -#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L6 469 - -/* - * NV Link Recovery Error Counter for Lane 7 - */ -#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L7 470 - -/* - * NV Link Recovery Error Counter for Lane 8 - */ -#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L8 471 - -/* - * NV Link Recovery Error Counter for Lane 9 - */ -#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L9 472 - -/* - * NV Link Recovery Error Counter for Lane 10 - */ -#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L10 473 - -/* - * NV Link Recovery Error Counter for Lane 11 - */ -#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L11 474 - -/* - * NV Link Bandwidth Counter for Lane 6 - */ -#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L6 475 - -/* - * NV Link Bandwidth Counter for Lane 7 - */ -#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L7 476 - -/* - * NV Link Bandwidth Counter for Lane 8 - */ -#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L8 477 - -/* - * NV Link Bandwidth Counter for Lane 9 - */ -#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L9 478 - -/* - * NV Link Bandwidth Counter for Lane 10 - */ -#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L10 479 - -/* - * NV Link Bandwidth Counter for Lane 11 - */ -#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L11 480 - - -/** - * Virtualization Mode corresponding to the GPU. - * - * One of DCGM_GPU_VIRTUALIZATION_MODE_* constants. - */ -#define DCGM_FI_DEV_VIRTUAL_MODE 500 - -/** - * Includes Count and Static info of vGPU types supported on a device - */ -#define DCGM_FI_DEV_SUPPORTED_TYPE_INFO 501 - -/** - * Includes Count and currently Creatable vGPU types on a device - */ -#define DCGM_FI_DEV_CREATABLE_VGPU_TYPE_IDS 502 - -/** - * Includes Count and currently Active vGPU Instances on a device - */ -#define DCGM_FI_DEV_VGPU_INSTANCE_IDS 503 - -/** - * Utilization values for vGPUs running on the device - */ -#define DCGM_FI_DEV_VGPU_UTILIZATIONS 504 - -/** - * Utilization values for processes running within vGPU VMs using the device - */ -#define DCGM_FI_DEV_VGPU_PER_PROCESS_UTILIZATION 505 - -/** - * Current encoder statistics for a given device - */ -#define DCGM_FI_DEV_ENC_STATS 506 - -/** - * Statistics of current active frame buffer capture sessions on a given device - */ -#define DCGM_FI_DEV_FBC_STATS 507 - -/** - * Information about active frame buffer capture sessions on a target device - */ -#define DCGM_FI_DEV_FBC_SESSIONS_INFO 508 -/** - * VM ID of the vGPU instance - */ -#define DCGM_FI_DEV_VGPU_VM_ID 520 - -/** - * VM name of the vGPU instance - */ -#define DCGM_FI_DEV_VGPU_VM_NAME 521 - -/** - * vGPU type of the vGPU instance - */ -#define DCGM_FI_DEV_VGPU_TYPE 522 - -/** - * UUID of the vGPU instance - */ -#define DCGM_FI_DEV_VGPU_UUID 523 - -/** - * Driver version of the vGPU instance - */ -#define DCGM_FI_DEV_VGPU_DRIVER_VERSION 524 - -/** - * Memory usage of the vGPU instance - */ -#define DCGM_FI_DEV_VGPU_MEMORY_USAGE 525 - -/** - * License status of the vGPU instance - */ -#define DCGM_FI_DEV_VGPU_LICENSE_STATUS 526 - -/** - * Frame rate limit of the vGPU instance - */ -#define DCGM_FI_DEV_VGPU_FRAME_RATE_LIMIT 527 - -/** - * Current encoder statistics of the vGPU instance - */ -#define DCGM_FI_DEV_VGPU_ENC_STATS 528 - -/** - * Information about all active encoder sessions on the vGPU instance - */ -#define DCGM_FI_DEV_VGPU_ENC_SESSIONS_INFO 529 - -/** - * Statistics of current active frame buffer capture sessions on the vGPU instance - */ -#define DCGM_FI_DEV_VGPU_FBC_STATS 530 - -/** - * Information about active frame buffer capture sessions on the vGPU instance - */ -#define DCGM_FI_DEV_VGPU_FBC_SESSIONS_INFO 531 - -/** - * License status of the vGPU host - */ -#define DCGM_FI_DEV_VGPU_LICENSE_INSTANCE_STATUS 532 - -/** - * Starting field ID of the vGPU instance - */ -#define DCGM_FI_FIRST_VGPU_FIELD_ID 520 - -/** - * Last field ID of the vGPU instance - */ -#define DCGM_FI_LAST_VGPU_FIELD_ID 570 - -/** - * For now max vGPU field Ids taken as difference of DCGM_FI_LAST_VGPU_FIELD_ID and DCGM_FI_LAST_VGPU_FIELD_ID i.e. 50 - */ -#define DCGM_FI_MAX_VGPU_FIELDS DCGM_FI_LAST_VGPU_FIELD_ID - DCGM_FI_FIRST_VGPU_FIELD_ID - -/** - * Starting ID for all the internal fields - */ -#define DCGM_FI_INTERNAL_FIELDS_0_START 600 - -/** - * Last ID for all the internal fields - */ - -/** - *

 

- *

 

- *

 

- *

NVSwitch entity field IDs start here.

- *

 

- *

 

- *

NVSwitch latency bins for port 0

- */ - -#define DCGM_FI_INTERNAL_FIELDS_0_END 699 - - -/** - *

Low latency bin

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P00 700 -/** - * Medium latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P00 701 -/** - * High latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P00 702 -/** - * Max latency bin - *

 

- *

 

- *

NVSwitch latency bins for port 1

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P00 703 - -/** - *

Low latency bin

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P01 704 -/** - * Medium latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P01 705 -/** - * High latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P01 706 -/** - * Max latency bin - *

 

- *

 

- *

NVSwitch latency bins for port 2

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P01 707 - -/** - *

Low latency bin

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P02 708 -/** - * Medium latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P02 709 -/** - * High latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P02 710 -/** - * Max latency bin - *

 

- *

 

- *

NVSwitch latency bins for port 3

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P02 711 - -/** - *

Low latency bin

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P03 712 -/** - * Medium latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P03 713 -/** - * High latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P03 714 -/** - * Max latency bin - *

 

- *

 

- *

NVSwitch latency bins for port 4

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P03 715 - -/** - *

Low latency bin

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P04 716 -/** - * Medium latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P04 717 -/** - * High latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P04 718 -/** - * Max latency bin - *

 

- *

 

- *

NVSwitch latency bins for port 5

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P04 719 - -/** - *

Low latency bin

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P05 720 -/** - * Medium latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P05 721 -/** - * High latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P05 722 -/** - * Max latency bin - *

 

- *

 

- *

NVSwitch latency bins for port 6

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P05 723 - -/** - *

Low latency bin

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P06 724 -/** - * Medium latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P06 725 -/** - * High latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P06 726 -/** - * Max latency bin - *

 

- *

 

- *

NVSwitch latency bins for port 7

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P06 727 - -/** - *

Low latency bin

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P07 728 -/** - * Medium latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P07 729 -/** - * High latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P07 730 -/** - * Max latency bin - *

 

- *

 

- *

NVSwitch latency bins for port 8

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P07 731 - -/** - *

Low latency bin

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P08 732 -/** - * Medium latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P08 733 -/** - * High latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P08 734 -/** - * Max latency bin - *

 

- *

 

- *

NVSwitch latency bins for port 9

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P08 735 - -/** - *

Low latency bin

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P09 736 -/** - * Medium latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P09 737 -/** - * High latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P09 738 -/** - * Max latency bin - *

 

- *

 

- *

NVSwitch latency bins for port 10

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P09 739 - -/** - *

Low latency bin

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P10 740 -/** - * Medium latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P10 741 -/** - * High latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P10 742 -/** - * Max latency bin - *

 

- *

 

- *

NVSwitch latency bins for port 11

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P10 743 - -/** - *

Low latency bin

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P11 744 -/** - * Medium latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P11 745 -/** - * High latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P11 746 -/** - * Max latency bin - *

 

- *

 

- *

NVSwitch latency bins for port 12

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P11 747 - -/** - *

Low latency bin

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P12 748 -/** - * Medium latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P12 749 -/** - * High latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P12 750 -/** - * Max latency bin - *

 

- *

 

- *

NVSwitch latency bins for port 13

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P12 751 - -/** - *

Low latency bin

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P13 752 -/** - * Medium latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P13 753 -/** - * High latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P13 754 -/** - * Max latency bin - *

 

- *

 

- *

NVSwitch latency bins for port 14

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P13 755 - -/** - *

Low latency bin

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P14 756 -/** - * Medium latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P14 757 -/** - * High latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P14 758 -/** - * Max latency bin - *

 

- *

 

- *

NVSwitch latency bins for port 15

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P14 759 - -/** - *

Low latency bin

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P15 760 -/** - * Medium latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P15 761 -/** - * High latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P15 762 -/** - * Max latency bin - *

 

- *

 

- *

NVSwitch latency bins for port 16

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P15 763 - -/** - *

Low latency bin

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P16 764 -/** - * Medium latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P16 765 -/** - * High latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P16 766 -/** - * Max latency bin - *

 

- *

 

- *

NVSwitch latency bins for port 17

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P16 767 - -/** - *

Low latency bin

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P17 768 -/** - * Medium latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P17 769 -/** - * High latency bin - */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P17 770 -/** - *

Max latency bin

- *

 

- *

 

- *

 

- *

NVSwitch Tx and Rx Counter 0 for each port

- *

By default, Counter 0 counts bytes.

- */ -#define DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P17 771 - -/** - *

NVSwitch Tx Bandwidth Counter 0 for port 0

- */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P00 780 -/** - * NVSwitch Rx Bandwidth Counter 0 for port 0 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P00 781 - -/** - * NVSwitch Tx Bandwidth Counter 0 for port 1 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P01 782 -/** - * NVSwitch Rx Bandwidth Counter 0 for port 1 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P01 783 - -/** - * NVSwitch Tx Bandwidth Counter 0 for port 2 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P02 784 -/** - * NVSwitch Rx Bandwidth Counter 0 for port 2 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P02 785 - -/** - * NVSwitch Tx Bandwidth Counter 0 for port 3 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P03 786 -/** - * NVSwitch Rx Bandwidth Counter 0 for port 3 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P03 787 - -/** - * NVSwitch Tx Bandwidth Counter 0 for port 4 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P04 788 -/** - * NVSwitch Rx Bandwidth Counter 0 for port 4 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P04 789 - -/** - * NVSwitch Tx Bandwidth Counter 0 for port 5 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P05 790 -/** - * NVSwitch Rx Bandwidth Counter 0 for port 5 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P05 791 - -/** - * NVSwitch Tx Bandwidth Counter 0 for port 6 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P06 792 -/** - * NVSwitch Rx Bandwidth Counter 0 for port 6 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P06 793 - -/** - * NVSwitch Tx Bandwidth Counter 0 for port 7 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P07 794 -/** - * NVSwitch Rx Bandwidth Counter 0 for port 7 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P07 795 - -/** - * NVSwitch Tx Bandwidth Counter 0 for port 8 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P08 796 -/** - * NVSwitch Rx Bandwidth Counter 0 for port 8 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P08 797 - -/** - * NVSwitch Tx Bandwidth Counter 0 for port 9 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P09 798 -/** - * NVSwitch Rx Bandwidth Counter 0 for port 9 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P09 799 - -/** - * NVSwitch Tx Bandwidth Counter 0 for port 10 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P10 800 -/** - * NVSwitch Rx Bandwidth Counter 0 for port 10 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P10 801 - -/** - * NVSwitch Tx Bandwidth Counter 0 for port 11 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P11 802 -/** - * NVSwitch Rx Bandwidth Counter 0 for port 11 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P11 803 - -/** - * NVSwitch Tx Bandwidth Counter 0 for port 12 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P12 804 -/** - * NVSwitch Rx Bandwidth Counter 0 for port 12 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P12 805 - -/** - * NVSwitch Tx Bandwidth Counter 0 for port 13 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P13 806 -/** - * NVSwitch Rx Bandwidth Counter 0 for port 13 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P13 807 - -/** - * NVSwitch Tx Bandwidth Counter 0 for port 14 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P14 808 -/** - * NVSwitch Rx Bandwidth Counter 0 for port 14 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P14 809 - -/** - * NVSwitch Tx Bandwidth Counter 0 for port 15 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P15 810 -/** - * NVSwitch Rx Bandwidth Counter 0 for port 15 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P15 811 - -/** - * NVSwitch Tx Bandwidth Counter 0 for port 16 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P16 812 -/** - * NVSwitch Rx Bandwidth Counter 0 for port 16 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P16 813 - -/** - * NVSwitch Tx Bandwidth Counter 0 for port 17 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P17 814 -/** - *

NVSwitch Rx Bandwidth Counter 0 for port 17

- *

 

- *

 

- *

 

- *

NVSwitch Tx and RX Bandwidth Counter 1 for each port

- *

By default, Counter 1 counts packets.

- */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P17 815 - -/** - *

NVSwitch Tx Bandwidth Counter 1 for port 0

- */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P00 820 -/** - * NVSwitch Rx Bandwidth Counter 1 for port 0 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P00 821 - -/** - * NVSwitch Tx Bandwidth Counter 1 for port 1 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P01 822 -/** - * NVSwitch Rx Bandwidth Counter 1 for port 1 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P01 823 - -/** - * NVSwitch Tx Bandwidth Counter 1 for port 2 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P02 824 -/** - * NVSwitch Rx Bandwidth Counter 1 for port 2 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P02 825 - -/** - * NVSwitch Tx Bandwidth Counter 1 for port 3 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P03 826 -/** - * NVSwitch Rx Bandwidth Counter 1 for port 3 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P03 827 - -/** - * NVSwitch Tx Bandwidth Counter 1 for port 4 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P04 828 -/** - * NVSwitch Rx Bandwidth Counter 1 for port 4 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P04 829 - -/** - * NVSwitch Tx Bandwidth Counter 1 for port 5 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P05 830 -/** - * NVSwitch Rx Bandwidth Counter 1 for port 5 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P05 831 - -/** - * NVSwitch Tx Bandwidth Counter 1 for port 6 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P06 832 -/** - * NVSwitch Rx Bandwidth Counter 1 for port 6 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P06 833 - -/** - * NVSwitch Tx Bandwidth Counter 1 for port 7 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P07 834 -/** - * NVSwitch Rx Bandwidth Counter 1 for port 7 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P07 835 - -/** - * NVSwitch Tx Bandwidth Counter 1 for port 8 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P08 836 -/** - * NVSwitch Rx Bandwidth Counter 1 for port 8 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P08 837 - -/** - * NVSwitch Tx Bandwidth Counter 1 for port 9 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P09 838 -/** - * NVSwitch Rx Bandwidth Counter 1 for port 9 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P09 839 - -/** - * NVSwitch Tx Bandwidth Counter 0 for port 10 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P10 840 -/** - * NVSwitch Rx Bandwidth Counter 1 for port 10 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P10 841 - -/** - * NVSwitch Tx Bandwidth Counter 1 for port 11 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P11 842 -/** - * NVSwitch Rx Bandwidth Counter 1 for port 11 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P11 843 - -/** - * NVSwitch Tx Bandwidth Counter 1 for port 12 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P12 844 -/** - * NVSwitch Rx Bandwidth Counter 1 for port 12 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P12 845 - -/** - * NVSwitch Tx Bandwidth Counter 0 for port 13 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P13 846 -/** - * NVSwitch Rx Bandwidth Counter 1 for port 13 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P13 847 - -/** - * NVSwitch Tx Bandwidth Counter 1 for port 14 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P14 848 -/** - * NVSwitch Rx Bandwidth Counter 1 for port 14 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P14 849 - -/** - * NVSwitch Tx Bandwidth Counter 1 for port 15 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P15 850 -/** - * NVSwitch Rx Bandwidth Counter 1 for port 15 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P15 851 - -/** - * NVSwitch Tx Bandwidth Counter 1 for port 16 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P16 852 -/** - * NVSwitch Rx Bandwidth Counter 1 for port 16 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P16 853 - -/** - * NVSwitch Tx Bandwidth Counter 1 for port 17 - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P17 854 -/** - * NVSwitch Rx Bandwidth Counter 1 for port 17 - *

 

- *

 

- *

 

- * NVSwitch error counters - */ -#define DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P17 855 - -/** - * NVSwitch fatal error information. - * Note: value field indicates the specific SXid reported - */ -#define DCGM_FI_DEV_NVSWITCH_FATAL_ERRORS 856 - -/** - * NVSwitch non fatal error information. - * Note: value field indicates the specific SXid reported - */ -#define DCGM_FI_DEV_NVSWITCH_NON_FATAL_ERRORS 857 - -/** - * Starting field ID of the NVSwitch instance - */ -#define DCGM_FI_FIRST_NVSWITCH_FIELD_ID 700 - -/** - * Last field ID of the NVSwitch instance - */ -#define DCGM_FI_LAST_NVSWITCH_FIELD_ID 860 - -/** - * For now max NVSwitch field Ids taken as difference of DCGM_FI_LAST_NVSWITCH_FIELD_ID and - * DCGM_FI_FIRST_NVSWITCH_FIELD_ID + 1 i.e. 200 - */ -#define DCGM_FI_MAX_NVSWITCH_FIELDS DCGM_FI_LAST_NVSWITCH_FIELD_ID - DCGM_FI_FIRST_NVSWITCH_FIELD_ID + 1 - -/** - * Profiling Fields. These all start with DCGM_FI_PROF_* - */ - -/** - * Ratio of time the graphics engine is active. The graphics engine is - * active if a graphics/compute context is bound and the graphics pipe or - * compute pipe is busy. - */ -#define DCGM_FI_PROF_GR_ENGINE_ACTIVE 1001 - -/** - * The ratio of cycles an SM has at least 1 warp assigned - * (computed from the number of cycles and elapsed cycles) - */ -#define DCGM_FI_PROF_SM_ACTIVE 1002 - -/** - * The ratio of number of warps resident on an SM. - * (number of resident as a ratio of the theoretical - * maximum number of warps per elapsed cycle) - */ -#define DCGM_FI_PROF_SM_OCCUPANCY 1003 - -/** - * The ratio of cycles the tensor (HMMA) pipe is active - * (off the peak sustained elapsed cycles) - */ -#define DCGM_FI_PROF_PIPE_TENSOR_ACTIVE 1004 - -/** - * The ratio of cycles the device memory interface is - * active sending or receiving data. - */ -#define DCGM_FI_PROF_DRAM_ACTIVE 1005 - -/** - * Ratio of cycles the fp64 pipe is active. - */ -#define DCGM_FI_PROF_PIPE_FP64_ACTIVE 1006 - -/** - * Ratio of cycles the fp32 pipe is active. - */ -#define DCGM_FI_PROF_PIPE_FP32_ACTIVE 1007 - -/** - * Ratio of cycles the fp16 pipe is active. This does not include HMMA. - */ -#define DCGM_FI_PROF_PIPE_FP16_ACTIVE 1008 - -/** - * The number of bytes of active PCIe tx (transmit) data including both header and payload. - * - * Note that this is from the perspective of the GPU, so copying data from device to host (DtoH) - * would be reflected in this metric. - */ -#define DCGM_FI_PROF_PCIE_TX_BYTES 1009 - -/** - * The number of bytes of active PCIe rx (read) data including both header and payload. - * - * Note that this is from the perspective of the GPU, so copying data from host to device (HtoD) - * would be reflected in this metric. - */ -#define DCGM_FI_PROF_PCIE_RX_BYTES 1010 - -/** - * The number of bytes of active NvLink tx (transmit) data including both header and payload. - */ -#define DCGM_FI_PROF_NVLINK_TX_BYTES 1011 - -/** - * The number of bytes of active NvLink rx (read) data including both header and payload. - */ -#define DCGM_FI_PROF_NVLINK_RX_BYTES 1012 - -/** - * 1 greater than maximum fields above. This is the 1 greater than the maximum field id that could be allocated - */ -#define DCGM_FI_MAX_FIELDS 1013 - - -/** @} */ - -/*****************************************************************************/ - -/** - * Structure for formating the output for dmon. - * Used as a member in dcgm_field_meta_p - */ -typedef struct -{ - char shortName[10]; /*!< Short name corresponding to field. This short name is used to identify columns in dmon - output.*/ - char unit[4]; /*!< The unit of value. Eg: C(elsius), W(att), MB/s*/ - short width; /*!< Maximum width/number of digits that a value for field can have.*/ -} dcgm_field_output_format_t, *dcgm_field_output_format_p; - -/** - * Structure to store meta data for the field - */ - -typedef struct -{ - unsigned short fieldId; /*!< Field identifier. DCGM_FI_? #define */ - char fieldType; /*!< Field type. DCGM_FT_? #define */ - unsigned char size; /*!< field size in bytes (raw value size). 0=variable (like DCGM_FT_STRING) */ - char tag[48]; /*!< Tag for this field for serialization like 'device_temperature' */ - int scope; /*!< Field scope. DCGM_FS_? #define of this field's association */ - int nvmlFieldId; /*!< Optional NVML field this DCGM field maps to. 0 = no mapping. - Otherwise, this should be a NVML_FI_? #define from nvml.h */ - dcgm_field_entity_group_t - entityLevel; /*!< Field entity level. DCGM_FE_? specifying at what level the field is queryable */ - - dcgm_field_output_format_p valueFormat; /*!< pointer to the structure that holds the formatting the - values for fields */ -} dcgm_field_meta_t, *dcgm_field_meta_p; - -/***************************************************************************************************/ -/** @addtogroup dcgmFieldIdentifiers - * @{ - */ -/***************************************************************************************************/ - -/** - * Get a pointer to the metadata for a field by its field ID. See DCGM_FI_? for a list of field IDs. - * - * @param fieldId IN: One of the field IDs (DCGM_FI_?) - * - * @return - * 0 On Failure - * >0 Pointer to field metadata structure if found. - * - */ -dcgm_field_meta_p DcgmFieldGetById(unsigned short fieldId); - -/** - * Get a pointer to the metadata for a field by its field tag. - * - * @param tag IN: Tag for the field of interest - * - * @return - * 0 On failure or not found - * >0 Pointer to field metadata structure if found - * - */ -dcgm_field_meta_p DcgmFieldGetByTag(char *tag); - -/** - * Initialize the DcgmFields module. Call this once from inside - * your program - * - * @return - * 0 On success - * <0 On error - * - */ -int DcgmFieldsInit(void); - -/** - * Terminates the DcgmFields module. Call this once from inside your program - * - * @return - * 0 On success - * <0 On error - * - */ -int DcgmFieldsTerm(void); - -/** - * Get the string version of a entityGroupId - * - * @returns - * - Pointer to a string like GPU/NvSwitch..etc - * - Null on error - * - */ -const char *DcgmFieldsGetEntityGroupString(dcgm_field_entity_group_t entityGroupId); - -/** @} */ - - -#ifdef __cplusplus -} -#endif - - -#endif // DCGMFIELDS_H diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/dcgm_structs.h b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/dcgm_structs.h deleted file mode 100644 index b038e5e9..00000000 --- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/dcgm_structs.h +++ /dev/null @@ -1,2958 +0,0 @@ -/* - * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * File: dcgm_structs.h - */ - -#ifndef DCGM_STRUCTS_H -#define DCGM_STRUCTS_H - -#include "dcgm_fields.h" -#include - - -/***************************************************************************************************/ -/** @defgroup dcgmReturnEnums Enums and Macros - * @{ - */ -/***************************************************************************************************/ - -/** - * Creates a unique version number for each struct - */ -#define MAKE_DCGM_VERSION(typeName, ver) (unsigned int)(sizeof(typeName) | ((unsigned long)(ver) << 24U)) - -/** - * Represents value of the field which can be returned by Host Engine in case the - * operation is not successful - */ -#ifndef DCGM_BLANK_VALUES -#define DCGM_BLANK_VALUES - -/** - * Base value for 32 bits integer blank. can be used as an unspecified blank - */ -#define DCGM_INT32_BLANK 0x7ffffff0 - -/** - * Base value for 64 bits integer blank. can be used as an unspecified blank - */ -#define DCGM_INT64_BLANK 0x7ffffffffffffff0 - -/** - * Base value for double blank. 2 ** 47. FP 64 has 52 bits of mantissa, - * so 47 bits can still increment by 1 and represent each value from 0-15 - */ -#define DCGM_FP64_BLANK 140737488355328.0 - -/** - * Base value for string blank. - */ -#define DCGM_STR_BLANK "<<>>" - -/** - * Represents an error where INT32 data was not found - */ -#define DCGM_INT32_NOT_FOUND (DCGM_INT32_BLANK + 1) - -/** - * Represents an error where INT64 data was not found - */ -#define DCGM_INT64_NOT_FOUND (DCGM_INT64_BLANK + 1) - -/** - * Represents an error where FP64 data was not found - */ -#define DCGM_FP64_NOT_FOUND (DCGM_FP64_BLANK + 1.0) - -/** - * Represents an error where STR data was not found - */ -#define DCGM_STR_NOT_FOUND "<<>>" - -/** - * Represents an error where fetching the INT32 value is not supported - */ -#define DCGM_INT32_NOT_SUPPORTED (DCGM_INT32_BLANK + 2) - -/** - * Represents an error where fetching the INT64 value is not supported - */ -#define DCGM_INT64_NOT_SUPPORTED (DCGM_INT64_BLANK + 2) - -/** - * Represents an error where fetching the FP64 value is not supported - */ -#define DCGM_FP64_NOT_SUPPORTED (DCGM_FP64_BLANK + 2.0) - -/** - * Represents an error where fetching the STR value is not supported - */ -#define DCGM_STR_NOT_SUPPORTED "<<>>" - -/** - * Represents and error where fetching the INT32 value is not allowed with our current credentials - */ -#define DCGM_INT32_NOT_PERMISSIONED (DCGM_INT32_BLANK + 3) - -/** - * Represents and error where fetching the INT64 value is not allowed with our current credentials - */ -#define DCGM_INT64_NOT_PERMISSIONED (DCGM_INT64_BLANK + 3) - -/** - * Represents and error where fetching the FP64 value is not allowed with our current credentials - */ -#define DCGM_FP64_NOT_PERMISSIONED (DCGM_FP64_BLANK + 3.0) - -/** - * Represents and error where fetching the STR value is not allowed with our current credentials - */ -#define DCGM_STR_NOT_PERMISSIONED "<<>>" - -/** - * Macro to check if a INT32 value is blank or not - */ -#define DCGM_INT32_IS_BLANK(val) (((val) >= DCGM_INT32_BLANK) ? 1 : 0) - -/** - * Macro to check if a INT64 value is blank or not - */ -#define DCGM_INT64_IS_BLANK(val) (((val) >= DCGM_INT64_BLANK) ? 1 : 0) - -/** - * Macro to check if a FP64 value is blank or not - */ -#define DCGM_FP64_IS_BLANK(val) (((val) >= DCGM_FP64_BLANK ? 1 : 0)) - -/** - * Macro to check if a STR value is blank or not - * Works on (char *). Looks for <<< at first position and >>> inside string - */ -#define DCGM_STR_IS_BLANK(val) (val == strstr(val, "<<<") && strstr(val, ">>>")) - -#endif // DCGM_BLANK_VALUES - -/** - * Max number of GPUs supported by DCGM - */ -#define DCGM_MAX_NUM_DEVICES 32 /* DCGM 2.0 and newer = 32. DCGM 1.8 and older = 16. */ - -/** - * Number of NvLink links per GPU supported by DCGM - * This is 12 for Ampere, 6 for Volta, and 4 for Pascal - */ -#define DCGM_NVLINK_MAX_LINKS_PER_GPU 12 - -/** - * Maximum NvLink links pre-Ampere - */ -#define DCGM_NVLINK_MAX_LINKS_PER_GPU_LEGACY1 6 - -/** - * Max number of NvSwitches supported by DCGM - **/ -#define DCGM_MAX_NUM_SWITCHES 12 - -/** - * Number of NvLink links per NvSwitch supported by DCGM - */ -#define DCGM_NVLINK_MAX_LINKS_PER_NVSWITCH 36 - -/** - * Maximum number of vGPU instances per physical GPU - */ -#define DCGM_MAX_VGPU_INSTANCES_PER_PGPU 32 - -/** - * Max length of the DCGM string field - */ -#define DCGM_MAX_STR_LENGTH 256 - -/** - * Max number of clocks supported for a device - */ -#define DCGM_MAX_CLOCKS 256 - -/** - * Max limit on the number of groups supported by DCGM - */ -#define DCGM_MAX_NUM_GROUPS 64 - -/** - * Max number of active FBC sessions - */ -#define DCGM_MAX_FBC_SESSIONS 256 - -/** - * Represents the size of a buffer that holds a vGPU type Name or vGPU class type or name of process running on vGPU - * instance. - */ -#define DCGM_VGPU_NAME_BUFFER_SIZE 64 - -/** - * Represents the size of a buffer that holds a vGPU license string - */ -#define DCGM_GRID_LICENSE_BUFFER_SIZE 128 - -/** - * Default compute mode -- multiple contexts per device - */ -#define DCGM_CONFIG_COMPUTEMODE_DEFAULT 0 - -/** - * Compute-prohibited mode -- no contexts per device - */ -#define DCGM_CONFIG_COMPUTEMODE_PROHIBITED 1 - -/** - * Compute-exclusive-process mode -- only one context per device, usable from multiple threads at a time - */ -#define DCGM_CONFIG_COMPUTEMODE_EXCLUSIVE_PROCESS 2 - -/** - * Default Port Number for DCGM Host Engine - */ -#define DCGM_HE_PORT_NUMBER 5555 - -#ifdef __cplusplus -extern "C" { -#endif - -/** - * Operation mode for DCGM - * - * DCGM can run in auto-mode where it runs additional threads in the background to collect - * any metrics of interest and auto manages any operations needed for policy management. - * - * DCGM can also operate in manual-mode where it's execution is controlled by the user. In - * this mode, the user has to periodically call APIs such as \ref dcgmPolicyTrigger and - * \ref dcgmUpdateAllFields which tells DCGM to wake up and perform data collection and - * operations needed for policy management. - */ -typedef enum dcgmOperationMode_enum -{ - DCGM_OPERATION_MODE_AUTO = 1, - DCGM_OPERATION_MODE_MANUAL = 2 -} dcgmOperationMode_t; - -/** - * When more than one value is returned from a query, which order should it be returned in? - */ -typedef enum dcgmOrder_enum -{ - DCGM_ORDER_ASCENDING = 1, //!< Data with earliest (lowest) timestamps returned first - DCGM_ORDER_DESCENDING = 2 //!< Data with latest (highest) timestamps returned first -} dcgmOrder_t; - -/** - * Return values for DCGM API calls. - */ -typedef enum dcgmReturn_enum -{ - DCGM_ST_OK = 0, //!< Success - DCGM_ST_BADPARAM = -1, //!< A bad parameter was passed to a function - DCGM_ST_GENERIC_ERROR = -3, //!< A generic, unspecified error - DCGM_ST_MEMORY = -4, //!< An out of memory error occurred - DCGM_ST_NOT_CONFIGURED = -5, //!< Setting not configured - DCGM_ST_NOT_SUPPORTED = -6, //!< Feature not supported - DCGM_ST_INIT_ERROR = -7, //!< DCGM Init error - DCGM_ST_NVML_ERROR = -8, //!< When NVML returns error - DCGM_ST_PENDING = -9, //!< Object is in pending state of something else - DCGM_ST_UNINITIALIZED = -10, //!< Object is in undefined state - DCGM_ST_TIMEOUT = -11, //!< Requested operation timed out - DCGM_ST_VER_MISMATCH = -12, //!< Version mismatch between received and understood API - DCGM_ST_UNKNOWN_FIELD = -13, //!< Unknown field id - DCGM_ST_NO_DATA = -14, //!< No data is available - DCGM_ST_STALE_DATA = -15, //!< Data is considered stale - DCGM_ST_NOT_WATCHED = -16, //!< The given field id is not being updated by the cache manager - DCGM_ST_NO_PERMISSION = -17, //!< Do not have permission to perform the desired action - DCGM_ST_GPU_IS_LOST = -18, //!< GPU is no longer reachable - DCGM_ST_RESET_REQUIRED = -19, //!< GPU requires a reset - DCGM_ST_FUNCTION_NOT_FOUND = -20, //!< The function that was requested was not found (bindings only error) - DCGM_ST_CONNECTION_NOT_VALID = -21, //!< The connection to the host engine is not valid any longer - DCGM_ST_GPU_NOT_SUPPORTED = -22, //!< This GPU is not supported by DCGM - DCGM_ST_GROUP_INCOMPATIBLE = -23, //!< The GPUs of the provided group are not compatible with each other for the - //!< requested operation - DCGM_ST_MAX_LIMIT = -24, //!< Max limit reached for the object - DCGM_ST_LIBRARY_NOT_FOUND = -25, //!< DCGM library could not be found - DCGM_ST_DUPLICATE_KEY = -26, //!< Duplicate key passed to a function - DCGM_ST_GPU_IN_SYNC_BOOST_GROUP = -27, //!< GPU is already a part of a sync boost group - DCGM_ST_GPU_NOT_IN_SYNC_BOOST_GROUP = -28, //!< GPU is not a part of a sync boost group - DCGM_ST_REQUIRES_ROOT = -29, //!< This operation cannot be performed when the host engine is running as non-root - DCGM_ST_NVVS_ERROR = -30, //!< DCGM GPU Diagnostic was successfully executed, but reported an error. - DCGM_ST_INSUFFICIENT_SIZE = -31, //!< An input argument is not large enough - DCGM_ST_FIELD_UNSUPPORTED_BY_API = -32, //!< The given field ID is not supported by the API being called - DCGM_ST_MODULE_NOT_LOADED = -33, //!< This request is serviced by a module of DCGM that is not currently loaded - DCGM_ST_IN_USE = -34, //!< The requested operation could not be completed because the affected - //!< resource is in use - DCGM_ST_GROUP_IS_EMPTY = -35, //!< This group is empty and the requested operation is not valid on an empty group - DCGM_ST_PROFILING_NOT_SUPPORTED = -36, //!< Profiling is not supported for this group of GPUs or GPU. - DCGM_ST_PROFILING_LIBRARY_ERROR = -37, //!< The third-party Profiling module returned an unrecoverable error. - DCGM_ST_PROFILING_MULTI_PASS = -38, //!< The requested profiling metrics cannot be collected in a single pass - DCGM_ST_DIAG_ALREADY_RUNNING = -39, //!< A diag instance is already running, cannot run a new diag until - //!< the current one finishes. - DCGM_ST_DIAG_BAD_JSON = -40, //!< The DCGM GPU Diagnostic returned JSON that cannot be parsed - DCGM_ST_DIAG_BAD_LAUNCH = -41, //!< Error while launching the DCGM GPU Diagnostic - DCGM_ST_DIAG_VARIANCE = -42, //!< There is too much variance while training the diagnostic - DCGM_ST_DIAG_THRESHOLD_EXCEEDED = -43, //!< A field value met or exceeded the error threshold. - DCGM_ST_INSUFFICIENT_DRIVER_VERSION = -44, //!< The installed driver version is insufficient for this API - DCGM_ST_INSTANCE_NOT_FOUND = -45, //!< The specified GPU instance does not exist - DCGM_ST_COMPUTE_INSTANCE_NOT_FOUND = -46, //!< The specified GPU compute instance does not exist - DCGM_ST_CHILD_NOT_KILLED = -47, //!< Couldn't kill a child process within the retries - DCGM_ST_3RD_PARTY_LIBRARY_ERROR = -48, //!< Detected an error in a 3rd-party library - DCGM_ST_INSUFFICIENT_RESOURCES = -49, //!< Not enough resources available - DCGM_ST_PLUGIN_EXCEPTION = -50, //!< Exception thrown from a diagnostic plugin - DCGM_ST_NVVS_ISOLATE_ERROR = -51, //!< The diagnostic returned an error that indicates the need for isolation -} dcgmReturn_t; - -const char *errorString(dcgmReturn_t result); - -/** - * Type of GPU groups - */ -typedef enum dcgmGroupType_enum -{ - DCGM_GROUP_DEFAULT = 0, //!< All the GPUs on the node are added to the group - DCGM_GROUP_EMPTY = 1, //!< Creates an empty group - DCGM_GROUP_DEFAULT_NVSWITCHES = 2, //!< All NvSwitches of the node are added to the group - DCGM_GROUP_DEFAULT_INSTANCES = 3, //!< All GPU instances of the node are added to the group - DCGM_GROUP_DEFAULT_COMPUTE_INSTANCES = 4, //!< All compute instances of the node are added to the group - DCGM_GROUP_DEFAULT_EVERYTHING = 5, //!< All entities are added to this default group -} dcgmGroupType_t; - -/** - * Identifies for special DCGM groups - */ -#define DCGM_GROUP_ALL_GPUS 0x7fffffff -#define DCGM_GROUP_ALL_NVSWITCHES 0x7ffffffe -#define DCGM_GROUP_ALL_INSTANCES 0x7ffffffd -#define DCGM_GROUP_ALL_COMPUTE_INSTANCES 0x7ffffffc -#define DCGM_GROUP_ALL_ENTITIES 0x7ffffffb - -/** - * Maximum number of entities per entity group - */ -#define DCGM_GROUP_MAX_ENTITIES 64 - -/** - * Simplified chip architecture. Note that these are made to match nvmlChipArchitecture_t and thus - * do not start at 0. - */ -typedef enum dcgmChipArchitecture_enum -{ - DCGM_CHIP_ARCH_OLDER = 1, //!< All GPUs older than Kepler - DCGM_CHIP_ARCH_KEPLER = 2, //!< All Kepler-architecture parts - DCGM_CHIP_ARCH_MAXWELL = 3, //!< All Maxwell-architecture parts - DCGM_CHIP_ARCH_PASCAL = 4, //!< All Pascal-architecture parts - DCGM_CHIP_ARCH_VOLTA = 5, //!< All Volta-architecture parts - DCGM_CHIP_ARCH_TURING = 6, //!< All Turing-architecture parts - DCGM_CHIP_ARCH_AMPERE = 7, //!< All Ampere-architecture parts - - DCGM_CHIP_ARCH_COUNT, //!< Keep this second to last, exclude unknown - - DCGM_CHIP_ARCH_UNKNOWN = 0xffffffff //!< Anything else, presumably something newer -} dcgmChipArchitecture_t; - -/** - * Represents the type of configuration to be fetched from the GPUs - */ -typedef enum dcgmConfigType_enum -{ - DCGM_CONFIG_TARGET_STATE = 0, //!< The target configuration values to be applied - DCGM_CONFIG_CURRENT_STATE = 1, //!< The current configuration state -} dcgmConfigType_t; - -/** - * Represents the power cap for each member of the group. - */ -typedef enum dcgmConfigPowerLimitType_enum -{ - DCGM_CONFIG_POWER_CAP_INDIVIDUAL = 0, //!< Represents the power cap to be applied for each member of the group - DCGM_CONFIG_POWER_BUDGET_GROUP = 1, //!< Represents the power budget for the entire group -} dcgmConfigPowerLimitType_t; - -/** @} */ - - -/***************************************************************************************************/ -/** @defgroup dcgmStructs Structure definitions - * @{ - */ -/***************************************************************************************************/ -typedef uintptr_t dcgmHandle_t; //!< Identifier for DCGM Handle -typedef uintptr_t dcgmGpuGrp_t; //!< Identifier for a group of GPUs. A group can have one or more GPUs -typedef uintptr_t dcgmFieldGrp_t; //!< Identifier for a group of fields. -typedef uintptr_t dcgmStatus_t; //!< Identifier for list of status codes - -/** - * DCGM Logging Severities. These match up with plog severities defined in Severity.h - * Each level includes all of the levels above it. For instance, level 4 includes 3,2, and 1 as well - */ -typedef enum -{ - DcgmLoggingSeverityUnspecified = -1, /*!< Don't care/inherit from the environment */ - DcgmLoggingSeverityNone = 0, /*!< No logging */ - DcgmLoggingSeverityFatal = 1, /*!< Fatal Errors */ - DcgmLoggingSeverityError = 2, /*!< Errors */ - DcgmLoggingSeverityWarning = 3, /*!< Warnings */ - DcgmLoggingSeverityInfo = 4, /*!< Informative */ - DcgmLoggingSeverityDebug = 5, /*!< Debug information (will generate large logs) */ - DcgmLoggingSeverityVerbose = 6 /*!< Verbose debugging information */ -} DcgmLoggingSeverity_t; - -/** - * Connection options for dcgmConnect_v2 (v1) - * - * NOTE: This version is deprecated. use dcgmConnectV2Params_v2 - */ -typedef struct -{ - unsigned int version; /*!< Version number. Use dcgmConnectV2Params_version */ - unsigned int persistAfterDisconnect; /*!< Whether to persist DCGM state modified by this connection - once the connection is terminated. Normally, all field - watches created by a connection are removed once a - connection goes away. - 1 = do not clean up after this connection. - 0 = clean up after this connection */ -} dcgmConnectV2Params_v1; - -/** - * Version 1 for \ref dcgmConnectV2Params_v1 - */ -#define dcgmConnectV2Params_version1 MAKE_DCGM_VERSION(dcgmConnectV2Params_v1, 1) - -/** - * Connection options for dcgmConnect_v2 (v2) - */ -typedef struct -{ - unsigned int version; /*!< Version number. Use dcgmConnectV2Params_version */ - unsigned int persistAfterDisconnect; /*!< Whether to persist DCGM state modified by this connection once the - connection is terminated. Normally, all field watches created by a - connection are removed once a connection goes away. 1 = do not clean up - after this connection. 0 = clean up after this connection */ - unsigned int timeoutMs; /*!< When attempting to connect to the specified host engine, how long should - we wait in milliseconds before giving up */ - unsigned int addressIsUnixSocket; /*!< Whether or not the passed-in address is a unix socket filename (1) or a - TCP/IP address (0) */ -} dcgmConnectV2Params_v2; - -/** - * Typedef for \ref dcgmConnectV2Params_v2 - */ -typedef dcgmConnectV2Params_v2 dcgmConnectV2Params_t; - -/** - * Version 2 for \ref dcgmConnectV2Params_v2 - */ -#define dcgmConnectV2Params_version2 MAKE_DCGM_VERSION(dcgmConnectV2Params_v2, 2) - -/** - * Latest version for \ref dcgmConnectV2Params_t - */ -#define dcgmConnectV2Params_version dcgmConnectV2Params_version2 - -/** - * Typedef for \ref dcgmHostengineHealth_v1 - */ -typedef struct -{ - unsigned int version; //!< The version of this request - unsigned int overallHealth; //!< 0 to indicate healthy, or a code to indicate the error - // For now, this will always be populated with 0 if the - // hostengine can respond. In the future this will be - // updated to have other options like NVML unresponsive, - // no GPUs on system, etc. -} dcgmHostengineHealth_v1; - -/** - * Typedef for \ref dcgmHostengineHealth_t - */ -typedef dcgmHostengineHealth_v1 dcgmHostengineHealth_t; - -#define dcgmHostengineHealth_version1 MAKE_DCGM_VERSION(dcgmHostengineHealth_v1, 1) - -/** - * Latest version for \ref dcgmHostengineHealth_t - */ -#define dcgmHostengineHealth_version dcgmHostengineHealth_version1 - -/** - * Represents a entityGroupId + entityId pair to uniquely identify a given entityId inside a group of entities - * - * Added in DCGM 1.5.0 - */ -typedef struct -{ - dcgm_field_entity_group_t entityGroupId; //!< Entity Group ID entity belongs to - dcgm_field_eid_t entityId; //!< Entity ID of the entity -} dcgmGroupEntityPair_t; - -/** - * Structure to store information for DCGM group - * - * Added in DCGM 1.5.0 - */ -typedef struct -{ - unsigned int version; //!< Version Number (use dcgmGroupInfo_version2) - unsigned int count; //!< count of entityIds returned in \a entityList - char groupName[DCGM_MAX_STR_LENGTH]; //!< Group Name - dcgmGroupEntityPair_t entityList[DCGM_GROUP_MAX_ENTITIES]; //!< List of the entities that are in this group -} dcgmGroupInfo_v2; - -/** - * Typedef for \ref dcgmGroupInfo_v2 - */ -typedef dcgmGroupInfo_v2 dcgmGroupInfo_t; - -/** - * Version 2 for \ref dcgmGroupInfo_v2 - */ -#define dcgmGroupInfo_version2 MAKE_DCGM_VERSION(dcgmGroupInfo_v2, 2) - -/** - * Latest version for \ref dcgmGroupInfo_t - */ -#define dcgmGroupInfo_version dcgmGroupInfo_version2 - -/** - * Enum for the different kinds of MIG profiles - */ -typedef enum -{ - DcgmMigProfileNone = 0, /*!< No profile (for GPUs) */ - DcgmMigProfileGpuInstanceSlice1 = 1, /*!< GPU instance slice 1 */ - DcgmMigProfileGpuInstanceSlice2 = 2, /*!< GPU instance slice 2 */ - DcgmMigProfileGpuInstanceSlice3 = 3, /*!< GPU instance slice 3 */ - DcgmMigProfileGpuInstanceSlice4 = 4, /*!< GPU instance slice 4 */ - DcgmMigProfileGpuInstanceSlice7 = 5, /*!< GPU instance slice 7 */ - DcgmMigProfileGpuInstanceSlice8 = 6, /*!< GPU instance slice 8 */ - DcgmMigProfileComputeInstanceSlice1 = 30, /*!< compute instance slice 1 */ - DcgmMigProfileComputeInstanceSlice2 = 31, /*!< compute instance slice 2 */ - DcgmMigProfileComputeInstanceSlice3 = 32, /*!< compute instance slice 3 */ - DcgmMigProfileComputeInstanceSlice4 = 33, /*!< compute instance slice 4*/ - DcgmMigProfileComputeInstanceSlice7 = 34, /*!< compute instance slice 7 */ - DcgmMigProfileComputeInstanceSlice8 = 35, /*!< compute instance slice 8 */ -} dcgmMigProfile_t; - -/** - * Represents a pair of entity pairings to uniquely identify an entity and its place in the hierarchy. - */ -typedef struct -{ - dcgmGroupEntityPair_t entity; //!< Entity id and type for the entity in question - dcgmGroupEntityPair_t parent; //!< Entity id and type for the parent of the entity in question - dcgmMigProfile_t sliceProfile; //!< Entity MIG profile identifier -} dcgmMigHierarchyInfo_t; - -/** - * Provides additional information about location of MIG entities. - */ -typedef struct -{ - char gpuUuid[128]; /*!< GPU UUID */ - unsigned int nvmlGpuIndex; /*!< GPU index from NVML */ - unsigned int nvmlInstanceId; /*!< GPU instance index within GPU. 0 to N. -1 for GPU entities */ - unsigned int nvmlComputeInstanceId; /*!< GPU Compute instance index within GPU instance. 0 to N. -1 for GPU - * Instance and GPU entities */ - unsigned int nvmlMigProfileId; /*!< Unique profile ID for GPU or Compute instances. -1 GPU entities - * \see nvmlComputeInstanceProfileInfo_st - * \see nvmlGpuInstanceProfileInfo_st */ - unsigned int nvmlProfileSlices; /*!< Number of slices in the MIG profile */ -} dcgmMigEntityInfo_t; - -typedef struct -{ - dcgmGroupEntityPair_t entity; - dcgmGroupEntityPair_t parent; - dcgmMigEntityInfo_t info; -} dcgmMigHierarchyInfo_v2; - -#define DCGM_MAX_INSTANCES_PER_GPU 8 -// There can never be more compute instances per GPU than instances per GPU because a compute instance is part -// of an instance -#define DCGM_MAX_COMPUTE_INSTANCES_PER_GPU DCGM_MAX_INSTANCES_PER_GPU -// Currently, there cannot be more than 14 instances + compute instances. There are always 7 compute instances -// and never more than 7 instances -#define DCGM_MAX_TOTAL_INSTANCES_PER_GPU 14 -#define DCGM_MAX_HIERARCHY_INFO DCGM_MAX_NUM_DEVICES *DCGM_MAX_TOTAL_INSTANCES_PER_GPU -#define DCGM_MAX_INSTANCES DCGM_MAX_NUM_DEVICES *DCGM_MAX_INSTANCES_PER_GPU -// The maximum compute instances are always the same as the maximum instances because each compute instance is -// part of an instance. -#define DCGM_MAX_COMPUTE_INSTANCES DCGM_MAX_INSTANCES - -/** - * Structure to store the GPU hierarchy for a system - * - * Added in DCGM 2.0 - */ -typedef struct -{ - unsigned int version; - unsigned int count; - dcgmMigHierarchyInfo_t entityList[DCGM_MAX_HIERARCHY_INFO]; -} dcgmMigHierarchy_v1; - -#define dcgmMigHierarchy_version1 MAKE_DCGM_VERSION(dcgmMigHierarchy_v1, 1) - -typedef struct -{ - unsigned int version; - unsigned int count; - dcgmMigHierarchyInfo_v2 entityList[DCGM_MAX_HIERARCHY_INFO]; -} dcgmMigHierarchy_v2; - -#define dcgmMigHierarchy_version2 MAKE_DCGM_VERSION(dcgmMigHierarchy_v2, 2) - -#define dcgmMigHierarchy_version dcgmMigHiearchyVersion2 - -/** - * Maximum number of field groups that can exist - */ -#define DCGM_MAX_NUM_FIELD_GROUPS 64 - -/** - * Maximum number of field IDs that can be in a single field group - */ -#define DCGM_MAX_FIELD_IDS_PER_FIELD_GROUP 128 - -/** - * Structure to represent information about a field group - */ -typedef struct -{ - unsigned int version; //!< Version number (dcgmFieldGroupInfo_version) - unsigned int numFieldIds; //!< Number of entries in fieldIds[] that are valid - dcgmFieldGrp_t fieldGroupId; //!< ID of this field group - char fieldGroupName[DCGM_MAX_STR_LENGTH]; //!< Field Group Name - unsigned short fieldIds[DCGM_MAX_FIELD_IDS_PER_FIELD_GROUP]; //!< Field ids that belong to this group -} dcgmFieldGroupInfo_v1; - -typedef dcgmFieldGroupInfo_v1 dcgmFieldGroupInfo_t; - -/** - * Version 1 for dcgmFieldGroupInfo_v1 - */ -#define dcgmFieldGroupInfo_version1 MAKE_DCGM_VERSION(dcgmFieldGroupInfo_v1, 1) - -/** - * Latest version for dcgmFieldGroupInfo_t - */ -#define dcgmFieldGroupInfo_version dcgmFieldGroupInfo_version1 - -typedef struct -{ - unsigned int version; //!< Version number (dcgmAllFieldGroupInfo_version) - unsigned int numFieldGroups; //!< Number of entries in fieldGroups[] that are populated - dcgmFieldGroupInfo_t fieldGroups[DCGM_MAX_NUM_FIELD_GROUPS]; //!< Info about each field group -} dcgmAllFieldGroup_v1; - -typedef dcgmAllFieldGroup_v1 dcgmAllFieldGroup_t; - -/** - * Version 1 for dcgmAllFieldGroup_v1 - */ -#define dcgmAllFieldGroup_version1 MAKE_DCGM_VERSION(dcgmAllFieldGroup_v1, 1) - -/** - * Latest version for dcgmAllFieldGroup_t - */ -#define dcgmAllFieldGroup_version dcgmAllFieldGroup_version1 - -/** - * Structure to represent error attributes - */ -typedef struct -{ - unsigned int gpuId; //!< Represents GPU ID - short fieldId; //!< One of DCGM_FI_? - int status; //!< One of DCGM_ST_? -} dcgmErrorInfo_t; - -/** - * Represents a set of memory, SM, and video clocks for a device. This can be current values or a target values - * based on context - */ -typedef struct -{ - int version; //!< Version Number (dcgmClockSet_version) - unsigned int memClock; //!< Memory Clock (Memory Clock value OR DCGM_INT32_BLANK to Ignore/Use compatible - //!< value with smClk) - unsigned int smClock; //!< SM Clock (SM Clock value OR DCGM_INT32_BLANK to Ignore/Use compatible value with memClk) -} dcgmClockSet_v1; - -/** - * Typedef for \ref dcgmClockSet_v1 - */ -typedef dcgmClockSet_v1 dcgmClockSet_t; - -/** - * Version 1 for \ref dcgmClockSet_v1 - */ -#define dcgmClockSet_version1 MAKE_DCGM_VERSION(dcgmClockSet_v1, 1) - -/** - * Latest version for \ref dcgmClockSet_t - */ -#define dcgmClockSet_version dcgmClockSet_version1 - -/** - * Represents list of supported clock sets for a device - */ -typedef struct -{ - unsigned int version; //!< Version Number (dcgmDeviceSupportedClockSets_version) - unsigned int count; //!< Number of supported clocks - dcgmClockSet_t clockSet[DCGM_MAX_CLOCKS]; //!< Valid clock sets for the device. Upto \ref count entries are filled -} dcgmDeviceSupportedClockSets_v1; - -/** - * Typedef for \ref dcgmDeviceSupportedClockSets_v1 - */ -typedef dcgmDeviceSupportedClockSets_v1 dcgmDeviceSupportedClockSets_t; - -/** - * Version 1 for \ref dcgmDeviceSupportedClockSets_v1 - */ -#define dcgmDeviceSupportedClockSets_version1 MAKE_DCGM_VERSION(dcgmDeviceSupportedClockSets_v1, 1) - -/** - * Latest version for \ref dcgmDeviceSupportedClockSets_t - */ -#define dcgmDeviceSupportedClockSets_version dcgmDeviceSupportedClockSets_version1 - -/** - * Represents accounting data for one process - */ -typedef struct -{ - unsigned int version; //!< Version Number. Should match dcgmDevicePidAccountingStats_version - unsigned int pid; //!< Process id of the process these stats are for - unsigned int gpuUtilization; //!< Percent of time over the process's lifetime during which one or more kernels - //!< was executing on the GPU. - //!< Set to DCGM_INT32_NOT_SUPPORTED if is not supported - unsigned int memoryUtilization; //!< Percent of time over the process's lifetime during which global (device) - //!< memory was being read or written. - //!< Set to DCGM_INT32_NOT_SUPPORTED if is not supported - unsigned long long maxMemoryUsage; //!< Maximum total memory in bytes that was ever allocated by the process. - //!< Set to DCGM_INT64_NOT_SUPPORTED if is not supported - unsigned long long startTimestamp; //!< CPU Timestamp in usec representing start time for the process - unsigned long long activeTimeUsec; //!< Amount of time in usec during which the compute context was active. - //!< Note that this does not mean the context was being used. endTimestamp - //!< can be computed as startTimestamp + activeTime -} dcgmDevicePidAccountingStats_v1; - -/** - * Typedef for \ref dcgmDevicePidAccountingStats_v1 - */ -typedef dcgmDevicePidAccountingStats_v1 dcgmDevicePidAccountingStats_t; - -/** - * Version 1 for \ref dcgmDevicePidAccountingStats_v1 - */ -#define dcgmDevicePidAccountingStats_version1 MAKE_DCGM_VERSION(dcgmDevicePidAccountingStats_v1, 1) - -/** - * Latest version for \ref dcgmDevicePidAccountingStats_t - */ -#define dcgmDevicePidAccountingStats_version dcgmDevicePidAccountingStats_version1 - -/** - * Represents thermal information - */ -typedef struct -{ - unsigned int version; //!< Version Number - unsigned int slowdownTemp; //!< Slowdown temperature - unsigned int shutdownTemp; //!< Shutdown temperature -} dcgmDeviceThermals_v1; - -/** - * Typedef for \ref dcgmDeviceThermals_v1 - */ -typedef dcgmDeviceThermals_v1 dcgmDeviceThermals_t; - -/** - * Version 1 for \ref dcgmDeviceThermals_v1 - */ -#define dcgmDeviceThermals_version1 MAKE_DCGM_VERSION(dcgmDeviceThermals_v1, 1) - -/** - * Latest version for \ref dcgmDeviceThermals_t - */ -#define dcgmDeviceThermals_version dcgmDeviceThermals_version1 - -/** - * Represents various power limits - */ -typedef struct -{ - unsigned int version; //!< Version Number - unsigned int curPowerLimit; //!< Power management limit associated with this device (in W) - unsigned int defaultPowerLimit; //!< Power management limit effective at device boot (in W) - unsigned int enforcedPowerLimit; //!< Effective power limit that the driver enforces after taking into account - //!< all limiters (in W) - unsigned int minPowerLimit; //!< Minimum power management limit (in W) - unsigned int maxPowerLimit; //!< Maximum power management limit (in W) -} dcgmDevicePowerLimits_v1; - -/** - * Typedef for \ref dcgmDevicePowerLimits_v1 - */ -typedef dcgmDevicePowerLimits_v1 dcgmDevicePowerLimits_t; - -/** - * Version 1 for \ref dcgmDevicePowerLimits_v1 - */ -#define dcgmDevicePowerLimits_version1 MAKE_DCGM_VERSION(dcgmDevicePowerLimits_v1, 1) - -/** - * Latest version for \ref dcgmDevicePowerLimits_t - */ -#define dcgmDevicePowerLimits_version dcgmDevicePowerLimits_version1 - -/** - * Represents device identifiers - */ -typedef struct -{ - unsigned int version; //!< Version Number (dcgmDeviceIdentifiers_version) - char brandName[DCGM_MAX_STR_LENGTH]; //!< Brand Name - char deviceName[DCGM_MAX_STR_LENGTH]; //!< Name of the device - char pciBusId[DCGM_MAX_STR_LENGTH]; //!< PCI Bus ID - char serial[DCGM_MAX_STR_LENGTH]; //!< Serial for the device - char uuid[DCGM_MAX_STR_LENGTH]; //!< UUID for the device - char vbios[DCGM_MAX_STR_LENGTH]; //!< VBIOS version - char inforomImageVersion[DCGM_MAX_STR_LENGTH]; //!< Inforom Image version - unsigned int pciDeviceId; //!< The combined 16-bit device id and 16-bit vendor id - unsigned int pciSubSystemId; //!< The 32-bit Sub System Device ID - char driverVersion[DCGM_MAX_STR_LENGTH]; //!< Driver Version - unsigned int virtualizationMode; //!< Virtualization Mode -} dcgmDeviceIdentifiers_v1; - -/** - * Typedef for \ref dcgmDeviceIdentifiers_v1 - */ -typedef dcgmDeviceIdentifiers_v1 dcgmDeviceIdentifiers_t; - -/** - * Version 1 for \ref dcgmDeviceIdentifiers_v1 - */ -#define dcgmDeviceIdentifiers_version1 MAKE_DCGM_VERSION(dcgmDeviceIdentifiers_v1, 1) - -/** - * Latest version for \ref dcgmDeviceIdentifiers_t - */ -#define dcgmDeviceIdentifiers_version dcgmDeviceIdentifiers_version1 - -/** - * Represents device memory and usage - */ -typedef struct -{ - unsigned int version; //!< Version Number (dcgmDeviceMemoryUsage_version) - unsigned int bar1Total; //!< Total BAR1 size in megabytes - unsigned int fbTotal; //!< Total framebuffer memory in megabytes - unsigned int fbUsed; //!< Used framebuffer memory in megabytes - unsigned int fbFree; //!< Free framebuffer memory in megabytes -} dcgmDeviceMemoryUsage_v1; - -/** - * Typedef for \ref dcgmDeviceMemoryUsage_v1 - */ -typedef dcgmDeviceMemoryUsage_v1 dcgmDeviceMemoryUsage_t; - -/** - * Version 1 for \ref dcgmDeviceMemoryUsage_v1 - */ -#define dcgmDeviceMemoryUsage_version1 MAKE_DCGM_VERSION(dcgmDeviceMemoryUsage_v1, 1) - -/** - * Latest version for \ref dcgmDeviceMemoryUsage_t - */ -#define dcgmDeviceMemoryUsage_version dcgmDeviceMemoryUsage_version1 - -/** - * Represents utilization values for vGPUs running on the device - */ -typedef struct -{ - unsigned int version; //!< Version Number (dcgmDeviceVgpuUtilInfo_version) - unsigned int vgpuId; //!< vGPU instance ID - unsigned int smUtil; //!< GPU utilization for vGPU - unsigned int memUtil; //!< Memory utilization for vGPU - unsigned int encUtil; //!< Encoder utilization for vGPU - unsigned int decUtil; //!< Decoder utilization for vGPU -} dcgmDeviceVgpuUtilInfo_v1; - -/** - * Typedef for \ref dcgmDeviceVgpuUtilInfo_v1 - */ -typedef dcgmDeviceVgpuUtilInfo_v1 dcgmDeviceVgpuUtilInfo_t; - -/** - * Version 1 for \ref dcgmDeviceVgpuUtilInfo_v1 - */ -#define dcgmDeviceVgpuUtilInfo_version1 MAKE_DCGM_VERSION(dcgmDeviceVgpuUtilInfo_v1, 1) - -/** - * Latest version for \ref dcgmDeviceVgpuUtilInfo_t - */ -#define dcgmDeviceVgpuUtilInfo_version dcgmDeviceVgpuUtilInfo_version1 - -/** - * Represents current encoder statistics for the given device/vGPU instance - */ -typedef struct -{ - unsigned int version; //!< Version Number (dcgmDeviceEncStats_version) - unsigned int sessionCount; //!< Count of active encoder sessions - unsigned int averageFps; //!< Trailing average FPS of all active sessions - unsigned int averageLatency; //!< Encode latency in milliseconds -} dcgmDeviceEncStats_v1; - -/** - * Typedef for \ref dcgmDeviceEncStats_v1 - */ -typedef dcgmDeviceEncStats_v1 dcgmDeviceEncStats_t; - -/** - * Version 1 for \ref dcgmDeviceEncStats_v1 - */ -#define dcgmDeviceEncStats_version1 MAKE_DCGM_VERSION(dcgmDeviceEncStats_v1, 1) - -/** - * Latest version for \ref dcgmDeviceEncStats_t - */ -#define dcgmDeviceEncStats_version dcgmDeviceEncStats_version1 - -/** - * Represents current frame buffer capture sessions statistics for the given device/vGPU instance - */ -typedef struct -{ - unsigned int version; //!< Version Number (dcgmDeviceFbcStats_version) - unsigned int sessionCount; //!< Count of active FBC sessions - unsigned int averageFps; //!< Moving average new frames captured per second - unsigned int averageLatency; //!< Moving average new frame capture latency in microseconds -} dcgmDeviceFbcStats_v1; - -/** - * Typedef for \ref dcgmDeviceFbcStats_v1 - */ -typedef dcgmDeviceFbcStats_v1 dcgmDeviceFbcStats_t; - -/** - * Version 1 for \ref dcgmDeviceFbcStats_v1 - */ -#define dcgmDeviceFbcStats_version1 MAKE_DCGM_VERSION(dcgmDeviceFbcStats_v1, 1) - -/** - * Latest version for \ref dcgmDeviceEncStats_t - */ -#define dcgmDeviceFbcStats_version dcgmDeviceFbcStats_version1 - -/* - * Represents frame buffer capture session type - */ -typedef enum dcgmFBCSessionType_enum -{ - DCGM_FBC_SESSION_TYPE_UNKNOWN = 0, //!< Unknown - DCGM_FBC_SESSION_TYPE_TOSYS, //!< FB capture for a system buffer - DCGM_FBC_SESSION_TYPE_CUDA, //!< FB capture for a cuda buffer - DCGM_FBC_SESSION_TYPE_VID, //!< FB capture for a Vid buffer - DCGM_FBC_SESSION_TYPE_HWENC, //!< FB capture for a NVENC HW buffer -} dcgmFBCSessionType_t; - -/** - * Represents information about active FBC session on the given device/vGPU instance - */ -typedef struct -{ - unsigned int version; //!< Version Number (dcgmDeviceFbcSessionInfo_version) - unsigned int sessionId; //!< Unique session ID - unsigned int pid; //!< Owning process ID - unsigned int vgpuId; //!< vGPU instance ID (only valid on vGPU hosts, otherwise zero) - unsigned int displayOrdinal; //!< Display identifier - dcgmFBCSessionType_t sessionType; //!< Type of frame buffer capture session - unsigned int sessionFlags; //!< Session flags - unsigned int hMaxResolution; //!< Max horizontal resolution supported by the capture session - unsigned int vMaxResolution; //!< Max vertical resolution supported by the capture session - unsigned int hResolution; //!< Horizontal resolution requested by caller in capture call - unsigned int vResolution; //!< Vertical resolution requested by caller in capture call - unsigned int averageFps; //!< Moving average new frames captured per second - unsigned int averageLatency; //!< Moving average new frame capture latency in microseconds -} dcgmDeviceFbcSessionInfo_v1; - -/** - * Typedef for \ref dcgmDeviceFbcSessionInfo_v1 - */ -typedef dcgmDeviceFbcSessionInfo_v1 dcgmDeviceFbcSessionInfo_t; - -/** - * Version 1 for \ref dcgmDeviceFbcSessionInfo_v1 - */ -#define dcgmDeviceFbcSessionInfo_version1 MAKE_DCGM_VERSION(dcgmDeviceFbcSessionInfo_v1, 1) - -/** - * Latest version for \ref dcgmDeviceFbcSessionInfo_t - */ -#define dcgmDeviceFbcSessionInfo_version dcgmDeviceFbcSessionInfo_version1 - -/** - * Represents all the active FBC sessions on the given device/vGPU instance - */ -typedef struct -{ - unsigned int version; //!< Version Number (dcgmDeviceFbcSessions_version) - unsigned int sessionCount; //!< Count of active FBC sessions - dcgmDeviceFbcSessionInfo_t sessionInfo[DCGM_MAX_FBC_SESSIONS]; //!< Info about the active FBC session -} dcgmDeviceFbcSessions_v1; - -/** - * Typedef for \ref dcgmDeviceFbcSessions_v1 - */ -typedef dcgmDeviceFbcSessions_v1 dcgmDeviceFbcSessions_t; - -/** - * Version 1 for \ref dcgmDeviceFbcSessions_v1 - */ -#define dcgmDeviceFbcSessions_version1 MAKE_DCGM_VERSION(dcgmDeviceFbcSessions_v1, 1) - -/** - * Latest version for \ref dcgmDeviceFbcSessions_t - */ -#define dcgmDeviceFbcSessions_version dcgmDeviceFbcSessions_version1 - -/* - * Represents type of encoder for capacity can be queried - */ -typedef enum dcgmEncoderQueryType_enum -{ - DCGM_ENCODER_QUERY_H264 = 0, - DCGM_ENCODER_QUERY_HEVC = 1 -} dcgmEncoderType_t; - -/** - * Represents information about active encoder sessions on the given vGPU instance - */ -typedef struct -{ - unsigned int version; //!< Version Number (dcgmDeviceVgpuEncSessions_version) - union - { - unsigned int vgpuId; //!< vGPU instance ID - unsigned int sessionCount; - } encoderSessionInfo; - unsigned int sessionId; //!< Unique session ID - unsigned int pid; //!< Process ID - dcgmEncoderType_t codecType; //!< Video encoder type - unsigned int hResolution; //!< Current encode horizontal resolution - unsigned int vResolution; //!< Current encode vertical resolution - unsigned int averageFps; //!< Moving average encode frames per second - unsigned int averageLatency; //!< Moving average encode latency in milliseconds -} dcgmDeviceVgpuEncSessions_v1; - -/** - * Typedef for \ref dcgmDeviceVgpuEncSessions_v1 - */ -typedef dcgmDeviceVgpuEncSessions_v1 dcgmDeviceVgpuEncSessions_t; - -/** - * Version 1 for \ref dcgmDeviceVgpuEncSessions_v1 - */ -#define dcgmDeviceVgpuEncSessions_version1 MAKE_DCGM_VERSION(dcgmDeviceVgpuEncSessions_v1, 1) - -/** - * Latest version for \ref dcgmDeviceVgpuEncSessions_t - */ -#define dcgmDeviceVgpuEncSessions_version dcgmDeviceVgpuEncSessions_version1 - -/** - * Represents utilization values for processes running in vGPU VMs using the device - */ -typedef struct -{ - unsigned int version; //!< Version Number (dcgmDeviceVgpuProcessUtilInfo_version) - union - { - unsigned int vgpuId; //!< vGPU instance ID - unsigned int vgpuProcessSamplesCount; //!< Count of processes running in the vGPU VM,for which utilization - //!< rates are being reported in this cycle. - } vgpuProcessUtilInfo; - unsigned int pid; //!< Process ID of the process running in the vGPU VM. - char processName[DCGM_VGPU_NAME_BUFFER_SIZE]; //!< Process Name of process running in the vGPU VM. - unsigned int smUtil; //!< GPU utilization of process running in the vGPU VM. - unsigned int memUtil; //!< Memory utilization of process running in the vGPU VM. - unsigned int encUtil; //!< Encoder utilization of process running in the vGPU VM. - unsigned int decUtil; //!< Decoder utilization of process running in the vGPU VM. -} dcgmDeviceVgpuProcessUtilInfo_v1; - -/** - * Typedef for \ref dcgmDeviceVgpuProcessUtilInfo_v1 - */ -typedef dcgmDeviceVgpuProcessUtilInfo_v1 dcgmDeviceVgpuProcessUtilInfo_t; - -/** - * Version 1 for \ref dcgmDeviceVgpuProcessUtilInfo_v1 - */ -#define dcgmDeviceVgpuProcessUtilInfo_version1 MAKE_DCGM_VERSION(dcgmDeviceVgpuProcessUtilInfo_v1, 1) - -/** - * Latest version for \ref dcgmDeviceVgpuProcessUtilInfo_t - */ -#define dcgmDeviceVgpuProcessUtilInfo_version dcgmDeviceVgpuProcessUtilInfo_version1 - -/** - * Represents static info related to vGPUs supported on the device. - */ -typedef struct -{ - unsigned int version; //!< Version number (dcgmDeviceVgpuTypeIdStaticInfo_version) - union - { - unsigned int vgpuTypeId; - unsigned int supportedVgpuTypeCount; - } vgpuTypeInfo; //!< vGPU type ID and Supported vGPU type count - char vgpuTypeName[DCGM_VGPU_NAME_BUFFER_SIZE]; //!< vGPU type Name - char vgpuTypeClass[DCGM_VGPU_NAME_BUFFER_SIZE]; //!< Class of vGPU type - char vgpuTypeLicense[DCGM_GRID_LICENSE_BUFFER_SIZE]; //!< license of vGPU type - int deviceId; //!< device ID of vGPU type - int subsystemId; //!< Subsystem ID of vGPU type - int numDisplayHeads; //!< Count of vGPU's supported display heads - int maxInstances; //!< maximum number of vGPU instances creatable on a device for given vGPU type - int frameRateLimit; //!< Frame rate limit value of the vGPU type - int maxResolutionX; //!< vGPU display head's maximum supported resolution in X dimension - int maxResolutionY; //!< vGPU display head's maximum supported resolution in Y dimension - int fbTotal; //!< vGPU Total framebuffer size in megabytes -} dcgmDeviceVgpuTypeInfo_v1; - -/** - * Typedef for \ref dcgmDeviceVgpuTypeInfo_v1 - */ -typedef dcgmDeviceVgpuTypeInfo_v1 dcgmDeviceVgpuTypeInfo_t; - -/** - * Version 1 for \ref dcgmDeviceVgpuTypeInfo_v1 - */ -#define dcgmDeviceVgpuTypeInfo_version1 MAKE_DCGM_VERSION(dcgmDeviceVgpuTypeInfo_v1, 1) - -/** - * Latest version for \ref dcgmDeviceVgpuTypeInfo_t - */ -#define dcgmDeviceVgpuTypeInfo_version dcgmDeviceVgpuTypeInfo_version1 - -typedef struct -{ - unsigned int version; - unsigned int persistenceModeEnabled; - unsigned int migModeEnabled; -} dcgmDeviceSettings_v1; - -typedef dcgmDeviceSettings_v1 dcgmDeviceSettings_t; - -#define dcgmDevicesSettings_version1 MAKE_DCGM_VERSION(dcgmDeviceSettings_v1, 1) - -#define dcgmDeviceSettings_version dcgmDeviceSettings_version1 - -/** - * Represents attributes corresponding to a device - */ -typedef struct -{ - unsigned int version; //!< Version number (dcgmDeviceAttributes_version) - dcgmDeviceSupportedClockSets_t clockSets; //!< Supported clocks for the device - dcgmDeviceThermals_t thermalSettings; //!< Thermal settings for the device - dcgmDevicePowerLimits_t powerLimits; //!< Various power limits for the device - dcgmDeviceIdentifiers_t identifiers; //!< Identifiers for the device - dcgmDeviceMemoryUsage_t memoryUsage; //!< Memory usage info for the device - char unused[208]; //!< Unused Space. Set to 0 for now -} dcgmDeviceAttributes_v1; - -/** - * Version 1 for \ref dcgmDeviceAttributes_v1 - */ -#define dcgmDeviceAttributes_version1 MAKE_DCGM_VERSION(dcgmDeviceAttributes_v1, 1) - -typedef struct -{ - unsigned int version; //!< Version number (dcgmDeviceAttributes_version) - dcgmDeviceSupportedClockSets_t clockSets; //!< Supported clocks for the device - dcgmDeviceThermals_t thermalSettings; //!< Thermal settings for the device - dcgmDevicePowerLimits_t powerLimits; //!< Various power limits for the device - dcgmDeviceIdentifiers_t identifiers; //!< Identifiers for the device - dcgmDeviceMemoryUsage_t memoryUsage; //!< Memory usage info for the device - dcgmDeviceSettings_t settings; //!< Basic device settings -} dcgmDeviceAttributes_v2; - -/** - * Typedef for \ref dcgmDeviceAttributes_v2 - */ -typedef dcgmDeviceAttributes_v2 dcgmDeviceAttributes_t; - -/** - * Version 1 for \ref dcgmDeviceAttributes_v2 - */ -#define dcgmDeviceAttributes_version2 MAKE_DCGM_VERSION(dcgmDeviceAttributes_v2, 2) - -/** - * Latest version for \ref dcgmDeviceAttributes_t - */ -#define dcgmDeviceAttributes_version dcgmDeviceAttributes_version2 - -/** - * Maximum number of vGPU types per physical GPU - */ -#define DCGM_MAX_VGPU_TYPES_PER_PGPU 32 - -/** - * Represents the size of a buffer that holds string related to attributes specific to vGPU instance - */ -#define DCGM_DEVICE_UUID_BUFFER_SIZE 80 - -/** - * Used to represent Performance state settings - */ -typedef struct -{ - unsigned int syncBoost; //!< Sync Boost Mode (0: Disabled, 1 : Enabled, DCGM_INT32_BLANK : Ignored). Note that - //!< using this setting may result in lower clocks than targetClocks - dcgmClockSet_t targetClocks; //!< Target clocks. Set smClock and memClock to DCGM_INT32_BLANK to ignore/use - //!< compatible values. For GPUs > Maxwell, setting this implies autoBoost=0 -} dcgmConfigPerfStateSettings_t; - -/** - * Used to represents the power capping limit for each GPU in the group or to represent the power - * budget for the entire group - */ -typedef struct -{ - dcgmConfigPowerLimitType_t type; //!< Flag to represent power cap for each GPU or power budget for the group of GPUs - unsigned int val; //!< Power Limit in Watts (Set a value OR DCGM_INT32_BLANK to Ignore) -} dcgmConfigPowerLimit_t; - -/** - * Structure to represent default and target configuration for a device - */ -typedef struct -{ - unsigned int version; //!< Version number (dcgmConfig_version) - unsigned int gpuId; //!< GPU ID - unsigned int eccMode; //!< ECC Mode (0: Disabled, 1 : Enabled, DCGM_INT32_BLANK : Ignored) - unsigned int computeMode; //!< Compute Mode (One of DCGM_CONFIG_COMPUTEMODE_? OR DCGM_INT32_BLANK to Ignore) - dcgmConfigPerfStateSettings_t perfState; //!< Performance State Settings (clocks / boost mode) - dcgmConfigPowerLimit_t powerLimit; //!< Power Limits -} dcgmConfig_v1; - -/** - * Typedef for \ref dcgmConfig_v1 - */ -typedef dcgmConfig_v1 dcgmConfig_t; - -/** - * Version 1 for \ref dcgmConfig_v1 - */ -#define dcgmConfig_version1 MAKE_DCGM_VERSION(dcgmConfig_v1, 1) - -/** - * Latest version for \ref dcgmConfig_t - */ -#define dcgmConfig_version dcgmConfig_version1 - -/** - * Represents a callback to receive updates from asynchronous functions. - * Currently the only implemented callback function is dcgmPolicyRegister - * and the void * data will be a pointer to dcgmPolicyCallbackResponse_t. - * Ex. - * dcgmPolicyCallbackResponse_t *callbackResponse = (dcgmPolicyCallbackResponse_t *) userData; - * - */ -typedef int (*fpRecvUpdates)(void *userData); - -/*Remove from doxygen documentation - * - * Define the structure that contains specific policy information - */ -typedef struct -{ - // version must always be first - unsigned int version; //!< Version number (dcgmPolicyViolation_version) - - unsigned int notifyOnEccDbe; //!< true/false notification on ECC Double Bit Errors - unsigned int notifyOnPciEvent; //!< true/false notification on PCI Events - unsigned int notifyOnMaxRetiredPages; //!< number of retired pages to occur before notification -} dcgmPolicyViolation_v1; - -/*Remove from doxygen documentation - * - * Represents the versioning for the dcgmPolicyViolation_v1 structure - */ - -/* - * Typedef for \ref dcgmPolicyViolation_v1 - */ -typedef dcgmPolicyViolation_v1 dcgmPolicyViolation_t; - -/* - * Version 1 for \ref dcgmPolicyViolation_v1 - */ -#define dcgmPolicyViolation_version1 MAKE_DCGM_VERSION(dcgmPolicyViolation_v1, 1) - -/* - * Latest version for \ref dcgmPolicyViolation_t - */ -#define dcgmPolicyViolation_version dcgmPolicyViolation_version1 - -/** - * Enumeration for policy conditions. - * When used as part of dcgmPolicy_t these have corresponding parameters to - * allow them to be switched on/off or set specific violation thresholds - */ -typedef enum dcgmPolicyCondition_enum -{ - // these are bitwise rather than sequential - DCGM_POLICY_COND_DBE = 0x1, //!< Double bit errors -- boolean in dcgmPolicyConditionParams_t - DCGM_POLICY_COND_PCI = 0x2, //!< PCI events/errors -- boolean in dcgmPolicyConditionParams_t - DCGM_POLICY_COND_MAX_PAGES_RETIRED = 0x4, //!< Maximum number of retired pages -- number - //!< required in dcgmPolicyConditionParams_t - DCGM_POLICY_COND_THERMAL = 0x8, //!< Thermal violation -- number required in dcgmPolicyConditionParams_t - DCGM_POLICY_COND_POWER = 0x10, //!< Power violation -- number required in dcgmPolicyConditionParams_t - DCGM_POLICY_COND_NVLINK = 0x20, //!< NVLINK errors -- boolean in dcgmPolicyConditionParams_t - DCGM_POLICY_COND_XID = 0x40, //!< XID errors -- number required in dcgmPolicyConditionParams_t -} dcgmPolicyCondition_t; - -#define DCGM_POLICY_COND_MAX 7 - -/** - * Structure for policy condition parameters. - * This structure contains a tag that represents the type of the value being passed - * as well as a "val" which is a union of the possible value types. For example, - * to pass a true boolean: tag = BOOL, val.boolean = 1. - */ -typedef struct dcgmPolicyConditionParams_st -{ - enum - { - BOOL, - LLONG - } tag; - union - { - unsigned int boolean; - unsigned long long llval; - } val; -} dcgmPolicyConditionParams_t; - -/** - * Enumeration for policy modes - */ -typedef enum dcgmPolicyMode_enum -{ - DCGM_POLICY_MODE_AUTOMATED = 0, //!< automatic mode - DCGM_POLICY_MODE_MANUAL = 1, //!< manual mode -} dcgmPolicyMode_t; - -/** - * Enumeration for policy isolation modes - */ -typedef enum dcgmPolicyIsolation_enum -{ - DCGM_POLICY_ISOLATION_NONE = 0, //!< no isolation of GPUs on error -} dcgmPolicyIsolation_t; - -/** - * Enumeration for policy actions - */ -typedef enum dcgmPolicyAction_enum -{ - DCGM_POLICY_ACTION_NONE = 0, //!< no action - DCGM_POLICY_ACTION_GPURESET = 1, //!< Deprecated - perform a GPU reset on violation -} dcgmPolicyAction_t; - -/** - * Enumeration for policy validation actions - */ -typedef enum dcgmPolicyValidation_enum -{ - DCGM_POLICY_VALID_NONE = 0, //!< no validation after an action is performed - DCGM_POLICY_VALID_SV_SHORT = 1, //!< run a short System Validation on the system after failure - DCGM_POLICY_VALID_SV_MED = 2, //!< run a medium System Validation test after failure - DCGM_POLICY_VALID_SV_LONG = 3, //!< run a extensive System Validation test after failure -} dcgmPolicyValidation_t; - -/** - * Enumeration for policy failure responses - */ -typedef enum dcgmPolicyFailureResp_enum -{ - DCGM_POLICY_FAILURE_NONE = 0, //!< on failure of validation perform no action -} dcgmPolicyFailureResp_t; - -/** - * Structure to fill when a user queries for policy violations - */ -typedef struct -{ - unsigned int gpuId; //!< gpu ID - unsigned int violationOccurred; //!< a violation based on the bit values in \ref dcgmPolicyCondition_t -} dcgmPolicyViolationNotify_t; - -/** - * Define the structure that specifies a policy to be enforced for a GPU - */ -typedef struct -{ - // version must always be first - unsigned int version; //!< version number (dcgmPolicy_version) - - dcgmPolicyCondition_t condition; //!< Condition(s) to access \ref dcgmPolicyCondition_t - dcgmPolicyMode_t mode; //!< Mode of operation \ref dcgmPolicyMode_t - dcgmPolicyIsolation_t isolation; //!< Isolation level after a policy violation \ref dcgmPolicyIsolation_t - dcgmPolicyAction_t action; //!< Action to perform after a policy violation \ref dcgmPolicyAction_t action - dcgmPolicyValidation_t validation; //!< Validation to perform after action is taken \ref dcgmPolicyValidation_t - dcgmPolicyFailureResp_t response; //!< Failure to validation response \ref dcgmPolicyFailureResp_t - dcgmPolicyConditionParams_t parms[DCGM_POLICY_COND_MAX]; //!< Parameters for the \a condition fields -} dcgmPolicy_v1; - -/** - * Typedef for \ref dcgmPolicy_v1 - */ -typedef dcgmPolicy_v1 dcgmPolicy_t; - -/** - * Version 1 for \ref dcgmPolicy_v1 - */ -#define dcgmPolicy_version1 MAKE_DCGM_VERSION(dcgmPolicy_v1, 1) - -/** - * Latest version for \ref dcgmPolicy_t - */ -#define dcgmPolicy_version dcgmPolicy_version1 - - -/** - * Define the ECC DBE return structure - */ -typedef struct -{ - long long timestamp; //!< timestamp of the error - enum - { - L1, - L2, - DEVICE, - REGISTER, - TEXTURE - } location; //!< location of the error - unsigned int numerrors; //!< number of errors -} dcgmPolicyConditionDbe_t; - -/** - * Define the PCI replay error return structure - */ -typedef struct -{ - long long timestamp; //!< timestamp of the error - unsigned int counter; //!< value of the PCIe replay counter -} dcgmPolicyConditionPci_t; - -/** - * Define the maximum pending retired pages limit return structure - */ -typedef struct -{ - long long timestamp; //!< timestamp of the error - unsigned int sbepages; //!< number of pending pages due to SBE - unsigned int dbepages; //!< number of pending pages due to DBE -} dcgmPolicyConditionMpr_t; - -/** - * Define the thermal policy violations return structure - */ -typedef struct -{ - long long timestamp; //!< timestamp of the error - unsigned int thermalViolation; //!< Temperature reached that violated policy -} dcgmPolicyConditionThermal_t; - -/** - * Define the power policy violations return structure - */ -typedef struct -{ - long long timestamp; //!< timestamp of the error - unsigned int powerViolation; //!< Power value reached that violated policy -} dcgmPolicyConditionPower_t; - -/** - * Define the nvlink policy violations return structure - */ -typedef struct -{ - long long timestamp; //!< timestamp of the error - unsigned short fieldId; //!< Nvlink counter field ID that violated policy - unsigned int counter; //!< Nvlink counter value that violated policy -} dcgmPolicyConditionNvlink_t; - -/** - * Define the xid policy violations return structure - */ -typedef struct -{ - long long timestamp; //!< Timestamp of the error - unsigned int errnum; //!< The XID error number -} dcgmPolicyConditionXID_t; - - -/** - * Define the structure that is given to the callback function - */ -typedef struct -{ - // version must always be first - unsigned int version; //!< version number (dcgmPolicyCallbackResponse_version) - - dcgmPolicyCondition_t condition; //!< Condition that was violated - union - { - dcgmPolicyConditionDbe_t dbe; //!< ECC DBE return structure - dcgmPolicyConditionPci_t pci; //!< PCI replay error return structure - dcgmPolicyConditionMpr_t mpr; //!< Max retired pages limit return structure - dcgmPolicyConditionThermal_t thermal; //!< Thermal policy violations return structure - dcgmPolicyConditionPower_t power; //!< Power policy violations return structure - dcgmPolicyConditionNvlink_t nvlink; //!< Nvlink policy violations return structure - dcgmPolicyConditionXID_t xid; //!< XID policy violations return structure - } val; -} dcgmPolicyCallbackResponse_v1; - - -/** - * Typedef for \ref dcgmPolicyCallbackResponse_v1 - */ -typedef dcgmPolicyCallbackResponse_v1 dcgmPolicyCallbackResponse_t; - -/** - * Version 1 for \ref dcgmPolicyCallbackResponse_v1 - */ -#define dcgmPolicyCallbackResponse_version1 MAKE_DCGM_VERSION(dcgmPolicyCallbackResponse_v1, 1) - -/** - * Latest version for \ref dcgmPolicyCallbackResponse_t - */ -#define dcgmPolicyCallbackResponse_version dcgmPolicyCallbackResponse_version1 - -/** - * Set above size of largest blob entry. Currently this is dcgmDeviceVgpuTypeInfo_v1 - */ -#define DCGM_MAX_BLOB_LENGTH 4096 - -/** - * This structure is used to represent value for the field to be queried. - */ -typedef struct -{ - // version must always be first - unsigned int version; //!< version number (dcgmFieldValue_version1) - - unsigned short fieldId; //!< One of DCGM_FI_? - unsigned short fieldType; //!< One of DCGM_FT_? - int status; //!< Status for the querying the field. DCGM_ST_OK or one of DCGM_ST_? - int64_t ts; //!< Timestamp in usec since 1970 - union - { - int64_t i64; //!< Int64 value - double dbl; //!< Double value - char str[DCGM_MAX_STR_LENGTH]; //!< NULL terminated string - char blob[DCGM_MAX_BLOB_LENGTH]; //!< Binary blob - } value; //!< Value -} dcgmFieldValue_v1; - -/** - * Version 1 for \ref dcgmFieldValue_v1 - */ -#define dcgmFieldValue_version1 MAKE_DCGM_VERSION(dcgmFieldValue_v1, 1) - -/** - * This structure is used to represent value for the field to be queried. - */ -typedef struct -{ - // version must always be first - unsigned int version; //!< version number (dcgmFieldValue_version2) - dcgm_field_entity_group_t entityGroupId; //!< Entity group this field value's entity belongs to - dcgm_field_eid_t entityId; //!< Entity this field value belongs to - unsigned short fieldId; //!< One of DCGM_FI_? - unsigned short fieldType; //!< One of DCGM_FT_? - int status; //!< Status for the querying the field. DCGM_ST_OK or one of DCGM_ST_? - unsigned int unused; //!< Unused for now to align ts to an 8-byte boundary. - int64_t ts; //!< Timestamp in usec since 1970 - union - { - int64_t i64; //!< Int64 value - double dbl; //!< Double value - char str[DCGM_MAX_STR_LENGTH]; //!< NULL terminated string - char blob[DCGM_MAX_BLOB_LENGTH]; //!< Binary blob - } value; //!< Value -} dcgmFieldValue_v2; - -/** - * Version 2 for \ref dcgmFieldValue_v2 - */ -#define dcgmFieldValue_version2 MAKE_DCGM_VERSION(dcgmFieldValue_v2, 2) - -/** - * Field value flags used by \ref dcgmEntitiesGetLatestValues - * - * Retrieve live data from the driver rather than cached data. - * Warning: Setting this flag will result in multiple calls to the NVIDIA driver that will be much slower than - * retrieving a cached value. - */ -#define DCGM_FV_FLAG_LIVE_DATA 0x00000001 - -/** - * User callback function for processing one or more field updates. This callback will - * be invoked one or more times per field until all of the expected field values have been - * enumerated. It is up to the callee to detect when the field id changes - * - * @param gpuId IN: GPU ID of the GPU this field value set belongs to - * @param values IN: Field values. These values must be copied as they will be destroyed as soon as this - * call returns. - * @param numValues IN: Number of entries that are valid in values[] - * @param userData IN: User data pointer passed to the update function that generated this callback - * - * @returns - * 0 if OK - * <0 if enumeration should stop. This allows to callee to abort field value enumeration. - * - */ -typedef int (*dcgmFieldValueEnumeration_f)(unsigned int gpuId, - dcgmFieldValue_v1 *values, - int numValues, - void *userData); - -/** - * User callback function for processing one or more field updates. This callback will - * be invoked one or more times per field until all of the expected field values have been - * enumerated. It is up to the callee to detect when the field id changes - * - * @param entityGroupId IN: entityGroup of the entity this field value set belongs to - * @param entityId IN: Entity this field value set belongs to - * @param values IN: Field values. These values must be copied as they will be destroyed as soon as this - * call returns. - * @param numValues IN: Number of entries that are valid in values[] - * @param userData IN: User data pointer passed to the update function that generated this callback - * - * @returns - * 0 if OK - * <0 if enumeration should stop. This allows to callee to abort field value enumeration. - * - */ -typedef int (*dcgmFieldValueEntityEnumeration_f)(dcgm_field_entity_group_t entityGroupId, - dcgm_field_eid_t entityId, - dcgmFieldValue_v1 *values, - int numValues, - void *userData); - - -/** - * Summary of time series data in int64 format. - * - * Each value will either be set or be a BLANK value. - * Check for blank with the DCGM_INT64_IS_BLANK() macro. - * \sa See dcgmvalue.h for the actual values of BLANK values - */ -typedef struct -{ - long long minValue; //!< Minimum value of the samples looked at - long long maxValue; //!< Maximum value of the samples looked at - long long average; //!< Simple average of the samples looked at. Blank values are ignored for this calculation -} dcgmStatSummaryInt64_t; - -/** - * Same as dcgmStatSummaryInt64_t, but with 32-bit integer values - */ -typedef struct -{ - int minValue; //!< Minimum value of the samples looked at - int maxValue; //!< Maximum value of the samples looked at - int average; //!< Simple average of the samples looked at. Blank values are ignored for this calculation -} dcgmStatSummaryInt32_t; - -/** - * Summary of time series data in double-precision format. - * Each value will either be set or be a BLANK value. - * Check for blank with the DCGM_FP64_IS_BLANK() macro. - * \sa See dcgmvalue.h for the actual values of BLANK values - */ -typedef struct -{ - double minValue; //!< Minimum value of the samples looked at - double maxValue; //!< Maximum value of the samples looked at - double average; //!< Simple average of the samples looked at. Blank values are ignored for this calculation -} dcgmStatSummaryFp64_t; - -/** - * Systems structure used to enable or disable health watch systems - */ -typedef enum dcgmHealthSystems_enum -{ - DCGM_HEALTH_WATCH_PCIE = 0x1, //!< PCIe system watches (must have 1m of data before query) - DCGM_HEALTH_WATCH_NVLINK = 0x2, //!< NVLINK system watches - DCGM_HEALTH_WATCH_PMU = 0x4, //!< Power management unit watches - DCGM_HEALTH_WATCH_MCU = 0x8, //!< Micro-controller unit watches - DCGM_HEALTH_WATCH_MEM = 0x10, //!< Memory watches - DCGM_HEALTH_WATCH_SM = 0x20, //!< Streaming multiprocessor watches - DCGM_HEALTH_WATCH_INFOROM = 0x40, //!< Inforom watches - DCGM_HEALTH_WATCH_THERMAL = 0x80, //!< Temperature watches (must have 1m of data before query) - DCGM_HEALTH_WATCH_POWER = 0x100, //!< Power watches (must have 1m of data before query) - DCGM_HEALTH_WATCH_DRIVER = 0x200, //!< Driver-related watches - DCGM_HEALTH_WATCH_NVSWITCH_NONFATAL = 0x400, //!< Non-fatal errors in NvSwitch - DCGM_HEALTH_WATCH_NVSWITCH_FATAL = 0x800, //!< Fatal errors in NvSwitch - - // ... - DCGM_HEALTH_WATCH_ALL = 0xFFFFFFFF //!< All watches enabled -} dcgmHealthSystems_t; - -#define DCGM_HEALTH_WATCH_COUNT_V1 10 /*!< For iterating through the dcgmHealthSystems_v1 enum */ -#define DCGM_HEALTH_WATCH_COUNT_V2 12 /*!< For iterating through the dcgmHealthSystems_v2 enum */ - -/** - * Health Watch test results - */ -typedef enum dcgmHealthWatchResult_enum -{ - DCGM_HEALTH_RESULT_PASS = 0, //!< All results within this system are reporting normal - DCGM_HEALTH_RESULT_WARN = 10, //!< A warning has been issued, refer to the response for more information - DCGM_HEALTH_RESULT_FAIL = 20, //!< A failure has been issued, refer to the response for more information -} dcgmHealthWatchResults_t; - -typedef struct -{ - char msg[1024]; - unsigned int code; -} dcgmDiagErrorDetail_t; - -#define DCGM_HEALTH_WATCH_MAX_INCIDENTS DCGM_GROUP_MAX_ENTITIES - -typedef struct -{ - dcgmHealthSystems_t system; //!< system to which this information belongs - dcgmHealthWatchResults_t health; //!< health diagnosis of this incident - dcgmDiagErrorDetail_t error; //!< Information about the error(s) and their error codes - dcgmGroupEntityPair_t entityInfo; //!< identify which entity has this error -} dcgmIncidentInfo_t; - -/** - * Health response structure version 4 - Simply list the incidents instead of reporting by entity - * - * Since DCGM 2.0 - */ -typedef struct -{ - unsigned int version; //!< The version number of this struct - dcgmHealthWatchResults_t overallHealth; //!< The overall health of this entire host - unsigned int incidentCount; //!< The number of health incidents reported in this struct - dcgmIncidentInfo_t incidents[DCGM_HEALTH_WATCH_MAX_INCIDENTS]; //!< Report of the errors detected -} dcgmHealthResponse_v4; - -/** - * Version 4 for \ref dcgmHealthResponse_v4 - */ -#define dcgmHealthResponse_version4 MAKE_DCGM_VERSION(dcgmHealthResponse_v4, 4) - -/** - * Latest version for \ref dcgmHealthResponse_t - */ -#define dcgmHealthResponse_version dcgmHealthResponse_version4 - -/** - * Typedef for \ref dcgmHealthResponse_v4 - */ -typedef dcgmHealthResponse_v4 dcgmHealthResponse_t; - -/** - * Structure used to set health watches via the dcgmHealthSet_v2 API - */ -typedef struct -{ - unsigned int version; /*!< Version of this struct. Should be dcgmHealthSet_version2 */ - dcgmGpuGrp_t groupId; /*!< Group ID representing collection of one or more entities. Look - at \ref dcgmGroupCreate for details on creating the group. - Alternatively, pass in the group id as \a DCGM_GROUP_ALL_GPUS - to perform operation on all the GPUs or \a DCGM_GROUP_ALL_NVSWITCHES - to perform operation on all the NvSwitches. */ - dcgmHealthSystems_t systems; /*!< An enum representing systems that should be enabled for health - checks logically OR'd together. Refer to \ref dcgmHealthSystems_t - for details. */ - long long updateInterval; /*!< How often to query the underlying health information from the - NVIDIA driver in usec. This should be the same as how often you call - dcgmHealthCheck */ - double maxKeepAge; /*!< How long to keep data cached for this field in seconds. This should - be at least your maximum time between calling dcgmHealthCheck */ -} dcgmHealthSetParams_v2; - -/** - * Version 2 for \ref dcgmHealthSet_v2 - */ -#define dcgmHealthSetParams_version2 MAKE_DCGM_VERSION(dcgmHealthSetParams_v2, 2) - - -#define DCGM_MAX_PID_INFO_NUM 16 -/** - * per process utilization rates - */ -typedef struct -{ - unsigned int pid; - double smUtil; - double memUtil; -} dcgmProcessUtilInfo_t; - -/** - *Internal structure used to get the PID and the corresponding utilization rate - */ -typedef struct -{ - double util; - unsigned int pid; -} dcgmProcessUtilSample_t; - -/** - * Info corresponding to single PID - */ -typedef struct -{ - unsigned int gpuId; //!< ID of the GPU this pertains to. GPU_ID_INVALID = summary information for multiple GPUs - - /* All of the following are during the process's lifetime */ - - long long energyConsumed; //!< Energy consumed by the gpu in milli-watt/seconds - dcgmStatSummaryInt64_t pcieRxBandwidth; //!< PCI-E bytes read from the GPU - dcgmStatSummaryInt64_t pcieTxBandwidth; //!< PCI-E bytes written to the GPU - long long pcieReplays; //!< Count of PCI-E replays that occurred - long long startTime; //!< Process start time in microseconds since 1970 - long long endTime; //!< Process end time in microseconds since 1970 or reported as 0 if the process is not completed - dcgmProcessUtilInfo_t processUtilization; //!< Process SM and Memory Utilization (in percent) - dcgmStatSummaryInt32_t smUtilization; //!< GPU SM Utilization in percent - dcgmStatSummaryInt32_t memoryUtilization; //!< GPU Memory Utilization in percent - unsigned int eccSingleBit; //!< Deprecated - Count of ECC single bit errors that occurred - unsigned int eccDoubleBit; //!< Count of ECC double bit errors that occurred - dcgmStatSummaryInt32_t memoryClock; //!< Memory clock in MHz - dcgmStatSummaryInt32_t smClock; //!< SM clock in MHz - - int numXidCriticalErrors; //!< Number of valid entries in xidCriticalErrorsTs - long long xidCriticalErrorsTs[10]; //!< Timestamps of the critical XID errors that occurred - - int numOtherComputePids; //!< Count of otherComputePids entries that are valid - unsigned int otherComputePids[DCGM_MAX_PID_INFO_NUM]; //!< Other compute processes that ran. 0=no process - - int numOtherGraphicsPids; //!< Count of otherGraphicsPids entries that are valid - unsigned int otherGraphicsPids[DCGM_MAX_PID_INFO_NUM]; //!< Other graphics processes that ran. 0=no process - - long long maxGpuMemoryUsed; //!< Maximum amount of GPU memory that was used in bytes - - long long powerViolationTime; //!< Number of microseconds we were at reduced clocks due to power violation - long long thermalViolationTime; //!< Number of microseconds we were at reduced clocks due to thermal violation - long long reliabilityViolationTime; //!< Amount of microseconds we were at reduced clocks - //!< due to the reliability limit - long long boardLimitViolationTime; //!< Amount of microseconds we were at reduced clocks due to being at the - //!< board's max voltage - long long lowUtilizationTime; //!< Amount of microseconds we were at reduced clocks due to low utilization - long long syncBoostTime; //!< Amount of microseconds we were at reduced clocks due to sync boost - dcgmHealthWatchResults_t overallHealth; //!< The overall health of the system. \ref dcgmHealthWatchResults_t - unsigned int incidentCount; - struct - { - dcgmHealthSystems_t system; //!< system to which this information belongs - dcgmHealthWatchResults_t health; //!< health of the specified system on this GPU - } systems[DCGM_HEALTH_WATCH_COUNT_V1]; -} dcgmPidSingleInfo_t; - -/** - * To store process statistics - */ -typedef struct -{ - unsigned int version; //!< Version of this message (dcgmPidInfo_version) - unsigned int pid; //!< PID of the process - unsigned int unused; - int numGpus; //!< Number of GPUs that are valid in GPUs - dcgmPidSingleInfo_t summary; //!< Summary information for all GPUs listed in gpus[] - dcgmPidSingleInfo_t gpus[DCGM_MAX_NUM_DEVICES]; //!< Per-GPU information for this PID -} dcgmPidInfo_v2; - -/** - * Typedef for \ref dcgmPidInfo_v2 - */ -typedef dcgmPidInfo_v2 dcgmPidInfo_t; - -/** - * Version 2 for \ref dcgmPidInfo_v2 - */ -#define dcgmPidInfo_version2 MAKE_DCGM_VERSION(dcgmPidInfo_v2, 2) - -/** - * Latest version for \ref dcgmPidInfo_t - */ -#define dcgmPidInfo_version dcgmPidInfo_version2 - -/** - * Info corresponding to the job on a GPU - */ -typedef struct -{ - unsigned int gpuId; //!< ID of the GPU this pertains to. GPU_ID_INVALID = summary information for multiple GPUs - - /* All of the following are during the job's lifetime */ - - long long energyConsumed; //!< Energy consumed in milli-watt/seconds - dcgmStatSummaryFp64_t powerUsage; //!< Power usage Min/Max/Avg in watts - dcgmStatSummaryInt64_t pcieRxBandwidth; //!< PCI-E bytes read from the GPU - dcgmStatSummaryInt64_t pcieTxBandwidth; //!< PCI-E bytes written to the GPU - long long pcieReplays; //!< Count of PCI-E replays that occurred - long long startTime; //!< User provided job start time in microseconds since 1970 - long long endTime; //!< User provided job end time in microseconds since 1970 - dcgmStatSummaryInt32_t smUtilization; //!< GPU SM Utilization in percent - dcgmStatSummaryInt32_t memoryUtilization; //!< GPU Memory Utilization in percent - unsigned int eccSingleBit; //!< Deprecated - Count of ECC single bit errors that occurred - unsigned int eccDoubleBit; //!< Count of ECC double bit errors that occurred - dcgmStatSummaryInt32_t memoryClock; //!< Memory clock in MHz - dcgmStatSummaryInt32_t smClock; //!< SM clock in MHz - - int numXidCriticalErrors; //!< Number of valid entries in xidCriticalErrorsTs - long long xidCriticalErrorsTs[10]; //!< Timestamps of the critical XID errors that occurred - - int numComputePids; //!< Count of computePids entries that are valid - dcgmProcessUtilInfo_t computePidInfo[DCGM_MAX_PID_INFO_NUM]; //!< List of compute processes that ran during the job - //!< 0=no process - - int numGraphicsPids; //!< Count of graphicsPids entries that are valid - dcgmProcessUtilInfo_t graphicsPidInfo[DCGM_MAX_PID_INFO_NUM]; //!< List of compute processes that ran during the job - //!< 0=no process - - long long maxGpuMemoryUsed; //!< Maximum amount of GPU memory that was used in bytes - - long long powerViolationTime; //!< Number of microseconds we were at reduced clocks due to power violation - long long thermalViolationTime; //!< Number of microseconds we were at reduced clocks due to thermal violation - long long reliabilityViolationTime; //!< Amount of microseconds we were at reduced clocks - //!< due to the reliability limit - long long boardLimitViolationTime; //!< Amount of microseconds we were at reduced clocks - //!< due to being at the board's max voltage - long long lowUtilizationTime; //!< Amount of microseconds we were at reduced clocks due to low utilization - long long syncBoostTime; //!< Amount of microseconds we were at reduced clocks due to sync boost - dcgmHealthWatchResults_t overallHealth; //!< The overall health of the system. \ref dcgmHealthWatchResults_t - unsigned int incidentCount; - struct - { - dcgmHealthSystems_t system; //!< system to which this information belongs - dcgmHealthWatchResults_t health; //!< health of the specified system on this GPU - } systems[DCGM_HEALTH_WATCH_COUNT_V1]; -} dcgmGpuUsageInfo_t; - - -/** - * To store job statistics - * The following fields are not applicable in the summary info: - * - pcieRxBandwidth (Min/Max) - * - pcieTxBandwidth (Min/Max) - * - smUtilization (Min/Max) - * - memoryUtilization (Min/Max) - * - memoryClock (Min/Max) - * - smClock (Min/Max) - * - processSamples - * - * The average value in the above fields (in the summary) is the - * average of the averages of respective fields from all GPUs - */ -typedef struct -{ - unsigned int version; //!< Version of this message (dcgmPidInfo_version) - int numGpus; //!< Number of GPUs that are valid in gpus[] - dcgmGpuUsageInfo_t summary; //!< Summary information for all GPUs listed in gpus[] - dcgmGpuUsageInfo_t gpus[DCGM_MAX_NUM_DEVICES]; //!< Per-GPU information for this PID -} dcgmJobInfo_v3; - -/** - * Typedef for \ref dcgmJobInfo_v3 - */ -typedef dcgmJobInfo_v3 dcgmJobInfo_t; - -/** - * Version 3 for \ref dcgmJobInfo_v3 - */ -#define dcgmJobInfo_version3 MAKE_DCGM_VERSION(dcgmJobInfo_v3, 3) - -/** - * Latest version for \ref dcgmJobInfo_t - */ -#define dcgmJobInfo_version dcgmJobInfo_version3 - - -/** - * Running process information for a compute or graphics process - */ -typedef struct -{ - unsigned int version; //!< Version of this message (dcgmRunningProcess_version) - unsigned int pid; //!< PID of the process - unsigned long long memoryUsed; //!< GPU memory used by this process in bytes. -} dcgmRunningProcess_v1; - -/** - * Typedef for \ref dcgmRunningProcess_v1 - */ -typedef dcgmRunningProcess_v1 dcgmRunningProcess_t; - -/** - * Version 1 for \ref dcgmRunningProcess_v1 - */ -#define dcgmRunningProcess_version1 MAKE_DCGM_VERSION(dcgmRunningProcess_v1, 1) - -/** - * Latest version for \ref dcgmRunningProcess_t - */ -#define dcgmRunningProcess_version dcgmRunningProcess_version1 - -/** - * Enumeration for diagnostic levels - */ -typedef enum -{ - DCGM_DIAG_LVL_INVALID = 0, //!< Uninitialized - DCGM_DIAG_LVL_SHORT = 10, //!< run a very basic health check on the system - DCGM_DIAG_LVL_MED = 20, //!< run a medium-length diagnostic (a few minutes) - DCGM_DIAG_LVL_LONG = 30, //!< run a extensive diagnostic (several minutes) -} dcgmDiagnosticLevel_t; - -/** - * Diagnostic test results - */ -typedef enum dcgmDiagResult_enum -{ - DCGM_DIAG_RESULT_PASS = 0, //!< This test passed as diagnostics - DCGM_DIAG_RESULT_SKIP = 1, //!< This test was skipped - DCGM_DIAG_RESULT_WARN = 2, //!< This test passed with warnings - DCGM_DIAG_RESULT_FAIL = 3, //!< This test failed the diagnostics - DCGM_DIAG_RESULT_NOT_RUN = 4, //!< This test wasn't executed -} dcgmDiagResult_t; - -typedef struct -{ - dcgmDiagResult_t status; //!< The result of the test - char warning[1024]; //!< Warning returned from the test, if any - char info[1024]; //!< Information details returned from the test, if any -} dcgmDiagTestResult_v1; - -typedef struct -{ - dcgmDiagResult_t status; //!< The result of the test - dcgmDiagErrorDetail_t error; //!< The error message and error code, if any - char info[1024]; //!< Information details returned from the test, if any -} dcgmDiagTestResult_v2; - - -/** - * Diagnostic per gpu tests - fixed indices for dcgmDiagResponsePerGpu_t.results[] - */ -typedef enum dcgmPerGpuTestIndices_enum -{ - DCGM_MEMORY_INDEX = 0, //!< Memory test index - DCGM_DIAGNOSTIC_INDEX = 1, //!< Diagnostic test index - DCGM_PCI_INDEX = 2, //!< PCIe test index - DCGM_SM_STRESS_INDEX = 3, //!< SM Stress test index - DCGM_TARGETED_STRESS_INDEX = 4, //!< Targeted Stress test index - DCGM_TARGETED_POWER_INDEX = 5, //!< Targeted Power test index - DCGM_MEMORY_BANDWIDTH_INDEX = 6, //!< Memory bandwidth test index - // Remaining tests are included for convenience but have different execution rules - // See DCGM_PER_GPU_TEST_COUNT - DCGM_SOFTWARE_INDEX = 7, //!< Software test index - DCGM_CONTEXT_CREATE_INDEX = 8, //!< Context create test index - DCGM_UNKNOWN_INDEX = 9 //!< Unknown test -} dcgmPerGpuTestIndices_t; - -// TODO: transition these to dcgm_deprecated.h -#define DCGM_SM_PERF_INDEX DCGM_SM_STRESS_INDEX -#define DCGM_TARGETED_PERF_INDEX DCGM_TARGETED_PERF_INDEX - -// Number of diag tests -// NOTE: does not include software and context_create which have different execution rules -#define DCGM_PER_GPU_TEST_COUNT 7 - -/** - * Per GPU diagnostics result structure - */ -typedef struct -{ - unsigned int gpuId; //!< ID for the GPU this information pertains - unsigned int hwDiagnosticReturn; //!< Per GPU hardware diagnostic test return code - dcgmDiagTestResult_v2 results[DCGM_PER_GPU_TEST_COUNT]; //!< Array with a result for each per-gpu test -} dcgmDiagResponsePerGpu_v2; - -#define DCGM_SWTEST_COUNT 10 -#define LEVEL_ONE_MAX_RESULTS 16 - -typedef enum dcgmSoftwareTest_enum -{ - DCGM_SWTEST_BLACKLIST = 0, //!< test for presence of blacklisted drivers (e.g. nouveau) - DCGM_SWTEST_NVML_LIBRARY = 1, //!< test for presence (and version) of NVML lib - DCGM_SWTEST_CUDA_MAIN_LIBRARY = 2, //!< test for presence (and version) of CUDA lib - DCGM_SWTEST_CUDA_RUNTIME_LIBRARY = 3, //!< test for presence (and version) of CUDA RT lib - DCGM_SWTEST_PERMISSIONS = 4, //!< test for character device permissions - DCGM_SWTEST_PERSISTENCE_MODE = 5, //!< test for persistence mode enabled - DCGM_SWTEST_ENVIRONMENT = 6, //!< test for CUDA environment vars that may slow tests - DCGM_SWTEST_PAGE_RETIREMENT = 7, //!< test for pending frame buffer page retirement - DCGM_SWTEST_GRAPHICS_PROCESSES = 8, //!< test for graphics processes running - DCGM_SWTEST_INFOROM = 9, //!< test for inforom corruption -} dcgmSoftwareTest_t; - -/** - * Global diagnostics result structure v6 - * - * Since DCGM 2.0 - */ -typedef struct -{ - unsigned int version; //!< version number (dcgmDiagResult_version) - unsigned int gpuCount; //!< number of valid per GPU results - unsigned int levelOneTestCount; //!< number of valid levelOne results - - dcgmDiagTestResult_v2 levelOneResults[LEVEL_ONE_MAX_RESULTS]; //!< Basic, system-wide test results. - dcgmDiagResponsePerGpu_v2 perGpuResponses[DCGM_MAX_NUM_DEVICES]; //!< per GPU test results - dcgmDiagErrorDetail_t systemError; //!< System-wide error reported from NVVS - char trainingMsg[1024]; //!< Training Message -} dcgmDiagResponse_v6; - -/** - * Typedef for \ref dcgmDiagResponse_v6 - */ -typedef dcgmDiagResponse_v6 dcgmDiagResponse_t; - -/** - * Version 6 for \ref dcgmDiagResponse_v6 - */ -#define dcgmDiagResponse_version6 MAKE_DCGM_VERSION(dcgmDiagResponse_v6, 6) - -/** - * Latest version for \ref dcgmDiagResponse_t - */ -#define dcgmDiagResponse_version dcgmDiagResponse_version6 - -/** - * Represents level relationships within a system between two GPUs - * The enums are spaced to allow for future relationships. - * These match the definitions in nvml.h - */ -typedef enum dcgmGpuLevel_enum -{ - DCGM_TOPOLOGY_UNINITIALIZED = 0x0, - - /** \name PCI connectivity states */ - /**@{*/ - DCGM_TOPOLOGY_BOARD = 0x1, //!< multi-GPU board - DCGM_TOPOLOGY_SINGLE = 0x2, //!< all devices that only need traverse a single PCIe switch - DCGM_TOPOLOGY_MULTIPLE = 0x4, //!< all devices that need not traverse a host bridge - DCGM_TOPOLOGY_HOSTBRIDGE = 0x8, //!< all devices that are connected to the same host bridge - DCGM_TOPOLOGY_CPU = 0x10, //!< all devices that are connected to the same CPU but possibly multiple host bridges - DCGM_TOPOLOGY_SYSTEM = 0x20, //!< all devices in the system - /**@}*/ - - /** \name NVLINK connectivity states */ - /**@{*/ - DCGM_TOPOLOGY_NVLINK1 = 0x0100, //!< GPUs connected via a single NVLINK link - DCGM_TOPOLOGY_NVLINK2 = 0x0200, //!< GPUs connected via two NVLINK links - DCGM_TOPOLOGY_NVLINK3 = 0x0400, //!< GPUs connected via three NVLINK links - DCGM_TOPOLOGY_NVLINK4 = 0x0800, //!< GPUs connected via four NVLINK links - DCGM_TOPOLOGY_NVLINK5 = 0x1000, //!< GPUs connected via five NVLINK links - DCGM_TOPOLOGY_NVLINK6 = 0x2000, //!< GPUs connected via six NVLINK links - DCGM_TOPOLOGY_NVLINK7 = 0x4000, //!< GPUs connected via seven NVLINK links - DCGM_TOPOLOGY_NVLINK8 = 0x8000, //!< GPUs connected via eight NVLINK links - DCGM_TOPOLOGY_NVLINK9 = 0x10000, //!< GPUs connected via nine NVLINK links - DCGM_TOPOLOGY_NVLINK10 = 0x20000, //!< GPUs connected via ten NVLINK links - DCGM_TOPOLOGY_NVLINK11 = 0x40000, //!< GPUs connected via eleven NVLINK links - DCGM_TOPOLOGY_NVLINK12 = 0x80000, //!< GPUs connected via twelve NVLINK links - /**@}*/ -} dcgmGpuTopologyLevel_t; - -// the PCI paths are the lower 8 bits of the path information -#define DCGM_TOPOLOGY_PATH_PCI(x) (dcgmGpuTopologyLevel_t)((unsigned int)(x)&0xFF) - -// the NVLINK paths are the upper 24 bits of the path information -#define DCGM_TOPOLOGY_PATH_NVLINK(x) (dcgmGpuTopologyLevel_t)((unsigned int)(x)&0xFFFFFF00) - -#define DCGM_AFFINITY_BITMASK_ARRAY_SIZE 8 - -/** - * Device topology information - */ -typedef struct -{ - unsigned int version; //!< version number (dcgmDeviceTopology_version) - - unsigned long cpuAffinityMask[DCGM_AFFINITY_BITMASK_ARRAY_SIZE]; //!< affinity mask for the specified GPU - //!< a 1 represents affinity to the CPU in that - //!< bit position supports up to 256 cores - unsigned int numGpus; //!< number of valid entries in gpuPaths - - struct - { - unsigned int gpuId; //!< gpuId to which the path represents - dcgmGpuTopologyLevel_t path; //!< path to the gpuId from this GPU. Note that this is a bit-mask - //!< of DCGM_TOPOLOGY_* values and can contain both PCIe topology - //!< and NvLink topology where applicable. For instance: - //!< 0x210 = DCGM_TOPOLOGY_CPU | DCGM_TOPOLOGY_NVLINK2 - //!< Use the macros DCGM_TOPOLOGY_PATH_NVLINK and - //!< DCGM_TOPOLOGY_PATH_PCI to mask the NvLink and PCI paths, respectively. - unsigned int localNvLinkIds; //!< bits representing the local links connected to gpuId - //!< e.g. if this field == 3, links 0 and 1 are connected, - //!< field is only valid if NVLINKS actually exist between GPUs - } gpuPaths[DCGM_MAX_NUM_DEVICES - 1]; -} dcgmDeviceTopology_v1; - -/** - * Typedef for \ref dcgmDeviceTopology_v1 - */ -typedef dcgmDeviceTopology_v1 dcgmDeviceTopology_t; - -/** - * Version 1 for \ref dcgmDeviceTopology_v1 - */ -#define dcgmDeviceTopology_version1 MAKE_DCGM_VERSION(dcgmDeviceTopology_v1, 1) - -/** - * Latest version for \ref dcgmDeviceTopology_t - */ -#define dcgmDeviceTopology_version dcgmDeviceTopology_version1 - -/** - * Group topology information - */ -typedef struct -{ - unsigned int version; //!< version number (dcgmGroupTopology_version) - - unsigned long - groupCpuAffinityMask[DCGM_AFFINITY_BITMASK_ARRAY_SIZE]; //!< the CPU affinity mask for all GPUs in the group - //!< a 1 represents affinity to the CPU in that bit - //!< position supports up to 256 cores - unsigned int numaOptimalFlag; //!< a zero value indicates that 1 or more GPUs - //!< in the group have a different CPU affinity and thus - //!< may not be optimal for certain algorithms - dcgmGpuTopologyLevel_t slowestPath; //!< the slowest path amongst GPUs in the group -} dcgmGroupTopology_v1; - -/** - * Typedef for \ref dcgmGroupTopology_v1 - */ -typedef dcgmGroupTopology_v1 dcgmGroupTopology_t; - -/** - * Version 1 for \ref dcgmGroupTopology_v1 - */ -#define dcgmGroupTopology_version1 MAKE_DCGM_VERSION(dcgmGroupTopology_v1, 1) - -/** - * Latest version for \ref dcgmGroupTopology_t - */ -#define dcgmGroupTopology_version dcgmGroupTopology_version1 - -/** - * Identifies a level to retrieve field introspection info for - */ -typedef enum dcgmIntrospectLevel_enum -{ - DCGM_INTROSPECT_LVL_INVALID = 0, //!< Invalid value - DCGM_INTROSPECT_LVL_FIELD = 1, //!< Introspection data is grouped by field ID - DCGM_INTROSPECT_LVL_FIELD_GROUP = 2, //!< Introspection data is grouped by field group - DCGM_INTROSPECT_LVL_ALL_FIELDS, //!< Introspection data is aggregated for all fields -} dcgmIntrospectLevel_t; - -/** - * Identifies the retrieval context for introspection API calls. - */ -typedef struct -{ - unsigned int version; //!< version number (dcgmIntrospectContext_version) - dcgmIntrospectLevel_t introspectLvl; //!< Introspect Level \ref dcgmIntrospectLevel_t - union - { - dcgmGpuGrp_t fieldGroupId; //!< Only needed if \ref introspectLvl is DCGM_INTROSPECT_LVL_FIELD_GROUP - unsigned short fieldId; //!< Only needed if \ref introspectLvl is DCGM_INTROSPECT_LVL_FIELD - unsigned long long contextId; //!< Overloaded way to access both fieldGroupId and fieldId - }; -} dcgmIntrospectContext_v1; - -/** - * Typedef for \ref dcgmIntrospectContext_v1 - */ -typedef dcgmIntrospectContext_v1 dcgmIntrospectContext_t; - -/** - * Version 1 for \ref dcgmIntrospectContext_t - */ -#define dcgmIntrospectContext_version1 MAKE_DCGM_VERSION(dcgmIntrospectContext_v1, 1) - -/** - * Latest version for \ref dcgmIntrospectContext_t - */ -#define dcgmIntrospectContext_version dcgmIntrospectContext_version1 - -/** - * DCGM Execution time info for a set of fields - */ -typedef struct -{ - unsigned int version; //!< version number (dcgmIntrospectFieldsExecTime_version) - - long long meanUpdateFreqUsec; //!< the mean update frequency of all fields - - double recentUpdateUsec; //!< the sum of every field's most recent execution time after they - //!< have been normalized to \ref meanUpdateFreqUsec". - //!< This is roughly how long it takes to update fields every \ref meanUpdateFreqUsec - - long long totalEverUpdateUsec; //!< The total amount of time, ever, that has been spent updating all the fields -} dcgmIntrospectFieldsExecTime_v1; - -/** - * Typedef for \ref dcgmIntrospectFieldsExecTime_t - */ -typedef dcgmIntrospectFieldsExecTime_v1 dcgmIntrospectFieldsExecTime_t; - -/** - * Version 1 for \ref dcgmIntrospectFieldsExecTime_t - */ -#define dcgmIntrospectFieldsExecTime_version1 MAKE_DCGM_VERSION(dcgmIntrospectFieldsExecTime_v1, 1) - -/** - * Latest version for \ref dcgmIntrospectFieldsExecTime_t - */ -#define dcgmIntrospectFieldsExecTime_version dcgmIntrospectFieldsExecTime_version1 - -/** - * Full introspection info for field execution time - * - * Since DCGM 2.0 - */ -typedef struct -{ - unsigned int version; //!< version number (dcgmIntrospectFullFieldsExecTime_version) - - dcgmIntrospectFieldsExecTime_v1 aggregateInfo; //!< info that includes global and device scope - - int hasGlobalInfo; //!< 0 means \ref globalInfo is populated, !0 means it's not - dcgmIntrospectFieldsExecTime_v1 globalInfo; //!< info that only includes global field scope - - unsigned short gpuInfoCount; //!< count of how many entries in \ref gpuInfo are populated - unsigned int gpuIdsForGpuInfo[DCGM_MAX_NUM_DEVICES]; //!< the GPU ID at a given index identifies which gpu - //!< the corresponding entry in \ref gpuInfo is from - - dcgmIntrospectFieldsExecTime_v1 gpuInfo[DCGM_MAX_NUM_DEVICES]; //!< info that is separated by the - //!< GPU ID that the watches were for -} dcgmIntrospectFullFieldsExecTime_v2; - -/** - * typedef for \ref dcgmIntrospectFullFieldsExecTime_v1 - */ -typedef dcgmIntrospectFullFieldsExecTime_v2 dcgmIntrospectFullFieldsExecTime_t; - -/** - * Version 1 for \ref dcgmIntrospectFullFieldsExecTime_t - */ -#define dcgmIntrospectFullFieldsExecTime_version2 MAKE_DCGM_VERSION(dcgmIntrospectFullFieldsExecTime_v2, 2) - -/** - * Latest version for \ref dcgmIntrospectFullFieldsExecTime_t - */ -#define dcgmIntrospectFullFieldsExecTime_version dcgmIntrospectFullFieldsExecTime_version2 - -/** - * State of DCGM metadata gathering. If it is set to DISABLED then "Metadata" API - * calls to DCGM are not supported. - */ -typedef enum dcgmIntrospectState_enum -{ - DCGM_INTROSPECT_STATE_DISABLED = 0, - DCGM_INTROSPECT_STATE_ENABLED = 1 -} dcgmIntrospectState_t; - -/** - * DCGM Memory usage information - */ -typedef struct -{ - unsigned int version; //!< version number (dcgmIntrospectMemory_version) - long long bytesUsed; //!< number of bytes -} dcgmIntrospectMemory_v1; - -/** - * Typedef for \ref dcgmIntrospectMemory_t - */ -typedef dcgmIntrospectMemory_v1 dcgmIntrospectMemory_t; - -/** - * Version 1 for \ref dcgmIntrospectMemory_t - */ -#define dcgmIntrospectMemory_version1 MAKE_DCGM_VERSION(dcgmIntrospectMemory_v1, 1) - -/** - * Latest version for \ref dcgmIntrospectMemory_t - */ -#define dcgmIntrospectMemory_version dcgmIntrospectMemory_version1 - - -/** - * Full introspection info for field memory - */ -typedef struct -{ - unsigned int version; //!< version number (dcgmIntrospectFullMemory_version) - - dcgmIntrospectMemory_v1 aggregateInfo; //!< info that includes global and device scope - - int hasGlobalInfo; //!< 0 means \ref globalInfo is populated, !0 means it's not - dcgmIntrospectMemory_v1 globalInfo; //!< info that only includes global field scope - - unsigned short gpuInfoCount; //!< count of how many entries in \ref gpuInfo are populated - unsigned int gpuIdsForGpuInfo[DCGM_MAX_NUM_DEVICES]; //!< the GPU ID at a given index identifies which gpu - //!< the corresponding entry in \ref gpuInfo is from - - dcgmIntrospectMemory_v1 gpuInfo[DCGM_MAX_NUM_DEVICES]; //!< info that is divided by the - //!< GPU ID that the watches were for -} dcgmIntrospectFullMemory_v1; - -/** - * typedef for \ref dcgmIntrospectFullMemory_v1 - */ -typedef dcgmIntrospectFullMemory_v1 dcgmIntrospectFullMemory_t; - -/** - * Version 1 for \ref dcgmIntrospectFullMemory_t - */ -#define dcgmIntrospectFullMemory_version1 MAKE_DCGM_VERSION(dcgmIntrospectFullMemory_v1, 1) - -/** - * Latest version for \ref dcgmIntrospectFullMemory_t - */ -#define dcgmIntrospectFullMemory_version dcgmIntrospectFullMemory_version1 - -/** - * DCGM CPU Utilization information. Multiply values by 100 to get them in %. - */ -typedef struct -{ - unsigned int version; //!< version number (dcgmMetadataCpuUtil_version) - double total; //!< fraction of device's CPU resources that were used - double kernel; //!< fraction of device's CPU resources that were used in kernel mode - double user; //!< fraction of device's CPU resources that were used in user mode -} dcgmIntrospectCpuUtil_v1; - -/** - * Typedef for \ref dcgmIntrospectCpuUtil_t - */ -typedef dcgmIntrospectCpuUtil_v1 dcgmIntrospectCpuUtil_t; - -/** - * Version 1 for \ref dcgmIntrospectCpuUtil_t - */ -#define dcgmIntrospectCpuUtil_version1 MAKE_DCGM_VERSION(dcgmIntrospectCpuUtil_v1, 1) - -/** - * Latest version for \ref dcgmIntrospectCpuUtil_t - */ -#define dcgmIntrospectCpuUtil_version dcgmIntrospectCpuUtil_version1 - -#define DCGM_MAX_CONFIG_FILE_LEN 10000 -#define DCGM_MAX_TEST_NAMES 20 -#define DCGM_MAX_TEST_NAMES_LEN 50 -#define DCGM_MAX_TEST_PARMS 100 -#define DCGM_MAX_TEST_PARMS_LEN 100 -#define DCGM_GPU_LIST_LEN 50 -#define DCGM_FILE_LEN 30 -#define DCGM_PATH_LEN 128 -#define DCGM_THROTTLE_MASK_LEN 50 - -/** - * Flags options for running the GPU diagnostic - * @{ - * - */ - -/** - * Output in verbose mode; include information as well as warnings - */ -#define DCGM_RUN_FLAGS_VERBOSE 0x0001 - -/** - * Output stats only on failure - */ -#define DCGM_RUN_FLAGS_STATSONFAIL 0x0002 - -/** - * Train DCGM diagnostic and output a configuration file with golden values - */ -#define DCGM_RUN_FLAGS_TRAIN 0x0004 - -/** - * Ignore warnings against training the diagnostic and train anyway - */ -#define DCGM_RUN_FLAGS_FORCE_TRAIN 0x0008 - -/** - * Enable fail early checks for the Targeted Stress, Targeted Power, SM Stress, and Diagnostic tests - */ -#define DCGM_RUN_FLAGS_FAIL_EARLY 0x0010 - -/** - * @} - */ - -/* - * Run diagnostic structure v7 - */ -typedef struct -{ - unsigned int version; //!< version of this message - unsigned int flags; //!< flags specifying binary options for running it. See DCGM_RUN_FLAGS_* - unsigned int debugLevel; //!< 0-5 for the debug level the GPU diagnostic will use for logging. - dcgmGpuGrp_t groupId; //!< group of GPUs to verify. Cannot be specified together with gpuList. - dcgmPolicyValidation_t validate; //!< 0-3 for which tests to run. Optional. - char testNames[DCGM_MAX_TEST_NAMES][DCGM_MAX_TEST_NAMES_LEN]; //!< Specified list of test names. Optional. - char testParms[DCGM_MAX_TEST_PARMS][DCGM_MAX_TEST_PARMS_LEN]; //!< Parameters to set for specified tests - //!< in the format: - //!< testName.parameterName=parameterValue. Optional. - char fakeGpuList[DCGM_GPU_LIST_LEN]; //!< Comma-separated list of GPUs. Cannot be specified with the groupId. - char gpuList[DCGM_GPU_LIST_LEN]; //!< Comma-separated list of GPUs. Cannot be specified with the groupId. - char debugLogFile[DCGM_PATH_LEN]; //!< Alternate name for the debug log file that should be used - char statsPath[DCGM_PATH_LEN]; //!< Path that the plugin's statistics files should be written to - char configFileContents[DCGM_MAX_CONFIG_FILE_LEN]; //!< Contents of nvvs config file (likely yaml) - char throttleMask[DCGM_THROTTLE_MASK_LEN]; //!< Throttle reasons to ignore as either integer mask or csv list of - //!< reasons - char pluginPath[DCGM_PATH_LEN]; //!< Custom path to the diagnostic plugins - unsigned int trainingIterations; //!< Number of iterations for training - unsigned int trainingVariance; //!< Acceptable training variance as a percentage of the value. (0-100) - unsigned int trainingTolerance; //!< Acceptable training tolerance as a percentage of the value. (0-100) - char goldenValuesFile[DCGM_PATH_LEN]; //!< The path where the golden values should be recorded - unsigned int failCheckInterval; //!< How often the fail early checks should occur when enabled. -} dcgmRunDiag_v7; - -/** - * Version 7 for \ref dcgmRunDiag_t - */ -#define dcgmRunDiag_version7 MAKE_DCGM_VERSION(dcgmRunDiag_v7, 7) - -/** - * Flags for dcgmGetEntityGroupEntities's flags parameter - * - * Only return entities that are supported by DCGM. - * This mimics the behavior of dcgmGetAllSupportedDevices(). - */ -#define DCGM_GEGE_FLAG_ONLY_SUPPORTED 0x00000001 - -/** - * Identifies a GPU NVLink error type returned by DCGM_FI_DEV_GPU_NVLINK_ERRORS - */ -typedef enum dcgmGpuNVLinkErrorType_enum -{ - DCGM_GPU_NVLINK_ERROR_RECOVERY_REQUIRED = 1, //!< NVLink link recovery error occurred - DCGM_GPU_NVLINK_ERROR_FATAL, //!< NVLink link fatal error occurred -} dcgmGpuNVLinkErrorType_t; - -/** Topology hints for dcgmSelectGpusByTopology() - * @{ - */ - -/** No hints specified */ -#define DCGM_TOPO_HINT_F_NONE 0x00000000 - -/** Ignore the health of the GPUs when picking GPUs for job - * execution. By default, only healthy GPUs are considered. - */ -#define DCGM_TOPO_HINT_F_IGNOREHEALTH 0x00000001 - -/** - * @} - */ - - -typedef struct -{ - unsigned int version; //!< version of this message - uint64_t inputGpuIds; //!< bit-mask of the GPU ids to choose from - uint32_t numGpus; //!< the number of GPUs that DCGM should choose - uint64_t hintFlags; //!< Hints to ignore certain factors for the scheduling hint -} dcgmTopoSchedHint_v1; - -typedef dcgmTopoSchedHint_v1 dcgmTopoSchedHint_t; - -#define dcgmTopoSchedHint_version1 MAKE_DCGM_VERSION(dcgmTopoSchedHint_v1, 1) - -/** - * NvLink link states - */ -typedef enum dcgmNvLinkLinkState_enum -{ - DcgmNvLinkLinkStateNotSupported = 0, //!< NvLink is unsupported by this GPU (Default for GPUs) - DcgmNvLinkLinkStateDisabled = 1, //!< NvLink is supported for this link but this link is disabled - //!< (Default for NvSwitches) - DcgmNvLinkLinkStateDown = 2, //!< This NvLink link is down (inactive) - DcgmNvLinkLinkStateUp = 3 //!< This NvLink link is up (active) -} dcgmNvLinkLinkState_t; - -/** - * State of NvLink links for a GPU - */ -typedef struct -{ - dcgm_field_eid_t entityId; //!< Entity ID of the GPU (gpuId) - dcgmNvLinkLinkState_t linkState[DCGM_NVLINK_MAX_LINKS_PER_GPU_LEGACY1]; //!< Per-GPU link states -} dcgmNvLinkGpuLinkStatus_v1; - -typedef struct -{ - dcgm_field_eid_t entityId; //!< Entity ID of the GPU (gpuId) - dcgmNvLinkLinkState_t linkState[DCGM_NVLINK_MAX_LINKS_PER_GPU]; //!< Per-GPU link states -} dcgmNvLinkGpuLinkStatus_v2; - -/** - * State of NvLink links for a NvSwitch - */ -typedef struct -{ - dcgm_field_eid_t entityId; //!< Entity ID of the NvSwitch (physicalId) - dcgmNvLinkLinkState_t linkState[DCGM_NVLINK_MAX_LINKS_PER_NVSWITCH]; //!< Per-NvSwitch link states -} dcgmNvLinkNvSwitchLinkStatus_t; - -/** - * Status of all of the NvLinks in a given system - */ -typedef struct -{ - unsigned int version; //!< Version of this request. Should be dcgmNvLinkStatus_version1 - unsigned int numGpus; //!< Number of entries in gpus[] that are populated - dcgmNvLinkGpuLinkStatus_v1 gpus[DCGM_MAX_NUM_DEVICES]; //!< Per-GPU NvLink link statuses - unsigned int numNvSwitches; //!< Number of entries in nvSwitches[] that are populated - dcgmNvLinkNvSwitchLinkStatus_t nvSwitches[DCGM_MAX_NUM_SWITCHES]; //!< Per-NvSwitch link statuses -} dcgmNvLinkStatus_v1; - -/** - * Version 1 of dcgmNvLinkStatus - */ -#define dcgmNvLinkStatus_version1 MAKE_DCGM_VERSION(dcgmNvLinkStatus_v1, 1) - -typedef struct -{ - unsigned int version; //!< Version of this request. Should be dcgmNvLinkStatus_version1 - unsigned int numGpus; //!< Number of entries in gpus[] that are populated - dcgmNvLinkGpuLinkStatus_v2 gpus[DCGM_MAX_NUM_DEVICES]; //!< Per-GPU NvLink link statuses - unsigned int numNvSwitches; //!< Number of entries in nvSwitches[] that are populated - dcgmNvLinkNvSwitchLinkStatus_t nvSwitches[DCGM_MAX_NUM_SWITCHES]; //!< Per-NvSwitch link statuses -} dcgmNvLinkStatus_v2; - -typedef dcgmNvLinkStatus_v2 dcgmNvLinkStatus_t; - -/** - * Version 2 of dcgmNvLinkStatus - */ -#define dcgmNvLinkStatus_version2 MAKE_DCGM_VERSION(dcgmNvLinkStatus_v2, 2) - -/* Bitmask values for dcgmGetFieldIdSummary - Sync with DcgmcmSummaryType_t */ -#define DCGM_SUMMARY_MIN 0x00000001 -#define DCGM_SUMMARY_MAX 0x00000002 -#define DCGM_SUMMARY_AVG 0x00000004 -#define DCGM_SUMMARY_SUM 0x00000008 -#define DCGM_SUMMARY_COUNT 0x00000010 -#define DCGM_SUMMARY_INTEGRAL 0x00000020 -#define DCGM_SUMMARY_DIFF 0x00000040 -#define DCGM_SUMMARY_SIZE 7 - -/* dcgmSummaryResponse_t is part of dcgmFieldSummaryRequest, so it uses dcgmFieldSummaryRequest's version. */ - -typedef struct -{ - unsigned int fieldType; //!< type of field that is summarized (int64 or fp64) - unsigned int summaryCount; //!< the number of populated summaries in \ref values - union - { - int64_t i64; - double fp64; - } values[DCGM_SUMMARY_SIZE]; //!< array for storing the values of each summary. The summaries are stored - //!< in order. For example, if MIN AND MAX are requested, then 0 will be MIN - //!< and 1 will be MAX. If AVG and DIFF were requested, then AVG would be 0 - //!< and 1 would be DIFF -} dcgmSummaryResponse_t; - -typedef struct -{ - unsigned int version; //!< version of this message - dcgmFieldSummaryRequest_v1 - unsigned short fieldId; //!< field id to be summarized - dcgm_field_entity_group_t entityGroupId; //!< the type of entity whose field we're getting - dcgm_field_eid_t entityId; //!< ordinal id for this entity - uint32_t summaryTypeMask; //!< bit-mask of DCGM_SUMMARY_*, the requested summaries - uint64_t startTime; //!< start time for the interval being summarized. 0 means to use - //!< any data before. - uint64_t endTime; //!< end time for the interval being summarized. 0 means to use - //!< any data after. - dcgmSummaryResponse_t response; //!< response data for this request -} dcgmFieldSummaryRequest_v1; - -typedef dcgmFieldSummaryRequest_v1 dcgmFieldSummaryRequest_t; - -#define dcgmFieldSummaryRequest_version1 MAKE_DCGM_VERSION(dcgmFieldSummaryRequest_v1, 1) - -/** - * Module IDs - */ -typedef enum -{ - DcgmModuleIdCore = 0, //!< Core DCGM - always loaded - DcgmModuleIdNvSwitch = 1, //!< NvSwitch Module - DcgmModuleIdVGPU = 2, //!< VGPU Module - DcgmModuleIdIntrospect = 3, //!< Introspection Module - DcgmModuleIdHealth = 4, //!< Health Module - DcgmModuleIdPolicy = 5, //!< Policy Module - DcgmModuleIdConfig = 6, //!< Config Module - DcgmModuleIdDiag = 7, //!< GPU Diagnostic Module - DcgmModuleIdProfiling = 8, //!< Profiling Module - - DcgmModuleIdCount //!< Always last. 1 greater than largest value above -} dcgmModuleId_t; - -/** - * Module Status. Modules are lazy loaded, so they will be in status DcgmModuleStatusNotLoaded - * until they are used. One modules are used, they will move to another status. - */ -typedef enum -{ - DcgmModuleStatusNotLoaded = 0, //!< Module has not been loaded yet - DcgmModuleStatusBlacklisted = 1, //!< Module has been blacklisted from being loaded - DcgmModuleStatusFailed = 2, //!< Loading the module failed - DcgmModuleStatusLoaded = 3, //!< Module has been loaded - DcgmModuleStatusUnloaded = 4, //!< Module has been unloaded, happens during shutdown -} dcgmModuleStatus_t; - -/** - * Status of all of the modules of the host engine - */ -typedef struct -{ - dcgmModuleId_t id; //!< ID of this module - dcgmModuleStatus_t status; //!< Status of this module -} dcgmModuleGetStatusesModule_t; - -/* This is larger than DcgmModuleIdCount so we can add modules without versioning this request */ -#define DCGM_MODULE_STATUSES_CAPACITY 16 - -typedef struct -{ - unsigned int version; //!< Version of this request. Should be dcgmModuleGetStatuses_version1 - unsigned int numStatuses; //!< Number of entries in statuses[] that are populated - dcgmModuleGetStatusesModule_t statuses[DCGM_MODULE_STATUSES_CAPACITY]; //!< Per-module status information -} dcgmModuleGetStatuses_v1; - -/** - * Version 1 of dcgmModuleGetStatuses - */ -#define dcgmModuleGetStatuses_version1 MAKE_DCGM_VERSION(dcgmModuleGetStatuses_v1, 1) -#define dcgmModuleGetStatuses_version dcgmModuleGetStatuses_version1 -typedef dcgmModuleGetStatuses_v1 dcgmModuleGetStatuses_t; - -/** - * Options for dcgmStartEmbedded_v2 - * - * Added in DCGM 2.0.0 - */ -typedef struct -{ - unsigned int version; /*!< Version number. Use dcgmStartEmbeddedV2Params_version1 */ - dcgmOperationMode_t opMode; /*!< IN: Collect data automatically or manually when asked by the user. */ - dcgmHandle_t dcgmHandle; /*!< OUT: DCGM Handle to use for API calls */ - const char *logFile; /*!< IN: File that DCGM should log to. NULL = do not log. '-' = stdout */ - DcgmLoggingSeverity_t severity; /*!< IN: Severity at which DCGM should log to logFile */ - unsigned int blackListCount; /*!< IN: Number of modules that to be blacklisted in blackList[] */ - dcgmModuleId_t blackList[DcgmModuleIdCount]; /* IN: IDs of modules to blacklist */ - unsigned int unused; /*!< IN: Unused. Set to 0. Aligns structure to 8-bytes */ -} dcgmStartEmbeddedV2Params_v1; - -/** - * Version 1 for \ref dcgmStartEmbeddedV2Params_v1 - */ -#define dcgmStartEmbeddedV2Params_version1 MAKE_DCGM_VERSION(dcgmStartEmbeddedV2Params_v1, 1) - -/** - * Maximum number of metric ID groups that can exist in DCGM - */ -#define DCGM_PROF_MAX_NUM_GROUPS 10 - -/** - * Maximum number of field IDs that can be in a single DCGM profiling metric group - */ -#define DCGM_PROF_MAX_FIELD_IDS_PER_GROUP 8 - -/** - * Structure to return all of the profiling metric groups that are available for the given groupId. - */ -typedef struct -{ - unsigned short majorId; //!< Major ID of this metric group. Metric groups with the same majorId cannot be - //!< watched concurrently with other metric groups with the same majorId - unsigned short minorId; //!< Minor ID of this metric group. This distinguishes metric groups within the same - //!< major metric group from each other - unsigned int numFieldIds; //!< Number of field IDs that are populated in fieldIds[] - unsigned short fieldIds[DCGM_PROF_MAX_FIELD_IDS_PER_GROUP]; //!< DCGM Field IDs that are part of this profiling - //!< group. See DCGM_FI_PROF_* definitions in - //!< dcgm_fields.h for details. -} dcgmProfMetricGroupInfo_t; - -typedef struct -{ - /** \name Input parameters - * @{ - */ - unsigned int version; //!< Version of this request. Should be dcgmProfGetMetricGroups_version - unsigned int unused; //!< Not used for now. Set to 0 - dcgmGpuGrp_t groupId; //!< Group of GPUs we should get the metric groups for. These must all be the - //!< exact same GPU or DCGM_ST_GROUP_INCOMPATIBLE will be returned - /** - * @} - */ - - /** \name Output - * @{ - */ - unsigned int numMetricGroups; //!< Number of entries in metricGroups[] that are populated - unsigned int unused1; //!< Not used for now. Set to 0 - dcgmProfMetricGroupInfo_t metricGroups[DCGM_PROF_MAX_NUM_GROUPS]; //!< Info for each metric group - /** - * @} - */ -} dcgmProfGetMetricGroups_v2; - -/** - * Version 1 of dcgmProfGetMetricGroups_t - */ -#define dcgmProfGetMetricGroups_version2 MAKE_DCGM_VERSION(dcgmProfGetMetricGroups_v2, 2) -#define dcgmProfGetMetricGroups_version dcgmProfGetMetricGroups_version2 -typedef dcgmProfGetMetricGroups_v2 dcgmProfGetMetricGroups_t; - -/** - * Structure to pass to dcgmProfWatchFields() when watching profiling metrics - */ -typedef struct -{ - unsigned int version; //!< Version of this request. Should be dcgmProfWatchFields_version - dcgmGpuGrp_t groupId; //!< Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate - //!< for details on creating the group. Alternatively, pass in the group id as \a - //!< DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. The GPUs of the group - //!< must all be identical or DCGM_ST_GROUP_INCOMPATIBLE will be returned by this API. - unsigned int numFieldIds; //!< Number of field IDs that are being passed in fieldIds[] - unsigned short fieldIds[16]; //!< DCGM_FI_PROF_? field IDs to watch - long long updateFreq; //!< How often to update this field in usec. Note that profiling metrics may need to be - //!< sampled more frequently than this value. See - //!< dcgmProfMetricGroupInfo_t.minUpdateFreqUsec of the metric group matching - //!< metricGroupTag to see what this minimum is. If minUpdateFreqUsec < updateFreq - //!< then samples will be aggregated to updateFreq intervals in DCGM's internal cache. - double maxKeepAge; //!< How long to keep data for every fieldId in seconds - int maxKeepSamples; //!< Maximum number of samples to keep for each fieldId. 0=no limit - unsigned int flags; //!< For future use. Set to 0 for now. -} dcgmProfWatchFields_v1; - -/** - * Version 1 of dcgmProfWatchFields_v1 - */ -#define dcgmProfWatchFields_version1 MAKE_DCGM_VERSION(dcgmProfWatchFields_v1, 1) -#define dcgmProfWatchFields_version dcgmProfWatchFields_version1 -typedef dcgmProfWatchFields_v1 dcgmProfWatchFields_t; - -/** - * Structure to pass to dcgmProfUnwatchFields when unwatching profiling metrics - */ -typedef struct -{ - unsigned int version; //!< Version of this request. Should be dcgmProfUnwatchFields_version - dcgmGpuGrp_t groupId; //!< Group ID representing collection of one or more GPUs. Look at - //!< \ref dcgmGroupCreate for details on creating the group. - //!< Alternatively, pass in the group id as \a DCGM_GROUP_ALL_GPUS - //!< to perform operation on all the GPUs. The GPUs of the group must all be - //!< identical or DCGM_ST_GROUP_INCOMPATIBLE will be returned by this API. - unsigned int flags; //!< For future use. Set to 0 for now. -} dcgmProfUnwatchFields_v1; - -/** - * Version 1 of dcgmProfUnwatchFields_v1 - */ -#define dcgmProfUnwatchFields_version1 MAKE_DCGM_VERSION(dcgmProfUnwatchFields_v1, 1) -#define dcgmProfUnwatchFields_version dcgmProfUnwatchFields_version1 -typedef dcgmProfUnwatchFields_v1 dcgmProfUnwatchFields_t; - -/** - * Version 1 of dcgmSettingsSetLoggingSeverity_t - */ -typedef struct -{ - int targetLogger; - DcgmLoggingSeverity_t targetSeverity; -} dcgmSettingsSetLoggingSeverity_v1; - - -#define dcgmSettingsSetLoggingSeverity_version1 MAKE_DCGM_VERSION(dcgmSettingsSetLoggingSeverity_v1, 1) -#define dcgmSettingsSetLoggingSeverity_version dcgmSettingsSetLoggingSeverity_version1 -typedef dcgmSettingsSetLoggingSeverity_v1 dcgmSettingsSetLoggingSeverity_t; - -/** - * Structure to describe the DCGM build environment ver 2.0 - */ -typedef struct -{ - unsigned int version; // - * Every pair is separated by a colon char (:). Only the very first colon is considered as a separation.
- * Values can contain colon chars. Values and Keys cannot contain semicolon chars.
- * Usually defined keys are: - *

- * version : DCGM Version.
- * arch : Target DCGM Architecture.
- * buildid : Build ID. Usually a sequential number.
- * commit : Commit ID (Usually a git commit hash).
- * author : Author of the commit above.
- * branch : Branch (Usually a git branch that was used for the build).
- * buildtype : Build Type.
- * builddate : Date of the build.
- * buildplatform : Platform where the build was made.
- *

- * Any or all keys may be absent.
- * This values are for reference only are not supposed to participate in some complicated logic.
- */ - char rawBuildInfoString[DCGM_MAX_STR_LENGTH * 2]; -} dcgmVersionInfo_v2; - -/** - * Version 2 of the dcgmVersionInfo_v2 - */ -#define dcgmVersionInfo_version2 MAKE_DCGM_VERSION(dcgmVersionInfo_v2, 2) - -#define dcgmVersionInfo_version dcgmVersionInfo_version2 -typedef dcgmVersionInfo_v2 dcgmVersionInfo_t; - -/** @} */ - -#ifdef __cplusplus -} -#endif - -#endif /* DCGM_STRUCTS_H */ diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/device_info.go b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/device_info.go deleted file mode 100644 index bda27a11..00000000 --- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/device_info.go +++ /dev/null @@ -1,196 +0,0 @@ -package dcgm - -/* -#include "dcgm_agent.h" -#include "dcgm_structs.h" -*/ -import "C" -import ( - "fmt" - "math/rand" - "unsafe" -) - -type PCIInfo struct { - BusID string - BAR1 uint // MB - FBTotal uint // MB - Bandwidth int64 // MB/s -} - -type DeviceIdentifiers struct { - Brand string - Model string - Serial string - Vbios string - InforomImageVersion string - DriverVersion string -} - -type Device struct { - GPU uint - DCGMSupported string - UUID string - Power uint // W - PCI PCIInfo - Identifiers DeviceIdentifiers - Topology []P2PLink - CPUAffinity string -} - -// getAllDeviceCount counts all GPUs on the system -func getAllDeviceCount() (gpuCount uint, err error) { - var gpuIdList [C.DCGM_MAX_NUM_DEVICES]C.uint - var count C.int - - result := C.dcgmGetAllDevices(handle.handle, &gpuIdList[0], &count) - if err = errorString(result); err != nil { - return gpuCount, fmt.Errorf("Error getting devices count: %s", err) - } - gpuCount = uint(count) - return -} - -// getSupportedDevices returns DCGM supported GPUs -func getSupportedDevices() (gpus []uint, err error) { - var gpuIdList [C.DCGM_MAX_NUM_DEVICES]C.uint - var count C.int - - result := C.dcgmGetAllSupportedDevices(handle.handle, &gpuIdList[0], &count) - if err = errorString(result); err != nil { - return gpus, fmt.Errorf("Error getting DCGM supported devices: %s", err) - } - - numGpus := int(count) - gpus = make([]uint, numGpus) - for i := 0; i < numGpus; i++ { - gpus[i] = uint(gpuIdList[i]) - } - return -} - -func getPciBandwidth(gpuId uint) (int64, error) { - const ( - maxLinkGen int = iota - maxLinkWidth - fieldsCount - ) - - pciFields := make([]Short, fieldsCount) - pciFields[maxLinkGen] = C.DCGM_FI_DEV_PCIE_MAX_LINK_GEN - pciFields[maxLinkWidth] = C.DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH - - fieldsName := fmt.Sprintf("pciBandwidthFields%d", rand.Uint64()) - - fieldsId, err := FieldGroupCreate(fieldsName, pciFields) - if err != nil { - return 0, err - } - - groupName := fmt.Sprintf("pciBandwidth%d", rand.Uint64()) - groupId, err := WatchFields(gpuId, fieldsId, groupName) - if err != nil { - _ = FieldGroupDestroy(fieldsId) - return 0, err - } - - values, err := GetLatestValuesForFields(gpuId, pciFields) - if err != nil { - _ = FieldGroupDestroy(fieldsId) - _ = DestroyGroup(groupId) - return 0, fmt.Errorf("Error getting Pcie bandwidth: %s", err) - } - - gen := values[maxLinkGen].Int64() - width := values[maxLinkWidth].Int64() - - _ = FieldGroupDestroy(fieldsId) - _ = DestroyGroup(groupId) - - genMap := map[int64]int64{ - 1: 250, // MB/s - 2: 500, - 3: 985, - 4: 1969, - } - - bandwidth := genMap[gen] * width - return bandwidth, nil -} - -func getDeviceInfo(gpuid uint) (deviceInfo Device, err error) { - var device C.dcgmDeviceAttributes_t - device.version = makeVersion2(unsafe.Sizeof(device)) - - result := C.dcgmGetDeviceAttributes(handle.handle, C.uint(gpuid), &device) - if err = errorString(result); err != nil { - return deviceInfo, fmt.Errorf("Error getting device information: %s", err) - } - - // check if the given GPU is DCGM supported - gpus, err := getSupportedDevices() - if err != nil { - return - } - - supported := "No" - - for _, gpu := range gpus { - if gpuid == gpu { - supported = "Yes" - break - } - } - - busid := *stringPtr(&device.identifiers.pciBusId[0]) - - cpuAffinity, err := getCPUAffinity(busid) - if err != nil { - return - } - - var topology []P2PLink - var bandwidth int64 - // get device topology and bandwidth only if its a DCGM supported device - if supported == "Yes" { - topology, err = getDeviceTopology(gpuid) - if err != nil { - return - } - bandwidth, err = getPciBandwidth(gpuid) - if err != nil { - return - } - } - - uuid := *stringPtr(&device.identifiers.uuid[0]) - power := *uintPtr(device.powerLimits.defaultPowerLimit) - - pci := PCIInfo{ - BusID: busid, - BAR1: *uintPtr(device.memoryUsage.bar1Total), - FBTotal: *uintPtr(device.memoryUsage.fbTotal), - Bandwidth: bandwidth, - } - - identifiers := DeviceIdentifiers{ - Brand: *stringPtr(&device.identifiers.brandName[0]), - Model: *stringPtr(&device.identifiers.deviceName[0]), - Serial: *stringPtr(&device.identifiers.serial[0]), - Vbios: *stringPtr(&device.identifiers.vbios[0]), - InforomImageVersion: *stringPtr(&device.identifiers.inforomImageVersion[0]), - DriverVersion: *stringPtr(&device.identifiers.driverVersion[0]), - } - - deviceInfo = Device{ - GPU: gpuid, - DCGMSupported: supported, - UUID: uuid, - Power: power, - PCI: pci, - Identifiers: identifiers, - Topology: topology, - CPUAffinity: cpuAffinity, - } - return -} diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/device_status.go b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/device_status.go deleted file mode 100644 index 4d37de12..00000000 --- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/device_status.go +++ /dev/null @@ -1,179 +0,0 @@ -package dcgm - -/* -#include "./dcgm_agent.h" -#include "./dcgm_structs.h" -*/ -import "C" -import ( - "fmt" - "math/rand" -) - -type PerfState uint - -const ( - PerfStateMax = 0 - PerfStateMin = 15 - PerfStateUnknown = 32 -) - -func (p PerfState) String() string { - if p >= PerfStateMax && p <= PerfStateMin { - return fmt.Sprintf("P%d", p) - } - return "Unknown" -} - -type UtilizationInfo struct { - GPU int64 // % - Memory int64 // % - Encoder int64 // % - Decoder int64 // % -} - -type ECCErrorsInfo struct { - SingleBit int64 - DoubleBit int64 -} - -type MemoryInfo struct { - GlobalUsed int64 - ECCErrors ECCErrorsInfo -} - -type ClockInfo struct { - Cores int64 // MHz - Memory int64 // MHz -} - -type PCIThroughputInfo struct { - Rx int64 // MB - Tx int64 // MB - Replays int64 -} - -type PCIStatusInfo struct { - BAR1Used int64 // MB - Throughput PCIThroughputInfo - FBUsed int64 -} - -type DeviceStatus struct { - Power float64 // W - Temperature int64 // °C - Utilization UtilizationInfo - Memory MemoryInfo - Clocks ClockInfo - PCI PCIStatusInfo - Performance PerfState - FanSpeed int64 // % -} - -func latestValuesForDevice(gpuId uint) (status DeviceStatus, err error) { - const ( - pwr int = iota - temp - sm - mem - enc - dec - smClock - memClock - bar1Used - pcieRxThroughput - pcieTxThroughput - pcieReplay - fbUsed - sbe - dbe - pstate - fanSpeed - fieldsCount - ) - - deviceFields := make([]Short, fieldsCount) - deviceFields[pwr] = C.DCGM_FI_DEV_POWER_USAGE - deviceFields[temp] = C.DCGM_FI_DEV_GPU_TEMP - deviceFields[sm] = C.DCGM_FI_DEV_GPU_UTIL - deviceFields[mem] = C.DCGM_FI_DEV_MEM_COPY_UTIL - deviceFields[enc] = C.DCGM_FI_DEV_ENC_UTIL - deviceFields[dec] = C.DCGM_FI_DEV_DEC_UTIL - deviceFields[smClock] = C.DCGM_FI_DEV_SM_CLOCK - deviceFields[memClock] = C.DCGM_FI_DEV_MEM_CLOCK - deviceFields[bar1Used] = C.DCGM_FI_DEV_BAR1_USED - deviceFields[pcieRxThroughput] = C.DCGM_FI_DEV_PCIE_RX_THROUGHPUT - deviceFields[pcieTxThroughput] = C.DCGM_FI_DEV_PCIE_TX_THROUGHPUT - deviceFields[pcieReplay] = C.DCGM_FI_DEV_PCIE_REPLAY_COUNTER - deviceFields[fbUsed] = C.DCGM_FI_DEV_FB_USED - deviceFields[sbe] = C.DCGM_FI_DEV_ECC_SBE_AGG_TOTAL - deviceFields[dbe] = C.DCGM_FI_DEV_ECC_DBE_AGG_TOTAL - deviceFields[pstate] = C.DCGM_FI_DEV_PSTATE - deviceFields[fanSpeed] = C.DCGM_FI_DEV_FAN_SPEED - - fieldsName := fmt.Sprintf("devStatusFields%d", rand.Uint64()) - fieldsId, err := FieldGroupCreate(fieldsName, deviceFields) - if err != nil { - return - } - - groupName := fmt.Sprintf("devStatus%d", rand.Uint64()) - groupId, err := WatchFields(gpuId, fieldsId, groupName) - if err != nil { - _ = FieldGroupDestroy(fieldsId) - return - } - - values, err := GetLatestValuesForFields(gpuId, deviceFields) - if err != nil { - _ = FieldGroupDestroy(fieldsId) - _ = DestroyGroup(groupId) - return status, fmt.Errorf("Error getting device status: %s", err) - } - - power := values[pwr].Float64() - - gpuUtil := UtilizationInfo{ - GPU: values[sm].Int64(), - Memory: values[mem].Int64(), - Encoder: values[enc].Int64(), - Decoder: values[dec].Int64(), - } - - memory := MemoryInfo{ - ECCErrors: ECCErrorsInfo{ - SingleBit: values[sbe].Int64(), - DoubleBit: values[dbe].Int64(), - }, - } - - clocks := ClockInfo{ - Cores: values[smClock].Int64(), - Memory: values[memClock].Int64(), - } - - pci := PCIStatusInfo{ - BAR1Used: values[bar1Used].Int64(), - Throughput: PCIThroughputInfo{ - Rx: values[pcieRxThroughput].Int64(), - Tx: values[pcieTxThroughput].Int64(), - Replays: values[pcieReplay].Int64(), - }, - FBUsed: values[fbUsed].Int64(), - } - - status = DeviceStatus{ - Power: power, - Temperature: values[temp].Int64(), - Utilization: gpuUtil, - Memory: memory, - Clocks: clocks, - PCI: pci, - Performance: PerfState(values[pstate].Int64()), - FanSpeed: values[fanSpeed].Int64(), - } - - _ = FieldGroupDestroy(fieldsId) - _ = DestroyGroup(groupId) - return -} diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/fields.go b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/fields.go deleted file mode 100644 index c5e50bf7..00000000 --- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/fields.go +++ /dev/null @@ -1,257 +0,0 @@ -package dcgm - -/* -#include "./dcgm_agent.h" -#include "./dcgm_structs.h" -*/ -import "C" -import ( - "fmt" - "unicode" - "unsafe" -) - -const ( - updateFreq = 1000000 // usec - maxKeepAge = 300 // sec - maxKeepSamples = 0 // nolimit -) - -type FieldMeta struct { - FieldId Short - FieldType byte - Size byte - Tag string - Scope int - NvmlFieldId int - EntityLevel Field_Entity_Group -} - -type FieldHandle struct{ handle C.dcgmFieldGrp_t } - -func FieldGroupCreate(fieldsGroupName string, fields []Short) (fieldsId FieldHandle, err error) { - var fieldsGroup C.dcgmFieldGrp_t - cfields := *(*[]C.ushort)(unsafe.Pointer(&fields)) - - groupName := C.CString(fieldsGroupName) - defer freeCString(groupName) - - result := C.dcgmFieldGroupCreate(handle.handle, C.int(len(fields)), &cfields[0], groupName, &fieldsGroup) - if err = errorString(result); err != nil { - return fieldsId, fmt.Errorf("Error creating DCGM fields group: %s", err) - } - - fieldsId = FieldHandle{fieldsGroup} - return -} - -func FieldGroupDestroy(fieldsGroup FieldHandle) (err error) { - result := C.dcgmFieldGroupDestroy(handle.handle, fieldsGroup.handle) - if err = errorString(result); err != nil { - fmt.Errorf("Error destroying DCGM fields group: %s", err) - } - - return -} - -func WatchFields(gpuId uint, fieldsGroup FieldHandle, groupName string) (groupId GroupHandle, err error) { - group, err := CreateGroup(groupName) - if err != nil { - return - } - - err = AddToGroup(group, gpuId) - if err != nil { - return - } - - result := C.dcgmWatchFields(handle.handle, group.handle, fieldsGroup.handle, C.longlong(updateFreq), C.double(maxKeepAge), C.int(maxKeepSamples)) - if err = errorString(result); err != nil { - return groupId, fmt.Errorf("Error watching fields: %s", err) - } - - _ = UpdateAllFields() - return group, nil -} - -func WatchFieldsWithGroup(fieldsGroup FieldHandle, group GroupHandle) error { - result := C.dcgmWatchFields(handle.handle, group.handle, fieldsGroup.handle, - C.longlong(updateFreq), C.double(maxKeepAge), C.int(maxKeepSamples)) - - if err := errorString(result); err != nil { - return fmt.Errorf("Error watching fields: %s", err) - } - - if err := UpdateAllFields(); err != nil { - return err - } - - return nil -} - -func GetLatestValuesForFields(gpu uint, fields []Short) ([]FieldValue_v1, error) { - values := make([]C.dcgmFieldValue_v1, len(fields)) - cfields := *(*[]C.ushort)(unsafe.Pointer(&fields)) - - result := C.dcgmGetLatestValuesForFields(handle.handle, C.int(gpu), &cfields[0], C.uint(len(fields)), &values[0]) - if err := errorString(result); err != nil { - return nil, fmt.Errorf("Error watching fields: %s", err) - } - - return toFieldValue(values), nil -} - -func EntityGetLatestValues(entityGroup Field_Entity_Group, entityId uint, fields []Short) ([]FieldValue_v1, error) { - values := make([]C.dcgmFieldValue_v1, len(fields)) - cfields := (*C.ushort)(unsafe.Pointer(&fields[0])) - - result := C.dcgmEntityGetLatestValues(handle.handle, C.dcgm_field_entity_group_t(entityGroup), C.int(entityId), cfields, C.uint(len(fields)), &values[0]) - if err := errorString(result); err != nil { - return nil, fmt.Errorf("Error getting the latest value for fields: %s", err) - } - - return toFieldValue(values), nil -} - -func EntitiesGetLatestValues(entities []GroupEntityPair, fields []Short, flags uint) ([]FieldValue_v2, error) { - values := make([]C.dcgmFieldValue_v2, len(fields)*len(entities)) - cfields := (*C.ushort)(unsafe.Pointer(&fields[0])) - cEntities := make([]C.dcgmGroupEntityPair_t, len(entities)) - cPtrEntities := *(*[]C.dcgmGroupEntityPair_t)(unsafe.Pointer(&cEntities)) - for i, entity := range entities { - cEntities[i] = C.dcgmGroupEntityPair_t{C.dcgm_field_entity_group_t(entity.EntityGroupId), C.dcgm_field_eid_t(entity.EntityId)} - } - - result := C.dcgmEntitiesGetLatestValues(handle.handle, &cPtrEntities[0], C.uint(len(entities)), cfields, C.uint(len(fields)), C.uint(flags), &values[0]) - if err := errorString(result); err != nil { - return nil, fmt.Errorf("Error getting the latest value for fields: %s", err) - } - - return toFieldValue_v2(values), nil -} - -func UpdateAllFields() error { - waitForUpdate := C.int(1) - result := C.dcgmUpdateAllFields(handle.handle, waitForUpdate) - - return errorString(result) -} - -func toFieldValue(cfields []C.dcgmFieldValue_v1) []FieldValue_v1 { - fields := make([]FieldValue_v1, len(cfields)) - for i, f := range cfields { - fields[i] = FieldValue_v1{ - Version: uint(f.version), - FieldId: uint(f.fieldId), - FieldType: uint(f.fieldType), - Status: int(f.status), - Ts: int64(f.ts), - Value: f.value, - } - } - - return fields -} - -func (fv FieldValue_v1) Int64() int64 { - return *(*int64)(unsafe.Pointer(&fv.Value[0])) -} - -func (fv FieldValue_v1) Float64() float64 { - return *(*float64)(unsafe.Pointer(&fv.Value[0])) -} - -func (fv FieldValue_v1) String() string { - return *(*string)(unsafe.Pointer(&fv.Value[0])) -} - -func (fv FieldValue_v1) Blob() [4096]byte { - return fv.Value -} - -func toFieldValue_v2(cfields []C.dcgmFieldValue_v2) []FieldValue_v2 { - fields := make([]FieldValue_v2, len(cfields)) - for i, f := range cfields { - if uint(f.fieldType) == DCGM_FT_STRING { - fields[i] = FieldValue_v2{ - Version: uint(f.version), - EntityGroupId: Field_Entity_Group(f.entityGroupId), - EntityId: uint(f.entityId), - FieldId: uint(f.fieldId), - FieldType: uint(f.fieldType), - Status: int(f.status), - Ts: int64(f.ts), - Value: f.value, - StringValue: stringPtr((*C.char)(unsafe.Pointer(&f.value[0]))), - } - } else { - fields[i] = FieldValue_v2{ - Version: uint(f.version), - EntityGroupId: Field_Entity_Group(f.entityGroupId), - EntityId: uint(f.entityId), - FieldId: uint(f.fieldId), - FieldType: uint(f.fieldType), - Status: int(f.status), - Ts: int64(f.ts), - Value: f.value, - StringValue: nil, - } - } - } - - return fields -} - -func Fv2_Int64(fv FieldValue_v2) int64 { - return *(*int64)(unsafe.Pointer(&fv.Value[0])) -} - -func Fv2_Float64(fv FieldValue_v2) float64 { - return *(*float64)(unsafe.Pointer(&fv.Value[0])) -} - -func FindFirstNonAsciiIndex(value [4096]byte) int { - for i := 0; i < 4096; i++ { - if value[i] > unicode.MaxASCII || value[i] < 33 { - return i - } - } - - return 4096 -} - -func Fv2_String(fv FieldValue_v2) string { - if fv.FieldType == DCGM_FT_STRING { - return *fv.StringValue - } else { - return string(fv.Value[:]) - } -} - -func Fv2_Blob(fv FieldValue_v2) [4096]byte { - return fv.Value -} - -func ToFieldMeta(fieldInfo C.dcgm_field_meta_p) FieldMeta { - return FieldMeta{ - FieldId: Short(fieldInfo.fieldId), - FieldType: byte(fieldInfo.fieldType), - Size: byte(fieldInfo.size), - Tag: *stringPtr((*C.char)(unsafe.Pointer(&fieldInfo.tag[0]))), - Scope: int(fieldInfo.scope), - NvmlFieldId: int(fieldInfo.nvmlFieldId), - EntityLevel: Field_Entity_Group(fieldInfo.entityLevel), - } -} - -func FieldGetById(fieldId Short) FieldMeta { - return ToFieldMeta(C.DcgmFieldGetById(C.ushort(fieldId))) -} - -func FieldsInit() int { - return int(C.DcgmFieldsInit()) -} - -func FieldsTerm() int { - return int(C.DcgmFieldsTerm()) -} diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/go.mod b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/go.mod deleted file mode 100644 index 6da14c24..00000000 --- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/go.mod +++ /dev/null @@ -1,3 +0,0 @@ -module github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm - -go 1.14 diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/gpu_group.go b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/gpu_group.go deleted file mode 100644 index 13c914ae..00000000 --- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/gpu_group.go +++ /dev/null @@ -1,67 +0,0 @@ -package dcgm - -/* -#include "dcgm_agent.h" -#include "dcgm_structs.h" -*/ -import "C" -import ( - "fmt" -) - -type GroupHandle struct{ handle C.dcgmGpuGrp_t } - -func CreateGroup(groupName string) (goGroupId GroupHandle, err error) { - var cGroupId C.dcgmGpuGrp_t - cname := C.CString(groupName) - defer freeCString(cname) - - result := C.dcgmGroupCreate(handle.handle, C.DCGM_GROUP_EMPTY, cname, &cGroupId) - if err = errorString(result); err != nil { - return goGroupId, fmt.Errorf("Error creating group: %s", err) - } - - goGroupId = GroupHandle{cGroupId} - return -} - -func NewDefaultGroup(groupName string) (GroupHandle, error) { - var cGroupId C.dcgmGpuGrp_t - - cname := C.CString(groupName) - defer freeCString(cname) - - result := C.dcgmGroupCreate(handle.handle, C.DCGM_GROUP_DEFAULT, cname, &cGroupId) - if err := errorString(result); err != nil { - return GroupHandle{}, fmt.Errorf("Error creating group: %s", err) - } - - return GroupHandle{cGroupId}, nil -} - -func AddToGroup(groupId GroupHandle, gpuId uint) (err error) { - result := C.dcgmGroupAddDevice(handle.handle, groupId.handle, C.uint(gpuId)) - if err = errorString(result); err != nil { - return fmt.Errorf("Error adding GPU %v to group: %s", gpuId, err) - } - - return -} - -func AddEntityToGroup(groupId GroupHandle, entityGroupId Field_Entity_Group, entityId uint) (err error) { - result := C.dcgmGroupAddEntity(handle.handle, groupId.handle, C.dcgm_field_entity_group_t(entityGroupId), C.uint(entityId)) - if err = errorString(result); err != nil { - return fmt.Errorf("Error adding entity group type %v, entity %v to group: %s", entityGroupId, entityId, err) - } - - return -} - -func DestroyGroup(groupId GroupHandle) (err error) { - result := C.dcgmGroupDestroy(handle.handle, groupId.handle) - if err = errorString(result); err != nil { - return fmt.Errorf("Error destroying group: %s", err) - } - - return -} diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/health.go b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/health.go deleted file mode 100644 index e611e726..00000000 --- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/health.go +++ /dev/null @@ -1,121 +0,0 @@ -package dcgm - -/* -#include "dcgm_agent.h" -#include "dcgm_structs.h" -*/ -import "C" -import ( - "fmt" - "math/rand" - "unsafe" -) - -type SystemWatch struct { - Type string - Status string - Error string -} - -type DeviceHealth struct { - GPU uint - Status string - Watches []SystemWatch -} - -func setHealthWatches(groupId GroupHandle) (err error) { - result := C.dcgmHealthSet(handle.handle, groupId.handle, C.DCGM_HEALTH_WATCH_ALL) - if err = errorString(result); err != nil { - return fmt.Errorf("Error setting health watches: %s", err) - } - return -} - -func healthCheckByGpuId(gpuId uint) (deviceHealth DeviceHealth, err error) { - name := fmt.Sprintf("health%d", rand.Uint64()) - groupId, err := CreateGroup(name) - if err != nil { - return - } - - err = AddToGroup(groupId, gpuId) - if err != nil { - return - } - - err = setHealthWatches(groupId) - if err != nil { - return - } - - var healthResults C.dcgmHealthResponse_v4 - healthResults.version = makeVersion2(unsafe.Sizeof(healthResults)) - - result := C.dcgmHealthCheck(handle.handle, groupId.handle, (*C.dcgmHealthResponse_t)(unsafe.Pointer(&healthResults))) - - if err = errorString(result); err != nil { - return deviceHealth, fmt.Errorf("Error checking GPU health: %s", err) - } - - status := healthStatus(int8(healthResults.overallHealth)) - watches := []SystemWatch{} - - // number of watches that encountred error/warning - incidents := uint(healthResults.incidentCount) - - for j := uint(0); j < incidents; j++ { - watch := SystemWatch{ - Type: systemWatch(int(healthResults.incidents[j].system)), - Status: healthStatus(int8(healthResults.incidents[j].health)), - - Error: *stringPtr(&healthResults.incidents[j].error.msg[0]), - } - watches = append(watches, watch) - } - - deviceHealth = DeviceHealth{ - GPU: gpuId, - Status: status, - Watches: watches, - } - _ = DestroyGroup(groupId) - return -} - -func healthStatus(status int8) string { - switch status { - case 0: - return "Healthy" - case 10: - return "Warning" - case 20: - return "Failure" - } - return "N/A" -} - -func systemWatch(watch int) string { - switch watch { - case 1: - return "PCIe watches" - case 2: - return "NVLINK watches" - case 4: - return "Power Managemnt unit watches" - case 8: - return "Microcontroller unit watches" - case 16: - return "Memory watches" - case 32: - return "Streaming Multiprocessor watches" - case 64: - return "Inforom watches" - case 128: - return "Temperature watches" - case 256: - return "Power watches" - case 512: - return "Driver-related watches" - } - return "N/A" -} diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/hostengine_status.go b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/hostengine_status.go deleted file mode 100644 index 4e6e6b93..00000000 --- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/hostengine_status.go +++ /dev/null @@ -1,49 +0,0 @@ -package dcgm - -/* -#include "dcgm_agent.h" -#include "dcgm_structs.h" -*/ -import "C" -import ( - "fmt" - "unsafe" -) - -type DcgmStatus struct { - Memory int64 - CPU float64 -} - -func introspect() (engine DcgmStatus, err error) { - enableIntrospect := C.dcgmIntrospectState_t(1) - result := C.dcgmIntrospectToggleState(handle.handle, enableIntrospect) - - if err = errorString(result); err != nil { - return engine, fmt.Errorf("Error enabling DCGM introspection: %s", err) - } - - var memory C.dcgmIntrospectMemory_t - memory.version = makeVersion2(unsafe.Sizeof(memory)) - waitIfNoData := 1 - result = C.dcgmIntrospectGetHostengineMemoryUsage(handle.handle, &memory, C.int(waitIfNoData)) - - if err = errorString(result); err != nil { - return engine, fmt.Errorf("Error getting memory usage of hostengine: %s", err) - } - - var cpu C.dcgmIntrospectCpuUtil_t - - cpu.version = makeVersion2(unsafe.Sizeof(cpu)) - result = C.dcgmIntrospectGetHostengineCpuUtilization(handle.handle, &cpu, C.int(waitIfNoData)) - - if err = errorString(result); err != nil { - return engine, fmt.Errorf("Error getting cpu usage of hostengine: %s", err) - } - - engine = DcgmStatus{ - Memory: toInt64(memory.bytesUsed) / 1024, - CPU: *dblToFloat(cpu.total) * 100, - } - return -} diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/mig.go b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/mig.go deleted file mode 100644 index 1e0f6dc3..00000000 --- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/mig.go +++ /dev/null @@ -1,89 +0,0 @@ -package dcgm - -/* -#include "./dcgm_agent.h" -#include "./dcgm_structs.h" -*/ -import "C" -import ( - "fmt" - "unsafe" -) - -type Field_Entity_Group uint - -const ( - FE_NONE Field_Entity_Group = iota - FE_GPU - FE_VGPU - FE_SWITCH - FE_GPU_I - FE_GPU_CI - FE_COUNT -) - -type GroupEntityPair struct { - EntityGroupId Field_Entity_Group - EntityId uint -} - -type MigEntityInfo struct { - GpuUuid string - NvmlGpuIndex uint - NvmlInstanceId uint - NvmlComputeInstanceId uint - NvmlMigProfileId uint - NvmlProfileSlices uint -} - -type MigHierarchyInfo_v2 struct { - Entity GroupEntityPair - Parent GroupEntityPair - Info MigEntityInfo -} - -const ( - MAX_NUM_DEVICES uint = C.DCGM_MAX_NUM_DEVICES - MAX_HIERARCHY_INFO uint = C.DCGM_MAX_HIERARCHY_INFO -) - -type MigHierarchy_v2 struct { - Version uint - Count uint - EntityList [C.DCGM_MAX_HIERARCHY_INFO]MigHierarchyInfo_v2 -} - -func GetGpuInstanceHierarchy() (hierarchy MigHierarchy_v2, err error) { - var c_hierarchy C.dcgmMigHierarchy_v2 - c_hierarchy.version = C.dcgmMigHierarchy_version2 - ptr_hierarchy := (*C.dcgmMigHierarchy_v2)(unsafe.Pointer(&c_hierarchy)) - result := C.dcgmGetGpuInstanceHierarchy(handle.handle, ptr_hierarchy) - - if err = errorString(result); err != nil { - return toMigHierarchy(c_hierarchy), fmt.Errorf("Error retrieving DCGM MIG hierarchy: %s", err) - } - - return toMigHierarchy(c_hierarchy), nil -} - -func toMigHierarchy(c_hierarchy C.dcgmMigHierarchy_v2) MigHierarchy_v2 { - var hierarchy MigHierarchy_v2 - hierarchy.Version = uint(c_hierarchy.version) - hierarchy.Count = uint(c_hierarchy.count) - for i := uint(0); i < hierarchy.Count; i++ { - hierarchy.EntityList[i] = MigHierarchyInfo_v2{ - Entity: GroupEntityPair{Field_Entity_Group(c_hierarchy.entityList[i].entity.entityGroupId), uint(c_hierarchy.entityList[i].entity.entityId)}, - Parent: GroupEntityPair{Field_Entity_Group(c_hierarchy.entityList[i].parent.entityGroupId), uint(c_hierarchy.entityList[i].parent.entityId)}, - Info: MigEntityInfo{ - GpuUuid: *stringPtr(&c_hierarchy.entityList[i].info.gpuUuid[0]), - NvmlGpuIndex: uint(c_hierarchy.entityList[i].info.nvmlGpuIndex), - NvmlInstanceId: uint(c_hierarchy.entityList[i].info.nvmlInstanceId), - NvmlComputeInstanceId: uint(c_hierarchy.entityList[i].info.nvmlComputeInstanceId), - NvmlMigProfileId: uint(c_hierarchy.entityList[i].info.nvmlMigProfileId), - NvmlProfileSlices: uint(c_hierarchy.entityList[i].info.nvmlProfileSlices), - }, - } - } - - return hierarchy -} diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/policy.go b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/policy.go deleted file mode 100644 index 06be22fa..00000000 --- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/policy.go +++ /dev/null @@ -1,419 +0,0 @@ -package dcgm - -/* -#include "dcgm_agent.h" -#include "dcgm_structs.h" - -// wrapper for go callback function -extern int violationNotify(void* p); -*/ -import "C" -import ( - "encoding/binary" - "fmt" - "log" - "math/rand" - "sync" - "time" - "unsafe" -) - -type policyCondition string - -const ( - DbePolicy = policyCondition("Double-bit ECC error") - PCIePolicy = policyCondition("PCI error") - MaxRtPgPolicy = policyCondition("Max Retired Pages Limit") - ThermalPolicy = policyCondition("Thermal Limit") - PowerPolicy = policyCondition("Power Limit") - NvlinkPolicy = policyCondition("Nvlink Error") - XidPolicy = policyCondition("XID Error") -) - -type PolicyViolation struct { - Condition policyCondition - Timestamp time.Time - Data interface{} -} - -type policyIndex int - -const ( - dbePolicyIndex policyIndex = iota - pciePolicyIndex - maxRtPgPolicyIndex - thermalPolicyIndex - powerPolicyIndex - nvlinkPolicyIndex - xidPolicyIndex -) - -type policyConditionParam struct { - typ uint32 - value uint32 -} - -type dbePolicyCondition struct { - Location string - NumErrors uint -} - -type pciPolicyCondition struct { - ReplayCounter uint -} - -type retiredPagesPolicyCondition struct { - SbePages uint - DbePages uint -} - -type thermalPolicyCondition struct { - ThermalViolation uint -} - -type powerPolicyCondition struct { - PowerViolation uint -} - -type nvlinkPolicyCondition struct { - FieldId uint16 - Counter uint -} - -type xidPolicyCondition struct { - ErrNum uint -} - -var ( - policyChanOnce sync.Once - policyMapOnce sync.Once - - // callbacks maps PolicyViolation channels with policy - // captures C callback() value for each violation condition - callbacks map[string]chan PolicyViolation - - // paramMap maps C.dcgmPolicy_t.parms index and limits - // to be used in setPolicy() for setting user selected policies - paramMap map[policyIndex]policyConditionParam -) - -func makePolicyChannels() { - policyChanOnce.Do(func() { - callbacks = make(map[string]chan PolicyViolation) - callbacks["dbe"] = make(chan PolicyViolation, 1) - callbacks["pcie"] = make(chan PolicyViolation, 1) - callbacks["maxrtpg"] = make(chan PolicyViolation, 1) - callbacks["thermal"] = make(chan PolicyViolation, 1) - callbacks["power"] = make(chan PolicyViolation, 1) - callbacks["nvlink"] = make(chan PolicyViolation, 1) - callbacks["xid"] = make(chan PolicyViolation, 1) - }) -} - -func makePolicyParmsMap() { - const ( - policyFieldTypeBool = 0 - policyFieldTypeLong = 1 - policyBoolValue = 1 - policyMaxRtPgThreshold = 10 - policyThermalThreshold = 100 - policyPowerThreshold = 250 - ) - - policyMapOnce.Do(func() { - paramMap = make(map[policyIndex]policyConditionParam) - paramMap[dbePolicyIndex] = policyConditionParam{ - typ: policyFieldTypeBool, - value: policyBoolValue, - } - - paramMap[pciePolicyIndex] = policyConditionParam{ - typ: policyFieldTypeBool, - value: policyBoolValue, - } - - paramMap[maxRtPgPolicyIndex] = policyConditionParam{ - typ: policyFieldTypeLong, - value: policyMaxRtPgThreshold, - } - - paramMap[thermalPolicyIndex] = policyConditionParam{ - typ: policyFieldTypeLong, - value: policyThermalThreshold, - } - - paramMap[powerPolicyIndex] = policyConditionParam{ - typ: policyFieldTypeLong, - value: policyPowerThreshold, - } - - paramMap[nvlinkPolicyIndex] = policyConditionParam{ - typ: policyFieldTypeBool, - value: policyBoolValue, - } - - paramMap[xidPolicyIndex] = policyConditionParam{ - typ: policyFieldTypeBool, - value: policyBoolValue, - } - }) -} - -// ViolationRegistration is a go callback function for dcgmPolicyRegister() wrapped in C.violationNotify() -//export ViolationRegistration -func ViolationRegistration(data unsafe.Pointer) int { - var con policyCondition - var timestamp time.Time - var val interface{} - - response := *(*C.dcgmPolicyCallbackResponse_t)(unsafe.Pointer(data)) - - switch response.condition { - case C.DCGM_POLICY_COND_DBE: - dbe := (*C.dcgmPolicyConditionDbe_t)(unsafe.Pointer(&response.val)) - con = DbePolicy - timestamp = createTimeStamp(dbe.timestamp) - val = dbePolicyCondition{ - Location: dbeLocation(int(dbe.location)), - NumErrors: *uintPtr(dbe.numerrors), - } - case C.DCGM_POLICY_COND_PCI: - pci := (*C.dcgmPolicyConditionPci_t)(unsafe.Pointer(&response.val)) - con = PCIePolicy - timestamp = createTimeStamp(pci.timestamp) - val = pciPolicyCondition{ - ReplayCounter: *uintPtr(pci.counter), - } - case C.DCGM_POLICY_COND_MAX_PAGES_RETIRED: - mpr := (*C.dcgmPolicyConditionMpr_t)(unsafe.Pointer(&response.val)) - con = MaxRtPgPolicy - timestamp = createTimeStamp(mpr.timestamp) - val = retiredPagesPolicyCondition{ - SbePages: *uintPtr(mpr.sbepages), - DbePages: *uintPtr(mpr.dbepages), - } - case C.DCGM_POLICY_COND_THERMAL: - thermal := (*C.dcgmPolicyConditionThermal_t)(unsafe.Pointer(&response.val)) - con = ThermalPolicy - timestamp = createTimeStamp(thermal.timestamp) - val = thermalPolicyCondition{ - ThermalViolation: *uintPtr(thermal.thermalViolation), - } - case C.DCGM_POLICY_COND_POWER: - pwr := (*C.dcgmPolicyConditionPower_t)(unsafe.Pointer(&response.val)) - con = PowerPolicy - timestamp = createTimeStamp(pwr.timestamp) - val = powerPolicyCondition{ - PowerViolation: *uintPtr(pwr.powerViolation), - } - case C.DCGM_POLICY_COND_NVLINK: - nvlink := (*C.dcgmPolicyConditionNvlink_t)(unsafe.Pointer(&response.val)) - con = NvlinkPolicy - timestamp = createTimeStamp(nvlink.timestamp) - val = nvlinkPolicyCondition{ - FieldId: uint16(nvlink.fieldId), - Counter: *uintPtr(nvlink.counter), - } - case C.DCGM_POLICY_COND_XID: - xid := (*C.dcgmPolicyConditionXID_t)(unsafe.Pointer(&response.val)) - con = XidPolicy - timestamp = createTimeStamp(xid.timestamp) - val = xidPolicyCondition{ - ErrNum: *uintPtr(xid.errnum), - } - } - - err := PolicyViolation{ - Condition: con, - Timestamp: timestamp, - Data: val, - } - - switch con { - case DbePolicy: - callbacks["dbe"] <- err - case PCIePolicy: - callbacks["pcie"] <- err - case MaxRtPgPolicy: - callbacks["maxrtpg"] <- err - case ThermalPolicy: - callbacks["thermal"] <- err - case PowerPolicy: - callbacks["power"] <- err - case NvlinkPolicy: - callbacks["nvlink"] <- err - case XidPolicy: - callbacks["xid"] <- err - } - return 0 -} - -func setPolicy(groupId GroupHandle, condition C.dcgmPolicyCondition_t, paramList []policyIndex) (err error) { - var policy C.dcgmPolicy_t - policy.version = makeVersion2(unsafe.Sizeof(policy)) - policy.mode = C.dcgmPolicyMode_t(C.DCGM_OPERATION_MODE_AUTO) - policy.action = C.DCGM_POLICY_ACTION_NONE - policy.isolation = C.DCGM_POLICY_ISOLATION_NONE - policy.validation = C.DCGM_POLICY_VALID_NONE - policy.condition = condition - - // iterate on paramMap for given policy conditions - for _, key := range paramList { - conditionParam, exists := paramMap[policyIndex(key)] - if !exists { - return fmt.Errorf("Error: Invalid Policy condition, %v does not exist.\n", key) - } - // set policy condition parameters - // set condition type (bool or longlong) - policy.parms[key].tag = conditionParam.typ - - // set condition val (violation threshold) - // policy.parms.val is a C union type - // cgo docs: Go doesn't have support for C's union type - // C union types are represented as a Go byte array - binary.LittleEndian.PutUint32(policy.parms[key].val[:], conditionParam.value) - } - var statusHandle C.dcgmStatus_t - result := C.dcgmPolicySet(handle.handle, groupId.handle, &policy, statusHandle) - if err = errorString(result); err != nil { - return fmt.Errorf("Error setting policies: %s", err) - } - log.Println("Policy successfully set.") - return -} - -func registerPolicy(gpuId uint, typ ...policyCondition) (violation chan PolicyViolation, err error) { - // init policy globals for internal API - makePolicyChannels() - makePolicyParmsMap() - - name := fmt.Sprintf("policy%d", rand.Uint64()) - groupId, err := CreateGroup(name) - if err != nil { - return - } - if err = AddToGroup(groupId, gpuId); err != nil { - return - } - - // make a list of all callback channels - var channels []chan PolicyViolation - // make a list of policy conditions for setting their parameters - var paramKeys []policyIndex - // get all conditions to be set in setPolicy() - var condition C.dcgmPolicyCondition_t = 0 - for _, t := range typ { - switch t { - case DbePolicy: - paramKeys = append(paramKeys, dbePolicyIndex) - condition |= C.DCGM_POLICY_COND_DBE - channels = append(channels, callbacks["dbe"]) - case PCIePolicy: - paramKeys = append(paramKeys, pciePolicyIndex) - condition |= C.DCGM_POLICY_COND_PCI - channels = append(channels, callbacks["pcie"]) - case MaxRtPgPolicy: - paramKeys = append(paramKeys, maxRtPgPolicyIndex) - condition |= C.DCGM_POLICY_COND_MAX_PAGES_RETIRED - channels = append(channels, callbacks["maxrtpg"]) - case ThermalPolicy: - paramKeys = append(paramKeys, thermalPolicyIndex) - condition |= C.DCGM_POLICY_COND_THERMAL - channels = append(channels, callbacks["thermal"]) - case PowerPolicy: - paramKeys = append(paramKeys, powerPolicyIndex) - condition |= C.DCGM_POLICY_COND_POWER - channels = append(channels, callbacks["power"]) - case NvlinkPolicy: - paramKeys = append(paramKeys, nvlinkPolicyIndex) - condition |= C.DCGM_POLICY_COND_NVLINK - channels = append(channels, callbacks["nvlink"]) - case XidPolicy: - paramKeys = append(paramKeys, xidPolicyIndex) - condition |= C.DCGM_POLICY_COND_XID - channels = append(channels, callbacks["xid"]) - } - } - - if err = setPolicy(groupId, condition, paramKeys); err != nil { - return - } - - result := C.dcgmPolicyRegister(handle.handle, groupId.handle, C.dcgmPolicyCondition_t(condition), C.fpRecvUpdates(C.violationNotify), C.fpRecvUpdates(C.violationNotify)) - - if err = errorString(result); err != nil { - return violation, fmt.Errorf("Error registering policy: %s", err) - } - log.Println("Listening for violations...") - - // create a publisher - publisher := newPublisher() - _ = publisher.add() - _ = publisher.add() - - // broadcast - go publisher.broadcast() - - go func() { - for { - select { - case dbe := <-callbacks["dbe"]: - publisher.send(dbe) - case pcie := <-callbacks["pcie"]: - publisher.send(pcie) - case maxrtpg := <-callbacks["maxrtpg"]: - publisher.send(maxrtpg) - case thermal := <-callbacks["thermal"]: - publisher.send(thermal) - case power := <-callbacks["power"]: - publisher.send(power) - case nvlink := <-callbacks["nvlink"]: - publisher.send(nvlink) - case xid := <-callbacks["xid"]: - publisher.send(xid) - } - } - }() - - // merge - violation = make(chan PolicyViolation, len(channels)) - go func() { - for _, c := range channels { - val := <-c - violation <- val - } - close(violation) - }() - _ = DestroyGroup(groupId) - return -} - -func unregisterPolicy(groupId GroupHandle, condition C.dcgmPolicyCondition_t) { - result := C.dcgmPolicyUnregister(handle.handle, groupId.handle, condition) - - if err := errorString(result); err != nil { - fmt.Errorf("Error unregistering policy: %s", err) - } -} - -func createTimeStamp(t C.longlong) time.Time { - tm := int64(t) / 1000000 - ts := time.Unix(tm, 0) - return ts -} - -func dbeLocation(location int) string { - switch location { - case 0: - return "L1" - case 1: - return "L2" - case 2: - return "Device" - case 3: - return "Register" - case 4: - return "Texture" - } - return "N/A" -} diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/process_info.go b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/process_info.go deleted file mode 100644 index 64227cfa..00000000 --- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/process_info.go +++ /dev/null @@ -1,203 +0,0 @@ -package dcgm - -/* -#include "dcgm_agent.h" -#include "dcgm_structs.h" -*/ -import "C" -import ( - "fmt" - "io/ioutil" - "math/rand" - "os" - "strings" - "time" - "unsafe" -) - -type Time uint64 - -func (t Time) String() string { - if t == 0 { - return "Running" - } - tm := time.Unix(int64(t), 0) - return tm.String() -} - -type ProcessUtilInfo struct { - StartTime Time - EndTime Time - EnergyConsumed *uint64 // Joules - SmUtil *float64 - MemUtil *float64 -} - -// ViolationTime measures amount of time (in ms) GPU was at reduced clocks -type ViolationTime struct { - Power *uint64 - Thermal *uint64 - Reliability *uint64 - BoardLimit *uint64 - LowUtilization *uint64 - SyncBoost *uint64 -} - -type XIDErrorInfo struct { - NumErrors int - Timestamp []uint64 -} - -type ProcessInfo struct { - GPU uint - PID uint - Name string - ProcessUtilization ProcessUtilInfo - PCI PCIStatusInfo - Memory MemoryInfo - GpuUtilization UtilizationInfo - Clocks ClockInfo - Violations ViolationTime - XIDErrors XIDErrorInfo -} - -func watchPidFields(gpus ...uint) (groupId GroupHandle, err error) { - groupName := fmt.Sprintf("watchPids%d", rand.Uint64()) - group, err := CreateGroup(groupName) - if err != nil { - return - } - numGpus := len(gpus) - - if numGpus == 0 { - gpus, err = getSupportedDevices() - if err != nil { - return - } - } - - for _, gpu := range gpus { - err = AddToGroup(group, gpu) - if err != nil { - return - } - - } - - result := C.dcgmWatchPidFields(handle.handle, group.handle, C.longlong(updateFreq), C.double(maxKeepAge), C.int(maxKeepSamples)) - - if err = errorString(result); err != nil { - return groupId, fmt.Errorf("Error watching process fields: %s", err) - } - _ = UpdateAllFields() - return group, nil -} - -func getProcessInfo(groupId GroupHandle, pid uint) (processInfo []ProcessInfo, err error) { - var pidInfo C.dcgmPidInfo_t - pidInfo.version = makeVersion2(unsafe.Sizeof(pidInfo)) - pidInfo.pid = C.uint(pid) - - result := C.dcgmGetPidInfo(handle.handle, groupId.handle, &pidInfo) - - if err = errorString(result); err != nil { - return processInfo, fmt.Errorf("Error getting process info: %s", err) - } - - name, err := processName(pid) - if err != nil { - return processInfo, fmt.Errorf("Error getting process name: %s", err) - } - - for i := 0; i < int(pidInfo.numGpus); i++ { - - var energy uint64 - e := *uint64Ptr(pidInfo.gpus[i].energyConsumed) - if !IsInt64Blank(int64(e)) { - energy = e / 1000 // mWs to joules - } - - processUtil := ProcessUtilInfo{ - StartTime: Time(uint64(pidInfo.gpus[i].startTime) / 1000000), - EndTime: Time(uint64(pidInfo.gpus[i].endTime) / 1000000), - EnergyConsumed: &energy, - SmUtil: roundFloat(dblToFloat(pidInfo.gpus[i].processUtilization.smUtil)), - MemUtil: roundFloat(dblToFloat(pidInfo.gpus[i].processUtilization.memUtil)), - } - - // TODO figure out how to deal with blanks - pci := PCIStatusInfo{ - Throughput: PCIThroughputInfo{ - Rx: *int64Ptr(pidInfo.gpus[i].pcieRxBandwidth.average), - Tx: *int64Ptr(pidInfo.gpus[i].pcieTxBandwidth.average), - Replays: *int64Ptr(pidInfo.gpus[i].pcieReplays), - }, - } - - memory := MemoryInfo{ - GlobalUsed: *int64Ptr(pidInfo.gpus[i].maxGpuMemoryUsed), // max gpu memory used for this process - ECCErrors: ECCErrorsInfo{ - SingleBit: *int64Ptr(C.longlong(pidInfo.gpus[i].eccSingleBit)), - DoubleBit: *int64Ptr(C.longlong(pidInfo.gpus[i].eccDoubleBit)), - }, - } - - gpuUtil := UtilizationInfo{ - GPU: int64(pidInfo.gpus[i].smUtilization.average), - Memory: int64(pidInfo.gpus[i].memoryUtilization.average), - } - - violations := ViolationTime{ - Power: uint64Ptr(pidInfo.gpus[i].powerViolationTime), - Thermal: uint64Ptr(pidInfo.gpus[i].thermalViolationTime), - Reliability: uint64Ptr(pidInfo.gpus[i].reliabilityViolationTime), - BoardLimit: uint64Ptr(pidInfo.gpus[i].boardLimitViolationTime), - LowUtilization: uint64Ptr(pidInfo.gpus[i].lowUtilizationTime), - SyncBoost: uint64Ptr(pidInfo.gpus[i].syncBoostTime), - } - - clocks := ClockInfo{ - Cores: *int64Ptr(C.longlong(pidInfo.gpus[i].smClock.average)), - Memory: *int64Ptr(C.longlong(pidInfo.gpus[i].memoryClock.average)), - } - - numErrs := int(pidInfo.gpus[i].numXidCriticalErrors) - ts := make([]uint64, numErrs) - for i := 0; i < numErrs; i++ { - ts[i] = uint64(pidInfo.gpus[i].xidCriticalErrorsTs[i]) - } - xidErrs := XIDErrorInfo{ - NumErrors: numErrs, - Timestamp: ts, - } - - pInfo := ProcessInfo{ - GPU: uint(pidInfo.summary.gpuId), - PID: uint(pidInfo.pid), - Name: name, - ProcessUtilization: processUtil, - PCI: pci, - Memory: memory, - GpuUtilization: gpuUtil, - Clocks: clocks, - Violations: violations, - XIDErrors: xidErrs, - } - processInfo = append(processInfo, pInfo) - } - _ = DestroyGroup(groupId) - return -} - -func processName(pid uint) (string, error) { - f := fmt.Sprintf("/proc/%d/comm", pid) - b, err := ioutil.ReadFile(f) - if err != nil { - // TOCTOU: process terminated - if os.IsNotExist(err) { - return "", nil - } - return "", err - } - return strings.TrimSuffix(string(b), "\n"), nil -} diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/profile.go b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/profile.go deleted file mode 100644 index 25ca7524..00000000 --- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/profile.go +++ /dev/null @@ -1,47 +0,0 @@ -package dcgm - -/* -#include "dcgm_agent.h" -#include "dcgm_structs.h" -*/ -import "C" -import ( - "fmt" - "unsafe" -) - -type MetricGroup struct { - major uint - minor uint - fieldIds []uint -} - -func getSupportedMetricGroups(grpid uint) (groups []MetricGroup, err error) { - - var groupInfo C.dcgmProfGetMetricGroups_t - groupInfo.version = makeVersion2(unsafe.Sizeof(groupInfo)) - groupInfo.groupId = C.ulong(grpid) - - result := C.dcgmProfGetSupportedMetricGroups(handle.handle, &groupInfo) - - if err = errorString(result); err != nil { - return groups, fmt.Errorf("Error getting supported metrics: %s", err) - } - - var count = uint(groupInfo.numMetricGroups) - - for i := uint(0); i < count; i++ { - var group MetricGroup - group.major = uint(groupInfo.metricGroups[i].majorId) - group.minor = uint(groupInfo.metricGroups[i].minorId) - - var fieldCount = uint(groupInfo.metricGroups[i].numFieldIds) - - for j := uint(0); j < fieldCount; j++ { - group.fieldIds = append(group.fieldIds, uint(groupInfo.metricGroups[i].fieldIds[j])) - } - groups = append(groups, group) - } - - return groups, nil -} diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/topology.go b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/topology.go deleted file mode 100644 index f3afc380..00000000 --- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/topology.go +++ /dev/null @@ -1,136 +0,0 @@ -package dcgm - -/* -#include "dcgm_agent.h" -#include "dcgm_structs.h" -*/ -import "C" -import ( - "fmt" - "io/ioutil" - "strings" - "unsafe" -) - -type P2PLinkType uint - -const ( - P2PLinkUnknown P2PLinkType = iota - P2PLinkCrossCPU - P2PLinkSameCPU - P2PLinkHostBridge - P2PLinkMultiSwitch - P2PLinkSingleSwitch - P2PLinkSameBoard - SingleNVLINKLink - TwoNVLINKLinks - ThreeNVLINKLinks - FourNVLINKLinks -) - -func (l P2PLinkType) PCIPaths() string { - switch l { - case P2PLinkSameBoard: - return "PSB" - case P2PLinkSingleSwitch: - return "PIX" - case P2PLinkMultiSwitch: - return "PXB" - case P2PLinkHostBridge: - return "PHB" - case P2PLinkSameCPU: - return "NODE" - case P2PLinkCrossCPU: - return "SYS" - case SingleNVLINKLink: - return "NV1" - case TwoNVLINKLinks: - return "NV2" - case ThreeNVLINKLinks: - return "NV3" - case FourNVLINKLinks: - return "NV4" - case P2PLinkUnknown: - } - return "N/A" -} - -type P2PLink struct { - GPU uint - BusID string - Link P2PLinkType -} - -func getP2PLink(path uint) P2PLinkType { - switch path { - case C.DCGM_TOPOLOGY_BOARD: - return P2PLinkSameBoard - case C.DCGM_TOPOLOGY_SINGLE: - return P2PLinkSingleSwitch - case C.DCGM_TOPOLOGY_MULTIPLE: - return P2PLinkMultiSwitch - case C.DCGM_TOPOLOGY_HOSTBRIDGE: - return P2PLinkHostBridge - case C.DCGM_TOPOLOGY_CPU: - return P2PLinkSameCPU - case C.DCGM_TOPOLOGY_SYSTEM: - return P2PLinkCrossCPU - case C.DCGM_TOPOLOGY_NVLINK1: - return SingleNVLINKLink - case C.DCGM_TOPOLOGY_NVLINK2: - return TwoNVLINKLinks - case C.DCGM_TOPOLOGY_NVLINK3: - return ThreeNVLINKLinks - case C.DCGM_TOPOLOGY_NVLINK4: - return FourNVLINKLinks - } - return P2PLinkUnknown -} - -func getCPUAffinity(busid string) (string, error) { - b, err := ioutil.ReadFile(fmt.Sprintf("/sys/bus/pci/devices/%s/local_cpulist", strings.ToLower(busid[4:]))) - if err != nil { - return "", fmt.Errorf("Error getting device cpu affinity: %v", err) - } - return strings.TrimSuffix(string(b), "\n"), nil -} - -func getBusid(gpuid uint) (string, error) { - var device C.dcgmDeviceAttributes_t - device.version = makeVersion2(unsafe.Sizeof(device)) - - result := C.dcgmGetDeviceAttributes(handle.handle, C.uint(gpuid), &device) - if err := errorString(result); err != nil { - return "", fmt.Errorf("Error getting device busid: %s", err) - } - return *stringPtr(&device.identifiers.pciBusId[0]), nil -} - -func getDeviceTopology(gpuid uint) (links []P2PLink, err error) { - var topology C.dcgmDeviceTopology_t - topology.version = makeVersion2(unsafe.Sizeof(topology)) - - result := C.dcgmGetDeviceTopology(handle.handle, C.uint(gpuid), &topology) - if result == C.DCGM_ST_NOT_SUPPORTED { - return links, nil - } - if result != C.DCGM_ST_OK { - return links, fmt.Errorf("Error getting device topology: %s", errorString(result)) - } - - busid, err := getBusid(gpuid) - if err != nil { - return - } - - for i := uint(0); i < uint(topology.numGpus); i++ { - gpu := topology.gpuPaths[i].gpuId - p2pLink := P2PLink{ - GPU: uint(gpu), - BusID: busid, - Link: getP2PLink(uint(topology.gpuPaths[i].path)), - } - links = append(links, p2pLink) - } - return -} diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/utils.go b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/utils.go deleted file mode 100644 index 219735a7..00000000 --- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/utils.go +++ /dev/null @@ -1,148 +0,0 @@ -package dcgm - -/* -#include "stdlib.h" -#include "dcgm_structs.h" -*/ -import "C" - -import ( - "fmt" - "math" - "unsafe" -) - -const ( - dcgmInt32Blank = 0x7ffffff0 // 2147483632 - dcgmInt64Blank = 0x7ffffffffffffff0 // 9223372036854775792 -) - -func uintPtr(c C.uint) *uint { - i := uint(c) - return &i -} - -func uintPtrInt(c C.int) *uint { - i := uint(c) - return &i -} - -func uintPtrUnsafe(p unsafe.Pointer) *uint { - if p == nil { - return nil - } - uintP := (*uint)(unsafe.Pointer(p)) - val := *uintP - return &val -} - -func uint64Ptr(c C.longlong) *uint64 { - i := uint64(c) - return &i -} - -func int64Ptr(c C.longlong) *int64 { - i := int64(c) - return &i -} - -func uint64PtrUint(c C.uint) *uint64 { - i := uint64(c) - return &i -} - -func uint64PtrUnsafe(p unsafe.Pointer) *uint64 { - if p == nil { - return nil - } - uintP := (*uint64)(unsafe.Pointer(p)) - val := *uintP - return &val -} - -func toInt64(c C.longlong) int64 { - i := int64(c) - return i -} - -func dblToUint(val C.double) *uint { - i := uint(val) - return &i -} - -func dblToFloat(val C.double) *float64 { - i := float64(val) - return &i -} - -func dblToFloatUnsafe(val unsafe.Pointer) *float64 { - if val == nil { - return nil - } - dblP := (*C.double)(unsafe.Pointer(val)) - floatP := float64(*dblP) - return &floatP -} - -func stringPtr(c *C.char) *string { - s := C.GoString(c) - return &s -} - -func errorString(result C.dcgmReturn_t) error { - if result == C.DCGM_ST_OK { - return nil - } - err := C.GoString(C.errorString(result)) - return fmt.Errorf("%v", err) -} - -func freeCString(cStr *C.char) { - C.free(unsafe.Pointer(cStr)) -} - -func IsInt32Blank(value int) bool { - if value >= dcgmInt32Blank { - return true - } - return false -} - -func IsInt64Blank(value int64) bool { - if value >= dcgmInt64Blank { - return true - } - return false -} - -func blank64(val *int64) *int64 { - if val != nil && IsInt64Blank(*val) { - return nil - } - return val -} - -func blank32(val *uint) *uint { - if val != nil && IsInt32Blank(int(*val)) { - return nil - } - return val -} - -func makeVersion1(struct_type uintptr) C.uint { - version := C.uint(struct_type | 1<<24) - return version -} - -func makeVersion2(struct_type uintptr) C.uint { - version := C.uint(struct_type | 2<<24) - return version -} - -func roundFloat(f *float64) *float64 { - var val float64 - if f != nil { - val = math.Round(*f) - } - return &val -} diff --git a/vendor/github.com/gorilla/mux/AUTHORS b/vendor/github.com/gorilla/mux/AUTHORS deleted file mode 100644 index b722392e..00000000 --- a/vendor/github.com/gorilla/mux/AUTHORS +++ /dev/null @@ -1,8 +0,0 @@ -# This is the official list of gorilla/mux authors for copyright purposes. -# -# Please keep the list sorted. - -Google LLC (https://opensource.google.com/) -Kamil Kisielk -Matt Silverlock -Rodrigo Moraes (https://github.com/moraes) diff --git a/vendor/github.com/gorilla/mux/LICENSE b/vendor/github.com/gorilla/mux/LICENSE deleted file mode 100644 index 6903df63..00000000 --- a/vendor/github.com/gorilla/mux/LICENSE +++ /dev/null @@ -1,27 +0,0 @@ -Copyright (c) 2012-2018 The Gorilla Authors. All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above -copyright notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with the -distribution. - * Neither the name of Google Inc. nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/github.com/gorilla/mux/README.md b/vendor/github.com/gorilla/mux/README.md deleted file mode 100644 index 35eea9f1..00000000 --- a/vendor/github.com/gorilla/mux/README.md +++ /dev/null @@ -1,805 +0,0 @@ -# gorilla/mux - -[![GoDoc](https://godoc.org/github.com/gorilla/mux?status.svg)](https://godoc.org/github.com/gorilla/mux) -[![CircleCI](https://circleci.com/gh/gorilla/mux.svg?style=svg)](https://circleci.com/gh/gorilla/mux) -[![Sourcegraph](https://sourcegraph.com/github.com/gorilla/mux/-/badge.svg)](https://sourcegraph.com/github.com/gorilla/mux?badge) - -![Gorilla Logo](https://cloud-cdn.questionable.services/gorilla-icon-64.png) - -https://www.gorillatoolkit.org/pkg/mux - -Package `gorilla/mux` implements a request router and dispatcher for matching incoming requests to -their respective handler. - -The name mux stands for "HTTP request multiplexer". Like the standard `http.ServeMux`, `mux.Router` matches incoming requests against a list of registered routes and calls a handler for the route that matches the URL or other conditions. The main features are: - -* It implements the `http.Handler` interface so it is compatible with the standard `http.ServeMux`. -* Requests can be matched based on URL host, path, path prefix, schemes, header and query values, HTTP methods or using custom matchers. -* URL hosts, paths and query values can have variables with an optional regular expression. -* Registered URLs can be built, or "reversed", which helps maintaining references to resources. -* Routes can be used as subrouters: nested routes are only tested if the parent route matches. This is useful to define groups of routes that share common conditions like a host, a path prefix or other repeated attributes. As a bonus, this optimizes request matching. - ---- - -* [Install](#install) -* [Examples](#examples) -* [Matching Routes](#matching-routes) -* [Static Files](#static-files) -* [Serving Single Page Applications](#serving-single-page-applications) (e.g. React, Vue, Ember.js, etc.) -* [Registered URLs](#registered-urls) -* [Walking Routes](#walking-routes) -* [Graceful Shutdown](#graceful-shutdown) -* [Middleware](#middleware) -* [Handling CORS Requests](#handling-cors-requests) -* [Testing Handlers](#testing-handlers) -* [Full Example](#full-example) - ---- - -## Install - -With a [correctly configured](https://golang.org/doc/install#testing) Go toolchain: - -```sh -go get -u github.com/gorilla/mux -``` - -## Examples - -Let's start registering a couple of URL paths and handlers: - -```go -func main() { - r := mux.NewRouter() - r.HandleFunc("/", HomeHandler) - r.HandleFunc("/products", ProductsHandler) - r.HandleFunc("/articles", ArticlesHandler) - http.Handle("/", r) -} -``` - -Here we register three routes mapping URL paths to handlers. This is equivalent to how `http.HandleFunc()` works: if an incoming request URL matches one of the paths, the corresponding handler is called passing (`http.ResponseWriter`, `*http.Request`) as parameters. - -Paths can have variables. They are defined using the format `{name}` or `{name:pattern}`. If a regular expression pattern is not defined, the matched variable will be anything until the next slash. For example: - -```go -r := mux.NewRouter() -r.HandleFunc("/products/{key}", ProductHandler) -r.HandleFunc("/articles/{category}/", ArticlesCategoryHandler) -r.HandleFunc("/articles/{category}/{id:[0-9]+}", ArticleHandler) -``` - -The names are used to create a map of route variables which can be retrieved calling `mux.Vars()`: - -```go -func ArticlesCategoryHandler(w http.ResponseWriter, r *http.Request) { - vars := mux.Vars(r) - w.WriteHeader(http.StatusOK) - fmt.Fprintf(w, "Category: %v\n", vars["category"]) -} -``` - -And this is all you need to know about the basic usage. More advanced options are explained below. - -### Matching Routes - -Routes can also be restricted to a domain or subdomain. Just define a host pattern to be matched. They can also have variables: - -```go -r := mux.NewRouter() -// Only matches if domain is "www.example.com". -r.Host("www.example.com") -// Matches a dynamic subdomain. -r.Host("{subdomain:[a-z]+}.example.com") -``` - -There are several other matchers that can be added. To match path prefixes: - -```go -r.PathPrefix("/products/") -``` - -...or HTTP methods: - -```go -r.Methods("GET", "POST") -``` - -...or URL schemes: - -```go -r.Schemes("https") -``` - -...or header values: - -```go -r.Headers("X-Requested-With", "XMLHttpRequest") -``` - -...or query values: - -```go -r.Queries("key", "value") -``` - -...or to use a custom matcher function: - -```go -r.MatcherFunc(func(r *http.Request, rm *RouteMatch) bool { - return r.ProtoMajor == 0 -}) -``` - -...and finally, it is possible to combine several matchers in a single route: - -```go -r.HandleFunc("/products", ProductsHandler). - Host("www.example.com"). - Methods("GET"). - Schemes("http") -``` - -Routes are tested in the order they were added to the router. If two routes match, the first one wins: - -```go -r := mux.NewRouter() -r.HandleFunc("/specific", specificHandler) -r.PathPrefix("/").Handler(catchAllHandler) -``` - -Setting the same matching conditions again and again can be boring, so we have a way to group several routes that share the same requirements. We call it "subrouting". - -For example, let's say we have several URLs that should only match when the host is `www.example.com`. Create a route for that host and get a "subrouter" from it: - -```go -r := mux.NewRouter() -s := r.Host("www.example.com").Subrouter() -``` - -Then register routes in the subrouter: - -```go -s.HandleFunc("/products/", ProductsHandler) -s.HandleFunc("/products/{key}", ProductHandler) -s.HandleFunc("/articles/{category}/{id:[0-9]+}", ArticleHandler) -``` - -The three URL paths we registered above will only be tested if the domain is `www.example.com`, because the subrouter is tested first. This is not only convenient, but also optimizes request matching. You can create subrouters combining any attribute matchers accepted by a route. - -Subrouters can be used to create domain or path "namespaces": you define subrouters in a central place and then parts of the app can register its paths relatively to a given subrouter. - -There's one more thing about subroutes. When a subrouter has a path prefix, the inner routes use it as base for their paths: - -```go -r := mux.NewRouter() -s := r.PathPrefix("/products").Subrouter() -// "/products/" -s.HandleFunc("/", ProductsHandler) -// "/products/{key}/" -s.HandleFunc("/{key}/", ProductHandler) -// "/products/{key}/details" -s.HandleFunc("/{key}/details", ProductDetailsHandler) -``` - - -### Static Files - -Note that the path provided to `PathPrefix()` represents a "wildcard": calling -`PathPrefix("/static/").Handler(...)` means that the handler will be passed any -request that matches "/static/\*". This makes it easy to serve static files with mux: - -```go -func main() { - var dir string - - flag.StringVar(&dir, "dir", ".", "the directory to serve files from. Defaults to the current dir") - flag.Parse() - r := mux.NewRouter() - - // This will serve files under http://localhost:8000/static/ - r.PathPrefix("/static/").Handler(http.StripPrefix("/static/", http.FileServer(http.Dir(dir)))) - - srv := &http.Server{ - Handler: r, - Addr: "127.0.0.1:8000", - // Good practice: enforce timeouts for servers you create! - WriteTimeout: 15 * time.Second, - ReadTimeout: 15 * time.Second, - } - - log.Fatal(srv.ListenAndServe()) -} -``` - -### Serving Single Page Applications - -Most of the time it makes sense to serve your SPA on a separate web server from your API, -but sometimes it's desirable to serve them both from one place. It's possible to write a simple -handler for serving your SPA (for use with React Router's [BrowserRouter](https://reacttraining.com/react-router/web/api/BrowserRouter) for example), and leverage -mux's powerful routing for your API endpoints. - -```go -package main - -import ( - "encoding/json" - "log" - "net/http" - "os" - "path/filepath" - "time" - - "github.com/gorilla/mux" -) - -// spaHandler implements the http.Handler interface, so we can use it -// to respond to HTTP requests. The path to the static directory and -// path to the index file within that static directory are used to -// serve the SPA in the given static directory. -type spaHandler struct { - staticPath string - indexPath string -} - -// ServeHTTP inspects the URL path to locate a file within the static dir -// on the SPA handler. If a file is found, it will be served. If not, the -// file located at the index path on the SPA handler will be served. This -// is suitable behavior for serving an SPA (single page application). -func (h spaHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { - // get the absolute path to prevent directory traversal - path, err := filepath.Abs(r.URL.Path) - if err != nil { - // if we failed to get the absolute path respond with a 400 bad request - // and stop - http.Error(w, err.Error(), http.StatusBadRequest) - return - } - - // prepend the path with the path to the static directory - path = filepath.Join(h.staticPath, path) - - // check whether a file exists at the given path - _, err = os.Stat(path) - if os.IsNotExist(err) { - // file does not exist, serve index.html - http.ServeFile(w, r, filepath.Join(h.staticPath, h.indexPath)) - return - } else if err != nil { - // if we got an error (that wasn't that the file doesn't exist) stating the - // file, return a 500 internal server error and stop - http.Error(w, err.Error(), http.StatusInternalServerError) - return - } - - // otherwise, use http.FileServer to serve the static dir - http.FileServer(http.Dir(h.staticPath)).ServeHTTP(w, r) -} - -func main() { - router := mux.NewRouter() - - router.HandleFunc("/api/health", func(w http.ResponseWriter, r *http.Request) { - // an example API handler - json.NewEncoder(w).Encode(map[string]bool{"ok": true}) - }) - - spa := spaHandler{staticPath: "build", indexPath: "index.html"} - router.PathPrefix("/").Handler(spa) - - srv := &http.Server{ - Handler: router, - Addr: "127.0.0.1:8000", - // Good practice: enforce timeouts for servers you create! - WriteTimeout: 15 * time.Second, - ReadTimeout: 15 * time.Second, - } - - log.Fatal(srv.ListenAndServe()) -} -``` - -### Registered URLs - -Now let's see how to build registered URLs. - -Routes can be named. All routes that define a name can have their URLs built, or "reversed". We define a name calling `Name()` on a route. For example: - -```go -r := mux.NewRouter() -r.HandleFunc("/articles/{category}/{id:[0-9]+}", ArticleHandler). - Name("article") -``` - -To build a URL, get the route and call the `URL()` method, passing a sequence of key/value pairs for the route variables. For the previous route, we would do: - -```go -url, err := r.Get("article").URL("category", "technology", "id", "42") -``` - -...and the result will be a `url.URL` with the following path: - -``` -"/articles/technology/42" -``` - -This also works for host and query value variables: - -```go -r := mux.NewRouter() -r.Host("{subdomain}.example.com"). - Path("/articles/{category}/{id:[0-9]+}"). - Queries("filter", "{filter}"). - HandlerFunc(ArticleHandler). - Name("article") - -// url.String() will be "http://news.example.com/articles/technology/42?filter=gorilla" -url, err := r.Get("article").URL("subdomain", "news", - "category", "technology", - "id", "42", - "filter", "gorilla") -``` - -All variables defined in the route are required, and their values must conform to the corresponding patterns. These requirements guarantee that a generated URL will always match a registered route -- the only exception is for explicitly defined "build-only" routes which never match. - -Regex support also exists for matching Headers within a route. For example, we could do: - -```go -r.HeadersRegexp("Content-Type", "application/(text|json)") -``` - -...and the route will match both requests with a Content-Type of `application/json` as well as `application/text` - -There's also a way to build only the URL host or path for a route: use the methods `URLHost()` or `URLPath()` instead. For the previous route, we would do: - -```go -// "http://news.example.com/" -host, err := r.Get("article").URLHost("subdomain", "news") - -// "/articles/technology/42" -path, err := r.Get("article").URLPath("category", "technology", "id", "42") -``` - -And if you use subrouters, host and path defined separately can be built as well: - -```go -r := mux.NewRouter() -s := r.Host("{subdomain}.example.com").Subrouter() -s.Path("/articles/{category}/{id:[0-9]+}"). - HandlerFunc(ArticleHandler). - Name("article") - -// "http://news.example.com/articles/technology/42" -url, err := r.Get("article").URL("subdomain", "news", - "category", "technology", - "id", "42") -``` - -### Walking Routes - -The `Walk` function on `mux.Router` can be used to visit all of the routes that are registered on a router. For example, -the following prints all of the registered routes: - -```go -package main - -import ( - "fmt" - "net/http" - "strings" - - "github.com/gorilla/mux" -) - -func handler(w http.ResponseWriter, r *http.Request) { - return -} - -func main() { - r := mux.NewRouter() - r.HandleFunc("/", handler) - r.HandleFunc("/products", handler).Methods("POST") - r.HandleFunc("/articles", handler).Methods("GET") - r.HandleFunc("/articles/{id}", handler).Methods("GET", "PUT") - r.HandleFunc("/authors", handler).Queries("surname", "{surname}") - err := r.Walk(func(route *mux.Route, router *mux.Router, ancestors []*mux.Route) error { - pathTemplate, err := route.GetPathTemplate() - if err == nil { - fmt.Println("ROUTE:", pathTemplate) - } - pathRegexp, err := route.GetPathRegexp() - if err == nil { - fmt.Println("Path regexp:", pathRegexp) - } - queriesTemplates, err := route.GetQueriesTemplates() - if err == nil { - fmt.Println("Queries templates:", strings.Join(queriesTemplates, ",")) - } - queriesRegexps, err := route.GetQueriesRegexp() - if err == nil { - fmt.Println("Queries regexps:", strings.Join(queriesRegexps, ",")) - } - methods, err := route.GetMethods() - if err == nil { - fmt.Println("Methods:", strings.Join(methods, ",")) - } - fmt.Println() - return nil - }) - - if err != nil { - fmt.Println(err) - } - - http.Handle("/", r) -} -``` - -### Graceful Shutdown - -Go 1.8 introduced the ability to [gracefully shutdown](https://golang.org/doc/go1.8#http_shutdown) a `*http.Server`. Here's how to do that alongside `mux`: - -```go -package main - -import ( - "context" - "flag" - "log" - "net/http" - "os" - "os/signal" - "time" - - "github.com/gorilla/mux" -) - -func main() { - var wait time.Duration - flag.DurationVar(&wait, "graceful-timeout", time.Second * 15, "the duration for which the server gracefully wait for existing connections to finish - e.g. 15s or 1m") - flag.Parse() - - r := mux.NewRouter() - // Add your routes as needed - - srv := &http.Server{ - Addr: "0.0.0.0:8080", - // Good practice to set timeouts to avoid Slowloris attacks. - WriteTimeout: time.Second * 15, - ReadTimeout: time.Second * 15, - IdleTimeout: time.Second * 60, - Handler: r, // Pass our instance of gorilla/mux in. - } - - // Run our server in a goroutine so that it doesn't block. - go func() { - if err := srv.ListenAndServe(); err != nil { - log.Println(err) - } - }() - - c := make(chan os.Signal, 1) - // We'll accept graceful shutdowns when quit via SIGINT (Ctrl+C) - // SIGKILL, SIGQUIT or SIGTERM (Ctrl+/) will not be caught. - signal.Notify(c, os.Interrupt) - - // Block until we receive our signal. - <-c - - // Create a deadline to wait for. - ctx, cancel := context.WithTimeout(context.Background(), wait) - defer cancel() - // Doesn't block if no connections, but will otherwise wait - // until the timeout deadline. - srv.Shutdown(ctx) - // Optionally, you could run srv.Shutdown in a goroutine and block on - // <-ctx.Done() if your application should wait for other services - // to finalize based on context cancellation. - log.Println("shutting down") - os.Exit(0) -} -``` - -### Middleware - -Mux supports the addition of middlewares to a [Router](https://godoc.org/github.com/gorilla/mux#Router), which are executed in the order they are added if a match is found, including its subrouters. -Middlewares are (typically) small pieces of code which take one request, do something with it, and pass it down to another middleware or the final handler. Some common use cases for middleware are request logging, header manipulation, or `ResponseWriter` hijacking. - -Mux middlewares are defined using the de facto standard type: - -```go -type MiddlewareFunc func(http.Handler) http.Handler -``` - -Typically, the returned handler is a closure which does something with the http.ResponseWriter and http.Request passed to it, and then calls the handler passed as parameter to the MiddlewareFunc. This takes advantage of closures being able access variables from the context where they are created, while retaining the signature enforced by the receivers. - -A very basic middleware which logs the URI of the request being handled could be written as: - -```go -func loggingMiddleware(next http.Handler) http.Handler { - return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - // Do stuff here - log.Println(r.RequestURI) - // Call the next handler, which can be another middleware in the chain, or the final handler. - next.ServeHTTP(w, r) - }) -} -``` - -Middlewares can be added to a router using `Router.Use()`: - -```go -r := mux.NewRouter() -r.HandleFunc("/", handler) -r.Use(loggingMiddleware) -``` - -A more complex authentication middleware, which maps session token to users, could be written as: - -```go -// Define our struct -type authenticationMiddleware struct { - tokenUsers map[string]string -} - -// Initialize it somewhere -func (amw *authenticationMiddleware) Populate() { - amw.tokenUsers["00000000"] = "user0" - amw.tokenUsers["aaaaaaaa"] = "userA" - amw.tokenUsers["05f717e5"] = "randomUser" - amw.tokenUsers["deadbeef"] = "user0" -} - -// Middleware function, which will be called for each request -func (amw *authenticationMiddleware) Middleware(next http.Handler) http.Handler { - return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - token := r.Header.Get("X-Session-Token") - - if user, found := amw.tokenUsers[token]; found { - // We found the token in our map - log.Printf("Authenticated user %s\n", user) - // Pass down the request to the next middleware (or final handler) - next.ServeHTTP(w, r) - } else { - // Write an error and stop the handler chain - http.Error(w, "Forbidden", http.StatusForbidden) - } - }) -} -``` - -```go -r := mux.NewRouter() -r.HandleFunc("/", handler) - -amw := authenticationMiddleware{} -amw.Populate() - -r.Use(amw.Middleware) -``` - -Note: The handler chain will be stopped if your middleware doesn't call `next.ServeHTTP()` with the corresponding parameters. This can be used to abort a request if the middleware writer wants to. Middlewares _should_ write to `ResponseWriter` if they _are_ going to terminate the request, and they _should not_ write to `ResponseWriter` if they _are not_ going to terminate it. - -### Handling CORS Requests - -[CORSMethodMiddleware](https://godoc.org/github.com/gorilla/mux#CORSMethodMiddleware) intends to make it easier to strictly set the `Access-Control-Allow-Methods` response header. - -* You will still need to use your own CORS handler to set the other CORS headers such as `Access-Control-Allow-Origin` -* The middleware will set the `Access-Control-Allow-Methods` header to all the method matchers (e.g. `r.Methods(http.MethodGet, http.MethodPut, http.MethodOptions)` -> `Access-Control-Allow-Methods: GET,PUT,OPTIONS`) on a route -* If you do not specify any methods, then: -> _Important_: there must be an `OPTIONS` method matcher for the middleware to set the headers. - -Here is an example of using `CORSMethodMiddleware` along with a custom `OPTIONS` handler to set all the required CORS headers: - -```go -package main - -import ( - "net/http" - "github.com/gorilla/mux" -) - -func main() { - r := mux.NewRouter() - - // IMPORTANT: you must specify an OPTIONS method matcher for the middleware to set CORS headers - r.HandleFunc("/foo", fooHandler).Methods(http.MethodGet, http.MethodPut, http.MethodPatch, http.MethodOptions) - r.Use(mux.CORSMethodMiddleware(r)) - - http.ListenAndServe(":8080", r) -} - -func fooHandler(w http.ResponseWriter, r *http.Request) { - w.Header().Set("Access-Control-Allow-Origin", "*") - if r.Method == http.MethodOptions { - return - } - - w.Write([]byte("foo")) -} -``` - -And an request to `/foo` using something like: - -```bash -curl localhost:8080/foo -v -``` - -Would look like: - -```bash -* Trying ::1... -* TCP_NODELAY set -* Connected to localhost (::1) port 8080 (#0) -> GET /foo HTTP/1.1 -> Host: localhost:8080 -> User-Agent: curl/7.59.0 -> Accept: */* -> -< HTTP/1.1 200 OK -< Access-Control-Allow-Methods: GET,PUT,PATCH,OPTIONS -< Access-Control-Allow-Origin: * -< Date: Fri, 28 Jun 2019 20:13:30 GMT -< Content-Length: 3 -< Content-Type: text/plain; charset=utf-8 -< -* Connection #0 to host localhost left intact -foo -``` - -### Testing Handlers - -Testing handlers in a Go web application is straightforward, and _mux_ doesn't complicate this any further. Given two files: `endpoints.go` and `endpoints_test.go`, here's how we'd test an application using _mux_. - -First, our simple HTTP handler: - -```go -// endpoints.go -package main - -func HealthCheckHandler(w http.ResponseWriter, r *http.Request) { - // A very simple health check. - w.Header().Set("Content-Type", "application/json") - w.WriteHeader(http.StatusOK) - - // In the future we could report back on the status of our DB, or our cache - // (e.g. Redis) by performing a simple PING, and include them in the response. - io.WriteString(w, `{"alive": true}`) -} - -func main() { - r := mux.NewRouter() - r.HandleFunc("/health", HealthCheckHandler) - - log.Fatal(http.ListenAndServe("localhost:8080", r)) -} -``` - -Our test code: - -```go -// endpoints_test.go -package main - -import ( - "net/http" - "net/http/httptest" - "testing" -) - -func TestHealthCheckHandler(t *testing.T) { - // Create a request to pass to our handler. We don't have any query parameters for now, so we'll - // pass 'nil' as the third parameter. - req, err := http.NewRequest("GET", "/health", nil) - if err != nil { - t.Fatal(err) - } - - // We create a ResponseRecorder (which satisfies http.ResponseWriter) to record the response. - rr := httptest.NewRecorder() - handler := http.HandlerFunc(HealthCheckHandler) - - // Our handlers satisfy http.Handler, so we can call their ServeHTTP method - // directly and pass in our Request and ResponseRecorder. - handler.ServeHTTP(rr, req) - - // Check the status code is what we expect. - if status := rr.Code; status != http.StatusOK { - t.Errorf("handler returned wrong status code: got %v want %v", - status, http.StatusOK) - } - - // Check the response body is what we expect. - expected := `{"alive": true}` - if rr.Body.String() != expected { - t.Errorf("handler returned unexpected body: got %v want %v", - rr.Body.String(), expected) - } -} -``` - -In the case that our routes have [variables](#examples), we can pass those in the request. We could write -[table-driven tests](https://dave.cheney.net/2013/06/09/writing-table-driven-tests-in-go) to test multiple -possible route variables as needed. - -```go -// endpoints.go -func main() { - r := mux.NewRouter() - // A route with a route variable: - r.HandleFunc("/metrics/{type}", MetricsHandler) - - log.Fatal(http.ListenAndServe("localhost:8080", r)) -} -``` - -Our test file, with a table-driven test of `routeVariables`: - -```go -// endpoints_test.go -func TestMetricsHandler(t *testing.T) { - tt := []struct{ - routeVariable string - shouldPass bool - }{ - {"goroutines", true}, - {"heap", true}, - {"counters", true}, - {"queries", true}, - {"adhadaeqm3k", false}, - } - - for _, tc := range tt { - path := fmt.Sprintf("/metrics/%s", tc.routeVariable) - req, err := http.NewRequest("GET", path, nil) - if err != nil { - t.Fatal(err) - } - - rr := httptest.NewRecorder() - - // Need to create a router that we can pass the request through so that the vars will be added to the context - router := mux.NewRouter() - router.HandleFunc("/metrics/{type}", MetricsHandler) - router.ServeHTTP(rr, req) - - // In this case, our MetricsHandler returns a non-200 response - // for a route variable it doesn't know about. - if rr.Code == http.StatusOK && !tc.shouldPass { - t.Errorf("handler should have failed on routeVariable %s: got %v want %v", - tc.routeVariable, rr.Code, http.StatusOK) - } - } -} -``` - -## Full Example - -Here's a complete, runnable example of a small `mux` based server: - -```go -package main - -import ( - "net/http" - "log" - "github.com/gorilla/mux" -) - -func YourHandler(w http.ResponseWriter, r *http.Request) { - w.Write([]byte("Gorilla!\n")) -} - -func main() { - r := mux.NewRouter() - // Routes consist of a path and a handler function. - r.HandleFunc("/", YourHandler) - - // Bind to a port and pass our router in - log.Fatal(http.ListenAndServe(":8000", r)) -} -``` - -## License - -BSD licensed. See the LICENSE file for details. diff --git a/vendor/github.com/gorilla/mux/doc.go b/vendor/github.com/gorilla/mux/doc.go deleted file mode 100644 index bd5a38b5..00000000 --- a/vendor/github.com/gorilla/mux/doc.go +++ /dev/null @@ -1,306 +0,0 @@ -// Copyright 2012 The Gorilla Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -/* -Package mux implements a request router and dispatcher. - -The name mux stands for "HTTP request multiplexer". Like the standard -http.ServeMux, mux.Router matches incoming requests against a list of -registered routes and calls a handler for the route that matches the URL -or other conditions. The main features are: - - * Requests can be matched based on URL host, path, path prefix, schemes, - header and query values, HTTP methods or using custom matchers. - * URL hosts, paths and query values can have variables with an optional - regular expression. - * Registered URLs can be built, or "reversed", which helps maintaining - references to resources. - * Routes can be used as subrouters: nested routes are only tested if the - parent route matches. This is useful to define groups of routes that - share common conditions like a host, a path prefix or other repeated - attributes. As a bonus, this optimizes request matching. - * It implements the http.Handler interface so it is compatible with the - standard http.ServeMux. - -Let's start registering a couple of URL paths and handlers: - - func main() { - r := mux.NewRouter() - r.HandleFunc("/", HomeHandler) - r.HandleFunc("/products", ProductsHandler) - r.HandleFunc("/articles", ArticlesHandler) - http.Handle("/", r) - } - -Here we register three routes mapping URL paths to handlers. This is -equivalent to how http.HandleFunc() works: if an incoming request URL matches -one of the paths, the corresponding handler is called passing -(http.ResponseWriter, *http.Request) as parameters. - -Paths can have variables. They are defined using the format {name} or -{name:pattern}. If a regular expression pattern is not defined, the matched -variable will be anything until the next slash. For example: - - r := mux.NewRouter() - r.HandleFunc("/products/{key}", ProductHandler) - r.HandleFunc("/articles/{category}/", ArticlesCategoryHandler) - r.HandleFunc("/articles/{category}/{id:[0-9]+}", ArticleHandler) - -Groups can be used inside patterns, as long as they are non-capturing (?:re). For example: - - r.HandleFunc("/articles/{category}/{sort:(?:asc|desc|new)}", ArticlesCategoryHandler) - -The names are used to create a map of route variables which can be retrieved -calling mux.Vars(): - - vars := mux.Vars(request) - category := vars["category"] - -Note that if any capturing groups are present, mux will panic() during parsing. To prevent -this, convert any capturing groups to non-capturing, e.g. change "/{sort:(asc|desc)}" to -"/{sort:(?:asc|desc)}". This is a change from prior versions which behaved unpredictably -when capturing groups were present. - -And this is all you need to know about the basic usage. More advanced options -are explained below. - -Routes can also be restricted to a domain or subdomain. Just define a host -pattern to be matched. They can also have variables: - - r := mux.NewRouter() - // Only matches if domain is "www.example.com". - r.Host("www.example.com") - // Matches a dynamic subdomain. - r.Host("{subdomain:[a-z]+}.domain.com") - -There are several other matchers that can be added. To match path prefixes: - - r.PathPrefix("/products/") - -...or HTTP methods: - - r.Methods("GET", "POST") - -...or URL schemes: - - r.Schemes("https") - -...or header values: - - r.Headers("X-Requested-With", "XMLHttpRequest") - -...or query values: - - r.Queries("key", "value") - -...or to use a custom matcher function: - - r.MatcherFunc(func(r *http.Request, rm *RouteMatch) bool { - return r.ProtoMajor == 0 - }) - -...and finally, it is possible to combine several matchers in a single route: - - r.HandleFunc("/products", ProductsHandler). - Host("www.example.com"). - Methods("GET"). - Schemes("http") - -Setting the same matching conditions again and again can be boring, so we have -a way to group several routes that share the same requirements. -We call it "subrouting". - -For example, let's say we have several URLs that should only match when the -host is "www.example.com". Create a route for that host and get a "subrouter" -from it: - - r := mux.NewRouter() - s := r.Host("www.example.com").Subrouter() - -Then register routes in the subrouter: - - s.HandleFunc("/products/", ProductsHandler) - s.HandleFunc("/products/{key}", ProductHandler) - s.HandleFunc("/articles/{category}/{id:[0-9]+}"), ArticleHandler) - -The three URL paths we registered above will only be tested if the domain is -"www.example.com", because the subrouter is tested first. This is not -only convenient, but also optimizes request matching. You can create -subrouters combining any attribute matchers accepted by a route. - -Subrouters can be used to create domain or path "namespaces": you define -subrouters in a central place and then parts of the app can register its -paths relatively to a given subrouter. - -There's one more thing about subroutes. When a subrouter has a path prefix, -the inner routes use it as base for their paths: - - r := mux.NewRouter() - s := r.PathPrefix("/products").Subrouter() - // "/products/" - s.HandleFunc("/", ProductsHandler) - // "/products/{key}/" - s.HandleFunc("/{key}/", ProductHandler) - // "/products/{key}/details" - s.HandleFunc("/{key}/details", ProductDetailsHandler) - -Note that the path provided to PathPrefix() represents a "wildcard": calling -PathPrefix("/static/").Handler(...) means that the handler will be passed any -request that matches "/static/*". This makes it easy to serve static files with mux: - - func main() { - var dir string - - flag.StringVar(&dir, "dir", ".", "the directory to serve files from. Defaults to the current dir") - flag.Parse() - r := mux.NewRouter() - - // This will serve files under http://localhost:8000/static/ - r.PathPrefix("/static/").Handler(http.StripPrefix("/static/", http.FileServer(http.Dir(dir)))) - - srv := &http.Server{ - Handler: r, - Addr: "127.0.0.1:8000", - // Good practice: enforce timeouts for servers you create! - WriteTimeout: 15 * time.Second, - ReadTimeout: 15 * time.Second, - } - - log.Fatal(srv.ListenAndServe()) - } - -Now let's see how to build registered URLs. - -Routes can be named. All routes that define a name can have their URLs built, -or "reversed". We define a name calling Name() on a route. For example: - - r := mux.NewRouter() - r.HandleFunc("/articles/{category}/{id:[0-9]+}", ArticleHandler). - Name("article") - -To build a URL, get the route and call the URL() method, passing a sequence of -key/value pairs for the route variables. For the previous route, we would do: - - url, err := r.Get("article").URL("category", "technology", "id", "42") - -...and the result will be a url.URL with the following path: - - "/articles/technology/42" - -This also works for host and query value variables: - - r := mux.NewRouter() - r.Host("{subdomain}.domain.com"). - Path("/articles/{category}/{id:[0-9]+}"). - Queries("filter", "{filter}"). - HandlerFunc(ArticleHandler). - Name("article") - - // url.String() will be "http://news.domain.com/articles/technology/42?filter=gorilla" - url, err := r.Get("article").URL("subdomain", "news", - "category", "technology", - "id", "42", - "filter", "gorilla") - -All variables defined in the route are required, and their values must -conform to the corresponding patterns. These requirements guarantee that a -generated URL will always match a registered route -- the only exception is -for explicitly defined "build-only" routes which never match. - -Regex support also exists for matching Headers within a route. For example, we could do: - - r.HeadersRegexp("Content-Type", "application/(text|json)") - -...and the route will match both requests with a Content-Type of `application/json` as well as -`application/text` - -There's also a way to build only the URL host or path for a route: -use the methods URLHost() or URLPath() instead. For the previous route, -we would do: - - // "http://news.domain.com/" - host, err := r.Get("article").URLHost("subdomain", "news") - - // "/articles/technology/42" - path, err := r.Get("article").URLPath("category", "technology", "id", "42") - -And if you use subrouters, host and path defined separately can be built -as well: - - r := mux.NewRouter() - s := r.Host("{subdomain}.domain.com").Subrouter() - s.Path("/articles/{category}/{id:[0-9]+}"). - HandlerFunc(ArticleHandler). - Name("article") - - // "http://news.domain.com/articles/technology/42" - url, err := r.Get("article").URL("subdomain", "news", - "category", "technology", - "id", "42") - -Mux supports the addition of middlewares to a Router, which are executed in the order they are added if a match is found, including its subrouters. Middlewares are (typically) small pieces of code which take one request, do something with it, and pass it down to another middleware or the final handler. Some common use cases for middleware are request logging, header manipulation, or ResponseWriter hijacking. - - type MiddlewareFunc func(http.Handler) http.Handler - -Typically, the returned handler is a closure which does something with the http.ResponseWriter and http.Request passed to it, and then calls the handler passed as parameter to the MiddlewareFunc (closures can access variables from the context where they are created). - -A very basic middleware which logs the URI of the request being handled could be written as: - - func simpleMw(next http.Handler) http.Handler { - return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - // Do stuff here - log.Println(r.RequestURI) - // Call the next handler, which can be another middleware in the chain, or the final handler. - next.ServeHTTP(w, r) - }) - } - -Middlewares can be added to a router using `Router.Use()`: - - r := mux.NewRouter() - r.HandleFunc("/", handler) - r.Use(simpleMw) - -A more complex authentication middleware, which maps session token to users, could be written as: - - // Define our struct - type authenticationMiddleware struct { - tokenUsers map[string]string - } - - // Initialize it somewhere - func (amw *authenticationMiddleware) Populate() { - amw.tokenUsers["00000000"] = "user0" - amw.tokenUsers["aaaaaaaa"] = "userA" - amw.tokenUsers["05f717e5"] = "randomUser" - amw.tokenUsers["deadbeef"] = "user0" - } - - // Middleware function, which will be called for each request - func (amw *authenticationMiddleware) Middleware(next http.Handler) http.Handler { - return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - token := r.Header.Get("X-Session-Token") - - if user, found := amw.tokenUsers[token]; found { - // We found the token in our map - log.Printf("Authenticated user %s\n", user) - next.ServeHTTP(w, r) - } else { - http.Error(w, "Forbidden", http.StatusForbidden) - } - }) - } - - r := mux.NewRouter() - r.HandleFunc("/", handler) - - amw := authenticationMiddleware{tokenUsers: make(map[string]string)} - amw.Populate() - - r.Use(amw.Middleware) - -Note: The handler chain will be stopped if your middleware doesn't call `next.ServeHTTP()` with the corresponding parameters. This can be used to abort a request if the middleware writer wants to. - -*/ -package mux diff --git a/vendor/github.com/gorilla/mux/go.mod b/vendor/github.com/gorilla/mux/go.mod deleted file mode 100644 index df170a39..00000000 --- a/vendor/github.com/gorilla/mux/go.mod +++ /dev/null @@ -1,3 +0,0 @@ -module github.com/gorilla/mux - -go 1.12 diff --git a/vendor/github.com/gorilla/mux/middleware.go b/vendor/github.com/gorilla/mux/middleware.go deleted file mode 100644 index cb51c565..00000000 --- a/vendor/github.com/gorilla/mux/middleware.go +++ /dev/null @@ -1,74 +0,0 @@ -package mux - -import ( - "net/http" - "strings" -) - -// MiddlewareFunc is a function which receives an http.Handler and returns another http.Handler. -// Typically, the returned handler is a closure which does something with the http.ResponseWriter and http.Request passed -// to it, and then calls the handler passed as parameter to the MiddlewareFunc. -type MiddlewareFunc func(http.Handler) http.Handler - -// middleware interface is anything which implements a MiddlewareFunc named Middleware. -type middleware interface { - Middleware(handler http.Handler) http.Handler -} - -// Middleware allows MiddlewareFunc to implement the middleware interface. -func (mw MiddlewareFunc) Middleware(handler http.Handler) http.Handler { - return mw(handler) -} - -// Use appends a MiddlewareFunc to the chain. Middleware can be used to intercept or otherwise modify requests and/or responses, and are executed in the order that they are applied to the Router. -func (r *Router) Use(mwf ...MiddlewareFunc) { - for _, fn := range mwf { - r.middlewares = append(r.middlewares, fn) - } -} - -// useInterface appends a middleware to the chain. Middleware can be used to intercept or otherwise modify requests and/or responses, and are executed in the order that they are applied to the Router. -func (r *Router) useInterface(mw middleware) { - r.middlewares = append(r.middlewares, mw) -} - -// CORSMethodMiddleware automatically sets the Access-Control-Allow-Methods response header -// on requests for routes that have an OPTIONS method matcher to all the method matchers on -// the route. Routes that do not explicitly handle OPTIONS requests will not be processed -// by the middleware. See examples for usage. -func CORSMethodMiddleware(r *Router) MiddlewareFunc { - return func(next http.Handler) http.Handler { - return http.HandlerFunc(func(w http.ResponseWriter, req *http.Request) { - allMethods, err := getAllMethodsForRoute(r, req) - if err == nil { - for _, v := range allMethods { - if v == http.MethodOptions { - w.Header().Set("Access-Control-Allow-Methods", strings.Join(allMethods, ",")) - } - } - } - - next.ServeHTTP(w, req) - }) - } -} - -// getAllMethodsForRoute returns all the methods from method matchers matching a given -// request. -func getAllMethodsForRoute(r *Router, req *http.Request) ([]string, error) { - var allMethods []string - - for _, route := range r.routes { - var match RouteMatch - if route.Match(req, &match) || match.MatchErr == ErrMethodMismatch { - methods, err := route.GetMethods() - if err != nil { - return nil, err - } - - allMethods = append(allMethods, methods...) - } - } - - return allMethods, nil -} diff --git a/vendor/github.com/gorilla/mux/mux.go b/vendor/github.com/gorilla/mux/mux.go deleted file mode 100644 index c9ba6470..00000000 --- a/vendor/github.com/gorilla/mux/mux.go +++ /dev/null @@ -1,607 +0,0 @@ -// Copyright 2012 The Gorilla Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package mux - -import ( - "context" - "errors" - "fmt" - "net/http" - "path" - "regexp" -) - -var ( - // ErrMethodMismatch is returned when the method in the request does not match - // the method defined against the route. - ErrMethodMismatch = errors.New("method is not allowed") - // ErrNotFound is returned when no route match is found. - ErrNotFound = errors.New("no matching route was found") -) - -// NewRouter returns a new router instance. -func NewRouter() *Router { - return &Router{namedRoutes: make(map[string]*Route)} -} - -// Router registers routes to be matched and dispatches a handler. -// -// It implements the http.Handler interface, so it can be registered to serve -// requests: -// -// var router = mux.NewRouter() -// -// func main() { -// http.Handle("/", router) -// } -// -// Or, for Google App Engine, register it in a init() function: -// -// func init() { -// http.Handle("/", router) -// } -// -// This will send all incoming requests to the router. -type Router struct { - // Configurable Handler to be used when no route matches. - NotFoundHandler http.Handler - - // Configurable Handler to be used when the request method does not match the route. - MethodNotAllowedHandler http.Handler - - // Routes to be matched, in order. - routes []*Route - - // Routes by name for URL building. - namedRoutes map[string]*Route - - // If true, do not clear the request context after handling the request. - // - // Deprecated: No effect, since the context is stored on the request itself. - KeepContext bool - - // Slice of middlewares to be called after a match is found - middlewares []middleware - - // configuration shared with `Route` - routeConf -} - -// common route configuration shared between `Router` and `Route` -type routeConf struct { - // If true, "/path/foo%2Fbar/to" will match the path "/path/{var}/to" - useEncodedPath bool - - // If true, when the path pattern is "/path/", accessing "/path" will - // redirect to the former and vice versa. - strictSlash bool - - // If true, when the path pattern is "/path//to", accessing "/path//to" - // will not redirect - skipClean bool - - // Manager for the variables from host and path. - regexp routeRegexpGroup - - // List of matchers. - matchers []matcher - - // The scheme used when building URLs. - buildScheme string - - buildVarsFunc BuildVarsFunc -} - -// returns an effective deep copy of `routeConf` -func copyRouteConf(r routeConf) routeConf { - c := r - - if r.regexp.path != nil { - c.regexp.path = copyRouteRegexp(r.regexp.path) - } - - if r.regexp.host != nil { - c.regexp.host = copyRouteRegexp(r.regexp.host) - } - - c.regexp.queries = make([]*routeRegexp, 0, len(r.regexp.queries)) - for _, q := range r.regexp.queries { - c.regexp.queries = append(c.regexp.queries, copyRouteRegexp(q)) - } - - c.matchers = make([]matcher, len(r.matchers)) - copy(c.matchers, r.matchers) - - return c -} - -func copyRouteRegexp(r *routeRegexp) *routeRegexp { - c := *r - return &c -} - -// Match attempts to match the given request against the router's registered routes. -// -// If the request matches a route of this router or one of its subrouters the Route, -// Handler, and Vars fields of the the match argument are filled and this function -// returns true. -// -// If the request does not match any of this router's or its subrouters' routes -// then this function returns false. If available, a reason for the match failure -// will be filled in the match argument's MatchErr field. If the match failure type -// (eg: not found) has a registered handler, the handler is assigned to the Handler -// field of the match argument. -func (r *Router) Match(req *http.Request, match *RouteMatch) bool { - for _, route := range r.routes { - if route.Match(req, match) { - // Build middleware chain if no error was found - if match.MatchErr == nil { - for i := len(r.middlewares) - 1; i >= 0; i-- { - match.Handler = r.middlewares[i].Middleware(match.Handler) - } - } - return true - } - } - - if match.MatchErr == ErrMethodMismatch { - if r.MethodNotAllowedHandler != nil { - match.Handler = r.MethodNotAllowedHandler - return true - } - - return false - } - - // Closest match for a router (includes sub-routers) - if r.NotFoundHandler != nil { - match.Handler = r.NotFoundHandler - match.MatchErr = ErrNotFound - return true - } - - match.MatchErr = ErrNotFound - return false -} - -// ServeHTTP dispatches the handler registered in the matched route. -// -// When there is a match, the route variables can be retrieved calling -// mux.Vars(request). -func (r *Router) ServeHTTP(w http.ResponseWriter, req *http.Request) { - if !r.skipClean { - path := req.URL.Path - if r.useEncodedPath { - path = req.URL.EscapedPath() - } - // Clean path to canonical form and redirect. - if p := cleanPath(path); p != path { - - // Added 3 lines (Philip Schlump) - It was dropping the query string and #whatever from query. - // This matches with fix in go 1.2 r.c. 4 for same problem. Go Issue: - // http://code.google.com/p/go/issues/detail?id=5252 - url := *req.URL - url.Path = p - p = url.String() - - w.Header().Set("Location", p) - w.WriteHeader(http.StatusMovedPermanently) - return - } - } - var match RouteMatch - var handler http.Handler - if r.Match(req, &match) { - handler = match.Handler - req = requestWithVars(req, match.Vars) - req = requestWithRoute(req, match.Route) - } - - if handler == nil && match.MatchErr == ErrMethodMismatch { - handler = methodNotAllowedHandler() - } - - if handler == nil { - handler = http.NotFoundHandler() - } - - handler.ServeHTTP(w, req) -} - -// Get returns a route registered with the given name. -func (r *Router) Get(name string) *Route { - return r.namedRoutes[name] -} - -// GetRoute returns a route registered with the given name. This method -// was renamed to Get() and remains here for backwards compatibility. -func (r *Router) GetRoute(name string) *Route { - return r.namedRoutes[name] -} - -// StrictSlash defines the trailing slash behavior for new routes. The initial -// value is false. -// -// When true, if the route path is "/path/", accessing "/path" will perform a redirect -// to the former and vice versa. In other words, your application will always -// see the path as specified in the route. -// -// When false, if the route path is "/path", accessing "/path/" will not match -// this route and vice versa. -// -// The re-direct is a HTTP 301 (Moved Permanently). Note that when this is set for -// routes with a non-idempotent method (e.g. POST, PUT), the subsequent re-directed -// request will be made as a GET by most clients. Use middleware or client settings -// to modify this behaviour as needed. -// -// Special case: when a route sets a path prefix using the PathPrefix() method, -// strict slash is ignored for that route because the redirect behavior can't -// be determined from a prefix alone. However, any subrouters created from that -// route inherit the original StrictSlash setting. -func (r *Router) StrictSlash(value bool) *Router { - r.strictSlash = value - return r -} - -// SkipClean defines the path cleaning behaviour for new routes. The initial -// value is false. Users should be careful about which routes are not cleaned -// -// When true, if the route path is "/path//to", it will remain with the double -// slash. This is helpful if you have a route like: /fetch/http://xkcd.com/534/ -// -// When false, the path will be cleaned, so /fetch/http://xkcd.com/534/ will -// become /fetch/http/xkcd.com/534 -func (r *Router) SkipClean(value bool) *Router { - r.skipClean = value - return r -} - -// UseEncodedPath tells the router to match the encoded original path -// to the routes. -// For eg. "/path/foo%2Fbar/to" will match the path "/path/{var}/to". -// -// If not called, the router will match the unencoded path to the routes. -// For eg. "/path/foo%2Fbar/to" will match the path "/path/foo/bar/to" -func (r *Router) UseEncodedPath() *Router { - r.useEncodedPath = true - return r -} - -// ---------------------------------------------------------------------------- -// Route factories -// ---------------------------------------------------------------------------- - -// NewRoute registers an empty route. -func (r *Router) NewRoute() *Route { - // initialize a route with a copy of the parent router's configuration - route := &Route{routeConf: copyRouteConf(r.routeConf), namedRoutes: r.namedRoutes} - r.routes = append(r.routes, route) - return route -} - -// Name registers a new route with a name. -// See Route.Name(). -func (r *Router) Name(name string) *Route { - return r.NewRoute().Name(name) -} - -// Handle registers a new route with a matcher for the URL path. -// See Route.Path() and Route.Handler(). -func (r *Router) Handle(path string, handler http.Handler) *Route { - return r.NewRoute().Path(path).Handler(handler) -} - -// HandleFunc registers a new route with a matcher for the URL path. -// See Route.Path() and Route.HandlerFunc(). -func (r *Router) HandleFunc(path string, f func(http.ResponseWriter, - *http.Request)) *Route { - return r.NewRoute().Path(path).HandlerFunc(f) -} - -// Headers registers a new route with a matcher for request header values. -// See Route.Headers(). -func (r *Router) Headers(pairs ...string) *Route { - return r.NewRoute().Headers(pairs...) -} - -// Host registers a new route with a matcher for the URL host. -// See Route.Host(). -func (r *Router) Host(tpl string) *Route { - return r.NewRoute().Host(tpl) -} - -// MatcherFunc registers a new route with a custom matcher function. -// See Route.MatcherFunc(). -func (r *Router) MatcherFunc(f MatcherFunc) *Route { - return r.NewRoute().MatcherFunc(f) -} - -// Methods registers a new route with a matcher for HTTP methods. -// See Route.Methods(). -func (r *Router) Methods(methods ...string) *Route { - return r.NewRoute().Methods(methods...) -} - -// Path registers a new route with a matcher for the URL path. -// See Route.Path(). -func (r *Router) Path(tpl string) *Route { - return r.NewRoute().Path(tpl) -} - -// PathPrefix registers a new route with a matcher for the URL path prefix. -// See Route.PathPrefix(). -func (r *Router) PathPrefix(tpl string) *Route { - return r.NewRoute().PathPrefix(tpl) -} - -// Queries registers a new route with a matcher for URL query values. -// See Route.Queries(). -func (r *Router) Queries(pairs ...string) *Route { - return r.NewRoute().Queries(pairs...) -} - -// Schemes registers a new route with a matcher for URL schemes. -// See Route.Schemes(). -func (r *Router) Schemes(schemes ...string) *Route { - return r.NewRoute().Schemes(schemes...) -} - -// BuildVarsFunc registers a new route with a custom function for modifying -// route variables before building a URL. -func (r *Router) BuildVarsFunc(f BuildVarsFunc) *Route { - return r.NewRoute().BuildVarsFunc(f) -} - -// Walk walks the router and all its sub-routers, calling walkFn for each route -// in the tree. The routes are walked in the order they were added. Sub-routers -// are explored depth-first. -func (r *Router) Walk(walkFn WalkFunc) error { - return r.walk(walkFn, []*Route{}) -} - -// SkipRouter is used as a return value from WalkFuncs to indicate that the -// router that walk is about to descend down to should be skipped. -var SkipRouter = errors.New("skip this router") - -// WalkFunc is the type of the function called for each route visited by Walk. -// At every invocation, it is given the current route, and the current router, -// and a list of ancestor routes that lead to the current route. -type WalkFunc func(route *Route, router *Router, ancestors []*Route) error - -func (r *Router) walk(walkFn WalkFunc, ancestors []*Route) error { - for _, t := range r.routes { - err := walkFn(t, r, ancestors) - if err == SkipRouter { - continue - } - if err != nil { - return err - } - for _, sr := range t.matchers { - if h, ok := sr.(*Router); ok { - ancestors = append(ancestors, t) - err := h.walk(walkFn, ancestors) - if err != nil { - return err - } - ancestors = ancestors[:len(ancestors)-1] - } - } - if h, ok := t.handler.(*Router); ok { - ancestors = append(ancestors, t) - err := h.walk(walkFn, ancestors) - if err != nil { - return err - } - ancestors = ancestors[:len(ancestors)-1] - } - } - return nil -} - -// ---------------------------------------------------------------------------- -// Context -// ---------------------------------------------------------------------------- - -// RouteMatch stores information about a matched route. -type RouteMatch struct { - Route *Route - Handler http.Handler - Vars map[string]string - - // MatchErr is set to appropriate matching error - // It is set to ErrMethodMismatch if there is a mismatch in - // the request method and route method - MatchErr error -} - -type contextKey int - -const ( - varsKey contextKey = iota - routeKey -) - -// Vars returns the route variables for the current request, if any. -func Vars(r *http.Request) map[string]string { - if rv := r.Context().Value(varsKey); rv != nil { - return rv.(map[string]string) - } - return nil -} - -// CurrentRoute returns the matched route for the current request, if any. -// This only works when called inside the handler of the matched route -// because the matched route is stored in the request context which is cleared -// after the handler returns, unless the KeepContext option is set on the -// Router. -func CurrentRoute(r *http.Request) *Route { - if rv := r.Context().Value(routeKey); rv != nil { - return rv.(*Route) - } - return nil -} - -func requestWithVars(r *http.Request, vars map[string]string) *http.Request { - ctx := context.WithValue(r.Context(), varsKey, vars) - return r.WithContext(ctx) -} - -func requestWithRoute(r *http.Request, route *Route) *http.Request { - ctx := context.WithValue(r.Context(), routeKey, route) - return r.WithContext(ctx) -} - -// ---------------------------------------------------------------------------- -// Helpers -// ---------------------------------------------------------------------------- - -// cleanPath returns the canonical path for p, eliminating . and .. elements. -// Borrowed from the net/http package. -func cleanPath(p string) string { - if p == "" { - return "/" - } - if p[0] != '/' { - p = "/" + p - } - np := path.Clean(p) - // path.Clean removes trailing slash except for root; - // put the trailing slash back if necessary. - if p[len(p)-1] == '/' && np != "/" { - np += "/" - } - - return np -} - -// uniqueVars returns an error if two slices contain duplicated strings. -func uniqueVars(s1, s2 []string) error { - for _, v1 := range s1 { - for _, v2 := range s2 { - if v1 == v2 { - return fmt.Errorf("mux: duplicated route variable %q", v2) - } - } - } - return nil -} - -// checkPairs returns the count of strings passed in, and an error if -// the count is not an even number. -func checkPairs(pairs ...string) (int, error) { - length := len(pairs) - if length%2 != 0 { - return length, fmt.Errorf( - "mux: number of parameters must be multiple of 2, got %v", pairs) - } - return length, nil -} - -// mapFromPairsToString converts variadic string parameters to a -// string to string map. -func mapFromPairsToString(pairs ...string) (map[string]string, error) { - length, err := checkPairs(pairs...) - if err != nil { - return nil, err - } - m := make(map[string]string, length/2) - for i := 0; i < length; i += 2 { - m[pairs[i]] = pairs[i+1] - } - return m, nil -} - -// mapFromPairsToRegex converts variadic string parameters to a -// string to regex map. -func mapFromPairsToRegex(pairs ...string) (map[string]*regexp.Regexp, error) { - length, err := checkPairs(pairs...) - if err != nil { - return nil, err - } - m := make(map[string]*regexp.Regexp, length/2) - for i := 0; i < length; i += 2 { - regex, err := regexp.Compile(pairs[i+1]) - if err != nil { - return nil, err - } - m[pairs[i]] = regex - } - return m, nil -} - -// matchInArray returns true if the given string value is in the array. -func matchInArray(arr []string, value string) bool { - for _, v := range arr { - if v == value { - return true - } - } - return false -} - -// matchMapWithString returns true if the given key/value pairs exist in a given map. -func matchMapWithString(toCheck map[string]string, toMatch map[string][]string, canonicalKey bool) bool { - for k, v := range toCheck { - // Check if key exists. - if canonicalKey { - k = http.CanonicalHeaderKey(k) - } - if values := toMatch[k]; values == nil { - return false - } else if v != "" { - // If value was defined as an empty string we only check that the - // key exists. Otherwise we also check for equality. - valueExists := false - for _, value := range values { - if v == value { - valueExists = true - break - } - } - if !valueExists { - return false - } - } - } - return true -} - -// matchMapWithRegex returns true if the given key/value pairs exist in a given map compiled against -// the given regex -func matchMapWithRegex(toCheck map[string]*regexp.Regexp, toMatch map[string][]string, canonicalKey bool) bool { - for k, v := range toCheck { - // Check if key exists. - if canonicalKey { - k = http.CanonicalHeaderKey(k) - } - if values := toMatch[k]; values == nil { - return false - } else if v != nil { - // If value was defined as an empty string we only check that the - // key exists. Otherwise we also check for equality. - valueExists := false - for _, value := range values { - if v.MatchString(value) { - valueExists = true - break - } - } - if !valueExists { - return false - } - } - } - return true -} - -// methodNotAllowed replies to the request with an HTTP status code 405. -func methodNotAllowed(w http.ResponseWriter, r *http.Request) { - w.WriteHeader(http.StatusMethodNotAllowed) -} - -// methodNotAllowedHandler returns a simple request handler -// that replies to each request with a status code 405. -func methodNotAllowedHandler() http.Handler { return http.HandlerFunc(methodNotAllowed) } diff --git a/vendor/github.com/gorilla/mux/regexp.go b/vendor/github.com/gorilla/mux/regexp.go deleted file mode 100644 index 96dd94ad..00000000 --- a/vendor/github.com/gorilla/mux/regexp.go +++ /dev/null @@ -1,382 +0,0 @@ -// Copyright 2012 The Gorilla Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package mux - -import ( - "bytes" - "fmt" - "net/http" - "net/url" - "regexp" - "strconv" - "strings" -) - -type routeRegexpOptions struct { - strictSlash bool - useEncodedPath bool -} - -type regexpType int - -const ( - regexpTypePath regexpType = 0 - regexpTypeHost regexpType = 1 - regexpTypePrefix regexpType = 2 - regexpTypeQuery regexpType = 3 -) - -// newRouteRegexp parses a route template and returns a routeRegexp, -// used to match a host, a path or a query string. -// -// It will extract named variables, assemble a regexp to be matched, create -// a "reverse" template to build URLs and compile regexps to validate variable -// values used in URL building. -// -// Previously we accepted only Python-like identifiers for variable -// names ([a-zA-Z_][a-zA-Z0-9_]*), but currently the only restriction is that -// name and pattern can't be empty, and names can't contain a colon. -func newRouteRegexp(tpl string, typ regexpType, options routeRegexpOptions) (*routeRegexp, error) { - // Check if it is well-formed. - idxs, errBraces := braceIndices(tpl) - if errBraces != nil { - return nil, errBraces - } - // Backup the original. - template := tpl - // Now let's parse it. - defaultPattern := "[^/]+" - if typ == regexpTypeQuery { - defaultPattern = ".*" - } else if typ == regexpTypeHost { - defaultPattern = "[^.]+" - } - // Only match strict slash if not matching - if typ != regexpTypePath { - options.strictSlash = false - } - // Set a flag for strictSlash. - endSlash := false - if options.strictSlash && strings.HasSuffix(tpl, "/") { - tpl = tpl[:len(tpl)-1] - endSlash = true - } - varsN := make([]string, len(idxs)/2) - varsR := make([]*regexp.Regexp, len(idxs)/2) - pattern := bytes.NewBufferString("") - pattern.WriteByte('^') - reverse := bytes.NewBufferString("") - var end int - var err error - for i := 0; i < len(idxs); i += 2 { - // Set all values we are interested in. - raw := tpl[end:idxs[i]] - end = idxs[i+1] - parts := strings.SplitN(tpl[idxs[i]+1:end-1], ":", 2) - name := parts[0] - patt := defaultPattern - if len(parts) == 2 { - patt = parts[1] - } - // Name or pattern can't be empty. - if name == "" || patt == "" { - return nil, fmt.Errorf("mux: missing name or pattern in %q", - tpl[idxs[i]:end]) - } - // Build the regexp pattern. - fmt.Fprintf(pattern, "%s(?P<%s>%s)", regexp.QuoteMeta(raw), varGroupName(i/2), patt) - - // Build the reverse template. - fmt.Fprintf(reverse, "%s%%s", raw) - - // Append variable name and compiled pattern. - varsN[i/2] = name - varsR[i/2], err = regexp.Compile(fmt.Sprintf("^%s$", patt)) - if err != nil { - return nil, err - } - } - // Add the remaining. - raw := tpl[end:] - pattern.WriteString(regexp.QuoteMeta(raw)) - if options.strictSlash { - pattern.WriteString("[/]?") - } - if typ == regexpTypeQuery { - // Add the default pattern if the query value is empty - if queryVal := strings.SplitN(template, "=", 2)[1]; queryVal == "" { - pattern.WriteString(defaultPattern) - } - } - if typ != regexpTypePrefix { - pattern.WriteByte('$') - } - - var wildcardHostPort bool - if typ == regexpTypeHost { - if !strings.Contains(pattern.String(), ":") { - wildcardHostPort = true - } - } - reverse.WriteString(raw) - if endSlash { - reverse.WriteByte('/') - } - // Compile full regexp. - reg, errCompile := regexp.Compile(pattern.String()) - if errCompile != nil { - return nil, errCompile - } - - // Check for capturing groups which used to work in older versions - if reg.NumSubexp() != len(idxs)/2 { - panic(fmt.Sprintf("route %s contains capture groups in its regexp. ", template) + - "Only non-capturing groups are accepted: e.g. (?:pattern) instead of (pattern)") - } - - // Done! - return &routeRegexp{ - template: template, - regexpType: typ, - options: options, - regexp: reg, - reverse: reverse.String(), - varsN: varsN, - varsR: varsR, - wildcardHostPort: wildcardHostPort, - }, nil -} - -// routeRegexp stores a regexp to match a host or path and information to -// collect and validate route variables. -type routeRegexp struct { - // The unmodified template. - template string - // The type of match - regexpType regexpType - // Options for matching - options routeRegexpOptions - // Expanded regexp. - regexp *regexp.Regexp - // Reverse template. - reverse string - // Variable names. - varsN []string - // Variable regexps (validators). - varsR []*regexp.Regexp - // Wildcard host-port (no strict port match in hostname) - wildcardHostPort bool -} - -// Match matches the regexp against the URL host or path. -func (r *routeRegexp) Match(req *http.Request, match *RouteMatch) bool { - if r.regexpType == regexpTypeHost { - host := getHost(req) - if r.wildcardHostPort { - // Don't be strict on the port match - if i := strings.Index(host, ":"); i != -1 { - host = host[:i] - } - } - return r.regexp.MatchString(host) - } - - if r.regexpType == regexpTypeQuery { - return r.matchQueryString(req) - } - path := req.URL.Path - if r.options.useEncodedPath { - path = req.URL.EscapedPath() - } - return r.regexp.MatchString(path) -} - -// url builds a URL part using the given values. -func (r *routeRegexp) url(values map[string]string) (string, error) { - urlValues := make([]interface{}, len(r.varsN), len(r.varsN)) - for k, v := range r.varsN { - value, ok := values[v] - if !ok { - return "", fmt.Errorf("mux: missing route variable %q", v) - } - if r.regexpType == regexpTypeQuery { - value = url.QueryEscape(value) - } - urlValues[k] = value - } - rv := fmt.Sprintf(r.reverse, urlValues...) - if !r.regexp.MatchString(rv) { - // The URL is checked against the full regexp, instead of checking - // individual variables. This is faster but to provide a good error - // message, we check individual regexps if the URL doesn't match. - for k, v := range r.varsN { - if !r.varsR[k].MatchString(values[v]) { - return "", fmt.Errorf( - "mux: variable %q doesn't match, expected %q", values[v], - r.varsR[k].String()) - } - } - } - return rv, nil -} - -// getURLQuery returns a single query parameter from a request URL. -// For a URL with foo=bar&baz=ding, we return only the relevant key -// value pair for the routeRegexp. -func (r *routeRegexp) getURLQuery(req *http.Request) string { - if r.regexpType != regexpTypeQuery { - return "" - } - templateKey := strings.SplitN(r.template, "=", 2)[0] - val, ok := findFirstQueryKey(req.URL.RawQuery, templateKey) - if ok { - return templateKey + "=" + val - } - return "" -} - -// findFirstQueryKey returns the same result as (*url.URL).Query()[key][0]. -// If key was not found, empty string and false is returned. -func findFirstQueryKey(rawQuery, key string) (value string, ok bool) { - query := []byte(rawQuery) - for len(query) > 0 { - foundKey := query - if i := bytes.IndexAny(foundKey, "&;"); i >= 0 { - foundKey, query = foundKey[:i], foundKey[i+1:] - } else { - query = query[:0] - } - if len(foundKey) == 0 { - continue - } - var value []byte - if i := bytes.IndexByte(foundKey, '='); i >= 0 { - foundKey, value = foundKey[:i], foundKey[i+1:] - } - if len(foundKey) < len(key) { - // Cannot possibly be key. - continue - } - keyString, err := url.QueryUnescape(string(foundKey)) - if err != nil { - continue - } - if keyString != key { - continue - } - valueString, err := url.QueryUnescape(string(value)) - if err != nil { - continue - } - return valueString, true - } - return "", false -} - -func (r *routeRegexp) matchQueryString(req *http.Request) bool { - return r.regexp.MatchString(r.getURLQuery(req)) -} - -// braceIndices returns the first level curly brace indices from a string. -// It returns an error in case of unbalanced braces. -func braceIndices(s string) ([]int, error) { - var level, idx int - var idxs []int - for i := 0; i < len(s); i++ { - switch s[i] { - case '{': - if level++; level == 1 { - idx = i - } - case '}': - if level--; level == 0 { - idxs = append(idxs, idx, i+1) - } else if level < 0 { - return nil, fmt.Errorf("mux: unbalanced braces in %q", s) - } - } - } - if level != 0 { - return nil, fmt.Errorf("mux: unbalanced braces in %q", s) - } - return idxs, nil -} - -// varGroupName builds a capturing group name for the indexed variable. -func varGroupName(idx int) string { - return "v" + strconv.Itoa(idx) -} - -// ---------------------------------------------------------------------------- -// routeRegexpGroup -// ---------------------------------------------------------------------------- - -// routeRegexpGroup groups the route matchers that carry variables. -type routeRegexpGroup struct { - host *routeRegexp - path *routeRegexp - queries []*routeRegexp -} - -// setMatch extracts the variables from the URL once a route matches. -func (v routeRegexpGroup) setMatch(req *http.Request, m *RouteMatch, r *Route) { - // Store host variables. - if v.host != nil { - host := getHost(req) - matches := v.host.regexp.FindStringSubmatchIndex(host) - if len(matches) > 0 { - extractVars(host, matches, v.host.varsN, m.Vars) - } - } - path := req.URL.Path - if r.useEncodedPath { - path = req.URL.EscapedPath() - } - // Store path variables. - if v.path != nil { - matches := v.path.regexp.FindStringSubmatchIndex(path) - if len(matches) > 0 { - extractVars(path, matches, v.path.varsN, m.Vars) - // Check if we should redirect. - if v.path.options.strictSlash { - p1 := strings.HasSuffix(path, "/") - p2 := strings.HasSuffix(v.path.template, "/") - if p1 != p2 { - u, _ := url.Parse(req.URL.String()) - if p1 { - u.Path = u.Path[:len(u.Path)-1] - } else { - u.Path += "/" - } - m.Handler = http.RedirectHandler(u.String(), http.StatusMovedPermanently) - } - } - } - } - // Store query string variables. - for _, q := range v.queries { - queryURL := q.getURLQuery(req) - matches := q.regexp.FindStringSubmatchIndex(queryURL) - if len(matches) > 0 { - extractVars(queryURL, matches, q.varsN, m.Vars) - } - } -} - -// getHost tries its best to return the request host. -// According to section 14.23 of RFC 2616 the Host header -// can include the port number if the default value of 80 is not used. -func getHost(r *http.Request) string { - if r.URL.IsAbs() { - return r.URL.Host - } - return r.Host -} - -func extractVars(input string, matches []int, names []string, output map[string]string) { - for i, name := range names { - output[name] = input[matches[2*i+2]:matches[2*i+3]] - } -} diff --git a/vendor/github.com/gorilla/mux/route.go b/vendor/github.com/gorilla/mux/route.go deleted file mode 100644 index 750afe57..00000000 --- a/vendor/github.com/gorilla/mux/route.go +++ /dev/null @@ -1,736 +0,0 @@ -// Copyright 2012 The Gorilla Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package mux - -import ( - "errors" - "fmt" - "net/http" - "net/url" - "regexp" - "strings" -) - -// Route stores information to match a request and build URLs. -type Route struct { - // Request handler for the route. - handler http.Handler - // If true, this route never matches: it is only used to build URLs. - buildOnly bool - // The name used to build URLs. - name string - // Error resulted from building a route. - err error - - // "global" reference to all named routes - namedRoutes map[string]*Route - - // config possibly passed in from `Router` - routeConf -} - -// SkipClean reports whether path cleaning is enabled for this route via -// Router.SkipClean. -func (r *Route) SkipClean() bool { - return r.skipClean -} - -// Match matches the route against the request. -func (r *Route) Match(req *http.Request, match *RouteMatch) bool { - if r.buildOnly || r.err != nil { - return false - } - - var matchErr error - - // Match everything. - for _, m := range r.matchers { - if matched := m.Match(req, match); !matched { - if _, ok := m.(methodMatcher); ok { - matchErr = ErrMethodMismatch - continue - } - - // Ignore ErrNotFound errors. These errors arise from match call - // to Subrouters. - // - // This prevents subsequent matching subrouters from failing to - // run middleware. If not ignored, the middleware would see a - // non-nil MatchErr and be skipped, even when there was a - // matching route. - if match.MatchErr == ErrNotFound { - match.MatchErr = nil - } - - matchErr = nil - return false - } - } - - if matchErr != nil { - match.MatchErr = matchErr - return false - } - - if match.MatchErr == ErrMethodMismatch && r.handler != nil { - // We found a route which matches request method, clear MatchErr - match.MatchErr = nil - // Then override the mis-matched handler - match.Handler = r.handler - } - - // Yay, we have a match. Let's collect some info about it. - if match.Route == nil { - match.Route = r - } - if match.Handler == nil { - match.Handler = r.handler - } - if match.Vars == nil { - match.Vars = make(map[string]string) - } - - // Set variables. - r.regexp.setMatch(req, match, r) - return true -} - -// ---------------------------------------------------------------------------- -// Route attributes -// ---------------------------------------------------------------------------- - -// GetError returns an error resulted from building the route, if any. -func (r *Route) GetError() error { - return r.err -} - -// BuildOnly sets the route to never match: it is only used to build URLs. -func (r *Route) BuildOnly() *Route { - r.buildOnly = true - return r -} - -// Handler -------------------------------------------------------------------- - -// Handler sets a handler for the route. -func (r *Route) Handler(handler http.Handler) *Route { - if r.err == nil { - r.handler = handler - } - return r -} - -// HandlerFunc sets a handler function for the route. -func (r *Route) HandlerFunc(f func(http.ResponseWriter, *http.Request)) *Route { - return r.Handler(http.HandlerFunc(f)) -} - -// GetHandler returns the handler for the route, if any. -func (r *Route) GetHandler() http.Handler { - return r.handler -} - -// Name ----------------------------------------------------------------------- - -// Name sets the name for the route, used to build URLs. -// It is an error to call Name more than once on a route. -func (r *Route) Name(name string) *Route { - if r.name != "" { - r.err = fmt.Errorf("mux: route already has name %q, can't set %q", - r.name, name) - } - if r.err == nil { - r.name = name - r.namedRoutes[name] = r - } - return r -} - -// GetName returns the name for the route, if any. -func (r *Route) GetName() string { - return r.name -} - -// ---------------------------------------------------------------------------- -// Matchers -// ---------------------------------------------------------------------------- - -// matcher types try to match a request. -type matcher interface { - Match(*http.Request, *RouteMatch) bool -} - -// addMatcher adds a matcher to the route. -func (r *Route) addMatcher(m matcher) *Route { - if r.err == nil { - r.matchers = append(r.matchers, m) - } - return r -} - -// addRegexpMatcher adds a host or path matcher and builder to a route. -func (r *Route) addRegexpMatcher(tpl string, typ regexpType) error { - if r.err != nil { - return r.err - } - if typ == regexpTypePath || typ == regexpTypePrefix { - if len(tpl) > 0 && tpl[0] != '/' { - return fmt.Errorf("mux: path must start with a slash, got %q", tpl) - } - if r.regexp.path != nil { - tpl = strings.TrimRight(r.regexp.path.template, "/") + tpl - } - } - rr, err := newRouteRegexp(tpl, typ, routeRegexpOptions{ - strictSlash: r.strictSlash, - useEncodedPath: r.useEncodedPath, - }) - if err != nil { - return err - } - for _, q := range r.regexp.queries { - if err = uniqueVars(rr.varsN, q.varsN); err != nil { - return err - } - } - if typ == regexpTypeHost { - if r.regexp.path != nil { - if err = uniqueVars(rr.varsN, r.regexp.path.varsN); err != nil { - return err - } - } - r.regexp.host = rr - } else { - if r.regexp.host != nil { - if err = uniqueVars(rr.varsN, r.regexp.host.varsN); err != nil { - return err - } - } - if typ == regexpTypeQuery { - r.regexp.queries = append(r.regexp.queries, rr) - } else { - r.regexp.path = rr - } - } - r.addMatcher(rr) - return nil -} - -// Headers -------------------------------------------------------------------- - -// headerMatcher matches the request against header values. -type headerMatcher map[string]string - -func (m headerMatcher) Match(r *http.Request, match *RouteMatch) bool { - return matchMapWithString(m, r.Header, true) -} - -// Headers adds a matcher for request header values. -// It accepts a sequence of key/value pairs to be matched. For example: -// -// r := mux.NewRouter() -// r.Headers("Content-Type", "application/json", -// "X-Requested-With", "XMLHttpRequest") -// -// The above route will only match if both request header values match. -// If the value is an empty string, it will match any value if the key is set. -func (r *Route) Headers(pairs ...string) *Route { - if r.err == nil { - var headers map[string]string - headers, r.err = mapFromPairsToString(pairs...) - return r.addMatcher(headerMatcher(headers)) - } - return r -} - -// headerRegexMatcher matches the request against the route given a regex for the header -type headerRegexMatcher map[string]*regexp.Regexp - -func (m headerRegexMatcher) Match(r *http.Request, match *RouteMatch) bool { - return matchMapWithRegex(m, r.Header, true) -} - -// HeadersRegexp accepts a sequence of key/value pairs, where the value has regex -// support. For example: -// -// r := mux.NewRouter() -// r.HeadersRegexp("Content-Type", "application/(text|json)", -// "X-Requested-With", "XMLHttpRequest") -// -// The above route will only match if both the request header matches both regular expressions. -// If the value is an empty string, it will match any value if the key is set. -// Use the start and end of string anchors (^ and $) to match an exact value. -func (r *Route) HeadersRegexp(pairs ...string) *Route { - if r.err == nil { - var headers map[string]*regexp.Regexp - headers, r.err = mapFromPairsToRegex(pairs...) - return r.addMatcher(headerRegexMatcher(headers)) - } - return r -} - -// Host ----------------------------------------------------------------------- - -// Host adds a matcher for the URL host. -// It accepts a template with zero or more URL variables enclosed by {}. -// Variables can define an optional regexp pattern to be matched: -// -// - {name} matches anything until the next dot. -// -// - {name:pattern} matches the given regexp pattern. -// -// For example: -// -// r := mux.NewRouter() -// r.Host("www.example.com") -// r.Host("{subdomain}.domain.com") -// r.Host("{subdomain:[a-z]+}.domain.com") -// -// Variable names must be unique in a given route. They can be retrieved -// calling mux.Vars(request). -func (r *Route) Host(tpl string) *Route { - r.err = r.addRegexpMatcher(tpl, regexpTypeHost) - return r -} - -// MatcherFunc ---------------------------------------------------------------- - -// MatcherFunc is the function signature used by custom matchers. -type MatcherFunc func(*http.Request, *RouteMatch) bool - -// Match returns the match for a given request. -func (m MatcherFunc) Match(r *http.Request, match *RouteMatch) bool { - return m(r, match) -} - -// MatcherFunc adds a custom function to be used as request matcher. -func (r *Route) MatcherFunc(f MatcherFunc) *Route { - return r.addMatcher(f) -} - -// Methods -------------------------------------------------------------------- - -// methodMatcher matches the request against HTTP methods. -type methodMatcher []string - -func (m methodMatcher) Match(r *http.Request, match *RouteMatch) bool { - return matchInArray(m, r.Method) -} - -// Methods adds a matcher for HTTP methods. -// It accepts a sequence of one or more methods to be matched, e.g.: -// "GET", "POST", "PUT". -func (r *Route) Methods(methods ...string) *Route { - for k, v := range methods { - methods[k] = strings.ToUpper(v) - } - return r.addMatcher(methodMatcher(methods)) -} - -// Path ----------------------------------------------------------------------- - -// Path adds a matcher for the URL path. -// It accepts a template with zero or more URL variables enclosed by {}. The -// template must start with a "/". -// Variables can define an optional regexp pattern to be matched: -// -// - {name} matches anything until the next slash. -// -// - {name:pattern} matches the given regexp pattern. -// -// For example: -// -// r := mux.NewRouter() -// r.Path("/products/").Handler(ProductsHandler) -// r.Path("/products/{key}").Handler(ProductsHandler) -// r.Path("/articles/{category}/{id:[0-9]+}"). -// Handler(ArticleHandler) -// -// Variable names must be unique in a given route. They can be retrieved -// calling mux.Vars(request). -func (r *Route) Path(tpl string) *Route { - r.err = r.addRegexpMatcher(tpl, regexpTypePath) - return r -} - -// PathPrefix ----------------------------------------------------------------- - -// PathPrefix adds a matcher for the URL path prefix. This matches if the given -// template is a prefix of the full URL path. See Route.Path() for details on -// the tpl argument. -// -// Note that it does not treat slashes specially ("/foobar/" will be matched by -// the prefix "/foo") so you may want to use a trailing slash here. -// -// Also note that the setting of Router.StrictSlash() has no effect on routes -// with a PathPrefix matcher. -func (r *Route) PathPrefix(tpl string) *Route { - r.err = r.addRegexpMatcher(tpl, regexpTypePrefix) - return r -} - -// Query ---------------------------------------------------------------------- - -// Queries adds a matcher for URL query values. -// It accepts a sequence of key/value pairs. Values may define variables. -// For example: -// -// r := mux.NewRouter() -// r.Queries("foo", "bar", "id", "{id:[0-9]+}") -// -// The above route will only match if the URL contains the defined queries -// values, e.g.: ?foo=bar&id=42. -// -// If the value is an empty string, it will match any value if the key is set. -// -// Variables can define an optional regexp pattern to be matched: -// -// - {name} matches anything until the next slash. -// -// - {name:pattern} matches the given regexp pattern. -func (r *Route) Queries(pairs ...string) *Route { - length := len(pairs) - if length%2 != 0 { - r.err = fmt.Errorf( - "mux: number of parameters must be multiple of 2, got %v", pairs) - return nil - } - for i := 0; i < length; i += 2 { - if r.err = r.addRegexpMatcher(pairs[i]+"="+pairs[i+1], regexpTypeQuery); r.err != nil { - return r - } - } - - return r -} - -// Schemes -------------------------------------------------------------------- - -// schemeMatcher matches the request against URL schemes. -type schemeMatcher []string - -func (m schemeMatcher) Match(r *http.Request, match *RouteMatch) bool { - scheme := r.URL.Scheme - // https://golang.org/pkg/net/http/#Request - // "For [most] server requests, fields other than Path and RawQuery will be - // empty." - // Since we're an http muxer, the scheme is either going to be http or https - // though, so we can just set it based on the tls termination state. - if scheme == "" { - if r.TLS == nil { - scheme = "http" - } else { - scheme = "https" - } - } - return matchInArray(m, scheme) -} - -// Schemes adds a matcher for URL schemes. -// It accepts a sequence of schemes to be matched, e.g.: "http", "https". -// If the request's URL has a scheme set, it will be matched against. -// Generally, the URL scheme will only be set if a previous handler set it, -// such as the ProxyHeaders handler from gorilla/handlers. -// If unset, the scheme will be determined based on the request's TLS -// termination state. -// The first argument to Schemes will be used when constructing a route URL. -func (r *Route) Schemes(schemes ...string) *Route { - for k, v := range schemes { - schemes[k] = strings.ToLower(v) - } - if len(schemes) > 0 { - r.buildScheme = schemes[0] - } - return r.addMatcher(schemeMatcher(schemes)) -} - -// BuildVarsFunc -------------------------------------------------------------- - -// BuildVarsFunc is the function signature used by custom build variable -// functions (which can modify route variables before a route's URL is built). -type BuildVarsFunc func(map[string]string) map[string]string - -// BuildVarsFunc adds a custom function to be used to modify build variables -// before a route's URL is built. -func (r *Route) BuildVarsFunc(f BuildVarsFunc) *Route { - if r.buildVarsFunc != nil { - // compose the old and new functions - old := r.buildVarsFunc - r.buildVarsFunc = func(m map[string]string) map[string]string { - return f(old(m)) - } - } else { - r.buildVarsFunc = f - } - return r -} - -// Subrouter ------------------------------------------------------------------ - -// Subrouter creates a subrouter for the route. -// -// It will test the inner routes only if the parent route matched. For example: -// -// r := mux.NewRouter() -// s := r.Host("www.example.com").Subrouter() -// s.HandleFunc("/products/", ProductsHandler) -// s.HandleFunc("/products/{key}", ProductHandler) -// s.HandleFunc("/articles/{category}/{id:[0-9]+}"), ArticleHandler) -// -// Here, the routes registered in the subrouter won't be tested if the host -// doesn't match. -func (r *Route) Subrouter() *Router { - // initialize a subrouter with a copy of the parent route's configuration - router := &Router{routeConf: copyRouteConf(r.routeConf), namedRoutes: r.namedRoutes} - r.addMatcher(router) - return router -} - -// ---------------------------------------------------------------------------- -// URL building -// ---------------------------------------------------------------------------- - -// URL builds a URL for the route. -// -// It accepts a sequence of key/value pairs for the route variables. For -// example, given this route: -// -// r := mux.NewRouter() -// r.HandleFunc("/articles/{category}/{id:[0-9]+}", ArticleHandler). -// Name("article") -// -// ...a URL for it can be built using: -// -// url, err := r.Get("article").URL("category", "technology", "id", "42") -// -// ...which will return an url.URL with the following path: -// -// "/articles/technology/42" -// -// This also works for host variables: -// -// r := mux.NewRouter() -// r.HandleFunc("/articles/{category}/{id:[0-9]+}", ArticleHandler). -// Host("{subdomain}.domain.com"). -// Name("article") -// -// // url.String() will be "http://news.domain.com/articles/technology/42" -// url, err := r.Get("article").URL("subdomain", "news", -// "category", "technology", -// "id", "42") -// -// The scheme of the resulting url will be the first argument that was passed to Schemes: -// -// // url.String() will be "https://example.com" -// r := mux.NewRouter() -// url, err := r.Host("example.com") -// .Schemes("https", "http").URL() -// -// All variables defined in the route are required, and their values must -// conform to the corresponding patterns. -func (r *Route) URL(pairs ...string) (*url.URL, error) { - if r.err != nil { - return nil, r.err - } - values, err := r.prepareVars(pairs...) - if err != nil { - return nil, err - } - var scheme, host, path string - queries := make([]string, 0, len(r.regexp.queries)) - if r.regexp.host != nil { - if host, err = r.regexp.host.url(values); err != nil { - return nil, err - } - scheme = "http" - if r.buildScheme != "" { - scheme = r.buildScheme - } - } - if r.regexp.path != nil { - if path, err = r.regexp.path.url(values); err != nil { - return nil, err - } - } - for _, q := range r.regexp.queries { - var query string - if query, err = q.url(values); err != nil { - return nil, err - } - queries = append(queries, query) - } - return &url.URL{ - Scheme: scheme, - Host: host, - Path: path, - RawQuery: strings.Join(queries, "&"), - }, nil -} - -// URLHost builds the host part of the URL for a route. See Route.URL(). -// -// The route must have a host defined. -func (r *Route) URLHost(pairs ...string) (*url.URL, error) { - if r.err != nil { - return nil, r.err - } - if r.regexp.host == nil { - return nil, errors.New("mux: route doesn't have a host") - } - values, err := r.prepareVars(pairs...) - if err != nil { - return nil, err - } - host, err := r.regexp.host.url(values) - if err != nil { - return nil, err - } - u := &url.URL{ - Scheme: "http", - Host: host, - } - if r.buildScheme != "" { - u.Scheme = r.buildScheme - } - return u, nil -} - -// URLPath builds the path part of the URL for a route. See Route.URL(). -// -// The route must have a path defined. -func (r *Route) URLPath(pairs ...string) (*url.URL, error) { - if r.err != nil { - return nil, r.err - } - if r.regexp.path == nil { - return nil, errors.New("mux: route doesn't have a path") - } - values, err := r.prepareVars(pairs...) - if err != nil { - return nil, err - } - path, err := r.regexp.path.url(values) - if err != nil { - return nil, err - } - return &url.URL{ - Path: path, - }, nil -} - -// GetPathTemplate returns the template used to build the -// route match. -// This is useful for building simple REST API documentation and for instrumentation -// against third-party services. -// An error will be returned if the route does not define a path. -func (r *Route) GetPathTemplate() (string, error) { - if r.err != nil { - return "", r.err - } - if r.regexp.path == nil { - return "", errors.New("mux: route doesn't have a path") - } - return r.regexp.path.template, nil -} - -// GetPathRegexp returns the expanded regular expression used to match route path. -// This is useful for building simple REST API documentation and for instrumentation -// against third-party services. -// An error will be returned if the route does not define a path. -func (r *Route) GetPathRegexp() (string, error) { - if r.err != nil { - return "", r.err - } - if r.regexp.path == nil { - return "", errors.New("mux: route does not have a path") - } - return r.regexp.path.regexp.String(), nil -} - -// GetQueriesRegexp returns the expanded regular expressions used to match the -// route queries. -// This is useful for building simple REST API documentation and for instrumentation -// against third-party services. -// An error will be returned if the route does not have queries. -func (r *Route) GetQueriesRegexp() ([]string, error) { - if r.err != nil { - return nil, r.err - } - if r.regexp.queries == nil { - return nil, errors.New("mux: route doesn't have queries") - } - queries := make([]string, 0, len(r.regexp.queries)) - for _, query := range r.regexp.queries { - queries = append(queries, query.regexp.String()) - } - return queries, nil -} - -// GetQueriesTemplates returns the templates used to build the -// query matching. -// This is useful for building simple REST API documentation and for instrumentation -// against third-party services. -// An error will be returned if the route does not define queries. -func (r *Route) GetQueriesTemplates() ([]string, error) { - if r.err != nil { - return nil, r.err - } - if r.regexp.queries == nil { - return nil, errors.New("mux: route doesn't have queries") - } - queries := make([]string, 0, len(r.regexp.queries)) - for _, query := range r.regexp.queries { - queries = append(queries, query.template) - } - return queries, nil -} - -// GetMethods returns the methods the route matches against -// This is useful for building simple REST API documentation and for instrumentation -// against third-party services. -// An error will be returned if route does not have methods. -func (r *Route) GetMethods() ([]string, error) { - if r.err != nil { - return nil, r.err - } - for _, m := range r.matchers { - if methods, ok := m.(methodMatcher); ok { - return []string(methods), nil - } - } - return nil, errors.New("mux: route doesn't have methods") -} - -// GetHostTemplate returns the template used to build the -// route match. -// This is useful for building simple REST API documentation and for instrumentation -// against third-party services. -// An error will be returned if the route does not define a host. -func (r *Route) GetHostTemplate() (string, error) { - if r.err != nil { - return "", r.err - } - if r.regexp.host == nil { - return "", errors.New("mux: route doesn't have a host") - } - return r.regexp.host.template, nil -} - -// prepareVars converts the route variable pairs into a map. If the route has a -// BuildVarsFunc, it is invoked. -func (r *Route) prepareVars(pairs ...string) (map[string]string, error) { - m, err := mapFromPairsToString(pairs...) - if err != nil { - return nil, err - } - return r.buildVars(m), nil -} - -func (r *Route) buildVars(m map[string]string) map[string]string { - if r.buildVarsFunc != nil { - m = r.buildVarsFunc(m) - } - return m -} diff --git a/vendor/github.com/gorilla/mux/test_helpers.go b/vendor/github.com/gorilla/mux/test_helpers.go deleted file mode 100644 index 5f5c496d..00000000 --- a/vendor/github.com/gorilla/mux/test_helpers.go +++ /dev/null @@ -1,19 +0,0 @@ -// Copyright 2012 The Gorilla Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package mux - -import "net/http" - -// SetURLVars sets the URL variables for the given request, to be accessed via -// mux.Vars for testing route behaviour. Arguments are not modified, a shallow -// copy is returned. -// -// This API should only be used for testing purposes; it provides a way to -// inject variables into the request context. Alternatively, URL variables -// can be set by making a route that captures the required variables, -// starting a server and sending the request to that server. -func SetURLVars(r *http.Request, val map[string]string) *http.Request { - return requestWithVars(r, val) -} diff --git a/vendor/modules.txt b/vendor/modules.txt deleted file mode 100644 index dd3e35b4..00000000 --- a/vendor/modules.txt +++ /dev/null @@ -1,31 +0,0 @@ -# github.com/Masterminds/semver v1.5.0 -## explicit -github.com/Masterminds/semver -# github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm v0.0.0-20210325210537-29b4f1784f18 => ./bindings/go/dcgm -## explicit -github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm -# github.com/gorilla/mux v1.7.4 -## explicit -github.com/gorilla/mux -# github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm => ./bindings/go/dcgm -# k8s.io/api => k8s.io/api v0.20.2 -# k8s.io/apiextensions-apiserver => k8s.io/apiextensions-apiserver v0.20.2 -# k8s.io/apimachinery => k8s.io/apimachinery v0.20.2 -# k8s.io/apiserver => k8s.io/apiserver v0.20.2 -# k8s.io/cli-runtime => k8s.io/cli-runtime v0.20.2 -# k8s.io/client-go => k8s.io/client-go v0.20.2 -# k8s.io/cloud-provider => k8s.io/cloud-provider v0.20.2 -# k8s.io/cluster-bootstrap => k8s.io/cluster-bootstrap v0.20.2 -# k8s.io/code-generator => k8s.io/code-generator v0.20.2 -# k8s.io/component-base => k8s.io/component-base v0.20.2 -# k8s.io/cri-api => k8s.io/cri-api v0.20.2 -# k8s.io/csi-translation-lib => k8s.io/csi-translation-lib v0.20.2 -# k8s.io/kube-aggregator => k8s.io/kube-aggregator v0.20.2 -# k8s.io/kube-controller-manager => k8s.io/kube-controller-manager v0.20.2 -# k8s.io/kube-proxy => k8s.io/kube-proxy v0.20.2 -# k8s.io/kube-scheduler => k8s.io/kube-scheduler v0.20.2 -# k8s.io/kubectl => k8s.io/kubectl v0.20.2 -# k8s.io/kubelet => k8s.io/kubelet v0.20.2 -# k8s.io/legacy-cloud-providers => k8s.io/legacy-cloud-providers v0.20.2 -# k8s.io/metrics => k8s.io/metrics v0.20.2 -# k8s.io/sample-apiserver => k8s.io/sample-apiserver v0.20.2