Skip to content

Commit

Permalink
Enable nvswitch/nvlink metric support (NVIDIA#113)
Browse files Browse the repository at this point in the history
  • Loading branch information
glowkey authored Oct 27, 2022
1 parent 7ca01a8 commit e2ba7f8
Show file tree
Hide file tree
Showing 9 changed files with 651 additions and 106 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ REGISTRY ?= nvidia

DCGM_VERSION := 3.0.4
GOLANG_VERSION := 1.17
VERSION := 3.0.0
VERSION := 3.1.0
FULL_VERSION := $(DCGM_VERSION)-$(VERSION)
OUTPUT := type=oci,dest=/tmp/dcgm-exporter.tar
PLATFORMS := linux/amd64,linux/arm64
Expand Down
62 changes: 38 additions & 24 deletions cmd/dcgm-exporter/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ package main
import (
"bytes"
"fmt"
"github.com/NVIDIA/dcgm-exporter/pkg/dcgmexporter"
"os"
"os/signal"
"strconv"
Expand All @@ -29,15 +28,17 @@ import (
"text/template"
"time"

"github.com/NVIDIA/dcgm-exporter/pkg/dcgmexporter"

"github.com/NVIDIA/go-dcgm/pkg/dcgm"
"github.com/sirupsen/logrus"
"github.com/urfave/cli/v2"
)

const (
FlexKey = "f" // Monitor all GPUs if MIG is disabled or all GPU instances if MIG is enabled
GPUKey = "g" // Monitor GPUs
GPUInstanceKey = "i" // Monitor GPU instances - cannot be specified if MIG is disabled
MajorKey = "g" // Monitor top-level entities: GPUs or NvSwitches
MinorKey = "i" // Monitor sub-level entities: GPU instances/NvLinks - cannot be specified if MIG is disabled
undefinedConfigMapData = "none"
)

Expand All @@ -51,7 +52,8 @@ var (
CLIKubernetesGPUIDType = "kubernetes-gpu-id-type"
CLIUseOldNamespace = "use-old-namespace"
CLIRemoteHEInfo = "remote-hostengine-info"
CLIDevices = "devices"
CLIGPUDevices = "devices"
CLISwitchDevices = "switch-devices"
CLINoHostname = "no-hostname"
CLIUseFakeGpus = "fake-gpus"
CLIConfigMapData = "configmap-data"
Expand All @@ -65,18 +67,18 @@ func main() {

deviceUsageTemplate := `Specify which devices dcgm-exporter monitors.
Possible values: {{.FlexKey}} or
{{.GPUKey}}[:id1[,-id2...] or
{{.GPUInstanceKey}}[:id1[,-id2...].
{{.MajorKey}}[:id1[,-id2...] or
{{.MinorKey}}[:id1[,-id2...].
If an id list is used, then devices with match IDs must exist on the system. For example:
(default) = monitor all GPU instances in MIG mode, all GPUs if MIG mode is disabled. (See {{.FlexKey}})
{{.GPUKey}} = Monitor all GPUs
{{.GPUInstanceKey}} = Monitor all GPU instances
{{.MajorKey}} = Monitor all GPUs
{{.MinorKey}} = Monitor all GPU instances
{{.FlexKey}} = Monitor all GPUs if MIG is disabled, or all GPU instances if MIG is enabled.
Note: this rule will be applied to each GPU. If it has GPU instances, those
will be monitored. If it doesn't, then the GPU will be monitored.
This is our recommended option for single or mixed MIG Strategies.
{{.GPUKey}}:0,1 = monitor GPUs 0 and 1
{{.GPUInstanceKey}}:0,2-4 = monitor GPU instances 0, 2, 3, and 4.
{{.MajorKey}}:0,1 = monitor GPUs 0 and 1
{{.MinorKey}}:0,2-4 = monitor GPU instances 0, 2, 3, and 4.
NOTE 1: -i cannot be specified unless MIG mode is enabled.
NOTE 2: Any time indices are specified, those indices must exist on the system.
Expand All @@ -85,7 +87,7 @@ func main() {

var deviceUsageBuffer bytes.Buffer
t := template.Must(template.New("").Parse(deviceUsageTemplate))
_ = t.Execute(&deviceUsageBuffer, map[string]string{"FlexKey": FlexKey, "GPUKey": GPUKey, "GPUInstanceKey": GPUInstanceKey})
_ = t.Execute(&deviceUsageBuffer, map[string]string{"FlexKey": FlexKey, "MajorKey": MajorKey, "MinorKey": MinorKey})
DeviceUsageStr := deviceUsageBuffer.String()

c.Flags = []cli.Flag{
Expand Down Expand Up @@ -146,7 +148,7 @@ func main() {
EnvVars: []string{"DCGM_EXPORTER_KUBERNETES_GPU_ID_TYPE"},
},
&cli.StringFlag{
Name: CLIDevices,
Name: CLIGPUDevices,
Aliases: []string{"d"},
Value: FlexKey,
Usage: DeviceUsageStr,
Expand All @@ -159,6 +161,13 @@ func main() {
Usage: "Omit the hostname information from the output, matching older versions.",
EnvVars: []string{"DCGM_EXPORTER_NO_HOSTNAME"},
},
&cli.StringFlag{
Name: CLISwitchDevices,
Aliases: []string{"s"},
Value: FlexKey,
Usage: DeviceUsageStr,
EnvVars: []string{"DCGM_EXPORTER_OTHER_DEVICES_STR"},
},
&cli.BoolFlag{
Name: CLIUseFakeGpus,
Value: false,
Expand Down Expand Up @@ -277,7 +286,7 @@ func parseDeviceOptionsToken(token string, dOpt *dcgmexporter.DeviceOptions) err
if count > 1 {
return fmt.Errorf("No range can be specified with the flex option 'f'")
}
} else if letter == GPUKey || letter == GPUInstanceKey {
} else if letter == MajorKey || letter == MinorKey {
var indices []int
if count == 1 {
// No range means all present devices of the type
Expand Down Expand Up @@ -313,10 +322,10 @@ func parseDeviceOptionsToken(token string, dOpt *dcgmexporter.DeviceOptions) err
}
}

if letter == GPUKey {
dOpt.GpuRange = indices
if letter == MajorKey {
dOpt.MajorRange = indices
} else {
dOpt.GpuInstanceRange = indices
dOpt.MinorRange = indices
}
} else {
return fmt.Errorf("The only valid options preceding ':<range>' are 'g' or 'i', but found '%s'", letter)
Expand All @@ -325,9 +334,8 @@ func parseDeviceOptionsToken(token string, dOpt *dcgmexporter.DeviceOptions) err
return nil
}

func parseDeviceOptions(c *cli.Context) (dcgmexporter.DeviceOptions, error) {
func parseDeviceOptions(devices string) (dcgmexporter.DeviceOptions, error) {
var dOpt dcgmexporter.DeviceOptions
devices := c.String(CLIDevices)

letterAndRange := strings.Split(devices, ":")
count := len(letterAndRange)
Expand All @@ -341,7 +349,7 @@ func parseDeviceOptions(c *cli.Context) (dcgmexporter.DeviceOptions, error) {
if count > 1 {
return dOpt, fmt.Errorf("No range can be specified with the flex option 'f'")
}
} else if letter == GPUKey || letter == GPUInstanceKey {
} else if letter == MajorKey || letter == MinorKey {
var indices []int
if count == 1 {
// No range means all present devices of the type
Expand Down Expand Up @@ -377,10 +385,10 @@ func parseDeviceOptions(c *cli.Context) (dcgmexporter.DeviceOptions, error) {
}
}

if letter == GPUKey {
dOpt.GpuRange = indices
if letter == MajorKey {
dOpt.MajorRange = indices
} else {
dOpt.GpuInstanceRange = indices
dOpt.MinorRange = indices
}
} else {
return dOpt, fmt.Errorf("The only valid options preceding ':<range>' are 'g' or 'i', but found '%s'", letter)
Expand All @@ -390,7 +398,12 @@ func parseDeviceOptions(c *cli.Context) (dcgmexporter.DeviceOptions, error) {
}

func contextToConfig(c *cli.Context) (*dcgmexporter.Config, error) {
dOpt, err := parseDeviceOptions(c)
gOpt, err := parseDeviceOptions(c.String(CLIGPUDevices))
if err != nil {
return nil, err
}

sOpt, err := parseDeviceOptions(c.String(CLISwitchDevices))
if err != nil {
return nil, err
}
Expand All @@ -405,7 +418,8 @@ func contextToConfig(c *cli.Context) (*dcgmexporter.Config, error) {
UseOldNamespace: c.Bool(CLIUseOldNamespace),
UseRemoteHE: c.IsSet(CLIRemoteHEInfo),
RemoteHEInfo: c.String(CLIRemoteHEInfo),
Devices: dOpt,
GPUDevices: gOpt,
SwitchDevices: sOpt,
NoHostname: c.Bool(CLINoHostname),
UseFakeGpus: c.Bool(CLIUseFakeGpus),
ConfigMapData: c.String(CLIConfigMapData),
Expand Down
49 changes: 33 additions & 16 deletions pkg/dcgmexporter/dcgm.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@ package dcgmexporter

import (
"fmt"
"github.com/NVIDIA/go-dcgm/pkg/dcgm"
"math/rand"

"github.com/NVIDIA/go-dcgm/pkg/dcgm"
)

func NewGroup() (dcgm.GroupHandle, func(), error) {
Expand All @@ -31,10 +32,16 @@ func NewGroup() (dcgm.GroupHandle, func(), error) {
return group, func() { dcgm.DestroyGroup(group) }, nil
}

func NewDeviceFields(counters []Counter) []dcgm.Short {
deviceFields := make([]dcgm.Short, len(counters))
for i, f := range counters {
deviceFields[i] = f.FieldID
func NewDeviceFields(counters []Counter, entityType dcgm.Field_Entity_Group) []dcgm.Short {
var deviceFields []dcgm.Short
for _, f := range counters {
meta := dcgm.FieldGetById(f.FieldID)

if meta.EntityLevel == entityType || meta.EntityLevel == dcgm.FE_NONE {
deviceFields = append(deviceFields, f.FieldID)
} else if entityType == dcgm.FE_GPU && (meta.EntityLevel == dcgm.FE_GPU_CI || meta.EntityLevel == dcgm.FE_GPU_I) {
deviceFields = append(deviceFields, f.FieldID)
}
}

return deviceFields
Expand Down Expand Up @@ -63,26 +70,36 @@ func SetupDcgmFieldsWatch(deviceFields []dcgm.Short, sysInfo SystemInfo, collect
var err error
var cleanups []func()
var cleanup func()
var group dcgm.GroupHandle
var groups []dcgm.GroupHandle
var fieldGroup dcgm.FieldHandle

group, cleanup, err = CreateGroupFromSystemInfo(sysInfo)
if err != nil {
goto fail
if sysInfo.InfoType == dcgm.FE_LINK {
/* one group per-nvswitch is created for nvlinks */
groups, cleanups, err = CreateLinkGroupsFromSystemInfo(sysInfo)
} else {
group, cleanup, err := CreateGroupFromSystemInfo(sysInfo)
if err == nil {
groups = append(groups, group)
cleanups = append(cleanups, cleanup)
}
}

cleanups = append(cleanups, cleanup)

fieldGroup, cleanup, err = NewFieldGroup(deviceFields)
if err != nil {
goto fail
}

cleanups = append(cleanups, cleanup)
for _, gr := range groups {
fieldGroup, cleanup, err = NewFieldGroup(deviceFields)
if err != nil {
goto fail
}

err = WatchFieldGroup(group, fieldGroup, collectIntervalUsec, 0.0, 1)
if err != nil {
goto fail
cleanups = append(cleanups, cleanup)

err = WatchFieldGroup(gr, fieldGroup, collectIntervalUsec, 0.0, 1)
if err != nil {
goto fail
}
}

return cleanups, nil
Expand Down
Loading

0 comments on commit e2ba7f8

Please sign in to comment.