From e2ba7f830e66b85578059d78860d3ee55ea49403 Mon Sep 17 00:00:00 2001 From: glowkey <4634408+glowkey@users.noreply.github.com> Date: Thu, 27 Oct 2022 09:36:13 -0600 Subject: [PATCH] Enable nvswitch/nvlink metric support (#113) --- Makefile | 2 +- cmd/dcgm-exporter/main.go | 62 +++-- pkg/dcgmexporter/dcgm.go | 49 ++-- pkg/dcgmexporter/gpu_collector.go | 100 +++++++- pkg/dcgmexporter/gpu_collector_test.go | 12 +- pkg/dcgmexporter/pipeline.go | 114 ++++++++- pkg/dcgmexporter/system_info.go | 307 +++++++++++++++++++++++-- pkg/dcgmexporter/system_info_test.go | 89 ++++++- pkg/dcgmexporter/types.go | 22 +- 9 files changed, 651 insertions(+), 106 deletions(-) diff --git a/Makefile b/Makefile index 916e05bf..669feafb 100644 --- a/Makefile +++ b/Makefile @@ -17,7 +17,7 @@ REGISTRY ?= nvidia DCGM_VERSION := 3.0.4 GOLANG_VERSION := 1.17 -VERSION := 3.0.0 +VERSION := 3.1.0 FULL_VERSION := $(DCGM_VERSION)-$(VERSION) OUTPUT := type=oci,dest=/tmp/dcgm-exporter.tar PLATFORMS := linux/amd64,linux/arm64 diff --git a/cmd/dcgm-exporter/main.go b/cmd/dcgm-exporter/main.go index a125de06..8a3eb05a 100644 --- a/cmd/dcgm-exporter/main.go +++ b/cmd/dcgm-exporter/main.go @@ -19,7 +19,6 @@ package main import ( "bytes" "fmt" - "github.com/NVIDIA/dcgm-exporter/pkg/dcgmexporter" "os" "os/signal" "strconv" @@ -29,6 +28,8 @@ import ( "text/template" "time" + "github.com/NVIDIA/dcgm-exporter/pkg/dcgmexporter" + "github.com/NVIDIA/go-dcgm/pkg/dcgm" "github.com/sirupsen/logrus" "github.com/urfave/cli/v2" @@ -36,8 +37,8 @@ import ( const ( FlexKey = "f" // Monitor all GPUs if MIG is disabled or all GPU instances if MIG is enabled - GPUKey = "g" // Monitor GPUs - GPUInstanceKey = "i" // Monitor GPU instances - cannot be specified if MIG is disabled + MajorKey = "g" // Monitor top-level entities: GPUs or NvSwitches + MinorKey = "i" // Monitor sub-level entities: GPU instances/NvLinks - cannot be specified if MIG is disabled undefinedConfigMapData = "none" ) @@ -51,7 +52,8 @@ var ( CLIKubernetesGPUIDType = "kubernetes-gpu-id-type" CLIUseOldNamespace = "use-old-namespace" CLIRemoteHEInfo = "remote-hostengine-info" - CLIDevices = "devices" + CLIGPUDevices = "devices" + CLISwitchDevices = "switch-devices" CLINoHostname = "no-hostname" CLIUseFakeGpus = "fake-gpus" CLIConfigMapData = "configmap-data" @@ -65,18 +67,18 @@ func main() { deviceUsageTemplate := `Specify which devices dcgm-exporter monitors. Possible values: {{.FlexKey}} or - {{.GPUKey}}[:id1[,-id2...] or - {{.GPUInstanceKey}}[:id1[,-id2...]. + {{.MajorKey}}[:id1[,-id2...] or + {{.MinorKey}}[:id1[,-id2...]. If an id list is used, then devices with match IDs must exist on the system. For example: (default) = monitor all GPU instances in MIG mode, all GPUs if MIG mode is disabled. (See {{.FlexKey}}) - {{.GPUKey}} = Monitor all GPUs - {{.GPUInstanceKey}} = Monitor all GPU instances + {{.MajorKey}} = Monitor all GPUs + {{.MinorKey}} = Monitor all GPU instances {{.FlexKey}} = Monitor all GPUs if MIG is disabled, or all GPU instances if MIG is enabled. Note: this rule will be applied to each GPU. If it has GPU instances, those will be monitored. If it doesn't, then the GPU will be monitored. This is our recommended option for single or mixed MIG Strategies. - {{.GPUKey}}:0,1 = monitor GPUs 0 and 1 - {{.GPUInstanceKey}}:0,2-4 = monitor GPU instances 0, 2, 3, and 4. + {{.MajorKey}}:0,1 = monitor GPUs 0 and 1 + {{.MinorKey}}:0,2-4 = monitor GPU instances 0, 2, 3, and 4. NOTE 1: -i cannot be specified unless MIG mode is enabled. NOTE 2: Any time indices are specified, those indices must exist on the system. @@ -85,7 +87,7 @@ func main() { var deviceUsageBuffer bytes.Buffer t := template.Must(template.New("").Parse(deviceUsageTemplate)) - _ = t.Execute(&deviceUsageBuffer, map[string]string{"FlexKey": FlexKey, "GPUKey": GPUKey, "GPUInstanceKey": GPUInstanceKey}) + _ = t.Execute(&deviceUsageBuffer, map[string]string{"FlexKey": FlexKey, "MajorKey": MajorKey, "MinorKey": MinorKey}) DeviceUsageStr := deviceUsageBuffer.String() c.Flags = []cli.Flag{ @@ -146,7 +148,7 @@ func main() { EnvVars: []string{"DCGM_EXPORTER_KUBERNETES_GPU_ID_TYPE"}, }, &cli.StringFlag{ - Name: CLIDevices, + Name: CLIGPUDevices, Aliases: []string{"d"}, Value: FlexKey, Usage: DeviceUsageStr, @@ -159,6 +161,13 @@ func main() { Usage: "Omit the hostname information from the output, matching older versions.", EnvVars: []string{"DCGM_EXPORTER_NO_HOSTNAME"}, }, + &cli.StringFlag{ + Name: CLISwitchDevices, + Aliases: []string{"s"}, + Value: FlexKey, + Usage: DeviceUsageStr, + EnvVars: []string{"DCGM_EXPORTER_OTHER_DEVICES_STR"}, + }, &cli.BoolFlag{ Name: CLIUseFakeGpus, Value: false, @@ -277,7 +286,7 @@ func parseDeviceOptionsToken(token string, dOpt *dcgmexporter.DeviceOptions) err if count > 1 { return fmt.Errorf("No range can be specified with the flex option 'f'") } - } else if letter == GPUKey || letter == GPUInstanceKey { + } else if letter == MajorKey || letter == MinorKey { var indices []int if count == 1 { // No range means all present devices of the type @@ -313,10 +322,10 @@ func parseDeviceOptionsToken(token string, dOpt *dcgmexporter.DeviceOptions) err } } - if letter == GPUKey { - dOpt.GpuRange = indices + if letter == MajorKey { + dOpt.MajorRange = indices } else { - dOpt.GpuInstanceRange = indices + dOpt.MinorRange = indices } } else { return fmt.Errorf("The only valid options preceding ':' are 'g' or 'i', but found '%s'", letter) @@ -325,9 +334,8 @@ func parseDeviceOptionsToken(token string, dOpt *dcgmexporter.DeviceOptions) err return nil } -func parseDeviceOptions(c *cli.Context) (dcgmexporter.DeviceOptions, error) { +func parseDeviceOptions(devices string) (dcgmexporter.DeviceOptions, error) { var dOpt dcgmexporter.DeviceOptions - devices := c.String(CLIDevices) letterAndRange := strings.Split(devices, ":") count := len(letterAndRange) @@ -341,7 +349,7 @@ func parseDeviceOptions(c *cli.Context) (dcgmexporter.DeviceOptions, error) { if count > 1 { return dOpt, fmt.Errorf("No range can be specified with the flex option 'f'") } - } else if letter == GPUKey || letter == GPUInstanceKey { + } else if letter == MajorKey || letter == MinorKey { var indices []int if count == 1 { // No range means all present devices of the type @@ -377,10 +385,10 @@ func parseDeviceOptions(c *cli.Context) (dcgmexporter.DeviceOptions, error) { } } - if letter == GPUKey { - dOpt.GpuRange = indices + if letter == MajorKey { + dOpt.MajorRange = indices } else { - dOpt.GpuInstanceRange = indices + dOpt.MinorRange = indices } } else { return dOpt, fmt.Errorf("The only valid options preceding ':' are 'g' or 'i', but found '%s'", letter) @@ -390,7 +398,12 @@ func parseDeviceOptions(c *cli.Context) (dcgmexporter.DeviceOptions, error) { } func contextToConfig(c *cli.Context) (*dcgmexporter.Config, error) { - dOpt, err := parseDeviceOptions(c) + gOpt, err := parseDeviceOptions(c.String(CLIGPUDevices)) + if err != nil { + return nil, err + } + + sOpt, err := parseDeviceOptions(c.String(CLISwitchDevices)) if err != nil { return nil, err } @@ -405,7 +418,8 @@ func contextToConfig(c *cli.Context) (*dcgmexporter.Config, error) { UseOldNamespace: c.Bool(CLIUseOldNamespace), UseRemoteHE: c.IsSet(CLIRemoteHEInfo), RemoteHEInfo: c.String(CLIRemoteHEInfo), - Devices: dOpt, + GPUDevices: gOpt, + SwitchDevices: sOpt, NoHostname: c.Bool(CLINoHostname), UseFakeGpus: c.Bool(CLIUseFakeGpus), ConfigMapData: c.String(CLIConfigMapData), diff --git a/pkg/dcgmexporter/dcgm.go b/pkg/dcgmexporter/dcgm.go index a7e1303c..527d824a 100644 --- a/pkg/dcgmexporter/dcgm.go +++ b/pkg/dcgmexporter/dcgm.go @@ -18,8 +18,9 @@ package dcgmexporter import ( "fmt" - "github.com/NVIDIA/go-dcgm/pkg/dcgm" "math/rand" + + "github.com/NVIDIA/go-dcgm/pkg/dcgm" ) func NewGroup() (dcgm.GroupHandle, func(), error) { @@ -31,10 +32,16 @@ func NewGroup() (dcgm.GroupHandle, func(), error) { return group, func() { dcgm.DestroyGroup(group) }, nil } -func NewDeviceFields(counters []Counter) []dcgm.Short { - deviceFields := make([]dcgm.Short, len(counters)) - for i, f := range counters { - deviceFields[i] = f.FieldID +func NewDeviceFields(counters []Counter, entityType dcgm.Field_Entity_Group) []dcgm.Short { + var deviceFields []dcgm.Short + for _, f := range counters { + meta := dcgm.FieldGetById(f.FieldID) + + if meta.EntityLevel == entityType || meta.EntityLevel == dcgm.FE_NONE { + deviceFields = append(deviceFields, f.FieldID) + } else if entityType == dcgm.FE_GPU && (meta.EntityLevel == dcgm.FE_GPU_CI || meta.EntityLevel == dcgm.FE_GPU_I) { + deviceFields = append(deviceFields, f.FieldID) + } } return deviceFields @@ -63,26 +70,36 @@ func SetupDcgmFieldsWatch(deviceFields []dcgm.Short, sysInfo SystemInfo, collect var err error var cleanups []func() var cleanup func() - var group dcgm.GroupHandle + var groups []dcgm.GroupHandle var fieldGroup dcgm.FieldHandle - group, cleanup, err = CreateGroupFromSystemInfo(sysInfo) - if err != nil { - goto fail + if sysInfo.InfoType == dcgm.FE_LINK { + /* one group per-nvswitch is created for nvlinks */ + groups, cleanups, err = CreateLinkGroupsFromSystemInfo(sysInfo) + } else { + group, cleanup, err := CreateGroupFromSystemInfo(sysInfo) + if err == nil { + groups = append(groups, group) + cleanups = append(cleanups, cleanup) + } } - cleanups = append(cleanups, cleanup) - - fieldGroup, cleanup, err = NewFieldGroup(deviceFields) if err != nil { goto fail } - cleanups = append(cleanups, cleanup) + for _, gr := range groups { + fieldGroup, cleanup, err = NewFieldGroup(deviceFields) + if err != nil { + goto fail + } - err = WatchFieldGroup(group, fieldGroup, collectIntervalUsec, 0.0, 1) - if err != nil { - goto fail + cleanups = append(cleanups, cleanup) + + err = WatchFieldGroup(gr, fieldGroup, collectIntervalUsec, 0.0, 1) + if err != nil { + goto fail + } } return cleanups, nil diff --git a/pkg/dcgmexporter/gpu_collector.go b/pkg/dcgmexporter/gpu_collector.go index 274e10dc..cee4d569 100644 --- a/pkg/dcgmexporter/gpu_collector.go +++ b/pkg/dcgmexporter/gpu_collector.go @@ -18,13 +18,14 @@ package dcgmexporter import ( "fmt" + "os" + "github.com/NVIDIA/go-dcgm/pkg/dcgm" "github.com/sirupsen/logrus" - "os" ) -func NewDCGMCollector(c []Counter, config *Config) (*DCGMCollector, func(), error) { - sysInfo, err := InitializeSystemInfo(config.Devices, config.UseFakeGpus) +func NewDCGMCollector(c []Counter, config *Config, entityType dcgm.Field_Entity_Group) (*DCGMCollector, func(), error) { + sysInfo, err := InitializeSystemInfo(config.GPUDevices, config.SwitchDevices, config.UseFakeGpus, entityType) if err != nil { return nil, func() {}, err } @@ -37,9 +38,15 @@ func NewDCGMCollector(c []Counter, config *Config) (*DCGMCollector, func(), erro } } + var deviceFields = NewDeviceFields(c, entityType) + + if len(deviceFields) <= 0 { + return nil, func() {}, fmt.Errorf("No fields to watch for device type: %d", entityType) + } + collector := &DCGMCollector{ Counters: c, - DeviceFields: NewDeviceFields(c), + DeviceFields: deviceFields, UseOldNamespace: config.UseOldNamespace, SysInfo: sysInfo, Hostname: hostname, @@ -47,7 +54,7 @@ func NewDCGMCollector(c []Counter, config *Config) (*DCGMCollector, func(), erro cleanups, err := SetupDcgmFieldsWatch(collector.DeviceFields, sysInfo, int64(config.CollectInterval)*1000) if err != nil { - return nil, func() {}, err + logrus.Fatal("Failed to watch metrics: ", err) } collector.Cleanups = cleanups @@ -68,7 +75,14 @@ func (c *DCGMCollector) GetMetrics() ([][]Metric, error) { metrics := make([][]Metric, count) for i, mi := range monitoringInfo { - vals, err := dcgm.EntityGetLatestValues(mi.Entity.EntityGroupId, mi.Entity.EntityId, c.DeviceFields) + var vals []dcgm.FieldValue_v1 + var err error + if mi.Entity.EntityGroupId == dcgm.FE_LINK { + vals, err = dcgm.LinkGetLatestValues(mi.Entity.EntityId, mi.ParentId, c.DeviceFields) + } else { + vals, err = dcgm.EntityGetLatestValues(mi.Entity.EntityGroupId, mi.Entity.EntityId, c.DeviceFields) + } + if err != nil { if derr, ok := err.(*dcgm.DcgmError); ok { if derr.Code == dcgm.DCGM_ST_CONNECTION_NOT_VALID { @@ -79,24 +93,88 @@ func (c *DCGMCollector) GetMetrics() ([][]Metric, error) { } // InstanceInfo will be nil for GPUs - metrics[i] = ToMetric(vals, c.Counters, mi.DeviceInfo, mi.InstanceInfo, c.UseOldNamespace, c.Hostname) + if c.SysInfo.InfoType == dcgm.FE_SWITCH || c.SysInfo.InfoType == dcgm.FE_LINK { + metrics[i] = ToSwitchMetric(vals, c.Counters, mi, c.UseOldNamespace, c.Hostname) + } else { + metrics[i] = ToMetric(vals, c.Counters, mi.DeviceInfo, mi.InstanceInfo, c.UseOldNamespace, c.Hostname) + } } return metrics, nil } +func FindCounterField(c []Counter, fieldId uint) (*Counter, error) { + for i := 0; i < len(c); i++ { + if uint(c[i].FieldID) == fieldId { + return &c[i], nil + } + } + + return &c[0], fmt.Errorf("Could not find corresponding counter") +} + +func ToSwitchMetric(values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string) []Metric { + var metrics []Metric + var labels = map[string]string{} + + for _, val := range values { + v := ToString(val) + // Filter out counters with no value and ignored fields for this entity + + counter, err := FindCounterField(c, val.FieldId) + if err != nil { + continue + } + + if counter.PromType == "label" { + labels[counter.FieldName] = v + continue + } + uuid := "UUID" + if useOld { + uuid = "uuid" + } + var m Metric + if v == SkipDCGMValue { + continue + } else { + m = Metric{ + Counter: counter, + Value: v, + UUID: uuid, + GPU: fmt.Sprintf("%d", mi.Entity.EntityId), + GPUUUID: "", + GPUDevice: fmt.Sprintf("nvswitch%d", mi.ParentId), + GPUModelName: "", + Hostname: hostname, + Labels: &labels, + Attributes: nil, + } + } + metrics = append(metrics, m) + } + + return metrics +} + func ToMetric(values []dcgm.FieldValue_v1, c []Counter, d dcgm.Device, instanceInfo *GpuInstanceInfo, useOld bool, hostname string) []Metric { var metrics []Metric var labels = map[string]string{} - for i, val := range values { + for _, val := range values { v := ToString(val) // Filter out counters with no value and ignored fields for this entity if v == SkipDCGMValue { continue } - if c[i].PromType == "label" { - labels[c[i].FieldName] = v + + counter, err := FindCounterField(c, val.FieldId) + if err != nil { + continue + } + + if counter.PromType == "label" { + labels[counter.FieldName] = v continue } uuid := "UUID" @@ -104,7 +182,7 @@ func ToMetric(values []dcgm.FieldValue_v1, c []Counter, d dcgm.Device, instanceI uuid = "uuid" } m := Metric{ - Counter: &c[i], + Counter: counter, Value: v, UUID: uuid, diff --git a/pkg/dcgmexporter/gpu_collector_test.go b/pkg/dcgmexporter/gpu_collector_test.go index 855e037b..1b2dd024 100644 --- a/pkg/dcgmexporter/gpu_collector_test.go +++ b/pkg/dcgmexporter/gpu_collector_test.go @@ -29,6 +29,9 @@ var sampleCounters = []Counter{ {dcgm.DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, "DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION", "gauge", "Energy help info"}, {dcgm.DCGM_FI_DEV_POWER_USAGE, "DCGM_FI_DEV_POWER_USAGE", "gauge", "Power help info"}, {dcgm.DCGM_FI_DRIVER_VERSION, "DCGM_FI_DRIVER_VERSION", "label", "Driver version"}, + /* test that switch and link metrics are filtered out automatically when devices are not detected */ + {dcgm.DCGM_FI_DEV_NVSWITCH_TEMPERATURE_CURRENT, "DCGM_FI_DEV_NVSWITCH_TEMPERATURE_CURRENT", "gauge", "switch temperature"}, + {dcgm.DCGM_FI_DEV_NVSWITCH_LINK_FLIT_ERRORS, "DCGM_FI_DEV_NVSWITCH_LINK_FLIT_ERRORS", "gauge", "per-link flit errors"}, } var expectedMetrics = map[string]bool{ @@ -49,14 +52,19 @@ func TestDCGMCollector(t *testing.T) { func testDCGMCollector(t *testing.T, counters []Counter) (*DCGMCollector, func()) { dOpt := DeviceOptions{true, []int{-1}, []int{-1}} cfg := Config{ - Devices: dOpt, + GPUDevices: dOpt, NoHostname: false, UseOldNamespace: false, UseFakeGpus: false, } - c, cleanup, err := NewDCGMCollector(counters, &cfg) + c, cleanup, err := NewDCGMCollector(counters, &cfg, dcgm.FE_GPU) require.NoError(t, err) + /* Test for error when no switches are available to monitor. + NOTE: This test will fail on a system with switches present. */ + _, _, err = NewDCGMCollector(counters, &cfg, dcgm.FE_SWITCH) + require.Error(t, err) + out, err := c.GetMetrics() require.NoError(t, err) require.Greater(t, len(out), 0, "Check that you have a GPU on this node") diff --git a/pkg/dcgmexporter/pipeline.go b/pkg/dcgmexporter/pipeline.go index a7ae09bc..77628418 100644 --- a/pkg/dcgmexporter/pipeline.go +++ b/pkg/dcgmexporter/pipeline.go @@ -23,6 +23,7 @@ import ( "text/template" "time" + "github.com/NVIDIA/go-dcgm/pkg/dcgm" "github.com/sirupsen/logrus" ) @@ -32,11 +33,21 @@ func NewMetricsPipeline(c *Config) (*MetricsPipeline, func(), error) { return nil, func() {}, err } - gpuCollector, cleanup, err := NewDCGMCollector(counters, c) + gpuCollector, cleanup, err := NewDCGMCollector(counters, c, dcgm.FE_GPU) if err != nil { return nil, func() {}, err } + switchCollector, cleanup, err := NewDCGMCollector(counters, c, dcgm.FE_SWITCH) + if err != nil { + logrus.Info("Not collecting switch metrics: ", err) + } + + linkCollector, cleanup, err := NewDCGMCollector(counters, c, dcgm.FE_LINK) + if err != nil { + logrus.Info("Not collecting link metrics: ", err) + } + transformations := []Transform{} if c.Kubernetes { podMapper, err := NewPodMapper(c) @@ -50,11 +61,14 @@ func NewMetricsPipeline(c *Config) (*MetricsPipeline, func(), error) { return &MetricsPipeline{ config: c, - metricsFormat: template.Must(template.New("metrics").Parse(metricsFormat)), - migMetricsFormat: template.Must(template.New("migMetrics").Parse(migMetricsFormat)), + migMetricsFormat: template.Must(template.New("migMetrics").Parse(migMetricsFormat)), + switchMetricsFormat: template.Must(template.New("switchMetrics").Parse(switchMetricsFormat)), + linkMetricsFormat: template.Must(template.New("switchMetrics").Parse(linkMetricsFormat)), counters: counters, gpuCollector: gpuCollector, + switchCollector: switchCollector, + linkCollector: linkCollector, transformations: transformations, }, func() { cleanup() @@ -66,8 +80,9 @@ func NewMetricsPipelineWithGPUCollector(c *Config, collector *DCGMCollector) (*M return &MetricsPipeline{ config: c, - metricsFormat: template.Must(template.New("metrics").Parse(metricsFormat)), - migMetricsFormat: template.Must(template.New("migMetrics").Parse(migMetricsFormat)), + migMetricsFormat: template.Must(template.New("migMetrics").Parse(migMetricsFormat)), + switchMetricsFormat: template.Must(template.New("switchMetrics").Parse(switchMetricsFormat)), + linkMetricsFormat: template.Must(template.New("switchMetrics").Parse(linkMetricsFormat)), counters: collector.Counters, gpuCollector: collector, @@ -107,10 +122,22 @@ func (m *MetricsPipeline) Run(out chan string, stop chan interface{}, wg *sync.W } } +func GetLinkStatMetricString(c DCGMCollector) string { + var linkStr string + for _, sw := range c.SysInfo.Switches { + for _, link := range sw.NvLinks { + linkStr = linkStr + fmt.Sprintf("DCGM_FI_DEV_NSWITCH_NVLINK_STATUS{nvlink=\"link%d\" nvswitch=\"nvswitch%d\",Hostname=\"%s\"} %d\n", link.Index, sw.EntityId, c.Hostname, link.State) + } + } + + return linkStr +} + func (m *MetricsPipeline) run() (string, error) { + /* Collect GPU Metrics */ metrics, err := m.gpuCollector.GetMetrics() if err != nil { - return "", fmt.Errorf("Failed to collect metrics with error: %v", err) + return "", fmt.Errorf("Failed to collect gpu metrics with error: %v", err) } for _, transform := range m.transformations { @@ -125,6 +152,59 @@ func (m *MetricsPipeline) run() (string, error) { return "", fmt.Errorf("Failed to format metrics with error: %v", err) } + if m.switchCollector != nil { + /* Collect Switch Metrics */ + metrics, err = m.switchCollector.GetMetrics() + if err != nil { + return "", fmt.Errorf("Failed to collect switch metrics with error: %v", err) + } + + if len(metrics) > 0 { + for _, transform := range m.transformations { + err := transform.Process(metrics, m.switchCollector.SysInfo) + if err != nil { + return "", fmt.Errorf("Failed to transform switch metrics for transform %s: %v", err, transform.Name()) + } + } + + switchFormated, err := FormatMetrics(m.switchMetricsFormat, metrics) + if err != nil { + logrus.Warnf("Failed to format switch metrics with error: %v", err) + } + + formated = formated + switchFormated + } + } + + if m.linkCollector != nil { + /* Collect Link Metrics */ + metrics, err = m.linkCollector.GetMetrics() + if err != nil { + return "", fmt.Errorf("Failed to collect link metrics with error: %v", err) + } + + if len(metrics) > 0 { + for _, transform := range m.transformations { + err := transform.Process(metrics, m.linkCollector.SysInfo) + if err != nil { + return "", fmt.Errorf("Failed to transform link metrics for transform %s: %v", err, transform.Name()) + } + } + + switchFormated, err := FormatMetrics(m.linkMetricsFormat, metrics) + if err != nil { + logrus.Warnf("Failed to format link metrics with error: %v", err) + } + + formated = formated + switchFormated + } + + /* Add link state output */ + + linkStates := GetLinkStatMetricString(*m.linkCollector) + formated = formated + linkStates + } + return formated, nil } @@ -139,12 +219,12 @@ func (m *MetricsPipeline) run() (string, error) { * ``` */ -var metricsFormat = ` +var migMetricsFormat = ` {{- range $counter, $metrics := . -}} # HELP {{ $counter.FieldName }} {{ $counter.Help }} # TYPE {{ $counter.FieldName }} {{ $counter.PromType }} {{- range $metric := $metrics }} -{{ $counter.FieldName }}{gpu="{{ $metric.GPU }}",{{ $metric.UUID }}="{{ $metric.GPUUUID }}",device="{{ $metric.GPUDevice }}",modelName="{{ $metric.GPUModelName }}" +{{ $counter.FieldName }}{gpu="{{ $metric.GPU }}",{{ $metric.UUID }}="{{ $metric.GPUUUID }}",device="{{ $metric.GPUDevice }}",modelName="{{ $metric.GPUModelName }}"{{if $metric.MigProfile}},GPU_I_PROFILE="{{ $metric.MigProfile }}",GPU_I_ID="{{ $metric.GPUInstanceID }}"{{end}}{{if $metric.Hostname }},Hostname="{{ $metric.Hostname }}"{{end}} {{- range $k, $v := $metric.Labels -}} ,{{ $k }}="{{ $v }}" @@ -157,20 +237,30 @@ var metricsFormat = ` {{- end }} {{ end }}` -var migMetricsFormat = ` +var switchMetricsFormat = ` {{- range $counter, $metrics := . -}} # HELP {{ $counter.FieldName }} {{ $counter.Help }} # TYPE {{ $counter.FieldName }} {{ $counter.PromType }} {{- range $metric := $metrics }} -{{ $counter.FieldName }}{gpu="{{ $metric.GPU }}",{{ $metric.UUID }}="{{ $metric.GPUUUID }}",device="{{ $metric.GPUDevice }}",modelName="{{ $metric.GPUModelName }}"{{if $metric.MigProfile}},GPU_I_PROFILE="{{ $metric.MigProfile }}",GPU_I_ID="{{ $metric.GPUInstanceID }}"{{end}}{{if $metric.Hostname }},Hostname="{{ $metric.Hostname }}"{{end}} +{{ $counter.FieldName }}{nvswitch="{{ $metric.GPU }}"{{if $metric.Hostname }},Hostname="{{ $metric.Hostname }}"{{end}} {{- range $k, $v := $metric.Labels -}} ,{{ $k }}="{{ $v }}" {{- end -}} -{{- range $k, $v := $metric.Attributes -}} +} {{ $metric.Value -}} +{{- end }} +{{ end }}` + +var linkMetricsFormat = ` +{{- range $counter, $metrics := . -}} +# HELP {{ $counter.FieldName }} {{ $counter.Help }} +# TYPE {{ $counter.FieldName }} {{ $counter.PromType }} +{{- range $metric := $metrics }} +{{ $counter.FieldName }}{nvlink="{{ $metric.GPU }}",nvswitch="{{ $metric.GPUDevice }}"{{if $metric.Hostname }},Hostname="{{ $metric.Hostname }}"{{end}} + +{{- range $k, $v := $metric.Labels -}} ,{{ $k }}="{{ $v }}" {{- end -}} - } {{ $metric.Value -}} {{- end }} {{ end }}` diff --git a/pkg/dcgmexporter/system_info.go b/pkg/dcgmexporter/system_info.go index c8d2fa8b..066eeb8d 100644 --- a/pkg/dcgmexporter/system_info.go +++ b/pkg/dcgmexporter/system_info.go @@ -18,10 +18,19 @@ package dcgmexporter import ( "fmt" - "github.com/NVIDIA/go-dcgm/pkg/dcgm" "math/rand" + + "github.com/NVIDIA/go-dcgm/pkg/dcgm" + "github.com/sirupsen/logrus" ) +const PARENT_ID_IGNORED = 0 + +type GroupInfo struct { + groupHandle dcgm.GroupHandle + groupType dcgm.Field_Entity_Group +} + type ComputeInstanceInfo struct { InstanceInfo dcgm.MigEntityInfo ProfileName string @@ -41,16 +50,25 @@ type GpuInfo struct { MigEnabled bool } +type SwitchInfo struct { + EntityId uint + NvLinks []dcgm.NvLinkStatus +} + type SystemInfo struct { GpuCount uint Gpus [dcgm.MAX_NUM_DEVICES]GpuInfo - dOpt DeviceOptions + gOpt DeviceOptions + sOpt DeviceOptions + InfoType dcgm.Field_Entity_Group + Switches []SwitchInfo } type MonitoringInfo struct { Entity dcgm.GroupEntityPair DeviceInfo dcgm.Device InstanceInfo *GpuInstanceInfo + ParentId uint } func SetGpuInstanceProfileName(sysInfo *SystemInfo, entityId uint, profileName string) bool { @@ -111,6 +129,15 @@ func GpuIdExists(sysInfo *SystemInfo, gpuId int) bool { return false } +func SwitchIdExists(sysInfo *SystemInfo, switchId int) bool { + for _, sw := range sysInfo.Switches { + if sw.EntityId == uint(switchId) { + return true + } + } + return false +} + func GpuInstanceIdExists(sysInfo *SystemInfo, gpuInstanceId int) bool { for i := uint(0); i < sysInfo.GpuCount; i++ { for _, instance := range sysInfo.Gpus[i].GpuInstances { @@ -122,22 +149,58 @@ func GpuInstanceIdExists(sysInfo *SystemInfo, gpuInstanceId int) bool { return false } -func VerifyDevicePresence(sysInfo *SystemInfo, dOpt DeviceOptions) error { - if dOpt.Flex { +func LinkIdExists(sysInfo *SystemInfo, linkId int) bool { + for _, sw := range sysInfo.Switches { + for _, link := range sw.NvLinks { + if link.Index == uint(linkId) { + return true + } + } + } + return false +} + +func VerifySwitchDevicePresence(sysInfo *SystemInfo, sOpt DeviceOptions) error { + if sOpt.Flex { + return nil + } + + if len(sOpt.MajorRange) > 0 && sOpt.MajorRange[0] != -1 { + // Verify we can find all the specified Switches + for _, swId := range sOpt.MajorRange { + if !SwitchIdExists(sysInfo, swId) { + return fmt.Errorf("couldn't find requested NvSwitch id %d", swId) + } + } + } + + if len(sOpt.MinorRange) > 0 && sOpt.MinorRange[0] != -1 { + for _, linkId := range sOpt.MinorRange { + if !LinkIdExists(sysInfo, linkId) { + return fmt.Errorf("couldn't find requested NvLink %d", linkId) + } + } + } + + return nil +} + +func VerifyDevicePresence(sysInfo *SystemInfo, gOpt DeviceOptions) error { + if gOpt.Flex { return nil } - if len(dOpt.GpuRange) > 0 && dOpt.GpuRange[0] != -1 { + if len(gOpt.MajorRange) > 0 && gOpt.MajorRange[0] != -1 { // Verify we can find all the specified GPUs - for _, gpuId := range dOpt.GpuRange { + for _, gpuId := range gOpt.MajorRange { if GpuIdExists(sysInfo, gpuId) == false { return fmt.Errorf("Couldn't find requested GPU id %d", gpuId) } } } - if len(dOpt.GpuInstanceRange) > 0 && dOpt.GpuInstanceRange[0] != -1 { - for _, gpuInstanceId := range dOpt.GpuInstanceRange { + if len(gOpt.MinorRange) > 0 && gOpt.MinorRange[0] != -1 { + for _, gpuInstanceId := range gOpt.MinorRange { if GpuInstanceIdExists(sysInfo, gpuInstanceId) == false { return fmt.Errorf("Couldn't find requested GPU instance id %d", gpuInstanceId) } @@ -147,8 +210,44 @@ func VerifyDevicePresence(sysInfo *SystemInfo, dOpt DeviceOptions) error { return nil } -func InitializeSystemInfo(dOpt DeviceOptions, useFakeGpus bool) (SystemInfo, error) { - sysInfo := SystemInfo{} +func InitializeNvSwitchInfo(sysInfo SystemInfo, sOpt DeviceOptions) (SystemInfo, error) { + switches, err := dcgm.GetEntityGroupEntities(dcgm.FE_SWITCH) + if err != nil { + return sysInfo, err + } + + if len(switches) <= 0 { + return sysInfo, fmt.Errorf("no switches to monitor") + } + + links, err := dcgm.GetNvLinkLinkStatus() + if err != nil { + return sysInfo, err + } + + for i := 0; i < len(switches); i++ { + var matchingLinks []dcgm.NvLinkStatus + for _, link := range links { + if link.ParentType == dcgm.FE_SWITCH && link.ParentId == uint(switches[i]) { + matchingLinks = append(matchingLinks, link) + } + } + + sw := SwitchInfo{ + switches[i], + matchingLinks, + } + + sysInfo.Switches = append(sysInfo.Switches, sw) + } + + sysInfo.sOpt = sOpt + err = VerifySwitchDevicePresence(&sysInfo, sOpt) + + return sysInfo, nil +} + +func InitializeGpuInfo(sysInfo SystemInfo, gOpt DeviceOptions, useFakeGpus bool) (SystemInfo, error) { gpuCount, err := dcgm.GetAllDeviceCount() if err != nil { return sysInfo, err @@ -207,12 +306,70 @@ func InitializeSystemInfo(dOpt DeviceOptions, useFakeGpus bool) (SystemInfo, err } } - sysInfo.dOpt = dOpt - err = VerifyDevicePresence(&sysInfo, dOpt) + sysInfo.gOpt = gOpt + err = VerifyDevicePresence(&sysInfo, gOpt) return sysInfo, nil } +func InitializeSystemInfo(gOpt DeviceOptions, sOpt DeviceOptions, useFakeGpus bool, entityType dcgm.Field_Entity_Group) (SystemInfo, error) { + sysInfo := SystemInfo{} + + logrus.Info("Initializing system entities of type: ", entityType) + switch entityType { + case dcgm.FE_LINK: + sysInfo.InfoType = dcgm.FE_LINK + return InitializeNvSwitchInfo(sysInfo, sOpt) + case dcgm.FE_SWITCH: + sysInfo.InfoType = dcgm.FE_SWITCH + return InitializeNvSwitchInfo(sysInfo, sOpt) + case dcgm.FE_GPU: + sysInfo.InfoType = dcgm.FE_GPU + return InitializeGpuInfo(sysInfo, gOpt, useFakeGpus) + } + + return sysInfo, fmt.Errorf("unhandled entity type: %d", entityType) +} + +func CreateLinkGroupsFromSystemInfo(sysInfo SystemInfo) ([]dcgm.GroupHandle, []func(), error) { + var groups []dcgm.GroupHandle + var cleanups []func() + + /* Create per-switch link groups */ + for _, sw := range sysInfo.Switches { + if !IsSwitchWatched(sw.EntityId, sysInfo) { + continue + } + + groupId, err := dcgm.CreateGroup(fmt.Sprintf("gpu-collector-group-%d", rand.Uint64())) + if err != nil { + return nil, cleanups, err + } + + groups = append(groups, groupId) + + for _, link := range sw.NvLinks { + if link.State != dcgm.LS_UP { + continue + } + + if !IsLinkWatched(link.Index, sw.EntityId, sysInfo) { + continue + } + + err = dcgm.AddLinkEntityToGroup(groupId, link.Index, link.ParentId) + + if err != nil { + return groups, cleanups, err + } + + cleanups = append(cleanups, func() { dcgm.DestroyGroup(groupId) }) + } + } + + return groups, cleanups, nil +} + func CreateGroupFromSystemInfo(sysInfo SystemInfo) (dcgm.GroupHandle, func(), error) { monitoringInfo := GetMonitoredEntities(sysInfo) groupId, err := dcgm.CreateGroup(fmt.Sprintf("gpu-collector-group-%d", rand.Uint64())) @@ -238,6 +395,7 @@ func AddAllGpus(sysInfo SystemInfo) []MonitoringInfo { dcgm.GroupEntityPair{dcgm.FE_GPU, sysInfo.Gpus[i].DeviceInfo.GPU}, sysInfo.Gpus[i].DeviceInfo, nil, + PARENT_ID_IGNORED, } monitoring = append(monitoring, mi) } @@ -245,6 +403,105 @@ func AddAllGpus(sysInfo SystemInfo) []MonitoringInfo { return monitoring } +func AddAllSwitches(sysInfo SystemInfo) []MonitoringInfo { + var monitoring []MonitoringInfo + + for _, sw := range sysInfo.Switches { + if !IsSwitchWatched(sw.EntityId, sysInfo) { + continue + } + + mi := MonitoringInfo{ + dcgm.GroupEntityPair{dcgm.FE_SWITCH, sw.EntityId}, + dcgm.Device{ + 0, "", "", 0, + dcgm.PCIInfo{"", 0, 0, 0}, + dcgm.DeviceIdentifiers{"", "", "", "", "", ""}, + nil, "", + }, + nil, + PARENT_ID_IGNORED, + } + monitoring = append(monitoring, mi) + } + + return monitoring +} + +func AddAllLinks(sysInfo SystemInfo) []MonitoringInfo { + var monitoring []MonitoringInfo + + for _, sw := range sysInfo.Switches { + for _, link := range sw.NvLinks { + if link.State != dcgm.LS_UP { + continue + } + + if !IsLinkWatched(link.Index, sw.EntityId, sysInfo) { + continue + } + + mi := MonitoringInfo{ + dcgm.GroupEntityPair{dcgm.FE_LINK, link.Index}, + dcgm.Device{ + 0, "", "", 0, + dcgm.PCIInfo{"", 0, 0, 0}, + dcgm.DeviceIdentifiers{"", "", "", "", "", ""}, + nil, "", + }, + nil, + link.ParentId, + } + monitoring = append(monitoring, mi) + } + } + + return monitoring +} + +func IsSwitchWatched(switchId uint, sysInfo SystemInfo) bool { + if sysInfo.sOpt.Flex { + return true + } + + if len(sysInfo.sOpt.MajorRange) <= 0 { + return true + } + + for _, sw := range sysInfo.sOpt.MajorRange { + if uint(sw) == switchId { + return true + } + + } + return false +} + +func IsLinkWatched(linkId uint, switchId uint, sysInfo SystemInfo) bool { + if sysInfo.sOpt.Flex { + return true + } + + for _, sw := range sysInfo.Switches { + if !IsSwitchWatched(sw.EntityId, sysInfo) { + return false + } + + if len(sysInfo.sOpt.MinorRange) <= 0 { + return true + } + + for _, link := range sysInfo.sOpt.MinorRange { + if uint(link) == linkId { + return true + } + } + return false + } + + return false +} + func AddAllGpuInstances(sysInfo SystemInfo, addFlexibly bool) []MonitoringInfo { var monitoring []MonitoringInfo @@ -254,6 +511,7 @@ func AddAllGpuInstances(sysInfo SystemInfo, addFlexibly bool) []MonitoringInfo { dcgm.GroupEntityPair{dcgm.FE_GPU, sysInfo.Gpus[i].DeviceInfo.GPU}, sysInfo.Gpus[i].DeviceInfo, nil, + PARENT_ID_IGNORED, } monitoring = append(monitoring, mi) } else { @@ -262,6 +520,7 @@ func AddAllGpuInstances(sysInfo SystemInfo, addFlexibly bool) []MonitoringInfo { dcgm.GroupEntityPair{dcgm.FE_GPU_I, sysInfo.Gpus[i].GpuInstances[j].EntityId}, sysInfo.Gpus[i].DeviceInfo, &sysInfo.Gpus[i].GpuInstances[j], + PARENT_ID_IGNORED, } monitoring = append(monitoring, mi) } @@ -278,6 +537,7 @@ func GetMonitoringInfoForGpu(sysInfo SystemInfo, gpuId int) *MonitoringInfo { dcgm.GroupEntityPair{dcgm.FE_GPU, sysInfo.Gpus[i].DeviceInfo.GPU}, sysInfo.Gpus[i].DeviceInfo, nil, + PARENT_ID_IGNORED, } } } @@ -293,6 +553,7 @@ func GetMonitoringInfoForGpuInstance(sysInfo SystemInfo, gpuInstanceId int) *Mon dcgm.GroupEntityPair{dcgm.FE_GPU_I, uint(gpuInstanceId)}, sysInfo.Gpus[i].DeviceInfo, &instance, + PARENT_ID_IGNORED, } } } @@ -304,22 +565,26 @@ func GetMonitoringInfoForGpuInstance(sysInfo SystemInfo, gpuInstanceId int) *Mon func GetMonitoredEntities(sysInfo SystemInfo) []MonitoringInfo { var monitoring []MonitoringInfo - if sysInfo.dOpt.Flex == true { - return AddAllGpuInstances(sysInfo, true) + if sysInfo.InfoType == dcgm.FE_SWITCH { + monitoring = AddAllSwitches(sysInfo) + } else if sysInfo.InfoType == dcgm.FE_LINK { + monitoring = AddAllLinks(sysInfo) + } else if sysInfo.gOpt.Flex == true { + monitoring = AddAllGpuInstances(sysInfo, true) } else { - if len(sysInfo.dOpt.GpuRange) > 0 && sysInfo.dOpt.GpuRange[0] == -1 { - return AddAllGpus(sysInfo) + if len(sysInfo.gOpt.MajorRange) > 0 && sysInfo.gOpt.MajorRange[0] == -1 { + monitoring = AddAllGpus(sysInfo) } else { - for _, gpuId := range sysInfo.dOpt.GpuRange { - // We've already verified that everying in the options list exists + for _, gpuId := range sysInfo.gOpt.MajorRange { + // We've already verified that everything in the options list exists monitoring = append(monitoring, *GetMonitoringInfoForGpu(sysInfo, gpuId)) } } - if len(sysInfo.dOpt.GpuInstanceRange) > 0 && sysInfo.dOpt.GpuInstanceRange[0] == -1 { - return AddAllGpuInstances(sysInfo, false) + if len(sysInfo.gOpt.MinorRange) > 0 && sysInfo.gOpt.MinorRange[0] == -1 { + monitoring = AddAllGpuInstances(sysInfo, false) } else { - for _, gpuInstanceId := range sysInfo.dOpt.GpuInstanceRange { + for _, gpuInstanceId := range sysInfo.gOpt.MinorRange { // We've already verified that everything in the options list exists monitoring = append(monitoring, *GetMonitoringInfoForGpuInstance(sysInfo, gpuInstanceId)) } diff --git a/pkg/dcgmexporter/system_info_test.go b/pkg/dcgmexporter/system_info_test.go index 2b679dca..5a7fd886 100644 --- a/pkg/dcgmexporter/system_info_test.go +++ b/pkg/dcgmexporter/system_info_test.go @@ -27,6 +27,55 @@ const ( fakeProfileName string = "2fake.4gb" ) +func SpoofSwitchSystemInfo() SystemInfo { + var sysInfo SystemInfo + sysInfo.InfoType = dcgm.FE_SWITCH + sw1 := SwitchInfo{ + EntityId: 0, + } + sw2 := SwitchInfo{ + EntityId: 1, + } + + l1 := dcgm.NvLinkStatus{ + ParentId: 0, + ParentType: dcgm.FE_SWITCH, + State: 2, + Index: 0, + } + + l2 := dcgm.NvLinkStatus{ + ParentId: 0, + ParentType: dcgm.FE_SWITCH, + State: 3, + Index: 1, + } + + l3 := dcgm.NvLinkStatus{ + ParentId: 1, + ParentType: dcgm.FE_SWITCH, + State: 2, + Index: 0, + } + + l4 := dcgm.NvLinkStatus{ + ParentId: 1, + ParentType: dcgm.FE_SWITCH, + State: 3, + Index: 1, + } + + sw1.NvLinks = append(sw1.NvLinks, l1) + sw1.NvLinks = append(sw1.NvLinks, l2) + sw2.NvLinks = append(sw2.NvLinks, l3) + sw2.NvLinks = append(sw2.NvLinks, l4) + + sysInfo.Switches = append(sysInfo.Switches, sw1) + sysInfo.Switches = append(sysInfo.Switches, sw2) + + return sysInfo +} + func SpoofSystemInfo() SystemInfo { var sysInfo SystemInfo sysInfo.GpuCount = 2 @@ -50,7 +99,7 @@ func SpoofSystemInfo() SystemInfo { func TestMonitoredEntities(t *testing.T) { sysInfo := SpoofSystemInfo() - sysInfo.dOpt.Flex = true + sysInfo.gOpt.Flex = true monitoring := GetMonitoredEntities(sysInfo) require.Equal(t, len(monitoring), 2, fmt.Sprintf("Should have 2 monitored entities but found %d", len(monitoring))) @@ -92,25 +141,25 @@ func TestVerifyDevicePresence(t *testing.T) { require.Equal(t, err, nil, "Expected to have no error, but found %s", err) dOpt.Flex = false - dOpt.GpuRange = append(dOpt.GpuRange, -1) - dOpt.GpuInstanceRange = append(dOpt.GpuInstanceRange, -1) + dOpt.MajorRange = append(dOpt.MajorRange, -1) + dOpt.MinorRange = append(dOpt.MinorRange, -1) err = VerifyDevicePresence(&sysInfo, dOpt) require.Equal(t, err, nil, "Expected to have no error, but found %s", err) - dOpt.GpuInstanceRange[0] = 10 // this GPU instance doesn't exist + dOpt.MinorRange[0] = 10 // this GPU instance doesn't exist err = VerifyDevicePresence(&sysInfo, dOpt) require.NotEqual(t, err, nil, "Expected to have an error for a non-existent GPU instance, but none found") - dOpt.GpuRange[0] = 10 // this GPU doesn't exist - dOpt.GpuInstanceRange[0] = -1 + dOpt.MajorRange[0] = 10 // this GPU doesn't exist + dOpt.MinorRange[0] = -1 err = VerifyDevicePresence(&sysInfo, dOpt) require.NotEqual(t, err, nil, "Expected to have an error for a non-existent GPU, but none found") // Add GPUs and instances that exist - dOpt.GpuRange[0] = 0 - dOpt.GpuRange = append(dOpt.GpuRange, 1) - dOpt.GpuInstanceRange[0] = 0 - dOpt.GpuInstanceRange = append(dOpt.GpuInstanceRange, 14) + dOpt.MajorRange[0] = 0 + dOpt.MajorRange = append(dOpt.MajorRange, 1) + dOpt.MinorRange[0] = 0 + dOpt.MinorRange = append(dOpt.MinorRange, 14) err = VerifyDevicePresence(&sysInfo, dOpt) require.Equal(t, err, nil, "Expected to have no error, but found %s", err) } @@ -119,3 +168,23 @@ func TestVerifyDevicePresence(t *testing.T) { // sysInfo := SpoofSystemInfo() // SetMigProfileNames(sysInfo, values) //} + +func TestMonitoredSwitches(t *testing.T) { + sysInfo := SpoofSwitchSystemInfo() + + /* test that only switches are returned */ + monitoring := GetMonitoredEntities(sysInfo) + require.Equal(t, len(monitoring), 2, fmt.Sprintf("Should have 2 monitored switches but found %d", len(monitoring))) + for _, mi := range monitoring { + require.Equal(t, mi.Entity.EntityGroupId, dcgm.FE_SWITCH, fmt.Sprintf("Should have only returned switches but returned %d", mi.Entity.EntityGroupId)) + } + + /* test that only "up" links are monitored and 1 from each switch */ + sysInfo.InfoType = dcgm.FE_LINK + monitoring = GetMonitoredEntities(sysInfo) + require.Equal(t, len(monitoring), 2, fmt.Sprintf("Should have 2 monitored links but found %d", len(monitoring))) + for i, mi := range monitoring { + require.Equal(t, mi.Entity.EntityGroupId, dcgm.FE_LINK, fmt.Sprintf("Should have only returned links but returned %d", mi.Entity.EntityGroupId)) + require.Equal(t, mi.ParentId, uint(i), fmt.Sprint("Link should reference switch parent")) + } +} diff --git a/pkg/dcgmexporter/types.go b/pkg/dcgmexporter/types.go index 1699cc5b..cae5a2e4 100644 --- a/pkg/dcgmexporter/types.go +++ b/pkg/dcgmexporter/types.go @@ -53,9 +53,9 @@ const ( ) type DeviceOptions struct { - Flex bool // If true, then monitor all GPUs if MIG mode is disabled or all GPU instances if MIG is enabled. - GpuRange []int // The indices of each GPU to monitor, or -1 to monitor all - GpuInstanceRange []int // The indices of each GPU instance to monitor, or -1 to monitor all + Flex bool // If true, then monitor all GPUs if MIG mode is disabled or all GPU instances if MIG is enabled. + MajorRange []int // The indices of each GPU/NvSwitch to monitor, or -1 to monitor all + MinorRange []int // The indices of each GPUInstance/NvLink to monitor, or -1 to monitor all } type Config struct { @@ -68,7 +68,8 @@ type Config struct { UseOldNamespace bool UseRemoteHE bool RemoteHEInfo string - Devices DeviceOptions + GPUDevices DeviceOptions + SwitchDevices DeviceOptions NoHostname bool UseFakeGpus bool ConfigMapData string @@ -83,12 +84,15 @@ type Transform interface { type MetricsPipeline struct { config *Config - transformations []Transform - metricsFormat *template.Template - migMetricsFormat *template.Template + transformations []Transform + migMetricsFormat *template.Template + switchMetricsFormat *template.Template + linkMetricsFormat *template.Template - counters []Counter - gpuCollector *DCGMCollector + counters []Counter + gpuCollector *DCGMCollector + switchCollector *DCGMCollector + linkCollector *DCGMCollector } type DCGMCollector struct {