Skip to content

Commit

Permalink
Merge branch 'formatting-fix' into 'master'
Browse files Browse the repository at this point in the history
Fixed grouping of prometheus metrics

See merge request nvidia/container-toolkit/gpu-monitoring-tools!77
  • Loading branch information
glowkey committed Jul 27, 2021
2 parents d5b5675 + e2b1019 commit 5b9f6d1
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 55 deletions.
5 changes: 2 additions & 3 deletions pkg/gpu_collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,8 @@ func ToMetric(values []dcgm.FieldValue_v1, c []Counter, d dcgm.Device, instanceI
uuid = "uuid"
}
m := Metric{
Name: c[i].FieldName,
Value: v,
Counter: &c[i],
Value: v,

UUID: uuid,
GPU: fmt.Sprintf("%d", d.GPU),
Expand All @@ -116,7 +116,6 @@ func ToMetric(values []dcgm.FieldValue_v1, c []Counter, d dcgm.Device, instanceI
}

return metrics

}

func ToString(value dcgm.FieldValue_v1) string {
Expand Down
79 changes: 30 additions & 49 deletions pkg/pipeline.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,6 @@ func NewMetricsPipeline(c *Config) (*MetricsPipeline, func(), error) {
return nil, func() {}, err
}

// Note this is an optimisation, we don't need to format these
// at every pipeline run.
countersText, err := FormatCounters(counters)
if err != nil {
return nil, func() {}, err
}

gpuCollector, cleanup, err := NewDCGMCollector(counters, c)
if err != nil {
return nil, func() {}, err
Expand All @@ -54,8 +47,8 @@ func NewMetricsPipeline(c *Config) (*MetricsPipeline, func(), error) {

metricsFormat: template.Must(template.New("metrics").Parse(metricsFormat)),
migMetricsFormat: template.Must(template.New("migMetrics").Parse(migMetricsFormat)),
countersText: countersText,

counters: counters,
gpuCollector: gpuCollector,
transformations: transformations,
}, func() {
Expand All @@ -65,18 +58,13 @@ func NewMetricsPipeline(c *Config) (*MetricsPipeline, func(), error) {

// Primarely for testing, caller expected to cleanup the collector
func NewMetricsPipelineWithGPUCollector(c *Config, collector *DCGMCollector) (*MetricsPipeline, func(), error) {
countersText, err := FormatCounters(collector.Counters)
if err != nil {
return nil, func() {}, err
}

return &MetricsPipeline{
config: c,

metricsFormat: template.Must(template.New("metrics").Parse(metricsFormat)),
migMetricsFormat: template.Must(template.New("migMetrics").Parse(migMetricsFormat)),
countersText: countersText,

counters: collector.Counters,
gpuCollector: collector,
}, func() {}, nil
}
Expand Down Expand Up @@ -125,7 +113,7 @@ func (m *MetricsPipeline) run() (string, error) {
}
}

formated, err := FormatMetrics(m.countersText, m.migMetricsFormat, metrics)
formated, err := FormatMetrics(m.migMetricsFormat, metrics)
if err != nil {
return "", fmt.Errorf("Failed to format metrics with error: %v", err)
}
Expand All @@ -138,64 +126,57 @@ func (m *MetricsPipeline) run() (string, error) {
* ```
* # HELP FIELD_ID HELP_MSG
* # TYPE FIELD_ID PROM_TYPE
* ...
* FIELD_ID{gpu="GPU_INDEX_0",uuid="GPU_UUID", attr...} VALUE
* ...
* FIELD_ID{gpu="GPU_INDEX_N",uuid="GPU_UUID", attr...} VALUE
* ...
* ```
*
* The expectation is that the template will be given the following
* values: {.Fields, .Devices, .Values[Device][Field]}
*
*/

var countersFormat = `{{- range $c := . -}}
# HELP {{ $c.FieldName }} {{ $c.Help }}
# TYPE {{ $c.FieldName }} {{ $c.PromType }}
{{ end }}`

func FormatCounters(c []Counter) (string, error) {
var res bytes.Buffer

t := template.Must(template.New("counters").Parse(countersFormat))
if err := t.Execute(&res, c); err != nil {
return "", err
}

return res.String(), nil
}

var metricsFormat = `
{{ range $dev := . }}{{ range $val := $dev }}
{{ $val.Name }}{gpu="{{ $val.GPU }}",{{ $val.UUID }}="{{ $val.GPUUUID }}",device="{{ $val.GPUDevice }}",modelName="{{ $val.GPUModelName }}"
{{- range $counter, $metrics := . -}}
# HELP {{ $counter.FieldName }} {{ $counter.Help }}
# TYPE {{ $counter.FieldName }} {{ $counter.PromType }}
{{- range $metric := $metrics }}
{{ $counter.FieldName }}{gpu="{{ $metric.GPU }}",{{ $metric.UUID }}="{{ $metric.GPUUUID }}",device="{{ $metric.GPUDevice }}",modelName="{{ $metric.GPUModelName }}"
{{- range $k, $v := $val.Attributes -}}
{{- range $k, $v := $metric.Attributes -}}
,{{ $k }}="{{ $v }}"
{{- end -}}
} {{ $val.Value }}
} {{ $metric.Value -}}
{{- end }}
{{ end }}`

var migMetricsFormat = `
{{ range $dev := . }}{{ range $val := $dev }}
{{ $val.Name }}{gpu="{{ $val.GPU }}",{{ $val.UUID }}="{{ $val.GPUUUID }}",device="{{ $val.GPUDevice }}",modelName="{{ $val.GPUModelName }}"{{if $val.MigProfile}},GPU_I_PROFILE="{{ $val.MigProfile }}",GPU_I_ID="{{ $val.GPUInstanceID }}"{{end}}{{if $val.Hostname }},Hostname="{{ $val.Hostname }}"{{end}}
{{- range $counter, $metrics := . -}}
# HELP {{ $counter.FieldName }} {{ $counter.Help }}
# TYPE {{ $counter.FieldName }} {{ $counter.PromType }}
{{- range $metric := $metrics }}
{{ $counter.FieldName }}{gpu="{{ $metric.GPU }}",{{ $metric.UUID }}="{{ $metric.GPUUUID }}",device="{{ $metric.GPUDevice }}",modelName="{{ $metric.GPUModelName }}"{{if $metric.MigProfile}},GPU_I_PROFILE="{{ $metric.MigProfile }}",GPU_I_ID="{{ $metric.GPUInstanceID }}"{{end}}{{if $metric.Hostname }},Hostname="{{ $metric.Hostname }}"{{end}}
{{- range $k, $v := $val.Attributes -}}
{{- range $k, $v := $metric.Attributes -}}
,{{ $k }}="{{ $v }}"
{{- end -}}
} {{ $val.Value }}
} {{ $metric.Value -}}
{{- end }}
{{ end }}`

// Template is passed here so that it isn't recompiled at each iteration
func FormatMetrics(countersText string, t *template.Template, m [][]Metric) (string, error) {
var res bytes.Buffer
func FormatMetrics(t *template.Template, m [][]Metric) (string, error) {
// Group metrics by counter instead of by device
groupedMetrics := make(map[*Counter][]Metric)
for _, deviceMetrics := range m {
for _, deviceMetric := range deviceMetrics {
groupedMetrics[deviceMetric.Counter] = append(groupedMetrics[deviceMetric.Counter], deviceMetric)
}
}

if err := t.Execute(&res, m); err != nil {
// Format metrics
var res bytes.Buffer
if err := t.Execute(&res, groupedMetrics); err != nil {
return "", err
}

return countersText + res.String(), nil
return res.String(), nil
}
6 changes: 3 additions & 3 deletions pkg/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,8 @@ type MetricsPipeline struct {
transformations []Transform
metricsFormat *template.Template
migMetricsFormat *template.Template
countersText string

counters []Counter
gpuCollector *DCGMCollector
}

Expand All @@ -110,8 +110,8 @@ type Counter struct {
}

type Metric struct {
Name string
Value string
Counter *Counter
Value string

GPU string
GPUUUID string
Expand Down

0 comments on commit 5b9f6d1

Please sign in to comment.