Skip to content

Commit

Permalink
Expose NVMe Volume Metrics (#2216)
Browse files Browse the repository at this point in the history
* Expose NVMe volume metrics

Signed-off-by: torredil <[email protected]>

* Add unit/e2e tests for NVMe log page metrics

Signed-off-by: torredil <[email protected]>

* Add Service for ebs-csi-node

Signed-off-by: torredil <[email protected]>

---------

Signed-off-by: torredil <[email protected]>
  • Loading branch information
torredil authored Nov 7, 2024
1 parent 2fae5d5 commit 966da33
Show file tree
Hide file tree
Showing 16 changed files with 1,022 additions and 42 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ e2e/single-az: bin/helm bin/ginkgo
TEST_PATH=./tests/e2e/... \
GINKGO_FOCUS="\[ebs-csi-e2e\] \[single-az\]" \
GINKGO_PARALLEL=5 \
HELM_EXTRA_FLAGS="--set=controller.volumeModificationFeature.enabled=true,sidecars.provisioner.additionalArgs[0]='--feature-gates=VolumeAttributesClass=true',sidecars.resizer.additionalArgs[0]='--feature-gates=VolumeAttributesClass=true'" \
HELM_EXTRA_FLAGS="--set=controller.volumeModificationFeature.enabled=true,sidecars.provisioner.additionalArgs[0]='--feature-gates=VolumeAttributesClass=true',sidecars.resizer.additionalArgs[0]='--feature-gates=VolumeAttributesClass=true',node.enableMetrics=true" \
./hack/e2e/run.sh

.PHONY: e2e/multi-az
Expand Down
6 changes: 6 additions & 0 deletions charts/aws-ebs-csi-driver/templates/_node.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,12 @@ spec:
{{- with .Values.node.reservedVolumeAttachments }}
- --reserved-volume-attachments={{ . }}
{{- end }}
{{- if .Values.node.enableMetrics }}
- --http-endpoint=0.0.0.0:3302
{{- end}}
{{- with .Values.node.kubeletPath }}
- --csi-mount-point-prefix={{ . }}
{{- end}}
{{- with .Values.node.volumeAttachLimit }}
- --volume-attach-limit={{ . }}
{{- end }}
Expand Down
18 changes: 18 additions & 0 deletions charts/aws-ebs-csi-driver/templates/metrics.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,21 @@ spec:
interval: {{ .Values.controller.serviceMonitor.interval | default "15s"}}
{{- end }}
{{- end }}
---
{{- if .Values.node.enableMetrics -}}
apiVersion: v1
kind: Service
metadata:
name: ebs-csi-node
namespace: {{ .Release.Namespace }}
labels:
app: ebs-csi-node
spec:
selector:
app: ebs-csi-node
ports:
- name: metrics
port: 3302
targetPort: 3302
type: ClusterIP
{{- end }}
14 changes: 9 additions & 5 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -134,11 +134,6 @@ func main() {
}()
}

if options.HTTPEndpoint != "" {
r := metrics.InitializeRecorder()
r.InitializeMetricsHandler(options.HTTPEndpoint, "/metrics", options.MetricsCertFile, options.MetricsKeyFile)
}

cfg := metadata.MetadataServiceConfig{
EC2MetadataClient: metadata.DefaultEC2MetadataClient,
K8sAPIClient: metadata.DefaultKubernetesAPIClient(options.Kubeconfig),
Expand All @@ -159,6 +154,15 @@ func main() {
md, metadataErr = metadata.NewMetadataService(cfg, region)
}

if options.HTTPEndpoint != "" {
r := metrics.InitializeRecorder()
r.InitializeMetricsHandler(options.HTTPEndpoint, "/metrics", options.MetricsCertFile, options.MetricsKeyFile)

if options.Mode == driver.NodeMode || options.Mode == driver.AllMode {
metrics.InitializeNVME(r, options.CsiMountPointPath, md.GetInstanceID())
}
}

if metadataErr != nil {
klog.ErrorS(metadataErr, "Failed to initialize metadata when it is required")
if options.Mode == driver.ControllerMode {
Expand Down
1 change: 1 addition & 0 deletions deploy/kubernetes/base/node.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ spec:
args:
- node
- --endpoint=$(CSI_ENDPOINT)
- --csi-mount-point-prefix=/var/lib/kubelet
- --logging-format=text
- --v=2
env:
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ require (
github.com/kubernetes-csi/external-snapshotter/client/v4 v4.2.0
github.com/onsi/ginkgo/v2 v2.21.0
github.com/onsi/gomega v1.35.0
github.com/prometheus/client_golang v1.20.5
github.com/spf13/pflag v1.0.5
github.com/stretchr/testify v1.9.0
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.56.0
Expand Down Expand Up @@ -99,7 +100,6 @@ require (
github.com/opencontainers/selinux v1.11.1 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/prometheus/client_golang v1.20.5 // indirect
github.com/prometheus/client_model v0.6.1 // indirect
github.com/prometheus/common v0.60.1 // indirect
github.com/prometheus/procfs v0.15.1 // indirect
Expand Down
3 changes: 3 additions & 0 deletions pkg/driver/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -648,6 +648,9 @@ func (d *NodeService) nodePublishVolumeForBlock(req *csi.NodePublishVolumeReques
}

// Create the mount point as a file since bind mount device node requires it to be a file
// This implementation detail is relied upon by the NVMECollector,
// which discovers block devices by parsing /proc/self/mountinfo. The bind mount
// created here ensures block devices appear in mountinfo even without a filesystem.
klog.V(4).InfoS("NodePublishVolume [block]: making target file", "target", target)
if err = d.mounter.MakeFile(target); err != nil {
if removeErr := os.Remove(target); removeErr != nil {
Expand Down
3 changes: 3 additions & 0 deletions pkg/driver/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@ type Options struct {
WindowsHostProcess bool
// LegacyXFSProgs formats XFS volumes with `bigtime=0,inobtcount=0,reflink=0`, so that they can be mounted onto nodes with linux kernel ≤ v5.4. Volumes formatted with this option may experience issues after 2038, and will be unable to use some XFS features (for example, reflinks).
LegacyXFSProgs bool
// CsiMountPointPath is the path where CSI volumes are expected to be mounted on the node.
CsiMountPointPath string
}

func (o *Options) AddFlags(f *flag.FlagSet) {
Expand Down Expand Up @@ -118,6 +120,7 @@ func (o *Options) AddFlags(f *flag.FlagSet) {
f.IntVar(&o.ReservedVolumeAttachments, "reserved-volume-attachments", -1, "Number of volume attachments reserved for system use. Not used when --volume-attach-limit is specified. The total amount of volume attachments for a node is computed as: <nr. of attachments for corresponding instance type> - <number of NICs, if relevant to the instance type> - <reserved-volume-attachments value>. When -1, the amount of reserved attachments is loaded from instance metadata that captured state at node boot and may include not only system disks but also CSI volumes.")
f.BoolVar(&o.WindowsHostProcess, "windows-host-process", false, "ALPHA: Indicates whether the driver is running in a Windows privileged container")
f.BoolVar(&o.LegacyXFSProgs, "legacy-xfs", false, "Warning: This option will be removed in a future version of EBS CSI Driver. Formats XFS volumes with `bigtime=0,inobtcount=0,reflink=0`, so that they can be mounted onto nodes with linux kernel ≤ v5.4. Volumes formatted with this option may experience issues after 2038, and will be unable to use some XFS features (for example, reflinks).")
f.StringVar(&o.CsiMountPointPath, "csi-mount-point-prefix", "", "A prefix of the mountpoints of all CSI-managed volumes. If this value is non-empty, all volumes mounted to a path beginning with the provided value are assumed to be CSI volumes owned by the EBS CSI Driver and safe to treat as such (for example, by exposing volume metrics).")
}
}

Expand Down
4 changes: 4 additions & 0 deletions pkg/driver/options_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,10 @@ func TestAddFlags(t *testing.T) {
t.Errorf("error setting legacy-xfs: %v", err)
}

if err := f.Set("csi-mount-point-prefix", "/var/lib/kubelet"); err != nil {
t.Errorf("error setting csi-mount-point-prefix: %v", err)
}

if o.Endpoint != "custom-endpoint" {
t.Errorf("unexpected Endpoint: got %s, want custom-endpoint", o.Endpoint)
}
Expand Down
58 changes: 26 additions & 32 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ import (
"sync"
"time"

"k8s.io/component-base/metrics"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"k8s.io/klog/v2"
)

Expand All @@ -29,7 +30,7 @@ var (
)

type metricRecorder struct {
registry metrics.KubeRegistry
registry *prometheus.Registry
metrics map[string]interface{}
}

Expand All @@ -43,13 +44,18 @@ func Recorder() *metricRecorder {
func InitializeRecorder() *metricRecorder {
once.Do(func() {
r = &metricRecorder{
registry: metrics.NewKubeRegistry(),
registry: prometheus.NewRegistry(),
metrics: make(map[string]interface{}),
}
})
return r
}

// InitializeNVME registers the NVMe collector for gathering metrics from NVMe devices.
func InitializeNVME(r *metricRecorder, csiMountPointPath, instanceID string) {
registerNVMECollector(r, csiMountPointPath, instanceID)
}

// IncreaseCount increases the counter metric by 1.
func (m *metricRecorder) IncreaseCount(name string, labels map[string]string) {
if m == nil {
Expand All @@ -65,7 +71,7 @@ func (m *metricRecorder) IncreaseCount(name string, labels map[string]string) {
return
}

metricAsCounterVec, ok := metric.(*metrics.CounterVec)
metricAsCounterVec, ok := metric.(*prometheus.CounterVec)
if ok {
metricAsCounterVec.With(labels).Inc()
} else {
Expand All @@ -87,7 +93,7 @@ func (m *metricRecorder) ObserveHistogram(name string, value float64, labels map
return
}

metricAsHistogramVec, ok := metric.(*metrics.HistogramVec)
metricAsHistogramVec, ok := metric.(*prometheus.HistogramVec)
if ok {
metricAsHistogramVec.With(labels).Observe(value)
} else {
Expand All @@ -103,11 +109,7 @@ func (m *metricRecorder) InitializeMetricsHandler(address, path, certFile, keyFi
}

mux := http.NewServeMux()
mux.Handle(path, metrics.HandlerFor(
m.registry,
metrics.HandlerOpts{
ErrorHandling: metrics.ContinueOnError,
}))
mux.Handle(path, promhttp.HandlerFor(m.registry, promhttp.HandlerOpts{ErrorHandling: promhttp.ContinueOnError}))

server := &http.Server{
Addr: address,
Expand Down Expand Up @@ -136,7 +138,14 @@ func (m *metricRecorder) registerHistogramVec(name, help string, labels []string
if _, exists := m.metrics[name]; exists {
return
}
histogram := createHistogramVec(name, help, labels, buckets)
histogram := prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: name,
Help: help,
Buckets: buckets,
},
labels,
)
m.metrics[name] = histogram
m.registry.MustRegister(histogram)
}
Expand All @@ -145,30 +154,15 @@ func (m *metricRecorder) registerCounterVec(name, help string, labels []string)
if _, exists := m.metrics[name]; exists {
return
}
counter := createCounterVec(name, help, labels)
m.metrics[name] = counter
m.registry.MustRegister(counter)
}

func createHistogramVec(name, help string, labels []string, buckets []float64) *metrics.HistogramVec {
opts := &metrics.HistogramOpts{
Name: name,
Help: help,
StabilityLevel: metrics.ALPHA,
Buckets: buckets,
}
return metrics.NewHistogramVec(opts, labels)
}

func createCounterVec(name, help string, labels []string) *metrics.CounterVec {
return metrics.NewCounterVec(
&metrics.CounterOpts{
Name: name,
Help: help,
StabilityLevel: metrics.ALPHA,
counter := prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: name,
Help: help,
},
labels,
)
m.metrics[name] = counter
m.registry.MustRegister(counter)
}

func getLabelNames(labels map[string]string) []string {
Expand Down
6 changes: 3 additions & 3 deletions pkg/metrics/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ func TestMetricRecorder(t *testing.T) {
m.IncreaseCount("test_total", map[string]string{"key": "value"})
},
expected: `
# HELP test_total [ALPHA] ebs_csi_aws_com metric
# HELP test_total ebs_csi_aws_com metric
# TYPE test_total counter
test_total{key="value"} 1
`,
Expand All @@ -46,7 +46,7 @@ test_total{key="value"} 1
m.ObserveHistogram("test", 1.5, map[string]string{"key": "value"}, []float64{1, 2, 3})
},
expected: `
# HELP test [ALPHA] ebs_csi_aws_com metric
# HELP test ebs_csi_aws_com metric
# TYPE test histogram
test{key="value",le="1"} 0
test{key="value",le="2"} 1
Expand All @@ -66,7 +66,7 @@ test_count{key="value"} 1
m.IncreaseCount("test_re_register_total", map[string]string{"key": "value2"})
},
expected: `
# HELP test_re_register_total [ALPHA] ebs_csi_aws_com metric
# HELP test_re_register_total ebs_csi_aws_com metric
# TYPE test_re_register_total counter
test_re_register_total{key="value1"} 2
test_re_register_total{key="value2"} 1
Expand Down
Loading

0 comments on commit 966da33

Please sign in to comment.