Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

nfd-topology-updater: serve metrics and healthz on the same port #1947

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 2 additions & 8 deletions cmd/nfd-topology-updater/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ import (

topology "sigs.k8s.io/node-feature-discovery/pkg/nfd-topology-updater"
"sigs.k8s.io/node-feature-discovery/pkg/resourcemonitor"
"sigs.k8s.io/node-feature-discovery/pkg/utils"
"sigs.k8s.io/node-feature-discovery/pkg/utils/hostpath"
"sigs.k8s.io/node-feature-discovery/pkg/version"
)
Expand All @@ -52,9 +51,6 @@ func main() {
klog.InfoS("version not set! Set -ldflags \"-X sigs.k8s.io/node-feature-discovery/pkg/version.version=`git describe --tags --dirty --always --match 'v*'`\" during build or run.")
}

// Plug klog into grpc logging infrastructure
utils.ConfigureGrpcKlog()

// Get new TopologyUpdater instance
instance, err := topology.NewTopologyUpdater(*args, *resourcemonitorArgs)
if err != nil {
Expand Down Expand Up @@ -111,10 +107,8 @@ func initFlags(flagset *flag.FlagSet) (*topology.Args, *resourcemonitor.Args) {
"Do not create or update NodeResourceTopology objects.")
flagset.StringVar(&args.KubeConfigFile, "kubeconfig", "",
"Kube config file.")
flagset.IntVar(&args.MetricsPort, "metrics", 8081,
"Port on which to expose metrics.")
flagset.IntVar(&args.GrpcHealthPort, "grpc-health", 8082,
"Port on which to expose the grpc health endpoint.")
flagset.IntVar(&args.Port, "port", 8080,
"Port on which to metrics and healthz endpoints are served")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
"Port on which to metrics and healthz endpoints are served")
"Port on which metrics and healthz endpoints are served")

nit

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and it would be nice to fix it on #1946 too

flagset.DurationVar(&resourcemonitorArgs.SleepInterval, "sleep-interval", time.Duration(60)*time.Second,
"Time to sleep between CR updates. zero means no CR updates on interval basis. [Default: 60s]")
flagset.StringVar(&resourcemonitorArgs.Namespace, "watch-namespace", "*",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,15 @@ spec:
image: gcr.io/k8s-staging-nfd/node-feature-discovery:master
imagePullPolicy: Always
livenessProbe:
grpc:
port: 8082
httpGet:
path: /healthz
port: http
initialDelaySeconds: 10
periodSeconds: 10
readinessProbe:
grpc:
port: 8082
httpGet:
path: /healthz
port: http
initialDelaySeconds: 5
periodSeconds: 10
failureThreshold: 10
Expand All @@ -41,5 +43,5 @@ spec:
cpu: 50m
memory: 40Mi
ports:
- name: metrics
containerPort: 8081
- name: http
containerPort: 8080
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,9 @@ spec:
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
imagePullPolicy: "{{ .Values.image.pullPolicy }}"
livenessProbe:
grpc:
port: {{ .Values.topologyUpdater.healthPort | default "8082" }}
httpGet:
path: /healthz
port: http
{{- with .Values.topologyUpdater.livenessProbe.initialDelaySeconds }}
initialDelaySeconds: {{ . }}
{{- end }}
Expand All @@ -60,8 +61,9 @@ spec:
timeoutSeconds: {{ . }}
{{- end }}
readinessProbe:
grpc:
port: {{ .Values.topologyUpdater.healthPort | default "8082" }}
httpGet:
path: /healthz
port: http
{{- with .Values.topologyUpdater.readinessProbe.initialDelaySeconds }}
initialDelaySeconds: {{ . }}
{{- end }}
Expand Down Expand Up @@ -113,16 +115,13 @@ spec:
# Disable kubelet state tracking by giving an empty path
- "-kubelet-state-dir="
{{- end }}
- "-metrics={{ .Values.topologyUpdater.metricsPort | default "8081"}}"
- "-grpc-health={{ .Values.topologyUpdater.healthPort | default "8082" }}"
- "-port={{ .Values.topologyUpdater.port | default "8080"}}"
{{- with .Values.topologyUpdater.extraArgs }}
{{- toYaml . | nindent 10 }}
{{- end }}
ports:
- containerPort: {{ .Values.topologyUpdater.metricsPort | default "8081"}}
name: metrics
- containerPort: {{ .Values.topologyUpdater.healthPort | default "8082" }}
name: health
- containerPort: {{ .Values.topologyUpdater.port | default "8080"}}
name: http
volumeMounts:
{{- if .Values.topologyUpdater.kubeletConfigPath | empty | not }}
- name: kubelet-config
Expand Down
9 changes: 2 additions & 7 deletions deployment/helm/node-feature-discovery/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -511,8 +511,7 @@ topologyUpdater:
rbac:
create: true

metricsPort: 8081
healthPort: 8082
port: 8080
kubeletConfigPath:
kubeletPodResourcesSockPath:
updateInterval: 60s
Expand All @@ -526,17 +525,13 @@ topologyUpdater:
drop: [ "ALL" ]
readOnlyRootFilesystem: true
runAsUser: 0

livenessProbe:
grpc:
port: 8082
initialDelaySeconds: 10
# failureThreshold: 3
# periodSeconds: 10
# timeoutSeconds: 1
readinessProbe:
grpc:
port: 8082
initialDelaySeconds: 5
failureThreshold: 10
# periodSeconds: 10
Expand Down
3 changes: 1 addition & 2 deletions docs/deployment/helm.md
Original file line number Diff line number Diff line change
Expand Up @@ -274,8 +274,7 @@ API's you need to install the prometheus operator in your cluster.
| `topologyUpdater.serviceAccount.annotations` | dict | {} | Annotations to add to the service account for topology updater |
| `topologyUpdater.serviceAccount.name` | string | | The name of the service account for topology updater to use. If not set and create is true, a name is generated using the fullname template and `-topology-updater` suffix |
| `topologyUpdater.rbac.create` | bool | true | Specifies whether to create [RBAC][rbac] configuration for topology updater |
| `topologyUpdater.metricsPort` | integer | 8081 | Port on which to expose prometheus metrics. **DEPRECATED**: will be replaced by `topologyUpdater.port` in NFD v0.18. |
| `topologyUpdater.healthPort` | integer | 8082 | Port on which to expose the grpc health endpoint, will be also used for the probes. **DEPRECATED**: will be replaced by `topologyUpdater.port` in NFD v0.18. |
| `topologyUpdater.port` | integer | 8080 | Port on which to serve http for metrics and healthz endpoints. |
| `topologyUpdater.kubeletConfigPath` | string | "" | Specifies the kubelet config host path |
| `topologyUpdater.kubeletPodResourcesSockPath` | string | "" | Specifies the kubelet sock path to read pod resources |
| `topologyUpdater.updateInterval` | string | 60s | Time to sleep between CR updates. Non-positive value implies no CR update. |
Expand Down
70 changes: 21 additions & 49 deletions pkg/nfd-topology-updater/nfd-topology-updater.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,13 @@ package nfdtopologyupdater

import (
"fmt"
"net"
"net/http"
"net/url"
"os"
"path/filepath"

"golang.org/x/net/context"

"google.golang.org/grpc"
"google.golang.org/grpc/health"
"google.golang.org/grpc/health/grpc_health_v1"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
Expand All @@ -37,6 +34,7 @@ import (

"github.com/k8stopologyawareschedwg/noderesourcetopology-api/pkg/apis/topology/v1alpha2"
topologyclientset "github.com/k8stopologyawareschedwg/noderesourcetopology-api/pkg/generated/clientset/versioned"
"github.com/prometheus/client_golang/prometheus"
"sigs.k8s.io/node-feature-discovery/pkg/nfd-topology-updater/kubeletnotifier"
"sigs.k8s.io/node-feature-discovery/pkg/podres"
"sigs.k8s.io/node-feature-discovery/pkg/resourcemonitor"
Expand All @@ -56,13 +54,12 @@ const (

// Args are the command line arguments
type Args struct {
MetricsPort int
Port int
NoPublish bool
Oneshot bool
KubeConfigFile string
ConfigFile string
KubeletStateDir string
GrpcHealthPort int

Klog map[string]*utils.KlogFlagVal
}
Expand Down Expand Up @@ -90,7 +87,6 @@ type nfdTopologyUpdater struct {
ownerRefs []metav1.OwnerReference
k8sClient k8sclient.Interface
kubeletConfigFunc func() (*kubeletconfigv1beta1.KubeletConfiguration, error)
healthServer *grpc.Server
}

// NewTopologyUpdater creates a new NfdTopologyUpdater instance.
Expand Down Expand Up @@ -134,27 +130,8 @@ func (w *nfdTopologyUpdater) detectTopologyPolicyAndScope() (string, string, err
return klConfig.TopologyManagerPolicy, klConfig.TopologyManagerScope, nil
}

func (w *nfdTopologyUpdater) startGrpcHealthServer(errChan chan<- error) error {
lis, err := net.Listen("tcp", fmt.Sprintf(":%d", w.args.GrpcHealthPort))
if err != nil {
return fmt.Errorf("failed to listen: %w", err)
}

s := grpc.NewServer()
grpc_health_v1.RegisterHealthServer(s, health.NewServer())
klog.InfoS("gRPC health server serving", "port", w.args.GrpcHealthPort)

go func() {
defer func() {
lis.Close()
}()
if err := s.Serve(lis); err != nil {
errChan <- fmt.Errorf("gRPC health server exited with an error: %w", err)
}
klog.InfoS("gRPC health server stopped")
}()
w.healthServer = s
return nil
func (w *nfdTopologyUpdater) Healthz(writer http.ResponseWriter, _ *http.Request) {
writer.WriteHeader(http.StatusOK)
}

// Run nfdTopologyUpdater. Returns if a fatal error is encountered, or, after
Expand Down Expand Up @@ -187,15 +164,14 @@ func (w *nfdTopologyUpdater) Run() error {
return fmt.Errorf("faild to configure Node Feature Discovery Topology Updater: %w", err)
}

httpMux := http.NewServeMux()

// Register to metrics server
if w.args.MetricsPort > 0 {
m := utils.CreateMetricsServer(w.args.MetricsPort,
buildInfo,
scanErrors)
go m.Run()
registerVersion(version.Get())
defer m.Stop()
}
promRegistry := prometheus.NewRegistry()
promRegistry.MustRegister(
buildInfo,
scanErrors)
registerVersion(version.Get())

var resScan resourcemonitor.ResourcesScanner

Expand All @@ -215,20 +191,19 @@ func (w *nfdTopologyUpdater) Run() error {
return fmt.Errorf("failed to obtain node resource information: %w", err)
}

grpcErr := make(chan error)
// Register health probe (at this point we're "ready and live")
httpMux.HandleFunc("/healthz", w.Healthz)

// Start gRPC server for liveness probe (at this point we're "live")
if w.args.GrpcHealthPort != 0 {
if err := w.startGrpcHealthServer(grpcErr); err != nil {
return fmt.Errorf("failed to start gRPC health server: %w", err)
}
}
// Start HTTP server
httpServer := http.Server{Addr: fmt.Sprintf(":%d", w.args.Port), Handler: httpMux}
go func() {
klog.InfoS("http server starting", "port", httpServer.Addr)
klog.InfoS("http server stopped", "exitCode", httpServer.ListenAndServe())
}()
defer httpServer.Close()

for {
select {
case err := <-grpcErr:
return fmt.Errorf("error in serving gRPC: %w", err)

case info := <-w.eventSource:
klog.V(4).InfoS("event received, scanning...", "event", info.Event)
scanResponse, err := resScan.Scan()
Expand Down Expand Up @@ -257,9 +232,6 @@ func (w *nfdTopologyUpdater) Run() error {

case <-w.stop:
klog.InfoS("shutting down nfd-topology-updater")
if w.healthServer != nil {
w.healthServer.GracefulStop()
}
return nil
}
}
Expand Down