diff --git a/cmd/nfd-topology-updater/main.go b/cmd/nfd-topology-updater/main.go index 89fe8c57ec..62d3afcf9b 100644 --- a/cmd/nfd-topology-updater/main.go +++ b/cmd/nfd-topology-updater/main.go @@ -29,7 +29,6 @@ import ( topology "sigs.k8s.io/node-feature-discovery/pkg/nfd-topology-updater" "sigs.k8s.io/node-feature-discovery/pkg/resourcemonitor" - "sigs.k8s.io/node-feature-discovery/pkg/utils" "sigs.k8s.io/node-feature-discovery/pkg/utils/hostpath" "sigs.k8s.io/node-feature-discovery/pkg/version" ) @@ -52,9 +51,6 @@ func main() { klog.InfoS("version not set! Set -ldflags \"-X sigs.k8s.io/node-feature-discovery/pkg/version.version=`git describe --tags --dirty --always --match 'v*'`\" during build or run.") } - // Plug klog into grpc logging infrastructure - utils.ConfigureGrpcKlog() - // Get new TopologyUpdater instance instance, err := topology.NewTopologyUpdater(*args, *resourcemonitorArgs) if err != nil { @@ -111,10 +107,8 @@ func initFlags(flagset *flag.FlagSet) (*topology.Args, *resourcemonitor.Args) { "Do not create or update NodeResourceTopology objects.") flagset.StringVar(&args.KubeConfigFile, "kubeconfig", "", "Kube config file.") - flagset.IntVar(&args.MetricsPort, "metrics", 8081, - "Port on which to expose metrics.") - flagset.IntVar(&args.GrpcHealthPort, "grpc-health", 8082, - "Port on which to expose the grpc health endpoint.") + flagset.IntVar(&args.Port, "port", 8080, + "Port on which to metrics and healthz endpoints are served") flagset.DurationVar(&resourcemonitorArgs.SleepInterval, "sleep-interval", time.Duration(60)*time.Second, "Time to sleep between CR updates. zero means no CR updates on interval basis. [Default: 60s]") flagset.StringVar(&resourcemonitorArgs.Namespace, "watch-namespace", "*", diff --git a/deployment/base/topologyupdater-daemonset/topologyupdater-daemonset.yaml b/deployment/base/topologyupdater-daemonset/topologyupdater-daemonset.yaml index 5d11e9b157..27d804b54c 100644 --- a/deployment/base/topologyupdater-daemonset/topologyupdater-daemonset.yaml +++ b/deployment/base/topologyupdater-daemonset/topologyupdater-daemonset.yaml @@ -20,13 +20,15 @@ spec: image: gcr.io/k8s-staging-nfd/node-feature-discovery:master imagePullPolicy: Always livenessProbe: - grpc: - port: 8082 + httpGet: + path: /healthz + port: http initialDelaySeconds: 10 periodSeconds: 10 readinessProbe: - grpc: - port: 8082 + httpGet: + path: /healthz + port: http initialDelaySeconds: 5 periodSeconds: 10 failureThreshold: 10 @@ -41,5 +43,5 @@ spec: cpu: 50m memory: 40Mi ports: - - name: metrics - containerPort: 8081 + - name: http + containerPort: 8080 diff --git a/deployment/helm/node-feature-discovery/templates/topologyupdater.yaml b/deployment/helm/node-feature-discovery/templates/topologyupdater.yaml index 9a466f88ef..c6d233ece7 100644 --- a/deployment/helm/node-feature-discovery/templates/topologyupdater.yaml +++ b/deployment/helm/node-feature-discovery/templates/topologyupdater.yaml @@ -45,8 +45,9 @@ spec: image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" imagePullPolicy: "{{ .Values.image.pullPolicy }}" livenessProbe: - grpc: - port: {{ .Values.topologyUpdater.healthPort | default "8082" }} + httpGet: + path: /healthz + port: http {{- with .Values.topologyUpdater.livenessProbe.initialDelaySeconds }} initialDelaySeconds: {{ . }} {{- end }} @@ -60,8 +61,9 @@ spec: timeoutSeconds: {{ . }} {{- end }} readinessProbe: - grpc: - port: {{ .Values.topologyUpdater.healthPort | default "8082" }} + httpGet: + path: /healthz + port: http {{- with .Values.topologyUpdater.readinessProbe.initialDelaySeconds }} initialDelaySeconds: {{ . }} {{- end }} @@ -113,16 +115,13 @@ spec: # Disable kubelet state tracking by giving an empty path - "-kubelet-state-dir=" {{- end }} - - "-metrics={{ .Values.topologyUpdater.metricsPort | default "8081"}}" - - "-grpc-health={{ .Values.topologyUpdater.healthPort | default "8082" }}" + - "-port={{ .Values.topologyUpdater.port | default "8080"}}" {{- with .Values.topologyUpdater.extraArgs }} {{- toYaml . | nindent 10 }} {{- end }} ports: - - containerPort: {{ .Values.topologyUpdater.metricsPort | default "8081"}} - name: metrics - - containerPort: {{ .Values.topologyUpdater.healthPort | default "8082" }} - name: health + - containerPort: {{ .Values.topologyUpdater.port | default "8080"}} + name: http volumeMounts: {{- if .Values.topologyUpdater.kubeletConfigPath | empty | not }} - name: kubelet-config diff --git a/deployment/helm/node-feature-discovery/values.yaml b/deployment/helm/node-feature-discovery/values.yaml index 68440f9c6e..b46cedb600 100644 --- a/deployment/helm/node-feature-discovery/values.yaml +++ b/deployment/helm/node-feature-discovery/values.yaml @@ -511,8 +511,7 @@ topologyUpdater: rbac: create: true - metricsPort: 8081 - healthPort: 8082 + port: 8080 kubeletConfigPath: kubeletPodResourcesSockPath: updateInterval: 60s @@ -526,17 +525,13 @@ topologyUpdater: drop: [ "ALL" ] readOnlyRootFilesystem: true runAsUser: 0 - + livenessProbe: - grpc: - port: 8082 initialDelaySeconds: 10 # failureThreshold: 3 # periodSeconds: 10 # timeoutSeconds: 1 readinessProbe: - grpc: - port: 8082 initialDelaySeconds: 5 failureThreshold: 10 # periodSeconds: 10 diff --git a/docs/deployment/helm.md b/docs/deployment/helm.md index cdcf4ca720..2033cdd9bf 100644 --- a/docs/deployment/helm.md +++ b/docs/deployment/helm.md @@ -274,8 +274,7 @@ API's you need to install the prometheus operator in your cluster. | `topologyUpdater.serviceAccount.annotations` | dict | {} | Annotations to add to the service account for topology updater | | `topologyUpdater.serviceAccount.name` | string | | The name of the service account for topology updater to use. If not set and create is true, a name is generated using the fullname template and `-topology-updater` suffix | | `topologyUpdater.rbac.create` | bool | true | Specifies whether to create [RBAC][rbac] configuration for topology updater | -| `topologyUpdater.metricsPort` | integer | 8081 | Port on which to expose prometheus metrics. **DEPRECATED**: will be replaced by `topologyUpdater.port` in NFD v0.18. | -| `topologyUpdater.healthPort` | integer | 8082 | Port on which to expose the grpc health endpoint, will be also used for the probes. **DEPRECATED**: will be replaced by `topologyUpdater.port` in NFD v0.18. | +| `topologyUpdater.port` | integer | 8080 | Port on which to serve http for metrics and healthz endpoints. | | `topologyUpdater.kubeletConfigPath` | string | "" | Specifies the kubelet config host path | | `topologyUpdater.kubeletPodResourcesSockPath` | string | "" | Specifies the kubelet sock path to read pod resources | | `topologyUpdater.updateInterval` | string | 60s | Time to sleep between CR updates. Non-positive value implies no CR update. | diff --git a/pkg/nfd-topology-updater/nfd-topology-updater.go b/pkg/nfd-topology-updater/nfd-topology-updater.go index e65edbea2b..d99bc61bd6 100644 --- a/pkg/nfd-topology-updater/nfd-topology-updater.go +++ b/pkg/nfd-topology-updater/nfd-topology-updater.go @@ -18,16 +18,13 @@ package nfdtopologyupdater import ( "fmt" - "net" + "net/http" "net/url" "os" "path/filepath" "golang.org/x/net/context" - "google.golang.org/grpc" - "google.golang.org/grpc/health" - "google.golang.org/grpc/health/grpc_health_v1" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" @@ -37,6 +34,7 @@ import ( "github.com/k8stopologyawareschedwg/noderesourcetopology-api/pkg/apis/topology/v1alpha2" topologyclientset "github.com/k8stopologyawareschedwg/noderesourcetopology-api/pkg/generated/clientset/versioned" + "github.com/prometheus/client_golang/prometheus" "sigs.k8s.io/node-feature-discovery/pkg/nfd-topology-updater/kubeletnotifier" "sigs.k8s.io/node-feature-discovery/pkg/podres" "sigs.k8s.io/node-feature-discovery/pkg/resourcemonitor" @@ -56,13 +54,12 @@ const ( // Args are the command line arguments type Args struct { - MetricsPort int + Port int NoPublish bool Oneshot bool KubeConfigFile string ConfigFile string KubeletStateDir string - GrpcHealthPort int Klog map[string]*utils.KlogFlagVal } @@ -90,7 +87,6 @@ type nfdTopologyUpdater struct { ownerRefs []metav1.OwnerReference k8sClient k8sclient.Interface kubeletConfigFunc func() (*kubeletconfigv1beta1.KubeletConfiguration, error) - healthServer *grpc.Server } // NewTopologyUpdater creates a new NfdTopologyUpdater instance. @@ -134,27 +130,8 @@ func (w *nfdTopologyUpdater) detectTopologyPolicyAndScope() (string, string, err return klConfig.TopologyManagerPolicy, klConfig.TopologyManagerScope, nil } -func (w *nfdTopologyUpdater) startGrpcHealthServer(errChan chan<- error) error { - lis, err := net.Listen("tcp", fmt.Sprintf(":%d", w.args.GrpcHealthPort)) - if err != nil { - return fmt.Errorf("failed to listen: %w", err) - } - - s := grpc.NewServer() - grpc_health_v1.RegisterHealthServer(s, health.NewServer()) - klog.InfoS("gRPC health server serving", "port", w.args.GrpcHealthPort) - - go func() { - defer func() { - lis.Close() - }() - if err := s.Serve(lis); err != nil { - errChan <- fmt.Errorf("gRPC health server exited with an error: %w", err) - } - klog.InfoS("gRPC health server stopped") - }() - w.healthServer = s - return nil +func (w *nfdTopologyUpdater) Healthz(writer http.ResponseWriter, _ *http.Request) { + writer.WriteHeader(http.StatusOK) } // Run nfdTopologyUpdater. Returns if a fatal error is encountered, or, after @@ -187,15 +164,14 @@ func (w *nfdTopologyUpdater) Run() error { return fmt.Errorf("faild to configure Node Feature Discovery Topology Updater: %w", err) } + httpMux := http.NewServeMux() + // Register to metrics server - if w.args.MetricsPort > 0 { - m := utils.CreateMetricsServer(w.args.MetricsPort, - buildInfo, - scanErrors) - go m.Run() - registerVersion(version.Get()) - defer m.Stop() - } + promRegistry := prometheus.NewRegistry() + promRegistry.MustRegister( + buildInfo, + scanErrors) + registerVersion(version.Get()) var resScan resourcemonitor.ResourcesScanner @@ -215,20 +191,19 @@ func (w *nfdTopologyUpdater) Run() error { return fmt.Errorf("failed to obtain node resource information: %w", err) } - grpcErr := make(chan error) + // Register health probe (at this point we're "ready and live") + httpMux.HandleFunc("/healthz", w.Healthz) - // Start gRPC server for liveness probe (at this point we're "live") - if w.args.GrpcHealthPort != 0 { - if err := w.startGrpcHealthServer(grpcErr); err != nil { - return fmt.Errorf("failed to start gRPC health server: %w", err) - } - } + // Start HTTP server + httpServer := http.Server{Addr: fmt.Sprintf(":%d", w.args.Port), Handler: httpMux} + go func() { + klog.InfoS("http server starting", "port", httpServer.Addr) + klog.InfoS("http server stopped", "exitCode", httpServer.ListenAndServe()) + }() + defer httpServer.Close() for { select { - case err := <-grpcErr: - return fmt.Errorf("error in serving gRPC: %w", err) - case info := <-w.eventSource: klog.V(4).InfoS("event received, scanning...", "event", info.Event) scanResponse, err := resScan.Scan() @@ -257,9 +232,6 @@ func (w *nfdTopologyUpdater) Run() error { case <-w.stop: klog.InfoS("shutting down nfd-topology-updater") - if w.healthServer != nil { - w.healthServer.GracefulStop() - } return nil } }