diff --git a/cmd/nfd-master/main.go b/cmd/nfd-master/main.go index 61078e3859..418008e563 100644 --- a/cmd/nfd-master/main.go +++ b/cmd/nfd-master/main.go @@ -89,9 +89,6 @@ func main() { klog.InfoS("version not set! Set -ldflags \"-X sigs.k8s.io/node-feature-discovery/pkg/version.version=`git describe --tags --dirty --always`\" during build or run.") } - // Plug klog into grpc logging infrastructure - utils.ConfigureGrpcKlog() - // Get new NfdMaster instance instance, err := master.NewNfdMaster(master.WithArgs(args)) if err != nil { @@ -114,10 +111,8 @@ func initFlags(flagset *flag.FlagSet) (*master.Args, *master.ConfigOverrideArgs) "Config file to use.") flagset.StringVar(&args.Kubeconfig, "kubeconfig", "", "Kubeconfig to use") - flagset.IntVar(&args.MetricsPort, "metrics", 8081, - "Port on which to expose metrics.") - flagset.IntVar(&args.GrpcHealthPort, "grpc-health", 8082, - "Port on which to expose the grpc health endpoint.") + flagset.IntVar(&args.Port, "port", 8080, + "Port on which to metrics and healthz endpoints are served") flagset.BoolVar(&args.Prune, "prune", false, "Prune all NFD related attributes from all nodes of the cluster and exit.") flagset.StringVar(&args.Options, "options", "", diff --git a/deployment/base/master/master-deployment.yaml b/deployment/base/master/master-deployment.yaml index e0b659d016..20786a5902 100644 --- a/deployment/base/master/master-deployment.yaml +++ b/deployment/base/master/master-deployment.yaml @@ -29,18 +29,20 @@ spec: cpu: 100m memory: 128Mi livenessProbe: - grpc: - port: 8082 + httpGet: + path: /healthz + port: http initialDelaySeconds: 10 periodSeconds: 10 readinessProbe: - grpc: - port: 8082 + httpGet: + path: /healthz + port: http initialDelaySeconds: 5 periodSeconds: 10 failureThreshold: 10 command: - "nfd-master" ports: - - name: metrics - containerPort: 8081 + - name: http + containerPort: 8080 diff --git a/deployment/helm/node-feature-discovery/templates/master.yaml b/deployment/helm/node-feature-discovery/templates/master.yaml index 4795fc2cb4..38817d3165 100644 --- a/deployment/helm/node-feature-discovery/templates/master.yaml +++ b/deployment/helm/node-feature-discovery/templates/master.yaml @@ -48,8 +48,9 @@ spec: image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" imagePullPolicy: {{ .Values.image.pullPolicy }} livenessProbe: - grpc: - port: {{ .Values.master.healthPort | default "8082" }} + httpGet: + path: /healthz + port: http {{- with .Values.master.livenessProbe.initialDelaySeconds }} initialDelaySeconds: {{ . }} {{- end }} @@ -63,8 +64,9 @@ spec: timeoutSeconds: {{ . }} {{- end }} readinessProbe: - grpc: - port: {{ .Values.master.healthPort | default "8082" }} + httpGet: + path: /healthz + port: http {{- with .Values.master.readinessProbe.initialDelaySeconds }} initialDelaySeconds: {{ . }} {{- end }} @@ -82,11 +84,7 @@ spec: {{- end }} ports: - containerPort: {{ .Values.master.port | default "8080" }} - name: grpc - - containerPort: {{ .Values.master.metricsPort | default "8081" }} - name: metrics - - containerPort: {{ .Values.master.healthPort | default "8082" }} - name: health + name: port env: - name: NODE_NAME valueFrom: @@ -129,8 +127,7 @@ spec: {{- range $key, $value := .Values.featureGates }} - "-feature-gates={{ $key }}={{ $value }}" {{- end }} - - "-metrics={{ .Values.master.metricsPort | default "8081" }}" - - "-grpc-health={{ .Values.master.healthPort | default "8082" }}" + - "-port={{ .Values.master.port | default "8080" }}" {{- with .Values.master.extraArgs }} {{- toYaml . | nindent 12 }} {{- end }} diff --git a/deployment/helm/node-feature-discovery/values.yaml b/deployment/helm/node-feature-discovery/values.yaml index 5ff0df68b5..dee20b5be7 100644 --- a/deployment/helm/node-feature-discovery/values.yaml +++ b/deployment/helm/node-feature-discovery/values.yaml @@ -68,8 +68,7 @@ master: # retryPeriod: 2s # nfdApiParallelism: 10 ### - metricsPort: 8081 - healthPort: 8082 + port: 8080 instance: featureApi: resyncPeriod: @@ -153,17 +152,13 @@ master: - key: "node-role.kubernetes.io/control-plane" operator: In values: [""] - + livenessProbe: - grpc: - port: 8082 initialDelaySeconds: 10 # failureThreshold: 3 # periodSeconds: 10 # timeoutSeconds: 1 readinessProbe: - grpc: - port: 8082 initialDelaySeconds: 5 failureThreshold: 10 # periodSeconds: 10 diff --git a/docs/deployment/helm.md b/docs/deployment/helm.md index a469c41b11..e10a9c2a5c 100644 --- a/docs/deployment/helm.md +++ b/docs/deployment/helm.md @@ -177,8 +177,7 @@ API's you need to install the prometheus operator in your cluster. | `master.*` | dict | | NFD master deployment configuration | | `master.enable` | bool | true | Specifies whether nfd-master should be deployed | | `master.hostNetwork` | bool | false | Specifies whether to enable or disable running the container in the host's network namespace | -| `master.metricsPort` | integer | 8081 | Port on which to expose metrics from components to prometheus operator | -| `master.healthPort` | integer | 8082 | Port on which to expose the grpc health endpoint, will be also used for the probes | +| `master.port` | integer | 8080 | Port on which to serve http for metrics and healthz endpoints. | | `master.instance` | string | | Instance name. Used to separate annotation namespaces for multiple parallel deployments | | `master.resyncPeriod` | string | | NFD API controller resync period. | | `master.extraLabelNs` | array | [] | List of allowed extra label namespaces | diff --git a/pkg/nfd-master/nfd-master.go b/pkg/nfd-master/nfd-master.go index 822ed8ffd4..674b04db55 100644 --- a/pkg/nfd-master/nfd-master.go +++ b/pkg/nfd-master/nfd-master.go @@ -20,7 +20,7 @@ import ( "encoding/json" "fmt" "maps" - "net" + "net/http" "os" "path" "path/filepath" @@ -31,10 +31,9 @@ import ( "time" "github.com/google/uuid" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" "golang.org/x/net/context" - "google.golang.org/grpc" - "google.golang.org/grpc/health" - "google.golang.org/grpc/health/grpc_health_v1" corev1 "k8s.io/api/core/v1" apiequality "k8s.io/apimachinery/pkg/api/equality" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -117,18 +116,14 @@ type ConfigOverrideArgs struct { // Args holds command line arguments type Args struct { - ConfigFile string - Instance string - Klog map[string]*utils.KlogFlagVal - Kubeconfig string - Port int - // GrpcHealthPort is only needed to avoid races between tests (by skipping the health server). - // Could be removed when gRPC labler service is dropped (when nfd-worker tests stop running nfd-master). - GrpcHealthPort int + ConfigFile string + Instance string + Klog map[string]*utils.KlogFlagVal + Kubeconfig string + Port int Prune bool Options string EnableLeaderElection bool - MetricsPort int Overrides ConfigOverrideArgs } @@ -141,7 +136,6 @@ type deniedNs struct { type NfdMaster interface { Run() error Stop() - WaitForReady(time.Duration) bool } type nfdMaster struct { @@ -151,10 +145,7 @@ type nfdMaster struct { namespace string nodeName string configFilePath string - server *grpc.Server - healthServer *grpc.Server stop chan struct{} - ready chan struct{} kubeconfig *restclient.Config k8sClient k8sclient.Interface nfdClient nfdclientset.Interface @@ -168,7 +159,6 @@ func NewNfdMaster(opts ...NfdMasterOption) (NfdMaster, error) { nfd := &nfdMaster{ nodeName: utils.NodeName(), namespace: utils.GetKubernetesNamespace(), - ready: make(chan struct{}), stop: make(chan struct{}), } @@ -301,22 +291,22 @@ func (m *nfdMaster) Run() error { } } + httpMux := http.NewServeMux() + // Register to metrics server - if m.args.MetricsPort > 0 { - m := utils.CreateMetricsServer(m.args.MetricsPort, - buildInfo, - nodeUpdateRequests, - nodeUpdates, - nodeUpdateFailures, - nodeLabelsRejected, - nodeERsRejected, - nodeTaintsRejected, - nfrProcessingTime, - nfrProcessingErrors) - go m.Run() - registerVersion(version.Get()) - defer m.Stop() - } + promRegistry := prometheus.NewRegistry() + promRegistry.MustRegister( + buildInfo, + nodeUpdateRequests, + nodeUpdates, + nodeUpdateFailures, + nodeLabelsRejected, + nodeERsRejected, + nodeTaintsRejected, + nfrProcessingTime, + nfrProcessingErrors) + httpMux.Handle("/metrics", promhttp.HandlerFor(promRegistry, promhttp.HandlerOpts{})) + registerVersion(version.Get()) // Run updater that handles events from the nfd CRD API. if m.nfdController != nil { @@ -327,60 +317,29 @@ func (m *nfdMaster) Run() error { } } - // Start gRPC server for liveness probe (at this point we're "live") - grpcErr := make(chan error) - if m.args.GrpcHealthPort != 0 { - if err := m.startGrpcHealthServer(grpcErr); err != nil { - return fmt.Errorf("failed to start gRPC health server: %w", err) - } - } - - // Notify that we're ready to accept connections - close(m.ready) - - // NFD-Master main event loop - for { - select { - case err := <-grpcErr: - return fmt.Errorf("error in serving gRPC: %w", err) - - case <-m.stop: - klog.InfoS("shutting down nfd-master") - return nil - } - } -} - -// startGrpcHealthServer starts a gRPC health server for Kubernetes readiness/liveness probes. -// TODO: improve status checking e.g. with watchdog in the main event loop and -// cheking that node updater pool is alive. -func (m *nfdMaster) startGrpcHealthServer(errChan chan<- error) error { - lis, err := net.Listen("tcp", fmt.Sprintf(":%d", m.args.GrpcHealthPort)) - if err != nil { - return fmt.Errorf("failed to listen: %w", err) - } - - s := grpc.NewServer() - grpc_health_v1.RegisterHealthServer(s, health.NewServer()) - klog.InfoS("gRPC health server serving", "port", m.args.GrpcHealthPort) + // Register health probe (at this point we're "ready and live") + httpMux.HandleFunc("/healthz", m.Healthz) + // Start HTTP server + httpServer := http.Server{Addr: fmt.Sprintf(":%d", m.args.Port), Handler: httpMux} go func() { - defer func() { - lis.Close() - }() - if err := s.Serve(lis); err != nil { - errChan <- fmt.Errorf("gRPC health server exited with an error: %w", err) - } - klog.InfoS("gRPC health server stopped") + klog.InfoS("http server starting", "port", httpServer.Addr) + klog.InfoS("http server stopped", "exitCode", httpServer.ListenAndServe()) }() - m.healthServer = s + defer httpServer.Close() + + <-m.stop + klog.InfoS("shutting down nfd-master") return nil } +func (m *nfdMaster) Healthz(writer http.ResponseWriter, _ *http.Request) { + writer.WriteHeader(http.StatusOK) +} + // nfdAPIUpdateHandler handles events from the nfd API controller. func (m *nfdMaster) nfdAPIUpdateHandler() { - // We want to unconditionally update all nodes at startup if gRPC is - // disabled (i.e. NodeFeature API is enabled) + // We want to unconditionally update all nodes at startup updateAll := true updateNodes := make(map[string]struct{}) nodeFeatureGroup := make(map[string]struct{}) @@ -434,13 +393,6 @@ func (m *nfdMaster) nfdAPIUpdateHandler() { // Stop NfdMaster func (m *nfdMaster) Stop() { - if m.server != nil { - m.server.GracefulStop() - } - if m.healthServer != nil { - m.healthServer.GracefulStop() - } - if m.nfdController != nil { m.nfdController.stop() } @@ -450,16 +402,6 @@ func (m *nfdMaster) Stop() { close(m.stop) } -// Wait until NfdMaster is able able to accept connections. -func (m *nfdMaster) WaitForReady(timeout time.Duration) bool { - select { - case <-m.ready: - return true - case <-time.After(timeout): - } - return false -} - // Prune erases all NFD related properties from the node objects of the cluster. func (m *nfdMaster) prune() error { if m.config.NoPublish {