Skip to content

Commit

Permalink
nfd-master: serve metrics and healthz endpoint on the same port
Browse files Browse the repository at this point in the history
Changes the gRPC health endpoint to plain http. At the same time starts
serving both the metrics and healthz endpoints on a single port.
Replaces the -metrics and -grpc-health command line flags with a single
-port flag.

Changes the Helm and kustomize deployments correspondingly.
  • Loading branch information
marquiz committed Jan 2, 2025
1 parent 00ecb01 commit 2c44e38
Show file tree
Hide file tree
Showing 6 changed files with 62 additions and 131 deletions.
9 changes: 2 additions & 7 deletions cmd/nfd-master/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,6 @@ func main() {
klog.InfoS("version not set! Set -ldflags \"-X sigs.k8s.io/node-feature-discovery/pkg/version.version=`git describe --tags --dirty --always --match 'v*'`\" during build or run.")
}

// Plug klog into grpc logging infrastructure
utils.ConfigureGrpcKlog()

// Get new NfdMaster instance
instance, err := master.NewNfdMaster(master.WithArgs(args))
if err != nil {
Expand All @@ -111,10 +108,8 @@ func initFlags(flagset *flag.FlagSet) (*master.Args, *master.ConfigOverrideArgs)
"Config file to use.")
flagset.StringVar(&args.Kubeconfig, "kubeconfig", "",
"Kubeconfig to use")
flagset.IntVar(&args.MetricsPort, "metrics", 8081,
"Port on which to expose metrics.")
flagset.IntVar(&args.GrpcHealthPort, "grpc-health", 8082,
"Port on which to expose the grpc health endpoint.")
flagset.IntVar(&args.Port, "port", 8080,
"Port on which to metrics and healthz endpoints are served")
flagset.BoolVar(&args.Prune, "prune", false,
"Prune all NFD related attributes from all nodes of the cluster and exit.")
flagset.StringVar(&args.Options, "options", "",
Expand Down
14 changes: 8 additions & 6 deletions deployment/base/master/master-deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,16 @@ spec:
port: 8082
failureThreshold: 30
livenessProbe:
grpc:
port: 8082
httpGet:
path: /healthz
port: http
readinessProbe:
grpc:
port: 8082
httpGet:
path: /healthz
port: http
failureThreshold: 10
command:
- "nfd-master"
ports:
- name: metrics
containerPort: 8081
- name: http
containerPort: 8080
24 changes: 12 additions & 12 deletions deployment/helm/node-feature-discovery/templates/master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,9 @@ spec:
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
imagePullPolicy: {{ .Values.image.pullPolicy }}
startupProbe:
grpc:
port: {{ .Values.master.healthPort | default "8082" }}
httpGet:
path: /healthz
port: http
{{- with .Values.master.startupProbe.initialDelaySeconds }}
initialDelaySeconds: {{ . }}
{{- end }}
Expand All @@ -63,8 +64,9 @@ spec:
timeoutSeconds: {{ . }}
{{- end }}
livenessProbe:
grpc:
port: {{ .Values.master.healthPort | default "8082" }}
httpGet:
path: /healthz
port: http
{{- with .Values.master.livenessProbe.initialDelaySeconds }}
initialDelaySeconds: {{ . }}
{{- end }}
Expand All @@ -78,8 +80,9 @@ spec:
timeoutSeconds: {{ . }}
{{- end }}
readinessProbe:
grpc:
port: {{ .Values.master.healthPort | default "8082" }}
httpGet:
path: /healthz
port: http
{{- with .Values.master.readinessProbe.initialDelaySeconds }}
initialDelaySeconds: {{ . }}
{{- end }}
Expand All @@ -96,10 +99,8 @@ spec:
successThreshold: {{ . }}
{{- end }}
ports:
- containerPort: {{ .Values.master.metricsPort | default "8081" }}
name: metrics
- containerPort: {{ .Values.master.healthPort | default "8082" }}
name: health
- containerPort: {{ .Values.master.port | default "8080" }}
name: http
env:
- name: NODE_NAME
valueFrom:
Expand Down Expand Up @@ -139,8 +140,7 @@ spec:
{{- range $key, $value := .Values.featureGates }}
- "-feature-gates={{ $key }}={{ $value }}"
{{- end }}
- "-metrics={{ .Values.master.metricsPort | default "8081" }}"
- "-grpc-health={{ .Values.master.healthPort | default "8082" }}"
- "-port={{ .Values.master.port | default "8080" }}"
{{- with .Values.master.extraArgs }}
{{- toYaml . | nindent 12 }}
{{- end }}
Expand Down
9 changes: 1 addition & 8 deletions deployment/helm/node-feature-discovery/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,7 @@ master:
# retryPeriod: 2s
# nfdApiParallelism: 10
### <NFD-MASTER-CONF-END-DO-NOT-REMOVE>
metricsPort: 8081
healthPort: 8082
port: 8080
instance:
featureApi:
resyncPeriod:
Expand Down Expand Up @@ -148,20 +147,14 @@ master:
values: [""]

startupProbe:
grpc:
port: 8082
failureThreshold: 30
# periodSeconds: 10
livenessProbe:
grpc:
port: 8082
# failureThreshold: 3
# initialDelaySeconds: 0
# periodSeconds: 10
# timeoutSeconds: 1
readinessProbe:
grpc:
port: 8082
failureThreshold: 10
# initialDelaySeconds: 0
# periodSeconds: 10
Expand Down
3 changes: 1 addition & 2 deletions docs/deployment/helm.md
Original file line number Diff line number Diff line change
Expand Up @@ -176,8 +176,7 @@ API's you need to install the prometheus operator in your cluster.
| `master.*` | dict | | NFD master deployment configuration |
| `master.enable` | bool | true | Specifies whether nfd-master should be deployed |
| `master.hostNetwork` | bool | false | Specifies whether to enable or disable running the container in the host's network namespace |
| `master.metricsPort` | integer | 8081 | Port on which to expose metrics from components to prometheus operator. **DEPRECATED**: will be replaced by `master.port` in NFD v0.18. |
| `master.healthPort` | integer | 8082 | Port on which to expose the grpc health endpoint, will be also used for the probes. **DEPRECATED**: will be replaced by `master.port` in NFD v0.18. |
| `master.port` | integer | 8080 | Port on which to serve http for metrics and healthz endpoints. |
| `master.instance` | string | | Instance name. Used to separate annotation namespaces for multiple parallel deployments |
| `master.resyncPeriod` | string | | NFD API controller resync period. |
| `master.extraLabelNs` | array | [] | List of allowed extra label namespaces |
Expand Down
134 changes: 38 additions & 96 deletions pkg/nfd-master/nfd-master.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ import (
"encoding/json"
"fmt"
"maps"
"net"
"net/http"
"os"
"path"
"path/filepath"
Expand All @@ -31,10 +31,9 @@ import (
"time"

"github.com/google/uuid"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"golang.org/x/net/context"
"google.golang.org/grpc"
"google.golang.org/grpc/health"
"google.golang.org/grpc/health/grpc_health_v1"
corev1 "k8s.io/api/core/v1"
apiequality "k8s.io/apimachinery/pkg/api/equality"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -115,18 +114,14 @@ type ConfigOverrideArgs struct {

// Args holds command line arguments
type Args struct {
ConfigFile string
Instance string
Klog map[string]*utils.KlogFlagVal
Kubeconfig string
Port int
// GrpcHealthPort is only needed to avoid races between tests (by skipping the health server).
// Could be removed when gRPC labler service is dropped (when nfd-worker tests stop running nfd-master).
GrpcHealthPort int
ConfigFile string
Instance string
Klog map[string]*utils.KlogFlagVal
Kubeconfig string
Port int
Prune bool
Options string
EnableLeaderElection bool
MetricsPort int

Overrides ConfigOverrideArgs
}
Expand All @@ -139,7 +134,6 @@ type deniedNs struct {
type NfdMaster interface {
Run() error
Stop()
WaitForReady(time.Duration) bool
}

type nfdMaster struct {
Expand All @@ -149,10 +143,7 @@ type nfdMaster struct {
namespace string
nodeName string
configFilePath string
server *grpc.Server
healthServer *grpc.Server
stop chan struct{}
ready chan struct{}
kubeconfig *restclient.Config
k8sClient k8sclient.Interface
nfdClient nfdclientset.Interface
Expand All @@ -166,7 +157,6 @@ func NewNfdMaster(opts ...NfdMasterOption) (NfdMaster, error) {
nfd := &nfdMaster{
nodeName: utils.NodeName(),
namespace: utils.GetKubernetesNamespace(),
ready: make(chan struct{}),
stop: make(chan struct{}),
}

Expand Down Expand Up @@ -298,22 +288,22 @@ func (m *nfdMaster) Run() error {
}
}

httpMux := http.NewServeMux()

// Register to metrics server
if m.args.MetricsPort > 0 {
m := utils.CreateMetricsServer(m.args.MetricsPort,
buildInfo,
nodeUpdateRequests,
nodeUpdates,
nodeUpdateFailures,
nodeLabelsRejected,
nodeERsRejected,
nodeTaintsRejected,
nfrProcessingTime,
nfrProcessingErrors)
go m.Run()
registerVersion(version.Get())
defer m.Stop()
}
promRegistry := prometheus.NewRegistry()
promRegistry.MustRegister(
buildInfo,
nodeUpdateRequests,
nodeUpdates,
nodeUpdateFailures,
nodeLabelsRejected,
nodeERsRejected,
nodeTaintsRejected,
nfrProcessingTime,
nfrProcessingErrors)
httpMux.Handle("/metrics", promhttp.HandlerFor(promRegistry, promhttp.HandlerOpts{}))
registerVersion(version.Get())

// Run updater that handles events from the nfd CRD API.
if m.nfdController != nil {
Expand All @@ -324,60 +314,29 @@ func (m *nfdMaster) Run() error {
}
}

// Start gRPC server for liveness probe (at this point we're "live")
grpcErr := make(chan error)
if m.args.GrpcHealthPort != 0 {
if err := m.startGrpcHealthServer(grpcErr); err != nil {
return fmt.Errorf("failed to start gRPC health server: %w", err)
}
}

// Notify that we're ready to accept connections
close(m.ready)

// NFD-Master main event loop
for {
select {
case err := <-grpcErr:
return fmt.Errorf("error in serving gRPC: %w", err)

case <-m.stop:
klog.InfoS("shutting down nfd-master")
return nil
}
}
}

// startGrpcHealthServer starts a gRPC health server for Kubernetes readiness/liveness probes.
// TODO: improve status checking e.g. with watchdog in the main event loop and
// cheking that node updater pool is alive.
func (m *nfdMaster) startGrpcHealthServer(errChan chan<- error) error {
lis, err := net.Listen("tcp", fmt.Sprintf(":%d", m.args.GrpcHealthPort))
if err != nil {
return fmt.Errorf("failed to listen: %w", err)
}

s := grpc.NewServer()
grpc_health_v1.RegisterHealthServer(s, health.NewServer())
klog.InfoS("gRPC health server serving", "port", m.args.GrpcHealthPort)
// Register health probe (at this point we're "ready and live")
httpMux.HandleFunc("/healthz", m.Healthz)

// Start HTTP server
httpServer := http.Server{Addr: fmt.Sprintf(":%d", m.args.Port), Handler: httpMux}
go func() {
defer func() {
lis.Close()
}()
if err := s.Serve(lis); err != nil {
errChan <- fmt.Errorf("gRPC health server exited with an error: %w", err)
}
klog.InfoS("gRPC health server stopped")
klog.InfoS("http server starting", "port", httpServer.Addr)
klog.InfoS("http server stopped", "exitCode", httpServer.ListenAndServe())
}()
m.healthServer = s
defer httpServer.Close()

<-m.stop
klog.InfoS("shutting down nfd-master")
return nil
}

func (m *nfdMaster) Healthz(writer http.ResponseWriter, _ *http.Request) {
writer.WriteHeader(http.StatusOK)
}

// nfdAPIUpdateHandler handles events from the nfd API controller.
func (m *nfdMaster) nfdAPIUpdateHandler() {
// We want to unconditionally update all nodes at startup if gRPC is
// disabled (i.e. NodeFeature API is enabled)
// We want to unconditionally update all nodes at startup
updateAll := true
updateNodes := make(map[string]struct{})
nodeFeatureGroup := make(map[string]struct{})
Expand Down Expand Up @@ -431,13 +390,6 @@ func (m *nfdMaster) nfdAPIUpdateHandler() {

// Stop NfdMaster
func (m *nfdMaster) Stop() {
if m.server != nil {
m.server.GracefulStop()
}
if m.healthServer != nil {
m.healthServer.GracefulStop()
}

if m.nfdController != nil {
m.nfdController.stop()
}
Expand All @@ -447,16 +399,6 @@ func (m *nfdMaster) Stop() {
close(m.stop)
}

// Wait until NfdMaster is able able to accept connections.
func (m *nfdMaster) WaitForReady(timeout time.Duration) bool {
select {
case <-m.ready:
return true
case <-time.After(timeout):
}
return false
}

// Prune erases all NFD related properties from the node objects of the cluster.
func (m *nfdMaster) prune() error {
if m.config.NoPublish {
Expand Down

0 comments on commit 2c44e38

Please sign in to comment.