From b16ca4105138ec3ca7a8c2916d8ab8c07f182740 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Brigitte?= Date: Mon, 16 Dec 2024 18:24:50 +0100 Subject: [PATCH] Add Alertmanager controller (#201) Reconciles the Alertmanager secret created by the observability-operator Helm chart and load the configuration and templates from the secret into Mimir Alertmanager. It also watches the Mimir Alertmanager pod and re-queue events to the controller when the pod is restarted, so the configuration is reloaded. Predicates are being used to filter the secret and pods being watched to only act on Alertmanager related resources. Finalizers are not used by this controller as the configuration is never deleted, only updated. --------- Co-authored-by: Taylor Bot Co-authored-by: Quentin Bisson --- CHANGELOG.md | 4 + .../templates/deployment.yaml | 3 + helm/observability-operator/values.yaml | 1 + .../controller/alertmanager_controller.go | 95 +++++++++++++++++++ .../predicates/alertmanager_predicates.go | 78 +++++++++++++++ main.go | 15 +++ pkg/config/config.go | 1 + pkg/monitoring/config.go | 4 +- 8 files changed, 200 insertions(+), 1 deletion(-) create mode 100644 internal/controller/alertmanager_controller.go create mode 100644 internal/controller/predicates/alertmanager_predicates.go diff --git a/CHANGELOG.md b/CHANGELOG.md index 875d234d..5ab378b6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- Add Alertmanager controller + ## [0.10.1] - 2024-12-12 ### Fixed diff --git a/helm/observability-operator/templates/deployment.yaml b/helm/observability-operator/templates/deployment.yaml index 3b8a0a91..99739760 100644 --- a/helm/observability-operator/templates/deployment.yaml +++ b/helm/observability-operator/templates/deployment.yaml @@ -31,12 +31,15 @@ spec: - --management-cluster-pipeline={{ $.Values.managementCluster.pipeline }} - --management-cluster-region={{ $.Values.managementCluster.region }} # Monitoring configuration + - --alertmanager-enabled={{ $.Values.alerting.enabled }} + - --alertmanager-secret-name={{ include "alertmanager-secret.name" . }} - --alertmanager-url={{ $.Values.alerting.alertmanagerURL }} - --monitoring-enabled={{ $.Values.monitoring.enabled }} - --monitoring-agent={{ $.Values.monitoring.agent }} - --monitoring-sharding-scale-up-series-count={{ $.Values.monitoring.sharding.scaleUpSeriesCount }} - --monitoring-sharding-scale-down-percentage={{ $.Values.monitoring.sharding.scaleDownPercentage }} - --monitoring-wal-truncate-frequency={{ $.Values.monitoring.wal.truncateFrequency }} + - --operator-namespace={{ include "resource.default.namespace" . }} {{- if .Values.monitoring.prometheusVersion }} - --prometheus-version={{ $.Values.monitoring.prometheusVersion }} {{- end }} diff --git a/helm/observability-operator/values.yaml b/helm/observability-operator/values.yaml index 23e398b9..981a52ce 100644 --- a/helm/observability-operator/values.yaml +++ b/helm/observability-operator/values.yaml @@ -16,6 +16,7 @@ managementCluster: region: region alerting: + enabled: false alertmanagerURL: "" grafanaAddress: "" proxyURL: "" diff --git a/internal/controller/alertmanager_controller.go b/internal/controller/alertmanager_controller.go new file mode 100644 index 00000000..8799353b --- /dev/null +++ b/internal/controller/alertmanager_controller.go @@ -0,0 +1,95 @@ +package controller + +import ( + "context" + + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + "github.com/pkg/errors" + + "github.com/giantswarm/observability-operator/internal/controller/predicates" + "github.com/giantswarm/observability-operator/pkg/alertmanager" + "github.com/giantswarm/observability-operator/pkg/config" +) + +// AlertmanagerReconciler reconciles the Alertmanager secret created by the observability-operator Helm chart +// and configures the Alertmanager instance with the configuration stored in the secret. +// This controller do not make use of finalizers as the configuration is not removed from Alertmanager when the secret is deleted. +type AlertmanagerReconciler struct { + client client.Client + + alertmanagerService alertmanager.Service +} + +// SetupAlertmanagerReconciler adds a controller into mgr that reconciles the Alertmanager secret. +func SetupAlertmanagerReconciler(mgr ctrl.Manager, conf config.Config) error { + r := &AlertmanagerReconciler{ + client: mgr.GetClient(), + alertmanagerService: alertmanager.New(conf), + } + + // Filter only the Alertmanager secret created by the observability-operator Helm chart + secretPredicate := predicates.NewAlertmanagerSecretPredicate(conf) + + // Filter only the Mimir Alertmanager pod + podPredicate := predicates.NewAlertmanagerPodPredicate() + + // Requeue the Alertmanager secret when the Mimir Alertmanager pod changes + p := podEventHandler(conf) + + // Setup the controller + return ctrl.NewControllerManagedBy(mgr). + For(&v1.Secret{}, builder.WithPredicates(secretPredicate)). + Watches(&v1.Pod{}, p, builder.WithPredicates(podPredicate)). + Complete(r) +} + +// podEventHandler returns an event handler that enqueues requests for the Alertmanager secret only. +// For now there is only one Alertmanager secret to be reconciled. +func podEventHandler(conf config.Config) handler.EventHandler { + return handler.EnqueueRequestsFromMapFunc(func(ctx context.Context, obj client.Object) []ctrl.Request { + return []reconcile.Request{ + { + NamespacedName: types.NamespacedName{ + Name: conf.Monitoring.AlertmanagerSecretName, + Namespace: conf.OperatorNamespace, + }, + }, + } + }) +} + +// Reconcile main logic +func (r AlertmanagerReconciler) Reconcile(ctx context.Context, req reconcile.Request) (ctrl.Result, error) { + logger := log.FromContext(ctx) + + logger.Info("Started reconciling") + + // Retrieve the secret being reconciled + secret := &v1.Secret{} + if err := r.client.Get(ctx, req.NamespacedName, secret); err != nil { + return ctrl.Result{}, errors.WithStack(err) + } + + if !secret.DeletionTimestamp.IsZero() { + // Nothing to do if the secret is being deleted + // Configuration is not removed from Alertmanager when the secret is deleted. + return ctrl.Result{}, nil + } + + err := r.alertmanagerService.Configure(ctx, secret) + if err != nil { + return ctrl.Result{}, errors.WithStack(err) + } + + logger.Info("Finished reconciling") + + return ctrl.Result{}, nil +} diff --git a/internal/controller/predicates/alertmanager_predicates.go b/internal/controller/predicates/alertmanager_predicates.go new file mode 100644 index 00000000..9347c095 --- /dev/null +++ b/internal/controller/predicates/alertmanager_predicates.go @@ -0,0 +1,78 @@ +package predicates + +import ( + v1 "k8s.io/api/core/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/predicate" + + "github.com/giantswarm/observability-operator/pkg/config" +) + +// NewAlertmanagerSecretPredicate returns a predicate that filters only the Alertmanager secret created by the observability-operator Helm chart. +func NewAlertmanagerSecretPredicate(conf config.Config) predicate.Predicate { + filter := func(object client.Object) bool { + if object == nil { + return false + } + + secret, ok := object.(*v1.Secret) + if !ok { + return false + } + + if !secret.DeletionTimestamp.IsZero() { + return false + } + + labels := secret.GetLabels() + + ok = secret.GetName() == conf.Monitoring.AlertmanagerSecretName && + secret.GetNamespace() == conf.OperatorNamespace && + labels != nil && + labels["app.kubernetes.io/name"] == "observability-operator" + + return ok + } + + p := predicate.NewPredicateFuncs(filter) + + return p +} + +const ( + mimirNamespace = "mimir" + mimirInstance = "mimir" + mimirAlertmanagerComponent = "alertmanager" +) + +// NewAlertmanagerPodPredicate returns a predicate that filters only the Mimir Alertmanager pod. +func NewAlertmanagerPodPredicate() predicate.Predicate { + filter := func(object client.Object) bool { + if object == nil { + return false + } + + pod, ok := object.(*v1.Pod) + if !ok { + return false + } + + if !pod.DeletionTimestamp.IsZero() { + return false + } + + labels := pod.GetLabels() + + ok = pod.GetNamespace() == mimirNamespace && + labels != nil && + labels["app.kubernetes.io/component"] == mimirAlertmanagerComponent && + labels["app.kubernetes.io/instance"] == mimirInstance && + isPodReady(pod) + + return ok + } + + p := predicate.NewPredicateFuncs(filter) + + return p +} diff --git a/main.go b/main.go index 3e27f8a9..57fb9adb 100644 --- a/main.go +++ b/main.go @@ -74,6 +74,8 @@ func main() { "If set the metrics endpoint is served securely") flag.BoolVar(&conf.EnableHTTP2, "enable-http2", false, "If set, HTTP/2 will be enabled for the metrics and webhook servers") + flag.StringVar(&conf.OperatorNamespace, "operator-namespace", "", + "The namespace where the observability-operator is running.") // Management cluster configuration flags. flag.StringVar(&conf.ManagementCluster.BaseDomain, "management-cluster-base-domain", "", @@ -90,6 +92,10 @@ func main() { "The region of the management cluster.") // Monitoring configuration flags. + flag.BoolVar(&conf.Monitoring.AlertmanagerEnabled, "alertmanager-enabled", false, + "Enable Alertmanager controller.") + flag.StringVar(&conf.Monitoring.AlertmanagerSecretName, "alertmanager-secret-name", "", + "The name of the secret containing the Alertmanager configuration.") flag.StringVar(&conf.Monitoring.AlertmanagerURL, "alertmanager-url", "", "The URL of the Alertmanager API.") flag.StringVar(&conf.Monitoring.MonitoringAgent, "monitoring-agent", commonmonitoring.MonitoringAgentAlloy, @@ -184,6 +190,15 @@ func main() { setupLog.Error(err, "unable to setup controller", "controller", "GrafanaOrganizationReconciler") os.Exit(1) } + + if conf.Monitoring.AlertmanagerEnabled { + // Setup controller for Alertmanager + err = controller.SetupAlertmanagerReconciler(mgr, conf) + if err != nil { + setupLog.Error(err, "unable to setup controller", "controller", "AlertmanagerReconciler") + os.Exit(1) + } + } //+kubebuilder:scaffold:builder if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { diff --git a/pkg/config/config.go b/pkg/config/config.go index 03702834..68925239 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -11,6 +11,7 @@ type Config struct { ProbeAddr string SecureMetrics bool EnableHTTP2 bool + OperatorNamespace string ManagementCluster common.ManagementCluster diff --git a/pkg/monitoring/config.go b/pkg/monitoring/config.go index b90c6653..ec06309d 100644 --- a/pkg/monitoring/config.go +++ b/pkg/monitoring/config.go @@ -15,7 +15,9 @@ const MonitoringLabel = "giantswarm.io/monitoring" type Config struct { Enabled bool - AlertmanagerURL string + AlertmanagerSecretName string + AlertmanagerURL string + AlertmanagerEnabled bool MonitoringAgent string DefaultShardingStrategy sharding.Strategy