Skip to content

Commit

Permalink
Implement handling of Alertmanager config (#180)
Browse files Browse the repository at this point in the history
Co-authored-by: Taylor Bot <[email protected]>
Co-authored-by: Quentin Bisson <[email protected]>
  • Loading branch information
3 people authored Dec 16, 2024
1 parent 6be2b18 commit fbfd0f1
Show file tree
Hide file tree
Showing 11 changed files with 614 additions and 5 deletions.
9 changes: 9 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ require (
github.com/opsgenie/opsgenie-go-sdk-v2 v1.2.23
github.com/pkg/errors v0.9.1
github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.79.0
github.com/prometheus/alertmanager v0.27.0
github.com/prometheus/client_golang v1.20.5
github.com/prometheus/common v0.61.0
github.com/sirupsen/logrus v1.9.3
Expand Down Expand Up @@ -87,7 +88,10 @@ require (
github.com/Masterminds/goutils v1.1.1 // indirect
github.com/Masterminds/semver/v3 v3.3.0 // indirect
github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 // indirect
github.com/aws/aws-sdk-go v1.50.8 // indirect
github.com/fxamacker/cbor/v2 v2.7.0 // indirect
github.com/go-kit/log v0.2.1 // indirect
github.com/go-logfmt/logfmt v0.5.1 // indirect
github.com/go-logr/stdr v1.2.2 // indirect
github.com/go-openapi/analysis v0.23.0 // indirect
github.com/go-openapi/errors v0.22.0 // indirect
Expand All @@ -97,12 +101,16 @@ require (
github.com/go-openapi/strfmt v0.23.0 // indirect
github.com/go-openapi/validate v0.24.0 // indirect
github.com/huandu/xstrings v1.5.0 // indirect
github.com/jmespath/go-jmespath v0.4.0 // indirect
github.com/jpillora/backoff v1.0.0 // indirect
github.com/klauspost/compress v1.17.9 // indirect
github.com/mitchellh/copystructure v1.2.0 // indirect
github.com/mitchellh/mapstructure v1.5.0 // indirect
github.com/mitchellh/reflectwalk v1.0.2 // indirect
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect
github.com/oklog/ulid v1.3.1 // indirect
github.com/opentracing/opentracing-go v1.2.0 // indirect
github.com/prometheus/common/sigv4 v0.1.0 // indirect
github.com/shopspring/decimal v1.4.0 // indirect
github.com/spf13/cast v1.7.0 // indirect
github.com/x448/float16 v0.8.4 // indirect
Expand All @@ -112,6 +120,7 @@ require (
go.opentelemetry.io/otel/trace v1.28.0 // indirect
golang.org/x/sync v0.10.0 // indirect
gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
)

replace (
Expand Down
414 changes: 414 additions & 0 deletions go.sum

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{{/* vim: set filetype=mustache: */}}

{{- define "alertmanager-secret.name" -}}
{{- include "resource.default.name" . -}}-alertmanager
{{- end }}
1 change: 1 addition & 0 deletions helm/observability-operator/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ spec:
- --management-cluster-pipeline={{ $.Values.managementCluster.pipeline }}
- --management-cluster-region={{ $.Values.managementCluster.region }}
# Monitoring configuration
- --alertmanager-url={{ $.Values.alerting.alertmanagerURL }}
- --monitoring-enabled={{ $.Values.monitoring.enabled }}
- --monitoring-agent={{ $.Values.monitoring.agent }}
- --monitoring-sharding-scale-up-series-count={{ $.Values.monitoring.sharding.scaleUpSeriesCount }}
Expand Down
7 changes: 7 additions & 0 deletions helm/observability-operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,13 @@ managementCluster:
pipeline: pipeline
region: region

alerting:
alertmanagerURL: ""
grafanaAddress: ""
proxyURL: ""
slackAPIToken: ""
slackAPIURL: ""

monitoring:
agent: alloy
enabled: false
Expand Down
6 changes: 4 additions & 2 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,8 @@ func main() {
"The region of the management cluster.")

// Monitoring configuration flags.
flag.StringVar(&conf.Monitoring.AlertmanagerURL, "alertmanager-url", "",
"The URL of the Alertmanager API.")
flag.StringVar(&conf.Monitoring.MonitoringAgent, "monitoring-agent", commonmonitoring.MonitoringAgentAlloy,
fmt.Sprintf("select monitoring agent to use (%s or %s)", commonmonitoring.MonitoringAgentPrometheus, commonmonitoring.MonitoringAgentAlloy))
flag.BoolVar(&conf.Monitoring.Enabled, "monitoring-enabled", false,
Expand All @@ -109,15 +111,15 @@ func main() {
opts.BindFlags(flag.CommandLine)
flag.Parse()

ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts)))

// Load environment variables.
_, err := env.UnmarshalFromEnviron(&conf.Environment)
if err != nil {
setupLog.Error(err, "failed to unmarshal environment variables")
os.Exit(1)
}

ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts)))

// if the enable-http2 flag is false (the default), http/2 should be disabled
// due to its vulnerabilities. More specifically, disabling http/2 will
// prevent from being vulnerable to the HTTP/2 Stream Cancelation and
Expand Down
150 changes: 150 additions & 0 deletions pkg/alertmanager/alertmanager.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
package alertmanager

import (
"bytes"
"context"
"fmt"
"io"
"maps"
"net/http"
"path"
"slices"
"strings"

"github.com/pkg/errors"
"github.com/prometheus/alertmanager/config"
v1 "k8s.io/api/core/v1"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/yaml"

common "github.com/giantswarm/observability-operator/pkg/common/monitoring"
pkgconfig "github.com/giantswarm/observability-operator/pkg/config"
)

const (
// Those values are used to retrieve the Alertmanager configuration from the secret named after conf.Monitoring.AlertmanagerSecretName
// alertmanagerConfigKey is the key to the alertmanager configuration in the secret
alertmanagerConfigKey = "alertmanager.yaml"
// templatesSuffix is the suffix used to identify the templates in the secret
templatesSuffix = ".tmpl"

alertmanagerAPIPath = "/api/v1/alerts"

//TODO: get this from somewhere
tenantID = "anonymous"
)

type Service struct {
alertmanagerURL string
}

// configRequest is the structure used to send the configuration to Alertmanager's API
// json tags also applies yaml field names
type configRequest struct {
TemplateFiles map[string]string `json:"template_files"`
AlertmanagerConfig string `json:"alertmanager_config"`
}

func New(conf pkgconfig.Config) Service {
service := Service{
alertmanagerURL: strings.TrimSuffix(conf.Monitoring.AlertmanagerURL, "/"),
}

return service
}

func (s Service) Configure(ctx context.Context, secret *v1.Secret) error {
logger := log.FromContext(ctx)

logger.Info("Alertmanager: configuring")

if secret == nil {
return errors.WithStack(fmt.Errorf("alertmanager: failed to get secret"))
}

// Retrieve Alertmanager configuration from secret
alertmanagerConfigContent, ok := secret.Data[alertmanagerConfigKey]
if !ok {
return errors.WithStack(fmt.Errorf("alertmanager: config not found"))
}

// Retrieve all alertmanager templates from secret
templates := make(map[string]string)
for key, value := range secret.Data {
if strings.HasSuffix(key, templatesSuffix) {
// Template key/name should not be a path otherwise the request will fail with:
// > error validating Alertmanager config: invalid template name "/etc/dummy.tmpl": the template name cannot contain any path
baseKey := path.Base(key)
templates[baseKey] = string(value)
}
}

err := s.configure(ctx, alertmanagerConfigContent, templates, tenantID)
if err != nil {
return errors.WithStack(fmt.Errorf("alertmanager: failed to configure: %w", err))
}

logger.Info("Alertmanager: configured")
return nil
}

// configure sends the configuration and templates to Mimir Alertmanager's API
// https://grafana.com/docs/mimir/latest/references/http-api/#set-alertmanager-configuration
func (s Service) configure(ctx context.Context, alertmanagerConfigContent []byte, templates map[string]string, tenantID string) error {
logger := log.FromContext(ctx)

// Load alertmanager configuration
alertmanagerConfig, err := config.Load(string(alertmanagerConfigContent))
if err != nil {
return errors.WithStack(fmt.Errorf("alertmanager: failed to load configuration: %w", err))
}

// Set template names
// Values set here must match the keys set in requestData.TemplateFiles
alertmanagerConfig.Templates = slices.Collect(maps.Keys(templates))
alertmanagerConfigString := alertmanagerConfig.String()

// Prepare request for Alertmanager API
requestData := configRequest{
AlertmanagerConfig: alertmanagerConfigString,
TemplateFiles: templates,
}
data, err := yaml.Marshal(requestData)
if err != nil {
return errors.WithStack(fmt.Errorf("alertmanager: failed to marshal yaml: %w", err))
}

url := s.alertmanagerURL + alertmanagerAPIPath
logger.WithValues("url", url, "data_size", len(data), "config_size", len(alertmanagerConfigString), "templates_count", len(templates)).Info("Alertmanager: sending configuration")

// Send request to Alertmanager's API
req, err := http.NewRequest(http.MethodPost, url, bytes.NewBuffer(data))
if err != nil {
return errors.WithStack(fmt.Errorf("alertmanager: failed to create request: %w", err))
}
req.Header.Set(common.OrgIDHeader, tenantID)

resp, err := http.DefaultClient.Do(req)
if err != nil {
return errors.WithStack(fmt.Errorf("alertmanager: failed to send request: %w", err))
}
defer resp.Body.Close() // nolint: errcheck

logger.WithValues("status_code", resp.StatusCode).Info("Alertmanager: configuration sent")

if resp.StatusCode != http.StatusCreated {
respBody, err := io.ReadAll(resp.Body)
if err != nil {
return errors.WithStack(fmt.Errorf("alertmanager: failed to read response: %w", err))
}

e := APIError{
Code: resp.StatusCode,
Message: string(respBody),
}

return errors.WithStack(fmt.Errorf("alertmanager: failed to send configuration: %w", e))
}

return nil
}
12 changes: 12 additions & 0 deletions pkg/alertmanager/error.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
package alertmanager

import "fmt"

type APIError struct {
Code int
Message string
}

func (e APIError) Error() string {
return fmt.Sprintf("%d: %s", e.Code, e.Message)
}
2 changes: 2 additions & 0 deletions pkg/common/monitoring/monitoring.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ const (
RemoteWriteName = "mimir"
RemoteWriteEndpointTemplateURL = "https://mimir.%s/api/v1/push"
RemoteWriteTimeout = "60s"

OrgIDHeader = "X-Scope-OrgID"
)

func GetServicePriority(cluster *clusterv1.Cluster) string {
Expand Down
8 changes: 6 additions & 2 deletions pkg/grafana/types.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
package grafana

import "strings"
import (
"strings"

common "github.com/giantswarm/observability-operator/pkg/common/monitoring"
)

type Organization struct {
ID int64
Expand Down Expand Up @@ -29,7 +33,7 @@ func (d Datasource) buildJSONData() map[string]interface{} {
}

// Add tenant header name
d.JSONData["httpHeaderName1"] = "X-Scope-OrgID"
d.JSONData["httpHeaderName1"] = common.OrgIDHeader

return d.JSONData
}
Expand Down
5 changes: 4 additions & 1 deletion pkg/monitoring/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,10 @@ const MonitoringLabel = "giantswarm.io/monitoring"

// Config represents the configuration used by the monitoring package.
type Config struct {
Enabled bool
Enabled bool

AlertmanagerURL string

MonitoringAgent string
DefaultShardingStrategy sharding.Strategy
// WALTruncateFrequency is the frequency at which the WAL segments should be truncated.
Expand Down

0 comments on commit fbfd0f1

Please sign in to comment.