Set alloy wal truncate_frequency to 15m (#130)

* try truncate frequency * Add flag to be able to set truncate_frequency
giantswarm · Oct 17, 2024 · fa6b6eb · fa6b6eb
1 parent 3b383d7
commit fa6b6eb
Show file tree

Hide file tree

Showing 8 changed files with 48 additions and 4 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- Add wal `truncate_frequency` configuration to alloy-metrics with a default set to 15m.
 - Add grafanaOrganization CRD in helm chart.
 
 ## [0.7.1] - 2024-10-10

diff --git a/helm/observability-operator/templates/deployment.yaml b/helm/observability-operator/templates/deployment.yaml
@@ -30,10 +30,12 @@ spec:
         - --management-cluster-name={{ $.Values.managementCluster.name }}
         - --management-cluster-pipeline={{ $.Values.managementCluster.pipeline }}
         - --management-cluster-region={{ $.Values.managementCluster.region }}
-        - --monitoring-agent={{ $.Values.monitoring.agent }}
+        # Monitoring configuration
         - --monitoring-enabled={{ $.Values.monitoring.enabled }}
+        - --monitoring-agent={{ $.Values.monitoring.agent }}
         - --monitoring-sharding-scale-up-series-count={{ $.Values.monitoring.sharding.scaleUpSeriesCount }}
         - --monitoring-sharding-scale-down-percentage={{ $.Values.monitoring.sharding.scaleDownPercentage }}
+        - --monitoring-wal-truncate-frequency={{ $.Values.monitoring.wal.truncateFrequency }}
         {{- if .Values.monitoring.prometheusVersion }}
         - --prometheus-version={{ $.Values.monitoring.prometheusVersion }}
         {{- end }}

diff --git a/helm/observability-operator/values.schema.json b/helm/observability-operator/values.schema.json
@@ -55,6 +55,9 @@
         "monitoring": {
             "type": "object",
             "properties": {
+                "agent": {
+                    "type": "string"
+                },
                 "enabled": {
                     "type": "boolean"
                 },
@@ -63,6 +66,25 @@
                 },
                 "prometheusVersion": {
                     "type": "string"
+                },
+                "sharding": {
+                    "type": "object",
+                    "properties": {
+                        "scaleDownPercentage": {
+                            "type": "number"
+                        },
+                        "scaleUpSeriesCount": {
+                            "type": "integer"
+                        }
+                    }
+                },
+                "wal": {
+                    "type": "object",
+                    "properties": {
+                        "truncateFrequency": {
+                            "type": "string"
+                        }
+                    }
                 }
             }
         },

diff --git a/helm/observability-operator/values.yaml b/helm/observability-operator/values.yaml
@@ -23,6 +23,9 @@ monitoring:
   sharding:
     scaleUpSeriesCount: 1000000
     scaleDownPercentage: 0.20
+  wal:
+    # -- Configures the WAL truncation frequency
+    truncateFrequency: 15m
 
 operator:
   # -- Configures the resources for the operator deployment

diff --git a/main.go b/main.go
@@ -21,6 +21,7 @@ import (
 	"flag"
 	"fmt"
 	"os"
+	"time"
 
 	// Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.)
 	// to ensure that exec-entrypoint and run can make use of them.
@@ -75,6 +76,7 @@ var (
 	monitoringEnabled                     bool
 	monitoringShardingScaleUpSeriesCount  float64
 	monitoringShardingScaleDownPercentage float64
+	monitoringWALTruncateFrequency        time.Duration
 	prometheusVersion                     string
 )
 
@@ -114,16 +116,19 @@ func main() {
 		"The pipeline of the management cluster.")
 	flag.StringVar(&managementClusterRegion, "management-cluster-region", "",
 		"The region of the management cluster.")
-	flag.StringVar(&monitoringAgent, "monitoring-agent", commonmonitoring.MonitoringAgentPrometheus,
-		fmt.Sprintf("select monitoring agent to use (%s or %s)", commonmonitoring.MonitoringAgentPrometheus, commonmonitoring.MonitoringAgentAlloy))
+	// Monitoring configuration flags.
 	flag.BoolVar(&monitoringEnabled, "monitoring-enabled", false,
 		"Enable monitoring at the management cluster level.")
+	flag.StringVar(&monitoringAgent, "monitoring-agent", commonmonitoring.MonitoringAgentPrometheus,
+		fmt.Sprintf("select monitoring agent to use (%s or %s)", commonmonitoring.MonitoringAgentPrometheus, commonmonitoring.MonitoringAgentAlloy))
 	flag.Float64Var(&monitoringShardingScaleUpSeriesCount, "monitoring-sharding-scale-up-series-count", 0,
 		"Configures the number of time series needed to add an extra prometheus agent shard.")
 	flag.Float64Var(&monitoringShardingScaleDownPercentage, "monitoring-sharding-scale-down-percentage", 0,
 		"Configures the percentage of removed series to scale down the number of prometheus agent shards.")
 	flag.StringVar(&prometheusVersion, "prometheus-version", "",
 		"The version of Prometheus Agents to deploy.")
+	flag.DurationVar(&monitoringWALTruncateFrequency, "monitoring-wal-truncate-frequency", 2*time.Hour,
+		"Configures how frequently the Write-Ahead Log (WAL) truncates segments.")
 	opts := zap.Options{
 		Development: false,
 	}
@@ -213,7 +218,8 @@ func main() {
 			ScaleUpSeriesCount:  monitoringShardingScaleUpSeriesCount,
 			ScaleDownPercentage: monitoringShardingScaleDownPercentage,
 		},
-		PrometheusVersion: prometheusVersion,
+		WALTruncateFrequency: monitoringWALTruncateFrequency,
+		PrometheusVersion:    prometheusVersion,
 	}
 
 	prometheusAgentService := prometheusagent.PrometheusAgentService{

diff --git a/pkg/monitoring/alloy/configmap.go b/pkg/monitoring/alloy/configmap.go
@@ -126,6 +126,8 @@ func (a *Service) generateAlloyConfig(ctx context.Context, cluster *clusterv1.Cl
 		QueueConfigMaxSamplesPerSend int
 		QueueConfigMaxShards         int
 
+		WALTruncateFrequency string
+
 		ExternalLabels map[string]string
 	}{
 		RemoteWriteURLEnvVarName:               AlloyRemoteWriteURLEnvVarName,
@@ -139,6 +141,8 @@ func (a *Service) generateAlloyConfig(ctx context.Context, cluster *clusterv1.Cl
 		QueueConfigMaxSamplesPerSend: commonmonitoring.QueueConfigMaxSamplesPerSend,
 		QueueConfigMaxShards:         commonmonitoring.QueueConfigMaxShards,
 
+		WALTruncateFrequency: a.MonitoringConfig.WALTruncateFrequency.String(),
+
 		ExternalLabels: map[string]string{
 			"cluster_id":       cluster.Name,
 			"cluster_type":     common.GetClusterType(cluster, a.ManagementCluster),

diff --git a/pkg/monitoring/alloy/templates/alloy-config.alloy.template b/pkg/monitoring/alloy/templates/alloy-config.alloy.template
@@ -49,6 +49,9 @@ prometheus.remote_write "default" {
       max_shards = {{ .QueueConfigMaxShards }}
     }
   }
+  wal {
+    truncate_frequency = "{{ .WALTruncateFrequency }}"
+  }
   external_labels = {
     {{- range $key, $value := .ExternalLabels }}
     "{{ $key }}" = "{{ $value }}",

diff --git a/pkg/monitoring/config.go b/pkg/monitoring/config.go
@@ -2,6 +2,7 @@ package monitoring
 
 import (
 	"strconv"
+	"time"
 
 	clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
 
@@ -15,6 +16,8 @@ type Config struct {
 	Enabled                 bool
 	MonitoringAgent         string
 	DefaultShardingStrategy sharding.Strategy
+	// WALTruncateFrequency is the frequency at which the WAL segments should be truncated.
+	WALTruncateFrequency time.Duration
 	// TODO(atlas): validate prometheus version using SemVer
 	PrometheusVersion string
 }