From 4ee344a4f3b7cecb5a7ad9a4439d01f4890c6018 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wilson=20J=C3=BAnior?= Date: Wed, 16 Oct 2019 09:06:55 -0300 Subject: [PATCH] Add support to record quantiles --- slo/slo.go | 69 ++++++++++++++++++----- slo/slo_test.go | 147 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 203 insertions(+), 13 deletions(-) diff --git a/slo/slo.go b/slo/slo.go index 585f280..69747aa 100644 --- a/slo/slo.go +++ b/slo/slo.go @@ -1,6 +1,7 @@ package slo import ( + "fmt" "log" "strings" @@ -9,6 +10,24 @@ import ( "github.com/prometheus/prometheus/pkg/rulefmt" ) +var quantiles = []struct { + name string + quantile float64 +}{ + { + name: "p50", + quantile: 0.5, + }, + { + name: "p95", + quantile: 0.95, + }, + { + name: "p99", + quantile: 0.99, + }, +} + type SLOSpec struct { SLOS []SLO } @@ -23,17 +42,23 @@ func (block *ExprBlock) ComputeExpr(window, le string) string { return replacer.Replace(block.Expr) } +func (block *ExprBlock) ComputeQuantile(window string, quantile float64) string { + replacer := strings.NewReplacer("$window", window, "$quantile", fmt.Sprintf("%g", quantile)) + return replacer.Replace(block.Expr) +} + type SLO struct { Name string `yaml:"name"` Objectives Objectives HonorLabels bool `yaml:"honorLabels"` - TrafficRateRecord ExprBlock `yaml:"trafficRateRecord"` - ErrorRateRecord ExprBlock `yaml:"errorRateRecord"` - LatencyRecord ExprBlock `yaml:"latencyRecord"` - Labels map[string]string `yaml:"labels"` - Annotations map[string]string `yaml:"annotations"` + TrafficRateRecord ExprBlock `yaml:"trafficRateRecord"` + ErrorRateRecord ExprBlock `yaml:"errorRateRecord"` + LatencyRecord ExprBlock `yaml:"latencyRecord"` + LatencyQuantileRecord ExprBlock `yaml:"latencyQuantileRecord"` + Labels map[string]string `yaml:"labels"` + Annotations map[string]string `yaml:"annotations"` } type Objectives struct { @@ -113,17 +138,35 @@ func (slo SLO) generateRules(bucket string) []rulefmt.Rule { rules = append(rules, trafficRateRecord) } - errorRateRecord := rulefmt.Rule{ - Record: "slo:service_errors_total:ratio_rate_" + bucket, - Expr: slo.ErrorRateRecord.ComputeExpr(bucket, ""), - Labels: map[string]string{}, - } + if slo.ErrorRateRecord.Expr != "" { + errorRateRecord := rulefmt.Rule{ + Record: "slo:service_errors_total:ratio_rate_" + bucket, + Expr: slo.ErrorRateRecord.ComputeExpr(bucket, ""), + Labels: map[string]string{}, + } - if !slo.HonorLabels { - errorRateRecord.Labels["service"] = slo.Name + if !slo.HonorLabels { + errorRateRecord.Labels["service"] = slo.Name + } + + rules = append(rules, errorRateRecord) } - rules = append(rules, errorRateRecord) + if slo.LatencyQuantileRecord.Expr != "" { + for _, quantile := range quantiles { + latencyQuantileRecord := rulefmt.Rule{ + Record: "slo:service_latency:" + quantile.name + "_" + bucket, + Expr: slo.LatencyQuantileRecord.ComputeQuantile(bucket, quantile.quantile), + Labels: map[string]string{}, + } + + if !slo.HonorLabels { + latencyQuantileRecord.Labels["service"] = slo.Name + } + + rules = append(rules, latencyQuantileRecord) + } + } for _, latencyBucket := range slo.Objectives.Latency { latencyRateRecord := rulefmt.Rule{ diff --git a/slo/slo_test.go b/slo/slo_test.go index 65f1251..ce188f8 100644 --- a/slo/slo_test.go +++ b/slo/slo_test.go @@ -290,6 +290,153 @@ func TestSLOGenerateGroupRules(t *testing.T) { }) } +func TestSLOGenerateGroupRulesWithLatencyQuantile(t *testing.T) { + slo := &SLO{ + Name: "auto-discover-services", + HonorLabels: true, + LatencyQuantileRecord: ExprBlock{ + Expr: "histogram_quantile($quantile, sum by (le) (rate(http_total[$window])))", + }, + } + + groupRules := slo.GenerateGroupRules() + assert.Len(t, groupRules, 3) + + assert.Equal(t, rulefmt.RuleGroup{ + Name: "slo:auto-discover-services:short", + Interval: model.Duration(time.Second * 30), + Rules: []rulefmt.Rule{ + // 5m + { + Record: "slo:service_latency:p50_5m", + Expr: "histogram_quantile(0.5, sum by (le) (rate(http_total[5m])))", + Labels: map[string]string{}, + }, + { + Record: "slo:service_latency:p95_5m", + Expr: "histogram_quantile(0.95, sum by (le) (rate(http_total[5m])))", + Labels: map[string]string{}, + }, + { + Record: "slo:service_latency:p99_5m", + Expr: "histogram_quantile(0.99, sum by (le) (rate(http_total[5m])))", + Labels: map[string]string{}, + }, + // 30m + { + Record: "slo:service_latency:p50_30m", + Expr: "histogram_quantile(0.5, sum by (le) (rate(http_total[30m])))", + Labels: map[string]string{}, + }, + { + Record: "slo:service_latency:p95_30m", + Expr: "histogram_quantile(0.95, sum by (le) (rate(http_total[30m])))", + Labels: map[string]string{}, + }, + { + Record: "slo:service_latency:p99_30m", + Expr: "histogram_quantile(0.99, sum by (le) (rate(http_total[30m])))", + Labels: map[string]string{}, + }, + // 1h + { + Record: "slo:service_latency:p50_1h", + Expr: "histogram_quantile(0.5, sum by (le) (rate(http_total[1h])))", + Labels: map[string]string{}, + }, + { + Record: "slo:service_latency:p95_1h", + Expr: "histogram_quantile(0.95, sum by (le) (rate(http_total[1h])))", + Labels: map[string]string{}, + }, + { + Record: "slo:service_latency:p99_1h", + Expr: "histogram_quantile(0.99, sum by (le) (rate(http_total[1h])))", + Labels: map[string]string{}, + }, + }, + }, groupRules[0]) + + assert.Equal(t, rulefmt.RuleGroup{ + Name: "slo:auto-discover-services:medium", + Interval: model.Duration(time.Second * 120), + Rules: []rulefmt.Rule{ + // 2h + { + Record: "slo:service_latency:p50_2h", + Expr: "histogram_quantile(0.5, sum by (le) (rate(http_total[2h])))", + Labels: map[string]string{}, + }, + { + Record: "slo:service_latency:p95_2h", + Expr: "histogram_quantile(0.95, sum by (le) (rate(http_total[2h])))", + Labels: map[string]string{}, + }, + { + Record: "slo:service_latency:p99_2h", + Expr: "histogram_quantile(0.99, sum by (le) (rate(http_total[2h])))", + Labels: map[string]string{}, + }, + // 6h + { + Record: "slo:service_latency:p50_6h", + Expr: "histogram_quantile(0.5, sum by (le) (rate(http_total[6h])))", + Labels: map[string]string{}, + }, + { + Record: "slo:service_latency:p95_6h", + Expr: "histogram_quantile(0.95, sum by (le) (rate(http_total[6h])))", + Labels: map[string]string{}, + }, + { + Record: "slo:service_latency:p99_6h", + Expr: "histogram_quantile(0.99, sum by (le) (rate(http_total[6h])))", + Labels: map[string]string{}, + }, + }, + }, groupRules[1]) + + assert.Equal(t, rulefmt.RuleGroup{ + Name: "slo:auto-discover-services:daily", + Interval: model.Duration(time.Second * 300), + Rules: []rulefmt.Rule{ + // 1d + { + Record: "slo:service_latency:p50_1d", + Expr: "histogram_quantile(0.5, sum by (le) (rate(http_total[1d])))", + Labels: map[string]string{}, + }, + { + Record: "slo:service_latency:p95_1d", + Expr: "histogram_quantile(0.95, sum by (le) (rate(http_total[1d])))", + Labels: map[string]string{}, + }, + { + Record: "slo:service_latency:p99_1d", + Expr: "histogram_quantile(0.99, sum by (le) (rate(http_total[1d])))", + Labels: map[string]string{}, + }, + + // 3d + { + Record: "slo:service_latency:p50_3d", + Expr: "histogram_quantile(0.5, sum by (le) (rate(http_total[3d])))", + Labels: map[string]string{}, + }, + { + Record: "slo:service_latency:p95_3d", + Expr: "histogram_quantile(0.95, sum by (le) (rate(http_total[3d])))", + Labels: map[string]string{}, + }, + { + Record: "slo:service_latency:p99_3d", + Expr: "histogram_quantile(0.99, sum by (le) (rate(http_total[3d])))", + Labels: map[string]string{}, + }, + }, + }, groupRules[2]) +} + func TestSLOGenerateGroupRulesWithAutoDiscovery(t *testing.T) { slo := &SLO{ Name: "auto-discover-services",