diff --git a/rules/mimir_alerts.yml b/rules/mimir_alerts.yml index 9706361..6a5ff57 100644 --- a/rules/mimir_alerts.yml +++ b/rules/mimir_alerts.yml @@ -473,15 +473,30 @@ groups: severity: warning - name: gossip_alerts rules: - - alert: MimirGossipMembersMismatch + - alert: MimirGossipMembersTooHigh annotations: message: - Mimir instance {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace - }} sees incorrect number of gossip members. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirgossipmembersmismatch + One or more Mimir instances in {{ $labels.cluster }}/{{ $labels.namespace + }} consistently sees a higher than expected number of gossip members. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirgossipmemberstoohigh expr: | - avg by (cluster, namespace) (memberlist_client_cluster_members_count) != sum by (cluster, namespace) (up{job=~".+/(admin-api|alertmanager|compactor.*|distributor|ingester.*|querier.*|ruler|ruler-querier.*|store-gateway.*|cortex|mimir|mimir-write.*|mimir-read.*|mimir-backend.*)"}) - for: 15m + max by (cluster, namespace) (memberlist_client_cluster_members_count) + > + (sum by (cluster, namespace) (up{job=~".+/(admin-api|alertmanager|compactor.*|distributor|ingester.*|querier.*|ruler|ruler-querier.*|store-gateway.*|cortex|mimir|mimir-write.*|mimir-read.*|mimir-backend.*)"}) + 10) + for: 20m + labels: + severity: warning + - alert: MimirGossipMembersTooLow + annotations: + message: + One or more Mimir instances in {{ $labels.cluster }}/{{ $labels.namespace + }} consistently sees a lower than expected number of gossip members. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirgossipmemberstoolow + expr: | + min by (cluster, namespace) (memberlist_client_cluster_members_count) + < + (sum by (cluster, namespace) (up{job=~".+/(admin-api|alertmanager|compactor.*|distributor|ingester.*|querier.*|ruler|ruler-querier.*|store-gateway.*|cortex|mimir|mimir-write.*|mimir-read.*|mimir-backend.*)"}) * 0.5) + for: 20m labels: severity: warning - name: etcd_alerts @@ -905,6 +920,17 @@ groups: for: 1m labels: severity: warning + - alert: MimirCompactorSkippedBlocksWithOutOfOrderChunks + annotations: + message: + Mimir Compactor {{ $labels.instance }} in {{ $labels.cluster }}/{{ + $labels.namespace }} has found and ignored blocks with out of order chunks. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorskippedblockswithoutoforderchunks + expr: | + increase(cortex_compactor_blocks_marked_for_no_compaction_total{reason="block-index-out-of-order-chunk"}[5m]) > 1 + for: 30m + labels: + severity: critical - name: mimir_autoscaling rules: - alert: MimirAutoscalerNotActive @@ -915,18 +941,21 @@ groups: runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirautoscalernotactive expr: | ( - kube_horizontalpodautoscaler_status_condition{condition="ScalingActive",status="false"} - # Match only Mimir namespaces. - * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) - # Add "metric" label. - + on(cluster, namespace, horizontalpodautoscaler) group_right label_replace(kube_horizontalpodautoscaler_spec_target_metric*0, "metric", "$1", "metric_name", "(.+)") - > 0 + label_replace(( + kube_horizontalpodautoscaler_status_condition{condition="ScalingActive",status="false"} + # Match only Mimir namespaces. + * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) + # Add "metric" label. + + on(cluster, namespace, horizontalpodautoscaler) group_right label_replace(kube_horizontalpodautoscaler_spec_target_metric*0, "metric", "$1", "metric_name", "(.+)") + > 0), + "scaledObject", "$1", "horizontalpodautoscaler", "keda-hpa-(.*)" + ) ) # Alert only if the scaling metric exists and is > 0. If the KEDA ScaledObject is configured to scale down 0, # then HPA ScalingActive may be false when expected to run 0 replicas. In this case, the scaling metric exported # by KEDA could not exist at all or being exposed with a value of 0. - and on (cluster, namespace, metric) - (label_replace(keda_metrics_adapter_scaler_metrics_value, "namespace", "$0", "exported_namespace", ".+") > 0) + and on (cluster, namespace, metric, scaledObject) + (label_replace(keda_scaler_metrics_value, "namespace", "$0", "exported_namespace", ".+") > 0) for: 1h labels: severity: critical @@ -939,7 +968,7 @@ groups: expr: | ( # Find KEDA scalers reporting errors. - label_replace(rate(keda_metrics_adapter_scaler_errors[5m]), "namespace", "$1", "exported_namespace", "(.*)") + label_replace(rate(keda_scaler_errors[5m]), "namespace", "$1", "exported_namespace", "(.*)") # Match only Mimir namespaces. * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) )