Skip to content

Commit

Permalink
chore: update mimir rules
Browse files Browse the repository at this point in the history
  • Loading branch information
bdossantos committed Nov 8, 2023
1 parent d7e492c commit 7581970
Showing 1 changed file with 44 additions and 15 deletions.
59 changes: 44 additions & 15 deletions rules/mimir_alerts.yml
Original file line number Diff line number Diff line change
Expand Up @@ -473,15 +473,30 @@ groups:
severity: warning
- name: gossip_alerts
rules:
- alert: MimirGossipMembersMismatch
- alert: MimirGossipMembersTooHigh
annotations:
message:
Mimir instance {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace
}} sees incorrect number of gossip members.
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirgossipmembersmismatch
One or more Mimir instances in {{ $labels.cluster }}/{{ $labels.namespace
}} consistently sees a higher than expected number of gossip members.
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirgossipmemberstoohigh
expr: |
avg by (cluster, namespace) (memberlist_client_cluster_members_count) != sum by (cluster, namespace) (up{job=~".+/(admin-api|alertmanager|compactor.*|distributor|ingester.*|querier.*|ruler|ruler-querier.*|store-gateway.*|cortex|mimir|mimir-write.*|mimir-read.*|mimir-backend.*)"})
for: 15m
max by (cluster, namespace) (memberlist_client_cluster_members_count)
>
(sum by (cluster, namespace) (up{job=~".+/(admin-api|alertmanager|compactor.*|distributor|ingester.*|querier.*|ruler|ruler-querier.*|store-gateway.*|cortex|mimir|mimir-write.*|mimir-read.*|mimir-backend.*)"}) + 10)
for: 20m
labels:
severity: warning
- alert: MimirGossipMembersTooLow
annotations:
message:
One or more Mimir instances in {{ $labels.cluster }}/{{ $labels.namespace
}} consistently sees a lower than expected number of gossip members.
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirgossipmemberstoolow
expr: |
min by (cluster, namespace) (memberlist_client_cluster_members_count)
<
(sum by (cluster, namespace) (up{job=~".+/(admin-api|alertmanager|compactor.*|distributor|ingester.*|querier.*|ruler|ruler-querier.*|store-gateway.*|cortex|mimir|mimir-write.*|mimir-read.*|mimir-backend.*)"}) * 0.5)
for: 20m
labels:
severity: warning
- name: etcd_alerts
Expand Down Expand Up @@ -905,6 +920,17 @@ groups:
for: 1m
labels:
severity: warning
- alert: MimirCompactorSkippedBlocksWithOutOfOrderChunks
annotations:
message:
Mimir Compactor {{ $labels.instance }} in {{ $labels.cluster }}/{{
$labels.namespace }} has found and ignored blocks with out of order chunks.
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorskippedblockswithoutoforderchunks
expr: |
increase(cortex_compactor_blocks_marked_for_no_compaction_total{reason="block-index-out-of-order-chunk"}[5m]) > 1
for: 30m
labels:
severity: critical
- name: mimir_autoscaling
rules:
- alert: MimirAutoscalerNotActive
Expand All @@ -915,18 +941,21 @@ groups:
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirautoscalernotactive
expr: |
(
kube_horizontalpodautoscaler_status_condition{condition="ScalingActive",status="false"}
# Match only Mimir namespaces.
* on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info)
# Add "metric" label.
+ on(cluster, namespace, horizontalpodautoscaler) group_right label_replace(kube_horizontalpodautoscaler_spec_target_metric*0, "metric", "$1", "metric_name", "(.+)")
> 0
label_replace((
kube_horizontalpodautoscaler_status_condition{condition="ScalingActive",status="false"}
# Match only Mimir namespaces.
* on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info)
# Add "metric" label.
+ on(cluster, namespace, horizontalpodautoscaler) group_right label_replace(kube_horizontalpodautoscaler_spec_target_metric*0, "metric", "$1", "metric_name", "(.+)")
> 0),
"scaledObject", "$1", "horizontalpodautoscaler", "keda-hpa-(.*)"
)
)
# Alert only if the scaling metric exists and is > 0. If the KEDA ScaledObject is configured to scale down 0,
# then HPA ScalingActive may be false when expected to run 0 replicas. In this case, the scaling metric exported
# by KEDA could not exist at all or being exposed with a value of 0.
and on (cluster, namespace, metric)
(label_replace(keda_metrics_adapter_scaler_metrics_value, "namespace", "$0", "exported_namespace", ".+") > 0)
and on (cluster, namespace, metric, scaledObject)
(label_replace(keda_scaler_metrics_value, "namespace", "$0", "exported_namespace", ".+") > 0)
for: 1h
labels:
severity: critical
Expand All @@ -939,7 +968,7 @@ groups:
expr: |
(
# Find KEDA scalers reporting errors.
label_replace(rate(keda_metrics_adapter_scaler_errors[5m]), "namespace", "$1", "exported_namespace", "(.*)")
label_replace(rate(keda_scaler_errors[5m]), "namespace", "$1", "exported_namespace", "(.*)")
# Match only Mimir namespaces.
* on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info)
)
Expand Down

0 comments on commit 7581970

Please sign in to comment.