Skip to content

Commit

Permalink
some WIP longhorn stuff
Browse files Browse the repository at this point in the history
  • Loading branch information
tuxpeople committed Nov 22, 2023
1 parent 8f0a9e3 commit 4c6512d
Show file tree
Hide file tree
Showing 9 changed files with 191 additions and 0 deletions.
8 changes: 8 additions & 0 deletions kubernetes/apps/storage/longhorn/conf/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
---
# yaml-language-server: $schema=https://json.schemastore.org/kustomization
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: storage
resources:
- monitoring
# - recurringjobs
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
---
# yaml-language-server: $schema=https://json.schemastore.org/kustomization
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: longhorn-system
resources:
- prometheusrule.yaml
- servicemonitor.yaml
104 changes: 104 additions & 0 deletions kubernetes/apps/storage/longhorn/conf/monitoring/prometheusrule.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
prometheus: longhorn
role: alert-rules
name: prometheus-longhorn-rules
spec:
groups:
- name: longhorn.rules
rules:
- alert: LonghornVolumeActualSpaceUsedWarning
annotations:
description: The actual space used by Longhorn volume {{$labels.volume}}
on {{$labels.node}} is at {{$value}}% capacity for more than 5 minutes.
summary: The actual used space of Longhorn volume is over 90% of the capacity.
expr: (longhorn_volume_actual_size_bytes / longhorn_volume_capacity_bytes)
* 100 > 90
for: 5m
labels:
issue: The actual used space of Longhorn volume {{$labels.volume}} on
{{$labels.node}} is high.
severity: warning
- alert: LonghornVolumeStatusCritical
annotations:
description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is
Fault for more than 2 minutes.
summary: Longhorn volume {{$labels.volume}} is Fault
expr: longhorn_volume_robustness == 3
for: 5m
labels:
issue: Longhorn volume {{$labels.volume}} is Fault.
severity: critical
- alert: LonghornVolumeStatusWarning
annotations:
description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is
Degraded for more than 5 minutes.
summary: Longhorn volume {{$labels.volume}} is Degraded
expr: longhorn_volume_robustness == 2
for: 5m
labels:
issue: Longhorn volume {{$labels.volume}} is Degraded.
severity: warning
- alert: LonghornNodeStorageWarning
annotations:
description: The used storage of node {{$labels.node}} is at {{$value}}%
capacity for more than 5 minutes.
summary: The used storage of node is over 70% of the capacity.
expr: (longhorn_node_storage_usage_bytes / longhorn_node_storage_capacity_bytes)
* 100 > 70
for: 5m
labels:
issue: The used storage of node {{$labels.node}} is high.
severity: warning
- alert: LonghornDiskStorageWarning
annotations:
description: The used storage of disk {{$labels.disk}} on node {{$labels.node}}
is at {{$value}}% capacity for more than 5 minutes.
summary: The used storage of disk is over 70% of the capacity.
expr: (longhorn_disk_usage_bytes / longhorn_disk_capacity_bytes) * 100 >
70
for: 5m
labels:
issue: The used storage of disk {{$labels.disk}} on node {{$labels.node}}
is high.
severity: warning
- alert: LonghornNodeDown
annotations:
description: There are {{$value}} Longhorn nodes which have been offline
for more than 5 minutes.
summary: Longhorn nodes is offline
expr: longhorn_node_total - (count(longhorn_node_status{condition="ready"}==1)
OR on() vector(0))
for: 5m
labels:
issue: There are {{$value}} Longhorn nodes are offline
severity: critical
- alert: LonghornIntanceManagerCPUUsageWarning
annotations:
description: Longhorn instance manager {{$labels.instance_manager}} on
{{$labels.node}} has CPU Usage / CPU request is {{$value}}% for more
than 5 minutes.
summary: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}}
has CPU Usage / CPU request is over 300%.
expr: (longhorn_instance_manager_cpu_usage_millicpu/longhorn_instance_manager_cpu_requests_millicpu)
* 100 > 300
for: 5m
labels:
issue: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}}
consumes 3 times the CPU request.
severity: warning
- alert: LonghornNodeCPUUsageWarning
annotations:
description: Longhorn node {{$labels.node}} has CPU Usage / CPU capacity
is {{$value}}% for more than 5 minutes.
summary: Longhorn node {{$labels.node}} experiences high CPU pressure
for more than 5m.
expr: (longhorn_node_cpu_usage_millicpu / longhorn_node_cpu_capacity_millicpu)
* 100 > 90
for: 5m
labels:
issue: Longhorn node {{$labels.node}} experiences high CPU pressure.
severity: warning
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: longhorn-prometheus-servicemonitor
labels:
name: longhorn-prometheus-servicemonitor
spec:
selector:
matchLabels:
app: longhorn-manager
namespaceSelector:
matchNames:
- longhorn-system
endpoints:
- port: manager
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
---
apiVersion: longhorn.io/v1beta1
kind: RecurringJob
metadata:
name: 30min-snapshot
namespace: longhorn-system
spec:
concurrency: 2
cron: 0/30 * * * *
groups:
- normal
retain: 4
task: snapshot
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
---
apiVersion: longhorn.io/v1beta1
kind: RecurringJob
metadata:
name: daily-backup
namespace: longhorn-system
spec:
concurrency: 2
cron: 45 0 * * *
groups:
- normal
retain: 7
task: backup
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
---
apiVersion: longhorn.io/v1beta1
kind: RecurringJob
metadata:
name: hourly-backup
namespace: longhorn-system
spec:
concurrency: 2
cron: 15 * * * *
groups:
- normal
retain: 6
task: backup
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
---
# yaml-language-server: $schema=https://json.schemastore.org/kustomization
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: longhorn-system
resources:
- 30min-snapshot.yaml
- daily-backup.yaml
- hourly-backup.yaml
7 changes: 7 additions & 0 deletions kubernetes/apps/storage/longhorn/conf/snap-class.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
kind: VolumeSnapshotClass
apiVersion: snapshot.storage.k8s.io/v1
metadata:
name: longhorn
driver: driver.longhorn.io
deletionPolicy: Delete

0 comments on commit 4c6512d

Please sign in to comment.