-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
9 changed files
with
191 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
--- | ||
# yaml-language-server: $schema=https://json.schemastore.org/kustomization | ||
apiVersion: kustomize.config.k8s.io/v1beta1 | ||
kind: Kustomization | ||
namespace: storage | ||
resources: | ||
- monitoring | ||
# - recurringjobs |
8 changes: 8 additions & 0 deletions
8
kubernetes/apps/storage/longhorn/conf/monitoring/kustomization.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
--- | ||
# yaml-language-server: $schema=https://json.schemastore.org/kustomization | ||
apiVersion: kustomize.config.k8s.io/v1beta1 | ||
kind: Kustomization | ||
namespace: longhorn-system | ||
resources: | ||
- prometheusrule.yaml | ||
- servicemonitor.yaml |
104 changes: 104 additions & 0 deletions
104
kubernetes/apps/storage/longhorn/conf/monitoring/prometheusrule.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
--- | ||
apiVersion: monitoring.coreos.com/v1 | ||
kind: PrometheusRule | ||
metadata: | ||
labels: | ||
prometheus: longhorn | ||
role: alert-rules | ||
name: prometheus-longhorn-rules | ||
spec: | ||
groups: | ||
- name: longhorn.rules | ||
rules: | ||
- alert: LonghornVolumeActualSpaceUsedWarning | ||
annotations: | ||
description: The actual space used by Longhorn volume {{$labels.volume}} | ||
on {{$labels.node}} is at {{$value}}% capacity for more than 5 minutes. | ||
summary: The actual used space of Longhorn volume is over 90% of the capacity. | ||
expr: (longhorn_volume_actual_size_bytes / longhorn_volume_capacity_bytes) | ||
* 100 > 90 | ||
for: 5m | ||
labels: | ||
issue: The actual used space of Longhorn volume {{$labels.volume}} on | ||
{{$labels.node}} is high. | ||
severity: warning | ||
- alert: LonghornVolumeStatusCritical | ||
annotations: | ||
description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is | ||
Fault for more than 2 minutes. | ||
summary: Longhorn volume {{$labels.volume}} is Fault | ||
expr: longhorn_volume_robustness == 3 | ||
for: 5m | ||
labels: | ||
issue: Longhorn volume {{$labels.volume}} is Fault. | ||
severity: critical | ||
- alert: LonghornVolumeStatusWarning | ||
annotations: | ||
description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is | ||
Degraded for more than 5 minutes. | ||
summary: Longhorn volume {{$labels.volume}} is Degraded | ||
expr: longhorn_volume_robustness == 2 | ||
for: 5m | ||
labels: | ||
issue: Longhorn volume {{$labels.volume}} is Degraded. | ||
severity: warning | ||
- alert: LonghornNodeStorageWarning | ||
annotations: | ||
description: The used storage of node {{$labels.node}} is at {{$value}}% | ||
capacity for more than 5 minutes. | ||
summary: The used storage of node is over 70% of the capacity. | ||
expr: (longhorn_node_storage_usage_bytes / longhorn_node_storage_capacity_bytes) | ||
* 100 > 70 | ||
for: 5m | ||
labels: | ||
issue: The used storage of node {{$labels.node}} is high. | ||
severity: warning | ||
- alert: LonghornDiskStorageWarning | ||
annotations: | ||
description: The used storage of disk {{$labels.disk}} on node {{$labels.node}} | ||
is at {{$value}}% capacity for more than 5 minutes. | ||
summary: The used storage of disk is over 70% of the capacity. | ||
expr: (longhorn_disk_usage_bytes / longhorn_disk_capacity_bytes) * 100 > | ||
70 | ||
for: 5m | ||
labels: | ||
issue: The used storage of disk {{$labels.disk}} on node {{$labels.node}} | ||
is high. | ||
severity: warning | ||
- alert: LonghornNodeDown | ||
annotations: | ||
description: There are {{$value}} Longhorn nodes which have been offline | ||
for more than 5 minutes. | ||
summary: Longhorn nodes is offline | ||
expr: longhorn_node_total - (count(longhorn_node_status{condition="ready"}==1) | ||
OR on() vector(0)) | ||
for: 5m | ||
labels: | ||
issue: There are {{$value}} Longhorn nodes are offline | ||
severity: critical | ||
- alert: LonghornIntanceManagerCPUUsageWarning | ||
annotations: | ||
description: Longhorn instance manager {{$labels.instance_manager}} on | ||
{{$labels.node}} has CPU Usage / CPU request is {{$value}}% for more | ||
than 5 minutes. | ||
summary: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} | ||
has CPU Usage / CPU request is over 300%. | ||
expr: (longhorn_instance_manager_cpu_usage_millicpu/longhorn_instance_manager_cpu_requests_millicpu) | ||
* 100 > 300 | ||
for: 5m | ||
labels: | ||
issue: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} | ||
consumes 3 times the CPU request. | ||
severity: warning | ||
- alert: LonghornNodeCPUUsageWarning | ||
annotations: | ||
description: Longhorn node {{$labels.node}} has CPU Usage / CPU capacity | ||
is {{$value}}% for more than 5 minutes. | ||
summary: Longhorn node {{$labels.node}} experiences high CPU pressure | ||
for more than 5m. | ||
expr: (longhorn_node_cpu_usage_millicpu / longhorn_node_cpu_capacity_millicpu) | ||
* 100 > 90 | ||
for: 5m | ||
labels: | ||
issue: Longhorn node {{$labels.node}} experiences high CPU pressure. | ||
severity: warning |
16 changes: 16 additions & 0 deletions
16
kubernetes/apps/storage/longhorn/conf/monitoring/servicemonitor.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
--- | ||
apiVersion: monitoring.coreos.com/v1 | ||
kind: ServiceMonitor | ||
metadata: | ||
name: longhorn-prometheus-servicemonitor | ||
labels: | ||
name: longhorn-prometheus-servicemonitor | ||
spec: | ||
selector: | ||
matchLabels: | ||
app: longhorn-manager | ||
namespaceSelector: | ||
matchNames: | ||
- longhorn-system | ||
endpoints: | ||
- port: manager |
13 changes: 13 additions & 0 deletions
13
kubernetes/apps/storage/longhorn/conf/recurringjobs/30min-snapshot.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
--- | ||
apiVersion: longhorn.io/v1beta1 | ||
kind: RecurringJob | ||
metadata: | ||
name: 30min-snapshot | ||
namespace: longhorn-system | ||
spec: | ||
concurrency: 2 | ||
cron: 0/30 * * * * | ||
groups: | ||
- normal | ||
retain: 4 | ||
task: snapshot |
13 changes: 13 additions & 0 deletions
13
kubernetes/apps/storage/longhorn/conf/recurringjobs/daily-backup.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
--- | ||
apiVersion: longhorn.io/v1beta1 | ||
kind: RecurringJob | ||
metadata: | ||
name: daily-backup | ||
namespace: longhorn-system | ||
spec: | ||
concurrency: 2 | ||
cron: 45 0 * * * | ||
groups: | ||
- normal | ||
retain: 7 | ||
task: backup |
13 changes: 13 additions & 0 deletions
13
kubernetes/apps/storage/longhorn/conf/recurringjobs/hourly-backup.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
--- | ||
apiVersion: longhorn.io/v1beta1 | ||
kind: RecurringJob | ||
metadata: | ||
name: hourly-backup | ||
namespace: longhorn-system | ||
spec: | ||
concurrency: 2 | ||
cron: 15 * * * * | ||
groups: | ||
- normal | ||
retain: 6 | ||
task: backup |
9 changes: 9 additions & 0 deletions
9
kubernetes/apps/storage/longhorn/conf/recurringjobs/kustomization.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
--- | ||
# yaml-language-server: $schema=https://json.schemastore.org/kustomization | ||
apiVersion: kustomize.config.k8s.io/v1beta1 | ||
kind: Kustomization | ||
namespace: longhorn-system | ||
resources: | ||
- 30min-snapshot.yaml | ||
- daily-backup.yaml | ||
- hourly-backup.yaml |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
--- | ||
kind: VolumeSnapshotClass | ||
apiVersion: snapshot.storage.k8s.io/v1 | ||
metadata: | ||
name: longhorn | ||
driver: driver.longhorn.io | ||
deletionPolicy: Delete |