Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Asserting when reconciliation loop exceeds certain threshold #1078

Merged
merged 11 commits into from
Nov 4, 2024
3 changes: 2 additions & 1 deletion .github/workflows/e2e.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ jobs:
- name: "run tests"
env:
KUBE_VERSION: ${{ matrix.kube-version }}
run: make prepare-e2e e2e KUBE_VERSION=$KUBE_VERSION
run: make prepare-e2e e2e test-operator-metrics KUBE_VERSION=$KUBE_VERSION


upgrade-tests:
name: Upgrade tests ${{ matrix.name }}
Expand Down
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -387,6 +387,10 @@ prepare-e2e: chainsaw start-kind cert-manager set-test-image-vars build docker-b
e2e:
$(CHAINSAW) test --test-dir ./tests/e2e

.PHONY: test-operator-metrics
test-operator-metrics:
$(CHAINSAW) test --test-dir ./tests/operator-metrics

# OpenShift end-to-tests
.PHONY: e2e-openshift
e2e-openshift:
Expand Down
10 changes: 10 additions & 0 deletions tests/operator-metrics/max-loops/00-assert.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: sa-assert-metrics
namespace: tempo-operator-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: sa-assert-metrics-role-binding
18 changes: 18 additions & 0 deletions tests/operator-metrics/max-loops/00-metrics-service.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: sa-assert-metrics
namespace: tempo-operator-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: sa-assert-metrics-role-binding
subjects:
- kind: ServiceAccount
name: sa-assert-metrics
namespace: tempo-operator-system
roleRef:
kind: ClusterRole
name: tempo-operator-metrics-reader
apiGroup: rbac.authorization.k8s.io
9 changes: 9 additions & 0 deletions tests/operator-metrics/max-loops/01-assert-job.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
apiVersion: batch/v1
kind: Job
metadata:
name: verify-metrics
namespace: tempo-operator-system
status:
conditions:
- status: "True"
type: Complete
62 changes: 62 additions & 0 deletions tests/operator-metrics/max-loops/01-verify-metrics.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
apiVersion: batch/v1
kind: Job
metadata:
name: verify-metrics
namespace: tempo-operator-system
spec:
template:
spec:
containers:
- name: verify-metrics
image: ghcr.io/grafana/tempo-operator/test-utils:main
env:
- name: TEMPOMONOLITHIC_THRESHOLD
value: "1000"
- name: TEMPOSTACK_THRESHOLD
value: "1000"
command:
- /bin/bash
- -eux
- -c
args:
- |
TOKEN=$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)
curl -k -H "Authorization: Bearer $TOKEN" -G https://tempo-operator-controller-manager-metrics-service:8443/metrics -o /tmp/tempo.out

# Define thresholds
declare -A SUCCESS_THRESHOLDS=(
["tempomonolithic"]=${TEMPOMONOLITHIC_THRESHOLD:-0}
["tempostack"]=${TEMPOSTACK_THRESHOLD:-0}
)

# Initialize counters
declare -A success_counts=()


while IFS= read -r line; do
if [[ $line =~ ^controller_runtime_reconcile_total ]]; then
echo $line
controller=$(echo "$line" | awk -F'[{}]' '{split($2, a, ","); split(a[1], b, "="); gsub(/"/, "", b[2]); print b[2]}')
result=$(echo "$line" | awk -F'[{}]' '{split($2, a, ","); split(a[2], b, "="); gsub(/"/, "", b[2]); print b[2]}')
value=$(echo "$line" | awk '{print $NF}')
if [[ $result == "success" ]]; then
success_counts["$controller"]=$value
fi
fi
done < /tmp/tempo.out

# Validate counts against thresholds
for controller in "${!SUCCESS_THRESHOLDS[@]}"; do
success_count=${success_counts["$controller"]:-0}
if (( success_count > SUCCESS_THRESHOLDS["$controller"] )); then
echo "Alert: Success count for $controller ($success_count) exceeds threshold (${SUCCESS_THRESHOLDS["$controller"]})."
exit 1
fi
done

# Print all success counts at the end
echo "All metrics are within acceptable limits."
echo "Success counts:"
declare -p success_counts
restartPolicy: Never
serviceAccountName: sa-assert-metrics
27 changes: 27 additions & 0 deletions tests/operator-metrics/max-loops/chainsaw-test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# yaml-language-server: $schema=https://raw.githubusercontent.com/kyverno/chainsaw/main/.schemas/json/test-chainsaw-v1alpha1.json
apiVersion: chainsaw.kyverno.io/v1alpha1
kind: Test
metadata:
creationTimestamp: null
name: operator-metrics
spec:
steps:
- name: step-00
try:
- apply:
file: 00-metrics-service.yaml
- assert:
file: 00-assert.yaml

- name: step-01
try:
- apply:
file: 01-verify-metrics.yaml
- assert:
file: 01-assert-job.yaml
catch:
- events: {}
- podLogs:
selector: job-name=verify-metrics
namespace: tempo-operator-system
tail: 10