Skip to content

Commit

Permalink
fix: add small window to consider raft index equal
Browse files Browse the repository at this point in the history
  • Loading branch information
Juanadelacuesta committed Jan 31, 2025
1 parent d29e3d9 commit 09fb837
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 9 deletions.
4 changes: 2 additions & 2 deletions enos/modules/test_cluster_health/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ resource "enos_local_exec" "run_tests" {
]
}

resource "enos_local_exec" "verify_versions" {
resource "enos_local_exec" "verify_versions" {
environment = {
NOMAD_ADDR = var.nomad_addr
NOMAD_CACERT = var.ca_file
Expand All @@ -49,5 +49,5 @@ resource "enos_local_exec" "run_tests" {
scripts = [
abspath("${path.module}/scripts/versions.sh"),
]
}
}

8 changes: 3 additions & 5 deletions enos/modules/test_cluster_health/scripts/allocs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,13 @@ MAX_WAIT_TIME=30 # Maximum wait time in seconds
POLL_INTERVAL=2 # Interval between status checks

random_alloc_id=$(echo "$running_allocs" | jq -r ".[$((RANDOM % ($allocs_length + 1)))].ID")
echo "about to stop alloc $random_alloc_id"
nomad alloc stop -detach "$random_alloc_id" || error_exit "Failed to stop allocation $random_alloc_id."

echo "Waiting for allocation $random_alloc_id to reach 'complete' status..."
elapsed_time=0
while alloc_status=$(nomad alloc status -json "$random_alloc_id" | jq -r '.ClientStatus'); [ "$alloc_status" != "complete" ]; do
if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
echo "Error: Allocation $random_alloc_id did not reach 'complete' status within $MAX_WAIT_TIME seconds."
exit 1
error_exit "Allocation $random_alloc_id did not reach 'complete' status within $MAX_WAIT_TIME seconds."
fi

echo "Current status: $alloc_status. Retrying in $POLL_INTERVAL seconds..."
Expand All @@ -49,10 +47,10 @@ done

echo "Waiting for all the allocations to be running again"
elapsed_time=0

while new_allocs=$(nomad alloc status -json | jq '[.[] | select(.ClientStatus == "running")]'); [ $(echo "$new_allocs" | jq 'length') != "$ALLOC_COUNT" ]; do
if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
echo "Error: Allocation $random_alloc_id did not reach 'complete' status within $MAX_WAIT_TIME seconds."
exit 1
error_exit "Allocation $random_alloc_id did not reach 'complete' status within $MAX_WAIT_TIME seconds."
fi

echo "Current status: $alloc_status. Retrying in $POLL_INTERVAL seconds..."
Expand Down
8 changes: 6 additions & 2 deletions enos/modules/test_cluster_health/scripts/servers.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,15 @@ fi

last_index=""

INDEX_WINDOW=5 # All the servers should be within +5/-5 raft log indexes from one another.

for ip in $SERVERS; do

last_log_index=$(nomad agent-info -address "https://$ip:4646" -json | jq -r '.stats.raft.last_log_index')
if [ -n "$last_index" ] && [ "$last_log_index" -ne "$last_index" ]; then
error_exit "Servers not on the same index. $ip on index: $last_index, previous read index: $last_log_index"
if [ -n "$last_index" ]; then
if (( last_log_index < last_index - INDEX_WINDOW || last_log_index > last_index + INDEX_WINDOW )); then
error_exit "Servers not on the same index! $ip is at index: $last_log_index, previous index: $last_index"
fi
fi

last_index="$last_log_index"
Expand Down

0 comments on commit 09fb837

Please sign in to comment.