diff --git a/enos/modules/test_cluster_health/main.tf b/enos/modules/test_cluster_health/main.tf index 15ea729ef56..b29fb5ffa75 100644 --- a/enos/modules/test_cluster_health/main.tf +++ b/enos/modules/test_cluster_health/main.tf @@ -35,7 +35,7 @@ resource "enos_local_exec" "run_tests" { ] } - resource "enos_local_exec" "verify_versions" { +resource "enos_local_exec" "verify_versions" { environment = { NOMAD_ADDR = var.nomad_addr NOMAD_CACERT = var.ca_file @@ -49,5 +49,5 @@ resource "enos_local_exec" "run_tests" { scripts = [ abspath("${path.module}/scripts/versions.sh"), ] -} +} diff --git a/enos/modules/test_cluster_health/scripts/allocs.sh b/enos/modules/test_cluster_health/scripts/allocs.sh index 89f35108f08..5bd6b8e9791 100755 --- a/enos/modules/test_cluster_health/scripts/allocs.sh +++ b/enos/modules/test_cluster_health/scripts/allocs.sh @@ -31,15 +31,13 @@ MAX_WAIT_TIME=30 # Maximum wait time in seconds POLL_INTERVAL=2 # Interval between status checks random_alloc_id=$(echo "$running_allocs" | jq -r ".[$((RANDOM % ($allocs_length + 1)))].ID") -echo "about to stop alloc $random_alloc_id" nomad alloc stop -detach "$random_alloc_id" || error_exit "Failed to stop allocation $random_alloc_id." echo "Waiting for allocation $random_alloc_id to reach 'complete' status..." elapsed_time=0 while alloc_status=$(nomad alloc status -json "$random_alloc_id" | jq -r '.ClientStatus'); [ "$alloc_status" != "complete" ]; do if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then - echo "Error: Allocation $random_alloc_id did not reach 'complete' status within $MAX_WAIT_TIME seconds." - exit 1 + error_exit "Allocation $random_alloc_id did not reach 'complete' status within $MAX_WAIT_TIME seconds." fi echo "Current status: $alloc_status. Retrying in $POLL_INTERVAL seconds..." @@ -49,10 +47,10 @@ done echo "Waiting for all the allocations to be running again" elapsed_time=0 + while new_allocs=$(nomad alloc status -json | jq '[.[] | select(.ClientStatus == "running")]'); [ $(echo "$new_allocs" | jq 'length') != "$ALLOC_COUNT" ]; do if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then - echo "Error: Allocation $random_alloc_id did not reach 'complete' status within $MAX_WAIT_TIME seconds." - exit 1 + error_exit "Allocation $random_alloc_id did not reach 'complete' status within $MAX_WAIT_TIME seconds." fi echo "Current status: $alloc_status. Retrying in $POLL_INTERVAL seconds..." diff --git a/enos/modules/test_cluster_health/scripts/servers.sh b/enos/modules/test_cluster_health/scripts/servers.sh index 8a5a4680d9c..a5f2caf2a51 100755 --- a/enos/modules/test_cluster_health/scripts/servers.sh +++ b/enos/modules/test_cluster_health/scripts/servers.sh @@ -27,11 +27,15 @@ fi last_index="" +INDEX_WINDOW=5 # All the servers should be within +5/-5 raft log indexes from one another. + for ip in $SERVERS; do last_log_index=$(nomad agent-info -address "https://$ip:4646" -json | jq -r '.stats.raft.last_log_index') - if [ -n "$last_index" ] && [ "$last_log_index" -ne "$last_index" ]; then - error_exit "Servers not on the same index. $ip on index: $last_index, previous read index: $last_log_index" + if [ -n "$last_index" ]; then + if (( last_log_index < last_index - INDEX_WINDOW || last_log_index > last_index + INDEX_WINDOW )); then + error_exit "Servers not on the same index! $ip is at index: $last_log_index, previous index: $last_index" + fi fi last_index="$last_log_index"