Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix the last_log_index check and add a versions check #24989

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 20 additions & 2 deletions enos/modules/test_cluster_health/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ terraform {
}

locals {
clean_token = trimspace(var.nomad_token) #Somewhere in the process, a newline is added to the token.
servers_addr = join(" ", var.servers)
}

resource "enos_local_exec" "run_tests" {
Expand All @@ -19,11 +19,12 @@ resource "enos_local_exec" "run_tests" {
NOMAD_CACERT = var.ca_file
NOMAD_CLIENT_CERT = var.cert_file
NOMAD_CLIENT_KEY = var.key_file
NOMAD_TOKEN = local.clean_token
NOMAD_TOKEN = var.nomad_token
SERVER_COUNT = var.server_count
CLIENT_COUNT = var.client_count
JOB_COUNT = var.jobs_count
ALLOC_COUNT = var.alloc_count
SERVERS = local.servers_addr
}

scripts = [
Expand All @@ -33,3 +34,20 @@ resource "enos_local_exec" "run_tests" {
abspath("${path.module}/scripts/allocs.sh")
]
}

resource "enos_local_exec" "verify_versions" {
environment = {
NOMAD_ADDR = var.nomad_addr
NOMAD_CACERT = var.ca_file
NOMAD_CLIENT_CERT = var.cert_file
NOMAD_CLIENT_KEY = var.key_file
NOMAD_TOKEN = var.nomad_token
SERVERS_VERSION = var.servers_version
CLIENTS_VERSION = var.clients_version
}

scripts = [
abspath("${path.module}/scripts/versions.sh"),
]
}

8 changes: 3 additions & 5 deletions enos/modules/test_cluster_health/scripts/allocs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,13 @@ MAX_WAIT_TIME=30 # Maximum wait time in seconds
POLL_INTERVAL=2 # Interval between status checks

random_alloc_id=$(echo "$running_allocs" | jq -r ".[$((RANDOM % ($allocs_length + 1)))].ID")
echo "about to stop alloc $random_alloc_id"
nomad alloc stop -detach "$random_alloc_id" || error_exit "Failed to stop allocation $random_alloc_id."

echo "Waiting for allocation $random_alloc_id to reach 'complete' status..."
elapsed_time=0
while alloc_status=$(nomad alloc status -json "$random_alloc_id" | jq -r '.ClientStatus'); [ "$alloc_status" != "complete" ]; do
if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
echo "Error: Allocation $random_alloc_id did not reach 'complete' status within $MAX_WAIT_TIME seconds."
exit 1
error_exit "Allocation $random_alloc_id did not reach 'complete' status within $MAX_WAIT_TIME seconds."
fi

echo "Current status: $alloc_status. Retrying in $POLL_INTERVAL seconds..."
Expand All @@ -49,10 +47,10 @@ done

echo "Waiting for all the allocations to be running again"
elapsed_time=0

while new_allocs=$(nomad alloc status -json | jq '[.[] | select(.ClientStatus == "running")]'); [ $(echo "$new_allocs" | jq 'length') != "$ALLOC_COUNT" ]; do
if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
echo "Error: Allocation $random_alloc_id did not reach 'complete' status within $MAX_WAIT_TIME seconds."
exit 1
error_exit "Allocation $random_alloc_id did not reach 'complete' status within $MAX_WAIT_TIME seconds."
fi

echo "Current status: $alloc_status. Retrying in $POLL_INTERVAL seconds..."
Expand Down
20 changes: 17 additions & 3 deletions enos/modules/test_cluster_health/scripts/servers.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,22 @@ if [ "$servers_length" -ne "$SERVER_COUNT" ]; then
error_exit "Unexpected number of servers are alive: $servers_length\n$(echo $servers | jq '.[] | select(.Status != "alive") | .Name')"
fi

if [ $(echo "$running_servers" | jq -r "map(.last_log_index ) | unique | length == 1") != "true" ]; then
error_exit "Servers not up to date"
fi
# Quality: nomad_agent_info_self: A GET call to /v1/agent/self against every server returns the same last_log_index for all of them"
tgross marked this conversation as resolved.
Show resolved Hide resolved

last_index=""

INDEX_WINDOW=5 # All the servers should be within +5/-5 raft log indexes from one another.

for ip in $SERVERS; do

last_log_index=$(nomad agent-info -address "https://$ip:4646" -json | jq -r '.stats.raft.last_log_index')
if [ -n "$last_index" ]; then
if (( last_log_index < last_index - INDEX_WINDOW || last_log_index > last_index + INDEX_WINDOW )); then
error_exit "Servers not on the same index! $ip is at index: $last_log_index, previous index: $last_index"
fi
fi

last_index="$last_log_index"
done

echo "All SERVERS are alive and up to date."
52 changes: 52 additions & 0 deletions enos/modules/test_cluster_health/scripts/versions.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#!/usr/bin/env bash
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1

set -euo pipefail

error_exit() {
printf 'Error: %s' "${1}"
exit 1
}

# Servers version
server_versions=$(nomad server members -json | jq -r '[.[] | select(.Status == "alive") | .Tags.build] | unique')

if [ "$(echo "$server_versions" | jq 'length')" -eq 0 ]; then
error_exit "Unable to get servers version"
fi

if [ "$(echo "$server_versions" | jq 'length')" -ne 1 ]; then
error_exit "Servers are running different versions: $(echo "$server_versions" | jq -c '.')"
fi

final_version=$(echo "$server_versions" | jq -r '.[0]'| xargs)
SERVERS_VERSION=$(echo "$SERVERS_VERSION" | xargs)

if [ "$final_version" != "$SERVERS_VERSION" ]; then
error_exit "Servers are not running the correct version. Found: $final_version, Expected: $SERVERS_VERSION"
fi

echo "All servers are running Nomad version $SERVERS_VERSION"

# Clients version
tgross marked this conversation as resolved.
Show resolved Hide resolved
clients_versions=$(nomad node status -json | jq -r '[.[] | select(.Status == "ready") | .Version] | unique')


if [ "$(echo "$clients_versions" | jq 'length')" -eq 0 ]; then
error_exit "Unable to get clients version"
fi


if [ "$(echo "$clients_versions" | jq 'length')" -ne 1 ]; then
error_exit "Clients are running different versions: $(echo "$clients_versions" | jq -c '.')"
fi

final_version=$(echo "$clients_versions" | jq -r '.[0]'| xargs)
CLIENTS_VERSION=$(echo "$CLIENTS_VERSION" | xargs)

if [ "$final_version" != "$CLIENTS_VERSION" ]; then
error_exit "Clients are not running the correct version. Found: $final_version, Expected: $CLIENTS_VERSION"
fi

echo "All clients are running Nomad version $CLIENTS_VERSION"
15 changes: 15 additions & 0 deletions enos/modules/test_cluster_health/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,18 @@ variable "jobs_count" {
variable "alloc_count" {
description = "Number of allocation that should be running in the cluster"
}

variable "clients_version" {
description = "Binary version running on the clients"
type = string
}

variable "servers_version" {
description = "Binary version running on the servers"
type = string
}

variable "servers" {
description = "List of public IP address of the nomad servers"
type = list
}