Skip to content

Commit

Permalink
Module to upgrade servers (#24971)
Browse files Browse the repository at this point in the history
* func: add initial enos skeleton

* style: add headers

* func: change the variables input to a map of objects to simplify the workloads creation

* style: formating

* Add tests for servers and clients

* style: separate the tests in diferent scripts

* style: add missing headers

* func: add tests for allocs

* style: improve output

* func: add step to copy remote upgrade version

* style: hcl formatting

* fix: remove the terraform nomad provider

* fix: Add clean token to remove extra new line added in provision

* fix: Add clean token to remove extra new line added in provision

* fix: Add clean token to remove extra new line added in provision

* fix: add missing license headers

* style: hcl fmt

* style: rename variables and fix format

* func: remove the template step on the workloads module and chop the noamd token output on the provide module

* fix: correct the jobspec path on the workloads module

* fix: add missing variable definitions on job specs for workloads

* style: formatting

* fix: Add clean token to remove extra new line added in provision

* func: add module to upgrade servers

* style: missing headers

* func: add upgrade module

* func: add install for windows as well

* func: add an intermediate module that runs the upgrade server for each server

* fix: add missing license headers

* fix: remove extra input variables and connect upgrade servers to the scenario

* fix: rename missing env variables for cluster health scripts

* func: move the cluster health test outside of the modules and into the upgrade scenario

* fix: fix the regex to ignore snap files on the gitignore file

* fix: Add clean token to remove extra new line added in provision

* fix: Add clean token to remove extra new line added in provision

* fix: Add clean token to remove extra new line added in provision

* fix: remove extra input variables and connect upgrade servers to the scenario

* style: formatting

* fix: move taken and restoring snapshots out of the upgrade_single_server to avoid possible race conditions

* fix: rename variable in health test

* fix: Add clean token to remove extra new line added in provision

* func: add an intermediate module that runs the upgrade server for each server

* fix: Add clean token to remove extra new line added in provision

* fix: Add clean token to remove extra new line added in provision

* fix: Add clean token to remove extra new line added in provision

* func: fix the last_log_index check and add a versions check

* func: done use for_each when upgrading the servers, hardcodes each one to ensure they are upgraded one by one

* Update enos/modules/upgrade_instance/variables.tf

Co-authored-by: Tim Gross <[email protected]>

* Update enos/modules/upgrade_instance/variables.tf

Co-authored-by: Tim Gross <[email protected]>

* Update enos/modules/upgrade_instance/variables.tf

Co-authored-by: Tim Gross <[email protected]>

* func: make snapshot by calling every server and allowing stale data

* style: formatting

* fix: make the source for the upgrade binary unknow until apply

* func: use enos bundle to install remote upgrade version, enos_files is not meant for dynamic files

---------

Co-authored-by: Tim Gross <[email protected]>
  • Loading branch information
Juanadelacuesta and tgross authored Feb 7, 2025
1 parent a914888 commit cf0a046
Show file tree
Hide file tree
Showing 14 changed files with 499 additions and 47 deletions.
6 changes: 4 additions & 2 deletions enos/enos-modules.hcl
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1

// Find any released RPM or Deb in Artifactory. Requires the version, edition, distro, and distro
// version.
module "build_artifactory" {
source = "./modules/fetch_artifactory"
}
Expand All @@ -18,3 +16,7 @@ module "run_workloads" {
module "test_cluster_health" {
source = "./modules/test_cluster_health"
}

module "upgrade_servers" {
source = "./modules/upgrade_servers"
}
125 changes: 81 additions & 44 deletions enos/enos-scenario-upgrade.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,14 @@ scenario "upgrade" {
linux_count = matrix.os == "linux" ? "4" : "0"
windows_count = matrix.os == "windows" ? "4" : "0"
arch = matrix.arch
clients_count = local.linux_count + local.windows_count
}

step "copy_initial_binary" {
description = <<-EOF
Determine which Nomad artifact we want to use for the scenario, depending on the
'arch', 'edition' and 'os' and bring it from the artifactory to a local instance.
'arch', 'edition' and 'os' and bring it from the artifactory to the local instance
running enos.
EOF

module = module.build_artifactory
Expand All @@ -52,9 +54,11 @@ scenario "upgrade" {
}

step "provision_cluster" {
depends_on = [step.copy_initial_binary]
depends_on = [step.copy_initial_binary]

description = <<-EOF
Using the binary from the previous step, provision a Nomad cluster using the e2e
module.
EOF

module = module.provision_cluster
Expand All @@ -73,7 +77,8 @@ scenario "upgrade" {
}

step "run_initial_workloads" {
depends_on = [step.provision_cluster]
depends_on = [step.provision_cluster]

description = <<-EOF
Verify the health of the cluster by running new workloads
EOF
Expand All @@ -86,28 +91,34 @@ scenario "upgrade" {
key_file = step.provision_cluster.key_file
nomad_token = step.provision_cluster.nomad_token
}

verifies = [
quality.nomad_register_job,
]
}

step "initial_test_cluster_health" {
depends_on = [step.run_initial_workloads]
depends_on = [step.run_initial_workloads]

description = <<-EOF
Verify the health of the cluster by checking the status of all servers, nodes, jobs and allocs and stopping random allocs to check for correct reschedules"
Verify the health of the cluster by checking the status of all servers, nodes,
jobs and allocs and stopping random allocs to check for correct reschedules"
EOF

module = module.test_cluster_health
variables {
nomad_addr = step.provision_cluster.nomad_addr
ca_file = step.provision_cluster.ca_file
cert_file = step.provision_cluster.cert_file
key_file = step.provision_cluster.key_file
nomad_token = step.provision_cluster.nomad_token
server_count = var.server_count
client_count = local.linux_count + local.windows_count
jobs_count = step.run_initial_workloads.jobs_count
alloc_count = step.run_initial_workloads.allocs_count
nomad_addr = step.provision_cluster.nomad_addr
ca_file = step.provision_cluster.ca_file
cert_file = step.provision_cluster.cert_file
key_file = step.provision_cluster.key_file
nomad_token = step.provision_cluster.nomad_token
server_count = var.server_count
client_count = local.clients_count
jobs_count = step.run_initial_workloads.jobs_count
alloc_count = step.run_initial_workloads.allocs_count
servers = step.provision_cluster.servers
clients_version = var.product_version
servers_version = var.product_version
}

verifies = [
Expand All @@ -120,10 +131,11 @@ scenario "upgrade" {
]
}

step "copy_upgrade_binary" {
depends_on = [step.provision_cluster]
step "fetch_upgrade_binary" {
depends_on = [step.provision_cluster]

description = <<-EOF
Bring the new upgraded binary from the artifactory
Bring the new upgraded binary from the artifactory to the instance running enos.
EOF

module = module.build_artifactory
Expand All @@ -135,51 +147,71 @@ scenario "upgrade" {
edition = matrix.edition
product_version = var.upgrade_version
os = matrix.os
binary_path = "${var.nomad_local_binary}/${matrix.os}-${matrix.arch}-${matrix.edition}-${var.upgrade_version}"
download_binary = false
}
}
/*

step "upgrade_servers" {
depends_on = [step.fetch_upgrade_binary]

description = <<-EOF
Upgrade the cluster's servers by invoking nomad-cc ...
EOF
Takes the servers one by one, makes a snapshot, updates the binary with the
new one previously fetched and restarts the servers.
module = module.run_cc_nomad
Important: The path where the binary will be placed is hardcoded to match
what the provision-cluster module does. It can be configurable in the future
but for now it is:
* "C:/opt/nomad.exe" for windows
* "/usr/local/bin/nomad" for linux
To ensure the servers are upgraded one by one, they use the depends_on meta,
there are ONLY 3 SERVERS being upgraded in the module.
EOF
module = module.upgrade_servers

verifies = [
quality.nomad_agent_info,
quality.nomad_agent_info_self,
nomad_restore_snapshot
quality.nomad_agent_info,
quality.nomad_agent_info_self,
quality.nomad_restore_snapshot
]

variables {
cc_update_type = "server"
nomad_upgraded_binary = step.copy_initial_binary.nomad_local_binary
// ...
nomad_addr = step.provision_cluster.nomad_addr
ca_file = step.provision_cluster.ca_file
cert_file = step.provision_cluster.cert_file
key_file = step.provision_cluster.key_file
nomad_token = step.provision_cluster.nomad_token
servers = step.provision_cluster.servers
ssh_key_path = step.provision_cluster.ssh_key_file
artifactory_username = var.artifactory_username
artifactory_token = var.artifactory_token
artifact_url = step.fetch_upgrade_binary.artifact_url
artifact_sha = step.fetch_upgrade_binary.artifact_sha
}
}

step "run_servers_workloads" {
// ...
}
step "server_upgrade_test_cluster_health" {
depends_on = [step.run_initial_workloads]
depends_on = [step.upgrade_servers]
description = <<-EOF
Verify the health of the cluster by checking the status of all servers, nodes, jobs and allocs and stopping random allocs to check for correct reschedules"
Verify the health of the cluster by checking the status of all servers, nodes,
jobs and allocs and stopping random allocs to check for correct reschedules"
EOF

module = module.test_cluster_health
variables {
nomad_addr = step.provision_cluster.nomad_addr
ca_file = step.provision_cluster.ca_file
cert_file = step.provision_cluster.cert_file
key_file = step.provision_cluster.key_file
nomad_token = step.provision_cluster.nomad_token
server_count = var.server_count
client_count = local.linux_count + local.windows_count
jobs_count = step.run_initial_workloads.jobs_count
alloc_count = step.run_initial_workloads.allocs_count
nomad_addr = step.provision_cluster.nomad_addr
ca_file = step.provision_cluster.ca_file
cert_file = step.provision_cluster.cert_file
key_file = step.provision_cluster.key_file
nomad_token = step.provision_cluster.nomad_token
server_count = var.server_count
client_count = local.linux_count + local.windows_count
jobs_count = step.run_initial_workloads.jobs_count
alloc_count = step.run_initial_workloads.allocs_count
servers = step.provision_cluster.servers
clients_version = var.product_version
servers_version = var.upgrade_version
}

verifies = [
Expand All @@ -192,6 +224,11 @@ scenario "upgrade" {
]
}

/*
step "run_servers_workloads" {
// ...
}
step "upgrade_client" {
description = <<-EOF
Upgrade the cluster's clients by invoking nomad-cc ...
Expand Down Expand Up @@ -244,6 +281,7 @@ scenario "upgrade" {
]
}
*/

output "servers" {
value = step.provision_cluster.servers
}
Expand Down Expand Up @@ -280,5 +318,4 @@ scenario "upgrade" {
value = step.provision_cluster.nomad_token
sensitive = true
}

}
2 changes: 2 additions & 0 deletions enos/modules/fetch_artifactory/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ data "enos_artifactory_item" "nomad" {
}

resource "enos_local_exec" "install_binary" {
count = var.download_binary ? 1 : 0

environment = {
URL = data.enos_artifactory_item.nomad.results[0].url
BINARY_PATH = var.binary_path
Expand Down
10 changes: 10 additions & 0 deletions enos/modules/fetch_artifactory/outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,13 @@ output "nomad_local_binary" {
description = "Path where the binary will be placed"
value = var.os == "windows" ? "${var.binary_path}/nomad.exe" : "${var.binary_path}/nomad"
}

output "artifact_url" {
description = "URL to fetch the artifact"
value = data.enos_artifactory_item.nomad.results[0].url
}

output "artifact_sha" {
description = "sha256 to fetch the artifact"
value = data.enos_artifactory_item.nomad.results[0].sha256
}
5 changes: 5 additions & 0 deletions enos/modules/fetch_artifactory/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,8 @@ variable "binary_path" {
type = string
default = "/home/ubuntu/nomad"
}

variable "download_binary" {
description = "Used to control if the artifact should be downloaded to the local instance or not"
default = true
}
1 change: 1 addition & 0 deletions enos/modules/test_cluster_health/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,4 @@ resource "enos_local_exec" "verify_versions" {
]
}


3 changes: 2 additions & 1 deletion enos/modules/test_cluster_health/scripts/allocs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ MAX_WAIT_TIME=30 # Maximum wait time in seconds
POLL_INTERVAL=2 # Interval between status checks

random_alloc_id=$(echo "$running_allocs" | jq -r ".[$((RANDOM % ($allocs_length + 1)))].ID")
nomad alloc stop -detach "$random_alloc_id" || error_exit "Failed to stop allocation $random_alloc_id."
nomad alloc stop "$random_alloc_id" || error_exit "Failed to stop allocation $random_alloc_id."


echo "Waiting for allocation $random_alloc_id to reach 'complete' status..."
elapsed_time=0
Expand Down
1 change: 1 addition & 0 deletions enos/modules/test_cluster_health/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ variable "jobs_count" {

variable "alloc_count" {
description = "Number of allocation that should be running in the cluster"
type = number
}

variable "clients_version" {
Expand Down
2 changes: 2 additions & 0 deletions enos/modules/upgrade_instance/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
// Don't commit cluster snapshots
*.snap
64 changes: 64 additions & 0 deletions enos/modules/upgrade_instance/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1

terraform {
required_providers {
enos = {
source = "registry.terraform.io/hashicorp-forge/enos"
}
}
}

locals {
binary_destination = var.platform == "windows" ? "C:/opt/" : "/usr/local/bin/"
ssh_user = var.platform == "windows" ? "Administrator" : "ubuntu"
}

resource "enos_bundle_install" "nomad" {
destination = local.binary_destination

artifactory = var.artifactory_release

transport = {
ssh = {
host = var.server_address
private_key_path = var.ssh_key_path
user = local.ssh_user
}
}
}

resource "enos_remote_exec" "restart_linux_services" {
count = var.platform == "linux" ? 1 : 0
depends_on = [enos_bundle_install.nomad]


transport = {
ssh = {
host = var.server_address
private_key_path = var.ssh_key_path
user = local.ssh_user
}
}

inline = [
"sudo systemctl restart nomad",
]
}

resource "enos_remote_exec" "restart_windows_services" {
count = var.platform == "windows" ? 1 : 0
depends_on = [enos_bundle_install.nomad]

transport = {
ssh = {
host = var.server_address
private_key_path = var.ssh_key_path
user = local.ssh_user
}
}

inline = [
"powershell Restart-Service Nomad"
]
}
Loading

0 comments on commit cf0a046

Please sign in to comment.