From a26b43c9114fd714bc6d7f5e2247dbe0159df601 Mon Sep 17 00:00:00 2001 From: dabradley Date: Tue, 20 Jun 2023 21:58:21 -0400 Subject: [PATCH] Use new SHA with LNET fix (#130) The changes to allow LNET to work with correctly with namespaces have been completed. We no longer need the logic that was previously working around this issue. The changes here will ensure that all available ethernet interfaces are added during setup, which should prevent another issue where multiple nics were preventing the setup from adding the correct interface for AMLFS connectivity. Co-authored-by: David A. Bradley --- pkg/azurelustreplugin/Dockerfile | 2 - pkg/azurelustreplugin/entrypoint.sh | 107 ++++++++++-------- pkg/azurelustreplugin/fix-lnet.sh | 32 ------ .../image/run_integration_test.sh | 2 +- test/long-haul/start-long-haul.sh | 13 +-- 5 files changed, 68 insertions(+), 88 deletions(-) delete mode 100644 pkg/azurelustreplugin/fix-lnet.sh diff --git a/pkg/azurelustreplugin/Dockerfile b/pkg/azurelustreplugin/Dockerfile index c018a327d..37645a2c2 100644 --- a/pkg/azurelustreplugin/Dockerfile +++ b/pkg/azurelustreplugin/Dockerfile @@ -16,10 +16,8 @@ FROM ubuntu:18.04 COPY "./_output/azurelustreplugin" "/app/azurelustreplugin" COPY "./pkg/azurelustreplugin/entrypoint.sh" "/app/entrypoint.sh" -COPY "./pkg/azurelustreplugin/fix-lnet.sh" "/app/fix-lnet.sh" RUN chmod +x "/app/entrypoint.sh" -RUN chmod +x "/app/fix-lnet.sh" RUN apt-get update && apt --only-upgrade -y install tar libudev1 libsystemd0 RUN apt-get clean all diff --git a/pkg/azurelustreplugin/entrypoint.sh b/pkg/azurelustreplugin/entrypoint.sh index 33749fb2e..7e36bd367 100755 --- a/pkg/azurelustreplugin/entrypoint.sh +++ b/pkg/azurelustreplugin/entrypoint.sh @@ -24,13 +24,51 @@ set -o errexit set -o pipefail set -o nounset +function add_net_interfaces() { + echo "$(date -u) Determining ethernet interfaces." + echo "$(date -u) Route table is:" + ip route list + interface_list=$(ip route show | sed -n 's/.*\s\+dev\s\+\([^ ]\+\).*/\1/p' | sort -u) + ethernet_interfaces=() + for interface in $interface_list; do + interface_info=$(ip link show "${interface}") + if [[ "$interface_info" =~ 'SLAVE' ]]; then + echo "$(date -u) Not adding slave interface: ${interface}" + continue + elif [[ "$interface_info" =~ 'link-netns' ]]; then + echo "$(date -u) Not adding namespaced interface: ${interface}" + continue + elif [[ "$interface_info" =~ 'link/ether' ]]; then + echo "$(date -u) Including ethernet interface: ${interface}" + ethernet_interfaces+=("$interface") + else + echo "$(date -u) Skipping non-ethernet interface: ${interface}" + fi + done + echo "$(date -u) List of found ethernet interfaces is: ${ethernet_interfaces[*]}" + + if [[ "${#ethernet_interfaces[@]}" -eq 0 ]]; then + echo "$(date -u) Cannot find any ethernet network interface" + exit 1 + fi + + for interface in "${ethernet_interfaces[@]}"; do + if lnetctl net show --net tcp | grep -q "\b${interface}\b"; then + echo "$(date -u) Interface already added, skipping: ${interface}" + else + echo "$(date -u) Adding interface: ${interface}" + lnetctl net add --net tcp --if "${interface}" + fi + done +} + installClientPackages=${AZURELUSTRE_CSI_INSTALL_LUSTRE_CLIENT:-yes} echo "installClientPackages: ${installClientPackages}" requiredLustreVersion=${LUSTRE_VERSION:-"2.15.1"} echo "requiredLustreVersion: ${requiredLustreVersion}" -pkgVersion="${requiredLustreVersion}-24-gbaa21ca" +pkgVersion="${requiredLustreVersion}-29-gbae0abe" echo "pkgVersion: ${pkgVersion}" pkgName="amlfs-lustre-client-${pkgVersion}" @@ -80,16 +118,19 @@ if [[ "${installClientPackages}" == "yes" ]]; then echo "$(date -u) Installed Lustre client packages." - # Issue #115 Remove workaround for LNET fix - # Revert below LNET fix, please don't remove the lines to cleanup rule files - init_lnet="true" - + if lsmod | grep "^lnet"; then if lnetctl net show --net tcp | grep interfaces; then - echo "$(date -u) LNet is loaded skip the load." + echo "$(date -u) LNet is loaded skip the load" + echo "$(date -u) Adding missing interfaces" + add_net_interfaces init_lnet="false" - fi + elif lnetctl net show | grep "net type: tcp"; then + # There may be a default configuration with no interface. + # This is configured by an old version CSI. + lnetctl net del --net tcp + fi fi if [[ "${init_lnet}" == "true" ]]; then @@ -97,46 +138,22 @@ if [[ "${installClientPackages}" == "yes" ]]; then modprobe -v lnet lnetctl lnet configure - echo "$(date -u) Determining the default network interface." - # perl will be installed as dependency by luster client - echo "$(date -u) Route table is:" - ip route list - default_interface=$(ip route list | perl -n -e'/default via [0-9.]+ dev ([0-9a-zA-Z]+) / && print $1') - echo "$(date -u) Default network interface is ${default_interface}" - - if [[ "${default_interface}" == "" ]]; then - echo "$(date -u) Cannot determine the default network interface" - exit 1 - fi - - if lnetctl net show | grep "net type: tcp"; then - # There may be a default configuration with no interface. - # This is configured by an old version CSI. - lnetctl net del --net tcp + add_net_interfaces + + # Remove old udev rules + should_reload_udev="false" + for rule_file in /etc/udev/rules.d/{73-netadd,74-netremove,98-netadd,99-netremove}.rules; do + if [[ -e ${rule_file} ]]; then + echo "Deleting unnecessary udev rule: ${rule_file}" + rm -f "${rule_file}" + should_reload_udev="true" + fi + done + if [[ "${should_reload_udev}" == "true" ]]; then + echo "$(date -u) Reloading udevadm" + udevadm control --reload fi - lnetctl net add --net tcp --if "${default_interface}" - - echo "$(date -u) Adding the udev script." - test -e /etc/lustre || mkdir /etc/lustre - touch /etc/lustre/.lock - test -e /etc/lustre/fix-lnet.sh && rm -f /etc/lustre/fix-lnet.sh - sed -i "s/{default_interface}/${default_interface}/g;" ./fix-lnet.sh - cp ./fix-lnet.sh /etc/lustre - - # legacy rules 73 & 74 - test -e /etc/udev/rules.d/73-netadd.rules && rm -f /etc/udev/rules.d/73-netadd.rules - test -e /etc/udev/rules.d/74-netremove.rules && rm -f /etc/udev/rules.d/74-netremove.rules - - # current rules 98 & 99 - test -e /etc/udev/rules.d/98-netadd.rules && rm -f /etc/udev/rules.d/98-netadd.rules - test -e /etc/udev/rules.d/99-netremove.rules && rm -f /etc/udev/rules.d/99-netremove.rules - - echo 'SUBSYSTEM=="net", ACTION=="add", RUN+="/etc/lustre/fix-lnet.sh"' | tee /etc/udev/rules.d/98-netadd.rules - echo 'SUBSYSTEM=="net", ACTION=="remove", RUN+="/etc/lustre/fix-lnet.sh"' | tee /etc/udev/rules.d/99-netremove.rules - - echo "$(date -u) Reloading udevadm" - udevadm control --reload echo "$(date -u) Done" fi diff --git a/pkg/azurelustreplugin/fix-lnet.sh b/pkg/azurelustreplugin/fix-lnet.sh deleted file mode 100644 index 11972be75..000000000 --- a/pkg/azurelustreplugin/fix-lnet.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash - -# Issue #115 Remove workaround for LNET fix -# Delete this bash script - -/usr/bin/logger "PID $$: Start fix-lnet" -count=1; - -# try fix lnet 5 times maximum -for sleep_in_secs in 0 0.5 0.5 0.5 0.5; do - sleep $sleep_in_secs - - break_flag=$( - ( - break_flag_inner=false - flock -w 60 -e ${FD} - if sudo lnetctl net show --net tcp | grep "status: down"; then - /usr/sbin/lnetctl net del --net tcp - /usr/sbin/lnetctl net add --net tcp --if {default_interface} - break_flag_inner=true - fi - echo $break_flag_inner - ) {FD}< /etc/lustre/.lock - ) - - if [[ $break_flag == true ]]; then - break - else - /usr/bin/logger "PID $$: Skipped fix-lnet, count=$count" - count=$((count+1)) - fi -done diff --git a/test/integration_aks/image/run_integration_test.sh b/test/integration_aks/image/run_integration_test.sh index 17a9f587d..9d1d0f4cf 100644 --- a/test/integration_aks/image/run_integration_test.sh +++ b/test/integration_aks/image/run_integration_test.sh @@ -26,7 +26,7 @@ readonly target_path="/tmp/target_path" readonly lustre_fs_name=$1 readonly lustre_fs_ip=$2 readonly lustre_client_version="2.15.1" -readonly pkgVersion="${lustre_client_version}-24-gbaa21ca" +readonly pkgVersion="${lustre_client_version}-29-gbae0abe" readonly pkgName="amlfs-lustre-client-${pkgVersion}" mkdir -p $target_path diff --git a/test/long-haul/start-long-haul.sh b/test/long-haul/start-long-haul.sh index e8dc5b8e6..a30ee7e98 100755 --- a/test/long-haul/start-long-haul.sh +++ b/test/long-haul/start-long-haul.sh @@ -43,14 +43,11 @@ print_logs_case "Executing fault test" print_logs_case "Executing update test" ./update-test.sh -# Issue #115 Remove workaround for LNET fix -# Enable perf/scale test -# print_logs_case "Executing perf/scale test" -# ./perf-scale-test.sh - -# Issue #115 Remove workaround for LNET fix -# print_logs_case "Executing external e2e test" -# ./external-e2e.sh +print_logs_case "Executing perf/scale test" +./perf-scale-test.sh + +print_logs_case "Executing external e2e test" +./external-e2e.sh print_logs_case "Executing cleanup" kubectl apply -f ./cleanup/cleanupjob.yaml