diff --git a/pkg/azurelustreplugin/Dockerfile b/pkg/azurelustreplugin/Dockerfile index c018a327d..37645a2c2 100644 --- a/pkg/azurelustreplugin/Dockerfile +++ b/pkg/azurelustreplugin/Dockerfile @@ -16,10 +16,8 @@ FROM ubuntu:18.04 COPY "./_output/azurelustreplugin" "/app/azurelustreplugin" COPY "./pkg/azurelustreplugin/entrypoint.sh" "/app/entrypoint.sh" -COPY "./pkg/azurelustreplugin/fix-lnet.sh" "/app/fix-lnet.sh" RUN chmod +x "/app/entrypoint.sh" -RUN chmod +x "/app/fix-lnet.sh" RUN apt-get update && apt --only-upgrade -y install tar libudev1 libsystemd0 RUN apt-get clean all diff --git a/pkg/azurelustreplugin/entrypoint.sh b/pkg/azurelustreplugin/entrypoint.sh index 33749fb2e..7e36bd367 100755 --- a/pkg/azurelustreplugin/entrypoint.sh +++ b/pkg/azurelustreplugin/entrypoint.sh @@ -24,13 +24,51 @@ set -o errexit set -o pipefail set -o nounset +function add_net_interfaces() { + echo "$(date -u) Determining ethernet interfaces." + echo "$(date -u) Route table is:" + ip route list + interface_list=$(ip route show | sed -n 's/.*\s\+dev\s\+\([^ ]\+\).*/\1/p' | sort -u) + ethernet_interfaces=() + for interface in $interface_list; do + interface_info=$(ip link show "${interface}") + if [[ "$interface_info" =~ 'SLAVE' ]]; then + echo "$(date -u) Not adding slave interface: ${interface}" + continue + elif [[ "$interface_info" =~ 'link-netns' ]]; then + echo "$(date -u) Not adding namespaced interface: ${interface}" + continue + elif [[ "$interface_info" =~ 'link/ether' ]]; then + echo "$(date -u) Including ethernet interface: ${interface}" + ethernet_interfaces+=("$interface") + else + echo "$(date -u) Skipping non-ethernet interface: ${interface}" + fi + done + echo "$(date -u) List of found ethernet interfaces is: ${ethernet_interfaces[*]}" + + if [[ "${#ethernet_interfaces[@]}" -eq 0 ]]; then + echo "$(date -u) Cannot find any ethernet network interface" + exit 1 + fi + + for interface in "${ethernet_interfaces[@]}"; do + if lnetctl net show --net tcp | grep -q "\b${interface}\b"; then + echo "$(date -u) Interface already added, skipping: ${interface}" + else + echo "$(date -u) Adding interface: ${interface}" + lnetctl net add --net tcp --if "${interface}" + fi + done +} + installClientPackages=${AZURELUSTRE_CSI_INSTALL_LUSTRE_CLIENT:-yes} echo "installClientPackages: ${installClientPackages}" requiredLustreVersion=${LUSTRE_VERSION:-"2.15.1"} echo "requiredLustreVersion: ${requiredLustreVersion}" -pkgVersion="${requiredLustreVersion}-24-gbaa21ca" +pkgVersion="${requiredLustreVersion}-29-gbae0abe" echo "pkgVersion: ${pkgVersion}" pkgName="amlfs-lustre-client-${pkgVersion}" @@ -80,16 +118,19 @@ if [[ "${installClientPackages}" == "yes" ]]; then echo "$(date -u) Installed Lustre client packages." - # Issue #115 Remove workaround for LNET fix - # Revert below LNET fix, please don't remove the lines to cleanup rule files - init_lnet="true" - + if lsmod | grep "^lnet"; then if lnetctl net show --net tcp | grep interfaces; then - echo "$(date -u) LNet is loaded skip the load." + echo "$(date -u) LNet is loaded skip the load" + echo "$(date -u) Adding missing interfaces" + add_net_interfaces init_lnet="false" - fi + elif lnetctl net show | grep "net type: tcp"; then + # There may be a default configuration with no interface. + # This is configured by an old version CSI. + lnetctl net del --net tcp + fi fi if [[ "${init_lnet}" == "true" ]]; then @@ -97,46 +138,22 @@ if [[ "${installClientPackages}" == "yes" ]]; then modprobe -v lnet lnetctl lnet configure - echo "$(date -u) Determining the default network interface." - # perl will be installed as dependency by luster client - echo "$(date -u) Route table is:" - ip route list - default_interface=$(ip route list | perl -n -e'/default via [0-9.]+ dev ([0-9a-zA-Z]+) / && print $1') - echo "$(date -u) Default network interface is ${default_interface}" - - if [[ "${default_interface}" == "" ]]; then - echo "$(date -u) Cannot determine the default network interface" - exit 1 - fi - - if lnetctl net show | grep "net type: tcp"; then - # There may be a default configuration with no interface. - # This is configured by an old version CSI. - lnetctl net del --net tcp + add_net_interfaces + + # Remove old udev rules + should_reload_udev="false" + for rule_file in /etc/udev/rules.d/{73-netadd,74-netremove,98-netadd,99-netremove}.rules; do + if [[ -e ${rule_file} ]]; then + echo "Deleting unnecessary udev rule: ${rule_file}" + rm -f "${rule_file}" + should_reload_udev="true" + fi + done + if [[ "${should_reload_udev}" == "true" ]]; then + echo "$(date -u) Reloading udevadm" + udevadm control --reload fi - lnetctl net add --net tcp --if "${default_interface}" - - echo "$(date -u) Adding the udev script." - test -e /etc/lustre || mkdir /etc/lustre - touch /etc/lustre/.lock - test -e /etc/lustre/fix-lnet.sh && rm -f /etc/lustre/fix-lnet.sh - sed -i "s/{default_interface}/${default_interface}/g;" ./fix-lnet.sh - cp ./fix-lnet.sh /etc/lustre - - # legacy rules 73 & 74 - test -e /etc/udev/rules.d/73-netadd.rules && rm -f /etc/udev/rules.d/73-netadd.rules - test -e /etc/udev/rules.d/74-netremove.rules && rm -f /etc/udev/rules.d/74-netremove.rules - - # current rules 98 & 99 - test -e /etc/udev/rules.d/98-netadd.rules && rm -f /etc/udev/rules.d/98-netadd.rules - test -e /etc/udev/rules.d/99-netremove.rules && rm -f /etc/udev/rules.d/99-netremove.rules - - echo 'SUBSYSTEM=="net", ACTION=="add", RUN+="/etc/lustre/fix-lnet.sh"' | tee /etc/udev/rules.d/98-netadd.rules - echo 'SUBSYSTEM=="net", ACTION=="remove", RUN+="/etc/lustre/fix-lnet.sh"' | tee /etc/udev/rules.d/99-netremove.rules - - echo "$(date -u) Reloading udevadm" - udevadm control --reload echo "$(date -u) Done" fi diff --git a/pkg/azurelustreplugin/fix-lnet.sh b/pkg/azurelustreplugin/fix-lnet.sh deleted file mode 100644 index 11972be75..000000000 --- a/pkg/azurelustreplugin/fix-lnet.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash - -# Issue #115 Remove workaround for LNET fix -# Delete this bash script - -/usr/bin/logger "PID $$: Start fix-lnet" -count=1; - -# try fix lnet 5 times maximum -for sleep_in_secs in 0 0.5 0.5 0.5 0.5; do - sleep $sleep_in_secs - - break_flag=$( - ( - break_flag_inner=false - flock -w 60 -e ${FD} - if sudo lnetctl net show --net tcp | grep "status: down"; then - /usr/sbin/lnetctl net del --net tcp - /usr/sbin/lnetctl net add --net tcp --if {default_interface} - break_flag_inner=true - fi - echo $break_flag_inner - ) {FD}< /etc/lustre/.lock - ) - - if [[ $break_flag == true ]]; then - break - else - /usr/bin/logger "PID $$: Skipped fix-lnet, count=$count" - count=$((count+1)) - fi -done diff --git a/test/integration_aks/image/run_integration_test.sh b/test/integration_aks/image/run_integration_test.sh index 17a9f587d..9d1d0f4cf 100644 --- a/test/integration_aks/image/run_integration_test.sh +++ b/test/integration_aks/image/run_integration_test.sh @@ -26,7 +26,7 @@ readonly target_path="/tmp/target_path" readonly lustre_fs_name=$1 readonly lustre_fs_ip=$2 readonly lustre_client_version="2.15.1" -readonly pkgVersion="${lustre_client_version}-24-gbaa21ca" +readonly pkgVersion="${lustre_client_version}-29-gbae0abe" readonly pkgName="amlfs-lustre-client-${pkgVersion}" mkdir -p $target_path diff --git a/test/long-haul/start-long-haul.sh b/test/long-haul/start-long-haul.sh index e8dc5b8e6..a30ee7e98 100755 --- a/test/long-haul/start-long-haul.sh +++ b/test/long-haul/start-long-haul.sh @@ -43,14 +43,11 @@ print_logs_case "Executing fault test" print_logs_case "Executing update test" ./update-test.sh -# Issue #115 Remove workaround for LNET fix -# Enable perf/scale test -# print_logs_case "Executing perf/scale test" -# ./perf-scale-test.sh - -# Issue #115 Remove workaround for LNET fix -# print_logs_case "Executing external e2e test" -# ./external-e2e.sh +print_logs_case "Executing perf/scale test" +./perf-scale-test.sh + +print_logs_case "Executing external e2e test" +./external-e2e.sh print_logs_case "Executing cleanup" kubectl apply -f ./cleanup/cleanupjob.yaml