Skip to content

Commit

Permalink
CA-392674: nbd_client_manager retry connect on nbd device busy (#6021)
Browse files Browse the repository at this point in the history
to connect to nbd devices, nbd_client_manager will
1. protect the operation with /var/run/nonpersistent/nbd_client_manager
file lock
2. check whether nbd is being used by `nbd-client -check`
3. load nbd kernel module by `modprobe nbd`
4. call `nbd-client` to connect to nbd device

However, step 3 will trigger systemd-udevd run asyncly, which would open
and lock the same nbd devices, run udev rules, etc. This introduce races
with step 4, e.g. both process want to open and lock the nbd device.

Note: the file lock in step 1 does NOT resovle the issue here, as it
only coordinate multiple nbd_client_manager processes.

To fix the issue,
- we patch nbd-client to report the device busy from kernel to
nbd_client_manager
- nbd_client_manager should check nbd-client exit code, and retry on
device busy
  • Loading branch information
liulinC authored Oct 14, 2024
2 parents 7670247 + 1e55415 commit 34a8796
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 12 deletions.
36 changes: 26 additions & 10 deletions python3/libexec/nbd_client_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
# Don't wait more than 10 minutes for the NBD device
MAX_DEVICE_WAIT_MINUTES = 10

# According to https://github.com/thom311/libnl/blob/main/include/netlink/errno.h#L38
NLE_BUSY = 25

class InvalidNbdDevName(Exception):
"""
Expand Down Expand Up @@ -80,7 +82,7 @@ def __exit__(self, *args):
FILE_LOCK = FileLock(path=LOCK_FILE)


def _call(cmd_args, error=True):
def _call(cmd_args, raise_err=True, log_err=True):
"""
[call cmd_args] executes [cmd_args] and returns the exit code.
If [error] and exit code != 0, log and throws a CalledProcessError.
Expand All @@ -94,14 +96,16 @@ def _call(cmd_args, error=True):

_, stderr = proc.communicate()

if error and proc.returncode != 0:
LOGGER.error(
"%s exited with code %d: %s", " ".join(cmd_args), proc.returncode, stderr
)
if proc.returncode != 0:
if log_err:
LOGGER.error(
"%s exited with code %d: %s", " ".join(cmd_args), proc.returncode, stderr
)

raise subprocess.CalledProcessError(
returncode=proc.returncode, cmd=cmd_args, output=stderr
)
if raise_err:
raise subprocess.CalledProcessError(
returncode=proc.returncode, cmd=cmd_args, output=stderr
)

return proc.returncode

Expand All @@ -116,7 +120,7 @@ def _is_nbd_device_connected(nbd_device):
if not os.path.exists(nbd_device):
raise NbdDeviceNotFound(nbd_device)
cmd = ["nbd-client", "-check", nbd_device]
returncode = _call(cmd, error=False)
returncode = _call(cmd, raise_err=False, log_err=False)
if returncode == 0:
return True
if returncode == 1:
Expand Down Expand Up @@ -191,6 +195,8 @@ def connect_nbd(path, exportname):
"""Connects to a free NBD device using nbd-client and returns its path"""
# We should not ask for too many nbds, as we might not have enough memory
_call(["modprobe", "nbd", "nbds_max=24"])
# Wait for systemd-udevd to process the udev rules
_call(["udevadm", "settle", "--timeout=30"])
retries = 0
while True:
try:
Expand All @@ -206,7 +212,17 @@ def connect_nbd(path, exportname):
"-name",
exportname,
]
_call(cmd)
ret = _call(cmd, raise_err=False, log_err=True)
if NLE_BUSY == ret:
# Although _find_unused_nbd_device tell us the nbd devcie is
# not connected by other nbd-client, it may be opened and locked
# by other process like systemd-udev, raise NbdDeviceNotFound to retry
LOGGER.warning("Device %s is busy, will retry", nbd_device)
raise NbdDeviceNotFound(nbd_device)

if 0 != ret:
raise subprocess.CalledProcessError(returncode=ret, cmd=cmd)

_wait_for_nbd_device(nbd_device=nbd_device, connected=True)
_persist_connect_info(nbd_device, path, exportname)
nbd = (
Expand Down
6 changes: 4 additions & 2 deletions python3/tests/test_nbd_client_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ def test_nbd_device_connected(self, mock_call, mock_exists):
result = nbd_client_manager._is_nbd_device_connected('/dev/nbd0')

self.assertTrue(result)
mock_call.assert_called_once_with(["nbd-client", "-check", "/dev/nbd0"], error=False)
mock_call.assert_called_once_with(["nbd-client", "-check", "/dev/nbd0"],
raise_err=False, log_err=False)

@patch('nbd_client_manager._call')
def test_nbd_device_not_connected(self, mock_call, mock_exists):
Expand All @@ -53,7 +54,8 @@ def test_nbd_device_not_connected(self, mock_call, mock_exists):
result = nbd_client_manager._is_nbd_device_connected('/dev/nbd1')

self.assertFalse(result)
mock_call.assert_called_once_with(["nbd-client", "-check", "/dev/nbd1"], error=False)
mock_call.assert_called_once_with(["nbd-client", "-check", "/dev/nbd1"],
raise_err=False, log_err=False)

def test_nbd_device_not_found(self, mock_exists):
mock_exists.return_value = False
Expand Down

0 comments on commit 34a8796

Please sign in to comment.