Skip to content

Commit

Permalink
add retries to file utils
Browse files Browse the repository at this point in the history
Summary:
# This diff

Adds some functionality to the Sapling path utils to retry certain kinds of IO errors. Currently, these retries are only configured on macOS, and the only type of error that is retried is ErrorKind::TimedOut

# Context

On macOS NFS mounts, IO operations can sometimes return "OS Error 60" which indicates that an operation timed out. This can happen if the NFS Server doesn't respond to an NFS request quickly enough. The error doesn't necessarily mean the file is unavailable; it just means the server didn't respond quickly enough.

In many cases, retrying the IO request would yield a successful result. However, many tools don't know that these error cases can be retried.

This diff teaches some of Sapling's file utilities to retry OS Error 60 errors on macOS.

Reviewed By: muirdm

Differential Revision: D68801396

fbshipit-source-id: e6e1239a484dde64fa2381263c844f6330be1fcf
  • Loading branch information
MichaelCuevas authored and facebook-github-bot committed Feb 1, 2025
1 parent 8c27713 commit 39a650c
Show file tree
Hide file tree
Showing 2 changed files with 103 additions and 11 deletions.
3 changes: 1 addition & 2 deletions eden/scm/lib/util/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ anyhow = "1.0.95"
dirs = "2.0"
fn-error-context = "0.2"
fs2 = "0.4"
once_cell = "1.12"
rand = { version = "0.8", features = ["small_rng"] }
sapling-atomicfile = { version = "0.1.0", path = "../atomicfile" }
sapling-lazystr = { version = "0.1.0", path = "../lazystr" }
Expand All @@ -34,12 +35,10 @@ tempfile = "3.8"

[target.'cfg(target_os = "linux")'.dependencies]
libc = "0.2.139"
once_cell = "1.12"
tempfile = "3.8"

[target.'cfg(target_os = "macos")'.dependencies]
libc = "0.2.139"
once_cell = "1.12"
tempfile = "3.8"

[target.'cfg(target_os = "windows")'.dependencies]
Expand Down
111 changes: 102 additions & 9 deletions eden/scm/lib/util/src/file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,17 @@ use std::fs::OpenOptions;
use std::io;
use std::path::Path;

#[cfg(unix)]
use once_cell::sync::Lazy;

use crate::errors::IOContext;

static MAX_IO_RETRIES: Lazy<u32> = Lazy::new(|| {
std::env::var("SL_IO_RETRIES")
.unwrap_or("3".to_string())
.parse::<u32>()
.unwrap_or(3)
});

#[cfg(unix)]
static UMASK: Lazy<u32> = Lazy::new(|| unsafe {
let umask = libc::umask(0);
Expand All @@ -29,12 +35,20 @@ pub fn apply_umask(mode: u32) -> u32 {
}

pub fn atomic_write(path: &Path, op: impl FnOnce(&mut File) -> io::Result<()>) -> io::Result<File> {
// Can't implement retries on IO timeouts because op is FnOnce
atomicfile::atomic_write(path, 0o644, false, op).path_context("error atomic writing file", path)
}

/// Open a path for atomic writing.
pub fn atomic_open(path: &Path) -> io::Result<atomicfile::AtomicFile> {
atomicfile::AtomicFile::open(path, 0o644, false).path_context("error atomic opening file", path)
let mut open_fn = |p: &Path| -> io::Result<atomicfile::AtomicFile> {
atomicfile::AtomicFile::open(p, 0o644, false)
};

match with_retry(&mut open_fn, path) {
Ok(m) => Ok(m),
Err(err) => Err(err).path_context("error opening file", path),
}
}

pub fn open(path: impl AsRef<Path>, mode: &str) -> io::Result<File> {
Expand All @@ -58,40 +72,72 @@ pub fn open(path: impl AsRef<Path>, mode: &str) -> io::Result<File> {
}
};
}
let mut open_fn = |p: &Path| -> io::Result<File> { opts.open(p) };

opts.open(path).path_context("error opening file", path)
match with_retry(&mut open_fn, path) {
Ok(m) => Ok(m),
Err(err) => Err(err).path_context("error opening file", path),
}
}

pub fn create(path: impl AsRef<Path>) -> io::Result<File> {
open(path, "wct")
}

fn is_retryable(err: &io::Error) -> bool {
cfg!(target_os = "macos") && err.kind() == io::ErrorKind::TimedOut
}

fn with_retry<'a, F, T>(io_operation: &mut F, path: &'a Path) -> io::Result<T>
where
F: FnMut(&'a Path) -> io::Result<T>,
{
let mut retries: u32 = 0;
loop {
match io_operation(path) {
Ok(v) => return Ok(v),
Err(err) if is_retryable(&err) => {
if retries >= *MAX_IO_RETRIES {
return Err(err);
}
retries += 1;
}
Err(err) => return Err(err),
}
}
}

pub fn exists(path: impl AsRef<Path>) -> io::Result<Option<std::fs::Metadata>> {
match std::fs::metadata(path.as_ref()) {
let path = path.as_ref();
match with_retry(&mut std::fs::metadata, path) {
Ok(m) => Ok(Some(m)),
Err(err) if err.kind() == io::ErrorKind::NotFound => Ok(None),
Err(err) => Err(err).path_context("error reading file", path.as_ref()),
Err(err) => Err(err).path_context("error reading file", path),
}
}

pub fn unlink_if_exists(path: impl AsRef<Path>) -> io::Result<()> {
match std::fs::remove_file(path.as_ref()) {
let path = path.as_ref();
match with_retry(&mut std::fs::remove_file, path) {
Ok(()) => Ok(()),
Err(err) if err.kind() == io::ErrorKind::NotFound => Ok(()),
Err(err) => Err(err).path_context("error deleting file", path.as_ref()),
Err(err) => Err(err).path_context("error deleting file", path),
}
}

pub fn read_to_string_if_exists(path: impl AsRef<Path>) -> io::Result<Option<String>> {
match std::fs::read_to_string(path.as_ref()) {
let path = path.as_ref();
match with_retry(&mut std::fs::read_to_string, path) {
Ok(contents) => Ok(Some(contents)),
Err(err) if err.kind() == io::ErrorKind::NotFound => Ok(None),
Err(err) => Err(err).path_context("error reading file", path.as_ref()),
Err(err) => Err(err).path_context("error reading file", path),
}
}

#[cfg(test)]
mod test {
use std::io::Read;

use anyhow::Result;
use tempfile::tempdir;

Expand All @@ -113,4 +159,51 @@ mod test {

Ok(())
}

#[test]
fn test_retry_io() -> Result<()> {
// NOTE: These test cases are run together because running them separately sometimes fails
// when testing with cargo. This is because the tests are run in parallel, and
// std::evn::var() can overwrite environment variable values for other running tests.
//
// To avoid this, we run the tests serially in a single test case.
let dir = tempdir()?;
let path = dir.path().join("test");
std::fs::write(&path, "test")?;
let mut retries = 0;

let mut open_fn = |p: &Path| -> io::Result<File> {
retries += 1;
if retries >= (*MAX_IO_RETRIES) {
std::fs::File::open(p)
} else {
Err(io::Error::new(io::ErrorKind::TimedOut, "timed out"))
}
};

let file = with_retry(&mut open_fn, &path);

if cfg!(target_os = "macos") {
let mut file = file?;
assert_eq!(retries, *MAX_IO_RETRIES);
let mut buf = String::new();
file.read_to_string(&mut buf)?;
assert_eq!(buf, "test");
} else {
file.as_ref().err();
assert_eq!(
file.err().map(|e| e.kind()).unwrap(),
io::ErrorKind::TimedOut
);
assert_eq!(retries, 1);
}

// Test error case
let dir = tempdir()?;
let path = dir.path().join("does_not_exist");
let res = read_to_string_if_exists(path)?;
assert_eq!(res, Option::None);

Ok(())
}
}

0 comments on commit 39a650c

Please sign in to comment.