Skip to content

Commit

Permalink
validate files after checkout on NFS
Browse files Browse the repository at this point in the history
Summary:
## Context
S439820 is an ongoing SEV that has been challenging to reproduce. Initially, we had some repros, but they suddenly disappeared. We suspect that NFS cache invalidation might be causing stale files on macOS. There is no direct way to invalidate cache in NFS. Therefore, we have to use some hacky ways to invalidate cache on macOS see this diff summary for detailed explanation D35435764

## This Diff Stack
This diff stack aims to implement an invalidation check method to verify the invalidation after each checkout on macOS. This will help us detect how many users are affected by the SEV and confirm whether our NFS cache invalidation approach is working correctly.
Also, if we find a large number of users are affected by the SEV, and we try any fix for macOS invalidation this check can confirm if any fix works in prod.

## This diff

After checkout we can verify all the invalidations, and get log of any invalidation which didn't happen. It compares the sha1 of the file from disk with the sha1 from inode.

Reviewed By: jdelliot, genevievehelsel

Differential Revision: D63544196

fbshipit-source-id: e92b064fab1675f96782a913c0cafbf99f29f00e
  • Loading branch information
kavehahmadi60 authored and facebook-github-bot committed Jan 30, 2025
1 parent ee57405 commit df7fc3b
Show file tree
Hide file tree
Showing 4 changed files with 165 additions and 0 deletions.
7 changes: 7 additions & 0 deletions eden/fs/inodes/ServerState.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@

#include "eden/fs/inodes/ServerState.h"

#include <folly/executors/CPUThreadPoolExecutor.h>
#include <folly/executors/task_queue/LifoSemMPMCQueue.h>
#include <folly/logging/xlog.h>
#include <gflags/gflags.h>

Expand Down Expand Up @@ -64,6 +66,11 @@ ServerState::ServerState(
privHelper_{std::move(privHelper)},
threadPool_{std::move(threadPool)},
fsChannelThreadPool_{std::move(fsChannelThreadPool)},
validationThreadPool_{std::make_shared<folly::CPUThreadPoolExecutor>(
initialConfig.numVerifierThreads.getValue(),
std::make_unique<
folly::LifoSemMPMCQueue<folly::CPUThreadPoolExecutor::CPUTask>>(
10 * initialConfig.maxNumberOfInvlidationsToVerify.getValue()))},
clock_{std::move(clock)},
processInfoCache_{std::move(processInfoCache)},
structuredLogger_{std::move(structuredLogger)},
Expand Down
18 changes: 18 additions & 0 deletions eden/fs/inodes/ServerState.h
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,23 @@ class ServerState {
return fsChannelThreadPool_;
}

/**
* Gets a thread pool for running validation. Validation will read file
* contents through the filesystem. Reads through the filesystem can call
* back into EdenFS, so we need to ensure that validation does not block
* any of the threads that EdenFS uses to serve filesystem operations.
*
* It's pretty similar to the invalidation threadpool that the channels use.
* However, this thread pool also errors when reaches capacity rather than
* blocking. We want this threadpoool to be bounded because we don't want
* blocking here to increase memory usage until we OOM. Additionally, we don't
* want to block because this could block checkout. Validation is an
* asynchronous action that should not effect EdenFS behavior.
*/
const std::shared_ptr<folly::Executor>& getValidationThreadPool() const {
return validationThreadPool_;
}

/**
* Get the Clock.
*/
Expand Down Expand Up @@ -216,6 +233,7 @@ class ServerState {
std::shared_ptr<PrivHelper> privHelper_;
std::shared_ptr<UnboundedQueueExecutor> threadPool_;
std::shared_ptr<folly::Executor> fsChannelThreadPool_;
std::shared_ptr<folly::Executor> validationThreadPool_;
std::shared_ptr<Clock> clock_;
std::shared_ptr<ProcessInfoCache> processInfoCache_;
std::shared_ptr<StructuredLogger> structuredLogger_;
Expand Down
123 changes: 123 additions & 0 deletions eden/fs/service/EdenServer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
#include "eden/fs/config/MountProtocol.h"
#include "eden/fs/config/TomlConfig.h"
#include "eden/fs/inodes/EdenMount.h"
#include "eden/fs/inodes/FileInode.h"
#include "eden/fs/inodes/InodeAccessLogger.h"
#include "eden/fs/inodes/InodeBase.h"
#include "eden/fs/inodes/InodeMap.h"
Expand Down Expand Up @@ -2119,6 +2120,128 @@ ImmediateFuture<CheckoutResult> EdenServer::checkOutRevision(
}
});
}
// Verify if cache file invalidation works as expected after
// checkout
// TODO T213849128: This is to collect data for S439820.
// We can remove this once SEV close.
if (isNfs &&
serverState_->getReloadableConfig()
->getEdenConfig()
->verifyFilesAfterCheckout.getValue()) {
std::vector<InodeNumber> inodesToValidate;
inodesToValidate.swap(result.sampleInodesToValidate);
folly::via(
this->getServerState()->getValidationThreadPool().get(),
[this,
inodesToValidate = std::move(inodesToValidate),
mountPath = mountPath.copy(),
maxSizeOfFileToVerifyInvalidation =
this->serverState_->getReloadableConfig()
->getEdenConfig()
->maxSizeOfFileToVerifyInvalidation.getValue(),
structuredLogger =
this->serverState_->getStructuredLogger()]() {
// for each inode
std::vector<ImmediateFuture<std::tuple<
RelativePath,
InodeNumber,
std::optional<Hash20>>>>
expectedContents;
{
// note we don't want to hold the mount handle in the
// capture because that would block shutdown until this
// future completes. This check can block waiting for the
// eden fuse threads to complete an fs request. We don't
// want to make a cycle, so to be safe we just attempt to
// lookup the mount and gracefully handle errors.
auto mountHandle = this->getMount(mountPath);
auto inodeMap = mountHandle.getEdenMount().getInodeMap();
for (auto& inodeNum : inodesToValidate) {
auto inode = inodeMap->lookupLoadedInode(inodeNum);
if (!inode) {
// Skip if inode doesn't exist because it is already
// invalidated
continue;
}
auto path = inode->getPath();
if (!path.has_value()) {
// skip if we can't get the path because we
// can't validate it anyway
continue;
}
if (auto fileInode = inode.asFile()) {
expectedContents.emplace_back(
fileInode
->getSha1(
ObjectFetchContext::getNullContext())
.thenValue(
[path = inode->getPath(),
inodeNum](auto&& sha1) mutable
-> std::tuple<
RelativePath,
InodeNumber,
std::optional<Hash20>> {
return {
std::move(path).value(),
inodeNum,
std::move(sha1)};
}));
} else {
expectedContents.emplace_back(std::tuple<
RelativePath,
InodeNumber,
std::optional<Hash20>>(
std::move(path).value(), inodeNum, std::nullopt));
}
}
}

return collectAll(std::move(expectedContents))
.thenValue([mountPath = mountPath.copy(),
maxSizeOfFileToVerifyInvalidation,
structuredLogger](auto&& expectedResults) {
for (auto& result : expectedResults) {
if (!result.hasValue()) {
continue;
}
auto [path, inodeNum, sha1Result] = result.value();
auto fileStat =
getFileStat((mountPath + path).value().c_str());
if (!fileStat.hasValue()) {
// Skip if file stat is not available
continue;
} else if (
fileStat.value().size >
maxSizeOfFileToVerifyInvalidation) {
// Skip verification for large files
continue;
}
// sha1 the path
std::string contents;
folly::readFile(
(mountPath + path).value().c_str(), contents);

auto actualSha1 = Hash20::sha1(contents);

if (actualSha1 != sha1Result) {
// log to scuba.
structuredLogger->logEvent(StaleContents{
path.value(), inodeNum.getRawValue()});
std::string sha1ResultStr =
sha1Result ? sha1Result->toString() : "none";
XLOG(
WARN,
"Stale file contents after checkout for path {} (ino {}) - actual sha1: {} , expected sha1: {}",
path.value(),
inodeNum.getRawValue(),
actualSha1.toString(),
sha1ResultStr);
}
}
});
});
}

return std::move(result);
});

Expand Down
17 changes: 17 additions & 0 deletions eden/fs/telemetry/LogEvent.h
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,23 @@ struct FinishedCheckout : public EdenFSEvent {
}
};

struct StaleContents : public EdenFSEvent {
std::string path;
uint64_t ino;

explicit StaleContents(std::string path, uint64_t ino)
: path(std::move(path)), ino(ino) {}

void populate(DynamicEvent& event) const override {
event.addString("path", path);
event.addInt("ino", ino);
}

const char* getType() const override {
return "stale_contents";
}
};

struct FinishedMount : public EdenFSEvent {
std::string backing_store_type;
std::string repo_type;
Expand Down

0 comments on commit df7fc3b

Please sign in to comment.