diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..44766c5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +/debug/ +/target/ +/qemu/ + +*.lock diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..876772d --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,15 @@ +[workspace] +members = [ + "permanent_common", + "permanent_plugin", + "permanent_trace", + "permanent_cig", + "permanent_tester", +] +exclude = [ + # Needs to be built with musl. + "fs-testing/fs-dump", +] + +[profile.release] +debug = true diff --git a/README.md b/README.md new file mode 100644 index 0000000..dbdf17e --- /dev/null +++ b/README.md @@ -0,0 +1,33 @@ +# Permanent: Persistent Memory and NVMe Non-Deterministic Tester + +This is the source code of Permanent, a tool for automated crash consistency testing for file systems using persistent memory and NVMe. +It is a combination of [Vinter](https://os.itec.kit.edu/65_3814.php) [(source)](https://github.com/KIT-OSGroup/vinter) and [Revin](https://os.itec.kit.edu/97_3853.php). + +## Setup + +```sh +# install dependencies +# TODO + +# Rust via rustup (see https://rustup.rs) +curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh +rustup target add x86_64-unknown-linux-musl + +# get QEMU +wget https://download.qemu.org/qemu-8.0.4.tar.xz +tar xvJf qemu-8.0.4.tar.xz && mv qemu-8.0.4 qemu +# apply patch +pushd qemu && patch -p1 < ../qemu.patch && popd +# configure QEMU +mkdir qemu/build && pushd qemu/build && ../configure --target-list=x86_64-softmmu --enable-debug --enable-plugins && popd +# build QEMU +make -C qemu/build -j$(nproc) + +# build permanent +./build.sh + +``` + +## License + +Permanent is released under the MIT license, see `LICENSE` for details. diff --git a/build.sh b/build.sh new file mode 100755 index 0000000..ba87703 --- /dev/null +++ b/build.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +set -eux + +cargo build --release + +# build fs-dump +(cd fs-testing/fs-dump && cargo build --release --target=x86_64-unknown-linux-musl) + +# build checkpoint +(cd permanent_plugin && ./build_checkpoint.sh) + +# build initramfs +make -C fs-testing/initramfs diff --git a/fs-testing/fs-dump/.gitignore b/fs-testing/fs-dump/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/fs-testing/fs-dump/.gitignore @@ -0,0 +1 @@ +/target diff --git a/fs-testing/fs-dump/Cargo.toml b/fs-testing/fs-dump/Cargo.toml new file mode 100644 index 0000000..89b3f2a --- /dev/null +++ b/fs-testing/fs-dump/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "fs-dump" +version = "0.1.0" +edition = "2018" + +[dependencies] +walkdir = "2" +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" diff --git a/fs-testing/fs-dump/src/main.rs b/fs-testing/fs-dump/src/main.rs new file mode 100644 index 0000000..0259ecc --- /dev/null +++ b/fs-testing/fs-dump/src/main.rs @@ -0,0 +1,85 @@ +use serde::Serialize; +use std::collections::BTreeMap; +use std::os::unix::fs::MetadataExt; +use std::fs::File; +use std::io::Read; +use walkdir::WalkDir; + +#[derive(Serialize)] +struct FileAttrs { + typeflag: String, + #[serde(skip_serializing_if = "Option::is_none")] + content: Option, + #[serde(skip_serializing_if = "Option::is_none")] + target: Option, + st_ino: u64, + st_mode: u32, + st_nlink: u64, + st_uid: u32, + st_gid: u32, + st_size: u64, + st_blocks: u64, + st_atim_sec: i64, + st_atim_nsec: i64, + st_mtim_sec: i64, + st_mtim_nsec: i64, + st_ctim_sec: i64, + st_ctim_nsec: i64, +} + +fn main() { + let args: Vec = std::env::args().collect(); + let (path, dump_contents) = match args.len() { + 2 => (&args[1], false), + 3 if args[1] == "--contents" => (&args[2], true), + _ => { + println!("usage: {} [--contents] ", args[0]); + std::process::exit(1); + } + }; + let mut result = BTreeMap::new(); + for entry in WalkDir::new(path) { + let entry = entry.expect("could not read dir entry"); + let metadata = entry.metadata().expect("could not retrieve file metadata"); + result.insert( + entry.path().to_string_lossy().into_owned(), + FileAttrs { + typeflag: match entry.file_type() { + t if t.is_file() => "F", + t if t.is_dir() => "D", + t if t.is_symlink() => "SL", + _ => panic!("unexpected file type at {}", entry.path().display()) + }.to_string(), + content: if dump_contents && entry.file_type().is_file() { + let mut file = File::open(entry.path()).expect("could not open file"); + let mut contents = String::new(); + file.read_to_string(&mut contents).expect("could not read file contents"); + Some(contents) + } else { + None + }, + target: if dump_contents && entry.file_type().is_symlink() { + Some(std::fs::read_link(entry.path()).expect("could not read symlink").to_string_lossy().into_owned()) + } else { + None} + , + st_ino: metadata.ino(), + st_mode: metadata.mode(), + st_nlink: metadata.nlink(), + st_uid: metadata.uid(), + st_gid: metadata.gid(), + st_size: metadata.size(), + st_blocks: metadata.blocks(), + st_atim_sec: metadata.atime(), + st_atim_nsec: metadata.atime_nsec(), + st_mtim_sec: metadata.mtime(), + st_mtim_nsec: metadata.mtime_nsec(), + st_ctim_sec: metadata.ctime(), + st_ctim_nsec: metadata.ctime_nsec(), + }, + ); + } + print!("PERMANENT START"); + serde_json::to_writer_pretty(std::io::stdout(), &result).expect("could not serialize JSON"); + print!("PERMANENT END"); +} diff --git a/fs-testing/initramfs/.gitignore b/fs-testing/initramfs/.gitignore new file mode 100644 index 0000000..b49a42e --- /dev/null +++ b/fs-testing/initramfs/.gitignore @@ -0,0 +1,4 @@ +/initramfs +/busybox +/busybox_pmfs +*.cpio.gz diff --git a/fs-testing/initramfs/Makefile b/fs-testing/initramfs/Makefile new file mode 100644 index 0000000..64a4565 --- /dev/null +++ b/fs-testing/initramfs/Makefile @@ -0,0 +1,29 @@ + +INITRAMFS_DEPS := \ + create-initramfs.sh \ + busybox \ + ../../permanent_plugin/checkpoint \ + ../fs-dump/target/x86_64-unknown-linux-musl/release/fs-dump + +default: initramfs.cpio.gz +.PHONY: default + +initramfs.cpio.gz: $(INITRAMFS_DEPS) + ./create-initramfs.sh > $@ + +initramfs_mainline.cpio.gz: $(INITRAMFS_DEPS) + rm -rf initramfs_mainline + mkdir -p initramfs_mainline/{bin,lib} + ln -s lib initramfs_mainline/lib64 + ./copy-binary.sh `which mkfs.ext4` initramfs_mainline + ./copy-binary.sh `which fsck.ext4` initramfs_mainline + ./copy-binary.sh `which mkfs.xfs` initramfs_mainline + ./copy-binary.sh `which xfs_repair` initramfs_mainline + ./create-initramfs.sh initramfs_mainline > $@ + +initramfs_zilpmem.cpio.gz: $(INITRAMFS_DEPS) ../zil-pmem/initramfs_zfs + ./create-initramfs.sh ../zil-pmem/initramfs_zfs > $@ + +busybox: + curl -o $@ https://busybox.net/downloads/binaries/1.35.0-x86_64-linux-musl/busybox + chmod +x $@ diff --git a/fs-testing/initramfs/copy-binary.sh b/fs-testing/initramfs/copy-binary.sh new file mode 100755 index 0000000..18264b6 --- /dev/null +++ b/fs-testing/initramfs/copy-binary.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash + +set -eu + +# Usage: copy-binary.sh + +binary=${1:?need binary} +initramfs=${2:?need initramfs root} + +cp "$binary" "$initramfs/bin/" + +# Copy libraries from host system. +ldd "$binary" | \ + awk '/ => \// { print $3 } /ld-linux/ { print $1 }' | \ + xargs cp -t "$initramfs/lib" + \ No newline at end of file diff --git a/fs-testing/initramfs/create-initramfs.sh b/fs-testing/initramfs/create-initramfs.sh new file mode 100755 index 0000000..12bbfec --- /dev/null +++ b/fs-testing/initramfs/create-initramfs.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +set -eu + +# Usage: create-initramfs.sh [folder with extra files] > initramfs.cpio.gz + +cd "$( dirname "${BASH_SOURCE[0]}" )" + +rm -rf initramfs + +# put extra stuff in $1 +if [[ -n "${1:-}" ]]; then + cp -rH "$1" initramfs +fi + +mkdir -p initramfs/{bin,mnt,proc,sys} + +cp init initramfs +"${BUSYBOX:-./busybox}" --install initramfs/bin +cp ../../permanent_plugin/checkpoint initramfs/bin +cp ../fs-dump/target/x86_64-unknown-linux-musl/release/fs-dump initramfs/bin + +(cd initramfs && find . -print0 | cpio --owner root:root --null -ov --format=newc | gzip -9) diff --git a/fs-testing/initramfs/init b/fs-testing/initramfs/init new file mode 100755 index 0000000..903e024 --- /dev/null +++ b/fs-testing/initramfs/init @@ -0,0 +1,13 @@ +#!/bin/sh + +mount -t proc none /proc +mount -t sysfs none /sys +mount -t devtmpfs none /dev + +if [ -d /init.d ]; then + for f in /init.d/*; do + . "$f" + done +fi + +exec /bin/sh diff --git a/fs-testing/tests/test_append.yaml b/fs-testing/tests/test_append.yaml new file mode 100644 index 0000000..2078144 --- /dev/null +++ b/fs-testing/tests/test_append.yaml @@ -0,0 +1,3 @@ +trace_cmd_suffix: "checkpoint 0 && echo -n test > /mnt/myfile && sync && checkpoint 1 && sleep 2 && echo -n appendedalignedtext01234 >> /mnt/myfile && sync && checkpoint 2" +checkpoint_range: [0, 2] +dump_cmd_suffix: "echo test >> /mnt/myfile && rm -r /mnt/*" diff --git a/fs-testing/tests/test_atime.yaml b/fs-testing/tests/test_atime.yaml new file mode 100644 index 0000000..16d043e --- /dev/null +++ b/fs-testing/tests/test_atime.yaml @@ -0,0 +1,5 @@ +# atomically update access time +# touch with old timestamp first to trick relatime feature +trace_cmd_suffix: "checkpoint 0 && echo -n test > /mnt/myfile && touch -d '2020-01-01 00:00:00' /mnt/myfile && sync && checkpoint 1 && sleep 2 && cat /mnt/myfile > /dev/null && sync && checkpoint 2" +checkpoint_range: [0, 2] +dump_cmd_suffix: "echo test >> /mnt/myfile && rm -r /mnt/*" diff --git a/fs-testing/tests/test_chmod.yaml b/fs-testing/tests/test_chmod.yaml new file mode 100644 index 0000000..7fb25f7 --- /dev/null +++ b/fs-testing/tests/test_chmod.yaml @@ -0,0 +1,3 @@ +trace_cmd_suffix: "checkpoint 0 && echo -n test > /mnt/myfile && sync && checkpoint 1 && sleep 2 && chmod 666 /mnt/myfile && sync && checkpoint 2" +checkpoint_range: [0, 2] +dump_cmd_suffix: "echo test >> /mnt/myfile && rm -r /mnt/*" diff --git a/fs-testing/tests/test_chown.yaml b/fs-testing/tests/test_chown.yaml new file mode 100644 index 0000000..225aaa8 --- /dev/null +++ b/fs-testing/tests/test_chown.yaml @@ -0,0 +1,3 @@ +trace_cmd_suffix: "checkpoint 0 && echo -n test > /mnt/myfile && sync && checkpoint 1 && sleep 2 && chown 321:789 /mnt/myfile && sync && checkpoint 2" +checkpoint_range: [0, 2] +dump_cmd_suffix: "echo test >> /mnt/myfile && rm -r /mnt/*" diff --git a/fs-testing/tests/test_ctime-mtime.yaml b/fs-testing/tests/test_ctime-mtime.yaml new file mode 100644 index 0000000..754d1d8 --- /dev/null +++ b/fs-testing/tests/test_ctime-mtime.yaml @@ -0,0 +1,4 @@ +# touch with old timestamp first to trick relatime feature +trace_cmd_suffix: "checkpoint 0 && mkdir /mnt/mydir && touch /mnt/mydir/myfile && touch -d '2020-01-01 00:00:00' /mnt/mydir && sync && checkpoint 1 && sleep 2 && rm /mnt/mydir/myfile && sync && checkpoint 2" +checkpoint_range: [0, 2] +dump_cmd_suffix: "echo test >> /mnt/myfile && if [ -e /mnt/mydir ] ; then echo test >> /mnt/mydir/myfile ; else true ; fi && rm -r /mnt/*" diff --git a/fs-testing/tests/test_hello-world.yaml b/fs-testing/tests/test_hello-world.yaml new file mode 100644 index 0000000..ff27ff8 --- /dev/null +++ b/fs-testing/tests/test_hello-world.yaml @@ -0,0 +1,3 @@ +trace_cmd_suffix: "checkpoint 0 && sync && checkpoint 1 && echo HelloWorld > /mnt/myfile && checkpoint 2 && sync && checkpoint 3" +checkpoint_range: [0, 3] +dump_cmd_suffix: "echo test >> /mnt/myfile && rm /mnt/myfile" diff --git a/fs-testing/tests/test_link-hard.yaml b/fs-testing/tests/test_link-hard.yaml new file mode 100644 index 0000000..c15d267 --- /dev/null +++ b/fs-testing/tests/test_link-hard.yaml @@ -0,0 +1,3 @@ +trace_cmd_suffix: "checkpoint 0 && echo -n test > /mnt/myfile && sync && checkpoint 1 && ln /mnt/myfile /mnt/hardlink && sync && checkpoint 2 && sleep 2 && rm /mnt/myfile && sync && checkpoint 3" +checkpoint_range: [0, 3] +dump_cmd_suffix: "echo test > /mnt/myfile && rm -r /mnt/*" diff --git a/fs-testing/tests/test_link-sym.yaml b/fs-testing/tests/test_link-sym.yaml new file mode 100644 index 0000000..ef3166e --- /dev/null +++ b/fs-testing/tests/test_link-sym.yaml @@ -0,0 +1,3 @@ +trace_cmd_suffix: "checkpoint 0 && echo -n test > /mnt/myfile && sync && checkpoint 1 && ln -s /mnt/myfile /mnt/symlink && sync && checkpoint 2" +checkpoint_range: [0, 2] +dump_cmd_suffix: "echo test >> /mnt/myfile && rm -r /mnt/*" diff --git a/fs-testing/tests/test_mkdir-rmdir.yaml b/fs-testing/tests/test_mkdir-rmdir.yaml new file mode 100644 index 0000000..05dfc84 --- /dev/null +++ b/fs-testing/tests/test_mkdir-rmdir.yaml @@ -0,0 +1,3 @@ +trace_cmd_suffix: "checkpoint 0 && mkdir /mnt/mydir && sync && checkpoint 1 && sleep 2 && rmdir /mnt/mydir && sync && checkpoint 2" +checkpoint_range: [0, 2] +dump_cmd_suffix: "if [ -e /mnt/mydir ] ; then echo -n hoho > /mnt/mydir/file ; else echo -n test > /mnt/mydir ; fi && echo -n test > /mnt/myfile && rm -r /mnt/*" diff --git a/fs-testing/tests/test_rename-dir.yaml b/fs-testing/tests/test_rename-dir.yaml new file mode 100644 index 0000000..00f7d03 --- /dev/null +++ b/fs-testing/tests/test_rename-dir.yaml @@ -0,0 +1,3 @@ +trace_cmd_suffix: "checkpoint 0 && mkdir /mnt/newdir && sync && checkpoint 1 && mkdir /mnt/newdir2 && sync && checkpoint 2 && echo -n test > /mnt/newdir/testfile && sync && checkpoint 3 && mv /mnt/newdir /mnt/newdir2 && sync && checkpoint 4" +checkpoint_range: [0, 4] +dump_cmd_suffix: "echo test >> /mnt/myfile && if [ -f /mnt/newdir2/newdir/testfile ] ; then echo test >> /mnt/newdir2/newdir/testfile ; else true ; fi && rm -r /mnt/*" diff --git a/fs-testing/tests/test_rename-long-name.yaml b/fs-testing/tests/test_rename-long-name.yaml new file mode 100644 index 0000000..45849a6 --- /dev/null +++ b/fs-testing/tests/test_rename-long-name.yaml @@ -0,0 +1,3 @@ +trace_cmd_suffix: "checkpoint 0 && echo -n test > /mnt/myfile && sync && checkpoint 1 && mv /mnt/myfile /mnt/testfile_renamed_to_a_long_filename2222222222222222222222222222222222222222222222222223200 && sync && checkpoint 2" +checkpoint_range: [0, 2] +dump_cmd_suffix: "echo test >> /mnt/testfile_renamed_to_a_long_filename2222222222222222222222222222222222222222222222222223200 && rm -r /mnt/*" diff --git a/fs-testing/tests/test_rename.yaml b/fs-testing/tests/test_rename.yaml new file mode 100644 index 0000000..5858ca8 --- /dev/null +++ b/fs-testing/tests/test_rename.yaml @@ -0,0 +1,3 @@ +trace_cmd_suffix: "checkpoint 0 && echo -n tes1 > /mnt/myfile && echo -n tes2 > /mnt/myfile2 && sync && checkpoint 1 && mv /mnt/myfile2 /mnt/myfile && sync && checkpoint 2" +checkpoint_range: [0, 2] +dump_cmd_suffix: "echo test >> /mnt/myfile && rm /mnt/myfile" diff --git a/fs-testing/tests/test_touch-long-name.yaml b/fs-testing/tests/test_touch-long-name.yaml new file mode 100644 index 0000000..473ba2d --- /dev/null +++ b/fs-testing/tests/test_touch-long-name.yaml @@ -0,0 +1,3 @@ +trace_cmd_suffix: "checkpoint 0 && touch /mnt/eizAKifFfyOn72ieKYxbCraXxNonCfH8CargS4xDIbOGGW6BPBCPEc1RYyNyZWZgXXX && sync && checkpoint 1 && sleep 2 && echo -n helo > /mnt/eizAKifFfyOn72ieKYxbCraXxNonCfH8CargS4xDIbOGGW6BPBCPEc1RYyNyZWZgXXX && sync && checkpoint 2" +checkpoint_range: [0, 2] +dump_cmd_suffix: "echo test >> /mnt/eizAKifFfyOn72ieKYxbCraXxNonCfH8CargS4xDIbOGGW6BPBCPEc1RYyNyZWZgXXX && rm -r /mnt/*" diff --git a/fs-testing/tests/test_touch.yaml b/fs-testing/tests/test_touch.yaml new file mode 100644 index 0000000..3442896 --- /dev/null +++ b/fs-testing/tests/test_touch.yaml @@ -0,0 +1,3 @@ +trace_cmd_suffix: "checkpoint 0 && touch /mnt/myfile && sync && checkpoint 1 && sleep 2 && touch /mnt/myfile && sync && checkpoint 2" +checkpoint_range: [0, 2] +dump_cmd_suffix: "echo test >> /mnt/myfile && rm -r /mnt/*" diff --git a/fs-testing/tests/test_unlink.yaml b/fs-testing/tests/test_unlink.yaml new file mode 100644 index 0000000..f37834b --- /dev/null +++ b/fs-testing/tests/test_unlink.yaml @@ -0,0 +1,3 @@ +trace_cmd_suffix: "checkpoint 0 && echo -n test > /mnt/myfile && sync && checkpoint 1 && rm /mnt/myfile && sync && checkpoint 2" +checkpoint_range: [0, 2] +dump_cmd_suffix: "echo test > /mnt/myfile && rm -r /mnt/*" diff --git a/fs-testing/tests/test_update-middle.yaml b/fs-testing/tests/test_update-middle.yaml new file mode 100644 index 0000000..83ba454 --- /dev/null +++ b/fs-testing/tests/test_update-middle.yaml @@ -0,0 +1,4 @@ +# update part of file +trace_cmd_suffix: 'checkpoint 0 && for i in `seq 71` ; do printf MjOf1E3x18E3R5EP6hq7WjzALMtjsAXY ; done > /mnt/myfile && sync && checkpoint 1 && echo -n hohoho | dd of=/mnt/myfile seek=171 bs=6 conv=notrunc && checkpoint 2 && sync && checkpoint 3' +checkpoint_range: [0, 3] +dump_cmd_suffix: "echo test >> /mnt/myfile && rm -r /mnt/*" diff --git a/fs-testing/vms/vm_zilpmem.yaml b/fs-testing/vms/vm_zilpmem.yaml new file mode 100644 index 0000000..e460e39 --- /dev/null +++ b/fs-testing/vms/vm_zilpmem.yaml @@ -0,0 +1,12 @@ +fs_type: "hybrid" +pmem_start: 536870912 # 512 * 2**20 +pmem_len: 134217728 # 128 * 2**20 +qemu_path: "qemu/build/qemu-system-x86_64" +kernel_path: "fs-testing/zil-pmem/linux_build/arch/x86/boot/bzImage" +initrd_path: "fs-testing/initramfs/initramfs_zilpmem.cpio.gz" +qemu_args: [ "-m", "1G", "-append", "console=ttyS0,115200n8 memmap=128M!512M" ] +trace_cmd_prefix: 'echo 1 > /proc/sys/kernel/printk && echo 1 > /sys/module/zfs/parameters/zfs_zil_pmem_prb_ncommitters && echo 2 > /sys/module/zfs/parameters/zil_default_kind && zpool create -O mountpoint=legacy testpool /dev/nvme0n1 log dax:/dev/pmem0 && mount -t zfs -o sync=always testpool /mnt' +# Recovery: Import pool read-write to allow replay, then mount dataset read-only. +recovery_cmd: 'echo 1 > /proc/sys/kernel/printk && echo 1 > /sys/module/zfs/parameters/zfs_zil_pmem_prb_ncommitters && zpool import testpool && mount -t zfs -oro testpool /mnt && ls -lah /mnt && fs-dump --contents /mnt > /dev/null' +# reduce linux console level to prevent garbage in the output +dump_cmd_prefix: 'echo 1 > /proc/sys/kernel/printk && echo 1 > /sys/module/zfs/parameters/zfs_zil_pmem_prb_ncommitters && zpool import testpool && mount -t zfs -oro testpool /mnt && fs-dump --contents /mnt && umount /mnt && mount -t zfs testpool /mnt' diff --git a/fs-testing/zil-pmem/README.md b/fs-testing/zil-pmem/README.md new file mode 100644 index 0000000..df7baa4 --- /dev/null +++ b/fs-testing/zil-pmem/README.md @@ -0,0 +1,34 @@ +# Analyzing ZIL-PMEM + +https://github.com/openzfs/zfs/pull/12731 + +## Building + +Clone ZFS (with ZIL-PMEM) into openzfs/ and a suitable Linux version into linux/ (e.g., 5.11). + +```sh +# upstream +git clone -b zil-pmem/upstreaming https://github.com/problame/zfs/ openzfs +# with some patches (e.g., smaller chunk size) to ease analysis +git clone -b zil-pmem/vinter https://github.com/lluchs/zfs/ openzfs +``` + +Build the kernel: + +```sh +./build-zfs-builtin.sh +# builds linux_build/arch/x86/boot/bzImage +``` + +Build the ZFS tools: + +```sh +./build-zfs-tools.sh +# builds initramfs_zfs/ +``` + +Build the initramfs: + +```sh +make -C../initramfs initramfs_zilpmem.cpio.gz +``` diff --git a/fs-testing/zil-pmem/build-zfs-builtin.sh b/fs-testing/zil-pmem/build-zfs-builtin.sh new file mode 100755 index 0000000..577df45 --- /dev/null +++ b/fs-testing/zil-pmem/build-zfs-builtin.sh @@ -0,0 +1,69 @@ +#!/usr/bin/env bash + +set -eux + +kernel=linux +build=${kernel}_build +kernel_abs=$PWD/$kernel +build_abs=$PWD/$build +zfs=openzfs + +# see https://github.com/openzfs/zfs/issues/10450#issuecomment-643654436 + +make -C"$kernel" O="../$build" defconfig + +# Enable ZFS and additional dependencies. +cat >>"$build/.config" < \// { print $3 } /ld-linux/ { print $1 }' | \ + xargs cp -t lib + +# Link required binaries to /bin +mkdir bin +ln -st bin ../sbin/mount.zfs ../usr/local/sbin/zpool +ln -s lib lib64 + +# Fix library path for ZFS tools. +mkdir init.d +echo "export LD_LIBRARY_PATH=/usr/local/lib" > init.d/zfs_path + +popd + +du -sh initramfs_zfs diff --git a/permanent_cig/Cargo.toml b/permanent_cig/Cargo.toml new file mode 100644 index 0000000..35c0a91 --- /dev/null +++ b/permanent_cig/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "permanent_cig" +version = "0.1.0" +edition = "2021" + +[dependencies] +anyhow = "1.0.74" +bitvec = "1.0.1" +blake3 = "1.4.1" +clap = { version = "4.3.23", features = ["derive"] } +fastrand = "2.0.0" +itertools = "0.11.0" +libc = "0.2.147" +linux-raw-sys = "0.4.5" +serde = "1.0.183" +serde_json = "1.0.105" +serde_yaml = "0.9.25" +permanent_common = { path = "../permanent_common" } diff --git a/permanent_cig/src/image.rs b/permanent_cig/src/image.rs new file mode 100644 index 0000000..b27805b --- /dev/null +++ b/permanent_cig/src/image.rs @@ -0,0 +1,60 @@ +use std::fs::File; +use std::io::Write; +use std::collections::HashSet; + +use anyhow::{Context, Result}; + +#[derive(PartialEq, Eq, Hash, Clone)] +pub struct CrashHash(blake3::Hash); + +impl serde::Serialize for CrashHash { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer + { + serializer.serialize_str(&self.0.to_hex()) + } +} + +pub struct ImagePool { + crash_dir: String, + size: usize, + size_max: usize, + hashes: HashSet, +} + +impl ImagePool { + pub fn new(work_dir: &String) -> Result { + let crash_dir = format!("{}/crash_images", work_dir); + std::fs::create_dir(crash_dir.as_str()).context("could not create crash images directory")?; + Ok(Self { + crash_dir, + size: 0, + size_max: usize::MAX, + hashes: HashSet::new(), + }) + } + + pub fn with_limit(work_dir: &String, limit: usize) -> Result { + let mut pool = Self::new(work_dir)?; + pool.size_max = limit; + Ok(pool) + } + + pub fn persist(&mut self, data: &[u8]) -> Result<(bool, CrashHash)> { + let hash = CrashHash(blake3::hash(data)); + if self.hashes.insert(hash.clone()) { + // first time encountering this hash + self.size += data.len(); + if self.size > self.size_max { + panic!("image pool size limit exceeded"); + } + let path = format!("{}/{}.raw", self.crash_dir, hash.0.to_hex()); + let mut file = File::create(path.as_str()).with_context(|| format!("could not open {}", path))?; + file.write_all(data).context("could not dump image")?; + Ok((true, hash)) + } else { + Ok((false, hash)) + } + } +} diff --git a/permanent_cig/src/lib.rs b/permanent_cig/src/lib.rs new file mode 100644 index 0000000..50d2e67 --- /dev/null +++ b/permanent_cig/src/lib.rs @@ -0,0 +1,269 @@ +use std::collections::{HashSet, HashMap}; +use std::io::{BufReader, BufWriter}; +use std::fs::File; +use std::time::SystemTime; +use std::marker::PhantomData; +use anyhow::{bail, Result}; +use permanent_common::action::Action; +use permanent_common::config::{VmConfig, TestConfig}; +use permanent_common::profiler::{Profile, Measurement}; +use permanent_common::trace::{TraceEntry, PmemEvent, NvmeEvent, parse_trace_file_bin}; + +mod set; + +mod image; +use image::{CrashHash, ImagePool}; + +mod models; +use models::{X86PersistentMemory, NvmeDevice}; + +enum CrashPersistenceType { + NoWrites, + NothingPersisted, + FullyPersisted, // { /* TODO */ }, + StrictSubsetPersisted, // { /* TODO */ } +} + +struct CrashMetadata { + trace_entry_id: usize, + prev_checkpoint_value: Option, + persistence_type: CrashPersistenceType, +} + +// TODO has_changed optimization +struct DeviceData { + device: D, + changed: bool, + last_generated_index: Option, + generated: HashMap>, + // metadata: HashMap>, +} + +pub struct CrashImageGenerator { + work_dir: String, + vm_config: VmConfig, + test_config: TestConfig, + + pool: ImagePool, + pmem: Option>, + nvme: Option>, + rng: fastrand::Rng, +} + +const POOL_LIMIT: usize = 20*1024*1024*1024; + +impl CrashImageGenerator { + pub fn new(work_dir: &String, vm_config: &VmConfig, test_config: &TestConfig) -> Self { + let (p, n) = vm_config.have_pmem_nvme(); + Self { + work_dir: work_dir.clone(), + vm_config: vm_config.clone(), + test_config: test_config.clone(), + + pool: ImagePool::with_limit(work_dir, POOL_LIMIT).unwrap(), + pmem: p.then(|| DeviceData { + device: X86PersistentMemory::new(std::fs::read(format!("{}/pmem_base.raw", &work_dir).as_str()).unwrap()), + changed: true, + last_generated_index: None, + generated: HashMap::new(), + }), + nvme: n.then(|| DeviceData { + device: NvmeDevice::new(std::fs::read(format!("{}/nvme_base.raw", &work_dir).as_str()).unwrap()), + changed: true, + last_generated_index: None, + generated: HashMap::new(), + }), + rng: fastrand::Rng::new(), + } + } + + fn generate_crash_images_at(&mut self, trace_entry_id: usize) { + println!("generate crash images at id {}", trace_entry_id); + let (p, n) = self.vm_config.have_pmem_nvme(); + if p { + if self.get_pmem_mut().changed { + let nothing_hash = self.pmem.as_ref().unwrap().device.generate_nothing_persisted_image(&mut self.pool); + let everything_hash = self.pmem.as_ref().unwrap().device.generate_everything_persisted_image(&mut self.pool); + let mut hashes = self.pmem.as_ref().unwrap().device.generate_random_images(&mut self.pool, &mut self.rng); + hashes.insert(nothing_hash); + hashes.insert(everything_hash); + self.get_pmem_mut().changed = false; + self.get_pmem_mut().last_generated_index = Some(trace_entry_id); + self.get_pmem_mut().generated.insert(trace_entry_id, hashes.clone()); + } else { + // reuse last set of images + let last_index = self.pmem.as_ref().unwrap().last_generated_index.expect("no last_generated_index"); + let last_images = self.pmem.as_ref().unwrap().generated.get(&last_index) + .expect("last_generated_index hashes not found") + .clone(); + self.get_pmem_mut().generated.insert(trace_entry_id, last_images); + } + } + if n { + if self.get_nvme_mut().changed { + let nothing_hash = self.nvme.as_ref().unwrap().device.generate_nothing_persisted_image(&mut self.pool); + let everything_hash = self.nvme.as_ref().unwrap().device.generate_everything_persisted_image(&mut self.pool); + let mut hashes = self.nvme.as_ref().unwrap().device.generate_random_images(&mut self.pool, &mut self.rng); + hashes.insert(nothing_hash); + hashes.insert(everything_hash); + self.get_nvme_mut().changed = false; + self.get_nvme_mut().last_generated_index = Some(trace_entry_id); + self.get_nvme_mut().generated.insert(trace_entry_id, hashes.clone()); + } else { + // reuse last set of images + let last_index = self.nvme.as_ref().unwrap().last_generated_index.expect("no last_generated_index"); + let last_images = self.nvme.as_ref().unwrap().generated.get(&last_index) + .expect("last_generated_index hashes not found") + .clone(); + self.get_nvme_mut().generated.insert(trace_entry_id, last_images); + } + } + } + + fn get_pmem_mut(&mut self) -> &mut DeviceData { + self.pmem.as_mut().unwrap() + } + + fn get_nvme_mut(&mut self) -> &mut DeviceData { + self.nvme.as_mut().unwrap() + } + + pub fn replay_trace(&mut self) { // TODO use anyhow results + let mut had_init = false; + let mut prev_checkpoint_value: Option = None; + let mut checkpoint_ids: HashMap = HashMap::new(); + + let checkpoint_range = self.test_config.checkpoint_range.0 + .. self.test_config.checkpoint_range.1; + let within_checkpoint_range = |maybe_value: Option| { maybe_value.is_some_and(|value| checkpoint_range.contains(&value)) }; + + // TODO path + let trace_file = File::open(format!("{}/analyse/trace.bin", self.work_dir).as_str()) + .expect("could not open trace file"); + for entry in parse_trace_file_bin(BufReader::new(trace_file)) { + match entry.unwrap() { + TraceEntry::Pmem { id, event } => { + match event { + PmemEvent::Read { .. } => { }, + PmemEvent::Write { address, size: _, content, non_temporal } => { + if !had_init { + panic!("pmem event before test script"); + } + self.get_pmem_mut().changed = true; + self.get_pmem_mut().device.write(address as usize, content.as_slice(), non_temporal); + }, + PmemEvent::Clflush { address } => { + if !had_init { + panic!("pmem event before test script"); + } + self.get_pmem_mut().device.clwb(address as usize, None); + // approximate Clflush by adding a fence + // this necessitates crash image generation. + if within_checkpoint_range(prev_checkpoint_value) { + // we do not generate crash images before the first or after the + // last checkpoint. + if !self.get_pmem_mut().device.pending_lines.is_empty() { + self.generate_crash_images_at(id as usize); + self.get_pmem_mut().changed = true; // after a fence with flushes, different + // crash images are possible + } + } + self.get_pmem_mut().device.fence(); + }, + PmemEvent::Clflushopt { address } => { + if !had_init { + panic!("pmem event before test script"); + } + self.get_pmem_mut().device.clwb(address as usize, None); + }, + PmemEvent::Clwb { address } => { + if !had_init { + panic!("pmem event before test script"); + } + self.get_pmem_mut().device.clwb(address as usize, None); + }, + PmemEvent::Wbinvd => { + if had_init { // yes, there should be no ! here. + panic!("wbinvd should not appear during normal test execution") + } + }, + PmemEvent::Fence => { + if within_checkpoint_range(prev_checkpoint_value) { + // we do not generate crash images before the first or after the + // last checkpoint. + if !self.get_pmem_mut().device.pending_lines.is_empty() { + self.generate_crash_images_at(id as usize); + self.get_pmem_mut().changed = true; // after a fence with flushes, different + // crash images are possible + } + } + self.get_pmem_mut().device.fence(); + }, + } + }, + TraceEntry::Nvme { id, event } => { + match event { + NvmeEvent::Read { .. } => { }, + NvmeEvent::Write { offset, length: _, data } => { + if !had_init { + panic!("nvme event before test script"); + } + self.get_nvme_mut().changed = true; + self.get_nvme_mut().device.write(offset as usize, data); + } + NvmeEvent::Flush => { + if !had_init { + panic!("nvme event before test script"); + } + if within_checkpoint_range(prev_checkpoint_value) { + if !self.get_nvme_mut().device.unpersisted_content.is_empty() { + self.generate_crash_images_at(id as usize); + self.get_nvme_mut().changed = true; // after flush with writes, different + // crash images are possible + } + } + self.get_nvme_mut().device.flush(); + }, + } + }, + TraceEntry::Checkpoint { id, value } => { + if value == 255 { + had_init = true; + } else { + prev_checkpoint_value = Some(value); + if value > 0 && !checkpoint_ids.contains_key(&(value - 1)) { // TODO do we want this? + panic!("non-contiguous checkpoints; missing: {}", value - 1); + } + if checkpoint_ids.insert(value, id as usize).is_some() { + panic!("duplicate checkpoint value: {}", value); + } + // we create crash images at every checkpoint including the last (for SFS) + if within_checkpoint_range(prev_checkpoint_value) || self.test_config.checkpoint_range.1 == value { + self.generate_crash_images_at(id as usize); + } + } + }, + } + } + + if !checkpoint_ids.contains_key(&self.test_config.checkpoint_range.1) { + panic!("ERROR: not all checkpoints are present in the trace. abort.") + } + + // write index information + let (p, n) = self.vm_config.have_pmem_nvme(); + if p { + let pmem = self.pmem.as_ref().unwrap(); + let file = File::create(format!("{}/pmem.index", self.work_dir).as_str()).unwrap(); + serde_json::to_writer_pretty(BufWriter::new(file), &pmem.generated).unwrap(); + } + if n { + let nvme = self.nvme.as_ref().unwrap(); + let file = File::create(format!("{}/nvme.index", self.work_dir).as_str()).unwrap(); + serde_json::to_writer_pretty(BufWriter::new(file), &nvme.generated).unwrap(); + } + // write checkpoint information + let file = File::create(format!("{}/checkpoint.index", self.work_dir).as_str()).unwrap(); + serde_json::to_writer_pretty(BufWriter::new(file), &checkpoint_ids).unwrap(); + } +} diff --git a/permanent_cig/src/main.rs b/permanent_cig/src/main.rs new file mode 100644 index 0000000..333f30a --- /dev/null +++ b/permanent_cig/src/main.rs @@ -0,0 +1,45 @@ +use std::io::BufReader; +use std::fs::File; +use clap::Parser; +use std::path::Path; +use permanent_common::action::Action; +use permanent_common::config::{VmConfig, TestConfig}; +use permanent_cig::CrashImageGenerator; + +fn remove_dir(path: &String) -> Result<(), std::io::Error> { + if Path::new(path).exists() { + std::fs::remove_dir_all(path)?; + } + Ok(()) +} + +fn remove_file(path: &String) -> Result<(), std::io::Error> { + if Path::new(path).exists() { + std::fs::remove_file(path)?; + } + Ok(()) +} + +fn main() { + let args = Args::parse(); + let vm_config: VmConfig = serde_yaml::from_reader(BufReader::new(File::open(format!("{}/vm_config.yaml", args.work_dir).as_str()).unwrap())).unwrap(); + let test_config: TestConfig = serde_yaml::from_reader(BufReader::new(File::open(format!("{}/test_config.yaml", args.work_dir).as_str()).unwrap())).unwrap(); + + let make_path = |suffix| format!("{}/{}", args.work_dir, suffix); + + if args.force { + remove_dir(&make_path("crash_images")).unwrap(); + remove_file(&make_path("pmem.index")).unwrap(); + remove_file(&make_path("nvme.index")).unwrap(); + remove_file(&make_path("checkpoint.index")).unwrap(); + } + let mut cig = CrashImageGenerator::new(&args.work_dir, &vm_config, &test_config); + cig.replay_trace(); +} + +#[derive(Debug, Parser)] +pub struct Args { + work_dir: String, + #[clap(short, long, action)] + force: bool, +} diff --git a/permanent_cig/src/models.rs b/permanent_cig/src/models.rs new file mode 100644 index 0000000..b938e58 --- /dev/null +++ b/permanent_cig/src/models.rs @@ -0,0 +1,391 @@ +use std::cmp::{max, min}; +use std::collections::{HashMap, HashSet}; +use std::ops::Range; +use std::marker::Sized; +use itertools::Itertools; + +use permanent_common::trace::{PmemEvent, NvmeEvent}; +use crate::image::{ImagePool, CrashHash}; +use crate::set; + +use anyhow::Result; + +#[derive(Debug, Clone)] +pub struct Store { + pub address: usize, + pub data: Vec, +} + +impl Store { + pub fn address_start(&self) -> usize { + self.address + } + + pub fn address_end(&self) -> usize { + self.address + self.data.len() + } + + pub fn address_range(&self) -> Range { + self.address_start()..self.address_end() + } +} + +#[derive(Clone)] +pub struct OrderedWriteLine { + writes: Vec, + /// everything up until but excluding this index has been marked for flushing + flushed_index: usize, +} + +impl Default for OrderedWriteLine { + fn default() -> Self { + Self::new() + } +} + +impl OrderedWriteLine { + pub fn new() -> Self { + OrderedWriteLine { + writes: Vec::new(), + flushed_index: 0, + } + } + + pub fn flush_all(&mut self) { + self.flushed_index = self.writes.len(); + } + + pub fn all_writes(&self) -> &[Store] { + &self.writes + } + + pub fn flushed_writes(&self) -> &[Store] { + &self.writes[0..self.flushed_index] + } + + pub fn unflushed_writes(&self) -> &[Store] { + &self.writes[self.flushed_index..] + } + + pub fn drain_flushed_writes(&mut self) -> std::vec::Drain<'_, Store> { + let idx = self.flushed_index; + self.flushed_index = 0; + self.writes.drain(0..idx) + } + + /// Do any pending writes overlap with an access at the specified address and size? + pub fn overlaps_access(&self, address: usize, size: usize) -> bool { + let access_range = address..(address + size); + self.writes + .iter() + .any(|w| !range_overlap(&w.address_range(), &access_range).is_empty()) + } +} + +/// x86 memory persistency model. +/// +/// writes to the same cache line are always ordered in respect to each other. +/// writes to different cache lines may be reordered. +pub struct X86PersistentMemory { + pub persisted_content: Vec, + pub pending_lines: HashSet, + /// maps line number (== address / line_granularity) to OrderedWriteLine + pub unpersisted_content: HashMap, + /// 8 or 64 + line_granularity: usize, +} + +// TODO +const LINE_GRANULARITY: usize = 64; +const MAX_UNPERSISTED_SUBSETS: usize = 5; +const MAX_PARTIAL_FLUSHES_COUNT: usize = 5; + +impl X86PersistentMemory { + pub fn new(persisted_content: Vec) -> Self { + Self { + persisted_content, + pending_lines: HashSet::new(), + unpersisted_content: HashMap::new(), + line_granularity: LINE_GRANULARITY, + } + } + + pub fn generate_nothing_persisted_image(&self, pool: &mut ImagePool) -> CrashHash { + let (_, hash) = pool.persist(self.persisted_content.as_slice()).unwrap(); + hash + } + + pub fn generate_everything_persisted_image(&self, pool: &mut ImagePool) -> CrashHash { + let mut img: Vec = self.persisted_content.clone(); + for ordered_write_line in self.unpersisted_content.values() { + for store in ordered_write_line.all_writes().iter() { + img[store.address_range()].copy_from_slice(store.data.as_slice()); + } + } + let (_, hash) = pool.persist(img.as_slice()).unwrap(); + hash + } + + pub fn generate_random_images(&self, pool: &mut ImagePool, rng: &mut fastrand::Rng) -> HashSet { + let mut img: Vec = vec![0u8; self.persisted_content.len()]; + let mut hashes = HashSet::new(); + + let unpersisted_reads_lines: Vec = self.unpersisted_content.keys().copied().collect(); // TODO heuristic + if !unpersisted_reads_lines.is_empty() { + let random_subsets: Vec> = if 1usize.checked_shl(unpersisted_reads_lines.len().try_into().unwrap()) + .is_some_and(|val| val <= (MAX_UNPERSISTED_SUBSETS + 1).try_into().unwrap()) + { + unpersisted_reads_lines + .iter() + .copied() + .powerset() + .skip(1) // empty set + .collect() + } else { + set::random_subsets(rng, &unpersisted_reads_lines) + .filter(|vec| !vec.is_empty()) + .take(MAX_UNPERSISTED_SUBSETS) + .collect() + }; + for random_lines in random_subsets { + let partial_flushes_count = random_lines + .iter() + .map(|line_number| self.unpersisted_content[line_number].all_writes().len()) + .fold(1, |acc, x| acc * x); + let line_partial_writes: Vec> = random_lines + .iter() + .map(|line_number| { + let writes_count = self.unpersisted_content[line_number].all_writes().len(); + if partial_flushes_count > MAX_PARTIAL_FLUSHES_COUNT { + if writes_count <= 1 { + vec![writes_count] + } else { + vec![writes_count, rng.usize(1..writes_count)] + } + } else { + (1..=writes_count).collect() + } + }) + .collect(); + for partial_writes_indices in line_partial_writes.iter().multi_cartesian_product() + { + img[..].copy_from_slice(self.persisted_content.as_slice()); + for (line_number, flush_writes_limit) in random_lines + .iter() + .copied() + .zip(partial_writes_indices.iter().copied()) + { + for store in self.unpersisted_content[&line_number].all_writes().iter().take(*flush_writes_limit) { + img[store.address_range()].copy_from_slice(store.data.as_slice()); + } + let (_, hash) = pool.persist(img.as_slice()).unwrap(); + hashes.insert(hash); + } + } + } + } + hashes + } + + pub fn write(&mut self, address: usize, value: &[u8], non_temporal: bool) { + // test to see if we even get larger stores + assert!(matches!(value.len(), 1 | 2 | 4 | 8)); + let address_stop = address + value.len(); + let split_address_ranges = { + let start = address - address % 8; + let stop = if address_stop % 8 == 0 { + address_stop + } else { + address_stop + 8 - (address_stop % 8) + }; + (start..stop) + .step_by(8) + .map(|a| max(a, address)..min(a + 8, address_stop)) + }; + + for address_range in split_address_ranges { + let line_number = address_range.start / self.line_granularity; + let line = self + .unpersisted_content + .entry(line_number) + .or_insert_with(OrderedWriteLine::new); + line.writes.push(Store { + address: address_range.start, + data: value[(address_range.start - address)..(address_range.end - address)].into(), + }); + + // approximation of non-temporal stores + if non_temporal { + self.pending_lines.insert(line_number); + // note that for cache line granularity, this is probably not quite correct + line.flush_all(); + } + } + } + + // TODO: what do we need flush_writes_limit for? + pub fn clwb(&mut self, address: usize, flush_writes_limit: Option) { + let cache_line_base = (address >> 6) << 6; + for a in (cache_line_base..(cache_line_base + 64)).step_by(self.line_granularity) { + let line_number = a / self.line_granularity; + if let Some(line) = self.unpersisted_content.get_mut(&line_number) { + self.pending_lines.insert(line_number); + if let Some(limit) = flush_writes_limit { + line.flushed_index = limit; + } else { + line.flush_all(); + } + } + } + } + + pub fn fence(&mut self) { + // A fence consumes all pending lines. Swap in a new set to avoid double borrow of self. + let mut pending_lines = HashSet::new(); + std::mem::swap(&mut pending_lines, &mut self.pending_lines); + for line in pending_lines { + self.fence_line(line); + } + } + + fn fence_line(&mut self, line: usize) { + if let Some(content) = self.unpersisted_content.get_mut(&line) { + assert!(content.flushed_index > 0); + for write in content.drain_flushed_writes() { + self.persisted_content[write.address_range()].copy_from_slice(&write.data); + } + if content.writes.is_empty() { + self.unpersisted_content.remove(&line); + } + self.pending_lines.remove(&line); + } else { + unreachable!(); + } + } + + pub fn persist_unpersisted(&mut self) { + let lines: Vec = self.unpersisted_content.keys().copied().collect(); + for line_number in lines { + self.clwb(line_number * self.line_granularity, None); + } + self.fence(); + assert!(self.unpersisted_content.is_empty()); + assert!(self.pending_lines.is_empty()); + } + + pub fn print_unpersisted(&self) { + let mut lines: Vec<(&usize, &OrderedWriteLine)> = self.unpersisted_content.iter().collect(); + lines.sort_by_key(|(line_number, _)| *line_number); + for (line_number, line) in lines { + println!("unpersisted line {}: {:?}", *line_number, line.writes); + } + } +} + +/// nvme persistency model. +/// +/// all writes that are not separated by a flush may be reordered. +pub struct NvmeDevice { + pub persisted_content: Vec, + // NOTE: we do not use a construct like OrderedWriteLines here. + // It might happen that when we create permutations, writes to the same block are reordered. + // But that doesn't matter because when we take partial permutations, no state can appear + // that could not have appeared otherwise. + pub unpersisted_content: Vec, +} + +// TODO +const NVME_RANDOM_IMAGES_MAX_AMOUNT: Option = Some(25); + +const NVME_ATOMIC_BLOCK_SIZE_SHIFT: usize = 9; +const NVME_ATOMIC_BLOCK_SIZE: usize = 1 << NVME_ATOMIC_BLOCK_SIZE_SHIFT; +const NVME_ATOMIC_BLOCK_SIZE_MASK: usize = (1 << NVME_ATOMIC_BLOCK_SIZE_SHIFT) - 1; + +impl NvmeDevice { + pub fn new(persisted_content: Vec) -> Self { + Self { + persisted_content, + unpersisted_content: Vec::new(), + } + } + + pub fn generate_nothing_persisted_image(&self, pool: &mut ImagePool) -> CrashHash { + let (_, hash) = pool.persist(self.persisted_content.as_slice()).unwrap(); + hash + } + + pub fn generate_everything_persisted_image(&self, pool: &mut ImagePool) -> CrashHash { + let mut img: Vec = self.persisted_content.clone(); + for store in self.unpersisted_content.iter() { + img[store.address_range()].copy_from_slice(store.data.as_slice()); + } + let (_, hash) = pool.persist(img.as_slice()).unwrap(); + hash + } + + pub fn generate_random_images(&self, pool: &mut ImagePool, rng: &mut fastrand::Rng) -> HashSet { + let mut img: Vec = vec![0u8; self.persisted_content.len()]; + let mut hashes = HashSet::new(); + + if self.unpersisted_content.is_empty() { + return hashes; + } + + // TODO use exhaustive if there are less than NVME_RANDOM_IMAGES_MAX_AMOUNT exhaustive + // images, like vinter does. + + if let Some(amount) = NVME_RANDOM_IMAGES_MAX_AMOUNT { + let mut indices: Vec = (0..self.unpersisted_content.len()).collect(); + for _ in 0..amount { + rng.shuffle(indices.as_mut_slice()); + let partial_index = rng.usize(1..=self.unpersisted_content.len()); + img[..].copy_from_slice(self.persisted_content.as_slice()); + for store in indices[..partial_index].iter().map(|idx| &self.unpersisted_content[*idx]) { + img[store.address_range()].copy_from_slice(store.data.as_slice()); + } + let (_, hash) = pool.persist(img.as_slice()).unwrap(); + hashes.insert(hash); + } + } else { + for indices in (0..self.unpersisted_content.len()).permutations(self.unpersisted_content.len()) { + img[..].copy_from_slice(self.persisted_content.as_slice()); + for store in indices.into_iter().map(|idx| &self.unpersisted_content[idx]) { + img[store.address_range()].copy_from_slice(store.data.as_slice()); + // create one crash image in every loop execution here, to simulate partial + // permutations + let (_, hash) = pool.persist(img.as_slice()).unwrap(); + hashes.insert(hash); + } + } + } + hashes + } + + pub fn write(&mut self, address: usize, data: Vec) { + if (address & NVME_ATOMIC_BLOCK_SIZE_MASK) != 0 || (data.len() & NVME_ATOMIC_BLOCK_SIZE_MASK) != 0 { + panic!("unaligned NVMe access: addr={} len={}", address, data.len()); + } + for offset in (0..data.len()).step_by(NVME_ATOMIC_BLOCK_SIZE) { + self.unpersisted_content.push(Store { + address: address + offset, + data: data[offset..(offset + NVME_ATOMIC_BLOCK_SIZE)].to_vec(), + }); + } + } + + pub fn flush(&mut self) { + for store in self.unpersisted_content.drain(..) { + self.persisted_content[store.address_range()].copy_from_slice(store.data.as_slice()); + } + } +} + +fn range_overlap(r1: &Range, r2: &Range) -> Range +where + T: std::cmp::Ord + Copy, +{ + Range { + start: max(r1.start, r2.start), + end: min(r1.end, r2.end), + } +} diff --git a/permanent_cig/src/set.rs b/permanent_cig/src/set.rs new file mode 100644 index 0000000..9c72b4f --- /dev/null +++ b/permanent_cig/src/set.rs @@ -0,0 +1,49 @@ +//! Utility functions for working with sets. + +use bitvec::vec::BitVec; + +/// Generate a random bitvec of the given size. +fn random_bitvec(rng: &mut fastrand::Rng, size: usize) -> BitVec { + let elems = size / (usize::BITS as usize); + let mut v = Vec::with_capacity(elems); + for _ in 0..=elems { + v.push(rng.usize(..)); + } + let mut v = BitVec::from_vec(v); + v.resize(size, false); + v +} + +pub struct RandomSubsets<'a, T> { + rng: &'a mut fastrand::Rng, + vec: &'a [T], +} + +impl<'a, T: Copy> Iterator for RandomSubsets<'a, T> { + type Item = Vec; + + fn next(&mut self) -> Option { + let bitvec = random_bitvec(self.rng, self.vec.len()); + Some(bitvec.iter_ones().map(|idx| self.vec[idx]).collect()) + } +} + +/// Generate random subsets of the vector. +pub fn random_subsets<'a, T>(rng: &'a mut fastrand::Rng, vec: &'a [T]) -> RandomSubsets<'a, T> { + RandomSubsets { rng, vec } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_random_subsets() { + let vec = vec![1, 2, 3, 4, 5]; + let mut rng = fastrand::Rng::with_seed(0); + let mut subsets = random_subsets(&mut rng, &vec); + assert_eq!(subsets.next().as_deref(), Some([2, 3, 4].as_slice())); + assert_eq!(subsets.next().as_deref(), Some([1, 3, 4].as_slice())); + assert_eq!(subsets.next().as_deref(), Some([3].as_slice())); + } +} diff --git a/permanent_common/.gitignore b/permanent_common/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/permanent_common/.gitignore @@ -0,0 +1 @@ +/target diff --git a/permanent_common/Cargo.toml b/permanent_common/Cargo.toml new file mode 100644 index 0000000..c39944a --- /dev/null +++ b/permanent_common/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "permanent_common" +version = "0.1.0" +edition = "2021" + +[dependencies] +anyhow = "1.0" +bincode = "1.3.3" +enumset = "1.1.2" +serde = { version = "1.0.183", features = ["derive"] } +snap = "1.0.5" diff --git a/permanent_common/src/action.rs b/permanent_common/src/action.rs new file mode 100644 index 0000000..2a34e46 --- /dev/null +++ b/permanent_common/src/action.rs @@ -0,0 +1,47 @@ +use crate::profiler::{Profile, Profiler}; + +pub trait Action { + fn run(&self) -> Option>; +} + +pub struct ActionChain { + actions: Vec>, + profile_file: Option, +} + +impl ActionChain { + pub fn new() -> Self { + Self { actions: vec![], profile_file: None } + } + + pub fn with_profiling(profile_file: &str) -> Self { + Self { actions: vec![], profile_file: Some(profile_file.to_string()) } + } + + pub fn append(&mut self, action: Box) { + self.actions.push(action); + } + +} + +impl Action for ActionChain { + fn run(&self) -> Option> { + + let mut profiler = Profiler::new(); + + for action in &self.actions { + match action.run() { + None => break, + Some(res) => profiler.register_vec(res), + } + } + + let profiles = profiler.get_profiles(); + if let Some(profile_file) = &self.profile_file { + println!("== Write Profiling data"); + profiler.to_file(&profile_file[..]).expect("Could not write profiling data"); + } + + return Some(profiles); + } +} diff --git a/permanent_common/src/bin/trace_read.rs b/permanent_common/src/bin/trace_read.rs new file mode 100644 index 0000000..58daa03 --- /dev/null +++ b/permanent_common/src/bin/trace_read.rs @@ -0,0 +1,43 @@ +use std::env; +use std::io::BufReader; +use std::fs::File; + +use permanent_common::trace::parse_trace_file_bin; +use permanent_common::trace::{PmemEvent, NvmeEvent, TraceEntry}; + +fn main() { + let args: Vec = env::args().collect(); + if args.len() < 2 || args.len() > 3 { + println!("usage: {} [--nodata]", args[0]); + return; + } + + let file = File::open(args[1].as_str()).unwrap(); + let nodata = args.len() == 3 && args[2] == "--nodata"; + for item in parse_trace_file_bin(BufReader::new(file)) { + let mut item = item.unwrap(); + if nodata { + remove_data(&mut item); + } + println!("{:?}", item); + } +} + +fn remove_data(item: &mut TraceEntry) { + match item { + //TraceEntry::Pmem { id: _, event } => { + // match event { + // PmemEvent::Read { address: _, size: _, content } => { content.clear(); }, + // PmemEvent::Write { address: _, size: _, content, non_temporal: _ } => { content.clear(); }, + // _ => { }, + // } + //}, + TraceEntry::Nvme { id: _, event } => { + match event { + NvmeEvent::Write { offset: _, length: _, data } => { data.clear(); }, + _ => { }, + } + }, + _ => { }, + } +} diff --git a/permanent_common/src/config.rs b/permanent_common/src/config.rs new file mode 100644 index 0000000..60eb9c2 --- /dev/null +++ b/permanent_common/src/config.rs @@ -0,0 +1,194 @@ +use enumset::{EnumSet, EnumSetType}; +use serde::Deserialize; + +#[derive(Debug, EnumSetType)] +pub enum TraceOption { + PmemRead, + PmemWrite, + PmemFence, + PmemFlush, + + NvmeRead, + NvmeWrite, + NvmeFlush, + + Checkpoint +} + +impl TraceOption { + pub fn to_qemu_str(&self) -> &'static str { + match self { + TraceOption::PmemRead => "pmem_read", + TraceOption::PmemWrite => "pmem_write", + TraceOption::PmemFence => "pmem_fence", + TraceOption::PmemFlush => "pmem_flush", + + TraceOption::NvmeRead => "nvme_read", + TraceOption::NvmeWrite => "nvme_write", + TraceOption::NvmeFlush => "nvme_flush", + + TraceOption::Checkpoint => "checkpoint", + } + } + + pub fn from_qemu_str(s: &str) -> Result { + match s { + "pmem_read" => Ok(TraceOption::PmemRead), + "pmem_write" => Ok(TraceOption::PmemWrite), + "pmem_fence" => Ok(TraceOption::PmemFence), + "pmem_flush" => Ok(TraceOption::PmemFlush), + + "nvme_read" => Ok(TraceOption::NvmeRead), + "nvme_write" => Ok(TraceOption::NvmeWrite), + "nvme_flush" => Ok(TraceOption::NvmeFlush), + + "checkpoint" => Ok(TraceOption::Checkpoint), + + _ => Err(()), + } + } +} + +#[derive(Debug)] +pub struct TcgPluginConfig { + pub pmem_start: u64, + pub pmem_len: u64, + pub pmem_base_image_path: Option, + pub trace_what: EnumSet, + pub out_trace_file: String, +} + +impl TcgPluginConfig { + pub fn to_qemu_plugin_arg_string(&self, plugin_path: &str) -> String { + // TODO there is probably a nicer way, but intersperse() is unstable + let maybe_trace_what_string = if self.trace_what.is_empty() { + None + } else { + let mut trace_what_string = String::new(); + for opt in self.trace_what.iter().map(|o| o.to_qemu_str()) { + trace_what_string.push_str(opt); + trace_what_string.push('/'); + } + trace_what_string.pop(); // remove last separator + Some(trace_what_string) + }; + + let mut s = format!("{},pmem_start={},pmem_len={}", + plugin_path, + self.pmem_start, + self.pmem_len, + ); + if let Some(path) = &self.pmem_base_image_path { + s.push_str(format!(",pmem_base_image_path={}", path).as_str()); + } + if let Some(trace_what_string) = maybe_trace_what_string { + s.push_str(format!(",trace_what={}", trace_what_string).as_str()); + } + s.push_str(format!(",out_trace_file={}", self.out_trace_file).as_str()); + s + } +} + +/// vm.yaml files +#[derive(Clone, Debug, Deserialize)] +pub struct VmConfig { + pub fs_type: String, + pub pmem_start: Option, // only used for pmem/hybrid; yaml files can simply leave it out + pub pmem_len: Option, + pub qemu_path: String, + pub kernel_path: String, + pub initrd_path: String, + pub qemu_args: Vec, + pub trace_cmd_prefix: String, + pub dump_cmd_prefix: String, + pub recovery_cmd: String, +} + +impl VmConfig { + pub fn have_pmem_nvme(&self) -> (bool, bool) { + match self.fs_type.as_str() { + "pmem" => (true, false), + "nvme" => (false, true), + "hybrid" => (true, true), + _ => panic!("invalid fs_type in vm config"), + } + } +} + +/// test.yaml files +#[derive(Clone, Debug, Deserialize)] +pub struct TestConfig { + pub trace_cmd_suffix: String, + pub checkpoint_range: (u8, u8), + pub dump_cmd_suffix: String, +} + +#[derive(Clone)] +pub enum TraceType { + /// execute test case. Trace all writes/fences/flushes/checkpoints + Analyse, + /// do recovery trace. Trace all reads/checkpoints + PostSuccess, + /// dump file system and verify integrity. Trace all checkpoints + PostFailure { pmem_hash: Option, nvme_hash: Option }, +} + +/// configuration for a single tracing operation +#[derive(Clone)] +pub struct TraceConfig { + pub trace_type: TraceType, + dir: String, +} + +impl TraceConfig { + pub fn new(work_dir: &String, trace_type: TraceType) -> Self { + let prefix = match &trace_type { + TraceType::Analyse => "analyse".to_string(), + TraceType::PostSuccess => "post_success".to_string(), + TraceType::PostFailure { pmem_hash, nvme_hash } => { + let mut s = "post".to_string(); + if let Some(hash) = pmem_hash { + s.push('_'); + s.push_str(hash); + } + if let Some(hash) = nvme_hash { + s.push('_'); + s.push_str(hash); + } + s + } + }; + Self { + trace_type, + dir: format!("{}/{}", work_dir, prefix), + } + } + + pub fn trace_dir(&self) -> String { + self.dir.clone() + } + + pub fn pipe_path(&self) -> String { + format!("{}/pipe", self.dir) + } + + pub fn trace_path(&self) -> String { + format!("{}/trace.bin", self.dir) + } + + pub fn pmem_image_path(&self) -> String { + format!("{}/pmem.raw", self.dir) + } + + pub fn nvme_image_path(&self) -> String { + format!("{}/nvme.raw", self.dir) + } + + pub fn log_path(&self) -> String { + format!("{}/log", self.dir) + } + + pub fn io_log_path(&self) -> String { + format!("{}/io_log", self.dir) + } +} diff --git a/permanent_common/src/lib.rs b/permanent_common/src/lib.rs new file mode 100644 index 0000000..3092804 --- /dev/null +++ b/permanent_common/src/lib.rs @@ -0,0 +1,4 @@ +pub mod profiler; +pub mod action; +pub mod config; +pub mod trace; diff --git a/permanent_common/src/profiler.rs b/permanent_common/src/profiler.rs new file mode 100644 index 0000000..b27d55b --- /dev/null +++ b/permanent_common/src/profiler.rs @@ -0,0 +1,131 @@ +use std::time::{SystemTime, Duration, SystemTimeError}; +use std::iter::Sum; +use std::fs::File; +use std::io::BufWriter; +use std::io; +use std::io::Write; + +#[derive(Clone)] +pub struct Profiler { + profiles: Vec, +} + +impl Profiler { + pub fn new() -> Self { + Self { profiles: vec!() } + } + + pub fn from_profiles(profiles: Vec) -> Self { + Self { profiles } + } + + pub fn to_file(&self, filename: &str) -> Result<(), io::Error> { + + let file = File::create(filename)?; + let mut writer = BufWriter::new(file); + + for profile in &self.profiles { + writer.write_fmt(format_args!("{}\n", profile))?; + } + + writer.flush()?; + Ok(()) + + } + + pub fn register(&mut self, profile: Profile) { + self.profiles.push(profile); + } + + pub fn register_vec(&mut self, profiles: Vec) { + for profile in profiles { + self.profiles.push(profile); + } + } + + pub fn get_profiles(&self) -> Vec { + self.profiles.clone() + } + + +} + +#[derive(Clone)] +pub struct Profile { + name: String, + measurements: Vec, + logs: Vec>, +} + +impl Profile { + pub fn new(name: &str) -> Self { + Self { name: name.to_string(), measurements: vec!(), logs: vec!() } + } + + pub fn from_measurement(name: &str, measurement: Measurement) -> Self { + Self { name: name.to_string(), measurements: vec!(measurement), logs: vec!() } + } + + pub fn measure(&mut self, measurement: Measurement) { + self.measurements.push(measurement); + } + + pub fn measure_vec(&mut self, measurements: Vec) { + for meas in measurements { + self.measure(meas); + } + } + + pub fn get_measurements(&self) -> Vec { + return self.measurements.clone(); + } + + pub fn log(&mut self, text: &str) { + self.logs.push(Box::new(text.to_string())); + } + + pub fn durations(&self) -> Vec { + let mut durations = vec!(); + + for meas in &self.measurements { + match meas.duration() { + Ok(d) => durations.push(d), + Err(_) => {}, + } + } + + return durations + } + + pub fn average_duration(&self) -> Duration { + let durations = self.durations(); + let dur = Duration::sum(durations.iter()); + return if durations.len() == 0 { + Duration::new(0, 0) + } else { + dur / durations.len() as u32 + } + } + +} + +impl std::fmt::Display for Profile { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}: {:?} {:?}\n{:?}", self.name, self.average_duration(), self.durations(), self.logs) + } +} + +#[derive(Clone)] +pub struct Measurement { + start: SystemTime, + end: SystemTime, +} + +impl Measurement { + pub fn new(start: SystemTime, end: SystemTime) -> Self { + Self { start, end } + } + pub fn duration(&self) -> Result { + self.end.duration_since(self.start) + } +} diff --git a/permanent_common/src/trace.rs b/permanent_common/src/trace.rs new file mode 100644 index 0000000..020de4a --- /dev/null +++ b/permanent_common/src/trace.rs @@ -0,0 +1,95 @@ +use std::io::{BufRead, Read, Write}; + +use serde::{Serialize, Deserialize}; +use anyhow::Result; + +#[derive(Debug, Serialize, Deserialize)] +pub enum PmemEvent { + Read { + address: u64, + size: u64, + content: Vec, + }, + Write { + address: u64, + size: u64, + content: Vec, + non_temporal: bool, + }, + Fence, + Clflush { address: u64 }, + Clflushopt { address: u64 }, + Clwb { address: u64 }, + Wbinvd, +} + +#[derive(Debug, Serialize, Deserialize)] +pub enum NvmeEvent { + Read { + offset: u64, + length: u64, + }, + Write { + offset: u64, + length: u64, + data: Vec, + }, + Flush, +} + +#[derive(Debug, Serialize, Deserialize)] +pub enum TraceEntry { + Pmem { id: u64, event: PmemEvent }, + Nvme { id: u64, event: NvmeEvent }, + Checkpoint { id: u64, value: u8 }, +} + +impl TraceEntry { + pub fn deserialize_from(src: &mut R) -> bincode::Result { + bincode::deserialize_from(src) + } + + pub fn serialize_into(&self, dst: &mut W) -> bincode::Result<()> { + bincode::serialize_into(dst, self) + } +} + +pub struct BinTraceIterator { + file: R, +} + +fn is_eof(err: &bincode::ErrorKind) -> bool { + match err { + bincode::ErrorKind::Io(io_err) => match io_err.kind() { + std::io::ErrorKind::UnexpectedEof => true, + _ => false, + }, + _ => false, + } +} + +impl Iterator for BinTraceIterator { + type Item = Result; + + fn next(&mut self) -> Option { + match TraceEntry::deserialize_from(&mut self.file) { + Ok(e) => Some(Ok(e)), + Err(e) if is_eof(&*e) => None, + Err(e) => Some(Err(e.into())), + } + } +} + +pub type TraceWriter = snap::write::FrameEncoder; + +/// Create a trace writer with compression. +pub fn new_trace_writer_bin(file: W) -> TraceWriter { + snap::write::FrameEncoder::new(file) +} + +/// Parse a binary trace file. +pub fn parse_trace_file_bin(file: R) -> BinTraceIterator> { + BinTraceIterator { + file: snap::read::FrameDecoder::new(file), + } +} diff --git a/permanent_plugin/.gitignore b/permanent_plugin/.gitignore new file mode 100644 index 0000000..2ee06a8 --- /dev/null +++ b/permanent_plugin/.gitignore @@ -0,0 +1,2 @@ +/target +/checkpoint diff --git a/permanent_plugin/Cargo.toml b/permanent_plugin/Cargo.toml new file mode 100644 index 0000000..43ced9a --- /dev/null +++ b/permanent_plugin/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "permanent_plugin" +version = "0.1.0" +edition = "2021" + +[lib] +crate-type = ["cdylib"] + +[dependencies] +permanent_common = { path = "../permanent_common" } + +crossbeam-channel = "0.5" +lazy_static = "1.4" +iced-x86 = { version = "1.19.0", default-features = false, features = ["std", "decoder", "intel"] } +enumset = "1.1.2" diff --git a/permanent_plugin/build_checkpoint.sh b/permanent_plugin/build_checkpoint.sh new file mode 100755 index 0000000..5ec5d59 --- /dev/null +++ b/permanent_plugin/build_checkpoint.sh @@ -0,0 +1,2 @@ +#!/usr/bin/env bash +exec gcc -O1 -static -o checkpoint checkpoint.c diff --git a/permanent_plugin/checkpoint.c b/permanent_plugin/checkpoint.c new file mode 100644 index 0000000..cb625c5 --- /dev/null +++ b/permanent_plugin/checkpoint.c @@ -0,0 +1,56 @@ +#include +#include +#include +#include +#include + +static volatile uint8_t global_value; + +int main(int argc, char** argv) { + if (argc != 2) { + fprintf(stderr, "usage: checkpoint \n"); + return 1; + } + char *s = argv[1]; + uint8_t value; + + if (*s >= 'a' && *s <= 'z') { + // string message (used to signal success) + do { + *s++ -= 32; // uppercase + } while (*s >= 'a' && *s <= 'z'); + if (*s) { + printf("expected only lowercase letters but found %c (%02x)\n", *s, *s); + return 1; + } + printf("\nPERMANENT %s\n", argv[1]); + return 0; + } else if (isdigit(s[0])) { + // pass number as value + char *error = NULL; + unsigned long value_long = strtoul(s, &error, 10); + if (*error != '\0') { + printf("expected a number but found %c (%02x)\n", *error, *error); + return 1; + } else if (value_long > 255) { + printf("only checkpoint values up to 255 supported\n"); + return 1; + } + value = (uint8_t) value_long; + } else { + printf("invalid argument\n"); + return 1; + } + + + // key: ascii bytes "perm" + asm __volatile__( + "mov $0x6d726570, %%eax \t\n" + "mov %1, %0 \t\n" + : "=m" (global_value) /* output */ + : "r" (value) /* input */ + : "eax" /* clobbers */ + ); + + return 0; +} diff --git a/permanent_plugin/src/lib.rs b/permanent_plugin/src/lib.rs new file mode 100644 index 0000000..009232f --- /dev/null +++ b/permanent_plugin/src/lib.rs @@ -0,0 +1,443 @@ +use std::mem; +use std::fs::File; +use std::sync::{Mutex, OnceLock}; +use std::thread::JoinHandle; +use core::ffi; +use lazy_static::lazy_static; +use iced_x86::{Decoder, DecoderOptions, Mnemonic}; +use crossbeam_channel::Sender; +use enumset::EnumSet; + +use permanent_common::config::{TraceOption, TcgPluginConfig}; +use permanent_common::trace::{PmemEvent, new_trace_writer_bin}; + +mod qemu_plugin_bindings; +use qemu_plugin_bindings as qp; + +mod writer; +use writer::{TraceMessage, writer_main}; + +#[no_mangle] +pub static qemu_plugin_version: ffi::c_int = qp::QEMU_PLUGIN_VERSION as ffi::c_int; +#[no_mangle] +pub static permanent_trace_version: ffi::c_int = 1; + +//------------------------------------------------------------------------------ + +#[derive(Debug)] +enum UserdataMem { + ReadWrite { disas: String, nt: bool }, + Clflush { disas: String }, + Clflushopt { disas: String }, + Clwb { disas: String }, + Checkpoint, +} + +#[derive(Debug)] +enum UserdataExec { + Wbinvd { disas: String }, + Fence { disas: String }, +} + +struct WriterThread { + trace_send: Sender, + done_send: Sender<()>, + handle: JoinHandle<()>, +} + + +// save all userdata to make sure the callbacks don't do use-after-free +// use Box here so that the address doesn't change on vector insertion/reallocation +lazy_static! { + static ref USERDATA_MEM_VEC: Mutex>> = Mutex::new(Vec::new()); + static ref USERDATA_EXEC_VEC: Mutex>> = Mutex::new(Vec::new()); + // static ref HAVE_WRITES: Mutex = Mutex::new(false); + static ref HAVE_CHECKPOINT_SIGNAL_INSN: Mutex = Mutex::new(false); + static ref HAVE_PMEM_INIT: Mutex = Mutex::new(false); + + static ref WRITER_THREAD: Mutex> = Mutex::new(None); +} +static CONFIG: OnceLock = OnceLock::new(); + +fn get_conf() -> &'static TcgPluginConfig { + &CONFIG.get().unwrap() +} + +fn send_msg(msg: TraceMessage) { + WRITER_THREAD.lock().unwrap() + .as_mut().unwrap() + .trace_send.send(msg) + .expect("failed send to writer thread"); +} + +#[no_mangle] +extern "C" fn my_vcpu_insn_exec_cb(_vcpu_index: ffi::c_uint, userdata: *mut ffi::c_void) { + let u: &UserdataExec = unsafe { &*(userdata as *const UserdataExec) }; + + // filtering happens in hook_insn + match u { + UserdataExec::Wbinvd { .. } => { + // *HAVE_WRITES.lock().unwrap() = true; + send_msg(TraceMessage::Pmem(PmemEvent::Wbinvd)); + }, + UserdataExec::Fence { .. } => { + // let mut have_writes = HAVE_WRITES.lock().unwrap(); + // if *have_writes { + // *have_writes = false; + // send_msg(TraceMessage::Pmem(PmemEvent::Fence)); + // } + send_msg(TraceMessage::Pmem(PmemEvent::Fence)); + }, + } +} + +#[no_mangle] +extern "C" fn my_vcpu_mem_cb(vcpu_index: ffi::c_uint, info: qp::QemuPluginMeminfo, vaddr: u64, userdata: *mut ffi::c_void) { + let conf = get_conf(); + + let u: &UserdataMem = unsafe { &*(userdata as *const UserdataMem) }; + + // handle checkpoint up here because it doesn't count as a normal memory access + // otherwise we would skip it as it is not in pmem range + if let UserdataMem::Checkpoint = u { + let mut value: u8 = 0; + unsafe { qp::qemu_plugin_vcpu_memory_rw(vcpu_index, vaddr, &mut value as *mut _ as *mut ffi::c_void, 1, false, false) }; + if value == 255 { // special value when kernel is booted + initialize_pmem_area(); + } + + if conf.trace_what.contains(TraceOption::Checkpoint) { + send_msg(TraceMessage::Checkpoint { value }); + } + return; + } + + let paddr = unsafe { qp::qemu_plugin_hwaddr_phys_addr(qp::qemu_plugin_get_hwaddr(info, vaddr)) }; + if paddr < conf.pmem_start || paddr >= conf.pmem_start + conf.pmem_len { + return; + } + let address = paddr - conf.pmem_start; + // *HAVE_WRITES.lock().unwrap() = true; + + match u { + // filtering of checkpoint and flush happens in hook_insn + UserdataMem::Checkpoint => panic!("checkpoints handled above"), + UserdataMem::Clflush { .. } => { + // *HAVE_WRITES.lock().unwrap() = true; + send_msg(TraceMessage::Pmem(PmemEvent::Clflush { address })); + }, + UserdataMem::Clflushopt { .. } => { + // *HAVE_WRITES.lock().unwrap() = true; + send_msg(TraceMessage::Pmem(PmemEvent::Clflushopt { address })); + }, + UserdataMem::Clwb { .. } => { + // *HAVE_WRITES.lock().unwrap() = true; + send_msg(TraceMessage::Pmem(PmemEvent::Clwb { address })); + }, + UserdataMem::ReadWrite { disas: _, nt: is_nt } => { + let is_store = unsafe { qp::qemu_plugin_mem_is_store(info) }; + if (is_store && conf.trace_what.contains(TraceOption::PmemWrite)) + || (!is_store && conf.trace_what.contains(TraceOption::PmemRead)) { + let nb = unsafe { 1usize << qp::qemu_plugin_mem_size_shift(info) }; + let mut buf: Vec = Vec::with_capacity(nb); + unsafe { + // TODO we could now do this with paddr as well. + qp::qemu_plugin_vcpu_memory_rw(vcpu_index, vaddr, buf.as_mut_ptr() as *mut ffi::c_void, nb as u64, false, false); + // safety: we assume that nb elements have been read (and are now initialized) + buf.set_len(nb); + } + + if is_store { + send_msg(TraceMessage::Pmem(PmemEvent::Write { address, size: nb as u64, content: buf, non_temporal: *is_nt })); + } else { + send_msg(TraceMessage::Pmem(PmemEvent::Read { address, size: nb as u64, content: buf })); + } + } + }, + } +} + +// TODO instead of having one exec_cb and one mem_cb, we could have several ones, and avoid +// branching inside the callbacks + +fn hook_insn(insn: *mut qp::QemuPluginInsn) { + let conf = get_conf(); + + let data = unsafe { + let dataptr = qp::qemu_plugin_insn_data(insn) as *const u8; + let datasize = qp::qemu_plugin_insn_size(insn); + if dataptr.is_null() || datasize == 0 { + panic!("qemu_plugin_insn_(data|size) invalid return value"); + } + std::slice::from_raw_parts(dataptr, datasize) + }; + let this_is_checkpoint = *HAVE_CHECKPOINT_SIGNAL_INSN.lock().unwrap(); + let checkpoint_signal_insn: [u8; 5] = [0xb8, 0x70, 0x65, 0x72, 0x6d]; + // next instruction coming up is going to be a memory write with the value of the checkpoint + // we set this globally at the end of the function so that the next invocation sees it + *HAVE_CHECKPOINT_SIGNAL_INSN.lock().unwrap() = data == &checkpoint_signal_insn[..]; + + let mut decoder = Decoder::new(64, data, DecoderOptions::NONE); + let decoded_insn = decoder.decode(); + if decoded_insn.is_invalid() { + if cfg!(permanent_trace_insn_invalid = "panic") { + panic!("invalid instruction: {:02x?}", data); + } else if cfg!(permanent_trace_insn_invalid = "print") { + println!("permanent_plugin: invalid instruction: {:02x?}", data); + // NOTE: this leaks memory + let qemu_disas = unsafe { ffi::CStr::from_ptr(qp::qemu_plugin_insn_disas(insn)).to_str().unwrap() }; + println!("permanent_plugin: {}", qemu_disas); + } + return; + } + let disas = decoded_insn.to_string(); + + let maybe_exec_udat = match decoded_insn.mnemonic() { + Mnemonic::Wbinvd => conf.trace_what.contains(TraceOption::PmemFlush) + .then(|| Box::new(UserdataExec::Wbinvd { disas: disas.clone() })), + Mnemonic::Mfence | Mnemonic::Sfence => conf.trace_what.contains(TraceOption::PmemFence) + .then(|| Box::new(UserdataExec::Fence { disas: disas.clone() })), + _ => None + }; + + if let Some(mut exec_udat) = maybe_exec_udat { + unsafe { + qp::qemu_plugin_register_vcpu_insn_exec_cb(insn, + Some(my_vcpu_insn_exec_cb), + qp::QEMU_PLUGIN_CB_R_REGS, + &mut *exec_udat as *mut _ as *mut ffi::c_void); + } + USERDATA_EXEC_VEC.lock().unwrap().push(exec_udat); + } + + let trace_rw = conf.trace_what.contains(TraceOption::PmemRead) + || conf.trace_what.contains(TraceOption::PmemWrite); + // we do some filtering of mem events inside the callback, because we don't know at this point (and QEMU is weird) + let maybe_mem_udat = match decoded_insn.mnemonic() { + _ if this_is_checkpoint => Some(Box::new(UserdataMem::Checkpoint)), // always checkpoint, + // because we use it + // for pmem init + Mnemonic::Clflush => conf.trace_what.contains(TraceOption::PmemFlush) + .then(|| Box::new(UserdataMem::Clflush { disas })), + Mnemonic::Clflushopt => conf.trace_what.contains(TraceOption::PmemFlush) + .then(|| Box::new(UserdataMem::Clflushopt { disas })), + Mnemonic::Clwb => conf.trace_what.contains(TraceOption::PmemFlush) + .then(|| Box::new(UserdataMem::Clwb { disas })), + Mnemonic::Movntdq + | Mnemonic::Movntdqa + | Mnemonic::Movnti + | Mnemonic::Movntpd + | Mnemonic::Movntps + | Mnemonic::Movntq + | Mnemonic::Movntsd + | Mnemonic::Movntss + => trace_rw.then(|| Box::new(UserdataMem::ReadWrite { disas, nt: true })), + _ => trace_rw.then(|| Box::new(UserdataMem::ReadWrite { disas, nt: false })), + }; + + if let Some(mut mem_udat) = maybe_mem_udat { + unsafe { + qp::qemu_plugin_register_vcpu_mem_cb(insn, + Some(my_vcpu_mem_cb), + qp::QEMU_PLUGIN_CB_R_REGS, + qp::QEMU_PLUGIN_MEM_RW, + &mut *mem_udat as *mut _ as *mut ffi::c_void); + } + USERDATA_MEM_VEC.lock().unwrap().push(mem_udat); + } +} + +fn initialize_pmem_area() { + let mut have_pmem_init = HAVE_PMEM_INIT.lock().unwrap(); + if *have_pmem_init { + panic!("pmem initialized twice"); + } + *have_pmem_init = true; + + let conf = get_conf(); + if conf.pmem_len == 0 { // no pmem + return; + } + let mut data = match &conf.pmem_base_image_path { + Some(path) => { + println!("permanent_plugin: initialize pmem from file {}", path); + let content = std::fs::read(path).unwrap(); + if content.len() != conf.pmem_len as usize { + panic!("pmem_base_image file has the wrong size"); + } + content + }, + None => { + println!("permanent_plugin: initialize pmem as zero"); + vec![0u8; conf.pmem_len as usize] + } + }; + unsafe { qp::qemu_plugin_vcpu_memory_rw(0, conf.pmem_start, data.as_mut_ptr() as *mut ffi::c_void, conf.pmem_len, true, true) }; + println!("permanent_plugin: pmem initialized"); +} + +#[no_mangle] +extern "C" fn my_vcpu_tb_trans_cb(_id: qp::QemuPluginId, tb: *mut qp::QemuPluginTb) { + let n = unsafe { qp::qemu_plugin_tb_n_insns(tb) }; + for i in 0..n { + let insn = unsafe { qp::qemu_plugin_tb_get_insn(tb, i) }; + hook_insn(insn); + } +} + +#[no_mangle] +extern "C" fn my_atexit_cb(_id: qp::QemuPluginId, _userdata: *mut ffi::c_void) { + // now we can drop the userdata + USERDATA_MEM_VEC.lock().unwrap().clear(); + USERDATA_EXEC_VEC.lock().unwrap().clear(); + + // collect writer thread + let wt = mem::take(&mut *WRITER_THREAD.lock().unwrap()).unwrap(); + wt.done_send.send(()).expect("couldn't signal writer thread termination"); + wt.handle.join().expect("couldn't join writer thread"); +} + +#[no_mangle] +pub extern "C" fn qemu_plugin_install( + id: qp::QemuPluginId, + _info: *const qp::QemuInfo, + argc: ffi::c_int, + argv: *mut *mut ffi::c_char, + ) -> ffi::c_int +{ + println!("permanent_plugin: install"); + // create config + let argc: usize = argc.try_into().unwrap(); + let argv_slice = unsafe { std::slice::from_raw_parts(argv, argc) }; + let mut args = Vec::new(); + for i in 0..argc { + let arg = unsafe { ffi::CStr::from_ptr(argv_slice[i]).to_str().unwrap() }; + args.push(arg); + } + let mut conf = TcgPluginConfig { + pmem_start: 0, + pmem_len: 0, + pmem_base_image_path: None, + trace_what: EnumSet::empty(), + out_trace_file: String::new(), + }; + for arg in args { + let (key, value) = arg.split_once("=").expect("invalid argument"); + // TODO return 1 instead of unwrap + match key { + "pmem_start" => { conf.pmem_start = value.parse().unwrap(); }, // TODO might overflow if we set pmem_start but not pmem_len + "pmem_len" => { conf.pmem_len = value.parse().unwrap(); }, + "trace_what" => { + let trace_what_args: Vec<&str> = value.split("/").collect(); + for arg in trace_what_args { + match TraceOption::from_qemu_str(arg) { + Ok(opt) => { conf.trace_what.insert(opt); } + Err(()) => panic!("unknown trace_what argument: {}", arg), + } + } + }, + "pmem_base_image_path" => { conf.pmem_base_image_path = Some(value.to_string()); }, + "out_trace_file" => { + conf.out_trace_file = value.to_string(); + }, + _ => panic!("unknown argument: {}", key), + } + } + + unsafe { + // NOTE: removed performance optimization for post-failure tracing because we + // initialize pmem area inside the mem callback + qp::qemu_plugin_register_vcpu_tb_trans_cb(id, Some(my_vcpu_tb_trans_cb)); + qp::qemu_plugin_register_atexit_cb(id, Some(my_atexit_cb), std::ptr::null_mut::()); + } + + let trace_out = new_trace_writer_bin(File::create(&conf.out_trace_file).expect("could not open out_trace_file")); + CONFIG.set(conf).expect("could not set config"); + + // create writer thread + let (trace_send, trace_recv) = crossbeam_channel::unbounded::(); + let (done_send, done_recv) = crossbeam_channel::bounded(0); + println!("permanent_plugin: start writer thread"); + let handle = std::thread::spawn(move || { writer_main(trace_recv, done_recv, trace_out); }); + *WRITER_THREAD.lock().unwrap() = Some(WriterThread { trace_send, done_send, handle }); + + println!("permanent_plugin: install successful"); + 0 +} + +//------------------------------------------------------------------------------ + +#[no_mangle] +pub extern "C" fn permanent_trace_pci_nvme_read(_c_cid: u16, _nsid: u32, _nlb: u32, _count: u64, _lba: u64) { + /* ignore */ +} + +#[no_mangle] +pub extern "C" fn permanent_trace_pci_nvme_write(_c_cid: u16, _verb: *const ffi::c_char, _nsid: u32, _nlb: u32, _count: u64, _lba: u64) { + /* ignore */ +} + +#[no_mangle] +pub extern "C" fn permanent_trace_pci_nvme_io_cmd(_cid: u16, _nsid: u32, _sqid: u16, _opcode: u8, opname: *const ffi::c_char) { + if get_conf().trace_what.contains(TraceOption::NvmeFlush) { + unsafe { + match ffi::CStr::from_ptr(opname).to_str().unwrap() { + "NVME_NVM_CMD_FLUSH" => { + send_msg(TraceMessage::NvmeFlush); + }, + _ => { /* ignore */ }, + } + } + } +} + +#[no_mangle] +pub extern "C" fn permanent_trace_pci_nvme_blk_read(req: *const ffi::c_void, offset: u64) { + if get_conf().trace_what.contains(TraceOption::NvmeRead) { + send_msg(TraceMessage::PciNvmeBlkRead { req: req as u64, offset }); + } +} + +#[no_mangle] +pub extern "C" fn permanent_trace_pci_nvme_blk_write(req: *const ffi::c_void, offset: u64) { + if get_conf().trace_what.contains(TraceOption::NvmeWrite) { + send_msg(TraceMessage::PciNvmeBlkWrite { req: req as u64, offset }); + } +} + +#[no_mangle] +pub extern "C" fn permanent_trace_pci_nvme_enqueue_req_completion(req: *const ffi::c_void, status: u16) { + if (get_conf().trace_what.contains(TraceOption::NvmeRead) + || get_conf().trace_what.contains(TraceOption::NvmeWrite)) + && status == 0 { + send_msg(TraceMessage::PciNvmeEnqueueReqCompletion { req: req as u64 }); + } +} + +#[no_mangle] +pub extern "C" fn permanent_trace_dma_blk_read(dbs: *const ffi::c_void, offset: i64, bytes: i64) { + if get_conf().trace_what.contains(TraceOption::NvmeRead) { + send_msg(TraceMessage::DmaBlkRead { dbs: dbs as u64, offset, length: bytes }); + } +} + +#[no_mangle] +pub extern "C" fn permanent_trace_dma_blk_write(dbs: *const ffi::c_void, offset: i64, bytes: i64, buf: *const ffi::c_void) { + if get_conf().trace_what.contains(TraceOption::NvmeWrite) { + let nb: usize = bytes.try_into().unwrap(); + let mut data: Vec = Vec::with_capacity(nb); + unsafe { + std::ptr::copy_nonoverlapping(buf as *const u8, data.as_mut_ptr(), nb); + data.set_len(nb); + } + send_msg(TraceMessage::DmaBlkWrite { dbs: dbs as u64, offset, length: bytes, data }); + } +} + +#[no_mangle] +pub extern "C" fn permanent_trace_dma_blk_io(req: *const ffi::c_void, dbs: *const ffi::c_void) { + if get_conf().trace_what.contains(TraceOption::NvmeRead) + || get_conf().trace_what.contains(TraceOption::NvmeWrite) { + send_msg(TraceMessage::DmaBlkIo { req: req as u64, dbs: dbs as u64 }); + } +} diff --git a/permanent_plugin/src/qemu_plugin_bindings.rs b/permanent_plugin/src/qemu_plugin_bindings.rs new file mode 100644 index 0000000..2b36ee7 --- /dev/null +++ b/permanent_plugin/src/qemu_plugin_bindings.rs @@ -0,0 +1,369 @@ +#![allow(dead_code)] + +pub const QEMU_PLUGIN_VERSION: u32 = 2; +#[doc = " typedef QemuPluginId - Unique plugin ID"] +pub type QemuPluginId = u64; +extern "C" { + pub static mut qemu_plugin_version: ::core::ffi::c_int; +} +#[doc = " struct QemuInfo - system information for plugins\n\n This structure provides for some limited information about the\n system to allow the plugin to make decisions on how to proceed. For\n example it might only be suitable for running on some guest\n architectures or when under full system emulation."] +#[repr(C)] +#[derive(Copy, Clone)] +pub struct QemuInfo { + #[doc = " @target_name: string describing architecture"] + pub target_name: *const ::core::ffi::c_char, + pub version: QemuInfoBindgenTy1, + #[doc = " @system_emulation: is this a full system emulation?"] + pub system_emulation: bool, + pub __bindgen_anon_1: QemuInfoBindgenTy2, +} +#[doc = " @version: minimum and current plugin API level"] +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct QemuInfoBindgenTy1 { + pub min: ::core::ffi::c_int, + pub cur: ::core::ffi::c_int, +} +#[repr(C)] +#[derive(Copy, Clone)] +pub union QemuInfoBindgenTy2 { + pub system: QemuInfoBindgenTy2BindgenTy1, +} +#[doc = " @system: information relevant to system emulation"] +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct QemuInfoBindgenTy2BindgenTy1 { + #[doc = " @system.smp_vcpus: initial number of vCPUs"] + pub smp_vcpus: ::core::ffi::c_int, + #[doc = " @system.max_vcpus: maximum possible number of vCPUs"] + pub max_vcpus: ::core::ffi::c_int, +} +extern "C" { + #[doc = " qemu_plugin_install() - Install a plugin\n @id: this plugin's opaque ID\n @info: a block describing some details about the guest\n @argc: number of arguments\n @argv: array of arguments (@argc elements)\n\n All plugins must export this symbol which is called when the plugin\n is first loaded. Calling qemu_plugin_uninstall() from this function\n is a bug.\n\n Note: @info is only live during the call. Copy any information we\n want to keep. @argv remains valid throughout the lifetime of the\n loaded plugin.\n\n Return: 0 on successful loading, !0 for an error."] + pub fn qemu_plugin_install( + id: QemuPluginId, + info: *const QemuInfo, + argc: ::core::ffi::c_int, + argv: *mut *mut ::core::ffi::c_char, + ) -> ::core::ffi::c_int; +} +#[doc = " typedef QemuPluginSimpleCb - simple callback\n @id: the unique QemuPluginId\n\n This callback passes no information aside from the unique @id."] +pub type QemuPluginSimpleCb = + ::core::option::Option; +#[doc = " typedef QemuPluginUdataCb - callback with user data\n @id: the unique QemuPluginId\n @userdata: a pointer to some user data supplied when the callback\n was registered."] +pub type QemuPluginUdataCb = ::core::option::Option< + unsafe extern "C" fn(id: QemuPluginId, userdata: *mut ::core::ffi::c_void), +>; +#[doc = " typedef QemuPluginVcpuSimpleCb - vcpu callback\n @id: the unique QemuPluginId\n @vcpu_index: the current vcpu context"] +pub type QemuPluginVcpuSimpleCb = ::core::option::Option< + unsafe extern "C" fn(id: QemuPluginId, vcpu_index: ::core::ffi::c_uint), +>; +#[doc = " typedef QemuPluginVcpuUdataCb - vcpu callback\n @vcpu_index: the current vcpu context\n @userdata: a pointer to some user data supplied when the callback\n was registered."] +pub type QemuPluginVcpuUdataCb = ::core::option::Option< + unsafe extern "C" fn(vcpu_index: ::core::ffi::c_uint, userdata: *mut ::core::ffi::c_void), +>; +extern "C" { + #[doc = " qemu_plugin_uninstall() - Uninstall a plugin\n @id: this plugin's opaque ID\n @cb: callback to be called once the plugin has been removed\n\n Do NOT assume that the plugin has been uninstalled once this function\n returns. Plugins are uninstalled asynchronously, and therefore the given\n plugin receives callbacks until @cb is called.\n\n Note: Calling this function from qemu_plugin_install() is a bug."] + pub fn qemu_plugin_uninstall(id: QemuPluginId, cb: QemuPluginSimpleCb); +} +extern "C" { + #[doc = " qemu_plugin_reset() - Reset a plugin\n @id: this plugin's opaque ID\n @cb: callback to be called once the plugin has been reset\n\n Unregisters all callbacks for the plugin given by @id.\n\n Do NOT assume that the plugin has been reset once this function returns.\n Plugins are reset asynchronously, and therefore the given plugin receives\n callbacks until @cb is called."] + pub fn qemu_plugin_reset(id: QemuPluginId, cb: QemuPluginSimpleCb); +} +extern "C" { + #[doc = " qemu_plugin_register_vcpu_init_cb() - register a vCPU initialization callback\n @id: plugin ID\n @cb: callback function\n\n The @cb function is called every time a vCPU is initialized.\n\n See also: qemu_plugin_register_vcpu_exit_cb()"] + pub fn qemu_plugin_register_vcpu_init_cb( + id: QemuPluginId, + cb: QemuPluginVcpuSimpleCb, + ); +} +extern "C" { + #[doc = " qemu_plugin_register_vcpu_exit_cb() - register a vCPU exit callback\n @id: plugin ID\n @cb: callback function\n\n The @cb function is called every time a vCPU exits.\n\n See also: qemu_plugin_register_vcpu_init_cb()"] + pub fn qemu_plugin_register_vcpu_exit_cb( + id: QemuPluginId, + cb: QemuPluginVcpuSimpleCb, + ); +} +extern "C" { + #[doc = " qemu_plugin_register_vcpu_idle_cb() - register a vCPU idle callback\n @id: plugin ID\n @cb: callback function\n\n The @cb function is called every time a vCPU idles."] + pub fn qemu_plugin_register_vcpu_idle_cb( + id: QemuPluginId, + cb: QemuPluginVcpuSimpleCb, + ); +} +extern "C" { + #[doc = " qemu_plugin_register_vcpu_resume_cb() - register a vCPU resume callback\n @id: plugin ID\n @cb: callback function\n\n The @cb function is called every time a vCPU resumes execution."] + pub fn qemu_plugin_register_vcpu_resume_cb( + id: QemuPluginId, + cb: QemuPluginVcpuSimpleCb, + ); +} +#[doc = " struct QemuPluginTb - Opaque handle for a translation block"] +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct QemuPluginTb { + _unused: [u8; 0], +} +#[doc = " struct QemuPluginInsn - Opaque handle for a translated instruction"] +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct QemuPluginInsn { + _unused: [u8; 0], +} +pub const QEMU_PLUGIN_CB_NO_REGS: QemuPluginCbFlags = 0; +pub const QEMU_PLUGIN_CB_R_REGS: QemuPluginCbFlags = 1; +pub const QEMU_PLUGIN_CB_RW_REGS: QemuPluginCbFlags = 2; +#[doc = " enum QemuPluginCbFlags - type of callback\n\n @QEMU_PLUGIN_CB_NO_REGS: callback does not access the CPU's regs\n @QEMU_PLUGIN_CB_R_REGS: callback reads the CPU's regs\n @QEMU_PLUGIN_CB_RW_REGS: callback reads and writes the CPU's regs\n\n Note: currently unused, plugins cannot read or change system\n register state."] +pub type QemuPluginCbFlags = ::core::ffi::c_uint; +pub const QEMU_PLUGIN_MEM_R: QemuPluginMemRw = 1; +pub const QEMU_PLUGIN_MEM_W: QemuPluginMemRw = 2; +pub const QEMU_PLUGIN_MEM_RW: QemuPluginMemRw = 3; +pub type QemuPluginMemRw = ::core::ffi::c_uint; +#[doc = " typedef QemuPluginVcpuTbTransCb - translation callback\n @id: unique plugin id\n @tb: opaque handle used for querying and instrumenting a block."] +pub type QemuPluginVcpuTbTransCb = + ::core::option::Option; +extern "C" { + #[doc = " qemu_plugin_register_vcpu_tb_trans_cb() - register a translate cb\n @id: plugin ID\n @cb: callback function\n\n The @cb function is called every time a translation occurs. The @cb\n function is passed an opaque qemu_plugin_type which it can query\n for additional information including the list of translated\n instructions. At this point the plugin can register further\n callbacks to be triggered when the block or individual instruction\n executes."] + pub fn qemu_plugin_register_vcpu_tb_trans_cb( + id: QemuPluginId, + cb: QemuPluginVcpuTbTransCb, + ); +} +extern "C" { + #[doc = " qemu_plugin_register_vcpu_tb_exec_cb() - register execution callback\n @tb: the opaque QemuPluginTb handle for the translation\n @cb: callback function\n @flags: does the plugin read or write the CPU's registers?\n @userdata: any plugin data to pass to the @cb?\n\n The @cb function is called every time a translated unit executes."] + pub fn qemu_plugin_register_vcpu_tb_exec_cb( + tb: *mut QemuPluginTb, + cb: QemuPluginVcpuUdataCb, + flags: QemuPluginCbFlags, + userdata: *mut ::core::ffi::c_void, + ); +} +pub const QEMU_PLUGIN_INLINE_ADD_U64: QemuPluginOp = 0; +#[doc = " enum QemuPluginOp - describes an inline op\n\n @QEMU_PLUGIN_INLINE_ADD_U64: add an immediate value uint64_t\n\n Note: currently only a single inline op is supported."] +pub type QemuPluginOp = ::core::ffi::c_uint; +extern "C" { + #[doc = " qemu_plugin_register_vcpu_tb_exec_inline() - execution inline op\n @tb: the opaque QemuPluginTb handle for the translation\n @op: the type of QemuPluginOp (e.g. ADD_U64)\n @ptr: the target memory location for the op\n @imm: the op data (e.g. 1)\n\n Insert an inline op to every time a translated unit executes.\n Useful if you just want to increment a single counter somewhere in\n memory.\n\n Note: ops are not atomic so in multi-threaded/multi-smp situations\n you will get inexact results."] + pub fn qemu_plugin_register_vcpu_tb_exec_inline( + tb: *mut QemuPluginTb, + op: QemuPluginOp, + ptr: *mut ::core::ffi::c_void, + imm: u64, + ); +} +extern "C" { + #[doc = " qemu_plugin_register_vcpu_insn_exec_cb() - register insn execution cb\n @insn: the opaque QemuPluginInsn handle for an instruction\n @cb: callback function\n @flags: does the plugin read or write the CPU's registers?\n @userdata: any plugin data to pass to the @cb?\n\n The @cb function is called every time an instruction is executed"] + pub fn qemu_plugin_register_vcpu_insn_exec_cb( + insn: *mut QemuPluginInsn, + cb: QemuPluginVcpuUdataCb, + flags: QemuPluginCbFlags, + userdata: *mut ::core::ffi::c_void, + ); +} +extern "C" { + #[doc = " qemu_plugin_register_vcpu_insn_exec_inline() - insn execution inline op\n @insn: the opaque QemuPluginInsn handle for an instruction\n @op: the type of QemuPluginOp (e.g. ADD_U64)\n @ptr: the target memory location for the op\n @imm: the op data (e.g. 1)\n\n Insert an inline op to every time an instruction executes. Useful\n if you just want to increment a single counter somewhere in memory."] + pub fn qemu_plugin_register_vcpu_insn_exec_inline( + insn: *mut QemuPluginInsn, + op: QemuPluginOp, + ptr: *mut ::core::ffi::c_void, + imm: u64, + ); +} +extern "C" { + #[doc = " qemu_plugin_tb_n_insns() - query helper for number of insns in TB\n @tb: opaque handle to TB passed to callback\n\n Returns: number of instructions in this block"] + pub fn qemu_plugin_tb_n_insns(tb: *const QemuPluginTb) -> usize; +} +extern "C" { + #[doc = " qemu_plugin_tb_vaddr() - query helper for vaddr of TB start\n @tb: opaque handle to TB passed to callback\n\n Returns: virtual address of block start"] + pub fn qemu_plugin_tb_vaddr(tb: *const QemuPluginTb) -> u64; +} +extern "C" { + #[doc = " qemu_plugin_tb_get_insn() - retrieve handle for instruction\n @tb: opaque handle to TB passed to callback\n @idx: instruction number, 0 indexed\n\n The returned handle can be used in follow up helper queries as well\n as when instrumenting an instruction. It is only valid for the\n lifetime of the callback.\n\n Returns: opaque handle to instruction"] + pub fn qemu_plugin_tb_get_insn(tb: *const QemuPluginTb, idx: usize) -> *mut QemuPluginInsn; +} +extern "C" { + #[doc = " qemu_plugin_insn_data() - return ptr to instruction data\n @insn: opaque instruction handle from qemu_plugin_tb_get_insn()\n\n Note: data is only valid for duration of callback. See\n qemu_plugin_insn_size() to calculate size of stream.\n\n Returns: pointer to a stream of bytes containing the value of this\n instructions opcode."] + pub fn qemu_plugin_insn_data(insn: *const QemuPluginInsn) -> *const ::core::ffi::c_void; +} +extern "C" { + #[doc = " qemu_plugin_insn_size() - return size of instruction\n @insn: opaque instruction handle from qemu_plugin_tb_get_insn()\n\n Returns: size of instruction in bytes"] + pub fn qemu_plugin_insn_size(insn: *const QemuPluginInsn) -> usize; +} +extern "C" { + #[doc = " qemu_plugin_insn_vaddr() - return vaddr of instruction\n @insn: opaque instruction handle from qemu_plugin_tb_get_insn()\n\n Returns: virtual address of instruction"] + pub fn qemu_plugin_insn_vaddr(insn: *const QemuPluginInsn) -> u64; +} +extern "C" { + #[doc = " qemu_plugin_insn_haddr() - return hardware addr of instruction\n @insn: opaque instruction handle from qemu_plugin_tb_get_insn()\n\n Returns: hardware (physical) target address of instruction"] + pub fn qemu_plugin_insn_haddr(insn: *const QemuPluginInsn) -> *mut ::core::ffi::c_void; +} +#[doc = " typedef QemuPluginMeminfo - opaque memory transaction handle\n\n This can be further queried using the qemu_plugin_mem_* query\n functions."] +pub type QemuPluginMeminfo = u32; +#[doc = " struct QemuPluginHwaddr - opaque hw address handle"] +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct QemuPluginHwaddr { + _unused: [u8; 0], +} +extern "C" { + #[doc = " qemu_plugin_mem_size_shift() - get size of access\n @info: opaque memory transaction handle\n\n Returns: size of access in ^2 (0=byte, 1=16bit, 2=32bit etc...)"] + pub fn qemu_plugin_mem_size_shift(info: QemuPluginMeminfo) -> ::core::ffi::c_uint; +} +extern "C" { + #[doc = " qemu_plugin_mem_is_sign_extended() - was the access sign extended\n @info: opaque memory transaction handle\n\n Returns: true if it was, otherwise false"] + pub fn qemu_plugin_mem_is_sign_extended(info: QemuPluginMeminfo) -> bool; +} +extern "C" { + #[doc = " qemu_plugin_mem_is_big_endian() - was the access big endian\n @info: opaque memory transaction handle\n\n Returns: true if it was, otherwise false"] + pub fn qemu_plugin_mem_is_big_endian(info: QemuPluginMeminfo) -> bool; +} +extern "C" { + #[doc = " qemu_plugin_mem_is_store() - was the access a store\n @info: opaque memory transaction handle\n\n Returns: true if it was, otherwise false"] + pub fn qemu_plugin_mem_is_store(info: QemuPluginMeminfo) -> bool; +} +extern "C" { + #[doc = " qemu_plugin_get_hwaddr() - return handle for memory operation\n @info: opaque memory info structure\n @vaddr: the virtual address of the memory operation\n\n For system emulation returns a QemuPluginHwaddr handle to query\n details about the actual physical address backing the virtual\n address. For linux-user guests it just returns NULL.\n\n This handle is *only* valid for the duration of the callback. Any\n information about the handle should be recovered before the\n callback returns."] + pub fn qemu_plugin_get_hwaddr( + info: QemuPluginMeminfo, + vaddr: u64, + ) -> *mut QemuPluginHwaddr; +} +extern "C" { + #[doc = " qemu_plugin_hwaddr_is_io() - query whether memory operation is IO\n @haddr: address handle from qemu_plugin_get_hwaddr()\n\n Returns true if the handle's memory operation is to memory-mapped IO, or\n false if it is to RAM"] + pub fn qemu_plugin_hwaddr_is_io(haddr: *const QemuPluginHwaddr) -> bool; +} +extern "C" { + #[doc = " qemu_plugin_hwaddr_phys_addr() - query physical address for memory operation\n @haddr: address handle from qemu_plugin_get_hwaddr()\n\n Returns the physical address associated with the memory operation\n\n Note that the returned physical address may not be unique if you are dealing\n with multiple address spaces."] + pub fn qemu_plugin_hwaddr_phys_addr(haddr: *const QemuPluginHwaddr) -> u64; +} +extern "C" { + pub fn qemu_plugin_hwaddr_device_name( + h: *const QemuPluginHwaddr, + ) -> *const ::core::ffi::c_char; +} +#[doc = " typedef QemuPluginVcpuMemCb - memory callback function type\n @vcpu_index: the executing vCPU\n @info: an opaque handle for further queries about the memory\n @vaddr: the virtual address of the transaction\n @userdata: any user data attached to the callback"] +pub type QemuPluginVcpuMemCb = ::core::option::Option< + unsafe extern "C" fn( + vcpu_index: ::core::ffi::c_uint, + info: QemuPluginMeminfo, + vaddr: u64, + userdata: *mut ::core::ffi::c_void, + ), +>; +extern "C" { + #[doc = " qemu_plugin_register_vcpu_mem_cb() - register memory access callback\n @insn: handle for instruction to instrument\n @cb: callback of type QemuPluginVcpuMemCb\n @flags: (currently unused) callback flags\n @rw: monitor reads, writes or both\n @userdata: opaque pointer for userdata\n\n This registers a full callback for every memory access generated by\n an instruction. If the instruction doesn't access memory no\n callback will be made.\n\n The callback reports the vCPU the access took place on, the virtual\n address of the access and a handle for further queries. The user\n can attach some userdata to the callback for additional purposes.\n\n Other execution threads will continue to execute during the\n callback so the plugin is responsible for ensuring it doesn't get\n confused by making appropriate use of locking if required."] + pub fn qemu_plugin_register_vcpu_mem_cb( + insn: *mut QemuPluginInsn, + cb: QemuPluginVcpuMemCb, + flags: QemuPluginCbFlags, + rw: QemuPluginMemRw, + userdata: *mut ::core::ffi::c_void, + ); +} +extern "C" { + #[doc = " qemu_plugin_register_vcpu_mem_inline() - register an inline op to any memory access\n @insn: handle for instruction to instrument\n @rw: apply to reads, writes or both\n @op: the op, of type QemuPluginOp\n @ptr: pointer memory for the op\n @imm: immediate data for @op\n\n This registers a inline op every memory access generated by the\n instruction. This provides for a lightweight but not thread-safe\n way of counting the number of operations done."] + pub fn qemu_plugin_register_vcpu_mem_inline( + insn: *mut QemuPluginInsn, + rw: QemuPluginMemRw, + op: QemuPluginOp, + ptr: *mut ::core::ffi::c_void, + imm: u64, + ); +} +pub type QemuPluginVcpuSyscallCb = ::core::option::Option< + unsafe extern "C" fn( + id: QemuPluginId, + vcpu_index: ::core::ffi::c_uint, + num: i64, + a1: u64, + a2: u64, + a3: u64, + a4: u64, + a5: u64, + a6: u64, + a7: u64, + a8: u64, + ), +>; +extern "C" { + pub fn qemu_plugin_register_vcpu_syscall_cb( + id: QemuPluginId, + cb: QemuPluginVcpuSyscallCb, + ); +} +pub type QemuPluginVcpuSyscallRetCb = ::core::option::Option< + unsafe extern "C" fn(id: QemuPluginId, vcpu_idx: ::core::ffi::c_uint, num: i64, ret: i64), +>; +extern "C" { + pub fn qemu_plugin_register_vcpu_syscall_ret_cb( + id: QemuPluginId, + cb: QemuPluginVcpuSyscallRetCb, + ); +} +extern "C" { + #[doc = " qemu_plugin_insn_disas() - return disassembly string for instruction\n @insn: instruction reference\n\n Returns an allocated string containing the disassembly"] + pub fn qemu_plugin_insn_disas(insn: *const QemuPluginInsn) -> *mut ::core::ffi::c_char; +} +extern "C" { + #[doc = " qemu_plugin_insn_symbol() - best effort symbol lookup\n @insn: instruction reference\n\n Return a static string referring to the symbol. This is dependent\n on the binary QEMU is running having provided a symbol table."] + pub fn qemu_plugin_insn_symbol(insn: *const QemuPluginInsn) -> *const ::core::ffi::c_char; +} +extern "C" { + #[doc = " qemu_plugin_vcpu_for_each() - iterate over the existing vCPU\n @id: plugin ID\n @cb: callback function\n\n The @cb function is called once for each existing vCPU.\n\n See also: qemu_plugin_register_vcpu_init_cb()"] + pub fn qemu_plugin_vcpu_for_each(id: QemuPluginId, cb: QemuPluginVcpuSimpleCb); +} +extern "C" { + pub fn qemu_plugin_register_flush_cb(id: QemuPluginId, cb: QemuPluginSimpleCb); +} +extern "C" { + #[doc = " qemu_plugin_register_atexit_cb() - register exit callback\n @id: plugin ID\n @cb: callback\n @userdata: user data for callback\n\n The @cb function is called once execution has finished. Plugins\n should be able to free all their resources at this point much like\n after a reset/uninstall callback is called.\n\n In user-mode it is possible a few un-instrumented instructions from\n child threads may run before the host kernel reaps the threads."] + pub fn qemu_plugin_register_atexit_cb( + id: QemuPluginId, + cb: QemuPluginUdataCb, + userdata: *mut ::core::ffi::c_void, + ); +} +extern "C" { + pub fn qemu_plugin_n_vcpus() -> ::core::ffi::c_int; +} +extern "C" { + pub fn qemu_plugin_n_max_vcpus() -> ::core::ffi::c_int; +} +extern "C" { + #[doc = " qemu_plugin_outs() - output string via QEMU's logging system\n @string: a string"] + pub fn qemu_plugin_outs(string: *const ::core::ffi::c_char); +} +extern "C" { + #[doc = " qemu_plugin_bool_parse() - parses a boolean argument in the form of\n \"=[on|yes|true|off|no|false]\"\n\n @name: argument name, the part before the equals sign\n @val: argument value, what's after the equals sign\n @ret: output return value\n\n returns true if the combination @name=@val parses correctly to a boolean\n argument, and false otherwise"] + pub fn qemu_plugin_bool_parse( + name: *const ::core::ffi::c_char, + val: *const ::core::ffi::c_char, + ret: *mut bool, + ) -> bool; +} +extern "C" { + #[doc = " qemu_plugin_path_to_binary() - path to binary file being executed\n\n Return a string representing the path to the binary. For user-mode\n this is the main executable. For system emulation we currently\n return NULL. The user should g_free() the string once no longer\n needed."] + pub fn qemu_plugin_path_to_binary() -> *const ::core::ffi::c_char; +} +extern "C" { + #[doc = " qemu_plugin_start_code() - returns start of text segment\n\n Returns the nominal start address of the main text segment in\n user-mode. Currently returns 0 for system emulation."] + pub fn qemu_plugin_start_code() -> u64; +} +extern "C" { + #[doc = " qemu_plugin_end_code() - returns end of text segment\n\n Returns the nominal end address of the main text segment in\n user-mode. Currently returns 0 for system emulation."] + pub fn qemu_plugin_end_code() -> u64; +} +extern "C" { + #[doc = " qemu_plugin_entry_code() - returns start address for module\n\n Returns the nominal entry address of the main text segment in\n user-mode. Currently returns 0 for system emulation."] + pub fn qemu_plugin_entry_code() -> u64; +} +extern "C" { + #[doc = " qemu_plugin_vcpu_memory_rw() - reads or writes guest's virtual or physicalmemory\n\n @vcpu_index: vcpu index\n @addr: guest's address\n @buf: data buffer\n @len: number of bytes to transfer\n @is_write: whether to read from buf or write to buf\n @is_phys: whether to interpret addr as virtual or physical address"] + pub fn qemu_plugin_vcpu_memory_rw( + vcpu_index: ::core::ffi::c_uint, + addr: u64, + buf: *mut ::core::ffi::c_void, + len: u64, + is_write: bool, + is_phys: bool, + ); +} diff --git a/permanent_plugin/src/writer.rs b/permanent_plugin/src/writer.rs new file mode 100644 index 0000000..b698727 --- /dev/null +++ b/permanent_plugin/src/writer.rs @@ -0,0 +1,174 @@ +use std::io::Write; +use crossbeam_channel::{Receiver, select}; +use std::collections::{VecDeque}; + +use permanent_common::trace::{PmemEvent, NvmeEvent, TraceEntry, TraceWriter}; + +#[derive(Debug)] +pub enum TraceMessage { + Pmem(PmemEvent), + NvmeFlush, + PciNvmeBlkRead { + req: u64, + offset: u64, + }, + PciNvmeBlkWrite { + req: u64, + offset: u64, + }, + DmaBlkIo { + req: u64, + dbs: u64, + }, + DmaBlkRead { + dbs: u64, + offset: i64, + length: i64, + }, + DmaBlkWrite { + dbs: u64, + offset: i64, + length: i64, + data: Vec, + }, + PciNvmeEnqueueReqCompletion { + req: u64, + }, + Checkpoint { + value: u8, + }, +} + +#[derive(Debug)] +struct NvmeConsolidateInfo { + id: usize, + req: u64, + dbs: u64, +} + +struct TraceQueue { + tail_id: usize, + queue: VecDeque, + consolidate: VecDeque, // entries are ordered by id + trace_out: TraceWriter +} + +fn write_entry(entry: TraceEntry, dst: &mut W) { + if cfg!(permanent_trace_debug = "entry") { + println!("{:?}", entry); + } else { + entry.serialize_into(dst).expect("failed encoding trace entry"); + } +} + +impl TraceQueue { + fn queue_index(&self, id: usize) -> usize { + id - (self.tail_id - self.queue.len()) + } + + fn insert_complete(&mut self, entry: TraceEntry) { + if self.queue.len() == 0 { + write_entry(entry, &mut self.trace_out); + } else { + self.queue.push_back(entry); + } + self.tail_id += 1; + } + + fn insert_incomplete(&mut self, entry: TraceEntry, req: u64) { + self.queue.push_back(entry); + self.consolidate.push_back(NvmeConsolidateInfo { id: self.tail_id, req, dbs: 0 }); + self.tail_id += 1; + } + + fn insert(&mut self, msg: TraceMessage) { + let id64 = self.tail_id as u64; + match msg { + TraceMessage::Checkpoint { value } => { + self.insert_complete(TraceEntry::Checkpoint { id: id64, value }); + }, + TraceMessage::Pmem(event) => { + self.insert_complete(TraceEntry::Pmem { id: id64, event }); + }, + TraceMessage::NvmeFlush => { + self.insert_complete(TraceEntry::Nvme { id: id64, event: NvmeEvent::Flush }); + }, + TraceMessage::PciNvmeBlkRead { req, offset } => { + let entry = TraceEntry::Nvme { id: id64, event: NvmeEvent::Read { offset, length: 0 }}; + self.insert_incomplete(entry, req); + }, + TraceMessage::PciNvmeBlkWrite { req, offset } => { + let entry = TraceEntry::Nvme { id: id64, event: NvmeEvent::Write { offset, length: 0, data: Vec::new() }}; + self.insert_incomplete(entry, req); + }, + TraceMessage::DmaBlkIo { req, dbs } => { + // we do not panic on unfound req, because revin doesn't do this either. + // might have a problem here if req and dbs pointers get reused. Revin fills this in reverse order, we don't. + // TODO find out if revin has a reason for this besides performance + if let Some(info) = self.consolidate.iter_mut().find(|x| x.req == req) { + (*info).dbs = dbs; + } + }, + TraceMessage::DmaBlkRead { dbs, offset: _, length } => { + if let Some(info) = self.consolidate.iter().find(|x| x.dbs == dbs) { + match self.queue.get_mut(self.queue_index(info.id)).unwrap() { + TraceEntry::Nvme { id: _, event: NvmeEvent::Read { offset: _, length: length_ref } } => { + *length_ref = length.try_into().unwrap(); + }, + other => panic!("TraceEntry should be NvmeRead but is {:?}", other), + } + } + }, + TraceMessage::DmaBlkWrite { dbs, offset: _, length, data } => { + if let Some(info) = self.consolidate.iter().find(|x| x.dbs == dbs) { + match self.queue.get_mut(self.queue_index(info.id)).unwrap() { + TraceEntry::Nvme { id: _, event: NvmeEvent::Write { offset: _, length: length_ref, data: data_ref } } => { + *length_ref = length.try_into().unwrap(); + *data_ref = data; + }, + other => panic!("TraceEntry should be NvmeWrite but is {:?}", other), + } + } + }, + TraceMessage::PciNvmeEnqueueReqCompletion { req } => { + if let Some(i) = self.consolidate.iter().position(|x| x.req == req) { + self.consolidate.remove(i); // O(n) worst case, but we don't use swap_remove because we want to preserve id order + // (most often we remove the front element anyways) + if i == 0 { // oldest entry has been freed, so we can write something out + let drain_until = match self.consolidate.get(0) { + Some(cons_entry) => self.queue_index(cons_entry.id), + None => self.queue.len(), // drain everything; we don't have remaining entries. + }; + for entry in self.queue.drain(0..drain_until) { + write_entry(entry, &mut self.trace_out); + } + } + } + } + } + } +} + +pub fn writer_main(trace_recv: Receiver, done_recv: Receiver<()>, trace_out: TraceWriter) { + let mut q = TraceQueue { tail_id: 0, queue: VecDeque::new(), consolidate: VecDeque::new(), trace_out }; + loop { + select! { + recv(trace_recv) -> msg => { + let msg = msg.unwrap(); + if cfg!(permanent_trace_debug = "message") { + println!("{:?}", msg); + } else { + q.insert(msg); + } + } + recv(done_recv) -> _ => { + println!("permanent_plugin: writer thread shut down"); + if q.queue.len() > 0 { + panic!("writer thread quit with queue non-empty"); + } + q.trace_out.flush().unwrap(); + break; + } + } + } +} diff --git a/permanent_tester/Cargo.toml b/permanent_tester/Cargo.toml new file mode 100644 index 0000000..5e88c2f --- /dev/null +++ b/permanent_tester/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "permanent_tester" +version = "0.1.0" +edition = "2021" + +[dependencies] +permanent_common = { path = "../permanent_common" } +clap = { version="4.3.22", features=["derive"] } +serde = "1.0.183" +serde_yaml = "0.9.25" +serde_json = "1.0.105" +blake3 = "1.4.1" +itertools = "0.11.0" diff --git a/permanent_tester/src/main.rs b/permanent_tester/src/main.rs new file mode 100644 index 0000000..5ba8b3f --- /dev/null +++ b/permanent_tester/src/main.rs @@ -0,0 +1,197 @@ +use std::fs::File; +use std::io::{BufReader, BufWriter, Write}; +use std::path::Path; +use std::process::Command; +use std::collections::{HashMap, HashSet}; +use clap::Parser; +use itertools::Itertools; +use permanent_common::config::{VmConfig, TestConfig, TraceConfig, TraceType}; + +const START_MSG: &'static str = "PERMANENT START"; +const END_MSG: &'static str = "PERMANENT END"; +const SUCCESS_MSG: &'static str = "PERMANENT SUCCESS"; + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +struct StateHash(blake3::Hash); + +impl serde::Serialize for StateHash { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer + { + serializer.serialize_str(&self.0.to_hex()) + } +} + +fn main() { + let args = Args::parse(); + + let vm_config_file = File::open(format!("{}/vm_config.yaml", args.work_dir).as_str()).expect("Could not open vm config file"); + let vm_config: VmConfig = serde_yaml::from_reader(BufReader::new(vm_config_file)).expect("Could not deserialize vm config file"); + + std::fs::create_dir(format!("{}/states", args.work_dir).as_str()).expect("could not create states dir"); + let mut state_hashes: HashMap> = HashMap::new(); + + let (p, n) = vm_config.have_pmem_nvme(); + if p && n { + // TODO CrashHash instead of String + let pmem_index: HashMap> = serde_json::from_reader( + BufReader::new(File::open(format!("{}/pmem.index", args.work_dir).as_str()).unwrap()) + ).unwrap(); + let nvme_index: HashMap> = serde_json::from_reader( + BufReader::new(File::open(format!("{}/nvme.index", args.work_dir).as_str()).unwrap()) + ).unwrap(); + let mut gen_indices_pmem: Vec = pmem_index.keys().copied().collect(); + gen_indices_pmem.sort(); + let mut gen_indices_nvme: Vec = nvme_index.keys().copied().collect(); + gen_indices_nvme.sort(); + if gen_indices_pmem != gen_indices_nvme { + panic!("index file key discrepancy"); + } + let gen_indices = gen_indices_pmem; + + let total_amount: usize = gen_indices.iter() + .map(|id| pmem_index.get(id).unwrap().len() * nvme_index.get(id).unwrap().len()) + .sum(); + let mut c = 0; + let mut seen: HashSet<(String, String)> = HashSet::new(); + + for id in gen_indices { + let pmem_hashes = pmem_index.get(&id).unwrap(); + let nvme_hashes = nvme_index.get(&id).unwrap(); + for (pmem_hash, nvme_hash) in pmem_hashes.iter().cartesian_product(nvme_hashes.iter()) { + c += 1; + let combination = (pmem_hash.clone(), nvme_hash.clone()); + if seen.contains(&combination) { + println!("[{}/{}] trace {} {} ... already done.", c, total_amount, pmem_hash, nvme_hash); + continue; + } + seen.insert(combination); + println!("[{}/{}] trace {} {}", c, total_amount, pmem_hash, nvme_hash); + + let dir = trace_dir(&args.work_dir, Some(pmem_hash), Some(nvme_hash)); + let success = Command::new("target/release/permanent_trace") + .arg("post-failure") + .arg(args.work_dir.as_str()) + .args(["--pmem-hash", pmem_hash]) + .args(["--nvme-hash", nvme_hash]) + .spawn() + .expect("could not start permanent_trace") + .wait() + .expect("could not collect permanent_trace process") + .success(); + if !success { + eprintln!("WARNING: trace {} {} returned non-zero exit status. skipped.", pmem_hash, nvme_hash); + } else { + let log = std::fs::read(format!("{}/log", dir).as_str()).expect("could not read log"); + let success = log.windows(SUCCESS_MSG.len()).any(|win| win == SUCCESS_MSG.as_bytes()); + let state_dump = if success { + extract_state_dump(log.as_slice()) + } else { + b"FAILED" + }; + let state_hash = StateHash(blake3::hash(state_dump)); + let state_hash_string = state_hash.0.to_hex(); + let crash_hashes = state_hashes.entry(state_hash).or_insert(Vec::new()); + if crash_hashes.is_empty() { + let mut f = File::create(format!("{}/states/{}.state", args.work_dir, state_hash_string).as_str()) + .expect("could not create state file"); + f.write_all(state_dump).expect("could not write state file"); + } + crash_hashes.push(format!("{}_{}", pmem_hash, nvme_hash)); + } + clean_dir(&dir); + } + } + + + } else if p || n { + // TODO total_amount + let hash_type_arg = if p { "--pmem-hash" } else { "--nvme-hash" }; + for path in std::fs::read_dir(format!("{}/crash_images", args.work_dir).as_str()) + .expect("could not read crash_image dir") + { + let filename = path.unwrap().file_name(); + let pathref: &Path = filename.as_ref(); + let crash_hash: String = pathref.file_stem().unwrap().to_str().unwrap().to_string(); + let dir = if p { + trace_dir(&args.work_dir, Some(&crash_hash), None) + } else { + trace_dir(&args.work_dir, None, Some(&crash_hash)) + }; + let success = Command::new("target/release/permanent_trace") // TODO release + .arg("post-failure") + .arg(args.work_dir.as_str()) + .args([hash_type_arg, crash_hash.as_str()]) + .spawn() + .expect("could not start permanent_trace") + .wait() + .expect("could not collect permanent_trace process") + .success(); + if !success { + eprintln!("WARNING: permanent_trace {} returned non-zero exit status. skipped.", crash_hash); + } else { + let log = std::fs::read(format!("{}/log", dir).as_str()).expect("could not read log"); + let success = log.windows(SUCCESS_MSG.len()).any(|win| win == SUCCESS_MSG.as_bytes()); + let state_dump = if success { + extract_state_dump(log.as_slice()) + } else { + b"FAILED" + }; + let state_hash = StateHash(blake3::hash(state_dump)); + let state_hash_string = state_hash.0.to_hex(); + let crash_hashes = state_hashes.entry(state_hash).or_insert(Vec::new()); + if crash_hashes.is_empty() { + let mut f = File::create(format!("{}/states/{}.state", args.work_dir, state_hash_string).as_str()) + .expect("could not create state file"); + f.write_all(state_dump).expect("could not write state file"); + } + crash_hashes.push(crash_hash); + } + clean_dir(&dir); + } + } else { + unreachable!(); + } + let out_file = File::create(format!("{}/states.index", args.work_dir).as_str()).expect("could not create output file"); + serde_json::to_writer_pretty(BufWriter::new(out_file), &state_hashes).expect("could not write output"); +} + +fn extract_state_dump(data: &[u8]) -> &[u8] { + let start_pos = data.windows(START_MSG.len()).position(|win| win == START_MSG.as_bytes()).expect("no START") + + START_MSG.len(); + let end_pos = data.windows(END_MSG.len()).position(|win| win == END_MSG.as_bytes()).expect("no END"); + &data[start_pos..end_pos] +} + +// TODO this is already implemented in permanent_common +fn trace_dir(work_dir: &String, pmem_hash: Option<&String>, nvme_hash: Option<&String>) -> String { + let mut dir = format!("{}/post", work_dir); + if let Some(hash) = pmem_hash { + dir.push('_'); + dir.push_str(hash.as_str()); + } + if let Some(hash) = nvme_hash { + dir.push('_'); + dir.push_str(hash.as_str()); + } + dir +} + +// remove everything except logs for debugging +fn clean_dir(dir: &String) { + let files = ["trace.bin", "pmem.raw", "nvme.raw", "pipe.in", "pipe.out"]; + for file in files { + let file = format!("{}/{}", dir, file); + if Path::new(file.as_str()).exists() { + if let Err(_) = std::fs::remove_file(file.as_str()) { + eprintln!("WARNING: could not remove {}", file); + } + } + } +} + +#[derive(Debug, Parser)] +pub struct Args { + work_dir: String, +} diff --git a/permanent_trace/Cargo.toml b/permanent_trace/Cargo.toml new file mode 100644 index 0000000..c5a5e0d --- /dev/null +++ b/permanent_trace/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "permanent_trace" +version = "0.1.0" +edition = "2021" + +[dependencies] +permanent_common = { path = "../permanent_common" } +libc = "0.2.126" +regex = "1.9.3" +clap = { version="4.3.22", features=["derive"] } +serde = "1.0.183" +serde_yaml = "0.9.25" +enumset = "1.1.2" diff --git a/permanent_trace/src/main.rs b/permanent_trace/src/main.rs new file mode 100644 index 0000000..1a2736c --- /dev/null +++ b/permanent_trace/src/main.rs @@ -0,0 +1,91 @@ +use std::fs::File; +use std::io::BufReader; +use std::path::Path; +use clap::{Parser, Subcommand}; +use permanent_common::config::{VmConfig, TestConfig, TraceConfig, TraceType}; + +mod pipe; +mod vm; +mod tracer; + +fn remove_dir(path: &String) -> Result<(), std::io::Error> { + if Path::new(path).exists() { + std::fs::remove_dir_all(path)?; + } + Ok(()) +} + +fn read_vm_config(work_dir: &String) -> VmConfig { + let vm_config_file = File::open(format!("{}/vm_config.yaml", work_dir).as_str()).expect("Could not open vm config file"); + serde_yaml::from_reader(BufReader::new(vm_config_file)).expect("Could not deserialize vm config file") +} + +fn read_test_config(work_dir: &String) -> TestConfig { + let test_config_file = File::open(format!("{}/test_config.yaml", work_dir).as_str()).expect("Could not open test config file"); + serde_yaml::from_reader(BufReader::new(test_config_file)).expect("Could not deserialize test config file") +} + +fn main() { + let args = Args::parse(); + + match args.command { + Command::Analyse { work_dir, force } => { + let vm_config = read_vm_config(&work_dir); + let test_config = read_test_config(&work_dir); + let trace_config = TraceConfig::new(&work_dir, TraceType::Analyse); + + if force { + remove_dir(&trace_config.trace_dir()).unwrap(); + } + std::fs::create_dir(trace_config.trace_dir()).expect("could not create trace dir"); + tracer::trace_vm(&work_dir, &vm_config, Some(&test_config), &trace_config); + }, + Command::PostSuccess { work_dir, pmem_hash, nvme_hash, force } => { + todo!(); + }, + Command::PostFailure { work_dir, pmem_hash, nvme_hash, force } => { + let vm_config = read_vm_config(&work_dir); + let test_config = read_test_config(&work_dir); + let trace_config = TraceConfig::new(&work_dir, TraceType::PostFailure { pmem_hash, nvme_hash }); + + if force { + remove_dir(&trace_config.trace_dir()).unwrap(); + } + std::fs::create_dir(trace_config.trace_dir()).expect("could not create trace dir"); + tracer::trace_vm(&work_dir, &vm_config, Some(&test_config), &trace_config); + } + } +} + +#[derive(Debug, Parser)] +pub struct Args { + #[clap(subcommand)] + command: Command, +} + +#[derive(Debug, Subcommand)] +enum Command { + Analyse { + work_dir: String, + #[clap(short, long, action)] + force: bool, + }, + PostSuccess { + work_dir: String, + #[arg(short, long)] + pmem_hash: Option, + #[arg(short, long)] + nvme_hash: Option, + #[clap(short, long, action)] + force: bool, + }, + PostFailure { + work_dir: String, + #[arg(short, long)] + pmem_hash: Option, + #[arg(short, long)] + nvme_hash: Option, + #[clap(short, long, action)] + force: bool, + } +} diff --git a/permanent_trace/src/pipe.rs b/permanent_trace/src/pipe.rs new file mode 100644 index 0000000..2d71428 --- /dev/null +++ b/permanent_trace/src/pipe.rs @@ -0,0 +1,107 @@ +use std::os::unix::fs::OpenOptionsExt; +use std::io; +use std::fs; +use std::path::Path; +use std::process::Command; +use std::io::BufReader; +use std::io::BufWriter; +use std::fs::File; +use std::io::BufRead; +use std::io::Write; +extern crate libc; + +pub struct Pipe { + pub reader: BufReader, + pub writer: BufWriter, + logger: BufWriter, +} + +impl Pipe { + pub fn open(path: &String, logger: BufWriter) -> io::Result { + let pipe_out = fs::OpenOptions::new() + .read(true) + .custom_flags(libc::O_NONBLOCK) + .open(&format!("{}.out", path))?; + let pipe_in = fs::OpenOptions::new() + .write(true) + .append(true) + .open(&format!("{}.in", path))?; + let pipe = Pipe { + reader: BufReader::new(pipe_out), + writer: BufWriter::new(pipe_in), + logger + }; + + Ok(pipe) + } + + pub fn make(path: &String) -> io::Result<()> { + if Path::new(&format!("{}.in", path)).exists() { + fs::remove_file(format!("{}.in", path))?; + } + if Path::new(&format!("{}.out", path)).exists() { + fs::remove_file(format!("{}.out", path))?; + } + + let mut handler = Command::new("mkfifo") + .arg(format!("{}.in", path)) + .arg(format!("{}.out", path)) + .spawn()?; + handler.wait()?; + + Ok(()) + } + + pub fn wait_for_any(&mut self, variants: &[&[u8]]) -> Result { + let mut wait_iters = 200; + let mut buf: Vec = Vec::new(); + + loop { + let result = self.reader.read_until(b'\n', &mut buf); + if let Err(e) = result { + if e.kind() == io::ErrorKind::WouldBlock { + if wait_iters <= 0 { + self.logger.write_fmt(format_args!("== Pipe broken")).unwrap(); + return Err(io::Error::new(io::ErrorKind::BrokenPipe, "Pipe locked")); + } + wait_iters -= 1; + std::thread::sleep(std::time::Duration::from_millis(1000)); + continue; + } else { + return Err(e); + } + } else { + wait_iters = 200; + } + + let n = result.unwrap(); + if n == 0 { + return Err(io::Error::new(io::ErrorKind::InvalidData, "Unexpected EOF")); + } + + self.logger.write_all(buf.as_slice()).unwrap(); + self.logger.flush().unwrap(); // we might want to see output immediately to debug + + for (i, variant) in variants.iter().enumerate() { + if buf.windows(variant.len()).any(|win| win == *variant) { + return Ok(i) + } + } + buf.clear(); + } + } + + pub fn wait_for(&mut self, bytes: &[u8]) -> Result<(), io::Error> { + let single_variant = [bytes]; + match self.wait_for_any(&single_variant) { + Ok(_) => Ok(()), + Err(e) => Err(e), + } + } + + pub fn send(&mut self, text: &str) -> Result<(), io::Error> { + self.writer.write_fmt(format_args!("{}", text))?; + self.writer.flush()?; + Ok(()) + } +} diff --git a/permanent_trace/src/tracer.rs b/permanent_trace/src/tracer.rs new file mode 100644 index 0000000..06f750f --- /dev/null +++ b/permanent_trace/src/tracer.rs @@ -0,0 +1,70 @@ +use std::time::SystemTime; + +use permanent_common::config::{VmConfig, TestConfig, TraceConfig, TraceType}; +use permanent_common::profiler::Measurement; + +use crate::vm::VM; +use crate::pipe::Pipe; + +pub fn trace_vm(work_dir: &String, vm_config: &VmConfig, test_config: Option<&TestConfig>, trace_config: &TraceConfig) { + // TODO we always copy, even on PostSuccess. + // might be a little inefficient, but we can't risk the post recovery to write to NVME. + // it also writes on mount -oro, in case of recovery. + let (p, n) = vm_config.have_pmem_nvme(); + match &trace_config.trace_type { + TraceType::Analyse => { + if p { + std::fs::copy(format!("{}/pmem_base.raw", work_dir).as_str(), + trace_config.pmem_image_path().as_str()) + .expect("could not copy base image"); + } + if n { + std::fs::copy(format!("{}/nvme_base.raw", work_dir).as_str(), + trace_config.nvme_image_path().as_str()) + .expect("could not copy base image"); + } + }, + TraceType::PostSuccess => { todo!(); }, + TraceType::PostFailure { pmem_hash, nvme_hash } => { + if p { + std::fs::copy(format!("{}/crash_images/{}.raw", work_dir, pmem_hash.as_ref().unwrap()).as_str(), + trace_config.pmem_image_path().as_str()) + .expect("could not copy base image"); + } + if n { + std::fs::copy(format!("{}/crash_images/{}.raw", work_dir, nvme_hash.as_ref().unwrap()).as_str(), + trace_config.nvme_image_path().as_str()) + .expect("could not copy base image"); + } + } + } + + // 2. create pipe + Pipe::make(&trace_config.pipe_path()).expect("Could not create control pipe"); + + // 3. init vm & wait for startup + let mut vm = VM::init(&vm_config, &trace_config); + + // 4. run tests & wait for end + let text = match &trace_config.trace_type { + TraceType::Analyse => { + format!("(checkpoint 255 && {} && {} && checkpoint success) || checkpoint fail\n", &vm_config.trace_cmd_prefix, test_config.unwrap().trace_cmd_suffix) + }, + TraceType::PostSuccess => { + format!("(checkpoint 255 && {} && checkpoint success) || checkpoint fail\n", &vm_config.recovery_cmd) + }, + TraceType::PostFailure { .. } => { + format!("(checkpoint 255 && {} && {} && checkpoint success) || checkpoint fail\n", &vm_config.dump_cmd_prefix, &test_config.unwrap().dump_cmd_suffix) + }, + }; + println!("== sh command: {}", text); + vm.send(text.as_str()).unwrap(); + + // 5. shutdown vm + let success = vm.teardown(); + if let TraceType::Analyse = &trace_config.trace_type { + if !success { + panic!("trace analyse was not successful! try tracing with a different shell command"); + } + } +} diff --git a/permanent_trace/src/vm.rs b/permanent_trace/src/vm.rs new file mode 100644 index 0000000..e490fdd --- /dev/null +++ b/permanent_trace/src/vm.rs @@ -0,0 +1,108 @@ +use std::io::{self, BufWriter}; +use std::fs::File; +use std::os::unix::io::{FromRawFd, IntoRawFd}; +use std::process::{Child, Command, Stdio}; +use enumset::EnumSet; +use crate::pipe::Pipe; +extern crate libc; + +use permanent_common::config::{VmConfig, TraceConfig, TraceType, TraceOption, TcgPluginConfig}; + +pub struct VM { + pipe: Pipe, + process: Child, +} + +impl VM { + // NOTE: we don't need TestConfig here, because we only start the VM (independent of test conf) + pub fn init(vm_config: &VmConfig, trace_config: &TraceConfig) -> Self { + println!("Create VM"); + + let io_log_file = File::create(&trace_config.io_log_path()).expect("Could not create io log file"); + let log_file = File::create(&trace_config.log_path()).expect("Could not create log file"); + + let mut command = Command::new(&vm_config.qemu_path); + // add kernel, initrd + command.args(["-kernel", vm_config.kernel_path.as_str()]); + command.args(["-initrd", vm_config.initrd_path.as_str()]); + + // pipe interface + command.args(["-serial", format!("pipe:{}", &trace_config.pipe_path()).as_str()]); + command.arg("-nographic"); + + // add nvme drive, if required + let (_, uses_nvme) = vm_config.have_pmem_nvme(); + if uses_nvme { + command.args([ + "-drive", format!("file={},format=raw,if=none,id=nvm", trace_config.nvme_image_path()).as_str(), + "-device", "nvme,serial=deadbeef,drive=nvm", + ]); + } + + let (p, n) = vm_config.have_pmem_nvme(); + + // add plugin information + let pmem_trace_what = TraceOption::PmemWrite | TraceOption::PmemFence | TraceOption::PmemFlush; + let nvme_trace_what = TraceOption::NvmeWrite | TraceOption::NvmeFlush; + let plugin_config = TcgPluginConfig { + pmem_start: vm_config.pmem_start.unwrap_or(0), + pmem_len: vm_config.pmem_len.unwrap_or(0), + pmem_base_image_path: p.then(|| trace_config.pmem_image_path()), + trace_what: match trace_config.trace_type { + TraceType::Analyse => { + let mut opts = TraceOption::Checkpoint.into(); + if p { opts |= pmem_trace_what; } + if n { opts |= nvme_trace_what; } + opts + }, + TraceType::PostSuccess => { + let mut opts = TraceOption::Checkpoint.into(); + if p { opts |= TraceOption::PmemRead; } + if n { opts |= TraceOption::NvmeRead; } + opts + }, + TraceType::PostFailure { .. } => EnumSet::empty(), + }, + out_trace_file: trace_config.trace_path(), + }; + command.args([ + "-plugin", + plugin_config.to_qemu_plugin_arg_string("target/release/libpermanent_plugin.so").as_str() + ]); + + // add free-form qemu args + command.args(vm_config.qemu_args.clone()); + + command.stderr(unsafe { Stdio::from_raw_fd(io_log_file.into_raw_fd()) }); + + println!("== Start QEMU VM"); + println!("{:?}", command); + + let child = command.spawn().expect("Could not start qemu vm"); + + // TODO lower? + std::thread::sleep(std::time::Duration::from_millis(2000)); + let mut pipe = Pipe::open(&trace_config.pipe_path(), BufWriter::new(log_file)).expect("Could not open control pipe"); + println!("Pipes opened"); + + pipe.wait_for(b"/bin/sh: can't access tty; job control turned off").unwrap(); + println!("VM ready"); + + return Self { pipe, process: child }; + } + + pub fn teardown(&mut self) -> bool { + let variants = [b"PERMANENT SUCCESS".as_slice(), b"PERMANENT FAIL".as_slice()]; + let success = self.pipe.wait_for_any(&variants).unwrap() == 0; // make sure we collected all output + // send SIGTERM for qemu to terminate gracefully + unsafe { libc::kill(self.process.id() as i32, libc::SIGTERM); } + + self.process.wait().expect("Could not collect qemu"); + println!("== Exit QEMU VM"); + return success; + } + + pub fn send(&mut self, text: &str) -> Result<(), io::Error> { + self.pipe.send(text) + } +} diff --git a/pipeline.sh b/pipeline.sh new file mode 100755 index 0000000..99136b2 --- /dev/null +++ b/pipeline.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +echo "---trace" +TRACE_START=$(date +%s) +target/release/permanent_trace analyse $1 > $1/trace.out +TRACE_END=$(date +%s) +echo "time: " $(expr $TRACE_END - $TRACE_START) +echo "time: " $(expr $TRACE_END - $TRACE_START) > $1/time.out + +echo "---cig" +CIG_START=$(date +%s) +target/release/permanent_cig $1 > $1/cig.out +CIG_END=$(date +%s) +echo "time: " $(expr $CIG_END - $CIG_START) +echo "time: " $(expr $CIG_END - $CIG_START) >> $1/time.out + +echo "---tester" +TESTER_START=$(date +%s) +target/release/permanent_tester $1 > $1/tester.out +TESTER_END=$(date +%s) +echo "time: " $(expr $TESTER_END - $TESTER_START) +echo "time: " $(expr $TESTER_END - $TESTER_START) >> $1/time.out diff --git a/qemu.patch b/qemu.patch new file mode 100644 index 0000000..bb6651f --- /dev/null +++ b/qemu.patch @@ -0,0 +1,312 @@ +diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c +index 90687b1..c80acc4 100644 +--- a/hw/nvme/ctrl.c ++++ b/hw/nvme/ctrl.c +@@ -208,6 +208,7 @@ + #include "nvme.h" + #include "dif.h" + #include "trace.h" ++#include "permanent_trace.h" + + #define NVME_MAX_IOQPAIRS 0xffff + #define NVME_DB_SIZE 4 +@@ -1447,6 +1448,9 @@ static inline void nvme_blk_read(BlockBackend *blk, int64_t offset, + assert(req->sg.flags & NVME_SG_ALLOC); + + if (req->sg.flags & NVME_SG_DMA) { ++ if (permanent_trace_funcs.pci_nvme_blk_read) { ++ permanent_trace_funcs.pci_nvme_blk_read((void*)req, offset); ++ } + req->aiocb = dma_blk_read(blk, &req->sg.qsg, offset, align, cb, req); + } else { + req->aiocb = blk_aio_preadv(blk, offset, &req->sg.iov, 0, cb, req); +@@ -1460,6 +1464,9 @@ static inline void nvme_blk_write(BlockBackend *blk, int64_t offset, + assert(req->sg.flags & NVME_SG_ALLOC); + + if (req->sg.flags & NVME_SG_DMA) { ++ if (permanent_trace_funcs.pci_nvme_blk_write) { ++ permanent_trace_funcs.pci_nvme_blk_write((void*)req, offset); ++ } + req->aiocb = dma_blk_write(blk, &req->sg.qsg, offset, align, cb, req); + } else { + req->aiocb = blk_aio_pwritev(blk, offset, &req->sg.iov, 0, cb, req); +@@ -1538,6 +1545,10 @@ static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req) + le32_to_cpu(req->cqe.dw1), + req->status); + ++ if (permanent_trace_funcs.pci_nvme_enqueue_req_completion) { ++ permanent_trace_funcs.pci_nvme_enqueue_req_completion((void*)req, req->status); ++ } ++ + if (req->status) { + trace_pci_nvme_err_req_status(nvme_cid(req), nvme_nsid(req->ns), + req->status, req->cmd.opcode); +@@ -3429,6 +3440,9 @@ static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req) + } + + trace_pci_nvme_read(nvme_cid(req), nvme_nsid(ns), nlb, mapped_size, slba); ++ if (permanent_trace_funcs.pci_nvme_read) { ++ permanent_trace_funcs.pci_nvme_read(nvme_cid(req), nvme_nsid(ns), nlb, mapped_size, slba); ++ } + + status = nvme_check_mdts(n, mapped_size); + if (status) { +@@ -3542,6 +3556,10 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append, + + trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode), + nvme_nsid(ns), nlb, mapped_size, slba); ++ if (permanent_trace_funcs.pci_nvme_write) { ++ permanent_trace_funcs.pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode), ++ nvme_nsid(ns), nlb, mapped_size, slba); ++ } + + if (!wrz) { + status = nvme_check_mdts(n, mapped_size); +@@ -4412,6 +4430,10 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req) + + trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req), + req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode)); ++ if (permanent_trace_funcs.pci_nvme_io_cmd) { ++ permanent_trace_funcs.pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req), ++ req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode)); ++ } + + if (!nvme_nsid_valid(n, nsid)) { + return NVME_INVALID_NSID | NVME_DNR; +diff --git a/include/qemu/qemu-plugin.h b/include/qemu/qemu-plugin.h +index 50a9957..024653f 100644 +--- a/include/qemu/qemu-plugin.h ++++ b/include/qemu/qemu-plugin.h +@@ -51,7 +51,7 @@ typedef uint64_t qemu_plugin_id_t; + + extern QEMU_PLUGIN_EXPORT int qemu_plugin_version; + +-#define QEMU_PLUGIN_VERSION 1 ++#define QEMU_PLUGIN_VERSION 2 + + /** + * struct qemu_info_t - system information for plugins +@@ -664,4 +664,21 @@ uint64_t qemu_plugin_end_code(void); + */ + uint64_t qemu_plugin_entry_code(void); + ++/** ++ * qemu_plugin_vcpu_memory_rw() - reads or writes guest's virtual or physical memory ++ * ++ * @vcpu_index: vcpu index ++ * @addr: guest's address ++ * @buf: data buffer ++ * @len: number of bytes to transfer ++ * @is_write: whether to read from buf or write to buf ++ * @is_phys: whether to interpret addr as virtual or physical address ++ */ ++void qemu_plugin_vcpu_memory_rw(unsigned int vcpu_index, ++ uint64_t addr, ++ void *buf, ++ uint64_t len, ++ bool is_write, ++ bool is_phys); ++ + #endif /* QEMU_QEMU_PLUGIN_H */ +diff --git a/include/permanent_trace.h b/include/permanent_trace.h +new file mode 100644 +index 0000000..d09813a +--- /dev/null ++++ b/include/permanent_trace.h +@@ -0,0 +1,23 @@ ++#ifndef PERMANENT_TRACE_H ++#define PERMANENT_TRACE_H ++ ++#include ++#include ++ ++struct permanent_trace_fn { ++ void (*pci_nvme_read)(uint16_t cid, uint32_t nsid, uint32_t nlb, uint64_t count, uint64_t lba); ++ void (*pci_nvme_write)(uint16_t cid, const char *verb, uint32_t nsid, uint32_t nlb, uint64_t count, uint64_t lba); ++ void (*pci_nvme_io_cmd)(uint16_t cid, uint32_t nsid, uint16_t sqid, uint8_t opcode, const char *opname); ++ ++ void (*pci_nvme_blk_read)(const void *req, int64_t offset); ++ void (*pci_nvme_blk_write)(const void *req, int64_t offset); ++ void (*pci_nvme_enqueue_req_completion)(const void *req, uint16_t status); ++ ++ void (*dma_blk_read)(const void *dbs, int64_t offset, int64_t bytes); ++ void (*dma_blk_write)(const void *dbs, int64_t offset, int64_t bytes, const void *buf); ++ void (*dma_blk_io)(const void *req, const void *dbs); ++}; ++ ++extern struct permanent_trace_fn permanent_trace_funcs; ++ ++#endif +diff --git a/plugins/api.c b/plugins/api.c +index 2078b16..40efc4b 100644 +--- a/plugins/api.c ++++ b/plugins/api.c +@@ -442,3 +442,26 @@ uint64_t qemu_plugin_entry_code(void) + #endif + return entry; + } ++ ++void qemu_plugin_vcpu_memory_rw(unsigned int vcpu_index, ++ uint64_t addr, ++ void *buf, ++ uint64_t len, ++ bool is_write, ++ bool is_phys) ++{ ++ CPUClass *cc; ++ CPUState *cpu; ++ ++ cpu = qemu_get_cpu(vcpu_index); ++ cc = CPU_GET_CLASS(cpu); ++ if (is_phys) { ++ cpu_physical_memory_rw(addr, buf, len, is_write); ++ } else { ++ if (cc->memory_rw_debug) { ++ cc->memory_rw_debug(cpu, addr, buf, len, is_write); ++ } else { ++ cpu_memory_rw_debug(cpu, addr, buf, len, is_write); ++ } ++ } ++} +diff --git a/plugins/loader.c b/plugins/loader.c +index 809f3f9..9eecba3 100644 +--- a/plugins/loader.c ++++ b/plugins/loader.c +@@ -34,6 +34,7 @@ + #include "hw/boards.h" + #endif + #include "qemu/compiler.h" ++#include "permanent_trace.h" + + #include "plugin.h" + +@@ -168,6 +169,9 @@ static uint64_t xorshift64star(uint64_t x) + return x * UINT64_C(2685821657736338717); + } + ++// permanent_trace functions (public) ++struct permanent_trace_fn permanent_trace_funcs = {0}; ++ + /* + * Disable CFI checks. + * The install and version functions have been loaded from an external library +@@ -255,6 +259,33 @@ static int plugin_load(struct qemu_plugin_desc *desc, const qemu_info_t *info, E + } + } + ++ // check for permanent_trace plugin and set functions that are needed in QEMU itself (NVME, DMA, ...) ++ if (g_module_symbol(ctx->handle, "permanent_trace_version", &sym)) { ++ g_module_symbol(ctx->handle, "permanent_trace_pci_nvme_read", (gpointer*)&permanent_trace_funcs.pci_nvme_read); ++ g_module_symbol(ctx->handle, "permanent_trace_pci_nvme_write", (gpointer*)&permanent_trace_funcs.pci_nvme_write); ++ g_module_symbol(ctx->handle, "permanent_trace_pci_nvme_io_cmd", (gpointer*)&permanent_trace_funcs.pci_nvme_io_cmd); ++ ++ g_module_symbol(ctx->handle, "permanent_trace_pci_nvme_blk_read", (gpointer*)&permanent_trace_funcs.pci_nvme_blk_read); ++ g_module_symbol(ctx->handle, "permanent_trace_pci_nvme_blk_write", (gpointer*)&permanent_trace_funcs.pci_nvme_blk_write); ++ g_module_symbol(ctx->handle, "permanent_trace_pci_nvme_enqueue_req_completion", (gpointer*)&permanent_trace_funcs.pci_nvme_enqueue_req_completion); ++ ++ g_module_symbol(ctx->handle, "permanent_trace_dma_blk_read", (gpointer*)&permanent_trace_funcs.dma_blk_read); ++ g_module_symbol(ctx->handle, "permanent_trace_dma_blk_write", (gpointer*)&permanent_trace_funcs.dma_blk_write); ++ g_module_symbol(ctx->handle, "permanent_trace_dma_blk_io", (gpointer*)&permanent_trace_funcs.dma_blk_io); ++ ++ g_assert_nonnull(permanent_trace_funcs.pci_nvme_read); ++ g_assert_nonnull(permanent_trace_funcs.pci_nvme_write); ++ g_assert_nonnull(permanent_trace_funcs.pci_nvme_io_cmd); ++ ++ g_assert_nonnull(permanent_trace_funcs.pci_nvme_blk_read); ++ g_assert_nonnull(permanent_trace_funcs.pci_nvme_blk_write); ++ g_assert_nonnull(permanent_trace_funcs.pci_nvme_enqueue_req_completion); ++ ++ g_assert_nonnull(permanent_trace_funcs.dma_blk_read); ++ g_assert_nonnull(permanent_trace_funcs.dma_blk_write); ++ g_assert_nonnull(permanent_trace_funcs.dma_blk_io); ++ } ++ + qemu_rec_mutex_unlock(&plugin.lock); + return rc; + +diff --git a/plugins/qemu-plugins.symbols b/plugins/qemu-plugins.symbols +index 71f6c90..3cdcd39 100644 +--- a/plugins/qemu-plugins.symbols ++++ b/plugins/qemu-plugins.symbols +@@ -42,4 +42,5 @@ + qemu_plugin_tb_vaddr; + qemu_plugin_uninstall; + qemu_plugin_vcpu_for_each; ++ qemu_plugin_vcpu_memory_rw; + }; +diff --git a/softmmu/dma-helpers.c b/softmmu/dma-helpers.c +index 2463964..1cd1095 100644 +--- a/softmmu/dma-helpers.c ++++ b/softmmu/dma-helpers.c +@@ -15,6 +15,7 @@ + #include "qemu/main-loop.h" + #include "sysemu/cpu-timers.h" + #include "qemu/range.h" ++#include "permanent_trace.h" + + /* #define DEBUG_IOMMU */ + +@@ -228,6 +229,9 @@ BlockAIOCB *dma_blk_io(AioContext *ctx, + DMAAIOCB *dbs = qemu_aio_get(&dma_aiocb_info, NULL, cb, opaque); + + trace_dma_blk_io(dbs, io_func_opaque, offset, (dir == DMA_DIRECTION_TO_DEVICE)); ++ if (permanent_trace_funcs.dma_blk_io) { ++ permanent_trace_funcs.dma_blk_io(opaque, (void*)dbs); ++ } + + dbs->acb = NULL; + dbs->sg = sg; +@@ -252,6 +256,11 @@ BlockAIOCB *dma_blk_read_io_func(int64_t offset, QEMUIOVector *iov, + void *opaque) + { + BlockBackend *blk = opaque; ++ ++ if (permanent_trace_funcs.dma_blk_read) { ++ permanent_trace_funcs.dma_blk_read(cb_opaque, offset, iov->iov->iov_len); ++ } ++ + return blk_aio_preadv(blk, offset, iov, 0, cb, cb_opaque); + } + +@@ -270,6 +279,11 @@ BlockAIOCB *dma_blk_write_io_func(int64_t offset, QEMUIOVector *iov, + void *opaque) + { + BlockBackend *blk = opaque; ++ ++ if (permanent_trace_funcs.dma_blk_write) { ++ permanent_trace_funcs.dma_blk_write(cb_opaque, offset, iov->iov->iov_len, iov->iov->iov_base); ++ } ++ + return blk_aio_pwritev(blk, offset, iov, 0, cb, cb_opaque); + } + +diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c +index e0a6229..659d804 100644 +--- a/target/i386/tcg/translate.c ++++ b/target/i386/tcg/translate.c +@@ -6656,7 +6656,11 @@ static bool disas_insn(DisasContext *s, CPUState *cpu) + if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_CLWB)) { + goto illegal_op; + } +- gen_nop_modrm(env, s, modrm); ++ // load (and ignore) a single byte from memory, triggering a memory callback. ++ // only one byte because we don't want inter-cacheline reads ++ // value in s->T0 is ignored afterwards. ++ gen_lea_modrm(env, s, modrm); ++ gen_op_ld_v(s, MO_8, s->T0, s->A0); + } else { + /* xsaveopt */ + if ((s->cpuid_ext_features & CPUID_EXT_XSAVE) == 0 +@@ -6687,7 +6691,11 @@ static bool disas_insn(DisasContext *s, CPUState *cpu) + goto illegal_op; + } + } +- gen_nop_modrm(env, s, modrm); ++ // load (and ignore) a single byte from memory, triggering a memory callback. ++ // only one byte because we don't want inter-cacheline reads ++ // value in s->T0 is ignored afterwards. ++ gen_lea_modrm(env, s, modrm); ++ gen_op_ld_v(s, MO_8, s->T0, s->A0); + break; + + case 0xc0 ... 0xc7: /* rdfsbase (f3 0f ae /0) */ diff --git a/report_results.py b/report_results.py new file mode 100644 index 0000000..5c3f9f7 --- /dev/null +++ b/report_results.py @@ -0,0 +1,97 @@ +import json +import sys + +def file_to_json(name): + loc = sys.argv[1] + "/" + name + with open(loc, "r") as f: + return json.load(f) + +def get_keys(name): + index = file_to_json(name) + il = list(index.keys()) + il.sort() + return il + +def get_unique_hashes(name): + index = file_to_json(name) + all_hashes = set() + for (_, hashes) in index.items(): + for h in hashes: + all_hashes.add(h) + return all_hashes + +print("pmem images:", len(get_unique_hashes("pmem.index"))) +print("nvme images:", len(get_unique_hashes("nvme.index"))) +hybrid_states = get_unique_hashes("states.index") +print("hybrid images:", len(hybrid_states)) +print("number of semantic states:", len(get_keys("states.index"))) + +checkpoint_index = file_to_json("checkpoint.index") +checkpoint_ids = list(checkpoint_index.values()) +checkpoint_ids.sort() +pmem_index = file_to_json("pmem.index") +nvme_index = file_to_json("nvme.index") +state_index = file_to_json("states.index") + +def hash_time(index): + times = {} + for (trace_id, hashes) in index.items(): + for h in hashes: + if h in times: + times[h].add(trace_id) + else: + times[h] = {trace_id} + return times + +pmem_hash_time = hash_time(pmem_index) +nvme_hash_time = hash_time(nvme_index) + +def hybrid_hash_time(hybrid_hash): + pmem_part, nvme_part = hybrid_hash.split('_') + valid_times = pmem_hash_time[pmem_part] & nvme_hash_time[nvme_part] + return valid_times + +state_time = {} +for state_hash, hybrid_hashes in state_index.items(): + current = set() + for h in hybrid_hashes: + times = hybrid_hash_time(h) + current |= times + state_time[state_hash] = current + +def id_to_prev_checkpoint_value(trace_id): + for i, cid in enumerate(checkpoint_ids): + if int(trace_id) <= int(cid): + return max(i - 1, 0) + +state_time_cp = {} +for state_hash, trace_ids in state_time.items(): + state_time_cp[state_hash] = {id_to_prev_checkpoint_value(tid) for tid in trace_ids} + +num_states_for_cp = {} +for val in range(len(checkpoint_ids) - 1): + num_states_for_cp[val] = sum(1 if val in cids else 0 for cids in state_time_cp.values()) + +COL_GREEN = '\033[92m' +COL_RED = '\033[91m' +COL_END = '\033[0m' + +print() +print("number of semantic states per logical operation:") +for val in range(len(checkpoint_ids) - 1): + print(f"[{val}..{val+1}]: {num_states_for_cp[val]}", end="") + if num_states_for_cp[val] <= 2: + print(f" -> {COL_GREEN}atomic{COL_END}") + else: + print(f" -> {COL_RED}not atomic{COL_END}") + +print() +print("single final state:") +for val in range(len(checkpoint_ids)): + states_at_exactly_this_cp = set() + for state_hash, trace_ids in state_time.items(): + if checkpoint_ids[val] in trace_ids: + states_at_exactly_this_cp.add(state_hash) + sfs = len(states_at_exactly_this_cp) <= 1 + msg = f"{COL_GREEN}SFS{COL_END}" if sfs else f"{COL_RED}not SFS{COL_END}" + print(f"checkpoint {val}: {msg}")