Skip to content

Commit

Permalink
cmp: print verbose diffs as we find them
Browse files Browse the repository at this point in the history
Before this change, we would first find all changes so we could obtain
the largest offset we will report and use that to set up the padding.

Now we use the file sizes to estimate the largest possible offset.
Not only does this allow us to print earlier, reduces memory usage, as
we do not store diffs to report later, but it also fixes a case in
which our output was different to GNU cmp's - because it also seems
to estimate based on size.

Memory usage drops by a factor of 1000(!), without losing performance
while comparing 2 binaries of hundreds of MBs:

Before:

 Maximum resident set size (kbytes): 2489260

 Benchmark 1: ../target/release/diffutils \
 cmp -l -b /usr/lib64/chromium-browser/chromium-browser /usr/lib64/firefox/libxul.so
   Time (mean ± σ):     14.466 s ±  0.166 s    [User: 12.367 s, System: 2.012 s]
   Range (min … max):   14.350 s … 14.914 s    10 runs

After:

 Maximum resident set size (kbytes): 2636

 Benchmark 1: ../target/release/diffutils \
 cmp -l -b /usr/lib64/chromium-browser/chromium-browser /usr/lib64/firefox/libxul.so
   Time (mean ± σ):     13.724 s ±  0.038 s    [User: 12.263 s, System: 1.372 s]
   Range (min … max):   13.667 s … 13.793 s    10 runs
  • Loading branch information
kov committed Oct 5, 2024
1 parent 0bf04b4 commit bca3d30
Show file tree
Hide file tree
Showing 2 changed files with 97 additions and 99 deletions.
194 changes: 96 additions & 98 deletions src/cmp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use std::ffi::OsString;
use std::io::{BufRead, BufReader, BufWriter, Read, Write};
use std::iter::Peekable;
use std::process::ExitCode;
use std::{fs, io};
use std::{cmp, fs, io};

#[cfg(not(target_os = "windows"))]
use std::os::fd::{AsRawFd, FromRawFd};
Expand Down Expand Up @@ -320,10 +320,39 @@ pub fn cmp(params: &Params) -> Result<Cmp, String> {
let mut from = prepare_reader(&params.from, &params.skip_a, params)?;
let mut to = prepare_reader(&params.to, &params.skip_b, params)?;

let mut offset_width = if let Some(max_bytes) = params.max_bytes {
max_bytes
} else {
usize::MAX
};

if let (Ok(a_meta), Ok(b_meta)) = (fs::metadata(&params.from), fs::metadata(&params.to)) {
#[cfg(not(target_os = "windows"))]
let (a_size, b_size) = (a_meta.size(), b_meta.size());

#[cfg(target_os = "windows")]
let (a_size, b_size) = (a_meta.file_size(), b_meta.file_size());

let smaller = cmp::min(a_size, b_size) as usize;
offset_width = cmp::min(smaller, offset_width) as usize;

// If the files have different sizes, we already know they are not identical. If we have not
// been asked to show even the first difference, we can quit early.
if params.quiet && a_size != b_size {
return Ok(Cmp::Different);
}
}

let offset_width = 1 + offset_width.checked_ilog10().unwrap_or(1) as usize;

// Capacity calc: at_byte width + 2 x 3-byte octal numbers + 2 x 4-byte value + 4 spaces
let mut output = Vec::<u8>::with_capacity(offset_width + 3 * 2 + 4 * 2 + 4);

let mut at_byte = 1;
let mut at_line = 1;
let mut start_of_line = true;
let mut verbose_diffs = vec![];
let mut stdout = BufWriter::new(io::stdout().lock());
let mut compare = Cmp::Equal;
loop {
// Fill up our buffers.
let from_buf = match from.fill_buf() {
Expand Down Expand Up @@ -360,10 +389,6 @@ pub fn cmp(params: &Params) -> Result<Cmp, String> {
&params.to.to_string_lossy()
};

if params.verbose {
report_verbose_diffs(verbose_diffs, params)?;
}

report_eof(at_byte, at_line, start_of_line, eof_on, params);
return Ok(Cmp::Different);
}
Expand Down Expand Up @@ -395,8 +420,24 @@ pub fn cmp(params: &Params) -> Result<Cmp, String> {
// first one runs out.
for (&from_byte, &to_byte) in from_buf.iter().zip(to_buf.iter()) {
if from_byte != to_byte {
compare = Cmp::Different;

if params.verbose {
verbose_diffs.push((at_byte, from_byte, to_byte));
format_verbose_difference(
from_byte,
to_byte,
at_byte,
offset_width,
&mut output,
params,
)?;
stdout.write_all(output.as_slice()).map_err(|e| {
format!(
"{}: error printing output: {e}",
params.executable.to_string_lossy()
)

Check warning on line 438 in src/cmp.rs

View check run for this annotation

Codecov / codecov/patch

src/cmp.rs#L435-L438

Added lines #L435 - L438 were not covered by tests
})?;
output.clear();
} else {
report_difference(from_byte, to_byte, at_byte, at_line, params);
return Ok(Cmp::Different);
Expand All @@ -422,12 +463,7 @@ pub fn cmp(params: &Params) -> Result<Cmp, String> {
to.consume(consumed);
}

if params.verbose && !verbose_diffs.is_empty() {
report_verbose_diffs(verbose_diffs, params)?;
return Ok(Cmp::Different);
}

Ok(Cmp::Equal)
Ok(compare)
}

// Exit codes are documented at
Expand All @@ -450,21 +486,6 @@ pub fn main(opts: Peekable<ArgsOs>) -> ExitCode {
return ExitCode::SUCCESS;
}

// If the files have different sizes, we already know they are not identical. If we have not
// been asked to show even the first difference, we can quit early.
if params.quiet {
if let (Ok(a_meta), Ok(b_meta)) = (fs::metadata(&params.from), fs::metadata(&params.to)) {
#[cfg(not(target_os = "windows"))]
if a_meta.size() != b_meta.size() {
return ExitCode::from(1);
}
#[cfg(target_os = "windows")]
if a_meta.file_size() != b_meta.file_size() {
return ExitCode::from(1);
}
}
}

match cmp(&params) {
Ok(Cmp::Equal) => ExitCode::SUCCESS,
Ok(Cmp::Different) => ExitCode::from(1),
Expand Down Expand Up @@ -533,99 +554,76 @@ fn format_byte(byte: u8) -> String {
// This function has been optimized to not use the Rust fmt system, which
// leads to a massive speed up when processing large files: cuts the time
// for comparing 2 ~36MB completely different files in half on an M1 Max.
fn report_verbose_diffs(diffs: Vec<(usize, u8, u8)>, params: &Params) -> Result<(), String> {
#[inline]
fn format_verbose_difference(
from_byte: u8,
to_byte: u8,
at_byte: usize,
offset_width: usize,
output: &mut Vec<u8>,
params: &Params,
) -> Result<(), String> {
assert!(!params.quiet);

let mut stdout = BufWriter::new(io::stdout().lock());
if let Some((offset, _, _)) = diffs.last() {
// Obtain the width of the first column from the last byte offset.
let width = format!("{}", offset).len();

let mut at_byte_buf = itoa::Buffer::new();
let mut from_oct = [0u8; 3]; // for octal conversions
let mut to_oct = [0u8; 3];

// Capacity calc: at_byte width + 2 x 3-byte octal numbers + 4-byte value + up to 2 byte value + 4 spaces
let mut output = Vec::<u8>::with_capacity(width + 3 * 2 + 4 + 2 + 4);

if params.print_bytes {
for (at_byte, from_byte, to_byte) in diffs {
output.clear();
let mut at_byte_buf = itoa::Buffer::new();
let mut from_oct = [0u8; 3]; // for octal conversions
let mut to_oct = [0u8; 3];

// "{:>width$} {:>3o} {:4} {:>3o} {}",
let at_byte_str = at_byte_buf.format(at_byte);
let at_byte_padding = width - at_byte_str.len();

for _ in 0..at_byte_padding {
output.push(b' ')
}

output.extend_from_slice(at_byte_str.as_bytes());
if params.print_bytes {
// "{:>width$} {:>3o} {:4} {:>3o} {}",
let at_byte_str = at_byte_buf.format(at_byte);
let at_byte_padding = offset_width.saturating_sub(at_byte_str.len());

output.push(b' ');
for _ in 0..at_byte_padding {
output.push(b' ')
}

output.extend_from_slice(format_octal(from_byte, &mut from_oct).as_bytes());
output.extend_from_slice(at_byte_str.as_bytes());

output.push(b' ');
output.push(b' ');

let from_byte_str = format_byte(from_byte);
let from_byte_padding = 4 - from_byte_str.len();
output.extend_from_slice(format_octal(from_byte, &mut from_oct).as_bytes());

output.extend_from_slice(from_byte_str.as_bytes());
output.push(b' ');

for _ in 0..from_byte_padding {
output.push(b' ')
}
let from_byte_str = format_byte(from_byte);
let from_byte_padding = 4 - from_byte_str.len();

output.push(b' ');
output.extend_from_slice(from_byte_str.as_bytes());

output.extend_from_slice(format_octal(to_byte, &mut to_oct).as_bytes());

output.push(b' ');
for _ in 0..from_byte_padding {
output.push(b' ')
}

output.extend_from_slice(format_byte(to_byte).as_bytes());
output.push(b' ');

output.push(b'\n');
output.extend_from_slice(format_octal(to_byte, &mut to_oct).as_bytes());

stdout.write_all(output.as_slice()).map_err(|e| {
format!(
"{}: error printing output: {e}",
params.executable.to_string_lossy()
)
})?;
}
} else {
for (at_byte, from_byte, to_byte) in diffs {
output.clear();
output.push(b' ');

// "{:>width$} {:>3o} {:>3o}"
let at_byte_str = at_byte_buf.format(at_byte);
let at_byte_padding = width - at_byte_str.len();
output.extend_from_slice(format_byte(to_byte).as_bytes());

for _ in 0..at_byte_padding {
output.push(b' ')
}
output.push(b'\n');
} else {
// "{:>width$} {:>3o} {:>3o}"
let at_byte_str = at_byte_buf.format(at_byte);
let at_byte_padding = offset_width - at_byte_str.len();

output.extend_from_slice(at_byte_str.as_bytes());
for _ in 0..at_byte_padding {
output.push(b' ')

Check warning on line 613 in src/cmp.rs

View check run for this annotation

Codecov / codecov/patch

src/cmp.rs#L613

Added line #L613 was not covered by tests
}

output.push(b' ');
output.extend_from_slice(at_byte_str.as_bytes());

output.extend_from_slice(format_octal(from_byte, &mut from_oct).as_bytes());
output.push(b' ');

output.push(b' ');
output.extend_from_slice(format_octal(from_byte, &mut from_oct).as_bytes());

output.extend_from_slice(format_octal(to_byte, &mut to_oct).as_bytes());
output.push(b' ');

output.push(b'\n');
output.extend_from_slice(format_octal(to_byte, &mut to_oct).as_bytes());

stdout.write_all(output.as_slice()).map_err(|e| {
format!(
"{}: error printing output: {e}",
params.executable.to_string_lossy()
)
})?;
}
}
output.push(b'\n');
}

Ok(())
Expand Down
2 changes: 1 addition & 1 deletion tests/integration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -616,7 +616,7 @@ mod cmp {
.code(predicate::eq(1))
.failure()
.stderr(predicate::str::is_empty())
.stdout(predicate::eq("4 40 144 d\n8 40 150 h\n"));
.stdout(predicate::eq(" 4 40 144 d\n 8 40 150 h\n"));
Ok(())
}

Expand Down

0 comments on commit bca3d30

Please sign in to comment.