From 1e16b610ac18d67e420e899db87aec4bbb52dfeb Mon Sep 17 00:00:00 2001 From: Weihang Lo Date: Mon, 13 Apr 2026 11:41:07 -0400 Subject: [PATCH 1/2] test(replay): replace git subprocess with `git cat-file --batch` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit So we use stdout pipe to avoid thousands of forc-exec over `git ls-tree` and `git show`. Benchmark result for Apple M1 Pro (10 cores): | Repo | Before | After | Speedup | |-----------------------|-------:|-------:|--------:| | bmwill/diffy (full) | 31.76s | 11.06s | 2.9× | | rust-lang/cargo (200) | 93.04s | 14.60s | 6.4× | --- tests/replay.rs | 199 ++++++++++++++++++++++++++++++------------------ 1 file changed, 126 insertions(+), 73 deletions(-) diff --git a/tests/replay.rs b/tests/replay.rs index d599b20..f1701e4 100644 --- a/tests/replay.rs +++ b/tests/replay.rs @@ -54,14 +54,109 @@ use std::{ env, + io::{BufRead, BufReader, Read, Write}, path::{Path, PathBuf}, - process::Command, + process::{Command, Stdio}, sync::Mutex, }; use diffy::patch_set::{FileOperation, ParseOptions, PatchKind, PatchSet}; use rayon::prelude::*; +/// Persistent `git cat-file --batch` process for fast object lookups. +/// +/// See for more. +struct CatFile { + // Field order matters: stdin must drop (close) before child is reaped. + stdin: std::process::ChildStdin, + stdout: BufReader, + #[allow(dead_code)] // held for drop order: reaped after stdin closes + child: std::process::Child, +} + +impl CatFile { + fn new(repo: &Path) -> Self { + let mut child = Command::new("git") + .env("GIT_CONFIG_NOSYSTEM", "1") + .env("GIT_CONFIG_GLOBAL", "/dev/null") + .arg("-C") + .arg(repo) + .args(["cat-file", "--batch"]) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::null()) + .spawn() + .expect("failed to spawn git cat-file --batch"); + let stdin = child.stdin.take().unwrap(); + let stdout = BufReader::new(child.stdout.take().unwrap()); + Self { + stdin, + stdout, + child, + } + } + + /// Look up an object by `:`. + /// + /// Returns `None` for submodules, commit/tree/tag object types, and missing objects. + fn get(&mut self, rev: &str, path: &str) -> Option> { + writeln!(self.stdin, "{rev}:{path}").expect("cat-file stdin write failed"); + + let mut header = String::new(); + self.stdout + .read_line(&mut header) + .expect("cat-file stdout read failed"); + + // Response formats: + // + // * regular file: ` blob \n\n` + // * submodule: + // * ` commit \n\n` + // * ` submodule\n` + // * not found: ` missing\n` + // + // `tag` and `tree` object type are not relevant here. + // + // See + + let header = header.trim_end(); + let mut it = header.splitn(3, ' '); + + let Some(_oid) = it.next() else { + panic!("unexpected cat-file header on {rev}: {header}"); + }; + let Some(ty) = it.next() else { + panic!("unexpected cat-file header on {rev}: {header}"); + }; + + // Types may have no `size` field, like "missing" or "submodule" + let size: usize = it + .next()? + .parse() + .unwrap_or_else(|e| panic!("invalid size in cat-file header on {rev}: {header}: {e}")); + + let mut buf = vec![0u8; size]; + self.stdout.read_exact(&mut buf).expect("short read"); + + let mut nl = [0]; + self.stdout + .read_exact(&mut nl) + .expect("missing trailing LF"); + + // Only blobs are regular file content + if ty != "blob" { + return None; + } + + Some(buf) + } + + /// Like [`CatFile::get`] but returns only UTF-8 string. + fn get_text(&mut self, rev: &str, path: &str) -> Option { + self.get(rev, path).and_then(|b| String::from_utf8(b).ok()) + } +} + /// Local enum for test configuration (maps to ParseOptions). #[derive(Debug, Clone, Copy, PartialEq, Eq)] enum TestMode { @@ -154,57 +249,6 @@ fn git(repo: &Path, args: &[&str]) -> String { String::from_utf8_lossy(&output.stdout).into_owned() } -/// Check if a path is a submodule at a specific commit. -fn is_submodule(repo: &Path, commit: &str, path: &str) -> bool { - let mut cmd = Command::new("git"); - cmd.env("GIT_CONFIG_NOSYSTEM", "1"); - cmd.env("GIT_CONFIG_GLOBAL", "/dev/null"); - cmd.arg("-C").arg(repo); - cmd.args(["ls-tree", "--format=%(objectmode)", commit, "--", path]); - - let output = cmd.output().expect("failed to execute git ls-tree"); - - if !output.status.success() { - return false; - } - - String::from_utf8_lossy(&output.stdout).trim() == "160000" -} - -/// Get file content at a specific commit as bytes. -/// -/// Returns `None` if the path is a submodule. -fn file_at_commit_bytes(repo: &Path, commit: &str, path: &str) -> Option> { - if is_submodule(repo, commit, path) { - return None; - } - - let mut cmd = Command::new("git"); - cmd.env("GIT_CONFIG_NOSYSTEM", "1"); - cmd.env("GIT_CONFIG_GLOBAL", "/dev/null"); - cmd.arg("-C").arg(repo); - cmd.args(["show", &format!("{commit}:{path}")]); - - let output = cmd.output().expect("failed to execute git show"); - - if !output.status.success() { - let stderr = String::from_utf8_lossy(&output.stderr); - panic!("file {path} doesn't exist at {commit}: {stderr}"); - } - - Some(output.stdout) -} - -/// Get file content at a specific commit as text. -/// -/// Returns `None` if: -/// -/// * The path is a submodule -/// * The file is binary (not valid UTF-8) -fn file_at_commit(repo: &Path, commit: &str, path: &str) -> Option { - file_at_commit_bytes(repo, commit, path).and_then(|b| String::from_utf8(b).ok()) -} - /// Get the list of commits from oldest to newest. fn commit_history(repo: &Path, selection: &CommitSelection) -> Vec { match selection { @@ -268,7 +312,13 @@ fn count_type_changes(raw: &str) -> usize { .count() } -fn process_commit(repo: &Path, parent: &str, child: &str, mode: TestMode) -> CommitResult { +fn process_commit( + cat: &mut CatFile, + repo: &Path, + parent: &str, + child: &str, + mode: TestMode, +) -> CommitResult { let parent_short = parent[..8].to_string(); let child_short = child[..8].to_string(); let mut files = Vec::new(); @@ -391,7 +441,7 @@ fn process_commit(repo: &Path, parent: &str, child: &str, mode: TestMode) -> Com match file_patch.patch() { PatchKind::Text(patch) => { let base_content = if let Some(path) = base_path { - let Some(content) = file_at_commit(repo, parent, path) else { + let Some(content) = cat.get_text(parent, path) else { skipped += 1; continue; }; @@ -401,7 +451,7 @@ fn process_commit(repo: &Path, parent: &str, child: &str, mode: TestMode) -> Com }; let expected_content = if let Some(path) = target_path { - let Some(content) = file_at_commit(repo, child, path) else { + let Some(content) = cat.get_text(child, path) else { skipped += 1; continue; }; @@ -482,25 +532,28 @@ fn replay() { total_skipped: 0, }); - (0..total_diffs).into_par_iter().for_each(|i| { - let result = process_commit(&repo, &commits[i], &commits[i + 1], mode); - - let completed = { - let mut p = progress.lock().unwrap(); - p.completed += 1; - p.total_applied += result.applied; - p.total_skipped += result.skipped; - p.completed - }; + (0..total_diffs).into_par_iter().for_each_init( + || CatFile::new(&repo), + |cat, i| { + let result = process_commit(cat, &repo, &commits[i], &commits[i + 1], mode); + + let completed = { + let mut p = progress.lock().unwrap(); + p.completed += 1; + p.total_applied += result.applied; + p.total_skipped += result.skipped; + p.completed + }; - eprintln!( - "[{completed}/{total_diffs}] ({repo_name}, {mode_name}) Processing {}..{}", - result.parent_short, result.child_short - ); - for desc in &result.files { - eprintln!(" ✓ {desc}"); - } - }); + eprintln!( + "[{completed}/{total_diffs}] ({repo_name}, {mode_name}) Processing {}..{}", + result.parent_short, result.child_short + ); + for desc in &result.files { + eprintln!(" ✓ {desc}"); + } + }, + ); let p = progress.lock().unwrap(); eprintln!( From 3586742eebbee34b14d7d9ad5f13f11d7e4b49c6 Mon Sep 17 00:00:00 2001 From: Weihang Lo Date: Mon, 13 Apr 2026 14:02:14 -0400 Subject: [PATCH 2/2] test(replay): combine `--raw` and `--numstat` into single git call MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reduces subprocess count from 3 to 2 per commit pair. Benchmark result for full history replay (on 96-cores Linux host): | Repo | master | this | Speedup | |-----------------|----------|---------|---------| | rust-lang/cargo | 59.40s | 5.61s | 10.6× | | rust-lang/rust | 1682.06s | 101.66s | 16.5× | | python/cpython | 439.20s | 106.31s | 4.1× | On macOS M1 10 cores mahchine: | Repo | master | this | Speedup | |-----------------------|-------:|-------:|--------:| | bmwill/diffy (full) | 34.01s | 8.91s | 3.8× | | rust-lang/cargo (200) | 96.22s | 11.58s | 8.3× | --- tests/replay.rs | 49 +++++++++++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/tests/replay.rs b/tests/replay.rs index f1701e4..7eaa56b 100644 --- a/tests/replay.rs +++ b/tests/replay.rs @@ -281,7 +281,7 @@ fn commit_history(repo: &Path, selection: &CommitSelection) -> Vec { } } -/// Count type-change entries (`T` status) in `git diff --raw` output. +/// Check if a `git diff --raw` line is a type change (status `T`). /// /// Type changes (e.g., symlink → regular file) produce two patches /// (delete + create) but only one `--raw` line. @@ -300,16 +300,12 @@ fn commit_history(repo: &Path, selection: &CommitSelection) -> Vec { /// /// See for /// the `--raw` format specification. -fn count_type_changes(raw: &str) -> usize { - raw.lines() - .filter(|l| !l.is_empty()) - .filter(|line| { - // --raw format: `:old_mode new_mode old_hash new_hash status\tpath` - line.split('\t') - .next() - .is_some_and(|meta| meta.ends_with(" T")) - }) - .count() +fn is_type_change(raw_line: &str) -> bool { + // --raw format: `:old_mode new_mode old_hash new_hash status\tpath` + raw_line + .split('\t') + .next() + .is_some_and(|meta| meta.ends_with(" T")) } fn process_commit( @@ -352,24 +348,29 @@ fn process_commit( // 2b08718b..06c93976 for examples. let expected_file_count = match mode { TestMode::UniDiff => { + // Combine `--raw` and `--numstat` into a single git call. + // Output: raw lines (start with `:`) followed by numstat lines. + // // `--numstat` format: // - `added\tdeleted\tpath` for text files // - `-\t-\tpath` for binary files (skipped - no patch data in unidiff) // - `0\t0\tpath` for empty/no-content changes (skipped) - let numstat = git(repo, &["diff", "--numstat", "--no-renames", parent, child]); - let text_files = numstat - .lines() - .filter(|l| !l.is_empty()) - .fold(0, |count, line| { - if line.starts_with("-\t-\t") || line.starts_with("0\t0\t") { - skipped += 1; - count - } else { - count + 1 + let raw_numstat = git( + repo, + &["diff", "--raw", "--numstat", "--no-renames", parent, child], + ); + let (mut type_changes, mut text_files) = (0, 0); + for line in raw_numstat.lines().filter(|l| !l.is_empty()) { + if line.starts_with(':') { + if is_type_change(line) { + type_changes += 1; } - }); - let raw = git(repo, &["diff", "--raw", "--no-renames", parent, child]); - let type_changes = count_type_changes(&raw); + } else if line.starts_with("-\t-\t") || line.starts_with("0\t0\t") { + skipped += 1; + } else { + text_files += 1; + } + } text_files + type_changes } };