372 KiB
PR #1770: Add a TurnDiffTracker to create a unified diff for an entire turn
- URL: https://github.com/openai/codex/pull/1770
- Author: gpeal
- Created: 2025-07-31 23:42:44 UTC
- Updated: 2025-08-04 16:55:01 UTC
- Changes: +998/-18, Files changed: 9, Commits: 25
Description
This lets us show an accumulating diff across all patches in a turn. Refer to the docs for TurnDiffTracker for implementation details.
There are multiple ways this could have been done and this felt like the right tradeoff between reliability and completeness: Pros
- It will pick up all changes to files that the model touched including if they prettier or another command that updates them.
- It will not pick up changes made by the user or other agents to files it didn't modify.
Cons
- It will pick up changes that the user made to a file that the model also touched
- It will not pick up changes to codegen or files that were not modified with apply_patch
Full Diff
diff --git a/codex-rs/Cargo.lock b/codex-rs/Cargo.lock
index 7d4e41d0b1..eb4eccd897 100644
--- a/codex-rs/Cargo.lock
+++ b/codex-rs/Cargo.lock
@@ -699,6 +699,7 @@ dependencies = [
"serde_json",
"sha1",
"shlex",
+ "similar",
"strum_macros 0.27.2",
"tempfile",
"thiserror 2.0.12",
diff --git a/codex-rs/core/Cargo.toml b/codex-rs/core/Cargo.toml
index db3fd4f834..466e9adf02 100644
--- a/codex-rs/core/Cargo.toml
+++ b/codex-rs/core/Cargo.toml
@@ -34,6 +34,7 @@ serde_json = "1"
serde_bytes = "0.11"
sha1 = "0.10.6"
shlex = "1.3.0"
+similar = "2.7.0"
strum_macros = "0.27.2"
thiserror = "2.0.12"
time = { version = "0.3", features = ["formatting", "local-offset", "macros"] }
diff --git a/codex-rs/core/src/codex.rs b/codex-rs/core/src/codex.rs
index 7004dcfcb7..568d87c4a8 100644
--- a/codex-rs/core/src/codex.rs
+++ b/codex-rs/core/src/codex.rs
@@ -85,11 +85,13 @@ use crate::protocol::SandboxPolicy;
use crate::protocol::SessionConfiguredEvent;
use crate::protocol::Submission;
use crate::protocol::TaskCompleteEvent;
+use crate::protocol::TurnDiffEvent;
use crate::rollout::RolloutRecorder;
use crate::safety::SafetyCheck;
use crate::safety::assess_command_safety;
use crate::safety::assess_safety_for_untrusted_command;
use crate::shell;
+use crate::turn_diff_tracker::TurnDiffTracker;
use crate::user_notification::UserNotification;
use crate::util::backoff;
@@ -362,7 +364,11 @@ impl Session {
}
}
- async fn notify_exec_command_begin(&self, exec_command_context: ExecCommandContext) {
+ async fn on_exec_command_begin(
+ &self,
+ turn_diff_tracker: &mut TurnDiffTracker,
+ exec_command_context: ExecCommandContext,
+ ) {
let ExecCommandContext {
sub_id,
call_id,
@@ -374,11 +380,15 @@ impl Session {
Some(ApplyPatchCommandContext {
user_explicitly_approved_this_action,
changes,
- }) => EventMsg::PatchApplyBegin(PatchApplyBeginEvent {
- call_id,
- auto_approved: !user_explicitly_approved_this_action,
- changes,
- }),
+ }) => {
+ turn_diff_tracker.on_patch_begin(&changes);
+
+ EventMsg::PatchApplyBegin(PatchApplyBeginEvent {
+ call_id,
+ auto_approved: !user_explicitly_approved_this_action,
+ changes,
+ })
+ }
None => EventMsg::ExecCommandBegin(ExecCommandBeginEvent {
call_id,
command: command_for_display.clone(),
@@ -392,8 +402,10 @@ impl Session {
let _ = self.tx_event.send(event).await;
}
- async fn notify_exec_command_end(
+ #[allow(clippy::too_many_arguments)]
+ async fn on_exec_command_end(
&self,
+ turn_diff_tracker: &mut TurnDiffTracker,
sub_id: &str,
call_id: &str,
output: &ExecToolCallOutput,
@@ -433,6 +445,20 @@ impl Session {
msg,
};
let _ = self.tx_event.send(event).await;
+
+ // If this is an apply_patch, after we emit the end patch, emit a second event
+ // with the full turn diff if there is one.
+ if is_apply_patch {
+ let unified_diff = turn_diff_tracker.get_unified_diff();
+ if let Ok(Some(unified_diff)) = unified_diff {
+ let msg = EventMsg::TurnDiff(TurnDiffEvent { unified_diff });
+ let event = Event {
+ id: sub_id.into(),
+ msg,
+ };
+ let _ = self.tx_event.send(event).await;
+ }
+ }
}
/// Helper that emits a BackgroundEvent with the given message. This keeps
@@ -1006,6 +1032,10 @@ async fn run_task(sess: Arc<Session>, sub_id: String, input: Vec<InputItem>) {
.await;
let last_agent_message: Option<String>;
+ // Although from the perspective of codex.rs, TurnDiffTracker has the lifecycle of a Task which contains
+ // many turns, from the perspective of the user, it is a single turn.
+ let mut turn_diff_tracker = TurnDiffTracker::new();
+
loop {
// Note that pending_input would be something like a message the user
// submitted through the UI while the model was running. Though the UI
@@ -1037,7 +1067,7 @@ async fn run_task(sess: Arc<Session>, sub_id: String, input: Vec<InputItem>) {
})
})
.collect();
- match run_turn(&sess, sub_id.clone(), turn_input).await {
+ match run_turn(&sess, &mut turn_diff_tracker, sub_id.clone(), turn_input).await {
Ok(turn_output) => {
let mut items_to_record_in_conversation_history = Vec::<ResponseItem>::new();
let mut responses = Vec::<ResponseInputItem>::new();
@@ -1163,6 +1193,7 @@ async fn run_task(sess: Arc<Session>, sub_id: String, input: Vec<InputItem>) {
async fn run_turn(
sess: &Session,
+ turn_diff_tracker: &mut TurnDiffTracker,
sub_id: String,
input: Vec<ResponseItem>,
) -> CodexResult<Vec<ProcessedResponseItem>> {
@@ -1177,7 +1208,7 @@ async fn run_turn(
let mut retries = 0;
loop {
- match try_run_turn(sess, &sub_id, &prompt).await {
+ match try_run_turn(sess, turn_diff_tracker, &sub_id, &prompt).await {
Ok(output) => return Ok(output),
Err(CodexErr::Interrupted) => return Err(CodexErr::Interrupted),
Err(CodexErr::EnvVar(var)) => return Err(CodexErr::EnvVar(var)),
@@ -1223,6 +1254,7 @@ struct ProcessedResponseItem {
async fn try_run_turn(
sess: &Session,
+ turn_diff_tracker: &mut TurnDiffTracker,
sub_id: &str,
prompt: &Prompt,
) -> CodexResult<Vec<ProcessedResponseItem>> {
@@ -1310,7 +1342,8 @@ async fn try_run_turn(
match event {
ResponseEvent::Created => {}
ResponseEvent::OutputItemDone(item) => {
- let response = handle_response_item(sess, sub_id, item.clone()).await?;
+ let response =
+ handle_response_item(sess, turn_diff_tracker, sub_id, item.clone()).await?;
output.push(ProcessedResponseItem { item, response });
}
@@ -1328,6 +1361,16 @@ async fn try_run_turn(
.ok();
}
+ let unified_diff = turn_diff_tracker.get_unified_diff();
+ if let Ok(Some(unified_diff)) = unified_diff {
+ let msg = EventMsg::TurnDiff(TurnDiffEvent { unified_diff });
+ let event = Event {
+ id: sub_id.to_string(),
+ msg,
+ };
+ let _ = sess.tx_event.send(event).await;
+ }
+
return Ok(output);
}
ResponseEvent::OutputTextDelta(delta) => {
@@ -1432,6 +1475,7 @@ async fn run_compact_task(
async fn handle_response_item(
sess: &Session,
+ turn_diff_tracker: &mut TurnDiffTracker,
sub_id: &str,
item: ResponseItem,
) -> CodexResult<Option<ResponseInputItem>> {
@@ -1469,7 +1513,17 @@ async fn handle_response_item(
..
} => {
info!("FunctionCall: {arguments}");
- Some(handle_function_call(sess, sub_id.to_string(), name, arguments, call_id).await)
+ Some(
+ handle_function_call(
+ sess,
+ turn_diff_tracker,
+ sub_id.to_string(),
+ name,
+ arguments,
+ call_id,
+ )
+ .await,
+ )
}
ResponseItem::LocalShellCall {
id,
@@ -1504,6 +1558,7 @@ async fn handle_response_item(
handle_container_exec_with_params(
exec_params,
sess,
+ turn_diff_tracker,
sub_id.to_string(),
effective_call_id,
)
@@ -1521,6 +1576,7 @@ async fn handle_response_item(
async fn handle_function_call(
sess: &Session,
+ turn_diff_tracker: &mut TurnDiffTracker,
sub_id: String,
name: String,
arguments: String,
@@ -1534,7 +1590,8 @@ async fn handle_function_call(
return *output;
}
};
- handle_container_exec_with_params(params, sess, sub_id, call_id).await
+ handle_container_exec_with_params(params, sess, turn_diff_tracker, sub_id, call_id)
+ .await
}
"update_plan" => handle_update_plan(sess, arguments, sub_id, call_id).await,
_ => {
@@ -1608,6 +1665,7 @@ fn maybe_run_with_user_profile(params: ExecParams, sess: &Session) -> ExecParams
async fn handle_container_exec_with_params(
params: ExecParams,
sess: &Session,
+ turn_diff_tracker: &mut TurnDiffTracker,
sub_id: String,
call_id: String,
) -> ResponseInputItem {
@@ -1755,7 +1813,7 @@ async fn handle_container_exec_with_params(
},
),
};
- sess.notify_exec_command_begin(exec_command_context.clone())
+ sess.on_exec_command_begin(turn_diff_tracker, exec_command_context.clone())
.await;
let params = maybe_run_with_user_profile(params, sess);
@@ -1782,7 +1840,8 @@ async fn handle_container_exec_with_params(
duration,
} = &output;
- sess.notify_exec_command_end(
+ sess.on_exec_command_end(
+ turn_diff_tracker,
&sub_id,
&call_id,
&output,
@@ -1806,7 +1865,15 @@ async fn handle_container_exec_with_params(
}
}
Err(CodexErr::Sandbox(error)) => {
- handle_sandbox_error(params, exec_command_context, error, sandbox_type, sess).await
+ handle_sandbox_error(
+ turn_diff_tracker,
+ params,
+ exec_command_context,
+ error,
+ sandbox_type,
+ sess,
+ )
+ .await
}
Err(e) => {
// Handle non-sandbox errors
@@ -1822,6 +1889,7 @@ async fn handle_container_exec_with_params(
}
async fn handle_sandbox_error(
+ turn_diff_tracker: &mut TurnDiffTracker,
params: ExecParams,
exec_command_context: ExecCommandContext,
error: SandboxErr,
@@ -1878,7 +1946,8 @@ async fn handle_sandbox_error(
sess.notify_background_event(&sub_id, "retrying command without sandbox")
.await;
- sess.notify_exec_command_begin(exec_command_context).await;
+ sess.on_exec_command_begin(turn_diff_tracker, exec_command_context)
+ .await;
// This is an escalated retry; the policy will not be
// examined and the sandbox has been set to `None`.
@@ -1905,8 +1974,14 @@ async fn handle_sandbox_error(
duration,
} = &retry_output;
- sess.notify_exec_command_end(&sub_id, &call_id, &retry_output, is_apply_patch)
- .await;
+ sess.on_exec_command_end(
+ turn_diff_tracker,
+ &sub_id,
+ &call_id,
+ &retry_output,
+ is_apply_patch,
+ )
+ .await;
let is_success = *exit_code == 0;
let content = format_exec_output(
diff --git a/codex-rs/core/src/lib.rs b/codex-rs/core/src/lib.rs
index 80f9014954..4f083d9e56 100644
--- a/codex-rs/core/src/lib.rs
+++ b/codex-rs/core/src/lib.rs
@@ -42,6 +42,7 @@ pub(crate) mod safety;
pub mod seatbelt;
pub mod shell;
pub mod spawn;
+pub mod turn_diff_tracker;
mod user_notification;
pub mod util;
diff --git a/codex-rs/core/src/protocol.rs b/codex-rs/core/src/protocol.rs
index cbb211d955..82591a2c78 100644
--- a/codex-rs/core/src/protocol.rs
+++ b/codex-rs/core/src/protocol.rs
@@ -387,6 +387,8 @@ pub enum EventMsg {
/// Notification that a patch application has finished.
PatchApplyEnd(PatchApplyEndEvent),
+ TurnDiff(TurnDiffEvent),
+
/// Response to GetHistoryEntryRequest.
GetHistoryEntryResponse(GetHistoryEntryResponseEvent),
@@ -598,6 +600,11 @@ pub struct PatchApplyEndEvent {
pub success: bool,
}
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct TurnDiffEvent {
+ pub unified_diff: String,
+}
+
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct GetHistoryEntryResponseEvent {
pub offset: usize,
diff --git a/codex-rs/core/src/turn_diff_tracker.rs b/codex-rs/core/src/turn_diff_tracker.rs
new file mode 100644
index 0000000000..7026d7bb32
--- /dev/null
+++ b/codex-rs/core/src/turn_diff_tracker.rs
@@ -0,0 +1,887 @@
+use std::collections::HashMap;
+use std::fs;
+use std::path::Path;
+use std::path::PathBuf;
+use std::process::Command;
+
+use anyhow::Context;
+use anyhow::Result;
+use anyhow::anyhow;
+use sha1::digest::Output;
+use uuid::Uuid;
+
+use crate::protocol::FileChange;
+
+const ZERO_OID: &str = "0000000000000000000000000000000000000000";
+const DEV_NULL: &str = "/dev/null";
+
+struct BaselineFileInfo {
+ path: PathBuf,
+ content: Vec<u8>,
+ mode: FileMode,
+ oid: String,
+}
+
+/// Tracks sets of changes to files and exposes the overall unified diff.
+/// Internally, the way this works is now:
+/// 1. Maintain an in-memory baseline snapshot of files when they are first seen.
+/// For new additions, do not create a baseline so that diffs are shown as proper additions (using /dev/null).
+/// 2. Keep a stable internal filename (uuid) per external path for rename tracking.
+/// 3. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk entirely in-memory
+/// using the `similar` crate and emit unified diffs with rewritten external paths.
+#[derive(Default)]
+pub struct TurnDiffTracker {
+ /// Map external path -> internal filename (uuid).
+ external_to_temp_name: HashMap<PathBuf, String>,
+ /// Internal filename -> baseline file info.
+ baseline_file_info: HashMap<String, BaselineFileInfo>,
+ /// Internal filename -> external path as of current accumulated state (after applying all changes).
+ /// This is where renames are tracked.
+ temp_name_to_current_path: HashMap<String, PathBuf>,
+ /// Cache of known git worktree roots to avoid repeated filesystem walks.
+ git_root_cache: Vec<PathBuf>,
+}
+
+impl TurnDiffTracker {
+ pub fn new() -> Self {
+ Self::default()
+ }
+
+ /// Front-run apply patch calls to track the starting contents of any modified files.
+ /// - Creates an in-memory baseline snapshot for files that already exist on disk when first seen.
+ /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions.
+ /// - Also updates internal mappings for move/rename events.
+ pub fn on_patch_begin(&mut self, changes: &HashMap<PathBuf, FileChange>) {
+ for (path, change) in changes.iter() {
+ // Ensure a stable internal filename exists for this external path.
+ if !self.external_to_temp_name.contains_key(path) {
+ let internal = Uuid::new_v4().to_string();
+ self.external_to_temp_name
+ .insert(path.clone(), internal.clone());
+ self.temp_name_to_current_path
+ .insert(internal.clone(), path.clone());
+
+ // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null.
+ let baseline_file_info = if path.exists() {
+ let mode = file_mode_for_path(path);
+ let mode_val = mode.unwrap_or(FileMode::Regular);
+ let content = blob_bytes(path, &mode_val).unwrap_or_default();
+ let oid = if mode == Some(FileMode::Symlink) {
+ format!("{:x}", git_blob_sha1_hex_bytes(&content))
+ } else {
+ self.git_blob_oid_for_path(path)
+ .unwrap_or_else(|| format!("{:x}", git_blob_sha1_hex_bytes(&content)))
+ };
+ Some(BaselineFileInfo {
+ path: path.clone(),
+ content,
+ mode: mode_val,
+ oid,
+ })
+ } else {
+ Some(BaselineFileInfo {
+ path: path.clone(),
+ content: vec![],
+ mode: FileMode::Regular,
+ oid: ZERO_OID.to_string(),
+ })
+ };
+
+ if let Some(baseline_file_info) = baseline_file_info {
+ self.baseline_file_info
+ .insert(internal.clone(), baseline_file_info);
+ }
+ }
+
+ // Track rename/move in current mapping if provided in an Update.
+ if let FileChange::Update {
+ move_path: Some(dest),
+ ..
+ } = change
+ {
+ let uuid_filename = match self.external_to_temp_name.get(path) {
+ Some(i) => i.clone(),
+ None => {
+ // This should be rare, but if we haven't mapped the source, create it with no baseline.
+ let i = Uuid::new_v4().to_string();
+ self.baseline_file_info.insert(
+ i.clone(),
+ BaselineFileInfo {
+ path: path.clone(),
+ content: vec![],
+ mode: FileMode::Regular,
+ oid: ZERO_OID.to_string(),
+ },
+ );
+ i
+ }
+ };
+ // Update current external mapping for temp file name.
+ self.temp_name_to_current_path
+ .insert(uuid_filename.clone(), dest.clone());
+ // Update forward file_mapping: external current -> internal name.
+ self.external_to_temp_name.remove(path);
+ self.external_to_temp_name
+ .insert(dest.clone(), uuid_filename);
+ };
+ }
+ }
+
+ fn get_path_for_internal(&self, internal: &str) -> Option<PathBuf> {
+ self.temp_name_to_current_path
+ .get(internal)
+ .cloned()
+ .or_else(|| {
+ self.baseline_file_info
+ .get(internal)
+ .map(|info| info.path.clone())
+ })
+ }
+
+ /// Find the git worktree root for a file/directory by walking up to the first ancestor containing a `.git` entry.
+ /// Uses a simple cache of known roots and avoids negative-result caching for simplicity.
+ fn find_git_root_cached(&mut self, start: &Path) -> Option<PathBuf> {
+ let dir = if start.is_dir() {
+ start
+ } else {
+ start.parent()?
+ };
+
+ // Fast path: if any cached root is an ancestor of this path, use it.
+ if let Some(root) = self
+ .git_root_cache
+ .iter()
+ .find(|r| dir.starts_with(r))
+ .cloned()
+ {
+ return Some(root);
+ }
+
+ // Walk up to find a `.git` marker.
+ let mut cur = dir.to_path_buf();
+ loop {
+ let git_marker = cur.join(".git");
+ if git_marker.is_dir() || git_marker.is_file() {
+ if !self.git_root_cache.iter().any(|r| r == &cur) {
+ self.git_root_cache.push(cur.clone());
+ }
+ return Some(cur);
+ }
+
+ // On Windows, avoid walking above the drive or UNC share root.
+ #[cfg(windows)]
+ {
+ if is_windows_drive_or_unc_root(&cur) {
+ return None;
+ }
+ }
+
+ if let Some(parent) = cur.parent() {
+ cur = parent.to_path_buf();
+ } else {
+ return None;
+ }
+ }
+ }
+
+ /// Return a display string for `path` relative to its git root if found, else absolute.
+ fn relative_to_git_root_str(&mut self, path: &Path) -> String {
+ let s = if let Some(root) = self.find_git_root_cached(path) {
+ if let Ok(rel) = path.strip_prefix(&root) {
+ rel.display().to_string()
+ } else {
+ path.display().to_string()
+ }
+ } else {
+ path.display().to_string()
+ };
+ s.replace('\\', "/")
+ }
+
+ /// Ask git to compute the blob SHA-1 for the file at `path` within its repository.
+ /// Returns None if no repository is found or git invocation fails.
+ fn git_blob_oid_for_path(&mut self, path: &Path) -> Option<String> {
+ let root = self.find_git_root_cached(path)?;
+ // Compute a path relative to the repo root for better portability across platforms.
+ let rel = path.strip_prefix(&root).unwrap_or(path);
+ let output = Command::new("git")
+ .arg("-C")
+ .arg(&root)
+ .arg("hash-object")
+ .arg("--")
+ .arg(rel)
+ .output()
+ .ok()?;
+ if !output.status.success() {
+ return None;
+ }
+ let s = String::from_utf8_lossy(&output.stdout).trim().to_string();
+ if s.len() == 40 { Some(s) } else { None }
+ }
+
+ /// Recompute the aggregated unified diff by comparing all of the in-memory snapshots that were
+ /// collected before the first time they were touched by apply_patch during this turn with
+ /// the current repo state.
+ pub fn get_unified_diff(&mut self) -> Result<Option<String>> {
+ let mut aggregated = String::new();
+
+ // Compute diffs per tracked internal file in a stable order by external path.
+ let mut baseline_file_names: Vec<String> =
+ self.baseline_file_info.keys().cloned().collect();
+ // Sort lexicographically by full repo-relative path to match git behavior.
+ baseline_file_names.sort_by_key(|internal| {
+ self.get_path_for_internal(internal)
+ .map(|p| self.relative_to_git_root_str(&p))
+ .unwrap_or_default()
+ });
+
+ for internal in baseline_file_names {
+ aggregated.push_str(self.get_file_diff(&internal).as_str());
+ if !aggregated.ends_with('\n') {
+ aggregated.push('\n');
+ }
+ }
+
+ if aggregated.trim().is_empty() {
+ Ok(None)
+ } else {
+ Ok(Some(aggregated))
+ }
+ }
+
+ fn get_file_diff(&mut self, internal_file_name: &str) -> String {
+ let mut aggregated = String::new();
+
+ // Snapshot lightweight fields only.
+ let (baseline_external_path, baseline_mode, left_oid) = {
+ if let Some(info) = self.baseline_file_info.get(internal_file_name) {
+ (info.path.clone(), info.mode, info.oid.clone())
+ } else {
+ (PathBuf::new(), FileMode::Regular, ZERO_OID.to_string())
+ }
+ };
+ let current_external_path = match self.get_path_for_internal(internal_file_name) {
+ Some(p) => p,
+ None => return aggregated,
+ };
+
+ let current_mode = file_mode_for_path(¤t_external_path).unwrap_or(FileMode::Regular);
+ let right_bytes = blob_bytes(¤t_external_path, ¤t_mode);
+
+ // Compute displays with &mut self before borrowing any baseline content.
+ let left_display = self.relative_to_git_root_str(&baseline_external_path);
+ let right_display = self.relative_to_git_root_str(¤t_external_path);
+
+ // Compute right oid before borrowing baseline content.
+ let right_oid = if let Some(b) = right_bytes.as_ref() {
+ if current_mode == FileMode::Symlink {
+ format!("{:x}", git_blob_sha1_hex_bytes(b))
+ } else {
+ self.git_blob_oid_for_path(¤t_external_path)
+ .unwrap_or_else(|| format!("{:x}", git_blob_sha1_hex_bytes(b)))
+ }
+ } else {
+ ZERO_OID.to_string()
+ };
+
+ // Borrow baseline content only after all &mut self uses are done.
+ let left_present = left_oid.as_str() != ZERO_OID;
+ let left_bytes: Option<&[u8]> = if left_present {
+ self.baseline_file_info
+ .get(internal_file_name)
+ .map(|i| i.content.as_slice())
+ } else {
+ None
+ };
+
+ // Fast path: identical bytes or both missing.
+ if left_bytes == right_bytes.as_deref() {
+ return aggregated;
+ }
+
+ aggregated.push_str(&format!("diff --git a/{left_display} b/{right_display}\n"));
+
+ let is_add = !left_present && right_bytes.is_some();
+ let is_delete = left_present && right_bytes.is_none();
+
+ if is_add {
+ aggregated.push_str(&format!("new file mode {current_mode}\n"));
+ } else if is_delete {
+ aggregated.push_str(&format!("deleted file mode {baseline_mode}\n"));
+ } else if baseline_mode != current_mode {
+ aggregated.push_str(&format!("old mode {baseline_mode}\n"));
+ aggregated.push_str(&format!("new mode {current_mode}\n"));
+ }
+
+ let left_text = left_bytes.and_then(|b| std::str::from_utf8(b).ok());
+ let right_text = right_bytes
+ .as_deref()
+ .and_then(|b| std::str::from_utf8(b).ok());
+
+ let can_text_diff = matches!(
+ (left_text, right_text, is_add, is_delete),
+ (Some(_), Some(_), _, _) | (_, Some(_), true, _) | (Some(_), _, _, true)
+ );
+
+ if can_text_diff {
+ let l = left_text.unwrap_or("");
+ let r = right_text.unwrap_or("");
+
+ aggregated.push_str(&format!("index {left_oid}..{right_oid}\n"));
+
+ let old_header = if left_present {
+ format!("a/{left_display}")
+ } else {
+ DEV_NULL.to_string()
+ };
+ let new_header = if right_bytes.is_some() {
+ format!("b/{right_display}")
+ } else {
+ DEV_NULL.to_string()
+ };
+
+ let diff = similar::TextDiff::from_lines(l, r);
+ let unified = diff
+ .unified_diff()
+ .context_radius(3)
+ .header(&old_header, &new_header)
+ .to_string();
+
+ aggregated.push_str(&unified);
+ } else {
+ aggregated.push_str(&format!("index {left_oid}..{right_oid}\n"));
+ let old_header = if left_present {
+ format!("a/{left_display}")
+ } else {
+ DEV_NULL.to_string()
+ };
+ let new_header = if right_bytes.is_some() {
+ format!("b/{right_display}")
+ } else {
+ DEV_NULL.to_string()
+ };
+ aggregated.push_str(&format!("--- {old_header}\n"));
+ aggregated.push_str(&format!("+++ {new_header}\n"));
+ aggregated.push_str("Binary files differ\n");
+ }
+ aggregated
+ }
+}
+
+/// Compute the Git SHA-1 blob object ID for the given content (bytes).
+fn git_blob_sha1_hex_bytes(data: &[u8]) -> Output<sha1::Sha1> {
+ // Git blob hash is sha1 of: "blob <len>\0<data>"
+ let header = format!("blob {}\0", data.len());
+ use sha1::Digest;
+ let mut hasher = sha1::Sha1::new();
+ hasher.update(header.as_bytes());
+ hasher.update(data);
+ hasher.finalize()
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+enum FileMode {
+ Regular,
+ #[cfg(unix)]
+ Executable,
+ Symlink,
+}
+
+impl FileMode {
+ fn as_str(&self) -> &'static str {
+ match self {
+ FileMode::Regular => "100644",
+ #[cfg(unix)]
+ FileMode::Executable => "100755",
+ FileMode::Symlink => "120000",
+ }
+ }
+}
+
+impl std::fmt::Display for FileMode {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ f.write_str(self.as_str())
+ }
+}
+
+#[cfg(unix)]
+fn file_mode_for_path(path: &Path) -> Option<FileMode> {
+ use std::os::unix::fs::PermissionsExt;
+ let meta = fs::symlink_metadata(path).ok()?;
+ let ft = meta.file_type();
+ if ft.is_symlink() {
+ return Some(FileMode::Symlink);
+ }
+ let mode = meta.permissions().mode();
+ let is_exec = (mode & 0o111) != 0;
+ Some(if is_exec {
+ FileMode::Executable
+ } else {
+ FileMode::Regular
+ })
+}
+
+#[cfg(not(unix))]
+fn file_mode_for_path(_path: &Path) -> Option<FileMode> {
+ // Default to non-executable on non-unix.
+ Some(FileMode::Regular)
+}
+
+fn blob_bytes(path: &Path, mode: &FileMode) -> Option<Vec<u8>> {
+ if path.exists() {
+ let contents = if *mode == FileMode::Symlink {
+ symlink_blob_bytes(path)
+ .ok_or_else(|| anyhow!("failed to read symlink target for {}", path.display()))
+ } else {
+ fs::read(path)
+ .with_context(|| format!("failed to read current file for diff {}", path.display()))
+ };
+ contents.ok()
+ } else {
+ None
+ }
+}
+
+#[cfg(unix)]
+fn symlink_blob_bytes(path: &Path) -> Option<Vec<u8>> {
+ use std::os::unix::ffi::OsStrExt;
+ let target = std::fs::read_link(path).ok()?;
+ Some(target.as_os_str().as_bytes().to_vec())
+}
+
+#[cfg(not(unix))]
+fn symlink_blob_bytes(_path: &Path) -> Option<Vec<u8>> {
+ None
+}
+
+#[cfg(windows)]
+fn is_windows_drive_or_unc_root(p: &std::path::Path) -> bool {
+ use std::path::Component;
+ let mut comps = p.components();
+ matches!(
+ (comps.next(), comps.next(), comps.next()),
+ (Some(Component::Prefix(_)), Some(Component::RootDir), None)
+ )
+}
+
+#[cfg(test)]
+mod tests {
+ #![allow(clippy::unwrap_used)]
+ use super::*;
+ use pretty_assertions::assert_eq;
+ use tempfile::tempdir;
+
+ /// Compute the Git SHA-1 blob object ID for the given content (string).
+ /// This delegates to the bytes version to avoid UTF-8 lossy conversions here.
+ fn git_blob_sha1_hex(data: &str) -> String {
+ format!("{:x}", git_blob_sha1_hex_bytes(data.as_bytes()))
+ }
+
+ fn normalize_diff_for_test(input: &str, root: &Path) -> String {
+ let root_str = root.display().to_string().replace('\\', "/");
+ let replaced = input.replace(&root_str, "<TMP>");
+ // Split into blocks on lines starting with "diff --git ", sort blocks for determinism, and rejoin
+ let mut blocks: Vec<String> = Vec::new();
+ let mut current = String::new();
+ for line in replaced.lines() {
+ if line.starts_with("diff --git ") && !current.is_empty() {
+ blocks.push(current);
+ current = String::new();
+ }
+ if !current.is_empty() {
+ current.push('\n');
+ }
+ current.push_str(line);
+ }
+ if !current.is_empty() {
+ blocks.push(current);
+ }
+ blocks.sort();
+ let mut out = blocks.join("\n");
+ if !out.ends_with('\n') {
+ out.push('\n');
+ }
+ out
+ }
+
+ #[test]
+ fn accumulates_add_and_update() {
+ let mut acc = TurnDiffTracker::new();
+
+ let dir = tempdir().unwrap();
+ let file = dir.path().join("a.txt");
+
+ // First patch: add file (baseline should be /dev/null).
+ let add_changes = HashMap::from([(
+ file.clone(),
+ FileChange::Add {
+ content: "foo\n".to_string(),
+ },
+ )]);
+ acc.on_patch_begin(&add_changes);
+
+ // Simulate apply: create the file on disk.
+ fs::write(&file, "foo\n").unwrap();
+ let first = acc.get_unified_diff().unwrap().unwrap();
+ let first = normalize_diff_for_test(&first, dir.path());
+ let expected_first = {
+ let mode = file_mode_for_path(&file).unwrap_or(FileMode::Regular);
+ let right_oid = git_blob_sha1_hex("foo\n");
+ format!(
+ r#"diff --git a/<TMP>/a.txt b/<TMP>/a.txt
+new file mode {mode}
+index {ZERO_OID}..{right_oid}
+--- {DEV_NULL}
++++ b/<TMP>/a.txt
+@@ -0,0 +1 @@
++foo
+"#,
+ )
+ };
+ assert_eq!(first, expected_first);
+
+ // Second patch: update the file on disk.
+ let update_changes = HashMap::from([(
+ file.clone(),
+ FileChange::Update {
+ unified_diff: "".to_owned(),
+ move_path: None,
+ },
+ )]);
+ acc.on_patch_begin(&update_changes);
+
+ // Simulate apply: append a new line.
+ fs::write(&file, "foo\nbar\n").unwrap();
+ let combined = acc.get_unified_diff().unwrap().unwrap();
+ let combined = normalize_diff_for_test(&combined, dir.path());
+ let expected_combined = {
+ let mode = file_mode_for_path(&file).unwrap_or(FileMode::Regular);
+ let right_oid = git_blob_sha1_hex("foo\nbar\n");
+ format!(
+ r#"diff --git a/<TMP>/a.txt b/<TMP>/a.txt
+new file mode {mode}
+index {ZERO_OID}..{right_oid}
+--- {DEV_NULL}
++++ b/<TMP>/a.txt
+@@ -0,0 +1,2 @@
++foo
++bar
+"#,
+ )
+ };
+ assert_eq!(combined, expected_combined);
+ }
+
+ #[test]
+ fn accumulates_delete() {
+ let dir = tempdir().unwrap();
+ let file = dir.path().join("b.txt");
+ fs::write(&file, "x\n").unwrap();
+
+ let mut acc = TurnDiffTracker::new();
+ let del_changes = HashMap::from([(file.clone(), FileChange::Delete)]);
+ acc.on_patch_begin(&del_changes);
+
+ // Simulate apply: delete the file from disk.
+ let baseline_mode = file_mode_for_path(&file).unwrap_or(FileMode::Regular);
+ fs::remove_file(&file).unwrap();
+ let diff = acc.get_unified_diff().unwrap().unwrap();
+ let diff = normalize_diff_for_test(&diff, dir.path());
+ let expected = {
+ let left_oid = git_blob_sha1_hex("x\n");
+ format!(
+ r#"diff --git a/<TMP>/b.txt b/<TMP>/b.txt
+deleted file mode {baseline_mode}
+index {left_oid}..{ZERO_OID}
+--- a/<TMP>/b.txt
++++ {DEV_NULL}
+@@ -1 +0,0 @@
+-x
+"#,
+ )
+ };
+ assert_eq!(diff, expected);
+ }
+
+ #[test]
+ fn accumulates_move_and_update() {
+ let dir = tempdir().unwrap();
+ let src = dir.path().join("src.txt");
+ let dest = dir.path().join("dst.txt");
+ fs::write(&src, "line\n").unwrap();
+
+ let mut acc = TurnDiffTracker::new();
+ let mv_changes = HashMap::from([(
+ src.clone(),
+ FileChange::Update {
+ unified_diff: "".to_owned(),
+ move_path: Some(dest.clone()),
+ },
+ )]);
+ acc.on_patch_begin(&mv_changes);
+
+ // Simulate apply: move and update content.
+ fs::rename(&src, &dest).unwrap();
+ fs::write(&dest, "line2\n").unwrap();
+
+ let out = acc.get_unified_diff().unwrap().unwrap();
+ let out = normalize_diff_for_test(&out, dir.path());
+ let expected = {
+ let left_oid = git_blob_sha1_hex("line\n");
+ let right_oid = git_blob_sha1_hex("line2\n");
+ format!(
+ r#"diff --git a/<TMP>/src.txt b/<TMP>/dst.txt
+index {left_oid}..{right_oid}
+--- a/<TMP>/src.txt
++++ b/<TMP>/dst.txt
+@@ -1 +1 @@
+-line
++line2
+"#
+ )
+ };
+ assert_eq!(out, expected);
+ }
+
+ #[test]
+ fn move_without_1change_yields_no_diff() {
+ let dir = tempdir().unwrap();
+ let src = dir.path().join("moved.txt");
+ let dest = dir.path().join("renamed.txt");
+ fs::write(&src, "same\n").unwrap();
+
+ let mut acc = TurnDiffTracker::new();
+ let mv_changes = HashMap::from([(
+ src.clone(),
+ FileChange::Update {
+ unified_diff: "".to_owned(),
+ move_path: Some(dest.clone()),
+ },
+ )]);
+ acc.on_patch_begin(&mv_changes);
+
+ // Simulate apply: move only, no content change.
+ fs::rename(&src, &dest).unwrap();
+
+ let diff = acc.get_unified_diff().unwrap();
+ assert_eq!(diff, None);
+ }
+
+ #[test]
+ fn move_declared_but_file_only_appears_at_dest_is_add() {
+ let dir = tempdir().unwrap();
+ let src = dir.path().join("src.txt");
+ let dest = dir.path().join("dest.txt");
+ let mut acc = TurnDiffTracker::new();
+ let mv = HashMap::from([(
+ src.clone(),
+ FileChange::Update {
+ unified_diff: "".into(),
+ move_path: Some(dest.clone()),
+ },
+ )]);
+ acc.on_patch_begin(&mv);
+ // No file existed initially; create only dest
+ fs::write(&dest, "hello\n").unwrap();
+ let diff = acc.get_unified_diff().unwrap().unwrap();
+ let diff = normalize_diff_for_test(&diff, dir.path());
+ let expected = {
+ let mode = file_mode_for_path(&dest).unwrap_or(FileMode::Regular);
+ let right_oid = git_blob_sha1_hex("hello\n");
+ format!(
+ r#"diff --git a/<TMP>/src.txt b/<TMP>/dest.txt
+new file mode {mode}
+index {ZERO_OID}..{right_oid}
+--- {DEV_NULL}
++++ b/<TMP>/dest.txt
+@@ -0,0 +1 @@
++hello
+"#,
+ )
+ };
+ assert_eq!(diff, expected);
+ }
+
+ #[test]
+ fn update_persists_across_new_baseline_for_new_file() {
+ let dir = tempdir().unwrap();
+ let a = dir.path().join("a.txt");
+ let b = dir.path().join("b.txt");
+ fs::write(&a, "foo\n").unwrap();
+ fs::write(&b, "z\n").unwrap();
+
+ let mut acc = TurnDiffTracker::new();
+
+ // First: update existing a.txt (baseline snapshot is created for a).
+ let update_a = HashMap::from([(
+ a.clone(),
+ FileChange::Update {
+ unified_diff: "".to_owned(),
+ move_path: None,
+ },
+ )]);
+ acc.on_patch_begin(&update_a);
+ // Simulate apply: modify a.txt on disk.
+ fs::write(&a, "foo\nbar\n").unwrap();
+ let first = acc.get_unified_diff().unwrap().unwrap();
+ let first = normalize_diff_for_test(&first, dir.path());
+ let expected_first = {
+ let left_oid = git_blob_sha1_hex("foo\n");
+ let right_oid = git_blob_sha1_hex("foo\nbar\n");
+ format!(
+ r#"diff --git a/<TMP>/a.txt b/<TMP>/a.txt
+index {left_oid}..{right_oid}
+--- a/<TMP>/a.txt
++++ b/<TMP>/a.txt
+@@ -1 +1,2 @@
+ foo
++bar
+"#
+ )
+ };
+ assert_eq!(first, expected_first);
+
+ // Next: introduce a brand-new path b.txt into baseline snapshots via a delete change.
+ let del_b = HashMap::from([(b.clone(), FileChange::Delete)]);
+ acc.on_patch_begin(&del_b);
+ // Simulate apply: delete b.txt.
+ let baseline_mode = file_mode_for_path(&b).unwrap_or(FileMode::Regular);
+ fs::remove_file(&b).unwrap();
+
+ let combined = acc.get_unified_diff().unwrap().unwrap();
+ let combined = normalize_diff_for_test(&combined, dir.path());
+ let expected = {
+ let left_oid_a = git_blob_sha1_hex("foo\n");
+ let right_oid_a = git_blob_sha1_hex("foo\nbar\n");
+ let left_oid_b = git_blob_sha1_hex("z\n");
+ format!(
+ r#"diff --git a/<TMP>/a.txt b/<TMP>/a.txt
+index {left_oid_a}..{right_oid_a}
+--- a/<TMP>/a.txt
++++ b/<TMP>/a.txt
+@@ -1 +1,2 @@
+ foo
++bar
+diff --git a/<TMP>/b.txt b/<TMP>/b.txt
+deleted file mode {baseline_mode}
+index {left_oid_b}..{ZERO_OID}
+--- a/<TMP>/b.txt
++++ {DEV_NULL}
+@@ -1 +0,0 @@
+-z
+"#,
+ )
+ };
+ assert_eq!(combined, expected);
+ }
+
+ #[test]
+ fn binary_files_differ_update() {
+ let dir = tempdir().unwrap();
+ let file = dir.path().join("bin.dat");
+
+ // Initial non-UTF8 bytes
+ let left_bytes: Vec<u8> = vec![0xff, 0xfe, 0xfd, 0x00];
+ // Updated non-UTF8 bytes
+ let right_bytes: Vec<u8> = vec![0x01, 0x02, 0x03, 0x00];
+
+ fs::write(&file, &left_bytes).unwrap();
+
+ let mut acc = TurnDiffTracker::new();
+ let update_changes = HashMap::from([(
+ file.clone(),
+ FileChange::Update {
+ unified_diff: "".to_owned(),
+ move_path: None,
+ },
+ )]);
+ acc.on_patch_begin(&update_changes);
+
+ // Apply update on disk
+ fs::write(&file, &right_bytes).unwrap();
+
+ let diff = acc.get_unified_diff().unwrap().unwrap();
+ let diff = normalize_diff_for_test(&diff, dir.path());
+ let expected = {
+ let left_oid = format!("{:x}", git_blob_sha1_hex_bytes(&left_bytes));
+ let right_oid = format!("{:x}", git_blob_sha1_hex_bytes(&right_bytes));
+ format!(
+ r#"diff --git a/<TMP>/bin.dat b/<TMP>/bin.dat
+index {left_oid}..{right_oid}
+--- a/<TMP>/bin.dat
++++ b/<TMP>/bin.dat
+Binary files differ
+"#
+ )
+ };
+ assert_eq!(diff, expected);
+ }
+
+ #[test]
+ fn filenames_with_spaces_add_and_update() {
+ let mut acc = TurnDiffTracker::new();
+
+ let dir = tempdir().unwrap();
+ let file = dir.path().join("name with spaces.txt");
+
+ // First patch: add file (baseline should be /dev/null).
+ let add_changes = HashMap::from([(
+ file.clone(),
+ FileChange::Add {
+ content: "foo\n".to_string(),
+ },
+ )]);
+ acc.on_patch_begin(&add_changes);
+
+ // Simulate apply: create the file on disk.
+ fs::write(&file, "foo\n").unwrap();
+ let first = acc.get_unified_diff().unwrap().unwrap();
+ let first = normalize_diff_for_test(&first, dir.path());
+ let expected_first = {
+ let mode = file_mode_for_path(&file).unwrap_or(FileMode::Regular);
+ let right_oid = git_blob_sha1_hex("foo\n");
+ format!(
+ r#"diff --git a/<TMP>/name with spaces.txt b/<TMP>/name with spaces.txt
+new file mode {mode}
+index {ZERO_OID}..{right_oid}
+--- {DEV_NULL}
++++ b/<TMP>/name with spaces.txt
+@@ -0,0 +1 @@
++foo
+"#,
+ )
+ };
+ assert_eq!(first, expected_first);
+
+ // Second patch: update the file on disk.
+ let update_changes = HashMap::from([(
+ file.clone(),
+ FileChange::Update {
+ unified_diff: "".to_owned(),
+ move_path: None,
+ },
+ )]);
+ acc.on_patch_begin(&update_changes);
+
+ // Simulate apply: append a new line with a space.
+ fs::write(&file, "foo\nbar baz\n").unwrap();
+ let combined = acc.get_unified_diff().unwrap().unwrap();
+ let combined = normalize_diff_for_test(&combined, dir.path());
+ let expected_combined = {
+ let mode = file_mode_for_path(&file).unwrap_or(FileMode::Regular);
+ let right_oid = git_blob_sha1_hex("foo\nbar baz\n");
+ format!(
+ r#"diff --git a/<TMP>/name with spaces.txt b/<TMP>/name with spaces.txt
+new file mode {mode}
+index {ZERO_OID}..{right_oid}
+--- {DEV_NULL}
++++ b/<TMP>/name with spaces.txt
+@@ -0,0 +1,2 @@
++foo
++bar baz
+"#,
+ )
+ };
+ assert_eq!(combined, expected_combined);
+ }
+}
diff --git a/codex-rs/exec/src/event_processor_with_human_output.rs b/codex-rs/exec/src/event_processor_with_human_output.rs
index 72e2f9298f..c290d9336b 100644
--- a/codex-rs/exec/src/event_processor_with_human_output.rs
+++ b/codex-rs/exec/src/event_processor_with_human_output.rs
@@ -20,6 +20,7 @@ use codex_core::protocol::PatchApplyEndEvent;
use codex_core::protocol::SessionConfiguredEvent;
use codex_core::protocol::TaskCompleteEvent;
use codex_core::protocol::TokenUsage;
+use codex_core::protocol::TurnDiffEvent;
use owo_colors::OwoColorize;
use owo_colors::Style;
use shlex::try_join;
@@ -399,6 +400,7 @@ impl EventProcessor for EventProcessorWithHumanOutput {
stdout,
stderr,
success,
+ ..
}) => {
let patch_begin = self.call_id_to_patch.remove(&call_id);
@@ -428,6 +430,10 @@ impl EventProcessor for EventProcessorWithHumanOutput {
println!("{}", line.style(self.dimmed));
}
}
+ EventMsg::TurnDiff(TurnDiffEvent { unified_diff }) => {
+ ts_println!(self, "{}", "turn diff:".style(self.magenta));
+ println!("{unified_diff}");
+ }
EventMsg::ExecApprovalRequest(_) => {
// Should we exit?
}
diff --git a/codex-rs/mcp-server/src/codex_tool_runner.rs b/codex-rs/mcp-server/src/codex_tool_runner.rs
index d489ffe076..205dfa4631 100644
--- a/codex-rs/mcp-server/src/codex_tool_runner.rs
+++ b/codex-rs/mcp-server/src/codex_tool_runner.rs
@@ -263,6 +263,7 @@ async fn run_codex_tool_session_inner(
| EventMsg::BackgroundEvent(_)
| EventMsg::PatchApplyBegin(_)
| EventMsg::PatchApplyEnd(_)
+ | EventMsg::TurnDiff(_)
| EventMsg::GetHistoryEntryResponse(_)
| EventMsg::PlanUpdate(_)
| EventMsg::ShutdownComplete => {
diff --git a/codex-rs/mcp-server/src/conversation_loop.rs b/codex-rs/mcp-server/src/conversation_loop.rs
index 534275181a..1db39a2306 100644
--- a/codex-rs/mcp-server/src/conversation_loop.rs
+++ b/codex-rs/mcp-server/src/conversation_loop.rs
@@ -97,6 +97,7 @@ pub async fn run_conversation_loop(
| EventMsg::McpToolCallEnd(_)
| EventMsg::ExecCommandBegin(_)
| EventMsg::ExecCommandEnd(_)
+ | EventMsg::TurnDiff(_)
| EventMsg::BackgroundEvent(_)
| EventMsg::ExecCommandOutputDelta(_)
| EventMsg::PatchApplyBegin(_)
Review Comments
codex-rs/core/Cargo.toml
- Created: 2025-08-01 16:35:11 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2248403984
@@ -51,6 +51,7 @@ tree-sitter-bash = "0.25.0"
uuid = { version = "1", features = ["serde", "v4"] }
whoami = "1.6.0"
wildmatch = "2.4.0"
+tempfile = "3"
@pakrym-oai alpha sort is from saving in the editor, right?
- Created: 2025-08-04 02:42:24 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2250289252
@@ -34,6 +34,7 @@ serde_json = "1"
serde_bytes = "0.11"
sha1 = "0.10.6"
shlex = "1.3.0"
+similar = "2"
Maybe we should match
e3565a3f43/codex-rs/apply-patch/Cargo.toml (L15)(or change that one to be"2"?)
codex-rs/core/src/codex.rs
- Created: 2025-08-01 16:48:22 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2248425776
@@ -1471,6 +1528,7 @@ fn maybe_run_with_user_profile(params: ExecParams, sess: &Session) -> ExecParams
async fn handle_container_exec_with_params(
params: ExecParams,
sess: &Session,
+ turn_diff_tracker: &mut TurnDiffTracker,
Hmm, what would happen if we wanted to support parallel tool calls at one point. This would be a problem, no?
- Created: 2025-08-01 17:25:22 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2248497145
@@ -1471,6 +1528,7 @@ fn maybe_run_with_user_profile(params: ExecParams, sess: &Session) -> ExecParams
async fn handle_container_exec_with_params(
params: ExecParams,
sess: &Session,
+ turn_diff_tracker: &mut TurnDiffTracker,
Because only one tool call could take ownership of TurnDiffTracker.
- Created: 2025-08-04 02:46:36 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2250292735
@@ -374,11 +380,15 @@ impl Session {
Some(ApplyPatchCommandContext {
user_explicitly_approved_this_action,
changes,
- }) => EventMsg::PatchApplyBegin(PatchApplyBeginEvent {
- call_id,
- auto_approved: !user_explicitly_approved_this_action,
- changes,
- }),
+ }) => {
+ let _ = turn_diff_tracker.on_patch_begin(&changes);
If this doesn't have to return
Result, thenlet _can go away, of course, but depending on what sort ofErrwe expect, perhaps we should at leastwarn!()orerror!()?
- Created: 2025-08-04 02:47:22 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2250293562
@@ -392,8 +402,10 @@ impl Session {
let _ = self.tx_event.send(event).await;
}
- async fn notify_exec_command_end(
+ #[allow(clippy::too_many_arguments)]
We should maybe introduce a struct in a follow-up PR.
- Created: 2025-08-04 02:49:08 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2250294909
@@ -1163,6 +1193,7 @@ async fn run_task(sess: Arc<Session>, sub_id: String, input: Vec<InputItem>) {
async fn run_turn(
sess: &Session,
+ turn_diff_tracker: &mut TurnDiffTracker,
We'll probably want a
struct TurnContextor somesuch in the near future.
- Created: 2025-08-04 02:51:47 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2250296938
@@ -1471,6 +1528,7 @@ fn maybe_run_with_user_profile(params: ExecParams, sess: &Session) -> ExecParams
async fn handle_container_exec_with_params(
params: ExecParams,
sess: &Session,
+ turn_diff_tracker: &mut TurnDiffTracker,
Yes, though also, if we introduce a
struct TurnContextas mentioned above, that may also force the move toMutex. But yes, does not have to be done in this PR.
- Created: 2025-08-04 02:55:26 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2250299871
@@ -1328,6 +1361,16 @@ async fn try_run_turn(
.ok();
}
+ let unified_diff = turn_diff_tracker.get_unified_diff();
+ if let Ok(Some(unified_diff)) = unified_diff {
+ let msg = EventMsg::TurnDiff(TurnDiffEvent { unified_diff });
+ let event = Event {
+ id: sub_id.to_string(),
+ msg,
+ };
+ let _ = sess.tx_event.send(event).await;
+ }
+
I'm starting to think that we should do
break token_usage;to get out of the loop and then do all of this post-loop stuff below just in case there ever ends up being another way to break out.It would also eliminate this
returnstatement buried in here (though admittedly it would bury thebreakstatement instead).
codex-rs/core/src/protocol.rs
- Created: 2025-08-01 16:41:09 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2248413365
@@ -525,6 +527,11 @@ pub struct PatchApplyEndEvent {
pub success: bool,
}
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct TurnDiffEvent {
+ pub unified_diff: String,
I feel like this would be easier to work with programmatically if this were keyed by path, more like
changesinPatchApplyBeginEvent. Maybe for a full add or a full delete for an individual file, we still want the unified diff, but it's nice to have added/modified/removed metadata for each path so it's easy to build a compact summary for the diff (maybe with +/- line counts)?
- Created: 2025-08-01 16:49:48 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2248429044
@@ -525,6 +527,11 @@ pub struct PatchApplyEndEvent {
pub success: bool,
}
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct TurnDiffEvent {
+ pub unified_diff: String,
What guarantees, if any, can we make about the paths in the
unified_diff: will they all be absolute paths?
codex-rs/core/src/turn_diff_tracker.rs
- Created: 2025-08-01 16:54:37 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2248437475
@@ -0,0 +1,476 @@
+use std::collections::HashMap;
+use std::fs;
+use std::path::Path;
+use std::path::PathBuf;
+use std::process::Command;
+
+use anyhow::Context;
+use anyhow::Result;
+use tempfile::TempDir;
+use uuid::Uuid;
+
+use crate::protocol::FileChange;
+
+/// Tracks sets of changes to files and exposes the overall unified diff.
+/// Internally, the way this works is now:
+/// 1. Create a temp directory to store baseline snapshots of files when they are first seen.
+/// 2. When a path is first observed, copy its current contents into the baseline dir if it exists on disk.
+/// For new additions, do not create a baseline file so that diffs are shown as proper additions (using /dev/null).
+/// 3. Keep a stable internal filename (uuid + same extension) per external path for path rewrite in diffs.
+/// 4. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk using
+/// `git diff --no-index` and rewrite paths to external paths.
+#[derive(Default)]
+pub struct TurnDiffTracker {
+ /// Temp directory holding baseline snapshots of files as first seen.
+ baseline_files_dir: Option<TempDir>,
+ /// Map external path -> internal filename (uuid + same extension).
+ external_to_temp_name: HashMap<PathBuf, String>,
+ /// Internal filename -> external path as of baseline snapshot.
+ temp_name_to_baseline_external: HashMap<String, PathBuf>,
+ /// Internal filename -> external path as of current accumulated state (after applying all changes).
+ /// This is where renames are tracked.
+ temp_name_to_current_external: HashMap<String, PathBuf>,
+ /// Aggregated unified diff for all accumulated changes across files.
+ pub unified_diff: Option<String>,
I'm surprised to see this as a field as opposed to always derived?
- Created: 2025-08-01 16:57:12 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2248442328
@@ -0,0 +1,476 @@
+use std::collections::HashMap;
+use std::fs;
+use std::path::Path;
+use std::path::PathBuf;
+use std::process::Command;
+
+use anyhow::Context;
+use anyhow::Result;
+use tempfile::TempDir;
+use uuid::Uuid;
+
+use crate::protocol::FileChange;
+
+/// Tracks sets of changes to files and exposes the overall unified diff.
+/// Internally, the way this works is now:
+/// 1. Create a temp directory to store baseline snapshots of files when they are first seen.
+/// 2. When a path is first observed, copy its current contents into the baseline dir if it exists on disk.
+/// For new additions, do not create a baseline file so that diffs are shown as proper additions (using /dev/null).
+/// 3. Keep a stable internal filename (uuid + same extension) per external path for path rewrite in diffs.
+/// 4. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk using
+/// `git diff --no-index` and rewrite paths to external paths.
+#[derive(Default)]
+pub struct TurnDiffTracker {
+ /// Temp directory holding baseline snapshots of files as first seen.
+ baseline_files_dir: Option<TempDir>,
+ /// Map external path -> internal filename (uuid + same extension).
+ external_to_temp_name: HashMap<PathBuf, String>,
+ /// Internal filename -> external path as of baseline snapshot.
+ temp_name_to_baseline_external: HashMap<String, PathBuf>,
+ /// Internal filename -> external path as of current accumulated state (after applying all changes).
+ /// This is where renames are tracked.
+ temp_name_to_current_external: HashMap<String, PathBuf>,
+ /// Aggregated unified diff for all accumulated changes across files.
+ pub unified_diff: Option<String>,
+}
+
+impl TurnDiffTracker {
+ pub fn new() -> Self {
+ Self::default()
+ }
+
+ /// Front-run apply patch calls to track the starting contents of any modified files.
+ /// - Creates a baseline snapshot for files that already exist on disk when first seen.
+ /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions.
+ /// - Also updates internal mappings for move/rename events.
+ pub fn on_patch_begin(&mut self, changes: &HashMap<PathBuf, FileChange>) -> Result<()> {
+ self.ensure_baseline_dir()?;
+ let baseline_dir = self.baseline_dir()?.to_path_buf();
+
+ for (path, change) in changes.iter() {
+ // Ensure a stable internal filename exists for this external path.
+ if !self.external_to_temp_name.contains_key(path) {
+ let internal = uuid_filename_for(path);
+ self.external_to_temp_name
+ .insert(path.clone(), internal.clone());
+ self.temp_name_to_baseline_external
+ .insert(internal.clone(), path.clone());
+ self.temp_name_to_current_external
+ .insert(internal.clone(), path.clone());
+
+ // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null.
+ if path.exists() {
+ let contents = fs::read(path)
+ .with_context(|| format!("failed to read original {}", path.display()))?;
+ let internal_path = baseline_dir.join(&internal);
+ fs::write(&internal_path, contents).with_context(|| {
+ format!("failed to write baseline file {}", internal_path.display())
+ })?;
+ }
+ }
+
+ // Track rename/move in current mapping if provided in an Update.
+ let move_path = match change {
+ FileChange::Update {
+ move_path: Some(dest),
+ ..
+ } => Some(dest),
+ _ => None,
+ };
+ if let Some(dest) = move_path {
+ let uuid_filename = match self.external_to_temp_name.get(path) {
+ Some(i) => i.clone(),
+ None => {
+ // This should be rare, but if we haven't mapped the source, create it with no baseline.
+ let i = uuid_filename_for(path);
+ self.external_to_temp_name.insert(path.clone(), i.clone());
+ self.temp_name_to_baseline_external
+ .insert(i.clone(), path.clone());
+ i
+ }
+ };
+ // Update current external mapping for temp file name.
+ self.temp_name_to_current_external
+ .insert(uuid_filename.clone(), dest.clone());
+ // Update forward file_mapping: external current -> internal name.
+ self.external_to_temp_name.remove(path);
+ self.external_to_temp_name
+ .insert(dest.clone(), uuid_filename);
+ }
+ }
+
+ Ok(())
+ }
+
+ /// Recompute the aggregated unified diff by comparing all baseline snapshots against
+ /// current files on disk using `git diff --no-index` and rewriting paths to external paths.
+ pub fn update_and_get_unified_diff(&mut self) -> Result<Option<String>> {
+ let baseline_dir = self.baseline_dir()?.to_path_buf();
+ let current_dir = baseline_dir.join("current");
+ if current_dir.exists() {
+ // Best-effort cleanup of previous run's mirror.
+ let _ = fs::remove_dir_all(¤t_dir);
+ }
+ fs::create_dir_all(¤t_dir).with_context(|| {
+ format!(
+ "failed to create current mirror dir {}",
+ current_dir.display()
+ )
+ })?;
+
+ let mut aggregated = String::new();
+
+ // Compute diffs per tracked internal file.
+ for (internal, baseline_external) in &self.temp_name_to_baseline_external {
+ let baseline_path = baseline_dir.join(internal);
+ let current_external = self
+ .temp_name_to_current_external
+ .get(internal)
+ .cloned()
+ .unwrap_or_else(|| baseline_external.clone());
+
+ let left_is_dev_null = !baseline_path.exists();
+ let right_exists = current_external.exists();
+
+ // Prepare right side mirror file if exists; otherwise use /dev/null for deletions.
+ let right_arg = if right_exists {
+ let mirror_path = current_dir.join(internal);
+ let contents = fs::read(¤t_external).with_context(|| {
+ format!(
+ "failed to read current file for diff {}",
+ current_external.display()
+ )
+ })?;
+ fs::write(&mirror_path, contents).with_context(|| {
+ format!(
+ "failed to write current mirror file {}",
+ mirror_path.display()
+ )
+ })?;
+ // Use relative path from baseline_dir (so headers say a/<uuid> b/current/<uuid>).
+ format!("current/{internal}")
+ } else {
+ // Deletion: right side is /dev/null to show proper deleted file diff.
+ "/dev/null".to_string()
+ };
+
+ // Prepare left arg: baseline file path or /dev/null for additions.
+ let left_arg = if left_is_dev_null {
+ "/dev/null".to_string()
+ } else {
+ internal.clone()
+ };
+
+ // Run git diff --no-index from baseline_dir to keep paths predictable.
+ let raw = run_git_allow_exit_codes(
+ &baseline_dir,
+ &[
+ "-c",
+ "color.ui=false",
+ "diff",
+ "--no-color",
+ "--no-index",
+ "--",
+ &left_arg,
+ &right_arg,
+ ],
+ &[0, 1], // 0: no changes, 1: differences
+ )?;
+
+ if raw.trim().is_empty() {
+ continue;
+ }
+ let rewritten = self.rewrite_diff_paths(&raw);
+ if !rewritten.trim().is_empty() {
+ if !aggregated.is_empty() && !aggregated.ends_with('\n') {
+ aggregated.push('\n');
+ }
+ aggregated.push_str(&rewritten);
+ }
+ }
+
+ self.unified_diff = if aggregated.trim().is_empty() {
+ None
+ } else {
+ Some(aggregated)
+ };
+
+ // Clean up the current dir.
+ let _ = fs::remove_dir_all(¤t_dir);
+
+ Ok(self.unified_diff.clone())
+ }
+
+ fn baseline_dir(&self) -> Result<&Path> {
+ self.baseline_files_dir
+ .as_ref()
+ .map(|d| d.path())
+ .ok_or_else(|| anyhow::anyhow!("baseline temp dir not initialized"))
+ }
+
+ fn ensure_baseline_dir(&mut self) -> Result<()> {
+ if self.baseline_files_dir.is_some() {
+ return Ok(());
+ }
+ let tmp = TempDir::new().context("create baseline temp dir")?;
+ self.baseline_files_dir = Some(tmp);
+ Ok(())
+ }
+
+ /// Rewrites the internal filenames to external paths in diff headers.
+ /// Handles inputs like:
+ /// diff --git a/<uuid> b/current/<uuid>
+ /// --- a/<uuid> | /dev/null
+ /// +++ b/current/<uuid> | /dev/null
+ /// and replaces uuid with the external paths tracking baseline/current.
+ fn rewrite_diff_paths(&self, diff: &str) -> String {
+ let mut out = String::new();
+ for line in diff.lines() {
+ if let Some(rest) = line.strip_prefix("diff --git ") {
+ // Format: diff --git a/<f> b/<f>
+ let parts: Vec<&str> = rest.split_whitespace().collect();
+ if parts.len() == 2 {
+ let a = parts[0].strip_prefix("a/").unwrap_or(parts[0]);
+ let b = parts[1].strip_prefix("b/").unwrap_or(parts[1]);
+
+ let a_ext_display = if a == "/dev/null" {
+ "/dev/null".to_string()
+ } else {
+ let a_base = Path::new(a)
+ .file_name()
+ .and_then(|s| s.to_str())
+ .unwrap_or(a);
+ let mapped = self
+ .temp_name_to_baseline_external
+ .get(a_base)
+ .cloned()
+ .unwrap_or_else(|| PathBuf::from(a));
+ mapped.display().to_string()
+ };
+
+ let b_ext_display = if b == "/dev/null" {
+ "/dev/null".to_string()
+ } else {
+ let b_base = Path::new(b)
+ .file_name()
+ .and_then(|s| s.to_str())
+ .unwrap_or(b);
+ let mapped = self
+ .temp_name_to_current_external
+ .get(b_base)
+ .cloned()
+ .unwrap_or_else(|| PathBuf::from(b));
+ mapped.display().to_string()
+ };
+
+ out.push_str(&format!("diff --git a/{a_ext_display} b/{b_ext_display}\n"));
+ continue;
+ }
+ }
+ if let Some(rest) = line.strip_prefix("--- ") {
+ if let Some(path) = rest.strip_prefix("a/") {
+ let external_display = if path == "/dev/null" {
+ "/dev/null".to_string()
+ } else {
+ let p_base = Path::new(path)
+ .file_name()
+ .and_then(|s| s.to_str())
+ .unwrap_or(path);
+ self.temp_name_to_baseline_external
+ .get(p_base)
+ .cloned()
+ .unwrap_or_else(|| PathBuf::from(path))
+ .display()
+ .to_string()
+ };
+ out.push_str(&format!("--- {external_display}\n"));
+ continue;
+ }
+ }
+ if let Some(rest) = line.strip_prefix("+++ ") {
+ if let Some(path) = rest.strip_prefix("b/") {
+ let external_display = if path == "/dev/null" {
+ "/dev/null".to_string()
+ } else {
+ let p_base = Path::new(path)
+ .file_name()
+ .and_then(|s| s.to_str())
+ .unwrap_or(path);
+ self.temp_name_to_current_external
+ .get(p_base)
+ .cloned()
+ .unwrap_or_else(|| PathBuf::from(path))
+ .display()
+ .to_string()
+ };
+ out.push_str(&format!("+++ {external_display}\n"));
+ continue;
+ }
+ }
+ out.push_str(line);
+ out.push('\n');
+ }
+ out
+ }
+}
+
+fn uuid_filename_for(path: &Path) -> String {
+ let id = Uuid::new_v4().to_string();
+ match path.extension().and_then(|e| e.to_str()) {
+ Some(ext) if !ext.is_empty() => format!("{id}.{ext}"),
+ _ => id,
None => id,
- Created: 2025-08-01 16:59:40 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2248446717
@@ -0,0 +1,476 @@
+use std::collections::HashMap;
+use std::fs;
+use std::path::Path;
+use std::path::PathBuf;
+use std::process::Command;
+
+use anyhow::Context;
+use anyhow::Result;
+use tempfile::TempDir;
+use uuid::Uuid;
+
+use crate::protocol::FileChange;
+
+/// Tracks sets of changes to files and exposes the overall unified diff.
+/// Internally, the way this works is now:
+/// 1. Create a temp directory to store baseline snapshots of files when they are first seen.
+/// 2. When a path is first observed, copy its current contents into the baseline dir if it exists on disk.
+/// For new additions, do not create a baseline file so that diffs are shown as proper additions (using /dev/null).
+/// 3. Keep a stable internal filename (uuid + same extension) per external path for path rewrite in diffs.
+/// 4. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk using
+/// `git diff --no-index` and rewrite paths to external paths.
+#[derive(Default)]
+pub struct TurnDiffTracker {
+ /// Temp directory holding baseline snapshots of files as first seen.
+ baseline_files_dir: Option<TempDir>,
+ /// Map external path -> internal filename (uuid + same extension).
+ external_to_temp_name: HashMap<PathBuf, String>,
+ /// Internal filename -> external path as of baseline snapshot.
+ temp_name_to_baseline_external: HashMap<String, PathBuf>,
+ /// Internal filename -> external path as of current accumulated state (after applying all changes).
+ /// This is where renames are tracked.
+ temp_name_to_current_external: HashMap<String, PathBuf>,
+ /// Aggregated unified diff for all accumulated changes across files.
+ pub unified_diff: Option<String>,
+}
+
+impl TurnDiffTracker {
+ pub fn new() -> Self {
+ Self::default()
+ }
+
+ /// Front-run apply patch calls to track the starting contents of any modified files.
+ /// - Creates a baseline snapshot for files that already exist on disk when first seen.
+ /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions.
+ /// - Also updates internal mappings for move/rename events.
+ pub fn on_patch_begin(&mut self, changes: &HashMap<PathBuf, FileChange>) -> Result<()> {
+ self.ensure_baseline_dir()?;
+ let baseline_dir = self.baseline_dir()?.to_path_buf();
+
+ for (path, change) in changes.iter() {
+ // Ensure a stable internal filename exists for this external path.
+ if !self.external_to_temp_name.contains_key(path) {
+ let internal = uuid_filename_for(path);
+ self.external_to_temp_name
+ .insert(path.clone(), internal.clone());
+ self.temp_name_to_baseline_external
+ .insert(internal.clone(), path.clone());
+ self.temp_name_to_current_external
+ .insert(internal.clone(), path.clone());
+
+ // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null.
+ if path.exists() {
+ let contents = fs::read(path)
+ .with_context(|| format!("failed to read original {}", path.display()))?;
+ let internal_path = baseline_dir.join(&internal);
+ fs::write(&internal_path, contents).with_context(|| {
+ format!("failed to write baseline file {}", internal_path.display())
+ })?;
+ }
+ }
+
+ // Track rename/move in current mapping if provided in an Update.
+ let move_path = match change {
+ FileChange::Update {
+ move_path: Some(dest),
+ ..
+ } => Some(dest),
+ _ => None,
+ };
+ if let Some(dest) = move_path {
+ let uuid_filename = match self.external_to_temp_name.get(path) {
+ Some(i) => i.clone(),
+ None => {
+ // This should be rare, but if we haven't mapped the source, create it with no baseline.
+ let i = uuid_filename_for(path);
+ self.external_to_temp_name.insert(path.clone(), i.clone());
+ self.temp_name_to_baseline_external
+ .insert(i.clone(), path.clone());
+ i
+ }
+ };
+ // Update current external mapping for temp file name.
+ self.temp_name_to_current_external
+ .insert(uuid_filename.clone(), dest.clone());
+ // Update forward file_mapping: external current -> internal name.
+ self.external_to_temp_name.remove(path);
+ self.external_to_temp_name
+ .insert(dest.clone(), uuid_filename);
+ }
+ }
+
+ Ok(())
+ }
+
+ /// Recompute the aggregated unified diff by comparing all baseline snapshots against
+ /// current files on disk using `git diff --no-index` and rewriting paths to external paths.
+ pub fn update_and_get_unified_diff(&mut self) -> Result<Option<String>> {
+ let baseline_dir = self.baseline_dir()?.to_path_buf();
+ let current_dir = baseline_dir.join("current");
+ if current_dir.exists() {
+ // Best-effort cleanup of previous run's mirror.
+ let _ = fs::remove_dir_all(¤t_dir);
+ }
+ fs::create_dir_all(¤t_dir).with_context(|| {
+ format!(
+ "failed to create current mirror dir {}",
+ current_dir.display()
+ )
+ })?;
+
+ let mut aggregated = String::new();
+
+ // Compute diffs per tracked internal file.
+ for (internal, baseline_external) in &self.temp_name_to_baseline_external {
+ let baseline_path = baseline_dir.join(internal);
+ let current_external = self
+ .temp_name_to_current_external
+ .get(internal)
+ .cloned()
+ .unwrap_or_else(|| baseline_external.clone());
+
+ let left_is_dev_null = !baseline_path.exists();
+ let right_exists = current_external.exists();
+
+ // Prepare right side mirror file if exists; otherwise use /dev/null for deletions.
+ let right_arg = if right_exists {
+ let mirror_path = current_dir.join(internal);
+ let contents = fs::read(¤t_external).with_context(|| {
+ format!(
+ "failed to read current file for diff {}",
+ current_external.display()
+ )
+ })?;
+ fs::write(&mirror_path, contents).with_context(|| {
+ format!(
+ "failed to write current mirror file {}",
+ mirror_path.display()
+ )
+ })?;
+ // Use relative path from baseline_dir (so headers say a/<uuid> b/current/<uuid>).
+ format!("current/{internal}")
+ } else {
+ // Deletion: right side is /dev/null to show proper deleted file diff.
+ "/dev/null".to_string()
+ };
+
+ // Prepare left arg: baseline file path or /dev/null for additions.
+ let left_arg = if left_is_dev_null {
+ "/dev/null".to_string()
+ } else {
+ internal.clone()
+ };
+
+ // Run git diff --no-index from baseline_dir to keep paths predictable.
+ let raw = run_git_allow_exit_codes(
+ &baseline_dir,
+ &[
+ "-c",
+ "color.ui=false",
+ "diff",
+ "--no-color",
+ "--no-index",
+ "--",
+ &left_arg,
+ &right_arg,
+ ],
+ &[0, 1], // 0: no changes, 1: differences
+ )?;
+
+ if raw.trim().is_empty() {
+ continue;
+ }
+ let rewritten = self.rewrite_diff_paths(&raw);
+ if !rewritten.trim().is_empty() {
+ if !aggregated.is_empty() && !aggregated.ends_with('\n') {
+ aggregated.push('\n');
+ }
+ aggregated.push_str(&rewritten);
+ }
+ }
+
+ self.unified_diff = if aggregated.trim().is_empty() {
+ None
+ } else {
+ Some(aggregated)
+ };
+
+ // Clean up the current dir.
+ let _ = fs::remove_dir_all(¤t_dir);
+
+ Ok(self.unified_diff.clone())
+ }
+
+ fn baseline_dir(&self) -> Result<&Path> {
+ self.baseline_files_dir
+ .as_ref()
+ .map(|d| d.path())
+ .ok_or_else(|| anyhow::anyhow!("baseline temp dir not initialized"))
+ }
+
+ fn ensure_baseline_dir(&mut self) -> Result<()> {
+ if self.baseline_files_dir.is_some() {
+ return Ok(());
+ }
+ let tmp = TempDir::new().context("create baseline temp dir")?;
+ self.baseline_files_dir = Some(tmp);
+ Ok(())
+ }
+
+ /// Rewrites the internal filenames to external paths in diff headers.
+ /// Handles inputs like:
+ /// diff --git a/<uuid> b/current/<uuid>
+ /// --- a/<uuid> | /dev/null
+ /// +++ b/current/<uuid> | /dev/null
+ /// and replaces uuid with the external paths tracking baseline/current.
+ fn rewrite_diff_paths(&self, diff: &str) -> String {
+ let mut out = String::new();
+ for line in diff.lines() {
+ if let Some(rest) = line.strip_prefix("diff --git ") {
+ // Format: diff --git a/<f> b/<f>
+ let parts: Vec<&str> = rest.split_whitespace().collect();
+ if parts.len() == 2 {
+ let a = parts[0].strip_prefix("a/").unwrap_or(parts[0]);
+ let b = parts[1].strip_prefix("b/").unwrap_or(parts[1]);
+
+ let a_ext_display = if a == "/dev/null" {
+ "/dev/null".to_string()
+ } else {
+ let a_base = Path::new(a)
+ .file_name()
+ .and_then(|s| s.to_str())
+ .unwrap_or(a);
+ let mapped = self
+ .temp_name_to_baseline_external
+ .get(a_base)
+ .cloned()
+ .unwrap_or_else(|| PathBuf::from(a));
+ mapped.display().to_string()
+ };
+
+ let b_ext_display = if b == "/dev/null" {
+ "/dev/null".to_string()
+ } else {
+ let b_base = Path::new(b)
+ .file_name()
+ .and_then(|s| s.to_str())
+ .unwrap_or(b);
+ let mapped = self
+ .temp_name_to_current_external
+ .get(b_base)
+ .cloned()
+ .unwrap_or_else(|| PathBuf::from(b));
+ mapped.display().to_string()
+ };
+
+ out.push_str(&format!("diff --git a/{a_ext_display} b/{b_ext_display}\n"));
+ continue;
+ }
+ }
+ if let Some(rest) = line.strip_prefix("--- ") {
+ if let Some(path) = rest.strip_prefix("a/") {
+ let external_display = if path == "/dev/null" {
+ "/dev/null".to_string()
+ } else {
+ let p_base = Path::new(path)
+ .file_name()
+ .and_then(|s| s.to_str())
+ .unwrap_or(path);
+ self.temp_name_to_baseline_external
+ .get(p_base)
+ .cloned()
+ .unwrap_or_else(|| PathBuf::from(path))
+ .display()
+ .to_string()
+ };
+ out.push_str(&format!("--- {external_display}\n"));
+ continue;
+ }
+ }
+ if let Some(rest) = line.strip_prefix("+++ ") {
+ if let Some(path) = rest.strip_prefix("b/") {
+ let external_display = if path == "/dev/null" {
+ "/dev/null".to_string()
+ } else {
+ let p_base = Path::new(path)
+ .file_name()
+ .and_then(|s| s.to_str())
+ .unwrap_or(path);
+ self.temp_name_to_current_external
+ .get(p_base)
+ .cloned()
+ .unwrap_or_else(|| PathBuf::from(path))
+ .display()
+ .to_string()
+ };
+ out.push_str(&format!("+++ {external_display}\n"));
+ continue;
+ }
+ }
+ out.push_str(line);
+ out.push('\n');
+ }
+ out
+ }
+}
+
+fn uuid_filename_for(path: &Path) -> String {
+ let id = Uuid::new_v4().to_string();
+ match path.extension().and_then(|e| e.to_str()) {
+ Some(ext) if !ext.is_empty() => format!("{id}.{ext}"),
+ _ => id,
+ }
+}
+
+fn run_git_allow_exit_codes(
+ repo: &Path,
+ args: &[&str],
+ allowed_exit_codes: &[i32],
+) -> Result<String> {
+ let output = Command::new("git")
+ .current_dir(repo)
+ .args(args)
+ .output()
+ .with_context(|| format!("failed to run git {:?} in {}", args, repo.display()))?;
.with_context(|| format!("failed to run `git {args:?}` in {repo}"))?;
- Created: 2025-08-01 17:08:30 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2248464099
@@ -0,0 +1,476 @@
+use std::collections::HashMap;
+use std::fs;
+use std::path::Path;
+use std::path::PathBuf;
+use std::process::Command;
+
+use anyhow::Context;
+use anyhow::Result;
+use tempfile::TempDir;
+use uuid::Uuid;
+
+use crate::protocol::FileChange;
+
+/// Tracks sets of changes to files and exposes the overall unified diff.
+/// Internally, the way this works is now:
+/// 1. Create a temp directory to store baseline snapshots of files when they are first seen.
+/// 2. When a path is first observed, copy its current contents into the baseline dir if it exists on disk.
+/// For new additions, do not create a baseline file so that diffs are shown as proper additions (using /dev/null).
+/// 3. Keep a stable internal filename (uuid + same extension) per external path for path rewrite in diffs.
+/// 4. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk using
+/// `git diff --no-index` and rewrite paths to external paths.
+#[derive(Default)]
+pub struct TurnDiffTracker {
+ /// Temp directory holding baseline snapshots of files as first seen.
+ baseline_files_dir: Option<TempDir>,
+ /// Map external path -> internal filename (uuid + same extension).
+ external_to_temp_name: HashMap<PathBuf, String>,
+ /// Internal filename -> external path as of baseline snapshot.
+ temp_name_to_baseline_external: HashMap<String, PathBuf>,
+ /// Internal filename -> external path as of current accumulated state (after applying all changes).
+ /// This is where renames are tracked.
+ temp_name_to_current_external: HashMap<String, PathBuf>,
+ /// Aggregated unified diff for all accumulated changes across files.
+ pub unified_diff: Option<String>,
+}
+
+impl TurnDiffTracker {
+ pub fn new() -> Self {
+ Self::default()
+ }
+
+ /// Front-run apply patch calls to track the starting contents of any modified files.
+ /// - Creates a baseline snapshot for files that already exist on disk when first seen.
+ /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions.
+ /// - Also updates internal mappings for move/rename events.
+ pub fn on_patch_begin(&mut self, changes: &HashMap<PathBuf, FileChange>) -> Result<()> {
+ self.ensure_baseline_dir()?;
+ let baseline_dir = self.baseline_dir()?.to_path_buf();
+
+ for (path, change) in changes.iter() {
+ // Ensure a stable internal filename exists for this external path.
+ if !self.external_to_temp_name.contains_key(path) {
+ let internal = uuid_filename_for(path);
+ self.external_to_temp_name
+ .insert(path.clone(), internal.clone());
+ self.temp_name_to_baseline_external
+ .insert(internal.clone(), path.clone());
+ self.temp_name_to_current_external
+ .insert(internal.clone(), path.clone());
+
+ // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null.
+ if path.exists() {
+ let contents = fs::read(path)
+ .with_context(|| format!("failed to read original {}", path.display()))?;
+ let internal_path = baseline_dir.join(&internal);
+ fs::write(&internal_path, contents).with_context(|| {
+ format!("failed to write baseline file {}", internal_path.display())
+ })?;
+ }
+ }
+
+ // Track rename/move in current mapping if provided in an Update.
+ let move_path = match change {
+ FileChange::Update {
+ move_path: Some(dest),
+ ..
+ } => Some(dest),
+ _ => None,
+ };
+ if let Some(dest) = move_path {
+ let uuid_filename = match self.external_to_temp_name.get(path) {
+ Some(i) => i.clone(),
+ None => {
+ // This should be rare, but if we haven't mapped the source, create it with no baseline.
+ let i = uuid_filename_for(path);
+ self.external_to_temp_name.insert(path.clone(), i.clone());
+ self.temp_name_to_baseline_external
+ .insert(i.clone(), path.clone());
+ i
+ }
+ };
+ // Update current external mapping for temp file name.
+ self.temp_name_to_current_external
+ .insert(uuid_filename.clone(), dest.clone());
+ // Update forward file_mapping: external current -> internal name.
+ self.external_to_temp_name.remove(path);
+ self.external_to_temp_name
+ .insert(dest.clone(), uuid_filename);
+ }
+ }
+
+ Ok(())
+ }
+
+ /// Recompute the aggregated unified diff by comparing all baseline snapshots against
+ /// current files on disk using `git diff --no-index` and rewriting paths to external paths.
+ pub fn update_and_get_unified_diff(&mut self) -> Result<Option<String>> {
+ let baseline_dir = self.baseline_dir()?.to_path_buf();
+ let current_dir = baseline_dir.join("current");
+ if current_dir.exists() {
+ // Best-effort cleanup of previous run's mirror.
+ let _ = fs::remove_dir_all(¤t_dir);
+ }
+ fs::create_dir_all(¤t_dir).with_context(|| {
+ format!(
+ "failed to create current mirror dir {}",
+ current_dir.display()
+ )
+ })?;
+
+ let mut aggregated = String::new();
+
+ // Compute diffs per tracked internal file.
+ for (internal, baseline_external) in &self.temp_name_to_baseline_external {
+ let baseline_path = baseline_dir.join(internal);
+ let current_external = self
+ .temp_name_to_current_external
+ .get(internal)
+ .cloned()
+ .unwrap_or_else(|| baseline_external.clone());
+
+ let left_is_dev_null = !baseline_path.exists();
+ let right_exists = current_external.exists();
+
+ // Prepare right side mirror file if exists; otherwise use /dev/null for deletions.
+ let right_arg = if right_exists {
+ let mirror_path = current_dir.join(internal);
+ let contents = fs::read(¤t_external).with_context(|| {
+ format!(
+ "failed to read current file for diff {}",
+ current_external.display()
+ )
+ })?;
+ fs::write(&mirror_path, contents).with_context(|| {
+ format!(
+ "failed to write current mirror file {}",
+ mirror_path.display()
+ )
+ })?;
+ // Use relative path from baseline_dir (so headers say a/<uuid> b/current/<uuid>).
+ format!("current/{internal}")
+ } else {
+ // Deletion: right side is /dev/null to show proper deleted file diff.
+ "/dev/null".to_string()
+ };
+
+ // Prepare left arg: baseline file path or /dev/null for additions.
+ let left_arg = if left_is_dev_null {
+ "/dev/null".to_string()
+ } else {
+ internal.clone()
+ };
+
+ // Run git diff --no-index from baseline_dir to keep paths predictable.
+ let raw = run_git_allow_exit_codes(
+ &baseline_dir,
+ &[
+ "-c",
+ "color.ui=false",
+ "diff",
+ "--no-color",
+ "--no-index",
+ "--",
+ &left_arg,
+ &right_arg,
+ ],
+ &[0, 1], // 0: no changes, 1: differences
+ )?;
+
+ if raw.trim().is_empty() {
+ continue;
+ }
+ let rewritten = self.rewrite_diff_paths(&raw);
+ if !rewritten.trim().is_empty() {
+ if !aggregated.is_empty() && !aggregated.ends_with('\n') {
+ aggregated.push('\n');
+ }
+ aggregated.push_str(&rewritten);
+ }
+ }
+
+ self.unified_diff = if aggregated.trim().is_empty() {
+ None
+ } else {
+ Some(aggregated)
+ };
+
+ // Clean up the current dir.
+ let _ = fs::remove_dir_all(¤t_dir);
+
+ Ok(self.unified_diff.clone())
+ }
+
+ fn baseline_dir(&self) -> Result<&Path> {
+ self.baseline_files_dir
+ .as_ref()
+ .map(|d| d.path())
+ .ok_or_else(|| anyhow::anyhow!("baseline temp dir not initialized"))
+ }
+
+ fn ensure_baseline_dir(&mut self) -> Result<()> {
+ if self.baseline_files_dir.is_some() {
+ return Ok(());
+ }
+ let tmp = TempDir::new().context("create baseline temp dir")?;
+ self.baseline_files_dir = Some(tmp);
+ Ok(())
+ }
+
+ /// Rewrites the internal filenames to external paths in diff headers.
+ /// Handles inputs like:
+ /// diff --git a/<uuid> b/current/<uuid>
+ /// --- a/<uuid> | /dev/null
+ /// +++ b/current/<uuid> | /dev/null
+ /// and replaces uuid with the external paths tracking baseline/current.
+ fn rewrite_diff_paths(&self, diff: &str) -> String {
+ let mut out = String::new();
+ for line in diff.lines() {
+ if let Some(rest) = line.strip_prefix("diff --git ") {
This gets indented pretty far, so maybe it's worth moving to a helper function that takes
(&mut String, temp_name_to_current_external, temp_name_to_baseline_external)
- Created: 2025-08-01 17:09:15 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2248466128
@@ -0,0 +1,476 @@
+use std::collections::HashMap;
+use std::fs;
+use std::path::Path;
+use std::path::PathBuf;
+use std::process::Command;
+
+use anyhow::Context;
+use anyhow::Result;
+use tempfile::TempDir;
+use uuid::Uuid;
+
+use crate::protocol::FileChange;
+
+/// Tracks sets of changes to files and exposes the overall unified diff.
+/// Internally, the way this works is now:
+/// 1. Create a temp directory to store baseline snapshots of files when they are first seen.
+/// 2. When a path is first observed, copy its current contents into the baseline dir if it exists on disk.
+/// For new additions, do not create a baseline file so that diffs are shown as proper additions (using /dev/null).
+/// 3. Keep a stable internal filename (uuid + same extension) per external path for path rewrite in diffs.
+/// 4. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk using
+/// `git diff --no-index` and rewrite paths to external paths.
+#[derive(Default)]
+pub struct TurnDiffTracker {
+ /// Temp directory holding baseline snapshots of files as first seen.
+ baseline_files_dir: Option<TempDir>,
+ /// Map external path -> internal filename (uuid + same extension).
+ external_to_temp_name: HashMap<PathBuf, String>,
+ /// Internal filename -> external path as of baseline snapshot.
+ temp_name_to_baseline_external: HashMap<String, PathBuf>,
+ /// Internal filename -> external path as of current accumulated state (after applying all changes).
+ /// This is where renames are tracked.
+ temp_name_to_current_external: HashMap<String, PathBuf>,
+ /// Aggregated unified diff for all accumulated changes across files.
+ pub unified_diff: Option<String>,
+}
+
+impl TurnDiffTracker {
+ pub fn new() -> Self {
+ Self::default()
+ }
+
+ /// Front-run apply patch calls to track the starting contents of any modified files.
+ /// - Creates a baseline snapshot for files that already exist on disk when first seen.
+ /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions.
+ /// - Also updates internal mappings for move/rename events.
+ pub fn on_patch_begin(&mut self, changes: &HashMap<PathBuf, FileChange>) -> Result<()> {
+ self.ensure_baseline_dir()?;
+ let baseline_dir = self.baseline_dir()?.to_path_buf();
+
+ for (path, change) in changes.iter() {
+ // Ensure a stable internal filename exists for this external path.
+ if !self.external_to_temp_name.contains_key(path) {
+ let internal = uuid_filename_for(path);
+ self.external_to_temp_name
+ .insert(path.clone(), internal.clone());
+ self.temp_name_to_baseline_external
+ .insert(internal.clone(), path.clone());
+ self.temp_name_to_current_external
+ .insert(internal.clone(), path.clone());
+
+ // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null.
+ if path.exists() {
+ let contents = fs::read(path)
+ .with_context(|| format!("failed to read original {}", path.display()))?;
+ let internal_path = baseline_dir.join(&internal);
+ fs::write(&internal_path, contents).with_context(|| {
+ format!("failed to write baseline file {}", internal_path.display())
+ })?;
+ }
+ }
+
+ // Track rename/move in current mapping if provided in an Update.
+ let move_path = match change {
+ FileChange::Update {
+ move_path: Some(dest),
+ ..
+ } => Some(dest),
+ _ => None,
+ };
+ if let Some(dest) = move_path {
+ let uuid_filename = match self.external_to_temp_name.get(path) {
+ Some(i) => i.clone(),
+ None => {
+ // This should be rare, but if we haven't mapped the source, create it with no baseline.
+ let i = uuid_filename_for(path);
+ self.external_to_temp_name.insert(path.clone(), i.clone());
+ self.temp_name_to_baseline_external
+ .insert(i.clone(), path.clone());
+ i
+ }
+ };
+ // Update current external mapping for temp file name.
+ self.temp_name_to_current_external
+ .insert(uuid_filename.clone(), dest.clone());
+ // Update forward file_mapping: external current -> internal name.
+ self.external_to_temp_name.remove(path);
+ self.external_to_temp_name
+ .insert(dest.clone(), uuid_filename);
+ }
+ }
+
+ Ok(())
+ }
+
+ /// Recompute the aggregated unified diff by comparing all baseline snapshots against
+ /// current files on disk using `git diff --no-index` and rewriting paths to external paths.
+ pub fn update_and_get_unified_diff(&mut self) -> Result<Option<String>> {
+ let baseline_dir = self.baseline_dir()?.to_path_buf();
+ let current_dir = baseline_dir.join("current");
+ if current_dir.exists() {
+ // Best-effort cleanup of previous run's mirror.
+ let _ = fs::remove_dir_all(¤t_dir);
+ }
+ fs::create_dir_all(¤t_dir).with_context(|| {
+ format!(
+ "failed to create current mirror dir {}",
+ current_dir.display()
+ )
+ })?;
+
+ let mut aggregated = String::new();
+
+ // Compute diffs per tracked internal file.
+ for (internal, baseline_external) in &self.temp_name_to_baseline_external {
+ let baseline_path = baseline_dir.join(internal);
+ let current_external = self
+ .temp_name_to_current_external
+ .get(internal)
+ .cloned()
+ .unwrap_or_else(|| baseline_external.clone());
+
+ let left_is_dev_null = !baseline_path.exists();
+ let right_exists = current_external.exists();
+
+ // Prepare right side mirror file if exists; otherwise use /dev/null for deletions.
+ let right_arg = if right_exists {
+ let mirror_path = current_dir.join(internal);
+ let contents = fs::read(¤t_external).with_context(|| {
+ format!(
+ "failed to read current file for diff {}",
+ current_external.display()
+ )
+ })?;
+ fs::write(&mirror_path, contents).with_context(|| {
+ format!(
+ "failed to write current mirror file {}",
+ mirror_path.display()
+ )
+ })?;
+ // Use relative path from baseline_dir (so headers say a/<uuid> b/current/<uuid>).
+ format!("current/{internal}")
+ } else {
+ // Deletion: right side is /dev/null to show proper deleted file diff.
+ "/dev/null".to_string()
+ };
+
+ // Prepare left arg: baseline file path or /dev/null for additions.
+ let left_arg = if left_is_dev_null {
+ "/dev/null".to_string()
+ } else {
+ internal.clone()
+ };
+
+ // Run git diff --no-index from baseline_dir to keep paths predictable.
+ let raw = run_git_allow_exit_codes(
+ &baseline_dir,
+ &[
+ "-c",
+ "color.ui=false",
+ "diff",
+ "--no-color",
+ "--no-index",
+ "--",
+ &left_arg,
+ &right_arg,
+ ],
+ &[0, 1], // 0: no changes, 1: differences
+ )?;
+
+ if raw.trim().is_empty() {
+ continue;
+ }
+ let rewritten = self.rewrite_diff_paths(&raw);
+ if !rewritten.trim().is_empty() {
+ if !aggregated.is_empty() && !aggregated.ends_with('\n') {
+ aggregated.push('\n');
+ }
+ aggregated.push_str(&rewritten);
+ }
+ }
+
+ self.unified_diff = if aggregated.trim().is_empty() {
+ None
+ } else {
+ Some(aggregated)
+ };
+
+ // Clean up the current dir.
+ let _ = fs::remove_dir_all(¤t_dir);
+
+ Ok(self.unified_diff.clone())
+ }
+
+ fn baseline_dir(&self) -> Result<&Path> {
+ self.baseline_files_dir
+ .as_ref()
+ .map(|d| d.path())
+ .ok_or_else(|| anyhow::anyhow!("baseline temp dir not initialized"))
+ }
+
+ fn ensure_baseline_dir(&mut self) -> Result<()> {
+ if self.baseline_files_dir.is_some() {
+ return Ok(());
+ }
+ let tmp = TempDir::new().context("create baseline temp dir")?;
+ self.baseline_files_dir = Some(tmp);
+ Ok(())
+ }
+
+ /// Rewrites the internal filenames to external paths in diff headers.
+ /// Handles inputs like:
+ /// diff --git a/<uuid> b/current/<uuid>
+ /// --- a/<uuid> | /dev/null
+ /// +++ b/current/<uuid> | /dev/null
+ /// and replaces uuid with the external paths tracking baseline/current.
+ fn rewrite_diff_paths(&self, diff: &str) -> String {
+ let mut out = String::new();
+ for line in diff.lines() {
+ if let Some(rest) = line.strip_prefix("diff --git ") {
+ // Format: diff --git a/<f> b/<f>
+ let parts: Vec<&str> = rest.split_whitespace().collect();
+ if parts.len() == 2 {
We don't have to worry about paths with spaces because they're all UUIDs?
- Created: 2025-08-01 17:10:28 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2248468317
@@ -0,0 +1,476 @@
+use std::collections::HashMap;
+use std::fs;
+use std::path::Path;
+use std::path::PathBuf;
+use std::process::Command;
+
+use anyhow::Context;
+use anyhow::Result;
+use tempfile::TempDir;
+use uuid::Uuid;
+
+use crate::protocol::FileChange;
+
+/// Tracks sets of changes to files and exposes the overall unified diff.
+/// Internally, the way this works is now:
+/// 1. Create a temp directory to store baseline snapshots of files when they are first seen.
+/// 2. When a path is first observed, copy its current contents into the baseline dir if it exists on disk.
+/// For new additions, do not create a baseline file so that diffs are shown as proper additions (using /dev/null).
+/// 3. Keep a stable internal filename (uuid + same extension) per external path for path rewrite in diffs.
+/// 4. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk using
+/// `git diff --no-index` and rewrite paths to external paths.
+#[derive(Default)]
+pub struct TurnDiffTracker {
+ /// Temp directory holding baseline snapshots of files as first seen.
+ baseline_files_dir: Option<TempDir>,
+ /// Map external path -> internal filename (uuid + same extension).
+ external_to_temp_name: HashMap<PathBuf, String>,
+ /// Internal filename -> external path as of baseline snapshot.
+ temp_name_to_baseline_external: HashMap<String, PathBuf>,
+ /// Internal filename -> external path as of current accumulated state (after applying all changes).
+ /// This is where renames are tracked.
+ temp_name_to_current_external: HashMap<String, PathBuf>,
+ /// Aggregated unified diff for all accumulated changes across files.
+ pub unified_diff: Option<String>,
+}
+
+impl TurnDiffTracker {
+ pub fn new() -> Self {
+ Self::default()
+ }
+
+ /// Front-run apply patch calls to track the starting contents of any modified files.
+ /// - Creates a baseline snapshot for files that already exist on disk when first seen.
+ /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions.
+ /// - Also updates internal mappings for move/rename events.
+ pub fn on_patch_begin(&mut self, changes: &HashMap<PathBuf, FileChange>) -> Result<()> {
+ self.ensure_baseline_dir()?;
+ let baseline_dir = self.baseline_dir()?.to_path_buf();
+
+ for (path, change) in changes.iter() {
+ // Ensure a stable internal filename exists for this external path.
+ if !self.external_to_temp_name.contains_key(path) {
+ let internal = uuid_filename_for(path);
+ self.external_to_temp_name
+ .insert(path.clone(), internal.clone());
+ self.temp_name_to_baseline_external
+ .insert(internal.clone(), path.clone());
+ self.temp_name_to_current_external
+ .insert(internal.clone(), path.clone());
+
+ // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null.
+ if path.exists() {
+ let contents = fs::read(path)
+ .with_context(|| format!("failed to read original {}", path.display()))?;
+ let internal_path = baseline_dir.join(&internal);
+ fs::write(&internal_path, contents).with_context(|| {
+ format!("failed to write baseline file {}", internal_path.display())
+ })?;
+ }
+ }
+
+ // Track rename/move in current mapping if provided in an Update.
+ let move_path = match change {
+ FileChange::Update {
+ move_path: Some(dest),
+ ..
+ } => Some(dest),
+ _ => None,
+ };
+ if let Some(dest) = move_path {
+ let uuid_filename = match self.external_to_temp_name.get(path) {
+ Some(i) => i.clone(),
+ None => {
+ // This should be rare, but if we haven't mapped the source, create it with no baseline.
+ let i = uuid_filename_for(path);
+ self.external_to_temp_name.insert(path.clone(), i.clone());
+ self.temp_name_to_baseline_external
+ .insert(i.clone(), path.clone());
+ i
+ }
+ };
+ // Update current external mapping for temp file name.
+ self.temp_name_to_current_external
+ .insert(uuid_filename.clone(), dest.clone());
+ // Update forward file_mapping: external current -> internal name.
+ self.external_to_temp_name.remove(path);
+ self.external_to_temp_name
+ .insert(dest.clone(), uuid_filename);
+ }
+ }
+
+ Ok(())
+ }
+
+ /// Recompute the aggregated unified diff by comparing all baseline snapshots against
+ /// current files on disk using `git diff --no-index` and rewriting paths to external paths.
+ pub fn update_and_get_unified_diff(&mut self) -> Result<Option<String>> {
+ let baseline_dir = self.baseline_dir()?.to_path_buf();
+ let current_dir = baseline_dir.join("current");
+ if current_dir.exists() {
+ // Best-effort cleanup of previous run's mirror.
+ let _ = fs::remove_dir_all(¤t_dir);
+ }
+ fs::create_dir_all(¤t_dir).with_context(|| {
+ format!(
+ "failed to create current mirror dir {}",
+ current_dir.display()
+ )
+ })?;
+
+ let mut aggregated = String::new();
+
+ // Compute diffs per tracked internal file.
+ for (internal, baseline_external) in &self.temp_name_to_baseline_external {
+ let baseline_path = baseline_dir.join(internal);
+ let current_external = self
+ .temp_name_to_current_external
+ .get(internal)
+ .cloned()
+ .unwrap_or_else(|| baseline_external.clone());
+
+ let left_is_dev_null = !baseline_path.exists();
+ let right_exists = current_external.exists();
+
+ // Prepare right side mirror file if exists; otherwise use /dev/null for deletions.
+ let right_arg = if right_exists {
+ let mirror_path = current_dir.join(internal);
+ let contents = fs::read(¤t_external).with_context(|| {
+ format!(
+ "failed to read current file for diff {}",
+ current_external.display()
+ )
+ })?;
+ fs::write(&mirror_path, contents).with_context(|| {
+ format!(
+ "failed to write current mirror file {}",
+ mirror_path.display()
+ )
+ })?;
+ // Use relative path from baseline_dir (so headers say a/<uuid> b/current/<uuid>).
+ format!("current/{internal}")
+ } else {
+ // Deletion: right side is /dev/null to show proper deleted file diff.
+ "/dev/null".to_string()
+ };
+
+ // Prepare left arg: baseline file path or /dev/null for additions.
+ let left_arg = if left_is_dev_null {
+ "/dev/null".to_string()
+ } else {
+ internal.clone()
+ };
+
+ // Run git diff --no-index from baseline_dir to keep paths predictable.
+ let raw = run_git_allow_exit_codes(
+ &baseline_dir,
+ &[
+ "-c",
+ "color.ui=false",
+ "diff",
+ "--no-color",
+ "--no-index",
+ "--",
+ &left_arg,
+ &right_arg,
+ ],
+ &[0, 1], // 0: no changes, 1: differences
+ )?;
+
+ if raw.trim().is_empty() {
+ continue;
+ }
+ let rewritten = self.rewrite_diff_paths(&raw);
+ if !rewritten.trim().is_empty() {
+ if !aggregated.is_empty() && !aggregated.ends_with('\n') {
+ aggregated.push('\n');
+ }
+ aggregated.push_str(&rewritten);
+ }
+ }
+
+ self.unified_diff = if aggregated.trim().is_empty() {
+ None
+ } else {
+ Some(aggregated)
+ };
+
+ // Clean up the current dir.
+ let _ = fs::remove_dir_all(¤t_dir);
+
+ Ok(self.unified_diff.clone())
+ }
+
+ fn baseline_dir(&self) -> Result<&Path> {
+ self.baseline_files_dir
+ .as_ref()
+ .map(|d| d.path())
+ .ok_or_else(|| anyhow::anyhow!("baseline temp dir not initialized"))
+ }
+
+ fn ensure_baseline_dir(&mut self) -> Result<()> {
+ if self.baseline_files_dir.is_some() {
+ return Ok(());
+ }
+ let tmp = TempDir::new().context("create baseline temp dir")?;
+ self.baseline_files_dir = Some(tmp);
+ Ok(())
+ }
+
+ /// Rewrites the internal filenames to external paths in diff headers.
+ /// Handles inputs like:
+ /// diff --git a/<uuid> b/current/<uuid>
+ /// --- a/<uuid> | /dev/null
+ /// +++ b/current/<uuid> | /dev/null
+ /// and replaces uuid with the external paths tracking baseline/current.
+ fn rewrite_diff_paths(&self, diff: &str) -> String {
+ let mut out = String::new();
+ for line in diff.lines() {
+ if let Some(rest) = line.strip_prefix("diff --git ") {
+ // Format: diff --git a/<f> b/<f>
+ let parts: Vec<&str> = rest.split_whitespace().collect();
+ if parts.len() == 2 {
+ let a = parts[0].strip_prefix("a/").unwrap_or(parts[0]);
+ let b = parts[1].strip_prefix("b/").unwrap_or(parts[1]);
+
+ let a_ext_display = if a == "/dev/null" {
+ "/dev/null".to_string()
+ } else {
+ let a_base = Path::new(a)
+ .file_name()
+ .and_then(|s| s.to_str())
+ .unwrap_or(a);
+ let mapped = self
+ .temp_name_to_baseline_external
+ .get(a_base)
+ .cloned()
+ .unwrap_or_else(|| PathBuf::from(a));
+ mapped.display().to_string()
+ };
+
+ let b_ext_display = if b == "/dev/null" {
+ "/dev/null".to_string()
+ } else {
+ let b_base = Path::new(b)
+ .file_name()
+ .and_then(|s| s.to_str())
+ .unwrap_or(b);
+ let mapped = self
+ .temp_name_to_current_external
+ .get(b_base)
+ .cloned()
+ .unwrap_or_else(|| PathBuf::from(b));
+ mapped.display().to_string()
+ };
+
+ out.push_str(&format!("diff --git a/{a_ext_display} b/{b_ext_display}\n"));
+ continue;
+ }
+ }
+ if let Some(rest) = line.strip_prefix("--- ") {
+ if let Some(path) = rest.strip_prefix("a/") {
+ let external_display = if path == "/dev/null" {
+ "/dev/null".to_string()
+ } else {
+ let p_base = Path::new(path)
+ .file_name()
+ .and_then(|s| s.to_str())
+ .unwrap_or(path);
+ self.temp_name_to_baseline_external
+ .get(p_base)
+ .cloned()
+ .unwrap_or_else(|| PathBuf::from(path))
+ .display()
+ .to_string()
+ };
+ out.push_str(&format!("--- {external_display}\n"));
+ continue;
+ }
+ }
+ if let Some(rest) = line.strip_prefix("+++ ") {
+ if let Some(path) = rest.strip_prefix("b/") {
+ let external_display = if path == "/dev/null" {
+ "/dev/null".to_string()
+ } else {
+ let p_base = Path::new(path)
+ .file_name()
+ .and_then(|s| s.to_str())
+ .unwrap_or(path);
+ self.temp_name_to_current_external
+ .get(p_base)
+ .cloned()
+ .unwrap_or_else(|| PathBuf::from(path))
+ .display()
+ .to_string()
+ };
+ out.push_str(&format!("+++ {external_display}\n"));
+ continue;
+ }
+ }
+ out.push_str(line);
+ out.push('\n');
+ }
+ out
+ }
+}
+
+fn uuid_filename_for(path: &Path) -> String {
+ let id = Uuid::new_v4().to_string();
+ match path.extension().and_then(|e| e.to_str()) {
Why preserve the
ext, btw? It could, in theory, contain a space, right?
- Created: 2025-08-01 17:10:59 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2248469411
@@ -0,0 +1,476 @@
+use std::collections::HashMap;
+use std::fs;
+use std::path::Path;
+use std::path::PathBuf;
+use std::process::Command;
+
+use anyhow::Context;
+use anyhow::Result;
+use tempfile::TempDir;
+use uuid::Uuid;
+
+use crate::protocol::FileChange;
+
+/// Tracks sets of changes to files and exposes the overall unified diff.
+/// Internally, the way this works is now:
+/// 1. Create a temp directory to store baseline snapshots of files when they are first seen.
+/// 2. When a path is first observed, copy its current contents into the baseline dir if it exists on disk.
+/// For new additions, do not create a baseline file so that diffs are shown as proper additions (using /dev/null).
+/// 3. Keep a stable internal filename (uuid + same extension) per external path for path rewrite in diffs.
+/// 4. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk using
+/// `git diff --no-index` and rewrite paths to external paths.
+#[derive(Default)]
+pub struct TurnDiffTracker {
+ /// Temp directory holding baseline snapshots of files as first seen.
+ baseline_files_dir: Option<TempDir>,
+ /// Map external path -> internal filename (uuid + same extension).
+ external_to_temp_name: HashMap<PathBuf, String>,
+ /// Internal filename -> external path as of baseline snapshot.
+ temp_name_to_baseline_external: HashMap<String, PathBuf>,
+ /// Internal filename -> external path as of current accumulated state (after applying all changes).
+ /// This is where renames are tracked.
+ temp_name_to_current_external: HashMap<String, PathBuf>,
+ /// Aggregated unified diff for all accumulated changes across files.
+ pub unified_diff: Option<String>,
+}
+
+impl TurnDiffTracker {
+ pub fn new() -> Self {
+ Self::default()
+ }
+
+ /// Front-run apply patch calls to track the starting contents of any modified files.
+ /// - Creates a baseline snapshot for files that already exist on disk when first seen.
+ /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions.
+ /// - Also updates internal mappings for move/rename events.
+ pub fn on_patch_begin(&mut self, changes: &HashMap<PathBuf, FileChange>) -> Result<()> {
+ self.ensure_baseline_dir()?;
+ let baseline_dir = self.baseline_dir()?.to_path_buf();
+
+ for (path, change) in changes.iter() {
+ // Ensure a stable internal filename exists for this external path.
+ if !self.external_to_temp_name.contains_key(path) {
+ let internal = uuid_filename_for(path);
+ self.external_to_temp_name
+ .insert(path.clone(), internal.clone());
+ self.temp_name_to_baseline_external
+ .insert(internal.clone(), path.clone());
+ self.temp_name_to_current_external
+ .insert(internal.clone(), path.clone());
+
+ // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null.
+ if path.exists() {
+ let contents = fs::read(path)
+ .with_context(|| format!("failed to read original {}", path.display()))?;
+ let internal_path = baseline_dir.join(&internal);
+ fs::write(&internal_path, contents).with_context(|| {
+ format!("failed to write baseline file {}", internal_path.display())
+ })?;
+ }
+ }
+
+ // Track rename/move in current mapping if provided in an Update.
+ let move_path = match change {
+ FileChange::Update {
+ move_path: Some(dest),
+ ..
+ } => Some(dest),
+ _ => None,
+ };
+ if let Some(dest) = move_path {
+ let uuid_filename = match self.external_to_temp_name.get(path) {
+ Some(i) => i.clone(),
+ None => {
+ // This should be rare, but if we haven't mapped the source, create it with no baseline.
+ let i = uuid_filename_for(path);
+ self.external_to_temp_name.insert(path.clone(), i.clone());
+ self.temp_name_to_baseline_external
+ .insert(i.clone(), path.clone());
+ i
+ }
+ };
+ // Update current external mapping for temp file name.
+ self.temp_name_to_current_external
+ .insert(uuid_filename.clone(), dest.clone());
+ // Update forward file_mapping: external current -> internal name.
+ self.external_to_temp_name.remove(path);
+ self.external_to_temp_name
+ .insert(dest.clone(), uuid_filename);
+ }
+ }
+
+ Ok(())
+ }
+
+ /// Recompute the aggregated unified diff by comparing all baseline snapshots against
+ /// current files on disk using `git diff --no-index` and rewriting paths to external paths.
+ pub fn update_and_get_unified_diff(&mut self) -> Result<Option<String>> {
+ let baseline_dir = self.baseline_dir()?.to_path_buf();
+ let current_dir = baseline_dir.join("current");
+ if current_dir.exists() {
+ // Best-effort cleanup of previous run's mirror.
+ let _ = fs::remove_dir_all(¤t_dir);
+ }
+ fs::create_dir_all(¤t_dir).with_context(|| {
+ format!(
+ "failed to create current mirror dir {}",
+ current_dir.display()
+ )
+ })?;
+
+ let mut aggregated = String::new();
+
+ // Compute diffs per tracked internal file.
+ for (internal, baseline_external) in &self.temp_name_to_baseline_external {
+ let baseline_path = baseline_dir.join(internal);
+ let current_external = self
+ .temp_name_to_current_external
+ .get(internal)
+ .cloned()
+ .unwrap_or_else(|| baseline_external.clone());
+
+ let left_is_dev_null = !baseline_path.exists();
+ let right_exists = current_external.exists();
+
+ // Prepare right side mirror file if exists; otherwise use /dev/null for deletions.
+ let right_arg = if right_exists {
+ let mirror_path = current_dir.join(internal);
+ let contents = fs::read(¤t_external).with_context(|| {
+ format!(
+ "failed to read current file for diff {}",
+ current_external.display()
+ )
+ })?;
+ fs::write(&mirror_path, contents).with_context(|| {
+ format!(
+ "failed to write current mirror file {}",
+ mirror_path.display()
+ )
+ })?;
+ // Use relative path from baseline_dir (so headers say a/<uuid> b/current/<uuid>).
+ format!("current/{internal}")
+ } else {
+ // Deletion: right side is /dev/null to show proper deleted file diff.
+ "/dev/null".to_string()
+ };
+
+ // Prepare left arg: baseline file path or /dev/null for additions.
+ let left_arg = if left_is_dev_null {
+ "/dev/null".to_string()
+ } else {
+ internal.clone()
+ };
+
+ // Run git diff --no-index from baseline_dir to keep paths predictable.
+ let raw = run_git_allow_exit_codes(
+ &baseline_dir,
+ &[
+ "-c",
+ "color.ui=false",
+ "diff",
+ "--no-color",
+ "--no-index",
+ "--",
+ &left_arg,
+ &right_arg,
+ ],
+ &[0, 1], // 0: no changes, 1: differences
+ )?;
+
+ if raw.trim().is_empty() {
+ continue;
+ }
+ let rewritten = self.rewrite_diff_paths(&raw);
+ if !rewritten.trim().is_empty() {
+ if !aggregated.is_empty() && !aggregated.ends_with('\n') {
+ aggregated.push('\n');
+ }
+ aggregated.push_str(&rewritten);
+ }
+ }
+
+ self.unified_diff = if aggregated.trim().is_empty() {
+ None
+ } else {
+ Some(aggregated)
+ };
+
+ // Clean up the current dir.
+ let _ = fs::remove_dir_all(¤t_dir);
+
+ Ok(self.unified_diff.clone())
+ }
+
+ fn baseline_dir(&self) -> Result<&Path> {
+ self.baseline_files_dir
+ .as_ref()
+ .map(|d| d.path())
+ .ok_or_else(|| anyhow::anyhow!("baseline temp dir not initialized"))
+ }
+
+ fn ensure_baseline_dir(&mut self) -> Result<()> {
+ if self.baseline_files_dir.is_some() {
+ return Ok(());
+ }
+ let tmp = TempDir::new().context("create baseline temp dir")?;
+ self.baseline_files_dir = Some(tmp);
+ Ok(())
+ }
+
+ /// Rewrites the internal filenames to external paths in diff headers.
+ /// Handles inputs like:
+ /// diff --git a/<uuid> b/current/<uuid>
+ /// --- a/<uuid> | /dev/null
+ /// +++ b/current/<uuid> | /dev/null
+ /// and replaces uuid with the external paths tracking baseline/current.
+ fn rewrite_diff_paths(&self, diff: &str) -> String {
+ let mut out = String::new();
+ for line in diff.lines() {
+ if let Some(rest) = line.strip_prefix("diff --git ") {
+ // Format: diff --git a/<f> b/<f>
+ let parts: Vec<&str> = rest.split_whitespace().collect();
+ if parts.len() == 2 {
+ let a = parts[0].strip_prefix("a/").unwrap_or(parts[0]);
+ let b = parts[1].strip_prefix("b/").unwrap_or(parts[1]);
+
+ let a_ext_display = if a == "/dev/null" {
+ "/dev/null".to_string()
+ } else {
+ let a_base = Path::new(a)
+ .file_name()
+ .and_then(|s| s.to_str())
+ .unwrap_or(a);
+ let mapped = self
+ .temp_name_to_baseline_external
+ .get(a_base)
+ .cloned()
+ .unwrap_or_else(|| PathBuf::from(a));
+ mapped.display().to_string()
+ };
+
+ let b_ext_display = if b == "/dev/null" {
+ "/dev/null".to_string()
+ } else {
+ let b_base = Path::new(b)
+ .file_name()
+ .and_then(|s| s.to_str())
+ .unwrap_or(b);
+ let mapped = self
+ .temp_name_to_current_external
+ .get(b_base)
+ .cloned()
+ .unwrap_or_else(|| PathBuf::from(b));
+ mapped.display().to_string()
+ };
+
+ out.push_str(&format!("diff --git a/{a_ext_display} b/{b_ext_display}\n"));
+ continue;
+ }
+ }
+ if let Some(rest) = line.strip_prefix("--- ") {
+ if let Some(path) = rest.strip_prefix("a/") {
+ let external_display = if path == "/dev/null" {
+ "/dev/null".to_string()
+ } else {
+ let p_base = Path::new(path)
+ .file_name()
+ .and_then(|s| s.to_str())
+ .unwrap_or(path);
+ self.temp_name_to_baseline_external
+ .get(p_base)
+ .cloned()
+ .unwrap_or_else(|| PathBuf::from(path))
+ .display()
+ .to_string()
+ };
+ out.push_str(&format!("--- {external_display}\n"));
+ continue;
+ }
+ }
+ if let Some(rest) = line.strip_prefix("+++ ") {
+ if let Some(path) = rest.strip_prefix("b/") {
+ let external_display = if path == "/dev/null" {
+ "/dev/null".to_string()
+ } else {
+ let p_base = Path::new(path)
+ .file_name()
+ .and_then(|s| s.to_str())
+ .unwrap_or(path);
+ self.temp_name_to_current_external
+ .get(p_base)
+ .cloned()
+ .unwrap_or_else(|| PathBuf::from(path))
+ .display()
+ .to_string()
+ };
+ out.push_str(&format!("+++ {external_display}\n"));
+ continue;
+ }
+ }
+ out.push_str(line);
+ out.push('\n');
+ }
+ out
+ }
+}
+
+fn uuid_filename_for(path: &Path) -> String {
+ let id = Uuid::new_v4().to_string();
+ match path.extension().and_then(|e| e.to_str()) {
+ Some(ext) if !ext.is_empty() => format!("{id}.{ext}"),
+ _ => id,
+ }
+}
+
+fn run_git_allow_exit_codes(
+ repo: &Path,
+ args: &[&str],
+ allowed_exit_codes: &[i32],
+) -> Result<String> {
+ let output = Command::new("git")
Should we make this async and use
tokio::Command?
- Created: 2025-08-01 17:12:45 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2248474046
@@ -0,0 +1,476 @@
+use std::collections::HashMap;
+use std::fs;
+use std::path::Path;
+use std::path::PathBuf;
+use std::process::Command;
+
+use anyhow::Context;
+use anyhow::Result;
+use tempfile::TempDir;
+use uuid::Uuid;
+
+use crate::protocol::FileChange;
+
+/// Tracks sets of changes to files and exposes the overall unified diff.
+/// Internally, the way this works is now:
+/// 1. Create a temp directory to store baseline snapshots of files when they are first seen.
+/// 2. When a path is first observed, copy its current contents into the baseline dir if it exists on disk.
+/// For new additions, do not create a baseline file so that diffs are shown as proper additions (using /dev/null).
+/// 3. Keep a stable internal filename (uuid + same extension) per external path for path rewrite in diffs.
+/// 4. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk using
+/// `git diff --no-index` and rewrite paths to external paths.
+#[derive(Default)]
+pub struct TurnDiffTracker {
+ /// Temp directory holding baseline snapshots of files as first seen.
+ baseline_files_dir: Option<TempDir>,
+ /// Map external path -> internal filename (uuid + same extension).
+ external_to_temp_name: HashMap<PathBuf, String>,
+ /// Internal filename -> external path as of baseline snapshot.
+ temp_name_to_baseline_external: HashMap<String, PathBuf>,
+ /// Internal filename -> external path as of current accumulated state (after applying all changes).
+ /// This is where renames are tracked.
+ temp_name_to_current_external: HashMap<String, PathBuf>,
+ /// Aggregated unified diff for all accumulated changes across files.
+ pub unified_diff: Option<String>,
+}
+
+impl TurnDiffTracker {
+ pub fn new() -> Self {
+ Self::default()
+ }
+
+ /// Front-run apply patch calls to track the starting contents of any modified files.
+ /// - Creates a baseline snapshot for files that already exist on disk when first seen.
+ /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions.
+ /// - Also updates internal mappings for move/rename events.
+ pub fn on_patch_begin(&mut self, changes: &HashMap<PathBuf, FileChange>) -> Result<()> {
+ self.ensure_baseline_dir()?;
+ let baseline_dir = self.baseline_dir()?.to_path_buf();
+
+ for (path, change) in changes.iter() {
+ // Ensure a stable internal filename exists for this external path.
+ if !self.external_to_temp_name.contains_key(path) {
+ let internal = uuid_filename_for(path);
+ self.external_to_temp_name
+ .insert(path.clone(), internal.clone());
+ self.temp_name_to_baseline_external
+ .insert(internal.clone(), path.clone());
+ self.temp_name_to_current_external
+ .insert(internal.clone(), path.clone());
+
+ // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null.
+ if path.exists() {
+ let contents = fs::read(path)
+ .with_context(|| format!("failed to read original {}", path.display()))?;
+ let internal_path = baseline_dir.join(&internal);
+ fs::write(&internal_path, contents).with_context(|| {
+ format!("failed to write baseline file {}", internal_path.display())
+ })?;
+ }
+ }
+
+ // Track rename/move in current mapping if provided in an Update.
+ let move_path = match change {
+ FileChange::Update {
+ move_path: Some(dest),
+ ..
+ } => Some(dest),
+ _ => None,
+ };
+ if let Some(dest) = move_path {
+ let uuid_filename = match self.external_to_temp_name.get(path) {
+ Some(i) => i.clone(),
+ None => {
+ // This should be rare, but if we haven't mapped the source, create it with no baseline.
+ let i = uuid_filename_for(path);
+ self.external_to_temp_name.insert(path.clone(), i.clone());
+ self.temp_name_to_baseline_external
+ .insert(i.clone(), path.clone());
+ i
+ }
+ };
+ // Update current external mapping for temp file name.
+ self.temp_name_to_current_external
+ .insert(uuid_filename.clone(), dest.clone());
+ // Update forward file_mapping: external current -> internal name.
+ self.external_to_temp_name.remove(path);
+ self.external_to_temp_name
+ .insert(dest.clone(), uuid_filename);
+ }
+ }
+
+ Ok(())
+ }
+
+ /// Recompute the aggregated unified diff by comparing all baseline snapshots against
+ /// current files on disk using `git diff --no-index` and rewriting paths to external paths.
+ pub fn update_and_get_unified_diff(&mut self) -> Result<Option<String>> {
+ let baseline_dir = self.baseline_dir()?.to_path_buf();
+ let current_dir = baseline_dir.join("current");
+ if current_dir.exists() {
+ // Best-effort cleanup of previous run's mirror.
+ let _ = fs::remove_dir_all(¤t_dir);
+ }
+ fs::create_dir_all(¤t_dir).with_context(|| {
+ format!(
+ "failed to create current mirror dir {}",
+ current_dir.display()
+ )
+ })?;
+
+ let mut aggregated = String::new();
+
+ // Compute diffs per tracked internal file.
+ for (internal, baseline_external) in &self.temp_name_to_baseline_external {
+ let baseline_path = baseline_dir.join(internal);
+ let current_external = self
+ .temp_name_to_current_external
+ .get(internal)
+ .cloned()
+ .unwrap_or_else(|| baseline_external.clone());
+
+ let left_is_dev_null = !baseline_path.exists();
+ let right_exists = current_external.exists();
+
+ // Prepare right side mirror file if exists; otherwise use /dev/null for deletions.
+ let right_arg = if right_exists {
+ let mirror_path = current_dir.join(internal);
+ let contents = fs::read(¤t_external).with_context(|| {
+ format!(
+ "failed to read current file for diff {}",
+ current_external.display()
+ )
+ })?;
+ fs::write(&mirror_path, contents).with_context(|| {
+ format!(
+ "failed to write current mirror file {}",
+ mirror_path.display()
+ )
+ })?;
+ // Use relative path from baseline_dir (so headers say a/<uuid> b/current/<uuid>).
+ format!("current/{internal}")
+ } else {
+ // Deletion: right side is /dev/null to show proper deleted file diff.
+ "/dev/null".to_string()
+ };
+
+ // Prepare left arg: baseline file path or /dev/null for additions.
+ let left_arg = if left_is_dev_null {
+ "/dev/null".to_string()
+ } else {
+ internal.clone()
+ };
+
+ // Run git diff --no-index from baseline_dir to keep paths predictable.
+ let raw = run_git_allow_exit_codes(
+ &baseline_dir,
+ &[
+ "-c",
+ "color.ui=false",
+ "diff",
+ "--no-color",
+ "--no-index",
+ "--",
+ &left_arg,
+ &right_arg,
+ ],
+ &[0, 1], // 0: no changes, 1: differences
+ )?;
+
+ if raw.trim().is_empty() {
+ continue;
+ }
+ let rewritten = self.rewrite_diff_paths(&raw);
+ if !rewritten.trim().is_empty() {
+ if !aggregated.is_empty() && !aggregated.ends_with('\n') {
+ aggregated.push('\n');
+ }
+ aggregated.push_str(&rewritten);
+ }
+ }
+
+ self.unified_diff = if aggregated.trim().is_empty() {
+ None
+ } else {
+ Some(aggregated)
+ };
+
+ // Clean up the current dir.
+ let _ = fs::remove_dir_all(¤t_dir);
+
+ Ok(self.unified_diff.clone())
+ }
+
+ fn baseline_dir(&self) -> Result<&Path> {
+ self.baseline_files_dir
+ .as_ref()
+ .map(|d| d.path())
+ .ok_or_else(|| anyhow::anyhow!("baseline temp dir not initialized"))
+ }
+
+ fn ensure_baseline_dir(&mut self) -> Result<()> {
+ if self.baseline_files_dir.is_some() {
+ return Ok(());
+ }
+ let tmp = TempDir::new().context("create baseline temp dir")?;
+ self.baseline_files_dir = Some(tmp);
+ Ok(())
+ }
+
+ /// Rewrites the internal filenames to external paths in diff headers.
+ /// Handles inputs like:
+ /// diff --git a/<uuid> b/current/<uuid>
+ /// --- a/<uuid> | /dev/null
+ /// +++ b/current/<uuid> | /dev/null
+ /// and replaces uuid with the external paths tracking baseline/current.
+ fn rewrite_diff_paths(&self, diff: &str) -> String {
+ let mut out = String::new();
+ for line in diff.lines() {
+ if let Some(rest) = line.strip_prefix("diff --git ") {
+ // Format: diff --git a/<f> b/<f>
+ let parts: Vec<&str> = rest.split_whitespace().collect();
+ if parts.len() == 2 {
+ let a = parts[0].strip_prefix("a/").unwrap_or(parts[0]);
+ let b = parts[1].strip_prefix("b/").unwrap_or(parts[1]);
+
+ let a_ext_display = if a == "/dev/null" {
+ "/dev/null".to_string()
+ } else {
+ let a_base = Path::new(a)
+ .file_name()
+ .and_then(|s| s.to_str())
+ .unwrap_or(a);
+ let mapped = self
+ .temp_name_to_baseline_external
+ .get(a_base)
+ .cloned()
+ .unwrap_or_else(|| PathBuf::from(a));
+ mapped.display().to_string()
+ };
+
+ let b_ext_display = if b == "/dev/null" {
+ "/dev/null".to_string()
+ } else {
+ let b_base = Path::new(b)
+ .file_name()
+ .and_then(|s| s.to_str())
+ .unwrap_or(b);
+ let mapped = self
+ .temp_name_to_current_external
+ .get(b_base)
+ .cloned()
+ .unwrap_or_else(|| PathBuf::from(b));
+ mapped.display().to_string()
+ };
+
+ out.push_str(&format!("diff --git a/{a_ext_display} b/{b_ext_display}\n"));
+ continue;
+ }
+ }
+ if let Some(rest) = line.strip_prefix("--- ") {
+ if let Some(path) = rest.strip_prefix("a/") {
+ let external_display = if path == "/dev/null" {
+ "/dev/null".to_string()
+ } else {
+ let p_base = Path::new(path)
+ .file_name()
+ .and_then(|s| s.to_str())
+ .unwrap_or(path);
+ self.temp_name_to_baseline_external
+ .get(p_base)
+ .cloned()
+ .unwrap_or_else(|| PathBuf::from(path))
+ .display()
+ .to_string()
+ };
+ out.push_str(&format!("--- {external_display}\n"));
+ continue;
+ }
+ }
+ if let Some(rest) = line.strip_prefix("+++ ") {
+ if let Some(path) = rest.strip_prefix("b/") {
+ let external_display = if path == "/dev/null" {
+ "/dev/null".to_string()
+ } else {
+ let p_base = Path::new(path)
+ .file_name()
+ .and_then(|s| s.to_str())
+ .unwrap_or(path);
+ self.temp_name_to_current_external
+ .get(p_base)
+ .cloned()
+ .unwrap_or_else(|| PathBuf::from(path))
+ .display()
+ .to_string()
+ };
+ out.push_str(&format!("+++ {external_display}\n"));
+ continue;
+ }
+ }
+ out.push_str(line);
+ out.push('\n');
+ }
+ out
+ }
+}
+
+fn uuid_filename_for(path: &Path) -> String {
+ let id = Uuid::new_v4().to_string();
+ match path.extension().and_then(|e| e.to_str()) {
+ Some(ext) if !ext.is_empty() => format!("{id}.{ext}"),
+ _ => id,
+ }
+}
+
+fn run_git_allow_exit_codes(
+ repo: &Path,
+ args: &[&str],
+ allowed_exit_codes: &[i32],
+) -> Result<String> {
+ let output = Command::new("git")
+ .current_dir(repo)
+ .args(args)
+ .output()
+ .with_context(|| format!("failed to run git {:?} in {}", args, repo.display()))?;
+ let code = output.status.code().unwrap_or(-1);
+ if !allowed_exit_codes.contains(&code) {
+ anyhow::bail!(
+ "git {:?} failed with status {:?}: {}",
+ args,
+ output.status,
+ String::from_utf8_lossy(&output.stderr)
+ );
+ }
+ Ok(String::from_utf8_lossy(&output.stdout).into_owned())
+}
+
+#[cfg(test)]
+mod tests {
+ #![allow(clippy::unwrap_used)]
+ use super::*;
+ use tempfile::tempdir;
+
+ #[test]
+ fn accumulates_add_and_update() {
+ let mut acc = TurnDiffTracker::new();
+
+ let dir = tempdir().unwrap();
+ let file = dir.path().join("a.txt");
+
+ // First patch: add file (baseline should be /dev/null).
+ let add_changes = HashMap::from([(
+ file.clone(),
+ FileChange::Add {
+ content: "foo\n".to_string(),
+ },
+ )]);
+ acc.on_patch_begin(&add_changes).unwrap();
+
+ // Simulate apply: create the file on disk.
+ // This must happen after on_patch_begin.
+ fs::write(&file, "foo\n").unwrap();
+ acc.update_and_get_unified_diff().unwrap();
+ let first = acc.unified_diff.clone().unwrap();
+ assert!(first.contains("+foo"));
Instead of
contains()checks, can these all be fullassert_eq!()checks?
- Created: 2025-08-01 17:13:43 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2248476408
@@ -0,0 +1,476 @@
+use std::collections::HashMap;
+use std::fs;
+use std::path::Path;
+use std::path::PathBuf;
+use std::process::Command;
+
+use anyhow::Context;
+use anyhow::Result;
+use tempfile::TempDir;
+use uuid::Uuid;
+
+use crate::protocol::FileChange;
+
+/// Tracks sets of changes to files and exposes the overall unified diff.
+/// Internally, the way this works is now:
+/// 1. Create a temp directory to store baseline snapshots of files when they are first seen.
+/// 2. When a path is first observed, copy its current contents into the baseline dir if it exists on disk.
+/// For new additions, do not create a baseline file so that diffs are shown as proper additions (using /dev/null).
+/// 3. Keep a stable internal filename (uuid + same extension) per external path for path rewrite in diffs.
+/// 4. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk using
+/// `git diff --no-index` and rewrite paths to external paths.
+#[derive(Default)]
+pub struct TurnDiffTracker {
+ /// Temp directory holding baseline snapshots of files as first seen.
+ baseline_files_dir: Option<TempDir>,
+ /// Map external path -> internal filename (uuid + same extension).
+ external_to_temp_name: HashMap<PathBuf, String>,
+ /// Internal filename -> external path as of baseline snapshot.
+ temp_name_to_baseline_external: HashMap<String, PathBuf>,
+ /// Internal filename -> external path as of current accumulated state (after applying all changes).
+ /// This is where renames are tracked.
+ temp_name_to_current_external: HashMap<String, PathBuf>,
+ /// Aggregated unified diff for all accumulated changes across files.
+ pub unified_diff: Option<String>,
+}
+
+impl TurnDiffTracker {
+ pub fn new() -> Self {
+ Self::default()
+ }
+
+ /// Front-run apply patch calls to track the starting contents of any modified files.
+ /// - Creates a baseline snapshot for files that already exist on disk when first seen.
+ /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions.
+ /// - Also updates internal mappings for move/rename events.
+ pub fn on_patch_begin(&mut self, changes: &HashMap<PathBuf, FileChange>) -> Result<()> {
+ self.ensure_baseline_dir()?;
+ let baseline_dir = self.baseline_dir()?.to_path_buf();
+
+ for (path, change) in changes.iter() {
+ // Ensure a stable internal filename exists for this external path.
+ if !self.external_to_temp_name.contains_key(path) {
+ let internal = uuid_filename_for(path);
+ self.external_to_temp_name
+ .insert(path.clone(), internal.clone());
+ self.temp_name_to_baseline_external
+ .insert(internal.clone(), path.clone());
+ self.temp_name_to_current_external
+ .insert(internal.clone(), path.clone());
+
+ // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null.
+ if path.exists() {
+ let contents = fs::read(path)
+ .with_context(|| format!("failed to read original {}", path.display()))?;
+ let internal_path = baseline_dir.join(&internal);
+ fs::write(&internal_path, contents).with_context(|| {
+ format!("failed to write baseline file {}", internal_path.display())
+ })?;
+ }
+ }
+
+ // Track rename/move in current mapping if provided in an Update.
+ let move_path = match change {
+ FileChange::Update {
+ move_path: Some(dest),
+ ..
+ } => Some(dest),
+ _ => None,
+ };
+ if let Some(dest) = move_path {
+ let uuid_filename = match self.external_to_temp_name.get(path) {
+ Some(i) => i.clone(),
+ None => {
+ // This should be rare, but if we haven't mapped the source, create it with no baseline.
+ let i = uuid_filename_for(path);
+ self.external_to_temp_name.insert(path.clone(), i.clone());
+ self.temp_name_to_baseline_external
+ .insert(i.clone(), path.clone());
+ i
+ }
+ };
+ // Update current external mapping for temp file name.
+ self.temp_name_to_current_external
+ .insert(uuid_filename.clone(), dest.clone());
+ // Update forward file_mapping: external current -> internal name.
+ self.external_to_temp_name.remove(path);
+ self.external_to_temp_name
+ .insert(dest.clone(), uuid_filename);
+ }
+ }
+
+ Ok(())
+ }
+
+ /// Recompute the aggregated unified diff by comparing all baseline snapshots against
+ /// current files on disk using `git diff --no-index` and rewriting paths to external paths.
+ pub fn update_and_get_unified_diff(&mut self) -> Result<Option<String>> {
+ let baseline_dir = self.baseline_dir()?.to_path_buf();
+ let current_dir = baseline_dir.join("current");
+ if current_dir.exists() {
+ // Best-effort cleanup of previous run's mirror.
+ let _ = fs::remove_dir_all(¤t_dir);
+ }
+ fs::create_dir_all(¤t_dir).with_context(|| {
+ format!(
+ "failed to create current mirror dir {}",
+ current_dir.display()
+ )
+ })?;
+
+ let mut aggregated = String::new();
+
+ // Compute diffs per tracked internal file.
+ for (internal, baseline_external) in &self.temp_name_to_baseline_external {
+ let baseline_path = baseline_dir.join(internal);
+ let current_external = self
+ .temp_name_to_current_external
+ .get(internal)
+ .cloned()
+ .unwrap_or_else(|| baseline_external.clone());
+
+ let left_is_dev_null = !baseline_path.exists();
+ let right_exists = current_external.exists();
+
+ // Prepare right side mirror file if exists; otherwise use /dev/null for deletions.
+ let right_arg = if right_exists {
+ let mirror_path = current_dir.join(internal);
+ let contents = fs::read(¤t_external).with_context(|| {
+ format!(
+ "failed to read current file for diff {}",
+ current_external.display()
+ )
+ })?;
+ fs::write(&mirror_path, contents).with_context(|| {
+ format!(
+ "failed to write current mirror file {}",
+ mirror_path.display()
+ )
+ })?;
+ // Use relative path from baseline_dir (so headers say a/<uuid> b/current/<uuid>).
+ format!("current/{internal}")
+ } else {
+ // Deletion: right side is /dev/null to show proper deleted file diff.
+ "/dev/null".to_string()
+ };
+
+ // Prepare left arg: baseline file path or /dev/null for additions.
+ let left_arg = if left_is_dev_null {
+ "/dev/null".to_string()
+ } else {
+ internal.clone()
+ };
+
+ // Run git diff --no-index from baseline_dir to keep paths predictable.
+ let raw = run_git_allow_exit_codes(
+ &baseline_dir,
+ &[
+ "-c",
+ "color.ui=false",
+ "diff",
+ "--no-color",
+ "--no-index",
+ "--",
+ &left_arg,
+ &right_arg,
+ ],
+ &[0, 1], // 0: no changes, 1: differences
+ )?;
+
+ if raw.trim().is_empty() {
+ continue;
+ }
+ let rewritten = self.rewrite_diff_paths(&raw);
+ if !rewritten.trim().is_empty() {
+ if !aggregated.is_empty() && !aggregated.ends_with('\n') {
+ aggregated.push('\n');
+ }
+ aggregated.push_str(&rewritten);
+ }
+ }
+
+ self.unified_diff = if aggregated.trim().is_empty() {
+ None
+ } else {
+ Some(aggregated)
+ };
+
+ // Clean up the current dir.
+ let _ = fs::remove_dir_all(¤t_dir);
+
+ Ok(self.unified_diff.clone())
+ }
+
+ fn baseline_dir(&self) -> Result<&Path> {
+ self.baseline_files_dir
+ .as_ref()
+ .map(|d| d.path())
+ .ok_or_else(|| anyhow::anyhow!("baseline temp dir not initialized"))
+ }
+
+ fn ensure_baseline_dir(&mut self) -> Result<()> {
+ if self.baseline_files_dir.is_some() {
+ return Ok(());
+ }
+ let tmp = TempDir::new().context("create baseline temp dir")?;
+ self.baseline_files_dir = Some(tmp);
+ Ok(())
+ }
+
+ /// Rewrites the internal filenames to external paths in diff headers.
+ /// Handles inputs like:
+ /// diff --git a/<uuid> b/current/<uuid>
+ /// --- a/<uuid> | /dev/null
+ /// +++ b/current/<uuid> | /dev/null
+ /// and replaces uuid with the external paths tracking baseline/current.
+ fn rewrite_diff_paths(&self, diff: &str) -> String {
+ let mut out = String::new();
+ for line in diff.lines() {
+ if let Some(rest) = line.strip_prefix("diff --git ") {
+ // Format: diff --git a/<f> b/<f>
+ let parts: Vec<&str> = rest.split_whitespace().collect();
+ if parts.len() == 2 {
+ let a = parts[0].strip_prefix("a/").unwrap_or(parts[0]);
+ let b = parts[1].strip_prefix("b/").unwrap_or(parts[1]);
+
+ let a_ext_display = if a == "/dev/null" {
+ "/dev/null".to_string()
+ } else {
+ let a_base = Path::new(a)
+ .file_name()
+ .and_then(|s| s.to_str())
+ .unwrap_or(a);
+ let mapped = self
+ .temp_name_to_baseline_external
+ .get(a_base)
+ .cloned()
+ .unwrap_or_else(|| PathBuf::from(a));
+ mapped.display().to_string()
+ };
+
+ let b_ext_display = if b == "/dev/null" {
+ "/dev/null".to_string()
+ } else {
+ let b_base = Path::new(b)
+ .file_name()
+ .and_then(|s| s.to_str())
+ .unwrap_or(b);
+ let mapped = self
+ .temp_name_to_current_external
+ .get(b_base)
+ .cloned()
+ .unwrap_or_else(|| PathBuf::from(b));
+ mapped.display().to_string()
+ };
+
+ out.push_str(&format!("diff --git a/{a_ext_display} b/{b_ext_display}\n"));
+ continue;
+ }
+ }
+ if let Some(rest) = line.strip_prefix("--- ") {
+ if let Some(path) = rest.strip_prefix("a/") {
+ let external_display = if path == "/dev/null" {
+ "/dev/null".to_string()
+ } else {
+ let p_base = Path::new(path)
+ .file_name()
+ .and_then(|s| s.to_str())
+ .unwrap_or(path);
+ self.temp_name_to_baseline_external
+ .get(p_base)
+ .cloned()
+ .unwrap_or_else(|| PathBuf::from(path))
+ .display()
+ .to_string()
+ };
+ out.push_str(&format!("--- {external_display}\n"));
+ continue;
+ }
+ }
+ if let Some(rest) = line.strip_prefix("+++ ") {
+ if let Some(path) = rest.strip_prefix("b/") {
+ let external_display = if path == "/dev/null" {
+ "/dev/null".to_string()
+ } else {
+ let p_base = Path::new(path)
+ .file_name()
+ .and_then(|s| s.to_str())
+ .unwrap_or(path);
+ self.temp_name_to_current_external
+ .get(p_base)
+ .cloned()
+ .unwrap_or_else(|| PathBuf::from(path))
+ .display()
+ .to_string()
+ };
+ out.push_str(&format!("+++ {external_display}\n"));
+ continue;
+ }
+ }
+ out.push_str(line);
+ out.push('\n');
+ }
+ out
+ }
+}
+
+fn uuid_filename_for(path: &Path) -> String {
+ let id = Uuid::new_v4().to_string();
+ match path.extension().and_then(|e| e.to_str()) {
+ Some(ext) if !ext.is_empty() => format!("{id}.{ext}"),
+ _ => id,
+ }
+}
+
+fn run_git_allow_exit_codes(
+ repo: &Path,
+ args: &[&str],
+ allowed_exit_codes: &[i32],
+) -> Result<String> {
+ let output = Command::new("git")
+ .current_dir(repo)
+ .args(args)
+ .output()
+ .with_context(|| format!("failed to run git {:?} in {}", args, repo.display()))?;
+ let code = output.status.code().unwrap_or(-1);
+ if !allowed_exit_codes.contains(&code) {
+ anyhow::bail!(
+ "git {:?} failed with status {:?}: {}",
+ args,
+ output.status,
+ String::from_utf8_lossy(&output.stderr)
+ );
+ }
+ Ok(String::from_utf8_lossy(&output.stdout).into_owned())
+}
+
+#[cfg(test)]
+mod tests {
+ #![allow(clippy::unwrap_used)]
Can you also test that paths with spaces work as intended?
- Created: 2025-08-01 17:15:26 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2248479943
@@ -0,0 +1,476 @@
+use std::collections::HashMap;
+use std::fs;
+use std::path::Path;
+use std::path::PathBuf;
+use std::process::Command;
+
+use anyhow::Context;
+use anyhow::Result;
+use tempfile::TempDir;
+use uuid::Uuid;
+
+use crate::protocol::FileChange;
+
+/// Tracks sets of changes to files and exposes the overall unified diff.
+/// Internally, the way this works is now:
+/// 1. Create a temp directory to store baseline snapshots of files when they are first seen.
+/// 2. When a path is first observed, copy its current contents into the baseline dir if it exists on disk.
+/// For new additions, do not create a baseline file so that diffs are shown as proper additions (using /dev/null).
+/// 3. Keep a stable internal filename (uuid + same extension) per external path for path rewrite in diffs.
+/// 4. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk using
+/// `git diff --no-index` and rewrite paths to external paths.
+#[derive(Default)]
+pub struct TurnDiffTracker {
+ /// Temp directory holding baseline snapshots of files as first seen.
+ baseline_files_dir: Option<TempDir>,
+ /// Map external path -> internal filename (uuid + same extension).
+ external_to_temp_name: HashMap<PathBuf, String>,
+ /// Internal filename -> external path as of baseline snapshot.
+ temp_name_to_baseline_external: HashMap<String, PathBuf>,
+ /// Internal filename -> external path as of current accumulated state (after applying all changes).
+ /// This is where renames are tracked.
+ temp_name_to_current_external: HashMap<String, PathBuf>,
+ /// Aggregated unified diff for all accumulated changes across files.
+ pub unified_diff: Option<String>,
+}
+
+impl TurnDiffTracker {
+ pub fn new() -> Self {
+ Self::default()
+ }
+
+ /// Front-run apply patch calls to track the starting contents of any modified files.
+ /// - Creates a baseline snapshot for files that already exist on disk when first seen.
+ /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions.
+ /// - Also updates internal mappings for move/rename events.
+ pub fn on_patch_begin(&mut self, changes: &HashMap<PathBuf, FileChange>) -> Result<()> {
+ self.ensure_baseline_dir()?;
+ let baseline_dir = self.baseline_dir()?.to_path_buf();
+
+ for (path, change) in changes.iter() {
+ // Ensure a stable internal filename exists for this external path.
+ if !self.external_to_temp_name.contains_key(path) {
+ let internal = uuid_filename_for(path);
+ self.external_to_temp_name
+ .insert(path.clone(), internal.clone());
+ self.temp_name_to_baseline_external
+ .insert(internal.clone(), path.clone());
+ self.temp_name_to_current_external
+ .insert(internal.clone(), path.clone());
+
+ // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null.
+ if path.exists() {
+ let contents = fs::read(path)
+ .with_context(|| format!("failed to read original {}", path.display()))?;
+ let internal_path = baseline_dir.join(&internal);
+ fs::write(&internal_path, contents).with_context(|| {
+ format!("failed to write baseline file {}", internal_path.display())
+ })?;
+ }
+ }
+
+ // Track rename/move in current mapping if provided in an Update.
+ let move_path = match change {
+ FileChange::Update {
+ move_path: Some(dest),
+ ..
+ } => Some(dest),
+ _ => None,
+ };
+ if let Some(dest) = move_path {
This could be folded into the above
matchstatement?
- Created: 2025-08-01 17:17:12 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2248482907
@@ -0,0 +1,476 @@
+use std::collections::HashMap;
+use std::fs;
+use std::path::Path;
+use std::path::PathBuf;
+use std::process::Command;
+
+use anyhow::Context;
+use anyhow::Result;
+use tempfile::TempDir;
+use uuid::Uuid;
+
+use crate::protocol::FileChange;
+
+/// Tracks sets of changes to files and exposes the overall unified diff.
+/// Internally, the way this works is now:
+/// 1. Create a temp directory to store baseline snapshots of files when they are first seen.
+/// 2. When a path is first observed, copy its current contents into the baseline dir if it exists on disk.
+/// For new additions, do not create a baseline file so that diffs are shown as proper additions (using /dev/null).
+/// 3. Keep a stable internal filename (uuid + same extension) per external path for path rewrite in diffs.
+/// 4. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk using
+/// `git diff --no-index` and rewrite paths to external paths.
+#[derive(Default)]
+pub struct TurnDiffTracker {
+ /// Temp directory holding baseline snapshots of files as first seen.
+ baseline_files_dir: Option<TempDir>,
+ /// Map external path -> internal filename (uuid + same extension).
+ external_to_temp_name: HashMap<PathBuf, String>,
+ /// Internal filename -> external path as of baseline snapshot.
+ temp_name_to_baseline_external: HashMap<String, PathBuf>,
+ /// Internal filename -> external path as of current accumulated state (after applying all changes).
+ /// This is where renames are tracked.
+ temp_name_to_current_external: HashMap<String, PathBuf>,
+ /// Aggregated unified diff for all accumulated changes across files.
+ pub unified_diff: Option<String>,
+}
+
+impl TurnDiffTracker {
+ pub fn new() -> Self {
+ Self::default()
+ }
+
+ /// Front-run apply patch calls to track the starting contents of any modified files.
+ /// - Creates a baseline snapshot for files that already exist on disk when first seen.
+ /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions.
+ /// - Also updates internal mappings for move/rename events.
+ pub fn on_patch_begin(&mut self, changes: &HashMap<PathBuf, FileChange>) -> Result<()> {
+ self.ensure_baseline_dir()?;
+ let baseline_dir = self.baseline_dir()?.to_path_buf();
+
+ for (path, change) in changes.iter() {
+ // Ensure a stable internal filename exists for this external path.
+ if !self.external_to_temp_name.contains_key(path) {
+ let internal = uuid_filename_for(path);
+ self.external_to_temp_name
+ .insert(path.clone(), internal.clone());
+ self.temp_name_to_baseline_external
+ .insert(internal.clone(), path.clone());
+ self.temp_name_to_current_external
+ .insert(internal.clone(), path.clone());
+
+ // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null.
+ if path.exists() {
+ let contents = fs::read(path)
+ .with_context(|| format!("failed to read original {}", path.display()))?;
+ let internal_path = baseline_dir.join(&internal);
+ fs::write(&internal_path, contents).with_context(|| {
+ format!("failed to write baseline file {}", internal_path.display())
+ })?;
+ }
+ }
+
+ // Track rename/move in current mapping if provided in an Update.
+ let move_path = match change {
+ FileChange::Update {
+ move_path: Some(dest),
+ ..
+ } => Some(dest),
+ _ => None,
+ };
+ if let Some(dest) = move_path {
+ let uuid_filename = match self.external_to_temp_name.get(path) {
+ Some(i) => i.clone(),
+ None => {
+ // This should be rare, but if we haven't mapped the source, create it with no baseline.
+ let i = uuid_filename_for(path);
+ self.external_to_temp_name.insert(path.clone(), i.clone());
+ self.temp_name_to_baseline_external
+ .insert(i.clone(), path.clone());
+ i
+ }
+ };
+ // Update current external mapping for temp file name.
+ self.temp_name_to_current_external
+ .insert(uuid_filename.clone(), dest.clone());
+ // Update forward file_mapping: external current -> internal name.
+ self.external_to_temp_name.remove(path);
+ self.external_to_temp_name
+ .insert(dest.clone(), uuid_filename);
+ }
+ }
+
+ Ok(())
+ }
+
+ /// Recompute the aggregated unified diff by comparing all baseline snapshots against
+ /// current files on disk using `git diff --no-index` and rewriting paths to external paths.
+ pub fn update_and_get_unified_diff(&mut self) -> Result<Option<String>> {
+ let baseline_dir = self.baseline_dir()?.to_path_buf();
+ let current_dir = baseline_dir.join("current");
+ if current_dir.exists() {
+ // Best-effort cleanup of previous run's mirror.
+ let _ = fs::remove_dir_all(¤t_dir);
+ }
+ fs::create_dir_all(¤t_dir).with_context(|| {
+ format!(
+ "failed to create current mirror dir {}",
+ current_dir.display()
+ )
+ })?;
+
+ let mut aggregated = String::new();
+
+ // Compute diffs per tracked internal file.
+ for (internal, baseline_external) in &self.temp_name_to_baseline_external {
+ let baseline_path = baseline_dir.join(internal);
+ let current_external = self
+ .temp_name_to_current_external
+ .get(internal)
+ .cloned()
+ .unwrap_or_else(|| baseline_external.clone());
+
+ let left_is_dev_null = !baseline_path.exists();
+ let right_exists = current_external.exists();
+
+ // Prepare right side mirror file if exists; otherwise use /dev/null for deletions.
+ let right_arg = if right_exists {
+ let mirror_path = current_dir.join(internal);
+ let contents = fs::read(¤t_external).with_context(|| {
+ format!(
+ "failed to read current file for diff {}",
+ current_external.display()
+ )
+ })?;
+ fs::write(&mirror_path, contents).with_context(|| {
+ format!(
+ "failed to write current mirror file {}",
+ mirror_path.display()
+ )
+ })?;
+ // Use relative path from baseline_dir (so headers say a/<uuid> b/current/<uuid>).
+ format!("current/{internal}")
+ } else {
+ // Deletion: right side is /dev/null to show proper deleted file diff.
+ "/dev/null".to_string()
+ };
+
+ // Prepare left arg: baseline file path or /dev/null for additions.
+ let left_arg = if left_is_dev_null {
+ "/dev/null".to_string()
+ } else {
+ internal.clone()
+ };
+
+ // Run git diff --no-index from baseline_dir to keep paths predictable.
+ let raw = run_git_allow_exit_codes(
+ &baseline_dir,
+ &[
+ "-c",
+ "color.ui=false",
+ "diff",
+ "--no-color",
+ "--no-index",
+ "--",
+ &left_arg,
+ &right_arg,
+ ],
+ &[0, 1], // 0: no changes, 1: differences
+ )?;
+
+ if raw.trim().is_empty() {
+ continue;
+ }
+ let rewritten = self.rewrite_diff_paths(&raw);
+ if !rewritten.trim().is_empty() {
+ if !aggregated.is_empty() && !aggregated.ends_with('\n') {
+ aggregated.push('\n');
+ }
+ aggregated.push_str(&rewritten);
+ }
+ }
+
+ self.unified_diff = if aggregated.trim().is_empty() {
+ None
+ } else {
+ Some(aggregated)
+ };
+
+ // Clean up the current dir.
+ let _ = fs::remove_dir_all(¤t_dir);
+
+ Ok(self.unified_diff.clone())
+ }
+
+ fn baseline_dir(&self) -> Result<&Path> {
+ self.baseline_files_dir
+ .as_ref()
+ .map(|d| d.path())
+ .ok_or_else(|| anyhow::anyhow!("baseline temp dir not initialized"))
+ }
+
+ fn ensure_baseline_dir(&mut self) -> Result<()> {
+ if self.baseline_files_dir.is_some() {
+ return Ok(());
+ }
+ let tmp = TempDir::new().context("create baseline temp dir")?;
+ self.baseline_files_dir = Some(tmp);
+ Ok(())
+ }
+
+ /// Rewrites the internal filenames to external paths in diff headers.
+ /// Handles inputs like:
+ /// diff --git a/<uuid> b/current/<uuid>
+ /// --- a/<uuid> | /dev/null
+ /// +++ b/current/<uuid> | /dev/null
+ /// and replaces uuid with the external paths tracking baseline/current.
+ fn rewrite_diff_paths(&self, diff: &str) -> String {
+ let mut out = String::new();
+ for line in diff.lines() {
+ if let Some(rest) = line.strip_prefix("diff --git ") {
+ // Format: diff --git a/<f> b/<f>
+ let parts: Vec<&str> = rest.split_whitespace().collect();
+ if parts.len() == 2 {
+ let a = parts[0].strip_prefix("a/").unwrap_or(parts[0]);
+ let b = parts[1].strip_prefix("b/").unwrap_or(parts[1]);
+
+ let a_ext_display = if a == "/dev/null" {
+ "/dev/null".to_string()
+ } else {
+ let a_base = Path::new(a)
+ .file_name()
+ .and_then(|s| s.to_str())
+ .unwrap_or(a);
+ let mapped = self
+ .temp_name_to_baseline_external
+ .get(a_base)
+ .cloned()
+ .unwrap_or_else(|| PathBuf::from(a));
+ mapped.display().to_string()
+ };
+
+ let b_ext_display = if b == "/dev/null" {
+ "/dev/null".to_string()
+ } else {
+ let b_base = Path::new(b)
+ .file_name()
+ .and_then(|s| s.to_str())
+ .unwrap_or(b);
+ let mapped = self
+ .temp_name_to_current_external
+ .get(b_base)
+ .cloned()
+ .unwrap_or_else(|| PathBuf::from(b));
+ mapped.display().to_string()
+ };
+
+ out.push_str(&format!("diff --git a/{a_ext_display} b/{b_ext_display}\n"));
+ continue;
+ }
+ }
+ if let Some(rest) = line.strip_prefix("--- ") {
+ if let Some(path) = rest.strip_prefix("a/") {
+ let external_display = if path == "/dev/null" {
+ "/dev/null".to_string()
+ } else {
+ let p_base = Path::new(path)
+ .file_name()
+ .and_then(|s| s.to_str())
+ .unwrap_or(path);
+ self.temp_name_to_baseline_external
+ .get(p_base)
+ .cloned()
+ .unwrap_or_else(|| PathBuf::from(path))
+ .display()
+ .to_string()
+ };
+ out.push_str(&format!("--- {external_display}\n"));
+ continue;
+ }
+ }
+ if let Some(rest) = line.strip_prefix("+++ ") {
+ if let Some(path) = rest.strip_prefix("b/") {
+ let external_display = if path == "/dev/null" {
+ "/dev/null".to_string()
+ } else {
+ let p_base = Path::new(path)
+ .file_name()
+ .and_then(|s| s.to_str())
+ .unwrap_or(path);
+ self.temp_name_to_current_external
+ .get(p_base)
+ .cloned()
+ .unwrap_or_else(|| PathBuf::from(path))
+ .display()
+ .to_string()
+ };
+ out.push_str(&format!("+++ {external_display}\n"));
+ continue;
+ }
+ }
+ out.push_str(line);
+ out.push('\n');
+ }
+ out
+ }
+}
+
+fn uuid_filename_for(path: &Path) -> String {
+ let id = Uuid::new_v4().to_string();
+ match path.extension().and_then(|e| e.to_str()) {
+ Some(ext) if !ext.is_empty() => format!("{id}.{ext}"),
+ _ => id,
+ }
+}
+
+fn run_git_allow_exit_codes(
From https://github.com/openai/codex/pull/1747 I would include:
let envs = vec![ ("GIT_CONFIG_GLOBAL", "/dev/null"), ("GIT_CONFIG_NOSYSTEM", "1"), ];
- Created: 2025-08-01 17:20:55 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2248489326
@@ -0,0 +1,476 @@
+use std::collections::HashMap;
+use std::fs;
+use std::path::Path;
+use std::path::PathBuf;
+use std::process::Command;
+
+use anyhow::Context;
+use anyhow::Result;
+use tempfile::TempDir;
+use uuid::Uuid;
+
+use crate::protocol::FileChange;
+
+/// Tracks sets of changes to files and exposes the overall unified diff.
+/// Internally, the way this works is now:
+/// 1. Create a temp directory to store baseline snapshots of files when they are first seen.
+/// 2. When a path is first observed, copy its current contents into the baseline dir if it exists on disk.
+/// For new additions, do not create a baseline file so that diffs are shown as proper additions (using /dev/null).
+/// 3. Keep a stable internal filename (uuid + same extension) per external path for path rewrite in diffs.
+/// 4. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk using
+/// `git diff --no-index` and rewrite paths to external paths.
+#[derive(Default)]
+pub struct TurnDiffTracker {
+ /// Temp directory holding baseline snapshots of files as first seen.
+ baseline_files_dir: Option<TempDir>,
+ /// Map external path -> internal filename (uuid + same extension).
+ external_to_temp_name: HashMap<PathBuf, String>,
+ /// Internal filename -> external path as of baseline snapshot.
+ temp_name_to_baseline_external: HashMap<String, PathBuf>,
+ /// Internal filename -> external path as of current accumulated state (after applying all changes).
+ /// This is where renames are tracked.
+ temp_name_to_current_external: HashMap<String, PathBuf>,
+ /// Aggregated unified diff for all accumulated changes across files.
+ pub unified_diff: Option<String>,
+}
+
+impl TurnDiffTracker {
+ pub fn new() -> Self {
+ Self::default()
+ }
+
+ /// Front-run apply patch calls to track the starting contents of any modified files.
+ /// - Creates a baseline snapshot for files that already exist on disk when first seen.
+ /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions.
+ /// - Also updates internal mappings for move/rename events.
+ pub fn on_patch_begin(&mut self, changes: &HashMap<PathBuf, FileChange>) -> Result<()> {
+ self.ensure_baseline_dir()?;
+ let baseline_dir = self.baseline_dir()?.to_path_buf();
+
+ for (path, change) in changes.iter() {
+ // Ensure a stable internal filename exists for this external path.
+ if !self.external_to_temp_name.contains_key(path) {
+ let internal = uuid_filename_for(path);
+ self.external_to_temp_name
+ .insert(path.clone(), internal.clone());
+ self.temp_name_to_baseline_external
+ .insert(internal.clone(), path.clone());
+ self.temp_name_to_current_external
+ .insert(internal.clone(), path.clone());
+
+ // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null.
+ if path.exists() {
+ let contents = fs::read(path)
+ .with_context(|| format!("failed to read original {}", path.display()))?;
+ let internal_path = baseline_dir.join(&internal);
+ fs::write(&internal_path, contents).with_context(|| {
+ format!("failed to write baseline file {}", internal_path.display())
+ })?;
+ }
+ }
+
+ // Track rename/move in current mapping if provided in an Update.
+ let move_path = match change {
+ FileChange::Update {
+ move_path: Some(dest),
+ ..
+ } => Some(dest),
+ _ => None,
+ };
+ if let Some(dest) = move_path {
+ let uuid_filename = match self.external_to_temp_name.get(path) {
+ Some(i) => i.clone(),
+ None => {
+ // This should be rare, but if we haven't mapped the source, create it with no baseline.
+ let i = uuid_filename_for(path);
+ self.external_to_temp_name.insert(path.clone(), i.clone());
+ self.temp_name_to_baseline_external
+ .insert(i.clone(), path.clone());
+ i
+ }
+ };
+ // Update current external mapping for temp file name.
+ self.temp_name_to_current_external
+ .insert(uuid_filename.clone(), dest.clone());
+ // Update forward file_mapping: external current -> internal name.
+ self.external_to_temp_name.remove(path);
+ self.external_to_temp_name
+ .insert(dest.clone(), uuid_filename);
+ }
+ }
+
+ Ok(())
+ }
+
+ /// Recompute the aggregated unified diff by comparing all baseline snapshots against
Can you expand this comment? I don't have a great mental model of the structure you're trying to set up for the ultimate
git diffcall.I want to understand why this isn't something simpler like
diff -u backed-up-file current-file.
- Created: 2025-08-04 02:45:51 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2250292032
@@ -0,0 +1,766 @@
+use std::collections::HashMap;
+use std::fs;
+use std::path::Path;
+use std::path::PathBuf;
+use std::process::Command;
+
+use anyhow::Context;
+use anyhow::Result;
+use anyhow::anyhow;
+use uuid::Uuid;
+
+use crate::protocol::FileChange;
+
+struct BaselineFileInfo {
+ path: Option<PathBuf>,
+ contents_bytes: Option<Vec<u8>>,
+ mode: Option<String>,
+ oid: Option<String>,
+}
+
+/// Tracks sets of changes to files and exposes the overall unified diff.
+/// Internally, the way this works is now:
+/// 1. Maintain an in-memory baseline snapshot of files when they are first seen.
+/// For new additions, do not create a baseline so that diffs are shown as proper additions (using /dev/null).
+/// 2. Keep a stable internal filename (uuid + same extension) per external path for rename tracking.
+/// 3. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk entirely in-memory
+/// using the `similar` crate and emit unified diffs with rewritten external paths.
+#[derive(Default)]
+pub struct TurnDiffTracker {
+ /// Map external path -> internal filename (uuid + same extension).
+ external_to_temp_name: HashMap<PathBuf, String>,
+ /// Internal filename -> baseline file info.
+ baseline_file_info: HashMap<String, BaselineFileInfo>,
+ /// Internal filename -> external path as of current accumulated state (after applying all changes).
+ /// This is where renames are tracked.
+ temp_name_to_current_path: HashMap<String, PathBuf>,
+ /// Cache of known git worktree roots to avoid repeated filesystem walks.
+ git_root_cache: Vec<PathBuf>,
+}
+
+impl TurnDiffTracker {
+ pub fn new() -> Self {
+ Self::default()
+ }
+
+ /// Front-run apply patch calls to track the starting contents of any modified files.
+ /// - Creates an in-memory baseline snapshot for files that already exist on disk when first seen.
+ /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions.
+ /// - Also updates internal mappings for move/rename events.
+ pub fn on_patch_begin(&mut self, changes: &HashMap<PathBuf, FileChange>) -> Result<()> {
+ for (path, change) in changes.iter() {
+ // Ensure a stable internal filename exists for this external path.
+ if !self.external_to_temp_name.contains_key(path) {
+ let internal = uuid_filename_for(path);
+ self.external_to_temp_name
+ .insert(path.clone(), internal.clone());
+ self.temp_name_to_current_path
+ .insert(internal.clone(), path.clone());
+
+ // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null.
+ let (contents_bytes, mode, oid) = if path.exists() {
+ let mode = file_mode_for_path(path);
+ let mode_str = mode.as_deref().unwrap_or(REGULAR_MODE);
+ let contents_bytes = blob_bytes(path, mode_str)
+ .unwrap_or_default()
+ .unwrap_or_default();
+ let oid = if mode.as_deref() == Some(SYMLINK_MODE) {
+ git_blob_sha1_hex_bytes(&contents_bytes)
+ } else {
+ self.git_blob_oid_for_path(path)
+ .unwrap_or_else(|| git_blob_sha1_hex_bytes(&contents_bytes))
+ };
+ (Some(contents_bytes), mode, Some(oid))
+ } else {
+ (None, None, Some(ZERO_OID.to_string()))
+ };
+
+ self.baseline_file_info.insert(
+ internal.clone(),
+ BaselineFileInfo {
+ path: Some(path.clone()),
+ contents_bytes,
+ mode,
+ oid,
+ },
+ );
+ }
+
+ // Track rename/move in current mapping if provided in an Update.
+ if let FileChange::Update {
+ move_path: Some(dest),
+ ..
+ } = change
+ {
+ let uuid_filename = match self.external_to_temp_name.get(path) {
+ Some(i) => i.clone(),
+ None => {
+ // This should be rare, but if we haven't mapped the source, create it with no baseline.
+ let i = uuid_filename_for(path);
+ self.baseline_file_info.insert(
+ i.clone(),
+ BaselineFileInfo {
+ path: Some(path.clone()),
+ contents_bytes: None,
+ mode: None,
+ oid: Some(ZERO_OID.to_string()),
+ },
+ );
+ i
+ }
+ };
+ // Update current external mapping for temp file name.
+ self.temp_name_to_current_path
+ .insert(uuid_filename.clone(), dest.clone());
+ // Update forward file_mapping: external current -> internal name.
+ self.external_to_temp_name.remove(path);
+ self.external_to_temp_name
+ .insert(dest.clone(), uuid_filename);
+ };
+ }
+
+ Ok(())
I don't see a
?or a place where theErrvariant is constructed, so does this need to returnResult?
- Created: 2025-08-04 02:57:10 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2250301151
@@ -0,0 +1,766 @@
+use std::collections::HashMap;
+use std::fs;
+use std::path::Path;
+use std::path::PathBuf;
+use std::process::Command;
+
+use anyhow::Context;
+use anyhow::Result;
+use anyhow::anyhow;
+use uuid::Uuid;
+
+use crate::protocol::FileChange;
+
+struct BaselineFileInfo {
+ path: Option<PathBuf>,
+ contents_bytes: Option<Vec<u8>>,
I think the field name
file_contentsimplies bytes given the type.
- Created: 2025-08-04 02:57:49 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2250301670
@@ -0,0 +1,766 @@
+use std::collections::HashMap;
+use std::fs;
+use std::path::Path;
+use std::path::PathBuf;
+use std::process::Command;
+
+use anyhow::Context;
+use anyhow::Result;
+use anyhow::anyhow;
+use uuid::Uuid;
+
+use crate::protocol::FileChange;
+
+struct BaselineFileInfo {
+ path: Option<PathBuf>,
+ contents_bytes: Option<Vec<u8>>,
+ mode: Option<String>,
I think an
enuminstead of aStringwould be clearer here.
- Created: 2025-08-04 02:58:20 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2250302031
@@ -0,0 +1,766 @@
+use std::collections::HashMap;
+use std::fs;
+use std::path::Path;
+use std::path::PathBuf;
+use std::process::Command;
+
+use anyhow::Context;
+use anyhow::Result;
+use anyhow::anyhow;
+use uuid::Uuid;
+
+use crate::protocol::FileChange;
+
+struct BaselineFileInfo {
+ path: Option<PathBuf>,
Add a comment since it is surprising that
pathcould beNone?
- Created: 2025-08-04 02:59:46 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2250303137
@@ -0,0 +1,766 @@
+use std::collections::HashMap;
+use std::fs;
+use std::path::Path;
+use std::path::PathBuf;
+use std::process::Command;
+
+use anyhow::Context;
+use anyhow::Result;
+use anyhow::anyhow;
+use uuid::Uuid;
+
+use crate::protocol::FileChange;
+
+struct BaselineFileInfo {
+ path: Option<PathBuf>,
+ contents_bytes: Option<Vec<u8>>,
+ mode: Option<String>,
+ oid: Option<String>,
+}
+
+/// Tracks sets of changes to files and exposes the overall unified diff.
+/// Internally, the way this works is now:
+/// 1. Maintain an in-memory baseline snapshot of files when they are first seen.
+/// For new additions, do not create a baseline so that diffs are shown as proper additions (using /dev/null).
+/// 2. Keep a stable internal filename (uuid + same extension) per external path for rename tracking.
+/// 3. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk entirely in-memory
+/// using the `similar` crate and emit unified diffs with rewritten external paths.
+#[derive(Default)]
+pub struct TurnDiffTracker {
+ /// Map external path -> internal filename (uuid + same extension).
+ external_to_temp_name: HashMap<PathBuf, String>,
+ /// Internal filename -> baseline file info.
+ baseline_file_info: HashMap<String, BaselineFileInfo>,
+ /// Internal filename -> external path as of current accumulated state (after applying all changes).
+ /// This is where renames are tracked.
+ temp_name_to_current_path: HashMap<String, PathBuf>,
+ /// Cache of known git worktree roots to avoid repeated filesystem walks.
+ git_root_cache: Vec<PathBuf>,
+}
+
+impl TurnDiffTracker {
+ pub fn new() -> Self {
+ Self::default()
+ }
+
+ /// Front-run apply patch calls to track the starting contents of any modified files.
+ /// - Creates an in-memory baseline snapshot for files that already exist on disk when first seen.
+ /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions.
+ /// - Also updates internal mappings for move/rename events.
+ pub fn on_patch_begin(&mut self, changes: &HashMap<PathBuf, FileChange>) -> Result<()> {
+ for (path, change) in changes.iter() {
+ // Ensure a stable internal filename exists for this external path.
+ if !self.external_to_temp_name.contains_key(path) {
+ let internal = uuid_filename_for(path);
+ self.external_to_temp_name
+ .insert(path.clone(), internal.clone());
+ self.temp_name_to_current_path
+ .insert(internal.clone(), path.clone());
+
+ // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null.
+ let (contents_bytes, mode, oid) = if path.exists() {
+ let mode = file_mode_for_path(path);
+ let mode_str = mode.as_deref().unwrap_or(REGULAR_MODE);
I
modebecomes anenum, usingmatchwould be cleaner here.
- Created: 2025-08-04 03:00:33 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2250303694
@@ -0,0 +1,766 @@
+use std::collections::HashMap;
+use std::fs;
+use std::path::Path;
+use std::path::PathBuf;
+use std::process::Command;
+
+use anyhow::Context;
+use anyhow::Result;
+use anyhow::anyhow;
+use uuid::Uuid;
+
+use crate::protocol::FileChange;
+
+struct BaselineFileInfo {
+ path: Option<PathBuf>,
I don't see a case where it is
Nonein the code, but maybe I'm missing something?
- Created: 2025-08-04 03:03:14 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2250305839
@@ -0,0 +1,766 @@
+use std::collections::HashMap;
+use std::fs;
+use std::path::Path;
+use std::path::PathBuf;
+use std::process::Command;
+
+use anyhow::Context;
+use anyhow::Result;
+use anyhow::anyhow;
+use uuid::Uuid;
+
+use crate::protocol::FileChange;
+
+struct BaselineFileInfo {
+ path: Option<PathBuf>,
+ contents_bytes: Option<Vec<u8>>,
+ mode: Option<String>,
+ oid: Option<String>,
+}
+
+/// Tracks sets of changes to files and exposes the overall unified diff.
+/// Internally, the way this works is now:
+/// 1. Maintain an in-memory baseline snapshot of files when they are first seen.
+/// For new additions, do not create a baseline so that diffs are shown as proper additions (using /dev/null).
+/// 2. Keep a stable internal filename (uuid + same extension) per external path for rename tracking.
+/// 3. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk entirely in-memory
+/// using the `similar` crate and emit unified diffs with rewritten external paths.
+#[derive(Default)]
+pub struct TurnDiffTracker {
+ /// Map external path -> internal filename (uuid + same extension).
+ external_to_temp_name: HashMap<PathBuf, String>,
+ /// Internal filename -> baseline file info.
+ baseline_file_info: HashMap<String, BaselineFileInfo>,
+ /// Internal filename -> external path as of current accumulated state (after applying all changes).
+ /// This is where renames are tracked.
+ temp_name_to_current_path: HashMap<String, PathBuf>,
+ /// Cache of known git worktree roots to avoid repeated filesystem walks.
+ git_root_cache: Vec<PathBuf>,
+}
+
+impl TurnDiffTracker {
+ pub fn new() -> Self {
+ Self::default()
+ }
+
+ /// Front-run apply patch calls to track the starting contents of any modified files.
+ /// - Creates an in-memory baseline snapshot for files that already exist on disk when first seen.
+ /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions.
+ /// - Also updates internal mappings for move/rename events.
+ pub fn on_patch_begin(&mut self, changes: &HashMap<PathBuf, FileChange>) -> Result<()> {
+ for (path, change) in changes.iter() {
+ // Ensure a stable internal filename exists for this external path.
+ if !self.external_to_temp_name.contains_key(path) {
+ let internal = uuid_filename_for(path);
+ self.external_to_temp_name
+ .insert(path.clone(), internal.clone());
+ self.temp_name_to_current_path
+ .insert(internal.clone(), path.clone());
+
+ // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null.
+ let (contents_bytes, mode, oid) = if path.exists() {
+ let mode = file_mode_for_path(path);
+ let mode_str = mode.as_deref().unwrap_or(REGULAR_MODE);
+ let contents_bytes = blob_bytes(path, mode_str)
+ .unwrap_or_default()
+ .unwrap_or_default();
+ let oid = if mode.as_deref() == Some(SYMLINK_MODE) {
+ git_blob_sha1_hex_bytes(&contents_bytes)
+ } else {
+ self.git_blob_oid_for_path(path)
+ .unwrap_or_else(|| git_blob_sha1_hex_bytes(&contents_bytes))
+ };
+ (Some(contents_bytes), mode, Some(oid))
+ } else {
+ (None, None, Some(ZERO_OID.to_string()))
+ };
+
+ self.baseline_file_info.insert(
+ internal.clone(),
+ BaselineFileInfo {
+ path: Some(path.clone()),
+ contents_bytes,
+ mode,
+ oid,
+ },
+ );
+ }
+
+ // Track rename/move in current mapping if provided in an Update.
+ if let FileChange::Update {
+ move_path: Some(dest),
+ ..
+ } = change
+ {
+ let uuid_filename = match self.external_to_temp_name.get(path) {
+ Some(i) => i.clone(),
+ None => {
+ // This should be rare, but if we haven't mapped the source, create it with no baseline.
+ let i = uuid_filename_for(path);
+ self.baseline_file_info.insert(
+ i.clone(),
+ BaselineFileInfo {
+ path: Some(path.clone()),
+ contents_bytes: None,
+ mode: None,
+ oid: Some(ZERO_OID.to_string()),
+ },
+ );
+ i
+ }
+ };
+ // Update current external mapping for temp file name.
+ self.temp_name_to_current_path
+ .insert(uuid_filename.clone(), dest.clone());
+ // Update forward file_mapping: external current -> internal name.
+ self.external_to_temp_name.remove(path);
+ self.external_to_temp_name
+ .insert(dest.clone(), uuid_filename);
+ };
+ }
+
+ Ok(())
+ }
+
+ fn get_path_for_internal(&self, internal: &str) -> Option<PathBuf> {
+ self.temp_name_to_current_path
+ .get(internal)
+ .cloned()
+ .or_else(|| {
+ self.baseline_file_info
+ .get(internal)
+ .and_then(|info| info.path.clone())
+ })
+ }
+
+ /// Find the git worktree root for a file/directory by walking up to the first ancestor containing a `.git` entry.
+ /// Uses a simple cache of known roots and avoids negative-result caching for simplicity.
+ fn find_git_root_cached(&mut self, start: &Path) -> Option<PathBuf> {
+ let dir = if start.is_dir() {
+ start
+ } else {
+ start.parent()?
+ };
+
+ // Fast path: if any cached root is an ancestor of this path, use it.
+ if let Some(root) = self
+ .git_root_cache
+ .iter()
+ .find(|r| dir.starts_with(r))
+ .cloned()
+ {
+ return Some(root);
+ }
+
+ // Walk up to find a `.git` marker.
+ let mut cur = dir.to_path_buf();
+ loop {
+ let git_marker = cur.join(".git");
+ if git_marker.is_dir() || git_marker.is_file() {
+ if !self.git_root_cache.iter().any(|r| r == &cur) {
+ self.git_root_cache.push(cur.clone());
+ }
+ return Some(cur);
+ }
+
+ // On Windows, avoid walking above the drive or UNC share root.
+ #[cfg(windows)]
+ {
+ if is_windows_drive_or_unc_root(&cur) {
+ return None;
+ }
+ }
+
+ if let Some(parent) = cur.parent() {
+ cur = parent.to_path_buf();
+ } else {
+ return None;
+ }
+ }
+ }
+
+ /// Return a display string for `path` relative to its git root if found, else absolute.
+ fn relative_to_git_root_str(&mut self, path: &Path) -> String {
+ let s = if let Some(root) = self.find_git_root_cached(path) {
+ if let Ok(rel) = path.strip_prefix(&root) {
+ rel.display().to_string()
+ } else {
+ path.display().to_string()
+ }
+ } else {
+ path.display().to_string()
+ };
+ s.replace('\\', "/")
+ }
+
+ /// Ask git to compute the blob SHA-1 for the file at `path` within its repository.
+ /// Returns None if no repository is found or git invocation fails.
+ fn git_blob_oid_for_path(&mut self, path: &Path) -> Option<String> {
+ let root = self.find_git_root_cached(path)?;
+ // Compute a path relative to the repo root for better portability across platforms.
+ let rel = path.strip_prefix(&root).unwrap_or(path);
+ let output = Command::new("git")
+ .arg("-C")
+ .arg(&root)
+ .arg("hash-object")
+ .arg("--")
+ .arg(rel)
+ .output()
+ .ok()?;
+ if !output.status.success() {
+ return None;
+ }
+ let s = String::from_utf8_lossy(&output.stdout).trim().to_string();
+ if s.len() == 40 { Some(s) } else { None }
If it is not 40, is this an error / unexpected situation?
- Created: 2025-08-04 03:03:31 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2250306088
@@ -0,0 +1,766 @@
+use std::collections::HashMap;
+use std::fs;
+use std::path::Path;
+use std::path::PathBuf;
+use std::process::Command;
+
+use anyhow::Context;
+use anyhow::Result;
+use anyhow::anyhow;
+use uuid::Uuid;
+
+use crate::protocol::FileChange;
+
+struct BaselineFileInfo {
+ path: Option<PathBuf>,
+ contents_bytes: Option<Vec<u8>>,
+ mode: Option<String>,
+ oid: Option<String>,
+}
+
+/// Tracks sets of changes to files and exposes the overall unified diff.
+/// Internally, the way this works is now:
+/// 1. Maintain an in-memory baseline snapshot of files when they are first seen.
+/// For new additions, do not create a baseline so that diffs are shown as proper additions (using /dev/null).
+/// 2. Keep a stable internal filename (uuid + same extension) per external path for rename tracking.
+/// 3. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk entirely in-memory
+/// using the `similar` crate and emit unified diffs with rewritten external paths.
+#[derive(Default)]
+pub struct TurnDiffTracker {
+ /// Map external path -> internal filename (uuid + same extension).
+ external_to_temp_name: HashMap<PathBuf, String>,
+ /// Internal filename -> baseline file info.
+ baseline_file_info: HashMap<String, BaselineFileInfo>,
+ /// Internal filename -> external path as of current accumulated state (after applying all changes).
+ /// This is where renames are tracked.
+ temp_name_to_current_path: HashMap<String, PathBuf>,
+ /// Cache of known git worktree roots to avoid repeated filesystem walks.
+ git_root_cache: Vec<PathBuf>,
+}
+
+impl TurnDiffTracker {
+ pub fn new() -> Self {
+ Self::default()
+ }
+
+ /// Front-run apply patch calls to track the starting contents of any modified files.
+ /// - Creates an in-memory baseline snapshot for files that already exist on disk when first seen.
+ /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions.
+ /// - Also updates internal mappings for move/rename events.
+ pub fn on_patch_begin(&mut self, changes: &HashMap<PathBuf, FileChange>) -> Result<()> {
+ for (path, change) in changes.iter() {
+ // Ensure a stable internal filename exists for this external path.
+ if !self.external_to_temp_name.contains_key(path) {
+ let internal = uuid_filename_for(path);
+ self.external_to_temp_name
+ .insert(path.clone(), internal.clone());
+ self.temp_name_to_current_path
+ .insert(internal.clone(), path.clone());
+
+ // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null.
+ let (contents_bytes, mode, oid) = if path.exists() {
+ let mode = file_mode_for_path(path);
+ let mode_str = mode.as_deref().unwrap_or(REGULAR_MODE);
+ let contents_bytes = blob_bytes(path, mode_str)
+ .unwrap_or_default()
+ .unwrap_or_default();
+ let oid = if mode.as_deref() == Some(SYMLINK_MODE) {
+ git_blob_sha1_hex_bytes(&contents_bytes)
+ } else {
+ self.git_blob_oid_for_path(path)
+ .unwrap_or_else(|| git_blob_sha1_hex_bytes(&contents_bytes))
+ };
+ (Some(contents_bytes), mode, Some(oid))
+ } else {
+ (None, None, Some(ZERO_OID.to_string()))
+ };
+
+ self.baseline_file_info.insert(
+ internal.clone(),
+ BaselineFileInfo {
+ path: Some(path.clone()),
+ contents_bytes,
+ mode,
+ oid,
+ },
+ );
+ }
+
+ // Track rename/move in current mapping if provided in an Update.
+ if let FileChange::Update {
+ move_path: Some(dest),
+ ..
+ } = change
+ {
+ let uuid_filename = match self.external_to_temp_name.get(path) {
+ Some(i) => i.clone(),
+ None => {
+ // This should be rare, but if we haven't mapped the source, create it with no baseline.
+ let i = uuid_filename_for(path);
+ self.baseline_file_info.insert(
+ i.clone(),
+ BaselineFileInfo {
+ path: Some(path.clone()),
+ contents_bytes: None,
+ mode: None,
+ oid: Some(ZERO_OID.to_string()),
+ },
+ );
+ i
+ }
+ };
+ // Update current external mapping for temp file name.
+ self.temp_name_to_current_path
+ .insert(uuid_filename.clone(), dest.clone());
+ // Update forward file_mapping: external current -> internal name.
+ self.external_to_temp_name.remove(path);
+ self.external_to_temp_name
+ .insert(dest.clone(), uuid_filename);
+ };
+ }
+
+ Ok(())
+ }
+
+ fn get_path_for_internal(&self, internal: &str) -> Option<PathBuf> {
+ self.temp_name_to_current_path
+ .get(internal)
+ .cloned()
+ .or_else(|| {
+ self.baseline_file_info
+ .get(internal)
+ .and_then(|info| info.path.clone())
+ })
+ }
+
+ /// Find the git worktree root for a file/directory by walking up to the first ancestor containing a `.git` entry.
+ /// Uses a simple cache of known roots and avoids negative-result caching for simplicity.
+ fn find_git_root_cached(&mut self, start: &Path) -> Option<PathBuf> {
+ let dir = if start.is_dir() {
+ start
+ } else {
+ start.parent()?
+ };
+
+ // Fast path: if any cached root is an ancestor of this path, use it.
+ if let Some(root) = self
+ .git_root_cache
+ .iter()
+ .find(|r| dir.starts_with(r))
+ .cloned()
+ {
+ return Some(root);
+ }
+
+ // Walk up to find a `.git` marker.
+ let mut cur = dir.to_path_buf();
+ loop {
+ let git_marker = cur.join(".git");
+ if git_marker.is_dir() || git_marker.is_file() {
+ if !self.git_root_cache.iter().any(|r| r == &cur) {
+ self.git_root_cache.push(cur.clone());
+ }
+ return Some(cur);
+ }
+
+ // On Windows, avoid walking above the drive or UNC share root.
+ #[cfg(windows)]
+ {
+ if is_windows_drive_or_unc_root(&cur) {
+ return None;
+ }
+ }
+
+ if let Some(parent) = cur.parent() {
+ cur = parent.to_path_buf();
+ } else {
+ return None;
+ }
+ }
+ }
+
+ /// Return a display string for `path` relative to its git root if found, else absolute.
+ fn relative_to_git_root_str(&mut self, path: &Path) -> String {
+ let s = if let Some(root) = self.find_git_root_cached(path) {
+ if let Ok(rel) = path.strip_prefix(&root) {
+ rel.display().to_string()
+ } else {
+ path.display().to_string()
+ }
+ } else {
+ path.display().to_string()
+ };
+ s.replace('\\', "/")
+ }
+
+ /// Ask git to compute the blob SHA-1 for the file at `path` within its repository.
+ /// Returns None if no repository is found or git invocation fails.
+ fn git_blob_oid_for_path(&mut self, path: &Path) -> Option<String> {
+ let root = self.find_git_root_cached(path)?;
+ // Compute a path relative to the repo root for better portability across platforms.
+ let rel = path.strip_prefix(&root).unwrap_or(path);
+ let output = Command::new("git")
+ .arg("-C")
+ .arg(&root)
+ .arg("hash-object")
+ .arg("--")
+ .arg(rel)
+ .output()
+ .ok()?;
+ if !output.status.success() {
Is this something expected or exceptional?
- Created: 2025-08-04 03:03:57 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2250306484
@@ -0,0 +1,766 @@
+use std::collections::HashMap;
+use std::fs;
+use std::path::Path;
+use std::path::PathBuf;
+use std::process::Command;
+
+use anyhow::Context;
+use anyhow::Result;
+use anyhow::anyhow;
+use uuid::Uuid;
+
+use crate::protocol::FileChange;
+
+struct BaselineFileInfo {
+ path: Option<PathBuf>,
+ contents_bytes: Option<Vec<u8>>,
+ mode: Option<String>,
+ oid: Option<String>,
+}
+
+/// Tracks sets of changes to files and exposes the overall unified diff.
+/// Internally, the way this works is now:
+/// 1. Maintain an in-memory baseline snapshot of files when they are first seen.
+/// For new additions, do not create a baseline so that diffs are shown as proper additions (using /dev/null).
+/// 2. Keep a stable internal filename (uuid + same extension) per external path for rename tracking.
+/// 3. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk entirely in-memory
+/// using the `similar` crate and emit unified diffs with rewritten external paths.
+#[derive(Default)]
+pub struct TurnDiffTracker {
+ /// Map external path -> internal filename (uuid + same extension).
+ external_to_temp_name: HashMap<PathBuf, String>,
+ /// Internal filename -> baseline file info.
+ baseline_file_info: HashMap<String, BaselineFileInfo>,
+ /// Internal filename -> external path as of current accumulated state (after applying all changes).
+ /// This is where renames are tracked.
+ temp_name_to_current_path: HashMap<String, PathBuf>,
+ /// Cache of known git worktree roots to avoid repeated filesystem walks.
+ git_root_cache: Vec<PathBuf>,
+}
+
+impl TurnDiffTracker {
+ pub fn new() -> Self {
+ Self::default()
+ }
+
+ /// Front-run apply patch calls to track the starting contents of any modified files.
+ /// - Creates an in-memory baseline snapshot for files that already exist on disk when first seen.
+ /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions.
+ /// - Also updates internal mappings for move/rename events.
+ pub fn on_patch_begin(&mut self, changes: &HashMap<PathBuf, FileChange>) -> Result<()> {
+ for (path, change) in changes.iter() {
+ // Ensure a stable internal filename exists for this external path.
+ if !self.external_to_temp_name.contains_key(path) {
+ let internal = uuid_filename_for(path);
+ self.external_to_temp_name
+ .insert(path.clone(), internal.clone());
+ self.temp_name_to_current_path
+ .insert(internal.clone(), path.clone());
+
+ // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null.
+ let (contents_bytes, mode, oid) = if path.exists() {
+ let mode = file_mode_for_path(path);
+ let mode_str = mode.as_deref().unwrap_or(REGULAR_MODE);
+ let contents_bytes = blob_bytes(path, mode_str)
+ .unwrap_or_default()
+ .unwrap_or_default();
+ let oid = if mode.as_deref() == Some(SYMLINK_MODE) {
+ git_blob_sha1_hex_bytes(&contents_bytes)
+ } else {
+ self.git_blob_oid_for_path(path)
+ .unwrap_or_else(|| git_blob_sha1_hex_bytes(&contents_bytes))
+ };
+ (Some(contents_bytes), mode, Some(oid))
+ } else {
+ (None, None, Some(ZERO_OID.to_string()))
+ };
+
+ self.baseline_file_info.insert(
+ internal.clone(),
+ BaselineFileInfo {
+ path: Some(path.clone()),
+ contents_bytes,
+ mode,
+ oid,
+ },
+ );
+ }
+
+ // Track rename/move in current mapping if provided in an Update.
+ if let FileChange::Update {
+ move_path: Some(dest),
+ ..
+ } = change
+ {
+ let uuid_filename = match self.external_to_temp_name.get(path) {
+ Some(i) => i.clone(),
+ None => {
+ // This should be rare, but if we haven't mapped the source, create it with no baseline.
+ let i = uuid_filename_for(path);
+ self.baseline_file_info.insert(
+ i.clone(),
+ BaselineFileInfo {
+ path: Some(path.clone()),
+ contents_bytes: None,
+ mode: None,
+ oid: Some(ZERO_OID.to_string()),
+ },
+ );
+ i
+ }
+ };
+ // Update current external mapping for temp file name.
+ self.temp_name_to_current_path
+ .insert(uuid_filename.clone(), dest.clone());
+ // Update forward file_mapping: external current -> internal name.
+ self.external_to_temp_name.remove(path);
+ self.external_to_temp_name
+ .insert(dest.clone(), uuid_filename);
+ };
+ }
+
+ Ok(())
+ }
+
+ fn get_path_for_internal(&self, internal: &str) -> Option<PathBuf> {
+ self.temp_name_to_current_path
+ .get(internal)
+ .cloned()
+ .or_else(|| {
+ self.baseline_file_info
+ .get(internal)
+ .and_then(|info| info.path.clone())
+ })
+ }
+
+ /// Find the git worktree root for a file/directory by walking up to the first ancestor containing a `.git` entry.
+ /// Uses a simple cache of known roots and avoids negative-result caching for simplicity.
+ fn find_git_root_cached(&mut self, start: &Path) -> Option<PathBuf> {
+ let dir = if start.is_dir() {
+ start
+ } else {
+ start.parent()?
+ };
+
+ // Fast path: if any cached root is an ancestor of this path, use it.
+ if let Some(root) = self
+ .git_root_cache
+ .iter()
+ .find(|r| dir.starts_with(r))
+ .cloned()
+ {
+ return Some(root);
+ }
+
+ // Walk up to find a `.git` marker.
+ let mut cur = dir.to_path_buf();
+ loop {
+ let git_marker = cur.join(".git");
+ if git_marker.is_dir() || git_marker.is_file() {
+ if !self.git_root_cache.iter().any(|r| r == &cur) {
+ self.git_root_cache.push(cur.clone());
+ }
+ return Some(cur);
+ }
+
+ // On Windows, avoid walking above the drive or UNC share root.
+ #[cfg(windows)]
+ {
+ if is_windows_drive_or_unc_root(&cur) {
+ return None;
+ }
+ }
+
+ if let Some(parent) = cur.parent() {
+ cur = parent.to_path_buf();
+ } else {
+ return None;
+ }
+ }
+ }
+
+ /// Return a display string for `path` relative to its git root if found, else absolute.
+ fn relative_to_git_root_str(&mut self, path: &Path) -> String {
+ let s = if let Some(root) = self.find_git_root_cached(path) {
+ if let Ok(rel) = path.strip_prefix(&root) {
+ rel.display().to_string()
+ } else {
+ path.display().to_string()
+ }
+ } else {
+ path.display().to_string()
+ };
+ s.replace('\\', "/")
+ }
+
+ /// Ask git to compute the blob SHA-1 for the file at `path` within its repository.
+ /// Returns None if no repository is found or git invocation fails.
+ fn git_blob_oid_for_path(&mut self, path: &Path) -> Option<String> {
+ let root = self.find_git_root_cached(path)?;
+ // Compute a path relative to the repo root for better portability across platforms.
+ let rel = path.strip_prefix(&root).unwrap_or(path);
+ let output = Command::new("git")
+ .arg("-C")
+ .arg(&root)
+ .arg("hash-object")
+ .arg("--")
+ .arg(rel)
+ .output()
+ .ok()?;
+ if !output.status.success() {
+ return None;
+ }
+ let s = String::from_utf8_lossy(&output.stdout).trim().to_string();
+ if s.len() == 40 { Some(s) } else { None }
+ }
+
+ /// Recompute the aggregated unified diff by comparing all of the in-memory snapshots that were
+ /// collected before the first time they were touched by apply_patch during this turn with
+ /// the current repo state.
+ pub fn get_unified_diff(&mut self) -> Result<Option<String>> {
+ let mut aggregated = String::new();
+
+ // Compute diffs per tracked internal file in a stable order by external path.
+ let mut baseline_file_names: Vec<String> =
+ self.baseline_file_info.keys().cloned().collect();
+ // Sort lexicographically by full repo-relative path to match git behavior.
+ baseline_file_names.sort_by_key(|internal| {
👍
- Created: 2025-08-04 03:05:24 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2250307567
@@ -0,0 +1,766 @@
+use std::collections::HashMap;
+use std::fs;
+use std::path::Path;
+use std::path::PathBuf;
+use std::process::Command;
+
+use anyhow::Context;
+use anyhow::Result;
+use anyhow::anyhow;
+use uuid::Uuid;
+
+use crate::protocol::FileChange;
+
+struct BaselineFileInfo {
+ path: Option<PathBuf>,
+ contents_bytes: Option<Vec<u8>>,
+ mode: Option<String>,
+ oid: Option<String>,
+}
+
+/// Tracks sets of changes to files and exposes the overall unified diff.
+/// Internally, the way this works is now:
+/// 1. Maintain an in-memory baseline snapshot of files when they are first seen.
+/// For new additions, do not create a baseline so that diffs are shown as proper additions (using /dev/null).
+/// 2. Keep a stable internal filename (uuid + same extension) per external path for rename tracking.
+/// 3. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk entirely in-memory
+/// using the `similar` crate and emit unified diffs with rewritten external paths.
+#[derive(Default)]
+pub struct TurnDiffTracker {
+ /// Map external path -> internal filename (uuid + same extension).
+ external_to_temp_name: HashMap<PathBuf, String>,
+ /// Internal filename -> baseline file info.
+ baseline_file_info: HashMap<String, BaselineFileInfo>,
+ /// Internal filename -> external path as of current accumulated state (after applying all changes).
+ /// This is where renames are tracked.
+ temp_name_to_current_path: HashMap<String, PathBuf>,
+ /// Cache of known git worktree roots to avoid repeated filesystem walks.
+ git_root_cache: Vec<PathBuf>,
+}
+
+impl TurnDiffTracker {
+ pub fn new() -> Self {
+ Self::default()
+ }
+
+ /// Front-run apply patch calls to track the starting contents of any modified files.
+ /// - Creates an in-memory baseline snapshot for files that already exist on disk when first seen.
+ /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions.
+ /// - Also updates internal mappings for move/rename events.
+ pub fn on_patch_begin(&mut self, changes: &HashMap<PathBuf, FileChange>) -> Result<()> {
+ for (path, change) in changes.iter() {
+ // Ensure a stable internal filename exists for this external path.
+ if !self.external_to_temp_name.contains_key(path) {
+ let internal = uuid_filename_for(path);
+ self.external_to_temp_name
+ .insert(path.clone(), internal.clone());
+ self.temp_name_to_current_path
+ .insert(internal.clone(), path.clone());
+
+ // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null.
+ let (contents_bytes, mode, oid) = if path.exists() {
+ let mode = file_mode_for_path(path);
+ let mode_str = mode.as_deref().unwrap_or(REGULAR_MODE);
+ let contents_bytes = blob_bytes(path, mode_str)
+ .unwrap_or_default()
+ .unwrap_or_default();
+ let oid = if mode.as_deref() == Some(SYMLINK_MODE) {
+ git_blob_sha1_hex_bytes(&contents_bytes)
+ } else {
+ self.git_blob_oid_for_path(path)
+ .unwrap_or_else(|| git_blob_sha1_hex_bytes(&contents_bytes))
+ };
+ (Some(contents_bytes), mode, Some(oid))
+ } else {
+ (None, None, Some(ZERO_OID.to_string()))
+ };
+
+ self.baseline_file_info.insert(
+ internal.clone(),
+ BaselineFileInfo {
+ path: Some(path.clone()),
+ contents_bytes,
+ mode,
+ oid,
+ },
+ );
+ }
+
+ // Track rename/move in current mapping if provided in an Update.
+ if let FileChange::Update {
+ move_path: Some(dest),
+ ..
+ } = change
+ {
+ let uuid_filename = match self.external_to_temp_name.get(path) {
+ Some(i) => i.clone(),
+ None => {
+ // This should be rare, but if we haven't mapped the source, create it with no baseline.
+ let i = uuid_filename_for(path);
+ self.baseline_file_info.insert(
+ i.clone(),
+ BaselineFileInfo {
+ path: Some(path.clone()),
+ contents_bytes: None,
+ mode: None,
+ oid: Some(ZERO_OID.to_string()),
+ },
+ );
+ i
+ }
+ };
+ // Update current external mapping for temp file name.
+ self.temp_name_to_current_path
+ .insert(uuid_filename.clone(), dest.clone());
+ // Update forward file_mapping: external current -> internal name.
+ self.external_to_temp_name.remove(path);
+ self.external_to_temp_name
+ .insert(dest.clone(), uuid_filename);
+ };
+ }
+
+ Ok(())
+ }
+
+ fn get_path_for_internal(&self, internal: &str) -> Option<PathBuf> {
+ self.temp_name_to_current_path
+ .get(internal)
+ .cloned()
+ .or_else(|| {
+ self.baseline_file_info
+ .get(internal)
+ .and_then(|info| info.path.clone())
+ })
+ }
+
+ /// Find the git worktree root for a file/directory by walking up to the first ancestor containing a `.git` entry.
+ /// Uses a simple cache of known roots and avoids negative-result caching for simplicity.
+ fn find_git_root_cached(&mut self, start: &Path) -> Option<PathBuf> {
+ let dir = if start.is_dir() {
+ start
+ } else {
+ start.parent()?
+ };
+
+ // Fast path: if any cached root is an ancestor of this path, use it.
+ if let Some(root) = self
+ .git_root_cache
+ .iter()
+ .find(|r| dir.starts_with(r))
+ .cloned()
+ {
+ return Some(root);
+ }
+
+ // Walk up to find a `.git` marker.
+ let mut cur = dir.to_path_buf();
+ loop {
+ let git_marker = cur.join(".git");
+ if git_marker.is_dir() || git_marker.is_file() {
+ if !self.git_root_cache.iter().any(|r| r == &cur) {
+ self.git_root_cache.push(cur.clone());
+ }
+ return Some(cur);
+ }
+
+ // On Windows, avoid walking above the drive or UNC share root.
+ #[cfg(windows)]
+ {
+ if is_windows_drive_or_unc_root(&cur) {
+ return None;
+ }
+ }
+
+ if let Some(parent) = cur.parent() {
+ cur = parent.to_path_buf();
+ } else {
+ return None;
+ }
+ }
+ }
+
+ /// Return a display string for `path` relative to its git root if found, else absolute.
+ fn relative_to_git_root_str(&mut self, path: &Path) -> String {
+ let s = if let Some(root) = self.find_git_root_cached(path) {
+ if let Ok(rel) = path.strip_prefix(&root) {
+ rel.display().to_string()
+ } else {
+ path.display().to_string()
+ }
+ } else {
+ path.display().to_string()
+ };
+ s.replace('\\', "/")
+ }
+
+ /// Ask git to compute the blob SHA-1 for the file at `path` within its repository.
+ /// Returns None if no repository is found or git invocation fails.
+ fn git_blob_oid_for_path(&mut self, path: &Path) -> Option<String> {
+ let root = self.find_git_root_cached(path)?;
+ // Compute a path relative to the repo root for better portability across platforms.
+ let rel = path.strip_prefix(&root).unwrap_or(path);
+ let output = Command::new("git")
+ .arg("-C")
+ .arg(&root)
+ .arg("hash-object")
+ .arg("--")
+ .arg(rel)
+ .output()
+ .ok()?;
+ if !output.status.success() {
+ return None;
+ }
+ let s = String::from_utf8_lossy(&output.stdout).trim().to_string();
+ if s.len() == 40 { Some(s) } else { None }
+ }
+
+ /// Recompute the aggregated unified diff by comparing all of the in-memory snapshots that were
+ /// collected before the first time they were touched by apply_patch during this turn with
+ /// the current repo state.
+ pub fn get_unified_diff(&mut self) -> Result<Option<String>> {
+ let mut aggregated = String::new();
+
+ // Compute diffs per tracked internal file in a stable order by external path.
+ let mut baseline_file_names: Vec<String> =
+ self.baseline_file_info.keys().cloned().collect();
+ // Sort lexicographically by full repo-relative path to match git behavior.
+ baseline_file_names.sort_by_key(|internal| {
+ self.get_path_for_internal(internal)
+ .map(|p| self.relative_to_git_root_str(&p))
+ .unwrap_or_default()
+ });
+
+ for internal in baseline_file_names {
+ // Baseline external must exist for any tracked internal.
+ let baseline_external = match self
+ .baseline_file_info
+ .get(&internal)
+ .and_then(|i| i.path.clone())
+ {
+ Some(p) => p,
+ None => continue,
+ };
+ let current_external = match self.get_path_for_internal(&internal) {
+ Some(p) => p,
+ None => continue,
+ };
+
+ // Determine modes early; needed to read symlink content correctly.
+ let baseline_mode = self
+ .baseline_file_info
+ .get(&internal)
+ .and_then(|i| i.mode.clone())
+ .unwrap_or_else(|| REGULAR_MODE.to_string());
+ let current_mode =
+ file_mode_for_path(¤t_external).unwrap_or_else(|| REGULAR_MODE.to_string());
+
+ let left_bytes = self
+ .baseline_file_info
+ .get(&internal)
+ .and_then(|i| i.contents_bytes.clone());
+
+ let right_bytes = blob_bytes(¤t_external, ¤t_mode)?;
+
+ // Fast path: identical bytes or both missing.
+ if left_bytes.as_deref() == right_bytes.as_deref() {
+ continue;
+ }
+
+ let left_display = self.relative_to_git_root_str(&baseline_external);
+ let right_display = self.relative_to_git_root_str(¤t_external);
+
+ // Emit a git-style header for better readability and parity with previous behavior.
+ aggregated.push_str(&format!("diff --git a/{left_display} b/{right_display}\n"));
+
+ let is_add = left_bytes.is_none() && right_bytes.is_some();
Could do
match (left_bytes, right_bytes)to ensure all cases are covered.
- Created: 2025-08-04 03:06:34 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2250308415
@@ -0,0 +1,766 @@
+use std::collections::HashMap;
+use std::fs;
+use std::path::Path;
+use std::path::PathBuf;
+use std::process::Command;
+
+use anyhow::Context;
+use anyhow::Result;
+use anyhow::anyhow;
+use uuid::Uuid;
+
+use crate::protocol::FileChange;
+
+struct BaselineFileInfo {
+ path: Option<PathBuf>,
+ contents_bytes: Option<Vec<u8>>,
+ mode: Option<String>,
+ oid: Option<String>,
+}
+
+/// Tracks sets of changes to files and exposes the overall unified diff.
+/// Internally, the way this works is now:
+/// 1. Maintain an in-memory baseline snapshot of files when they are first seen.
+/// For new additions, do not create a baseline so that diffs are shown as proper additions (using /dev/null).
+/// 2. Keep a stable internal filename (uuid + same extension) per external path for rename tracking.
+/// 3. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk entirely in-memory
+/// using the `similar` crate and emit unified diffs with rewritten external paths.
+#[derive(Default)]
+pub struct TurnDiffTracker {
+ /// Map external path -> internal filename (uuid + same extension).
+ external_to_temp_name: HashMap<PathBuf, String>,
+ /// Internal filename -> baseline file info.
+ baseline_file_info: HashMap<String, BaselineFileInfo>,
+ /// Internal filename -> external path as of current accumulated state (after applying all changes).
+ /// This is where renames are tracked.
+ temp_name_to_current_path: HashMap<String, PathBuf>,
+ /// Cache of known git worktree roots to avoid repeated filesystem walks.
+ git_root_cache: Vec<PathBuf>,
+}
+
+impl TurnDiffTracker {
+ pub fn new() -> Self {
+ Self::default()
+ }
+
+ /// Front-run apply patch calls to track the starting contents of any modified files.
+ /// - Creates an in-memory baseline snapshot for files that already exist on disk when first seen.
+ /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions.
+ /// - Also updates internal mappings for move/rename events.
+ pub fn on_patch_begin(&mut self, changes: &HashMap<PathBuf, FileChange>) -> Result<()> {
+ for (path, change) in changes.iter() {
+ // Ensure a stable internal filename exists for this external path.
+ if !self.external_to_temp_name.contains_key(path) {
+ let internal = uuid_filename_for(path);
+ self.external_to_temp_name
+ .insert(path.clone(), internal.clone());
+ self.temp_name_to_current_path
+ .insert(internal.clone(), path.clone());
+
+ // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null.
+ let (contents_bytes, mode, oid) = if path.exists() {
+ let mode = file_mode_for_path(path);
+ let mode_str = mode.as_deref().unwrap_or(REGULAR_MODE);
+ let contents_bytes = blob_bytes(path, mode_str)
+ .unwrap_or_default()
+ .unwrap_or_default();
+ let oid = if mode.as_deref() == Some(SYMLINK_MODE) {
+ git_blob_sha1_hex_bytes(&contents_bytes)
+ } else {
+ self.git_blob_oid_for_path(path)
+ .unwrap_or_else(|| git_blob_sha1_hex_bytes(&contents_bytes))
+ };
+ (Some(contents_bytes), mode, Some(oid))
+ } else {
+ (None, None, Some(ZERO_OID.to_string()))
+ };
+
+ self.baseline_file_info.insert(
+ internal.clone(),
+ BaselineFileInfo {
+ path: Some(path.clone()),
+ contents_bytes,
+ mode,
+ oid,
+ },
+ );
+ }
+
+ // Track rename/move in current mapping if provided in an Update.
+ if let FileChange::Update {
+ move_path: Some(dest),
+ ..
+ } = change
+ {
+ let uuid_filename = match self.external_to_temp_name.get(path) {
+ Some(i) => i.clone(),
+ None => {
+ // This should be rare, but if we haven't mapped the source, create it with no baseline.
+ let i = uuid_filename_for(path);
+ self.baseline_file_info.insert(
+ i.clone(),
+ BaselineFileInfo {
+ path: Some(path.clone()),
+ contents_bytes: None,
+ mode: None,
+ oid: Some(ZERO_OID.to_string()),
+ },
+ );
+ i
+ }
+ };
+ // Update current external mapping for temp file name.
+ self.temp_name_to_current_path
+ .insert(uuid_filename.clone(), dest.clone());
+ // Update forward file_mapping: external current -> internal name.
+ self.external_to_temp_name.remove(path);
+ self.external_to_temp_name
+ .insert(dest.clone(), uuid_filename);
+ };
+ }
+
+ Ok(())
+ }
+
+ fn get_path_for_internal(&self, internal: &str) -> Option<PathBuf> {
+ self.temp_name_to_current_path
+ .get(internal)
+ .cloned()
+ .or_else(|| {
+ self.baseline_file_info
+ .get(internal)
+ .and_then(|info| info.path.clone())
+ })
+ }
+
+ /// Find the git worktree root for a file/directory by walking up to the first ancestor containing a `.git` entry.
+ /// Uses a simple cache of known roots and avoids negative-result caching for simplicity.
+ fn find_git_root_cached(&mut self, start: &Path) -> Option<PathBuf> {
+ let dir = if start.is_dir() {
+ start
+ } else {
+ start.parent()?
+ };
+
+ // Fast path: if any cached root is an ancestor of this path, use it.
+ if let Some(root) = self
+ .git_root_cache
+ .iter()
+ .find(|r| dir.starts_with(r))
+ .cloned()
+ {
+ return Some(root);
+ }
+
+ // Walk up to find a `.git` marker.
+ let mut cur = dir.to_path_buf();
+ loop {
+ let git_marker = cur.join(".git");
+ if git_marker.is_dir() || git_marker.is_file() {
+ if !self.git_root_cache.iter().any(|r| r == &cur) {
+ self.git_root_cache.push(cur.clone());
+ }
+ return Some(cur);
+ }
+
+ // On Windows, avoid walking above the drive or UNC share root.
+ #[cfg(windows)]
+ {
+ if is_windows_drive_or_unc_root(&cur) {
+ return None;
+ }
+ }
+
+ if let Some(parent) = cur.parent() {
+ cur = parent.to_path_buf();
+ } else {
+ return None;
+ }
+ }
+ }
+
+ /// Return a display string for `path` relative to its git root if found, else absolute.
+ fn relative_to_git_root_str(&mut self, path: &Path) -> String {
+ let s = if let Some(root) = self.find_git_root_cached(path) {
+ if let Ok(rel) = path.strip_prefix(&root) {
+ rel.display().to_string()
+ } else {
+ path.display().to_string()
+ }
+ } else {
+ path.display().to_string()
+ };
+ s.replace('\\', "/")
+ }
+
+ /// Ask git to compute the blob SHA-1 for the file at `path` within its repository.
+ /// Returns None if no repository is found or git invocation fails.
+ fn git_blob_oid_for_path(&mut self, path: &Path) -> Option<String> {
+ let root = self.find_git_root_cached(path)?;
+ // Compute a path relative to the repo root for better portability across platforms.
+ let rel = path.strip_prefix(&root).unwrap_or(path);
+ let output = Command::new("git")
+ .arg("-C")
+ .arg(&root)
+ .arg("hash-object")
+ .arg("--")
+ .arg(rel)
+ .output()
+ .ok()?;
+ if !output.status.success() {
+ return None;
+ }
+ let s = String::from_utf8_lossy(&output.stdout).trim().to_string();
+ if s.len() == 40 { Some(s) } else { None }
+ }
+
+ /// Recompute the aggregated unified diff by comparing all of the in-memory snapshots that were
+ /// collected before the first time they were touched by apply_patch during this turn with
+ /// the current repo state.
+ pub fn get_unified_diff(&mut self) -> Result<Option<String>> {
This function is quite long and feels like it would benefit from being broken up.
- Created: 2025-08-04 03:07:44 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2250309286
@@ -0,0 +1,476 @@
+use std::collections::HashMap;
+use std::fs;
+use std::path::Path;
+use std::path::PathBuf;
+use std::process::Command;
+
+use anyhow::Context;
+use anyhow::Result;
+use tempfile::TempDir;
+use uuid::Uuid;
+
+use crate::protocol::FileChange;
+
+/// Tracks sets of changes to files and exposes the overall unified diff.
+/// Internally, the way this works is now:
+/// 1. Create a temp directory to store baseline snapshots of files when they are first seen.
+/// 2. When a path is first observed, copy its current contents into the baseline dir if it exists on disk.
+/// For new additions, do not create a baseline file so that diffs are shown as proper additions (using /dev/null).
+/// 3. Keep a stable internal filename (uuid + same extension) per external path for path rewrite in diffs.
+/// 4. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk using
+/// `git diff --no-index` and rewrite paths to external paths.
+#[derive(Default)]
+pub struct TurnDiffTracker {
+ /// Temp directory holding baseline snapshots of files as first seen.
+ baseline_files_dir: Option<TempDir>,
+ /// Map external path -> internal filename (uuid + same extension).
+ external_to_temp_name: HashMap<PathBuf, String>,
+ /// Internal filename -> external path as of baseline snapshot.
+ temp_name_to_baseline_external: HashMap<String, PathBuf>,
+ /// Internal filename -> external path as of current accumulated state (after applying all changes).
+ /// This is where renames are tracked.
+ temp_name_to_current_external: HashMap<String, PathBuf>,
+ /// Aggregated unified diff for all accumulated changes across files.
+ pub unified_diff: Option<String>,
+}
+
+impl TurnDiffTracker {
+ pub fn new() -> Self {
+ Self::default()
+ }
+
+ /// Front-run apply patch calls to track the starting contents of any modified files.
+ /// - Creates a baseline snapshot for files that already exist on disk when first seen.
+ /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions.
+ /// - Also updates internal mappings for move/rename events.
+ pub fn on_patch_begin(&mut self, changes: &HashMap<PathBuf, FileChange>) -> Result<()> {
+ self.ensure_baseline_dir()?;
+ let baseline_dir = self.baseline_dir()?.to_path_buf();
+
+ for (path, change) in changes.iter() {
+ // Ensure a stable internal filename exists for this external path.
+ if !self.external_to_temp_name.contains_key(path) {
+ let internal = uuid_filename_for(path);
+ self.external_to_temp_name
+ .insert(path.clone(), internal.clone());
+ self.temp_name_to_baseline_external
+ .insert(internal.clone(), path.clone());
+ self.temp_name_to_current_external
+ .insert(internal.clone(), path.clone());
+
+ // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null.
+ if path.exists() {
+ let contents = fs::read(path)
+ .with_context(|| format!("failed to read original {}", path.display()))?;
+ let internal_path = baseline_dir.join(&internal);
+ fs::write(&internal_path, contents).with_context(|| {
+ format!("failed to write baseline file {}", internal_path.display())
+ })?;
+ }
+ }
+
+ // Track rename/move in current mapping if provided in an Update.
+ let move_path = match change {
+ FileChange::Update {
+ move_path: Some(dest),
+ ..
+ } => Some(dest),
+ _ => None,
+ };
+ if let Some(dest) = move_path {
+ let uuid_filename = match self.external_to_temp_name.get(path) {
+ Some(i) => i.clone(),
+ None => {
+ // This should be rare, but if we haven't mapped the source, create it with no baseline.
+ let i = uuid_filename_for(path);
+ self.external_to_temp_name.insert(path.clone(), i.clone());
+ self.temp_name_to_baseline_external
+ .insert(i.clone(), path.clone());
+ i
+ }
+ };
+ // Update current external mapping for temp file name.
+ self.temp_name_to_current_external
+ .insert(uuid_filename.clone(), dest.clone());
+ // Update forward file_mapping: external current -> internal name.
+ self.external_to_temp_name.remove(path);
+ self.external_to_temp_name
+ .insert(dest.clone(), uuid_filename);
+ }
+ }
+
+ Ok(())
+ }
+
+ /// Recompute the aggregated unified diff by comparing all baseline snapshots against
+ /// current files on disk using `git diff --no-index` and rewriting paths to external paths.
+ pub fn update_and_get_unified_diff(&mut self) -> Result<Option<String>> {
+ let baseline_dir = self.baseline_dir()?.to_path_buf();
+ let current_dir = baseline_dir.join("current");
+ if current_dir.exists() {
+ // Best-effort cleanup of previous run's mirror.
+ let _ = fs::remove_dir_all(¤t_dir);
+ }
+ fs::create_dir_all(¤t_dir).with_context(|| {
+ format!(
+ "failed to create current mirror dir {}",
+ current_dir.display()
+ )
+ })?;
+
+ let mut aggregated = String::new();
+
+ // Compute diffs per tracked internal file.
+ for (internal, baseline_external) in &self.temp_name_to_baseline_external {
+ let baseline_path = baseline_dir.join(internal);
+ let current_external = self
+ .temp_name_to_current_external
+ .get(internal)
+ .cloned()
+ .unwrap_or_else(|| baseline_external.clone());
+
+ let left_is_dev_null = !baseline_path.exists();
+ let right_exists = current_external.exists();
+
+ // Prepare right side mirror file if exists; otherwise use /dev/null for deletions.
+ let right_arg = if right_exists {
+ let mirror_path = current_dir.join(internal);
+ let contents = fs::read(¤t_external).with_context(|| {
+ format!(
+ "failed to read current file for diff {}",
+ current_external.display()
+ )
+ })?;
+ fs::write(&mirror_path, contents).with_context(|| {
+ format!(
+ "failed to write current mirror file {}",
+ mirror_path.display()
+ )
+ })?;
+ // Use relative path from baseline_dir (so headers say a/<uuid> b/current/<uuid>).
+ format!("current/{internal}")
+ } else {
+ // Deletion: right side is /dev/null to show proper deleted file diff.
+ "/dev/null".to_string()
+ };
+
+ // Prepare left arg: baseline file path or /dev/null for additions.
+ let left_arg = if left_is_dev_null {
+ "/dev/null".to_string()
+ } else {
+ internal.clone()
+ };
+
+ // Run git diff --no-index from baseline_dir to keep paths predictable.
+ let raw = run_git_allow_exit_codes(
+ &baseline_dir,
+ &[
+ "-c",
+ "color.ui=false",
+ "diff",
+ "--no-color",
+ "--no-index",
+ "--",
+ &left_arg,
+ &right_arg,
+ ],
+ &[0, 1], // 0: no changes, 1: differences
+ )?;
+
+ if raw.trim().is_empty() {
+ continue;
+ }
+ let rewritten = self.rewrite_diff_paths(&raw);
+ if !rewritten.trim().is_empty() {
+ if !aggregated.is_empty() && !aggregated.ends_with('\n') {
+ aggregated.push('\n');
+ }
+ aggregated.push_str(&rewritten);
+ }
+ }
+
+ self.unified_diff = if aggregated.trim().is_empty() {
+ None
+ } else {
+ Some(aggregated)
+ };
+
+ // Clean up the current dir.
+ let _ = fs::remove_dir_all(¤t_dir);
+
+ Ok(self.unified_diff.clone())
+ }
+
+ fn baseline_dir(&self) -> Result<&Path> {
+ self.baseline_files_dir
+ .as_ref()
+ .map(|d| d.path())
+ .ok_or_else(|| anyhow::anyhow!("baseline temp dir not initialized"))
+ }
+
+ fn ensure_baseline_dir(&mut self) -> Result<()> {
+ if self.baseline_files_dir.is_some() {
+ return Ok(());
+ }
+ let tmp = TempDir::new().context("create baseline temp dir")?;
+ self.baseline_files_dir = Some(tmp);
+ Ok(())
+ }
+
+ /// Rewrites the internal filenames to external paths in diff headers.
+ /// Handles inputs like:
+ /// diff --git a/<uuid> b/current/<uuid>
+ /// --- a/<uuid> | /dev/null
+ /// +++ b/current/<uuid> | /dev/null
+ /// and replaces uuid with the external paths tracking baseline/current.
+ fn rewrite_diff_paths(&self, diff: &str) -> String {
+ let mut out = String::new();
+ for line in diff.lines() {
+ if let Some(rest) = line.strip_prefix("diff --git ") {
+ // Format: diff --git a/<f> b/<f>
+ let parts: Vec<&str> = rest.split_whitespace().collect();
+ if parts.len() == 2 {
+ let a = parts[0].strip_prefix("a/").unwrap_or(parts[0]);
+ let b = parts[1].strip_prefix("b/").unwrap_or(parts[1]);
+
+ let a_ext_display = if a == "/dev/null" {
+ "/dev/null".to_string()
+ } else {
+ let a_base = Path::new(a)
+ .file_name()
+ .and_then(|s| s.to_str())
+ .unwrap_or(a);
+ let mapped = self
+ .temp_name_to_baseline_external
+ .get(a_base)
+ .cloned()
+ .unwrap_or_else(|| PathBuf::from(a));
+ mapped.display().to_string()
+ };
+
+ let b_ext_display = if b == "/dev/null" {
+ "/dev/null".to_string()
+ } else {
+ let b_base = Path::new(b)
+ .file_name()
+ .and_then(|s| s.to_str())
+ .unwrap_or(b);
+ let mapped = self
+ .temp_name_to_current_external
+ .get(b_base)
+ .cloned()
+ .unwrap_or_else(|| PathBuf::from(b));
+ mapped.display().to_string()
+ };
+
+ out.push_str(&format!("diff --git a/{a_ext_display} b/{b_ext_display}\n"));
+ continue;
+ }
+ }
+ if let Some(rest) = line.strip_prefix("--- ") {
+ if let Some(path) = rest.strip_prefix("a/") {
+ let external_display = if path == "/dev/null" {
+ "/dev/null".to_string()
+ } else {
+ let p_base = Path::new(path)
+ .file_name()
+ .and_then(|s| s.to_str())
+ .unwrap_or(path);
+ self.temp_name_to_baseline_external
+ .get(p_base)
+ .cloned()
+ .unwrap_or_else(|| PathBuf::from(path))
+ .display()
+ .to_string()
+ };
+ out.push_str(&format!("--- {external_display}\n"));
+ continue;
+ }
+ }
+ if let Some(rest) = line.strip_prefix("+++ ") {
+ if let Some(path) = rest.strip_prefix("b/") {
+ let external_display = if path == "/dev/null" {
+ "/dev/null".to_string()
+ } else {
+ let p_base = Path::new(path)
+ .file_name()
+ .and_then(|s| s.to_str())
+ .unwrap_or(path);
+ self.temp_name_to_current_external
+ .get(p_base)
+ .cloned()
+ .unwrap_or_else(|| PathBuf::from(path))
+ .display()
+ .to_string()
+ };
+ out.push_str(&format!("+++ {external_display}\n"));
+ continue;
+ }
+ }
+ out.push_str(line);
+ out.push('\n');
+ }
+ out
+ }
+}
+
+fn uuid_filename_for(path: &Path) -> String {
+ let id = Uuid::new_v4().to_string();
+ match path.extension().and_then(|e| e.to_str()) {
+ Some(ext) if !ext.is_empty() => format!("{id}.{ext}"),
+ _ => id,
I'm still unclear why
extis added.
- Created: 2025-08-04 03:08:45 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2250310045
@@ -0,0 +1,766 @@
+use std::collections::HashMap;
+use std::fs;
+use std::path::Path;
+use std::path::PathBuf;
+use std::process::Command;
+
+use anyhow::Context;
+use anyhow::Result;
+use anyhow::anyhow;
+use uuid::Uuid;
+
+use crate::protocol::FileChange;
+
+struct BaselineFileInfo {
+ path: Option<PathBuf>,
+ contents_bytes: Option<Vec<u8>>,
+ mode: Option<String>,
+ oid: Option<String>,
+}
+
+/// Tracks sets of changes to files and exposes the overall unified diff.
+/// Internally, the way this works is now:
+/// 1. Maintain an in-memory baseline snapshot of files when they are first seen.
+/// For new additions, do not create a baseline so that diffs are shown as proper additions (using /dev/null).
+/// 2. Keep a stable internal filename (uuid + same extension) per external path for rename tracking.
+/// 3. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk entirely in-memory
+/// using the `similar` crate and emit unified diffs with rewritten external paths.
+#[derive(Default)]
+pub struct TurnDiffTracker {
+ /// Map external path -> internal filename (uuid + same extension).
+ external_to_temp_name: HashMap<PathBuf, String>,
+ /// Internal filename -> baseline file info.
+ baseline_file_info: HashMap<String, BaselineFileInfo>,
+ /// Internal filename -> external path as of current accumulated state (after applying all changes).
+ /// This is where renames are tracked.
+ temp_name_to_current_path: HashMap<String, PathBuf>,
+ /// Cache of known git worktree roots to avoid repeated filesystem walks.
+ git_root_cache: Vec<PathBuf>,
+}
+
+impl TurnDiffTracker {
+ pub fn new() -> Self {
+ Self::default()
+ }
+
+ /// Front-run apply patch calls to track the starting contents of any modified files.
+ /// - Creates an in-memory baseline snapshot for files that already exist on disk when first seen.
+ /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions.
+ /// - Also updates internal mappings for move/rename events.
+ pub fn on_patch_begin(&mut self, changes: &HashMap<PathBuf, FileChange>) -> Result<()> {
+ for (path, change) in changes.iter() {
+ // Ensure a stable internal filename exists for this external path.
+ if !self.external_to_temp_name.contains_key(path) {
+ let internal = uuid_filename_for(path);
+ self.external_to_temp_name
+ .insert(path.clone(), internal.clone());
+ self.temp_name_to_current_path
+ .insert(internal.clone(), path.clone());
+
+ // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null.
+ let (contents_bytes, mode, oid) = if path.exists() {
+ let mode = file_mode_for_path(path);
+ let mode_str = mode.as_deref().unwrap_or(REGULAR_MODE);
+ let contents_bytes = blob_bytes(path, mode_str)
+ .unwrap_or_default()
+ .unwrap_or_default();
+ let oid = if mode.as_deref() == Some(SYMLINK_MODE) {
+ git_blob_sha1_hex_bytes(&contents_bytes)
+ } else {
+ self.git_blob_oid_for_path(path)
+ .unwrap_or_else(|| git_blob_sha1_hex_bytes(&contents_bytes))
+ };
+ (Some(contents_bytes), mode, Some(oid))
+ } else {
+ (None, None, Some(ZERO_OID.to_string()))
+ };
+
+ self.baseline_file_info.insert(
+ internal.clone(),
+ BaselineFileInfo {
+ path: Some(path.clone()),
+ contents_bytes,
+ mode,
+ oid,
+ },
+ );
+ }
+
+ // Track rename/move in current mapping if provided in an Update.
+ if let FileChange::Update {
+ move_path: Some(dest),
+ ..
+ } = change
+ {
+ let uuid_filename = match self.external_to_temp_name.get(path) {
+ Some(i) => i.clone(),
+ None => {
+ // This should be rare, but if we haven't mapped the source, create it with no baseline.
+ let i = uuid_filename_for(path);
+ self.baseline_file_info.insert(
+ i.clone(),
+ BaselineFileInfo {
+ path: Some(path.clone()),
+ contents_bytes: None,
+ mode: None,
+ oid: Some(ZERO_OID.to_string()),
+ },
+ );
+ i
+ }
+ };
+ // Update current external mapping for temp file name.
+ self.temp_name_to_current_path
+ .insert(uuid_filename.clone(), dest.clone());
+ // Update forward file_mapping: external current -> internal name.
+ self.external_to_temp_name.remove(path);
+ self.external_to_temp_name
+ .insert(dest.clone(), uuid_filename);
+ };
+ }
+
+ Ok(())
+ }
+
+ fn get_path_for_internal(&self, internal: &str) -> Option<PathBuf> {
+ self.temp_name_to_current_path
+ .get(internal)
+ .cloned()
+ .or_else(|| {
+ self.baseline_file_info
+ .get(internal)
+ .and_then(|info| info.path.clone())
+ })
+ }
+
+ /// Find the git worktree root for a file/directory by walking up to the first ancestor containing a `.git` entry.
+ /// Uses a simple cache of known roots and avoids negative-result caching for simplicity.
+ fn find_git_root_cached(&mut self, start: &Path) -> Option<PathBuf> {
+ let dir = if start.is_dir() {
+ start
+ } else {
+ start.parent()?
+ };
+
+ // Fast path: if any cached root is an ancestor of this path, use it.
+ if let Some(root) = self
+ .git_root_cache
+ .iter()
+ .find(|r| dir.starts_with(r))
+ .cloned()
+ {
+ return Some(root);
+ }
+
+ // Walk up to find a `.git` marker.
+ let mut cur = dir.to_path_buf();
+ loop {
+ let git_marker = cur.join(".git");
+ if git_marker.is_dir() || git_marker.is_file() {
+ if !self.git_root_cache.iter().any(|r| r == &cur) {
+ self.git_root_cache.push(cur.clone());
+ }
+ return Some(cur);
+ }
+
+ // On Windows, avoid walking above the drive or UNC share root.
+ #[cfg(windows)]
+ {
+ if is_windows_drive_or_unc_root(&cur) {
+ return None;
+ }
+ }
+
+ if let Some(parent) = cur.parent() {
+ cur = parent.to_path_buf();
+ } else {
+ return None;
+ }
+ }
+ }
+
+ /// Return a display string for `path` relative to its git root if found, else absolute.
+ fn relative_to_git_root_str(&mut self, path: &Path) -> String {
+ let s = if let Some(root) = self.find_git_root_cached(path) {
+ if let Ok(rel) = path.strip_prefix(&root) {
+ rel.display().to_string()
+ } else {
+ path.display().to_string()
+ }
+ } else {
+ path.display().to_string()
+ };
+ s.replace('\\', "/")
+ }
+
+ /// Ask git to compute the blob SHA-1 for the file at `path` within its repository.
+ /// Returns None if no repository is found or git invocation fails.
+ fn git_blob_oid_for_path(&mut self, path: &Path) -> Option<String> {
+ let root = self.find_git_root_cached(path)?;
+ // Compute a path relative to the repo root for better portability across platforms.
+ let rel = path.strip_prefix(&root).unwrap_or(path);
+ let output = Command::new("git")
+ .arg("-C")
+ .arg(&root)
+ .arg("hash-object")
+ .arg("--")
+ .arg(rel)
+ .output()
+ .ok()?;
+ if !output.status.success() {
+ return None;
+ }
+ let s = String::from_utf8_lossy(&output.stdout).trim().to_string();
+ if s.len() == 40 { Some(s) } else { None }
+ }
+
+ /// Recompute the aggregated unified diff by comparing all of the in-memory snapshots that were
+ /// collected before the first time they were touched by apply_patch during this turn with
+ /// the current repo state.
+ pub fn get_unified_diff(&mut self) -> Result<Option<String>> {
+ let mut aggregated = String::new();
+
+ // Compute diffs per tracked internal file in a stable order by external path.
+ let mut baseline_file_names: Vec<String> =
+ self.baseline_file_info.keys().cloned().collect();
+ // Sort lexicographically by full repo-relative path to match git behavior.
+ baseline_file_names.sort_by_key(|internal| {
+ self.get_path_for_internal(internal)
+ .map(|p| self.relative_to_git_root_str(&p))
+ .unwrap_or_default()
+ });
+
+ for internal in baseline_file_names {
+ // Baseline external must exist for any tracked internal.
+ let baseline_external = match self
+ .baseline_file_info
+ .get(&internal)
+ .and_then(|i| i.path.clone())
+ {
+ Some(p) => p,
+ None => continue,
+ };
+ let current_external = match self.get_path_for_internal(&internal) {
+ Some(p) => p,
+ None => continue,
+ };
+
+ // Determine modes early; needed to read symlink content correctly.
+ let baseline_mode = self
+ .baseline_file_info
+ .get(&internal)
+ .and_then(|i| i.mode.clone())
+ .unwrap_or_else(|| REGULAR_MODE.to_string());
+ let current_mode =
+ file_mode_for_path(¤t_external).unwrap_or_else(|| REGULAR_MODE.to_string());
+
+ let left_bytes = self
+ .baseline_file_info
+ .get(&internal)
+ .and_then(|i| i.contents_bytes.clone());
+
+ let right_bytes = blob_bytes(¤t_external, ¤t_mode)?;
+
+ // Fast path: identical bytes or both missing.
+ if left_bytes.as_deref() == right_bytes.as_deref() {
+ continue;
+ }
+
+ let left_display = self.relative_to_git_root_str(&baseline_external);
+ let right_display = self.relative_to_git_root_str(¤t_external);
+
+ // Emit a git-style header for better readability and parity with previous behavior.
+ aggregated.push_str(&format!("diff --git a/{left_display} b/{right_display}\n"));
+
+ let is_add = left_bytes.is_none() && right_bytes.is_some();
+ let is_delete = left_bytes.is_some() && right_bytes.is_none();
+
+ if is_add {
+ aggregated.push_str(&format!("new file mode {current_mode}\n"));
+ } else if is_delete {
+ aggregated.push_str(&format!("deleted file mode {baseline_mode}\n"));
+ } else if baseline_mode != current_mode {
+ aggregated.push_str(&format!("old mode {baseline_mode}\n"));
+ aggregated.push_str(&format!("new mode {current_mode}\n"));
+ }
+
+ // Determine blob object IDs for left and right contents. Prefer stored OIDs
+ // captured from the original repo state when the change was first seen.
+ let left_oid = self
+ .baseline_file_info
+ .get(&internal)
+ .and_then(|i| i.oid.clone())
+ .or_else(|| {
+ left_bytes
+ .as_ref()
+ .map(|b| git_blob_sha1_hex_bytes(b))
+ .or(Some(ZERO_OID.to_string()))
+ })
+ .unwrap_or_else(|| ZERO_OID.to_string());
+ let right_oid = if let Some(b) = right_bytes.as_ref() {
+ if current_mode == SYMLINK_MODE {
+ git_blob_sha1_hex_bytes(b)
+ } else {
+ self.git_blob_oid_for_path(¤t_external)
+ .unwrap_or_else(|| git_blob_sha1_hex_bytes(b))
+ }
+ } else {
+ ZERO_OID.to_string()
+ };
+
+ // If either side isn't valid UTF-8, emit a binary diff header and continue.
+ let left_text = left_bytes
+ .as_deref()
+ .and_then(|b| std::str::from_utf8(b).ok());
+ let right_text = right_bytes
+ .as_deref()
+ .and_then(|b| std::str::from_utf8(b).ok());
+
+ // Prefer text diffs when possible:
+ // - both sides are valid UTF-8
+ // - OR one side is missing (add/delete) and the present side is valid UTF-8
+ let can_text_diff = match (left_text, right_text, is_add, is_delete) {
+ (Some(_), Some(_), _, _) => true,
+ (_, Some(_), true, _) => true, // add: left missing, right text
+ (Some(_), _, _, true) => true, // delete: left text, right missing
+ _ => false,
+ };
+
+ if can_text_diff {
+ // Diff the contents as text, treating missing side as empty string.
+ let l = left_text.unwrap_or("");
+ let r = right_text.unwrap_or("");
+
+ // Emit index line without mode suffix to preserve current test expectations.
+ aggregated.push_str(&format!("index {left_oid}..{right_oid}\n"));
+
+ let old_header = if left_bytes.is_some() {
+ format!("a/{left_display}")
+ } else {
+ "/dev/null".to_string()
+ };
+ let new_header = if right_bytes.is_some() {
+ format!("b/{right_display}")
+ } else {
+ "/dev/null".to_string()
+ };
+
+ let diff = similar::TextDiff::from_lines(l, r);
+ let unified = diff
+ .unified_diff()
+ .context_radius(3)
+ .header(&old_header, &new_header)
+ .to_string();
+
+ aggregated.push_str(&unified);
+ if !aggregated.ends_with('\n') {
+ aggregated.push('\n');
+ }
+ } else {
+ // Binary or invalid UTF-8: emit header only.
+ aggregated.push_str(&format!("index {left_oid}..{right_oid}\n"));
+ let old_header = if left_bytes.is_some() {
+ format!("a/{left_display}")
+ } else {
+ "/dev/null".to_string()
+ };
+ let new_header = if right_bytes.is_some() {
+ format!("b/{right_display}")
+ } else {
+ "/dev/null".to_string()
+ };
+ aggregated.push_str(&format!("--- {old_header}\n"));
+ aggregated.push_str(&format!("+++ {new_header}\n"));
+ aggregated.push_str("Binary files differ\n");
+ if !aggregated.ends_with('\n') {
+ aggregated.push('\n');
+ }
+ }
+ }
+
+ if aggregated.trim().is_empty() {
+ Ok(None)
+ } else {
+ Ok(Some(aggregated))
+ }
+ }
+}
+
+fn uuid_filename_for(path: &Path) -> String {
+ let id = Uuid::new_v4().to_string();
+ match path.extension().and_then(|e| e.to_str()) {
+ Some(ext) if !ext.is_empty() => format!("{id}.{ext}"),
+ _ => id,
+ }
+}
+
+/// Compute the Git SHA-1 blob object ID for the given content (bytes).
+fn git_blob_sha1_hex_bytes(data: &[u8]) -> String {
+ // Git blob hash is sha1 of: "blob <len>\0<data>"
+ let header = format!("blob {}\0", data.len());
+ use sha1::Digest;
+ let mut hasher = sha1::Sha1::new();
+ hasher.update(header.as_bytes());
+ hasher.update(data);
+ let digest = hasher.finalize();
+ let mut out = String::with_capacity(40);
+ for b in digest {
+ use std::fmt::Write;
+ let _ = write!(&mut out, "{b:02x}");
+ }
+ out
+}
+
+const ZERO_OID: &str = "0000000000000000000000000000000000000000";
+const REGULAR_MODE: &str = "100644";
+#[cfg(unix)]
Though even on Windows, this has to be readable (and preserved?) in a Git tree object, no?
- Created: 2025-08-04 03:11:58 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2250312120
@@ -0,0 +1,766 @@
+use std::collections::HashMap;
+use std::fs;
+use std::path::Path;
+use std::path::PathBuf;
+use std::process::Command;
+
+use anyhow::Context;
+use anyhow::Result;
+use anyhow::anyhow;
+use uuid::Uuid;
+
+use crate::protocol::FileChange;
+
+struct BaselineFileInfo {
+ path: Option<PathBuf>,
+ contents_bytes: Option<Vec<u8>>,
+ mode: Option<String>,
+ oid: Option<String>,
+}
+
+/// Tracks sets of changes to files and exposes the overall unified diff.
+/// Internally, the way this works is now:
+/// 1. Maintain an in-memory baseline snapshot of files when they are first seen.
+/// For new additions, do not create a baseline so that diffs are shown as proper additions (using /dev/null).
+/// 2. Keep a stable internal filename (uuid + same extension) per external path for rename tracking.
+/// 3. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk entirely in-memory
+/// using the `similar` crate and emit unified diffs with rewritten external paths.
+#[derive(Default)]
+pub struct TurnDiffTracker {
+ /// Map external path -> internal filename (uuid + same extension).
+ external_to_temp_name: HashMap<PathBuf, String>,
+ /// Internal filename -> baseline file info.
+ baseline_file_info: HashMap<String, BaselineFileInfo>,
+ /// Internal filename -> external path as of current accumulated state (after applying all changes).
+ /// This is where renames are tracked.
+ temp_name_to_current_path: HashMap<String, PathBuf>,
+ /// Cache of known git worktree roots to avoid repeated filesystem walks.
+ git_root_cache: Vec<PathBuf>,
+}
+
+impl TurnDiffTracker {
+ pub fn new() -> Self {
+ Self::default()
+ }
+
+ /// Front-run apply patch calls to track the starting contents of any modified files.
+ /// - Creates an in-memory baseline snapshot for files that already exist on disk when first seen.
+ /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions.
+ /// - Also updates internal mappings for move/rename events.
+ pub fn on_patch_begin(&mut self, changes: &HashMap<PathBuf, FileChange>) -> Result<()> {
+ for (path, change) in changes.iter() {
+ // Ensure a stable internal filename exists for this external path.
+ if !self.external_to_temp_name.contains_key(path) {
+ let internal = uuid_filename_for(path);
+ self.external_to_temp_name
+ .insert(path.clone(), internal.clone());
+ self.temp_name_to_current_path
+ .insert(internal.clone(), path.clone());
+
+ // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null.
+ let (contents_bytes, mode, oid) = if path.exists() {
+ let mode = file_mode_for_path(path);
+ let mode_str = mode.as_deref().unwrap_or(REGULAR_MODE);
+ let contents_bytes = blob_bytes(path, mode_str)
+ .unwrap_or_default()
+ .unwrap_or_default();
+ let oid = if mode.as_deref() == Some(SYMLINK_MODE) {
+ git_blob_sha1_hex_bytes(&contents_bytes)
+ } else {
+ self.git_blob_oid_for_path(path)
+ .unwrap_or_else(|| git_blob_sha1_hex_bytes(&contents_bytes))
+ };
+ (Some(contents_bytes), mode, Some(oid))
+ } else {
+ (None, None, Some(ZERO_OID.to_string()))
+ };
+
+ self.baseline_file_info.insert(
+ internal.clone(),
+ BaselineFileInfo {
+ path: Some(path.clone()),
+ contents_bytes,
+ mode,
+ oid,
+ },
+ );
+ }
+
+ // Track rename/move in current mapping if provided in an Update.
+ if let FileChange::Update {
+ move_path: Some(dest),
+ ..
+ } = change
+ {
+ let uuid_filename = match self.external_to_temp_name.get(path) {
+ Some(i) => i.clone(),
+ None => {
+ // This should be rare, but if we haven't mapped the source, create it with no baseline.
+ let i = uuid_filename_for(path);
+ self.baseline_file_info.insert(
+ i.clone(),
+ BaselineFileInfo {
+ path: Some(path.clone()),
+ contents_bytes: None,
+ mode: None,
+ oid: Some(ZERO_OID.to_string()),
+ },
+ );
+ i
+ }
+ };
+ // Update current external mapping for temp file name.
+ self.temp_name_to_current_path
+ .insert(uuid_filename.clone(), dest.clone());
+ // Update forward file_mapping: external current -> internal name.
+ self.external_to_temp_name.remove(path);
+ self.external_to_temp_name
+ .insert(dest.clone(), uuid_filename);
+ };
+ }
+
+ Ok(())
+ }
+
+ fn get_path_for_internal(&self, internal: &str) -> Option<PathBuf> {
+ self.temp_name_to_current_path
+ .get(internal)
+ .cloned()
+ .or_else(|| {
+ self.baseline_file_info
+ .get(internal)
+ .and_then(|info| info.path.clone())
+ })
+ }
+
+ /// Find the git worktree root for a file/directory by walking up to the first ancestor containing a `.git` entry.
+ /// Uses a simple cache of known roots and avoids negative-result caching for simplicity.
+ fn find_git_root_cached(&mut self, start: &Path) -> Option<PathBuf> {
+ let dir = if start.is_dir() {
+ start
+ } else {
+ start.parent()?
+ };
+
+ // Fast path: if any cached root is an ancestor of this path, use it.
+ if let Some(root) = self
+ .git_root_cache
+ .iter()
+ .find(|r| dir.starts_with(r))
+ .cloned()
+ {
+ return Some(root);
+ }
+
+ // Walk up to find a `.git` marker.
+ let mut cur = dir.to_path_buf();
+ loop {
+ let git_marker = cur.join(".git");
+ if git_marker.is_dir() || git_marker.is_file() {
+ if !self.git_root_cache.iter().any(|r| r == &cur) {
+ self.git_root_cache.push(cur.clone());
+ }
+ return Some(cur);
+ }
+
+ // On Windows, avoid walking above the drive or UNC share root.
+ #[cfg(windows)]
+ {
+ if is_windows_drive_or_unc_root(&cur) {
+ return None;
+ }
+ }
+
+ if let Some(parent) = cur.parent() {
+ cur = parent.to_path_buf();
+ } else {
+ return None;
+ }
+ }
+ }
+
+ /// Return a display string for `path` relative to its git root if found, else absolute.
+ fn relative_to_git_root_str(&mut self, path: &Path) -> String {
+ let s = if let Some(root) = self.find_git_root_cached(path) {
+ if let Ok(rel) = path.strip_prefix(&root) {
+ rel.display().to_string()
+ } else {
+ path.display().to_string()
+ }
+ } else {
+ path.display().to_string()
+ };
+ s.replace('\\', "/")
+ }
+
+ /// Ask git to compute the blob SHA-1 for the file at `path` within its repository.
+ /// Returns None if no repository is found or git invocation fails.
+ fn git_blob_oid_for_path(&mut self, path: &Path) -> Option<String> {
+ let root = self.find_git_root_cached(path)?;
+ // Compute a path relative to the repo root for better portability across platforms.
+ let rel = path.strip_prefix(&root).unwrap_or(path);
+ let output = Command::new("git")
+ .arg("-C")
+ .arg(&root)
+ .arg("hash-object")
+ .arg("--")
+ .arg(rel)
+ .output()
+ .ok()?;
+ if !output.status.success() {
+ return None;
+ }
+ let s = String::from_utf8_lossy(&output.stdout).trim().to_string();
+ if s.len() == 40 { Some(s) } else { None }
+ }
+
+ /// Recompute the aggregated unified diff by comparing all of the in-memory snapshots that were
+ /// collected before the first time they were touched by apply_patch during this turn with
+ /// the current repo state.
+ pub fn get_unified_diff(&mut self) -> Result<Option<String>> {
+ let mut aggregated = String::new();
+
+ // Compute diffs per tracked internal file in a stable order by external path.
+ let mut baseline_file_names: Vec<String> =
+ self.baseline_file_info.keys().cloned().collect();
+ // Sort lexicographically by full repo-relative path to match git behavior.
+ baseline_file_names.sort_by_key(|internal| {
+ self.get_path_for_internal(internal)
+ .map(|p| self.relative_to_git_root_str(&p))
+ .unwrap_or_default()
+ });
+
+ for internal in baseline_file_names {
+ // Baseline external must exist for any tracked internal.
+ let baseline_external = match self
+ .baseline_file_info
+ .get(&internal)
+ .and_then(|i| i.path.clone())
+ {
+ Some(p) => p,
+ None => continue,
+ };
+ let current_external = match self.get_path_for_internal(&internal) {
+ Some(p) => p,
+ None => continue,
+ };
+
+ // Determine modes early; needed to read symlink content correctly.
+ let baseline_mode = self
+ .baseline_file_info
+ .get(&internal)
+ .and_then(|i| i.mode.clone())
+ .unwrap_or_else(|| REGULAR_MODE.to_string());
+ let current_mode =
+ file_mode_for_path(¤t_external).unwrap_or_else(|| REGULAR_MODE.to_string());
+
+ let left_bytes = self
+ .baseline_file_info
+ .get(&internal)
+ .and_then(|i| i.contents_bytes.clone());
+
+ let right_bytes = blob_bytes(¤t_external, ¤t_mode)?;
+
+ // Fast path: identical bytes or both missing.
+ if left_bytes.as_deref() == right_bytes.as_deref() {
+ continue;
+ }
+
+ let left_display = self.relative_to_git_root_str(&baseline_external);
+ let right_display = self.relative_to_git_root_str(¤t_external);
+
+ // Emit a git-style header for better readability and parity with previous behavior.
+ aggregated.push_str(&format!("diff --git a/{left_display} b/{right_display}\n"));
+
+ let is_add = left_bytes.is_none() && right_bytes.is_some();
+ let is_delete = left_bytes.is_some() && right_bytes.is_none();
+
+ if is_add {
+ aggregated.push_str(&format!("new file mode {current_mode}\n"));
+ } else if is_delete {
+ aggregated.push_str(&format!("deleted file mode {baseline_mode}\n"));
+ } else if baseline_mode != current_mode {
+ aggregated.push_str(&format!("old mode {baseline_mode}\n"));
+ aggregated.push_str(&format!("new mode {current_mode}\n"));
+ }
+
+ // Determine blob object IDs for left and right contents. Prefer stored OIDs
+ // captured from the original repo state when the change was first seen.
+ let left_oid = self
+ .baseline_file_info
+ .get(&internal)
+ .and_then(|i| i.oid.clone())
+ .or_else(|| {
+ left_bytes
+ .as_ref()
+ .map(|b| git_blob_sha1_hex_bytes(b))
+ .or(Some(ZERO_OID.to_string()))
+ })
+ .unwrap_or_else(|| ZERO_OID.to_string());
+ let right_oid = if let Some(b) = right_bytes.as_ref() {
+ if current_mode == SYMLINK_MODE {
+ git_blob_sha1_hex_bytes(b)
+ } else {
+ self.git_blob_oid_for_path(¤t_external)
+ .unwrap_or_else(|| git_blob_sha1_hex_bytes(b))
+ }
+ } else {
+ ZERO_OID.to_string()
+ };
+
+ // If either side isn't valid UTF-8, emit a binary diff header and continue.
+ let left_text = left_bytes
+ .as_deref()
+ .and_then(|b| std::str::from_utf8(b).ok());
+ let right_text = right_bytes
+ .as_deref()
+ .and_then(|b| std::str::from_utf8(b).ok());
+
+ // Prefer text diffs when possible:
+ // - both sides are valid UTF-8
+ // - OR one side is missing (add/delete) and the present side is valid UTF-8
+ let can_text_diff = match (left_text, right_text, is_add, is_delete) {
+ (Some(_), Some(_), _, _) => true,
+ (_, Some(_), true, _) => true, // add: left missing, right text
+ (Some(_), _, _, true) => true, // delete: left text, right missing
+ _ => false,
+ };
+
+ if can_text_diff {
+ // Diff the contents as text, treating missing side as empty string.
+ let l = left_text.unwrap_or("");
+ let r = right_text.unwrap_or("");
+
+ // Emit index line without mode suffix to preserve current test expectations.
+ aggregated.push_str(&format!("index {left_oid}..{right_oid}\n"));
+
+ let old_header = if left_bytes.is_some() {
+ format!("a/{left_display}")
+ } else {
+ "/dev/null".to_string()
+ };
+ let new_header = if right_bytes.is_some() {
+ format!("b/{right_display}")
+ } else {
+ "/dev/null".to_string()
+ };
+
+ let diff = similar::TextDiff::from_lines(l, r);
+ let unified = diff
+ .unified_diff()
+ .context_radius(3)
+ .header(&old_header, &new_header)
+ .to_string();
+
+ aggregated.push_str(&unified);
+ if !aggregated.ends_with('\n') {
+ aggregated.push('\n');
+ }
+ } else {
+ // Binary or invalid UTF-8: emit header only.
+ aggregated.push_str(&format!("index {left_oid}..{right_oid}\n"));
+ let old_header = if left_bytes.is_some() {
+ format!("a/{left_display}")
+ } else {
+ "/dev/null".to_string()
+ };
+ let new_header = if right_bytes.is_some() {
+ format!("b/{right_display}")
+ } else {
+ "/dev/null".to_string()
+ };
+ aggregated.push_str(&format!("--- {old_header}\n"));
+ aggregated.push_str(&format!("+++ {new_header}\n"));
+ aggregated.push_str("Binary files differ\n");
+ if !aggregated.ends_with('\n') {
+ aggregated.push('\n');
+ }
+ }
+ }
+
+ if aggregated.trim().is_empty() {
+ Ok(None)
+ } else {
+ Ok(Some(aggregated))
+ }
+ }
+}
+
+fn uuid_filename_for(path: &Path) -> String {
+ let id = Uuid::new_v4().to_string();
+ match path.extension().and_then(|e| e.to_str()) {
+ Some(ext) if !ext.is_empty() => format!("{id}.{ext}"),
+ _ => id,
+ }
+}
+
+/// Compute the Git SHA-1 blob object ID for the given content (bytes).
+fn git_blob_sha1_hex_bytes(data: &[u8]) -> String {
+ // Git blob hash is sha1 of: "blob <len>\0<data>"
+ let header = format!("blob {}\0", data.len());
+ use sha1::Digest;
+ let mut hasher = sha1::Sha1::new();
+ hasher.update(header.as_bytes());
+ hasher.update(data);
+ let digest = hasher.finalize();
+ let mut out = String::with_capacity(40);
+ for b in digest {
+ use std::fmt::Write;
+ let _ = write!(&mut out, "{b:02x}");
+ }
+ out
+}
+
+const ZERO_OID: &str = "0000000000000000000000000000000000000000";
+const REGULAR_MODE: &str = "100644";
+#[cfg(unix)]
+const EXECUTABLE_MODE: &str = "100755";
+const SYMLINK_MODE: &str = "120000";
+
+#[cfg(unix)]
+fn file_mode_for_path(path: &Path) -> Option<String> {
+ use std::os::unix::fs::PermissionsExt;
+ let meta = fs::symlink_metadata(path).ok()?;
+ let ft = meta.file_type();
+ if ft.is_symlink() {
+ return Some(SYMLINK_MODE.to_string());
+ }
+ let mode = meta.permissions().mode();
+ let is_exec = (mode & 0o111) != 0;
+ Some(if is_exec {
+ EXECUTABLE_MODE.into()
+ } else {
+ REGULAR_MODE.into()
+ })
+}
+
+#[cfg(not(unix))]
+fn file_mode_for_path(_path: &Path) -> Option<String> {
+ // Default to non-executable on non-unix.
+ Some(REGULAR_MODE.to_string())
+}
+
+fn blob_bytes(path: &Path, mode: &str) -> Result<Option<Vec<u8>>> {
+ if path.exists() {
+ let contents = if mode == SYMLINK_MODE {
+ symlink_blob_bytes(path)
+ .ok_or_else(|| anyhow!("failed to read symlink target for {}", path.display()))?
+ } else {
+ fs::read(path).with_context(|| {
+ format!("failed to read current file for diff {}", path.display())
+ })?
+ };
+ Ok(Some(contents))
+ } else {
+ Ok(None)
+ }
+}
+
+#[cfg(unix)]
+fn symlink_blob_bytes(path: &Path) -> Option<Vec<u8>> {
+ use std::os::unix::ffi::OsStrExt;
+ let target = std::fs::read_link(path).ok()?;
+ Some(target.as_os_str().as_bytes().to_vec())
One interesting operating system fact is that the contents of a symlink do not have to be a path to a file: you can just use it for arbitrary data storage. (As such, I think the max number of bytes you can store in a symlink is
PATH_MAX, though.)I knew of one project that did this to save a system call because
readlink()is one system call butopen()plusread()for a regular file is two?
- Created: 2025-08-04 03:12:56 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2250312732
@@ -0,0 +1,766 @@
+use std::collections::HashMap;
+use std::fs;
+use std::path::Path;
+use std::path::PathBuf;
+use std::process::Command;
+
+use anyhow::Context;
+use anyhow::Result;
+use anyhow::anyhow;
+use uuid::Uuid;
+
+use crate::protocol::FileChange;
+
+struct BaselineFileInfo {
+ path: Option<PathBuf>,
+ contents_bytes: Option<Vec<u8>>,
+ mode: Option<String>,
+ oid: Option<String>,
+}
+
+/// Tracks sets of changes to files and exposes the overall unified diff.
+/// Internally, the way this works is now:
+/// 1. Maintain an in-memory baseline snapshot of files when they are first seen.
+/// For new additions, do not create a baseline so that diffs are shown as proper additions (using /dev/null).
+/// 2. Keep a stable internal filename (uuid + same extension) per external path for rename tracking.
+/// 3. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk entirely in-memory
+/// using the `similar` crate and emit unified diffs with rewritten external paths.
+#[derive(Default)]
+pub struct TurnDiffTracker {
+ /// Map external path -> internal filename (uuid + same extension).
+ external_to_temp_name: HashMap<PathBuf, String>,
+ /// Internal filename -> baseline file info.
+ baseline_file_info: HashMap<String, BaselineFileInfo>,
+ /// Internal filename -> external path as of current accumulated state (after applying all changes).
+ /// This is where renames are tracked.
+ temp_name_to_current_path: HashMap<String, PathBuf>,
+ /// Cache of known git worktree roots to avoid repeated filesystem walks.
+ git_root_cache: Vec<PathBuf>,
+}
+
+impl TurnDiffTracker {
+ pub fn new() -> Self {
+ Self::default()
+ }
+
+ /// Front-run apply patch calls to track the starting contents of any modified files.
+ /// - Creates an in-memory baseline snapshot for files that already exist on disk when first seen.
+ /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions.
+ /// - Also updates internal mappings for move/rename events.
+ pub fn on_patch_begin(&mut self, changes: &HashMap<PathBuf, FileChange>) -> Result<()> {
+ for (path, change) in changes.iter() {
+ // Ensure a stable internal filename exists for this external path.
+ if !self.external_to_temp_name.contains_key(path) {
+ let internal = uuid_filename_for(path);
+ self.external_to_temp_name
+ .insert(path.clone(), internal.clone());
+ self.temp_name_to_current_path
+ .insert(internal.clone(), path.clone());
+
+ // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null.
+ let (contents_bytes, mode, oid) = if path.exists() {
+ let mode = file_mode_for_path(path);
+ let mode_str = mode.as_deref().unwrap_or(REGULAR_MODE);
+ let contents_bytes = blob_bytes(path, mode_str)
+ .unwrap_or_default()
+ .unwrap_or_default();
+ let oid = if mode.as_deref() == Some(SYMLINK_MODE) {
+ git_blob_sha1_hex_bytes(&contents_bytes)
+ } else {
+ self.git_blob_oid_for_path(path)
+ .unwrap_or_else(|| git_blob_sha1_hex_bytes(&contents_bytes))
+ };
+ (Some(contents_bytes), mode, Some(oid))
+ } else {
+ (None, None, Some(ZERO_OID.to_string()))
+ };
+
+ self.baseline_file_info.insert(
+ internal.clone(),
+ BaselineFileInfo {
+ path: Some(path.clone()),
+ contents_bytes,
+ mode,
+ oid,
+ },
+ );
+ }
+
+ // Track rename/move in current mapping if provided in an Update.
+ if let FileChange::Update {
+ move_path: Some(dest),
+ ..
+ } = change
+ {
+ let uuid_filename = match self.external_to_temp_name.get(path) {
+ Some(i) => i.clone(),
+ None => {
+ // This should be rare, but if we haven't mapped the source, create it with no baseline.
+ let i = uuid_filename_for(path);
+ self.baseline_file_info.insert(
+ i.clone(),
+ BaselineFileInfo {
+ path: Some(path.clone()),
+ contents_bytes: None,
+ mode: None,
+ oid: Some(ZERO_OID.to_string()),
+ },
+ );
+ i
+ }
+ };
+ // Update current external mapping for temp file name.
+ self.temp_name_to_current_path
+ .insert(uuid_filename.clone(), dest.clone());
+ // Update forward file_mapping: external current -> internal name.
+ self.external_to_temp_name.remove(path);
+ self.external_to_temp_name
+ .insert(dest.clone(), uuid_filename);
+ };
+ }
+
+ Ok(())
+ }
+
+ fn get_path_for_internal(&self, internal: &str) -> Option<PathBuf> {
+ self.temp_name_to_current_path
+ .get(internal)
+ .cloned()
+ .or_else(|| {
+ self.baseline_file_info
+ .get(internal)
+ .and_then(|info| info.path.clone())
+ })
+ }
+
+ /// Find the git worktree root for a file/directory by walking up to the first ancestor containing a `.git` entry.
+ /// Uses a simple cache of known roots and avoids negative-result caching for simplicity.
+ fn find_git_root_cached(&mut self, start: &Path) -> Option<PathBuf> {
+ let dir = if start.is_dir() {
+ start
+ } else {
+ start.parent()?
+ };
+
+ // Fast path: if any cached root is an ancestor of this path, use it.
+ if let Some(root) = self
+ .git_root_cache
+ .iter()
+ .find(|r| dir.starts_with(r))
+ .cloned()
+ {
+ return Some(root);
+ }
+
+ // Walk up to find a `.git` marker.
+ let mut cur = dir.to_path_buf();
+ loop {
+ let git_marker = cur.join(".git");
+ if git_marker.is_dir() || git_marker.is_file() {
+ if !self.git_root_cache.iter().any(|r| r == &cur) {
+ self.git_root_cache.push(cur.clone());
+ }
+ return Some(cur);
+ }
+
+ // On Windows, avoid walking above the drive or UNC share root.
+ #[cfg(windows)]
+ {
+ if is_windows_drive_or_unc_root(&cur) {
+ return None;
+ }
+ }
+
+ if let Some(parent) = cur.parent() {
+ cur = parent.to_path_buf();
+ } else {
+ return None;
+ }
+ }
+ }
+
+ /// Return a display string for `path` relative to its git root if found, else absolute.
+ fn relative_to_git_root_str(&mut self, path: &Path) -> String {
+ let s = if let Some(root) = self.find_git_root_cached(path) {
+ if let Ok(rel) = path.strip_prefix(&root) {
+ rel.display().to_string()
+ } else {
+ path.display().to_string()
+ }
+ } else {
+ path.display().to_string()
+ };
+ s.replace('\\', "/")
+ }
+
+ /// Ask git to compute the blob SHA-1 for the file at `path` within its repository.
+ /// Returns None if no repository is found or git invocation fails.
+ fn git_blob_oid_for_path(&mut self, path: &Path) -> Option<String> {
+ let root = self.find_git_root_cached(path)?;
+ // Compute a path relative to the repo root for better portability across platforms.
+ let rel = path.strip_prefix(&root).unwrap_or(path);
+ let output = Command::new("git")
+ .arg("-C")
+ .arg(&root)
+ .arg("hash-object")
+ .arg("--")
+ .arg(rel)
+ .output()
+ .ok()?;
+ if !output.status.success() {
+ return None;
+ }
+ let s = String::from_utf8_lossy(&output.stdout).trim().to_string();
+ if s.len() == 40 { Some(s) } else { None }
+ }
+
+ /// Recompute the aggregated unified diff by comparing all of the in-memory snapshots that were
+ /// collected before the first time they were touched by apply_patch during this turn with
+ /// the current repo state.
+ pub fn get_unified_diff(&mut self) -> Result<Option<String>> {
+ let mut aggregated = String::new();
+
+ // Compute diffs per tracked internal file in a stable order by external path.
+ let mut baseline_file_names: Vec<String> =
+ self.baseline_file_info.keys().cloned().collect();
+ // Sort lexicographically by full repo-relative path to match git behavior.
+ baseline_file_names.sort_by_key(|internal| {
+ self.get_path_for_internal(internal)
+ .map(|p| self.relative_to_git_root_str(&p))
+ .unwrap_or_default()
+ });
+
+ for internal in baseline_file_names {
+ // Baseline external must exist for any tracked internal.
+ let baseline_external = match self
+ .baseline_file_info
+ .get(&internal)
+ .and_then(|i| i.path.clone())
+ {
+ Some(p) => p,
+ None => continue,
+ };
+ let current_external = match self.get_path_for_internal(&internal) {
+ Some(p) => p,
+ None => continue,
+ };
+
+ // Determine modes early; needed to read symlink content correctly.
+ let baseline_mode = self
+ .baseline_file_info
+ .get(&internal)
+ .and_then(|i| i.mode.clone())
+ .unwrap_or_else(|| REGULAR_MODE.to_string());
+ let current_mode =
+ file_mode_for_path(¤t_external).unwrap_or_else(|| REGULAR_MODE.to_string());
+
+ let left_bytes = self
+ .baseline_file_info
+ .get(&internal)
+ .and_then(|i| i.contents_bytes.clone());
+
+ let right_bytes = blob_bytes(¤t_external, ¤t_mode)?;
+
+ // Fast path: identical bytes or both missing.
+ if left_bytes.as_deref() == right_bytes.as_deref() {
+ continue;
+ }
+
+ let left_display = self.relative_to_git_root_str(&baseline_external);
+ let right_display = self.relative_to_git_root_str(¤t_external);
+
+ // Emit a git-style header for better readability and parity with previous behavior.
+ aggregated.push_str(&format!("diff --git a/{left_display} b/{right_display}\n"));
+
+ let is_add = left_bytes.is_none() && right_bytes.is_some();
+ let is_delete = left_bytes.is_some() && right_bytes.is_none();
+
+ if is_add {
+ aggregated.push_str(&format!("new file mode {current_mode}\n"));
+ } else if is_delete {
+ aggregated.push_str(&format!("deleted file mode {baseline_mode}\n"));
+ } else if baseline_mode != current_mode {
+ aggregated.push_str(&format!("old mode {baseline_mode}\n"));
+ aggregated.push_str(&format!("new mode {current_mode}\n"));
+ }
+
+ // Determine blob object IDs for left and right contents. Prefer stored OIDs
+ // captured from the original repo state when the change was first seen.
+ let left_oid = self
+ .baseline_file_info
+ .get(&internal)
+ .and_then(|i| i.oid.clone())
+ .or_else(|| {
+ left_bytes
+ .as_ref()
+ .map(|b| git_blob_sha1_hex_bytes(b))
+ .or(Some(ZERO_OID.to_string()))
+ })
+ .unwrap_or_else(|| ZERO_OID.to_string());
+ let right_oid = if let Some(b) = right_bytes.as_ref() {
+ if current_mode == SYMLINK_MODE {
+ git_blob_sha1_hex_bytes(b)
+ } else {
+ self.git_blob_oid_for_path(¤t_external)
+ .unwrap_or_else(|| git_blob_sha1_hex_bytes(b))
+ }
+ } else {
+ ZERO_OID.to_string()
+ };
+
+ // If either side isn't valid UTF-8, emit a binary diff header and continue.
+ let left_text = left_bytes
+ .as_deref()
+ .and_then(|b| std::str::from_utf8(b).ok());
+ let right_text = right_bytes
+ .as_deref()
+ .and_then(|b| std::str::from_utf8(b).ok());
+
+ // Prefer text diffs when possible:
+ // - both sides are valid UTF-8
+ // - OR one side is missing (add/delete) and the present side is valid UTF-8
+ let can_text_diff = match (left_text, right_text, is_add, is_delete) {
+ (Some(_), Some(_), _, _) => true,
+ (_, Some(_), true, _) => true, // add: left missing, right text
+ (Some(_), _, _, true) => true, // delete: left text, right missing
+ _ => false,
+ };
+
+ if can_text_diff {
+ // Diff the contents as text, treating missing side as empty string.
+ let l = left_text.unwrap_or("");
+ let r = right_text.unwrap_or("");
+
+ // Emit index line without mode suffix to preserve current test expectations.
+ aggregated.push_str(&format!("index {left_oid}..{right_oid}\n"));
+
+ let old_header = if left_bytes.is_some() {
+ format!("a/{left_display}")
+ } else {
+ "/dev/null".to_string()
+ };
+ let new_header = if right_bytes.is_some() {
+ format!("b/{right_display}")
+ } else {
+ "/dev/null".to_string()
+ };
+
+ let diff = similar::TextDiff::from_lines(l, r);
+ let unified = diff
+ .unified_diff()
+ .context_radius(3)
+ .header(&old_header, &new_header)
+ .to_string();
+
+ aggregated.push_str(&unified);
+ if !aggregated.ends_with('\n') {
+ aggregated.push('\n');
+ }
+ } else {
+ // Binary or invalid UTF-8: emit header only.
+ aggregated.push_str(&format!("index {left_oid}..{right_oid}\n"));
+ let old_header = if left_bytes.is_some() {
+ format!("a/{left_display}")
+ } else {
+ "/dev/null".to_string()
+ };
+ let new_header = if right_bytes.is_some() {
+ format!("b/{right_display}")
+ } else {
+ "/dev/null".to_string()
+ };
+ aggregated.push_str(&format!("--- {old_header}\n"));
+ aggregated.push_str(&format!("+++ {new_header}\n"));
+ aggregated.push_str("Binary files differ\n");
+ if !aggregated.ends_with('\n') {
+ aggregated.push('\n');
+ }
+ }
+ }
+
+ if aggregated.trim().is_empty() {
+ Ok(None)
+ } else {
+ Ok(Some(aggregated))
+ }
+ }
+}
+
+fn uuid_filename_for(path: &Path) -> String {
+ let id = Uuid::new_v4().to_string();
+ match path.extension().and_then(|e| e.to_str()) {
+ Some(ext) if !ext.is_empty() => format!("{id}.{ext}"),
+ _ => id,
+ }
+}
+
+/// Compute the Git SHA-1 blob object ID for the given content (bytes).
+fn git_blob_sha1_hex_bytes(data: &[u8]) -> String {
+ // Git blob hash is sha1 of: "blob <len>\0<data>"
+ let header = format!("blob {}\0", data.len());
+ use sha1::Digest;
Returning this type is slightly stronger since you don't have to verify the integrity of the
Stringcontents elsewhere.
- Created: 2025-08-04 03:13:59 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2250313339
@@ -0,0 +1,766 @@
+use std::collections::HashMap;
+use std::fs;
+use std::path::Path;
+use std::path::PathBuf;
+use std::process::Command;
+
+use anyhow::Context;
+use anyhow::Result;
+use anyhow::anyhow;
+use uuid::Uuid;
+
+use crate::protocol::FileChange;
+
+struct BaselineFileInfo {
+ path: Option<PathBuf>,
+ contents_bytes: Option<Vec<u8>>,
+ mode: Option<String>,
+ oid: Option<String>,
+}
+
+/// Tracks sets of changes to files and exposes the overall unified diff.
+/// Internally, the way this works is now:
+/// 1. Maintain an in-memory baseline snapshot of files when they are first seen.
+/// For new additions, do not create a baseline so that diffs are shown as proper additions (using /dev/null).
+/// 2. Keep a stable internal filename (uuid + same extension) per external path for rename tracking.
+/// 3. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk entirely in-memory
+/// using the `similar` crate and emit unified diffs with rewritten external paths.
+#[derive(Default)]
+pub struct TurnDiffTracker {
+ /// Map external path -> internal filename (uuid + same extension).
+ external_to_temp_name: HashMap<PathBuf, String>,
+ /// Internal filename -> baseline file info.
+ baseline_file_info: HashMap<String, BaselineFileInfo>,
+ /// Internal filename -> external path as of current accumulated state (after applying all changes).
+ /// This is where renames are tracked.
+ temp_name_to_current_path: HashMap<String, PathBuf>,
+ /// Cache of known git worktree roots to avoid repeated filesystem walks.
+ git_root_cache: Vec<PathBuf>,
+}
+
+impl TurnDiffTracker {
+ pub fn new() -> Self {
+ Self::default()
+ }
+
+ /// Front-run apply patch calls to track the starting contents of any modified files.
+ /// - Creates an in-memory baseline snapshot for files that already exist on disk when first seen.
+ /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions.
+ /// - Also updates internal mappings for move/rename events.
+ pub fn on_patch_begin(&mut self, changes: &HashMap<PathBuf, FileChange>) -> Result<()> {
+ for (path, change) in changes.iter() {
+ // Ensure a stable internal filename exists for this external path.
+ if !self.external_to_temp_name.contains_key(path) {
+ let internal = uuid_filename_for(path);
+ self.external_to_temp_name
+ .insert(path.clone(), internal.clone());
+ self.temp_name_to_current_path
+ .insert(internal.clone(), path.clone());
+
+ // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null.
+ let (contents_bytes, mode, oid) = if path.exists() {
+ let mode = file_mode_for_path(path);
+ let mode_str = mode.as_deref().unwrap_or(REGULAR_MODE);
+ let contents_bytes = blob_bytes(path, mode_str)
+ .unwrap_or_default()
+ .unwrap_or_default();
+ let oid = if mode.as_deref() == Some(SYMLINK_MODE) {
+ git_blob_sha1_hex_bytes(&contents_bytes)
+ } else {
+ self.git_blob_oid_for_path(path)
+ .unwrap_or_else(|| git_blob_sha1_hex_bytes(&contents_bytes))
+ };
+ (Some(contents_bytes), mode, Some(oid))
+ } else {
+ (None, None, Some(ZERO_OID.to_string()))
+ };
+
+ self.baseline_file_info.insert(
+ internal.clone(),
+ BaselineFileInfo {
+ path: Some(path.clone()),
+ contents_bytes,
+ mode,
+ oid,
+ },
+ );
+ }
+
+ // Track rename/move in current mapping if provided in an Update.
+ if let FileChange::Update {
+ move_path: Some(dest),
+ ..
+ } = change
+ {
+ let uuid_filename = match self.external_to_temp_name.get(path) {
+ Some(i) => i.clone(),
+ None => {
+ // This should be rare, but if we haven't mapped the source, create it with no baseline.
+ let i = uuid_filename_for(path);
+ self.baseline_file_info.insert(
+ i.clone(),
+ BaselineFileInfo {
+ path: Some(path.clone()),
+ contents_bytes: None,
+ mode: None,
+ oid: Some(ZERO_OID.to_string()),
+ },
+ );
+ i
+ }
+ };
+ // Update current external mapping for temp file name.
+ self.temp_name_to_current_path
+ .insert(uuid_filename.clone(), dest.clone());
+ // Update forward file_mapping: external current -> internal name.
+ self.external_to_temp_name.remove(path);
+ self.external_to_temp_name
+ .insert(dest.clone(), uuid_filename);
+ };
+ }
+
+ Ok(())
+ }
+
+ fn get_path_for_internal(&self, internal: &str) -> Option<PathBuf> {
+ self.temp_name_to_current_path
+ .get(internal)
+ .cloned()
+ .or_else(|| {
+ self.baseline_file_info
+ .get(internal)
+ .and_then(|info| info.path.clone())
+ })
+ }
+
+ /// Find the git worktree root for a file/directory by walking up to the first ancestor containing a `.git` entry.
+ /// Uses a simple cache of known roots and avoids negative-result caching for simplicity.
+ fn find_git_root_cached(&mut self, start: &Path) -> Option<PathBuf> {
+ let dir = if start.is_dir() {
+ start
+ } else {
+ start.parent()?
+ };
+
+ // Fast path: if any cached root is an ancestor of this path, use it.
+ if let Some(root) = self
+ .git_root_cache
+ .iter()
+ .find(|r| dir.starts_with(r))
+ .cloned()
+ {
+ return Some(root);
+ }
+
+ // Walk up to find a `.git` marker.
+ let mut cur = dir.to_path_buf();
+ loop {
+ let git_marker = cur.join(".git");
+ if git_marker.is_dir() || git_marker.is_file() {
+ if !self.git_root_cache.iter().any(|r| r == &cur) {
+ self.git_root_cache.push(cur.clone());
+ }
+ return Some(cur);
+ }
+
+ // On Windows, avoid walking above the drive or UNC share root.
+ #[cfg(windows)]
+ {
+ if is_windows_drive_or_unc_root(&cur) {
+ return None;
+ }
+ }
+
+ if let Some(parent) = cur.parent() {
+ cur = parent.to_path_buf();
+ } else {
+ return None;
+ }
+ }
+ }
+
+ /// Return a display string for `path` relative to its git root if found, else absolute.
+ fn relative_to_git_root_str(&mut self, path: &Path) -> String {
+ let s = if let Some(root) = self.find_git_root_cached(path) {
+ if let Ok(rel) = path.strip_prefix(&root) {
+ rel.display().to_string()
+ } else {
+ path.display().to_string()
+ }
+ } else {
+ path.display().to_string()
+ };
+ s.replace('\\', "/")
+ }
+
+ /// Ask git to compute the blob SHA-1 for the file at `path` within its repository.
+ /// Returns None if no repository is found or git invocation fails.
+ fn git_blob_oid_for_path(&mut self, path: &Path) -> Option<String> {
+ let root = self.find_git_root_cached(path)?;
+ // Compute a path relative to the repo root for better portability across platforms.
+ let rel = path.strip_prefix(&root).unwrap_or(path);
+ let output = Command::new("git")
+ .arg("-C")
+ .arg(&root)
+ .arg("hash-object")
+ .arg("--")
+ .arg(rel)
+ .output()
+ .ok()?;
+ if !output.status.success() {
+ return None;
+ }
+ let s = String::from_utf8_lossy(&output.stdout).trim().to_string();
+ if s.len() == 40 { Some(s) } else { None }
+ }
+
+ /// Recompute the aggregated unified diff by comparing all of the in-memory snapshots that were
+ /// collected before the first time they were touched by apply_patch during this turn with
+ /// the current repo state.
+ pub fn get_unified_diff(&mut self) -> Result<Option<String>> {
+ let mut aggregated = String::new();
+
+ // Compute diffs per tracked internal file in a stable order by external path.
+ let mut baseline_file_names: Vec<String> =
+ self.baseline_file_info.keys().cloned().collect();
+ // Sort lexicographically by full repo-relative path to match git behavior.
+ baseline_file_names.sort_by_key(|internal| {
+ self.get_path_for_internal(internal)
+ .map(|p| self.relative_to_git_root_str(&p))
+ .unwrap_or_default()
+ });
+
+ for internal in baseline_file_names {
+ // Baseline external must exist for any tracked internal.
+ let baseline_external = match self
+ .baseline_file_info
+ .get(&internal)
+ .and_then(|i| i.path.clone())
+ {
+ Some(p) => p,
+ None => continue,
+ };
+ let current_external = match self.get_path_for_internal(&internal) {
+ Some(p) => p,
+ None => continue,
+ };
+
+ // Determine modes early; needed to read symlink content correctly.
+ let baseline_mode = self
+ .baseline_file_info
+ .get(&internal)
+ .and_then(|i| i.mode.clone())
+ .unwrap_or_else(|| REGULAR_MODE.to_string());
+ let current_mode =
+ file_mode_for_path(¤t_external).unwrap_or_else(|| REGULAR_MODE.to_string());
+
+ let left_bytes = self
+ .baseline_file_info
+ .get(&internal)
+ .and_then(|i| i.contents_bytes.clone());
+
+ let right_bytes = blob_bytes(¤t_external, ¤t_mode)?;
+
+ // Fast path: identical bytes or both missing.
+ if left_bytes.as_deref() == right_bytes.as_deref() {
+ continue;
+ }
+
+ let left_display = self.relative_to_git_root_str(&baseline_external);
+ let right_display = self.relative_to_git_root_str(¤t_external);
+
+ // Emit a git-style header for better readability and parity with previous behavior.
+ aggregated.push_str(&format!("diff --git a/{left_display} b/{right_display}\n"));
+
+ let is_add = left_bytes.is_none() && right_bytes.is_some();
+ let is_delete = left_bytes.is_some() && right_bytes.is_none();
+
+ if is_add {
+ aggregated.push_str(&format!("new file mode {current_mode}\n"));
+ } else if is_delete {
+ aggregated.push_str(&format!("deleted file mode {baseline_mode}\n"));
+ } else if baseline_mode != current_mode {
+ aggregated.push_str(&format!("old mode {baseline_mode}\n"));
+ aggregated.push_str(&format!("new mode {current_mode}\n"));
+ }
+
+ // Determine blob object IDs for left and right contents. Prefer stored OIDs
+ // captured from the original repo state when the change was first seen.
+ let left_oid = self
+ .baseline_file_info
+ .get(&internal)
+ .and_then(|i| i.oid.clone())
+ .or_else(|| {
+ left_bytes
+ .as_ref()
+ .map(|b| git_blob_sha1_hex_bytes(b))
+ .or(Some(ZERO_OID.to_string()))
+ })
+ .unwrap_or_else(|| ZERO_OID.to_string());
+ let right_oid = if let Some(b) = right_bytes.as_ref() {
+ if current_mode == SYMLINK_MODE {
+ git_blob_sha1_hex_bytes(b)
+ } else {
+ self.git_blob_oid_for_path(¤t_external)
+ .unwrap_or_else(|| git_blob_sha1_hex_bytes(b))
+ }
+ } else {
+ ZERO_OID.to_string()
+ };
+
+ // If either side isn't valid UTF-8, emit a binary diff header and continue.
+ let left_text = left_bytes
+ .as_deref()
+ .and_then(|b| std::str::from_utf8(b).ok());
+ let right_text = right_bytes
+ .as_deref()
+ .and_then(|b| std::str::from_utf8(b).ok());
+
+ // Prefer text diffs when possible:
+ // - both sides are valid UTF-8
+ // - OR one side is missing (add/delete) and the present side is valid UTF-8
+ let can_text_diff = match (left_text, right_text, is_add, is_delete) {
+ (Some(_), Some(_), _, _) => true,
+ (_, Some(_), true, _) => true, // add: left missing, right text
+ (Some(_), _, _, true) => true, // delete: left text, right missing
+ _ => false,
+ };
+
+ if can_text_diff {
+ // Diff the contents as text, treating missing side as empty string.
+ let l = left_text.unwrap_or("");
+ let r = right_text.unwrap_or("");
+
+ // Emit index line without mode suffix to preserve current test expectations.
+ aggregated.push_str(&format!("index {left_oid}..{right_oid}\n"));
+
+ let old_header = if left_bytes.is_some() {
+ format!("a/{left_display}")
+ } else {
+ "/dev/null".to_string()
+ };
+ let new_header = if right_bytes.is_some() {
+ format!("b/{right_display}")
+ } else {
+ "/dev/null".to_string()
+ };
+
+ let diff = similar::TextDiff::from_lines(l, r);
+ let unified = diff
+ .unified_diff()
+ .context_radius(3)
+ .header(&old_header, &new_header)
+ .to_string();
+
+ aggregated.push_str(&unified);
+ if !aggregated.ends_with('\n') {
+ aggregated.push('\n');
+ }
+ } else {
+ // Binary or invalid UTF-8: emit header only.
+ aggregated.push_str(&format!("index {left_oid}..{right_oid}\n"));
+ let old_header = if left_bytes.is_some() {
+ format!("a/{left_display}")
+ } else {
+ "/dev/null".to_string()
+ };
+ let new_header = if right_bytes.is_some() {
+ format!("b/{right_display}")
+ } else {
+ "/dev/null".to_string()
+ };
+ aggregated.push_str(&format!("--- {old_header}\n"));
+ aggregated.push_str(&format!("+++ {new_header}\n"));
+ aggregated.push_str("Binary files differ\n");
+ if !aggregated.ends_with('\n') {
+ aggregated.push('\n');
+ }
+ }
+ }
+
+ if aggregated.trim().is_empty() {
+ Ok(None)
+ } else {
+ Ok(Some(aggregated))
+ }
+ }
+}
+
+fn uuid_filename_for(path: &Path) -> String {
+ let id = Uuid::new_v4().to_string();
+ match path.extension().and_then(|e| e.to_str()) {
+ Some(ext) if !ext.is_empty() => format!("{id}.{ext}"),
+ _ => id,
+ }
+}
+
+/// Compute the Git SHA-1 blob object ID for the given content (bytes).
+fn git_blob_sha1_hex_bytes(data: &[u8]) -> String {
+ // Git blob hash is sha1 of: "blob <len>\0<data>"
+ let header = format!("blob {}\0", data.len());
+ use sha1::Digest;
+ let mut hasher = sha1::Sha1::new();
+ hasher.update(header.as_bytes());
+ hasher.update(data);
+ let digest = hasher.finalize();
+ let mut out = String::with_capacity(40);
+ for b in digest {
+ use std::fmt::Write;
+ let _ = write!(&mut out, "{b:02x}");
+ }
+ out
+}
+
+const ZERO_OID: &str = "0000000000000000000000000000000000000000";
+const REGULAR_MODE: &str = "100644";
+#[cfg(unix)]
+const EXECUTABLE_MODE: &str = "100755";
+const SYMLINK_MODE: &str = "120000";
+
+#[cfg(unix)]
+fn file_mode_for_path(path: &Path) -> Option<String> {
+ use std::os::unix::fs::PermissionsExt;
+ let meta = fs::symlink_metadata(path).ok()?;
+ let ft = meta.file_type();
+ if ft.is_symlink() {
+ return Some(SYMLINK_MODE.to_string());
+ }
+ let mode = meta.permissions().mode();
+ let is_exec = (mode & 0o111) != 0;
+ Some(if is_exec {
+ EXECUTABLE_MODE.into()
+ } else {
+ REGULAR_MODE.into()
+ })
+}
+
+#[cfg(not(unix))]
+fn file_mode_for_path(_path: &Path) -> Option<String> {
+ // Default to non-executable on non-unix.
+ Some(REGULAR_MODE.to_string())
+}
+
+fn blob_bytes(path: &Path, mode: &str) -> Result<Option<Vec<u8>>> {
+ if path.exists() {
+ let contents = if mode == SYMLINK_MODE {
+ symlink_blob_bytes(path)
+ .ok_or_else(|| anyhow!("failed to read symlink target for {}", path.display()))?
+ } else {
+ fs::read(path).with_context(|| {
+ format!("failed to read current file for diff {}", path.display())
+ })?
+ };
+ Ok(Some(contents))
+ } else {
+ Ok(None)
+ }
+}
+
+#[cfg(unix)]
+fn symlink_blob_bytes(path: &Path) -> Option<Vec<u8>> {
+ use std::os::unix::ffi::OsStrExt;
+ let target = std::fs::read_link(path).ok()?;
+ Some(target.as_os_str().as_bytes().to_vec())
+}
+
+#[cfg(not(unix))]
+fn symlink_blob_bytes(_path: &Path) -> Option<Vec<u8>> {
+ None
+}
+
+#[cfg(windows)]
+fn is_windows_drive_or_unc_root(p: &std::path::Path) -> bool {
+ use std::path::Component;
+ let mut comps = p.components();
+ matches!(
+ (comps.next(), comps.next(), comps.next()),
+ (Some(Component::Prefix(_)), Some(Component::RootDir), None)
+ )
+}
+
+#[cfg(test)]
+mod tests {
+ #![allow(clippy::unwrap_used)]
+ use super::*;
+ use pretty_assertions::assert_eq;
+ use tempfile::tempdir;
+
+ /// Compute the Git SHA-1 blob object ID for the given content (string).
+ /// This delegates to the bytes version to avoid UTF-8 lossy conversions here.
+ fn git_blob_sha1_hex(data: &str) -> String {
+ git_blob_sha1_hex_bytes(data.as_bytes())
+ }
+
+ fn normalize_diff_for_test(input: &str, root: &Path) -> String {
+ let root_str = root.display().to_string().replace('\\', "/");
+ let replaced = input.replace(&root_str, "<TMP>");
+ // Split into blocks on lines starting with "diff --git ", sort blocks for determinism, and rejoin
+ let mut blocks: Vec<String> = Vec::new();
+ let mut current = String::new();
+ for line in replaced.lines() {
+ if line.starts_with("diff --git ") && !current.is_empty() {
+ blocks.push(current);
+ current = String::new();
+ }
+ if !current.is_empty() {
+ current.push('\n');
+ }
+ current.push_str(line);
+ }
+ if !current.is_empty() {
+ blocks.push(current);
+ }
+ blocks.sort();
+ let mut out = blocks.join("\n");
+ if !out.ends_with('\n') {
+ out.push('\n');
+ }
+ out
+ }
+
+ #[test]
+ fn accumulates_add_and_update() {
+ let mut acc = TurnDiffTracker::new();
+
+ let dir = tempdir().unwrap();
+ let file = dir.path().join("a.txt");
+
+ // First patch: add file (baseline should be /dev/null).
+ let add_changes = HashMap::from([(
+ file.clone(),
+ FileChange::Add {
+ content: "foo\n".to_string(),
+ },
+ )]);
+ acc.on_patch_begin(&add_changes).unwrap();
+
+ // Simulate apply: create the file on disk.
+ fs::write(&file, "foo\n").unwrap();
+ let first = acc.get_unified_diff().unwrap().unwrap();
+ let first = normalize_diff_for_test(&first, dir.path());
+ let expected_first = {
+ let mode = file_mode_for_path(&file).unwrap_or_else(|| REGULAR_MODE.to_string());
+ let right_oid = git_blob_sha1_hex("foo\n");
+ format!(
+ "diff --git a/<TMP>/a.txt b/<TMP>/a.txt\nnew file mode {mode}\nindex {ZERO_OID}..{right_oid}\n--- /dev/null\n+++ b/<TMP>/a.txt\n@@ -0,0 +1 @@\n+foo\n",
r#for better readability?
- Created: 2025-08-04 03:14:59 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2250313964
@@ -0,0 +1,766 @@
+use std::collections::HashMap;
+use std::fs;
+use std::path::Path;
+use std::path::PathBuf;
+use std::process::Command;
+
+use anyhow::Context;
+use anyhow::Result;
+use anyhow::anyhow;
+use uuid::Uuid;
+
+use crate::protocol::FileChange;
+
+struct BaselineFileInfo {
+ path: Option<PathBuf>,
+ contents_bytes: Option<Vec<u8>>,
+ mode: Option<String>,
+ oid: Option<String>,
+}
+
+/// Tracks sets of changes to files and exposes the overall unified diff.
+/// Internally, the way this works is now:
+/// 1. Maintain an in-memory baseline snapshot of files when they are first seen.
+/// For new additions, do not create a baseline so that diffs are shown as proper additions (using /dev/null).
+/// 2. Keep a stable internal filename (uuid + same extension) per external path for rename tracking.
+/// 3. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk entirely in-memory
+/// using the `similar` crate and emit unified diffs with rewritten external paths.
+#[derive(Default)]
+pub struct TurnDiffTracker {
+ /// Map external path -> internal filename (uuid + same extension).
+ external_to_temp_name: HashMap<PathBuf, String>,
+ /// Internal filename -> baseline file info.
+ baseline_file_info: HashMap<String, BaselineFileInfo>,
+ /// Internal filename -> external path as of current accumulated state (after applying all changes).
+ /// This is where renames are tracked.
+ temp_name_to_current_path: HashMap<String, PathBuf>,
+ /// Cache of known git worktree roots to avoid repeated filesystem walks.
+ git_root_cache: Vec<PathBuf>,
+}
+
+impl TurnDiffTracker {
+ pub fn new() -> Self {
+ Self::default()
+ }
+
+ /// Front-run apply patch calls to track the starting contents of any modified files.
+ /// - Creates an in-memory baseline snapshot for files that already exist on disk when first seen.
+ /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions.
+ /// - Also updates internal mappings for move/rename events.
+ pub fn on_patch_begin(&mut self, changes: &HashMap<PathBuf, FileChange>) -> Result<()> {
+ for (path, change) in changes.iter() {
+ // Ensure a stable internal filename exists for this external path.
+ if !self.external_to_temp_name.contains_key(path) {
+ let internal = uuid_filename_for(path);
+ self.external_to_temp_name
+ .insert(path.clone(), internal.clone());
+ self.temp_name_to_current_path
+ .insert(internal.clone(), path.clone());
+
+ // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null.
+ let (contents_bytes, mode, oid) = if path.exists() {
+ let mode = file_mode_for_path(path);
+ let mode_str = mode.as_deref().unwrap_or(REGULAR_MODE);
+ let contents_bytes = blob_bytes(path, mode_str)
+ .unwrap_or_default()
+ .unwrap_or_default();
+ let oid = if mode.as_deref() == Some(SYMLINK_MODE) {
+ git_blob_sha1_hex_bytes(&contents_bytes)
+ } else {
+ self.git_blob_oid_for_path(path)
+ .unwrap_or_else(|| git_blob_sha1_hex_bytes(&contents_bytes))
+ };
+ (Some(contents_bytes), mode, Some(oid))
+ } else {
+ (None, None, Some(ZERO_OID.to_string()))
+ };
+
+ self.baseline_file_info.insert(
+ internal.clone(),
+ BaselineFileInfo {
+ path: Some(path.clone()),
+ contents_bytes,
+ mode,
+ oid,
+ },
+ );
+ }
+
+ // Track rename/move in current mapping if provided in an Update.
+ if let FileChange::Update {
+ move_path: Some(dest),
+ ..
+ } = change
+ {
+ let uuid_filename = match self.external_to_temp_name.get(path) {
+ Some(i) => i.clone(),
+ None => {
+ // This should be rare, but if we haven't mapped the source, create it with no baseline.
+ let i = uuid_filename_for(path);
+ self.baseline_file_info.insert(
+ i.clone(),
+ BaselineFileInfo {
+ path: Some(path.clone()),
+ contents_bytes: None,
+ mode: None,
+ oid: Some(ZERO_OID.to_string()),
+ },
+ );
+ i
+ }
+ };
+ // Update current external mapping for temp file name.
+ self.temp_name_to_current_path
+ .insert(uuid_filename.clone(), dest.clone());
+ // Update forward file_mapping: external current -> internal name.
+ self.external_to_temp_name.remove(path);
+ self.external_to_temp_name
+ .insert(dest.clone(), uuid_filename);
+ };
+ }
+
+ Ok(())
+ }
+
+ fn get_path_for_internal(&self, internal: &str) -> Option<PathBuf> {
+ self.temp_name_to_current_path
+ .get(internal)
+ .cloned()
+ .or_else(|| {
+ self.baseline_file_info
+ .get(internal)
+ .and_then(|info| info.path.clone())
+ })
+ }
+
+ /// Find the git worktree root for a file/directory by walking up to the first ancestor containing a `.git` entry.
+ /// Uses a simple cache of known roots and avoids negative-result caching for simplicity.
+ fn find_git_root_cached(&mut self, start: &Path) -> Option<PathBuf> {
+ let dir = if start.is_dir() {
+ start
+ } else {
+ start.parent()?
+ };
+
+ // Fast path: if any cached root is an ancestor of this path, use it.
+ if let Some(root) = self
+ .git_root_cache
+ .iter()
+ .find(|r| dir.starts_with(r))
+ .cloned()
+ {
+ return Some(root);
+ }
+
+ // Walk up to find a `.git` marker.
+ let mut cur = dir.to_path_buf();
+ loop {
+ let git_marker = cur.join(".git");
+ if git_marker.is_dir() || git_marker.is_file() {
+ if !self.git_root_cache.iter().any(|r| r == &cur) {
+ self.git_root_cache.push(cur.clone());
+ }
+ return Some(cur);
+ }
+
+ // On Windows, avoid walking above the drive or UNC share root.
+ #[cfg(windows)]
+ {
+ if is_windows_drive_or_unc_root(&cur) {
+ return None;
+ }
+ }
+
+ if let Some(parent) = cur.parent() {
+ cur = parent.to_path_buf();
+ } else {
+ return None;
+ }
+ }
+ }
+
+ /// Return a display string for `path` relative to its git root if found, else absolute.
+ fn relative_to_git_root_str(&mut self, path: &Path) -> String {
+ let s = if let Some(root) = self.find_git_root_cached(path) {
+ if let Ok(rel) = path.strip_prefix(&root) {
+ rel.display().to_string()
+ } else {
+ path.display().to_string()
+ }
+ } else {
+ path.display().to_string()
+ };
+ s.replace('\\', "/")
+ }
+
+ /// Ask git to compute the blob SHA-1 for the file at `path` within its repository.
+ /// Returns None if no repository is found or git invocation fails.
+ fn git_blob_oid_for_path(&mut self, path: &Path) -> Option<String> {
+ let root = self.find_git_root_cached(path)?;
+ // Compute a path relative to the repo root for better portability across platforms.
+ let rel = path.strip_prefix(&root).unwrap_or(path);
+ let output = Command::new("git")
+ .arg("-C")
+ .arg(&root)
+ .arg("hash-object")
+ .arg("--")
+ .arg(rel)
+ .output()
+ .ok()?;
+ if !output.status.success() {
+ return None;
+ }
+ let s = String::from_utf8_lossy(&output.stdout).trim().to_string();
+ if s.len() == 40 { Some(s) } else { None }
+ }
+
+ /// Recompute the aggregated unified diff by comparing all of the in-memory snapshots that were
+ /// collected before the first time they were touched by apply_patch during this turn with
+ /// the current repo state.
+ pub fn get_unified_diff(&mut self) -> Result<Option<String>> {
+ let mut aggregated = String::new();
+
+ // Compute diffs per tracked internal file in a stable order by external path.
+ let mut baseline_file_names: Vec<String> =
+ self.baseline_file_info.keys().cloned().collect();
+ // Sort lexicographically by full repo-relative path to match git behavior.
+ baseline_file_names.sort_by_key(|internal| {
+ self.get_path_for_internal(internal)
+ .map(|p| self.relative_to_git_root_str(&p))
+ .unwrap_or_default()
+ });
+
+ for internal in baseline_file_names {
+ // Baseline external must exist for any tracked internal.
+ let baseline_external = match self
+ .baseline_file_info
+ .get(&internal)
+ .and_then(|i| i.path.clone())
+ {
+ Some(p) => p,
+ None => continue,
+ };
+ let current_external = match self.get_path_for_internal(&internal) {
+ Some(p) => p,
+ None => continue,
+ };
+
+ // Determine modes early; needed to read symlink content correctly.
+ let baseline_mode = self
+ .baseline_file_info
+ .get(&internal)
+ .and_then(|i| i.mode.clone())
+ .unwrap_or_else(|| REGULAR_MODE.to_string());
+ let current_mode =
+ file_mode_for_path(¤t_external).unwrap_or_else(|| REGULAR_MODE.to_string());
+
+ let left_bytes = self
+ .baseline_file_info
+ .get(&internal)
+ .and_then(|i| i.contents_bytes.clone());
+
+ let right_bytes = blob_bytes(¤t_external, ¤t_mode)?;
+
+ // Fast path: identical bytes or both missing.
+ if left_bytes.as_deref() == right_bytes.as_deref() {
+ continue;
+ }
+
+ let left_display = self.relative_to_git_root_str(&baseline_external);
+ let right_display = self.relative_to_git_root_str(¤t_external);
+
+ // Emit a git-style header for better readability and parity with previous behavior.
+ aggregated.push_str(&format!("diff --git a/{left_display} b/{right_display}\n"));
+
+ let is_add = left_bytes.is_none() && right_bytes.is_some();
+ let is_delete = left_bytes.is_some() && right_bytes.is_none();
+
+ if is_add {
+ aggregated.push_str(&format!("new file mode {current_mode}\n"));
+ } else if is_delete {
+ aggregated.push_str(&format!("deleted file mode {baseline_mode}\n"));
+ } else if baseline_mode != current_mode {
+ aggregated.push_str(&format!("old mode {baseline_mode}\n"));
+ aggregated.push_str(&format!("new mode {current_mode}\n"));
+ }
+
+ // Determine blob object IDs for left and right contents. Prefer stored OIDs
+ // captured from the original repo state when the change was first seen.
+ let left_oid = self
+ .baseline_file_info
+ .get(&internal)
+ .and_then(|i| i.oid.clone())
+ .or_else(|| {
+ left_bytes
+ .as_ref()
+ .map(|b| git_blob_sha1_hex_bytes(b))
+ .or(Some(ZERO_OID.to_string()))
+ })
+ .unwrap_or_else(|| ZERO_OID.to_string());
+ let right_oid = if let Some(b) = right_bytes.as_ref() {
+ if current_mode == SYMLINK_MODE {
+ git_blob_sha1_hex_bytes(b)
+ } else {
+ self.git_blob_oid_for_path(¤t_external)
+ .unwrap_or_else(|| git_blob_sha1_hex_bytes(b))
+ }
+ } else {
+ ZERO_OID.to_string()
+ };
+
+ // If either side isn't valid UTF-8, emit a binary diff header and continue.
+ let left_text = left_bytes
+ .as_deref()
+ .and_then(|b| std::str::from_utf8(b).ok());
+ let right_text = right_bytes
+ .as_deref()
+ .and_then(|b| std::str::from_utf8(b).ok());
+
+ // Prefer text diffs when possible:
+ // - both sides are valid UTF-8
+ // - OR one side is missing (add/delete) and the present side is valid UTF-8
+ let can_text_diff = match (left_text, right_text, is_add, is_delete) {
+ (Some(_), Some(_), _, _) => true,
+ (_, Some(_), true, _) => true, // add: left missing, right text
+ (Some(_), _, _, true) => true, // delete: left text, right missing
+ _ => false,
+ };
+
+ if can_text_diff {
+ // Diff the contents as text, treating missing side as empty string.
+ let l = left_text.unwrap_or("");
+ let r = right_text.unwrap_or("");
+
+ // Emit index line without mode suffix to preserve current test expectations.
+ aggregated.push_str(&format!("index {left_oid}..{right_oid}\n"));
+
+ let old_header = if left_bytes.is_some() {
+ format!("a/{left_display}")
+ } else {
+ "/dev/null".to_string()
+ };
+ let new_header = if right_bytes.is_some() {
+ format!("b/{right_display}")
+ } else {
+ "/dev/null".to_string()
+ };
+
+ let diff = similar::TextDiff::from_lines(l, r);
+ let unified = diff
+ .unified_diff()
+ .context_radius(3)
+ .header(&old_header, &new_header)
+ .to_string();
+
+ aggregated.push_str(&unified);
+ if !aggregated.ends_with('\n') {
+ aggregated.push('\n');
+ }
+ } else {
+ // Binary or invalid UTF-8: emit header only.
+ aggregated.push_str(&format!("index {left_oid}..{right_oid}\n"));
+ let old_header = if left_bytes.is_some() {
+ format!("a/{left_display}")
+ } else {
+ "/dev/null".to_string()
+ };
+ let new_header = if right_bytes.is_some() {
+ format!("b/{right_display}")
+ } else {
+ "/dev/null".to_string()
+ };
+ aggregated.push_str(&format!("--- {old_header}\n"));
+ aggregated.push_str(&format!("+++ {new_header}\n"));
+ aggregated.push_str("Binary files differ\n");
+ if !aggregated.ends_with('\n') {
+ aggregated.push('\n');
+ }
+ }
+ }
+
+ if aggregated.trim().is_empty() {
+ Ok(None)
+ } else {
+ Ok(Some(aggregated))
+ }
+ }
+}
+
+fn uuid_filename_for(path: &Path) -> String {
+ let id = Uuid::new_v4().to_string();
+ match path.extension().and_then(|e| e.to_str()) {
+ Some(ext) if !ext.is_empty() => format!("{id}.{ext}"),
+ _ => id,
+ }
+}
+
+/// Compute the Git SHA-1 blob object ID for the given content (bytes).
+fn git_blob_sha1_hex_bytes(data: &[u8]) -> String {
+ // Git blob hash is sha1 of: "blob <len>\0<data>"
+ let header = format!("blob {}\0", data.len());
+ use sha1::Digest;
+ let mut hasher = sha1::Sha1::new();
+ hasher.update(header.as_bytes());
+ hasher.update(data);
+ let digest = hasher.finalize();
+ let mut out = String::with_capacity(40);
+ for b in digest {
+ use std::fmt::Write;
+ let _ = write!(&mut out, "{b:02x}");
+ }
+ out
+}
+
+const ZERO_OID: &str = "0000000000000000000000000000000000000000";
+const REGULAR_MODE: &str = "100644";
+#[cfg(unix)]
+const EXECUTABLE_MODE: &str = "100755";
+const SYMLINK_MODE: &str = "120000";
+
+#[cfg(unix)]
+fn file_mode_for_path(path: &Path) -> Option<String> {
+ use std::os::unix::fs::PermissionsExt;
+ let meta = fs::symlink_metadata(path).ok()?;
+ let ft = meta.file_type();
+ if ft.is_symlink() {
+ return Some(SYMLINK_MODE.to_string());
+ }
+ let mode = meta.permissions().mode();
+ let is_exec = (mode & 0o111) != 0;
+ Some(if is_exec {
+ EXECUTABLE_MODE.into()
+ } else {
+ REGULAR_MODE.into()
+ })
+}
+
+#[cfg(not(unix))]
+fn file_mode_for_path(_path: &Path) -> Option<String> {
+ // Default to non-executable on non-unix.
+ Some(REGULAR_MODE.to_string())
+}
+
+fn blob_bytes(path: &Path, mode: &str) -> Result<Option<Vec<u8>>> {
+ if path.exists() {
+ let contents = if mode == SYMLINK_MODE {
+ symlink_blob_bytes(path)
+ .ok_or_else(|| anyhow!("failed to read symlink target for {}", path.display()))?
+ } else {
+ fs::read(path).with_context(|| {
+ format!("failed to read current file for diff {}", path.display())
+ })?
+ };
+ Ok(Some(contents))
+ } else {
+ Ok(None)
+ }
+}
+
+#[cfg(unix)]
+fn symlink_blob_bytes(path: &Path) -> Option<Vec<u8>> {
+ use std::os::unix::ffi::OsStrExt;
+ let target = std::fs::read_link(path).ok()?;
+ Some(target.as_os_str().as_bytes().to_vec())
+}
+
+#[cfg(not(unix))]
+fn symlink_blob_bytes(_path: &Path) -> Option<Vec<u8>> {
+ None
+}
+
+#[cfg(windows)]
+fn is_windows_drive_or_unc_root(p: &std::path::Path) -> bool {
+ use std::path::Component;
+ let mut comps = p.components();
+ matches!(
+ (comps.next(), comps.next(), comps.next()),
+ (Some(Component::Prefix(_)), Some(Component::RootDir), None)
+ )
+}
+
+#[cfg(test)]
+mod tests {
+ #![allow(clippy::unwrap_used)]
+ use super::*;
+ use pretty_assertions::assert_eq;
+ use tempfile::tempdir;
+
+ /// Compute the Git SHA-1 blob object ID for the given content (string).
+ /// This delegates to the bytes version to avoid UTF-8 lossy conversions here.
+ fn git_blob_sha1_hex(data: &str) -> String {
+ git_blob_sha1_hex_bytes(data.as_bytes())
+ }
+
+ fn normalize_diff_for_test(input: &str, root: &Path) -> String {
+ let root_str = root.display().to_string().replace('\\', "/");
+ let replaced = input.replace(&root_str, "<TMP>");
+ // Split into blocks on lines starting with "diff --git ", sort blocks for determinism, and rejoin
+ let mut blocks: Vec<String> = Vec::new();
+ let mut current = String::new();
+ for line in replaced.lines() {
+ if line.starts_with("diff --git ") && !current.is_empty() {
+ blocks.push(current);
+ current = String::new();
+ }
+ if !current.is_empty() {
+ current.push('\n');
+ }
+ current.push_str(line);
+ }
+ if !current.is_empty() {
+ blocks.push(current);
+ }
+ blocks.sort();
+ let mut out = blocks.join("\n");
+ if !out.ends_with('\n') {
+ out.push('\n');
+ }
+ out
+ }
+
+ #[test]
+ fn accumulates_add_and_update() {
+ let mut acc = TurnDiffTracker::new();
+
+ let dir = tempdir().unwrap();
+ let file = dir.path().join("a.txt");
+
+ // First patch: add file (baseline should be /dev/null).
+ let add_changes = HashMap::from([(
+ file.clone(),
+ FileChange::Add {
+ content: "foo\n".to_string(),
+ },
+ )]);
+ acc.on_patch_begin(&add_changes).unwrap();
+
+ // Simulate apply: create the file on disk.
+ fs::write(&file, "foo\n").unwrap();
+ let first = acc.get_unified_diff().unwrap().unwrap();
+ let first = normalize_diff_for_test(&first, dir.path());
+ let expected_first = {
+ let mode = file_mode_for_path(&file).unwrap_or_else(|| REGULAR_MODE.to_string());
+ let right_oid = git_blob_sha1_hex("foo\n");
+ format!(
+ "diff --git a/<TMP>/a.txt b/<TMP>/a.txt\nnew file mode {mode}\nindex {ZERO_OID}..{right_oid}\n--- /dev/null\n+++ b/<TMP>/a.txt\n@@ -0,0 +1 @@\n+foo\n",
+ )
+ };
+ assert_eq!(first, expected_first);
+
+ // Second patch: update the file on disk.
+ let update_changes = HashMap::from([(
+ file.clone(),
+ FileChange::Update {
+ unified_diff: "".to_owned(),
+ move_path: None,
+ },
+ )]);
+ acc.on_patch_begin(&update_changes).unwrap();
+
+ // Simulate apply: append a new line.
+ fs::write(&file, "foo\nbar\n").unwrap();
+ let combined = acc.get_unified_diff().unwrap().unwrap();
+ let combined = normalize_diff_for_test(&combined, dir.path());
+ let expected_combined = {
+ let mode = file_mode_for_path(&file).unwrap_or_else(|| REGULAR_MODE.to_string());
+ let right_oid = git_blob_sha1_hex("foo\nbar\n");
+ format!(
+ "diff --git a/<TMP>/a.txt b/<TMP>/a.txt\nnew file mode {mode}\nindex {ZERO_OID}..{right_oid}\n--- /dev/null\n+++ b/<TMP>/a.txt\n@@ -0,0 +1,2 @@\n+foo\n+bar\n",
+ )
+ };
+ assert_eq!(combined, expected_combined);
+ }
+
+ #[test]
+ fn accumulates_delete() {
+ let dir = tempdir().unwrap();
+ let file = dir.path().join("b.txt");
+ fs::write(&file, "x\n").unwrap();
+
+ let mut acc = TurnDiffTracker::new();
+ let del_changes = HashMap::from([(file.clone(), FileChange::Delete)]);
+ acc.on_patch_begin(&del_changes).unwrap();
+
+ // Simulate apply: delete the file from disk.
+ let baseline_mode = file_mode_for_path(&file).unwrap_or_else(|| REGULAR_MODE.to_string());
+ fs::remove_file(&file).unwrap();
+ let diff = acc.get_unified_diff().unwrap().unwrap();
+ let diff = normalize_diff_for_test(&diff, dir.path());
+ let expected = {
+ let left_oid = git_blob_sha1_hex("x\n");
+ format!(
+ "diff --git a/<TMP>/b.txt b/<TMP>/b.txt\ndeleted file mode {baseline_mode}\nindex {left_oid}..{ZERO_OID}\n--- a/<TMP>/b.txt\n+++ /dev/null\n@@ -1 +0,0 @@\n-x\n",
+ )
+ };
+ assert_eq!(diff, expected);
+ }
+
+ #[test]
+ fn accumulates_move_and_update() {
+ let dir = tempdir().unwrap();
+ let src = dir.path().join("src.txt");
+ let dest = dir.path().join("dst.txt");
+ fs::write(&src, "line\n").unwrap();
+
+ let mut acc = TurnDiffTracker::new();
+ let mv_changes = HashMap::from([(
+ src.clone(),
+ FileChange::Update {
+ unified_diff: "".to_owned(),
+ move_path: Some(dest.clone()),
+ },
+ )]);
+ acc.on_patch_begin(&mv_changes).unwrap();
+
+ // Simulate apply: move and update content.
+ fs::rename(&src, &dest).unwrap();
+ fs::write(&dest, "line2\n").unwrap();
+
+ let out = acc.get_unified_diff().unwrap().unwrap();
+ let out = normalize_diff_for_test(&out, dir.path());
+ let expected = {
+ let left_oid = git_blob_sha1_hex("line\n");
+ let right_oid = git_blob_sha1_hex("line2\n");
+ format!(
+ "diff --git a/<TMP>/src.txt b/<TMP>/dst.txt\nindex {left_oid}..{right_oid}\n--- a/<TMP>/src.txt\n+++ b/<TMP>/dst.txt\n@@ -1 +1 @@\n-line\n+line2\n"
+ )
+ };
+ assert_eq!(out, expected);
+ }
+
+ #[test]
+ fn move_without_content_change_yields_no_diff() {
+ let dir = tempdir().unwrap();
+ let src = dir.path().join("moved.txt");
+ let dest = dir.path().join("renamed.txt");
+ fs::write(&src, "same\n").unwrap();
+
+ let mut acc = TurnDiffTracker::new();
+ let mv_changes = HashMap::from([(
+ src.clone(),
+ FileChange::Update {
+ unified_diff: "".to_owned(),
+ move_path: Some(dest.clone()),
+ },
+ )]);
+ acc.on_patch_begin(&mv_changes).unwrap();
+
+ // Simulate apply: move only, no content change.
+ fs::rename(&src, &dest).unwrap();
+
+ let diff = acc.get_unified_diff().unwrap();
+ assert_eq!(diff, None);
+ }
+
+ #[test]
+ fn move_declared_but_file_only_appears_at_dest_is_add() {
+ let dir = tempdir().unwrap();
+ let src = dir.path().join("src.txt");
+ let dest = dir.path().join("dest.txt");
+ let mut acc = TurnDiffTracker::new();
+ let mv = HashMap::from([(
+ src.clone(),
+ FileChange::Update {
+ unified_diff: "".into(),
+ move_path: Some(dest.clone()),
+ },
+ )]);
+ acc.on_patch_begin(&mv).unwrap();
+ // No file existed initially; create only dest
+ fs::write(&dest, "hello\n").unwrap();
+ let diff = acc.get_unified_diff().unwrap().unwrap();
+ assert!(diff.contains("new file mode"));
We can't
assert_eq!()here?
- Created: 2025-08-04 16:55:01 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2252061946
@@ -0,0 +1,766 @@
+use std::collections::HashMap;
+use std::fs;
+use std::path::Path;
+use std::path::PathBuf;
+use std::process::Command;
+
+use anyhow::Context;
+use anyhow::Result;
+use anyhow::anyhow;
+use uuid::Uuid;
+
+use crate::protocol::FileChange;
+
+struct BaselineFileInfo {
+ path: Option<PathBuf>,
+ contents_bytes: Option<Vec<u8>>,
+ mode: Option<String>,
+ oid: Option<String>,
+}
+
+/// Tracks sets of changes to files and exposes the overall unified diff.
+/// Internally, the way this works is now:
+/// 1. Maintain an in-memory baseline snapshot of files when they are first seen.
+/// For new additions, do not create a baseline so that diffs are shown as proper additions (using /dev/null).
+/// 2. Keep a stable internal filename (uuid + same extension) per external path for rename tracking.
+/// 3. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk entirely in-memory
+/// using the `similar` crate and emit unified diffs with rewritten external paths.
+#[derive(Default)]
+pub struct TurnDiffTracker {
+ /// Map external path -> internal filename (uuid + same extension).
+ external_to_temp_name: HashMap<PathBuf, String>,
+ /// Internal filename -> baseline file info.
+ baseline_file_info: HashMap<String, BaselineFileInfo>,
+ /// Internal filename -> external path as of current accumulated state (after applying all changes).
+ /// This is where renames are tracked.
+ temp_name_to_current_path: HashMap<String, PathBuf>,
+ /// Cache of known git worktree roots to avoid repeated filesystem walks.
+ git_root_cache: Vec<PathBuf>,
+}
+
+impl TurnDiffTracker {
+ pub fn new() -> Self {
+ Self::default()
+ }
+
+ /// Front-run apply patch calls to track the starting contents of any modified files.
+ /// - Creates an in-memory baseline snapshot for files that already exist on disk when first seen.
+ /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions.
+ /// - Also updates internal mappings for move/rename events.
+ pub fn on_patch_begin(&mut self, changes: &HashMap<PathBuf, FileChange>) -> Result<()> {
+ for (path, change) in changes.iter() {
+ // Ensure a stable internal filename exists for this external path.
+ if !self.external_to_temp_name.contains_key(path) {
+ let internal = uuid_filename_for(path);
+ self.external_to_temp_name
+ .insert(path.clone(), internal.clone());
+ self.temp_name_to_current_path
+ .insert(internal.clone(), path.clone());
+
+ // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null.
+ let (contents_bytes, mode, oid) = if path.exists() {
+ let mode = file_mode_for_path(path);
+ let mode_str = mode.as_deref().unwrap_or(REGULAR_MODE);
+ let contents_bytes = blob_bytes(path, mode_str)
+ .unwrap_or_default()
+ .unwrap_or_default();
+ let oid = if mode.as_deref() == Some(SYMLINK_MODE) {
+ git_blob_sha1_hex_bytes(&contents_bytes)
+ } else {
+ self.git_blob_oid_for_path(path)
+ .unwrap_or_else(|| git_blob_sha1_hex_bytes(&contents_bytes))
+ };
+ (Some(contents_bytes), mode, Some(oid))
+ } else {
+ (None, None, Some(ZERO_OID.to_string()))
+ };
+
+ self.baseline_file_info.insert(
+ internal.clone(),
+ BaselineFileInfo {
+ path: Some(path.clone()),
+ contents_bytes,
+ mode,
+ oid,
+ },
+ );
+ }
+
+ // Track rename/move in current mapping if provided in an Update.
+ if let FileChange::Update {
+ move_path: Some(dest),
+ ..
+ } = change
+ {
+ let uuid_filename = match self.external_to_temp_name.get(path) {
+ Some(i) => i.clone(),
+ None => {
+ // This should be rare, but if we haven't mapped the source, create it with no baseline.
+ let i = uuid_filename_for(path);
+ self.baseline_file_info.insert(
+ i.clone(),
+ BaselineFileInfo {
+ path: Some(path.clone()),
+ contents_bytes: None,
+ mode: None,
+ oid: Some(ZERO_OID.to_string()),
+ },
+ );
+ i
+ }
+ };
+ // Update current external mapping for temp file name.
+ self.temp_name_to_current_path
+ .insert(uuid_filename.clone(), dest.clone());
+ // Update forward file_mapping: external current -> internal name.
+ self.external_to_temp_name.remove(path);
+ self.external_to_temp_name
+ .insert(dest.clone(), uuid_filename);
+ };
+ }
+
+ Ok(())
+ }
+
+ fn get_path_for_internal(&self, internal: &str) -> Option<PathBuf> {
+ self.temp_name_to_current_path
+ .get(internal)
+ .cloned()
+ .or_else(|| {
+ self.baseline_file_info
+ .get(internal)
+ .and_then(|info| info.path.clone())
+ })
+ }
+
+ /// Find the git worktree root for a file/directory by walking up to the first ancestor containing a `.git` entry.
+ /// Uses a simple cache of known roots and avoids negative-result caching for simplicity.
+ fn find_git_root_cached(&mut self, start: &Path) -> Option<PathBuf> {
+ let dir = if start.is_dir() {
+ start
+ } else {
+ start.parent()?
+ };
+
+ // Fast path: if any cached root is an ancestor of this path, use it.
+ if let Some(root) = self
+ .git_root_cache
+ .iter()
+ .find(|r| dir.starts_with(r))
+ .cloned()
+ {
+ return Some(root);
+ }
+
+ // Walk up to find a `.git` marker.
+ let mut cur = dir.to_path_buf();
+ loop {
+ let git_marker = cur.join(".git");
+ if git_marker.is_dir() || git_marker.is_file() {
+ if !self.git_root_cache.iter().any(|r| r == &cur) {
+ self.git_root_cache.push(cur.clone());
+ }
+ return Some(cur);
+ }
+
+ // On Windows, avoid walking above the drive or UNC share root.
+ #[cfg(windows)]
+ {
+ if is_windows_drive_or_unc_root(&cur) {
+ return None;
+ }
+ }
+
+ if let Some(parent) = cur.parent() {
+ cur = parent.to_path_buf();
+ } else {
+ return None;
+ }
+ }
+ }
+
+ /// Return a display string for `path` relative to its git root if found, else absolute.
+ fn relative_to_git_root_str(&mut self, path: &Path) -> String {
+ let s = if let Some(root) = self.find_git_root_cached(path) {
+ if let Ok(rel) = path.strip_prefix(&root) {
+ rel.display().to_string()
+ } else {
+ path.display().to_string()
+ }
+ } else {
+ path.display().to_string()
+ };
+ s.replace('\\', "/")
+ }
+
+ /// Ask git to compute the blob SHA-1 for the file at `path` within its repository.
+ /// Returns None if no repository is found or git invocation fails.
+ fn git_blob_oid_for_path(&mut self, path: &Path) -> Option<String> {
+ let root = self.find_git_root_cached(path)?;
+ // Compute a path relative to the repo root for better portability across platforms.
+ let rel = path.strip_prefix(&root).unwrap_or(path);
+ let output = Command::new("git")
+ .arg("-C")
+ .arg(&root)
+ .arg("hash-object")
+ .arg("--")
+ .arg(rel)
+ .output()
+ .ok()?;
+ if !output.status.success() {
+ return None;
+ }
+ let s = String::from_utf8_lossy(&output.stdout).trim().to_string();
+ if s.len() == 40 { Some(s) } else { None }
+ }
+
+ /// Recompute the aggregated unified diff by comparing all of the in-memory snapshots that were
+ /// collected before the first time they were touched by apply_patch during this turn with
+ /// the current repo state.
+ pub fn get_unified_diff(&mut self) -> Result<Option<String>> {
+ let mut aggregated = String::new();
+
+ // Compute diffs per tracked internal file in a stable order by external path.
+ let mut baseline_file_names: Vec<String> =
+ self.baseline_file_info.keys().cloned().collect();
+ // Sort lexicographically by full repo-relative path to match git behavior.
+ baseline_file_names.sort_by_key(|internal| {
+ self.get_path_for_internal(internal)
+ .map(|p| self.relative_to_git_root_str(&p))
+ .unwrap_or_default()
+ });
+
+ for internal in baseline_file_names {
+ // Baseline external must exist for any tracked internal.
+ let baseline_external = match self
+ .baseline_file_info
+ .get(&internal)
+ .and_then(|i| i.path.clone())
+ {
+ Some(p) => p,
+ None => continue,
+ };
+ let current_external = match self.get_path_for_internal(&internal) {
+ Some(p) => p,
+ None => continue,
+ };
+
+ // Determine modes early; needed to read symlink content correctly.
+ let baseline_mode = self
+ .baseline_file_info
+ .get(&internal)
+ .and_then(|i| i.mode.clone())
+ .unwrap_or_else(|| REGULAR_MODE.to_string());
+ let current_mode =
+ file_mode_for_path(¤t_external).unwrap_or_else(|| REGULAR_MODE.to_string());
+
+ let left_bytes = self
+ .baseline_file_info
+ .get(&internal)
+ .and_then(|i| i.contents_bytes.clone());
+
+ let right_bytes = blob_bytes(¤t_external, ¤t_mode)?;
+
+ // Fast path: identical bytes or both missing.
+ if left_bytes.as_deref() == right_bytes.as_deref() {
+ continue;
+ }
+
+ let left_display = self.relative_to_git_root_str(&baseline_external);
+ let right_display = self.relative_to_git_root_str(¤t_external);
+
+ // Emit a git-style header for better readability and parity with previous behavior.
+ aggregated.push_str(&format!("diff --git a/{left_display} b/{right_display}\n"));
+
+ let is_add = left_bytes.is_none() && right_bytes.is_some();
+ let is_delete = left_bytes.is_some() && right_bytes.is_none();
+
+ if is_add {
+ aggregated.push_str(&format!("new file mode {current_mode}\n"));
+ } else if is_delete {
+ aggregated.push_str(&format!("deleted file mode {baseline_mode}\n"));
+ } else if baseline_mode != current_mode {
+ aggregated.push_str(&format!("old mode {baseline_mode}\n"));
+ aggregated.push_str(&format!("new mode {current_mode}\n"));
+ }
+
+ // Determine blob object IDs for left and right contents. Prefer stored OIDs
+ // captured from the original repo state when the change was first seen.
+ let left_oid = self
+ .baseline_file_info
+ .get(&internal)
+ .and_then(|i| i.oid.clone())
+ .or_else(|| {
+ left_bytes
+ .as_ref()
+ .map(|b| git_blob_sha1_hex_bytes(b))
+ .or(Some(ZERO_OID.to_string()))
+ })
+ .unwrap_or_else(|| ZERO_OID.to_string());
+ let right_oid = if let Some(b) = right_bytes.as_ref() {
+ if current_mode == SYMLINK_MODE {
+ git_blob_sha1_hex_bytes(b)
+ } else {
+ self.git_blob_oid_for_path(¤t_external)
+ .unwrap_or_else(|| git_blob_sha1_hex_bytes(b))
+ }
+ } else {
+ ZERO_OID.to_string()
+ };
+
+ // If either side isn't valid UTF-8, emit a binary diff header and continue.
+ let left_text = left_bytes
+ .as_deref()
+ .and_then(|b| std::str::from_utf8(b).ok());
+ let right_text = right_bytes
+ .as_deref()
+ .and_then(|b| std::str::from_utf8(b).ok());
+
+ // Prefer text diffs when possible:
+ // - both sides are valid UTF-8
+ // - OR one side is missing (add/delete) and the present side is valid UTF-8
+ let can_text_diff = match (left_text, right_text, is_add, is_delete) {
+ (Some(_), Some(_), _, _) => true,
+ (_, Some(_), true, _) => true, // add: left missing, right text
+ (Some(_), _, _, true) => true, // delete: left text, right missing
+ _ => false,
+ };
+
+ if can_text_diff {
+ // Diff the contents as text, treating missing side as empty string.
+ let l = left_text.unwrap_or("");
+ let r = right_text.unwrap_or("");
+
+ // Emit index line without mode suffix to preserve current test expectations.
+ aggregated.push_str(&format!("index {left_oid}..{right_oid}\n"));
+
+ let old_header = if left_bytes.is_some() {
+ format!("a/{left_display}")
+ } else {
+ "/dev/null".to_string()
+ };
+ let new_header = if right_bytes.is_some() {
+ format!("b/{right_display}")
+ } else {
+ "/dev/null".to_string()
+ };
+
+ let diff = similar::TextDiff::from_lines(l, r);
+ let unified = diff
+ .unified_diff()
+ .context_radius(3)
+ .header(&old_header, &new_header)
+ .to_string();
+
+ aggregated.push_str(&unified);
+ if !aggregated.ends_with('\n') {
+ aggregated.push('\n');
+ }
+ } else {
+ // Binary or invalid UTF-8: emit header only.
+ aggregated.push_str(&format!("index {left_oid}..{right_oid}\n"));
+ let old_header = if left_bytes.is_some() {
+ format!("a/{left_display}")
+ } else {
+ "/dev/null".to_string()
+ };
+ let new_header = if right_bytes.is_some() {
+ format!("b/{right_display}")
+ } else {
+ "/dev/null".to_string()
+ };
+ aggregated.push_str(&format!("--- {old_header}\n"));
+ aggregated.push_str(&format!("+++ {new_header}\n"));
+ aggregated.push_str("Binary files differ\n");
+ if !aggregated.ends_with('\n') {
+ aggregated.push('\n');
+ }
+ }
+ }
+
+ if aggregated.trim().is_empty() {
+ Ok(None)
+ } else {
+ Ok(Some(aggregated))
+ }
+ }
+}
+
+fn uuid_filename_for(path: &Path) -> String {
+ let id = Uuid::new_v4().to_string();
+ match path.extension().and_then(|e| e.to_str()) {
+ Some(ext) if !ext.is_empty() => format!("{id}.{ext}"),
+ _ => id,
+ }
+}
+
+/// Compute the Git SHA-1 blob object ID for the given content (bytes).
+fn git_blob_sha1_hex_bytes(data: &[u8]) -> String {
+ // Git blob hash is sha1 of: "blob <len>\0<data>"
+ let header = format!("blob {}\0", data.len());
+ use sha1::Digest;
+ let mut hasher = sha1::Sha1::new();
+ hasher.update(header.as_bytes());
+ hasher.update(data);
+ let digest = hasher.finalize();
+ let mut out = String::with_capacity(40);
+ for b in digest {
+ use std::fmt::Write;
+ let _ = write!(&mut out, "{b:02x}");
+ }
+ out
+}
+
+const ZERO_OID: &str = "0000000000000000000000000000000000000000";
+const REGULAR_MODE: &str = "100644";
+#[cfg(unix)]
@gpeal but then that would change the tree hash?