# PR #1770: Add a TurnDiffTracker to create a unified diff for an entire turn - URL: https://github.com/openai/codex/pull/1770 - Author: gpeal - Created: 2025-07-31 23:42:44 UTC - Updated: 2025-08-04 16:55:01 UTC - Changes: +998/-18, Files changed: 9, Commits: 25 ## Description This lets us show an accumulating diff across all patches in a turn. Refer to the docs for TurnDiffTracker for implementation details. There are multiple ways this could have been done and this felt like the right tradeoff between reliability and completeness: *Pros* * It will pick up all changes to files that the model touched including if they prettier or another command that updates them. * It will not pick up changes made by the user or other agents to files it didn't modify. *Cons* * It will pick up changes that the user made to a file that the model also touched * It will not pick up changes to codegen or files that were not modified with apply_patch ## Full Diff ```diff diff --git a/codex-rs/Cargo.lock b/codex-rs/Cargo.lock index 7d4e41d0b1..eb4eccd897 100644 --- a/codex-rs/Cargo.lock +++ b/codex-rs/Cargo.lock @@ -699,6 +699,7 @@ dependencies = [ "serde_json", "sha1", "shlex", + "similar", "strum_macros 0.27.2", "tempfile", "thiserror 2.0.12", diff --git a/codex-rs/core/Cargo.toml b/codex-rs/core/Cargo.toml index db3fd4f834..466e9adf02 100644 --- a/codex-rs/core/Cargo.toml +++ b/codex-rs/core/Cargo.toml @@ -34,6 +34,7 @@ serde_json = "1" serde_bytes = "0.11" sha1 = "0.10.6" shlex = "1.3.0" +similar = "2.7.0" strum_macros = "0.27.2" thiserror = "2.0.12" time = { version = "0.3", features = ["formatting", "local-offset", "macros"] } diff --git a/codex-rs/core/src/codex.rs b/codex-rs/core/src/codex.rs index 7004dcfcb7..568d87c4a8 100644 --- a/codex-rs/core/src/codex.rs +++ b/codex-rs/core/src/codex.rs @@ -85,11 +85,13 @@ use crate::protocol::SandboxPolicy; use crate::protocol::SessionConfiguredEvent; use crate::protocol::Submission; use crate::protocol::TaskCompleteEvent; +use crate::protocol::TurnDiffEvent; use crate::rollout::RolloutRecorder; use crate::safety::SafetyCheck; use crate::safety::assess_command_safety; use crate::safety::assess_safety_for_untrusted_command; use crate::shell; +use crate::turn_diff_tracker::TurnDiffTracker; use crate::user_notification::UserNotification; use crate::util::backoff; @@ -362,7 +364,11 @@ impl Session { } } - async fn notify_exec_command_begin(&self, exec_command_context: ExecCommandContext) { + async fn on_exec_command_begin( + &self, + turn_diff_tracker: &mut TurnDiffTracker, + exec_command_context: ExecCommandContext, + ) { let ExecCommandContext { sub_id, call_id, @@ -374,11 +380,15 @@ impl Session { Some(ApplyPatchCommandContext { user_explicitly_approved_this_action, changes, - }) => EventMsg::PatchApplyBegin(PatchApplyBeginEvent { - call_id, - auto_approved: !user_explicitly_approved_this_action, - changes, - }), + }) => { + turn_diff_tracker.on_patch_begin(&changes); + + EventMsg::PatchApplyBegin(PatchApplyBeginEvent { + call_id, + auto_approved: !user_explicitly_approved_this_action, + changes, + }) + } None => EventMsg::ExecCommandBegin(ExecCommandBeginEvent { call_id, command: command_for_display.clone(), @@ -392,8 +402,10 @@ impl Session { let _ = self.tx_event.send(event).await; } - async fn notify_exec_command_end( + #[allow(clippy::too_many_arguments)] + async fn on_exec_command_end( &self, + turn_diff_tracker: &mut TurnDiffTracker, sub_id: &str, call_id: &str, output: &ExecToolCallOutput, @@ -433,6 +445,20 @@ impl Session { msg, }; let _ = self.tx_event.send(event).await; + + // If this is an apply_patch, after we emit the end patch, emit a second event + // with the full turn diff if there is one. + if is_apply_patch { + let unified_diff = turn_diff_tracker.get_unified_diff(); + if let Ok(Some(unified_diff)) = unified_diff { + let msg = EventMsg::TurnDiff(TurnDiffEvent { unified_diff }); + let event = Event { + id: sub_id.into(), + msg, + }; + let _ = self.tx_event.send(event).await; + } + } } /// Helper that emits a BackgroundEvent with the given message. This keeps @@ -1006,6 +1032,10 @@ async fn run_task(sess: Arc, sub_id: String, input: Vec) { .await; let last_agent_message: Option; + // Although from the perspective of codex.rs, TurnDiffTracker has the lifecycle of a Task which contains + // many turns, from the perspective of the user, it is a single turn. + let mut turn_diff_tracker = TurnDiffTracker::new(); + loop { // Note that pending_input would be something like a message the user // submitted through the UI while the model was running. Though the UI @@ -1037,7 +1067,7 @@ async fn run_task(sess: Arc, sub_id: String, input: Vec) { }) }) .collect(); - match run_turn(&sess, sub_id.clone(), turn_input).await { + match run_turn(&sess, &mut turn_diff_tracker, sub_id.clone(), turn_input).await { Ok(turn_output) => { let mut items_to_record_in_conversation_history = Vec::::new(); let mut responses = Vec::::new(); @@ -1163,6 +1193,7 @@ async fn run_task(sess: Arc, sub_id: String, input: Vec) { async fn run_turn( sess: &Session, + turn_diff_tracker: &mut TurnDiffTracker, sub_id: String, input: Vec, ) -> CodexResult> { @@ -1177,7 +1208,7 @@ async fn run_turn( let mut retries = 0; loop { - match try_run_turn(sess, &sub_id, &prompt).await { + match try_run_turn(sess, turn_diff_tracker, &sub_id, &prompt).await { Ok(output) => return Ok(output), Err(CodexErr::Interrupted) => return Err(CodexErr::Interrupted), Err(CodexErr::EnvVar(var)) => return Err(CodexErr::EnvVar(var)), @@ -1223,6 +1254,7 @@ struct ProcessedResponseItem { async fn try_run_turn( sess: &Session, + turn_diff_tracker: &mut TurnDiffTracker, sub_id: &str, prompt: &Prompt, ) -> CodexResult> { @@ -1310,7 +1342,8 @@ async fn try_run_turn( match event { ResponseEvent::Created => {} ResponseEvent::OutputItemDone(item) => { - let response = handle_response_item(sess, sub_id, item.clone()).await?; + let response = + handle_response_item(sess, turn_diff_tracker, sub_id, item.clone()).await?; output.push(ProcessedResponseItem { item, response }); } @@ -1328,6 +1361,16 @@ async fn try_run_turn( .ok(); } + let unified_diff = turn_diff_tracker.get_unified_diff(); + if let Ok(Some(unified_diff)) = unified_diff { + let msg = EventMsg::TurnDiff(TurnDiffEvent { unified_diff }); + let event = Event { + id: sub_id.to_string(), + msg, + }; + let _ = sess.tx_event.send(event).await; + } + return Ok(output); } ResponseEvent::OutputTextDelta(delta) => { @@ -1432,6 +1475,7 @@ async fn run_compact_task( async fn handle_response_item( sess: &Session, + turn_diff_tracker: &mut TurnDiffTracker, sub_id: &str, item: ResponseItem, ) -> CodexResult> { @@ -1469,7 +1513,17 @@ async fn handle_response_item( .. } => { info!("FunctionCall: {arguments}"); - Some(handle_function_call(sess, sub_id.to_string(), name, arguments, call_id).await) + Some( + handle_function_call( + sess, + turn_diff_tracker, + sub_id.to_string(), + name, + arguments, + call_id, + ) + .await, + ) } ResponseItem::LocalShellCall { id, @@ -1504,6 +1558,7 @@ async fn handle_response_item( handle_container_exec_with_params( exec_params, sess, + turn_diff_tracker, sub_id.to_string(), effective_call_id, ) @@ -1521,6 +1576,7 @@ async fn handle_response_item( async fn handle_function_call( sess: &Session, + turn_diff_tracker: &mut TurnDiffTracker, sub_id: String, name: String, arguments: String, @@ -1534,7 +1590,8 @@ async fn handle_function_call( return *output; } }; - handle_container_exec_with_params(params, sess, sub_id, call_id).await + handle_container_exec_with_params(params, sess, turn_diff_tracker, sub_id, call_id) + .await } "update_plan" => handle_update_plan(sess, arguments, sub_id, call_id).await, _ => { @@ -1608,6 +1665,7 @@ fn maybe_run_with_user_profile(params: ExecParams, sess: &Session) -> ExecParams async fn handle_container_exec_with_params( params: ExecParams, sess: &Session, + turn_diff_tracker: &mut TurnDiffTracker, sub_id: String, call_id: String, ) -> ResponseInputItem { @@ -1755,7 +1813,7 @@ async fn handle_container_exec_with_params( }, ), }; - sess.notify_exec_command_begin(exec_command_context.clone()) + sess.on_exec_command_begin(turn_diff_tracker, exec_command_context.clone()) .await; let params = maybe_run_with_user_profile(params, sess); @@ -1782,7 +1840,8 @@ async fn handle_container_exec_with_params( duration, } = &output; - sess.notify_exec_command_end( + sess.on_exec_command_end( + turn_diff_tracker, &sub_id, &call_id, &output, @@ -1806,7 +1865,15 @@ async fn handle_container_exec_with_params( } } Err(CodexErr::Sandbox(error)) => { - handle_sandbox_error(params, exec_command_context, error, sandbox_type, sess).await + handle_sandbox_error( + turn_diff_tracker, + params, + exec_command_context, + error, + sandbox_type, + sess, + ) + .await } Err(e) => { // Handle non-sandbox errors @@ -1822,6 +1889,7 @@ async fn handle_container_exec_with_params( } async fn handle_sandbox_error( + turn_diff_tracker: &mut TurnDiffTracker, params: ExecParams, exec_command_context: ExecCommandContext, error: SandboxErr, @@ -1878,7 +1946,8 @@ async fn handle_sandbox_error( sess.notify_background_event(&sub_id, "retrying command without sandbox") .await; - sess.notify_exec_command_begin(exec_command_context).await; + sess.on_exec_command_begin(turn_diff_tracker, exec_command_context) + .await; // This is an escalated retry; the policy will not be // examined and the sandbox has been set to `None`. @@ -1905,8 +1974,14 @@ async fn handle_sandbox_error( duration, } = &retry_output; - sess.notify_exec_command_end(&sub_id, &call_id, &retry_output, is_apply_patch) - .await; + sess.on_exec_command_end( + turn_diff_tracker, + &sub_id, + &call_id, + &retry_output, + is_apply_patch, + ) + .await; let is_success = *exit_code == 0; let content = format_exec_output( diff --git a/codex-rs/core/src/lib.rs b/codex-rs/core/src/lib.rs index 80f9014954..4f083d9e56 100644 --- a/codex-rs/core/src/lib.rs +++ b/codex-rs/core/src/lib.rs @@ -42,6 +42,7 @@ pub(crate) mod safety; pub mod seatbelt; pub mod shell; pub mod spawn; +pub mod turn_diff_tracker; mod user_notification; pub mod util; diff --git a/codex-rs/core/src/protocol.rs b/codex-rs/core/src/protocol.rs index cbb211d955..82591a2c78 100644 --- a/codex-rs/core/src/protocol.rs +++ b/codex-rs/core/src/protocol.rs @@ -387,6 +387,8 @@ pub enum EventMsg { /// Notification that a patch application has finished. PatchApplyEnd(PatchApplyEndEvent), + TurnDiff(TurnDiffEvent), + /// Response to GetHistoryEntryRequest. GetHistoryEntryResponse(GetHistoryEntryResponseEvent), @@ -598,6 +600,11 @@ pub struct PatchApplyEndEvent { pub success: bool, } +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct TurnDiffEvent { + pub unified_diff: String, +} + #[derive(Debug, Clone, Deserialize, Serialize)] pub struct GetHistoryEntryResponseEvent { pub offset: usize, diff --git a/codex-rs/core/src/turn_diff_tracker.rs b/codex-rs/core/src/turn_diff_tracker.rs new file mode 100644 index 0000000000..7026d7bb32 --- /dev/null +++ b/codex-rs/core/src/turn_diff_tracker.rs @@ -0,0 +1,887 @@ +use std::collections::HashMap; +use std::fs; +use std::path::Path; +use std::path::PathBuf; +use std::process::Command; + +use anyhow::Context; +use anyhow::Result; +use anyhow::anyhow; +use sha1::digest::Output; +use uuid::Uuid; + +use crate::protocol::FileChange; + +const ZERO_OID: &str = "0000000000000000000000000000000000000000"; +const DEV_NULL: &str = "/dev/null"; + +struct BaselineFileInfo { + path: PathBuf, + content: Vec, + mode: FileMode, + oid: String, +} + +/// Tracks sets of changes to files and exposes the overall unified diff. +/// Internally, the way this works is now: +/// 1. Maintain an in-memory baseline snapshot of files when they are first seen. +/// For new additions, do not create a baseline so that diffs are shown as proper additions (using /dev/null). +/// 2. Keep a stable internal filename (uuid) per external path for rename tracking. +/// 3. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk entirely in-memory +/// using the `similar` crate and emit unified diffs with rewritten external paths. +#[derive(Default)] +pub struct TurnDiffTracker { + /// Map external path -> internal filename (uuid). + external_to_temp_name: HashMap, + /// Internal filename -> baseline file info. + baseline_file_info: HashMap, + /// Internal filename -> external path as of current accumulated state (after applying all changes). + /// This is where renames are tracked. + temp_name_to_current_path: HashMap, + /// Cache of known git worktree roots to avoid repeated filesystem walks. + git_root_cache: Vec, +} + +impl TurnDiffTracker { + pub fn new() -> Self { + Self::default() + } + + /// Front-run apply patch calls to track the starting contents of any modified files. + /// - Creates an in-memory baseline snapshot for files that already exist on disk when first seen. + /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions. + /// - Also updates internal mappings for move/rename events. + pub fn on_patch_begin(&mut self, changes: &HashMap) { + for (path, change) in changes.iter() { + // Ensure a stable internal filename exists for this external path. + if !self.external_to_temp_name.contains_key(path) { + let internal = Uuid::new_v4().to_string(); + self.external_to_temp_name + .insert(path.clone(), internal.clone()); + self.temp_name_to_current_path + .insert(internal.clone(), path.clone()); + + // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null. + let baseline_file_info = if path.exists() { + let mode = file_mode_for_path(path); + let mode_val = mode.unwrap_or(FileMode::Regular); + let content = blob_bytes(path, &mode_val).unwrap_or_default(); + let oid = if mode == Some(FileMode::Symlink) { + format!("{:x}", git_blob_sha1_hex_bytes(&content)) + } else { + self.git_blob_oid_for_path(path) + .unwrap_or_else(|| format!("{:x}", git_blob_sha1_hex_bytes(&content))) + }; + Some(BaselineFileInfo { + path: path.clone(), + content, + mode: mode_val, + oid, + }) + } else { + Some(BaselineFileInfo { + path: path.clone(), + content: vec![], + mode: FileMode::Regular, + oid: ZERO_OID.to_string(), + }) + }; + + if let Some(baseline_file_info) = baseline_file_info { + self.baseline_file_info + .insert(internal.clone(), baseline_file_info); + } + } + + // Track rename/move in current mapping if provided in an Update. + if let FileChange::Update { + move_path: Some(dest), + .. + } = change + { + let uuid_filename = match self.external_to_temp_name.get(path) { + Some(i) => i.clone(), + None => { + // This should be rare, but if we haven't mapped the source, create it with no baseline. + let i = Uuid::new_v4().to_string(); + self.baseline_file_info.insert( + i.clone(), + BaselineFileInfo { + path: path.clone(), + content: vec![], + mode: FileMode::Regular, + oid: ZERO_OID.to_string(), + }, + ); + i + } + }; + // Update current external mapping for temp file name. + self.temp_name_to_current_path + .insert(uuid_filename.clone(), dest.clone()); + // Update forward file_mapping: external current -> internal name. + self.external_to_temp_name.remove(path); + self.external_to_temp_name + .insert(dest.clone(), uuid_filename); + }; + } + } + + fn get_path_for_internal(&self, internal: &str) -> Option { + self.temp_name_to_current_path + .get(internal) + .cloned() + .or_else(|| { + self.baseline_file_info + .get(internal) + .map(|info| info.path.clone()) + }) + } + + /// Find the git worktree root for a file/directory by walking up to the first ancestor containing a `.git` entry. + /// Uses a simple cache of known roots and avoids negative-result caching for simplicity. + fn find_git_root_cached(&mut self, start: &Path) -> Option { + let dir = if start.is_dir() { + start + } else { + start.parent()? + }; + + // Fast path: if any cached root is an ancestor of this path, use it. + if let Some(root) = self + .git_root_cache + .iter() + .find(|r| dir.starts_with(r)) + .cloned() + { + return Some(root); + } + + // Walk up to find a `.git` marker. + let mut cur = dir.to_path_buf(); + loop { + let git_marker = cur.join(".git"); + if git_marker.is_dir() || git_marker.is_file() { + if !self.git_root_cache.iter().any(|r| r == &cur) { + self.git_root_cache.push(cur.clone()); + } + return Some(cur); + } + + // On Windows, avoid walking above the drive or UNC share root. + #[cfg(windows)] + { + if is_windows_drive_or_unc_root(&cur) { + return None; + } + } + + if let Some(parent) = cur.parent() { + cur = parent.to_path_buf(); + } else { + return None; + } + } + } + + /// Return a display string for `path` relative to its git root if found, else absolute. + fn relative_to_git_root_str(&mut self, path: &Path) -> String { + let s = if let Some(root) = self.find_git_root_cached(path) { + if let Ok(rel) = path.strip_prefix(&root) { + rel.display().to_string() + } else { + path.display().to_string() + } + } else { + path.display().to_string() + }; + s.replace('\\', "/") + } + + /// Ask git to compute the blob SHA-1 for the file at `path` within its repository. + /// Returns None if no repository is found or git invocation fails. + fn git_blob_oid_for_path(&mut self, path: &Path) -> Option { + let root = self.find_git_root_cached(path)?; + // Compute a path relative to the repo root for better portability across platforms. + let rel = path.strip_prefix(&root).unwrap_or(path); + let output = Command::new("git") + .arg("-C") + .arg(&root) + .arg("hash-object") + .arg("--") + .arg(rel) + .output() + .ok()?; + if !output.status.success() { + return None; + } + let s = String::from_utf8_lossy(&output.stdout).trim().to_string(); + if s.len() == 40 { Some(s) } else { None } + } + + /// Recompute the aggregated unified diff by comparing all of the in-memory snapshots that were + /// collected before the first time they were touched by apply_patch during this turn with + /// the current repo state. + pub fn get_unified_diff(&mut self) -> Result> { + let mut aggregated = String::new(); + + // Compute diffs per tracked internal file in a stable order by external path. + let mut baseline_file_names: Vec = + self.baseline_file_info.keys().cloned().collect(); + // Sort lexicographically by full repo-relative path to match git behavior. + baseline_file_names.sort_by_key(|internal| { + self.get_path_for_internal(internal) + .map(|p| self.relative_to_git_root_str(&p)) + .unwrap_or_default() + }); + + for internal in baseline_file_names { + aggregated.push_str(self.get_file_diff(&internal).as_str()); + if !aggregated.ends_with('\n') { + aggregated.push('\n'); + } + } + + if aggregated.trim().is_empty() { + Ok(None) + } else { + Ok(Some(aggregated)) + } + } + + fn get_file_diff(&mut self, internal_file_name: &str) -> String { + let mut aggregated = String::new(); + + // Snapshot lightweight fields only. + let (baseline_external_path, baseline_mode, left_oid) = { + if let Some(info) = self.baseline_file_info.get(internal_file_name) { + (info.path.clone(), info.mode, info.oid.clone()) + } else { + (PathBuf::new(), FileMode::Regular, ZERO_OID.to_string()) + } + }; + let current_external_path = match self.get_path_for_internal(internal_file_name) { + Some(p) => p, + None => return aggregated, + }; + + let current_mode = file_mode_for_path(¤t_external_path).unwrap_or(FileMode::Regular); + let right_bytes = blob_bytes(¤t_external_path, ¤t_mode); + + // Compute displays with &mut self before borrowing any baseline content. + let left_display = self.relative_to_git_root_str(&baseline_external_path); + let right_display = self.relative_to_git_root_str(¤t_external_path); + + // Compute right oid before borrowing baseline content. + let right_oid = if let Some(b) = right_bytes.as_ref() { + if current_mode == FileMode::Symlink { + format!("{:x}", git_blob_sha1_hex_bytes(b)) + } else { + self.git_blob_oid_for_path(¤t_external_path) + .unwrap_or_else(|| format!("{:x}", git_blob_sha1_hex_bytes(b))) + } + } else { + ZERO_OID.to_string() + }; + + // Borrow baseline content only after all &mut self uses are done. + let left_present = left_oid.as_str() != ZERO_OID; + let left_bytes: Option<&[u8]> = if left_present { + self.baseline_file_info + .get(internal_file_name) + .map(|i| i.content.as_slice()) + } else { + None + }; + + // Fast path: identical bytes or both missing. + if left_bytes == right_bytes.as_deref() { + return aggregated; + } + + aggregated.push_str(&format!("diff --git a/{left_display} b/{right_display}\n")); + + let is_add = !left_present && right_bytes.is_some(); + let is_delete = left_present && right_bytes.is_none(); + + if is_add { + aggregated.push_str(&format!("new file mode {current_mode}\n")); + } else if is_delete { + aggregated.push_str(&format!("deleted file mode {baseline_mode}\n")); + } else if baseline_mode != current_mode { + aggregated.push_str(&format!("old mode {baseline_mode}\n")); + aggregated.push_str(&format!("new mode {current_mode}\n")); + } + + let left_text = left_bytes.and_then(|b| std::str::from_utf8(b).ok()); + let right_text = right_bytes + .as_deref() + .and_then(|b| std::str::from_utf8(b).ok()); + + let can_text_diff = matches!( + (left_text, right_text, is_add, is_delete), + (Some(_), Some(_), _, _) | (_, Some(_), true, _) | (Some(_), _, _, true) + ); + + if can_text_diff { + let l = left_text.unwrap_or(""); + let r = right_text.unwrap_or(""); + + aggregated.push_str(&format!("index {left_oid}..{right_oid}\n")); + + let old_header = if left_present { + format!("a/{left_display}") + } else { + DEV_NULL.to_string() + }; + let new_header = if right_bytes.is_some() { + format!("b/{right_display}") + } else { + DEV_NULL.to_string() + }; + + let diff = similar::TextDiff::from_lines(l, r); + let unified = diff + .unified_diff() + .context_radius(3) + .header(&old_header, &new_header) + .to_string(); + + aggregated.push_str(&unified); + } else { + aggregated.push_str(&format!("index {left_oid}..{right_oid}\n")); + let old_header = if left_present { + format!("a/{left_display}") + } else { + DEV_NULL.to_string() + }; + let new_header = if right_bytes.is_some() { + format!("b/{right_display}") + } else { + DEV_NULL.to_string() + }; + aggregated.push_str(&format!("--- {old_header}\n")); + aggregated.push_str(&format!("+++ {new_header}\n")); + aggregated.push_str("Binary files differ\n"); + } + aggregated + } +} + +/// Compute the Git SHA-1 blob object ID for the given content (bytes). +fn git_blob_sha1_hex_bytes(data: &[u8]) -> Output { + // Git blob hash is sha1 of: "blob \0" + let header = format!("blob {}\0", data.len()); + use sha1::Digest; + let mut hasher = sha1::Sha1::new(); + hasher.update(header.as_bytes()); + hasher.update(data); + hasher.finalize() +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +enum FileMode { + Regular, + #[cfg(unix)] + Executable, + Symlink, +} + +impl FileMode { + fn as_str(&self) -> &'static str { + match self { + FileMode::Regular => "100644", + #[cfg(unix)] + FileMode::Executable => "100755", + FileMode::Symlink => "120000", + } + } +} + +impl std::fmt::Display for FileMode { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(self.as_str()) + } +} + +#[cfg(unix)] +fn file_mode_for_path(path: &Path) -> Option { + use std::os::unix::fs::PermissionsExt; + let meta = fs::symlink_metadata(path).ok()?; + let ft = meta.file_type(); + if ft.is_symlink() { + return Some(FileMode::Symlink); + } + let mode = meta.permissions().mode(); + let is_exec = (mode & 0o111) != 0; + Some(if is_exec { + FileMode::Executable + } else { + FileMode::Regular + }) +} + +#[cfg(not(unix))] +fn file_mode_for_path(_path: &Path) -> Option { + // Default to non-executable on non-unix. + Some(FileMode::Regular) +} + +fn blob_bytes(path: &Path, mode: &FileMode) -> Option> { + if path.exists() { + let contents = if *mode == FileMode::Symlink { + symlink_blob_bytes(path) + .ok_or_else(|| anyhow!("failed to read symlink target for {}", path.display())) + } else { + fs::read(path) + .with_context(|| format!("failed to read current file for diff {}", path.display())) + }; + contents.ok() + } else { + None + } +} + +#[cfg(unix)] +fn symlink_blob_bytes(path: &Path) -> Option> { + use std::os::unix::ffi::OsStrExt; + let target = std::fs::read_link(path).ok()?; + Some(target.as_os_str().as_bytes().to_vec()) +} + +#[cfg(not(unix))] +fn symlink_blob_bytes(_path: &Path) -> Option> { + None +} + +#[cfg(windows)] +fn is_windows_drive_or_unc_root(p: &std::path::Path) -> bool { + use std::path::Component; + let mut comps = p.components(); + matches!( + (comps.next(), comps.next(), comps.next()), + (Some(Component::Prefix(_)), Some(Component::RootDir), None) + ) +} + +#[cfg(test)] +mod tests { + #![allow(clippy::unwrap_used)] + use super::*; + use pretty_assertions::assert_eq; + use tempfile::tempdir; + + /// Compute the Git SHA-1 blob object ID for the given content (string). + /// This delegates to the bytes version to avoid UTF-8 lossy conversions here. + fn git_blob_sha1_hex(data: &str) -> String { + format!("{:x}", git_blob_sha1_hex_bytes(data.as_bytes())) + } + + fn normalize_diff_for_test(input: &str, root: &Path) -> String { + let root_str = root.display().to_string().replace('\\', "/"); + let replaced = input.replace(&root_str, ""); + // Split into blocks on lines starting with "diff --git ", sort blocks for determinism, and rejoin + let mut blocks: Vec = Vec::new(); + let mut current = String::new(); + for line in replaced.lines() { + if line.starts_with("diff --git ") && !current.is_empty() { + blocks.push(current); + current = String::new(); + } + if !current.is_empty() { + current.push('\n'); + } + current.push_str(line); + } + if !current.is_empty() { + blocks.push(current); + } + blocks.sort(); + let mut out = blocks.join("\n"); + if !out.ends_with('\n') { + out.push('\n'); + } + out + } + + #[test] + fn accumulates_add_and_update() { + let mut acc = TurnDiffTracker::new(); + + let dir = tempdir().unwrap(); + let file = dir.path().join("a.txt"); + + // First patch: add file (baseline should be /dev/null). + let add_changes = HashMap::from([( + file.clone(), + FileChange::Add { + content: "foo\n".to_string(), + }, + )]); + acc.on_patch_begin(&add_changes); + + // Simulate apply: create the file on disk. + fs::write(&file, "foo\n").unwrap(); + let first = acc.get_unified_diff().unwrap().unwrap(); + let first = normalize_diff_for_test(&first, dir.path()); + let expected_first = { + let mode = file_mode_for_path(&file).unwrap_or(FileMode::Regular); + let right_oid = git_blob_sha1_hex("foo\n"); + format!( + r#"diff --git a//a.txt b//a.txt +new file mode {mode} +index {ZERO_OID}..{right_oid} +--- {DEV_NULL} ++++ b//a.txt +@@ -0,0 +1 @@ ++foo +"#, + ) + }; + assert_eq!(first, expected_first); + + // Second patch: update the file on disk. + let update_changes = HashMap::from([( + file.clone(), + FileChange::Update { + unified_diff: "".to_owned(), + move_path: None, + }, + )]); + acc.on_patch_begin(&update_changes); + + // Simulate apply: append a new line. + fs::write(&file, "foo\nbar\n").unwrap(); + let combined = acc.get_unified_diff().unwrap().unwrap(); + let combined = normalize_diff_for_test(&combined, dir.path()); + let expected_combined = { + let mode = file_mode_for_path(&file).unwrap_or(FileMode::Regular); + let right_oid = git_blob_sha1_hex("foo\nbar\n"); + format!( + r#"diff --git a//a.txt b//a.txt +new file mode {mode} +index {ZERO_OID}..{right_oid} +--- {DEV_NULL} ++++ b//a.txt +@@ -0,0 +1,2 @@ ++foo ++bar +"#, + ) + }; + assert_eq!(combined, expected_combined); + } + + #[test] + fn accumulates_delete() { + let dir = tempdir().unwrap(); + let file = dir.path().join("b.txt"); + fs::write(&file, "x\n").unwrap(); + + let mut acc = TurnDiffTracker::new(); + let del_changes = HashMap::from([(file.clone(), FileChange::Delete)]); + acc.on_patch_begin(&del_changes); + + // Simulate apply: delete the file from disk. + let baseline_mode = file_mode_for_path(&file).unwrap_or(FileMode::Regular); + fs::remove_file(&file).unwrap(); + let diff = acc.get_unified_diff().unwrap().unwrap(); + let diff = normalize_diff_for_test(&diff, dir.path()); + let expected = { + let left_oid = git_blob_sha1_hex("x\n"); + format!( + r#"diff --git a//b.txt b//b.txt +deleted file mode {baseline_mode} +index {left_oid}..{ZERO_OID} +--- a//b.txt ++++ {DEV_NULL} +@@ -1 +0,0 @@ +-x +"#, + ) + }; + assert_eq!(diff, expected); + } + + #[test] + fn accumulates_move_and_update() { + let dir = tempdir().unwrap(); + let src = dir.path().join("src.txt"); + let dest = dir.path().join("dst.txt"); + fs::write(&src, "line\n").unwrap(); + + let mut acc = TurnDiffTracker::new(); + let mv_changes = HashMap::from([( + src.clone(), + FileChange::Update { + unified_diff: "".to_owned(), + move_path: Some(dest.clone()), + }, + )]); + acc.on_patch_begin(&mv_changes); + + // Simulate apply: move and update content. + fs::rename(&src, &dest).unwrap(); + fs::write(&dest, "line2\n").unwrap(); + + let out = acc.get_unified_diff().unwrap().unwrap(); + let out = normalize_diff_for_test(&out, dir.path()); + let expected = { + let left_oid = git_blob_sha1_hex("line\n"); + let right_oid = git_blob_sha1_hex("line2\n"); + format!( + r#"diff --git a//src.txt b//dst.txt +index {left_oid}..{right_oid} +--- a//src.txt ++++ b//dst.txt +@@ -1 +1 @@ +-line ++line2 +"# + ) + }; + assert_eq!(out, expected); + } + + #[test] + fn move_without_1change_yields_no_diff() { + let dir = tempdir().unwrap(); + let src = dir.path().join("moved.txt"); + let dest = dir.path().join("renamed.txt"); + fs::write(&src, "same\n").unwrap(); + + let mut acc = TurnDiffTracker::new(); + let mv_changes = HashMap::from([( + src.clone(), + FileChange::Update { + unified_diff: "".to_owned(), + move_path: Some(dest.clone()), + }, + )]); + acc.on_patch_begin(&mv_changes); + + // Simulate apply: move only, no content change. + fs::rename(&src, &dest).unwrap(); + + let diff = acc.get_unified_diff().unwrap(); + assert_eq!(diff, None); + } + + #[test] + fn move_declared_but_file_only_appears_at_dest_is_add() { + let dir = tempdir().unwrap(); + let src = dir.path().join("src.txt"); + let dest = dir.path().join("dest.txt"); + let mut acc = TurnDiffTracker::new(); + let mv = HashMap::from([( + src.clone(), + FileChange::Update { + unified_diff: "".into(), + move_path: Some(dest.clone()), + }, + )]); + acc.on_patch_begin(&mv); + // No file existed initially; create only dest + fs::write(&dest, "hello\n").unwrap(); + let diff = acc.get_unified_diff().unwrap().unwrap(); + let diff = normalize_diff_for_test(&diff, dir.path()); + let expected = { + let mode = file_mode_for_path(&dest).unwrap_or(FileMode::Regular); + let right_oid = git_blob_sha1_hex("hello\n"); + format!( + r#"diff --git a//src.txt b//dest.txt +new file mode {mode} +index {ZERO_OID}..{right_oid} +--- {DEV_NULL} ++++ b//dest.txt +@@ -0,0 +1 @@ ++hello +"#, + ) + }; + assert_eq!(diff, expected); + } + + #[test] + fn update_persists_across_new_baseline_for_new_file() { + let dir = tempdir().unwrap(); + let a = dir.path().join("a.txt"); + let b = dir.path().join("b.txt"); + fs::write(&a, "foo\n").unwrap(); + fs::write(&b, "z\n").unwrap(); + + let mut acc = TurnDiffTracker::new(); + + // First: update existing a.txt (baseline snapshot is created for a). + let update_a = HashMap::from([( + a.clone(), + FileChange::Update { + unified_diff: "".to_owned(), + move_path: None, + }, + )]); + acc.on_patch_begin(&update_a); + // Simulate apply: modify a.txt on disk. + fs::write(&a, "foo\nbar\n").unwrap(); + let first = acc.get_unified_diff().unwrap().unwrap(); + let first = normalize_diff_for_test(&first, dir.path()); + let expected_first = { + let left_oid = git_blob_sha1_hex("foo\n"); + let right_oid = git_blob_sha1_hex("foo\nbar\n"); + format!( + r#"diff --git a//a.txt b//a.txt +index {left_oid}..{right_oid} +--- a//a.txt ++++ b//a.txt +@@ -1 +1,2 @@ + foo ++bar +"# + ) + }; + assert_eq!(first, expected_first); + + // Next: introduce a brand-new path b.txt into baseline snapshots via a delete change. + let del_b = HashMap::from([(b.clone(), FileChange::Delete)]); + acc.on_patch_begin(&del_b); + // Simulate apply: delete b.txt. + let baseline_mode = file_mode_for_path(&b).unwrap_or(FileMode::Regular); + fs::remove_file(&b).unwrap(); + + let combined = acc.get_unified_diff().unwrap().unwrap(); + let combined = normalize_diff_for_test(&combined, dir.path()); + let expected = { + let left_oid_a = git_blob_sha1_hex("foo\n"); + let right_oid_a = git_blob_sha1_hex("foo\nbar\n"); + let left_oid_b = git_blob_sha1_hex("z\n"); + format!( + r#"diff --git a//a.txt b//a.txt +index {left_oid_a}..{right_oid_a} +--- a//a.txt ++++ b//a.txt +@@ -1 +1,2 @@ + foo ++bar +diff --git a//b.txt b//b.txt +deleted file mode {baseline_mode} +index {left_oid_b}..{ZERO_OID} +--- a//b.txt ++++ {DEV_NULL} +@@ -1 +0,0 @@ +-z +"#, + ) + }; + assert_eq!(combined, expected); + } + + #[test] + fn binary_files_differ_update() { + let dir = tempdir().unwrap(); + let file = dir.path().join("bin.dat"); + + // Initial non-UTF8 bytes + let left_bytes: Vec = vec![0xff, 0xfe, 0xfd, 0x00]; + // Updated non-UTF8 bytes + let right_bytes: Vec = vec![0x01, 0x02, 0x03, 0x00]; + + fs::write(&file, &left_bytes).unwrap(); + + let mut acc = TurnDiffTracker::new(); + let update_changes = HashMap::from([( + file.clone(), + FileChange::Update { + unified_diff: "".to_owned(), + move_path: None, + }, + )]); + acc.on_patch_begin(&update_changes); + + // Apply update on disk + fs::write(&file, &right_bytes).unwrap(); + + let diff = acc.get_unified_diff().unwrap().unwrap(); + let diff = normalize_diff_for_test(&diff, dir.path()); + let expected = { + let left_oid = format!("{:x}", git_blob_sha1_hex_bytes(&left_bytes)); + let right_oid = format!("{:x}", git_blob_sha1_hex_bytes(&right_bytes)); + format!( + r#"diff --git a//bin.dat b//bin.dat +index {left_oid}..{right_oid} +--- a//bin.dat ++++ b//bin.dat +Binary files differ +"# + ) + }; + assert_eq!(diff, expected); + } + + #[test] + fn filenames_with_spaces_add_and_update() { + let mut acc = TurnDiffTracker::new(); + + let dir = tempdir().unwrap(); + let file = dir.path().join("name with spaces.txt"); + + // First patch: add file (baseline should be /dev/null). + let add_changes = HashMap::from([( + file.clone(), + FileChange::Add { + content: "foo\n".to_string(), + }, + )]); + acc.on_patch_begin(&add_changes); + + // Simulate apply: create the file on disk. + fs::write(&file, "foo\n").unwrap(); + let first = acc.get_unified_diff().unwrap().unwrap(); + let first = normalize_diff_for_test(&first, dir.path()); + let expected_first = { + let mode = file_mode_for_path(&file).unwrap_or(FileMode::Regular); + let right_oid = git_blob_sha1_hex("foo\n"); + format!( + r#"diff --git a//name with spaces.txt b//name with spaces.txt +new file mode {mode} +index {ZERO_OID}..{right_oid} +--- {DEV_NULL} ++++ b//name with spaces.txt +@@ -0,0 +1 @@ ++foo +"#, + ) + }; + assert_eq!(first, expected_first); + + // Second patch: update the file on disk. + let update_changes = HashMap::from([( + file.clone(), + FileChange::Update { + unified_diff: "".to_owned(), + move_path: None, + }, + )]); + acc.on_patch_begin(&update_changes); + + // Simulate apply: append a new line with a space. + fs::write(&file, "foo\nbar baz\n").unwrap(); + let combined = acc.get_unified_diff().unwrap().unwrap(); + let combined = normalize_diff_for_test(&combined, dir.path()); + let expected_combined = { + let mode = file_mode_for_path(&file).unwrap_or(FileMode::Regular); + let right_oid = git_blob_sha1_hex("foo\nbar baz\n"); + format!( + r#"diff --git a//name with spaces.txt b//name with spaces.txt +new file mode {mode} +index {ZERO_OID}..{right_oid} +--- {DEV_NULL} ++++ b//name with spaces.txt +@@ -0,0 +1,2 @@ ++foo ++bar baz +"#, + ) + }; + assert_eq!(combined, expected_combined); + } +} diff --git a/codex-rs/exec/src/event_processor_with_human_output.rs b/codex-rs/exec/src/event_processor_with_human_output.rs index 72e2f9298f..c290d9336b 100644 --- a/codex-rs/exec/src/event_processor_with_human_output.rs +++ b/codex-rs/exec/src/event_processor_with_human_output.rs @@ -20,6 +20,7 @@ use codex_core::protocol::PatchApplyEndEvent; use codex_core::protocol::SessionConfiguredEvent; use codex_core::protocol::TaskCompleteEvent; use codex_core::protocol::TokenUsage; +use codex_core::protocol::TurnDiffEvent; use owo_colors::OwoColorize; use owo_colors::Style; use shlex::try_join; @@ -399,6 +400,7 @@ impl EventProcessor for EventProcessorWithHumanOutput { stdout, stderr, success, + .. }) => { let patch_begin = self.call_id_to_patch.remove(&call_id); @@ -428,6 +430,10 @@ impl EventProcessor for EventProcessorWithHumanOutput { println!("{}", line.style(self.dimmed)); } } + EventMsg::TurnDiff(TurnDiffEvent { unified_diff }) => { + ts_println!(self, "{}", "turn diff:".style(self.magenta)); + println!("{unified_diff}"); + } EventMsg::ExecApprovalRequest(_) => { // Should we exit? } diff --git a/codex-rs/mcp-server/src/codex_tool_runner.rs b/codex-rs/mcp-server/src/codex_tool_runner.rs index d489ffe076..205dfa4631 100644 --- a/codex-rs/mcp-server/src/codex_tool_runner.rs +++ b/codex-rs/mcp-server/src/codex_tool_runner.rs @@ -263,6 +263,7 @@ async fn run_codex_tool_session_inner( | EventMsg::BackgroundEvent(_) | EventMsg::PatchApplyBegin(_) | EventMsg::PatchApplyEnd(_) + | EventMsg::TurnDiff(_) | EventMsg::GetHistoryEntryResponse(_) | EventMsg::PlanUpdate(_) | EventMsg::ShutdownComplete => { diff --git a/codex-rs/mcp-server/src/conversation_loop.rs b/codex-rs/mcp-server/src/conversation_loop.rs index 534275181a..1db39a2306 100644 --- a/codex-rs/mcp-server/src/conversation_loop.rs +++ b/codex-rs/mcp-server/src/conversation_loop.rs @@ -97,6 +97,7 @@ pub async fn run_conversation_loop( | EventMsg::McpToolCallEnd(_) | EventMsg::ExecCommandBegin(_) | EventMsg::ExecCommandEnd(_) + | EventMsg::TurnDiff(_) | EventMsg::BackgroundEvent(_) | EventMsg::ExecCommandOutputDelta(_) | EventMsg::PatchApplyBegin(_) ``` ## Review Comments ### codex-rs/core/Cargo.toml - Created: 2025-08-01 16:35:11 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2248403984 ```diff @@ -51,6 +51,7 @@ tree-sitter-bash = "0.25.0" uuid = { version = "1", features = ["serde", "v4"] } whoami = "1.6.0" wildmatch = "2.4.0" +tempfile = "3" ``` > @pakrym-oai alpha sort is from saving in the editor, right? - Created: 2025-08-04 02:42:24 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2250289252 ```diff @@ -34,6 +34,7 @@ serde_json = "1" serde_bytes = "0.11" sha1 = "0.10.6" shlex = "1.3.0" +similar = "2" ``` > Maybe we should match https://github.com/openai/codex/blob/e3565a3f438c30c9d36412d2817346c7accd487c/codex-rs/apply-patch/Cargo.toml#L15 (or change that one to be `"2"`?) ### codex-rs/core/src/codex.rs - Created: 2025-08-01 16:48:22 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2248425776 ```diff @@ -1471,6 +1528,7 @@ fn maybe_run_with_user_profile(params: ExecParams, sess: &Session) -> ExecParams async fn handle_container_exec_with_params( params: ExecParams, sess: &Session, + turn_diff_tracker: &mut TurnDiffTracker, ``` > Hmm, what would happen if we wanted to support parallel tool calls at one point. This would be a problem, no? - Created: 2025-08-01 17:25:22 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2248497145 ```diff @@ -1471,6 +1528,7 @@ fn maybe_run_with_user_profile(params: ExecParams, sess: &Session) -> ExecParams async fn handle_container_exec_with_params( params: ExecParams, sess: &Session, + turn_diff_tracker: &mut TurnDiffTracker, ``` > Because only one tool call could take ownership of TurnDiffTracker. - Created: 2025-08-04 02:46:36 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2250292735 ```diff @@ -374,11 +380,15 @@ impl Session { Some(ApplyPatchCommandContext { user_explicitly_approved_this_action, changes, - }) => EventMsg::PatchApplyBegin(PatchApplyBeginEvent { - call_id, - auto_approved: !user_explicitly_approved_this_action, - changes, - }), + }) => { + let _ = turn_diff_tracker.on_patch_begin(&changes); ``` > If this doesn't have to return `Result`, then `let _` can go away, of course, but depending on what sort of `Err` we expect, perhaps we should at least `warn!()` or `error!()`? - Created: 2025-08-04 02:47:22 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2250293562 ```diff @@ -392,8 +402,10 @@ impl Session { let _ = self.tx_event.send(event).await; } - async fn notify_exec_command_end( + #[allow(clippy::too_many_arguments)] ``` > We should maybe introduce a struct in a follow-up PR. - Created: 2025-08-04 02:49:08 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2250294909 ```diff @@ -1163,6 +1193,7 @@ async fn run_task(sess: Arc, sub_id: String, input: Vec) { async fn run_turn( sess: &Session, + turn_diff_tracker: &mut TurnDiffTracker, ``` > We'll probably want a `struct TurnContext` or somesuch in the near future. - Created: 2025-08-04 02:51:47 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2250296938 ```diff @@ -1471,6 +1528,7 @@ fn maybe_run_with_user_profile(params: ExecParams, sess: &Session) -> ExecParams async fn handle_container_exec_with_params( params: ExecParams, sess: &Session, + turn_diff_tracker: &mut TurnDiffTracker, ``` > Yes, though also, if we introduce a `struct TurnContext` as mentioned above, that may also force the move to `Mutex`. But yes, does not have to be done in this PR. - Created: 2025-08-04 02:55:26 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2250299871 ```diff @@ -1328,6 +1361,16 @@ async fn try_run_turn( .ok(); } + let unified_diff = turn_diff_tracker.get_unified_diff(); + if let Ok(Some(unified_diff)) = unified_diff { + let msg = EventMsg::TurnDiff(TurnDiffEvent { unified_diff }); + let event = Event { + id: sub_id.to_string(), + msg, + }; + let _ = sess.tx_event.send(event).await; + } + ``` > I'm starting to think that we should do `break token_usage;` to get out of the loop and then do all of this post-loop stuff below just in case there ever ends up being another way to break out. > > It would also eliminate this `return` statement buried in here (though admittedly it would bury the `break` statement instead). ### codex-rs/core/src/protocol.rs - Created: 2025-08-01 16:41:09 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2248413365 ```diff @@ -525,6 +527,11 @@ pub struct PatchApplyEndEvent { pub success: bool, } +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct TurnDiffEvent { + pub unified_diff: String, ``` > I feel like this would be easier to work with programmatically if this were keyed by path, more like `changes` in `PatchApplyBeginEvent`. Maybe for a full add or a full delete for an individual file, we still want the unified diff, but it's nice to have added/modified/removed metadata for each path so it's easy to build a compact summary for the diff (maybe with +/- line counts)? - Created: 2025-08-01 16:49:48 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2248429044 ```diff @@ -525,6 +527,11 @@ pub struct PatchApplyEndEvent { pub success: bool, } +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct TurnDiffEvent { + pub unified_diff: String, ``` > What guarantees, if any, can we make about the paths in the `unified_diff`: will they all be absolute paths? ### codex-rs/core/src/turn_diff_tracker.rs - Created: 2025-08-01 16:54:37 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2248437475 ```diff @@ -0,0 +1,476 @@ +use std::collections::HashMap; +use std::fs; +use std::path::Path; +use std::path::PathBuf; +use std::process::Command; + +use anyhow::Context; +use anyhow::Result; +use tempfile::TempDir; +use uuid::Uuid; + +use crate::protocol::FileChange; + +/// Tracks sets of changes to files and exposes the overall unified diff. +/// Internally, the way this works is now: +/// 1. Create a temp directory to store baseline snapshots of files when they are first seen. +/// 2. When a path is first observed, copy its current contents into the baseline dir if it exists on disk. +/// For new additions, do not create a baseline file so that diffs are shown as proper additions (using /dev/null). +/// 3. Keep a stable internal filename (uuid + same extension) per external path for path rewrite in diffs. +/// 4. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk using +/// `git diff --no-index` and rewrite paths to external paths. +#[derive(Default)] +pub struct TurnDiffTracker { + /// Temp directory holding baseline snapshots of files as first seen. + baseline_files_dir: Option, + /// Map external path -> internal filename (uuid + same extension). + external_to_temp_name: HashMap, + /// Internal filename -> external path as of baseline snapshot. + temp_name_to_baseline_external: HashMap, + /// Internal filename -> external path as of current accumulated state (after applying all changes). + /// This is where renames are tracked. + temp_name_to_current_external: HashMap, + /// Aggregated unified diff for all accumulated changes across files. + pub unified_diff: Option, ``` > I'm surprised to see this as a field as opposed to always derived? - Created: 2025-08-01 16:57:12 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2248442328 ```diff @@ -0,0 +1,476 @@ +use std::collections::HashMap; +use std::fs; +use std::path::Path; +use std::path::PathBuf; +use std::process::Command; + +use anyhow::Context; +use anyhow::Result; +use tempfile::TempDir; +use uuid::Uuid; + +use crate::protocol::FileChange; + +/// Tracks sets of changes to files and exposes the overall unified diff. +/// Internally, the way this works is now: +/// 1. Create a temp directory to store baseline snapshots of files when they are first seen. +/// 2. When a path is first observed, copy its current contents into the baseline dir if it exists on disk. +/// For new additions, do not create a baseline file so that diffs are shown as proper additions (using /dev/null). +/// 3. Keep a stable internal filename (uuid + same extension) per external path for path rewrite in diffs. +/// 4. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk using +/// `git diff --no-index` and rewrite paths to external paths. +#[derive(Default)] +pub struct TurnDiffTracker { + /// Temp directory holding baseline snapshots of files as first seen. + baseline_files_dir: Option, + /// Map external path -> internal filename (uuid + same extension). + external_to_temp_name: HashMap, + /// Internal filename -> external path as of baseline snapshot. + temp_name_to_baseline_external: HashMap, + /// Internal filename -> external path as of current accumulated state (after applying all changes). + /// This is where renames are tracked. + temp_name_to_current_external: HashMap, + /// Aggregated unified diff for all accumulated changes across files. + pub unified_diff: Option, +} + +impl TurnDiffTracker { + pub fn new() -> Self { + Self::default() + } + + /// Front-run apply patch calls to track the starting contents of any modified files. + /// - Creates a baseline snapshot for files that already exist on disk when first seen. + /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions. + /// - Also updates internal mappings for move/rename events. + pub fn on_patch_begin(&mut self, changes: &HashMap) -> Result<()> { + self.ensure_baseline_dir()?; + let baseline_dir = self.baseline_dir()?.to_path_buf(); + + for (path, change) in changes.iter() { + // Ensure a stable internal filename exists for this external path. + if !self.external_to_temp_name.contains_key(path) { + let internal = uuid_filename_for(path); + self.external_to_temp_name + .insert(path.clone(), internal.clone()); + self.temp_name_to_baseline_external + .insert(internal.clone(), path.clone()); + self.temp_name_to_current_external + .insert(internal.clone(), path.clone()); + + // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null. + if path.exists() { + let contents = fs::read(path) + .with_context(|| format!("failed to read original {}", path.display()))?; + let internal_path = baseline_dir.join(&internal); + fs::write(&internal_path, contents).with_context(|| { + format!("failed to write baseline file {}", internal_path.display()) + })?; + } + } + + // Track rename/move in current mapping if provided in an Update. + let move_path = match change { + FileChange::Update { + move_path: Some(dest), + .. + } => Some(dest), + _ => None, + }; + if let Some(dest) = move_path { + let uuid_filename = match self.external_to_temp_name.get(path) { + Some(i) => i.clone(), + None => { + // This should be rare, but if we haven't mapped the source, create it with no baseline. + let i = uuid_filename_for(path); + self.external_to_temp_name.insert(path.clone(), i.clone()); + self.temp_name_to_baseline_external + .insert(i.clone(), path.clone()); + i + } + }; + // Update current external mapping for temp file name. + self.temp_name_to_current_external + .insert(uuid_filename.clone(), dest.clone()); + // Update forward file_mapping: external current -> internal name. + self.external_to_temp_name.remove(path); + self.external_to_temp_name + .insert(dest.clone(), uuid_filename); + } + } + + Ok(()) + } + + /// Recompute the aggregated unified diff by comparing all baseline snapshots against + /// current files on disk using `git diff --no-index` and rewriting paths to external paths. + pub fn update_and_get_unified_diff(&mut self) -> Result> { + let baseline_dir = self.baseline_dir()?.to_path_buf(); + let current_dir = baseline_dir.join("current"); + if current_dir.exists() { + // Best-effort cleanup of previous run's mirror. + let _ = fs::remove_dir_all(¤t_dir); + } + fs::create_dir_all(¤t_dir).with_context(|| { + format!( + "failed to create current mirror dir {}", + current_dir.display() + ) + })?; + + let mut aggregated = String::new(); + + // Compute diffs per tracked internal file. + for (internal, baseline_external) in &self.temp_name_to_baseline_external { + let baseline_path = baseline_dir.join(internal); + let current_external = self + .temp_name_to_current_external + .get(internal) + .cloned() + .unwrap_or_else(|| baseline_external.clone()); + + let left_is_dev_null = !baseline_path.exists(); + let right_exists = current_external.exists(); + + // Prepare right side mirror file if exists; otherwise use /dev/null for deletions. + let right_arg = if right_exists { + let mirror_path = current_dir.join(internal); + let contents = fs::read(¤t_external).with_context(|| { + format!( + "failed to read current file for diff {}", + current_external.display() + ) + })?; + fs::write(&mirror_path, contents).with_context(|| { + format!( + "failed to write current mirror file {}", + mirror_path.display() + ) + })?; + // Use relative path from baseline_dir (so headers say a/ b/current/). + format!("current/{internal}") + } else { + // Deletion: right side is /dev/null to show proper deleted file diff. + "/dev/null".to_string() + }; + + // Prepare left arg: baseline file path or /dev/null for additions. + let left_arg = if left_is_dev_null { + "/dev/null".to_string() + } else { + internal.clone() + }; + + // Run git diff --no-index from baseline_dir to keep paths predictable. + let raw = run_git_allow_exit_codes( + &baseline_dir, + &[ + "-c", + "color.ui=false", + "diff", + "--no-color", + "--no-index", + "--", + &left_arg, + &right_arg, + ], + &[0, 1], // 0: no changes, 1: differences + )?; + + if raw.trim().is_empty() { + continue; + } + let rewritten = self.rewrite_diff_paths(&raw); + if !rewritten.trim().is_empty() { + if !aggregated.is_empty() && !aggregated.ends_with('\n') { + aggregated.push('\n'); + } + aggregated.push_str(&rewritten); + } + } + + self.unified_diff = if aggregated.trim().is_empty() { + None + } else { + Some(aggregated) + }; + + // Clean up the current dir. + let _ = fs::remove_dir_all(¤t_dir); + + Ok(self.unified_diff.clone()) + } + + fn baseline_dir(&self) -> Result<&Path> { + self.baseline_files_dir + .as_ref() + .map(|d| d.path()) + .ok_or_else(|| anyhow::anyhow!("baseline temp dir not initialized")) + } + + fn ensure_baseline_dir(&mut self) -> Result<()> { + if self.baseline_files_dir.is_some() { + return Ok(()); + } + let tmp = TempDir::new().context("create baseline temp dir")?; + self.baseline_files_dir = Some(tmp); + Ok(()) + } + + /// Rewrites the internal filenames to external paths in diff headers. + /// Handles inputs like: + /// diff --git a/ b/current/ + /// --- a/ | /dev/null + /// +++ b/current/ | /dev/null + /// and replaces uuid with the external paths tracking baseline/current. + fn rewrite_diff_paths(&self, diff: &str) -> String { + let mut out = String::new(); + for line in diff.lines() { + if let Some(rest) = line.strip_prefix("diff --git ") { + // Format: diff --git a/ b/ + let parts: Vec<&str> = rest.split_whitespace().collect(); + if parts.len() == 2 { + let a = parts[0].strip_prefix("a/").unwrap_or(parts[0]); + let b = parts[1].strip_prefix("b/").unwrap_or(parts[1]); + + let a_ext_display = if a == "/dev/null" { + "/dev/null".to_string() + } else { + let a_base = Path::new(a) + .file_name() + .and_then(|s| s.to_str()) + .unwrap_or(a); + let mapped = self + .temp_name_to_baseline_external + .get(a_base) + .cloned() + .unwrap_or_else(|| PathBuf::from(a)); + mapped.display().to_string() + }; + + let b_ext_display = if b == "/dev/null" { + "/dev/null".to_string() + } else { + let b_base = Path::new(b) + .file_name() + .and_then(|s| s.to_str()) + .unwrap_or(b); + let mapped = self + .temp_name_to_current_external + .get(b_base) + .cloned() + .unwrap_or_else(|| PathBuf::from(b)); + mapped.display().to_string() + }; + + out.push_str(&format!("diff --git a/{a_ext_display} b/{b_ext_display}\n")); + continue; + } + } + if let Some(rest) = line.strip_prefix("--- ") { + if let Some(path) = rest.strip_prefix("a/") { + let external_display = if path == "/dev/null" { + "/dev/null".to_string() + } else { + let p_base = Path::new(path) + .file_name() + .and_then(|s| s.to_str()) + .unwrap_or(path); + self.temp_name_to_baseline_external + .get(p_base) + .cloned() + .unwrap_or_else(|| PathBuf::from(path)) + .display() + .to_string() + }; + out.push_str(&format!("--- {external_display}\n")); + continue; + } + } + if let Some(rest) = line.strip_prefix("+++ ") { + if let Some(path) = rest.strip_prefix("b/") { + let external_display = if path == "/dev/null" { + "/dev/null".to_string() + } else { + let p_base = Path::new(path) + .file_name() + .and_then(|s| s.to_str()) + .unwrap_or(path); + self.temp_name_to_current_external + .get(p_base) + .cloned() + .unwrap_or_else(|| PathBuf::from(path)) + .display() + .to_string() + }; + out.push_str(&format!("+++ {external_display}\n")); + continue; + } + } + out.push_str(line); + out.push('\n'); + } + out + } +} + +fn uuid_filename_for(path: &Path) -> String { + let id = Uuid::new_v4().to_string(); + match path.extension().and_then(|e| e.to_str()) { + Some(ext) if !ext.is_empty() => format!("{id}.{ext}"), + _ => id, ``` > ```suggestion > None => id, > ``` - Created: 2025-08-01 16:59:40 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2248446717 ```diff @@ -0,0 +1,476 @@ +use std::collections::HashMap; +use std::fs; +use std::path::Path; +use std::path::PathBuf; +use std::process::Command; + +use anyhow::Context; +use anyhow::Result; +use tempfile::TempDir; +use uuid::Uuid; + +use crate::protocol::FileChange; + +/// Tracks sets of changes to files and exposes the overall unified diff. +/// Internally, the way this works is now: +/// 1. Create a temp directory to store baseline snapshots of files when they are first seen. +/// 2. When a path is first observed, copy its current contents into the baseline dir if it exists on disk. +/// For new additions, do not create a baseline file so that diffs are shown as proper additions (using /dev/null). +/// 3. Keep a stable internal filename (uuid + same extension) per external path for path rewrite in diffs. +/// 4. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk using +/// `git diff --no-index` and rewrite paths to external paths. +#[derive(Default)] +pub struct TurnDiffTracker { + /// Temp directory holding baseline snapshots of files as first seen. + baseline_files_dir: Option, + /// Map external path -> internal filename (uuid + same extension). + external_to_temp_name: HashMap, + /// Internal filename -> external path as of baseline snapshot. + temp_name_to_baseline_external: HashMap, + /// Internal filename -> external path as of current accumulated state (after applying all changes). + /// This is where renames are tracked. + temp_name_to_current_external: HashMap, + /// Aggregated unified diff for all accumulated changes across files. + pub unified_diff: Option, +} + +impl TurnDiffTracker { + pub fn new() -> Self { + Self::default() + } + + /// Front-run apply patch calls to track the starting contents of any modified files. + /// - Creates a baseline snapshot for files that already exist on disk when first seen. + /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions. + /// - Also updates internal mappings for move/rename events. + pub fn on_patch_begin(&mut self, changes: &HashMap) -> Result<()> { + self.ensure_baseline_dir()?; + let baseline_dir = self.baseline_dir()?.to_path_buf(); + + for (path, change) in changes.iter() { + // Ensure a stable internal filename exists for this external path. + if !self.external_to_temp_name.contains_key(path) { + let internal = uuid_filename_for(path); + self.external_to_temp_name + .insert(path.clone(), internal.clone()); + self.temp_name_to_baseline_external + .insert(internal.clone(), path.clone()); + self.temp_name_to_current_external + .insert(internal.clone(), path.clone()); + + // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null. + if path.exists() { + let contents = fs::read(path) + .with_context(|| format!("failed to read original {}", path.display()))?; + let internal_path = baseline_dir.join(&internal); + fs::write(&internal_path, contents).with_context(|| { + format!("failed to write baseline file {}", internal_path.display()) + })?; + } + } + + // Track rename/move in current mapping if provided in an Update. + let move_path = match change { + FileChange::Update { + move_path: Some(dest), + .. + } => Some(dest), + _ => None, + }; + if let Some(dest) = move_path { + let uuid_filename = match self.external_to_temp_name.get(path) { + Some(i) => i.clone(), + None => { + // This should be rare, but if we haven't mapped the source, create it with no baseline. + let i = uuid_filename_for(path); + self.external_to_temp_name.insert(path.clone(), i.clone()); + self.temp_name_to_baseline_external + .insert(i.clone(), path.clone()); + i + } + }; + // Update current external mapping for temp file name. + self.temp_name_to_current_external + .insert(uuid_filename.clone(), dest.clone()); + // Update forward file_mapping: external current -> internal name. + self.external_to_temp_name.remove(path); + self.external_to_temp_name + .insert(dest.clone(), uuid_filename); + } + } + + Ok(()) + } + + /// Recompute the aggregated unified diff by comparing all baseline snapshots against + /// current files on disk using `git diff --no-index` and rewriting paths to external paths. + pub fn update_and_get_unified_diff(&mut self) -> Result> { + let baseline_dir = self.baseline_dir()?.to_path_buf(); + let current_dir = baseline_dir.join("current"); + if current_dir.exists() { + // Best-effort cleanup of previous run's mirror. + let _ = fs::remove_dir_all(¤t_dir); + } + fs::create_dir_all(¤t_dir).with_context(|| { + format!( + "failed to create current mirror dir {}", + current_dir.display() + ) + })?; + + let mut aggregated = String::new(); + + // Compute diffs per tracked internal file. + for (internal, baseline_external) in &self.temp_name_to_baseline_external { + let baseline_path = baseline_dir.join(internal); + let current_external = self + .temp_name_to_current_external + .get(internal) + .cloned() + .unwrap_or_else(|| baseline_external.clone()); + + let left_is_dev_null = !baseline_path.exists(); + let right_exists = current_external.exists(); + + // Prepare right side mirror file if exists; otherwise use /dev/null for deletions. + let right_arg = if right_exists { + let mirror_path = current_dir.join(internal); + let contents = fs::read(¤t_external).with_context(|| { + format!( + "failed to read current file for diff {}", + current_external.display() + ) + })?; + fs::write(&mirror_path, contents).with_context(|| { + format!( + "failed to write current mirror file {}", + mirror_path.display() + ) + })?; + // Use relative path from baseline_dir (so headers say a/ b/current/). + format!("current/{internal}") + } else { + // Deletion: right side is /dev/null to show proper deleted file diff. + "/dev/null".to_string() + }; + + // Prepare left arg: baseline file path or /dev/null for additions. + let left_arg = if left_is_dev_null { + "/dev/null".to_string() + } else { + internal.clone() + }; + + // Run git diff --no-index from baseline_dir to keep paths predictable. + let raw = run_git_allow_exit_codes( + &baseline_dir, + &[ + "-c", + "color.ui=false", + "diff", + "--no-color", + "--no-index", + "--", + &left_arg, + &right_arg, + ], + &[0, 1], // 0: no changes, 1: differences + )?; + + if raw.trim().is_empty() { + continue; + } + let rewritten = self.rewrite_diff_paths(&raw); + if !rewritten.trim().is_empty() { + if !aggregated.is_empty() && !aggregated.ends_with('\n') { + aggregated.push('\n'); + } + aggregated.push_str(&rewritten); + } + } + + self.unified_diff = if aggregated.trim().is_empty() { + None + } else { + Some(aggregated) + }; + + // Clean up the current dir. + let _ = fs::remove_dir_all(¤t_dir); + + Ok(self.unified_diff.clone()) + } + + fn baseline_dir(&self) -> Result<&Path> { + self.baseline_files_dir + .as_ref() + .map(|d| d.path()) + .ok_or_else(|| anyhow::anyhow!("baseline temp dir not initialized")) + } + + fn ensure_baseline_dir(&mut self) -> Result<()> { + if self.baseline_files_dir.is_some() { + return Ok(()); + } + let tmp = TempDir::new().context("create baseline temp dir")?; + self.baseline_files_dir = Some(tmp); + Ok(()) + } + + /// Rewrites the internal filenames to external paths in diff headers. + /// Handles inputs like: + /// diff --git a/ b/current/ + /// --- a/ | /dev/null + /// +++ b/current/ | /dev/null + /// and replaces uuid with the external paths tracking baseline/current. + fn rewrite_diff_paths(&self, diff: &str) -> String { + let mut out = String::new(); + for line in diff.lines() { + if let Some(rest) = line.strip_prefix("diff --git ") { + // Format: diff --git a/ b/ + let parts: Vec<&str> = rest.split_whitespace().collect(); + if parts.len() == 2 { + let a = parts[0].strip_prefix("a/").unwrap_or(parts[0]); + let b = parts[1].strip_prefix("b/").unwrap_or(parts[1]); + + let a_ext_display = if a == "/dev/null" { + "/dev/null".to_string() + } else { + let a_base = Path::new(a) + .file_name() + .and_then(|s| s.to_str()) + .unwrap_or(a); + let mapped = self + .temp_name_to_baseline_external + .get(a_base) + .cloned() + .unwrap_or_else(|| PathBuf::from(a)); + mapped.display().to_string() + }; + + let b_ext_display = if b == "/dev/null" { + "/dev/null".to_string() + } else { + let b_base = Path::new(b) + .file_name() + .and_then(|s| s.to_str()) + .unwrap_or(b); + let mapped = self + .temp_name_to_current_external + .get(b_base) + .cloned() + .unwrap_or_else(|| PathBuf::from(b)); + mapped.display().to_string() + }; + + out.push_str(&format!("diff --git a/{a_ext_display} b/{b_ext_display}\n")); + continue; + } + } + if let Some(rest) = line.strip_prefix("--- ") { + if let Some(path) = rest.strip_prefix("a/") { + let external_display = if path == "/dev/null" { + "/dev/null".to_string() + } else { + let p_base = Path::new(path) + .file_name() + .and_then(|s| s.to_str()) + .unwrap_or(path); + self.temp_name_to_baseline_external + .get(p_base) + .cloned() + .unwrap_or_else(|| PathBuf::from(path)) + .display() + .to_string() + }; + out.push_str(&format!("--- {external_display}\n")); + continue; + } + } + if let Some(rest) = line.strip_prefix("+++ ") { + if let Some(path) = rest.strip_prefix("b/") { + let external_display = if path == "/dev/null" { + "/dev/null".to_string() + } else { + let p_base = Path::new(path) + .file_name() + .and_then(|s| s.to_str()) + .unwrap_or(path); + self.temp_name_to_current_external + .get(p_base) + .cloned() + .unwrap_or_else(|| PathBuf::from(path)) + .display() + .to_string() + }; + out.push_str(&format!("+++ {external_display}\n")); + continue; + } + } + out.push_str(line); + out.push('\n'); + } + out + } +} + +fn uuid_filename_for(path: &Path) -> String { + let id = Uuid::new_v4().to_string(); + match path.extension().and_then(|e| e.to_str()) { + Some(ext) if !ext.is_empty() => format!("{id}.{ext}"), + _ => id, + } +} + +fn run_git_allow_exit_codes( + repo: &Path, + args: &[&str], + allowed_exit_codes: &[i32], +) -> Result { + let output = Command::new("git") + .current_dir(repo) + .args(args) + .output() + .with_context(|| format!("failed to run git {:?} in {}", args, repo.display()))?; ``` > ```suggestion > .with_context(|| format!("failed to run `git {args:?}` in {repo}"))?; > ``` - Created: 2025-08-01 17:08:30 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2248464099 ```diff @@ -0,0 +1,476 @@ +use std::collections::HashMap; +use std::fs; +use std::path::Path; +use std::path::PathBuf; +use std::process::Command; + +use anyhow::Context; +use anyhow::Result; +use tempfile::TempDir; +use uuid::Uuid; + +use crate::protocol::FileChange; + +/// Tracks sets of changes to files and exposes the overall unified diff. +/// Internally, the way this works is now: +/// 1. Create a temp directory to store baseline snapshots of files when they are first seen. +/// 2. When a path is first observed, copy its current contents into the baseline dir if it exists on disk. +/// For new additions, do not create a baseline file so that diffs are shown as proper additions (using /dev/null). +/// 3. Keep a stable internal filename (uuid + same extension) per external path for path rewrite in diffs. +/// 4. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk using +/// `git diff --no-index` and rewrite paths to external paths. +#[derive(Default)] +pub struct TurnDiffTracker { + /// Temp directory holding baseline snapshots of files as first seen. + baseline_files_dir: Option, + /// Map external path -> internal filename (uuid + same extension). + external_to_temp_name: HashMap, + /// Internal filename -> external path as of baseline snapshot. + temp_name_to_baseline_external: HashMap, + /// Internal filename -> external path as of current accumulated state (after applying all changes). + /// This is where renames are tracked. + temp_name_to_current_external: HashMap, + /// Aggregated unified diff for all accumulated changes across files. + pub unified_diff: Option, +} + +impl TurnDiffTracker { + pub fn new() -> Self { + Self::default() + } + + /// Front-run apply patch calls to track the starting contents of any modified files. + /// - Creates a baseline snapshot for files that already exist on disk when first seen. + /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions. + /// - Also updates internal mappings for move/rename events. + pub fn on_patch_begin(&mut self, changes: &HashMap) -> Result<()> { + self.ensure_baseline_dir()?; + let baseline_dir = self.baseline_dir()?.to_path_buf(); + + for (path, change) in changes.iter() { + // Ensure a stable internal filename exists for this external path. + if !self.external_to_temp_name.contains_key(path) { + let internal = uuid_filename_for(path); + self.external_to_temp_name + .insert(path.clone(), internal.clone()); + self.temp_name_to_baseline_external + .insert(internal.clone(), path.clone()); + self.temp_name_to_current_external + .insert(internal.clone(), path.clone()); + + // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null. + if path.exists() { + let contents = fs::read(path) + .with_context(|| format!("failed to read original {}", path.display()))?; + let internal_path = baseline_dir.join(&internal); + fs::write(&internal_path, contents).with_context(|| { + format!("failed to write baseline file {}", internal_path.display()) + })?; + } + } + + // Track rename/move in current mapping if provided in an Update. + let move_path = match change { + FileChange::Update { + move_path: Some(dest), + .. + } => Some(dest), + _ => None, + }; + if let Some(dest) = move_path { + let uuid_filename = match self.external_to_temp_name.get(path) { + Some(i) => i.clone(), + None => { + // This should be rare, but if we haven't mapped the source, create it with no baseline. + let i = uuid_filename_for(path); + self.external_to_temp_name.insert(path.clone(), i.clone()); + self.temp_name_to_baseline_external + .insert(i.clone(), path.clone()); + i + } + }; + // Update current external mapping for temp file name. + self.temp_name_to_current_external + .insert(uuid_filename.clone(), dest.clone()); + // Update forward file_mapping: external current -> internal name. + self.external_to_temp_name.remove(path); + self.external_to_temp_name + .insert(dest.clone(), uuid_filename); + } + } + + Ok(()) + } + + /// Recompute the aggregated unified diff by comparing all baseline snapshots against + /// current files on disk using `git diff --no-index` and rewriting paths to external paths. + pub fn update_and_get_unified_diff(&mut self) -> Result> { + let baseline_dir = self.baseline_dir()?.to_path_buf(); + let current_dir = baseline_dir.join("current"); + if current_dir.exists() { + // Best-effort cleanup of previous run's mirror. + let _ = fs::remove_dir_all(¤t_dir); + } + fs::create_dir_all(¤t_dir).with_context(|| { + format!( + "failed to create current mirror dir {}", + current_dir.display() + ) + })?; + + let mut aggregated = String::new(); + + // Compute diffs per tracked internal file. + for (internal, baseline_external) in &self.temp_name_to_baseline_external { + let baseline_path = baseline_dir.join(internal); + let current_external = self + .temp_name_to_current_external + .get(internal) + .cloned() + .unwrap_or_else(|| baseline_external.clone()); + + let left_is_dev_null = !baseline_path.exists(); + let right_exists = current_external.exists(); + + // Prepare right side mirror file if exists; otherwise use /dev/null for deletions. + let right_arg = if right_exists { + let mirror_path = current_dir.join(internal); + let contents = fs::read(¤t_external).with_context(|| { + format!( + "failed to read current file for diff {}", + current_external.display() + ) + })?; + fs::write(&mirror_path, contents).with_context(|| { + format!( + "failed to write current mirror file {}", + mirror_path.display() + ) + })?; + // Use relative path from baseline_dir (so headers say a/ b/current/). + format!("current/{internal}") + } else { + // Deletion: right side is /dev/null to show proper deleted file diff. + "/dev/null".to_string() + }; + + // Prepare left arg: baseline file path or /dev/null for additions. + let left_arg = if left_is_dev_null { + "/dev/null".to_string() + } else { + internal.clone() + }; + + // Run git diff --no-index from baseline_dir to keep paths predictable. + let raw = run_git_allow_exit_codes( + &baseline_dir, + &[ + "-c", + "color.ui=false", + "diff", + "--no-color", + "--no-index", + "--", + &left_arg, + &right_arg, + ], + &[0, 1], // 0: no changes, 1: differences + )?; + + if raw.trim().is_empty() { + continue; + } + let rewritten = self.rewrite_diff_paths(&raw); + if !rewritten.trim().is_empty() { + if !aggregated.is_empty() && !aggregated.ends_with('\n') { + aggregated.push('\n'); + } + aggregated.push_str(&rewritten); + } + } + + self.unified_diff = if aggregated.trim().is_empty() { + None + } else { + Some(aggregated) + }; + + // Clean up the current dir. + let _ = fs::remove_dir_all(¤t_dir); + + Ok(self.unified_diff.clone()) + } + + fn baseline_dir(&self) -> Result<&Path> { + self.baseline_files_dir + .as_ref() + .map(|d| d.path()) + .ok_or_else(|| anyhow::anyhow!("baseline temp dir not initialized")) + } + + fn ensure_baseline_dir(&mut self) -> Result<()> { + if self.baseline_files_dir.is_some() { + return Ok(()); + } + let tmp = TempDir::new().context("create baseline temp dir")?; + self.baseline_files_dir = Some(tmp); + Ok(()) + } + + /// Rewrites the internal filenames to external paths in diff headers. + /// Handles inputs like: + /// diff --git a/ b/current/ + /// --- a/ | /dev/null + /// +++ b/current/ | /dev/null + /// and replaces uuid with the external paths tracking baseline/current. + fn rewrite_diff_paths(&self, diff: &str) -> String { + let mut out = String::new(); + for line in diff.lines() { + if let Some(rest) = line.strip_prefix("diff --git ") { ``` > This gets indented pretty far, so maybe it's worth moving to a helper function that takes `(&mut String, temp_name_to_current_external, temp_name_to_baseline_external)` - Created: 2025-08-01 17:09:15 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2248466128 ```diff @@ -0,0 +1,476 @@ +use std::collections::HashMap; +use std::fs; +use std::path::Path; +use std::path::PathBuf; +use std::process::Command; + +use anyhow::Context; +use anyhow::Result; +use tempfile::TempDir; +use uuid::Uuid; + +use crate::protocol::FileChange; + +/// Tracks sets of changes to files and exposes the overall unified diff. +/// Internally, the way this works is now: +/// 1. Create a temp directory to store baseline snapshots of files when they are first seen. +/// 2. When a path is first observed, copy its current contents into the baseline dir if it exists on disk. +/// For new additions, do not create a baseline file so that diffs are shown as proper additions (using /dev/null). +/// 3. Keep a stable internal filename (uuid + same extension) per external path for path rewrite in diffs. +/// 4. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk using +/// `git diff --no-index` and rewrite paths to external paths. +#[derive(Default)] +pub struct TurnDiffTracker { + /// Temp directory holding baseline snapshots of files as first seen. + baseline_files_dir: Option, + /// Map external path -> internal filename (uuid + same extension). + external_to_temp_name: HashMap, + /// Internal filename -> external path as of baseline snapshot. + temp_name_to_baseline_external: HashMap, + /// Internal filename -> external path as of current accumulated state (after applying all changes). + /// This is where renames are tracked. + temp_name_to_current_external: HashMap, + /// Aggregated unified diff for all accumulated changes across files. + pub unified_diff: Option, +} + +impl TurnDiffTracker { + pub fn new() -> Self { + Self::default() + } + + /// Front-run apply patch calls to track the starting contents of any modified files. + /// - Creates a baseline snapshot for files that already exist on disk when first seen. + /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions. + /// - Also updates internal mappings for move/rename events. + pub fn on_patch_begin(&mut self, changes: &HashMap) -> Result<()> { + self.ensure_baseline_dir()?; + let baseline_dir = self.baseline_dir()?.to_path_buf(); + + for (path, change) in changes.iter() { + // Ensure a stable internal filename exists for this external path. + if !self.external_to_temp_name.contains_key(path) { + let internal = uuid_filename_for(path); + self.external_to_temp_name + .insert(path.clone(), internal.clone()); + self.temp_name_to_baseline_external + .insert(internal.clone(), path.clone()); + self.temp_name_to_current_external + .insert(internal.clone(), path.clone()); + + // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null. + if path.exists() { + let contents = fs::read(path) + .with_context(|| format!("failed to read original {}", path.display()))?; + let internal_path = baseline_dir.join(&internal); + fs::write(&internal_path, contents).with_context(|| { + format!("failed to write baseline file {}", internal_path.display()) + })?; + } + } + + // Track rename/move in current mapping if provided in an Update. + let move_path = match change { + FileChange::Update { + move_path: Some(dest), + .. + } => Some(dest), + _ => None, + }; + if let Some(dest) = move_path { + let uuid_filename = match self.external_to_temp_name.get(path) { + Some(i) => i.clone(), + None => { + // This should be rare, but if we haven't mapped the source, create it with no baseline. + let i = uuid_filename_for(path); + self.external_to_temp_name.insert(path.clone(), i.clone()); + self.temp_name_to_baseline_external + .insert(i.clone(), path.clone()); + i + } + }; + // Update current external mapping for temp file name. + self.temp_name_to_current_external + .insert(uuid_filename.clone(), dest.clone()); + // Update forward file_mapping: external current -> internal name. + self.external_to_temp_name.remove(path); + self.external_to_temp_name + .insert(dest.clone(), uuid_filename); + } + } + + Ok(()) + } + + /// Recompute the aggregated unified diff by comparing all baseline snapshots against + /// current files on disk using `git diff --no-index` and rewriting paths to external paths. + pub fn update_and_get_unified_diff(&mut self) -> Result> { + let baseline_dir = self.baseline_dir()?.to_path_buf(); + let current_dir = baseline_dir.join("current"); + if current_dir.exists() { + // Best-effort cleanup of previous run's mirror. + let _ = fs::remove_dir_all(¤t_dir); + } + fs::create_dir_all(¤t_dir).with_context(|| { + format!( + "failed to create current mirror dir {}", + current_dir.display() + ) + })?; + + let mut aggregated = String::new(); + + // Compute diffs per tracked internal file. + for (internal, baseline_external) in &self.temp_name_to_baseline_external { + let baseline_path = baseline_dir.join(internal); + let current_external = self + .temp_name_to_current_external + .get(internal) + .cloned() + .unwrap_or_else(|| baseline_external.clone()); + + let left_is_dev_null = !baseline_path.exists(); + let right_exists = current_external.exists(); + + // Prepare right side mirror file if exists; otherwise use /dev/null for deletions. + let right_arg = if right_exists { + let mirror_path = current_dir.join(internal); + let contents = fs::read(¤t_external).with_context(|| { + format!( + "failed to read current file for diff {}", + current_external.display() + ) + })?; + fs::write(&mirror_path, contents).with_context(|| { + format!( + "failed to write current mirror file {}", + mirror_path.display() + ) + })?; + // Use relative path from baseline_dir (so headers say a/ b/current/). + format!("current/{internal}") + } else { + // Deletion: right side is /dev/null to show proper deleted file diff. + "/dev/null".to_string() + }; + + // Prepare left arg: baseline file path or /dev/null for additions. + let left_arg = if left_is_dev_null { + "/dev/null".to_string() + } else { + internal.clone() + }; + + // Run git diff --no-index from baseline_dir to keep paths predictable. + let raw = run_git_allow_exit_codes( + &baseline_dir, + &[ + "-c", + "color.ui=false", + "diff", + "--no-color", + "--no-index", + "--", + &left_arg, + &right_arg, + ], + &[0, 1], // 0: no changes, 1: differences + )?; + + if raw.trim().is_empty() { + continue; + } + let rewritten = self.rewrite_diff_paths(&raw); + if !rewritten.trim().is_empty() { + if !aggregated.is_empty() && !aggregated.ends_with('\n') { + aggregated.push('\n'); + } + aggregated.push_str(&rewritten); + } + } + + self.unified_diff = if aggregated.trim().is_empty() { + None + } else { + Some(aggregated) + }; + + // Clean up the current dir. + let _ = fs::remove_dir_all(¤t_dir); + + Ok(self.unified_diff.clone()) + } + + fn baseline_dir(&self) -> Result<&Path> { + self.baseline_files_dir + .as_ref() + .map(|d| d.path()) + .ok_or_else(|| anyhow::anyhow!("baseline temp dir not initialized")) + } + + fn ensure_baseline_dir(&mut self) -> Result<()> { + if self.baseline_files_dir.is_some() { + return Ok(()); + } + let tmp = TempDir::new().context("create baseline temp dir")?; + self.baseline_files_dir = Some(tmp); + Ok(()) + } + + /// Rewrites the internal filenames to external paths in diff headers. + /// Handles inputs like: + /// diff --git a/ b/current/ + /// --- a/ | /dev/null + /// +++ b/current/ | /dev/null + /// and replaces uuid with the external paths tracking baseline/current. + fn rewrite_diff_paths(&self, diff: &str) -> String { + let mut out = String::new(); + for line in diff.lines() { + if let Some(rest) = line.strip_prefix("diff --git ") { + // Format: diff --git a/ b/ + let parts: Vec<&str> = rest.split_whitespace().collect(); + if parts.len() == 2 { ``` > We don't have to worry about paths with spaces because they're all UUIDs? - Created: 2025-08-01 17:10:28 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2248468317 ```diff @@ -0,0 +1,476 @@ +use std::collections::HashMap; +use std::fs; +use std::path::Path; +use std::path::PathBuf; +use std::process::Command; + +use anyhow::Context; +use anyhow::Result; +use tempfile::TempDir; +use uuid::Uuid; + +use crate::protocol::FileChange; + +/// Tracks sets of changes to files and exposes the overall unified diff. +/// Internally, the way this works is now: +/// 1. Create a temp directory to store baseline snapshots of files when they are first seen. +/// 2. When a path is first observed, copy its current contents into the baseline dir if it exists on disk. +/// For new additions, do not create a baseline file so that diffs are shown as proper additions (using /dev/null). +/// 3. Keep a stable internal filename (uuid + same extension) per external path for path rewrite in diffs. +/// 4. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk using +/// `git diff --no-index` and rewrite paths to external paths. +#[derive(Default)] +pub struct TurnDiffTracker { + /// Temp directory holding baseline snapshots of files as first seen. + baseline_files_dir: Option, + /// Map external path -> internal filename (uuid + same extension). + external_to_temp_name: HashMap, + /// Internal filename -> external path as of baseline snapshot. + temp_name_to_baseline_external: HashMap, + /// Internal filename -> external path as of current accumulated state (after applying all changes). + /// This is where renames are tracked. + temp_name_to_current_external: HashMap, + /// Aggregated unified diff for all accumulated changes across files. + pub unified_diff: Option, +} + +impl TurnDiffTracker { + pub fn new() -> Self { + Self::default() + } + + /// Front-run apply patch calls to track the starting contents of any modified files. + /// - Creates a baseline snapshot for files that already exist on disk when first seen. + /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions. + /// - Also updates internal mappings for move/rename events. + pub fn on_patch_begin(&mut self, changes: &HashMap) -> Result<()> { + self.ensure_baseline_dir()?; + let baseline_dir = self.baseline_dir()?.to_path_buf(); + + for (path, change) in changes.iter() { + // Ensure a stable internal filename exists for this external path. + if !self.external_to_temp_name.contains_key(path) { + let internal = uuid_filename_for(path); + self.external_to_temp_name + .insert(path.clone(), internal.clone()); + self.temp_name_to_baseline_external + .insert(internal.clone(), path.clone()); + self.temp_name_to_current_external + .insert(internal.clone(), path.clone()); + + // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null. + if path.exists() { + let contents = fs::read(path) + .with_context(|| format!("failed to read original {}", path.display()))?; + let internal_path = baseline_dir.join(&internal); + fs::write(&internal_path, contents).with_context(|| { + format!("failed to write baseline file {}", internal_path.display()) + })?; + } + } + + // Track rename/move in current mapping if provided in an Update. + let move_path = match change { + FileChange::Update { + move_path: Some(dest), + .. + } => Some(dest), + _ => None, + }; + if let Some(dest) = move_path { + let uuid_filename = match self.external_to_temp_name.get(path) { + Some(i) => i.clone(), + None => { + // This should be rare, but if we haven't mapped the source, create it with no baseline. + let i = uuid_filename_for(path); + self.external_to_temp_name.insert(path.clone(), i.clone()); + self.temp_name_to_baseline_external + .insert(i.clone(), path.clone()); + i + } + }; + // Update current external mapping for temp file name. + self.temp_name_to_current_external + .insert(uuid_filename.clone(), dest.clone()); + // Update forward file_mapping: external current -> internal name. + self.external_to_temp_name.remove(path); + self.external_to_temp_name + .insert(dest.clone(), uuid_filename); + } + } + + Ok(()) + } + + /// Recompute the aggregated unified diff by comparing all baseline snapshots against + /// current files on disk using `git diff --no-index` and rewriting paths to external paths. + pub fn update_and_get_unified_diff(&mut self) -> Result> { + let baseline_dir = self.baseline_dir()?.to_path_buf(); + let current_dir = baseline_dir.join("current"); + if current_dir.exists() { + // Best-effort cleanup of previous run's mirror. + let _ = fs::remove_dir_all(¤t_dir); + } + fs::create_dir_all(¤t_dir).with_context(|| { + format!( + "failed to create current mirror dir {}", + current_dir.display() + ) + })?; + + let mut aggregated = String::new(); + + // Compute diffs per tracked internal file. + for (internal, baseline_external) in &self.temp_name_to_baseline_external { + let baseline_path = baseline_dir.join(internal); + let current_external = self + .temp_name_to_current_external + .get(internal) + .cloned() + .unwrap_or_else(|| baseline_external.clone()); + + let left_is_dev_null = !baseline_path.exists(); + let right_exists = current_external.exists(); + + // Prepare right side mirror file if exists; otherwise use /dev/null for deletions. + let right_arg = if right_exists { + let mirror_path = current_dir.join(internal); + let contents = fs::read(¤t_external).with_context(|| { + format!( + "failed to read current file for diff {}", + current_external.display() + ) + })?; + fs::write(&mirror_path, contents).with_context(|| { + format!( + "failed to write current mirror file {}", + mirror_path.display() + ) + })?; + // Use relative path from baseline_dir (so headers say a/ b/current/). + format!("current/{internal}") + } else { + // Deletion: right side is /dev/null to show proper deleted file diff. + "/dev/null".to_string() + }; + + // Prepare left arg: baseline file path or /dev/null for additions. + let left_arg = if left_is_dev_null { + "/dev/null".to_string() + } else { + internal.clone() + }; + + // Run git diff --no-index from baseline_dir to keep paths predictable. + let raw = run_git_allow_exit_codes( + &baseline_dir, + &[ + "-c", + "color.ui=false", + "diff", + "--no-color", + "--no-index", + "--", + &left_arg, + &right_arg, + ], + &[0, 1], // 0: no changes, 1: differences + )?; + + if raw.trim().is_empty() { + continue; + } + let rewritten = self.rewrite_diff_paths(&raw); + if !rewritten.trim().is_empty() { + if !aggregated.is_empty() && !aggregated.ends_with('\n') { + aggregated.push('\n'); + } + aggregated.push_str(&rewritten); + } + } + + self.unified_diff = if aggregated.trim().is_empty() { + None + } else { + Some(aggregated) + }; + + // Clean up the current dir. + let _ = fs::remove_dir_all(¤t_dir); + + Ok(self.unified_diff.clone()) + } + + fn baseline_dir(&self) -> Result<&Path> { + self.baseline_files_dir + .as_ref() + .map(|d| d.path()) + .ok_or_else(|| anyhow::anyhow!("baseline temp dir not initialized")) + } + + fn ensure_baseline_dir(&mut self) -> Result<()> { + if self.baseline_files_dir.is_some() { + return Ok(()); + } + let tmp = TempDir::new().context("create baseline temp dir")?; + self.baseline_files_dir = Some(tmp); + Ok(()) + } + + /// Rewrites the internal filenames to external paths in diff headers. + /// Handles inputs like: + /// diff --git a/ b/current/ + /// --- a/ | /dev/null + /// +++ b/current/ | /dev/null + /// and replaces uuid with the external paths tracking baseline/current. + fn rewrite_diff_paths(&self, diff: &str) -> String { + let mut out = String::new(); + for line in diff.lines() { + if let Some(rest) = line.strip_prefix("diff --git ") { + // Format: diff --git a/ b/ + let parts: Vec<&str> = rest.split_whitespace().collect(); + if parts.len() == 2 { + let a = parts[0].strip_prefix("a/").unwrap_or(parts[0]); + let b = parts[1].strip_prefix("b/").unwrap_or(parts[1]); + + let a_ext_display = if a == "/dev/null" { + "/dev/null".to_string() + } else { + let a_base = Path::new(a) + .file_name() + .and_then(|s| s.to_str()) + .unwrap_or(a); + let mapped = self + .temp_name_to_baseline_external + .get(a_base) + .cloned() + .unwrap_or_else(|| PathBuf::from(a)); + mapped.display().to_string() + }; + + let b_ext_display = if b == "/dev/null" { + "/dev/null".to_string() + } else { + let b_base = Path::new(b) + .file_name() + .and_then(|s| s.to_str()) + .unwrap_or(b); + let mapped = self + .temp_name_to_current_external + .get(b_base) + .cloned() + .unwrap_or_else(|| PathBuf::from(b)); + mapped.display().to_string() + }; + + out.push_str(&format!("diff --git a/{a_ext_display} b/{b_ext_display}\n")); + continue; + } + } + if let Some(rest) = line.strip_prefix("--- ") { + if let Some(path) = rest.strip_prefix("a/") { + let external_display = if path == "/dev/null" { + "/dev/null".to_string() + } else { + let p_base = Path::new(path) + .file_name() + .and_then(|s| s.to_str()) + .unwrap_or(path); + self.temp_name_to_baseline_external + .get(p_base) + .cloned() + .unwrap_or_else(|| PathBuf::from(path)) + .display() + .to_string() + }; + out.push_str(&format!("--- {external_display}\n")); + continue; + } + } + if let Some(rest) = line.strip_prefix("+++ ") { + if let Some(path) = rest.strip_prefix("b/") { + let external_display = if path == "/dev/null" { + "/dev/null".to_string() + } else { + let p_base = Path::new(path) + .file_name() + .and_then(|s| s.to_str()) + .unwrap_or(path); + self.temp_name_to_current_external + .get(p_base) + .cloned() + .unwrap_or_else(|| PathBuf::from(path)) + .display() + .to_string() + }; + out.push_str(&format!("+++ {external_display}\n")); + continue; + } + } + out.push_str(line); + out.push('\n'); + } + out + } +} + +fn uuid_filename_for(path: &Path) -> String { + let id = Uuid::new_v4().to_string(); + match path.extension().and_then(|e| e.to_str()) { ``` > Why preserve the `ext`, btw? It could, in theory, contain a space, right? - Created: 2025-08-01 17:10:59 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2248469411 ```diff @@ -0,0 +1,476 @@ +use std::collections::HashMap; +use std::fs; +use std::path::Path; +use std::path::PathBuf; +use std::process::Command; + +use anyhow::Context; +use anyhow::Result; +use tempfile::TempDir; +use uuid::Uuid; + +use crate::protocol::FileChange; + +/// Tracks sets of changes to files and exposes the overall unified diff. +/// Internally, the way this works is now: +/// 1. Create a temp directory to store baseline snapshots of files when they are first seen. +/// 2. When a path is first observed, copy its current contents into the baseline dir if it exists on disk. +/// For new additions, do not create a baseline file so that diffs are shown as proper additions (using /dev/null). +/// 3. Keep a stable internal filename (uuid + same extension) per external path for path rewrite in diffs. +/// 4. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk using +/// `git diff --no-index` and rewrite paths to external paths. +#[derive(Default)] +pub struct TurnDiffTracker { + /// Temp directory holding baseline snapshots of files as first seen. + baseline_files_dir: Option, + /// Map external path -> internal filename (uuid + same extension). + external_to_temp_name: HashMap, + /// Internal filename -> external path as of baseline snapshot. + temp_name_to_baseline_external: HashMap, + /// Internal filename -> external path as of current accumulated state (after applying all changes). + /// This is where renames are tracked. + temp_name_to_current_external: HashMap, + /// Aggregated unified diff for all accumulated changes across files. + pub unified_diff: Option, +} + +impl TurnDiffTracker { + pub fn new() -> Self { + Self::default() + } + + /// Front-run apply patch calls to track the starting contents of any modified files. + /// - Creates a baseline snapshot for files that already exist on disk when first seen. + /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions. + /// - Also updates internal mappings for move/rename events. + pub fn on_patch_begin(&mut self, changes: &HashMap) -> Result<()> { + self.ensure_baseline_dir()?; + let baseline_dir = self.baseline_dir()?.to_path_buf(); + + for (path, change) in changes.iter() { + // Ensure a stable internal filename exists for this external path. + if !self.external_to_temp_name.contains_key(path) { + let internal = uuid_filename_for(path); + self.external_to_temp_name + .insert(path.clone(), internal.clone()); + self.temp_name_to_baseline_external + .insert(internal.clone(), path.clone()); + self.temp_name_to_current_external + .insert(internal.clone(), path.clone()); + + // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null. + if path.exists() { + let contents = fs::read(path) + .with_context(|| format!("failed to read original {}", path.display()))?; + let internal_path = baseline_dir.join(&internal); + fs::write(&internal_path, contents).with_context(|| { + format!("failed to write baseline file {}", internal_path.display()) + })?; + } + } + + // Track rename/move in current mapping if provided in an Update. + let move_path = match change { + FileChange::Update { + move_path: Some(dest), + .. + } => Some(dest), + _ => None, + }; + if let Some(dest) = move_path { + let uuid_filename = match self.external_to_temp_name.get(path) { + Some(i) => i.clone(), + None => { + // This should be rare, but if we haven't mapped the source, create it with no baseline. + let i = uuid_filename_for(path); + self.external_to_temp_name.insert(path.clone(), i.clone()); + self.temp_name_to_baseline_external + .insert(i.clone(), path.clone()); + i + } + }; + // Update current external mapping for temp file name. + self.temp_name_to_current_external + .insert(uuid_filename.clone(), dest.clone()); + // Update forward file_mapping: external current -> internal name. + self.external_to_temp_name.remove(path); + self.external_to_temp_name + .insert(dest.clone(), uuid_filename); + } + } + + Ok(()) + } + + /// Recompute the aggregated unified diff by comparing all baseline snapshots against + /// current files on disk using `git diff --no-index` and rewriting paths to external paths. + pub fn update_and_get_unified_diff(&mut self) -> Result> { + let baseline_dir = self.baseline_dir()?.to_path_buf(); + let current_dir = baseline_dir.join("current"); + if current_dir.exists() { + // Best-effort cleanup of previous run's mirror. + let _ = fs::remove_dir_all(¤t_dir); + } + fs::create_dir_all(¤t_dir).with_context(|| { + format!( + "failed to create current mirror dir {}", + current_dir.display() + ) + })?; + + let mut aggregated = String::new(); + + // Compute diffs per tracked internal file. + for (internal, baseline_external) in &self.temp_name_to_baseline_external { + let baseline_path = baseline_dir.join(internal); + let current_external = self + .temp_name_to_current_external + .get(internal) + .cloned() + .unwrap_or_else(|| baseline_external.clone()); + + let left_is_dev_null = !baseline_path.exists(); + let right_exists = current_external.exists(); + + // Prepare right side mirror file if exists; otherwise use /dev/null for deletions. + let right_arg = if right_exists { + let mirror_path = current_dir.join(internal); + let contents = fs::read(¤t_external).with_context(|| { + format!( + "failed to read current file for diff {}", + current_external.display() + ) + })?; + fs::write(&mirror_path, contents).with_context(|| { + format!( + "failed to write current mirror file {}", + mirror_path.display() + ) + })?; + // Use relative path from baseline_dir (so headers say a/ b/current/). + format!("current/{internal}") + } else { + // Deletion: right side is /dev/null to show proper deleted file diff. + "/dev/null".to_string() + }; + + // Prepare left arg: baseline file path or /dev/null for additions. + let left_arg = if left_is_dev_null { + "/dev/null".to_string() + } else { + internal.clone() + }; + + // Run git diff --no-index from baseline_dir to keep paths predictable. + let raw = run_git_allow_exit_codes( + &baseline_dir, + &[ + "-c", + "color.ui=false", + "diff", + "--no-color", + "--no-index", + "--", + &left_arg, + &right_arg, + ], + &[0, 1], // 0: no changes, 1: differences + )?; + + if raw.trim().is_empty() { + continue; + } + let rewritten = self.rewrite_diff_paths(&raw); + if !rewritten.trim().is_empty() { + if !aggregated.is_empty() && !aggregated.ends_with('\n') { + aggregated.push('\n'); + } + aggregated.push_str(&rewritten); + } + } + + self.unified_diff = if aggregated.trim().is_empty() { + None + } else { + Some(aggregated) + }; + + // Clean up the current dir. + let _ = fs::remove_dir_all(¤t_dir); + + Ok(self.unified_diff.clone()) + } + + fn baseline_dir(&self) -> Result<&Path> { + self.baseline_files_dir + .as_ref() + .map(|d| d.path()) + .ok_or_else(|| anyhow::anyhow!("baseline temp dir not initialized")) + } + + fn ensure_baseline_dir(&mut self) -> Result<()> { + if self.baseline_files_dir.is_some() { + return Ok(()); + } + let tmp = TempDir::new().context("create baseline temp dir")?; + self.baseline_files_dir = Some(tmp); + Ok(()) + } + + /// Rewrites the internal filenames to external paths in diff headers. + /// Handles inputs like: + /// diff --git a/ b/current/ + /// --- a/ | /dev/null + /// +++ b/current/ | /dev/null + /// and replaces uuid with the external paths tracking baseline/current. + fn rewrite_diff_paths(&self, diff: &str) -> String { + let mut out = String::new(); + for line in diff.lines() { + if let Some(rest) = line.strip_prefix("diff --git ") { + // Format: diff --git a/ b/ + let parts: Vec<&str> = rest.split_whitespace().collect(); + if parts.len() == 2 { + let a = parts[0].strip_prefix("a/").unwrap_or(parts[0]); + let b = parts[1].strip_prefix("b/").unwrap_or(parts[1]); + + let a_ext_display = if a == "/dev/null" { + "/dev/null".to_string() + } else { + let a_base = Path::new(a) + .file_name() + .and_then(|s| s.to_str()) + .unwrap_or(a); + let mapped = self + .temp_name_to_baseline_external + .get(a_base) + .cloned() + .unwrap_or_else(|| PathBuf::from(a)); + mapped.display().to_string() + }; + + let b_ext_display = if b == "/dev/null" { + "/dev/null".to_string() + } else { + let b_base = Path::new(b) + .file_name() + .and_then(|s| s.to_str()) + .unwrap_or(b); + let mapped = self + .temp_name_to_current_external + .get(b_base) + .cloned() + .unwrap_or_else(|| PathBuf::from(b)); + mapped.display().to_string() + }; + + out.push_str(&format!("diff --git a/{a_ext_display} b/{b_ext_display}\n")); + continue; + } + } + if let Some(rest) = line.strip_prefix("--- ") { + if let Some(path) = rest.strip_prefix("a/") { + let external_display = if path == "/dev/null" { + "/dev/null".to_string() + } else { + let p_base = Path::new(path) + .file_name() + .and_then(|s| s.to_str()) + .unwrap_or(path); + self.temp_name_to_baseline_external + .get(p_base) + .cloned() + .unwrap_or_else(|| PathBuf::from(path)) + .display() + .to_string() + }; + out.push_str(&format!("--- {external_display}\n")); + continue; + } + } + if let Some(rest) = line.strip_prefix("+++ ") { + if let Some(path) = rest.strip_prefix("b/") { + let external_display = if path == "/dev/null" { + "/dev/null".to_string() + } else { + let p_base = Path::new(path) + .file_name() + .and_then(|s| s.to_str()) + .unwrap_or(path); + self.temp_name_to_current_external + .get(p_base) + .cloned() + .unwrap_or_else(|| PathBuf::from(path)) + .display() + .to_string() + }; + out.push_str(&format!("+++ {external_display}\n")); + continue; + } + } + out.push_str(line); + out.push('\n'); + } + out + } +} + +fn uuid_filename_for(path: &Path) -> String { + let id = Uuid::new_v4().to_string(); + match path.extension().and_then(|e| e.to_str()) { + Some(ext) if !ext.is_empty() => format!("{id}.{ext}"), + _ => id, + } +} + +fn run_git_allow_exit_codes( + repo: &Path, + args: &[&str], + allowed_exit_codes: &[i32], +) -> Result { + let output = Command::new("git") ``` > Should we make this async and use `tokio::Command`? - Created: 2025-08-01 17:12:45 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2248474046 ```diff @@ -0,0 +1,476 @@ +use std::collections::HashMap; +use std::fs; +use std::path::Path; +use std::path::PathBuf; +use std::process::Command; + +use anyhow::Context; +use anyhow::Result; +use tempfile::TempDir; +use uuid::Uuid; + +use crate::protocol::FileChange; + +/// Tracks sets of changes to files and exposes the overall unified diff. +/// Internally, the way this works is now: +/// 1. Create a temp directory to store baseline snapshots of files when they are first seen. +/// 2. When a path is first observed, copy its current contents into the baseline dir if it exists on disk. +/// For new additions, do not create a baseline file so that diffs are shown as proper additions (using /dev/null). +/// 3. Keep a stable internal filename (uuid + same extension) per external path for path rewrite in diffs. +/// 4. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk using +/// `git diff --no-index` and rewrite paths to external paths. +#[derive(Default)] +pub struct TurnDiffTracker { + /// Temp directory holding baseline snapshots of files as first seen. + baseline_files_dir: Option, + /// Map external path -> internal filename (uuid + same extension). + external_to_temp_name: HashMap, + /// Internal filename -> external path as of baseline snapshot. + temp_name_to_baseline_external: HashMap, + /// Internal filename -> external path as of current accumulated state (after applying all changes). + /// This is where renames are tracked. + temp_name_to_current_external: HashMap, + /// Aggregated unified diff for all accumulated changes across files. + pub unified_diff: Option, +} + +impl TurnDiffTracker { + pub fn new() -> Self { + Self::default() + } + + /// Front-run apply patch calls to track the starting contents of any modified files. + /// - Creates a baseline snapshot for files that already exist on disk when first seen. + /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions. + /// - Also updates internal mappings for move/rename events. + pub fn on_patch_begin(&mut self, changes: &HashMap) -> Result<()> { + self.ensure_baseline_dir()?; + let baseline_dir = self.baseline_dir()?.to_path_buf(); + + for (path, change) in changes.iter() { + // Ensure a stable internal filename exists for this external path. + if !self.external_to_temp_name.contains_key(path) { + let internal = uuid_filename_for(path); + self.external_to_temp_name + .insert(path.clone(), internal.clone()); + self.temp_name_to_baseline_external + .insert(internal.clone(), path.clone()); + self.temp_name_to_current_external + .insert(internal.clone(), path.clone()); + + // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null. + if path.exists() { + let contents = fs::read(path) + .with_context(|| format!("failed to read original {}", path.display()))?; + let internal_path = baseline_dir.join(&internal); + fs::write(&internal_path, contents).with_context(|| { + format!("failed to write baseline file {}", internal_path.display()) + })?; + } + } + + // Track rename/move in current mapping if provided in an Update. + let move_path = match change { + FileChange::Update { + move_path: Some(dest), + .. + } => Some(dest), + _ => None, + }; + if let Some(dest) = move_path { + let uuid_filename = match self.external_to_temp_name.get(path) { + Some(i) => i.clone(), + None => { + // This should be rare, but if we haven't mapped the source, create it with no baseline. + let i = uuid_filename_for(path); + self.external_to_temp_name.insert(path.clone(), i.clone()); + self.temp_name_to_baseline_external + .insert(i.clone(), path.clone()); + i + } + }; + // Update current external mapping for temp file name. + self.temp_name_to_current_external + .insert(uuid_filename.clone(), dest.clone()); + // Update forward file_mapping: external current -> internal name. + self.external_to_temp_name.remove(path); + self.external_to_temp_name + .insert(dest.clone(), uuid_filename); + } + } + + Ok(()) + } + + /// Recompute the aggregated unified diff by comparing all baseline snapshots against + /// current files on disk using `git diff --no-index` and rewriting paths to external paths. + pub fn update_and_get_unified_diff(&mut self) -> Result> { + let baseline_dir = self.baseline_dir()?.to_path_buf(); + let current_dir = baseline_dir.join("current"); + if current_dir.exists() { + // Best-effort cleanup of previous run's mirror. + let _ = fs::remove_dir_all(¤t_dir); + } + fs::create_dir_all(¤t_dir).with_context(|| { + format!( + "failed to create current mirror dir {}", + current_dir.display() + ) + })?; + + let mut aggregated = String::new(); + + // Compute diffs per tracked internal file. + for (internal, baseline_external) in &self.temp_name_to_baseline_external { + let baseline_path = baseline_dir.join(internal); + let current_external = self + .temp_name_to_current_external + .get(internal) + .cloned() + .unwrap_or_else(|| baseline_external.clone()); + + let left_is_dev_null = !baseline_path.exists(); + let right_exists = current_external.exists(); + + // Prepare right side mirror file if exists; otherwise use /dev/null for deletions. + let right_arg = if right_exists { + let mirror_path = current_dir.join(internal); + let contents = fs::read(¤t_external).with_context(|| { + format!( + "failed to read current file for diff {}", + current_external.display() + ) + })?; + fs::write(&mirror_path, contents).with_context(|| { + format!( + "failed to write current mirror file {}", + mirror_path.display() + ) + })?; + // Use relative path from baseline_dir (so headers say a/ b/current/). + format!("current/{internal}") + } else { + // Deletion: right side is /dev/null to show proper deleted file diff. + "/dev/null".to_string() + }; + + // Prepare left arg: baseline file path or /dev/null for additions. + let left_arg = if left_is_dev_null { + "/dev/null".to_string() + } else { + internal.clone() + }; + + // Run git diff --no-index from baseline_dir to keep paths predictable. + let raw = run_git_allow_exit_codes( + &baseline_dir, + &[ + "-c", + "color.ui=false", + "diff", + "--no-color", + "--no-index", + "--", + &left_arg, + &right_arg, + ], + &[0, 1], // 0: no changes, 1: differences + )?; + + if raw.trim().is_empty() { + continue; + } + let rewritten = self.rewrite_diff_paths(&raw); + if !rewritten.trim().is_empty() { + if !aggregated.is_empty() && !aggregated.ends_with('\n') { + aggregated.push('\n'); + } + aggregated.push_str(&rewritten); + } + } + + self.unified_diff = if aggregated.trim().is_empty() { + None + } else { + Some(aggregated) + }; + + // Clean up the current dir. + let _ = fs::remove_dir_all(¤t_dir); + + Ok(self.unified_diff.clone()) + } + + fn baseline_dir(&self) -> Result<&Path> { + self.baseline_files_dir + .as_ref() + .map(|d| d.path()) + .ok_or_else(|| anyhow::anyhow!("baseline temp dir not initialized")) + } + + fn ensure_baseline_dir(&mut self) -> Result<()> { + if self.baseline_files_dir.is_some() { + return Ok(()); + } + let tmp = TempDir::new().context("create baseline temp dir")?; + self.baseline_files_dir = Some(tmp); + Ok(()) + } + + /// Rewrites the internal filenames to external paths in diff headers. + /// Handles inputs like: + /// diff --git a/ b/current/ + /// --- a/ | /dev/null + /// +++ b/current/ | /dev/null + /// and replaces uuid with the external paths tracking baseline/current. + fn rewrite_diff_paths(&self, diff: &str) -> String { + let mut out = String::new(); + for line in diff.lines() { + if let Some(rest) = line.strip_prefix("diff --git ") { + // Format: diff --git a/ b/ + let parts: Vec<&str> = rest.split_whitespace().collect(); + if parts.len() == 2 { + let a = parts[0].strip_prefix("a/").unwrap_or(parts[0]); + let b = parts[1].strip_prefix("b/").unwrap_or(parts[1]); + + let a_ext_display = if a == "/dev/null" { + "/dev/null".to_string() + } else { + let a_base = Path::new(a) + .file_name() + .and_then(|s| s.to_str()) + .unwrap_or(a); + let mapped = self + .temp_name_to_baseline_external + .get(a_base) + .cloned() + .unwrap_or_else(|| PathBuf::from(a)); + mapped.display().to_string() + }; + + let b_ext_display = if b == "/dev/null" { + "/dev/null".to_string() + } else { + let b_base = Path::new(b) + .file_name() + .and_then(|s| s.to_str()) + .unwrap_or(b); + let mapped = self + .temp_name_to_current_external + .get(b_base) + .cloned() + .unwrap_or_else(|| PathBuf::from(b)); + mapped.display().to_string() + }; + + out.push_str(&format!("diff --git a/{a_ext_display} b/{b_ext_display}\n")); + continue; + } + } + if let Some(rest) = line.strip_prefix("--- ") { + if let Some(path) = rest.strip_prefix("a/") { + let external_display = if path == "/dev/null" { + "/dev/null".to_string() + } else { + let p_base = Path::new(path) + .file_name() + .and_then(|s| s.to_str()) + .unwrap_or(path); + self.temp_name_to_baseline_external + .get(p_base) + .cloned() + .unwrap_or_else(|| PathBuf::from(path)) + .display() + .to_string() + }; + out.push_str(&format!("--- {external_display}\n")); + continue; + } + } + if let Some(rest) = line.strip_prefix("+++ ") { + if let Some(path) = rest.strip_prefix("b/") { + let external_display = if path == "/dev/null" { + "/dev/null".to_string() + } else { + let p_base = Path::new(path) + .file_name() + .and_then(|s| s.to_str()) + .unwrap_or(path); + self.temp_name_to_current_external + .get(p_base) + .cloned() + .unwrap_or_else(|| PathBuf::from(path)) + .display() + .to_string() + }; + out.push_str(&format!("+++ {external_display}\n")); + continue; + } + } + out.push_str(line); + out.push('\n'); + } + out + } +} + +fn uuid_filename_for(path: &Path) -> String { + let id = Uuid::new_v4().to_string(); + match path.extension().and_then(|e| e.to_str()) { + Some(ext) if !ext.is_empty() => format!("{id}.{ext}"), + _ => id, + } +} + +fn run_git_allow_exit_codes( + repo: &Path, + args: &[&str], + allowed_exit_codes: &[i32], +) -> Result { + let output = Command::new("git") + .current_dir(repo) + .args(args) + .output() + .with_context(|| format!("failed to run git {:?} in {}", args, repo.display()))?; + let code = output.status.code().unwrap_or(-1); + if !allowed_exit_codes.contains(&code) { + anyhow::bail!( + "git {:?} failed with status {:?}: {}", + args, + output.status, + String::from_utf8_lossy(&output.stderr) + ); + } + Ok(String::from_utf8_lossy(&output.stdout).into_owned()) +} + +#[cfg(test)] +mod tests { + #![allow(clippy::unwrap_used)] + use super::*; + use tempfile::tempdir; + + #[test] + fn accumulates_add_and_update() { + let mut acc = TurnDiffTracker::new(); + + let dir = tempdir().unwrap(); + let file = dir.path().join("a.txt"); + + // First patch: add file (baseline should be /dev/null). + let add_changes = HashMap::from([( + file.clone(), + FileChange::Add { + content: "foo\n".to_string(), + }, + )]); + acc.on_patch_begin(&add_changes).unwrap(); + + // Simulate apply: create the file on disk. + // This must happen after on_patch_begin. + fs::write(&file, "foo\n").unwrap(); + acc.update_and_get_unified_diff().unwrap(); + let first = acc.unified_diff.clone().unwrap(); + assert!(first.contains("+foo")); ``` > Instead of `contains()` checks, can these all be full `assert_eq!()` checks? - Created: 2025-08-01 17:13:43 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2248476408 ```diff @@ -0,0 +1,476 @@ +use std::collections::HashMap; +use std::fs; +use std::path::Path; +use std::path::PathBuf; +use std::process::Command; + +use anyhow::Context; +use anyhow::Result; +use tempfile::TempDir; +use uuid::Uuid; + +use crate::protocol::FileChange; + +/// Tracks sets of changes to files and exposes the overall unified diff. +/// Internally, the way this works is now: +/// 1. Create a temp directory to store baseline snapshots of files when they are first seen. +/// 2. When a path is first observed, copy its current contents into the baseline dir if it exists on disk. +/// For new additions, do not create a baseline file so that diffs are shown as proper additions (using /dev/null). +/// 3. Keep a stable internal filename (uuid + same extension) per external path for path rewrite in diffs. +/// 4. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk using +/// `git diff --no-index` and rewrite paths to external paths. +#[derive(Default)] +pub struct TurnDiffTracker { + /// Temp directory holding baseline snapshots of files as first seen. + baseline_files_dir: Option, + /// Map external path -> internal filename (uuid + same extension). + external_to_temp_name: HashMap, + /// Internal filename -> external path as of baseline snapshot. + temp_name_to_baseline_external: HashMap, + /// Internal filename -> external path as of current accumulated state (after applying all changes). + /// This is where renames are tracked. + temp_name_to_current_external: HashMap, + /// Aggregated unified diff for all accumulated changes across files. + pub unified_diff: Option, +} + +impl TurnDiffTracker { + pub fn new() -> Self { + Self::default() + } + + /// Front-run apply patch calls to track the starting contents of any modified files. + /// - Creates a baseline snapshot for files that already exist on disk when first seen. + /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions. + /// - Also updates internal mappings for move/rename events. + pub fn on_patch_begin(&mut self, changes: &HashMap) -> Result<()> { + self.ensure_baseline_dir()?; + let baseline_dir = self.baseline_dir()?.to_path_buf(); + + for (path, change) in changes.iter() { + // Ensure a stable internal filename exists for this external path. + if !self.external_to_temp_name.contains_key(path) { + let internal = uuid_filename_for(path); + self.external_to_temp_name + .insert(path.clone(), internal.clone()); + self.temp_name_to_baseline_external + .insert(internal.clone(), path.clone()); + self.temp_name_to_current_external + .insert(internal.clone(), path.clone()); + + // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null. + if path.exists() { + let contents = fs::read(path) + .with_context(|| format!("failed to read original {}", path.display()))?; + let internal_path = baseline_dir.join(&internal); + fs::write(&internal_path, contents).with_context(|| { + format!("failed to write baseline file {}", internal_path.display()) + })?; + } + } + + // Track rename/move in current mapping if provided in an Update. + let move_path = match change { + FileChange::Update { + move_path: Some(dest), + .. + } => Some(dest), + _ => None, + }; + if let Some(dest) = move_path { + let uuid_filename = match self.external_to_temp_name.get(path) { + Some(i) => i.clone(), + None => { + // This should be rare, but if we haven't mapped the source, create it with no baseline. + let i = uuid_filename_for(path); + self.external_to_temp_name.insert(path.clone(), i.clone()); + self.temp_name_to_baseline_external + .insert(i.clone(), path.clone()); + i + } + }; + // Update current external mapping for temp file name. + self.temp_name_to_current_external + .insert(uuid_filename.clone(), dest.clone()); + // Update forward file_mapping: external current -> internal name. + self.external_to_temp_name.remove(path); + self.external_to_temp_name + .insert(dest.clone(), uuid_filename); + } + } + + Ok(()) + } + + /// Recompute the aggregated unified diff by comparing all baseline snapshots against + /// current files on disk using `git diff --no-index` and rewriting paths to external paths. + pub fn update_and_get_unified_diff(&mut self) -> Result> { + let baseline_dir = self.baseline_dir()?.to_path_buf(); + let current_dir = baseline_dir.join("current"); + if current_dir.exists() { + // Best-effort cleanup of previous run's mirror. + let _ = fs::remove_dir_all(¤t_dir); + } + fs::create_dir_all(¤t_dir).with_context(|| { + format!( + "failed to create current mirror dir {}", + current_dir.display() + ) + })?; + + let mut aggregated = String::new(); + + // Compute diffs per tracked internal file. + for (internal, baseline_external) in &self.temp_name_to_baseline_external { + let baseline_path = baseline_dir.join(internal); + let current_external = self + .temp_name_to_current_external + .get(internal) + .cloned() + .unwrap_or_else(|| baseline_external.clone()); + + let left_is_dev_null = !baseline_path.exists(); + let right_exists = current_external.exists(); + + // Prepare right side mirror file if exists; otherwise use /dev/null for deletions. + let right_arg = if right_exists { + let mirror_path = current_dir.join(internal); + let contents = fs::read(¤t_external).with_context(|| { + format!( + "failed to read current file for diff {}", + current_external.display() + ) + })?; + fs::write(&mirror_path, contents).with_context(|| { + format!( + "failed to write current mirror file {}", + mirror_path.display() + ) + })?; + // Use relative path from baseline_dir (so headers say a/ b/current/). + format!("current/{internal}") + } else { + // Deletion: right side is /dev/null to show proper deleted file diff. + "/dev/null".to_string() + }; + + // Prepare left arg: baseline file path or /dev/null for additions. + let left_arg = if left_is_dev_null { + "/dev/null".to_string() + } else { + internal.clone() + }; + + // Run git diff --no-index from baseline_dir to keep paths predictable. + let raw = run_git_allow_exit_codes( + &baseline_dir, + &[ + "-c", + "color.ui=false", + "diff", + "--no-color", + "--no-index", + "--", + &left_arg, + &right_arg, + ], + &[0, 1], // 0: no changes, 1: differences + )?; + + if raw.trim().is_empty() { + continue; + } + let rewritten = self.rewrite_diff_paths(&raw); + if !rewritten.trim().is_empty() { + if !aggregated.is_empty() && !aggregated.ends_with('\n') { + aggregated.push('\n'); + } + aggregated.push_str(&rewritten); + } + } + + self.unified_diff = if aggregated.trim().is_empty() { + None + } else { + Some(aggregated) + }; + + // Clean up the current dir. + let _ = fs::remove_dir_all(¤t_dir); + + Ok(self.unified_diff.clone()) + } + + fn baseline_dir(&self) -> Result<&Path> { + self.baseline_files_dir + .as_ref() + .map(|d| d.path()) + .ok_or_else(|| anyhow::anyhow!("baseline temp dir not initialized")) + } + + fn ensure_baseline_dir(&mut self) -> Result<()> { + if self.baseline_files_dir.is_some() { + return Ok(()); + } + let tmp = TempDir::new().context("create baseline temp dir")?; + self.baseline_files_dir = Some(tmp); + Ok(()) + } + + /// Rewrites the internal filenames to external paths in diff headers. + /// Handles inputs like: + /// diff --git a/ b/current/ + /// --- a/ | /dev/null + /// +++ b/current/ | /dev/null + /// and replaces uuid with the external paths tracking baseline/current. + fn rewrite_diff_paths(&self, diff: &str) -> String { + let mut out = String::new(); + for line in diff.lines() { + if let Some(rest) = line.strip_prefix("diff --git ") { + // Format: diff --git a/ b/ + let parts: Vec<&str> = rest.split_whitespace().collect(); + if parts.len() == 2 { + let a = parts[0].strip_prefix("a/").unwrap_or(parts[0]); + let b = parts[1].strip_prefix("b/").unwrap_or(parts[1]); + + let a_ext_display = if a == "/dev/null" { + "/dev/null".to_string() + } else { + let a_base = Path::new(a) + .file_name() + .and_then(|s| s.to_str()) + .unwrap_or(a); + let mapped = self + .temp_name_to_baseline_external + .get(a_base) + .cloned() + .unwrap_or_else(|| PathBuf::from(a)); + mapped.display().to_string() + }; + + let b_ext_display = if b == "/dev/null" { + "/dev/null".to_string() + } else { + let b_base = Path::new(b) + .file_name() + .and_then(|s| s.to_str()) + .unwrap_or(b); + let mapped = self + .temp_name_to_current_external + .get(b_base) + .cloned() + .unwrap_or_else(|| PathBuf::from(b)); + mapped.display().to_string() + }; + + out.push_str(&format!("diff --git a/{a_ext_display} b/{b_ext_display}\n")); + continue; + } + } + if let Some(rest) = line.strip_prefix("--- ") { + if let Some(path) = rest.strip_prefix("a/") { + let external_display = if path == "/dev/null" { + "/dev/null".to_string() + } else { + let p_base = Path::new(path) + .file_name() + .and_then(|s| s.to_str()) + .unwrap_or(path); + self.temp_name_to_baseline_external + .get(p_base) + .cloned() + .unwrap_or_else(|| PathBuf::from(path)) + .display() + .to_string() + }; + out.push_str(&format!("--- {external_display}\n")); + continue; + } + } + if let Some(rest) = line.strip_prefix("+++ ") { + if let Some(path) = rest.strip_prefix("b/") { + let external_display = if path == "/dev/null" { + "/dev/null".to_string() + } else { + let p_base = Path::new(path) + .file_name() + .and_then(|s| s.to_str()) + .unwrap_or(path); + self.temp_name_to_current_external + .get(p_base) + .cloned() + .unwrap_or_else(|| PathBuf::from(path)) + .display() + .to_string() + }; + out.push_str(&format!("+++ {external_display}\n")); + continue; + } + } + out.push_str(line); + out.push('\n'); + } + out + } +} + +fn uuid_filename_for(path: &Path) -> String { + let id = Uuid::new_v4().to_string(); + match path.extension().and_then(|e| e.to_str()) { + Some(ext) if !ext.is_empty() => format!("{id}.{ext}"), + _ => id, + } +} + +fn run_git_allow_exit_codes( + repo: &Path, + args: &[&str], + allowed_exit_codes: &[i32], +) -> Result { + let output = Command::new("git") + .current_dir(repo) + .args(args) + .output() + .with_context(|| format!("failed to run git {:?} in {}", args, repo.display()))?; + let code = output.status.code().unwrap_or(-1); + if !allowed_exit_codes.contains(&code) { + anyhow::bail!( + "git {:?} failed with status {:?}: {}", + args, + output.status, + String::from_utf8_lossy(&output.stderr) + ); + } + Ok(String::from_utf8_lossy(&output.stdout).into_owned()) +} + +#[cfg(test)] +mod tests { + #![allow(clippy::unwrap_used)] ``` > Can you also test that paths with spaces work as intended? - Created: 2025-08-01 17:15:26 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2248479943 ```diff @@ -0,0 +1,476 @@ +use std::collections::HashMap; +use std::fs; +use std::path::Path; +use std::path::PathBuf; +use std::process::Command; + +use anyhow::Context; +use anyhow::Result; +use tempfile::TempDir; +use uuid::Uuid; + +use crate::protocol::FileChange; + +/// Tracks sets of changes to files and exposes the overall unified diff. +/// Internally, the way this works is now: +/// 1. Create a temp directory to store baseline snapshots of files when they are first seen. +/// 2. When a path is first observed, copy its current contents into the baseline dir if it exists on disk. +/// For new additions, do not create a baseline file so that diffs are shown as proper additions (using /dev/null). +/// 3. Keep a stable internal filename (uuid + same extension) per external path for path rewrite in diffs. +/// 4. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk using +/// `git diff --no-index` and rewrite paths to external paths. +#[derive(Default)] +pub struct TurnDiffTracker { + /// Temp directory holding baseline snapshots of files as first seen. + baseline_files_dir: Option, + /// Map external path -> internal filename (uuid + same extension). + external_to_temp_name: HashMap, + /// Internal filename -> external path as of baseline snapshot. + temp_name_to_baseline_external: HashMap, + /// Internal filename -> external path as of current accumulated state (after applying all changes). + /// This is where renames are tracked. + temp_name_to_current_external: HashMap, + /// Aggregated unified diff for all accumulated changes across files. + pub unified_diff: Option, +} + +impl TurnDiffTracker { + pub fn new() -> Self { + Self::default() + } + + /// Front-run apply patch calls to track the starting contents of any modified files. + /// - Creates a baseline snapshot for files that already exist on disk when first seen. + /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions. + /// - Also updates internal mappings for move/rename events. + pub fn on_patch_begin(&mut self, changes: &HashMap) -> Result<()> { + self.ensure_baseline_dir()?; + let baseline_dir = self.baseline_dir()?.to_path_buf(); + + for (path, change) in changes.iter() { + // Ensure a stable internal filename exists for this external path. + if !self.external_to_temp_name.contains_key(path) { + let internal = uuid_filename_for(path); + self.external_to_temp_name + .insert(path.clone(), internal.clone()); + self.temp_name_to_baseline_external + .insert(internal.clone(), path.clone()); + self.temp_name_to_current_external + .insert(internal.clone(), path.clone()); + + // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null. + if path.exists() { + let contents = fs::read(path) + .with_context(|| format!("failed to read original {}", path.display()))?; + let internal_path = baseline_dir.join(&internal); + fs::write(&internal_path, contents).with_context(|| { + format!("failed to write baseline file {}", internal_path.display()) + })?; + } + } + + // Track rename/move in current mapping if provided in an Update. + let move_path = match change { + FileChange::Update { + move_path: Some(dest), + .. + } => Some(dest), + _ => None, + }; + if let Some(dest) = move_path { ``` > This could be folded into the above `match` statement? - Created: 2025-08-01 17:17:12 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2248482907 ```diff @@ -0,0 +1,476 @@ +use std::collections::HashMap; +use std::fs; +use std::path::Path; +use std::path::PathBuf; +use std::process::Command; + +use anyhow::Context; +use anyhow::Result; +use tempfile::TempDir; +use uuid::Uuid; + +use crate::protocol::FileChange; + +/// Tracks sets of changes to files and exposes the overall unified diff. +/// Internally, the way this works is now: +/// 1. Create a temp directory to store baseline snapshots of files when they are first seen. +/// 2. When a path is first observed, copy its current contents into the baseline dir if it exists on disk. +/// For new additions, do not create a baseline file so that diffs are shown as proper additions (using /dev/null). +/// 3. Keep a stable internal filename (uuid + same extension) per external path for path rewrite in diffs. +/// 4. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk using +/// `git diff --no-index` and rewrite paths to external paths. +#[derive(Default)] +pub struct TurnDiffTracker { + /// Temp directory holding baseline snapshots of files as first seen. + baseline_files_dir: Option, + /// Map external path -> internal filename (uuid + same extension). + external_to_temp_name: HashMap, + /// Internal filename -> external path as of baseline snapshot. + temp_name_to_baseline_external: HashMap, + /// Internal filename -> external path as of current accumulated state (after applying all changes). + /// This is where renames are tracked. + temp_name_to_current_external: HashMap, + /// Aggregated unified diff for all accumulated changes across files. + pub unified_diff: Option, +} + +impl TurnDiffTracker { + pub fn new() -> Self { + Self::default() + } + + /// Front-run apply patch calls to track the starting contents of any modified files. + /// - Creates a baseline snapshot for files that already exist on disk when first seen. + /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions. + /// - Also updates internal mappings for move/rename events. + pub fn on_patch_begin(&mut self, changes: &HashMap) -> Result<()> { + self.ensure_baseline_dir()?; + let baseline_dir = self.baseline_dir()?.to_path_buf(); + + for (path, change) in changes.iter() { + // Ensure a stable internal filename exists for this external path. + if !self.external_to_temp_name.contains_key(path) { + let internal = uuid_filename_for(path); + self.external_to_temp_name + .insert(path.clone(), internal.clone()); + self.temp_name_to_baseline_external + .insert(internal.clone(), path.clone()); + self.temp_name_to_current_external + .insert(internal.clone(), path.clone()); + + // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null. + if path.exists() { + let contents = fs::read(path) + .with_context(|| format!("failed to read original {}", path.display()))?; + let internal_path = baseline_dir.join(&internal); + fs::write(&internal_path, contents).with_context(|| { + format!("failed to write baseline file {}", internal_path.display()) + })?; + } + } + + // Track rename/move in current mapping if provided in an Update. + let move_path = match change { + FileChange::Update { + move_path: Some(dest), + .. + } => Some(dest), + _ => None, + }; + if let Some(dest) = move_path { + let uuid_filename = match self.external_to_temp_name.get(path) { + Some(i) => i.clone(), + None => { + // This should be rare, but if we haven't mapped the source, create it with no baseline. + let i = uuid_filename_for(path); + self.external_to_temp_name.insert(path.clone(), i.clone()); + self.temp_name_to_baseline_external + .insert(i.clone(), path.clone()); + i + } + }; + // Update current external mapping for temp file name. + self.temp_name_to_current_external + .insert(uuid_filename.clone(), dest.clone()); + // Update forward file_mapping: external current -> internal name. + self.external_to_temp_name.remove(path); + self.external_to_temp_name + .insert(dest.clone(), uuid_filename); + } + } + + Ok(()) + } + + /// Recompute the aggregated unified diff by comparing all baseline snapshots against + /// current files on disk using `git diff --no-index` and rewriting paths to external paths. + pub fn update_and_get_unified_diff(&mut self) -> Result> { + let baseline_dir = self.baseline_dir()?.to_path_buf(); + let current_dir = baseline_dir.join("current"); + if current_dir.exists() { + // Best-effort cleanup of previous run's mirror. + let _ = fs::remove_dir_all(¤t_dir); + } + fs::create_dir_all(¤t_dir).with_context(|| { + format!( + "failed to create current mirror dir {}", + current_dir.display() + ) + })?; + + let mut aggregated = String::new(); + + // Compute diffs per tracked internal file. + for (internal, baseline_external) in &self.temp_name_to_baseline_external { + let baseline_path = baseline_dir.join(internal); + let current_external = self + .temp_name_to_current_external + .get(internal) + .cloned() + .unwrap_or_else(|| baseline_external.clone()); + + let left_is_dev_null = !baseline_path.exists(); + let right_exists = current_external.exists(); + + // Prepare right side mirror file if exists; otherwise use /dev/null for deletions. + let right_arg = if right_exists { + let mirror_path = current_dir.join(internal); + let contents = fs::read(¤t_external).with_context(|| { + format!( + "failed to read current file for diff {}", + current_external.display() + ) + })?; + fs::write(&mirror_path, contents).with_context(|| { + format!( + "failed to write current mirror file {}", + mirror_path.display() + ) + })?; + // Use relative path from baseline_dir (so headers say a/ b/current/). + format!("current/{internal}") + } else { + // Deletion: right side is /dev/null to show proper deleted file diff. + "/dev/null".to_string() + }; + + // Prepare left arg: baseline file path or /dev/null for additions. + let left_arg = if left_is_dev_null { + "/dev/null".to_string() + } else { + internal.clone() + }; + + // Run git diff --no-index from baseline_dir to keep paths predictable. + let raw = run_git_allow_exit_codes( + &baseline_dir, + &[ + "-c", + "color.ui=false", + "diff", + "--no-color", + "--no-index", + "--", + &left_arg, + &right_arg, + ], + &[0, 1], // 0: no changes, 1: differences + )?; + + if raw.trim().is_empty() { + continue; + } + let rewritten = self.rewrite_diff_paths(&raw); + if !rewritten.trim().is_empty() { + if !aggregated.is_empty() && !aggregated.ends_with('\n') { + aggregated.push('\n'); + } + aggregated.push_str(&rewritten); + } + } + + self.unified_diff = if aggregated.trim().is_empty() { + None + } else { + Some(aggregated) + }; + + // Clean up the current dir. + let _ = fs::remove_dir_all(¤t_dir); + + Ok(self.unified_diff.clone()) + } + + fn baseline_dir(&self) -> Result<&Path> { + self.baseline_files_dir + .as_ref() + .map(|d| d.path()) + .ok_or_else(|| anyhow::anyhow!("baseline temp dir not initialized")) + } + + fn ensure_baseline_dir(&mut self) -> Result<()> { + if self.baseline_files_dir.is_some() { + return Ok(()); + } + let tmp = TempDir::new().context("create baseline temp dir")?; + self.baseline_files_dir = Some(tmp); + Ok(()) + } + + /// Rewrites the internal filenames to external paths in diff headers. + /// Handles inputs like: + /// diff --git a/ b/current/ + /// --- a/ | /dev/null + /// +++ b/current/ | /dev/null + /// and replaces uuid with the external paths tracking baseline/current. + fn rewrite_diff_paths(&self, diff: &str) -> String { + let mut out = String::new(); + for line in diff.lines() { + if let Some(rest) = line.strip_prefix("diff --git ") { + // Format: diff --git a/ b/ + let parts: Vec<&str> = rest.split_whitespace().collect(); + if parts.len() == 2 { + let a = parts[0].strip_prefix("a/").unwrap_or(parts[0]); + let b = parts[1].strip_prefix("b/").unwrap_or(parts[1]); + + let a_ext_display = if a == "/dev/null" { + "/dev/null".to_string() + } else { + let a_base = Path::new(a) + .file_name() + .and_then(|s| s.to_str()) + .unwrap_or(a); + let mapped = self + .temp_name_to_baseline_external + .get(a_base) + .cloned() + .unwrap_or_else(|| PathBuf::from(a)); + mapped.display().to_string() + }; + + let b_ext_display = if b == "/dev/null" { + "/dev/null".to_string() + } else { + let b_base = Path::new(b) + .file_name() + .and_then(|s| s.to_str()) + .unwrap_or(b); + let mapped = self + .temp_name_to_current_external + .get(b_base) + .cloned() + .unwrap_or_else(|| PathBuf::from(b)); + mapped.display().to_string() + }; + + out.push_str(&format!("diff --git a/{a_ext_display} b/{b_ext_display}\n")); + continue; + } + } + if let Some(rest) = line.strip_prefix("--- ") { + if let Some(path) = rest.strip_prefix("a/") { + let external_display = if path == "/dev/null" { + "/dev/null".to_string() + } else { + let p_base = Path::new(path) + .file_name() + .and_then(|s| s.to_str()) + .unwrap_or(path); + self.temp_name_to_baseline_external + .get(p_base) + .cloned() + .unwrap_or_else(|| PathBuf::from(path)) + .display() + .to_string() + }; + out.push_str(&format!("--- {external_display}\n")); + continue; + } + } + if let Some(rest) = line.strip_prefix("+++ ") { + if let Some(path) = rest.strip_prefix("b/") { + let external_display = if path == "/dev/null" { + "/dev/null".to_string() + } else { + let p_base = Path::new(path) + .file_name() + .and_then(|s| s.to_str()) + .unwrap_or(path); + self.temp_name_to_current_external + .get(p_base) + .cloned() + .unwrap_or_else(|| PathBuf::from(path)) + .display() + .to_string() + }; + out.push_str(&format!("+++ {external_display}\n")); + continue; + } + } + out.push_str(line); + out.push('\n'); + } + out + } +} + +fn uuid_filename_for(path: &Path) -> String { + let id = Uuid::new_v4().to_string(); + match path.extension().and_then(|e| e.to_str()) { + Some(ext) if !ext.is_empty() => format!("{id}.{ext}"), + _ => id, + } +} + +fn run_git_allow_exit_codes( ``` > From https://github.com/openai/codex/pull/1747 I would include: > > ```rust > let envs = vec![ > ("GIT_CONFIG_GLOBAL", "/dev/null"), > ("GIT_CONFIG_NOSYSTEM", "1"), > ]; > ``` - Created: 2025-08-01 17:20:55 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2248489326 ```diff @@ -0,0 +1,476 @@ +use std::collections::HashMap; +use std::fs; +use std::path::Path; +use std::path::PathBuf; +use std::process::Command; + +use anyhow::Context; +use anyhow::Result; +use tempfile::TempDir; +use uuid::Uuid; + +use crate::protocol::FileChange; + +/// Tracks sets of changes to files and exposes the overall unified diff. +/// Internally, the way this works is now: +/// 1. Create a temp directory to store baseline snapshots of files when they are first seen. +/// 2. When a path is first observed, copy its current contents into the baseline dir if it exists on disk. +/// For new additions, do not create a baseline file so that diffs are shown as proper additions (using /dev/null). +/// 3. Keep a stable internal filename (uuid + same extension) per external path for path rewrite in diffs. +/// 4. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk using +/// `git diff --no-index` and rewrite paths to external paths. +#[derive(Default)] +pub struct TurnDiffTracker { + /// Temp directory holding baseline snapshots of files as first seen. + baseline_files_dir: Option, + /// Map external path -> internal filename (uuid + same extension). + external_to_temp_name: HashMap, + /// Internal filename -> external path as of baseline snapshot. + temp_name_to_baseline_external: HashMap, + /// Internal filename -> external path as of current accumulated state (after applying all changes). + /// This is where renames are tracked. + temp_name_to_current_external: HashMap, + /// Aggregated unified diff for all accumulated changes across files. + pub unified_diff: Option, +} + +impl TurnDiffTracker { + pub fn new() -> Self { + Self::default() + } + + /// Front-run apply patch calls to track the starting contents of any modified files. + /// - Creates a baseline snapshot for files that already exist on disk when first seen. + /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions. + /// - Also updates internal mappings for move/rename events. + pub fn on_patch_begin(&mut self, changes: &HashMap) -> Result<()> { + self.ensure_baseline_dir()?; + let baseline_dir = self.baseline_dir()?.to_path_buf(); + + for (path, change) in changes.iter() { + // Ensure a stable internal filename exists for this external path. + if !self.external_to_temp_name.contains_key(path) { + let internal = uuid_filename_for(path); + self.external_to_temp_name + .insert(path.clone(), internal.clone()); + self.temp_name_to_baseline_external + .insert(internal.clone(), path.clone()); + self.temp_name_to_current_external + .insert(internal.clone(), path.clone()); + + // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null. + if path.exists() { + let contents = fs::read(path) + .with_context(|| format!("failed to read original {}", path.display()))?; + let internal_path = baseline_dir.join(&internal); + fs::write(&internal_path, contents).with_context(|| { + format!("failed to write baseline file {}", internal_path.display()) + })?; + } + } + + // Track rename/move in current mapping if provided in an Update. + let move_path = match change { + FileChange::Update { + move_path: Some(dest), + .. + } => Some(dest), + _ => None, + }; + if let Some(dest) = move_path { + let uuid_filename = match self.external_to_temp_name.get(path) { + Some(i) => i.clone(), + None => { + // This should be rare, but if we haven't mapped the source, create it with no baseline. + let i = uuid_filename_for(path); + self.external_to_temp_name.insert(path.clone(), i.clone()); + self.temp_name_to_baseline_external + .insert(i.clone(), path.clone()); + i + } + }; + // Update current external mapping for temp file name. + self.temp_name_to_current_external + .insert(uuid_filename.clone(), dest.clone()); + // Update forward file_mapping: external current -> internal name. + self.external_to_temp_name.remove(path); + self.external_to_temp_name + .insert(dest.clone(), uuid_filename); + } + } + + Ok(()) + } + + /// Recompute the aggregated unified diff by comparing all baseline snapshots against ``` > Can you expand this comment? I don't have a great mental model of the structure you're trying to set up for the ultimate `git diff` call. > > I want to understand why this isn't something simpler like `diff -u backed-up-file current-file`. - Created: 2025-08-04 02:45:51 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2250292032 ```diff @@ -0,0 +1,766 @@ +use std::collections::HashMap; +use std::fs; +use std::path::Path; +use std::path::PathBuf; +use std::process::Command; + +use anyhow::Context; +use anyhow::Result; +use anyhow::anyhow; +use uuid::Uuid; + +use crate::protocol::FileChange; + +struct BaselineFileInfo { + path: Option, + contents_bytes: Option>, + mode: Option, + oid: Option, +} + +/// Tracks sets of changes to files and exposes the overall unified diff. +/// Internally, the way this works is now: +/// 1. Maintain an in-memory baseline snapshot of files when they are first seen. +/// For new additions, do not create a baseline so that diffs are shown as proper additions (using /dev/null). +/// 2. Keep a stable internal filename (uuid + same extension) per external path for rename tracking. +/// 3. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk entirely in-memory +/// using the `similar` crate and emit unified diffs with rewritten external paths. +#[derive(Default)] +pub struct TurnDiffTracker { + /// Map external path -> internal filename (uuid + same extension). + external_to_temp_name: HashMap, + /// Internal filename -> baseline file info. + baseline_file_info: HashMap, + /// Internal filename -> external path as of current accumulated state (after applying all changes). + /// This is where renames are tracked. + temp_name_to_current_path: HashMap, + /// Cache of known git worktree roots to avoid repeated filesystem walks. + git_root_cache: Vec, +} + +impl TurnDiffTracker { + pub fn new() -> Self { + Self::default() + } + + /// Front-run apply patch calls to track the starting contents of any modified files. + /// - Creates an in-memory baseline snapshot for files that already exist on disk when first seen. + /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions. + /// - Also updates internal mappings for move/rename events. + pub fn on_patch_begin(&mut self, changes: &HashMap) -> Result<()> { + for (path, change) in changes.iter() { + // Ensure a stable internal filename exists for this external path. + if !self.external_to_temp_name.contains_key(path) { + let internal = uuid_filename_for(path); + self.external_to_temp_name + .insert(path.clone(), internal.clone()); + self.temp_name_to_current_path + .insert(internal.clone(), path.clone()); + + // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null. + let (contents_bytes, mode, oid) = if path.exists() { + let mode = file_mode_for_path(path); + let mode_str = mode.as_deref().unwrap_or(REGULAR_MODE); + let contents_bytes = blob_bytes(path, mode_str) + .unwrap_or_default() + .unwrap_or_default(); + let oid = if mode.as_deref() == Some(SYMLINK_MODE) { + git_blob_sha1_hex_bytes(&contents_bytes) + } else { + self.git_blob_oid_for_path(path) + .unwrap_or_else(|| git_blob_sha1_hex_bytes(&contents_bytes)) + }; + (Some(contents_bytes), mode, Some(oid)) + } else { + (None, None, Some(ZERO_OID.to_string())) + }; + + self.baseline_file_info.insert( + internal.clone(), + BaselineFileInfo { + path: Some(path.clone()), + contents_bytes, + mode, + oid, + }, + ); + } + + // Track rename/move in current mapping if provided in an Update. + if let FileChange::Update { + move_path: Some(dest), + .. + } = change + { + let uuid_filename = match self.external_to_temp_name.get(path) { + Some(i) => i.clone(), + None => { + // This should be rare, but if we haven't mapped the source, create it with no baseline. + let i = uuid_filename_for(path); + self.baseline_file_info.insert( + i.clone(), + BaselineFileInfo { + path: Some(path.clone()), + contents_bytes: None, + mode: None, + oid: Some(ZERO_OID.to_string()), + }, + ); + i + } + }; + // Update current external mapping for temp file name. + self.temp_name_to_current_path + .insert(uuid_filename.clone(), dest.clone()); + // Update forward file_mapping: external current -> internal name. + self.external_to_temp_name.remove(path); + self.external_to_temp_name + .insert(dest.clone(), uuid_filename); + }; + } + + Ok(()) ``` > I don't see a `?` or a place where the `Err` variant is constructed, so does this need to return `Result`? - Created: 2025-08-04 02:57:10 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2250301151 ```diff @@ -0,0 +1,766 @@ +use std::collections::HashMap; +use std::fs; +use std::path::Path; +use std::path::PathBuf; +use std::process::Command; + +use anyhow::Context; +use anyhow::Result; +use anyhow::anyhow; +use uuid::Uuid; + +use crate::protocol::FileChange; + +struct BaselineFileInfo { + path: Option, + contents_bytes: Option>, ``` > I think the field name `file_contents` implies bytes given the type. - Created: 2025-08-04 02:57:49 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2250301670 ```diff @@ -0,0 +1,766 @@ +use std::collections::HashMap; +use std::fs; +use std::path::Path; +use std::path::PathBuf; +use std::process::Command; + +use anyhow::Context; +use anyhow::Result; +use anyhow::anyhow; +use uuid::Uuid; + +use crate::protocol::FileChange; + +struct BaselineFileInfo { + path: Option, + contents_bytes: Option>, + mode: Option, ``` > I think an `enum` instead of a `String` would be clearer here. - Created: 2025-08-04 02:58:20 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2250302031 ```diff @@ -0,0 +1,766 @@ +use std::collections::HashMap; +use std::fs; +use std::path::Path; +use std::path::PathBuf; +use std::process::Command; + +use anyhow::Context; +use anyhow::Result; +use anyhow::anyhow; +use uuid::Uuid; + +use crate::protocol::FileChange; + +struct BaselineFileInfo { + path: Option, ``` > Add a comment since it is surprising that `path` could be `None`? - Created: 2025-08-04 02:59:46 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2250303137 ```diff @@ -0,0 +1,766 @@ +use std::collections::HashMap; +use std::fs; +use std::path::Path; +use std::path::PathBuf; +use std::process::Command; + +use anyhow::Context; +use anyhow::Result; +use anyhow::anyhow; +use uuid::Uuid; + +use crate::protocol::FileChange; + +struct BaselineFileInfo { + path: Option, + contents_bytes: Option>, + mode: Option, + oid: Option, +} + +/// Tracks sets of changes to files and exposes the overall unified diff. +/// Internally, the way this works is now: +/// 1. Maintain an in-memory baseline snapshot of files when they are first seen. +/// For new additions, do not create a baseline so that diffs are shown as proper additions (using /dev/null). +/// 2. Keep a stable internal filename (uuid + same extension) per external path for rename tracking. +/// 3. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk entirely in-memory +/// using the `similar` crate and emit unified diffs with rewritten external paths. +#[derive(Default)] +pub struct TurnDiffTracker { + /// Map external path -> internal filename (uuid + same extension). + external_to_temp_name: HashMap, + /// Internal filename -> baseline file info. + baseline_file_info: HashMap, + /// Internal filename -> external path as of current accumulated state (after applying all changes). + /// This is where renames are tracked. + temp_name_to_current_path: HashMap, + /// Cache of known git worktree roots to avoid repeated filesystem walks. + git_root_cache: Vec, +} + +impl TurnDiffTracker { + pub fn new() -> Self { + Self::default() + } + + /// Front-run apply patch calls to track the starting contents of any modified files. + /// - Creates an in-memory baseline snapshot for files that already exist on disk when first seen. + /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions. + /// - Also updates internal mappings for move/rename events. + pub fn on_patch_begin(&mut self, changes: &HashMap) -> Result<()> { + for (path, change) in changes.iter() { + // Ensure a stable internal filename exists for this external path. + if !self.external_to_temp_name.contains_key(path) { + let internal = uuid_filename_for(path); + self.external_to_temp_name + .insert(path.clone(), internal.clone()); + self.temp_name_to_current_path + .insert(internal.clone(), path.clone()); + + // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null. + let (contents_bytes, mode, oid) = if path.exists() { + let mode = file_mode_for_path(path); + let mode_str = mode.as_deref().unwrap_or(REGULAR_MODE); ``` > I `mode` becomes an `enum`, using `match` would be cleaner here. - Created: 2025-08-04 03:00:33 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2250303694 ```diff @@ -0,0 +1,766 @@ +use std::collections::HashMap; +use std::fs; +use std::path::Path; +use std::path::PathBuf; +use std::process::Command; + +use anyhow::Context; +use anyhow::Result; +use anyhow::anyhow; +use uuid::Uuid; + +use crate::protocol::FileChange; + +struct BaselineFileInfo { + path: Option, ``` > I don't see a case where it is `None` in the code, but maybe I'm missing something? - Created: 2025-08-04 03:03:14 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2250305839 ```diff @@ -0,0 +1,766 @@ +use std::collections::HashMap; +use std::fs; +use std::path::Path; +use std::path::PathBuf; +use std::process::Command; + +use anyhow::Context; +use anyhow::Result; +use anyhow::anyhow; +use uuid::Uuid; + +use crate::protocol::FileChange; + +struct BaselineFileInfo { + path: Option, + contents_bytes: Option>, + mode: Option, + oid: Option, +} + +/// Tracks sets of changes to files and exposes the overall unified diff. +/// Internally, the way this works is now: +/// 1. Maintain an in-memory baseline snapshot of files when they are first seen. +/// For new additions, do not create a baseline so that diffs are shown as proper additions (using /dev/null). +/// 2. Keep a stable internal filename (uuid + same extension) per external path for rename tracking. +/// 3. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk entirely in-memory +/// using the `similar` crate and emit unified diffs with rewritten external paths. +#[derive(Default)] +pub struct TurnDiffTracker { + /// Map external path -> internal filename (uuid + same extension). + external_to_temp_name: HashMap, + /// Internal filename -> baseline file info. + baseline_file_info: HashMap, + /// Internal filename -> external path as of current accumulated state (after applying all changes). + /// This is where renames are tracked. + temp_name_to_current_path: HashMap, + /// Cache of known git worktree roots to avoid repeated filesystem walks. + git_root_cache: Vec, +} + +impl TurnDiffTracker { + pub fn new() -> Self { + Self::default() + } + + /// Front-run apply patch calls to track the starting contents of any modified files. + /// - Creates an in-memory baseline snapshot for files that already exist on disk when first seen. + /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions. + /// - Also updates internal mappings for move/rename events. + pub fn on_patch_begin(&mut self, changes: &HashMap) -> Result<()> { + for (path, change) in changes.iter() { + // Ensure a stable internal filename exists for this external path. + if !self.external_to_temp_name.contains_key(path) { + let internal = uuid_filename_for(path); + self.external_to_temp_name + .insert(path.clone(), internal.clone()); + self.temp_name_to_current_path + .insert(internal.clone(), path.clone()); + + // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null. + let (contents_bytes, mode, oid) = if path.exists() { + let mode = file_mode_for_path(path); + let mode_str = mode.as_deref().unwrap_or(REGULAR_MODE); + let contents_bytes = blob_bytes(path, mode_str) + .unwrap_or_default() + .unwrap_or_default(); + let oid = if mode.as_deref() == Some(SYMLINK_MODE) { + git_blob_sha1_hex_bytes(&contents_bytes) + } else { + self.git_blob_oid_for_path(path) + .unwrap_or_else(|| git_blob_sha1_hex_bytes(&contents_bytes)) + }; + (Some(contents_bytes), mode, Some(oid)) + } else { + (None, None, Some(ZERO_OID.to_string())) + }; + + self.baseline_file_info.insert( + internal.clone(), + BaselineFileInfo { + path: Some(path.clone()), + contents_bytes, + mode, + oid, + }, + ); + } + + // Track rename/move in current mapping if provided in an Update. + if let FileChange::Update { + move_path: Some(dest), + .. + } = change + { + let uuid_filename = match self.external_to_temp_name.get(path) { + Some(i) => i.clone(), + None => { + // This should be rare, but if we haven't mapped the source, create it with no baseline. + let i = uuid_filename_for(path); + self.baseline_file_info.insert( + i.clone(), + BaselineFileInfo { + path: Some(path.clone()), + contents_bytes: None, + mode: None, + oid: Some(ZERO_OID.to_string()), + }, + ); + i + } + }; + // Update current external mapping for temp file name. + self.temp_name_to_current_path + .insert(uuid_filename.clone(), dest.clone()); + // Update forward file_mapping: external current -> internal name. + self.external_to_temp_name.remove(path); + self.external_to_temp_name + .insert(dest.clone(), uuid_filename); + }; + } + + Ok(()) + } + + fn get_path_for_internal(&self, internal: &str) -> Option { + self.temp_name_to_current_path + .get(internal) + .cloned() + .or_else(|| { + self.baseline_file_info + .get(internal) + .and_then(|info| info.path.clone()) + }) + } + + /// Find the git worktree root for a file/directory by walking up to the first ancestor containing a `.git` entry. + /// Uses a simple cache of known roots and avoids negative-result caching for simplicity. + fn find_git_root_cached(&mut self, start: &Path) -> Option { + let dir = if start.is_dir() { + start + } else { + start.parent()? + }; + + // Fast path: if any cached root is an ancestor of this path, use it. + if let Some(root) = self + .git_root_cache + .iter() + .find(|r| dir.starts_with(r)) + .cloned() + { + return Some(root); + } + + // Walk up to find a `.git` marker. + let mut cur = dir.to_path_buf(); + loop { + let git_marker = cur.join(".git"); + if git_marker.is_dir() || git_marker.is_file() { + if !self.git_root_cache.iter().any(|r| r == &cur) { + self.git_root_cache.push(cur.clone()); + } + return Some(cur); + } + + // On Windows, avoid walking above the drive or UNC share root. + #[cfg(windows)] + { + if is_windows_drive_or_unc_root(&cur) { + return None; + } + } + + if let Some(parent) = cur.parent() { + cur = parent.to_path_buf(); + } else { + return None; + } + } + } + + /// Return a display string for `path` relative to its git root if found, else absolute. + fn relative_to_git_root_str(&mut self, path: &Path) -> String { + let s = if let Some(root) = self.find_git_root_cached(path) { + if let Ok(rel) = path.strip_prefix(&root) { + rel.display().to_string() + } else { + path.display().to_string() + } + } else { + path.display().to_string() + }; + s.replace('\\', "/") + } + + /// Ask git to compute the blob SHA-1 for the file at `path` within its repository. + /// Returns None if no repository is found or git invocation fails. + fn git_blob_oid_for_path(&mut self, path: &Path) -> Option { + let root = self.find_git_root_cached(path)?; + // Compute a path relative to the repo root for better portability across platforms. + let rel = path.strip_prefix(&root).unwrap_or(path); + let output = Command::new("git") + .arg("-C") + .arg(&root) + .arg("hash-object") + .arg("--") + .arg(rel) + .output() + .ok()?; + if !output.status.success() { + return None; + } + let s = String::from_utf8_lossy(&output.stdout).trim().to_string(); + if s.len() == 40 { Some(s) } else { None } ``` > If it is not 40, is this an error / unexpected situation? - Created: 2025-08-04 03:03:31 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2250306088 ```diff @@ -0,0 +1,766 @@ +use std::collections::HashMap; +use std::fs; +use std::path::Path; +use std::path::PathBuf; +use std::process::Command; + +use anyhow::Context; +use anyhow::Result; +use anyhow::anyhow; +use uuid::Uuid; + +use crate::protocol::FileChange; + +struct BaselineFileInfo { + path: Option, + contents_bytes: Option>, + mode: Option, + oid: Option, +} + +/// Tracks sets of changes to files and exposes the overall unified diff. +/// Internally, the way this works is now: +/// 1. Maintain an in-memory baseline snapshot of files when they are first seen. +/// For new additions, do not create a baseline so that diffs are shown as proper additions (using /dev/null). +/// 2. Keep a stable internal filename (uuid + same extension) per external path for rename tracking. +/// 3. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk entirely in-memory +/// using the `similar` crate and emit unified diffs with rewritten external paths. +#[derive(Default)] +pub struct TurnDiffTracker { + /// Map external path -> internal filename (uuid + same extension). + external_to_temp_name: HashMap, + /// Internal filename -> baseline file info. + baseline_file_info: HashMap, + /// Internal filename -> external path as of current accumulated state (after applying all changes). + /// This is where renames are tracked. + temp_name_to_current_path: HashMap, + /// Cache of known git worktree roots to avoid repeated filesystem walks. + git_root_cache: Vec, +} + +impl TurnDiffTracker { + pub fn new() -> Self { + Self::default() + } + + /// Front-run apply patch calls to track the starting contents of any modified files. + /// - Creates an in-memory baseline snapshot for files that already exist on disk when first seen. + /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions. + /// - Also updates internal mappings for move/rename events. + pub fn on_patch_begin(&mut self, changes: &HashMap) -> Result<()> { + for (path, change) in changes.iter() { + // Ensure a stable internal filename exists for this external path. + if !self.external_to_temp_name.contains_key(path) { + let internal = uuid_filename_for(path); + self.external_to_temp_name + .insert(path.clone(), internal.clone()); + self.temp_name_to_current_path + .insert(internal.clone(), path.clone()); + + // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null. + let (contents_bytes, mode, oid) = if path.exists() { + let mode = file_mode_for_path(path); + let mode_str = mode.as_deref().unwrap_or(REGULAR_MODE); + let contents_bytes = blob_bytes(path, mode_str) + .unwrap_or_default() + .unwrap_or_default(); + let oid = if mode.as_deref() == Some(SYMLINK_MODE) { + git_blob_sha1_hex_bytes(&contents_bytes) + } else { + self.git_blob_oid_for_path(path) + .unwrap_or_else(|| git_blob_sha1_hex_bytes(&contents_bytes)) + }; + (Some(contents_bytes), mode, Some(oid)) + } else { + (None, None, Some(ZERO_OID.to_string())) + }; + + self.baseline_file_info.insert( + internal.clone(), + BaselineFileInfo { + path: Some(path.clone()), + contents_bytes, + mode, + oid, + }, + ); + } + + // Track rename/move in current mapping if provided in an Update. + if let FileChange::Update { + move_path: Some(dest), + .. + } = change + { + let uuid_filename = match self.external_to_temp_name.get(path) { + Some(i) => i.clone(), + None => { + // This should be rare, but if we haven't mapped the source, create it with no baseline. + let i = uuid_filename_for(path); + self.baseline_file_info.insert( + i.clone(), + BaselineFileInfo { + path: Some(path.clone()), + contents_bytes: None, + mode: None, + oid: Some(ZERO_OID.to_string()), + }, + ); + i + } + }; + // Update current external mapping for temp file name. + self.temp_name_to_current_path + .insert(uuid_filename.clone(), dest.clone()); + // Update forward file_mapping: external current -> internal name. + self.external_to_temp_name.remove(path); + self.external_to_temp_name + .insert(dest.clone(), uuid_filename); + }; + } + + Ok(()) + } + + fn get_path_for_internal(&self, internal: &str) -> Option { + self.temp_name_to_current_path + .get(internal) + .cloned() + .or_else(|| { + self.baseline_file_info + .get(internal) + .and_then(|info| info.path.clone()) + }) + } + + /// Find the git worktree root for a file/directory by walking up to the first ancestor containing a `.git` entry. + /// Uses a simple cache of known roots and avoids negative-result caching for simplicity. + fn find_git_root_cached(&mut self, start: &Path) -> Option { + let dir = if start.is_dir() { + start + } else { + start.parent()? + }; + + // Fast path: if any cached root is an ancestor of this path, use it. + if let Some(root) = self + .git_root_cache + .iter() + .find(|r| dir.starts_with(r)) + .cloned() + { + return Some(root); + } + + // Walk up to find a `.git` marker. + let mut cur = dir.to_path_buf(); + loop { + let git_marker = cur.join(".git"); + if git_marker.is_dir() || git_marker.is_file() { + if !self.git_root_cache.iter().any(|r| r == &cur) { + self.git_root_cache.push(cur.clone()); + } + return Some(cur); + } + + // On Windows, avoid walking above the drive or UNC share root. + #[cfg(windows)] + { + if is_windows_drive_or_unc_root(&cur) { + return None; + } + } + + if let Some(parent) = cur.parent() { + cur = parent.to_path_buf(); + } else { + return None; + } + } + } + + /// Return a display string for `path` relative to its git root if found, else absolute. + fn relative_to_git_root_str(&mut self, path: &Path) -> String { + let s = if let Some(root) = self.find_git_root_cached(path) { + if let Ok(rel) = path.strip_prefix(&root) { + rel.display().to_string() + } else { + path.display().to_string() + } + } else { + path.display().to_string() + }; + s.replace('\\', "/") + } + + /// Ask git to compute the blob SHA-1 for the file at `path` within its repository. + /// Returns None if no repository is found or git invocation fails. + fn git_blob_oid_for_path(&mut self, path: &Path) -> Option { + let root = self.find_git_root_cached(path)?; + // Compute a path relative to the repo root for better portability across platforms. + let rel = path.strip_prefix(&root).unwrap_or(path); + let output = Command::new("git") + .arg("-C") + .arg(&root) + .arg("hash-object") + .arg("--") + .arg(rel) + .output() + .ok()?; + if !output.status.success() { ``` > Is this something expected or exceptional? - Created: 2025-08-04 03:03:57 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2250306484 ```diff @@ -0,0 +1,766 @@ +use std::collections::HashMap; +use std::fs; +use std::path::Path; +use std::path::PathBuf; +use std::process::Command; + +use anyhow::Context; +use anyhow::Result; +use anyhow::anyhow; +use uuid::Uuid; + +use crate::protocol::FileChange; + +struct BaselineFileInfo { + path: Option, + contents_bytes: Option>, + mode: Option, + oid: Option, +} + +/// Tracks sets of changes to files and exposes the overall unified diff. +/// Internally, the way this works is now: +/// 1. Maintain an in-memory baseline snapshot of files when they are first seen. +/// For new additions, do not create a baseline so that diffs are shown as proper additions (using /dev/null). +/// 2. Keep a stable internal filename (uuid + same extension) per external path for rename tracking. +/// 3. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk entirely in-memory +/// using the `similar` crate and emit unified diffs with rewritten external paths. +#[derive(Default)] +pub struct TurnDiffTracker { + /// Map external path -> internal filename (uuid + same extension). + external_to_temp_name: HashMap, + /// Internal filename -> baseline file info. + baseline_file_info: HashMap, + /// Internal filename -> external path as of current accumulated state (after applying all changes). + /// This is where renames are tracked. + temp_name_to_current_path: HashMap, + /// Cache of known git worktree roots to avoid repeated filesystem walks. + git_root_cache: Vec, +} + +impl TurnDiffTracker { + pub fn new() -> Self { + Self::default() + } + + /// Front-run apply patch calls to track the starting contents of any modified files. + /// - Creates an in-memory baseline snapshot for files that already exist on disk when first seen. + /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions. + /// - Also updates internal mappings for move/rename events. + pub fn on_patch_begin(&mut self, changes: &HashMap) -> Result<()> { + for (path, change) in changes.iter() { + // Ensure a stable internal filename exists for this external path. + if !self.external_to_temp_name.contains_key(path) { + let internal = uuid_filename_for(path); + self.external_to_temp_name + .insert(path.clone(), internal.clone()); + self.temp_name_to_current_path + .insert(internal.clone(), path.clone()); + + // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null. + let (contents_bytes, mode, oid) = if path.exists() { + let mode = file_mode_for_path(path); + let mode_str = mode.as_deref().unwrap_or(REGULAR_MODE); + let contents_bytes = blob_bytes(path, mode_str) + .unwrap_or_default() + .unwrap_or_default(); + let oid = if mode.as_deref() == Some(SYMLINK_MODE) { + git_blob_sha1_hex_bytes(&contents_bytes) + } else { + self.git_blob_oid_for_path(path) + .unwrap_or_else(|| git_blob_sha1_hex_bytes(&contents_bytes)) + }; + (Some(contents_bytes), mode, Some(oid)) + } else { + (None, None, Some(ZERO_OID.to_string())) + }; + + self.baseline_file_info.insert( + internal.clone(), + BaselineFileInfo { + path: Some(path.clone()), + contents_bytes, + mode, + oid, + }, + ); + } + + // Track rename/move in current mapping if provided in an Update. + if let FileChange::Update { + move_path: Some(dest), + .. + } = change + { + let uuid_filename = match self.external_to_temp_name.get(path) { + Some(i) => i.clone(), + None => { + // This should be rare, but if we haven't mapped the source, create it with no baseline. + let i = uuid_filename_for(path); + self.baseline_file_info.insert( + i.clone(), + BaselineFileInfo { + path: Some(path.clone()), + contents_bytes: None, + mode: None, + oid: Some(ZERO_OID.to_string()), + }, + ); + i + } + }; + // Update current external mapping for temp file name. + self.temp_name_to_current_path + .insert(uuid_filename.clone(), dest.clone()); + // Update forward file_mapping: external current -> internal name. + self.external_to_temp_name.remove(path); + self.external_to_temp_name + .insert(dest.clone(), uuid_filename); + }; + } + + Ok(()) + } + + fn get_path_for_internal(&self, internal: &str) -> Option { + self.temp_name_to_current_path + .get(internal) + .cloned() + .or_else(|| { + self.baseline_file_info + .get(internal) + .and_then(|info| info.path.clone()) + }) + } + + /// Find the git worktree root for a file/directory by walking up to the first ancestor containing a `.git` entry. + /// Uses a simple cache of known roots and avoids negative-result caching for simplicity. + fn find_git_root_cached(&mut self, start: &Path) -> Option { + let dir = if start.is_dir() { + start + } else { + start.parent()? + }; + + // Fast path: if any cached root is an ancestor of this path, use it. + if let Some(root) = self + .git_root_cache + .iter() + .find(|r| dir.starts_with(r)) + .cloned() + { + return Some(root); + } + + // Walk up to find a `.git` marker. + let mut cur = dir.to_path_buf(); + loop { + let git_marker = cur.join(".git"); + if git_marker.is_dir() || git_marker.is_file() { + if !self.git_root_cache.iter().any(|r| r == &cur) { + self.git_root_cache.push(cur.clone()); + } + return Some(cur); + } + + // On Windows, avoid walking above the drive or UNC share root. + #[cfg(windows)] + { + if is_windows_drive_or_unc_root(&cur) { + return None; + } + } + + if let Some(parent) = cur.parent() { + cur = parent.to_path_buf(); + } else { + return None; + } + } + } + + /// Return a display string for `path` relative to its git root if found, else absolute. + fn relative_to_git_root_str(&mut self, path: &Path) -> String { + let s = if let Some(root) = self.find_git_root_cached(path) { + if let Ok(rel) = path.strip_prefix(&root) { + rel.display().to_string() + } else { + path.display().to_string() + } + } else { + path.display().to_string() + }; + s.replace('\\', "/") + } + + /// Ask git to compute the blob SHA-1 for the file at `path` within its repository. + /// Returns None if no repository is found or git invocation fails. + fn git_blob_oid_for_path(&mut self, path: &Path) -> Option { + let root = self.find_git_root_cached(path)?; + // Compute a path relative to the repo root for better portability across platforms. + let rel = path.strip_prefix(&root).unwrap_or(path); + let output = Command::new("git") + .arg("-C") + .arg(&root) + .arg("hash-object") + .arg("--") + .arg(rel) + .output() + .ok()?; + if !output.status.success() { + return None; + } + let s = String::from_utf8_lossy(&output.stdout).trim().to_string(); + if s.len() == 40 { Some(s) } else { None } + } + + /// Recompute the aggregated unified diff by comparing all of the in-memory snapshots that were + /// collected before the first time they were touched by apply_patch during this turn with + /// the current repo state. + pub fn get_unified_diff(&mut self) -> Result> { + let mut aggregated = String::new(); + + // Compute diffs per tracked internal file in a stable order by external path. + let mut baseline_file_names: Vec = + self.baseline_file_info.keys().cloned().collect(); + // Sort lexicographically by full repo-relative path to match git behavior. + baseline_file_names.sort_by_key(|internal| { ``` > 👍 - Created: 2025-08-04 03:05:24 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2250307567 ```diff @@ -0,0 +1,766 @@ +use std::collections::HashMap; +use std::fs; +use std::path::Path; +use std::path::PathBuf; +use std::process::Command; + +use anyhow::Context; +use anyhow::Result; +use anyhow::anyhow; +use uuid::Uuid; + +use crate::protocol::FileChange; + +struct BaselineFileInfo { + path: Option, + contents_bytes: Option>, + mode: Option, + oid: Option, +} + +/// Tracks sets of changes to files and exposes the overall unified diff. +/// Internally, the way this works is now: +/// 1. Maintain an in-memory baseline snapshot of files when they are first seen. +/// For new additions, do not create a baseline so that diffs are shown as proper additions (using /dev/null). +/// 2. Keep a stable internal filename (uuid + same extension) per external path for rename tracking. +/// 3. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk entirely in-memory +/// using the `similar` crate and emit unified diffs with rewritten external paths. +#[derive(Default)] +pub struct TurnDiffTracker { + /// Map external path -> internal filename (uuid + same extension). + external_to_temp_name: HashMap, + /// Internal filename -> baseline file info. + baseline_file_info: HashMap, + /// Internal filename -> external path as of current accumulated state (after applying all changes). + /// This is where renames are tracked. + temp_name_to_current_path: HashMap, + /// Cache of known git worktree roots to avoid repeated filesystem walks. + git_root_cache: Vec, +} + +impl TurnDiffTracker { + pub fn new() -> Self { + Self::default() + } + + /// Front-run apply patch calls to track the starting contents of any modified files. + /// - Creates an in-memory baseline snapshot for files that already exist on disk when first seen. + /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions. + /// - Also updates internal mappings for move/rename events. + pub fn on_patch_begin(&mut self, changes: &HashMap) -> Result<()> { + for (path, change) in changes.iter() { + // Ensure a stable internal filename exists for this external path. + if !self.external_to_temp_name.contains_key(path) { + let internal = uuid_filename_for(path); + self.external_to_temp_name + .insert(path.clone(), internal.clone()); + self.temp_name_to_current_path + .insert(internal.clone(), path.clone()); + + // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null. + let (contents_bytes, mode, oid) = if path.exists() { + let mode = file_mode_for_path(path); + let mode_str = mode.as_deref().unwrap_or(REGULAR_MODE); + let contents_bytes = blob_bytes(path, mode_str) + .unwrap_or_default() + .unwrap_or_default(); + let oid = if mode.as_deref() == Some(SYMLINK_MODE) { + git_blob_sha1_hex_bytes(&contents_bytes) + } else { + self.git_blob_oid_for_path(path) + .unwrap_or_else(|| git_blob_sha1_hex_bytes(&contents_bytes)) + }; + (Some(contents_bytes), mode, Some(oid)) + } else { + (None, None, Some(ZERO_OID.to_string())) + }; + + self.baseline_file_info.insert( + internal.clone(), + BaselineFileInfo { + path: Some(path.clone()), + contents_bytes, + mode, + oid, + }, + ); + } + + // Track rename/move in current mapping if provided in an Update. + if let FileChange::Update { + move_path: Some(dest), + .. + } = change + { + let uuid_filename = match self.external_to_temp_name.get(path) { + Some(i) => i.clone(), + None => { + // This should be rare, but if we haven't mapped the source, create it with no baseline. + let i = uuid_filename_for(path); + self.baseline_file_info.insert( + i.clone(), + BaselineFileInfo { + path: Some(path.clone()), + contents_bytes: None, + mode: None, + oid: Some(ZERO_OID.to_string()), + }, + ); + i + } + }; + // Update current external mapping for temp file name. + self.temp_name_to_current_path + .insert(uuid_filename.clone(), dest.clone()); + // Update forward file_mapping: external current -> internal name. + self.external_to_temp_name.remove(path); + self.external_to_temp_name + .insert(dest.clone(), uuid_filename); + }; + } + + Ok(()) + } + + fn get_path_for_internal(&self, internal: &str) -> Option { + self.temp_name_to_current_path + .get(internal) + .cloned() + .or_else(|| { + self.baseline_file_info + .get(internal) + .and_then(|info| info.path.clone()) + }) + } + + /// Find the git worktree root for a file/directory by walking up to the first ancestor containing a `.git` entry. + /// Uses a simple cache of known roots and avoids negative-result caching for simplicity. + fn find_git_root_cached(&mut self, start: &Path) -> Option { + let dir = if start.is_dir() { + start + } else { + start.parent()? + }; + + // Fast path: if any cached root is an ancestor of this path, use it. + if let Some(root) = self + .git_root_cache + .iter() + .find(|r| dir.starts_with(r)) + .cloned() + { + return Some(root); + } + + // Walk up to find a `.git` marker. + let mut cur = dir.to_path_buf(); + loop { + let git_marker = cur.join(".git"); + if git_marker.is_dir() || git_marker.is_file() { + if !self.git_root_cache.iter().any(|r| r == &cur) { + self.git_root_cache.push(cur.clone()); + } + return Some(cur); + } + + // On Windows, avoid walking above the drive or UNC share root. + #[cfg(windows)] + { + if is_windows_drive_or_unc_root(&cur) { + return None; + } + } + + if let Some(parent) = cur.parent() { + cur = parent.to_path_buf(); + } else { + return None; + } + } + } + + /// Return a display string for `path` relative to its git root if found, else absolute. + fn relative_to_git_root_str(&mut self, path: &Path) -> String { + let s = if let Some(root) = self.find_git_root_cached(path) { + if let Ok(rel) = path.strip_prefix(&root) { + rel.display().to_string() + } else { + path.display().to_string() + } + } else { + path.display().to_string() + }; + s.replace('\\', "/") + } + + /// Ask git to compute the blob SHA-1 for the file at `path` within its repository. + /// Returns None if no repository is found or git invocation fails. + fn git_blob_oid_for_path(&mut self, path: &Path) -> Option { + let root = self.find_git_root_cached(path)?; + // Compute a path relative to the repo root for better portability across platforms. + let rel = path.strip_prefix(&root).unwrap_or(path); + let output = Command::new("git") + .arg("-C") + .arg(&root) + .arg("hash-object") + .arg("--") + .arg(rel) + .output() + .ok()?; + if !output.status.success() { + return None; + } + let s = String::from_utf8_lossy(&output.stdout).trim().to_string(); + if s.len() == 40 { Some(s) } else { None } + } + + /// Recompute the aggregated unified diff by comparing all of the in-memory snapshots that were + /// collected before the first time they were touched by apply_patch during this turn with + /// the current repo state. + pub fn get_unified_diff(&mut self) -> Result> { + let mut aggregated = String::new(); + + // Compute diffs per tracked internal file in a stable order by external path. + let mut baseline_file_names: Vec = + self.baseline_file_info.keys().cloned().collect(); + // Sort lexicographically by full repo-relative path to match git behavior. + baseline_file_names.sort_by_key(|internal| { + self.get_path_for_internal(internal) + .map(|p| self.relative_to_git_root_str(&p)) + .unwrap_or_default() + }); + + for internal in baseline_file_names { + // Baseline external must exist for any tracked internal. + let baseline_external = match self + .baseline_file_info + .get(&internal) + .and_then(|i| i.path.clone()) + { + Some(p) => p, + None => continue, + }; + let current_external = match self.get_path_for_internal(&internal) { + Some(p) => p, + None => continue, + }; + + // Determine modes early; needed to read symlink content correctly. + let baseline_mode = self + .baseline_file_info + .get(&internal) + .and_then(|i| i.mode.clone()) + .unwrap_or_else(|| REGULAR_MODE.to_string()); + let current_mode = + file_mode_for_path(¤t_external).unwrap_or_else(|| REGULAR_MODE.to_string()); + + let left_bytes = self + .baseline_file_info + .get(&internal) + .and_then(|i| i.contents_bytes.clone()); + + let right_bytes = blob_bytes(¤t_external, ¤t_mode)?; + + // Fast path: identical bytes or both missing. + if left_bytes.as_deref() == right_bytes.as_deref() { + continue; + } + + let left_display = self.relative_to_git_root_str(&baseline_external); + let right_display = self.relative_to_git_root_str(¤t_external); + + // Emit a git-style header for better readability and parity with previous behavior. + aggregated.push_str(&format!("diff --git a/{left_display} b/{right_display}\n")); + + let is_add = left_bytes.is_none() && right_bytes.is_some(); ``` > Could do `match (left_bytes, right_bytes)` to ensure all cases are covered. - Created: 2025-08-04 03:06:34 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2250308415 ```diff @@ -0,0 +1,766 @@ +use std::collections::HashMap; +use std::fs; +use std::path::Path; +use std::path::PathBuf; +use std::process::Command; + +use anyhow::Context; +use anyhow::Result; +use anyhow::anyhow; +use uuid::Uuid; + +use crate::protocol::FileChange; + +struct BaselineFileInfo { + path: Option, + contents_bytes: Option>, + mode: Option, + oid: Option, +} + +/// Tracks sets of changes to files and exposes the overall unified diff. +/// Internally, the way this works is now: +/// 1. Maintain an in-memory baseline snapshot of files when they are first seen. +/// For new additions, do not create a baseline so that diffs are shown as proper additions (using /dev/null). +/// 2. Keep a stable internal filename (uuid + same extension) per external path for rename tracking. +/// 3. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk entirely in-memory +/// using the `similar` crate and emit unified diffs with rewritten external paths. +#[derive(Default)] +pub struct TurnDiffTracker { + /// Map external path -> internal filename (uuid + same extension). + external_to_temp_name: HashMap, + /// Internal filename -> baseline file info. + baseline_file_info: HashMap, + /// Internal filename -> external path as of current accumulated state (after applying all changes). + /// This is where renames are tracked. + temp_name_to_current_path: HashMap, + /// Cache of known git worktree roots to avoid repeated filesystem walks. + git_root_cache: Vec, +} + +impl TurnDiffTracker { + pub fn new() -> Self { + Self::default() + } + + /// Front-run apply patch calls to track the starting contents of any modified files. + /// - Creates an in-memory baseline snapshot for files that already exist on disk when first seen. + /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions. + /// - Also updates internal mappings for move/rename events. + pub fn on_patch_begin(&mut self, changes: &HashMap) -> Result<()> { + for (path, change) in changes.iter() { + // Ensure a stable internal filename exists for this external path. + if !self.external_to_temp_name.contains_key(path) { + let internal = uuid_filename_for(path); + self.external_to_temp_name + .insert(path.clone(), internal.clone()); + self.temp_name_to_current_path + .insert(internal.clone(), path.clone()); + + // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null. + let (contents_bytes, mode, oid) = if path.exists() { + let mode = file_mode_for_path(path); + let mode_str = mode.as_deref().unwrap_or(REGULAR_MODE); + let contents_bytes = blob_bytes(path, mode_str) + .unwrap_or_default() + .unwrap_or_default(); + let oid = if mode.as_deref() == Some(SYMLINK_MODE) { + git_blob_sha1_hex_bytes(&contents_bytes) + } else { + self.git_blob_oid_for_path(path) + .unwrap_or_else(|| git_blob_sha1_hex_bytes(&contents_bytes)) + }; + (Some(contents_bytes), mode, Some(oid)) + } else { + (None, None, Some(ZERO_OID.to_string())) + }; + + self.baseline_file_info.insert( + internal.clone(), + BaselineFileInfo { + path: Some(path.clone()), + contents_bytes, + mode, + oid, + }, + ); + } + + // Track rename/move in current mapping if provided in an Update. + if let FileChange::Update { + move_path: Some(dest), + .. + } = change + { + let uuid_filename = match self.external_to_temp_name.get(path) { + Some(i) => i.clone(), + None => { + // This should be rare, but if we haven't mapped the source, create it with no baseline. + let i = uuid_filename_for(path); + self.baseline_file_info.insert( + i.clone(), + BaselineFileInfo { + path: Some(path.clone()), + contents_bytes: None, + mode: None, + oid: Some(ZERO_OID.to_string()), + }, + ); + i + } + }; + // Update current external mapping for temp file name. + self.temp_name_to_current_path + .insert(uuid_filename.clone(), dest.clone()); + // Update forward file_mapping: external current -> internal name. + self.external_to_temp_name.remove(path); + self.external_to_temp_name + .insert(dest.clone(), uuid_filename); + }; + } + + Ok(()) + } + + fn get_path_for_internal(&self, internal: &str) -> Option { + self.temp_name_to_current_path + .get(internal) + .cloned() + .or_else(|| { + self.baseline_file_info + .get(internal) + .and_then(|info| info.path.clone()) + }) + } + + /// Find the git worktree root for a file/directory by walking up to the first ancestor containing a `.git` entry. + /// Uses a simple cache of known roots and avoids negative-result caching for simplicity. + fn find_git_root_cached(&mut self, start: &Path) -> Option { + let dir = if start.is_dir() { + start + } else { + start.parent()? + }; + + // Fast path: if any cached root is an ancestor of this path, use it. + if let Some(root) = self + .git_root_cache + .iter() + .find(|r| dir.starts_with(r)) + .cloned() + { + return Some(root); + } + + // Walk up to find a `.git` marker. + let mut cur = dir.to_path_buf(); + loop { + let git_marker = cur.join(".git"); + if git_marker.is_dir() || git_marker.is_file() { + if !self.git_root_cache.iter().any(|r| r == &cur) { + self.git_root_cache.push(cur.clone()); + } + return Some(cur); + } + + // On Windows, avoid walking above the drive or UNC share root. + #[cfg(windows)] + { + if is_windows_drive_or_unc_root(&cur) { + return None; + } + } + + if let Some(parent) = cur.parent() { + cur = parent.to_path_buf(); + } else { + return None; + } + } + } + + /// Return a display string for `path` relative to its git root if found, else absolute. + fn relative_to_git_root_str(&mut self, path: &Path) -> String { + let s = if let Some(root) = self.find_git_root_cached(path) { + if let Ok(rel) = path.strip_prefix(&root) { + rel.display().to_string() + } else { + path.display().to_string() + } + } else { + path.display().to_string() + }; + s.replace('\\', "/") + } + + /// Ask git to compute the blob SHA-1 for the file at `path` within its repository. + /// Returns None if no repository is found or git invocation fails. + fn git_blob_oid_for_path(&mut self, path: &Path) -> Option { + let root = self.find_git_root_cached(path)?; + // Compute a path relative to the repo root for better portability across platforms. + let rel = path.strip_prefix(&root).unwrap_or(path); + let output = Command::new("git") + .arg("-C") + .arg(&root) + .arg("hash-object") + .arg("--") + .arg(rel) + .output() + .ok()?; + if !output.status.success() { + return None; + } + let s = String::from_utf8_lossy(&output.stdout).trim().to_string(); + if s.len() == 40 { Some(s) } else { None } + } + + /// Recompute the aggregated unified diff by comparing all of the in-memory snapshots that were + /// collected before the first time they were touched by apply_patch during this turn with + /// the current repo state. + pub fn get_unified_diff(&mut self) -> Result> { ``` > This function is quite long and feels like it would benefit from being broken up. - Created: 2025-08-04 03:07:44 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2250309286 ```diff @@ -0,0 +1,476 @@ +use std::collections::HashMap; +use std::fs; +use std::path::Path; +use std::path::PathBuf; +use std::process::Command; + +use anyhow::Context; +use anyhow::Result; +use tempfile::TempDir; +use uuid::Uuid; + +use crate::protocol::FileChange; + +/// Tracks sets of changes to files and exposes the overall unified diff. +/// Internally, the way this works is now: +/// 1. Create a temp directory to store baseline snapshots of files when they are first seen. +/// 2. When a path is first observed, copy its current contents into the baseline dir if it exists on disk. +/// For new additions, do not create a baseline file so that diffs are shown as proper additions (using /dev/null). +/// 3. Keep a stable internal filename (uuid + same extension) per external path for path rewrite in diffs. +/// 4. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk using +/// `git diff --no-index` and rewrite paths to external paths. +#[derive(Default)] +pub struct TurnDiffTracker { + /// Temp directory holding baseline snapshots of files as first seen. + baseline_files_dir: Option, + /// Map external path -> internal filename (uuid + same extension). + external_to_temp_name: HashMap, + /// Internal filename -> external path as of baseline snapshot. + temp_name_to_baseline_external: HashMap, + /// Internal filename -> external path as of current accumulated state (after applying all changes). + /// This is where renames are tracked. + temp_name_to_current_external: HashMap, + /// Aggregated unified diff for all accumulated changes across files. + pub unified_diff: Option, +} + +impl TurnDiffTracker { + pub fn new() -> Self { + Self::default() + } + + /// Front-run apply patch calls to track the starting contents of any modified files. + /// - Creates a baseline snapshot for files that already exist on disk when first seen. + /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions. + /// - Also updates internal mappings for move/rename events. + pub fn on_patch_begin(&mut self, changes: &HashMap) -> Result<()> { + self.ensure_baseline_dir()?; + let baseline_dir = self.baseline_dir()?.to_path_buf(); + + for (path, change) in changes.iter() { + // Ensure a stable internal filename exists for this external path. + if !self.external_to_temp_name.contains_key(path) { + let internal = uuid_filename_for(path); + self.external_to_temp_name + .insert(path.clone(), internal.clone()); + self.temp_name_to_baseline_external + .insert(internal.clone(), path.clone()); + self.temp_name_to_current_external + .insert(internal.clone(), path.clone()); + + // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null. + if path.exists() { + let contents = fs::read(path) + .with_context(|| format!("failed to read original {}", path.display()))?; + let internal_path = baseline_dir.join(&internal); + fs::write(&internal_path, contents).with_context(|| { + format!("failed to write baseline file {}", internal_path.display()) + })?; + } + } + + // Track rename/move in current mapping if provided in an Update. + let move_path = match change { + FileChange::Update { + move_path: Some(dest), + .. + } => Some(dest), + _ => None, + }; + if let Some(dest) = move_path { + let uuid_filename = match self.external_to_temp_name.get(path) { + Some(i) => i.clone(), + None => { + // This should be rare, but if we haven't mapped the source, create it with no baseline. + let i = uuid_filename_for(path); + self.external_to_temp_name.insert(path.clone(), i.clone()); + self.temp_name_to_baseline_external + .insert(i.clone(), path.clone()); + i + } + }; + // Update current external mapping for temp file name. + self.temp_name_to_current_external + .insert(uuid_filename.clone(), dest.clone()); + // Update forward file_mapping: external current -> internal name. + self.external_to_temp_name.remove(path); + self.external_to_temp_name + .insert(dest.clone(), uuid_filename); + } + } + + Ok(()) + } + + /// Recompute the aggregated unified diff by comparing all baseline snapshots against + /// current files on disk using `git diff --no-index` and rewriting paths to external paths. + pub fn update_and_get_unified_diff(&mut self) -> Result> { + let baseline_dir = self.baseline_dir()?.to_path_buf(); + let current_dir = baseline_dir.join("current"); + if current_dir.exists() { + // Best-effort cleanup of previous run's mirror. + let _ = fs::remove_dir_all(¤t_dir); + } + fs::create_dir_all(¤t_dir).with_context(|| { + format!( + "failed to create current mirror dir {}", + current_dir.display() + ) + })?; + + let mut aggregated = String::new(); + + // Compute diffs per tracked internal file. + for (internal, baseline_external) in &self.temp_name_to_baseline_external { + let baseline_path = baseline_dir.join(internal); + let current_external = self + .temp_name_to_current_external + .get(internal) + .cloned() + .unwrap_or_else(|| baseline_external.clone()); + + let left_is_dev_null = !baseline_path.exists(); + let right_exists = current_external.exists(); + + // Prepare right side mirror file if exists; otherwise use /dev/null for deletions. + let right_arg = if right_exists { + let mirror_path = current_dir.join(internal); + let contents = fs::read(¤t_external).with_context(|| { + format!( + "failed to read current file for diff {}", + current_external.display() + ) + })?; + fs::write(&mirror_path, contents).with_context(|| { + format!( + "failed to write current mirror file {}", + mirror_path.display() + ) + })?; + // Use relative path from baseline_dir (so headers say a/ b/current/). + format!("current/{internal}") + } else { + // Deletion: right side is /dev/null to show proper deleted file diff. + "/dev/null".to_string() + }; + + // Prepare left arg: baseline file path or /dev/null for additions. + let left_arg = if left_is_dev_null { + "/dev/null".to_string() + } else { + internal.clone() + }; + + // Run git diff --no-index from baseline_dir to keep paths predictable. + let raw = run_git_allow_exit_codes( + &baseline_dir, + &[ + "-c", + "color.ui=false", + "diff", + "--no-color", + "--no-index", + "--", + &left_arg, + &right_arg, + ], + &[0, 1], // 0: no changes, 1: differences + )?; + + if raw.trim().is_empty() { + continue; + } + let rewritten = self.rewrite_diff_paths(&raw); + if !rewritten.trim().is_empty() { + if !aggregated.is_empty() && !aggregated.ends_with('\n') { + aggregated.push('\n'); + } + aggregated.push_str(&rewritten); + } + } + + self.unified_diff = if aggregated.trim().is_empty() { + None + } else { + Some(aggregated) + }; + + // Clean up the current dir. + let _ = fs::remove_dir_all(¤t_dir); + + Ok(self.unified_diff.clone()) + } + + fn baseline_dir(&self) -> Result<&Path> { + self.baseline_files_dir + .as_ref() + .map(|d| d.path()) + .ok_or_else(|| anyhow::anyhow!("baseline temp dir not initialized")) + } + + fn ensure_baseline_dir(&mut self) -> Result<()> { + if self.baseline_files_dir.is_some() { + return Ok(()); + } + let tmp = TempDir::new().context("create baseline temp dir")?; + self.baseline_files_dir = Some(tmp); + Ok(()) + } + + /// Rewrites the internal filenames to external paths in diff headers. + /// Handles inputs like: + /// diff --git a/ b/current/ + /// --- a/ | /dev/null + /// +++ b/current/ | /dev/null + /// and replaces uuid with the external paths tracking baseline/current. + fn rewrite_diff_paths(&self, diff: &str) -> String { + let mut out = String::new(); + for line in diff.lines() { + if let Some(rest) = line.strip_prefix("diff --git ") { + // Format: diff --git a/ b/ + let parts: Vec<&str> = rest.split_whitespace().collect(); + if parts.len() == 2 { + let a = parts[0].strip_prefix("a/").unwrap_or(parts[0]); + let b = parts[1].strip_prefix("b/").unwrap_or(parts[1]); + + let a_ext_display = if a == "/dev/null" { + "/dev/null".to_string() + } else { + let a_base = Path::new(a) + .file_name() + .and_then(|s| s.to_str()) + .unwrap_or(a); + let mapped = self + .temp_name_to_baseline_external + .get(a_base) + .cloned() + .unwrap_or_else(|| PathBuf::from(a)); + mapped.display().to_string() + }; + + let b_ext_display = if b == "/dev/null" { + "/dev/null".to_string() + } else { + let b_base = Path::new(b) + .file_name() + .and_then(|s| s.to_str()) + .unwrap_or(b); + let mapped = self + .temp_name_to_current_external + .get(b_base) + .cloned() + .unwrap_or_else(|| PathBuf::from(b)); + mapped.display().to_string() + }; + + out.push_str(&format!("diff --git a/{a_ext_display} b/{b_ext_display}\n")); + continue; + } + } + if let Some(rest) = line.strip_prefix("--- ") { + if let Some(path) = rest.strip_prefix("a/") { + let external_display = if path == "/dev/null" { + "/dev/null".to_string() + } else { + let p_base = Path::new(path) + .file_name() + .and_then(|s| s.to_str()) + .unwrap_or(path); + self.temp_name_to_baseline_external + .get(p_base) + .cloned() + .unwrap_or_else(|| PathBuf::from(path)) + .display() + .to_string() + }; + out.push_str(&format!("--- {external_display}\n")); + continue; + } + } + if let Some(rest) = line.strip_prefix("+++ ") { + if let Some(path) = rest.strip_prefix("b/") { + let external_display = if path == "/dev/null" { + "/dev/null".to_string() + } else { + let p_base = Path::new(path) + .file_name() + .and_then(|s| s.to_str()) + .unwrap_or(path); + self.temp_name_to_current_external + .get(p_base) + .cloned() + .unwrap_or_else(|| PathBuf::from(path)) + .display() + .to_string() + }; + out.push_str(&format!("+++ {external_display}\n")); + continue; + } + } + out.push_str(line); + out.push('\n'); + } + out + } +} + +fn uuid_filename_for(path: &Path) -> String { + let id = Uuid::new_v4().to_string(); + match path.extension().and_then(|e| e.to_str()) { + Some(ext) if !ext.is_empty() => format!("{id}.{ext}"), + _ => id, ``` > I'm still unclear why `ext` is added. - Created: 2025-08-04 03:08:45 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2250310045 ```diff @@ -0,0 +1,766 @@ +use std::collections::HashMap; +use std::fs; +use std::path::Path; +use std::path::PathBuf; +use std::process::Command; + +use anyhow::Context; +use anyhow::Result; +use anyhow::anyhow; +use uuid::Uuid; + +use crate::protocol::FileChange; + +struct BaselineFileInfo { + path: Option, + contents_bytes: Option>, + mode: Option, + oid: Option, +} + +/// Tracks sets of changes to files and exposes the overall unified diff. +/// Internally, the way this works is now: +/// 1. Maintain an in-memory baseline snapshot of files when they are first seen. +/// For new additions, do not create a baseline so that diffs are shown as proper additions (using /dev/null). +/// 2. Keep a stable internal filename (uuid + same extension) per external path for rename tracking. +/// 3. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk entirely in-memory +/// using the `similar` crate and emit unified diffs with rewritten external paths. +#[derive(Default)] +pub struct TurnDiffTracker { + /// Map external path -> internal filename (uuid + same extension). + external_to_temp_name: HashMap, + /// Internal filename -> baseline file info. + baseline_file_info: HashMap, + /// Internal filename -> external path as of current accumulated state (after applying all changes). + /// This is where renames are tracked. + temp_name_to_current_path: HashMap, + /// Cache of known git worktree roots to avoid repeated filesystem walks. + git_root_cache: Vec, +} + +impl TurnDiffTracker { + pub fn new() -> Self { + Self::default() + } + + /// Front-run apply patch calls to track the starting contents of any modified files. + /// - Creates an in-memory baseline snapshot for files that already exist on disk when first seen. + /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions. + /// - Also updates internal mappings for move/rename events. + pub fn on_patch_begin(&mut self, changes: &HashMap) -> Result<()> { + for (path, change) in changes.iter() { + // Ensure a stable internal filename exists for this external path. + if !self.external_to_temp_name.contains_key(path) { + let internal = uuid_filename_for(path); + self.external_to_temp_name + .insert(path.clone(), internal.clone()); + self.temp_name_to_current_path + .insert(internal.clone(), path.clone()); + + // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null. + let (contents_bytes, mode, oid) = if path.exists() { + let mode = file_mode_for_path(path); + let mode_str = mode.as_deref().unwrap_or(REGULAR_MODE); + let contents_bytes = blob_bytes(path, mode_str) + .unwrap_or_default() + .unwrap_or_default(); + let oid = if mode.as_deref() == Some(SYMLINK_MODE) { + git_blob_sha1_hex_bytes(&contents_bytes) + } else { + self.git_blob_oid_for_path(path) + .unwrap_or_else(|| git_blob_sha1_hex_bytes(&contents_bytes)) + }; + (Some(contents_bytes), mode, Some(oid)) + } else { + (None, None, Some(ZERO_OID.to_string())) + }; + + self.baseline_file_info.insert( + internal.clone(), + BaselineFileInfo { + path: Some(path.clone()), + contents_bytes, + mode, + oid, + }, + ); + } + + // Track rename/move in current mapping if provided in an Update. + if let FileChange::Update { + move_path: Some(dest), + .. + } = change + { + let uuid_filename = match self.external_to_temp_name.get(path) { + Some(i) => i.clone(), + None => { + // This should be rare, but if we haven't mapped the source, create it with no baseline. + let i = uuid_filename_for(path); + self.baseline_file_info.insert( + i.clone(), + BaselineFileInfo { + path: Some(path.clone()), + contents_bytes: None, + mode: None, + oid: Some(ZERO_OID.to_string()), + }, + ); + i + } + }; + // Update current external mapping for temp file name. + self.temp_name_to_current_path + .insert(uuid_filename.clone(), dest.clone()); + // Update forward file_mapping: external current -> internal name. + self.external_to_temp_name.remove(path); + self.external_to_temp_name + .insert(dest.clone(), uuid_filename); + }; + } + + Ok(()) + } + + fn get_path_for_internal(&self, internal: &str) -> Option { + self.temp_name_to_current_path + .get(internal) + .cloned() + .or_else(|| { + self.baseline_file_info + .get(internal) + .and_then(|info| info.path.clone()) + }) + } + + /// Find the git worktree root for a file/directory by walking up to the first ancestor containing a `.git` entry. + /// Uses a simple cache of known roots and avoids negative-result caching for simplicity. + fn find_git_root_cached(&mut self, start: &Path) -> Option { + let dir = if start.is_dir() { + start + } else { + start.parent()? + }; + + // Fast path: if any cached root is an ancestor of this path, use it. + if let Some(root) = self + .git_root_cache + .iter() + .find(|r| dir.starts_with(r)) + .cloned() + { + return Some(root); + } + + // Walk up to find a `.git` marker. + let mut cur = dir.to_path_buf(); + loop { + let git_marker = cur.join(".git"); + if git_marker.is_dir() || git_marker.is_file() { + if !self.git_root_cache.iter().any(|r| r == &cur) { + self.git_root_cache.push(cur.clone()); + } + return Some(cur); + } + + // On Windows, avoid walking above the drive or UNC share root. + #[cfg(windows)] + { + if is_windows_drive_or_unc_root(&cur) { + return None; + } + } + + if let Some(parent) = cur.parent() { + cur = parent.to_path_buf(); + } else { + return None; + } + } + } + + /// Return a display string for `path` relative to its git root if found, else absolute. + fn relative_to_git_root_str(&mut self, path: &Path) -> String { + let s = if let Some(root) = self.find_git_root_cached(path) { + if let Ok(rel) = path.strip_prefix(&root) { + rel.display().to_string() + } else { + path.display().to_string() + } + } else { + path.display().to_string() + }; + s.replace('\\', "/") + } + + /// Ask git to compute the blob SHA-1 for the file at `path` within its repository. + /// Returns None if no repository is found or git invocation fails. + fn git_blob_oid_for_path(&mut self, path: &Path) -> Option { + let root = self.find_git_root_cached(path)?; + // Compute a path relative to the repo root for better portability across platforms. + let rel = path.strip_prefix(&root).unwrap_or(path); + let output = Command::new("git") + .arg("-C") + .arg(&root) + .arg("hash-object") + .arg("--") + .arg(rel) + .output() + .ok()?; + if !output.status.success() { + return None; + } + let s = String::from_utf8_lossy(&output.stdout).trim().to_string(); + if s.len() == 40 { Some(s) } else { None } + } + + /// Recompute the aggregated unified diff by comparing all of the in-memory snapshots that were + /// collected before the first time they were touched by apply_patch during this turn with + /// the current repo state. + pub fn get_unified_diff(&mut self) -> Result> { + let mut aggregated = String::new(); + + // Compute diffs per tracked internal file in a stable order by external path. + let mut baseline_file_names: Vec = + self.baseline_file_info.keys().cloned().collect(); + // Sort lexicographically by full repo-relative path to match git behavior. + baseline_file_names.sort_by_key(|internal| { + self.get_path_for_internal(internal) + .map(|p| self.relative_to_git_root_str(&p)) + .unwrap_or_default() + }); + + for internal in baseline_file_names { + // Baseline external must exist for any tracked internal. + let baseline_external = match self + .baseline_file_info + .get(&internal) + .and_then(|i| i.path.clone()) + { + Some(p) => p, + None => continue, + }; + let current_external = match self.get_path_for_internal(&internal) { + Some(p) => p, + None => continue, + }; + + // Determine modes early; needed to read symlink content correctly. + let baseline_mode = self + .baseline_file_info + .get(&internal) + .and_then(|i| i.mode.clone()) + .unwrap_or_else(|| REGULAR_MODE.to_string()); + let current_mode = + file_mode_for_path(¤t_external).unwrap_or_else(|| REGULAR_MODE.to_string()); + + let left_bytes = self + .baseline_file_info + .get(&internal) + .and_then(|i| i.contents_bytes.clone()); + + let right_bytes = blob_bytes(¤t_external, ¤t_mode)?; + + // Fast path: identical bytes or both missing. + if left_bytes.as_deref() == right_bytes.as_deref() { + continue; + } + + let left_display = self.relative_to_git_root_str(&baseline_external); + let right_display = self.relative_to_git_root_str(¤t_external); + + // Emit a git-style header for better readability and parity with previous behavior. + aggregated.push_str(&format!("diff --git a/{left_display} b/{right_display}\n")); + + let is_add = left_bytes.is_none() && right_bytes.is_some(); + let is_delete = left_bytes.is_some() && right_bytes.is_none(); + + if is_add { + aggregated.push_str(&format!("new file mode {current_mode}\n")); + } else if is_delete { + aggregated.push_str(&format!("deleted file mode {baseline_mode}\n")); + } else if baseline_mode != current_mode { + aggregated.push_str(&format!("old mode {baseline_mode}\n")); + aggregated.push_str(&format!("new mode {current_mode}\n")); + } + + // Determine blob object IDs for left and right contents. Prefer stored OIDs + // captured from the original repo state when the change was first seen. + let left_oid = self + .baseline_file_info + .get(&internal) + .and_then(|i| i.oid.clone()) + .or_else(|| { + left_bytes + .as_ref() + .map(|b| git_blob_sha1_hex_bytes(b)) + .or(Some(ZERO_OID.to_string())) + }) + .unwrap_or_else(|| ZERO_OID.to_string()); + let right_oid = if let Some(b) = right_bytes.as_ref() { + if current_mode == SYMLINK_MODE { + git_blob_sha1_hex_bytes(b) + } else { + self.git_blob_oid_for_path(¤t_external) + .unwrap_or_else(|| git_blob_sha1_hex_bytes(b)) + } + } else { + ZERO_OID.to_string() + }; + + // If either side isn't valid UTF-8, emit a binary diff header and continue. + let left_text = left_bytes + .as_deref() + .and_then(|b| std::str::from_utf8(b).ok()); + let right_text = right_bytes + .as_deref() + .and_then(|b| std::str::from_utf8(b).ok()); + + // Prefer text diffs when possible: + // - both sides are valid UTF-8 + // - OR one side is missing (add/delete) and the present side is valid UTF-8 + let can_text_diff = match (left_text, right_text, is_add, is_delete) { + (Some(_), Some(_), _, _) => true, + (_, Some(_), true, _) => true, // add: left missing, right text + (Some(_), _, _, true) => true, // delete: left text, right missing + _ => false, + }; + + if can_text_diff { + // Diff the contents as text, treating missing side as empty string. + let l = left_text.unwrap_or(""); + let r = right_text.unwrap_or(""); + + // Emit index line without mode suffix to preserve current test expectations. + aggregated.push_str(&format!("index {left_oid}..{right_oid}\n")); + + let old_header = if left_bytes.is_some() { + format!("a/{left_display}") + } else { + "/dev/null".to_string() + }; + let new_header = if right_bytes.is_some() { + format!("b/{right_display}") + } else { + "/dev/null".to_string() + }; + + let diff = similar::TextDiff::from_lines(l, r); + let unified = diff + .unified_diff() + .context_radius(3) + .header(&old_header, &new_header) + .to_string(); + + aggregated.push_str(&unified); + if !aggregated.ends_with('\n') { + aggregated.push('\n'); + } + } else { + // Binary or invalid UTF-8: emit header only. + aggregated.push_str(&format!("index {left_oid}..{right_oid}\n")); + let old_header = if left_bytes.is_some() { + format!("a/{left_display}") + } else { + "/dev/null".to_string() + }; + let new_header = if right_bytes.is_some() { + format!("b/{right_display}") + } else { + "/dev/null".to_string() + }; + aggregated.push_str(&format!("--- {old_header}\n")); + aggregated.push_str(&format!("+++ {new_header}\n")); + aggregated.push_str("Binary files differ\n"); + if !aggregated.ends_with('\n') { + aggregated.push('\n'); + } + } + } + + if aggregated.trim().is_empty() { + Ok(None) + } else { + Ok(Some(aggregated)) + } + } +} + +fn uuid_filename_for(path: &Path) -> String { + let id = Uuid::new_v4().to_string(); + match path.extension().and_then(|e| e.to_str()) { + Some(ext) if !ext.is_empty() => format!("{id}.{ext}"), + _ => id, + } +} + +/// Compute the Git SHA-1 blob object ID for the given content (bytes). +fn git_blob_sha1_hex_bytes(data: &[u8]) -> String { + // Git blob hash is sha1 of: "blob \0" + let header = format!("blob {}\0", data.len()); + use sha1::Digest; + let mut hasher = sha1::Sha1::new(); + hasher.update(header.as_bytes()); + hasher.update(data); + let digest = hasher.finalize(); + let mut out = String::with_capacity(40); + for b in digest { + use std::fmt::Write; + let _ = write!(&mut out, "{b:02x}"); + } + out +} + +const ZERO_OID: &str = "0000000000000000000000000000000000000000"; +const REGULAR_MODE: &str = "100644"; +#[cfg(unix)] ``` > Though even on Windows, this has to be readable (and preserved?) in a Git tree object, no? - Created: 2025-08-04 03:11:58 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2250312120 ```diff @@ -0,0 +1,766 @@ +use std::collections::HashMap; +use std::fs; +use std::path::Path; +use std::path::PathBuf; +use std::process::Command; + +use anyhow::Context; +use anyhow::Result; +use anyhow::anyhow; +use uuid::Uuid; + +use crate::protocol::FileChange; + +struct BaselineFileInfo { + path: Option, + contents_bytes: Option>, + mode: Option, + oid: Option, +} + +/// Tracks sets of changes to files and exposes the overall unified diff. +/// Internally, the way this works is now: +/// 1. Maintain an in-memory baseline snapshot of files when they are first seen. +/// For new additions, do not create a baseline so that diffs are shown as proper additions (using /dev/null). +/// 2. Keep a stable internal filename (uuid + same extension) per external path for rename tracking. +/// 3. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk entirely in-memory +/// using the `similar` crate and emit unified diffs with rewritten external paths. +#[derive(Default)] +pub struct TurnDiffTracker { + /// Map external path -> internal filename (uuid + same extension). + external_to_temp_name: HashMap, + /// Internal filename -> baseline file info. + baseline_file_info: HashMap, + /// Internal filename -> external path as of current accumulated state (after applying all changes). + /// This is where renames are tracked. + temp_name_to_current_path: HashMap, + /// Cache of known git worktree roots to avoid repeated filesystem walks. + git_root_cache: Vec, +} + +impl TurnDiffTracker { + pub fn new() -> Self { + Self::default() + } + + /// Front-run apply patch calls to track the starting contents of any modified files. + /// - Creates an in-memory baseline snapshot for files that already exist on disk when first seen. + /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions. + /// - Also updates internal mappings for move/rename events. + pub fn on_patch_begin(&mut self, changes: &HashMap) -> Result<()> { + for (path, change) in changes.iter() { + // Ensure a stable internal filename exists for this external path. + if !self.external_to_temp_name.contains_key(path) { + let internal = uuid_filename_for(path); + self.external_to_temp_name + .insert(path.clone(), internal.clone()); + self.temp_name_to_current_path + .insert(internal.clone(), path.clone()); + + // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null. + let (contents_bytes, mode, oid) = if path.exists() { + let mode = file_mode_for_path(path); + let mode_str = mode.as_deref().unwrap_or(REGULAR_MODE); + let contents_bytes = blob_bytes(path, mode_str) + .unwrap_or_default() + .unwrap_or_default(); + let oid = if mode.as_deref() == Some(SYMLINK_MODE) { + git_blob_sha1_hex_bytes(&contents_bytes) + } else { + self.git_blob_oid_for_path(path) + .unwrap_or_else(|| git_blob_sha1_hex_bytes(&contents_bytes)) + }; + (Some(contents_bytes), mode, Some(oid)) + } else { + (None, None, Some(ZERO_OID.to_string())) + }; + + self.baseline_file_info.insert( + internal.clone(), + BaselineFileInfo { + path: Some(path.clone()), + contents_bytes, + mode, + oid, + }, + ); + } + + // Track rename/move in current mapping if provided in an Update. + if let FileChange::Update { + move_path: Some(dest), + .. + } = change + { + let uuid_filename = match self.external_to_temp_name.get(path) { + Some(i) => i.clone(), + None => { + // This should be rare, but if we haven't mapped the source, create it with no baseline. + let i = uuid_filename_for(path); + self.baseline_file_info.insert( + i.clone(), + BaselineFileInfo { + path: Some(path.clone()), + contents_bytes: None, + mode: None, + oid: Some(ZERO_OID.to_string()), + }, + ); + i + } + }; + // Update current external mapping for temp file name. + self.temp_name_to_current_path + .insert(uuid_filename.clone(), dest.clone()); + // Update forward file_mapping: external current -> internal name. + self.external_to_temp_name.remove(path); + self.external_to_temp_name + .insert(dest.clone(), uuid_filename); + }; + } + + Ok(()) + } + + fn get_path_for_internal(&self, internal: &str) -> Option { + self.temp_name_to_current_path + .get(internal) + .cloned() + .or_else(|| { + self.baseline_file_info + .get(internal) + .and_then(|info| info.path.clone()) + }) + } + + /// Find the git worktree root for a file/directory by walking up to the first ancestor containing a `.git` entry. + /// Uses a simple cache of known roots and avoids negative-result caching for simplicity. + fn find_git_root_cached(&mut self, start: &Path) -> Option { + let dir = if start.is_dir() { + start + } else { + start.parent()? + }; + + // Fast path: if any cached root is an ancestor of this path, use it. + if let Some(root) = self + .git_root_cache + .iter() + .find(|r| dir.starts_with(r)) + .cloned() + { + return Some(root); + } + + // Walk up to find a `.git` marker. + let mut cur = dir.to_path_buf(); + loop { + let git_marker = cur.join(".git"); + if git_marker.is_dir() || git_marker.is_file() { + if !self.git_root_cache.iter().any(|r| r == &cur) { + self.git_root_cache.push(cur.clone()); + } + return Some(cur); + } + + // On Windows, avoid walking above the drive or UNC share root. + #[cfg(windows)] + { + if is_windows_drive_or_unc_root(&cur) { + return None; + } + } + + if let Some(parent) = cur.parent() { + cur = parent.to_path_buf(); + } else { + return None; + } + } + } + + /// Return a display string for `path` relative to its git root if found, else absolute. + fn relative_to_git_root_str(&mut self, path: &Path) -> String { + let s = if let Some(root) = self.find_git_root_cached(path) { + if let Ok(rel) = path.strip_prefix(&root) { + rel.display().to_string() + } else { + path.display().to_string() + } + } else { + path.display().to_string() + }; + s.replace('\\', "/") + } + + /// Ask git to compute the blob SHA-1 for the file at `path` within its repository. + /// Returns None if no repository is found or git invocation fails. + fn git_blob_oid_for_path(&mut self, path: &Path) -> Option { + let root = self.find_git_root_cached(path)?; + // Compute a path relative to the repo root for better portability across platforms. + let rel = path.strip_prefix(&root).unwrap_or(path); + let output = Command::new("git") + .arg("-C") + .arg(&root) + .arg("hash-object") + .arg("--") + .arg(rel) + .output() + .ok()?; + if !output.status.success() { + return None; + } + let s = String::from_utf8_lossy(&output.stdout).trim().to_string(); + if s.len() == 40 { Some(s) } else { None } + } + + /// Recompute the aggregated unified diff by comparing all of the in-memory snapshots that were + /// collected before the first time they were touched by apply_patch during this turn with + /// the current repo state. + pub fn get_unified_diff(&mut self) -> Result> { + let mut aggregated = String::new(); + + // Compute diffs per tracked internal file in a stable order by external path. + let mut baseline_file_names: Vec = + self.baseline_file_info.keys().cloned().collect(); + // Sort lexicographically by full repo-relative path to match git behavior. + baseline_file_names.sort_by_key(|internal| { + self.get_path_for_internal(internal) + .map(|p| self.relative_to_git_root_str(&p)) + .unwrap_or_default() + }); + + for internal in baseline_file_names { + // Baseline external must exist for any tracked internal. + let baseline_external = match self + .baseline_file_info + .get(&internal) + .and_then(|i| i.path.clone()) + { + Some(p) => p, + None => continue, + }; + let current_external = match self.get_path_for_internal(&internal) { + Some(p) => p, + None => continue, + }; + + // Determine modes early; needed to read symlink content correctly. + let baseline_mode = self + .baseline_file_info + .get(&internal) + .and_then(|i| i.mode.clone()) + .unwrap_or_else(|| REGULAR_MODE.to_string()); + let current_mode = + file_mode_for_path(¤t_external).unwrap_or_else(|| REGULAR_MODE.to_string()); + + let left_bytes = self + .baseline_file_info + .get(&internal) + .and_then(|i| i.contents_bytes.clone()); + + let right_bytes = blob_bytes(¤t_external, ¤t_mode)?; + + // Fast path: identical bytes or both missing. + if left_bytes.as_deref() == right_bytes.as_deref() { + continue; + } + + let left_display = self.relative_to_git_root_str(&baseline_external); + let right_display = self.relative_to_git_root_str(¤t_external); + + // Emit a git-style header for better readability and parity with previous behavior. + aggregated.push_str(&format!("diff --git a/{left_display} b/{right_display}\n")); + + let is_add = left_bytes.is_none() && right_bytes.is_some(); + let is_delete = left_bytes.is_some() && right_bytes.is_none(); + + if is_add { + aggregated.push_str(&format!("new file mode {current_mode}\n")); + } else if is_delete { + aggregated.push_str(&format!("deleted file mode {baseline_mode}\n")); + } else if baseline_mode != current_mode { + aggregated.push_str(&format!("old mode {baseline_mode}\n")); + aggregated.push_str(&format!("new mode {current_mode}\n")); + } + + // Determine blob object IDs for left and right contents. Prefer stored OIDs + // captured from the original repo state when the change was first seen. + let left_oid = self + .baseline_file_info + .get(&internal) + .and_then(|i| i.oid.clone()) + .or_else(|| { + left_bytes + .as_ref() + .map(|b| git_blob_sha1_hex_bytes(b)) + .or(Some(ZERO_OID.to_string())) + }) + .unwrap_or_else(|| ZERO_OID.to_string()); + let right_oid = if let Some(b) = right_bytes.as_ref() { + if current_mode == SYMLINK_MODE { + git_blob_sha1_hex_bytes(b) + } else { + self.git_blob_oid_for_path(¤t_external) + .unwrap_or_else(|| git_blob_sha1_hex_bytes(b)) + } + } else { + ZERO_OID.to_string() + }; + + // If either side isn't valid UTF-8, emit a binary diff header and continue. + let left_text = left_bytes + .as_deref() + .and_then(|b| std::str::from_utf8(b).ok()); + let right_text = right_bytes + .as_deref() + .and_then(|b| std::str::from_utf8(b).ok()); + + // Prefer text diffs when possible: + // - both sides are valid UTF-8 + // - OR one side is missing (add/delete) and the present side is valid UTF-8 + let can_text_diff = match (left_text, right_text, is_add, is_delete) { + (Some(_), Some(_), _, _) => true, + (_, Some(_), true, _) => true, // add: left missing, right text + (Some(_), _, _, true) => true, // delete: left text, right missing + _ => false, + }; + + if can_text_diff { + // Diff the contents as text, treating missing side as empty string. + let l = left_text.unwrap_or(""); + let r = right_text.unwrap_or(""); + + // Emit index line without mode suffix to preserve current test expectations. + aggregated.push_str(&format!("index {left_oid}..{right_oid}\n")); + + let old_header = if left_bytes.is_some() { + format!("a/{left_display}") + } else { + "/dev/null".to_string() + }; + let new_header = if right_bytes.is_some() { + format!("b/{right_display}") + } else { + "/dev/null".to_string() + }; + + let diff = similar::TextDiff::from_lines(l, r); + let unified = diff + .unified_diff() + .context_radius(3) + .header(&old_header, &new_header) + .to_string(); + + aggregated.push_str(&unified); + if !aggregated.ends_with('\n') { + aggregated.push('\n'); + } + } else { + // Binary or invalid UTF-8: emit header only. + aggregated.push_str(&format!("index {left_oid}..{right_oid}\n")); + let old_header = if left_bytes.is_some() { + format!("a/{left_display}") + } else { + "/dev/null".to_string() + }; + let new_header = if right_bytes.is_some() { + format!("b/{right_display}") + } else { + "/dev/null".to_string() + }; + aggregated.push_str(&format!("--- {old_header}\n")); + aggregated.push_str(&format!("+++ {new_header}\n")); + aggregated.push_str("Binary files differ\n"); + if !aggregated.ends_with('\n') { + aggregated.push('\n'); + } + } + } + + if aggregated.trim().is_empty() { + Ok(None) + } else { + Ok(Some(aggregated)) + } + } +} + +fn uuid_filename_for(path: &Path) -> String { + let id = Uuid::new_v4().to_string(); + match path.extension().and_then(|e| e.to_str()) { + Some(ext) if !ext.is_empty() => format!("{id}.{ext}"), + _ => id, + } +} + +/// Compute the Git SHA-1 blob object ID for the given content (bytes). +fn git_blob_sha1_hex_bytes(data: &[u8]) -> String { + // Git blob hash is sha1 of: "blob \0" + let header = format!("blob {}\0", data.len()); + use sha1::Digest; + let mut hasher = sha1::Sha1::new(); + hasher.update(header.as_bytes()); + hasher.update(data); + let digest = hasher.finalize(); + let mut out = String::with_capacity(40); + for b in digest { + use std::fmt::Write; + let _ = write!(&mut out, "{b:02x}"); + } + out +} + +const ZERO_OID: &str = "0000000000000000000000000000000000000000"; +const REGULAR_MODE: &str = "100644"; +#[cfg(unix)] +const EXECUTABLE_MODE: &str = "100755"; +const SYMLINK_MODE: &str = "120000"; + +#[cfg(unix)] +fn file_mode_for_path(path: &Path) -> Option { + use std::os::unix::fs::PermissionsExt; + let meta = fs::symlink_metadata(path).ok()?; + let ft = meta.file_type(); + if ft.is_symlink() { + return Some(SYMLINK_MODE.to_string()); + } + let mode = meta.permissions().mode(); + let is_exec = (mode & 0o111) != 0; + Some(if is_exec { + EXECUTABLE_MODE.into() + } else { + REGULAR_MODE.into() + }) +} + +#[cfg(not(unix))] +fn file_mode_for_path(_path: &Path) -> Option { + // Default to non-executable on non-unix. + Some(REGULAR_MODE.to_string()) +} + +fn blob_bytes(path: &Path, mode: &str) -> Result>> { + if path.exists() { + let contents = if mode == SYMLINK_MODE { + symlink_blob_bytes(path) + .ok_or_else(|| anyhow!("failed to read symlink target for {}", path.display()))? + } else { + fs::read(path).with_context(|| { + format!("failed to read current file for diff {}", path.display()) + })? + }; + Ok(Some(contents)) + } else { + Ok(None) + } +} + +#[cfg(unix)] +fn symlink_blob_bytes(path: &Path) -> Option> { + use std::os::unix::ffi::OsStrExt; + let target = std::fs::read_link(path).ok()?; + Some(target.as_os_str().as_bytes().to_vec()) ``` > One interesting operating system fact is that the contents of a symlink do not have to be a path to a file: you can just use it for arbitrary data storage. (As such, I think the max number of bytes you can store in a symlink is `PATH_MAX`, though.) > > I knew of one project that did this to save a system call because `readlink()` is one system call but `open()` plus `read()` for a regular file is two? - Created: 2025-08-04 03:12:56 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2250312732 ```diff @@ -0,0 +1,766 @@ +use std::collections::HashMap; +use std::fs; +use std::path::Path; +use std::path::PathBuf; +use std::process::Command; + +use anyhow::Context; +use anyhow::Result; +use anyhow::anyhow; +use uuid::Uuid; + +use crate::protocol::FileChange; + +struct BaselineFileInfo { + path: Option, + contents_bytes: Option>, + mode: Option, + oid: Option, +} + +/// Tracks sets of changes to files and exposes the overall unified diff. +/// Internally, the way this works is now: +/// 1. Maintain an in-memory baseline snapshot of files when they are first seen. +/// For new additions, do not create a baseline so that diffs are shown as proper additions (using /dev/null). +/// 2. Keep a stable internal filename (uuid + same extension) per external path for rename tracking. +/// 3. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk entirely in-memory +/// using the `similar` crate and emit unified diffs with rewritten external paths. +#[derive(Default)] +pub struct TurnDiffTracker { + /// Map external path -> internal filename (uuid + same extension). + external_to_temp_name: HashMap, + /// Internal filename -> baseline file info. + baseline_file_info: HashMap, + /// Internal filename -> external path as of current accumulated state (after applying all changes). + /// This is where renames are tracked. + temp_name_to_current_path: HashMap, + /// Cache of known git worktree roots to avoid repeated filesystem walks. + git_root_cache: Vec, +} + +impl TurnDiffTracker { + pub fn new() -> Self { + Self::default() + } + + /// Front-run apply patch calls to track the starting contents of any modified files. + /// - Creates an in-memory baseline snapshot for files that already exist on disk when first seen. + /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions. + /// - Also updates internal mappings for move/rename events. + pub fn on_patch_begin(&mut self, changes: &HashMap) -> Result<()> { + for (path, change) in changes.iter() { + // Ensure a stable internal filename exists for this external path. + if !self.external_to_temp_name.contains_key(path) { + let internal = uuid_filename_for(path); + self.external_to_temp_name + .insert(path.clone(), internal.clone()); + self.temp_name_to_current_path + .insert(internal.clone(), path.clone()); + + // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null. + let (contents_bytes, mode, oid) = if path.exists() { + let mode = file_mode_for_path(path); + let mode_str = mode.as_deref().unwrap_or(REGULAR_MODE); + let contents_bytes = blob_bytes(path, mode_str) + .unwrap_or_default() + .unwrap_or_default(); + let oid = if mode.as_deref() == Some(SYMLINK_MODE) { + git_blob_sha1_hex_bytes(&contents_bytes) + } else { + self.git_blob_oid_for_path(path) + .unwrap_or_else(|| git_blob_sha1_hex_bytes(&contents_bytes)) + }; + (Some(contents_bytes), mode, Some(oid)) + } else { + (None, None, Some(ZERO_OID.to_string())) + }; + + self.baseline_file_info.insert( + internal.clone(), + BaselineFileInfo { + path: Some(path.clone()), + contents_bytes, + mode, + oid, + }, + ); + } + + // Track rename/move in current mapping if provided in an Update. + if let FileChange::Update { + move_path: Some(dest), + .. + } = change + { + let uuid_filename = match self.external_to_temp_name.get(path) { + Some(i) => i.clone(), + None => { + // This should be rare, but if we haven't mapped the source, create it with no baseline. + let i = uuid_filename_for(path); + self.baseline_file_info.insert( + i.clone(), + BaselineFileInfo { + path: Some(path.clone()), + contents_bytes: None, + mode: None, + oid: Some(ZERO_OID.to_string()), + }, + ); + i + } + }; + // Update current external mapping for temp file name. + self.temp_name_to_current_path + .insert(uuid_filename.clone(), dest.clone()); + // Update forward file_mapping: external current -> internal name. + self.external_to_temp_name.remove(path); + self.external_to_temp_name + .insert(dest.clone(), uuid_filename); + }; + } + + Ok(()) + } + + fn get_path_for_internal(&self, internal: &str) -> Option { + self.temp_name_to_current_path + .get(internal) + .cloned() + .or_else(|| { + self.baseline_file_info + .get(internal) + .and_then(|info| info.path.clone()) + }) + } + + /// Find the git worktree root for a file/directory by walking up to the first ancestor containing a `.git` entry. + /// Uses a simple cache of known roots and avoids negative-result caching for simplicity. + fn find_git_root_cached(&mut self, start: &Path) -> Option { + let dir = if start.is_dir() { + start + } else { + start.parent()? + }; + + // Fast path: if any cached root is an ancestor of this path, use it. + if let Some(root) = self + .git_root_cache + .iter() + .find(|r| dir.starts_with(r)) + .cloned() + { + return Some(root); + } + + // Walk up to find a `.git` marker. + let mut cur = dir.to_path_buf(); + loop { + let git_marker = cur.join(".git"); + if git_marker.is_dir() || git_marker.is_file() { + if !self.git_root_cache.iter().any(|r| r == &cur) { + self.git_root_cache.push(cur.clone()); + } + return Some(cur); + } + + // On Windows, avoid walking above the drive or UNC share root. + #[cfg(windows)] + { + if is_windows_drive_or_unc_root(&cur) { + return None; + } + } + + if let Some(parent) = cur.parent() { + cur = parent.to_path_buf(); + } else { + return None; + } + } + } + + /// Return a display string for `path` relative to its git root if found, else absolute. + fn relative_to_git_root_str(&mut self, path: &Path) -> String { + let s = if let Some(root) = self.find_git_root_cached(path) { + if let Ok(rel) = path.strip_prefix(&root) { + rel.display().to_string() + } else { + path.display().to_string() + } + } else { + path.display().to_string() + }; + s.replace('\\', "/") + } + + /// Ask git to compute the blob SHA-1 for the file at `path` within its repository. + /// Returns None if no repository is found or git invocation fails. + fn git_blob_oid_for_path(&mut self, path: &Path) -> Option { + let root = self.find_git_root_cached(path)?; + // Compute a path relative to the repo root for better portability across platforms. + let rel = path.strip_prefix(&root).unwrap_or(path); + let output = Command::new("git") + .arg("-C") + .arg(&root) + .arg("hash-object") + .arg("--") + .arg(rel) + .output() + .ok()?; + if !output.status.success() { + return None; + } + let s = String::from_utf8_lossy(&output.stdout).trim().to_string(); + if s.len() == 40 { Some(s) } else { None } + } + + /// Recompute the aggregated unified diff by comparing all of the in-memory snapshots that were + /// collected before the first time they were touched by apply_patch during this turn with + /// the current repo state. + pub fn get_unified_diff(&mut self) -> Result> { + let mut aggregated = String::new(); + + // Compute diffs per tracked internal file in a stable order by external path. + let mut baseline_file_names: Vec = + self.baseline_file_info.keys().cloned().collect(); + // Sort lexicographically by full repo-relative path to match git behavior. + baseline_file_names.sort_by_key(|internal| { + self.get_path_for_internal(internal) + .map(|p| self.relative_to_git_root_str(&p)) + .unwrap_or_default() + }); + + for internal in baseline_file_names { + // Baseline external must exist for any tracked internal. + let baseline_external = match self + .baseline_file_info + .get(&internal) + .and_then(|i| i.path.clone()) + { + Some(p) => p, + None => continue, + }; + let current_external = match self.get_path_for_internal(&internal) { + Some(p) => p, + None => continue, + }; + + // Determine modes early; needed to read symlink content correctly. + let baseline_mode = self + .baseline_file_info + .get(&internal) + .and_then(|i| i.mode.clone()) + .unwrap_or_else(|| REGULAR_MODE.to_string()); + let current_mode = + file_mode_for_path(¤t_external).unwrap_or_else(|| REGULAR_MODE.to_string()); + + let left_bytes = self + .baseline_file_info + .get(&internal) + .and_then(|i| i.contents_bytes.clone()); + + let right_bytes = blob_bytes(¤t_external, ¤t_mode)?; + + // Fast path: identical bytes or both missing. + if left_bytes.as_deref() == right_bytes.as_deref() { + continue; + } + + let left_display = self.relative_to_git_root_str(&baseline_external); + let right_display = self.relative_to_git_root_str(¤t_external); + + // Emit a git-style header for better readability and parity with previous behavior. + aggregated.push_str(&format!("diff --git a/{left_display} b/{right_display}\n")); + + let is_add = left_bytes.is_none() && right_bytes.is_some(); + let is_delete = left_bytes.is_some() && right_bytes.is_none(); + + if is_add { + aggregated.push_str(&format!("new file mode {current_mode}\n")); + } else if is_delete { + aggregated.push_str(&format!("deleted file mode {baseline_mode}\n")); + } else if baseline_mode != current_mode { + aggregated.push_str(&format!("old mode {baseline_mode}\n")); + aggregated.push_str(&format!("new mode {current_mode}\n")); + } + + // Determine blob object IDs for left and right contents. Prefer stored OIDs + // captured from the original repo state when the change was first seen. + let left_oid = self + .baseline_file_info + .get(&internal) + .and_then(|i| i.oid.clone()) + .or_else(|| { + left_bytes + .as_ref() + .map(|b| git_blob_sha1_hex_bytes(b)) + .or(Some(ZERO_OID.to_string())) + }) + .unwrap_or_else(|| ZERO_OID.to_string()); + let right_oid = if let Some(b) = right_bytes.as_ref() { + if current_mode == SYMLINK_MODE { + git_blob_sha1_hex_bytes(b) + } else { + self.git_blob_oid_for_path(¤t_external) + .unwrap_or_else(|| git_blob_sha1_hex_bytes(b)) + } + } else { + ZERO_OID.to_string() + }; + + // If either side isn't valid UTF-8, emit a binary diff header and continue. + let left_text = left_bytes + .as_deref() + .and_then(|b| std::str::from_utf8(b).ok()); + let right_text = right_bytes + .as_deref() + .and_then(|b| std::str::from_utf8(b).ok()); + + // Prefer text diffs when possible: + // - both sides are valid UTF-8 + // - OR one side is missing (add/delete) and the present side is valid UTF-8 + let can_text_diff = match (left_text, right_text, is_add, is_delete) { + (Some(_), Some(_), _, _) => true, + (_, Some(_), true, _) => true, // add: left missing, right text + (Some(_), _, _, true) => true, // delete: left text, right missing + _ => false, + }; + + if can_text_diff { + // Diff the contents as text, treating missing side as empty string. + let l = left_text.unwrap_or(""); + let r = right_text.unwrap_or(""); + + // Emit index line without mode suffix to preserve current test expectations. + aggregated.push_str(&format!("index {left_oid}..{right_oid}\n")); + + let old_header = if left_bytes.is_some() { + format!("a/{left_display}") + } else { + "/dev/null".to_string() + }; + let new_header = if right_bytes.is_some() { + format!("b/{right_display}") + } else { + "/dev/null".to_string() + }; + + let diff = similar::TextDiff::from_lines(l, r); + let unified = diff + .unified_diff() + .context_radius(3) + .header(&old_header, &new_header) + .to_string(); + + aggregated.push_str(&unified); + if !aggregated.ends_with('\n') { + aggregated.push('\n'); + } + } else { + // Binary or invalid UTF-8: emit header only. + aggregated.push_str(&format!("index {left_oid}..{right_oid}\n")); + let old_header = if left_bytes.is_some() { + format!("a/{left_display}") + } else { + "/dev/null".to_string() + }; + let new_header = if right_bytes.is_some() { + format!("b/{right_display}") + } else { + "/dev/null".to_string() + }; + aggregated.push_str(&format!("--- {old_header}\n")); + aggregated.push_str(&format!("+++ {new_header}\n")); + aggregated.push_str("Binary files differ\n"); + if !aggregated.ends_with('\n') { + aggregated.push('\n'); + } + } + } + + if aggregated.trim().is_empty() { + Ok(None) + } else { + Ok(Some(aggregated)) + } + } +} + +fn uuid_filename_for(path: &Path) -> String { + let id = Uuid::new_v4().to_string(); + match path.extension().and_then(|e| e.to_str()) { + Some(ext) if !ext.is_empty() => format!("{id}.{ext}"), + _ => id, + } +} + +/// Compute the Git SHA-1 blob object ID for the given content (bytes). +fn git_blob_sha1_hex_bytes(data: &[u8]) -> String { + // Git blob hash is sha1 of: "blob \0" + let header = format!("blob {}\0", data.len()); + use sha1::Digest; ``` > Returning this type is slightly stronger since you don't have to verify the integrity of the `String` contents elsewhere. - Created: 2025-08-04 03:13:59 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2250313339 ```diff @@ -0,0 +1,766 @@ +use std::collections::HashMap; +use std::fs; +use std::path::Path; +use std::path::PathBuf; +use std::process::Command; + +use anyhow::Context; +use anyhow::Result; +use anyhow::anyhow; +use uuid::Uuid; + +use crate::protocol::FileChange; + +struct BaselineFileInfo { + path: Option, + contents_bytes: Option>, + mode: Option, + oid: Option, +} + +/// Tracks sets of changes to files and exposes the overall unified diff. +/// Internally, the way this works is now: +/// 1. Maintain an in-memory baseline snapshot of files when they are first seen. +/// For new additions, do not create a baseline so that diffs are shown as proper additions (using /dev/null). +/// 2. Keep a stable internal filename (uuid + same extension) per external path for rename tracking. +/// 3. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk entirely in-memory +/// using the `similar` crate and emit unified diffs with rewritten external paths. +#[derive(Default)] +pub struct TurnDiffTracker { + /// Map external path -> internal filename (uuid + same extension). + external_to_temp_name: HashMap, + /// Internal filename -> baseline file info. + baseline_file_info: HashMap, + /// Internal filename -> external path as of current accumulated state (after applying all changes). + /// This is where renames are tracked. + temp_name_to_current_path: HashMap, + /// Cache of known git worktree roots to avoid repeated filesystem walks. + git_root_cache: Vec, +} + +impl TurnDiffTracker { + pub fn new() -> Self { + Self::default() + } + + /// Front-run apply patch calls to track the starting contents of any modified files. + /// - Creates an in-memory baseline snapshot for files that already exist on disk when first seen. + /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions. + /// - Also updates internal mappings for move/rename events. + pub fn on_patch_begin(&mut self, changes: &HashMap) -> Result<()> { + for (path, change) in changes.iter() { + // Ensure a stable internal filename exists for this external path. + if !self.external_to_temp_name.contains_key(path) { + let internal = uuid_filename_for(path); + self.external_to_temp_name + .insert(path.clone(), internal.clone()); + self.temp_name_to_current_path + .insert(internal.clone(), path.clone()); + + // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null. + let (contents_bytes, mode, oid) = if path.exists() { + let mode = file_mode_for_path(path); + let mode_str = mode.as_deref().unwrap_or(REGULAR_MODE); + let contents_bytes = blob_bytes(path, mode_str) + .unwrap_or_default() + .unwrap_or_default(); + let oid = if mode.as_deref() == Some(SYMLINK_MODE) { + git_blob_sha1_hex_bytes(&contents_bytes) + } else { + self.git_blob_oid_for_path(path) + .unwrap_or_else(|| git_blob_sha1_hex_bytes(&contents_bytes)) + }; + (Some(contents_bytes), mode, Some(oid)) + } else { + (None, None, Some(ZERO_OID.to_string())) + }; + + self.baseline_file_info.insert( + internal.clone(), + BaselineFileInfo { + path: Some(path.clone()), + contents_bytes, + mode, + oid, + }, + ); + } + + // Track rename/move in current mapping if provided in an Update. + if let FileChange::Update { + move_path: Some(dest), + .. + } = change + { + let uuid_filename = match self.external_to_temp_name.get(path) { + Some(i) => i.clone(), + None => { + // This should be rare, but if we haven't mapped the source, create it with no baseline. + let i = uuid_filename_for(path); + self.baseline_file_info.insert( + i.clone(), + BaselineFileInfo { + path: Some(path.clone()), + contents_bytes: None, + mode: None, + oid: Some(ZERO_OID.to_string()), + }, + ); + i + } + }; + // Update current external mapping for temp file name. + self.temp_name_to_current_path + .insert(uuid_filename.clone(), dest.clone()); + // Update forward file_mapping: external current -> internal name. + self.external_to_temp_name.remove(path); + self.external_to_temp_name + .insert(dest.clone(), uuid_filename); + }; + } + + Ok(()) + } + + fn get_path_for_internal(&self, internal: &str) -> Option { + self.temp_name_to_current_path + .get(internal) + .cloned() + .or_else(|| { + self.baseline_file_info + .get(internal) + .and_then(|info| info.path.clone()) + }) + } + + /// Find the git worktree root for a file/directory by walking up to the first ancestor containing a `.git` entry. + /// Uses a simple cache of known roots and avoids negative-result caching for simplicity. + fn find_git_root_cached(&mut self, start: &Path) -> Option { + let dir = if start.is_dir() { + start + } else { + start.parent()? + }; + + // Fast path: if any cached root is an ancestor of this path, use it. + if let Some(root) = self + .git_root_cache + .iter() + .find(|r| dir.starts_with(r)) + .cloned() + { + return Some(root); + } + + // Walk up to find a `.git` marker. + let mut cur = dir.to_path_buf(); + loop { + let git_marker = cur.join(".git"); + if git_marker.is_dir() || git_marker.is_file() { + if !self.git_root_cache.iter().any(|r| r == &cur) { + self.git_root_cache.push(cur.clone()); + } + return Some(cur); + } + + // On Windows, avoid walking above the drive or UNC share root. + #[cfg(windows)] + { + if is_windows_drive_or_unc_root(&cur) { + return None; + } + } + + if let Some(parent) = cur.parent() { + cur = parent.to_path_buf(); + } else { + return None; + } + } + } + + /// Return a display string for `path` relative to its git root if found, else absolute. + fn relative_to_git_root_str(&mut self, path: &Path) -> String { + let s = if let Some(root) = self.find_git_root_cached(path) { + if let Ok(rel) = path.strip_prefix(&root) { + rel.display().to_string() + } else { + path.display().to_string() + } + } else { + path.display().to_string() + }; + s.replace('\\', "/") + } + + /// Ask git to compute the blob SHA-1 for the file at `path` within its repository. + /// Returns None if no repository is found or git invocation fails. + fn git_blob_oid_for_path(&mut self, path: &Path) -> Option { + let root = self.find_git_root_cached(path)?; + // Compute a path relative to the repo root for better portability across platforms. + let rel = path.strip_prefix(&root).unwrap_or(path); + let output = Command::new("git") + .arg("-C") + .arg(&root) + .arg("hash-object") + .arg("--") + .arg(rel) + .output() + .ok()?; + if !output.status.success() { + return None; + } + let s = String::from_utf8_lossy(&output.stdout).trim().to_string(); + if s.len() == 40 { Some(s) } else { None } + } + + /// Recompute the aggregated unified diff by comparing all of the in-memory snapshots that were + /// collected before the first time they were touched by apply_patch during this turn with + /// the current repo state. + pub fn get_unified_diff(&mut self) -> Result> { + let mut aggregated = String::new(); + + // Compute diffs per tracked internal file in a stable order by external path. + let mut baseline_file_names: Vec = + self.baseline_file_info.keys().cloned().collect(); + // Sort lexicographically by full repo-relative path to match git behavior. + baseline_file_names.sort_by_key(|internal| { + self.get_path_for_internal(internal) + .map(|p| self.relative_to_git_root_str(&p)) + .unwrap_or_default() + }); + + for internal in baseline_file_names { + // Baseline external must exist for any tracked internal. + let baseline_external = match self + .baseline_file_info + .get(&internal) + .and_then(|i| i.path.clone()) + { + Some(p) => p, + None => continue, + }; + let current_external = match self.get_path_for_internal(&internal) { + Some(p) => p, + None => continue, + }; + + // Determine modes early; needed to read symlink content correctly. + let baseline_mode = self + .baseline_file_info + .get(&internal) + .and_then(|i| i.mode.clone()) + .unwrap_or_else(|| REGULAR_MODE.to_string()); + let current_mode = + file_mode_for_path(¤t_external).unwrap_or_else(|| REGULAR_MODE.to_string()); + + let left_bytes = self + .baseline_file_info + .get(&internal) + .and_then(|i| i.contents_bytes.clone()); + + let right_bytes = blob_bytes(¤t_external, ¤t_mode)?; + + // Fast path: identical bytes or both missing. + if left_bytes.as_deref() == right_bytes.as_deref() { + continue; + } + + let left_display = self.relative_to_git_root_str(&baseline_external); + let right_display = self.relative_to_git_root_str(¤t_external); + + // Emit a git-style header for better readability and parity with previous behavior. + aggregated.push_str(&format!("diff --git a/{left_display} b/{right_display}\n")); + + let is_add = left_bytes.is_none() && right_bytes.is_some(); + let is_delete = left_bytes.is_some() && right_bytes.is_none(); + + if is_add { + aggregated.push_str(&format!("new file mode {current_mode}\n")); + } else if is_delete { + aggregated.push_str(&format!("deleted file mode {baseline_mode}\n")); + } else if baseline_mode != current_mode { + aggregated.push_str(&format!("old mode {baseline_mode}\n")); + aggregated.push_str(&format!("new mode {current_mode}\n")); + } + + // Determine blob object IDs for left and right contents. Prefer stored OIDs + // captured from the original repo state when the change was first seen. + let left_oid = self + .baseline_file_info + .get(&internal) + .and_then(|i| i.oid.clone()) + .or_else(|| { + left_bytes + .as_ref() + .map(|b| git_blob_sha1_hex_bytes(b)) + .or(Some(ZERO_OID.to_string())) + }) + .unwrap_or_else(|| ZERO_OID.to_string()); + let right_oid = if let Some(b) = right_bytes.as_ref() { + if current_mode == SYMLINK_MODE { + git_blob_sha1_hex_bytes(b) + } else { + self.git_blob_oid_for_path(¤t_external) + .unwrap_or_else(|| git_blob_sha1_hex_bytes(b)) + } + } else { + ZERO_OID.to_string() + }; + + // If either side isn't valid UTF-8, emit a binary diff header and continue. + let left_text = left_bytes + .as_deref() + .and_then(|b| std::str::from_utf8(b).ok()); + let right_text = right_bytes + .as_deref() + .and_then(|b| std::str::from_utf8(b).ok()); + + // Prefer text diffs when possible: + // - both sides are valid UTF-8 + // - OR one side is missing (add/delete) and the present side is valid UTF-8 + let can_text_diff = match (left_text, right_text, is_add, is_delete) { + (Some(_), Some(_), _, _) => true, + (_, Some(_), true, _) => true, // add: left missing, right text + (Some(_), _, _, true) => true, // delete: left text, right missing + _ => false, + }; + + if can_text_diff { + // Diff the contents as text, treating missing side as empty string. + let l = left_text.unwrap_or(""); + let r = right_text.unwrap_or(""); + + // Emit index line without mode suffix to preserve current test expectations. + aggregated.push_str(&format!("index {left_oid}..{right_oid}\n")); + + let old_header = if left_bytes.is_some() { + format!("a/{left_display}") + } else { + "/dev/null".to_string() + }; + let new_header = if right_bytes.is_some() { + format!("b/{right_display}") + } else { + "/dev/null".to_string() + }; + + let diff = similar::TextDiff::from_lines(l, r); + let unified = diff + .unified_diff() + .context_radius(3) + .header(&old_header, &new_header) + .to_string(); + + aggregated.push_str(&unified); + if !aggregated.ends_with('\n') { + aggregated.push('\n'); + } + } else { + // Binary or invalid UTF-8: emit header only. + aggregated.push_str(&format!("index {left_oid}..{right_oid}\n")); + let old_header = if left_bytes.is_some() { + format!("a/{left_display}") + } else { + "/dev/null".to_string() + }; + let new_header = if right_bytes.is_some() { + format!("b/{right_display}") + } else { + "/dev/null".to_string() + }; + aggregated.push_str(&format!("--- {old_header}\n")); + aggregated.push_str(&format!("+++ {new_header}\n")); + aggregated.push_str("Binary files differ\n"); + if !aggregated.ends_with('\n') { + aggregated.push('\n'); + } + } + } + + if aggregated.trim().is_empty() { + Ok(None) + } else { + Ok(Some(aggregated)) + } + } +} + +fn uuid_filename_for(path: &Path) -> String { + let id = Uuid::new_v4().to_string(); + match path.extension().and_then(|e| e.to_str()) { + Some(ext) if !ext.is_empty() => format!("{id}.{ext}"), + _ => id, + } +} + +/// Compute the Git SHA-1 blob object ID for the given content (bytes). +fn git_blob_sha1_hex_bytes(data: &[u8]) -> String { + // Git blob hash is sha1 of: "blob \0" + let header = format!("blob {}\0", data.len()); + use sha1::Digest; + let mut hasher = sha1::Sha1::new(); + hasher.update(header.as_bytes()); + hasher.update(data); + let digest = hasher.finalize(); + let mut out = String::with_capacity(40); + for b in digest { + use std::fmt::Write; + let _ = write!(&mut out, "{b:02x}"); + } + out +} + +const ZERO_OID: &str = "0000000000000000000000000000000000000000"; +const REGULAR_MODE: &str = "100644"; +#[cfg(unix)] +const EXECUTABLE_MODE: &str = "100755"; +const SYMLINK_MODE: &str = "120000"; + +#[cfg(unix)] +fn file_mode_for_path(path: &Path) -> Option { + use std::os::unix::fs::PermissionsExt; + let meta = fs::symlink_metadata(path).ok()?; + let ft = meta.file_type(); + if ft.is_symlink() { + return Some(SYMLINK_MODE.to_string()); + } + let mode = meta.permissions().mode(); + let is_exec = (mode & 0o111) != 0; + Some(if is_exec { + EXECUTABLE_MODE.into() + } else { + REGULAR_MODE.into() + }) +} + +#[cfg(not(unix))] +fn file_mode_for_path(_path: &Path) -> Option { + // Default to non-executable on non-unix. + Some(REGULAR_MODE.to_string()) +} + +fn blob_bytes(path: &Path, mode: &str) -> Result>> { + if path.exists() { + let contents = if mode == SYMLINK_MODE { + symlink_blob_bytes(path) + .ok_or_else(|| anyhow!("failed to read symlink target for {}", path.display()))? + } else { + fs::read(path).with_context(|| { + format!("failed to read current file for diff {}", path.display()) + })? + }; + Ok(Some(contents)) + } else { + Ok(None) + } +} + +#[cfg(unix)] +fn symlink_blob_bytes(path: &Path) -> Option> { + use std::os::unix::ffi::OsStrExt; + let target = std::fs::read_link(path).ok()?; + Some(target.as_os_str().as_bytes().to_vec()) +} + +#[cfg(not(unix))] +fn symlink_blob_bytes(_path: &Path) -> Option> { + None +} + +#[cfg(windows)] +fn is_windows_drive_or_unc_root(p: &std::path::Path) -> bool { + use std::path::Component; + let mut comps = p.components(); + matches!( + (comps.next(), comps.next(), comps.next()), + (Some(Component::Prefix(_)), Some(Component::RootDir), None) + ) +} + +#[cfg(test)] +mod tests { + #![allow(clippy::unwrap_used)] + use super::*; + use pretty_assertions::assert_eq; + use tempfile::tempdir; + + /// Compute the Git SHA-1 blob object ID for the given content (string). + /// This delegates to the bytes version to avoid UTF-8 lossy conversions here. + fn git_blob_sha1_hex(data: &str) -> String { + git_blob_sha1_hex_bytes(data.as_bytes()) + } + + fn normalize_diff_for_test(input: &str, root: &Path) -> String { + let root_str = root.display().to_string().replace('\\', "/"); + let replaced = input.replace(&root_str, ""); + // Split into blocks on lines starting with "diff --git ", sort blocks for determinism, and rejoin + let mut blocks: Vec = Vec::new(); + let mut current = String::new(); + for line in replaced.lines() { + if line.starts_with("diff --git ") && !current.is_empty() { + blocks.push(current); + current = String::new(); + } + if !current.is_empty() { + current.push('\n'); + } + current.push_str(line); + } + if !current.is_empty() { + blocks.push(current); + } + blocks.sort(); + let mut out = blocks.join("\n"); + if !out.ends_with('\n') { + out.push('\n'); + } + out + } + + #[test] + fn accumulates_add_and_update() { + let mut acc = TurnDiffTracker::new(); + + let dir = tempdir().unwrap(); + let file = dir.path().join("a.txt"); + + // First patch: add file (baseline should be /dev/null). + let add_changes = HashMap::from([( + file.clone(), + FileChange::Add { + content: "foo\n".to_string(), + }, + )]); + acc.on_patch_begin(&add_changes).unwrap(); + + // Simulate apply: create the file on disk. + fs::write(&file, "foo\n").unwrap(); + let first = acc.get_unified_diff().unwrap().unwrap(); + let first = normalize_diff_for_test(&first, dir.path()); + let expected_first = { + let mode = file_mode_for_path(&file).unwrap_or_else(|| REGULAR_MODE.to_string()); + let right_oid = git_blob_sha1_hex("foo\n"); + format!( + "diff --git a//a.txt b//a.txt\nnew file mode {mode}\nindex {ZERO_OID}..{right_oid}\n--- /dev/null\n+++ b//a.txt\n@@ -0,0 +1 @@\n+foo\n", ``` > `r#` for better readability? - Created: 2025-08-04 03:14:59 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2250313964 ```diff @@ -0,0 +1,766 @@ +use std::collections::HashMap; +use std::fs; +use std::path::Path; +use std::path::PathBuf; +use std::process::Command; + +use anyhow::Context; +use anyhow::Result; +use anyhow::anyhow; +use uuid::Uuid; + +use crate::protocol::FileChange; + +struct BaselineFileInfo { + path: Option, + contents_bytes: Option>, + mode: Option, + oid: Option, +} + +/// Tracks sets of changes to files and exposes the overall unified diff. +/// Internally, the way this works is now: +/// 1. Maintain an in-memory baseline snapshot of files when they are first seen. +/// For new additions, do not create a baseline so that diffs are shown as proper additions (using /dev/null). +/// 2. Keep a stable internal filename (uuid + same extension) per external path for rename tracking. +/// 3. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk entirely in-memory +/// using the `similar` crate and emit unified diffs with rewritten external paths. +#[derive(Default)] +pub struct TurnDiffTracker { + /// Map external path -> internal filename (uuid + same extension). + external_to_temp_name: HashMap, + /// Internal filename -> baseline file info. + baseline_file_info: HashMap, + /// Internal filename -> external path as of current accumulated state (after applying all changes). + /// This is where renames are tracked. + temp_name_to_current_path: HashMap, + /// Cache of known git worktree roots to avoid repeated filesystem walks. + git_root_cache: Vec, +} + +impl TurnDiffTracker { + pub fn new() -> Self { + Self::default() + } + + /// Front-run apply patch calls to track the starting contents of any modified files. + /// - Creates an in-memory baseline snapshot for files that already exist on disk when first seen. + /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions. + /// - Also updates internal mappings for move/rename events. + pub fn on_patch_begin(&mut self, changes: &HashMap) -> Result<()> { + for (path, change) in changes.iter() { + // Ensure a stable internal filename exists for this external path. + if !self.external_to_temp_name.contains_key(path) { + let internal = uuid_filename_for(path); + self.external_to_temp_name + .insert(path.clone(), internal.clone()); + self.temp_name_to_current_path + .insert(internal.clone(), path.clone()); + + // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null. + let (contents_bytes, mode, oid) = if path.exists() { + let mode = file_mode_for_path(path); + let mode_str = mode.as_deref().unwrap_or(REGULAR_MODE); + let contents_bytes = blob_bytes(path, mode_str) + .unwrap_or_default() + .unwrap_or_default(); + let oid = if mode.as_deref() == Some(SYMLINK_MODE) { + git_blob_sha1_hex_bytes(&contents_bytes) + } else { + self.git_blob_oid_for_path(path) + .unwrap_or_else(|| git_blob_sha1_hex_bytes(&contents_bytes)) + }; + (Some(contents_bytes), mode, Some(oid)) + } else { + (None, None, Some(ZERO_OID.to_string())) + }; + + self.baseline_file_info.insert( + internal.clone(), + BaselineFileInfo { + path: Some(path.clone()), + contents_bytes, + mode, + oid, + }, + ); + } + + // Track rename/move in current mapping if provided in an Update. + if let FileChange::Update { + move_path: Some(dest), + .. + } = change + { + let uuid_filename = match self.external_to_temp_name.get(path) { + Some(i) => i.clone(), + None => { + // This should be rare, but if we haven't mapped the source, create it with no baseline. + let i = uuid_filename_for(path); + self.baseline_file_info.insert( + i.clone(), + BaselineFileInfo { + path: Some(path.clone()), + contents_bytes: None, + mode: None, + oid: Some(ZERO_OID.to_string()), + }, + ); + i + } + }; + // Update current external mapping for temp file name. + self.temp_name_to_current_path + .insert(uuid_filename.clone(), dest.clone()); + // Update forward file_mapping: external current -> internal name. + self.external_to_temp_name.remove(path); + self.external_to_temp_name + .insert(dest.clone(), uuid_filename); + }; + } + + Ok(()) + } + + fn get_path_for_internal(&self, internal: &str) -> Option { + self.temp_name_to_current_path + .get(internal) + .cloned() + .or_else(|| { + self.baseline_file_info + .get(internal) + .and_then(|info| info.path.clone()) + }) + } + + /// Find the git worktree root for a file/directory by walking up to the first ancestor containing a `.git` entry. + /// Uses a simple cache of known roots and avoids negative-result caching for simplicity. + fn find_git_root_cached(&mut self, start: &Path) -> Option { + let dir = if start.is_dir() { + start + } else { + start.parent()? + }; + + // Fast path: if any cached root is an ancestor of this path, use it. + if let Some(root) = self + .git_root_cache + .iter() + .find(|r| dir.starts_with(r)) + .cloned() + { + return Some(root); + } + + // Walk up to find a `.git` marker. + let mut cur = dir.to_path_buf(); + loop { + let git_marker = cur.join(".git"); + if git_marker.is_dir() || git_marker.is_file() { + if !self.git_root_cache.iter().any(|r| r == &cur) { + self.git_root_cache.push(cur.clone()); + } + return Some(cur); + } + + // On Windows, avoid walking above the drive or UNC share root. + #[cfg(windows)] + { + if is_windows_drive_or_unc_root(&cur) { + return None; + } + } + + if let Some(parent) = cur.parent() { + cur = parent.to_path_buf(); + } else { + return None; + } + } + } + + /// Return a display string for `path` relative to its git root if found, else absolute. + fn relative_to_git_root_str(&mut self, path: &Path) -> String { + let s = if let Some(root) = self.find_git_root_cached(path) { + if let Ok(rel) = path.strip_prefix(&root) { + rel.display().to_string() + } else { + path.display().to_string() + } + } else { + path.display().to_string() + }; + s.replace('\\', "/") + } + + /// Ask git to compute the blob SHA-1 for the file at `path` within its repository. + /// Returns None if no repository is found or git invocation fails. + fn git_blob_oid_for_path(&mut self, path: &Path) -> Option { + let root = self.find_git_root_cached(path)?; + // Compute a path relative to the repo root for better portability across platforms. + let rel = path.strip_prefix(&root).unwrap_or(path); + let output = Command::new("git") + .arg("-C") + .arg(&root) + .arg("hash-object") + .arg("--") + .arg(rel) + .output() + .ok()?; + if !output.status.success() { + return None; + } + let s = String::from_utf8_lossy(&output.stdout).trim().to_string(); + if s.len() == 40 { Some(s) } else { None } + } + + /// Recompute the aggregated unified diff by comparing all of the in-memory snapshots that were + /// collected before the first time they were touched by apply_patch during this turn with + /// the current repo state. + pub fn get_unified_diff(&mut self) -> Result> { + let mut aggregated = String::new(); + + // Compute diffs per tracked internal file in a stable order by external path. + let mut baseline_file_names: Vec = + self.baseline_file_info.keys().cloned().collect(); + // Sort lexicographically by full repo-relative path to match git behavior. + baseline_file_names.sort_by_key(|internal| { + self.get_path_for_internal(internal) + .map(|p| self.relative_to_git_root_str(&p)) + .unwrap_or_default() + }); + + for internal in baseline_file_names { + // Baseline external must exist for any tracked internal. + let baseline_external = match self + .baseline_file_info + .get(&internal) + .and_then(|i| i.path.clone()) + { + Some(p) => p, + None => continue, + }; + let current_external = match self.get_path_for_internal(&internal) { + Some(p) => p, + None => continue, + }; + + // Determine modes early; needed to read symlink content correctly. + let baseline_mode = self + .baseline_file_info + .get(&internal) + .and_then(|i| i.mode.clone()) + .unwrap_or_else(|| REGULAR_MODE.to_string()); + let current_mode = + file_mode_for_path(¤t_external).unwrap_or_else(|| REGULAR_MODE.to_string()); + + let left_bytes = self + .baseline_file_info + .get(&internal) + .and_then(|i| i.contents_bytes.clone()); + + let right_bytes = blob_bytes(¤t_external, ¤t_mode)?; + + // Fast path: identical bytes or both missing. + if left_bytes.as_deref() == right_bytes.as_deref() { + continue; + } + + let left_display = self.relative_to_git_root_str(&baseline_external); + let right_display = self.relative_to_git_root_str(¤t_external); + + // Emit a git-style header for better readability and parity with previous behavior. + aggregated.push_str(&format!("diff --git a/{left_display} b/{right_display}\n")); + + let is_add = left_bytes.is_none() && right_bytes.is_some(); + let is_delete = left_bytes.is_some() && right_bytes.is_none(); + + if is_add { + aggregated.push_str(&format!("new file mode {current_mode}\n")); + } else if is_delete { + aggregated.push_str(&format!("deleted file mode {baseline_mode}\n")); + } else if baseline_mode != current_mode { + aggregated.push_str(&format!("old mode {baseline_mode}\n")); + aggregated.push_str(&format!("new mode {current_mode}\n")); + } + + // Determine blob object IDs for left and right contents. Prefer stored OIDs + // captured from the original repo state when the change was first seen. + let left_oid = self + .baseline_file_info + .get(&internal) + .and_then(|i| i.oid.clone()) + .or_else(|| { + left_bytes + .as_ref() + .map(|b| git_blob_sha1_hex_bytes(b)) + .or(Some(ZERO_OID.to_string())) + }) + .unwrap_or_else(|| ZERO_OID.to_string()); + let right_oid = if let Some(b) = right_bytes.as_ref() { + if current_mode == SYMLINK_MODE { + git_blob_sha1_hex_bytes(b) + } else { + self.git_blob_oid_for_path(¤t_external) + .unwrap_or_else(|| git_blob_sha1_hex_bytes(b)) + } + } else { + ZERO_OID.to_string() + }; + + // If either side isn't valid UTF-8, emit a binary diff header and continue. + let left_text = left_bytes + .as_deref() + .and_then(|b| std::str::from_utf8(b).ok()); + let right_text = right_bytes + .as_deref() + .and_then(|b| std::str::from_utf8(b).ok()); + + // Prefer text diffs when possible: + // - both sides are valid UTF-8 + // - OR one side is missing (add/delete) and the present side is valid UTF-8 + let can_text_diff = match (left_text, right_text, is_add, is_delete) { + (Some(_), Some(_), _, _) => true, + (_, Some(_), true, _) => true, // add: left missing, right text + (Some(_), _, _, true) => true, // delete: left text, right missing + _ => false, + }; + + if can_text_diff { + // Diff the contents as text, treating missing side as empty string. + let l = left_text.unwrap_or(""); + let r = right_text.unwrap_or(""); + + // Emit index line without mode suffix to preserve current test expectations. + aggregated.push_str(&format!("index {left_oid}..{right_oid}\n")); + + let old_header = if left_bytes.is_some() { + format!("a/{left_display}") + } else { + "/dev/null".to_string() + }; + let new_header = if right_bytes.is_some() { + format!("b/{right_display}") + } else { + "/dev/null".to_string() + }; + + let diff = similar::TextDiff::from_lines(l, r); + let unified = diff + .unified_diff() + .context_radius(3) + .header(&old_header, &new_header) + .to_string(); + + aggregated.push_str(&unified); + if !aggregated.ends_with('\n') { + aggregated.push('\n'); + } + } else { + // Binary or invalid UTF-8: emit header only. + aggregated.push_str(&format!("index {left_oid}..{right_oid}\n")); + let old_header = if left_bytes.is_some() { + format!("a/{left_display}") + } else { + "/dev/null".to_string() + }; + let new_header = if right_bytes.is_some() { + format!("b/{right_display}") + } else { + "/dev/null".to_string() + }; + aggregated.push_str(&format!("--- {old_header}\n")); + aggregated.push_str(&format!("+++ {new_header}\n")); + aggregated.push_str("Binary files differ\n"); + if !aggregated.ends_with('\n') { + aggregated.push('\n'); + } + } + } + + if aggregated.trim().is_empty() { + Ok(None) + } else { + Ok(Some(aggregated)) + } + } +} + +fn uuid_filename_for(path: &Path) -> String { + let id = Uuid::new_v4().to_string(); + match path.extension().and_then(|e| e.to_str()) { + Some(ext) if !ext.is_empty() => format!("{id}.{ext}"), + _ => id, + } +} + +/// Compute the Git SHA-1 blob object ID for the given content (bytes). +fn git_blob_sha1_hex_bytes(data: &[u8]) -> String { + // Git blob hash is sha1 of: "blob \0" + let header = format!("blob {}\0", data.len()); + use sha1::Digest; + let mut hasher = sha1::Sha1::new(); + hasher.update(header.as_bytes()); + hasher.update(data); + let digest = hasher.finalize(); + let mut out = String::with_capacity(40); + for b in digest { + use std::fmt::Write; + let _ = write!(&mut out, "{b:02x}"); + } + out +} + +const ZERO_OID: &str = "0000000000000000000000000000000000000000"; +const REGULAR_MODE: &str = "100644"; +#[cfg(unix)] +const EXECUTABLE_MODE: &str = "100755"; +const SYMLINK_MODE: &str = "120000"; + +#[cfg(unix)] +fn file_mode_for_path(path: &Path) -> Option { + use std::os::unix::fs::PermissionsExt; + let meta = fs::symlink_metadata(path).ok()?; + let ft = meta.file_type(); + if ft.is_symlink() { + return Some(SYMLINK_MODE.to_string()); + } + let mode = meta.permissions().mode(); + let is_exec = (mode & 0o111) != 0; + Some(if is_exec { + EXECUTABLE_MODE.into() + } else { + REGULAR_MODE.into() + }) +} + +#[cfg(not(unix))] +fn file_mode_for_path(_path: &Path) -> Option { + // Default to non-executable on non-unix. + Some(REGULAR_MODE.to_string()) +} + +fn blob_bytes(path: &Path, mode: &str) -> Result>> { + if path.exists() { + let contents = if mode == SYMLINK_MODE { + symlink_blob_bytes(path) + .ok_or_else(|| anyhow!("failed to read symlink target for {}", path.display()))? + } else { + fs::read(path).with_context(|| { + format!("failed to read current file for diff {}", path.display()) + })? + }; + Ok(Some(contents)) + } else { + Ok(None) + } +} + +#[cfg(unix)] +fn symlink_blob_bytes(path: &Path) -> Option> { + use std::os::unix::ffi::OsStrExt; + let target = std::fs::read_link(path).ok()?; + Some(target.as_os_str().as_bytes().to_vec()) +} + +#[cfg(not(unix))] +fn symlink_blob_bytes(_path: &Path) -> Option> { + None +} + +#[cfg(windows)] +fn is_windows_drive_or_unc_root(p: &std::path::Path) -> bool { + use std::path::Component; + let mut comps = p.components(); + matches!( + (comps.next(), comps.next(), comps.next()), + (Some(Component::Prefix(_)), Some(Component::RootDir), None) + ) +} + +#[cfg(test)] +mod tests { + #![allow(clippy::unwrap_used)] + use super::*; + use pretty_assertions::assert_eq; + use tempfile::tempdir; + + /// Compute the Git SHA-1 blob object ID for the given content (string). + /// This delegates to the bytes version to avoid UTF-8 lossy conversions here. + fn git_blob_sha1_hex(data: &str) -> String { + git_blob_sha1_hex_bytes(data.as_bytes()) + } + + fn normalize_diff_for_test(input: &str, root: &Path) -> String { + let root_str = root.display().to_string().replace('\\', "/"); + let replaced = input.replace(&root_str, ""); + // Split into blocks on lines starting with "diff --git ", sort blocks for determinism, and rejoin + let mut blocks: Vec = Vec::new(); + let mut current = String::new(); + for line in replaced.lines() { + if line.starts_with("diff --git ") && !current.is_empty() { + blocks.push(current); + current = String::new(); + } + if !current.is_empty() { + current.push('\n'); + } + current.push_str(line); + } + if !current.is_empty() { + blocks.push(current); + } + blocks.sort(); + let mut out = blocks.join("\n"); + if !out.ends_with('\n') { + out.push('\n'); + } + out + } + + #[test] + fn accumulates_add_and_update() { + let mut acc = TurnDiffTracker::new(); + + let dir = tempdir().unwrap(); + let file = dir.path().join("a.txt"); + + // First patch: add file (baseline should be /dev/null). + let add_changes = HashMap::from([( + file.clone(), + FileChange::Add { + content: "foo\n".to_string(), + }, + )]); + acc.on_patch_begin(&add_changes).unwrap(); + + // Simulate apply: create the file on disk. + fs::write(&file, "foo\n").unwrap(); + let first = acc.get_unified_diff().unwrap().unwrap(); + let first = normalize_diff_for_test(&first, dir.path()); + let expected_first = { + let mode = file_mode_for_path(&file).unwrap_or_else(|| REGULAR_MODE.to_string()); + let right_oid = git_blob_sha1_hex("foo\n"); + format!( + "diff --git a//a.txt b//a.txt\nnew file mode {mode}\nindex {ZERO_OID}..{right_oid}\n--- /dev/null\n+++ b//a.txt\n@@ -0,0 +1 @@\n+foo\n", + ) + }; + assert_eq!(first, expected_first); + + // Second patch: update the file on disk. + let update_changes = HashMap::from([( + file.clone(), + FileChange::Update { + unified_diff: "".to_owned(), + move_path: None, + }, + )]); + acc.on_patch_begin(&update_changes).unwrap(); + + // Simulate apply: append a new line. + fs::write(&file, "foo\nbar\n").unwrap(); + let combined = acc.get_unified_diff().unwrap().unwrap(); + let combined = normalize_diff_for_test(&combined, dir.path()); + let expected_combined = { + let mode = file_mode_for_path(&file).unwrap_or_else(|| REGULAR_MODE.to_string()); + let right_oid = git_blob_sha1_hex("foo\nbar\n"); + format!( + "diff --git a//a.txt b//a.txt\nnew file mode {mode}\nindex {ZERO_OID}..{right_oid}\n--- /dev/null\n+++ b//a.txt\n@@ -0,0 +1,2 @@\n+foo\n+bar\n", + ) + }; + assert_eq!(combined, expected_combined); + } + + #[test] + fn accumulates_delete() { + let dir = tempdir().unwrap(); + let file = dir.path().join("b.txt"); + fs::write(&file, "x\n").unwrap(); + + let mut acc = TurnDiffTracker::new(); + let del_changes = HashMap::from([(file.clone(), FileChange::Delete)]); + acc.on_patch_begin(&del_changes).unwrap(); + + // Simulate apply: delete the file from disk. + let baseline_mode = file_mode_for_path(&file).unwrap_or_else(|| REGULAR_MODE.to_string()); + fs::remove_file(&file).unwrap(); + let diff = acc.get_unified_diff().unwrap().unwrap(); + let diff = normalize_diff_for_test(&diff, dir.path()); + let expected = { + let left_oid = git_blob_sha1_hex("x\n"); + format!( + "diff --git a//b.txt b//b.txt\ndeleted file mode {baseline_mode}\nindex {left_oid}..{ZERO_OID}\n--- a//b.txt\n+++ /dev/null\n@@ -1 +0,0 @@\n-x\n", + ) + }; + assert_eq!(diff, expected); + } + + #[test] + fn accumulates_move_and_update() { + let dir = tempdir().unwrap(); + let src = dir.path().join("src.txt"); + let dest = dir.path().join("dst.txt"); + fs::write(&src, "line\n").unwrap(); + + let mut acc = TurnDiffTracker::new(); + let mv_changes = HashMap::from([( + src.clone(), + FileChange::Update { + unified_diff: "".to_owned(), + move_path: Some(dest.clone()), + }, + )]); + acc.on_patch_begin(&mv_changes).unwrap(); + + // Simulate apply: move and update content. + fs::rename(&src, &dest).unwrap(); + fs::write(&dest, "line2\n").unwrap(); + + let out = acc.get_unified_diff().unwrap().unwrap(); + let out = normalize_diff_for_test(&out, dir.path()); + let expected = { + let left_oid = git_blob_sha1_hex("line\n"); + let right_oid = git_blob_sha1_hex("line2\n"); + format!( + "diff --git a//src.txt b//dst.txt\nindex {left_oid}..{right_oid}\n--- a//src.txt\n+++ b//dst.txt\n@@ -1 +1 @@\n-line\n+line2\n" + ) + }; + assert_eq!(out, expected); + } + + #[test] + fn move_without_content_change_yields_no_diff() { + let dir = tempdir().unwrap(); + let src = dir.path().join("moved.txt"); + let dest = dir.path().join("renamed.txt"); + fs::write(&src, "same\n").unwrap(); + + let mut acc = TurnDiffTracker::new(); + let mv_changes = HashMap::from([( + src.clone(), + FileChange::Update { + unified_diff: "".to_owned(), + move_path: Some(dest.clone()), + }, + )]); + acc.on_patch_begin(&mv_changes).unwrap(); + + // Simulate apply: move only, no content change. + fs::rename(&src, &dest).unwrap(); + + let diff = acc.get_unified_diff().unwrap(); + assert_eq!(diff, None); + } + + #[test] + fn move_declared_but_file_only_appears_at_dest_is_add() { + let dir = tempdir().unwrap(); + let src = dir.path().join("src.txt"); + let dest = dir.path().join("dest.txt"); + let mut acc = TurnDiffTracker::new(); + let mv = HashMap::from([( + src.clone(), + FileChange::Update { + unified_diff: "".into(), + move_path: Some(dest.clone()), + }, + )]); + acc.on_patch_begin(&mv).unwrap(); + // No file existed initially; create only dest + fs::write(&dest, "hello\n").unwrap(); + let diff = acc.get_unified_diff().unwrap().unwrap(); + assert!(diff.contains("new file mode")); ``` > We can't `assert_eq!()` here? - Created: 2025-08-04 16:55:01 UTC | Link: https://github.com/openai/codex/pull/1770#discussion_r2252061946 ```diff @@ -0,0 +1,766 @@ +use std::collections::HashMap; +use std::fs; +use std::path::Path; +use std::path::PathBuf; +use std::process::Command; + +use anyhow::Context; +use anyhow::Result; +use anyhow::anyhow; +use uuid::Uuid; + +use crate::protocol::FileChange; + +struct BaselineFileInfo { + path: Option, + contents_bytes: Option>, + mode: Option, + oid: Option, +} + +/// Tracks sets of changes to files and exposes the overall unified diff. +/// Internally, the way this works is now: +/// 1. Maintain an in-memory baseline snapshot of files when they are first seen. +/// For new additions, do not create a baseline so that diffs are shown as proper additions (using /dev/null). +/// 2. Keep a stable internal filename (uuid + same extension) per external path for rename tracking. +/// 3. To compute the aggregated unified diff, compare each baseline snapshot to the current file on disk entirely in-memory +/// using the `similar` crate and emit unified diffs with rewritten external paths. +#[derive(Default)] +pub struct TurnDiffTracker { + /// Map external path -> internal filename (uuid + same extension). + external_to_temp_name: HashMap, + /// Internal filename -> baseline file info. + baseline_file_info: HashMap, + /// Internal filename -> external path as of current accumulated state (after applying all changes). + /// This is where renames are tracked. + temp_name_to_current_path: HashMap, + /// Cache of known git worktree roots to avoid repeated filesystem walks. + git_root_cache: Vec, +} + +impl TurnDiffTracker { + pub fn new() -> Self { + Self::default() + } + + /// Front-run apply patch calls to track the starting contents of any modified files. + /// - Creates an in-memory baseline snapshot for files that already exist on disk when first seen. + /// - For additions, we intentionally do not create a baseline snapshot so that diffs are proper additions. + /// - Also updates internal mappings for move/rename events. + pub fn on_patch_begin(&mut self, changes: &HashMap) -> Result<()> { + for (path, change) in changes.iter() { + // Ensure a stable internal filename exists for this external path. + if !self.external_to_temp_name.contains_key(path) { + let internal = uuid_filename_for(path); + self.external_to_temp_name + .insert(path.clone(), internal.clone()); + self.temp_name_to_current_path + .insert(internal.clone(), path.clone()); + + // If the file exists on disk now, snapshot as baseline; else leave missing to represent /dev/null. + let (contents_bytes, mode, oid) = if path.exists() { + let mode = file_mode_for_path(path); + let mode_str = mode.as_deref().unwrap_or(REGULAR_MODE); + let contents_bytes = blob_bytes(path, mode_str) + .unwrap_or_default() + .unwrap_or_default(); + let oid = if mode.as_deref() == Some(SYMLINK_MODE) { + git_blob_sha1_hex_bytes(&contents_bytes) + } else { + self.git_blob_oid_for_path(path) + .unwrap_or_else(|| git_blob_sha1_hex_bytes(&contents_bytes)) + }; + (Some(contents_bytes), mode, Some(oid)) + } else { + (None, None, Some(ZERO_OID.to_string())) + }; + + self.baseline_file_info.insert( + internal.clone(), + BaselineFileInfo { + path: Some(path.clone()), + contents_bytes, + mode, + oid, + }, + ); + } + + // Track rename/move in current mapping if provided in an Update. + if let FileChange::Update { + move_path: Some(dest), + .. + } = change + { + let uuid_filename = match self.external_to_temp_name.get(path) { + Some(i) => i.clone(), + None => { + // This should be rare, but if we haven't mapped the source, create it with no baseline. + let i = uuid_filename_for(path); + self.baseline_file_info.insert( + i.clone(), + BaselineFileInfo { + path: Some(path.clone()), + contents_bytes: None, + mode: None, + oid: Some(ZERO_OID.to_string()), + }, + ); + i + } + }; + // Update current external mapping for temp file name. + self.temp_name_to_current_path + .insert(uuid_filename.clone(), dest.clone()); + // Update forward file_mapping: external current -> internal name. + self.external_to_temp_name.remove(path); + self.external_to_temp_name + .insert(dest.clone(), uuid_filename); + }; + } + + Ok(()) + } + + fn get_path_for_internal(&self, internal: &str) -> Option { + self.temp_name_to_current_path + .get(internal) + .cloned() + .or_else(|| { + self.baseline_file_info + .get(internal) + .and_then(|info| info.path.clone()) + }) + } + + /// Find the git worktree root for a file/directory by walking up to the first ancestor containing a `.git` entry. + /// Uses a simple cache of known roots and avoids negative-result caching for simplicity. + fn find_git_root_cached(&mut self, start: &Path) -> Option { + let dir = if start.is_dir() { + start + } else { + start.parent()? + }; + + // Fast path: if any cached root is an ancestor of this path, use it. + if let Some(root) = self + .git_root_cache + .iter() + .find(|r| dir.starts_with(r)) + .cloned() + { + return Some(root); + } + + // Walk up to find a `.git` marker. + let mut cur = dir.to_path_buf(); + loop { + let git_marker = cur.join(".git"); + if git_marker.is_dir() || git_marker.is_file() { + if !self.git_root_cache.iter().any(|r| r == &cur) { + self.git_root_cache.push(cur.clone()); + } + return Some(cur); + } + + // On Windows, avoid walking above the drive or UNC share root. + #[cfg(windows)] + { + if is_windows_drive_or_unc_root(&cur) { + return None; + } + } + + if let Some(parent) = cur.parent() { + cur = parent.to_path_buf(); + } else { + return None; + } + } + } + + /// Return a display string for `path` relative to its git root if found, else absolute. + fn relative_to_git_root_str(&mut self, path: &Path) -> String { + let s = if let Some(root) = self.find_git_root_cached(path) { + if let Ok(rel) = path.strip_prefix(&root) { + rel.display().to_string() + } else { + path.display().to_string() + } + } else { + path.display().to_string() + }; + s.replace('\\', "/") + } + + /// Ask git to compute the blob SHA-1 for the file at `path` within its repository. + /// Returns None if no repository is found or git invocation fails. + fn git_blob_oid_for_path(&mut self, path: &Path) -> Option { + let root = self.find_git_root_cached(path)?; + // Compute a path relative to the repo root for better portability across platforms. + let rel = path.strip_prefix(&root).unwrap_or(path); + let output = Command::new("git") + .arg("-C") + .arg(&root) + .arg("hash-object") + .arg("--") + .arg(rel) + .output() + .ok()?; + if !output.status.success() { + return None; + } + let s = String::from_utf8_lossy(&output.stdout).trim().to_string(); + if s.len() == 40 { Some(s) } else { None } + } + + /// Recompute the aggregated unified diff by comparing all of the in-memory snapshots that were + /// collected before the first time they were touched by apply_patch during this turn with + /// the current repo state. + pub fn get_unified_diff(&mut self) -> Result> { + let mut aggregated = String::new(); + + // Compute diffs per tracked internal file in a stable order by external path. + let mut baseline_file_names: Vec = + self.baseline_file_info.keys().cloned().collect(); + // Sort lexicographically by full repo-relative path to match git behavior. + baseline_file_names.sort_by_key(|internal| { + self.get_path_for_internal(internal) + .map(|p| self.relative_to_git_root_str(&p)) + .unwrap_or_default() + }); + + for internal in baseline_file_names { + // Baseline external must exist for any tracked internal. + let baseline_external = match self + .baseline_file_info + .get(&internal) + .and_then(|i| i.path.clone()) + { + Some(p) => p, + None => continue, + }; + let current_external = match self.get_path_for_internal(&internal) { + Some(p) => p, + None => continue, + }; + + // Determine modes early; needed to read symlink content correctly. + let baseline_mode = self + .baseline_file_info + .get(&internal) + .and_then(|i| i.mode.clone()) + .unwrap_or_else(|| REGULAR_MODE.to_string()); + let current_mode = + file_mode_for_path(¤t_external).unwrap_or_else(|| REGULAR_MODE.to_string()); + + let left_bytes = self + .baseline_file_info + .get(&internal) + .and_then(|i| i.contents_bytes.clone()); + + let right_bytes = blob_bytes(¤t_external, ¤t_mode)?; + + // Fast path: identical bytes or both missing. + if left_bytes.as_deref() == right_bytes.as_deref() { + continue; + } + + let left_display = self.relative_to_git_root_str(&baseline_external); + let right_display = self.relative_to_git_root_str(¤t_external); + + // Emit a git-style header for better readability and parity with previous behavior. + aggregated.push_str(&format!("diff --git a/{left_display} b/{right_display}\n")); + + let is_add = left_bytes.is_none() && right_bytes.is_some(); + let is_delete = left_bytes.is_some() && right_bytes.is_none(); + + if is_add { + aggregated.push_str(&format!("new file mode {current_mode}\n")); + } else if is_delete { + aggregated.push_str(&format!("deleted file mode {baseline_mode}\n")); + } else if baseline_mode != current_mode { + aggregated.push_str(&format!("old mode {baseline_mode}\n")); + aggregated.push_str(&format!("new mode {current_mode}\n")); + } + + // Determine blob object IDs for left and right contents. Prefer stored OIDs + // captured from the original repo state when the change was first seen. + let left_oid = self + .baseline_file_info + .get(&internal) + .and_then(|i| i.oid.clone()) + .or_else(|| { + left_bytes + .as_ref() + .map(|b| git_blob_sha1_hex_bytes(b)) + .or(Some(ZERO_OID.to_string())) + }) + .unwrap_or_else(|| ZERO_OID.to_string()); + let right_oid = if let Some(b) = right_bytes.as_ref() { + if current_mode == SYMLINK_MODE { + git_blob_sha1_hex_bytes(b) + } else { + self.git_blob_oid_for_path(¤t_external) + .unwrap_or_else(|| git_blob_sha1_hex_bytes(b)) + } + } else { + ZERO_OID.to_string() + }; + + // If either side isn't valid UTF-8, emit a binary diff header and continue. + let left_text = left_bytes + .as_deref() + .and_then(|b| std::str::from_utf8(b).ok()); + let right_text = right_bytes + .as_deref() + .and_then(|b| std::str::from_utf8(b).ok()); + + // Prefer text diffs when possible: + // - both sides are valid UTF-8 + // - OR one side is missing (add/delete) and the present side is valid UTF-8 + let can_text_diff = match (left_text, right_text, is_add, is_delete) { + (Some(_), Some(_), _, _) => true, + (_, Some(_), true, _) => true, // add: left missing, right text + (Some(_), _, _, true) => true, // delete: left text, right missing + _ => false, + }; + + if can_text_diff { + // Diff the contents as text, treating missing side as empty string. + let l = left_text.unwrap_or(""); + let r = right_text.unwrap_or(""); + + // Emit index line without mode suffix to preserve current test expectations. + aggregated.push_str(&format!("index {left_oid}..{right_oid}\n")); + + let old_header = if left_bytes.is_some() { + format!("a/{left_display}") + } else { + "/dev/null".to_string() + }; + let new_header = if right_bytes.is_some() { + format!("b/{right_display}") + } else { + "/dev/null".to_string() + }; + + let diff = similar::TextDiff::from_lines(l, r); + let unified = diff + .unified_diff() + .context_radius(3) + .header(&old_header, &new_header) + .to_string(); + + aggregated.push_str(&unified); + if !aggregated.ends_with('\n') { + aggregated.push('\n'); + } + } else { + // Binary or invalid UTF-8: emit header only. + aggregated.push_str(&format!("index {left_oid}..{right_oid}\n")); + let old_header = if left_bytes.is_some() { + format!("a/{left_display}") + } else { + "/dev/null".to_string() + }; + let new_header = if right_bytes.is_some() { + format!("b/{right_display}") + } else { + "/dev/null".to_string() + }; + aggregated.push_str(&format!("--- {old_header}\n")); + aggregated.push_str(&format!("+++ {new_header}\n")); + aggregated.push_str("Binary files differ\n"); + if !aggregated.ends_with('\n') { + aggregated.push('\n'); + } + } + } + + if aggregated.trim().is_empty() { + Ok(None) + } else { + Ok(Some(aggregated)) + } + } +} + +fn uuid_filename_for(path: &Path) -> String { + let id = Uuid::new_v4().to_string(); + match path.extension().and_then(|e| e.to_str()) { + Some(ext) if !ext.is_empty() => format!("{id}.{ext}"), + _ => id, + } +} + +/// Compute the Git SHA-1 blob object ID for the given content (bytes). +fn git_blob_sha1_hex_bytes(data: &[u8]) -> String { + // Git blob hash is sha1 of: "blob \0" + let header = format!("blob {}\0", data.len()); + use sha1::Digest; + let mut hasher = sha1::Sha1::new(); + hasher.update(header.as_bytes()); + hasher.update(data); + let digest = hasher.finalize(); + let mut out = String::with_capacity(40); + for b in digest { + use std::fmt::Write; + let _ = write!(&mut out, "{b:02x}"); + } + out +} + +const ZERO_OID: &str = "0000000000000000000000000000000000000000"; +const REGULAR_MODE: &str = "100644"; +#[cfg(unix)] ``` > @gpeal but then that would change the tree hash?