feat: resumable backfill (#10745)

## Summary

This PR makes SQLite rollout backfill resumable and repeatable instead
of one-shot-on-db-create.

## What changed

- Added a persisted backfill state table:
  - state/migrations/0008_backfill_state.sql
- Tracks status (pending|running|complete), last_watermark, and
last_success_at.
- Added backfill state model/types in codex-state:
  - BackfillState, BackfillStatus (state/src/model/backfill_state.rs)
- Added runtime APIs to manage backfill lifecycle/progress:
  - get_backfill_state
  - mark_backfill_running
  - checkpoint_backfill
  - mark_backfill_complete
- Updated core startup behavior:
- Backfill now runs whenever state is not Complete (not only when DB
file is newly created).
- Reworked backfill execution:
- Collect rollout files, derive deterministic watermark per path, sort,
resume from last_watermark.
- Process in batches (BACKFILL_BATCH_SIZE = 200), checkpoint after each
batch.
  - Mark complete with last_success_at at the end.

## Why

Previous behavior could leave users permanently partially backfilled if
the process exited during initial async backfill. This change allows
safe continuation across restarts and avoids restarting from scratch.
This commit is contained in:
jif-oai
2026-02-05 14:34:34 +00:00
committed by GitHub
parent f2ffc4e5d0
commit 4033f905c6
8 changed files with 528 additions and 68 deletions

View File

@@ -13,14 +13,15 @@ use codex_protocol::protocol::RolloutItem;
use codex_protocol::protocol::SandboxPolicy;
use codex_protocol::protocol::SessionMetaLine;
use codex_protocol::protocol::SessionSource;
use codex_state::BackfillState;
use codex_state::BackfillStats;
use codex_state::BackfillStatus;
use codex_state::DB_ERROR_METRIC;
use codex_state::DB_METRIC_BACKFILL;
use codex_state::DB_METRIC_BACKFILL_DURATION_MS;
use codex_state::ExtractionOutcome;
use codex_state::ThreadMetadataBuilder;
use codex_state::apply_rollout_item;
use std::cmp::Reverse;
use std::path::Path;
use std::path::PathBuf;
use tracing::info;
@@ -28,6 +29,7 @@ use tracing::warn;
const ROLLOUT_PREFIX: &str = "rollout-";
const ROLLOUT_SUFFIX: &str = ".jsonl";
const BACKFILL_BATCH_SIZE: usize = 200;
pub(crate) fn builder_from_session_meta(
session_meta: &SessionMetaLine,
@@ -130,16 +132,52 @@ pub(crate) async fn backfill_sessions(
otel: Option<&OtelManager>,
) {
let timer = otel.and_then(|otel| otel.start_timer(DB_METRIC_BACKFILL_DURATION_MS, &[]).ok());
let mut backfill_state = match runtime.get_backfill_state().await {
Ok(state) => state,
Err(err) => {
warn!(
"failed to read backfill state at {}: {err}",
config.codex_home.display()
);
if let Some(otel) = otel {
otel.counter(DB_ERROR_METRIC, 1, &[("stage", "backfill_state_read")]);
}
BackfillState::default()
}
};
if backfill_state.status == BackfillStatus::Complete {
return;
}
if let Err(err) = runtime.mark_backfill_running().await {
warn!(
"failed to mark backfill running at {}: {err}",
config.codex_home.display()
);
if let Some(otel) = otel {
otel.counter(
DB_ERROR_METRIC,
1,
&[("stage", "backfill_state_mark_running")],
);
}
} else {
backfill_state.status = BackfillStatus::Running;
}
let sessions_root = config.codex_home.join(rollout::SESSIONS_SUBDIR);
let archived_root = config.codex_home.join(rollout::ARCHIVED_SESSIONS_SUBDIR);
let mut rollout_paths: Vec<(PathBuf, bool)> = Vec::new();
let mut rollout_paths: Vec<BackfillRolloutPath> = Vec::new();
for (root, archived) in [(sessions_root, false), (archived_root, true)] {
if !tokio::fs::try_exists(&root).await.unwrap_or(false) {
continue;
}
match collect_rollout_paths(&root).await {
Ok(paths) => {
rollout_paths.extend(paths.into_iter().map(|path| (path, archived)));
rollout_paths.extend(paths.into_iter().map(|path| BackfillRolloutPath {
watermark: backfill_watermark_for_path(config.codex_home.as_path(), &path),
path,
archived,
}));
}
Err(err) => {
warn!(
@@ -149,75 +187,126 @@ pub(crate) async fn backfill_sessions(
}
}
}
rollout_paths.sort_by_key(|(path, _archived)| {
let parsed = path
.file_name()
.and_then(|name| name.to_str())
.and_then(parse_timestamp_uuid_from_filename)
.unwrap_or((time::OffsetDateTime::UNIX_EPOCH, uuid::Uuid::nil()));
(Reverse(parsed.0), Reverse(parsed.1))
});
rollout_paths.sort_by(|a, b| a.watermark.cmp(&b.watermark));
if let Some(last_watermark) = backfill_state.last_watermark.as_deref() {
rollout_paths.retain(|entry| entry.watermark.as_str() > last_watermark);
}
let mut stats = BackfillStats {
scanned: 0,
upserted: 0,
failed: 0,
};
for (path, archived) in rollout_paths {
stats.scanned = stats.scanned.saturating_add(1);
match extract_metadata_from_rollout(&path, config.model_provider_id.as_str(), otel).await {
Ok(outcome) => {
if outcome.parse_errors > 0
&& let Some(otel) = otel
{
otel.counter(
DB_ERROR_METRIC,
outcome.parse_errors as i64,
&[("stage", "backfill_sessions")],
);
}
let mut metadata = outcome.metadata;
if archived && metadata.archived_at.is_none() {
let fallback_archived_at = metadata.updated_at;
metadata.archived_at = file_modified_time_utc(&path)
.await
.or(Some(fallback_archived_at));
}
if let Err(err) = runtime.upsert_thread(&metadata).await {
stats.failed = stats.failed.saturating_add(1);
warn!("failed to upsert rollout {}: {err}", path.display());
} else {
stats.upserted = stats.upserted.saturating_add(1);
if let Ok(meta_line) = rollout::list::read_session_meta_line(&path).await {
if let Err(err) = runtime
.persist_dynamic_tools(
meta_line.meta.id,
meta_line.meta.dynamic_tools.as_deref(),
)
.await
{
if let Some(otel) = otel {
otel.counter(
DB_ERROR_METRIC,
1,
&[("stage", "backfill_dynamic_tools")],
);
}
warn!("failed to backfill dynamic tools {}: {err}", path.display());
}
} else {
warn!(
"failed to read session meta for dynamic tools {}",
path.display()
let mut last_watermark = backfill_state.last_watermark.clone();
for batch in rollout_paths.chunks(BACKFILL_BATCH_SIZE) {
for rollout in batch {
stats.scanned = stats.scanned.saturating_add(1);
match extract_metadata_from_rollout(
&rollout.path,
config.model_provider_id.as_str(),
otel,
)
.await
{
Ok(outcome) => {
if outcome.parse_errors > 0
&& let Some(otel) = otel
{
otel.counter(
DB_ERROR_METRIC,
outcome.parse_errors as i64,
&[("stage", "backfill_sessions")],
);
}
let mut metadata = outcome.metadata;
if rollout.archived && metadata.archived_at.is_none() {
let fallback_archived_at = metadata.updated_at;
metadata.archived_at = file_modified_time_utc(&rollout.path)
.await
.or(Some(fallback_archived_at));
}
if let Err(err) = runtime.upsert_thread(&metadata).await {
stats.failed = stats.failed.saturating_add(1);
warn!("failed to upsert rollout {}: {err}", rollout.path.display());
} else {
stats.upserted = stats.upserted.saturating_add(1);
if let Ok(meta_line) =
rollout::list::read_session_meta_line(&rollout.path).await
{
if let Err(err) = runtime
.persist_dynamic_tools(
meta_line.meta.id,
meta_line.meta.dynamic_tools.as_deref(),
)
.await
{
if let Some(otel) = otel {
otel.counter(
DB_ERROR_METRIC,
1,
&[("stage", "backfill_dynamic_tools")],
);
}
warn!(
"failed to backfill dynamic tools {}: {err}",
rollout.path.display()
);
}
} else {
warn!(
"failed to read session meta for dynamic tools {}",
rollout.path.display()
);
}
}
}
Err(err) => {
stats.failed = stats.failed.saturating_add(1);
warn!(
"failed to extract rollout {}: {err}",
rollout.path.display()
);
}
}
Err(err) => {
stats.failed = stats.failed.saturating_add(1);
warn!("failed to extract rollout {}: {err}", path.display());
}
if let Some(last_entry) = batch.last() {
if let Err(err) = runtime
.checkpoint_backfill(last_entry.watermark.as_str())
.await
{
warn!(
"failed to checkpoint backfill at {}: {err}",
config.codex_home.display()
);
if let Some(otel) = otel {
otel.counter(
DB_ERROR_METRIC,
1,
&[("stage", "backfill_state_checkpoint")],
);
}
} else {
last_watermark = Some(last_entry.watermark.clone());
}
}
}
if let Err(err) = runtime
.mark_backfill_complete(last_watermark.as_deref())
.await
{
warn!(
"failed to mark backfill complete at {}: {err}",
config.codex_home.display()
);
if let Some(otel) = otel {
otel.counter(
DB_ERROR_METRIC,
1,
&[("stage", "backfill_state_mark_complete")],
);
}
}
info!(
"state db backfill scanned={}, upserted={}, failed={}",
@@ -247,6 +336,20 @@ pub(crate) async fn backfill_sessions(
}
}
#[derive(Debug, Clone)]
struct BackfillRolloutPath {
watermark: String,
path: PathBuf,
archived: bool,
}
fn backfill_watermark_for_path(codex_home: &Path, path: &Path) -> String {
path.strip_prefix(codex_home)
.unwrap_or(path)
.to_string_lossy()
.replace('\\', "/")
}
async fn file_modified_time_utc(path: &Path) -> Option<DateTime<Utc>> {
let modified = tokio::fs::metadata(path).await.ok()?.modified().ok()?;
let updated_at: DateTime<Utc> = modified.into();
@@ -331,10 +434,13 @@ mod tests {
use codex_protocol::protocol::SessionMeta;
use codex_protocol::protocol::SessionMetaLine;
use codex_protocol::protocol::SessionSource;
use codex_state::BackfillStatus;
use codex_state::ThreadMetadataBuilder;
use pretty_assertions::assert_eq;
use std::fs::File;
use std::io::Write;
use std::path::Path;
use std::path::PathBuf;
use tempfile::tempdir;
use uuid::Uuid;
@@ -412,4 +518,108 @@ mod tests {
assert_eq!(builder, expected);
}
#[tokio::test]
async fn backfill_sessions_resumes_from_watermark_and_marks_complete() {
let dir = tempdir().expect("tempdir");
let codex_home = dir.path().to_path_buf();
let first_uuid = Uuid::new_v4();
let second_uuid = Uuid::new_v4();
let first_path = write_rollout_in_sessions(
codex_home.as_path(),
"2026-01-27T12-34-56",
"2026-01-27T12:34:56Z",
first_uuid,
);
let second_path = write_rollout_in_sessions(
codex_home.as_path(),
"2026-01-27T12-35-56",
"2026-01-27T12:35:56Z",
second_uuid,
);
let runtime =
codex_state::StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None)
.await
.expect("initialize runtime");
let first_watermark =
backfill_watermark_for_path(codex_home.as_path(), first_path.as_path());
runtime.mark_backfill_running().await.expect("mark running");
runtime
.checkpoint_backfill(first_watermark.as_str())
.await
.expect("checkpoint first watermark");
let mut config = crate::config::test_config();
config.codex_home = codex_home.clone();
config.model_provider_id = "test-provider".to_string();
backfill_sessions(runtime.as_ref(), &config, None).await;
let first_id = ThreadId::from_string(&first_uuid.to_string()).expect("first thread id");
let second_id = ThreadId::from_string(&second_uuid.to_string()).expect("second thread id");
assert_eq!(
runtime
.get_thread(first_id)
.await
.expect("get first thread"),
None
);
assert!(
runtime
.get_thread(second_id)
.await
.expect("get second thread")
.is_some()
);
let state = runtime
.get_backfill_state()
.await
.expect("get backfill state");
assert_eq!(state.status, BackfillStatus::Complete);
assert_eq!(
state.last_watermark,
Some(backfill_watermark_for_path(
codex_home.as_path(),
second_path.as_path()
))
);
assert!(state.last_success_at.is_some());
}
fn write_rollout_in_sessions(
codex_home: &Path,
filename_ts: &str,
event_ts: &str,
thread_uuid: Uuid,
) -> PathBuf {
let id = ThreadId::from_string(&thread_uuid.to_string()).expect("thread id");
let sessions_dir = codex_home.join("sessions");
std::fs::create_dir_all(sessions_dir.as_path()).expect("create sessions dir");
let path = sessions_dir.join(format!("rollout-{filename_ts}-{thread_uuid}.jsonl"));
let session_meta = SessionMeta {
id,
forked_from_id: None,
timestamp: event_ts.to_string(),
cwd: codex_home.to_path_buf(),
originator: "cli".to_string(),
cli_version: "0.0.0".to_string(),
source: SessionSource::default(),
model_provider: Some("test-provider".to_string()),
base_instructions: None,
dynamic_tools: None,
};
let session_meta_line = SessionMetaLine {
meta: session_meta,
git: None,
};
let rollout_line = RolloutLine {
timestamp: event_ts.to_string(),
item: RolloutItem::SessionMeta(session_meta_line),
};
let json = serde_json::to_string(&rollout_line).expect("serialize rollout");
let mut file = File::create(&path).expect("create rollout");
writeln!(file, "{json}").expect("write rollout");
path
}
}

View File

@@ -19,6 +19,7 @@ use tokio::sync::mpsc::Sender;
use tokio::sync::mpsc::{self};
use tokio::sync::oneshot;
use tracing::info;
use tracing::trace;
use tracing::warn;
use super::ARCHIVED_SESSIONS_SUBDIR;
@@ -386,7 +387,7 @@ impl RolloutRecorder {
pub(crate) async fn load_rollout_items(
path: &Path,
) -> std::io::Result<(Vec<RolloutItem>, Option<ThreadId>, usize)> {
info!("Resuming rollout from {path:?}");
trace!("Resuming rollout from {path:?}");
let text = tokio::fs::read_to_string(path).await?;
if text.trim().is_empty() {
return Err(IoError::other("empty session file"));
@@ -433,7 +434,7 @@ impl RolloutRecorder {
}
},
Err(e) => {
warn!("failed to parse rollout line: {e}");
trace!("failed to parse rollout line: {e}");
parse_errors = parse_errors.saturating_add(1);
}
}