feat: resumable backfill (#10745)

## Summary

This PR makes SQLite rollout backfill resumable and repeatable instead
of one-shot-on-db-create.

## What changed

- Added a persisted backfill state table:
  - state/migrations/0008_backfill_state.sql
- Tracks status (pending|running|complete), last_watermark, and
last_success_at.
- Added backfill state model/types in codex-state:
  - BackfillState, BackfillStatus (state/src/model/backfill_state.rs)
- Added runtime APIs to manage backfill lifecycle/progress:
  - get_backfill_state
  - mark_backfill_running
  - checkpoint_backfill
  - mark_backfill_complete
- Updated core startup behavior:
- Backfill now runs whenever state is not Complete (not only when DB
file is newly created).
- Reworked backfill execution:
- Collect rollout files, derive deterministic watermark per path, sort,
resume from last_watermark.
- Process in batches (BACKFILL_BATCH_SIZE = 200), checkpoint after each
batch.
  - Mark complete with last_success_at at the end.

## Why

Previous behavior could leave users permanently partially backfilled if
the process exited during initial async backfill. This change allows
safe continuation across restarts and avoids restarting from scratch.
This commit is contained in:
jif-oai
2026-02-05 14:34:34 +00:00
committed by GitHub
parent f2ffc4e5d0
commit 4033f905c6
8 changed files with 528 additions and 68 deletions

View File

@@ -22,7 +22,9 @@ pub use runtime::StateRuntime;
/// Most consumers should prefer [`StateRuntime`].
pub use extract::apply_rollout_item;
pub use model::Anchor;
pub use model::BackfillState;
pub use model::BackfillStats;
pub use model::BackfillStatus;
pub use model::ExtractionOutcome;
pub use model::SortKey;
pub use model::ThreadMemory;
@@ -36,9 +38,9 @@ pub use runtime::state_db_path;
/// Errors encountered during DB operations. Tags: [stage]
pub const DB_ERROR_METRIC: &str = "codex.db.error";
/// Metrics on backfill process during first init of the db. Tags: [status]
/// Metrics on backfill process. Tags: [status]
pub const DB_METRIC_BACKFILL: &str = "codex.db.backfill";
/// Metrics on backfill duration during first init of the db. Tags: [status]
/// Metrics on backfill duration. Tags: [status]
pub const DB_METRIC_BACKFILL_DURATION_MS: &str = "codex.db.backfill.duration_ms";
/// Metrics on errors during comparison between DB and rollout file. Tags: [stage]
pub const DB_METRIC_COMPARE_ERROR: &str = "codex.db.compare_error";

View File

@@ -0,0 +1,73 @@
use anyhow::Result;
use chrono::DateTime;
use chrono::Utc;
use sqlx::Row;
use sqlx::sqlite::SqliteRow;
/// Persisted lifecycle state for rollout metadata backfill.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct BackfillState {
/// Current lifecycle status.
pub status: BackfillStatus,
/// Last processed rollout watermark.
pub last_watermark: Option<String>,
/// Last successful completion time.
pub last_success_at: Option<DateTime<Utc>>,
}
impl Default for BackfillState {
fn default() -> Self {
Self {
status: BackfillStatus::Pending,
last_watermark: None,
last_success_at: None,
}
}
}
impl BackfillState {
pub(crate) fn try_from_row(row: &SqliteRow) -> Result<Self> {
let status: String = row.try_get("status")?;
let last_success_at = row
.try_get::<Option<i64>, _>("last_success_at")?
.map(epoch_seconds_to_datetime)
.transpose()?;
Ok(Self {
status: BackfillStatus::parse(status.as_str())?,
last_watermark: row.try_get("last_watermark")?,
last_success_at,
})
}
}
/// Backfill lifecycle status.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum BackfillStatus {
Pending,
Running,
Complete,
}
impl BackfillStatus {
pub const fn as_str(self) -> &'static str {
match self {
BackfillStatus::Pending => "pending",
BackfillStatus::Running => "running",
BackfillStatus::Complete => "complete",
}
}
pub fn parse(value: &str) -> Result<Self> {
match value {
"pending" => Ok(Self::Pending),
"running" => Ok(Self::Running),
"complete" => Ok(Self::Complete),
_ => Err(anyhow::anyhow!("invalid backfill status: {value}")),
}
}
}
fn epoch_seconds_to_datetime(secs: i64) -> Result<DateTime<Utc>> {
DateTime::<Utc>::from_timestamp(secs, 0)
.ok_or_else(|| anyhow::anyhow!("invalid unix timestamp: {secs}"))
}

View File

@@ -1,7 +1,10 @@
mod backfill_state;
mod log;
mod thread_memory;
mod thread_metadata;
pub use backfill_state::BackfillState;
pub use backfill_state::BackfillStatus;
pub use log::LogEntry;
pub use log::LogQuery;
pub use log::LogRow;

View File

@@ -91,6 +91,80 @@ impl StateRuntime {
self.codex_home.as_path()
}
/// Get persisted rollout metadata backfill state.
pub async fn get_backfill_state(&self) -> anyhow::Result<crate::BackfillState> {
self.ensure_backfill_state_row().await?;
let row = sqlx::query(
r#"
SELECT status, last_watermark, last_success_at
FROM backfill_state
WHERE id = 1
"#,
)
.fetch_one(self.pool.as_ref())
.await?;
crate::BackfillState::try_from_row(&row)
}
/// Mark rollout metadata backfill as running.
pub async fn mark_backfill_running(&self) -> anyhow::Result<()> {
self.ensure_backfill_state_row().await?;
sqlx::query(
r#"
UPDATE backfill_state
SET status = ?, updated_at = ?
WHERE id = 1
"#,
)
.bind(crate::BackfillStatus::Running.as_str())
.bind(Utc::now().timestamp())
.execute(self.pool.as_ref())
.await?;
Ok(())
}
/// Persist rollout metadata backfill progress.
pub async fn checkpoint_backfill(&self, watermark: &str) -> anyhow::Result<()> {
self.ensure_backfill_state_row().await?;
sqlx::query(
r#"
UPDATE backfill_state
SET status = ?, last_watermark = ?, updated_at = ?
WHERE id = 1
"#,
)
.bind(crate::BackfillStatus::Running.as_str())
.bind(watermark)
.bind(Utc::now().timestamp())
.execute(self.pool.as_ref())
.await?;
Ok(())
}
/// Mark rollout metadata backfill as complete.
pub async fn mark_backfill_complete(&self, last_watermark: Option<&str>) -> anyhow::Result<()> {
self.ensure_backfill_state_row().await?;
let now = Utc::now().timestamp();
sqlx::query(
r#"
UPDATE backfill_state
SET
status = ?,
last_watermark = COALESCE(?, last_watermark),
last_success_at = ?,
updated_at = ?
WHERE id = 1
"#,
)
.bind(crate::BackfillStatus::Complete.as_str())
.bind(last_watermark)
.bind(now)
.bind(now)
.execute(self.pool.as_ref())
.await?;
Ok(())
}
/// Load thread metadata by id using the underlying database.
pub async fn get_thread(&self, id: ThreadId) -> anyhow::Result<Option<crate::ThreadMetadata>> {
let row = sqlx::query(
@@ -637,6 +711,22 @@ ON CONFLICT(thread_id, position) DO NOTHING
}
self.upsert_thread(&metadata).await
}
async fn ensure_backfill_state_row(&self) -> anyhow::Result<()> {
sqlx::query(
r#"
INSERT INTO backfill_state (id, status, last_watermark, last_success_at, updated_at)
VALUES (?, ?, NULL, NULL, ?)
ON CONFLICT(id) DO NOTHING
"#,
)
.bind(1_i64)
.bind(crate::BackfillStatus::Pending.as_str())
.bind(Utc::now().timestamp())
.execute(self.pool.as_ref())
.await?;
Ok(())
}
}
fn push_log_filters<'a>(builder: &mut QueryBuilder<'a, Sqlite>, query: &'a LogQuery) {
@@ -889,7 +979,10 @@ mod tests {
let nanos = SystemTime::now()
.duration_since(UNIX_EPOCH)
.map_or(0, |duration| duration.as_nanos());
std::env::temp_dir().join(format!("codex-state-runtime-test-{nanos}"))
std::env::temp_dir().join(format!(
"codex-state-runtime-test-{nanos}-{}",
Uuid::new_v4()
))
}
#[tokio::test]
@@ -967,6 +1060,59 @@ mod tests {
let _ = tokio::fs::remove_dir_all(codex_home).await;
}
#[tokio::test]
async fn backfill_state_persists_progress_and_completion() {
let codex_home = unique_temp_dir();
let runtime = StateRuntime::init(codex_home.clone(), "test-provider".to_string(), None)
.await
.expect("initialize runtime");
let initial = runtime
.get_backfill_state()
.await
.expect("get initial backfill state");
assert_eq!(initial.status, crate::BackfillStatus::Pending);
assert_eq!(initial.last_watermark, None);
assert_eq!(initial.last_success_at, None);
runtime
.mark_backfill_running()
.await
.expect("mark backfill running");
runtime
.checkpoint_backfill("sessions/2026/01/27/rollout-a.jsonl")
.await
.expect("checkpoint backfill");
let running = runtime
.get_backfill_state()
.await
.expect("get running backfill state");
assert_eq!(running.status, crate::BackfillStatus::Running);
assert_eq!(
running.last_watermark,
Some("sessions/2026/01/27/rollout-a.jsonl".to_string())
);
assert_eq!(running.last_success_at, None);
runtime
.mark_backfill_complete(Some("sessions/2026/01/28/rollout-b.jsonl"))
.await
.expect("mark backfill complete");
let completed = runtime
.get_backfill_state()
.await
.expect("get completed backfill state");
assert_eq!(completed.status, crate::BackfillStatus::Complete);
assert_eq!(
completed.last_watermark,
Some("sessions/2026/01/28/rollout-b.jsonl".to_string())
);
assert!(completed.last_success_at.is_some());
let _ = tokio::fs::remove_dir_all(codex_home).await;
}
#[tokio::test]
async fn upsert_and_get_thread_memory() {
let codex_home = unique_temp_dir();