mirror of
https://github.com/openai/codex.git
synced 2026-05-05 05:42:33 +03:00
feat: resumable backfill (#10745)
## Summary This PR makes SQLite rollout backfill resumable and repeatable instead of one-shot-on-db-create. ## What changed - Added a persisted backfill state table: - state/migrations/0008_backfill_state.sql - Tracks status (pending|running|complete), last_watermark, and last_success_at. - Added backfill state model/types in codex-state: - BackfillState, BackfillStatus (state/src/model/backfill_state.rs) - Added runtime APIs to manage backfill lifecycle/progress: - get_backfill_state - mark_backfill_running - checkpoint_backfill - mark_backfill_complete - Updated core startup behavior: - Backfill now runs whenever state is not Complete (not only when DB file is newly created). - Reworked backfill execution: - Collect rollout files, derive deterministic watermark per path, sort, resume from last_watermark. - Process in batches (BACKFILL_BATCH_SIZE = 200), checkpoint after each batch. - Mark complete with last_success_at at the end. ## Why Previous behavior could leave users permanently partially backfilled if the process exited during initial async backfill. This change allows safe continuation across restarts and avoids restarting from scratch.
This commit is contained in:
73
codex-rs/state/src/model/backfill_state.rs
Normal file
73
codex-rs/state/src/model/backfill_state.rs
Normal file
@@ -0,0 +1,73 @@
|
||||
use anyhow::Result;
|
||||
use chrono::DateTime;
|
||||
use chrono::Utc;
|
||||
use sqlx::Row;
|
||||
use sqlx::sqlite::SqliteRow;
|
||||
|
||||
/// Persisted lifecycle state for rollout metadata backfill.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct BackfillState {
|
||||
/// Current lifecycle status.
|
||||
pub status: BackfillStatus,
|
||||
/// Last processed rollout watermark.
|
||||
pub last_watermark: Option<String>,
|
||||
/// Last successful completion time.
|
||||
pub last_success_at: Option<DateTime<Utc>>,
|
||||
}
|
||||
|
||||
impl Default for BackfillState {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
status: BackfillStatus::Pending,
|
||||
last_watermark: None,
|
||||
last_success_at: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl BackfillState {
|
||||
pub(crate) fn try_from_row(row: &SqliteRow) -> Result<Self> {
|
||||
let status: String = row.try_get("status")?;
|
||||
let last_success_at = row
|
||||
.try_get::<Option<i64>, _>("last_success_at")?
|
||||
.map(epoch_seconds_to_datetime)
|
||||
.transpose()?;
|
||||
Ok(Self {
|
||||
status: BackfillStatus::parse(status.as_str())?,
|
||||
last_watermark: row.try_get("last_watermark")?,
|
||||
last_success_at,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Backfill lifecycle status.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum BackfillStatus {
|
||||
Pending,
|
||||
Running,
|
||||
Complete,
|
||||
}
|
||||
|
||||
impl BackfillStatus {
|
||||
pub const fn as_str(self) -> &'static str {
|
||||
match self {
|
||||
BackfillStatus::Pending => "pending",
|
||||
BackfillStatus::Running => "running",
|
||||
BackfillStatus::Complete => "complete",
|
||||
}
|
||||
}
|
||||
|
||||
pub fn parse(value: &str) -> Result<Self> {
|
||||
match value {
|
||||
"pending" => Ok(Self::Pending),
|
||||
"running" => Ok(Self::Running),
|
||||
"complete" => Ok(Self::Complete),
|
||||
_ => Err(anyhow::anyhow!("invalid backfill status: {value}")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn epoch_seconds_to_datetime(secs: i64) -> Result<DateTime<Utc>> {
|
||||
DateTime::<Utc>::from_timestamp(secs, 0)
|
||||
.ok_or_else(|| anyhow::anyhow!("invalid unix timestamp: {secs}"))
|
||||
}
|
||||
@@ -1,7 +1,10 @@
|
||||
mod backfill_state;
|
||||
mod log;
|
||||
mod thread_memory;
|
||||
mod thread_metadata;
|
||||
|
||||
pub use backfill_state::BackfillState;
|
||||
pub use backfill_state::BackfillStatus;
|
||||
pub use log::LogEntry;
|
||||
pub use log::LogQuery;
|
||||
pub use log::LogRow;
|
||||
|
||||
Reference in New Issue
Block a user