feat: add log db (#10086)

Add a log DB. The goal is just to store our logs in a `.sqlite` DB to
make it easier to crawl them and drop the oldest ones.
This commit is contained in:
jif-oai
2026-01-29 10:23:03 +01:00
committed by GitHub
parent 4d9ae3a298
commit 780482da84
12 changed files with 248 additions and 0 deletions

View File

@@ -0,0 +1,14 @@
use serde::Serialize;
#[derive(Clone, Debug, Serialize)]
pub struct LogEntry {
pub ts: i64,
pub ts_nanos: i64,
pub level: String,
pub target: String,
pub message: Option<String>,
pub fields_json: String,
pub module_path: Option<String>,
pub file: Option<String>,
pub line: Option<i64>,
}

View File

@@ -0,0 +1,15 @@
mod log;
mod thread_metadata;
pub use log::LogEntry;
pub use thread_metadata::Anchor;
pub use thread_metadata::BackfillStats;
pub use thread_metadata::ExtractionOutcome;
pub use thread_metadata::SortKey;
pub use thread_metadata::ThreadMetadata;
pub use thread_metadata::ThreadMetadataBuilder;
pub use thread_metadata::ThreadsPage;
pub(crate) use thread_metadata::ThreadRow;
pub(crate) use thread_metadata::anchor_from_item;
pub(crate) use thread_metadata::datetime_to_epoch_seconds;

View File

@@ -0,0 +1,352 @@
use anyhow::Result;
use chrono::DateTime;
use chrono::Timelike;
use chrono::Utc;
use codex_protocol::ThreadId;
use codex_protocol::protocol::AskForApproval;
use codex_protocol::protocol::SandboxPolicy;
use codex_protocol::protocol::SessionSource;
use sqlx::Row;
use sqlx::sqlite::SqliteRow;
use std::path::PathBuf;
use uuid::Uuid;
/// The sort key to use when listing threads.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum SortKey {
/// Sort by the thread's creation timestamp.
CreatedAt,
/// Sort by the thread's last update timestamp.
UpdatedAt,
}
/// A pagination anchor used for keyset pagination.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Anchor {
/// The timestamp component of the anchor.
pub ts: DateTime<Utc>,
/// The UUID component of the anchor.
pub id: Uuid,
}
/// A single page of thread metadata results.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ThreadsPage {
/// The thread metadata items in this page.
pub items: Vec<ThreadMetadata>,
/// The next anchor to use for pagination, if any.
pub next_anchor: Option<Anchor>,
/// The number of rows scanned to produce this page.
pub num_scanned_rows: usize,
}
/// The outcome of extracting metadata from a rollout.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ExtractionOutcome {
/// The extracted thread metadata.
pub metadata: ThreadMetadata,
/// The number of rollout lines that failed to parse.
pub parse_errors: usize,
}
/// Canonical thread metadata derived from rollout files.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ThreadMetadata {
/// The thread identifier.
pub id: ThreadId,
/// The absolute rollout path on disk.
pub rollout_path: PathBuf,
/// The creation timestamp.
pub created_at: DateTime<Utc>,
/// The last update timestamp.
pub updated_at: DateTime<Utc>,
/// The session source (stringified enum).
pub source: String,
/// The model provider identifier.
pub model_provider: String,
/// The working directory for the thread.
pub cwd: PathBuf,
/// A best-effort thread title.
pub title: String,
/// The sandbox policy (stringified enum).
pub sandbox_policy: String,
/// The approval mode (stringified enum).
pub approval_mode: String,
/// The last observed token usage.
pub tokens_used: i64,
/// Whether the thread has observed a user message.
pub has_user_event: bool,
/// The archive timestamp, if the thread is archived.
pub archived_at: Option<DateTime<Utc>>,
/// The git commit SHA, if known.
pub git_sha: Option<String>,
/// The git branch name, if known.
pub git_branch: Option<String>,
/// The git origin URL, if known.
pub git_origin_url: Option<String>,
}
/// Builder data required to construct [`ThreadMetadata`] without parsing filenames.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ThreadMetadataBuilder {
/// The thread identifier.
pub id: ThreadId,
/// The absolute rollout path on disk.
pub rollout_path: PathBuf,
/// The creation timestamp.
pub created_at: DateTime<Utc>,
/// The last update timestamp, if known.
pub updated_at: Option<DateTime<Utc>>,
/// The session source.
pub source: SessionSource,
/// The model provider identifier, if known.
pub model_provider: Option<String>,
/// The working directory for the thread.
pub cwd: PathBuf,
/// The sandbox policy.
pub sandbox_policy: SandboxPolicy,
/// The approval mode.
pub approval_mode: AskForApproval,
/// The archive timestamp, if the thread is archived.
pub archived_at: Option<DateTime<Utc>>,
/// The git commit SHA, if known.
pub git_sha: Option<String>,
/// The git branch name, if known.
pub git_branch: Option<String>,
/// The git origin URL, if known.
pub git_origin_url: Option<String>,
}
impl ThreadMetadataBuilder {
/// Create a new builder with required fields and sensible defaults.
pub fn new(
id: ThreadId,
rollout_path: PathBuf,
created_at: DateTime<Utc>,
source: SessionSource,
) -> Self {
Self {
id,
rollout_path,
created_at,
updated_at: None,
source,
model_provider: None,
cwd: PathBuf::new(),
sandbox_policy: SandboxPolicy::ReadOnly,
approval_mode: AskForApproval::OnRequest,
archived_at: None,
git_sha: None,
git_branch: None,
git_origin_url: None,
}
}
/// Build canonical thread metadata, filling missing values from defaults.
pub fn build(&self, default_provider: &str) -> ThreadMetadata {
let source = crate::extract::enum_to_string(&self.source);
let sandbox_policy = crate::extract::enum_to_string(&self.sandbox_policy);
let approval_mode = crate::extract::enum_to_string(&self.approval_mode);
let created_at = canonicalize_datetime(self.created_at);
let updated_at = self
.updated_at
.map(canonicalize_datetime)
.unwrap_or(created_at);
ThreadMetadata {
id: self.id,
rollout_path: self.rollout_path.clone(),
created_at,
updated_at,
source,
model_provider: self
.model_provider
.clone()
.unwrap_or_else(|| default_provider.to_string()),
cwd: self.cwd.clone(),
title: String::new(),
sandbox_policy,
approval_mode,
tokens_used: 0,
has_user_event: false,
archived_at: self.archived_at.map(canonicalize_datetime),
git_sha: self.git_sha.clone(),
git_branch: self.git_branch.clone(),
git_origin_url: self.git_origin_url.clone(),
}
}
}
impl ThreadMetadata {
/// Return the list of field names that differ between `self` and `other`.
pub fn diff_fields(&self, other: &Self) -> Vec<&'static str> {
let mut diffs = Vec::new();
if self.id != other.id {
diffs.push("id");
}
if self.rollout_path != other.rollout_path {
diffs.push("rollout_path");
}
if self.created_at != other.created_at {
diffs.push("created_at");
}
if self.updated_at != other.updated_at {
diffs.push("updated_at");
}
if self.source != other.source {
diffs.push("source");
}
if self.model_provider != other.model_provider {
diffs.push("model_provider");
}
if self.cwd != other.cwd {
diffs.push("cwd");
}
if self.title != other.title {
diffs.push("title");
}
if self.sandbox_policy != other.sandbox_policy {
diffs.push("sandbox_policy");
}
if self.approval_mode != other.approval_mode {
diffs.push("approval_mode");
}
if self.tokens_used != other.tokens_used {
diffs.push("tokens_used");
}
if self.has_user_event != other.has_user_event {
diffs.push("has_user_event");
}
if self.archived_at != other.archived_at {
diffs.push("archived_at");
}
if self.git_sha != other.git_sha {
diffs.push("git_sha");
}
if self.git_branch != other.git_branch {
diffs.push("git_branch");
}
if self.git_origin_url != other.git_origin_url {
diffs.push("git_origin_url");
}
diffs
}
}
fn canonicalize_datetime(dt: DateTime<Utc>) -> DateTime<Utc> {
dt.with_nanosecond(0).unwrap_or(dt)
}
#[derive(Debug)]
pub(crate) struct ThreadRow {
id: String,
rollout_path: String,
created_at: i64,
updated_at: i64,
source: String,
model_provider: String,
cwd: String,
title: String,
sandbox_policy: String,
approval_mode: String,
tokens_used: i64,
has_user_event: bool,
archived_at: Option<i64>,
git_sha: Option<String>,
git_branch: Option<String>,
git_origin_url: Option<String>,
}
impl ThreadRow {
pub(crate) fn try_from_row(row: &SqliteRow) -> Result<Self> {
Ok(Self {
id: row.try_get("id")?,
rollout_path: row.try_get("rollout_path")?,
created_at: row.try_get("created_at")?,
updated_at: row.try_get("updated_at")?,
source: row.try_get("source")?,
model_provider: row.try_get("model_provider")?,
cwd: row.try_get("cwd")?,
title: row.try_get("title")?,
sandbox_policy: row.try_get("sandbox_policy")?,
approval_mode: row.try_get("approval_mode")?,
tokens_used: row.try_get("tokens_used")?,
has_user_event: row.try_get("has_user_event")?,
archived_at: row.try_get("archived_at")?,
git_sha: row.try_get("git_sha")?,
git_branch: row.try_get("git_branch")?,
git_origin_url: row.try_get("git_origin_url")?,
})
}
}
impl TryFrom<ThreadRow> for ThreadMetadata {
type Error = anyhow::Error;
fn try_from(row: ThreadRow) -> std::result::Result<Self, Self::Error> {
let ThreadRow {
id,
rollout_path,
created_at,
updated_at,
source,
model_provider,
cwd,
title,
sandbox_policy,
approval_mode,
tokens_used,
has_user_event,
archived_at,
git_sha,
git_branch,
git_origin_url,
} = row;
Ok(Self {
id: ThreadId::try_from(id)?,
rollout_path: PathBuf::from(rollout_path),
created_at: epoch_seconds_to_datetime(created_at)?,
updated_at: epoch_seconds_to_datetime(updated_at)?,
source,
model_provider,
cwd: PathBuf::from(cwd),
title,
sandbox_policy,
approval_mode,
tokens_used,
has_user_event,
archived_at: archived_at.map(epoch_seconds_to_datetime).transpose()?,
git_sha,
git_branch,
git_origin_url,
})
}
}
pub(crate) fn anchor_from_item(item: &ThreadMetadata, sort_key: SortKey) -> Option<Anchor> {
let id = Uuid::parse_str(&item.id.to_string()).ok()?;
let ts = match sort_key {
SortKey::CreatedAt => item.created_at,
SortKey::UpdatedAt => item.updated_at,
};
Some(Anchor { ts, id })
}
pub(crate) fn datetime_to_epoch_seconds(dt: DateTime<Utc>) -> i64 {
dt.timestamp()
}
pub(crate) fn epoch_seconds_to_datetime(secs: i64) -> Result<DateTime<Utc>> {
DateTime::<Utc>::from_timestamp(secs, 0)
.ok_or_else(|| anyhow::anyhow!("invalid unix timestamp: {secs}"))
}
/// Statistics about a backfill operation.
#[derive(Debug, Clone)]
pub struct BackfillStats {
/// The number of rollout files scanned.
pub scanned: usize,
/// The number of rows upserted successfully.
pub upserted: usize,
/// The number of rows that failed to upsert.
pub failed: usize,
}