From c8d5c388c12f0e1d19099be78a6f6b320e554ce1 Mon Sep 17 00:00:00 2001 From: "Giau. Tran Minh" Date: Thu, 21 May 2026 14:10:34 +0000 Subject: [PATCH] doc/md: blob storage document --- doc/md/blob-storage.md | 303 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 303 insertions(+) create mode 100644 doc/md/blob-storage.md diff --git a/doc/md/blob-storage.md b/doc/md/blob-storage.md new file mode 100644 index 000000000..1e7538c32 --- /dev/null +++ b/doc/md/blob-storage.md @@ -0,0 +1,303 @@ +--- +id: blob-storage +title: Blob Storage +--- + +## Overview + +Ent's `field.Blob` type stores field data in external blob storage (e.g., S3, GCS, local filesystem) +rather than in the database. The database only stores a **key** (a short string) referencing the blob. +This keeps database rows small while supporting arbitrarily large binary or text payloads. + +## Quick Start + +### 1. Define a schema with blob fields + +```go +package schema + +import ( + "crypto" + + "entgo.io/ent" + "entgo.io/ent/schema/field" +) + +type Document struct { + ent.Schema +} + +func (Document) Fields() []ent.Field { + return []ent.Field{ + field.String("name").Unique(), + field.Blob("content"), + field.Blob("avatar").Optional(), + } +} +``` + +### 2. Configure blob storage + +Each entity type with blob fields requires a `BlobOpener` — a function that opens a +blob bucket for a given field name: + +```go +import ( + "context" + + "myapp/ent" + "myapp/ent/document" + + _ "gocloud.dev/blob/s3blob" // or fileblob, gcsblob, etc. +) + +client := ent.NewClient( + ent.Driver(drv), + ent.WithBlobOpeners(ent.BlobOpeners{ + Document: func(ctx context.Context, field string) (ent.Blob, error) { + switch field { + case document.FieldContent: + return blob.OpenBucket(ctx, "s3://my-bucket/content") + case document.FieldAvatar: + return blob.OpenBucket(ctx, "s3://my-bucket/avatars") + default: + return nil, fmt.Errorf("unknown blob field: %s", field) + } + }, + }), +) +``` + +### 3. Use the generated API + +```go +// Create — pass []byte directly. +doc := client.Document.Create(). + SetName("readme"). + SetContent([]byte("Hello, World!")). + SaveX(ctx) + +// Read — the struct field holds the loaded value. +fmt.Println(string(doc.Content)) // "Hello, World!" + +// Update +doc = doc.Update(). + SetContent([]byte("Updated content")). + SaveX(ctx) +``` + +## `field.Blob` API + +| Method | Description | +|--------|-------------| +| `Optional()` | Field is nullable; not required on create. | +| `Nillable()` | Struct field is a pointer (distinguishes zero value from unset). | +| `Immutable()` | Field can only be set on create, not updated. | +| `Lazy()` | Mutation accepts `io.Reader`; struct field omitted; use Reader method to read. | +| `HashKey(h)` | Content-addressable key via hash (default: `crypto.SHA256`). | +| `UUIDKey()` | Random UUID v7 key per write. | +| `DualWrite(...)` | Migration mode: write to both blob storage and database column. | +| `GoType(typ)` | Override the default `[]byte` Go type. | +| `ValueScanner(vs)` | Custom codec between Go type and raw bytes (required for non-`[]byte`/`string` GoType). | +| `StorageKey(key)` | Override the database column name. | +| `StructTag(s)` | Set the struct tag on the generated field. | +| `Comment(c)` | Set the field comment. | +| `Annotations(...)` | Attach codegen annotations. | +| `Deprecated(...)` | Mark the field as deprecated. | + +## Key Strategies + +Every blob field requires a **key function** that determines the storage key for a given +piece of data. By default (when neither `UUIDKey` nor `HashKey` is called), blobs use +SHA-256 content hashing. + +### HashKey (default) + +Content-addressable storage: the data is hashed to produce the key. Identical content +always maps to the same key, enabling deduplication and write-skip optimizations on update. + +```go +field.Blob("content").HashKey(crypto.SHA256) +``` + +On update, if the new content produces the same hash as the existing key, the write to +blob storage is skipped entirely. + +### UUIDKey + +Each write generates a new random UUID (v7) as the storage key. This guarantees uniqueness +but does not deduplicate. + +```go +field.Blob("content").UUIDKey() +``` + +## Lazy Fields + +By default, blob data is automatically loaded into the entity struct field when the row is +scanned from the database. For large blobs where you don't always need the data in memory, +use `Lazy()`: + +```go +field.Blob("content").Lazy() +``` + +With `Lazy()`: +- The **mutation builder** accepts an `io.Reader` (which is fully buffered before writing). +- The entity **struct field is omitted** — data is not loaded on scan. +- A **Reader method** (e.g., `ContentReader`) is generated to explicitly open a reader from storage. + +```go +// Create with io.Reader. +doc := client.Document.Create(). + SetName("large-file"). + SetContent(bytes.NewReader(largeData)). + SaveX(ctx) + +// Read explicitly via the Reader method. +rc, err := doc.ContentReader(ctx) +if err != nil { ... } +defer rc.Close() +data, _ := io.ReadAll(rc) +``` + +## DualWrite (Migration Mode) + +`DualWrite()` preserves the original bytes column alongside the blob key column. This is +useful when migrating an existing column to blob storage: + +```go +field.Blob("payload"). + DualWrite(map[string]string{ + dialect.MySQL: "json", + dialect.Postgres: "jsonb", + dialect.SQLite: "json", + }) +``` + +In DualWrite mode: +- **Writes** go to both blob storage and the database column. +- **Reads** prefer blob storage (if a key exists) and fall back to the database column. + +The optional `columnType` argument overrides the database column type per dialect to prevent +schema drift when migrating from an existing column definition. + +## Custom GoType + +Blob fields default to `[]byte`. You can override the Go type: + +```go +// String type (automatic conversion). +field.Blob("description").GoType("") + +// Custom struct with a ValueScanner. +field.Blob("config"). + GoType(&MyConfig{}). + ValueScanner(configScanner{}) +``` + +When using a custom `GoType` other than `string`, a `ValueScanner` must be provided +to encode/decode between the Go type and the raw bytes stored in blob storage. + +## Blob Interface + +Any blob storage backend must implement the `ent.Blob` interface: + +```go +type Blob interface { + NewReader(ctx context.Context, key string) (io.ReadCloser, error) + NewWriter(ctx context.Context, key string) (io.WriteCloser, error) + Delete(ctx context.Context, key string) error + Close() error +} +``` + +- `NewReader` should return `fs.ErrNotExist` (or a wrapping error) when the key does not exist. +- `Delete` should return `nil` (not an error) when the key does not exist. + +The [Go CDK `blob` package](https://gocloud.dev/howto/blob/) provides implementations +for S3, GCS, Azure, and local filesystem that satisfy this interface via a thin adapter. + +## Lifecycle and Cleanup + +### Create + +Blobs are written **before** the database row is inserted. If the SQL INSERT fails +(e.g., constraint violation), the generated code automatically deletes the just-written blobs. + +### Update + +On update: +1. New blob data is written to storage. +2. The SQL UPDATE executes. +3. On success, old (orphaned) blobs are deleted. +4. On SQL failure, newly-written blobs are rolled back (deleted). + +When using `HashKey`, if the content hasn't changed (same hash), the write is skipped entirely. + +### Delete + +Generated delete builders query existing blob keys before deleting the row, then remove +the blobs from storage after a successful SQL DELETE. + +## OnConflict (Upsert) + +Blob fields work with `OnConflict` / upsert builders. Blobs are written to storage +**before** the SQL executes. If the INSERT succeeds (no conflict), the new key is stored. +If there is a conflict, behavior depends on the conflict action and key strategy. + +The following per-field methods are generated on the upsert builder: + +- **`Update()`** — Sets the blob key column (and data column for DualWrite fields) to + the value provided on create. +- **`Clear()`** — Nulls the blob key column (and data column for DualWrite). Only + generated for optional fields. **Note:** this does not delete the old blob from storage — + the previously-referenced blob becomes orphaned. Use a regular Update with `Clear()` + if you need storage cleanup. + +```go +client.Document.Create(). + SetName("readme"). + SetContent([]byte("data")). + SetAttachment([]byte("file")). + OnConflict(). + UpdateContent(). // update only the content blob + UpdateAttachment(). // update only the attachment blob + ExecX(ctx) +``` + +You can also use `UpdateNewValues()` to update all fields (including all blob key columns) +at once. + +### With HashKey (content-addressable) + +Since the key is derived from content, identical data always produces the same key. +This makes all conflict actions safe: + +- **`Update()`** — The SQL updates the key column to the new key. Since the content + is the same, the key is the same, so it's effectively a no-op in storage. +- **`Ignore()` / `DoNothing()`** — The SQL does nothing. The blob written to storage is + identical to what already exists at that key (same content = same key), so no orphan is created. + +```go +client.Document.Create(). + SetName("readme"). + SetContent([]byte("data")). + OnConflict(). + UpdateContent(). + ExecX(ctx) +``` + +### With UUIDKey (random keys) + +Each write generates a new unique key. This means the blob written before the SQL is always +at a **new** key that didn't previously exist: + +- **`Update()`** — The SQL updates the key column to the new UUID key. The old blob + at the previous key becomes orphaned and is **not** automatically cleaned up (OnConflict + does not query old keys like a regular Update does). +- **`Ignore()` / `DoNothing()`** — The SQL does nothing; the row keeps its existing key. + The newly-written blob is orphaned in storage with no database reference. + +For these reasons, `UUIDKey` is **not recommended** with `OnConflict`. If you need upsert +semantics, prefer `HashKey` which is inherently idempotent.