supabase
diff --git a/‎etl-postgres/src/replication/schema.rs‎
Lines changed: 118 additions & 23 deletions b/‎etl-postgres/src/replication/schema.rs‎
Lines changed: 118 additions & 23 deletions
diff --git a/‎etl-postgres/src/types/schema.rs‎
Lines changed: 29 additions & 4 deletions b/‎etl-postgres/src/types/schema.rs‎
Lines changed: 29 additions & 4 deletions
diff --git a/‎etl/migrations/20251205000000_schema_versioning.sql‎
Lines changed: 19 additions & 0 deletions b/‎etl/migrations/20251205000000_schema_versioning.sql‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎etl/src/conversions/event.rs‎
Lines changed: 10 additions & 5 deletions b/‎etl/src/conversions/event.rs‎
Lines changed: 10 additions & 5 deletions
@@ -4,7 +4,7 @@ use sqlx::{PgExecutor, PgPool, Row};
 use std::collections::HashMap;
 use tokio_postgres::types::Type as PgType;
 
-use crate::types::{ColumnSchema, TableId, TableName, TableSchema};
+use crate::types::{ColumnSchema, SnapshotId, TableId, TableName, TableSchema};
 
 macro_rules! define_type_mappings {
     (
@@ -134,44 +134,35 @@ define_type_mappings! {
     DATE_RANGE => "DATE_RANGE"
 }
 
-/// Stores a table schema in the database.
+/// Stores a table schema in the database with a specific snapshot ID.
 ///
-/// Inserts or updates table schema and column information in schema storage tables
-/// using a transaction to ensure atomicity.
+/// Inserts a new table schema version and column information in schema storage tables
+/// using a transaction to ensure atomicity. Unlike upsert, this creates a new version
+/// entry for schema versioning.
 pub async fn store_table_schema(
     pool: &PgPool,
     pipeline_id: i64,
     table_schema: &TableSchema,
 ) -> Result<(), sqlx::Error> {
     let mut tx = pool.begin().await?;
 
-    // Insert or update table schema record
+    // Insert new table schema version
     let table_schema_id: i64 = sqlx::query(
         r#"
-        insert into etl.table_schemas (pipeline_id, table_id, schema_name, table_name)
-        values ($1, $2, $3, $4)
-        on conflict (pipeline_id, table_id)
-        do update set 
-            schema_name = excluded.schema_name,
-            table_name = excluded.table_name,
-            updated_at = now()
+        insert into etl.table_schemas (pipeline_id, table_id, schema_name, table_name, snapshot_id)
+        values ($1, $2, $3, $4, $5)
         returning id
         "#,
     )
     .bind(pipeline_id)
     .bind(table_schema.id.into_inner() as i64)
     .bind(&table_schema.name.schema)
     .bind(&table_schema.name.name)
+    .bind(table_schema.snapshot_id)
     .fetch_one(&mut *tx)
     .await?
     .get(0);
 
-    // Delete existing columns for this table schema to handle schema changes
-    sqlx::query("delete from etl.table_columns where table_schema_id = $1")
-        .bind(table_schema_id)
-        .execute(&mut *tx)
-        .await?;
-
     // Insert all columns
     for column_schema in table_schema.column_schemas.iter() {
         sqlx::query(
@@ -199,20 +190,34 @@ pub async fn store_table_schema(
     Ok(())
 }
 
-/// Loads all table schemas for a pipeline from the database.
+/// Loads all table schemas for a pipeline from the database at the latest snapshot.
 ///
 /// Retrieves table schemas and columns from schema storage tables,
-/// reconstructing complete [`TableSchema`] objects.
+/// reconstructing complete [`TableSchema`] objects. This is equivalent to
+/// calling [`load_table_schemas_at_snapshot`] with `i64::MAX`.
 pub async fn load_table_schemas(
     pool: &PgPool,
     pipeline_id: i64,
 ) -> Result<Vec<TableSchema>, sqlx::Error> {
+    load_table_schemas_at_snapshot(pool, pipeline_id, i64::MAX).await
+}
+
+/// Loads a single table schema with the largest snapshot_id <= the requested snapshot.
+///
+/// Returns `None` if no schema version exists for the table at or before the given snapshot.
+pub async fn load_table_schema_at_snapshot(
+    pool: &PgPool,
+    pipeline_id: i64,
+    table_id: TableId,
+    snapshot_id: SnapshotId,
+) -> Result<Option<TableSchema>, sqlx::Error> {
     let rows = sqlx::query(
         r#"
         select
             ts.table_id,
             ts.schema_name,
             ts.table_name,
+            ts.snapshot_id,
             tc.column_name,
             tc.column_type,
             tc.type_modifier,
@@ -222,11 +227,95 @@ pub async fn load_table_schemas(
             tc.primary_key_ordinal_position
         from etl.table_schemas ts
         inner join etl.table_columns tc on ts.id = tc.table_schema_id
-        where ts.pipeline_id = $1
-        order by ts.table_id, tc.column_order
+        where ts.id = (
+            select id from etl.table_schemas
+            where pipeline_id = $1 and table_id = $2 and snapshot_id <= $3
+            order by snapshot_id desc
+            limit 1
+        )
+        order by tc.column_order
+        "#,
+    )
+    .bind(pipeline_id)
+    .bind(SqlxTableId(table_id.into_inner()))
+    .bind(snapshot_id)
+    .fetch_all(pool)
+    .await?;
+
+    if rows.is_empty() {
+        return Ok(None);
+    }
+
+    let first_row = &rows[0];
+    let table_oid: SqlxTableId = first_row.get("table_id");
+    let table_id = TableId::new(table_oid.0);
+    let schema_name: String = first_row.get("schema_name");
+    let table_name: String = first_row.get("table_name");
+    let snapshot_id: SnapshotId = first_row.get("snapshot_id");
+
+    let mut table_schema = TableSchema::with_snapshot_id(
+        table_id,
+        TableName::new(schema_name, table_name),
+        vec![],
+        snapshot_id,
+    );
+
+    for row in rows {
+        table_schema.add_column_schema(parse_column_schema(&row));
+    }
+
+    Ok(Some(table_schema))
+}
+
+/// Loads all table schemas for a pipeline at a specific snapshot point.
+///
+/// For each table, retrieves the schema version with the largest snapshot_id
+/// that is <= the requested snapshot_id. Tables without any schema version
+/// at or before the snapshot are excluded from the result.
+pub async fn load_table_schemas_at_snapshot(
+    pool: &PgPool,
+    pipeline_id: i64,
+    snapshot_id: SnapshotId,
+) -> Result<Vec<TableSchema>, sqlx::Error> {
+    // Use a window function to find the latest schema version for each table
+    // at or before the requested snapshot
+    let rows = sqlx::query(
+        r#"
+        with latest_schemas as (
+            select
+                ts.id,
+                ts.table_id,
+                ts.schema_name,
+                ts.table_name,
+                ts.snapshot_id,
+                row_number() over (
+                    partition by ts.table_id
+                    order by ts.snapshot_id desc
+                ) as rn
+            from etl.table_schemas ts
+            where ts.pipeline_id = $1
+              and ts.snapshot_id <= $2
+        )
+        select
+            ls.table_id,
+            ls.schema_name,
+            ls.table_name,
+            ls.snapshot_id,
+            tc.column_name,
+            tc.column_type,
+            tc.type_modifier,
+            tc.nullable,
+            tc.primary_key,
+            tc.column_order,
+            tc.primary_key_ordinal_position
+        from latest_schemas ls
+        inner join etl.table_columns tc on ls.id = tc.table_schema_id
+        where ls.rn = 1
+        order by ls.table_id, tc.column_order
         "#,
     )
     .bind(pipeline_id)
+    .bind(snapshot_id)
     .fetch_all(pool)
     .await?;
 
@@ -237,9 +326,15 @@ pub async fn load_table_schemas(
         let table_id = TableId::new(table_oid.0);
         let schema_name: String = row.get("schema_name");
         let table_name: String = row.get("table_name");
+        let row_snapshot_id: SnapshotId = row.get("snapshot_id");
 
         let entry = table_schemas.entry(table_id).or_insert_with(|| {
-            TableSchema::new(table_id, TableName::new(schema_name, table_name), vec![])
+            TableSchema::with_snapshot_id(
+                table_id,
+                TableName::new(schema_name, table_name),
+                vec![],
+                row_snapshot_id,
+            )
         });
 
         entry.add_column_schema(parse_column_schema(&row));
 
@@ -17,6 +17,15 @@ pub enum SchemaError {
 /// An object identifier in Postgres.
 type Oid = u32;
 
+/// Snapshot identifier for schema versioning.
+///
+/// The value represents the start_lsn of the DDL message that created this schema version.
+/// A value of 0 indicates the initial schema before any DDL changes.
+pub type SnapshotId = i64;
+
+/// The initial snapshot ID used for the first schema version.
+pub const INITIAL_SNAPSHOT_ID: SnapshotId = 0;
+
 /// A fully qualified Postgres table name consisting of a schema and table name.
 ///
 /// This type represents a table identifier in Postgres, which requires both a schema name
@@ -189,23 +198,39 @@ impl ToSql for TableId {
 /// Represents the complete schema of a Postgres table.
 ///
 /// This type contains all metadata about a table including its name, OID,
-/// and the schemas of all its columns.
+/// the schemas of all its columns, and a snapshot identifier for versioning.
 #[derive(Debug, Clone, Eq, PartialEq)]
 pub struct TableSchema {
-    /// The Postgres OID of the table
+    /// The Postgres OID of the table.
     pub id: TableId,
-    /// The fully qualified name of the table
+    /// The fully qualified name of the table.
     pub name: TableName,
-    /// The schemas of all columns in the table
+    /// The schemas of all columns in the table.
     pub column_schemas: Vec<ColumnSchema>,
+    /// The snapshot identifier for this schema version.
+    ///
+    /// Value 0 indicates the initial schema, other values are start_lsn positions of DDL changes.
+    pub snapshot_id: SnapshotId,
 }
 
 impl TableSchema {
+    /// Creates a new [`TableSchema`] with the initial snapshot ID (0).
     pub fn new(id: TableId, name: TableName, column_schemas: Vec<ColumnSchema>) -> Self {
+        Self::with_snapshot_id(id, name, column_schemas, INITIAL_SNAPSHOT_ID)
+    }
+
+    /// Creates a new [`TableSchema`] with a specific snapshot ID.
+    pub fn with_snapshot_id(
+        id: TableId,
+        name: TableName,
+        column_schemas: Vec<ColumnSchema>,
+        snapshot_id: SnapshotId,
+    ) -> Self {
         Self {
             id,
             name,
             column_schemas,
+            snapshot_id,
         }
     }
 
 
@@ -0,0 +1,19 @@
+-- Add snapshot_id column to table_schemas for schema versioning.
+-- The snapshot_id value is the start_lsn of the DDL message that created this schema version.
+-- Initial schemas use snapshot_id=0.
+
+ALTER TABLE etl.table_schemas
+    ADD COLUMN IF NOT EXISTS snapshot_id BIGINT NOT NULL DEFAULT 0;
+
+-- Change unique constraint from (pipeline_id, table_id) to (pipeline_id, table_id, snapshot_id)
+-- to allow multiple schema versions per table.
+ALTER TABLE etl.table_schemas
+    DROP CONSTRAINT IF EXISTS table_schemas_pipeline_id_table_id_key;
+
+ALTER TABLE etl.table_schemas
+    ADD CONSTRAINT table_schemas_pipeline_id_table_id_snapshot_id_key
+    UNIQUE (pipeline_id, table_id, snapshot_id);
+
+-- Index for efficient "find largest snapshot_id <= X" queries.
+CREATE INDEX IF NOT EXISTS idx_table_schemas_pipeline_table_snapshot_id
+    ON etl.table_schemas (pipeline_id, table_id, snapshot_id DESC);
@@ -2,7 +2,8 @@ use core::str;
 use std::collections::HashSet;
 
 use etl_postgres::types::{
-    ColumnSchema, ReplicatedTableSchema, TableId, TableName, TableSchema, convert_type_oid_to_type,
+    ColumnSchema, ReplicatedTableSchema, SnapshotId, TableId, TableName, TableSchema,
+    convert_type_oid_to_type,
 };
 use postgres_replication::protocol;
 use serde::Deserialize;
@@ -325,11 +326,14 @@ pub fn parse_ddl_schema_change_message(content: &str) -> EtlResult<DdlSchemaChan
     })
 }
 
-/// Converts a [`DdlSchemaChangeMessage`] to a [`TableSchema`].
+/// Converts a [`DdlSchemaChangeMessage`] to a [`TableSchema`] with a specific snapshot ID.
 ///
 /// This is used to update the stored table schema when a DDL change is detected.
-#[allow(dead_code)]
-pub fn ddl_message_to_table_schema(message: &DdlSchemaChangeMessage) -> TableSchema {
+/// The snapshot_id should be the start_lsn of the DDL message.
+pub fn ddl_message_to_table_schema(
+    message: &DdlSchemaChangeMessage,
+    snapshot_id: SnapshotId,
+) -> TableSchema {
     let table_name = TableName::new(message.schema_name.clone(), message.table_name.clone());
     let column_schemas = message
         .columns
@@ -347,9 +351,10 @@ pub fn ddl_message_to_table_schema(message: &DdlSchemaChangeMessage) -> TableSch
         })
         .collect();
 
-    TableSchema::new(
+    TableSchema::with_snapshot_id(
         TableId::new(message.table_id as u32),
         table_name,
         column_schemas,
+        snapshot_id,
     )
 }