diff --git a/kernel/src/lib.rs b/kernel/src/lib.rs index 0f611cfbcc..b94ae4fcf3 100644 --- a/kernel/src/lib.rs +++ b/kernel/src/lib.rs @@ -96,6 +96,7 @@ mod log_path; pub mod scan; pub mod schema; pub mod snapshot; +pub mod statistics; pub mod table_changes; pub mod table_configuration; pub mod table_features; diff --git a/kernel/src/statistics.rs b/kernel/src/statistics.rs new file mode 100644 index 0000000000..c7fc2fe11a --- /dev/null +++ b/kernel/src/statistics.rs @@ -0,0 +1,48 @@ +//! Statistics types for Delta tables +//! +//! This module contains types for representing file statistics in both JSON and parsed formats. +//! Statistics are used for data skipping during query execution and are stored in checkpoint files. + +use std::collections::HashMap; + +use crate::expressions::Scalar; + +/// Parsed statistics for a file (alternative to JSON stats string) +/// +/// This represents the structured form of file statistics that can be stored +/// directly in checkpoints as `stats_parsed` instead of as a JSON string. +/// +/// The minValues, maxValues, and nullCount fields contain dynamic structs +/// whose schema matches the table's data columns (using physical column names). +/// +/// # Schema +/// +/// The stats_parsed schema in checkpoints is: +/// ```text +/// struct< +/// numRecords: long, +/// minValues: struct, // Dynamic per table +/// maxValues: struct, // Dynamic per table +/// nullCount: struct, // Dynamic per table +/// tightBounds: boolean +/// > +/// ``` +#[derive(Debug, Clone, PartialEq)] +pub struct StatsParsed { + /// Number of records in the file + pub num_records: Option, + + /// Minimum values per column (physical column names) + /// Dynamic based on table schema + pub min_values: Option>, + + /// Maximum values per column (physical column names) + /// Dynamic based on table schema + pub max_values: Option>, + + /// Null count per column (physical column names) + pub null_count: Option>, + + /// Whether statistics are exact (tight bounds) + pub tight_bounds: Option, +}