From aab78d6c897214033c3cc951d780576304910291 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Tue, 4 Nov 2025 08:38:39 +0100 Subject: [PATCH 01/39] Add REE file column helpers --- crates/iceberg/src/arrow/reader.rs | 78 ++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs index e0894ad6b4..905d6a3dae 100644 --- a/crates/iceberg/src/arrow/reader.rs +++ b/crates/iceberg/src/arrow/reader.rs @@ -492,6 +492,84 @@ impl ArrowReader { Ok(results.into()) } + /// Helper function to add a `_file` column to a RecordBatch. + /// Takes the array and field to add, reducing code duplication. + fn create_file_field( + batch: RecordBatch, + file_array: ArrayRef, + file_field: Field, + field_id: i32, + ) -> Result { + let mut columns = batch.columns().to_vec(); + columns.push(file_array); + + let mut fields: Vec<_> = batch.schema().fields().iter().cloned().collect(); + fields.push(Arc::new(file_field.with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + field_id.to_string(), + )])))); + + let schema = Arc::new(ArrowSchema::new(fields)); + RecordBatch::try_new(schema, columns).map_err(|e| { + Error::new( + ErrorKind::Unexpected, + "Failed to add _file column to RecordBatch", + ) + .with_source(e) + }) + } + + /// Adds a `_file` column to the RecordBatch containing the file path. + /// Uses Run-End Encoding (REE) for maximum memory efficiency when the same + /// file path is repeated across all rows. + /// Note: This is only used in tests for now, for production usage we use the + /// non-REE version as it is Julia-compatible. + #[allow(dead_code)] + pub(crate) fn add_file_path_column_ree( + batch: RecordBatch, + file_path: &str, + field_name: &str, + field_id: i32, + ) -> Result { + let num_rows = batch.num_rows(); + + // Use Run-End Encoded array for optimal memory efficiency + // For a constant value repeated num_rows times, this stores: + // - run_ends: [num_rows] (one i32) for non-empty batches, or [] for empty batches + // - values: [file_path] (one string) for non-empty batches, or [] for empty batches + let run_ends = if num_rows == 0 { + Int32Array::from(Vec::::new()) + } else { + Int32Array::from(vec![num_rows as i32]) + }; + let values = if num_rows == 0 { + StringArray::from(Vec::<&str>::new()) + } else { + StringArray::from(vec![file_path]) + }; + + let file_array = RunArray::try_new(&run_ends, &values).map_err(|e| { + Error::new( + ErrorKind::Unexpected, + "Failed to create RunArray for _file column", + ) + .with_source(e) + })?; + + // Per Iceberg spec, the _file column has reserved field ID RESERVED_FIELD_ID_FILE + // DataType is RunEndEncoded with Int32 run ends and Utf8 values + // Note: values field is nullable to match what RunArray::try_new(..) expects. + let run_ends_field = Arc::new(Field::new("run_ends", DataType::Int32, false)); + let values_field = Arc::new(Field::new("values", DataType::Utf8, true)); + let file_field = Field::new( + field_name, + DataType::RunEndEncoded(run_ends_field, values_field), + false, + ); + + Self::create_file_field(batch, Arc::new(file_array), file_field, field_id) + } + fn build_field_id_set_and_map( parquet_schema: &SchemaDescriptor, predicate: &BoundPredicate, From ee21cab449becfdd0f32d1ef1ae23318b6f13e6a Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Tue, 4 Nov 2025 08:40:04 +0100 Subject: [PATCH 02/39] Add helper tests --- crates/iceberg/src/arrow/reader.rs | 164 +++++++++++++++++++++++++++++ 1 file changed, 164 insertions(+) diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs index 905d6a3dae..c5cd28f2f5 100644 --- a/crates/iceberg/src/arrow/reader.rs +++ b/crates/iceberg/src/arrow/reader.rs @@ -2370,6 +2370,170 @@ message schema { assert!(col_b.is_null(2)); } + #[test] + fn test_add_file_path_column_ree() { + use arrow_array::{Array, Int32Array, RecordBatch, StringArray}; + use arrow_schema::{DataType, Field, Schema}; + + // Create a simple test batch with 2 columns and 3 rows + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("name", DataType::Utf8, false), + ])); + + let id_array = Int32Array::from(vec![1, 2, 3]); + let name_array = StringArray::from(vec!["Alice", "Bob", "Charlie"]); + + let batch = RecordBatch::try_new(schema.clone(), vec![ + Arc::new(id_array), + Arc::new(name_array), + ]) + .unwrap(); + + assert_eq!(batch.num_columns(), 2); + assert_eq!(batch.num_rows(), 3); + + // Add file path column with REE + let file_path = "/path/to/data/file.parquet"; + let result = ArrowReader::add_file_path_column_ree( + batch, + file_path, + RESERVED_COL_NAME_FILE, + RESERVED_FIELD_ID_FILE, + ); + assert!(result.is_ok(), "Should successfully add file path column"); + + let new_batch = result.unwrap(); + + // Verify the new batch has 3 columns + assert_eq!(new_batch.num_columns(), 3); + assert_eq!(new_batch.num_rows(), 3); + + // Verify schema has the _file column + let schema = new_batch.schema(); + assert_eq!(schema.fields().len(), 3); + + let file_field = schema.field(2); + assert_eq!(file_field.name(), RESERVED_COL_NAME_FILE); + assert!(!file_field.is_nullable()); + + // Verify the field has the correct metadata + let metadata = file_field.metadata(); + assert_eq!( + metadata.get(PARQUET_FIELD_ID_META_KEY), + Some(&RESERVED_FIELD_ID_FILE.to_string()) + ); + + // Verify the data type is RunEndEncoded + match file_field.data_type() { + DataType::RunEndEncoded(run_ends_field, values_field) => { + assert_eq!(run_ends_field.name(), "run_ends"); + assert_eq!(run_ends_field.data_type(), &DataType::Int32); + assert!(!run_ends_field.is_nullable()); + + assert_eq!(values_field.name(), "values"); + assert_eq!(values_field.data_type(), &DataType::Utf8); + } + _ => panic!("Expected RunEndEncoded data type for _file column"), + } + + // Verify the original columns are intact + let id_col = new_batch + .column(0) + .as_primitive::(); + assert_eq!(id_col.values(), &[1, 2, 3]); + + let name_col = new_batch.column(1).as_string::(); + assert_eq!(name_col.value(0), "Alice"); + assert_eq!(name_col.value(1), "Bob"); + assert_eq!(name_col.value(2), "Charlie"); + + // Verify the file path column contains the correct value + // The _file column is a RunArray, so we need to decode it + let file_col = new_batch.column(2); + let run_array = file_col + .as_any() + .downcast_ref::>() + .expect("Expected RunArray for _file column"); + + // Verify the run array structure (should be optimally encoded) + let run_ends = run_array.run_ends(); + assert_eq!(run_ends.values().len(), 1, "Should have only 1 run end"); + assert_eq!( + run_ends.values()[0], + new_batch.num_rows() as i32, + "Run end should equal number of rows" + ); + + // Check that the single value in the RunArray is the expected file path + let values = run_array.values(); + let string_values = values.as_string::(); + assert_eq!(string_values.len(), 1, "Should have only 1 value"); + assert_eq!(string_values.value(0), file_path); + + assert!( + string_values + .downcast_ref::() + .unwrap() + .iter() + .all(|v| v == Some(file_path)) + ) + } + + #[test] + fn test_add_file_path_column_ree_empty_batch() { + use arrow_array::RecordBatch; + use arrow_schema::{DataType, Field, Schema}; + use parquet::arrow::PARQUET_FIELD_ID_META_KEY; + + // Create an empty batch + let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); + + let id_array = arrow_array::Int32Array::from(Vec::::new()); + let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(id_array)]).unwrap(); + + assert_eq!(batch.num_rows(), 0); + + // Add file path column to empty batch with REE + let file_path = "/empty/file.parquet"; + let result = ArrowReader::add_file_path_column_ree( + batch, + file_path, + RESERVED_COL_NAME_FILE, + RESERVED_FIELD_ID_FILE, + ); + + // Should succeed with empty RunArray for empty batches + assert!(result.is_ok()); + let new_batch = result.unwrap(); + assert_eq!(new_batch.num_rows(), 0); + assert_eq!(new_batch.num_columns(), 2); + + // Verify the _file column exists with correct schema + let schema = new_batch.schema(); + let file_field = schema.field(1); + assert_eq!(file_field.name(), RESERVED_COL_NAME_FILE); + + // Should use RunEndEncoded even for empty batches + match file_field.data_type() { + DataType::RunEndEncoded(run_ends_field, values_field) => { + assert_eq!(run_ends_field.data_type(), &DataType::Int32); + assert_eq!(values_field.data_type(), &DataType::Utf8); + } + _ => panic!("Expected RunEndEncoded data type for _file column"), + } + + // Verify metadata with reserved field ID + assert_eq!( + file_field.metadata().get(PARQUET_FIELD_ID_META_KEY), + Some(&RESERVED_FIELD_ID_FILE.to_string()) + ); + + // Verify the file path column is empty but properly structured + let file_path_column = new_batch.column(1); + assert_eq!(file_path_column.len(), 0); + } + /// Test for bug where position deletes in later row groups are not applied correctly. /// /// When a file has multiple row groups and a position delete targets a row in a later From 37b52e2707a0e4717d9143ce323f66efde7cda91 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Tue, 4 Nov 2025 08:41:20 +0100 Subject: [PATCH 03/39] Add constants --- crates/iceberg/src/arrow/reader.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs index c5cd28f2f5..4c0179b373 100644 --- a/crates/iceberg/src/arrow/reader.rs +++ b/crates/iceberg/src/arrow/reader.rs @@ -59,6 +59,14 @@ use crate::spec::{Datum, NestedField, PrimitiveType, Schema, Type}; use crate::utils::available_parallelism; use crate::{Error, ErrorKind}; +/// Column name for the file path metadata column per Iceberg spec +/// This is dead code for now but will be used when we add the _file column support. +#[allow(dead_code)] +pub(crate) const RESERVED_COL_NAME_FILE: &str = "_file"; + +/// Reserved field ID for the file path column used in delete file reading. +pub(crate) const RESERVED_FIELD_ID_FILE_PATH: i32 = 2147483546; + /// Builder to create ArrowReader pub struct ArrowReaderBuilder { batch_size: Option, From 44463a0ce1de8809cbb94a06f3707650d4d3e1ee Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Tue, 4 Nov 2025 10:05:55 +0100 Subject: [PATCH 04/39] Add support for _file constant --- crates/iceberg/src/arrow/delete_filter.rs | 2 + crates/iceberg/src/arrow/reader.rs | 114 ++++++--- crates/iceberg/src/scan/context.rs | 9 + crates/iceberg/src/scan/mod.rs | 281 +++++++++++++++++++++- crates/iceberg/src/scan/task.rs | 5 + 5 files changed, 366 insertions(+), 45 deletions(-) diff --git a/crates/iceberg/src/arrow/delete_filter.rs b/crates/iceberg/src/arrow/delete_filter.rs index b853baa993..01b71cd4cd 100644 --- a/crates/iceberg/src/arrow/delete_filter.rs +++ b/crates/iceberg/src/arrow/delete_filter.rs @@ -338,6 +338,7 @@ pub(crate) mod tests { schema: data_file_schema.clone(), project_field_ids: vec![], predicate: None, + file_column_position: None, deletes: vec![pos_del_1, pos_del_2.clone()], }, FileScanTask { @@ -349,6 +350,7 @@ pub(crate) mod tests { schema: data_file_schema.clone(), project_field_ids: vec![], predicate: None, + file_column_position: None, deletes: vec![pos_del_3], }, ]; diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs index 4c0179b373..6eedb37960 100644 --- a/crates/iceberg/src/arrow/reader.rs +++ b/crates/iceberg/src/arrow/reader.rs @@ -23,11 +23,14 @@ use std::str::FromStr; use std::sync::Arc; use arrow_arith::boolean::{and, and_kleene, is_not_null, is_null, not, or, or_kleene}; -use arrow_array::{Array, ArrayRef, BooleanArray, Datum as ArrowDatum, RecordBatch, Scalar}; +use arrow_array::{ + Array, ArrayRef, BooleanArray, Datum as ArrowDatum, Int32Array, RecordBatch, RunArray, Scalar, + StringArray, +}; use arrow_cast::cast::cast; use arrow_ord::cmp::{eq, gt, gt_eq, lt, lt_eq, neq}; use arrow_schema::{ - ArrowError, DataType, FieldRef, Schema as ArrowSchema, SchemaRef as ArrowSchemaRef, + ArrowError, DataType, Field, FieldRef, Schema as ArrowSchema, SchemaRef as ArrowSchemaRef, }; use arrow_string::like::starts_with; use bytes::Bytes; @@ -59,14 +62,12 @@ use crate::spec::{Datum, NestedField, PrimitiveType, Schema, Type}; use crate::utils::available_parallelism; use crate::{Error, ErrorKind}; +/// Reserved field ID for the file path (_file) column per Iceberg spec +pub(crate) const RESERVED_FIELD_ID_FILE: i32 = 2147483646; + /// Column name for the file path metadata column per Iceberg spec -/// This is dead code for now but will be used when we add the _file column support. -#[allow(dead_code)] pub(crate) const RESERVED_COL_NAME_FILE: &str = "_file"; -/// Reserved field ID for the file path column used in delete file reading. -pub(crate) const RESERVED_FIELD_ID_FILE_PATH: i32 = 2147483546; - /// Builder to create ArrowReader pub struct ArrowReaderBuilder { batch_size: Option, @@ -344,13 +345,33 @@ impl ArrowReader { record_batch_stream_builder.with_row_groups(selected_row_group_indices); } + // Get the _file column position from the task (if requested) + let file_column_position = task.file_column_position; + let data_file_path = task.data_file_path.clone(); + // Build the batch stream and send all the RecordBatches that it generates // to the requester. let record_batch_stream = record_batch_stream_builder .build()? .map(move |batch| match batch { - Ok(batch) => record_batch_transformer.process_record_batch(batch), + Ok(batch) => { + let processed_batch = + record_batch_transformer.process_record_batch(batch)?; + + // Add the _file column if requested at the correct position + if let Some(position) = file_column_position { + Self::add_file_path_column_ree_at_position( + processed_batch, + &data_file_path, + RESERVED_COL_NAME_FILE, + RESERVED_FIELD_ID_FILE, + position, + ) + } else { + Ok(processed_batch) + } + } Err(err) => Err(err.into()), }); @@ -500,22 +521,34 @@ impl ArrowReader { Ok(results.into()) } - /// Helper function to add a `_file` column to a RecordBatch. - /// Takes the array and field to add, reducing code duplication. - fn create_file_field( + /// Helper function to add a `_file` column to a RecordBatch at a specific position. + /// Takes the array, field to add, and position where to insert. + fn create_file_field_at_position( batch: RecordBatch, file_array: ArrayRef, file_field: Field, field_id: i32, + position: usize, ) -> Result { - let mut columns = batch.columns().to_vec(); - columns.push(file_array); - - let mut fields: Vec<_> = batch.schema().fields().iter().cloned().collect(); - fields.push(Arc::new(file_field.with_metadata(HashMap::from([( + let file_field_with_metadata = Arc::new(file_field.with_metadata(HashMap::from([( PARQUET_FIELD_ID_META_KEY.to_string(), field_id.to_string(), - )])))); + )]))); + + // Build columns vector in a single pass without insert + let original_columns = batch.columns(); + let mut columns = Vec::with_capacity(original_columns.len() + 1); + columns.extend_from_slice(&original_columns[..position]); + columns.push(file_array); + columns.extend_from_slice(&original_columns[position..]); + + // Build fields vector in a single pass without insert + let schema = batch.schema(); + let original_fields = schema.fields(); + let mut fields = Vec::with_capacity(original_fields.len() + 1); + fields.extend(original_fields[..position].iter().cloned()); + fields.push(file_field_with_metadata); + fields.extend(original_fields[position..].iter().cloned()); let schema = Arc::new(ArrowSchema::new(fields)); RecordBatch::try_new(schema, columns).map_err(|e| { @@ -527,24 +560,18 @@ impl ArrowReader { }) } - /// Adds a `_file` column to the RecordBatch containing the file path. - /// Uses Run-End Encoding (REE) for maximum memory efficiency when the same - /// file path is repeated across all rows. - /// Note: This is only used in tests for now, for production usage we use the - /// non-REE version as it is Julia-compatible. - #[allow(dead_code)] - pub(crate) fn add_file_path_column_ree( + /// Adds a `_file` column to the RecordBatch at a specific position. + /// Uses Run-End Encoding (REE) for maximum memory efficiency. + pub(crate) fn add_file_path_column_ree_at_position( batch: RecordBatch, file_path: &str, field_name: &str, field_id: i32, + position: usize, ) -> Result { let num_rows = batch.num_rows(); // Use Run-End Encoded array for optimal memory efficiency - // For a constant value repeated num_rows times, this stores: - // - run_ends: [num_rows] (one i32) for non-empty batches, or [] for empty batches - // - values: [file_path] (one string) for non-empty batches, or [] for empty batches let run_ends = if num_rows == 0 { Int32Array::from(Vec::::new()) } else { @@ -564,9 +591,6 @@ impl ArrowReader { .with_source(e) })?; - // Per Iceberg spec, the _file column has reserved field ID RESERVED_FIELD_ID_FILE - // DataType is RunEndEncoded with Int32 run ends and Utf8 values - // Note: values field is nullable to match what RunArray::try_new(..) expects. let run_ends_field = Arc::new(Field::new("run_ends", DataType::Int32, false)); let values_field = Arc::new(Field::new("values", DataType::Utf8, true)); let file_field = Field::new( @@ -575,7 +599,13 @@ impl ArrowReader { false, ); - Self::create_file_field(batch, Arc::new(file_array), file_field, field_id) + Self::create_file_field_at_position( + batch, + Arc::new(file_array), + file_field, + field_id, + position, + ) } fn build_field_id_set_and_map( @@ -1573,6 +1603,7 @@ mod tests { use arrow_array::cast::AsArray; use arrow_array::{ArrayRef, LargeStringArray, RecordBatch, StringArray}; use arrow_schema::{DataType, Field, Schema as ArrowSchema, TimeUnit}; + use as_any::Downcast; use futures::TryStreamExt; use parquet::arrow::arrow_reader::{RowSelection, RowSelector}; use parquet::arrow::{ArrowWriter, ProjectionMask}; @@ -1586,7 +1617,9 @@ mod tests { use crate::ErrorKind; use crate::arrow::reader::{CollectFieldIdVisitor, PARQUET_FIELD_ID_META_KEY}; - use crate::arrow::{ArrowReader, ArrowReaderBuilder}; + use crate::arrow::{ + ArrowReader, ArrowReaderBuilder, RESERVED_COL_NAME_FILE, RESERVED_FIELD_ID_FILE, + }; use crate::delete_vector::DeleteVector; use crate::expr::visitors::bound_predicate_visitor::visit; use crate::expr::{Bind, Predicate, Reference}; @@ -1887,6 +1920,7 @@ message schema { schema: schema.clone(), project_field_ids: vec![1], predicate: Some(predicate.bind(schema, true).unwrap()), + file_column_position: None, deletes: vec![], })] .into_iter(), @@ -2205,6 +2239,7 @@ message schema { schema: schema.clone(), project_field_ids: vec![1], predicate: None, + file_column_position: None, deletes: vec![], }; @@ -2218,6 +2253,7 @@ message schema { schema: schema.clone(), project_field_ids: vec![1], predicate: None, + file_column_position: None, deletes: vec![], }; @@ -2342,6 +2378,7 @@ message schema { schema: new_schema.clone(), project_field_ids: vec![1, 2], // Request both columns 'a' and 'b' predicate: None, + file_column_position: None, deletes: vec![], })] .into_iter(), @@ -2401,13 +2438,14 @@ message schema { assert_eq!(batch.num_columns(), 2); assert_eq!(batch.num_rows(), 3); - // Add file path column with REE + // Add file path column with REE at the end (position 2) let file_path = "/path/to/data/file.parquet"; - let result = ArrowReader::add_file_path_column_ree( + let result = ArrowReader::add_file_path_column_ree_at_position( batch, file_path, RESERVED_COL_NAME_FILE, RESERVED_FIELD_ID_FILE, + 2, // Position at the end after id and name columns ); assert!(result.is_ok(), "Should successfully add file path column"); @@ -2502,13 +2540,14 @@ message schema { assert_eq!(batch.num_rows(), 0); - // Add file path column to empty batch with REE + // Add file path column to empty batch with REE at position 1 (after id column) let file_path = "/empty/file.parquet"; - let result = ArrowReader::add_file_path_column_ree( + let result = ArrowReader::add_file_path_column_ree_at_position( batch, file_path, RESERVED_COL_NAME_FILE, RESERVED_FIELD_ID_FILE, + 1, // Position 1, after the id column ); // Should succeed with empty RunArray for empty batches @@ -2669,6 +2708,7 @@ message schema { schema: table_schema.clone(), project_field_ids: vec![1], predicate: None, + file_column_position: None, deletes: vec![FileScanTaskDeleteFile { file_path: delete_file_path, file_type: DataContentType::PositionDeletes, @@ -2884,6 +2924,7 @@ message schema { schema: table_schema.clone(), project_field_ids: vec![1], predicate: None, + file_column_position: None, deletes: vec![FileScanTaskDeleteFile { file_path: delete_file_path, file_type: DataContentType::PositionDeletes, @@ -3092,6 +3133,7 @@ message schema { schema: table_schema.clone(), project_field_ids: vec![1], predicate: None, + file_column_position: None, deletes: vec![FileScanTaskDeleteFile { file_path: delete_file_path, file_type: DataContentType::PositionDeletes, diff --git a/crates/iceberg/src/scan/context.rs b/crates/iceberg/src/scan/context.rs index 3f7c29dbf4..48652890dc 100644 --- a/crates/iceberg/src/scan/context.rs +++ b/crates/iceberg/src/scan/context.rs @@ -46,6 +46,7 @@ pub(crate) struct ManifestFileContext { snapshot_schema: SchemaRef, expression_evaluator_cache: Arc, delete_file_index: DeleteFileIndex, + file_column_position: Option, } /// Wraps a [`ManifestEntryRef`] alongside the objects that are needed @@ -59,6 +60,7 @@ pub(crate) struct ManifestEntryContext { pub partition_spec_id: i32, pub snapshot_schema: SchemaRef, pub delete_file_index: DeleteFileIndex, + pub file_column_position: Option, } impl ManifestFileContext { @@ -74,6 +76,7 @@ impl ManifestFileContext { mut sender, expression_evaluator_cache, delete_file_index, + file_column_position, .. } = self; @@ -89,6 +92,7 @@ impl ManifestFileContext { bound_predicates: bound_predicates.clone(), snapshot_schema: snapshot_schema.clone(), delete_file_index: delete_file_index.clone(), + file_column_position, }; sender @@ -127,6 +131,8 @@ impl ManifestEntryContext { .bound_predicates .map(|x| x.as_ref().snapshot_bound_predicate.clone()), + file_column_position: self.file_column_position, + deletes, }) } @@ -149,6 +155,8 @@ pub(crate) struct PlanContext { pub partition_filter_cache: Arc, pub manifest_evaluator_cache: Arc, pub expression_evaluator_cache: Arc, + + pub file_column_position: Option, } impl PlanContext { @@ -260,6 +268,7 @@ impl PlanContext { field_ids: self.field_ids.clone(), expression_evaluator_cache: self.expression_evaluator_cache.clone(), delete_file_index, + file_column_position: self.file_column_position, } } } diff --git a/crates/iceberg/src/scan/mod.rs b/crates/iceberg/src/scan/mod.rs index 6884e00b9b..05f1c81e9d 100644 --- a/crates/iceberg/src/scan/mod.rs +++ b/crates/iceberg/src/scan/mod.rs @@ -31,7 +31,7 @@ use futures::stream::BoxStream; use futures::{SinkExt, StreamExt, TryStreamExt}; pub use task::*; -use crate::arrow::ArrowReaderBuilder; +use crate::arrow::{ArrowReaderBuilder, RESERVED_COL_NAME_FILE as RESERVED_COL_NAME_FILE_INTERNAL}; use crate::delete_file_index::DeleteFileIndex; use crate::expr::visitors::inclusive_metrics_evaluator::InclusiveMetricsEvaluator; use crate::expr::{Bind, BoundPredicate, Predicate}; @@ -42,6 +42,27 @@ use crate::table::Table; use crate::utils::available_parallelism; use crate::{Error, ErrorKind, Result}; +/// Reserved column name for the file path metadata column. +/// +/// When this column is selected in a table scan (e.g., `.select(["col1", RESERVED_COL_NAME_FILE])`), +/// each row will include the path of the file from which that row was read. +/// This is useful for debugging, auditing, or tracking data lineage. +/// +/// # Example +/// ```no_run +/// # use iceberg::scan::RESERVED_COL_NAME_FILE; +/// # async fn example() -> iceberg::Result<()> { +/// # let table = todo!(); +/// // Select regular columns along with the file path +/// let scan = table +/// .scan() +/// .select(["id", "name", RESERVED_COL_NAME_FILE]) +/// .build()?; +/// # Ok(()) +/// # } +/// ``` +pub const RESERVED_COL_NAME_FILE: &str = RESERVED_COL_NAME_FILE_INTERNAL; + /// A stream of arrow [`RecordBatch`]es. pub type ArrowRecordBatchStream = BoxStream<'static, Result>; @@ -217,9 +238,13 @@ impl<'a> TableScanBuilder<'a> { let schema = snapshot.schema(self.table.metadata())?; - // Check that all column names exist in the schema. + // Check that all column names exist in the schema (skip reserved columns). if let Some(column_names) = self.column_names.as_ref() { for column_name in column_names { + // Skip reserved columns that don't exist in the schema + if column_name == RESERVED_COL_NAME_FILE_INTERNAL { + continue; + } if schema.field_by_name(column_name).is_none() { return Err(Error::new( ErrorKind::DataInvalid, @@ -239,7 +264,16 @@ impl<'a> TableScanBuilder<'a> { .collect() }); - for column_name in column_names.iter() { + // Track the position of the _file column if requested + let mut file_column_position = None; + + for (index, column_name) in column_names.iter().enumerate() { + // Handle special reserved column "_file" + if column_name == RESERVED_COL_NAME_FILE_INTERNAL { + file_column_position = Some(index); + continue; // Don't add to field_ids - it's a virtual column + } + let field_id = schema.field_id_by_name(column_name).ok_or_else(|| { Error::new( ErrorKind::DataInvalid, @@ -254,10 +288,10 @@ impl<'a> TableScanBuilder<'a> { Error::new( ErrorKind::FeatureUnsupported, format!( - "Column {column_name} is not a direct child of schema but a nested field, which is not supported now. Schema: {schema}" - ), - ) - })?; + "Column {column_name} is not a direct child of schema but a nested field, which is not supported now. Schema: {schema}" + ), + ) + })?; field_ids.push(field_id); } @@ -280,6 +314,7 @@ impl<'a> TableScanBuilder<'a> { partition_filter_cache: Arc::new(PartitionFilterCache::new()), manifest_evaluator_cache: Arc::new(ManifestEvaluatorCache::new()), expression_evaluator_cache: Arc::new(ExpressionEvaluatorCache::new()), + file_column_position, }; Ok(TableScan { @@ -559,8 +594,10 @@ pub mod tests { use std::fs::File; use std::sync::Arc; + use arrow_array::cast::AsArray; use arrow_array::{ - ArrayRef, BooleanArray, Float64Array, Int32Array, Int64Array, RecordBatch, StringArray, + Array, ArrayRef, BooleanArray, Float64Array, Int32Array, Int64Array, RecordBatch, + StringArray, }; use futures::{TryStreamExt, stream}; use minijinja::value::Value; @@ -575,7 +612,7 @@ pub mod tests { use crate::arrow::ArrowReaderBuilder; use crate::expr::{BoundPredicate, Reference}; use crate::io::{FileIO, OutputFile}; - use crate::scan::FileScanTask; + use crate::scan::{FileScanTask, RESERVED_COL_NAME_FILE}; use crate::spec::{ DataContentType, DataFileBuilder, DataFileFormat, Datum, Literal, ManifestEntry, ManifestListWriter, ManifestStatus, ManifestWriterBuilder, NestedField, PartitionSpec, @@ -1776,6 +1813,7 @@ pub mod tests { schema: schema.clone(), record_count: Some(100), data_file_format: DataFileFormat::Parquet, + file_column_position: None, deletes: vec![], }; test_fn(task); @@ -1790,8 +1828,233 @@ pub mod tests { schema, record_count: None, data_file_format: DataFileFormat::Avro, + file_column_position: None, deletes: vec![], }; test_fn(task); } + + #[tokio::test] + async fn test_select_with_file_column() { + use arrow_array::cast::AsArray; + + let mut fixture = TableTestFixture::new(); + fixture.setup_manifest_files().await; + + // Select regular columns plus the _file column + let table_scan = fixture + .table + .scan() + .select(["x", RESERVED_COL_NAME_FILE]) + .with_row_selection_enabled(true) + .build() + .unwrap(); + + let batch_stream = table_scan.to_arrow().await.unwrap(); + let batches: Vec<_> = batch_stream.try_collect().await.unwrap(); + + // Verify we have 2 columns: x and _file + assert_eq!(batches[0].num_columns(), 2); + + // Verify the x column exists and has correct data + let x_col = batches[0].column_by_name("x").unwrap(); + let x_arr = x_col.as_primitive::(); + assert_eq!(x_arr.value(0), 1); + + // Verify the _file column exists + let file_col = batches[0].column_by_name(RESERVED_COL_NAME_FILE); + assert!( + file_col.is_some(), + "_file column should be present in the batch" + ); + + // Verify the _file column contains a file path + let file_col = file_col.unwrap(); + assert!( + matches!( + file_col.data_type(), + arrow_schema::DataType::RunEndEncoded(_, _) + ), + "_file column should use RunEndEncoded type" + ); + + // Decode the RunArray to verify it contains the file path + let run_array = file_col + .as_any() + .downcast_ref::>() + .expect("_file column should be a RunArray"); + + let values = run_array.values(); + let string_values = values.as_string::(); + assert_eq!(string_values.len(), 1, "Should have a single file path"); + + let file_path = string_values.value(0); + assert!( + file_path.ends_with(".parquet"), + "File path should end with .parquet, got: {}", + file_path + ); + } + + #[tokio::test] + async fn test_select_file_column_position() { + let mut fixture = TableTestFixture::new(); + fixture.setup_manifest_files().await; + + // Select columns in specific order: x, _file, z + let table_scan = fixture + .table + .scan() + .select(["x", RESERVED_COL_NAME_FILE, "z"]) + .with_row_selection_enabled(true) + .build() + .unwrap(); + + let batch_stream = table_scan.to_arrow().await.unwrap(); + let batches: Vec<_> = batch_stream.try_collect().await.unwrap(); + + assert_eq!(batches[0].num_columns(), 3); + + // Verify column order: x at position 0, _file at position 1, z at position 2 + let schema = batches[0].schema(); + assert_eq!(schema.field(0).name(), "x"); + assert_eq!(schema.field(1).name(), RESERVED_COL_NAME_FILE); + assert_eq!(schema.field(2).name(), "z"); + + // Verify columns by name also works + assert!(batches[0].column_by_name("x").is_some()); + assert!(batches[0].column_by_name(RESERVED_COL_NAME_FILE).is_some()); + assert!(batches[0].column_by_name("z").is_some()); + } + + #[tokio::test] + async fn test_select_file_column_only() { + let mut fixture = TableTestFixture::new(); + fixture.setup_manifest_files().await; + + // Select only the _file column + let table_scan = fixture + .table + .scan() + .select([RESERVED_COL_NAME_FILE]) + .with_row_selection_enabled(true) + .build() + .unwrap(); + + let batch_stream = table_scan.to_arrow().await.unwrap(); + let batches: Vec<_> = batch_stream.try_collect().await.unwrap(); + + // Should have exactly 1 column + assert_eq!(batches[0].num_columns(), 1); + + // Verify it's the _file column + let schema = batches[0].schema(); + assert_eq!(schema.field(0).name(), RESERVED_COL_NAME_FILE); + + // Verify the batch has the correct number of rows + // The scan reads files 1.parquet and 3.parquet (2.parquet is deleted) + // Each file has 1024 rows, so total is 2048 rows + let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum(); + assert_eq!(total_rows, 2048); + } + + #[tokio::test] + async fn test_file_column_with_multiple_files() { + use std::collections::HashSet; + + let mut fixture = TableTestFixture::new(); + fixture.setup_manifest_files().await; + + // Select x and _file columns + let table_scan = fixture + .table + .scan() + .select(["x", RESERVED_COL_NAME_FILE]) + .with_row_selection_enabled(true) + .build() + .unwrap(); + + let batch_stream = table_scan.to_arrow().await.unwrap(); + let batches: Vec<_> = batch_stream.try_collect().await.unwrap(); + + // Collect all unique file paths from the batches + let mut file_paths = HashSet::new(); + for batch in &batches { + let file_col = batch.column_by_name(RESERVED_COL_NAME_FILE).unwrap(); + let run_array = file_col + .as_any() + .downcast_ref::>() + .expect("_file column should be a RunArray"); + + let values = run_array.values(); + let string_values = values.as_string::(); + for i in 0..string_values.len() { + file_paths.insert(string_values.value(i).to_string()); + } + } + + // We should have multiple files (the test creates 1.parquet and 3.parquet) + assert!(file_paths.len() >= 1, "Should have at least one file path"); + + // All paths should end with .parquet + for path in &file_paths { + assert!( + path.ends_with(".parquet"), + "All file paths should end with .parquet, got: {}", + path + ); + } + } + + #[tokio::test] + async fn test_file_column_at_start() { + let mut fixture = TableTestFixture::new(); + fixture.setup_manifest_files().await; + + // Select _file at the start + let table_scan = fixture + .table + .scan() + .select([RESERVED_COL_NAME_FILE, "x", "y"]) + .with_row_selection_enabled(true) + .build() + .unwrap(); + + let batch_stream = table_scan.to_arrow().await.unwrap(); + let batches: Vec<_> = batch_stream.try_collect().await.unwrap(); + + assert_eq!(batches[0].num_columns(), 3); + + // Verify _file is at position 0 + let schema = batches[0].schema(); + assert_eq!(schema.field(0).name(), RESERVED_COL_NAME_FILE); + assert_eq!(schema.field(1).name(), "x"); + assert_eq!(schema.field(2).name(), "y"); + } + + #[tokio::test] + async fn test_file_column_at_end() { + let mut fixture = TableTestFixture::new(); + fixture.setup_manifest_files().await; + + // Select _file at the end + let table_scan = fixture + .table + .scan() + .select(["x", "y", RESERVED_COL_NAME_FILE]) + .with_row_selection_enabled(true) + .build() + .unwrap(); + + let batch_stream = table_scan.to_arrow().await.unwrap(); + let batches: Vec<_> = batch_stream.try_collect().await.unwrap(); + + assert_eq!(batches[0].num_columns(), 3); + + // Verify _file is at position 2 (the end) + let schema = batches[0].schema(); + assert_eq!(schema.field(0).name(), "x"); + assert_eq!(schema.field(1).name(), "y"); + assert_eq!(schema.field(2).name(), RESERVED_COL_NAME_FILE); + } } diff --git a/crates/iceberg/src/scan/task.rs b/crates/iceberg/src/scan/task.rs index 32fe3ae309..2ead68ff39 100644 --- a/crates/iceberg/src/scan/task.rs +++ b/crates/iceberg/src/scan/task.rs @@ -52,6 +52,11 @@ pub struct FileScanTask { #[serde(skip_serializing_if = "Option::is_none")] pub predicate: Option, + /// The position of the _file column in the output, if requested. + /// None if the _file column was not requested. + #[serde(skip_serializing_if = "Option::is_none")] + pub file_column_position: Option, + /// The list of delete files that may need to be applied to this data file pub deletes: Vec, } From e034009b0e65978e08c02c7e0599da1e826ca1bd Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Tue, 4 Nov 2025 10:29:59 +0100 Subject: [PATCH 05/39] Update tests --- crates/iceberg/src/arrow/reader.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs index f93e368e9a..d1204d7a99 100644 --- a/crates/iceberg/src/arrow/reader.rs +++ b/crates/iceberg/src/arrow/reader.rs @@ -3385,6 +3385,7 @@ message schema { schema: schema.clone(), project_field_ids: vec![1, 2], predicate: None, + file_column_position: None, deletes: vec![], })] .into_iter(), @@ -3479,6 +3480,7 @@ message schema { schema: schema.clone(), project_field_ids: vec![1, 3], predicate: None, + file_column_position: None, deletes: vec![], })] .into_iter(), @@ -3562,6 +3564,7 @@ message schema { schema: schema.clone(), project_field_ids: vec![1, 2, 3], predicate: None, + file_column_position: None, deletes: vec![], })] .into_iter(), @@ -3659,6 +3662,7 @@ message schema { schema: schema.clone(), project_field_ids: vec![1, 2], predicate: None, + file_column_position: None, deletes: vec![], })] .into_iter(), @@ -3785,6 +3789,7 @@ message schema { schema: schema.clone(), project_field_ids: vec![1, 2], predicate: None, + file_column_position: None, deletes: vec![], })] .into_iter(), @@ -3878,6 +3883,7 @@ message schema { schema: schema.clone(), project_field_ids: vec![1, 5, 2], predicate: None, + file_column_position: None, deletes: vec![], })] .into_iter(), @@ -3984,6 +3990,7 @@ message schema { schema: schema.clone(), project_field_ids: vec![1, 2, 3], predicate: Some(predicate.bind(schema, true).unwrap()), + file_column_position: None, deletes: vec![], })] .into_iter(), From 4f0a4f19ccc5830d0fc770b62b7cf3e40467fffe Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Tue, 4 Nov 2025 10:41:35 +0100 Subject: [PATCH 06/39] Fix clippy warning --- crates/iceberg/src/scan/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/iceberg/src/scan/mod.rs b/crates/iceberg/src/scan/mod.rs index 05f1c81e9d..f41616cde7 100644 --- a/crates/iceberg/src/scan/mod.rs +++ b/crates/iceberg/src/scan/mod.rs @@ -1994,7 +1994,7 @@ pub mod tests { } // We should have multiple files (the test creates 1.parquet and 3.parquet) - assert!(file_paths.len() >= 1, "Should have at least one file path"); + assert!(!file_paths.is_empty(), "Should have at least one file path"); // All paths should end with .parquet for path in &file_paths { From 51f76d38e7b4c488f34e2254971708e0d8577bc5 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Tue, 4 Nov 2025 11:06:53 +0100 Subject: [PATCH 07/39] Fix doc test --- crates/iceberg/src/scan/mod.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crates/iceberg/src/scan/mod.rs b/crates/iceberg/src/scan/mod.rs index f41616cde7..eae5b05d35 100644 --- a/crates/iceberg/src/scan/mod.rs +++ b/crates/iceberg/src/scan/mod.rs @@ -51,8 +51,9 @@ use crate::{Error, ErrorKind, Result}; /// # Example /// ```no_run /// # use iceberg::scan::RESERVED_COL_NAME_FILE; +/// # use iceberg::table::Table; /// # async fn example() -> iceberg::Result<()> { -/// # let table = todo!(); +/// # let table: Table = todo!(); /// // Select regular columns along with the file path /// let scan = table /// .scan() From d84e16b8d123963c8b274308002b4f9b0cc63415 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Tue, 4 Nov 2025 12:52:44 +0100 Subject: [PATCH 08/39] Track in field ids --- crates/iceberg/src/arrow/delete_filter.rs | 2 -- crates/iceberg/src/arrow/reader.rs | 37 ++++++++++++----------- crates/iceberg/src/scan/context.rs | 9 ------ crates/iceberg/src/scan/mod.rs | 17 +++++------ crates/iceberg/src/scan/task.rs | 5 --- 5 files changed, 26 insertions(+), 44 deletions(-) diff --git a/crates/iceberg/src/arrow/delete_filter.rs b/crates/iceberg/src/arrow/delete_filter.rs index 01b71cd4cd..b853baa993 100644 --- a/crates/iceberg/src/arrow/delete_filter.rs +++ b/crates/iceberg/src/arrow/delete_filter.rs @@ -338,7 +338,6 @@ pub(crate) mod tests { schema: data_file_schema.clone(), project_field_ids: vec![], predicate: None, - file_column_position: None, deletes: vec![pos_del_1, pos_del_2.clone()], }, FileScanTask { @@ -350,7 +349,6 @@ pub(crate) mod tests { schema: data_file_schema.clone(), project_field_ids: vec![], predicate: None, - file_column_position: None, deletes: vec![pos_del_3], }, ]; diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs index d1204d7a99..ef7891066c 100644 --- a/crates/iceberg/src/arrow/reader.rs +++ b/crates/iceberg/src/arrow/reader.rs @@ -228,10 +228,26 @@ impl ArrowReader { initial_stream_builder }; + // Check if _file column is requested and filter it out for projection + let mut file_column_position = None; + let project_field_ids_without_virtual: Vec = task + .project_field_ids + .iter() + .enumerate() + .filter_map(|(idx, &field_id)| { + if field_id == RESERVED_FIELD_ID_FILE { + file_column_position = Some(idx); + None + } else { + Some(field_id) + } + }) + .collect(); + // Fallback IDs don't match Parquet's embedded field IDs (since they don't exist), // so we must use position-based projection instead of field-ID matching let projection_mask = Self::get_arrow_projection_mask( - &task.project_field_ids, + &project_field_ids_without_virtual, &task.schema, record_batch_stream_builder.parquet_schema(), record_batch_stream_builder.schema(), @@ -245,7 +261,7 @@ impl ArrowReader { // that come back from the file, such as type promotion, default column insertion // and column re-ordering let mut record_batch_transformer = - RecordBatchTransformer::build(task.schema_ref(), task.project_field_ids()); + RecordBatchTransformer::build(task.schema_ref(), &project_field_ids_without_virtual); if let Some(batch_size) = batch_size { record_batch_stream_builder = record_batch_stream_builder.with_batch_size(batch_size); @@ -377,8 +393,7 @@ impl ArrowReader { record_batch_stream_builder.with_row_groups(selected_row_group_indices); } - // Get the _file column position from the task (if requested) - let file_column_position = task.file_column_position; + // Clone data_file_path for use in the closure let data_file_path = task.data_file_path.clone(); // Build the batch stream and send all the RecordBatches that it generates @@ -2066,7 +2081,6 @@ message schema { schema: schema.clone(), project_field_ids: vec![1], predicate: Some(predicate.bind(schema, true).unwrap()), - file_column_position: None, deletes: vec![], })] .into_iter(), @@ -2385,7 +2399,6 @@ message schema { schema: schema.clone(), project_field_ids: vec![1], predicate: None, - file_column_position: None, deletes: vec![], }; @@ -2399,7 +2412,6 @@ message schema { schema: schema.clone(), project_field_ids: vec![1], predicate: None, - file_column_position: None, deletes: vec![], }; @@ -2524,7 +2536,6 @@ message schema { schema: new_schema.clone(), project_field_ids: vec![1, 2], // Request both columns 'a' and 'b' predicate: None, - file_column_position: None, deletes: vec![], })] .into_iter(), @@ -2854,7 +2865,6 @@ message schema { schema: table_schema.clone(), project_field_ids: vec![1], predicate: None, - file_column_position: None, deletes: vec![FileScanTaskDeleteFile { file_path: delete_file_path, file_type: DataContentType::PositionDeletes, @@ -3070,7 +3080,6 @@ message schema { schema: table_schema.clone(), project_field_ids: vec![1], predicate: None, - file_column_position: None, deletes: vec![FileScanTaskDeleteFile { file_path: delete_file_path, file_type: DataContentType::PositionDeletes, @@ -3279,7 +3288,6 @@ message schema { schema: table_schema.clone(), project_field_ids: vec![1], predicate: None, - file_column_position: None, deletes: vec![FileScanTaskDeleteFile { file_path: delete_file_path, file_type: DataContentType::PositionDeletes, @@ -3385,7 +3393,6 @@ message schema { schema: schema.clone(), project_field_ids: vec![1, 2], predicate: None, - file_column_position: None, deletes: vec![], })] .into_iter(), @@ -3480,7 +3487,6 @@ message schema { schema: schema.clone(), project_field_ids: vec![1, 3], predicate: None, - file_column_position: None, deletes: vec![], })] .into_iter(), @@ -3564,7 +3570,6 @@ message schema { schema: schema.clone(), project_field_ids: vec![1, 2, 3], predicate: None, - file_column_position: None, deletes: vec![], })] .into_iter(), @@ -3662,7 +3667,6 @@ message schema { schema: schema.clone(), project_field_ids: vec![1, 2], predicate: None, - file_column_position: None, deletes: vec![], })] .into_iter(), @@ -3789,7 +3793,6 @@ message schema { schema: schema.clone(), project_field_ids: vec![1, 2], predicate: None, - file_column_position: None, deletes: vec![], })] .into_iter(), @@ -3883,7 +3886,6 @@ message schema { schema: schema.clone(), project_field_ids: vec![1, 5, 2], predicate: None, - file_column_position: None, deletes: vec![], })] .into_iter(), @@ -3990,7 +3992,6 @@ message schema { schema: schema.clone(), project_field_ids: vec![1, 2, 3], predicate: Some(predicate.bind(schema, true).unwrap()), - file_column_position: None, deletes: vec![], })] .into_iter(), diff --git a/crates/iceberg/src/scan/context.rs b/crates/iceberg/src/scan/context.rs index 48652890dc..3f7c29dbf4 100644 --- a/crates/iceberg/src/scan/context.rs +++ b/crates/iceberg/src/scan/context.rs @@ -46,7 +46,6 @@ pub(crate) struct ManifestFileContext { snapshot_schema: SchemaRef, expression_evaluator_cache: Arc, delete_file_index: DeleteFileIndex, - file_column_position: Option, } /// Wraps a [`ManifestEntryRef`] alongside the objects that are needed @@ -60,7 +59,6 @@ pub(crate) struct ManifestEntryContext { pub partition_spec_id: i32, pub snapshot_schema: SchemaRef, pub delete_file_index: DeleteFileIndex, - pub file_column_position: Option, } impl ManifestFileContext { @@ -76,7 +74,6 @@ impl ManifestFileContext { mut sender, expression_evaluator_cache, delete_file_index, - file_column_position, .. } = self; @@ -92,7 +89,6 @@ impl ManifestFileContext { bound_predicates: bound_predicates.clone(), snapshot_schema: snapshot_schema.clone(), delete_file_index: delete_file_index.clone(), - file_column_position, }; sender @@ -131,8 +127,6 @@ impl ManifestEntryContext { .bound_predicates .map(|x| x.as_ref().snapshot_bound_predicate.clone()), - file_column_position: self.file_column_position, - deletes, }) } @@ -155,8 +149,6 @@ pub(crate) struct PlanContext { pub partition_filter_cache: Arc, pub manifest_evaluator_cache: Arc, pub expression_evaluator_cache: Arc, - - pub file_column_position: Option, } impl PlanContext { @@ -268,7 +260,6 @@ impl PlanContext { field_ids: self.field_ids.clone(), expression_evaluator_cache: self.expression_evaluator_cache.clone(), delete_file_index, - file_column_position: self.file_column_position, } } } diff --git a/crates/iceberg/src/scan/mod.rs b/crates/iceberg/src/scan/mod.rs index eae5b05d35..94e8c2514a 100644 --- a/crates/iceberg/src/scan/mod.rs +++ b/crates/iceberg/src/scan/mod.rs @@ -31,7 +31,10 @@ use futures::stream::BoxStream; use futures::{SinkExt, StreamExt, TryStreamExt}; pub use task::*; -use crate::arrow::{ArrowReaderBuilder, RESERVED_COL_NAME_FILE as RESERVED_COL_NAME_FILE_INTERNAL}; +use crate::arrow::{ + ArrowReaderBuilder, RESERVED_COL_NAME_FILE as RESERVED_COL_NAME_FILE_INTERNAL, + RESERVED_FIELD_ID_FILE, +}; use crate::delete_file_index::DeleteFileIndex; use crate::expr::visitors::inclusive_metrics_evaluator::InclusiveMetricsEvaluator; use crate::expr::{Bind, BoundPredicate, Predicate}; @@ -265,14 +268,11 @@ impl<'a> TableScanBuilder<'a> { .collect() }); - // Track the position of the _file column if requested - let mut file_column_position = None; - - for (index, column_name) in column_names.iter().enumerate() { + for column_name in column_names.iter() { // Handle special reserved column "_file" if column_name == RESERVED_COL_NAME_FILE_INTERNAL { - file_column_position = Some(index); - continue; // Don't add to field_ids - it's a virtual column + field_ids.push(RESERVED_FIELD_ID_FILE); + continue; } let field_id = schema.field_id_by_name(column_name).ok_or_else(|| { @@ -315,7 +315,6 @@ impl<'a> TableScanBuilder<'a> { partition_filter_cache: Arc::new(PartitionFilterCache::new()), manifest_evaluator_cache: Arc::new(ManifestEvaluatorCache::new()), expression_evaluator_cache: Arc::new(ExpressionEvaluatorCache::new()), - file_column_position, }; Ok(TableScan { @@ -1814,7 +1813,6 @@ pub mod tests { schema: schema.clone(), record_count: Some(100), data_file_format: DataFileFormat::Parquet, - file_column_position: None, deletes: vec![], }; test_fn(task); @@ -1829,7 +1827,6 @@ pub mod tests { schema, record_count: None, data_file_format: DataFileFormat::Avro, - file_column_position: None, deletes: vec![], }; test_fn(task); diff --git a/crates/iceberg/src/scan/task.rs b/crates/iceberg/src/scan/task.rs index 2ead68ff39..32fe3ae309 100644 --- a/crates/iceberg/src/scan/task.rs +++ b/crates/iceberg/src/scan/task.rs @@ -52,11 +52,6 @@ pub struct FileScanTask { #[serde(skip_serializing_if = "Option::is_none")] pub predicate: Option, - /// The position of the _file column in the output, if requested. - /// None if the _file column was not requested. - #[serde(skip_serializing_if = "Option::is_none")] - pub file_column_position: Option, - /// The list of delete files that may need to be applied to this data file pub deletes: Vec, } From bd478cb41e82a925d17f56cc3816108323a0ecd7 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Tue, 4 Nov 2025 15:32:17 +0100 Subject: [PATCH 09/39] Add test --- crates/iceberg/src/scan/mod.rs | 61 ++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/crates/iceberg/src/scan/mod.rs b/crates/iceberg/src/scan/mod.rs index 94e8c2514a..21fa1310a6 100644 --- a/crates/iceberg/src/scan/mod.rs +++ b/crates/iceberg/src/scan/mod.rs @@ -2055,4 +2055,65 @@ pub mod tests { assert_eq!(schema.field(1).name(), "y"); assert_eq!(schema.field(2).name(), RESERVED_COL_NAME_FILE); } + + #[tokio::test] + async fn test_select_called_twice_uses_last_selection() { + let mut fixture = TableTestFixture::new(); + fixture.setup_manifest_files().await; + + // Call select twice - first with x and y, then with only z and _file + // The second call should override the first + let table_scan = fixture + .table + .scan() + .select(["x", "y", RESERVED_COL_NAME_FILE]) // This should be ignored + .select(["z", RESERVED_COL_NAME_FILE]) // This should be used + .build() + .unwrap(); + + let batch_stream = table_scan.to_arrow().await.unwrap(); + let batches: Vec<_> = batch_stream.try_collect().await.unwrap(); + + // Verify we have exactly 2 columns: z and _file (NOT x or y) + assert_eq!( + batches[0].num_columns(), + 2, + "Should have exactly 2 columns from the last select" + ); + + let schema = batches[0].schema(); + + // Verify the columns are z and _file in that order + assert_eq!(schema.field(0).name(), "z", "First column should be z"); + assert_eq!( + schema.field(1).name(), + RESERVED_COL_NAME_FILE, + "Second column should be _file" + ); + + // Verify x and y are NOT present + assert!( + batches[0].column_by_name("x").is_none(), + "Column x should not be present (it was only in the first select)" + ); + assert!( + batches[0].column_by_name("y").is_none(), + "Column y should not be present (it was only in the first select)" + ); + + // Verify z column has data + let z_col = batches[0].column_by_name("z").unwrap(); + let z_arr = z_col.as_primitive::(); + assert!(z_arr.len() > 0, "Column z should have data"); + + // Verify _file column exists and is properly formatted + let file_col = batches[0].column_by_name(RESERVED_COL_NAME_FILE).unwrap(); + assert!( + matches!( + file_col.data_type(), + arrow_schema::DataType::RunEndEncoded(_, _) + ), + "_file column should use RunEndEncoded type" + ); + } } From 9b186c79e588adfd40c2e7f160bb1c2a06c0a7d9 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Tue, 4 Nov 2025 16:04:24 +0100 Subject: [PATCH 10/39] Allow repeated virtual file column selection --- crates/iceberg/src/arrow/reader.rs | 20 ++++---- crates/iceberg/src/scan/mod.rs | 80 ++++++++++++++++++++---------- 2 files changed, 66 insertions(+), 34 deletions(-) diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs index ef7891066c..8b65249715 100644 --- a/crates/iceberg/src/arrow/reader.rs +++ b/crates/iceberg/src/arrow/reader.rs @@ -229,14 +229,14 @@ impl ArrowReader { }; // Check if _file column is requested and filter it out for projection - let mut file_column_position = None; + let mut file_column_positions = Vec::new(); let project_field_ids_without_virtual: Vec = task .project_field_ids .iter() .enumerate() .filter_map(|(idx, &field_id)| { if field_id == RESERVED_FIELD_ID_FILE { - file_column_position = Some(idx); + file_column_positions.push(idx); None } else { Some(field_id) @@ -403,21 +403,23 @@ impl ArrowReader { .build()? .map(move |batch| match batch { Ok(batch) => { - let processed_batch = + let mut processed_batch = record_batch_transformer.process_record_batch(batch)?; - // Add the _file column if requested at the correct position - if let Some(position) = file_column_position { - Self::add_file_path_column_ree_at_position( + // Add the _file column at each requested position + // We insert them back at their original positions since we're reconstructing + // the original column order + for &position in &file_column_positions { + processed_batch = Self::add_file_path_column_ree_at_position( processed_batch, &data_file_path, RESERVED_COL_NAME_FILE, RESERVED_FIELD_ID_FILE, position, - ) - } else { - Ok(processed_batch) + )?; } + + Ok(processed_batch) } Err(err) => Err(err.into()), }); diff --git a/crates/iceberg/src/scan/mod.rs b/crates/iceberg/src/scan/mod.rs index 21fa1310a6..740dab94d7 100644 --- a/crates/iceberg/src/scan/mod.rs +++ b/crates/iceberg/src/scan/mod.rs @@ -2057,63 +2057,93 @@ pub mod tests { } #[tokio::test] - async fn test_select_called_twice_uses_last_selection() { + async fn test_select_with_repeated_column_names() { let mut fixture = TableTestFixture::new(); fixture.setup_manifest_files().await; - // Call select twice - first with x and y, then with only z and _file - // The second call should override the first + // Select with repeated column names - both regular columns and virtual columns + // Repeated columns should appear multiple times in the result (duplicates are allowed) let table_scan = fixture .table .scan() - .select(["x", "y", RESERVED_COL_NAME_FILE]) // This should be ignored - .select(["z", RESERVED_COL_NAME_FILE]) // This should be used + .select([ + "x", + RESERVED_COL_NAME_FILE, + "x", // x repeated + "y", + RESERVED_COL_NAME_FILE, // _file repeated + "y", // y repeated + ]) + .with_row_selection_enabled(true) .build() .unwrap(); let batch_stream = table_scan.to_arrow().await.unwrap(); let batches: Vec<_> = batch_stream.try_collect().await.unwrap(); - // Verify we have exactly 2 columns: z and _file (NOT x or y) + // Verify we have exactly 6 columns (duplicates are allowed and preserved) assert_eq!( batches[0].num_columns(), - 2, - "Should have exactly 2 columns from the last select" + 6, + "Should have exactly 6 columns with duplicates" ); let schema = batches[0].schema(); - // Verify the columns are z and _file in that order - assert_eq!(schema.field(0).name(), "z", "First column should be z"); + // Verify columns appear in the exact order requested: x, _file, x, y, _file, y + assert_eq!(schema.field(0).name(), "x", "Column 0 should be x"); assert_eq!( schema.field(1).name(), RESERVED_COL_NAME_FILE, - "Second column should be _file" + "Column 1 should be _file" + ); + assert_eq!( + schema.field(2).name(), + "x", + "Column 2 should be x (duplicate)" + ); + assert_eq!(schema.field(3).name(), "y", "Column 3 should be y"); + assert_eq!( + schema.field(4).name(), + RESERVED_COL_NAME_FILE, + "Column 4 should be _file (duplicate)" + ); + assert_eq!( + schema.field(5).name(), + "y", + "Column 5 should be y (duplicate)" ); - // Verify x and y are NOT present + // Verify all columns have correct data types assert!( - batches[0].column_by_name("x").is_none(), - "Column x should not be present (it was only in the first select)" + matches!(schema.field(0).data_type(), arrow_schema::DataType::Int64), + "Column x should be Int64" ); assert!( - batches[0].column_by_name("y").is_none(), - "Column y should not be present (it was only in the first select)" + matches!(schema.field(2).data_type(), arrow_schema::DataType::Int64), + "Column x (duplicate) should be Int64" + ); + assert!( + matches!(schema.field(3).data_type(), arrow_schema::DataType::Int64), + "Column y should be Int64" + ); + assert!( + matches!(schema.field(5).data_type(), arrow_schema::DataType::Int64), + "Column y (duplicate) should be Int64" ); - - // Verify z column has data - let z_col = batches[0].column_by_name("z").unwrap(); - let z_arr = z_col.as_primitive::(); - assert!(z_arr.len() > 0, "Column z should have data"); - - // Verify _file column exists and is properly formatted - let file_col = batches[0].column_by_name(RESERVED_COL_NAME_FILE).unwrap(); assert!( matches!( - file_col.data_type(), + schema.field(1).data_type(), arrow_schema::DataType::RunEndEncoded(_, _) ), "_file column should use RunEndEncoded type" ); + assert!( + matches!( + schema.field(4).data_type(), + arrow_schema::DataType::RunEndEncoded(_, _) + ), + "_file column (duplicate) should use RunEndEncoded type" + ); } } From adf0da0022c97660725140dec114b5ca2bf873cf Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Fri, 7 Nov 2025 14:03:33 +0100 Subject: [PATCH 11/39] Refactor into own transformer step --- .../src/arrow/metadata_column_transformer.rs | 282 ++++++++++++++++++ crates/iceberg/src/arrow/mod.rs | 2 + crates/iceberg/src/arrow/reader.rs | 53 ++-- 3 files changed, 301 insertions(+), 36 deletions(-) create mode 100644 crates/iceberg/src/arrow/metadata_column_transformer.rs diff --git a/crates/iceberg/src/arrow/metadata_column_transformer.rs b/crates/iceberg/src/arrow/metadata_column_transformer.rs new file mode 100644 index 0000000000..f8a2f70d72 --- /dev/null +++ b/crates/iceberg/src/arrow/metadata_column_transformer.rs @@ -0,0 +1,282 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::collections::HashMap; +use std::sync::Arc; + +use arrow_array::{Int32Array, RecordBatch, RunArray, StringArray}; +use arrow_schema::{DataType, Field}; +use parquet::arrow::PARQUET_FIELD_ID_META_KEY; + +use crate::{Error, ErrorKind, Result}; + +/// Represents different types of metadata column transformations that can be applied to a RecordBatch. +/// Each variant encapsulates the data and logic needed for a specific type of metadata column. +#[derive(Debug, Clone)] +pub(crate) enum MetadataColumnTransformation { + /// Adds a _file column with the file path using Run-End Encoding (REE) for memory efficiency. + /// The _file column stores the file path from which each row was read, with REE ensuring + /// that the same file path value is not repeated in memory for every row. + FilePath { + file_path: String, + field_name: String, + field_id: i32, + }, + // Future metadata columns can be added here, e.g.: + // PartitionValue { partition_values: HashMap, ... }, + // RowNumber { start: u64, ... }, +} + +impl MetadataColumnTransformation { + /// Applies the transformation to a RecordBatch, adding the metadata column at the specified position. + /// + /// # Arguments + /// * `batch` - The input RecordBatch to transform + /// * `position` - The position at which to insert the metadata column + /// + /// # Returns + /// A new RecordBatch with the metadata column inserted at the given position + pub(crate) fn apply(&self, batch: RecordBatch, position: usize) -> Result { + match self { + Self::FilePath { + file_path, + field_name, + field_id, + } => Self::apply_file_path(batch, file_path, field_name, *field_id, position), + } + } + + /// Applies the file path transformation using Run-End Encoding. + fn apply_file_path( + batch: RecordBatch, + file_path: &str, + field_name: &str, + field_id: i32, + position: usize, + ) -> Result { + let num_rows = batch.num_rows(); + + // Use Run-End Encoded array for optimal memory efficiency + let run_ends = if num_rows == 0 { + Int32Array::from(Vec::::new()) + } else { + Int32Array::from(vec![num_rows as i32]) + }; + let values = if num_rows == 0 { + StringArray::from(Vec::<&str>::new()) + } else { + StringArray::from(vec![file_path]) + }; + + let file_array = RunArray::try_new(&run_ends, &values).map_err(|e| { + Error::new( + ErrorKind::Unexpected, + "Failed to create RunArray for _file column", + ) + .with_source(e) + })?; + + let run_ends_field = Arc::new(Field::new("run_ends", DataType::Int32, false)); + let values_field = Arc::new(Field::new("values", DataType::Utf8, true)); + let file_field = Field::new( + field_name, + DataType::RunEndEncoded(run_ends_field, values_field), + false, + ); + + Self::insert_column_at_position(batch, Arc::new(file_array), file_field, field_id, position) + } + + /// Inserts a column at the specified position in a RecordBatch. + fn insert_column_at_position( + batch: RecordBatch, + column_array: arrow_array::ArrayRef, + field: Field, + field_id: i32, + position: usize, + ) -> Result { + let field_with_metadata = Arc::new(field.with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + field_id.to_string(), + )]))); + + // Build columns vector in a single pass without insert + let original_columns = batch.columns(); + let mut columns = Vec::with_capacity(original_columns.len() + 1); + columns.extend_from_slice(&original_columns[..position]); + columns.push(column_array); + columns.extend_from_slice(&original_columns[position..]); + + // Build fields vector in a single pass without insert + let schema = batch.schema(); + let original_fields = schema.fields(); + let mut fields = Vec::with_capacity(original_fields.len() + 1); + fields.extend(original_fields[..position].iter().cloned()); + fields.push(field_with_metadata); + fields.extend(original_fields[position..].iter().cloned()); + + let schema = Arc::new(arrow_schema::Schema::new(fields)); + RecordBatch::try_new(schema, columns).map_err(|e| { + Error::new( + ErrorKind::Unexpected, + "Failed to add metadata column to RecordBatch", + ) + .with_source(e) + }) + } +} + +/// Composes multiple metadata column transformations. +/// +/// This allows us to apply multiple metadata columns in sequence, where each transformation +/// builds on the previous one. For example, adding both _file and partition value columns. +pub(crate) struct MetadataTransformer { + transformations: Vec<(MetadataColumnTransformation, usize)>, +} + +impl MetadataTransformer { + /// Creates a new empty MetadataTransformer. + pub(crate) fn new() -> Self { + Self { + transformations: Vec::new(), + } + } + + /// Creates a builder for constructing a MetadataTransformer from projected field IDs. + pub(crate) fn builder(projected_field_ids: Vec) -> MetadataTransformerBuilder { + MetadataTransformerBuilder::new(projected_field_ids) + } + + /// Applies all registered transformations to the given RecordBatch. + /// + /// Transformations are applied in the order they were added. Each transformation + /// inserts a column at its specified position, so later transformations see the + /// updated batch with previously inserted columns. + pub(crate) fn apply(&self, mut batch: RecordBatch) -> Result { + for (transformation, position) in &self.transformations { + batch = transformation.apply(batch, *position)?; + } + Ok(batch) + } + + /// Returns true if there are any transformations to apply. + pub(crate) fn has_transformations(&self) -> bool { + !self.transformations.is_empty() + } +} + +impl Default for MetadataTransformer { + fn default() -> Self { + Self::new() + } +} + +/// Builder for constructing a MetadataTransformer from projected field IDs. +/// +/// This builder analyzes projected field IDs to identify metadata columns (reserved fields) +/// and builds the appropriate transformations. Reserved fields have special handling and +/// are inserted into the RecordBatch at their projected positions. +pub(crate) struct MetadataTransformerBuilder { + projected_field_ids: Vec, + file_path: Option, +} + +impl MetadataTransformerBuilder { + /// Creates a new MetadataTransformerBuilder for the given projected field IDs. + /// + /// # Arguments + /// * `projected_field_ids` - The list of field IDs being projected, including any reserved fields + pub(crate) fn new(projected_field_ids: Vec) -> Self { + Self { + projected_field_ids, + file_path: None, + } + } + + /// Sets the file path for the _file metadata column. + /// + /// # Arguments + /// * `file_path` - The file path to use for the _file column + /// + /// # Returns + /// Self for method chaining + pub(crate) fn with_file_path(mut self, file_path: String) -> Self { + self.file_path = Some(file_path); + self + } + + /// Builds the MetadataTransformer by analyzing projected field IDs and creating appropriate transformations. + /// + /// This method: + /// 1. Iterates through projected field IDs to find reserved fields + /// 2. Calculates the correct position for each reserved field in the final output + /// 3. Creates transformations for each reserved field found + pub(crate) fn build(self) -> MetadataTransformer { + let mut transformations = Vec::new(); + + // Iterate through the projected field IDs and check for reserved fields + for (position, field_id) in self.projected_field_ids.iter().enumerate() { + // Check if this is a reserved field ID for the _file column + if *field_id == RESERVED_FIELD_ID_FILE { + if let Some(ref path) = self.file_path { + let transformation = MetadataColumnTransformation::FilePath { + file_path: path.clone(), + field_name: RESERVED_COL_NAME_FILE.to_string(), + field_id: *field_id, + }; + transformations.push((transformation, position)); + } + } + // Additional reserved fields can be handled here in the future + } + + MetadataTransformer { transformations } + } + + /// Returns the projected field IDs with virtual/reserved fields filtered out. + /// + /// This is used to determine which regular (non-virtual) fields should be read from the data file. + /// Virtual fields are handled by the metadata transformer and should not be included in the + /// Parquet projection. + pub(crate) fn project_field_ids_without_virtual(&self) -> Vec { + self.projected_field_ids + .iter() + .filter(|&&field_id| !Self::is_reserved_field(field_id)) + .copied() + .collect() + } + + /// Checks if a field ID is reserved (virtual). + fn is_reserved_field(field_id: i32) -> bool { + field_id == RESERVED_FIELD_ID_FILE + // Additional reserved fields can be checked here + } +} + +impl Default for MetadataTransformerBuilder { + fn default() -> Self { + Self::new(Vec::new()) + } +} + +// Reserved field IDs and names for metadata columns + +/// Reserved field ID for the file path (_file) column per Iceberg spec +pub(crate) const RESERVED_FIELD_ID_FILE: i32 = 2147483646; + +/// Reserved column name for the file path metadata column +pub(crate) const RESERVED_COL_NAME_FILE: &str = "_file"; diff --git a/crates/iceberg/src/arrow/mod.rs b/crates/iceberg/src/arrow/mod.rs index c091c45177..feb8f3ac48 100644 --- a/crates/iceberg/src/arrow/mod.rs +++ b/crates/iceberg/src/arrow/mod.rs @@ -27,6 +27,8 @@ pub(crate) mod caching_delete_file_loader; pub mod delete_file_loader; pub(crate) mod delete_filter; +pub(crate) mod metadata_column_transformer; +pub(crate) use metadata_column_transformer::{RESERVED_COL_NAME_FILE, RESERVED_FIELD_ID_FILE}; mod reader; /// RecordBatch projection utilities pub mod record_batch_projector; diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs index 8b65249715..d834eb9fc4 100644 --- a/crates/iceberg/src/arrow/reader.rs +++ b/crates/iceberg/src/arrow/reader.rs @@ -48,6 +48,7 @@ use parquet::file::metadata::{ use parquet::schema::types::{SchemaDescriptor, Type as ParquetType}; use crate::arrow::caching_delete_file_loader::CachingDeleteFileLoader; +use crate::arrow::metadata_column_transformer::MetadataTransformer; use crate::arrow::record_batch_transformer::RecordBatchTransformer; use crate::arrow::{arrow_schema_to_schema, get_arrow_datum}; use crate::delete_vector::DeleteVector; @@ -62,12 +63,6 @@ use crate::spec::{Datum, NestedField, PrimitiveType, Schema, Type}; use crate::utils::available_parallelism; use crate::{Error, ErrorKind}; -/// Reserved field ID for the file path (_file) column per Iceberg spec -pub(crate) const RESERVED_FIELD_ID_FILE: i32 = 2147483646; - -/// Column name for the file path metadata column per Iceberg spec -pub(crate) const RESERVED_COL_NAME_FILE: &str = "_file"; - /// Builder to create ArrowReader pub struct ArrowReaderBuilder { batch_size: Option, @@ -228,21 +223,18 @@ impl ArrowReader { initial_stream_builder }; - // Check if _file column is requested and filter it out for projection - let mut file_column_positions = Vec::new(); - let project_field_ids_without_virtual: Vec = task - .project_field_ids - .iter() - .enumerate() - .filter_map(|(idx, &field_id)| { - if field_id == RESERVED_FIELD_ID_FILE { - file_column_positions.push(idx); - None - } else { - Some(field_id) - } - }) - .collect(); + // Build the metadata transformer from the projected field IDs + // This identifies reserved fields (like _file) and creates transformations for them + let metadata_transformer_builder = + MetadataTransformer::builder(task.project_field_ids.clone()) + .with_file_path(task.data_file_path.clone()); + + // Get the field IDs without virtual fields for Parquet projection + let project_field_ids_without_virtual = + metadata_transformer_builder.project_field_ids_without_virtual(); + + // Build the metadata transformer (which will handle adding _file columns) + let metadata_transformer = metadata_transformer_builder.build(); // Fallback IDs don't match Parquet's embedded field IDs (since they don't exist), // so we must use position-based projection instead of field-ID matching @@ -403,23 +395,12 @@ impl ArrowReader { .build()? .map(move |batch| match batch { Ok(batch) => { - let mut processed_batch = + // First apply record batch transformations (type promotion, column reordering, etc.) + let processed_batch = record_batch_transformer.process_record_batch(batch)?; - // Add the _file column at each requested position - // We insert them back at their original positions since we're reconstructing - // the original column order - for &position in &file_column_positions { - processed_batch = Self::add_file_path_column_ree_at_position( - processed_batch, - &data_file_path, - RESERVED_COL_NAME_FILE, - RESERVED_FIELD_ID_FILE, - position, - )?; - } - - Ok(processed_batch) + // Then apply metadata transformations (add _file column, etc.) + metadata_transformer.apply(processed_batch) } Err(err) => Err(err.into()), }); From ef3a965e98b418cc3ef722f99f3d1e29f6ec6a85 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Fri, 7 Nov 2025 15:04:35 +0100 Subject: [PATCH 12/39] Revert "Refactor into own transformer step" This reverts commit adf0da0022c97660725140dec114b5ca2bf873cf. --- .../src/arrow/metadata_column_transformer.rs | 282 ------------------ crates/iceberg/src/arrow/mod.rs | 2 - crates/iceberg/src/arrow/reader.rs | 53 ++-- 3 files changed, 36 insertions(+), 301 deletions(-) delete mode 100644 crates/iceberg/src/arrow/metadata_column_transformer.rs diff --git a/crates/iceberg/src/arrow/metadata_column_transformer.rs b/crates/iceberg/src/arrow/metadata_column_transformer.rs deleted file mode 100644 index f8a2f70d72..0000000000 --- a/crates/iceberg/src/arrow/metadata_column_transformer.rs +++ /dev/null @@ -1,282 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::collections::HashMap; -use std::sync::Arc; - -use arrow_array::{Int32Array, RecordBatch, RunArray, StringArray}; -use arrow_schema::{DataType, Field}; -use parquet::arrow::PARQUET_FIELD_ID_META_KEY; - -use crate::{Error, ErrorKind, Result}; - -/// Represents different types of metadata column transformations that can be applied to a RecordBatch. -/// Each variant encapsulates the data and logic needed for a specific type of metadata column. -#[derive(Debug, Clone)] -pub(crate) enum MetadataColumnTransformation { - /// Adds a _file column with the file path using Run-End Encoding (REE) for memory efficiency. - /// The _file column stores the file path from which each row was read, with REE ensuring - /// that the same file path value is not repeated in memory for every row. - FilePath { - file_path: String, - field_name: String, - field_id: i32, - }, - // Future metadata columns can be added here, e.g.: - // PartitionValue { partition_values: HashMap, ... }, - // RowNumber { start: u64, ... }, -} - -impl MetadataColumnTransformation { - /// Applies the transformation to a RecordBatch, adding the metadata column at the specified position. - /// - /// # Arguments - /// * `batch` - The input RecordBatch to transform - /// * `position` - The position at which to insert the metadata column - /// - /// # Returns - /// A new RecordBatch with the metadata column inserted at the given position - pub(crate) fn apply(&self, batch: RecordBatch, position: usize) -> Result { - match self { - Self::FilePath { - file_path, - field_name, - field_id, - } => Self::apply_file_path(batch, file_path, field_name, *field_id, position), - } - } - - /// Applies the file path transformation using Run-End Encoding. - fn apply_file_path( - batch: RecordBatch, - file_path: &str, - field_name: &str, - field_id: i32, - position: usize, - ) -> Result { - let num_rows = batch.num_rows(); - - // Use Run-End Encoded array for optimal memory efficiency - let run_ends = if num_rows == 0 { - Int32Array::from(Vec::::new()) - } else { - Int32Array::from(vec![num_rows as i32]) - }; - let values = if num_rows == 0 { - StringArray::from(Vec::<&str>::new()) - } else { - StringArray::from(vec![file_path]) - }; - - let file_array = RunArray::try_new(&run_ends, &values).map_err(|e| { - Error::new( - ErrorKind::Unexpected, - "Failed to create RunArray for _file column", - ) - .with_source(e) - })?; - - let run_ends_field = Arc::new(Field::new("run_ends", DataType::Int32, false)); - let values_field = Arc::new(Field::new("values", DataType::Utf8, true)); - let file_field = Field::new( - field_name, - DataType::RunEndEncoded(run_ends_field, values_field), - false, - ); - - Self::insert_column_at_position(batch, Arc::new(file_array), file_field, field_id, position) - } - - /// Inserts a column at the specified position in a RecordBatch. - fn insert_column_at_position( - batch: RecordBatch, - column_array: arrow_array::ArrayRef, - field: Field, - field_id: i32, - position: usize, - ) -> Result { - let field_with_metadata = Arc::new(field.with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - field_id.to_string(), - )]))); - - // Build columns vector in a single pass without insert - let original_columns = batch.columns(); - let mut columns = Vec::with_capacity(original_columns.len() + 1); - columns.extend_from_slice(&original_columns[..position]); - columns.push(column_array); - columns.extend_from_slice(&original_columns[position..]); - - // Build fields vector in a single pass without insert - let schema = batch.schema(); - let original_fields = schema.fields(); - let mut fields = Vec::with_capacity(original_fields.len() + 1); - fields.extend(original_fields[..position].iter().cloned()); - fields.push(field_with_metadata); - fields.extend(original_fields[position..].iter().cloned()); - - let schema = Arc::new(arrow_schema::Schema::new(fields)); - RecordBatch::try_new(schema, columns).map_err(|e| { - Error::new( - ErrorKind::Unexpected, - "Failed to add metadata column to RecordBatch", - ) - .with_source(e) - }) - } -} - -/// Composes multiple metadata column transformations. -/// -/// This allows us to apply multiple metadata columns in sequence, where each transformation -/// builds on the previous one. For example, adding both _file and partition value columns. -pub(crate) struct MetadataTransformer { - transformations: Vec<(MetadataColumnTransformation, usize)>, -} - -impl MetadataTransformer { - /// Creates a new empty MetadataTransformer. - pub(crate) fn new() -> Self { - Self { - transformations: Vec::new(), - } - } - - /// Creates a builder for constructing a MetadataTransformer from projected field IDs. - pub(crate) fn builder(projected_field_ids: Vec) -> MetadataTransformerBuilder { - MetadataTransformerBuilder::new(projected_field_ids) - } - - /// Applies all registered transformations to the given RecordBatch. - /// - /// Transformations are applied in the order they were added. Each transformation - /// inserts a column at its specified position, so later transformations see the - /// updated batch with previously inserted columns. - pub(crate) fn apply(&self, mut batch: RecordBatch) -> Result { - for (transformation, position) in &self.transformations { - batch = transformation.apply(batch, *position)?; - } - Ok(batch) - } - - /// Returns true if there are any transformations to apply. - pub(crate) fn has_transformations(&self) -> bool { - !self.transformations.is_empty() - } -} - -impl Default for MetadataTransformer { - fn default() -> Self { - Self::new() - } -} - -/// Builder for constructing a MetadataTransformer from projected field IDs. -/// -/// This builder analyzes projected field IDs to identify metadata columns (reserved fields) -/// and builds the appropriate transformations. Reserved fields have special handling and -/// are inserted into the RecordBatch at their projected positions. -pub(crate) struct MetadataTransformerBuilder { - projected_field_ids: Vec, - file_path: Option, -} - -impl MetadataTransformerBuilder { - /// Creates a new MetadataTransformerBuilder for the given projected field IDs. - /// - /// # Arguments - /// * `projected_field_ids` - The list of field IDs being projected, including any reserved fields - pub(crate) fn new(projected_field_ids: Vec) -> Self { - Self { - projected_field_ids, - file_path: None, - } - } - - /// Sets the file path for the _file metadata column. - /// - /// # Arguments - /// * `file_path` - The file path to use for the _file column - /// - /// # Returns - /// Self for method chaining - pub(crate) fn with_file_path(mut self, file_path: String) -> Self { - self.file_path = Some(file_path); - self - } - - /// Builds the MetadataTransformer by analyzing projected field IDs and creating appropriate transformations. - /// - /// This method: - /// 1. Iterates through projected field IDs to find reserved fields - /// 2. Calculates the correct position for each reserved field in the final output - /// 3. Creates transformations for each reserved field found - pub(crate) fn build(self) -> MetadataTransformer { - let mut transformations = Vec::new(); - - // Iterate through the projected field IDs and check for reserved fields - for (position, field_id) in self.projected_field_ids.iter().enumerate() { - // Check if this is a reserved field ID for the _file column - if *field_id == RESERVED_FIELD_ID_FILE { - if let Some(ref path) = self.file_path { - let transformation = MetadataColumnTransformation::FilePath { - file_path: path.clone(), - field_name: RESERVED_COL_NAME_FILE.to_string(), - field_id: *field_id, - }; - transformations.push((transformation, position)); - } - } - // Additional reserved fields can be handled here in the future - } - - MetadataTransformer { transformations } - } - - /// Returns the projected field IDs with virtual/reserved fields filtered out. - /// - /// This is used to determine which regular (non-virtual) fields should be read from the data file. - /// Virtual fields are handled by the metadata transformer and should not be included in the - /// Parquet projection. - pub(crate) fn project_field_ids_without_virtual(&self) -> Vec { - self.projected_field_ids - .iter() - .filter(|&&field_id| !Self::is_reserved_field(field_id)) - .copied() - .collect() - } - - /// Checks if a field ID is reserved (virtual). - fn is_reserved_field(field_id: i32) -> bool { - field_id == RESERVED_FIELD_ID_FILE - // Additional reserved fields can be checked here - } -} - -impl Default for MetadataTransformerBuilder { - fn default() -> Self { - Self::new(Vec::new()) - } -} - -// Reserved field IDs and names for metadata columns - -/// Reserved field ID for the file path (_file) column per Iceberg spec -pub(crate) const RESERVED_FIELD_ID_FILE: i32 = 2147483646; - -/// Reserved column name for the file path metadata column -pub(crate) const RESERVED_COL_NAME_FILE: &str = "_file"; diff --git a/crates/iceberg/src/arrow/mod.rs b/crates/iceberg/src/arrow/mod.rs index feb8f3ac48..c091c45177 100644 --- a/crates/iceberg/src/arrow/mod.rs +++ b/crates/iceberg/src/arrow/mod.rs @@ -27,8 +27,6 @@ pub(crate) mod caching_delete_file_loader; pub mod delete_file_loader; pub(crate) mod delete_filter; -pub(crate) mod metadata_column_transformer; -pub(crate) use metadata_column_transformer::{RESERVED_COL_NAME_FILE, RESERVED_FIELD_ID_FILE}; mod reader; /// RecordBatch projection utilities pub mod record_batch_projector; diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs index d834eb9fc4..8b65249715 100644 --- a/crates/iceberg/src/arrow/reader.rs +++ b/crates/iceberg/src/arrow/reader.rs @@ -48,7 +48,6 @@ use parquet::file::metadata::{ use parquet::schema::types::{SchemaDescriptor, Type as ParquetType}; use crate::arrow::caching_delete_file_loader::CachingDeleteFileLoader; -use crate::arrow::metadata_column_transformer::MetadataTransformer; use crate::arrow::record_batch_transformer::RecordBatchTransformer; use crate::arrow::{arrow_schema_to_schema, get_arrow_datum}; use crate::delete_vector::DeleteVector; @@ -63,6 +62,12 @@ use crate::spec::{Datum, NestedField, PrimitiveType, Schema, Type}; use crate::utils::available_parallelism; use crate::{Error, ErrorKind}; +/// Reserved field ID for the file path (_file) column per Iceberg spec +pub(crate) const RESERVED_FIELD_ID_FILE: i32 = 2147483646; + +/// Column name for the file path metadata column per Iceberg spec +pub(crate) const RESERVED_COL_NAME_FILE: &str = "_file"; + /// Builder to create ArrowReader pub struct ArrowReaderBuilder { batch_size: Option, @@ -223,18 +228,21 @@ impl ArrowReader { initial_stream_builder }; - // Build the metadata transformer from the projected field IDs - // This identifies reserved fields (like _file) and creates transformations for them - let metadata_transformer_builder = - MetadataTransformer::builder(task.project_field_ids.clone()) - .with_file_path(task.data_file_path.clone()); - - // Get the field IDs without virtual fields for Parquet projection - let project_field_ids_without_virtual = - metadata_transformer_builder.project_field_ids_without_virtual(); - - // Build the metadata transformer (which will handle adding _file columns) - let metadata_transformer = metadata_transformer_builder.build(); + // Check if _file column is requested and filter it out for projection + let mut file_column_positions = Vec::new(); + let project_field_ids_without_virtual: Vec = task + .project_field_ids + .iter() + .enumerate() + .filter_map(|(idx, &field_id)| { + if field_id == RESERVED_FIELD_ID_FILE { + file_column_positions.push(idx); + None + } else { + Some(field_id) + } + }) + .collect(); // Fallback IDs don't match Parquet's embedded field IDs (since they don't exist), // so we must use position-based projection instead of field-ID matching @@ -395,12 +403,23 @@ impl ArrowReader { .build()? .map(move |batch| match batch { Ok(batch) => { - // First apply record batch transformations (type promotion, column reordering, etc.) - let processed_batch = + let mut processed_batch = record_batch_transformer.process_record_batch(batch)?; - // Then apply metadata transformations (add _file column, etc.) - metadata_transformer.apply(processed_batch) + // Add the _file column at each requested position + // We insert them back at their original positions since we're reconstructing + // the original column order + for &position in &file_column_positions { + processed_batch = Self::add_file_path_column_ree_at_position( + processed_batch, + &data_file_path, + RESERVED_COL_NAME_FILE, + RESERVED_FIELD_ID_FILE, + position, + )?; + } + + Ok(processed_batch) } Err(err) => Err(err.into()), }); From 534490bb8239a87e0cced20529b73018702debbe Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Fri, 7 Nov 2025 15:08:42 +0100 Subject: [PATCH 13/39] Avoid special casing in batch creation --- crates/iceberg/src/arrow/reader.rs | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs index 8b65249715..d3dde7eaf4 100644 --- a/crates/iceberg/src/arrow/reader.rs +++ b/crates/iceberg/src/arrow/reader.rs @@ -229,14 +229,12 @@ impl ArrowReader { }; // Check if _file column is requested and filter it out for projection - let mut file_column_positions = Vec::new(); let project_field_ids_without_virtual: Vec = task .project_field_ids .iter() .enumerate() .filter_map(|(idx, &field_id)| { if field_id == RESERVED_FIELD_ID_FILE { - file_column_positions.push(idx); None } else { Some(field_id) @@ -409,14 +407,16 @@ impl ArrowReader { // Add the _file column at each requested position // We insert them back at their original positions since we're reconstructing // the original column order - for &position in &file_column_positions { - processed_batch = Self::add_file_path_column_ree_at_position( - processed_batch, - &data_file_path, - RESERVED_COL_NAME_FILE, - RESERVED_FIELD_ID_FILE, - position, - )?; + for (position, field_id) in task.project_field_ids.iter().enumerate() { + if *field_id == RESERVED_FIELD_ID_FILE { + processed_batch = Self::add_file_path_column_ree_at_position( + processed_batch, + &data_file_path, + RESERVED_COL_NAME_FILE, + RESERVED_FIELD_ID_FILE, + position, + )?; + } } Ok(processed_batch) From 04bf463623e12a3f1b26cb7576952c30e48d64d6 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Fri, 7 Nov 2025 15:09:25 +0100 Subject: [PATCH 14/39] . --- crates/iceberg/src/arrow/reader.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs index d3dde7eaf4..064e0c621f 100644 --- a/crates/iceberg/src/arrow/reader.rs +++ b/crates/iceberg/src/arrow/reader.rs @@ -232,8 +232,7 @@ impl ArrowReader { let project_field_ids_without_virtual: Vec = task .project_field_ids .iter() - .enumerate() - .filter_map(|(idx, &field_id)| { + .filter_map(|&field_id| { if field_id == RESERVED_FIELD_ID_FILE { None } else { From 9e88edf2555de6d05d5a3b1796172b44ced07cc7 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Wed, 12 Nov 2025 08:57:49 +0100 Subject: [PATCH 15/39] Modify record batch transformer to support reserved fields --- crates/iceberg/src/arrow/reader.rs | 326 ++---------------- .../src/arrow/record_batch_transformer.rs | 104 +++++- crates/iceberg/src/lib.rs | 1 + crates/iceberg/src/metadata_columns.rs | 60 ++++ crates/iceberg/src/scan/mod.rs | 32 +- 5 files changed, 182 insertions(+), 341 deletions(-) create mode 100644 crates/iceberg/src/metadata_columns.rs diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs index 064e0c621f..f482a9f6a9 100644 --- a/crates/iceberg/src/arrow/reader.rs +++ b/crates/iceberg/src/arrow/reader.rs @@ -23,14 +23,11 @@ use std::str::FromStr; use std::sync::Arc; use arrow_arith::boolean::{and, and_kleene, is_not_null, is_null, not, or, or_kleene}; -use arrow_array::{ - Array, ArrayRef, BooleanArray, Datum as ArrowDatum, Int32Array, RecordBatch, RunArray, Scalar, - StringArray, -}; +use arrow_array::{Array, ArrayRef, BooleanArray, Datum as ArrowDatum, RecordBatch, Scalar}; use arrow_cast::cast::cast; use arrow_ord::cmp::{eq, gt, gt_eq, lt, lt_eq, neq}; use arrow_schema::{ - ArrowError, DataType, Field, FieldRef, Schema as ArrowSchema, SchemaRef as ArrowSchemaRef, + ArrowError, DataType, FieldRef, Schema as ArrowSchema, SchemaRef as ArrowSchemaRef, }; use arrow_string::like::starts_with; use bytes::Bytes; @@ -57,17 +54,12 @@ use crate::expr::visitors::page_index_evaluator::PageIndexEvaluator; use crate::expr::visitors::row_group_metrics_evaluator::RowGroupMetricsEvaluator; use crate::expr::{BoundPredicate, BoundReference}; use crate::io::{FileIO, FileMetadata, FileRead}; +use crate::metadata_columns::{RESERVED_FIELD_ID_FILE, is_reserved_field}; use crate::scan::{ArrowRecordBatchStream, FileScanTask, FileScanTaskStream}; -use crate::spec::{Datum, NestedField, PrimitiveType, Schema, Type}; +use crate::spec::{Datum, NestedField, PrimitiveLiteral, PrimitiveType, Schema, Type}; use crate::utils::available_parallelism; use crate::{Error, ErrorKind}; -/// Reserved field ID for the file path (_file) column per Iceberg spec -pub(crate) const RESERVED_FIELD_ID_FILE: i32 = 2147483646; - -/// Column name for the file path metadata column per Iceberg spec -pub(crate) const RESERVED_COL_NAME_FILE: &str = "_file"; - /// Builder to create ArrowReader pub struct ArrowReaderBuilder { batch_size: Option, @@ -228,23 +220,17 @@ impl ArrowReader { initial_stream_builder }; - // Check if _file column is requested and filter it out for projection - let project_field_ids_without_virtual: Vec = task + // Fallback IDs don't match Parquet's embedded field IDs (since they don't exist), + // Filter out reserved fields for Parquet projection + let project_field_ids_without_reserved: Vec = task .project_field_ids .iter() - .filter_map(|&field_id| { - if field_id == RESERVED_FIELD_ID_FILE { - None - } else { - Some(field_id) - } - }) + .filter(|&&id| !is_reserved_field(id)) + .copied() .collect(); - - // Fallback IDs don't match Parquet's embedded field IDs (since they don't exist), // so we must use position-based projection instead of field-ID matching let projection_mask = Self::get_arrow_projection_mask( - &project_field_ids_without_virtual, + &project_field_ids_without_reserved, &task.schema, record_batch_stream_builder.parquet_schema(), record_batch_stream_builder.schema(), @@ -255,10 +241,14 @@ impl ArrowReader { record_batch_stream_builder.with_projection(projection_mask.clone()); // RecordBatchTransformer performs any transformations required on the RecordBatches - // that come back from the file, such as type promotion, default column insertion - // and column re-ordering + // that come back from the file, such as type promotion, default column insertion, + // column re-ordering, and virtual field addition (like _file) let mut record_batch_transformer = - RecordBatchTransformer::build(task.schema_ref(), &project_field_ids_without_virtual); + RecordBatchTransformer::build(task.schema_ref(), &task.project_field_ids) + .with_constant( + RESERVED_FIELD_ID_FILE, + PrimitiveLiteral::String(task.data_file_path.clone()), + ); if let Some(batch_size) = batch_size { record_batch_stream_builder = record_batch_stream_builder.with_batch_size(batch_size); @@ -390,9 +380,6 @@ impl ArrowReader { record_batch_stream_builder.with_row_groups(selected_row_group_indices); } - // Clone data_file_path for use in the closure - let data_file_path = task.data_file_path.clone(); - // Build the batch stream and send all the RecordBatches that it generates // to the requester. let record_batch_stream = @@ -400,25 +387,8 @@ impl ArrowReader { .build()? .map(move |batch| match batch { Ok(batch) => { - let mut processed_batch = - record_batch_transformer.process_record_batch(batch)?; - - // Add the _file column at each requested position - // We insert them back at their original positions since we're reconstructing - // the original column order - for (position, field_id) in task.project_field_ids.iter().enumerate() { - if *field_id == RESERVED_FIELD_ID_FILE { - processed_batch = Self::add_file_path_column_ree_at_position( - processed_batch, - &data_file_path, - RESERVED_COL_NAME_FILE, - RESERVED_FIELD_ID_FILE, - position, - )?; - } - } - - Ok(processed_batch) + // Process the record batch (type promotion, column reordering, virtual fields, etc.) + record_batch_transformer.process_record_batch(batch) } Err(err) => Err(err.into()), }); @@ -568,93 +538,6 @@ impl ArrowReader { Ok(results.into()) } - /// Helper function to add a `_file` column to a RecordBatch at a specific position. - /// Takes the array, field to add, and position where to insert. - fn create_file_field_at_position( - batch: RecordBatch, - file_array: ArrayRef, - file_field: Field, - field_id: i32, - position: usize, - ) -> Result { - let file_field_with_metadata = Arc::new(file_field.with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - field_id.to_string(), - )]))); - - // Build columns vector in a single pass without insert - let original_columns = batch.columns(); - let mut columns = Vec::with_capacity(original_columns.len() + 1); - columns.extend_from_slice(&original_columns[..position]); - columns.push(file_array); - columns.extend_from_slice(&original_columns[position..]); - - // Build fields vector in a single pass without insert - let schema = batch.schema(); - let original_fields = schema.fields(); - let mut fields = Vec::with_capacity(original_fields.len() + 1); - fields.extend(original_fields[..position].iter().cloned()); - fields.push(file_field_with_metadata); - fields.extend(original_fields[position..].iter().cloned()); - - let schema = Arc::new(ArrowSchema::new(fields)); - RecordBatch::try_new(schema, columns).map_err(|e| { - Error::new( - ErrorKind::Unexpected, - "Failed to add _file column to RecordBatch", - ) - .with_source(e) - }) - } - - /// Adds a `_file` column to the RecordBatch at a specific position. - /// Uses Run-End Encoding (REE) for maximum memory efficiency. - pub(crate) fn add_file_path_column_ree_at_position( - batch: RecordBatch, - file_path: &str, - field_name: &str, - field_id: i32, - position: usize, - ) -> Result { - let num_rows = batch.num_rows(); - - // Use Run-End Encoded array for optimal memory efficiency - let run_ends = if num_rows == 0 { - Int32Array::from(Vec::::new()) - } else { - Int32Array::from(vec![num_rows as i32]) - }; - let values = if num_rows == 0 { - StringArray::from(Vec::<&str>::new()) - } else { - StringArray::from(vec![file_path]) - }; - - let file_array = RunArray::try_new(&run_ends, &values).map_err(|e| { - Error::new( - ErrorKind::Unexpected, - "Failed to create RunArray for _file column", - ) - .with_source(e) - })?; - - let run_ends_field = Arc::new(Field::new("run_ends", DataType::Int32, false)); - let values_field = Arc::new(Field::new("values", DataType::Utf8, true)); - let file_field = Field::new( - field_name, - DataType::RunEndEncoded(run_ends_field, values_field), - false, - ); - - Self::create_file_field_at_position( - batch, - Arc::new(file_array), - file_field, - field_id, - position, - ) - } - fn build_field_id_set_and_map( parquet_schema: &SchemaDescriptor, predicate: &BoundPredicate, @@ -1758,7 +1641,6 @@ mod tests { use arrow_array::cast::AsArray; use arrow_array::{ArrayRef, LargeStringArray, RecordBatch, StringArray}; use arrow_schema::{DataType, Field, Schema as ArrowSchema, TimeUnit}; - use as_any::Downcast; use futures::TryStreamExt; use parquet::arrow::arrow_reader::{RowSelection, RowSelector}; use parquet::arrow::{ArrowWriter, ProjectionMask}; @@ -1772,9 +1654,7 @@ mod tests { use crate::ErrorKind; use crate::arrow::reader::{CollectFieldIdVisitor, PARQUET_FIELD_ID_META_KEY}; - use crate::arrow::{ - ArrowReader, ArrowReaderBuilder, RESERVED_COL_NAME_FILE, RESERVED_FIELD_ID_FILE, - }; + use crate::arrow::{ArrowReader, ArrowReaderBuilder}; use crate::delete_vector::DeleteVector; use crate::expr::visitors::bound_predicate_visitor::visit; use crate::expr::{Bind, Predicate, Reference}; @@ -2573,172 +2453,6 @@ message schema { assert!(col_b.is_null(2)); } - #[test] - fn test_add_file_path_column_ree() { - use arrow_array::{Array, Int32Array, RecordBatch, StringArray}; - use arrow_schema::{DataType, Field, Schema}; - - // Create a simple test batch with 2 columns and 3 rows - let schema = Arc::new(Schema::new(vec![ - Field::new("id", DataType::Int32, false), - Field::new("name", DataType::Utf8, false), - ])); - - let id_array = Int32Array::from(vec![1, 2, 3]); - let name_array = StringArray::from(vec!["Alice", "Bob", "Charlie"]); - - let batch = RecordBatch::try_new(schema.clone(), vec![ - Arc::new(id_array), - Arc::new(name_array), - ]) - .unwrap(); - - assert_eq!(batch.num_columns(), 2); - assert_eq!(batch.num_rows(), 3); - - // Add file path column with REE at the end (position 2) - let file_path = "/path/to/data/file.parquet"; - let result = ArrowReader::add_file_path_column_ree_at_position( - batch, - file_path, - RESERVED_COL_NAME_FILE, - RESERVED_FIELD_ID_FILE, - 2, // Position at the end after id and name columns - ); - assert!(result.is_ok(), "Should successfully add file path column"); - - let new_batch = result.unwrap(); - - // Verify the new batch has 3 columns - assert_eq!(new_batch.num_columns(), 3); - assert_eq!(new_batch.num_rows(), 3); - - // Verify schema has the _file column - let schema = new_batch.schema(); - assert_eq!(schema.fields().len(), 3); - - let file_field = schema.field(2); - assert_eq!(file_field.name(), RESERVED_COL_NAME_FILE); - assert!(!file_field.is_nullable()); - - // Verify the field has the correct metadata - let metadata = file_field.metadata(); - assert_eq!( - metadata.get(PARQUET_FIELD_ID_META_KEY), - Some(&RESERVED_FIELD_ID_FILE.to_string()) - ); - - // Verify the data type is RunEndEncoded - match file_field.data_type() { - DataType::RunEndEncoded(run_ends_field, values_field) => { - assert_eq!(run_ends_field.name(), "run_ends"); - assert_eq!(run_ends_field.data_type(), &DataType::Int32); - assert!(!run_ends_field.is_nullable()); - - assert_eq!(values_field.name(), "values"); - assert_eq!(values_field.data_type(), &DataType::Utf8); - } - _ => panic!("Expected RunEndEncoded data type for _file column"), - } - - // Verify the original columns are intact - let id_col = new_batch - .column(0) - .as_primitive::(); - assert_eq!(id_col.values(), &[1, 2, 3]); - - let name_col = new_batch.column(1).as_string::(); - assert_eq!(name_col.value(0), "Alice"); - assert_eq!(name_col.value(1), "Bob"); - assert_eq!(name_col.value(2), "Charlie"); - - // Verify the file path column contains the correct value - // The _file column is a RunArray, so we need to decode it - let file_col = new_batch.column(2); - let run_array = file_col - .as_any() - .downcast_ref::>() - .expect("Expected RunArray for _file column"); - - // Verify the run array structure (should be optimally encoded) - let run_ends = run_array.run_ends(); - assert_eq!(run_ends.values().len(), 1, "Should have only 1 run end"); - assert_eq!( - run_ends.values()[0], - new_batch.num_rows() as i32, - "Run end should equal number of rows" - ); - - // Check that the single value in the RunArray is the expected file path - let values = run_array.values(); - let string_values = values.as_string::(); - assert_eq!(string_values.len(), 1, "Should have only 1 value"); - assert_eq!(string_values.value(0), file_path); - - assert!( - string_values - .downcast_ref::() - .unwrap() - .iter() - .all(|v| v == Some(file_path)) - ) - } - - #[test] - fn test_add_file_path_column_ree_empty_batch() { - use arrow_array::RecordBatch; - use arrow_schema::{DataType, Field, Schema}; - use parquet::arrow::PARQUET_FIELD_ID_META_KEY; - - // Create an empty batch - let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); - - let id_array = arrow_array::Int32Array::from(Vec::::new()); - let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(id_array)]).unwrap(); - - assert_eq!(batch.num_rows(), 0); - - // Add file path column to empty batch with REE at position 1 (after id column) - let file_path = "/empty/file.parquet"; - let result = ArrowReader::add_file_path_column_ree_at_position( - batch, - file_path, - RESERVED_COL_NAME_FILE, - RESERVED_FIELD_ID_FILE, - 1, // Position 1, after the id column - ); - - // Should succeed with empty RunArray for empty batches - assert!(result.is_ok()); - let new_batch = result.unwrap(); - assert_eq!(new_batch.num_rows(), 0); - assert_eq!(new_batch.num_columns(), 2); - - // Verify the _file column exists with correct schema - let schema = new_batch.schema(); - let file_field = schema.field(1); - assert_eq!(file_field.name(), RESERVED_COL_NAME_FILE); - - // Should use RunEndEncoded even for empty batches - match file_field.data_type() { - DataType::RunEndEncoded(run_ends_field, values_field) => { - assert_eq!(run_ends_field.data_type(), &DataType::Int32); - assert_eq!(values_field.data_type(), &DataType::Utf8); - } - _ => panic!("Expected RunEndEncoded data type for _file column"), - } - - // Verify metadata with reserved field ID - assert_eq!( - file_field.metadata().get(PARQUET_FIELD_ID_META_KEY), - Some(&RESERVED_FIELD_ID_FILE.to_string()) - ); - - // Verify the file path column is empty but properly structured - let file_path_column = new_batch.column(1); - assert_eq!(file_path_column.len(), 0); - } - /// Test for bug where position deletes in later row groups are not applied correctly. /// /// When a file has multiple row groups and a position delete targets a row in a later diff --git a/crates/iceberg/src/arrow/record_batch_transformer.rs b/crates/iceberg/src/arrow/record_batch_transformer.rs index 5fbbbb106a..63f1142bab 100644 --- a/crates/iceberg/src/arrow/record_batch_transformer.rs +++ b/crates/iceberg/src/arrow/record_batch_transformer.rs @@ -20,15 +20,17 @@ use std::sync::Arc; use arrow_array::{ Array as ArrowArray, ArrayRef, BinaryArray, BooleanArray, Date32Array, Float32Array, - Float64Array, Int32Array, Int64Array, NullArray, RecordBatch, RecordBatchOptions, StringArray, + Float64Array, Int32Array, Int64Array, NullArray, RecordBatch, RecordBatchOptions, RunArray, + StringArray, }; use arrow_cast::cast; use arrow_schema::{ - DataType, FieldRef, Schema as ArrowSchema, SchemaRef as ArrowSchemaRef, SchemaRef, + DataType, Field, FieldRef, Schema as ArrowSchema, SchemaRef as ArrowSchemaRef, SchemaRef, }; use parquet::arrow::PARQUET_FIELD_ID_META_KEY; use crate::arrow::schema_to_arrow_schema; +use crate::metadata_columns::get_reserved_field_name; use crate::spec::{Literal, PrimitiveLiteral, Schema as IcebergSchema}; use crate::{Error, ErrorKind, Result}; @@ -111,6 +113,8 @@ enum SchemaComparison { pub(crate) struct RecordBatchTransformer { snapshot_schema: Arc, projected_iceberg_field_ids: Vec, + // Map from field ID to constant value for virtual/metadata fields + constants_map: HashMap, // BatchTransform gets lazily constructed based on the schema of // the first RecordBatch we receive from the file @@ -129,10 +133,22 @@ impl RecordBatchTransformer { Self { snapshot_schema, projected_iceberg_field_ids, + constants_map: HashMap::new(), batch_transform: None, } } + /// Add a constant value for a specific field ID. + /// This is used for virtual/metadata fields like _file that have constant values per batch. + /// + /// # Arguments + /// * `field_id` - The field ID to associate with the constant + /// * `value` - The constant value for this field + pub(crate) fn with_constant(mut self, field_id: i32, value: PrimitiveLiteral) -> Self { + self.constants_map.insert(field_id, value); + self + } + pub(crate) fn process_record_batch( &mut self, record_batch: RecordBatch, @@ -167,6 +183,7 @@ impl RecordBatchTransformer { record_batch.schema_ref(), self.snapshot_schema.as_ref(), &self.projected_iceberg_field_ids, + &self.constants_map, )?); self.process_record_batch(record_batch)? @@ -185,6 +202,7 @@ impl RecordBatchTransformer { source_schema: &ArrowSchemaRef, snapshot_schema: &IcebergSchema, projected_iceberg_field_ids: &[i32], + constants_map: &HashMap, ) -> Result { let mapped_unprojected_arrow_schema = Arc::new(schema_to_arrow_schema(snapshot_schema)?); let field_id_to_mapped_schema_map = @@ -195,11 +213,24 @@ impl RecordBatchTransformer { let fields: Result> = projected_iceberg_field_ids .iter() .map(|field_id| { - Ok(field_id_to_mapped_schema_map - .get(field_id) - .ok_or(Error::new(ErrorKind::Unexpected, "field not found"))? - .0 - .clone()) + // Check if this is a constant/virtual field + if let Some(constant_value) = constants_map.get(field_id) { + // Create a field for the virtual column based on the constant type + let arrow_type = Self::primitive_literal_to_arrow_type(constant_value)?; + let field_name = get_reserved_field_name(*field_id)?; + Ok(Arc::new( + Field::new(field_name, arrow_type, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + field_id.to_string(), + )])), + )) + } else { + Ok(field_id_to_mapped_schema_map + .get(field_id) + .ok_or(Error::new(ErrorKind::Unexpected, "field not found"))? + .0 + .clone()) + } }) .collect(); @@ -214,6 +245,7 @@ impl RecordBatchTransformer { snapshot_schema, projected_iceberg_field_ids, field_id_to_mapped_schema_map, + constants_map, )?, target_schema, }), @@ -270,11 +302,21 @@ impl RecordBatchTransformer { snapshot_schema: &IcebergSchema, projected_iceberg_field_ids: &[i32], field_id_to_mapped_schema_map: HashMap, + constants_map: &HashMap, ) -> Result> { let field_id_to_source_schema_map = Self::build_field_id_to_arrow_schema_map(source_schema)?; projected_iceberg_field_ids.iter().map(|field_id|{ + // Check if this is a constant/virtual field first + if let Some(constant_value) = constants_map.get(field_id) { + // This is a virtual field - add it with the constant value + return Ok(ColumnSource::Add { + value: Some(constant_value.clone()), + target_type: Self::primitive_literal_to_arrow_type(constant_value)?, + }); + } + let (target_field, _) = field_id_to_mapped_schema_map.get(field_id).ok_or( Error::new(ErrorKind::Unexpected, "could not find field in schema") )?; @@ -429,6 +471,27 @@ impl RecordBatchTransformer { let vals: Vec> = vec![None; num_rows]; Arc::new(Float64Array::from(vals)) } + (DataType::RunEndEncoded(_, _), Some(PrimitiveLiteral::String(value))) => { + // Create Run-End Encoded array for constant string values (e.g., file paths) + // This is more memory-efficient than repeating the same value for every row + let run_ends = if num_rows == 0 { + Int32Array::from(Vec::::new()) + } else { + Int32Array::from(vec![num_rows as i32]) + }; + let values = if num_rows == 0 { + StringArray::from(Vec::<&str>::new()) + } else { + StringArray::from(vec![value.as_str()]) + }; + Arc::new(RunArray::try_new(&run_ends, &values).map_err(|e| { + Error::new( + ErrorKind::Unexpected, + "Failed to create RunArray for constant string", + ) + .with_source(e) + })?) + } (DataType::Utf8, Some(PrimitiveLiteral::String(value))) => { Arc::new(StringArray::from(vec![value.clone(); num_rows])) } @@ -452,6 +515,33 @@ impl RecordBatchTransformer { } }) } + + /// Converts a PrimitiveLiteral to its corresponding Arrow DataType. + /// This is used for virtual fields to determine the Arrow type based on the constant value. + fn primitive_literal_to_arrow_type(literal: &PrimitiveLiteral) -> Result { + Ok(match literal { + PrimitiveLiteral::Boolean(_) => DataType::Boolean, + PrimitiveLiteral::Int(_) => DataType::Int32, + PrimitiveLiteral::Long(_) => DataType::Int64, + PrimitiveLiteral::Float(_) => DataType::Float32, + PrimitiveLiteral::Double(_) => DataType::Float64, + PrimitiveLiteral::String(_) => { + // Use Run-End Encoding for constant strings (memory efficient) + let run_ends_field = Arc::new(Field::new("run_ends", DataType::Int32, false)); + let values_field = Arc::new(Field::new("values", DataType::Utf8, true)); + DataType::RunEndEncoded(run_ends_field, values_field) + } + PrimitiveLiteral::Binary(_) => DataType::Binary, + PrimitiveLiteral::Int128(_) => DataType::Decimal128(38, 0), + PrimitiveLiteral::UInt128(_) => DataType::Decimal128(38, 0), + PrimitiveLiteral::AboveMax | PrimitiveLiteral::BelowMin => { + return Err(Error::new( + ErrorKind::Unexpected, + "Cannot create arrow type for AboveMax/BelowMin literal", + )); + } + }) + } } #[cfg(test)] diff --git a/crates/iceberg/src/lib.rs b/crates/iceberg/src/lib.rs index aae8efed74..8d8f40f72d 100644 --- a/crates/iceberg/src/lib.rs +++ b/crates/iceberg/src/lib.rs @@ -96,4 +96,5 @@ mod utils; pub mod writer; mod delete_vector; +pub mod metadata_columns; pub mod puffin; diff --git a/crates/iceberg/src/metadata_columns.rs b/crates/iceberg/src/metadata_columns.rs new file mode 100644 index 0000000000..c03727bfbc --- /dev/null +++ b/crates/iceberg/src/metadata_columns.rs @@ -0,0 +1,60 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Metadata columns (virtual/reserved fields) for Iceberg tables. +//! +//! This module defines metadata columns that can be requested in projections +//! but are not stored in data files. Instead, they are computed on-the-fly +//! during reading. Examples include the _file column (file path) and future +//! columns like partition values or row numbers. + +use crate::{Error, ErrorKind, Result}; + +/// Reserved field ID for the file path (_file) column per Iceberg spec +pub const RESERVED_FIELD_ID_FILE: i32 = 2147483646; + +/// Reserved column name for the file path metadata column +pub const RESERVED_COL_NAME_FILE: &str = "_file"; + +/// Returns the field name for a reserved field ID. +/// +/// # Arguments +/// * `field_id` - The reserved field ID +/// +/// # Returns +/// The name of the reserved field, or an error if the field ID is not recognized +pub fn get_reserved_field_name(field_id: i32) -> Result<&'static str> { + match field_id { + RESERVED_FIELD_ID_FILE => Ok(RESERVED_COL_NAME_FILE), + _ => Err(Error::new( + ErrorKind::Unexpected, + format!("Unknown reserved field ID: {field_id}"), + )), + } +} + +/// Checks if a field ID is a reserved (virtual/metadata) field. +/// +/// # Arguments +/// * `field_id` - The field ID to check +/// +/// # Returns +/// `true` if the field ID is reserved, `false` otherwise +pub fn is_reserved_field(field_id: i32) -> bool { + field_id == RESERVED_FIELD_ID_FILE + // Additional reserved fields can be checked here in the future +} diff --git a/crates/iceberg/src/scan/mod.rs b/crates/iceberg/src/scan/mod.rs index 740dab94d7..1202d566f7 100644 --- a/crates/iceberg/src/scan/mod.rs +++ b/crates/iceberg/src/scan/mod.rs @@ -31,42 +31,18 @@ use futures::stream::BoxStream; use futures::{SinkExt, StreamExt, TryStreamExt}; pub use task::*; -use crate::arrow::{ - ArrowReaderBuilder, RESERVED_COL_NAME_FILE as RESERVED_COL_NAME_FILE_INTERNAL, - RESERVED_FIELD_ID_FILE, -}; +use crate::arrow::ArrowReaderBuilder; use crate::delete_file_index::DeleteFileIndex; use crate::expr::visitors::inclusive_metrics_evaluator::InclusiveMetricsEvaluator; use crate::expr::{Bind, BoundPredicate, Predicate}; use crate::io::FileIO; +use crate::metadata_columns::{RESERVED_COL_NAME_FILE, RESERVED_FIELD_ID_FILE}; use crate::runtime::spawn; use crate::spec::{DataContentType, SnapshotRef}; use crate::table::Table; use crate::utils::available_parallelism; use crate::{Error, ErrorKind, Result}; -/// Reserved column name for the file path metadata column. -/// -/// When this column is selected in a table scan (e.g., `.select(["col1", RESERVED_COL_NAME_FILE])`), -/// each row will include the path of the file from which that row was read. -/// This is useful for debugging, auditing, or tracking data lineage. -/// -/// # Example -/// ```no_run -/// # use iceberg::scan::RESERVED_COL_NAME_FILE; -/// # use iceberg::table::Table; -/// # async fn example() -> iceberg::Result<()> { -/// # let table: Table = todo!(); -/// // Select regular columns along with the file path -/// let scan = table -/// .scan() -/// .select(["id", "name", RESERVED_COL_NAME_FILE]) -/// .build()?; -/// # Ok(()) -/// # } -/// ``` -pub const RESERVED_COL_NAME_FILE: &str = RESERVED_COL_NAME_FILE_INTERNAL; - /// A stream of arrow [`RecordBatch`]es. pub type ArrowRecordBatchStream = BoxStream<'static, Result>; @@ -246,7 +222,7 @@ impl<'a> TableScanBuilder<'a> { if let Some(column_names) = self.column_names.as_ref() { for column_name in column_names { // Skip reserved columns that don't exist in the schema - if column_name == RESERVED_COL_NAME_FILE_INTERNAL { + if column_name == RESERVED_COL_NAME_FILE { continue; } if schema.field_by_name(column_name).is_none() { @@ -270,7 +246,7 @@ impl<'a> TableScanBuilder<'a> { for column_name in column_names.iter() { // Handle special reserved column "_file" - if column_name == RESERVED_COL_NAME_FILE_INTERNAL { + if column_name == RESERVED_COL_NAME_FILE { field_ids.push(RESERVED_FIELD_ID_FILE); continue; } From 060b45d24cdba620b605be00b1955f2636412e9b Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Wed, 12 Nov 2025 09:31:37 +0100 Subject: [PATCH 16/39] Add metadata column helper functions --- crates/iceberg/src/arrow/reader.rs | 4 +- .../src/arrow/record_batch_transformer.rs | 4 +- crates/iceberg/src/metadata_columns.rs | 47 +++++++++++++++---- crates/iceberg/src/scan/mod.rs | 13 ++--- 4 files changed, 49 insertions(+), 19 deletions(-) diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs index f482a9f6a9..a59bd6aba4 100644 --- a/crates/iceberg/src/arrow/reader.rs +++ b/crates/iceberg/src/arrow/reader.rs @@ -54,7 +54,7 @@ use crate::expr::visitors::page_index_evaluator::PageIndexEvaluator; use crate::expr::visitors::row_group_metrics_evaluator::RowGroupMetricsEvaluator; use crate::expr::{BoundPredicate, BoundReference}; use crate::io::{FileIO, FileMetadata, FileRead}; -use crate::metadata_columns::{RESERVED_FIELD_ID_FILE, is_reserved_field}; +use crate::metadata_columns::{RESERVED_FIELD_ID_FILE, is_metadata_field}; use crate::scan::{ArrowRecordBatchStream, FileScanTask, FileScanTaskStream}; use crate::spec::{Datum, NestedField, PrimitiveLiteral, PrimitiveType, Schema, Type}; use crate::utils::available_parallelism; @@ -225,7 +225,7 @@ impl ArrowReader { let project_field_ids_without_reserved: Vec = task .project_field_ids .iter() - .filter(|&&id| !is_reserved_field(id)) + .filter(|&&id| !is_metadata_field(id)) .copied() .collect(); // so we must use position-based projection instead of field-ID matching diff --git a/crates/iceberg/src/arrow/record_batch_transformer.rs b/crates/iceberg/src/arrow/record_batch_transformer.rs index 63f1142bab..4c6386f912 100644 --- a/crates/iceberg/src/arrow/record_batch_transformer.rs +++ b/crates/iceberg/src/arrow/record_batch_transformer.rs @@ -30,7 +30,7 @@ use arrow_schema::{ use parquet::arrow::PARQUET_FIELD_ID_META_KEY; use crate::arrow::schema_to_arrow_schema; -use crate::metadata_columns::get_reserved_field_name; +use crate::metadata_columns::get_metadata_column_name; use crate::spec::{Literal, PrimitiveLiteral, Schema as IcebergSchema}; use crate::{Error, ErrorKind, Result}; @@ -217,7 +217,7 @@ impl RecordBatchTransformer { if let Some(constant_value) = constants_map.get(field_id) { // Create a field for the virtual column based on the constant type let arrow_type = Self::primitive_literal_to_arrow_type(constant_value)?; - let field_name = get_reserved_field_name(*field_id)?; + let field_name = get_metadata_column_name(*field_id)?; Ok(Arc::new( Field::new(field_name, arrow_type, false).with_metadata(HashMap::from([( PARQUET_FIELD_ID_META_KEY.to_string(), diff --git a/crates/iceberg/src/metadata_columns.rs b/crates/iceberg/src/metadata_columns.rs index c03727bfbc..2091dbd304 100644 --- a/crates/iceberg/src/metadata_columns.rs +++ b/crates/iceberg/src/metadata_columns.rs @@ -30,31 +30,60 @@ pub const RESERVED_FIELD_ID_FILE: i32 = 2147483646; /// Reserved column name for the file path metadata column pub const RESERVED_COL_NAME_FILE: &str = "_file"; -/// Returns the field name for a reserved field ID. +/// Returns the column name for a metadata field ID. /// /// # Arguments -/// * `field_id` - The reserved field ID +/// * `field_id` - The metadata field ID /// /// # Returns -/// The name of the reserved field, or an error if the field ID is not recognized -pub fn get_reserved_field_name(field_id: i32) -> Result<&'static str> { +/// The name of the metadata column, or an error if the field ID is not recognized +pub fn get_metadata_column_name(field_id: i32) -> Result<&'static str> { match field_id { RESERVED_FIELD_ID_FILE => Ok(RESERVED_COL_NAME_FILE), _ => Err(Error::new( ErrorKind::Unexpected, - format!("Unknown reserved field ID: {field_id}"), + format!("Unknown metadata field ID: {field_id}"), )), } } -/// Checks if a field ID is a reserved (virtual/metadata) field. +/// Returns the field ID for a metadata column name. +/// +/// # Arguments +/// * `column_name` - The metadata column name +/// +/// # Returns +/// The field ID of the metadata column, or an error if the column name is not recognized +pub fn get_metadata_field_id(column_name: &str) -> Result { + match column_name { + RESERVED_COL_NAME_FILE => Ok(RESERVED_FIELD_ID_FILE), + _ => Err(Error::new( + ErrorKind::Unexpected, + format!("Unknown metadata column name: {column_name}"), + )), + } +} + +/// Checks if a field ID is a metadata field. /// /// # Arguments /// * `field_id` - The field ID to check /// /// # Returns -/// `true` if the field ID is reserved, `false` otherwise -pub fn is_reserved_field(field_id: i32) -> bool { +/// `true` if the field ID is a metadata field, `false` otherwise +pub fn is_metadata_field(field_id: i32) -> bool { field_id == RESERVED_FIELD_ID_FILE - // Additional reserved fields can be checked here in the future + // Additional metadata fields can be checked here in the future +} + +/// Checks if a column name is a metadata column. +/// +/// # Arguments +/// * `column_name` - The column name to check +/// +/// # Returns +/// `true` if the column name is a metadata column, `false` otherwise +pub fn is_metadata_column_name(column_name: &str) -> bool { + column_name == RESERVED_COL_NAME_FILE + // Additional metadata column names can be checked here in the future } diff --git a/crates/iceberg/src/scan/mod.rs b/crates/iceberg/src/scan/mod.rs index 1202d566f7..ae53605317 100644 --- a/crates/iceberg/src/scan/mod.rs +++ b/crates/iceberg/src/scan/mod.rs @@ -36,7 +36,7 @@ use crate::delete_file_index::DeleteFileIndex; use crate::expr::visitors::inclusive_metrics_evaluator::InclusiveMetricsEvaluator; use crate::expr::{Bind, BoundPredicate, Predicate}; use crate::io::FileIO; -use crate::metadata_columns::{RESERVED_COL_NAME_FILE, RESERVED_FIELD_ID_FILE}; +use crate::metadata_columns::{get_metadata_field_id, is_metadata_column_name}; use crate::runtime::spawn; use crate::spec::{DataContentType, SnapshotRef}; use crate::table::Table; @@ -222,7 +222,7 @@ impl<'a> TableScanBuilder<'a> { if let Some(column_names) = self.column_names.as_ref() { for column_name in column_names { // Skip reserved columns that don't exist in the schema - if column_name == RESERVED_COL_NAME_FILE { + if is_metadata_column_name(column_name) { continue; } if schema.field_by_name(column_name).is_none() { @@ -245,9 +245,9 @@ impl<'a> TableScanBuilder<'a> { }); for column_name in column_names.iter() { - // Handle special reserved column "_file" - if column_name == RESERVED_COL_NAME_FILE { - field_ids.push(RESERVED_FIELD_ID_FILE); + // Handle metadata columns (like "_file") + if is_metadata_column_name(column_name) { + field_ids.push(get_metadata_field_id(column_name)?); continue; } @@ -588,7 +588,8 @@ pub mod tests { use crate::arrow::ArrowReaderBuilder; use crate::expr::{BoundPredicate, Reference}; use crate::io::{FileIO, OutputFile}; - use crate::scan::{FileScanTask, RESERVED_COL_NAME_FILE}; + use crate::metadata_columns::RESERVED_COL_NAME_FILE; + use crate::scan::FileScanTask; use crate::spec::{ DataContentType, DataFileBuilder, DataFileFormat, Datum, Literal, ManifestEntry, ManifestListWriter, ManifestStatus, ManifestWriterBuilder, NestedField, PartitionSpec, From 8572dae2f4b99670c7f8785e77d3741f83c6c77c Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Wed, 12 Nov 2025 11:16:47 +0100 Subject: [PATCH 17/39] Store fields instead of constants --- crates/iceberg/src/arrow/reader.rs | 2 +- .../src/arrow/record_batch_transformer.rs | 47 ++++++++++--------- crates/iceberg/src/metadata_columns.rs | 3 +- 3 files changed, 27 insertions(+), 25 deletions(-) diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs index a59bd6aba4..b22577f85f 100644 --- a/crates/iceberg/src/arrow/reader.rs +++ b/crates/iceberg/src/arrow/reader.rs @@ -248,7 +248,7 @@ impl ArrowReader { .with_constant( RESERVED_FIELD_ID_FILE, PrimitiveLiteral::String(task.data_file_path.clone()), - ); + )?; if let Some(batch_size) = batch_size { record_batch_stream_builder = record_batch_stream_builder.with_batch_size(batch_size); diff --git a/crates/iceberg/src/arrow/record_batch_transformer.rs b/crates/iceberg/src/arrow/record_batch_transformer.rs index 4c6386f912..01bfb6287d 100644 --- a/crates/iceberg/src/arrow/record_batch_transformer.rs +++ b/crates/iceberg/src/arrow/record_batch_transformer.rs @@ -113,8 +113,9 @@ enum SchemaComparison { pub(crate) struct RecordBatchTransformer { snapshot_schema: Arc, projected_iceberg_field_ids: Vec, - // Map from field ID to constant value for virtual/metadata fields - constants_map: HashMap, + // Pre-computed constant field information: field_id -> (arrow_type, value) + // Avoids duplicate lookups and type conversions during batch processing + constant_fields: HashMap, // BatchTransform gets lazily constructed based on the schema of // the first RecordBatch we receive from the file @@ -133,7 +134,7 @@ impl RecordBatchTransformer { Self { snapshot_schema, projected_iceberg_field_ids, - constants_map: HashMap::new(), + constant_fields: HashMap::new(), batch_transform: None, } } @@ -144,9 +145,10 @@ impl RecordBatchTransformer { /// # Arguments /// * `field_id` - The field ID to associate with the constant /// * `value` - The constant value for this field - pub(crate) fn with_constant(mut self, field_id: i32, value: PrimitiveLiteral) -> Self { - self.constants_map.insert(field_id, value); - self + pub(crate) fn with_constant(mut self, field_id: i32, value: PrimitiveLiteral) -> Result { + let arrow_type = Self::primitive_literal_to_arrow_type(&value)?; + self.constant_fields.insert(field_id, (arrow_type, value)); + Ok(self) } pub(crate) fn process_record_batch( @@ -183,7 +185,7 @@ impl RecordBatchTransformer { record_batch.schema_ref(), self.snapshot_schema.as_ref(), &self.projected_iceberg_field_ids, - &self.constants_map, + &self.constant_fields, )?); self.process_record_batch(record_batch)? @@ -202,7 +204,7 @@ impl RecordBatchTransformer { source_schema: &ArrowSchemaRef, snapshot_schema: &IcebergSchema, projected_iceberg_field_ids: &[i32], - constants_map: &HashMap, + constant_fields: &HashMap, ) -> Result { let mapped_unprojected_arrow_schema = Arc::new(schema_to_arrow_schema(snapshot_schema)?); let field_id_to_mapped_schema_map = @@ -213,16 +215,17 @@ impl RecordBatchTransformer { let fields: Result> = projected_iceberg_field_ids .iter() .map(|field_id| { - // Check if this is a constant/virtual field - if let Some(constant_value) = constants_map.get(field_id) { - // Create a field for the virtual column based on the constant type - let arrow_type = Self::primitive_literal_to_arrow_type(constant_value)?; + // Check if this is a constant/virtual field (pre-computed) + if let Some((arrow_type, _)) = constant_fields.get(field_id) { + // Create a field for the virtual column let field_name = get_metadata_column_name(*field_id)?; Ok(Arc::new( - Field::new(field_name, arrow_type, false).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - field_id.to_string(), - )])), + Field::new(field_name, arrow_type.clone(), false).with_metadata( + HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + field_id.to_string(), + )]), + ), )) } else { Ok(field_id_to_mapped_schema_map @@ -245,7 +248,7 @@ impl RecordBatchTransformer { snapshot_schema, projected_iceberg_field_ids, field_id_to_mapped_schema_map, - constants_map, + constant_fields, )?, target_schema, }), @@ -302,18 +305,18 @@ impl RecordBatchTransformer { snapshot_schema: &IcebergSchema, projected_iceberg_field_ids: &[i32], field_id_to_mapped_schema_map: HashMap, - constants_map: &HashMap, + constant_fields: &HashMap, ) -> Result> { let field_id_to_source_schema_map = Self::build_field_id_to_arrow_schema_map(source_schema)?; projected_iceberg_field_ids.iter().map(|field_id|{ - // Check if this is a constant/virtual field first - if let Some(constant_value) = constants_map.get(field_id) { + // Check if this is a constant/virtual field (pre-computed) + if let Some((arrow_type, value)) = constant_fields.get(field_id) { // This is a virtual field - add it with the constant value return Ok(ColumnSource::Add { - value: Some(constant_value.clone()), - target_type: Self::primitive_literal_to_arrow_type(constant_value)?, + value: Some(value.clone()), + target_type: arrow_type.clone(), }); } diff --git a/crates/iceberg/src/metadata_columns.rs b/crates/iceberg/src/metadata_columns.rs index 2091dbd304..435be44d17 100644 --- a/crates/iceberg/src/metadata_columns.rs +++ b/crates/iceberg/src/metadata_columns.rs @@ -84,6 +84,5 @@ pub fn is_metadata_field(field_id: i32) -> bool { /// # Returns /// `true` if the column name is a metadata column, `false` otherwise pub fn is_metadata_column_name(column_name: &str) -> bool { - column_name == RESERVED_COL_NAME_FILE - // Additional metadata column names can be checked here in the future + get_metadata_field_id(column_name).is_ok() } From f273add29ee44d033d037871bda69d441dbc6348 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Wed, 12 Nov 2025 11:18:36 +0100 Subject: [PATCH 18/39] Add comment --- crates/iceberg/src/arrow/record_batch_transformer.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/crates/iceberg/src/arrow/record_batch_transformer.rs b/crates/iceberg/src/arrow/record_batch_transformer.rs index 01bfb6287d..67535f58b7 100644 --- a/crates/iceberg/src/arrow/record_batch_transformer.rs +++ b/crates/iceberg/src/arrow/record_batch_transformer.rs @@ -531,6 +531,8 @@ impl RecordBatchTransformer { PrimitiveLiteral::String(_) => { // Use Run-End Encoding for constant strings (memory efficient) let run_ends_field = Arc::new(Field::new("run_ends", DataType::Int32, false)); + // Note that this is nullable, as Arrow expects this when building the + // final Arrow schema. let values_field = Arc::new(Field::new("values", DataType::Utf8, true)); DataType::RunEndEncoded(run_ends_field, values_field) } From 5aa92ae618d566f747ac20d06df9a82210845f82 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Wed, 12 Nov 2025 11:19:20 +0100 Subject: [PATCH 19/39] Adapt comment --- crates/iceberg/src/metadata_columns.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/iceberg/src/metadata_columns.rs b/crates/iceberg/src/metadata_columns.rs index 435be44d17..e24abdb3c5 100644 --- a/crates/iceberg/src/metadata_columns.rs +++ b/crates/iceberg/src/metadata_columns.rs @@ -70,7 +70,7 @@ pub fn get_metadata_field_id(column_name: &str) -> Result { /// * `field_id` - The field ID to check /// /// # Returns -/// `true` if the field ID is a metadata field, `false` otherwise +/// `true` if the field ID is a (currently supported) metadata field, `false` otherwise pub fn is_metadata_field(field_id: i32) -> bool { field_id == RESERVED_FIELD_ID_FILE // Additional metadata fields can be checked here in the future From c05b8860858ebc44c6f81632622c84263519356e Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Wed, 12 Nov 2025 11:25:42 +0100 Subject: [PATCH 20/39] . --- crates/iceberg/src/arrow/record_batch_transformer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/iceberg/src/arrow/record_batch_transformer.rs b/crates/iceberg/src/arrow/record_batch_transformer.rs index 67535f58b7..e3196f8a69 100644 --- a/crates/iceberg/src/arrow/record_batch_transformer.rs +++ b/crates/iceberg/src/arrow/record_batch_transformer.rs @@ -532,7 +532,7 @@ impl RecordBatchTransformer { // Use Run-End Encoding for constant strings (memory efficient) let run_ends_field = Arc::new(Field::new("run_ends", DataType::Int32, false)); // Note that this is nullable, as Arrow expects this when building the - // final Arrow schema. + // final Arrow schema with `RunArray::try_new`. let values_field = Arc::new(Field::new("values", DataType::Utf8, true)); DataType::RunEndEncoded(run_ends_field, values_field) } From 33bb0ad5300192af74fa1f0e60df68ebbba93d54 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Wed, 12 Nov 2025 11:45:50 +0100 Subject: [PATCH 21/39] Adapt error message --- crates/iceberg/src/metadata_columns.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/iceberg/src/metadata_columns.rs b/crates/iceberg/src/metadata_columns.rs index e24abdb3c5..51e430d3b2 100644 --- a/crates/iceberg/src/metadata_columns.rs +++ b/crates/iceberg/src/metadata_columns.rs @@ -42,7 +42,7 @@ pub fn get_metadata_column_name(field_id: i32) -> Result<&'static str> { RESERVED_FIELD_ID_FILE => Ok(RESERVED_COL_NAME_FILE), _ => Err(Error::new( ErrorKind::Unexpected, - format!("Unknown metadata field ID: {field_id}"), + format!("Unknown/unsupported metadata field ID: {field_id}"), )), } } @@ -59,7 +59,7 @@ pub fn get_metadata_field_id(column_name: &str) -> Result { RESERVED_COL_NAME_FILE => Ok(RESERVED_FIELD_ID_FILE), _ => Err(Error::new( ErrorKind::Unexpected, - format!("Unknown metadata column name: {column_name}"), + format!("Unknown/unsupported metadata column name: {column_name}"), )), } } From 42167ffec74d048d9cf62822138af91e91947cf3 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Wed, 12 Nov 2025 12:05:52 +0100 Subject: [PATCH 22/39] Consider field_id range --- crates/iceberg/src/metadata_columns.rs | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/crates/iceberg/src/metadata_columns.rs b/crates/iceberg/src/metadata_columns.rs index 51e430d3b2..c69af33d29 100644 --- a/crates/iceberg/src/metadata_columns.rs +++ b/crates/iceberg/src/metadata_columns.rs @@ -40,10 +40,19 @@ pub const RESERVED_COL_NAME_FILE: &str = "_file"; pub fn get_metadata_column_name(field_id: i32) -> Result<&'static str> { match field_id { RESERVED_FIELD_ID_FILE => Ok(RESERVED_COL_NAME_FILE), - _ => Err(Error::new( - ErrorKind::Unexpected, - format!("Unknown/unsupported metadata field ID: {field_id}"), - )), + _ => { + if field_id > 2147483447 { + Err(Error::new( + ErrorKind::Unexpected, + format!("Unsupported metadata field ID: {field_id}"), + )) + } else { + Err(Error::new( + ErrorKind::Unexpected, + format!("Field ID {field_id} is not a metadata field"), + )) + } + } } } From 83443aa8e5a96871c28c87ceeaeb271f726567b9 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Fri, 14 Nov 2025 09:13:35 +0100 Subject: [PATCH 23/39] Use REE encoding in record batch transformer --- crates/iceberg/src/arrow/reader.rs | 74 +- .../src/arrow/record_batch_transformer.rs | 641 ++++++++++-------- crates/iceberg/src/metadata_columns.rs | 68 +- 3 files changed, 458 insertions(+), 325 deletions(-) diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs index de6bd1ed59..0a8aa08064 100644 --- a/crates/iceberg/src/arrow/reader.rs +++ b/crates/iceberg/src/arrow/reader.rs @@ -1753,7 +1753,7 @@ mod tests { use std::sync::Arc; use arrow_array::cast::AsArray; - use arrow_array::{ArrayRef, LargeStringArray, RecordBatch, StringArray}; + use arrow_array::{ArrayRef, LargeStringArray, RecordBatch, RunArray, StringArray}; use arrow_schema::{DataType, Field, Schema as ArrowSchema, TimeUnit}; use futures::TryStreamExt; use parquet::arrow::arrow_reader::{RowSelection, RowSelector}; @@ -2569,14 +2569,24 @@ message schema { .as_primitive::(); assert_eq!(col_a.values(), &[1, 2, 3]); - // Column 'b' should be all NULLs (it didn't exist in the old file) - let col_b = batch - .column(1) - .as_primitive::(); - assert_eq!(col_b.null_count(), 3); - assert!(col_b.is_null(0)); - assert!(col_b.is_null(1)); - assert!(col_b.is_null(2)); + // Column 'b' should be all NULLs (it didn't exist in the old file, added with REE) + let col_b = batch.column(1); + // For REE array with null, check the values array + if let Some(run_array) = col_b + .as_any() + .downcast_ref::>() + { + let values = run_array.values(); + assert!( + values.is_null(0), + "REE values should contain null for added column" + ); + } else { + // Fallback to direct null check for simple arrays + assert!(col_b.is_null(0)); + assert!(col_b.is_null(1)); + assert!(col_b.is_null(2)); + } } /// Test for bug where position deletes in later row groups are not applied correctly. @@ -3456,11 +3466,23 @@ message schema { assert_eq!(age_array.value(0), 30); assert_eq!(age_array.value(1), 25); - // Verify missing column filled with NULLs - let city_array = batch.column(2).as_string::(); - assert_eq!(city_array.null_count(), 2); - assert!(city_array.is_null(0)); - assert!(city_array.is_null(1)); + // Verify missing column filled with NULLs (will be REE with null) + let city_col = batch.column(2); + if let Some(run_array) = city_col + .as_any() + .downcast_ref::>() + { + let values = run_array.values(); + assert!( + values.is_null(0), + "REE values should contain null for added column" + ); + } else { + let city_array = city_col.as_string::(); + assert_eq!(city_array.null_count(), 2); + assert!(city_array.is_null(0)); + assert!(city_array.is_null(1)); + } } /// Test reading Parquet files without field IDs that have multiple row groups. @@ -3777,13 +3799,23 @@ message schema { assert_eq!(result_col0.value(0), 1); assert_eq!(result_col0.value(1), 2); - // New column should be NULL (doesn't exist in old file) - let result_newcol = batch - .column(1) - .as_primitive::(); - assert_eq!(result_newcol.null_count(), 2); - assert!(result_newcol.is_null(0)); - assert!(result_newcol.is_null(1)); + // New column should be NULL (doesn't exist in old file, added with REE) + let newcol = batch.column(1); + if let Some(run_array) = newcol + .as_any() + .downcast_ref::>() + { + let values = run_array.values(); + assert!( + values.is_null(0), + "REE values should contain null for added column" + ); + } else { + let result_newcol = newcol.as_primitive::(); + assert_eq!(result_newcol.null_count(), 2); + assert!(result_newcol.is_null(0)); + assert!(result_newcol.is_null(1)); + } let result_col1 = batch .column(2) diff --git a/crates/iceberg/src/arrow/record_batch_transformer.rs b/crates/iceberg/src/arrow/record_batch_transformer.rs index 3554191cc5..90b5bddf7f 100644 --- a/crates/iceberg/src/arrow/record_batch_transformer.rs +++ b/crates/iceberg/src/arrow/record_batch_transformer.rs @@ -20,11 +20,8 @@ use std::sync::Arc; use arrow_array::{ Array as ArrowArray, ArrayRef, BinaryArray, BooleanArray, Date32Array, Float32Array, - Float64Array, Int32Array, Int64Array, NullArray, RecordBatch, RecordBatchOptions, RunArray, - StringArray, - StructArray, + Float64Array, Int32Array, Int64Array, RecordBatch, RecordBatchOptions, RunArray, StringArray, }; -use arrow_buffer::NullBuffer; use arrow_cast::cast; use arrow_schema::{ DataType, Field, FieldRef, Schema as ArrowSchema, SchemaRef as ArrowSchemaRef, SchemaRef, @@ -32,7 +29,7 @@ use arrow_schema::{ use parquet::arrow::PARQUET_FIELD_ID_META_KEY; use crate::arrow::schema_to_arrow_schema; -use crate::metadata_columns::get_metadata_column_name; +use crate::metadata_columns::get_metadata_field; use crate::spec::{ Literal, PartitionSpec, PrimitiveLiteral, Schema as IcebergSchema, Struct, Transform, }; @@ -155,7 +152,6 @@ pub(crate) struct RecordBatchTransformerBuilder { snapshot_schema: Arc, projected_iceberg_field_ids: Vec, constant_fields: HashMap, - field_id_to_mapped_schema_map: Option>, } impl RecordBatchTransformerBuilder { @@ -167,21 +163,9 @@ impl RecordBatchTransformerBuilder { snapshot_schema, projected_iceberg_field_ids: projected_iceberg_field_ids.to_vec(), constant_fields: HashMap::new(), - field_id_to_mapped_schema_map: None, } } - /// Lazily build and cache the field_id_to_mapped_schema_map - fn get_or_build_field_id_map(&mut self) -> Result<&HashMap> { - if self.field_id_to_mapped_schema_map.is_none() { - let mapped_arrow_schema = Arc::new(schema_to_arrow_schema(&self.snapshot_schema)?); - self.field_id_to_mapped_schema_map = Some( - RecordBatchTransformer::build_field_id_to_arrow_schema_map(&mapped_arrow_schema)?, - ); - } - Ok(self.field_id_to_mapped_schema_map.as_ref().unwrap()) - } - /// Add a constant value for a specific field ID. /// This is used for virtual/metadata fields like _file that have constant values per batch. /// @@ -207,17 +191,10 @@ impl RecordBatchTransformerBuilder { // Compute partition constants for identity-transformed fields let partition_constants = constants_map(&partition_spec, &partition_data); - // Ensure field_id_to_mapped_schema_map is built (needed for Arrow type lookup) - self.get_or_build_field_id_map()?; - - // Add partition constants to constant_fields (get Arrow types from schema) - // Safe to borrow field_id_to_mapped_schema_map again now - let field_map = self.field_id_to_mapped_schema_map.as_ref().unwrap(); + // Add partition constants to constant_fields (compute REE types from literals) for (field_id, value) in partition_constants { - if let Some((field, _)) = field_map.get(&field_id) { - let arrow_type = field.data_type().clone(); - self.constant_fields.insert(field_id, (arrow_type, value)); - } + let arrow_type = RecordBatchTransformer::primitive_literal_to_arrow_type(&value)?; + self.constant_fields.insert(field_id, (arrow_type, value)); } Ok(self) @@ -336,6 +313,8 @@ impl RecordBatchTransformer { let mapped_unprojected_arrow_schema = Arc::new(schema_to_arrow_schema(snapshot_schema)?); let field_id_to_mapped_schema_map = Self::build_field_id_to_arrow_schema_map(&mapped_unprojected_arrow_schema)?; + let field_id_to_source_schema_map = + Self::build_field_id_to_arrow_schema_map(source_schema)?; // Create a new arrow schema by selecting fields from mapped_unprojected, // in the order of the field ids in projected_iceberg_field_ids @@ -346,31 +325,46 @@ impl RecordBatchTransformer { if constant_fields.contains_key(field_id) { // For metadata/virtual fields (like _file), get name from metadata_columns // For partition fields, get name from schema (they exist in schema) - if let Ok(field_name) = get_metadata_column_name(*field_id) { - // This is a metadata/virtual field - let (arrow_type, _) = constant_fields.get(field_id).unwrap(); - Ok(Arc::new( - Field::new(field_name, arrow_type.clone(), false).with_metadata( - HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - field_id.to_string(), - )]), - ), - )) + if let Ok(field) = get_metadata_field(*field_id) { + // This is a metadata/virtual field - use the predefined field + Ok(field) } else { - // This is a partition field (exists in schema) - Ok(field_id_to_mapped_schema_map + // This is a partition constant field (exists in schema but uses constant value) + let field = &field_id_to_mapped_schema_map .get(field_id) .ok_or(Error::new(ErrorKind::Unexpected, "field not found"))? - .0 - .clone()) + .0; + let (arrow_type, _) = constant_fields.get(field_id).unwrap(); + // Use the REE type from constant_fields + let ree_field = + Field::new(field.name(), arrow_type.clone(), field.is_nullable()) + .with_metadata(field.metadata().clone()); + Ok(Arc::new(ree_field)) } } else { - Ok(field_id_to_mapped_schema_map + // Get field from schema + let field = &field_id_to_mapped_schema_map .get(field_id) .ok_or(Error::new(ErrorKind::Unexpected, "field not found"))? - .0 - .clone()) + .0; + + // Check if this field exists in the source - if not, it will be added with REE + if !field_id_to_source_schema_map.contains_key(field_id) { + // Field will be added - use REE type for the schema + let run_ends_field = + Arc::new(Field::new("run_ends", DataType::Int32, false)); + let values_field = + Arc::new(Field::new("values", field.data_type().clone(), true)); + let ree_field = Field::new( + field.name(), + DataType::RunEndEncoded(run_ends_field, values_field), + field.is_nullable(), + ) + .with_metadata(field.metadata().clone()); + Ok(Arc::new(ree_field)) + } else { + Ok(Arc::clone(field)) + } } }) .collect(); @@ -525,9 +519,15 @@ impl RecordBatchTransformer { None } }); + + // Always use REE for added columns (memory efficient) + let run_ends_field = Arc::new(Field::new("run_ends", DataType::Int32, false)); + let values_field = Arc::new(Field::new("values", target_type.clone(), true)); + let column_type = DataType::RunEndEncoded(run_ends_field, values_field); + ColumnSource::Add { value: default_value, - target_type: target_type.clone(), + target_type: column_type, } }; @@ -593,129 +593,160 @@ impl RecordBatchTransformer { prim_lit: &Option, num_rows: usize, ) -> Result { - Ok(match (target_type, prim_lit) { - (DataType::Boolean, Some(PrimitiveLiteral::Boolean(value))) => { - Arc::new(BooleanArray::from(vec![*value; num_rows])) - } - (DataType::Boolean, None) => { - let vals: Vec> = vec![None; num_rows]; - Arc::new(BooleanArray::from(vals)) - } - (DataType::Int32, Some(PrimitiveLiteral::Int(value))) => { - Arc::new(Int32Array::from(vec![*value; num_rows])) - } - (DataType::Int32, None) => { - let vals: Vec> = vec![None; num_rows]; - Arc::new(Int32Array::from(vals)) - } - (DataType::Date32, Some(PrimitiveLiteral::Int(value))) => { - Arc::new(Date32Array::from(vec![*value; num_rows])) + // All added columns use Run-End Encoding for memory efficiency + let DataType::RunEndEncoded(_, values_field) = target_type else { + return Err(Error::new( + ErrorKind::Unexpected, + format!( + "Expected RunEndEncoded type for added column, got: {}", + target_type + ), + )); + }; + + // Helper to create a Run-End Encoded array + let create_ree_array = |values_array: ArrayRef| -> Result { + let run_ends = if num_rows == 0 { + Int32Array::from(Vec::::new()) + } else { + Int32Array::from(vec![num_rows as i32]) + }; + Ok(Arc::new( + RunArray::try_new(&run_ends, &values_array).map_err(|e| { + Error::new( + ErrorKind::Unexpected, + "Failed to create RunArray for constant value", + ) + .with_source(e) + })?, + )) + }; + + // Create the values array based on the literal value + let values_array: ArrayRef = match (values_field.data_type(), prim_lit) { + (DataType::Boolean, Some(PrimitiveLiteral::Boolean(v))) => { + Arc::new(BooleanArray::from(vec![*v])) } - (DataType::Date32, None) => { - let vals: Vec> = vec![None; num_rows]; - Arc::new(Date32Array::from(vals)) + (DataType::Boolean, None) => Arc::new(BooleanArray::from(vec![Option::::None])), + (DataType::Int32, Some(PrimitiveLiteral::Int(v))) => { + Arc::new(Int32Array::from(vec![*v])) } - (DataType::Int64, Some(PrimitiveLiteral::Long(value))) => { - Arc::new(Int64Array::from(vec![*value; num_rows])) + (DataType::Int32, None) => Arc::new(Int32Array::from(vec![Option::::None])), + (DataType::Date32, Some(PrimitiveLiteral::Int(v))) => { + Arc::new(Date32Array::from(vec![*v])) } - (DataType::Int64, None) => { - let vals: Vec> = vec![None; num_rows]; - Arc::new(Int64Array::from(vals)) + (DataType::Date32, None) => Arc::new(Date32Array::from(vec![Option::::None])), + (DataType::Int64, Some(PrimitiveLiteral::Long(v))) => { + Arc::new(Int64Array::from(vec![*v])) } - (DataType::Float32, Some(PrimitiveLiteral::Float(value))) => { - Arc::new(Float32Array::from(vec![value.0; num_rows])) + (DataType::Int64, None) => Arc::new(Int64Array::from(vec![Option::::None])), + (DataType::Float32, Some(PrimitiveLiteral::Float(v))) => { + Arc::new(Float32Array::from(vec![v.0])) } - (DataType::Float32, None) => { - let vals: Vec> = vec![None; num_rows]; - Arc::new(Float32Array::from(vals)) + (DataType::Float32, None) => Arc::new(Float32Array::from(vec![Option::::None])), + (DataType::Float64, Some(PrimitiveLiteral::Double(v))) => { + Arc::new(Float64Array::from(vec![v.0])) } - (DataType::Float64, Some(PrimitiveLiteral::Double(value))) => { - Arc::new(Float64Array::from(vec![value.0; num_rows])) + (DataType::Float64, None) => Arc::new(Float64Array::from(vec![Option::::None])), + (DataType::Utf8, Some(PrimitiveLiteral::String(v))) => { + Arc::new(StringArray::from(vec![v.as_str()])) } - (DataType::Float64, None) => { - let vals: Vec> = vec![None; num_rows]; - Arc::new(Float64Array::from(vals)) + (DataType::Utf8, None) => Arc::new(StringArray::from(vec![Option::<&str>::None])), + (DataType::Binary, Some(PrimitiveLiteral::Binary(v))) => { + Arc::new(BinaryArray::from_vec(vec![v.as_slice()])) } - (DataType::RunEndEncoded(_, _), Some(PrimitiveLiteral::String(value))) => { - // Create Run-End Encoded array for constant string values (e.g., file paths) - // This is more memory-efficient than repeating the same value for every row - let run_ends = if num_rows == 0 { - Int32Array::from(Vec::::new()) - } else { - Int32Array::from(vec![num_rows as i32]) - }; - let values = if num_rows == 0 { - StringArray::from(Vec::<&str>::new()) - } else { - StringArray::from(vec![value.as_str()]) - }; - Arc::new(RunArray::try_new(&run_ends, &values).map_err(|e| { - Error::new( - ErrorKind::Unexpected, - "Failed to create RunArray for constant string", - ) - .with_source(e) - })?) + (DataType::Binary, None) => { + Arc::new(BinaryArray::from_opt_vec(vec![Option::<&[u8]>::None])) } - (DataType::Utf8, Some(PrimitiveLiteral::String(value))) => { - Arc::new(StringArray::from(vec![value.clone(); num_rows])) + (DataType::Decimal128(_, _), Some(PrimitiveLiteral::Int128(v))) => { + Arc::new(arrow_array::Decimal128Array::from(vec![{ *v }])) } - (DataType::Utf8, None) => { - let vals: Vec> = vec![None; num_rows]; - Arc::new(StringArray::from(vals)) + (DataType::Decimal128(_, _), Some(PrimitiveLiteral::UInt128(v))) => { + Arc::new(arrow_array::Decimal128Array::from(vec![*v as i128])) } - (DataType::Binary, Some(PrimitiveLiteral::Binary(value))) => { - Arc::new(BinaryArray::from_vec(vec![value; num_rows])) - } - (DataType::Binary, None) => { - let vals: Vec> = vec![None; num_rows]; - Arc::new(BinaryArray::from_opt_vec(vals)) + (DataType::Decimal128(_, _), None) => { + Arc::new(arrow_array::Decimal128Array::from(vec![ + Option::::None, + ])) } (DataType::Struct(fields), None) => { - // Create a StructArray filled with nulls. Per Iceberg spec, optional struct fields - // default to null when added to the schema. We defer non-null default struct values - // and leave them as not implemented yet. + // Create a single-element StructArray with nulls let null_arrays: Vec = fields .iter() - .map(|field| Self::create_column(field.data_type(), &None, num_rows)) - .collect::>>()?; - - Arc::new(StructArray::new( + .map(|f| { + // Recursively create null arrays for struct fields + // For primitive fields in structs, use simple null arrays (not REE within struct) + match f.data_type() { + DataType::Boolean => { + Arc::new(BooleanArray::from(vec![Option::::None])) as ArrayRef + } + DataType::Int32 | DataType::Date32 => { + Arc::new(Int32Array::from(vec![Option::::None])) + } + DataType::Int64 => { + Arc::new(Int64Array::from(vec![Option::::None])) + } + DataType::Float32 => { + Arc::new(Float32Array::from(vec![Option::::None])) + } + DataType::Float64 => { + Arc::new(Float64Array::from(vec![Option::::None])) + } + DataType::Utf8 => { + Arc::new(StringArray::from(vec![Option::<&str>::None])) + } + DataType::Binary => { + Arc::new(BinaryArray::from_opt_vec(vec![Option::<&[u8]>::None])) + } + _ => panic!("Unsupported struct field type: {:?}", f.data_type()), + } + }) + .collect(); + Arc::new(arrow_array::StructArray::new( fields.clone(), null_arrays, - Some(NullBuffer::new_null(num_rows)), + Some(arrow_buffer::NullBuffer::new_null(1)), )) } - (DataType::Null, _) => Arc::new(NullArray::new(num_rows)), - (dt, _) => { + _ => { return Err(Error::new( ErrorKind::Unexpected, - format!("unexpected target column type {}", dt), + format!( + "Unsupported constant type combination: {:?} with {:?}", + values_field.data_type(), + prim_lit + ), )); } - }) + }; + + // Wrap in Run-End Encoding + create_ree_array(values_array) } /// Converts a PrimitiveLiteral to its corresponding Arrow DataType. - /// This is used for virtual fields to determine the Arrow type based on the constant value. + /// This is used for constant fields to determine the Arrow type. + /// For constant values, we use Run-End Encoding for all types to save memory. fn primitive_literal_to_arrow_type(literal: &PrimitiveLiteral) -> Result { + // Helper to create REE type with the given values type + // Note: values field is nullable as Arrow expects this when building the + // final Arrow schema with `RunArray::try_new`. + let make_ree = |values_type: DataType| -> DataType { + let run_ends_field = Arc::new(Field::new("run_ends", DataType::Int32, false)); + let values_field = Arc::new(Field::new("values", values_type, true)); + DataType::RunEndEncoded(run_ends_field, values_field) + }; + Ok(match literal { - PrimitiveLiteral::Boolean(_) => DataType::Boolean, - PrimitiveLiteral::Int(_) => DataType::Int32, - PrimitiveLiteral::Long(_) => DataType::Int64, - PrimitiveLiteral::Float(_) => DataType::Float32, - PrimitiveLiteral::Double(_) => DataType::Float64, - PrimitiveLiteral::String(_) => { - // Use Run-End Encoding for constant strings (memory efficient) - let run_ends_field = Arc::new(Field::new("run_ends", DataType::Int32, false)); - // Note that this is nullable, as Arrow expects this when building the - // final Arrow schema with `RunArray::try_new`. - let values_field = Arc::new(Field::new("values", DataType::Utf8, true)); - DataType::RunEndEncoded(run_ends_field, values_field) - } - PrimitiveLiteral::Binary(_) => DataType::Binary, - PrimitiveLiteral::Int128(_) => DataType::Decimal128(38, 0), - PrimitiveLiteral::UInt128(_) => DataType::Decimal128(38, 0), + PrimitiveLiteral::Boolean(_) => make_ree(DataType::Boolean), + PrimitiveLiteral::Int(_) => make_ree(DataType::Int32), + PrimitiveLiteral::Long(_) => make_ree(DataType::Int64), + PrimitiveLiteral::Float(_) => make_ree(DataType::Float32), + PrimitiveLiteral::Double(_) => make_ree(DataType::Float64), + PrimitiveLiteral::String(_) => make_ree(DataType::Utf8), + PrimitiveLiteral::Binary(_) => make_ree(DataType::Binary), + PrimitiveLiteral::Int128(_) => make_ree(DataType::Decimal128(38, 0)), + PrimitiveLiteral::UInt128(_) => make_ree(DataType::Decimal128(38, 0)), PrimitiveLiteral::AboveMax | PrimitiveLiteral::BelowMin => { return Err(Error::new( ErrorKind::Unexpected, @@ -732,8 +763,8 @@ mod test { use std::sync::Arc; use arrow_array::{ - Array, Date32Array, Float32Array, Float64Array, Int32Array, Int64Array, RecordBatch, - StringArray, + Array, ArrayRef, Date32Array, Float32Array, Float64Array, Int32Array, Int64Array, + RecordBatch, StringArray, }; use arrow_schema::{DataType, Field, Schema as ArrowSchema}; use parquet::arrow::PARQUET_FIELD_ID_META_KEY; @@ -743,6 +774,82 @@ mod test { }; use crate::spec::{Literal, NestedField, PrimitiveType, Schema, Struct, Type}; + /// Helper to extract string values from either StringArray or RunEndEncoded + /// Returns empty string for null values + fn get_string_value(array: &dyn Array, index: usize) -> String { + if let Some(string_array) = array.as_any().downcast_ref::() { + if string_array.is_null(index) { + String::new() + } else { + string_array.value(index).to_string() + } + } else if let Some(run_array) = array + .as_any() + .downcast_ref::>() + { + let values = run_array.values(); + let string_values = values + .as_any() + .downcast_ref::() + .expect("REE values should be StringArray"); + // For REE, all rows have the same value (index 0 in the values array) + if string_values.is_null(0) { + String::new() + } else { + string_values.value(0).to_string() + } + } else { + panic!("Expected StringArray or RunEndEncoded"); + } + } + + /// Helper to extract int values from either Int32Array or RunEndEncoded + fn get_int_value(array: &dyn Array, index: usize) -> i32 { + if let Some(int_array) = array.as_any().downcast_ref::() { + int_array.value(index) + } else if let Some(run_array) = array + .as_any() + .downcast_ref::>() + { + let values = run_array.values(); + let int_values = values + .as_any() + .downcast_ref::() + .expect("REE values should be Int32Array"); + int_values.value(0) + } else { + panic!("Expected Int32Array or RunEndEncoded"); + } + } + + /// Helper to check if value is null in either simple or REE array + fn is_null_value(array: &dyn Array, index: usize) -> bool { + if let Some(run_array) = array + .as_any() + .downcast_ref::>() + { + let values = run_array.values(); + values.is_null(0) // For REE, check the single value + } else { + array.is_null(index) + } + } + + /// Helper to create a RunEndEncoded StringArray for constant values + fn create_ree_string_array(value: Option<&str>, num_rows: usize) -> ArrayRef { + let run_ends = if num_rows == 0 { + Int32Array::from(Vec::::new()) + } else { + Int32Array::from(vec![num_rows as i32]) + }; + let values = if num_rows == 0 { + StringArray::from(Vec::>::new()) + } else { + StringArray::from(vec![value]) + }; + Arc::new(arrow_array::RunArray::try_new(&run_ends, &values).unwrap()) + } + #[test] fn build_field_id_to_source_schema_map_works() { let arrow_schema = arrow_schema_already_same_as_target(); @@ -838,30 +945,19 @@ mod test { assert_eq!(result.num_columns(), 3); assert_eq!(result.num_rows(), 3); - let id_column = result - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(id_column.values(), &[1, 2, 3]); + // Use helpers to handle both simple and REE arrays + assert_eq!(get_int_value(result.column(0).as_ref(), 0), 1); + assert_eq!(get_int_value(result.column(0).as_ref(), 1), 2); + assert_eq!(get_int_value(result.column(0).as_ref(), 2), 3); - let name_column = result - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(name_column.value(0), "Alice"); - assert_eq!(name_column.value(1), "Bob"); - assert_eq!(name_column.value(2), "Charlie"); + assert_eq!(get_string_value(result.column(1).as_ref(), 0), "Alice"); + assert_eq!(get_string_value(result.column(1).as_ref(), 1), "Bob"); + assert_eq!(get_string_value(result.column(1).as_ref(), 2), "Charlie"); - let date_column = result - .column(2) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(date_column.is_null(0)); - assert!(date_column.is_null(1)); - assert!(date_column.is_null(2)); + // date_col added with null default - will be REE with null + assert!(is_null_value(result.column(2).as_ref(), 0)); + assert!(is_null_value(result.column(2).as_ref(), 1)); + assert!(is_null_value(result.column(2).as_ref(), 2)); } #[test] @@ -896,7 +992,8 @@ mod test { let projected_iceberg_field_ids = [1, 2, 3]; let mut transformer = - RecordBatchTransformer::build(snapshot_schema, &projected_iceberg_field_ids); + RecordBatchTransformerBuilder::new(snapshot_schema, &projected_iceberg_field_ids) + .build(); let file_schema = Arc::new(ArrowSchema::new(vec![ simple_field("id", DataType::Int32, false, "1"), @@ -914,30 +1011,19 @@ mod test { assert_eq!(result.num_columns(), 3); assert_eq!(result.num_rows(), 3); - let id_column = result - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(id_column.values(), &[1, 2, 3]); + // Use helpers to handle both simple and REE arrays + assert_eq!(get_int_value(result.column(0).as_ref(), 0), 1); + assert_eq!(get_int_value(result.column(0).as_ref(), 1), 2); + assert_eq!(get_int_value(result.column(0).as_ref(), 2), 3); - let data_column = result - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(data_column.value(0), "a"); - assert_eq!(data_column.value(1), "b"); - assert_eq!(data_column.value(2), "c"); + assert_eq!(get_string_value(result.column(1).as_ref(), 0), "a"); + assert_eq!(get_string_value(result.column(1).as_ref(), 1), "b"); + assert_eq!(get_string_value(result.column(1).as_ref(), 2), "c"); - let struct_column = result - .column(2) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(struct_column.is_null(0)); - assert!(struct_column.is_null(1)); - assert!(struct_column.is_null(2)); + // Struct column added with null - will be REE + assert!(is_null_value(result.column(2).as_ref(), 0)); + assert!(is_null_value(result.column(2).as_ref(), 1)); + assert!(is_null_value(result.column(2).as_ref(), 2)); } pub fn source_record_batch() -> RecordBatch { @@ -977,10 +1063,17 @@ mod test { } pub fn expected_record_batch_migration_required() -> RecordBatch { - RecordBatch::try_new(arrow_schema_already_same_as_target(), vec![ - Arc::new(StringArray::from(Vec::>::from([ - None, None, None, - ]))), // a + // Build schema with REE type for added fields (a and f) + let schema = Arc::new(ArrowSchema::new(vec![ + ree_field("a", DataType::Utf8, true, "10"), // Added field - REE with null + simple_field("b", DataType::Int64, false, "11"), + simple_field("c", DataType::Float64, false, "12"), + simple_field("e", DataType::Utf8, true, "14"), + ree_field("f", DataType::Utf8, false, "15"), // Added field - REE with default + ])); + + RecordBatch::try_new(schema, vec![ + create_ree_string_array(None, 3), // a - REE with null (not in source) Arc::new(Int64Array::from(vec![Some(1001), Some(1002), Some(1003)])), // b Arc::new(Float64Array::from(vec![ Some(12.125), @@ -992,11 +1085,7 @@ mod test { Some("Iceberg"), Some("Rocks"), ])), // e (d skipped by projection) - Arc::new(StringArray::from(vec![ - Some("(╯°□°)╯"), - Some("(╯°□°)╯"), - Some("(╯°□°)╯"), - ])), // f + create_ree_string_array(Some("(╯°□°)╯"), 3), // f - REE for constant default ]) .unwrap() } @@ -1052,6 +1141,21 @@ mod test { )])) } + /// Helper to create a Field with RunEndEncoded type for constant columns + fn ree_field(name: &str, values_type: DataType, nullable: bool, field_id: &str) -> Field { + let run_ends_field = Arc::new(Field::new("run_ends", DataType::Int32, false)); + let values_field = Arc::new(Field::new("values", values_type, true)); + Field::new( + name, + DataType::RunEndEncoded(run_ends_field, values_field), + nullable, + ) + .with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + field_id.to_string(), + )])) + } + /// Test for add_files with Parquet files that have NO field IDs (Hive tables). /// /// This reproduces the scenario from Iceberg spec where: @@ -1131,33 +1235,14 @@ mod test { assert_eq!(result.num_columns(), 4); assert_eq!(result.num_rows(), 1); - let id_column = result - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(id_column.value(0), 1); - - let name_column = result - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(name_column.value(0), "John Doe"); - - let dept_column = result - .column(2) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(dept_column.value(0), "hr"); - - let subdept_column = result - .column(3) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(subdept_column.value(0), "communications"); + // Use helpers to handle both simple and REE arrays + assert_eq!(get_int_value(result.column(0).as_ref(), 0), 1); // id from initial_default - REE + assert_eq!(get_string_value(result.column(1).as_ref(), 0), "John Doe"); // name from Parquet + assert_eq!(get_string_value(result.column(2).as_ref(), 0), "hr"); // dept from initial_default - REE + assert_eq!( + get_string_value(result.column(3).as_ref(), 0), + "communications" + ); // subdept from Parquet } /// Test for bucket partitioning where source columns must be read from data files. @@ -1376,30 +1461,23 @@ mod test { assert_eq!(result.num_columns(), 3); assert_eq!(result.num_rows(), 2); - let id_column = result - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(id_column.value(0), 100); - assert_eq!(id_column.value(1), 200); + // Use helpers to handle both simple and REE arrays + assert_eq!(get_int_value(result.column(0).as_ref(), 0), 100); + assert_eq!(get_int_value(result.column(0).as_ref(), 1), 200); - let dept_column = result - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - // This value MUST come from partition metadata (constant) - assert_eq!(dept_column.value(0), "engineering"); - assert_eq!(dept_column.value(1), "engineering"); + // dept column comes from partition metadata (constant) - will be REE + assert_eq!( + get_string_value(result.column(1).as_ref(), 0), + "engineering" + ); + assert_eq!( + get_string_value(result.column(1).as_ref(), 1), + "engineering" + ); - let name_column = result - .column(2) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(name_column.value(0), "Alice"); - assert_eq!(name_column.value(1), "Bob"); + // name column comes from file + assert_eq!(get_string_value(result.column(2).as_ref(), 0), "Alice"); + assert_eq!(get_string_value(result.column(2).as_ref(), 1), "Bob"); } /// Test bucket partitioning with renamed source column. @@ -1599,48 +1677,37 @@ mod test { // Verify each column demonstrates the correct spec rule: // Normal case: id from Parquet by field ID - let id_column = result - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(id_column.value(0), 100); - assert_eq!(id_column.value(1), 200); + // Use helpers to handle both simple and REE arrays + assert_eq!(get_int_value(result.column(0).as_ref(), 0), 100); + assert_eq!(get_int_value(result.column(0).as_ref(), 1), 200); + + // Rule #1: dept from partition metadata (identity transform) - will be REE + assert_eq!( + get_string_value(result.column(1).as_ref(), 0), + "engineering" + ); + assert_eq!( + get_string_value(result.column(1).as_ref(), 1), + "engineering" + ); - // Rule #1: dept from partition metadata (identity transform) - let dept_column = result - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(dept_column.value(0), "engineering"); - assert_eq!(dept_column.value(1), "engineering"); + // Rule #2: data from Parquet via name mapping - will be regular array + assert_eq!(get_string_value(result.column(2).as_ref(), 0), "value1"); + assert_eq!(get_string_value(result.column(2).as_ref(), 1), "value2"); - // Rule #2: data from Parquet via name mapping - let data_column = result - .column(2) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(data_column.value(0), "value1"); - assert_eq!(data_column.value(1), "value2"); - - // Rule #3: category from initial_default - let category_column = result - .column(3) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(category_column.value(0), "default_category"); - assert_eq!(category_column.value(1), "default_category"); + // Rule #3: category from initial_default - will be REE + assert_eq!( + get_string_value(result.column(3).as_ref(), 0), + "default_category" + ); + assert_eq!( + get_string_value(result.column(3).as_ref(), 1), + "default_category" + ); - // Rule #4: notes is null (no default, not in Parquet, not in partition) - let notes_column = result - .column(4) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(notes_column.is_null(0)); - assert!(notes_column.is_null(1)); + // Rule #4: notes is null (no default, not in Parquet, not in partition) - will be REE with null + // For null REE arrays, we still use the helper which handles extraction + assert_eq!(get_string_value(result.column(4).as_ref(), 0), ""); + assert_eq!(get_string_value(result.column(4).as_ref(), 1), ""); } } diff --git a/crates/iceberg/src/metadata_columns.rs b/crates/iceberg/src/metadata_columns.rs index c69af33d29..ce48651c17 100644 --- a/crates/iceberg/src/metadata_columns.rs +++ b/crates/iceberg/src/metadata_columns.rs @@ -22,37 +22,71 @@ //! during reading. Examples include the _file column (file path) and future //! columns like partition values or row numbers. +use std::collections::HashMap; +use std::sync::Arc; + +use arrow_schema::{DataType, Field}; +use once_cell::sync::Lazy; +use parquet::arrow::PARQUET_FIELD_ID_META_KEY; + use crate::{Error, ErrorKind, Result}; /// Reserved field ID for the file path (_file) column per Iceberg spec -pub const RESERVED_FIELD_ID_FILE: i32 = 2147483646; +pub const RESERVED_FIELD_ID_FILE: i32 = i32::MAX - 1; /// Reserved column name for the file path metadata column pub const RESERVED_COL_NAME_FILE: &str = "_file"; -/// Returns the column name for a metadata field ID. +/// Lazy-initialized Arrow Field definition for the _file metadata column. +/// Uses Run-End Encoding for memory efficiency. +static FILE_PATH_FIELD: Lazy> = Lazy::new(|| { + let run_ends_field = Arc::new(Field::new("run_ends", DataType::Int32, false)); + let values_field = Arc::new(Field::new("values", DataType::Utf8, true)); + Arc::new( + Field::new( + RESERVED_COL_NAME_FILE, + DataType::RunEndEncoded(run_ends_field, values_field), + false, + ) + .with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + RESERVED_FIELD_ID_FILE.to_string(), + )])), + ) +}); + +/// Returns the Arrow Field definition for the _file metadata column. +/// +/// # Returns +/// A reference to the _file field definition (RunEndEncoded type) +pub fn file_path_field() -> &'static Arc { + &FILE_PATH_FIELD +} + +/// Returns the Arrow Field definition for a metadata field ID. /// /// # Arguments /// * `field_id` - The metadata field ID /// /// # Returns -/// The name of the metadata column, or an error if the field ID is not recognized -pub fn get_metadata_column_name(field_id: i32) -> Result<&'static str> { +/// The Arrow Field definition for the metadata column, or an error if not a metadata field +pub fn get_metadata_field(field_id: i32) -> Result> { match field_id { - RESERVED_FIELD_ID_FILE => Ok(RESERVED_COL_NAME_FILE), - _ => { - if field_id > 2147483447 { - Err(Error::new( - ErrorKind::Unexpected, - format!("Unsupported metadata field ID: {field_id}"), - )) - } else { - Err(Error::new( - ErrorKind::Unexpected, - format!("Field ID {field_id} is not a metadata field"), - )) - } + RESERVED_FIELD_ID_FILE => Ok(Arc::clone(file_path_field())), + _ if is_metadata_field(field_id) => { + // Future metadata fields can be added here + Err(Error::new( + ErrorKind::Unexpected, + format!( + "Metadata field ID {} recognized but field definition not implemented", + field_id + ), + )) } + _ => Err(Error::new( + ErrorKind::Unexpected, + format!("Field ID {} is not a metadata field", field_id), + )), } } From 35aba12ae711d03a7ed443ba515359ddf0748c42 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Fri, 14 Nov 2025 09:42:43 +0100 Subject: [PATCH 24/39] Fix clippy errors --- crates/iceberg/src/arrow/record_batch_transformer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/iceberg/src/arrow/record_batch_transformer.rs b/crates/iceberg/src/arrow/record_batch_transformer.rs index 90b5bddf7f..c205a9de71 100644 --- a/crates/iceberg/src/arrow/record_batch_transformer.rs +++ b/crates/iceberg/src/arrow/record_batch_transformer.rs @@ -763,7 +763,7 @@ mod test { use std::sync::Arc; use arrow_array::{ - Array, ArrayRef, Date32Array, Float32Array, Float64Array, Int32Array, Int64Array, + Array, ArrayRef, Float32Array, Float64Array, Int32Array, Int64Array, RecordBatch, StringArray, }; use arrow_schema::{DataType, Field, Schema as ArrowSchema}; From 830e462e021e483902567077a0a17f11fab84813 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Fri, 14 Nov 2025 10:03:18 +0100 Subject: [PATCH 25/39] Format --- crates/iceberg/src/arrow/record_batch_transformer.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/iceberg/src/arrow/record_batch_transformer.rs b/crates/iceberg/src/arrow/record_batch_transformer.rs index c205a9de71..d0cd4c25d4 100644 --- a/crates/iceberg/src/arrow/record_batch_transformer.rs +++ b/crates/iceberg/src/arrow/record_batch_transformer.rs @@ -763,8 +763,8 @@ mod test { use std::sync::Arc; use arrow_array::{ - Array, ArrayRef, Float32Array, Float64Array, Int32Array, Int64Array, - RecordBatch, StringArray, + Array, ArrayRef, Float32Array, Float64Array, Int32Array, Int64Array, RecordBatch, + StringArray, }; use arrow_schema::{DataType, Field, Schema as ArrowSchema}; use parquet::arrow::PARQUET_FIELD_ID_META_KEY; From 4eb8a63b8987bc4109a44da93d570e5e6540e3a8 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Fri, 14 Nov 2025 15:49:30 +0100 Subject: [PATCH 26/39] Add `with_file_path_column` helper --- crates/iceberg/src/scan/mod.rs | 41 ++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/crates/iceberg/src/scan/mod.rs b/crates/iceberg/src/scan/mod.rs index 24c03b0b2c..df332a8dd2 100644 --- a/crates/iceberg/src/scan/mod.rs +++ b/crates/iceberg/src/scan/mod.rs @@ -125,6 +125,47 @@ impl<'a> TableScanBuilder<'a> { self } + /// Include the _file metadata column in the scan. + /// + /// This is a convenience method that adds the _file column to the current selection. + /// If no columns are currently selected (select_all), this will select all columns plus _file. + /// If specific columns are selected, this adds _file to that selection. + /// + /// # Example + /// ```no_run + /// # use iceberg::table::Table; + /// # async fn example(table: Table) -> iceberg::Result<()> { + /// // Select id, name, and _file + /// let scan = table + /// .scan() + /// .select(["id", "name"]) + /// .with_file_path_column() + /// .build()?; + /// # Ok(()) + /// # } + /// ``` + pub fn with_file_path_column(mut self) -> Self { + use crate::metadata_columns::RESERVED_COL_NAME_FILE; + + let mut columns = self.column_names.unwrap_or_else(|| { + // No explicit selection - get all column names from schema + self.table + .metadata() + .current_schema() + .as_struct() + .fields() + .iter() + .map(|f| f.name.clone()) + .collect() + }); + + // Add _file column + columns.push(RESERVED_COL_NAME_FILE.to_string()); + + self.column_names = Some(columns); + self + } + /// Set the snapshot to scan. When not set, it uses current snapshot. pub fn snapshot_id(mut self, snapshot_id: i64) -> Self { self.snapshot_id = Some(snapshot_id); From 0b8f15b46646bb6f088b94a58a21c7867c316a80 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Mon, 17 Nov 2025 11:35:24 +0100 Subject: [PATCH 27/39] Rename field --- crates/iceberg/src/metadata_columns.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/crates/iceberg/src/metadata_columns.rs b/crates/iceberg/src/metadata_columns.rs index ce48651c17..30e4b4f2b9 100644 --- a/crates/iceberg/src/metadata_columns.rs +++ b/crates/iceberg/src/metadata_columns.rs @@ -39,7 +39,7 @@ pub const RESERVED_COL_NAME_FILE: &str = "_file"; /// Lazy-initialized Arrow Field definition for the _file metadata column. /// Uses Run-End Encoding for memory efficiency. -static FILE_PATH_FIELD: Lazy> = Lazy::new(|| { +static FILE_FIELD: Lazy> = Lazy::new(|| { let run_ends_field = Arc::new(Field::new("run_ends", DataType::Int32, false)); let values_field = Arc::new(Field::new("values", DataType::Utf8, true)); Arc::new( @@ -59,8 +59,8 @@ static FILE_PATH_FIELD: Lazy> = Lazy::new(|| { /// /// # Returns /// A reference to the _file field definition (RunEndEncoded type) -pub fn file_path_field() -> &'static Arc { - &FILE_PATH_FIELD +pub fn file_field() -> &'static Arc { + &FILE_FIELD } /// Returns the Arrow Field definition for a metadata field ID. @@ -72,7 +72,7 @@ pub fn file_path_field() -> &'static Arc { /// The Arrow Field definition for the metadata column, or an error if not a metadata field pub fn get_metadata_field(field_id: i32) -> Result> { match field_id { - RESERVED_FIELD_ID_FILE => Ok(Arc::clone(file_path_field())), + RESERVED_FIELD_ID_FILE => Ok(Arc::clone(file_field())), _ if is_metadata_field(field_id) => { // Future metadata fields can be added here Err(Error::new( From fca14bd335db2c95becc78bd7b7be1e37ed31b3f Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Mon, 17 Nov 2025 12:13:09 +0100 Subject: [PATCH 28/39] Rename method --- crates/iceberg/src/scan/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/iceberg/src/scan/mod.rs b/crates/iceberg/src/scan/mod.rs index df332a8dd2..b719524a61 100644 --- a/crates/iceberg/src/scan/mod.rs +++ b/crates/iceberg/src/scan/mod.rs @@ -139,12 +139,12 @@ impl<'a> TableScanBuilder<'a> { /// let scan = table /// .scan() /// .select(["id", "name"]) - /// .with_file_path_column() + /// .with_file_column() /// .build()?; /// # Ok(()) /// # } /// ``` - pub fn with_file_path_column(mut self) -> Self { + pub fn with_file_column(mut self) -> Self { use crate::metadata_columns::RESERVED_COL_NAME_FILE; let mut columns = self.column_names.unwrap_or_else(|| { From b7da6d3eefdbaf3ee38c72fcd316db2fbc67d9d2 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Mon, 17 Nov 2025 12:15:41 +0100 Subject: [PATCH 29/39] Undo some changes --- .../src/arrow/record_batch_transformer.rs | 396 ++++++++++-------- 1 file changed, 232 insertions(+), 164 deletions(-) diff --git a/crates/iceberg/src/arrow/record_batch_transformer.rs b/crates/iceberg/src/arrow/record_batch_transformer.rs index d0cd4c25d4..385b1e6988 100644 --- a/crates/iceberg/src/arrow/record_batch_transformer.rs +++ b/crates/iceberg/src/arrow/record_batch_transformer.rs @@ -19,9 +19,11 @@ use std::collections::HashMap; use std::sync::Arc; use arrow_array::{ - Array as ArrowArray, ArrayRef, BinaryArray, BooleanArray, Date32Array, Float32Array, - Float64Array, Int32Array, Int64Array, RecordBatch, RecordBatchOptions, RunArray, StringArray, + Array as ArrowArray, ArrayRef, BinaryArray, BooleanArray, Date32Array, Decimal128Array, + Float32Array, Float64Array, Int32Array, Int64Array, NullArray, RecordBatch, RecordBatchOptions, + RunArray, StringArray, StructArray, }; +use arrow_buffer::NullBuffer; use arrow_cast::cast; use arrow_schema::{ DataType, Field, FieldRef, Schema as ArrowSchema, SchemaRef as ArrowSchemaRef, SchemaRef, @@ -313,8 +315,6 @@ impl RecordBatchTransformer { let mapped_unprojected_arrow_schema = Arc::new(schema_to_arrow_schema(snapshot_schema)?); let field_id_to_mapped_schema_map = Self::build_field_id_to_arrow_schema_map(&mapped_unprojected_arrow_schema)?; - let field_id_to_source_schema_map = - Self::build_field_id_to_arrow_schema_map(source_schema)?; // Create a new arrow schema by selecting fields from mapped_unprojected, // in the order of the field ids in projected_iceberg_field_ids @@ -335,36 +335,19 @@ impl RecordBatchTransformer { .ok_or(Error::new(ErrorKind::Unexpected, "field not found"))? .0; let (arrow_type, _) = constant_fields.get(field_id).unwrap(); - // Use the REE type from constant_fields - let ree_field = + // Use the type from constant_fields (REE for constants) + let constant_field = Field::new(field.name(), arrow_type.clone(), field.is_nullable()) .with_metadata(field.metadata().clone()); - Ok(Arc::new(ree_field)) + Ok(Arc::new(constant_field)) } } else { - // Get field from schema - let field = &field_id_to_mapped_schema_map + // Regular field - use schema as-is + Ok(field_id_to_mapped_schema_map .get(field_id) .ok_or(Error::new(ErrorKind::Unexpected, "field not found"))? - .0; - - // Check if this field exists in the source - if not, it will be added with REE - if !field_id_to_source_schema_map.contains_key(field_id) { - // Field will be added - use REE type for the schema - let run_ends_field = - Arc::new(Field::new("run_ends", DataType::Int32, false)); - let values_field = - Arc::new(Field::new("values", field.data_type().clone(), true)); - let ree_field = Field::new( - field.name(), - DataType::RunEndEncoded(run_ends_field, values_field), - field.is_nullable(), - ) - .with_metadata(field.metadata().clone()); - Ok(Arc::new(ree_field)) - } else { - Ok(Arc::clone(field)) - } + .0 + .clone()) } }) .collect(); @@ -520,14 +503,9 @@ impl RecordBatchTransformer { } }); - // Always use REE for added columns (memory efficient) - let run_ends_field = Arc::new(Field::new("run_ends", DataType::Int32, false)); - let values_field = Arc::new(Field::new("values", target_type.clone(), true)); - let column_type = DataType::RunEndEncoded(run_ends_field, values_field); - ColumnSource::Add { value: default_value, - target_type: column_type, + target_type: target_type.clone(), } }; @@ -593,135 +571,224 @@ impl RecordBatchTransformer { prim_lit: &Option, num_rows: usize, ) -> Result { - // All added columns use Run-End Encoding for memory efficiency - let DataType::RunEndEncoded(_, values_field) = target_type else { - return Err(Error::new( - ErrorKind::Unexpected, - format!( - "Expected RunEndEncoded type for added column, got: {}", - target_type - ), - )); - }; - - // Helper to create a Run-End Encoded array - let create_ree_array = |values_array: ArrayRef| -> Result { - let run_ends = if num_rows == 0 { - Int32Array::from(Vec::::new()) - } else { - Int32Array::from(vec![num_rows as i32]) + // Check if this is a RunEndEncoded type (for constant fields) + if let DataType::RunEndEncoded(_, values_field) = target_type { + // Helper to create a Run-End Encoded array + let create_ree_array = |values_array: ArrayRef| -> Result { + let run_ends = if num_rows == 0 { + Int32Array::from(Vec::::new()) + } else { + Int32Array::from(vec![num_rows as i32]) + }; + Ok(Arc::new( + RunArray::try_new(&run_ends, &values_array).map_err(|e| { + Error::new( + ErrorKind::Unexpected, + "Failed to create RunArray for constant value", + ) + .with_source(e) + })?, + )) }; - Ok(Arc::new( - RunArray::try_new(&run_ends, &values_array).map_err(|e| { - Error::new( - ErrorKind::Unexpected, - "Failed to create RunArray for constant value", - ) - .with_source(e) - })?, - )) - }; - // Create the values array based on the literal value - let values_array: ArrayRef = match (values_field.data_type(), prim_lit) { - (DataType::Boolean, Some(PrimitiveLiteral::Boolean(v))) => { - Arc::new(BooleanArray::from(vec![*v])) - } - (DataType::Boolean, None) => Arc::new(BooleanArray::from(vec![Option::::None])), - (DataType::Int32, Some(PrimitiveLiteral::Int(v))) => { - Arc::new(Int32Array::from(vec![*v])) - } - (DataType::Int32, None) => Arc::new(Int32Array::from(vec![Option::::None])), - (DataType::Date32, Some(PrimitiveLiteral::Int(v))) => { - Arc::new(Date32Array::from(vec![*v])) - } - (DataType::Date32, None) => Arc::new(Date32Array::from(vec![Option::::None])), - (DataType::Int64, Some(PrimitiveLiteral::Long(v))) => { - Arc::new(Int64Array::from(vec![*v])) - } - (DataType::Int64, None) => Arc::new(Int64Array::from(vec![Option::::None])), - (DataType::Float32, Some(PrimitiveLiteral::Float(v))) => { - Arc::new(Float32Array::from(vec![v.0])) - } - (DataType::Float32, None) => Arc::new(Float32Array::from(vec![Option::::None])), - (DataType::Float64, Some(PrimitiveLiteral::Double(v))) => { - Arc::new(Float64Array::from(vec![v.0])) - } - (DataType::Float64, None) => Arc::new(Float64Array::from(vec![Option::::None])), - (DataType::Utf8, Some(PrimitiveLiteral::String(v))) => { - Arc::new(StringArray::from(vec![v.as_str()])) - } - (DataType::Utf8, None) => Arc::new(StringArray::from(vec![Option::<&str>::None])), - (DataType::Binary, Some(PrimitiveLiteral::Binary(v))) => { - Arc::new(BinaryArray::from_vec(vec![v.as_slice()])) - } - (DataType::Binary, None) => { - Arc::new(BinaryArray::from_opt_vec(vec![Option::<&[u8]>::None])) - } - (DataType::Decimal128(_, _), Some(PrimitiveLiteral::Int128(v))) => { - Arc::new(arrow_array::Decimal128Array::from(vec![{ *v }])) - } - (DataType::Decimal128(_, _), Some(PrimitiveLiteral::UInt128(v))) => { - Arc::new(arrow_array::Decimal128Array::from(vec![*v as i128])) - } - (DataType::Decimal128(_, _), None) => { - Arc::new(arrow_array::Decimal128Array::from(vec![ - Option::::None, - ])) - } - (DataType::Struct(fields), None) => { - // Create a single-element StructArray with nulls - let null_arrays: Vec = fields - .iter() - .map(|f| { - // Recursively create null arrays for struct fields - // For primitive fields in structs, use simple null arrays (not REE within struct) - match f.data_type() { - DataType::Boolean => { - Arc::new(BooleanArray::from(vec![Option::::None])) as ArrayRef - } - DataType::Int32 | DataType::Date32 => { - Arc::new(Int32Array::from(vec![Option::::None])) - } - DataType::Int64 => { - Arc::new(Int64Array::from(vec![Option::::None])) - } - DataType::Float32 => { - Arc::new(Float32Array::from(vec![Option::::None])) - } - DataType::Float64 => { - Arc::new(Float64Array::from(vec![Option::::None])) - } - DataType::Utf8 => { - Arc::new(StringArray::from(vec![Option::<&str>::None])) - } - DataType::Binary => { - Arc::new(BinaryArray::from_opt_vec(vec![Option::<&[u8]>::None])) + // Create the values array based on the literal value + let values_array: ArrayRef = match (values_field.data_type(), prim_lit) { + (DataType::Boolean, Some(PrimitiveLiteral::Boolean(v))) => { + Arc::new(BooleanArray::from(vec![*v])) + } + (DataType::Boolean, None) => { + Arc::new(BooleanArray::from(vec![Option::::None])) + } + (DataType::Int32, Some(PrimitiveLiteral::Int(v))) => { + Arc::new(Int32Array::from(vec![*v])) + } + (DataType::Int32, None) => Arc::new(Int32Array::from(vec![Option::::None])), + (DataType::Date32, Some(PrimitiveLiteral::Int(v))) => { + Arc::new(Date32Array::from(vec![*v])) + } + (DataType::Date32, None) => Arc::new(Date32Array::from(vec![Option::::None])), + (DataType::Int64, Some(PrimitiveLiteral::Long(v))) => { + Arc::new(Int64Array::from(vec![*v])) + } + (DataType::Int64, None) => Arc::new(Int64Array::from(vec![Option::::None])), + (DataType::Float32, Some(PrimitiveLiteral::Float(v))) => { + Arc::new(Float32Array::from(vec![v.0])) + } + (DataType::Float32, None) => { + Arc::new(Float32Array::from(vec![Option::::None])) + } + (DataType::Float64, Some(PrimitiveLiteral::Double(v))) => { + Arc::new(Float64Array::from(vec![v.0])) + } + (DataType::Float64, None) => { + Arc::new(Float64Array::from(vec![Option::::None])) + } + (DataType::Utf8, Some(PrimitiveLiteral::String(v))) => { + Arc::new(StringArray::from(vec![v.as_str()])) + } + (DataType::Utf8, None) => Arc::new(StringArray::from(vec![Option::<&str>::None])), + (DataType::Binary, Some(PrimitiveLiteral::Binary(v))) => { + Arc::new(BinaryArray::from_vec(vec![v.as_slice()])) + } + (DataType::Binary, None) => { + Arc::new(BinaryArray::from_opt_vec(vec![Option::<&[u8]>::None])) + } + (DataType::Decimal128(_, _), Some(PrimitiveLiteral::Int128(v))) => { + Arc::new(arrow_array::Decimal128Array::from(vec![{ *v }])) + } + (DataType::Decimal128(_, _), Some(PrimitiveLiteral::UInt128(v))) => { + Arc::new(arrow_array::Decimal128Array::from(vec![*v as i128])) + } + (DataType::Decimal128(_, _), None) => { + Arc::new(arrow_array::Decimal128Array::from(vec![ + Option::::None, + ])) + } + (DataType::Struct(fields), None) => { + // Create a single-element StructArray with nulls + let null_arrays: Vec = fields + .iter() + .map(|f| { + // Recursively create null arrays for struct fields + // For primitive fields in structs, use simple null arrays (not REE within struct) + match f.data_type() { + DataType::Boolean => { + Arc::new(BooleanArray::from(vec![Option::::None])) + as ArrayRef + } + DataType::Int32 | DataType::Date32 => { + Arc::new(Int32Array::from(vec![Option::::None])) + } + DataType::Int64 => { + Arc::new(Int64Array::from(vec![Option::::None])) + } + DataType::Float32 => { + Arc::new(Float32Array::from(vec![Option::::None])) + } + DataType::Float64 => { + Arc::new(Float64Array::from(vec![Option::::None])) + } + DataType::Utf8 => { + Arc::new(StringArray::from(vec![Option::<&str>::None])) + } + DataType::Binary => { + Arc::new(BinaryArray::from_opt_vec(vec![Option::<&[u8]>::None])) + } + _ => panic!("Unsupported struct field type: {:?}", f.data_type()), } - _ => panic!("Unsupported struct field type: {:?}", f.data_type()), - } - }) - .collect(); - Arc::new(arrow_array::StructArray::new( - fields.clone(), - null_arrays, - Some(arrow_buffer::NullBuffer::new_null(1)), - )) - } - _ => { - return Err(Error::new( - ErrorKind::Unexpected, - format!( - "Unsupported constant type combination: {:?} with {:?}", - values_field.data_type(), - prim_lit - ), - )); - } - }; + }) + .collect(); + Arc::new(arrow_array::StructArray::new( + fields.clone(), + null_arrays, + Some(arrow_buffer::NullBuffer::new_null(1)), + )) + } + _ => { + return Err(Error::new( + ErrorKind::Unexpected, + format!( + "Unsupported constant type combination: {:?} with {:?}", + values_field.data_type(), + prim_lit + ), + )); + } + }; - // Wrap in Run-End Encoding - create_ree_array(values_array) + // Wrap in Run-End Encoding + create_ree_array(values_array) + } else { + // Non-REE type (simple arrays for non-constant fields) + Ok(match (target_type, prim_lit) { + (DataType::Boolean, Some(PrimitiveLiteral::Boolean(value))) => { + Arc::new(BooleanArray::from(vec![*value; num_rows])) + } + (DataType::Boolean, None) => { + let vals: Vec> = vec![None; num_rows]; + Arc::new(BooleanArray::from(vals)) + } + (DataType::Int32, Some(PrimitiveLiteral::Int(value))) => { + Arc::new(Int32Array::from(vec![*value; num_rows])) + } + (DataType::Int32, None) => { + let vals: Vec> = vec![None; num_rows]; + Arc::new(Int32Array::from(vals)) + } + (DataType::Date32, Some(PrimitiveLiteral::Int(value))) => { + Arc::new(Date32Array::from(vec![*value; num_rows])) + } + (DataType::Date32, None) => { + let vals: Vec> = vec![None; num_rows]; + Arc::new(Date32Array::from(vals)) + } + (DataType::Int64, Some(PrimitiveLiteral::Long(value))) => { + Arc::new(Int64Array::from(vec![*value; num_rows])) + } + (DataType::Int64, None) => { + let vals: Vec> = vec![None; num_rows]; + Arc::new(Int64Array::from(vals)) + } + (DataType::Float32, Some(PrimitiveLiteral::Float(value))) => { + Arc::new(Float32Array::from(vec![value.0; num_rows])) + } + (DataType::Float32, None) => { + let vals: Vec> = vec![None; num_rows]; + Arc::new(Float32Array::from(vals)) + } + (DataType::Float64, Some(PrimitiveLiteral::Double(value))) => { + Arc::new(Float64Array::from(vec![value.0; num_rows])) + } + (DataType::Float64, None) => { + let vals: Vec> = vec![None; num_rows]; + Arc::new(Float64Array::from(vals)) + } + (DataType::Utf8, Some(PrimitiveLiteral::String(value))) => { + Arc::new(StringArray::from(vec![value.clone(); num_rows])) + } + (DataType::Utf8, None) => { + let vals: Vec> = vec![None; num_rows]; + Arc::new(StringArray::from(vals)) + } + (DataType::Binary, Some(PrimitiveLiteral::Binary(value))) => { + Arc::new(BinaryArray::from_vec(vec![value; num_rows])) + } + (DataType::Binary, None) => { + let vals: Vec> = vec![None; num_rows]; + Arc::new(BinaryArray::from_opt_vec(vals)) + } + (DataType::Decimal128(_, _), Some(PrimitiveLiteral::Int128(value))) => { + Arc::new(Decimal128Array::from(vec![*value as i128; num_rows])) + } + (DataType::Decimal128(_, _), Some(PrimitiveLiteral::UInt128(value))) => { + Arc::new(Decimal128Array::from(vec![*value as i128; num_rows])) + } + (DataType::Decimal128(_, _), None) => { + let vals: Vec> = vec![None; num_rows]; + Arc::new(Decimal128Array::from(vals)) + } + (DataType::Struct(fields), None) => { + // Create a StructArray filled with nulls + let null_arrays: Vec = fields + .iter() + .map(|field| Self::create_column(field.data_type(), &None, num_rows)) + .collect::>>()?; + + Arc::new(StructArray::new( + fields.clone(), + null_arrays, + Some(NullBuffer::new_null(num_rows)), + )) + } + (DataType::Null, _) => Arc::new(NullArray::new(num_rows)), + (dt, _) => { + return Err(Error::new( + ErrorKind::Unexpected, + format!("unexpected target column type {}", dt), + )); + } + }) + } } /// Converts a PrimitiveLiteral to its corresponding Arrow DataType. @@ -1063,17 +1130,18 @@ mod test { } pub fn expected_record_batch_migration_required() -> RecordBatch { - // Build schema with REE type for added fields (a and f) + // Build schema - only constant fields (from constant_fields) use REE + // Field 'a' has no default, field 'f' has initial_default (not constant) let schema = Arc::new(ArrowSchema::new(vec![ - ree_field("a", DataType::Utf8, true, "10"), // Added field - REE with null + simple_field("a", DataType::Utf8, true, "10"), // Added with null - simple type simple_field("b", DataType::Int64, false, "11"), simple_field("c", DataType::Float64, false, "12"), simple_field("e", DataType::Utf8, true, "14"), - ree_field("f", DataType::Utf8, false, "15"), // Added field - REE with default + simple_field("f", DataType::Utf8, false, "15"), // Added with default - simple type ])); RecordBatch::try_new(schema, vec![ - create_ree_string_array(None, 3), // a - REE with null (not in source) + Arc::new(StringArray::from(vec![Option::::None; 3])), // a - simple with null Arc::new(Int64Array::from(vec![Some(1001), Some(1002), Some(1003)])), // b Arc::new(Float64Array::from(vec![ Some(12.125), @@ -1085,7 +1153,7 @@ mod test { Some("Iceberg"), Some("Rocks"), ])), // e (d skipped by projection) - create_ree_string_array(Some("(╯°□°)╯"), 3), // f - REE for constant default + Arc::new(StringArray::from(vec!["(╯°□°)╯"; 3])), // f - simple for default value ]) .unwrap() } From 7ebdf87ada888388dec12fdd1790661125842a61 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Mon, 17 Nov 2025 12:27:17 +0100 Subject: [PATCH 30/39] . --- .../src/arrow/record_batch_transformer.rs | 54 ++++--------------- 1 file changed, 10 insertions(+), 44 deletions(-) diff --git a/crates/iceberg/src/arrow/record_batch_transformer.rs b/crates/iceberg/src/arrow/record_batch_transformer.rs index 385b1e6988..c546e22d1b 100644 --- a/crates/iceberg/src/arrow/record_batch_transformer.rs +++ b/crates/iceberg/src/arrow/record_batch_transformer.rs @@ -758,7 +758,7 @@ impl RecordBatchTransformer { Arc::new(BinaryArray::from_opt_vec(vals)) } (DataType::Decimal128(_, _), Some(PrimitiveLiteral::Int128(value))) => { - Arc::new(Decimal128Array::from(vec![*value as i128; num_rows])) + Arc::new(Decimal128Array::from(vec![*value; num_rows])) } (DataType::Decimal128(_, _), Some(PrimitiveLiteral::UInt128(value))) => { Arc::new(Decimal128Array::from(vec![*value as i128; num_rows])) @@ -902,21 +902,6 @@ mod test { } } - /// Helper to create a RunEndEncoded StringArray for constant values - fn create_ree_string_array(value: Option<&str>, num_rows: usize) -> ArrayRef { - let run_ends = if num_rows == 0 { - Int32Array::from(Vec::::new()) - } else { - Int32Array::from(vec![num_rows as i32]) - }; - let values = if num_rows == 0 { - StringArray::from(Vec::>::new()) - } else { - StringArray::from(vec![value]) - }; - Arc::new(arrow_array::RunArray::try_new(&run_ends, &values).unwrap()) - } - #[test] fn build_field_id_to_source_schema_map_works() { let arrow_schema = arrow_schema_already_same_as_target(); @@ -1130,18 +1115,10 @@ mod test { } pub fn expected_record_batch_migration_required() -> RecordBatch { - // Build schema - only constant fields (from constant_fields) use REE - // Field 'a' has no default, field 'f' has initial_default (not constant) - let schema = Arc::new(ArrowSchema::new(vec![ - simple_field("a", DataType::Utf8, true, "10"), // Added with null - simple type - simple_field("b", DataType::Int64, false, "11"), - simple_field("c", DataType::Float64, false, "12"), - simple_field("e", DataType::Utf8, true, "14"), - simple_field("f", DataType::Utf8, false, "15"), // Added with default - simple type - ])); - - RecordBatch::try_new(schema, vec![ - Arc::new(StringArray::from(vec![Option::::None; 3])), // a - simple with null + RecordBatch::try_new(arrow_schema_already_same_as_target(), vec![ + Arc::new(StringArray::from(Vec::>::from([ + None, None, None, + ]))), // a Arc::new(Int64Array::from(vec![Some(1001), Some(1002), Some(1003)])), // b Arc::new(Float64Array::from(vec![ Some(12.125), @@ -1153,7 +1130,11 @@ mod test { Some("Iceberg"), Some("Rocks"), ])), // e (d skipped by projection) - Arc::new(StringArray::from(vec!["(╯°□°)╯"; 3])), // f - simple for default value + Arc::new(StringArray::from(vec![ + Some("(╯°□°)╯"), + Some("(╯°□°)╯"), + Some("(╯°□°)╯"), + ])), // f ]) .unwrap() } @@ -1209,21 +1190,6 @@ mod test { )])) } - /// Helper to create a Field with RunEndEncoded type for constant columns - fn ree_field(name: &str, values_type: DataType, nullable: bool, field_id: &str) -> Field { - let run_ends_field = Arc::new(Field::new("run_ends", DataType::Int32, false)); - let values_field = Arc::new(Field::new("values", values_type, true)); - Field::new( - name, - DataType::RunEndEncoded(run_ends_field, values_field), - nullable, - ) - .with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - field_id.to_string(), - )])) - } - /// Test for add_files with Parquet files that have NO field IDs (Hive tables). /// /// This reproduces the scenario from Iceberg spec where: From edbc72ada429bfa0ccff3598eadac6953e37de1e Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Mon, 17 Nov 2025 12:40:02 +0100 Subject: [PATCH 31/39] Re-refactor tests --- .../src/arrow/record_batch_transformer.rs | 124 +++++++++++------- 1 file changed, 76 insertions(+), 48 deletions(-) diff --git a/crates/iceberg/src/arrow/record_batch_transformer.rs b/crates/iceberg/src/arrow/record_batch_transformer.rs index c546e22d1b..8a3018b66a 100644 --- a/crates/iceberg/src/arrow/record_batch_transformer.rs +++ b/crates/iceberg/src/arrow/record_batch_transformer.rs @@ -830,7 +830,7 @@ mod test { use std::sync::Arc; use arrow_array::{ - Array, ArrayRef, Float32Array, Float64Array, Int32Array, Int64Array, RecordBatch, + Array, Date32Array, Float32Array, Float64Array, Int32Array, Int64Array, RecordBatch, StringArray, }; use arrow_schema::{DataType, Field, Schema as ArrowSchema}; @@ -889,19 +889,6 @@ mod test { } } - /// Helper to check if value is null in either simple or REE array - fn is_null_value(array: &dyn Array, index: usize) -> bool { - if let Some(run_array) = array - .as_any() - .downcast_ref::>() - { - let values = run_array.values(); - values.is_null(0) // For REE, check the single value - } else { - array.is_null(index) - } - } - #[test] fn build_field_id_to_source_schema_map_works() { let arrow_schema = arrow_schema_already_same_as_target(); @@ -997,19 +984,30 @@ mod test { assert_eq!(result.num_columns(), 3); assert_eq!(result.num_rows(), 3); - // Use helpers to handle both simple and REE arrays - assert_eq!(get_int_value(result.column(0).as_ref(), 0), 1); - assert_eq!(get_int_value(result.column(0).as_ref(), 1), 2); - assert_eq!(get_int_value(result.column(0).as_ref(), 2), 3); - - assert_eq!(get_string_value(result.column(1).as_ref(), 0), "Alice"); - assert_eq!(get_string_value(result.column(1).as_ref(), 1), "Bob"); - assert_eq!(get_string_value(result.column(1).as_ref(), 2), "Charlie"); - - // date_col added with null default - will be REE with null - assert!(is_null_value(result.column(2).as_ref(), 0)); - assert!(is_null_value(result.column(2).as_ref(), 1)); - assert!(is_null_value(result.column(2).as_ref(), 2)); + let id_column = result + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(id_column.values(), &[1, 2, 3]); + + let name_column = result + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(name_column.value(0), "Alice"); + assert_eq!(name_column.value(1), "Bob"); + assert_eq!(name_column.value(2), "Charlie"); + + let date_column = result + .column(2) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(date_column.is_null(0)); + assert!(date_column.is_null(1)); + assert!(date_column.is_null(2)); } #[test] @@ -1063,19 +1061,30 @@ mod test { assert_eq!(result.num_columns(), 3); assert_eq!(result.num_rows(), 3); - // Use helpers to handle both simple and REE arrays - assert_eq!(get_int_value(result.column(0).as_ref(), 0), 1); - assert_eq!(get_int_value(result.column(0).as_ref(), 1), 2); - assert_eq!(get_int_value(result.column(0).as_ref(), 2), 3); - - assert_eq!(get_string_value(result.column(1).as_ref(), 0), "a"); - assert_eq!(get_string_value(result.column(1).as_ref(), 1), "b"); - assert_eq!(get_string_value(result.column(1).as_ref(), 2), "c"); - - // Struct column added with null - will be REE - assert!(is_null_value(result.column(2).as_ref(), 0)); - assert!(is_null_value(result.column(2).as_ref(), 1)); - assert!(is_null_value(result.column(2).as_ref(), 2)); + let id_column = result + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(id_column.values(), &[1, 2, 3]); + + let data_column = result + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(data_column.value(0), "a"); + assert_eq!(data_column.value(1), "b"); + assert_eq!(data_column.value(2), "c"); + + let struct_column = result + .column(2) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(struct_column.is_null(0)); + assert!(struct_column.is_null(1)); + assert!(struct_column.is_null(2)); } pub fn source_record_batch() -> RecordBatch { @@ -1269,14 +1278,33 @@ mod test { assert_eq!(result.num_columns(), 4); assert_eq!(result.num_rows(), 1); - // Use helpers to handle both simple and REE arrays - assert_eq!(get_int_value(result.column(0).as_ref(), 0), 1); // id from initial_default - REE - assert_eq!(get_string_value(result.column(1).as_ref(), 0), "John Doe"); // name from Parquet - assert_eq!(get_string_value(result.column(2).as_ref(), 0), "hr"); // dept from initial_default - REE - assert_eq!( - get_string_value(result.column(3).as_ref(), 0), - "communications" - ); // subdept from Parquet + let id_column = result + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(id_column.value(0), 1); + + let name_column = result + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(name_column.value(0), "John Doe"); + + let dept_column = result + .column(2) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(dept_column.value(0), "hr"); + + let subdept_column = result + .column(3) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(subdept_column.value(0), "communications"); } /// Test for bucket partitioning where source columns must be read from data files. From 4a08ee601193ec2eadce7f9241f0574e670ce3ec Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Mon, 17 Nov 2025 12:50:46 +0100 Subject: [PATCH 32/39] Undo reader test changes --- crates/iceberg/src/arrow/reader.rs | 72 +++++++++--------------------- 1 file changed, 20 insertions(+), 52 deletions(-) diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs index 0a8aa08064..21bcce029c 100644 --- a/crates/iceberg/src/arrow/reader.rs +++ b/crates/iceberg/src/arrow/reader.rs @@ -2569,24 +2569,14 @@ message schema { .as_primitive::(); assert_eq!(col_a.values(), &[1, 2, 3]); - // Column 'b' should be all NULLs (it didn't exist in the old file, added with REE) - let col_b = batch.column(1); - // For REE array with null, check the values array - if let Some(run_array) = col_b - .as_any() - .downcast_ref::>() - { - let values = run_array.values(); - assert!( - values.is_null(0), - "REE values should contain null for added column" - ); - } else { - // Fallback to direct null check for simple arrays - assert!(col_b.is_null(0)); - assert!(col_b.is_null(1)); - assert!(col_b.is_null(2)); - } + // Column 'b' should be all NULLs (it didn't exist in the old file) + let col_b = batch + .column(1) + .as_primitive::(); + assert_eq!(col_b.null_count(), 3); + assert!(col_b.is_null(0)); + assert!(col_b.is_null(1)); + assert!(col_b.is_null(2)); } /// Test for bug where position deletes in later row groups are not applied correctly. @@ -3466,23 +3456,11 @@ message schema { assert_eq!(age_array.value(0), 30); assert_eq!(age_array.value(1), 25); - // Verify missing column filled with NULLs (will be REE with null) - let city_col = batch.column(2); - if let Some(run_array) = city_col - .as_any() - .downcast_ref::>() - { - let values = run_array.values(); - assert!( - values.is_null(0), - "REE values should contain null for added column" - ); - } else { - let city_array = city_col.as_string::(); - assert_eq!(city_array.null_count(), 2); - assert!(city_array.is_null(0)); - assert!(city_array.is_null(1)); - } + // Verify missing column filled with NULLs + let city_array = batch.column(2).as_string::(); + assert_eq!(city_array.null_count(), 2); + assert!(city_array.is_null(0)); + assert!(city_array.is_null(1)); } /// Test reading Parquet files without field IDs that have multiple row groups. @@ -3799,23 +3777,13 @@ message schema { assert_eq!(result_col0.value(0), 1); assert_eq!(result_col0.value(1), 2); - // New column should be NULL (doesn't exist in old file, added with REE) - let newcol = batch.column(1); - if let Some(run_array) = newcol - .as_any() - .downcast_ref::>() - { - let values = run_array.values(); - assert!( - values.is_null(0), - "REE values should contain null for added column" - ); - } else { - let result_newcol = newcol.as_primitive::(); - assert_eq!(result_newcol.null_count(), 2); - assert!(result_newcol.is_null(0)); - assert!(result_newcol.is_null(1)); - } + // New column should be NULL (doesn't exist in old file) + let result_newcol = batch + .column(1) + .as_primitive::(); + assert_eq!(result_newcol.null_count(), 2); + assert!(result_newcol.is_null(0)); + assert!(result_newcol.is_null(1)); let result_col1 = batch .column(2) From 671fd4f5640643e9fc6a92d3131a554d24c07ea9 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Mon, 17 Nov 2025 12:51:14 +0100 Subject: [PATCH 33/39] . --- crates/iceberg/src/arrow/reader.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs index 21bcce029c..de6bd1ed59 100644 --- a/crates/iceberg/src/arrow/reader.rs +++ b/crates/iceberg/src/arrow/reader.rs @@ -1753,7 +1753,7 @@ mod tests { use std::sync::Arc; use arrow_array::cast::AsArray; - use arrow_array::{ArrayRef, LargeStringArray, RecordBatch, RunArray, StringArray}; + use arrow_array::{ArrayRef, LargeStringArray, RecordBatch, StringArray}; use arrow_schema::{DataType, Field, Schema as ArrowSchema, TimeUnit}; use futures::TryStreamExt; use parquet::arrow::arrow_reader::{RowSelection, RowSelector}; From 4757868471e433ab70cca1cea210f3bfbe777437 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Tue, 18 Nov 2025 13:26:07 +0100 Subject: [PATCH 34/39] Move import --- crates/iceberg/src/scan/mod.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/crates/iceberg/src/scan/mod.rs b/crates/iceberg/src/scan/mod.rs index b719524a61..78aa068d62 100644 --- a/crates/iceberg/src/scan/mod.rs +++ b/crates/iceberg/src/scan/mod.rs @@ -36,7 +36,9 @@ use crate::delete_file_index::DeleteFileIndex; use crate::expr::visitors::inclusive_metrics_evaluator::InclusiveMetricsEvaluator; use crate::expr::{Bind, BoundPredicate, Predicate}; use crate::io::FileIO; -use crate::metadata_columns::{get_metadata_field_id, is_metadata_column_name}; +use crate::metadata_columns::{ + RESERVED_COL_NAME_FILE, get_metadata_field_id, is_metadata_column_name, +}; use crate::runtime::spawn; use crate::spec::{DataContentType, SnapshotRef}; use crate::table::Table; @@ -145,8 +147,6 @@ impl<'a> TableScanBuilder<'a> { /// # } /// ``` pub fn with_file_column(mut self) -> Self { - use crate::metadata_columns::RESERVED_COL_NAME_FILE; - let mut columns = self.column_names.unwrap_or_else(|| { // No explicit selection - get all column names from schema self.table From 38e323352ccd9421fc749abf6619f204a2105302 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Mon, 1 Dec 2025 12:49:54 +0100 Subject: [PATCH 35/39] PR comments --- crates/iceberg/src/arrow/reader.rs | 7 +- .../src/arrow/record_batch_transformer.rs | 150 ++++++++++++------ crates/iceberg/src/metadata_columns.rs | 60 ++++--- crates/iceberg/src/scan/mod.rs | 43 +---- 4 files changed, 138 insertions(+), 122 deletions(-) diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs index de6bd1ed59..672647d818 100644 --- a/crates/iceberg/src/arrow/reader.rs +++ b/crates/iceberg/src/arrow/reader.rs @@ -56,7 +56,7 @@ use crate::expr::{BoundPredicate, BoundReference}; use crate::io::{FileIO, FileMetadata, FileRead}; use crate::metadata_columns::{RESERVED_FIELD_ID_FILE, is_metadata_field}; use crate::scan::{ArrowRecordBatchStream, FileScanTask, FileScanTaskStream}; -use crate::spec::{Datum, NameMapping, NestedField, PrimitiveLiteral, PrimitiveType, Schema, Type}; +use crate::spec::{Datum, NameMapping, NestedField, PrimitiveType, Schema, Type}; use crate::utils::available_parallelism; use crate::{Error, ErrorKind}; @@ -279,10 +279,7 @@ impl ArrowReader { // column re-ordering, partition constants, and virtual field addition (like _file) let mut record_batch_transformer_builder = RecordBatchTransformerBuilder::new(task.schema_ref(), task.project_field_ids()) - .with_constant( - RESERVED_FIELD_ID_FILE, - PrimitiveLiteral::String(task.data_file_path.clone()), - )?; + .with_reserved_field(RESERVED_FIELD_ID_FILE, task.data_file_path.clone())?; if let (Some(partition_spec), Some(partition_data)) = (task.partition_spec.clone(), task.partition.clone()) diff --git a/crates/iceberg/src/arrow/record_batch_transformer.rs b/crates/iceberg/src/arrow/record_batch_transformer.rs index 8a3018b66a..efef8b2f65 100644 --- a/crates/iceberg/src/arrow/record_batch_transformer.rs +++ b/crates/iceberg/src/arrow/record_batch_transformer.rs @@ -31,13 +31,14 @@ use arrow_schema::{ use parquet::arrow::PARQUET_FIELD_ID_META_KEY; use crate::arrow::schema_to_arrow_schema; -use crate::metadata_columns::get_metadata_field; +use crate::metadata_columns::{get_metadata_field, metadata_field_primitive_type}; use crate::spec::{ - Literal, PartitionSpec, PrimitiveLiteral, Schema as IcebergSchema, Struct, Transform, + Datum, Literal, PartitionSpec, PrimitiveLiteral, PrimitiveType, Schema as IcebergSchema, + Struct, Transform, }; use crate::{Error, ErrorKind, Result}; -/// Build a map of field ID to constant value for identity-partitioned fields. +/// Build a map of field ID to constant value (as Datum) for identity-partitioned fields. /// /// Implements Iceberg spec "Column Projection" rule #1: use partition metadata constants /// only for identity-transformed fields. Non-identity transforms (bucket, truncate, year, etc.) @@ -54,7 +55,8 @@ use crate::{Error, ErrorKind, Result}; fn constants_map( partition_spec: &PartitionSpec, partition_data: &Struct, -) -> HashMap { + schema: &IcebergSchema, +) -> Result> { let mut constants = HashMap::new(); for (pos, field) in partition_spec.fields().iter().enumerate() { @@ -62,12 +64,23 @@ fn constants_map( if matches!(field.transform, Transform::Identity) { // Get the partition value for this field if let Some(Literal::Primitive(value)) = &partition_data[pos] { - constants.insert(field.source_id, value.clone()); + // Get the field from schema to extract its type + let iceberg_field = schema.field_by_id(field.source_id).ok_or(Error::new( + ErrorKind::Unexpected, + format!("Field {} not found in schema", field.source_id), + ))?; + + // Extract the primitive type from the field + if let crate::spec::Type::Primitive(prim_type) = &*iceberg_field.field_type { + // Create a Datum from the primitive type and value + let datum = Datum::new(prim_type.clone(), value.clone()); + constants.insert(field.source_id, datum); + } } } } - constants + Ok(constants) } /// Indicates how a particular column in a processed RecordBatch should @@ -153,7 +166,7 @@ enum SchemaComparison { pub(crate) struct RecordBatchTransformerBuilder { snapshot_schema: Arc, projected_iceberg_field_ids: Vec, - constant_fields: HashMap, + constant_fields: HashMap, } impl RecordBatchTransformerBuilder { @@ -173,11 +186,34 @@ impl RecordBatchTransformerBuilder { /// /// # Arguments /// * `field_id` - The field ID to associate with the constant - /// * `value` - The constant value for this field - pub(crate) fn with_constant(mut self, field_id: i32, value: PrimitiveLiteral) -> Result { - let arrow_type = RecordBatchTransformer::primitive_literal_to_arrow_type(&value)?; - self.constant_fields.insert(field_id, (arrow_type, value)); - Ok(self) + /// * `datum` - The constant value (with type) for this field + pub(crate) fn with_constant(mut self, field_id: i32, datum: Datum) -> Self { + self.constant_fields.insert(field_id, datum); + self + } + + /// Add a reserved/metadata field with a constant string value. + /// This is a convenience method for reserved fields like _file that automatically + /// handles type extraction from the field definition. + /// + /// # Arguments + /// * `field_id` - The reserved field ID (e.g., RESERVED_FIELD_ID_FILE) + /// * `value` - The constant string value for this field + /// + /// # Returns + /// Self for method chaining, or an error if the field is not a valid metadata field + pub(crate) fn with_reserved_field(self, field_id: i32, value: String) -> Result { + // Get the Iceberg field definition + let iceberg_field = get_metadata_field(field_id)?; + + // Extract the primitive type from the field + let prim_type = metadata_field_primitive_type(&iceberg_field)?; + + // Create a Datum with the extracted type and value + let datum = Datum::new(prim_type, PrimitiveLiteral::String(value)); + + // Add the constant field + Ok(self.with_constant(field_id, datum)) } /// Set partition spec and data together for identifying identity-transformed partition columns. @@ -190,13 +226,13 @@ impl RecordBatchTransformerBuilder { partition_spec: Arc, partition_data: Struct, ) -> Result { - // Compute partition constants for identity-transformed fields - let partition_constants = constants_map(&partition_spec, &partition_data); + // Compute partition constants for identity-transformed fields (already returns Datum) + let partition_constants = + constants_map(&partition_spec, &partition_data, &self.snapshot_schema)?; - // Add partition constants to constant_fields (compute REE types from literals) - for (field_id, value) in partition_constants { - let arrow_type = RecordBatchTransformer::primitive_literal_to_arrow_type(&value)?; - self.constant_fields.insert(field_id, (arrow_type, value)); + // Add partition constants to constant_fields + for (field_id, datum) in partition_constants { + self.constant_fields.insert(field_id, datum); } Ok(self) @@ -246,10 +282,10 @@ impl RecordBatchTransformerBuilder { pub(crate) struct RecordBatchTransformer { snapshot_schema: Arc, projected_iceberg_field_ids: Vec, - // Pre-computed constant field information: field_id -> (arrow_type, value) + // Pre-computed constant field information: field_id -> Datum // Includes both virtual/metadata fields (like _file) and identity-partitioned fields - // Avoids type conversions during batch processing - constant_fields: HashMap, + // Datum holds both the Iceberg type and the value + constant_fields: HashMap, // BatchTransform gets lazily constructed based on the schema of // the first RecordBatch we receive from the file @@ -310,7 +346,7 @@ impl RecordBatchTransformer { source_schema: &ArrowSchemaRef, snapshot_schema: &IcebergSchema, projected_iceberg_field_ids: &[i32], - constant_fields: &HashMap, + constant_fields: &HashMap, ) -> Result { let mapped_unprojected_arrow_schema = Arc::new(schema_to_arrow_schema(snapshot_schema)?); let field_id_to_mapped_schema_map = @@ -325,19 +361,28 @@ impl RecordBatchTransformer { if constant_fields.contains_key(field_id) { // For metadata/virtual fields (like _file), get name from metadata_columns // For partition fields, get name from schema (they exist in schema) - if let Ok(field) = get_metadata_field(*field_id) { - // This is a metadata/virtual field - use the predefined field - Ok(field) + if let Ok(iceberg_field) = get_metadata_field(*field_id) { + // This is a metadata/virtual field - convert Iceberg field to Arrow + let arrow_type = + Self::datum_to_arrow_type(constant_fields.get(field_id).unwrap()); + let arrow_field = + Field::new(&iceberg_field.name, arrow_type, !iceberg_field.required) + .with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + iceberg_field.id.to_string(), + )])); + Ok(Arc::new(arrow_field)) } else { // This is a partition constant field (exists in schema but uses constant value) let field = &field_id_to_mapped_schema_map .get(field_id) .ok_or(Error::new(ErrorKind::Unexpected, "field not found"))? .0; - let (arrow_type, _) = constant_fields.get(field_id).unwrap(); + let datum = constant_fields.get(field_id).unwrap(); + let arrow_type = Self::datum_to_arrow_type(datum); // Use the type from constant_fields (REE for constants) let constant_field = - Field::new(field.name(), arrow_type.clone(), field.is_nullable()) + Field::new(field.name(), arrow_type, field.is_nullable()) .with_metadata(field.metadata().clone()); Ok(Arc::new(constant_field)) } @@ -420,7 +465,7 @@ impl RecordBatchTransformer { snapshot_schema: &IcebergSchema, projected_iceberg_field_ids: &[i32], field_id_to_mapped_schema_map: HashMap, - constant_fields: &HashMap, + constant_fields: &HashMap, ) -> Result> { let field_id_to_source_schema_map = Self::build_field_id_to_arrow_schema_map(source_schema)?; @@ -432,10 +477,11 @@ impl RecordBatchTransformer { // Constant fields always use their pre-computed constant values, regardless of whether // they exist in the Parquet file. This is per Iceberg spec rule #1: partition metadata // is authoritative and should be preferred over file data. - if let Some((arrow_type, value)) = constant_fields.get(field_id) { + if let Some(datum) = constant_fields.get(field_id) { + let arrow_type = Self::datum_to_arrow_type(datum); return Ok(ColumnSource::Add { - value: Some(value.clone()), - target_type: arrow_type.clone(), + value: Some(datum.literal().clone()), + target_type: arrow_type, }); } @@ -791,10 +837,10 @@ impl RecordBatchTransformer { } } - /// Converts a PrimitiveLiteral to its corresponding Arrow DataType. - /// This is used for constant fields to determine the Arrow type. + /// Converts a Datum (Iceberg type + primitive literal) to its corresponding Arrow DataType. + /// Uses the PrimitiveType from the Datum to determine the correct Arrow type. /// For constant values, we use Run-End Encoding for all types to save memory. - fn primitive_literal_to_arrow_type(literal: &PrimitiveLiteral) -> Result { + fn datum_to_arrow_type(datum: &Datum) -> DataType { // Helper to create REE type with the given values type // Note: values field is nullable as Arrow expects this when building the // final Arrow schema with `RunArray::try_new`. @@ -804,23 +850,27 @@ impl RecordBatchTransformer { DataType::RunEndEncoded(run_ends_field, values_field) }; - Ok(match literal { - PrimitiveLiteral::Boolean(_) => make_ree(DataType::Boolean), - PrimitiveLiteral::Int(_) => make_ree(DataType::Int32), - PrimitiveLiteral::Long(_) => make_ree(DataType::Int64), - PrimitiveLiteral::Float(_) => make_ree(DataType::Float32), - PrimitiveLiteral::Double(_) => make_ree(DataType::Float64), - PrimitiveLiteral::String(_) => make_ree(DataType::Utf8), - PrimitiveLiteral::Binary(_) => make_ree(DataType::Binary), - PrimitiveLiteral::Int128(_) => make_ree(DataType::Decimal128(38, 0)), - PrimitiveLiteral::UInt128(_) => make_ree(DataType::Decimal128(38, 0)), - PrimitiveLiteral::AboveMax | PrimitiveLiteral::BelowMin => { - return Err(Error::new( - ErrorKind::Unexpected, - "Cannot create arrow type for AboveMax/BelowMin literal", - )); + // Match on the PrimitiveType from the Datum to determine the Arrow type + match datum.data_type() { + PrimitiveType::Boolean => make_ree(DataType::Boolean), + PrimitiveType::Int => make_ree(DataType::Int32), + PrimitiveType::Long => make_ree(DataType::Int64), + PrimitiveType::Float => make_ree(DataType::Float32), + PrimitiveType::Double => make_ree(DataType::Float64), + PrimitiveType::Date => make_ree(DataType::Date32), + PrimitiveType::Time => make_ree(DataType::Int64), + PrimitiveType::Timestamp => make_ree(DataType::Int64), + PrimitiveType::Timestamptz => make_ree(DataType::Int64), + PrimitiveType::TimestampNs => make_ree(DataType::Int64), + PrimitiveType::TimestamptzNs => make_ree(DataType::Int64), + PrimitiveType::String => make_ree(DataType::Utf8), + PrimitiveType::Uuid => make_ree(DataType::Binary), + PrimitiveType::Fixed(_) => make_ree(DataType::Binary), + PrimitiveType::Binary => make_ree(DataType::Binary), + PrimitiveType::Decimal { precision, scale } => { + make_ree(DataType::Decimal128(*precision as u8, *scale as i8)) } - }) + } } } diff --git a/crates/iceberg/src/metadata_columns.rs b/crates/iceberg/src/metadata_columns.rs index 30e4b4f2b9..7759753046 100644 --- a/crates/iceberg/src/metadata_columns.rs +++ b/crates/iceberg/src/metadata_columns.rs @@ -22,13 +22,11 @@ //! during reading. Examples include the _file column (file path) and future //! columns like partition values or row numbers. -use std::collections::HashMap; use std::sync::Arc; -use arrow_schema::{DataType, Field}; use once_cell::sync::Lazy; -use parquet::arrow::PARQUET_FIELD_ID_META_KEY; +use crate::spec::{NestedField, NestedFieldRef, PrimitiveType, Type}; use crate::{Error, ErrorKind, Result}; /// Reserved field ID for the file path (_file) column per Iceberg spec @@ -37,40 +35,52 @@ pub const RESERVED_FIELD_ID_FILE: i32 = i32::MAX - 1; /// Reserved column name for the file path metadata column pub const RESERVED_COL_NAME_FILE: &str = "_file"; -/// Lazy-initialized Arrow Field definition for the _file metadata column. -/// Uses Run-End Encoding for memory efficiency. -static FILE_FIELD: Lazy> = Lazy::new(|| { - let run_ends_field = Arc::new(Field::new("run_ends", DataType::Int32, false)); - let values_field = Arc::new(Field::new("values", DataType::Utf8, true)); - Arc::new( - Field::new( - RESERVED_COL_NAME_FILE, - DataType::RunEndEncoded(run_ends_field, values_field), - false, - ) - .with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - RESERVED_FIELD_ID_FILE.to_string(), - )])), - ) +/// Lazy-initialized Iceberg field definition for the _file metadata column. +/// This field represents the file path as a required string field. +static FILE_FIELD: Lazy = Lazy::new(|| { + Arc::new(NestedField::required( + RESERVED_FIELD_ID_FILE, + RESERVED_COL_NAME_FILE, + Type::Primitive(PrimitiveType::String), + )) }); -/// Returns the Arrow Field definition for the _file metadata column. +/// Returns the Iceberg field definition for the _file metadata column. /// /// # Returns -/// A reference to the _file field definition (RunEndEncoded type) -pub fn file_field() -> &'static Arc { +/// A reference to the _file field definition as an Iceberg NestedField +pub fn file_field() -> &'static NestedFieldRef { &FILE_FIELD } -/// Returns the Arrow Field definition for a metadata field ID. +/// Extracts the primitive type from a metadata field. +/// +/// # Arguments +/// * `field` - The metadata field +/// +/// # Returns +/// The PrimitiveType of the field, or an error if the field is not a primitive type +pub fn metadata_field_primitive_type(field: &NestedFieldRef) -> Result { + field + .field_type + .as_primitive_type() + .cloned() + .ok_or_else(|| { + Error::new( + ErrorKind::Unexpected, + format!("Metadata field '{}' must be a primitive type", field.name), + ) + }) +} + +/// Returns the Iceberg field definition for a metadata field ID. /// /// # Arguments /// * `field_id` - The metadata field ID /// /// # Returns -/// The Arrow Field definition for the metadata column, or an error if not a metadata field -pub fn get_metadata_field(field_id: i32) -> Result> { +/// The Iceberg field definition for the metadata column, or an error if not a metadata field +pub fn get_metadata_field(field_id: i32) -> Result { match field_id { RESERVED_FIELD_ID_FILE => Ok(Arc::clone(file_field())), _ if is_metadata_field(field_id) => { diff --git a/crates/iceberg/src/scan/mod.rs b/crates/iceberg/src/scan/mod.rs index 78aa068d62..24c03b0b2c 100644 --- a/crates/iceberg/src/scan/mod.rs +++ b/crates/iceberg/src/scan/mod.rs @@ -36,9 +36,7 @@ use crate::delete_file_index::DeleteFileIndex; use crate::expr::visitors::inclusive_metrics_evaluator::InclusiveMetricsEvaluator; use crate::expr::{Bind, BoundPredicate, Predicate}; use crate::io::FileIO; -use crate::metadata_columns::{ - RESERVED_COL_NAME_FILE, get_metadata_field_id, is_metadata_column_name, -}; +use crate::metadata_columns::{get_metadata_field_id, is_metadata_column_name}; use crate::runtime::spawn; use crate::spec::{DataContentType, SnapshotRef}; use crate::table::Table; @@ -127,45 +125,6 @@ impl<'a> TableScanBuilder<'a> { self } - /// Include the _file metadata column in the scan. - /// - /// This is a convenience method that adds the _file column to the current selection. - /// If no columns are currently selected (select_all), this will select all columns plus _file. - /// If specific columns are selected, this adds _file to that selection. - /// - /// # Example - /// ```no_run - /// # use iceberg::table::Table; - /// # async fn example(table: Table) -> iceberg::Result<()> { - /// // Select id, name, and _file - /// let scan = table - /// .scan() - /// .select(["id", "name"]) - /// .with_file_column() - /// .build()?; - /// # Ok(()) - /// # } - /// ``` - pub fn with_file_column(mut self) -> Self { - let mut columns = self.column_names.unwrap_or_else(|| { - // No explicit selection - get all column names from schema - self.table - .metadata() - .current_schema() - .as_struct() - .fields() - .iter() - .map(|f| f.name.clone()) - .collect() - }); - - // Add _file column - columns.push(RESERVED_COL_NAME_FILE.to_string()); - - self.column_names = Some(columns); - self - } - /// Set the snapshot to scan. When not set, it uses current snapshot. pub fn snapshot_id(mut self, snapshot_id: i64) -> Self { self.snapshot_id = Some(snapshot_id); From 16decc0a7dfad47ec45ca0641583e6fb3e008fde Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Mon, 1 Dec 2025 12:50:35 +0100 Subject: [PATCH 36/39] . --- crates/iceberg/src/arrow/record_batch_transformer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/iceberg/src/arrow/record_batch_transformer.rs b/crates/iceberg/src/arrow/record_batch_transformer.rs index efef8b2f65..5e24692254 100644 --- a/crates/iceberg/src/arrow/record_batch_transformer.rs +++ b/crates/iceberg/src/arrow/record_batch_transformer.rs @@ -357,7 +357,7 @@ impl RecordBatchTransformer { let fields: Result> = projected_iceberg_field_ids .iter() .map(|field_id| { - // Check if this is a constant field (virtual or partition) + // Check if this is a constant field if constant_fields.contains_key(field_id) { // For metadata/virtual fields (like _file), get name from metadata_columns // For partition fields, get name from schema (they exist in schema) From bcae8d1b37c9e2de100d98b1a71bb28f05221fbc Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Tue, 2 Dec 2025 12:34:37 +0100 Subject: [PATCH 37/39] Add doc field --- crates/iceberg/src/metadata_columns.rs | 36 ++++++++------------------ 1 file changed, 11 insertions(+), 25 deletions(-) diff --git a/crates/iceberg/src/metadata_columns.rs b/crates/iceberg/src/metadata_columns.rs index 7759753046..b11b5cadb2 100644 --- a/crates/iceberg/src/metadata_columns.rs +++ b/crates/iceberg/src/metadata_columns.rs @@ -35,14 +35,20 @@ pub const RESERVED_FIELD_ID_FILE: i32 = i32::MAX - 1; /// Reserved column name for the file path metadata column pub const RESERVED_COL_NAME_FILE: &str = "_file"; +/// Documentation for the _file metadata column +pub const RESERVED_COL_DOC_FILE: &str = "Path of the file in which a row is stored"; + /// Lazy-initialized Iceberg field definition for the _file metadata column. /// This field represents the file path as a required string field. static FILE_FIELD: Lazy = Lazy::new(|| { - Arc::new(NestedField::required( - RESERVED_FIELD_ID_FILE, - RESERVED_COL_NAME_FILE, - Type::Primitive(PrimitiveType::String), - )) + Arc::new( + NestedField::required( + RESERVED_FIELD_ID_FILE, + RESERVED_COL_NAME_FILE, + Type::Primitive(PrimitiveType::String), + ) + .with_doc(RESERVED_COL_DOC_FILE), + ) }); /// Returns the Iceberg field definition for the _file metadata column. @@ -53,26 +59,6 @@ pub fn file_field() -> &'static NestedFieldRef { &FILE_FIELD } -/// Extracts the primitive type from a metadata field. -/// -/// # Arguments -/// * `field` - The metadata field -/// -/// # Returns -/// The PrimitiveType of the field, or an error if the field is not a primitive type -pub fn metadata_field_primitive_type(field: &NestedFieldRef) -> Result { - field - .field_type - .as_primitive_type() - .cloned() - .ok_or_else(|| { - Error::new( - ErrorKind::Unexpected, - format!("Metadata field '{}' must be a primitive type", field.name), - ) - }) -} - /// Returns the Iceberg field definition for a metadata field ID. /// /// # Arguments From 1257268f7c634406c537f31e8ac97d1b9d1e3a82 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Tue, 2 Dec 2025 12:34:49 +0100 Subject: [PATCH 38/39] PR comments --- crates/iceberg/src/arrow/reader.rs | 10 +- .../src/arrow/record_batch_transformer.rs | 118 +++++++----------- crates/iceberg/src/arrow/schema.rs | 51 ++++++++ 3 files changed, 101 insertions(+), 78 deletions(-) diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs index 672647d818..de8a1420e4 100644 --- a/crates/iceberg/src/arrow/reader.rs +++ b/crates/iceberg/src/arrow/reader.rs @@ -278,8 +278,14 @@ impl ArrowReader { // that come back from the file, such as type promotion, default column insertion, // column re-ordering, partition constants, and virtual field addition (like _file) let mut record_batch_transformer_builder = - RecordBatchTransformerBuilder::new(task.schema_ref(), task.project_field_ids()) - .with_reserved_field(RESERVED_FIELD_ID_FILE, task.data_file_path.clone())?; + RecordBatchTransformerBuilder::new(task.schema_ref(), task.project_field_ids()); + + // Add the _file metadata column if it's in the projected fields + if task.project_field_ids().contains(&RESERVED_FIELD_ID_FILE) { + let file_datum = Datum::string(task.data_file_path.clone()); + record_batch_transformer_builder = + record_batch_transformer_builder.with_constant(RESERVED_FIELD_ID_FILE, file_datum); + } if let (Some(partition_spec), Some(partition_data)) = (task.partition_spec.clone(), task.partition.clone()) diff --git a/crates/iceberg/src/arrow/record_batch_transformer.rs b/crates/iceberg/src/arrow/record_batch_transformer.rs index 5e24692254..b767372ea5 100644 --- a/crates/iceberg/src/arrow/record_batch_transformer.rs +++ b/crates/iceberg/src/arrow/record_batch_transformer.rs @@ -30,11 +30,10 @@ use arrow_schema::{ }; use parquet::arrow::PARQUET_FIELD_ID_META_KEY; -use crate::arrow::schema_to_arrow_schema; -use crate::metadata_columns::{get_metadata_field, metadata_field_primitive_type}; +use crate::arrow::{datum_to_arrow_type_with_ree, schema_to_arrow_schema}; +use crate::metadata_columns::get_metadata_field; use crate::spec::{ - Datum, Literal, PartitionSpec, PrimitiveLiteral, PrimitiveType, Schema as IcebergSchema, - Struct, Transform, + Datum, Literal, PartitionSpec, PrimitiveLiteral, Schema as IcebergSchema, Struct, Transform, }; use crate::{Error, ErrorKind, Result}; @@ -62,20 +61,47 @@ fn constants_map( for (pos, field) in partition_spec.fields().iter().enumerate() { // Only identity transforms should use constant values from partition metadata if matches!(field.transform, Transform::Identity) { - // Get the partition value for this field - if let Some(Literal::Primitive(value)) = &partition_data[pos] { - // Get the field from schema to extract its type - let iceberg_field = schema.field_by_id(field.source_id).ok_or(Error::new( - ErrorKind::Unexpected, - format!("Field {} not found in schema", field.source_id), - ))?; + // Get the field from schema to extract its type + let iceberg_field = schema.field_by_id(field.source_id).ok_or(Error::new( + ErrorKind::Unexpected, + format!("Field {} not found in schema", field.source_id), + ))?; + + // Ensure the field type is primitive + let prim_type = match &*iceberg_field.field_type { + crate::spec::Type::Primitive(prim_type) => prim_type, + _ => { + return Err(Error::new( + ErrorKind::Unexpected, + format!( + "Partition field {} has non-primitive type {:?}", + field.source_id, iceberg_field.field_type + ), + )); + } + }; - // Extract the primitive type from the field - if let crate::spec::Type::Primitive(prim_type) = &*iceberg_field.field_type { + // Get the partition value for this field + // Handle both None (null) and Some(Literal::Primitive) cases + match &partition_data[pos] { + None => { + // Null partition values are skipped - they cannot be represented as a constant Datum + // The field will be read from the data file instead (or produce null from missing data) + } + Some(Literal::Primitive(value)) => { // Create a Datum from the primitive type and value let datum = Datum::new(prim_type.clone(), value.clone()); constants.insert(field.source_id, datum); } + Some(literal) => { + return Err(Error::new( + ErrorKind::Unexpected, + format!( + "Partition field {} has non-primitive value: {:?}", + field.source_id, literal + ), + )); + } } } } @@ -192,30 +218,6 @@ impl RecordBatchTransformerBuilder { self } - /// Add a reserved/metadata field with a constant string value. - /// This is a convenience method for reserved fields like _file that automatically - /// handles type extraction from the field definition. - /// - /// # Arguments - /// * `field_id` - The reserved field ID (e.g., RESERVED_FIELD_ID_FILE) - /// * `value` - The constant string value for this field - /// - /// # Returns - /// Self for method chaining, or an error if the field is not a valid metadata field - pub(crate) fn with_reserved_field(self, field_id: i32, value: String) -> Result { - // Get the Iceberg field definition - let iceberg_field = get_metadata_field(field_id)?; - - // Extract the primitive type from the field - let prim_type = metadata_field_primitive_type(&iceberg_field)?; - - // Create a Datum with the extracted type and value - let datum = Datum::new(prim_type, PrimitiveLiteral::String(value)); - - // Add the constant field - Ok(self.with_constant(field_id, datum)) - } - /// Set partition spec and data together for identifying identity-transformed partition columns. /// /// Both partition_spec and partition_data must be provided together since the spec defines @@ -364,7 +366,7 @@ impl RecordBatchTransformer { if let Ok(iceberg_field) = get_metadata_field(*field_id) { // This is a metadata/virtual field - convert Iceberg field to Arrow let arrow_type = - Self::datum_to_arrow_type(constant_fields.get(field_id).unwrap()); + datum_to_arrow_type_with_ree(constant_fields.get(field_id).unwrap()); let arrow_field = Field::new(&iceberg_field.name, arrow_type, !iceberg_field.required) .with_metadata(HashMap::from([( @@ -379,7 +381,7 @@ impl RecordBatchTransformer { .ok_or(Error::new(ErrorKind::Unexpected, "field not found"))? .0; let datum = constant_fields.get(field_id).unwrap(); - let arrow_type = Self::datum_to_arrow_type(datum); + let arrow_type = datum_to_arrow_type_with_ree(datum); // Use the type from constant_fields (REE for constants) let constant_field = Field::new(field.name(), arrow_type, field.is_nullable()) @@ -478,7 +480,7 @@ impl RecordBatchTransformer { // they exist in the Parquet file. This is per Iceberg spec rule #1: partition metadata // is authoritative and should be preferred over file data. if let Some(datum) = constant_fields.get(field_id) { - let arrow_type = Self::datum_to_arrow_type(datum); + let arrow_type = datum_to_arrow_type_with_ree(datum); return Ok(ColumnSource::Add { value: Some(datum.literal().clone()), target_type: arrow_type, @@ -836,42 +838,6 @@ impl RecordBatchTransformer { }) } } - - /// Converts a Datum (Iceberg type + primitive literal) to its corresponding Arrow DataType. - /// Uses the PrimitiveType from the Datum to determine the correct Arrow type. - /// For constant values, we use Run-End Encoding for all types to save memory. - fn datum_to_arrow_type(datum: &Datum) -> DataType { - // Helper to create REE type with the given values type - // Note: values field is nullable as Arrow expects this when building the - // final Arrow schema with `RunArray::try_new`. - let make_ree = |values_type: DataType| -> DataType { - let run_ends_field = Arc::new(Field::new("run_ends", DataType::Int32, false)); - let values_field = Arc::new(Field::new("values", values_type, true)); - DataType::RunEndEncoded(run_ends_field, values_field) - }; - - // Match on the PrimitiveType from the Datum to determine the Arrow type - match datum.data_type() { - PrimitiveType::Boolean => make_ree(DataType::Boolean), - PrimitiveType::Int => make_ree(DataType::Int32), - PrimitiveType::Long => make_ree(DataType::Int64), - PrimitiveType::Float => make_ree(DataType::Float32), - PrimitiveType::Double => make_ree(DataType::Float64), - PrimitiveType::Date => make_ree(DataType::Date32), - PrimitiveType::Time => make_ree(DataType::Int64), - PrimitiveType::Timestamp => make_ree(DataType::Int64), - PrimitiveType::Timestamptz => make_ree(DataType::Int64), - PrimitiveType::TimestampNs => make_ree(DataType::Int64), - PrimitiveType::TimestamptzNs => make_ree(DataType::Int64), - PrimitiveType::String => make_ree(DataType::Utf8), - PrimitiveType::Uuid => make_ree(DataType::Binary), - PrimitiveType::Fixed(_) => make_ree(DataType::Binary), - PrimitiveType::Binary => make_ree(DataType::Binary), - PrimitiveType::Decimal { precision, scale } => { - make_ree(DataType::Decimal128(*precision as u8, *scale as i8)) - } - } - } } #[cfg(test)] diff --git a/crates/iceberg/src/arrow/schema.rs b/crates/iceberg/src/arrow/schema.rs index ec0135bd77..65b58acf66 100644 --- a/crates/iceberg/src/arrow/schema.rs +++ b/crates/iceberg/src/arrow/schema.rs @@ -1019,6 +1019,57 @@ impl TryFrom<&crate::spec::Schema> for ArrowSchema { } } +/// Converts a Datum (Iceberg type + primitive literal) to its corresponding Arrow DataType +/// with Run-End Encoding (REE). +/// +/// This function is used for constant fields in record batches, where all values are the same. +/// Run-End Encoding provides efficient storage for such constant columns. +/// +/// # Arguments +/// * `datum` - The Datum to convert, which contains both type and value information +/// +/// # Returns +/// Arrow DataType with Run-End Encoding applied +/// +/// # Example +/// ```ignore +/// let datum = Datum::string("test_file.parquet"); +/// let ree_type = datum_to_arrow_type_with_ree(&datum); +/// // Returns: RunEndEncoded(Int32, Utf8) +/// ``` +pub fn datum_to_arrow_type_with_ree(datum: &Datum) -> DataType { + // Helper to create REE type with the given values type. + // Note: values field is nullable as Arrow expects this when building the + // final Arrow schema with `RunArray::try_new`. + let make_ree = |values_type: DataType| -> DataType { + let run_ends_field = Arc::new(Field::new("run_ends", DataType::Int32, false)); + let values_field = Arc::new(Field::new("values", values_type, true)); + DataType::RunEndEncoded(run_ends_field, values_field) + }; + + // Match on the PrimitiveType from the Datum to determine the Arrow type + match datum.data_type() { + PrimitiveType::Boolean => make_ree(DataType::Boolean), + PrimitiveType::Int => make_ree(DataType::Int32), + PrimitiveType::Long => make_ree(DataType::Int64), + PrimitiveType::Float => make_ree(DataType::Float32), + PrimitiveType::Double => make_ree(DataType::Float64), + PrimitiveType::Date => make_ree(DataType::Date32), + PrimitiveType::Time => make_ree(DataType::Int64), + PrimitiveType::Timestamp => make_ree(DataType::Int64), + PrimitiveType::Timestamptz => make_ree(DataType::Int64), + PrimitiveType::TimestampNs => make_ree(DataType::Int64), + PrimitiveType::TimestamptzNs => make_ree(DataType::Int64), + PrimitiveType::String => make_ree(DataType::Utf8), + PrimitiveType::Uuid => make_ree(DataType::Binary), + PrimitiveType::Fixed(_) => make_ree(DataType::Binary), + PrimitiveType::Binary => make_ree(DataType::Binary), + PrimitiveType::Decimal { precision, scale } => { + make_ree(DataType::Decimal128(*precision as u8, *scale as i8)) + } + } +} + #[cfg(test)] mod tests { use std::collections::HashMap; From 300d9386f6eb6075c59384e560fb47812211d4bc Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Wed, 3 Dec 2025 10:18:29 +0100 Subject: [PATCH 39/39] Make function pub(crate) --- crates/iceberg/src/arrow/schema.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/iceberg/src/arrow/schema.rs b/crates/iceberg/src/arrow/schema.rs index 65b58acf66..309b17d6e3 100644 --- a/crates/iceberg/src/arrow/schema.rs +++ b/crates/iceberg/src/arrow/schema.rs @@ -1037,7 +1037,7 @@ impl TryFrom<&crate::spec::Schema> for ArrowSchema { /// let ree_type = datum_to_arrow_type_with_ree(&datum); /// // Returns: RunEndEncoded(Int32, Utf8) /// ``` -pub fn datum_to_arrow_type_with_ree(datum: &Datum) -> DataType { +pub(crate) fn datum_to_arrow_type_with_ree(datum: &Datum) -> DataType { // Helper to create REE type with the given values type. // Note: values field is nullable as Arrow expects this when building the // final Arrow schema with `RunArray::try_new`.