Skip to content

Commit 2a065a3

Browse files
murali-dbclaude
andcommitted
refactor: Optimize schema diff and add nullability tests
Performance optimizations: - Add ColumnName::parent() method for O(1) parent path access - Remove ColumnName::is_descendant_of() (no longer needed) - Optimize has_added_ancestor() from O(N*M) to O(M) using parent chain - Add early return in classify_data_type_change() for identical types Code quality improvements: - Add check_field_nullability_change() helper for better code reuse - Refactor physical name validation into validate_physical_name() - Simplify nested match statements with helper functions Test coverage: - Add Case 6: Field nullability tightening in deeply nested structures - Add Case 7: Container nullability tightening in deeply nested structures All 32 tests passing. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
1 parent 4802cf6 commit 2a065a3

File tree

2 files changed

+228
-64
lines changed

2 files changed

+228
-64
lines changed

kernel/src/expressions/column_names.rs

Lines changed: 9 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -98,37 +98,24 @@ impl ColumnName {
9898
self.path
9999
}
100100

101-
/// Checks if this column name is a descendant of another column name.
102-
///
103-
/// Returns `true` if `ancestor` is a prefix of `self`, meaning `self` is nested
104-
/// within `ancestor`. For example, `user.address.street` is a descendant of `user`
105-
/// and `user.address`, but not of `user.address.street` (same path) or `profile`.
101+
/// Returns the parent of this column name, or `None` if this is a top-level column.
106102
///
107103
/// # Examples
108104
///
109105
/// ```
110106
/// # use delta_kernel::expressions::ColumnName;
111107
/// let path = ColumnName::new(["user", "address", "street"]);
112-
/// let ancestor = ColumnName::new(["user"]);
113-
/// assert!(path.is_descendant_of(&ancestor));
114-
///
115-
/// let not_ancestor = ColumnName::new(["profile"]);
116-
/// assert!(!path.is_descendant_of(&not_ancestor));
108+
/// assert_eq!(path.parent(), Some(ColumnName::new(["user", "address"])));
117109
///
118-
/// // Same path is not a descendant
119-
/// assert!(!path.is_descendant_of(&path));
110+
/// let path = ColumnName::new(["user"]);
111+
/// assert_eq!(path.parent(), None);
120112
/// ```
121-
pub fn is_descendant_of(&self, ancestor: &ColumnName) -> bool {
122-
let path_parts = self.path();
123-
let ancestor_parts = ancestor.path();
124-
125-
// Path must be longer than ancestor to be a descendant
126-
if path_parts.len() <= ancestor_parts.len() {
127-
return false;
113+
pub fn parent(&self) -> Option<ColumnName> {
114+
if self.path.len() > 1 {
115+
Some(ColumnName::new(&self.path[..self.path.len() - 1]))
116+
} else {
117+
None
128118
}
129-
130-
// Check if ancestor is a prefix of path
131-
path_parts.starts_with(ancestor_parts)
132119
}
133120
}
134121

kernel/src/schema/diff.rs

Lines changed: 219 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -382,10 +382,19 @@ fn filter_ancestor_fields(fields: Vec<FieldChange>) -> Vec<FieldChange> {
382382
///
383383
/// Returns true if any path in `added_ancestor_paths` is a prefix of `path`.
384384
/// For example, "user" is an ancestor of "user.name" and "user.address.street".
385+
///
386+
/// This implementation walks up the parent chain of `path`, checking at each level
387+
/// if that parent exists in the set. This is O(depth) instead of O(N * depth) where
388+
/// N is the number of ancestor paths.
385389
fn has_added_ancestor(path: &ColumnName, added_ancestor_paths: &HashSet<ColumnName>) -> bool {
386-
added_ancestor_paths
387-
.iter()
388-
.any(|ancestor| path.is_descendant_of(ancestor))
390+
let mut curr = path.parent();
391+
while let Some(parent) = curr {
392+
if added_ancestor_paths.contains(&parent) {
393+
return true;
394+
}
395+
curr = parent.parent();
396+
}
397+
false
389398
}
390399

391400
/// Gets the physical name of a field if present
@@ -396,6 +405,50 @@ fn physical_name(field: &StructField) -> Option<&str> {
396405
}
397406
}
398407

408+
/// Validates that physical names are consistent between two versions of the same field.
409+
///
410+
/// Since schema diffing requires column mapping (field IDs), physical names must be present
411+
/// and must not change for the same field ID across schema versions.
412+
///
413+
/// # Errors
414+
/// - `PhysicalNameChanged`: Physical name differs between before and after
415+
/// - `MissingPhysicalName`: Physical name is missing in either version
416+
fn validate_physical_name(
417+
before: &FieldWithPath,
418+
after: &FieldWithPath,
419+
) -> Result<(), SchemaDiffError> {
420+
let before_physical = physical_name(&before.field);
421+
let after_physical = physical_name(&after.field);
422+
423+
match (before_physical, after_physical) {
424+
(Some(b), Some(a)) if b == a => {
425+
// Valid: physical name is present and unchanged
426+
Ok(())
427+
}
428+
(Some(b), Some(a)) => {
429+
// Invalid: physical name changed for the same field ID
430+
Err(SchemaDiffError::PhysicalNameChanged {
431+
field_id: before.field_id,
432+
path: after.path.clone(),
433+
before: b.to_string(),
434+
after: a.to_string(),
435+
})
436+
}
437+
(Some(_), None) | (None, Some(_)) => {
438+
// Invalid: physical name was added or removed
439+
Err(SchemaDiffError::MissingPhysicalName {
440+
path: after.path.clone(),
441+
})
442+
}
443+
(None, None) => {
444+
// Invalid: physical name must be present when column mapping is enabled
445+
Err(SchemaDiffError::MissingPhysicalName {
446+
path: after.path.clone(),
447+
})
448+
}
449+
}
450+
}
451+
399452
/// Recursively collects all struct fields from a data type with their paths
400453
///
401454
/// This helper function handles deep nesting like `array<array<struct<...>>>` or
@@ -522,49 +575,21 @@ fn compute_field_update(
522575
}
523576

524577
// Check for nullability change - distinguish between tightening and loosening
525-
if let Some(change) = check_container_nullability_change(before.field.nullable, after.field.nullable) {
578+
if let Some(change) = check_field_nullability_change(before.field.nullable, after.field.nullable) {
526579
changes.push(change);
527580
}
528581

529582
// Validate physical name consistency
530-
// Since we require column mapping (field IDs) for schema diffing, physical names must be present
531-
let bp = physical_name(&before.field);
532-
let ap = physical_name(&after.field);
533-
match (bp, ap) {
534-
(Some(b), Some(a)) if b == a => {
535-
// Valid: physical name is present and unchanged
536-
}
537-
(Some(b), Some(a)) => {
538-
// Invalid: physical name changed for the same field ID
539-
return Err(SchemaDiffError::PhysicalNameChanged {
540-
field_id: before.field_id,
541-
path: after.path.clone(),
542-
before: b.to_string(),
543-
after: a.to_string(),
544-
});
545-
}
546-
(Some(_), None) | (None, Some(_)) => {
547-
// Invalid: physical name was added or removed
548-
return Err(SchemaDiffError::MissingPhysicalName {
549-
path: after.path.clone(),
550-
});
551-
}
552-
(None, None) => {
553-
// Invalid: physical name must be present when column mapping is enabled
554-
return Err(SchemaDiffError::MissingPhysicalName {
555-
path: after.path.clone(),
556-
});
557-
}
558-
}
583+
validate_physical_name(before, after)?;
559584

560585
// Check for type change (including container changes)
561586
if let Some(change_type) =
562587
classify_data_type_change(before.field.data_type(), after.field.data_type())
563588
{
564589
changes.push(change_type);
565590
}
566-
// If None is returned, the container structure is the same and nested changes
567-
// are already captured via field IDs, so we don't report a change here
591+
// If None is returned, the types are identical or the container structure is the same
592+
// and nested changes are already captured via field IDs, so we don't report a change here
568593

569594
// Check for metadata changes (excluding column mapping metadata)
570595
if has_metadata_changes(&before.field, &after.field) {
@@ -585,6 +610,23 @@ fn compute_field_update(
585610
}))
586611
}
587612

613+
/// Checks for field nullability changes.
614+
///
615+
/// Returns:
616+
/// - `Some(FieldChangeType::NullabilityLoosened)` if nullability was relaxed (false -> true)
617+
/// - `Some(FieldChangeType::NullabilityTightened)` if nullability was restricted (true -> false)
618+
/// - `None` if nullability didn't change
619+
fn check_field_nullability_change(
620+
before_nullable: bool,
621+
after_nullable: bool,
622+
) -> Option<FieldChangeType> {
623+
match (before_nullable, after_nullable) {
624+
(false, true) => Some(FieldChangeType::NullabilityLoosened),
625+
(true, false) => Some(FieldChangeType::NullabilityTightened),
626+
(true, true) | (false, false) => None,
627+
}
628+
}
629+
588630
/// Checks for container nullability changes.
589631
///
590632
/// Returns:
@@ -617,6 +659,11 @@ fn check_container_nullability_change(
617659
/// 3. **Map containers**: Similar logic to arrays, but for both key and value types
618660
/// 4. **Different container types or primitives**: Type change
619661
fn classify_data_type_change(before: &DataType, after: &DataType) -> Option<FieldChangeType> {
662+
// Early return if types are identical - no change to report
663+
if before == after {
664+
return None;
665+
}
666+
620667
match (before, after) {
621668
// Struct-to-struct: nested field changes are handled separately via field IDs
622669
(DataType::Struct(_), DataType::Struct(_)) => None,
@@ -628,7 +675,7 @@ fn classify_data_type_change(before: &DataType, after: &DataType) -> Option<Fiel
628675
match (before_array.element_type(), after_array.element_type()) {
629676
// Both have struct elements - nested changes handled via field IDs
630677
(DataType::Struct(_), DataType::Struct(_)) => None,
631-
// For non-struct elements, recurse to check for changes (but only if types differ)
678+
// For non-struct elements, recurse to check for changes
632679
(e1, e2) => classify_data_type_change(e1, e2),
633680
};
634681

@@ -654,18 +701,16 @@ fn classify_data_type_change(before: &DataType, after: &DataType) -> Option<Fiel
654701
let key_type_change = match (before_map.key_type(), after_map.key_type()) {
655702
// Both have struct keys - nested changes handled via field IDs
656703
(DataType::Struct(_), DataType::Struct(_)) => None,
657-
// For non-struct keys (including arrays/maps containing structs), recurse (but only if types differ)
658-
(k1, k2) if k1 != k2 => classify_data_type_change(k1, k2),
659-
_ => None,
704+
// For non-struct keys (including arrays/maps containing structs), recurse
705+
(k1, k2) => classify_data_type_change(k1, k2),
660706
};
661707

662708
// Recursively check for value type changes
663709
let value_type_change = match (before_map.value_type(), after_map.value_type()) {
664710
// Both have struct values - nested changes handled via field IDs
665711
(DataType::Struct(_), DataType::Struct(_)) => None,
666-
// For non-struct values (including arrays/maps containing structs), recurse (but only if types differ)
667-
(v1, v2) if v1 != v2 => classify_data_type_change(v1, v2),
668-
_ => None,
712+
// For non-struct values (including arrays/maps containing structs), recurse
713+
(v1, v2) => classify_data_type_change(v1, v2),
669714
};
670715

671716
// Check container nullability change
@@ -2540,5 +2585,137 @@ mod tests {
25402585
])));
25412586

25422587
assert!(!diff5.has_breaking_changes()); // Removal is safe, renames are safe
2588+
2589+
// Case 6: Nullability tightening in deeply nested structure - should be breaking
2590+
// Test: array<struct<items: array<struct<value int nullable>>>> -> array<struct<items: array<struct<value int not null>>>>
2591+
let before6 = StructType::new_unchecked([create_field_with_id(
2592+
"wrapper",
2593+
DataType::Array(Box::new(ArrayType::new(
2594+
DataType::try_struct_type([create_field_with_id(
2595+
"items",
2596+
DataType::Array(Box::new(ArrayType::new(
2597+
DataType::try_struct_type([
2598+
create_field_with_id("value", DataType::INTEGER, true, 3), // Nullable
2599+
])
2600+
.unwrap(),
2601+
false,
2602+
))),
2603+
false,
2604+
2,
2605+
)])
2606+
.unwrap(),
2607+
false,
2608+
))),
2609+
false,
2610+
1,
2611+
)]);
2612+
2613+
let after6 = StructType::new_unchecked([create_field_with_id(
2614+
"wrapper",
2615+
DataType::Array(Box::new(ArrayType::new(
2616+
DataType::try_struct_type([create_field_with_id(
2617+
"items",
2618+
DataType::Array(Box::new(ArrayType::new(
2619+
DataType::try_struct_type([
2620+
create_field_with_id("value", DataType::INTEGER, false, 3), // Non-nullable now - BREAKING!
2621+
])
2622+
.unwrap(),
2623+
false,
2624+
))),
2625+
false,
2626+
2,
2627+
)])
2628+
.unwrap(),
2629+
false,
2630+
))),
2631+
false,
2632+
1,
2633+
)]);
2634+
2635+
let diff6 = SchemaDiffArgs {
2636+
before: &before6,
2637+
after: &after6,
2638+
}
2639+
.compute_diff()
2640+
.unwrap();
2641+
2642+
assert_eq!(diff6.added_fields.len(), 0);
2643+
assert_eq!(diff6.removed_fields.len(), 0);
2644+
assert_eq!(diff6.updated_fields.len(), 1);
2645+
assert_eq!(
2646+
diff6.updated_fields[0].path,
2647+
ColumnName::new(["wrapper", "element", "items", "element", "value"])
2648+
);
2649+
assert_eq!(
2650+
diff6.updated_fields[0].change_type,
2651+
FieldChangeType::NullabilityTightened
2652+
);
2653+
assert!(diff6.has_breaking_changes()); // Nullability tightening is breaking
2654+
2655+
// Case 7: Array container nullability tightening in nested structure - should be breaking
2656+
// Test: array<struct<items: array<struct<value int> nullable>>> -> array<struct<items: array<struct<value int> not null>>>
2657+
let before7 = StructType::new_unchecked([create_field_with_id(
2658+
"wrapper",
2659+
DataType::Array(Box::new(ArrayType::new(
2660+
DataType::try_struct_type([create_field_with_id(
2661+
"items",
2662+
DataType::Array(Box::new(ArrayType::new(
2663+
DataType::try_struct_type([
2664+
create_field_with_id("value", DataType::INTEGER, false, 3),
2665+
])
2666+
.unwrap(),
2667+
true, // Array elements are nullable
2668+
))),
2669+
false,
2670+
2,
2671+
)])
2672+
.unwrap(),
2673+
false,
2674+
))),
2675+
false,
2676+
1,
2677+
)]);
2678+
2679+
let after7 = StructType::new_unchecked([create_field_with_id(
2680+
"wrapper",
2681+
DataType::Array(Box::new(ArrayType::new(
2682+
DataType::try_struct_type([create_field_with_id(
2683+
"items",
2684+
DataType::Array(Box::new(ArrayType::new(
2685+
DataType::try_struct_type([
2686+
create_field_with_id("value", DataType::INTEGER, false, 3),
2687+
])
2688+
.unwrap(),
2689+
false, // Array elements now non-nullable - BREAKING!
2690+
))),
2691+
false,
2692+
2,
2693+
)])
2694+
.unwrap(),
2695+
false,
2696+
))),
2697+
false,
2698+
1,
2699+
)]);
2700+
2701+
let diff7 = SchemaDiffArgs {
2702+
before: &before7,
2703+
after: &after7,
2704+
}
2705+
.compute_diff()
2706+
.unwrap();
2707+
2708+
assert_eq!(diff7.added_fields.len(), 0);
2709+
assert_eq!(diff7.removed_fields.len(), 0);
2710+
assert_eq!(diff7.updated_fields.len(), 1);
2711+
assert_eq!(
2712+
diff7.updated_fields[0].path,
2713+
ColumnName::new(["wrapper", "element", "items"])
2714+
);
2715+
assert_eq!(
2716+
diff7.updated_fields[0].change_type,
2717+
FieldChangeType::ContainerNullabilityTightened
2718+
);
2719+
assert!(diff7.has_breaking_changes()); // Container nullability tightening is breaking
25432720
}
25442721
}

0 commit comments

Comments
 (0)