Skip to content

Commit 65ba595

Browse files
somasaysSomasundaram Sekar
andauthored
fix: Allow writing data with missing optional map fields (#2797)
## Summary - Fixes #2684: Writing to a table with an optional map field fails when the data is missing that field - Modified `_SchemaCompatibilityVisitor.field()` to skip child validation when optional parent is missing - Added 5 unit tests for schema compatibility with optional nested fields ## Root Cause When writing data to a table with an optional map field, the schema compatibility check incorrectly failed if the data was missing that field. This happened because the validator descended into the map's internal key field (which is always `required=True` per Iceberg spec) even when the parent map field was optional and missing. ## The Fix Modified `_SchemaCompatibilityVisitor.field()` in `pyiceberg/schema.py` to check if the field exists in the provided schema before descending into children. If an optional parent field is missing, we skip child validation entirely. ## Test plan - [x] All 328 schema tests pass - [x] Lint passes (`make lint`) - [x] New tests cover: - Optional map field missing (should pass) - main fix - Required map field missing (should fail) - Optional list field missing (should pass) - Optional struct field missing (should pass) - Optional map field present (should pass) Co-authored-by: Somasundaram Sekar <somasundaramsekar.1986@gmail.com>
1 parent 454c17f commit 65ba595

File tree

2 files changed

+110
-1
lines changed

2 files changed

+110
-1
lines changed

pyiceberg/schema.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1783,7 +1783,11 @@ def struct(self, struct: StructType, field_results: builtins.list[Callable[[], b
17831783
return all(results)
17841784

17851785
def field(self, field: NestedField, field_result: Callable[[], bool]) -> bool:
1786-
return self._is_field_compatible(field) and field_result()
1786+
# Skip child validation for missing optional fields (#2797)
1787+
is_compatible = self._is_field_compatible(field)
1788+
if field.field_id not in self.provided_schema._lazy_id_to_field:
1789+
return is_compatible
1790+
return is_compatible and field_result()
17871791

17881792
def list(self, list_type: ListType, element_result: Callable[[], bool]) -> bool:
17891793
return self._is_field_compatible(list_type.element_field) and element_result()

tests/test_schema.py

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from pyiceberg.schema import (
2626
Accessor,
2727
Schema,
28+
_check_schema_compatible,
2829
build_position_accessors,
2930
index_by_id,
3031
index_by_name,
@@ -1687,3 +1688,107 @@ def test_arrow_schema() -> None:
16871688
)
16881689

16891690
assert base_schema.as_arrow() == expected_schema
1691+
1692+
1693+
def test_check_schema_compatible_optional_map_field_missing() -> None:
1694+
"""Test that optional map field missing from provided schema is compatible (issue #2684)."""
1695+
requested_schema = Schema(
1696+
NestedField(field_id=1, name="id", field_type=LongType(), required=True),
1697+
NestedField(
1698+
field_id=2,
1699+
name="data",
1700+
field_type=MapType(key_id=3, key_type=StringType(), value_id=4, value_type=StringType()),
1701+
required=False, # Optional map field
1702+
),
1703+
)
1704+
# Provided schema is missing the optional map field
1705+
provided_schema = Schema(
1706+
NestedField(field_id=1, name="id", field_type=LongType(), required=True),
1707+
)
1708+
# Should not raise - optional field can be missing
1709+
_check_schema_compatible(requested_schema, provided_schema)
1710+
1711+
1712+
def test_check_schema_compatible_required_map_field_missing() -> None:
1713+
"""Test that required map field missing from provided schema raises error."""
1714+
requested_schema = Schema(
1715+
NestedField(field_id=1, name="id", field_type=LongType(), required=True),
1716+
NestedField(
1717+
field_id=2,
1718+
name="data",
1719+
field_type=MapType(key_id=3, key_type=StringType(), value_id=4, value_type=StringType()),
1720+
required=True, # Required map field
1721+
),
1722+
)
1723+
# Provided schema is missing the required map field
1724+
provided_schema = Schema(
1725+
NestedField(field_id=1, name="id", field_type=LongType(), required=True),
1726+
)
1727+
# Should raise - required field cannot be missing
1728+
with pytest.raises(ValueError, match="Mismatch in fields"):
1729+
_check_schema_compatible(requested_schema, provided_schema)
1730+
1731+
1732+
def test_check_schema_compatible_optional_list_field_missing() -> None:
1733+
"""Test that optional list field missing from provided schema is compatible."""
1734+
requested_schema = Schema(
1735+
NestedField(field_id=1, name="id", field_type=LongType(), required=True),
1736+
NestedField(
1737+
field_id=2,
1738+
name="items",
1739+
field_type=ListType(element_id=3, element_type=StringType(), element_required=True),
1740+
required=False, # Optional list field
1741+
),
1742+
)
1743+
# Provided schema is missing the optional list field
1744+
provided_schema = Schema(
1745+
NestedField(field_id=1, name="id", field_type=LongType(), required=True),
1746+
)
1747+
# Should not raise - optional field can be missing
1748+
_check_schema_compatible(requested_schema, provided_schema)
1749+
1750+
1751+
def test_check_schema_compatible_optional_struct_field_missing() -> None:
1752+
"""Test that optional struct field missing from provided schema is compatible."""
1753+
requested_schema = Schema(
1754+
NestedField(field_id=1, name="id", field_type=LongType(), required=True),
1755+
NestedField(
1756+
field_id=2,
1757+
name="details",
1758+
field_type=StructType(
1759+
NestedField(field_id=3, name="name", field_type=StringType(), required=True),
1760+
NestedField(field_id=4, name="count", field_type=IntegerType(), required=True),
1761+
),
1762+
required=False, # Optional struct field
1763+
),
1764+
)
1765+
# Provided schema is missing the optional struct field
1766+
provided_schema = Schema(
1767+
NestedField(field_id=1, name="id", field_type=LongType(), required=True),
1768+
)
1769+
# Should not raise - optional field can be missing
1770+
_check_schema_compatible(requested_schema, provided_schema)
1771+
1772+
1773+
def test_check_schema_compatible_optional_map_field_present() -> None:
1774+
"""Test that optional map field present in provided schema is compatible."""
1775+
requested_schema = Schema(
1776+
NestedField(field_id=1, name="id", field_type=LongType(), required=True),
1777+
NestedField(
1778+
field_id=2,
1779+
name="data",
1780+
field_type=MapType(key_id=3, key_type=StringType(), value_id=4, value_type=StringType()),
1781+
required=False,
1782+
),
1783+
)
1784+
provided_schema = Schema(
1785+
NestedField(field_id=1, name="id", field_type=LongType(), required=True),
1786+
NestedField(
1787+
field_id=2,
1788+
name="data",
1789+
field_type=MapType(key_id=3, key_type=StringType(), value_id=4, value_type=StringType()),
1790+
required=False,
1791+
),
1792+
)
1793+
# Should not raise - schemas match
1794+
_check_schema_compatible(requested_schema, provided_schema)

0 commit comments

Comments
 (0)