Skip to content
19 changes: 17 additions & 2 deletions docs/dqx/docs/reference/quality_checks.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,10 @@ You can also define your own custom checks (see [Creating custom checks](#creati
| `is_geometrycollection` | Checks whether the values in the input column are geometrycollection geometries/geographies. This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression) |
| `is_ogc_valid` | Checks whether the values in the input column are valid geometries in the OGC sense. I.e a bowtie polygon is invalid because it has a self intersection. This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression) |
| `is_non_empty_geometry` | Checks whether the values in the input column are non-empty geometries. This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression) |
| `is_not_null_island` | Checks whether the values in the input column are null island geometries (POINT(0 0)). This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression) |
| `has_dimension` | Checks whether the values in the input column are geometries of the specified dimension (2D projected dimension). This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression); `dimension`: dimension to check |
| `has_x_coordinate_between` | Checks whether the values in the input column are geometries with x coordinate between the provided boundaries. This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression); `min_value`: minimum value; `max_value`: maximum value |
| `has_y_coordinate_between` | Checks whether the values in the input column are geometries with y coordinate between the provided boundaries. This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression); `min_value`: minimum value; `max_value`: maximum value | | `column`: column to check (can be a string column name or a column expression); `min_value`: minimum value; `max_value`: maximum value |
| `has_y_coordinate_between` | Checks whether the values in the input column are geometries with y coordinate between the provided boundaries. This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression); `min_value`: minimum value; `max_value`: maximum value |
</details>

<Admonition type="warning" title="Applicability">
Expand Down Expand Up @@ -580,7 +581,14 @@ For brevity, the `name` field in the examples is omitted and it will be auto-gen
function: is_non_empty_geometry
arguments:
column: point_geom


# is_not_null_island check
- criticality: error
check:
function: is_not_null_island
arguments:
column: point_geom

# has_dimension check
- criticality: error
check:
Expand Down Expand Up @@ -1044,6 +1052,13 @@ checks = [
column="point_geom"
),

# is_not_null_island check
DQRowRule(
criticality="error",
check_func=geo_check_funcs.is_not_null_island,
column="point_geom"
),

# has_dimension check
DQRowRule(
criticality="error",
Expand Down
29 changes: 29 additions & 0 deletions src/databricks/labs/dqx/geo/check_funcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,6 +357,35 @@ def is_non_empty_geometry(column: str | Column) -> Column:
)


@register_rule("row")
def is_not_null_island(column: str | Column) -> Column:
"""Checks whether the values in the input column are NULL island geometries (POINT(0 0)).

Args:
column: column to check; can be a string column name or a column expression

Returns:
Column object indicating whether the values in the input column are NULL island geometries

Note:
This function requires Databricks serverless compute or runtime 17.1 or above.
"""
col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column)
# NOTE: This function is currently only available in Databricks runtime 17.1 or above or in
# Databricks SQL, due to the use of the `try_to_geometry`, `st_geometrytype`, `st_x`, and `st_y` functions.
geom_cond = F.expr(f"try_to_geometry({col_str_norm}) IS NULL")
is_point_cond = F.expr(f"st_geometrytype(try_to_geometry({col_str_norm})) = '{POINT_TYPE}'")
is_zero_zero = F.expr(f"st_x(try_to_geometry({col_str_norm})) = 0.0 AND st_y(try_to_geometry({col_str_norm})) = 0.0")
condition = F.when(col_expr.isNull(), F.lit(None)).otherwise(~geom_cond & is_point_cond & is_zero_zero)
condition_str = f"column `{col_expr_str}` contains a null island"

return make_condition(
condition,
F.lit(condition_str),
f"{col_str_norm}_contains_null_island",
)


@register_rule("row")
def has_dimension(column: str | Column, dimension: int) -> Column:
"""Checks whether the geometries/geographies in the input column have a given dimension.
Expand Down
11 changes: 11 additions & 0 deletions tests/integration/test_apply_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -5819,6 +5819,17 @@ def test_apply_checks_all_geo_checks_using_classes(skip_if_runtime_not_geo_compa
check_func=geo_check_funcs.is_non_empty_geometry,
column=F.col("point_geom"),
),
# is_not_null_island check
DQRowRule(
criticality="error",
check_func=geo_check_funcs.is_not_null_island,
column="point_geom",
),
DQRowRule(
criticality="error",
check_func=geo_check_funcs.is_not_null_island,
column=F.col("point_geom"),
),
# has_dimension check
DQRowRule(
criticality="error",
Expand Down
24 changes: 24 additions & 0 deletions tests/integration/test_row_checks_geo.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
is_multilinestring,
is_multipoint,
is_multipolygon,
is_not_null_island,
is_point,
is_polygon,
is_ogc_valid,
Expand Down Expand Up @@ -333,6 +334,29 @@ def test_is_non_empty_geometry(skip_if_runtime_not_geo_compatible, spark):
assert_df_equality(actual, expected, ignore_nullable=True)


def test_is_not_null_island(skip_if_runtime_not_geo_compatible, spark):
input_schema = "geom: string"
test_df = spark.createDataFrame(
[["POINT(1 1)"], ["POINT(0 0)"], ["LINESTRING(0 0, 1 1)"], ["nonsense"], [None]],
input_schema,
)

actual = test_df.select(is_not_null_island("geom"))

checked_schema = "geom_contains_null_island: string"
expected = spark.createDataFrame(
[
[None],
["column `geom` contains a null island"],
[None],
[None],
[None],
],
checked_schema,
)
assert_df_equality(actual, expected, ignore_nullable=True)


def test_has_dimension(skip_if_runtime_not_geo_compatible, spark):
input_schema = "geom: string"
test_df = spark.createDataFrame(
Expand Down
12 changes: 12 additions & 0 deletions tests/perf/test_apply_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1509,6 +1509,18 @@ def test_benchmark_is_non_empty_geometry(skip_if_runtime_not_geo_compatible, ben
actual_count = benchmark(lambda: checked.count())
assert actual_count == EXPECTED_ROWS

def test_benchmark_is_not_null_island(skip_if_runtime_not_geo_compatible, benchmark, ws, generated_geo_df):
dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS)
checks = [
DQRowRule(
criticality="error",
check_func=geo_check_funcs.is_not_null_island,
column="point_geom",
)
]
checked = dq_engine.apply_checks(generated_geo_df, checks)
actual_count = benchmark(lambda: checked.count())
assert actual_count == EXPECTED_ROWS

def test_benchmark_has_dimension(skip_if_runtime_not_geo_compatible, benchmark, ws, generated_geo_df):
dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS)
Expand Down
7 changes: 7 additions & 0 deletions tests/resources/all_row_geo_checks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,13 @@
arguments:
column: point_geom

# is_not_null_island check
- criticality: error
check:
function: is_not_null_island
arguments:
column: point_geom

# has_dimension check
- criticality: error
check:
Expand Down
Loading