Skip to content

Commit 7e5f162

Browse files
committed
improve distinct count approximation to datafusion statistics
1 parent ec54547 commit 7e5f162

File tree

2 files changed

+72
-9
lines changed

2 files changed

+72
-9
lines changed

datafusion_iceberg/src/statistics.rs

Lines changed: 57 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ use datafusion::{
44
scalar::ScalarValue,
55
};
66
use iceberg_rust::error::Error;
7+
use iceberg_rust::file_format::parquet::estimate_distinct_count;
78
use iceberg_rust::spec::{
89
manifest::{ManifestEntry, Status},
910
schema::Schema,
@@ -45,12 +46,16 @@ pub(crate) fn statistics_from_datafiles(
4546
.column_statistics
4647
.into_iter()
4748
.zip(column_stats)
48-
.map(|(acc, x)| ColumnStatistics {
49-
null_count: acc.null_count.add(&x.null_count),
50-
max_value: acc.max_value.max(&x.max_value),
51-
min_value: acc.min_value.min(&x.min_value),
52-
distinct_count: acc.distinct_count.add(&x.distinct_count),
53-
sum_value: acc.sum_value.add(&x.sum_value),
49+
.map(|(acc, x)| {
50+
let new_distinct_count = new_distinct_count(&acc, &x);
51+
52+
ColumnStatistics {
53+
null_count: acc.null_count.add(&x.null_count),
54+
max_value: acc.max_value.max(&x.max_value),
55+
min_value: acc.min_value.min(&x.min_value),
56+
distinct_count: new_distinct_count,
57+
sum_value: acc.sum_value.add(&x.sum_value),
58+
}
5459
})
5560
.collect(),
5661
}
@@ -134,3 +139,49 @@ fn convert_value_to_scalar_value(value: Value) -> Result<ScalarValue, Error> {
134139
)),
135140
}
136141
}
142+
143+
fn new_distinct_count(acc: &ColumnStatistics, x: &ColumnStatistics) -> Precision<usize> {
144+
match (
145+
&acc.distinct_count,
146+
&x.distinct_count,
147+
&acc.min_value,
148+
&acc.max_value,
149+
&x.min_value,
150+
&x.max_value,
151+
) {
152+
(
153+
Precision::Exact(old_count),
154+
Precision::Exact(new_count),
155+
Precision::Exact(ScalarValue::Int32(Some(old_min))),
156+
Precision::Exact(ScalarValue::Int32(Some(old_max))),
157+
Precision::Exact(ScalarValue::Int32(Some(new_min))),
158+
Precision::Exact(ScalarValue::Int32(Some(new_max))),
159+
) => {
160+
let estimated = estimate_distinct_count(
161+
&[old_min, old_max],
162+
&[new_min, new_max],
163+
*old_count as i64,
164+
*new_count as i64,
165+
);
166+
Precision::Inexact(*old_count + estimated as usize)
167+
}
168+
(
169+
Precision::Exact(old_count),
170+
Precision::Exact(new_count),
171+
Precision::Exact(ScalarValue::Int64(Some(old_min))),
172+
Precision::Exact(ScalarValue::Int64(Some(old_max))),
173+
Precision::Exact(ScalarValue::Int64(Some(new_min))),
174+
Precision::Exact(ScalarValue::Int64(Some(new_max))),
175+
) => {
176+
let estimated = estimate_distinct_count(
177+
&[old_min, old_max],
178+
&[new_min, new_max],
179+
*old_count as i64,
180+
*new_count as i64,
181+
);
182+
Precision::Inexact(*old_count + estimated as usize)
183+
}
184+
(Precision::Absent, Precision::Exact(_), _, _, _, _) => x.distinct_count,
185+
_ => acc.distinct_count.add(&x.distinct_count),
186+
}
187+
}

iceberg-rust/src/file_format/parquet.rs

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -334,8 +334,20 @@ fn range_overlap<T: Ord + Sub + Copy>(
334334
overlap_end - overlap_start
335335
}
336336

337-
/// Helper trait to convert numeric types to f64 for statistical calculations
338-
trait ToF64 {
337+
/// Helper trait to convert numeric types to f64 for statistical calculations.
338+
///
339+
/// This trait provides a uniform interface for converting integer types to f64,
340+
/// which is necessary for the statistical estimation algorithms. The conversion
341+
/// may be lossy for very large i64 values (beyond 2^53), but this is acceptable
342+
/// for statistical approximations.
343+
pub trait ToF64 {
344+
/// Converts the value to f64.
345+
///
346+
/// # Note
347+
///
348+
/// For i64 values larger than 2^53, precision may be lost in the conversion.
349+
/// This is acceptable for statistical calculations where exact precision is
350+
/// not required.
339351
fn to_f64(self) -> f64;
340352
}
341353

@@ -384,7 +396,7 @@ impl ToF64 for i64 {
384396
/// // New range [500, 1500] with 50 distinct values
385397
/// let new_count = estimate_distinct_count(&[&0, &1000], &[&500, &1500], 100, 50);
386398
/// ```
387-
fn estimate_distinct_count<T>(
399+
pub fn estimate_distinct_count<T>(
388400
old_range: &[&T; 2],
389401
new_range: &[&T; 2],
390402
old_distinct_count: i64,

0 commit comments

Comments
 (0)