Skip to content

Commit 162ad16

Browse files
author
Jan Kaul
committed
use simd to determine distinct partitions
1 parent 80f03e0 commit 162ad16

File tree

1 file changed

+65
-21
lines changed

1 file changed

+65
-21
lines changed

iceberg-rust/src/arrow/partition.rs

Lines changed: 65 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,13 @@ use std::{
1818

1919
use arrow::{
2020
array::{
21-
as_primitive_array, as_string_array, ArrayIter, ArrayRef, BooleanArray,
22-
BooleanBufferBuilder, PrimitiveArray, StringArray,
21+
as_primitive_array, as_string_array, ArrayRef, BooleanArray, BooleanBufferBuilder,
22+
PrimitiveArray, StringArray,
23+
},
24+
compute::{
25+
and, filter, filter_record_batch,
26+
kernels::cmp::{distinct, eq},
2327
},
24-
compute::kernels::cmp::eq,
25-
compute::{and, filter_record_batch},
2628
datatypes::{ArrowPrimitiveType, DataType, Int32Type, Int64Type},
2729
error::ArrowError,
2830
record_batch::RecordBatch,
@@ -260,12 +262,12 @@ fn distinct_values(array: ArrayRef) -> Result<DistinctValues, ArrowError> {
260262
DataType::Int32 => Ok(DistinctValues::Int(distinct_values_primitive::<
261263
i32,
262264
Int32Type,
263-
>(array))),
265+
>(array)?)),
264266
DataType::Int64 => Ok(DistinctValues::Long(distinct_values_primitive::<
265267
i64,
266268
Int64Type,
267-
>(array))),
268-
DataType::Utf8 => Ok(DistinctValues::String(distinct_values_string(array))),
269+
>(array)?)),
270+
DataType::Utf8 => Ok(DistinctValues::String(distinct_values_string(array)?)),
269271
_ => Err(ArrowError::ComputeError(
270272
"Datatype not supported for transform.".to_string(),
271273
)),
@@ -285,15 +287,36 @@ fn distinct_values(array: ArrayRef) -> Result<DistinctValues, ArrowError> {
285287
/// A HashSet containing all unique values from the array
286288
fn distinct_values_primitive<T: Eq + Hash, P: ArrowPrimitiveType<Native = T>>(
287289
array: ArrayRef,
288-
) -> HashSet<P::Native> {
289-
let mut set = HashSet::new();
290+
) -> Result<HashSet<P::Native>, ArrowError> {
290291
let array = as_primitive_array::<P>(&array);
291-
for value in ArrayIter::new(array).flatten() {
292-
if !set.contains(&value) {
293-
set.insert(value);
294-
}
292+
293+
let first = array.value(0);
294+
295+
let slice_len = array.len() - 1;
296+
297+
if slice_len == 0 {
298+
return Ok(HashSet::from_iter([first]));
295299
}
296-
set
300+
301+
let v1 = array.slice(0, slice_len);
302+
let v2 = array.slice(1, slice_len);
303+
304+
// Which consecutive entries are different
305+
let mask = distinct(&v1, &v2)?;
306+
307+
let unique = filter(&v2, &mask)?;
308+
309+
let unique = as_primitive_array::<P>(&unique);
310+
311+
let set = unique
312+
.iter()
313+
.fold(HashSet::from_iter([first]), |mut acc, x| {
314+
if let Some(x) = x {
315+
acc.insert(x);
316+
}
317+
acc
318+
});
319+
Ok(set)
297320
}
298321

299322
/// Extracts distinct string values from an Arrow array into a HashSet
@@ -303,15 +326,36 @@ fn distinct_values_primitive<T: Eq + Hash, P: ArrowPrimitiveType<Native = T>>(
303326
///
304327
/// # Returns
305328
/// A HashSet containing all unique string values from the array
306-
fn distinct_values_string(array: ArrayRef) -> HashSet<String> {
307-
let mut set = HashSet::new();
329+
fn distinct_values_string(array: ArrayRef) -> Result<HashSet<String>, ArrowError> {
330+
let slice_len = array.len() - 1;
331+
308332
let array = as_string_array(&array);
309-
for value in ArrayIter::new(array).flatten() {
310-
if !set.contains(value) {
311-
set.insert(value.to_owned());
312-
}
333+
334+
let first = array.value(0).to_owned();
335+
336+
if slice_len == 0 {
337+
return Ok(HashSet::from_iter([first]));
313338
}
314-
set
339+
340+
let v1 = array.slice(0, slice_len);
341+
let v2 = array.slice(1, slice_len);
342+
343+
// Which consecutive entries are different
344+
let mask = distinct(&v1, &v2)?;
345+
346+
let unique = filter(&v2, &mask)?;
347+
348+
let unique = as_string_array(&unique);
349+
350+
let set = unique
351+
.iter()
352+
.fold(HashSet::from_iter([first]), |mut acc, x| {
353+
if let Some(x) = x {
354+
acc.insert(x.to_owned());
355+
}
356+
acc
357+
});
358+
Ok(set)
315359
}
316360

317361
/// Represents distinct values found in Arrow arrays during partitioning

0 commit comments

Comments
 (0)