@@ -18,11 +18,13 @@ use std::{
1818
1919use arrow:: {
2020 array:: {
21- as_primitive_array, as_string_array, ArrayIter , ArrayRef , BooleanArray ,
22- BooleanBufferBuilder , PrimitiveArray , StringArray ,
21+ as_primitive_array, as_string_array, ArrayRef , BooleanArray , BooleanBufferBuilder ,
22+ PrimitiveArray , StringArray ,
23+ } ,
24+ compute:: {
25+ and, filter, filter_record_batch,
26+ kernels:: cmp:: { distinct, eq} ,
2327 } ,
24- compute:: kernels:: cmp:: eq,
25- compute:: { and, filter_record_batch} ,
2628 datatypes:: { ArrowPrimitiveType , DataType , Int32Type , Int64Type } ,
2729 error:: ArrowError ,
2830 record_batch:: RecordBatch ,
@@ -260,12 +262,12 @@ fn distinct_values(array: ArrayRef) -> Result<DistinctValues, ArrowError> {
260262 DataType :: Int32 => Ok ( DistinctValues :: Int ( distinct_values_primitive :: <
261263 i32 ,
262264 Int32Type ,
263- > ( array) ) ) ,
265+ > ( array) ? ) ) ,
264266 DataType :: Int64 => Ok ( DistinctValues :: Long ( distinct_values_primitive :: <
265267 i64 ,
266268 Int64Type ,
267- > ( array) ) ) ,
268- DataType :: Utf8 => Ok ( DistinctValues :: String ( distinct_values_string ( array) ) ) ,
269+ > ( array) ? ) ) ,
270+ DataType :: Utf8 => Ok ( DistinctValues :: String ( distinct_values_string ( array) ? ) ) ,
269271 _ => Err ( ArrowError :: ComputeError (
270272 "Datatype not supported for transform." . to_string ( ) ,
271273 ) ) ,
@@ -285,15 +287,36 @@ fn distinct_values(array: ArrayRef) -> Result<DistinctValues, ArrowError> {
285287/// A HashSet containing all unique values from the array
286288fn distinct_values_primitive < T : Eq + Hash , P : ArrowPrimitiveType < Native = T > > (
287289 array : ArrayRef ,
288- ) -> HashSet < P :: Native > {
289- let mut set = HashSet :: new ( ) ;
290+ ) -> Result < HashSet < P :: Native > , ArrowError > {
290291 let array = as_primitive_array :: < P > ( & array) ;
291- for value in ArrayIter :: new ( array) . flatten ( ) {
292- if !set. contains ( & value) {
293- set. insert ( value) ;
294- }
292+
293+ let first = array. value ( 0 ) ;
294+
295+ let slice_len = array. len ( ) - 1 ;
296+
297+ if slice_len == 0 {
298+ return Ok ( HashSet :: from_iter ( [ first] ) ) ;
295299 }
296- set
300+
301+ let v1 = array. slice ( 0 , slice_len) ;
302+ let v2 = array. slice ( 1 , slice_len) ;
303+
304+ // Which consecutive entries are different
305+ let mask = distinct ( & v1, & v2) ?;
306+
307+ let unique = filter ( & v2, & mask) ?;
308+
309+ let unique = as_primitive_array :: < P > ( & unique) ;
310+
311+ let set = unique
312+ . iter ( )
313+ . fold ( HashSet :: from_iter ( [ first] ) , |mut acc, x| {
314+ if let Some ( x) = x {
315+ acc. insert ( x) ;
316+ }
317+ acc
318+ } ) ;
319+ Ok ( set)
297320}
298321
299322/// Extracts distinct string values from an Arrow array into a HashSet
@@ -303,15 +326,36 @@ fn distinct_values_primitive<T: Eq + Hash, P: ArrowPrimitiveType<Native = T>>(
303326///
304327/// # Returns
305328/// A HashSet containing all unique string values from the array
306- fn distinct_values_string ( array : ArrayRef ) -> HashSet < String > {
307- let mut set = HashSet :: new ( ) ;
329+ fn distinct_values_string ( array : ArrayRef ) -> Result < HashSet < String > , ArrowError > {
330+ let slice_len = array. len ( ) - 1 ;
331+
308332 let array = as_string_array ( & array) ;
309- for value in ArrayIter :: new ( array) . flatten ( ) {
310- if !set. contains ( value) {
311- set. insert ( value. to_owned ( ) ) ;
312- }
333+
334+ let first = array. value ( 0 ) . to_owned ( ) ;
335+
336+ if slice_len == 0 {
337+ return Ok ( HashSet :: from_iter ( [ first] ) ) ;
313338 }
314- set
339+
340+ let v1 = array. slice ( 0 , slice_len) ;
341+ let v2 = array. slice ( 1 , slice_len) ;
342+
343+ // Which consecutive entries are different
344+ let mask = distinct ( & v1, & v2) ?;
345+
346+ let unique = filter ( & v2, & mask) ?;
347+
348+ let unique = as_string_array ( & unique) ;
349+
350+ let set = unique
351+ . iter ( )
352+ . fold ( HashSet :: from_iter ( [ first] ) , |mut acc, x| {
353+ if let Some ( x) = x {
354+ acc. insert ( x. to_owned ( ) ) ;
355+ }
356+ acc
357+ } ) ;
358+ Ok ( set)
315359}
316360
317361/// Represents distinct values found in Arrow arrays during partitioning
0 commit comments