@@ -12,6 +12,7 @@ use crate::actions::get_log_add_schema;
1212use crate :: engine_data:: { GetData , RowVisitor , TypedGetData as _} ;
1313use crate :: expressions:: { column_name, ColumnName , Expression , ExpressionRef , PredicateRef } ;
1414use crate :: kernel_predicates:: { DefaultKernelPredicateEvaluator , KernelPredicateEvaluator as _} ;
15+ use crate :: log_replay:: deduplicator:: Deduplicator ;
1516use crate :: log_replay:: { ActionsBatch , FileActionDeduplicator , FileActionKey , LogReplayProcessor } ;
1617use crate :: scan:: Scalar ;
1718use crate :: schema:: ToSchema as _;
@@ -46,6 +47,15 @@ pub struct SerializableScanState {
4647 pub seen_file_keys : HashSet < FileActionKey > ,
4748}
4849
50+ // These index positions correspond to the order of columns defined in
51+ // `selected_column_names_and_types()`
52+ pub ( crate ) const ADD_PATH_INDEX : usize = 0 ; // Position of "add.path" in getters
53+ const ADD_PARTITION_VALUES_INDEX : usize = 1 ; // Position of "add.partitionValues" in getters
54+ pub ( crate ) const ADD_DV_START_INDEX : usize = 2 ; // Start position of add deletion vector columns
55+ const BASE_ROW_ID_INDEX : usize = 5 ; // Position of add.baseRowId in getters
56+ pub ( crate ) const REMOVE_PATH_INDEX : usize = 6 ; // Position of "remove.path" in getters
57+ pub ( crate ) const REMOVE_DV_START_INDEX : usize = 7 ; // Start position of remove deletion vector columns
58+
4959/// [`ScanLogReplayProcessor`] performs log replay (processes actions) specifically for doing a table scan.
5060///
5161/// During a table scan, the processor reads batches of log actions (in reverse chronological order)
@@ -226,40 +236,23 @@ impl ScanLogReplayProcessor {
226236/// replay visits actions newest-first, so once we've seen a file action for a given (path, dvId)
227237/// pair, we should ignore all subsequent (older) actions for that same (path, dvId) pair. If the
228238/// first action for a given file is a remove, then that file does not show up in the result at all.
229- struct AddRemoveDedupVisitor < ' seen > {
230- deduplicator : FileActionDeduplicator < ' seen > ,
239+ struct AddRemoveDedupVisitor < D : Deduplicator > {
240+ deduplicator : D ,
231241 selection_vector : Vec < bool > ,
232242 state_info : Arc < StateInfo > ,
233243 partition_filter : Option < PredicateRef > ,
234244 row_transform_exprs : Vec < Option < ExpressionRef > > ,
235245}
236246
237- impl AddRemoveDedupVisitor < ' _ > {
238- // These index positions correspond to the order of columns defined in
239- // `selected_column_names_and_types()`
240- const ADD_PATH_INDEX : usize = 0 ; // Position of "add.path" in getters
241- const ADD_PARTITION_VALUES_INDEX : usize = 1 ; // Position of "add.partitionValues" in getters
242- const ADD_DV_START_INDEX : usize = 2 ; // Start position of add deletion vector columns
243- const BASE_ROW_ID_INDEX : usize = 5 ; // Position of add.baseRowId in getters
244- const REMOVE_PATH_INDEX : usize = 6 ; // Position of "remove.path" in getters
245- const REMOVE_DV_START_INDEX : usize = 7 ; // Start position of remove deletion vector columns
246-
247+ impl < D : Deduplicator > AddRemoveDedupVisitor < D > {
247248 fn new (
248- seen : & mut HashSet < FileActionKey > ,
249+ deduplicator : D ,
249250 selection_vector : Vec < bool > ,
250251 state_info : Arc < StateInfo > ,
251252 partition_filter : Option < PredicateRef > ,
252- is_log_batch : bool ,
253- ) -> AddRemoveDedupVisitor < ' _ > {
253+ ) -> AddRemoveDedupVisitor < D > {
254254 AddRemoveDedupVisitor {
255- deduplicator : FileActionDeduplicator :: new (
256- seen,
257- is_log_batch,
258- Self :: ADD_PATH_INDEX ,
259- Self :: REMOVE_PATH_INDEX ,
260- Self :: ADD_DV_START_INDEX ,
261- Self :: REMOVE_DV_START_INDEX ,
262- ) ,
255+ deduplicator,
263256 selection_vector,
264257 state_info,
265258 partition_filter,
@@ -312,7 +305,7 @@ impl AddRemoveDedupVisitor<'_> {
312305 let partition_values = match & self . state_info . transform_spec {
313306 Some ( transform) if is_add => {
314307 let partition_values =
315- getters[ Self :: ADD_PARTITION_VALUES_INDEX ] . get ( i, "add.partitionValues" ) ?;
308+ getters[ ADD_PARTITION_VALUES_INDEX ] . get ( i, "add.partitionValues" ) ?;
316309 let partition_values = parse_partition_values (
317310 & self . state_info . logical_schema ,
318311 transform,
@@ -331,8 +324,7 @@ impl AddRemoveDedupVisitor<'_> {
331324 if self . deduplicator . check_and_record_seen ( file_key) || !is_add {
332325 return Ok ( false ) ;
333326 }
334- let base_row_id: Option < i64 > =
335- getters[ Self :: BASE_ROW_ID_INDEX ] . get_opt ( i, "add.baseRowId" ) ?;
327+ let base_row_id: Option < i64 > = getters[ BASE_ROW_ID_INDEX ] . get_opt ( i, "add.baseRowId" ) ?;
336328 let transform = self
337329 . state_info
338330 . transform_spec
@@ -355,7 +347,7 @@ impl AddRemoveDedupVisitor<'_> {
355347 }
356348}
357349
358- impl RowVisitor for AddRemoveDedupVisitor < ' _ > {
350+ impl < D : Deduplicator > RowVisitor for AddRemoveDedupVisitor < D > {
359351 fn selected_column_names_and_types ( & self ) -> ( & ' static [ ColumnName ] , & ' static [ DataType ] ) {
360352 // NOTE: The visitor assumes a schema with adds first and removes optionally afterward.
361353 static NAMES_AND_TYPES : LazyLock < ColumnNamesAndTypes > = LazyLock :: new ( || {
@@ -501,12 +493,19 @@ impl LogReplayProcessor for ScanLogReplayProcessor {
501493 let selection_vector = self . build_selection_vector ( actions. as_ref ( ) ) ?;
502494 assert_eq ! ( selection_vector. len( ) , actions. len( ) ) ;
503495
504- let mut visitor = AddRemoveDedupVisitor :: new (
496+ let deduplicator = FileActionDeduplicator :: new (
505497 & mut self . seen_file_keys ,
498+ is_log_batch,
499+ ADD_PATH_INDEX ,
500+ REMOVE_PATH_INDEX ,
501+ ADD_DV_START_INDEX ,
502+ REMOVE_DV_START_INDEX ,
503+ ) ;
504+ let mut visitor = AddRemoveDedupVisitor :: new (
505+ deduplicator,
506506 selection_vector,
507507 self . state_info . clone ( ) ,
508508 self . partition_filter . clone ( ) ,
509- is_log_batch,
510509 ) ;
511510 visitor. visit_rows_of ( actions. as_ref ( ) ) ?;
512511
0 commit comments