Skip to content

Commit acc045d

Browse files
committed
use immutable addrmdedup
1 parent b36ac03 commit acc045d

File tree

3 files changed

+54
-15
lines changed

3 files changed

+54
-15
lines changed

kernel/src/log_replay.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,10 @@ impl ActionsBatch {
204204
}
205205
}
206206

207+
pub(crate) trait ParallelizableLogReplayProcessor: LogReplayProcessor {
208+
fn process_actions_batch(&self, actions_batch: ActionsBatch) -> DeltaResult<Self::Output>;
209+
}
210+
207211
/// A trait for processing batches of actions from Delta transaction logs during log replay.
208212
///
209213
/// Log replay processors scan transaction logs in **reverse chronological order** (newest to oldest),

kernel/src/log_replay/deduplicator.rs

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,12 @@
1010
//!
1111
//! [`FileActionDeduplicator`]: crate::log_replay::FileActionDeduplicator
1212
13-
use crate::{
14-
actions::deletion_vector::DeletionVectorDescriptor,
15-
engine_data::{GetData, TypedGetData},
16-
DeltaResult,
17-
};
13+
use std::collections::HashSet;
14+
15+
use crate::actions::deletion_vector::DeletionVectorDescriptor;
16+
use crate::engine_data::{GetData, TypedGetData};
17+
use crate::log_replay::FileActionKey;
18+
use crate::DeltaResult;
1819

1920
pub(crate) trait Deduplicator {
2021
/// Key type for identifying file actions. JSON deduplicators use `FileActionKey`
@@ -67,24 +68,24 @@ pub(crate) trait Deduplicator {
6768
}
6869

6970
#[allow(unused)]
70-
pub(crate) struct CheckpointDeduplicator {
71-
seen_file_keys: HashSet<String>,
71+
pub(crate) struct CheckpointDeduplicator<'a> {
72+
seen_file_keys: &'a HashSet<FileActionKey>,
7273
add_path_index: usize,
7374
}
74-
impl CheckpointDeduplicator {
75+
impl CheckpointDeduplicator<'_> {
7576
#[allow(unused)]
76-
pub(crate) fn try_new(
77-
seen_file_keys: HashSet<String>,
77+
pub(crate) fn try_new<'a>(
78+
seen_file_keys: &'a HashSet<FileActionKey>,
7879
add_path_index: usize,
79-
) -> DeltaResult<Self> {
80-
Ok(Self {
80+
) -> DeltaResult<CheckpointDeduplicator<'a>> {
81+
Ok(CheckpointDeduplicator {
8182
seen_file_keys,
8283
add_path_index,
8384
})
8485
}
8586
}
8687

87-
impl Deduplicator for CheckpointDeduplicator {
88+
impl Deduplicator for CheckpointDeduplicator<'_> {
8889
type Key = String;
8990

9091
fn extract_file_action<'a>(

kernel/src/scan/log_replay.rs

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,11 @@ use crate::actions::get_log_add_schema;
1212
use crate::engine_data::{GetData, RowVisitor, TypedGetData as _};
1313
use crate::expressions::{column_name, ColumnName, Expression, ExpressionRef, PredicateRef};
1414
use crate::kernel_predicates::{DefaultKernelPredicateEvaluator, KernelPredicateEvaluator as _};
15-
use crate::log_replay::deduplicator::Deduplicator;
16-
use crate::log_replay::{ActionsBatch, FileActionDeduplicator, FileActionKey, LogReplayProcessor};
15+
use crate::log_replay::deduplicator::{self, CheckpointDeduplicator, Deduplicator};
16+
use crate::log_replay::{
17+
ActionsBatch, FileActionDeduplicator, FileActionKey, LogReplayProcessor,
18+
ParallelizableLogReplayProcessor,
19+
};
1720
use crate::scan::Scalar;
1821
use crate::schema::ToSchema as _;
1922
use crate::schema::{ColumnNamesAndTypes, DataType, MapType, StructField, StructType};
@@ -480,6 +483,37 @@ pub(crate) fn get_scan_metadata_transform_expr() -> ExpressionRef {
480483
EXPR.clone()
481484
}
482485

486+
impl ParallelizableLogReplayProcessor for ScanLogReplayProcessor {
487+
fn process_actions_batch(&self, actions_batch: ActionsBatch) -> DeltaResult<Self::Output> {
488+
let ActionsBatch {
489+
actions,
490+
is_log_batch: _,
491+
} = actions_batch;
492+
// Build an initial selection vector for the batch which has had the data skipping filter
493+
// applied. The selection vector is further updated by the deduplication visitor to remove
494+
// rows that are not valid adds.
495+
let selection_vector = self.build_selection_vector(actions.as_ref())?;
496+
assert_eq!(selection_vector.len(), actions.len());
497+
498+
let deduplicator = CheckpointDeduplicator::try_new(&self.seen_file_keys, ADD_PATH_INDEX)?;
499+
let mut visitor = AddRemoveDedupVisitor::new(
500+
deduplicator,
501+
selection_vector,
502+
self.state_info.clone(),
503+
self.partition_filter.clone(),
504+
);
505+
visitor.visit_rows_of(actions.as_ref())?;
506+
507+
// TODO: Teach expression eval to respect the selection vector we just computed so carefully!
508+
let result = self.add_transform.evaluate(actions.as_ref())?;
509+
ScanMetadata::try_new(
510+
result,
511+
visitor.selection_vector,
512+
visitor.row_transform_exprs,
513+
)
514+
}
515+
}
516+
483517
impl LogReplayProcessor for ScanLogReplayProcessor {
484518
type Output = ScanMetadata;
485519

0 commit comments

Comments
 (0)