Skip to content

Commit 17553c3

Browse files
committed
move dv extraction to deduplicator
1 parent ddcad9b commit 17553c3

File tree

2 files changed

+35
-32
lines changed

2 files changed

+35
-32
lines changed

kernel/src/log_replay.rs

Lines changed: 1 addition & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,7 @@
1313
//! This module provides structures for efficient batch processing, focusing on file action
1414
//! deduplication with `FileActionDeduplicator` which tracks unique files across log batches
1515
//! to minimize memory usage for tables with extensive history.
16-
use crate::actions::deletion_vector::DeletionVectorDescriptor;
17-
use crate::engine_data::{GetData, TypedGetData};
16+
use crate::engine_data::GetData;
1817
use crate::log_replay::deduplicator::Deduplicator;
1918
use crate::scan::data_skipping::DataSkippingFilter;
2019
use crate::{DeltaResult, EngineData};
@@ -90,35 +89,6 @@ impl<'seen> FileActionDeduplicator<'seen> {
9089
remove_dv_start_index,
9190
}
9291
}
93-
94-
/// Extracts the deletion vector unique ID if it exists.
95-
///
96-
/// This function retrieves the necessary fields for constructing a deletion vector unique ID
97-
/// by accessing `getters` at `dv_start_index` and the following two indices. Specifically:
98-
/// - `dv_start_index` retrieves the storage type (`deletionVector.storageType`).
99-
/// - `dv_start_index + 1` retrieves the path or inline deletion vector (`deletionVector.pathOrInlineDv`).
100-
/// - `dv_start_index + 2` retrieves the optional offset (`deletionVector.offset`).
101-
fn extract_dv_unique_id<'a>(
102-
&self,
103-
i: usize,
104-
getters: &[&'a dyn GetData<'a>],
105-
dv_start_index: usize,
106-
) -> DeltaResult<Option<String>> {
107-
match getters[dv_start_index].get_opt(i, "deletionVector.storageType")? {
108-
Some(storage_type) => {
109-
let path_or_inline =
110-
getters[dv_start_index + 1].get(i, "deletionVector.pathOrInlineDv")?;
111-
let offset = getters[dv_start_index + 2].get_opt(i, "deletionVector.offset")?;
112-
113-
Ok(Some(DeletionVectorDescriptor::unique_id_from_parts(
114-
storage_type,
115-
path_or_inline,
116-
offset,
117-
)))
118-
}
119-
None => Ok(None),
120-
}
121-
}
12292
}
12393

12494
impl<'seen> Deduplicator for FileActionDeduplicator<'seen> {

kernel/src/log_replay/deduplicator.rs

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,11 @@
1010
//!
1111
//! [`FileActionDeduplicator`]: crate::log_replay::FileActionDeduplicator
1212
13-
use crate::{engine_data::GetData, DeltaResult};
13+
use crate::{
14+
actions::deletion_vector::DeletionVectorDescriptor,
15+
engine_data::{GetData, TypedGetData},
16+
DeltaResult,
17+
};
1418

1519
pub(crate) trait Deduplicator {
1620
/// Key type for identifying file actions. JSON deduplicators use `FileActionKey`
@@ -31,4 +35,33 @@ pub(crate) trait Deduplicator {
3135

3236
/// Returns `true` for commit log batches (updates hashmap), `false` for checkpoints (read-only).
3337
fn is_log_batch(&self) -> bool;
38+
39+
/// Extracts the deletion vector unique ID if it exists.
40+
///
41+
/// This function retrieves the necessary fields for constructing a deletion vector unique ID
42+
/// by accessing `getters` at `dv_start_index` and the following two indices. Specifically:
43+
/// - `dv_start_index` retrieves the storage type (`deletionVector.storageType`).
44+
/// - `dv_start_index + 1` retrieves the path or inline deletion vector (`deletionVector.pathOrInlineDv`).
45+
/// - `dv_start_index + 2` retrieves the optional offset (`deletionVector.offset`).
46+
fn extract_dv_unique_id<'a>(
47+
&self,
48+
i: usize,
49+
getters: &[&'a dyn GetData<'a>],
50+
dv_start_index: usize,
51+
) -> DeltaResult<Option<String>> {
52+
match getters[dv_start_index].get_opt(i, "deletionVector.storageType")? {
53+
Some(storage_type) => {
54+
let path_or_inline =
55+
getters[dv_start_index + 1].get(i, "deletionVector.pathOrInlineDv")?;
56+
let offset = getters[dv_start_index + 2].get_opt(i, "deletionVector.offset")?;
57+
58+
Ok(Some(DeletionVectorDescriptor::unique_id_from_parts(
59+
storage_type,
60+
path_or_inline,
61+
offset,
62+
)))
63+
}
64+
None => Ok(None),
65+
}
66+
}
3467
}

0 commit comments

Comments
 (0)