@@ -71,22 +71,25 @@ pub(crate) trait Deduplicator {
7171pub ( crate ) struct CheckpointDeduplicator < ' a > {
7272 seen_file_keys : & ' a HashSet < FileActionKey > ,
7373 add_path_index : usize ,
74+ add_dv_start_index : usize ,
7475}
7576impl CheckpointDeduplicator < ' _ > {
7677 #[ allow( unused) ]
7778 pub ( crate ) fn try_new < ' a > (
7879 seen_file_keys : & ' a HashSet < FileActionKey > ,
7980 add_path_index : usize ,
81+ add_dv_start_index : usize ,
8082 ) -> DeltaResult < CheckpointDeduplicator < ' a > > {
8183 Ok ( CheckpointDeduplicator {
8284 seen_file_keys,
8385 add_path_index,
86+ add_dv_start_index,
8487 } )
8588 }
8689}
8790
8891impl Deduplicator for CheckpointDeduplicator < ' _ > {
89- type Key = String ;
92+ type Key = FileActionKey ;
9093
9194 fn extract_file_action < ' a > (
9295 & self ,
@@ -96,7 +99,8 @@ impl Deduplicator for CheckpointDeduplicator<'_> {
9699 ) -> DeltaResult < Option < ( Self :: Key , bool ) > > {
97100 // Try to extract an add action by the required path column
98101 if let Some ( path) = getters[ self . add_path_index ] . get_str ( i, "add.path" ) ? {
99- Ok ( Some ( ( path. to_string ( ) , true ) ) )
102+ let dv_unique_id = extract_dv_unique_id ( i, getters, self . add_dv_start_index ) ?;
103+ Ok ( Some ( ( FileActionKey :: new ( path, dv_unique_id) , true ) ) )
100104 } else {
101105 Ok ( None )
102106 }
@@ -110,3 +114,31 @@ impl Deduplicator for CheckpointDeduplicator<'_> {
110114 false
111115 }
112116}
117+
118+ /// Extracts the deletion vector unique ID if it exists.
119+ ///
120+ /// This function retrieves the necessary fields for constructing a deletion vector unique ID
121+ /// by accessing `getters` at `dv_start_index` and the following two indices. Specifically:
122+ /// - `dv_start_index` retrieves the storage type (`deletionVector.storageType`).
123+ /// - `dv_start_index + 1` retrieves the path or inline deletion vector (`deletionVector.pathOrInlineDv`).
124+ /// - `dv_start_index + 2` retrieves the optional offset (`deletionVector.offset`).
125+ pub ( crate ) fn extract_dv_unique_id < ' a > (
126+ i : usize ,
127+ getters : & [ & ' a dyn GetData < ' a > ] ,
128+ dv_start_index : usize ,
129+ ) -> DeltaResult < Option < String > > {
130+ match getters[ dv_start_index] . get_opt ( i, "deletionVector.storageType" ) ? {
131+ Some ( storage_type) => {
132+ let path_or_inline =
133+ getters[ dv_start_index + 1 ] . get ( i, "deletionVector.pathOrInlineDv" ) ?;
134+ let offset = getters[ dv_start_index + 2 ] . get_opt ( i, "deletionVector.offset" ) ?;
135+
136+ Ok ( Some ( DeletionVectorDescriptor :: unique_id_from_parts (
137+ storage_type,
138+ path_or_inline,
139+ offset,
140+ ) ) )
141+ }
142+ None => Ok ( None ) ,
143+ }
144+ }
0 commit comments