Skip to content

Commit be9b59b

Browse files
committed
fix: use murmur3 instead of fasthash
1 parent 67a5074 commit be9b59b

File tree

5 files changed

+31
-24
lines changed

5 files changed

+31
-24
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ itertools = "0.14.0"
3838
lazy_static = "1.5.0"
3939
lru = "0.16.0"
4040
object_store = { version = "0.12", features = ["aws", "gcp"] }
41-
fasthash = { version = "0.4" }
41+
murmur3 = { version = "0.5.2" }
4242
parquet = { version = "56", features = ["async", "object_store"] }
4343
pin-project-lite = "0.2"
4444
regex = "1.11.1"

iceberg-rust-spec/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ derive-getters = { workspace = true }
1616
derive_builder = { workspace = true }
1717
getrandom = { workspace = true }
1818
itertools = { workspace = true }
19-
murmur3 = "0.5.2"
19+
murmur3 = { workspace = true }
2020
ordered-float = { version = "5.0.0", features = ["serde"] }
2121
rust_decimal = "1.36.0"
2222
serde = { workspace = true }

iceberg-rust/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@ iceberg-rust-spec = { path = "../iceberg-rust-spec", version = "0.8.0" }
2323
itertools = { workspace = true }
2424
lazy_static = { workspace = true }
2525
lru = { workspace = true }
26+
murmur3 = { workspace = true }
2627
object_store = { workspace = true }
27-
fasthash = { workspace = true }
2828
parquet = { workspace = true }
2929
pin-project-lite = { workspace = true }
3030
regex = { workspace = true }

iceberg-rust/src/arrow/transform.rs

Lines changed: 27 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -125,13 +125,17 @@ pub fn transform_arrow(array: ArrayRef, transform: &Transform) -> Result<ArrayRe
125125
)),
126126
(DataType::Int32, Transform::Bucket(m)) => Ok(Arc::<PrimitiveArray<Int32Type>>::new(
127127
unary(as_primitive_array::<Int32Type>(&array), |i| {
128-
(fasthash::murmur3::hash32_with_seed((i as i64).to_le_bytes(), 0) as i32)
128+
let mut buffer = std::io::Cursor::new((i as i64).to_le_bytes());
129+
(murmur3::murmur3_32(&mut buffer, 0).expect("murmur3 hash failled for some reason")
130+
as i32)
129131
.rem_euclid(*m as i32)
130132
}),
131133
)),
132134
(DataType::Int64, Transform::Bucket(m)) => Ok(Arc::<PrimitiveArray<Int32Type>>::new(
133135
unary(as_primitive_array::<Int64Type>(&array), |i| {
134-
(fasthash::murmur3::hash32_with_seed(i.to_le_bytes(), 0) as i32)
136+
let mut buffer = std::io::Cursor::new((i).to_le_bytes());
137+
(murmur3::murmur3_32(&mut buffer, 0).expect("murmur3 hash failled for some reason")
138+
as i32)
135139
.rem_euclid(*m as i32)
136140
}),
137141
)),
@@ -141,7 +145,9 @@ pub fn transform_arrow(array: ArrayRef, transform: &Transform) -> Result<ArrayRe
141145
Ok(Arc::<PrimitiveArray<Int32Type>>::new(unary(
142146
as_primitive_array::<Int32Type>(&temp),
143147
|i| {
144-
(fasthash::murmur3::hash32_with_seed(i.to_le_bytes(), 0) as i32)
148+
let mut buffer = std::io::Cursor::new((i as i64).to_le_bytes());
149+
(murmur3::murmur3_32(&mut buffer, 0)
150+
.expect("murmur3 hash failled for some reason") as i32)
145151
.rem_euclid(*m as i32)
146152
},
147153
)))
@@ -152,7 +158,9 @@ pub fn transform_arrow(array: ArrayRef, transform: &Transform) -> Result<ArrayRe
152158
Ok(Arc::<PrimitiveArray<Int32Type>>::new(unary(
153159
as_primitive_array::<Int32Type>(&temp),
154160
|i: i32| {
155-
(fasthash::murmur3::hash32_with_seed((i as i64).to_le_bytes(), 0) as i32)
161+
let mut buffer = std::io::Cursor::new((i as i64).to_le_bytes());
162+
(murmur3::murmur3_32(&mut buffer, 0)
163+
.expect("murmur3 hash failled for some reason") as i32)
156164
.rem_euclid(*m as i32)
157165
},
158166
)))
@@ -164,7 +172,9 @@ pub fn transform_arrow(array: ArrayRef, transform: &Transform) -> Result<ArrayRe
164172
Ok(Arc::new(PrimitiveArray::<Int32Type>::new(
165173
ScalarBuffer::from_iter(local_array.iter().map(|a| {
166174
if let Some(value) = a {
167-
fasthash::murmur3::hash32_with_seed(value.as_bytes(), 0) as i32
175+
murmur3::murmur3_32(&mut value.as_bytes(), 0)
176+
.expect("murmur3 hash failled for some reason")
177+
as i32
168178
} else {
169179
0
170180
}
@@ -386,40 +396,36 @@ mod tests {
386396
// Check value match https://iceberg.apache.org/spec/#appendix-b-32-bit-hash-requirements
387397

388398
// 34 -> 2017239379
389-
assert_eq!(
390-
fasthash::murmur3::hash32_with_seed((34i32 as i64).to_le_bytes(), 0),
391-
2017239379
392-
);
399+
let mut buffer = std::io::Cursor::new((34i32 as i64).to_le_bytes());
400+
assert_eq!(murmur3::murmur3_32(&mut buffer, 0).unwrap(), 2017239379);
401+
393402
// 34 -> 2017239379
394-
assert_eq!(
395-
fasthash::murmur3::hash32_with_seed((34i64).to_le_bytes(), 0),
396-
2017239379
397-
);
403+
let mut buffer = std::io::Cursor::new((34i64).to_le_bytes());
404+
assert_eq!(murmur3::murmur3_32(&mut buffer, 0).unwrap(), 2017239379);
405+
398406
// daysFromUnixEpoch(2017-11-16) -> 17_486 -> -653330422
407+
let mut buffer = std::io::Cursor::new((17_486i32 as i64).to_le_bytes());
399408
assert_eq!(
400-
fasthash::murmur3::hash32_with_seed((17_486i32 as i64).to_le_bytes(), 0) as i32,
409+
murmur3::murmur3_32(&mut buffer, 0).unwrap() as i32,
401410
-653330422
402411
);
412+
403413
// 81_068_000_000 number of micros from midnight 22:31:08
414+
let mut buffer = std::io::Cursor::new((81_068_000_000i64).to_le_bytes());
404415
assert_eq!(
405-
fasthash::murmur3::hash32_with_seed((81_068_000_000i64).to_le_bytes(), 0) as i32,
416+
murmur3::murmur3_32(&mut buffer, 0).unwrap() as i32,
406417
-662762989
407418
);
408419

409420
// utf8Bytes(iceberg) -> 1210000089
410421
assert_eq!(
411-
fasthash::murmur3::hash32_with_seed("iceberg".as_bytes(), 0) as i32,
422+
murmur3::murmur3_32(&mut "iceberg".as_bytes(), 0).unwrap() as i32,
412423
1210000089
413424
);
414425
}
415426

416427
#[test]
417428
fn test_int32_bucket_transform() {
418-
assert_eq!(
419-
fasthash::murmur3::hash32_with_seed(17_486i64.to_le_bytes(), 0) as i32,
420-
-653_330_422
421-
);
422-
423429
let array = Arc::new(arrow::array::Int32Array::from(vec![
424430
Some(34), // Spec value
425431
Some(17_486), // number of day between 2017-11-16 and epoch

0 commit comments

Comments
 (0)