Skip to content

Commit 2453c4f

Browse files
committed
Updated existing algs, and added documentation
1 parent e0263c2 commit 2453c4f

21 files changed

+520
-111
lines changed

README.md

Lines changed: 62 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,62 @@
1-
# pyarrow-algorithms
1+
# PyArrow Algorithms Toolkit
2+
[![CI Status](https://github.com/yourusername/pyarrow-algorithms/actions/workflows/ci.yml/badge.svg)](https://github.com/yourusername/pyarrow-algorithms/actions)[![Code Coverage](https://codecov.io/gh/yourusername/pyarrow-algorithms/branch/main/graph/badge.svg)](https://codecov.io/gh/yourusername/pyarrow-algorithms)[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
3+
4+
High-performance foundational system design algorithm implementations using PyArrow and modern Python.
5+
6+
7+
8+
## Features
9+
10+
**Distributed Systems Algorithms**
11+
- Consistent Hashing 
12+
- Merkle Trees for synchronization
13+
- Raft Consensus Protocol (TODO) 
14+
15+
**Data Structures**
16+
- Bloom Filters 
17+
- HyperLogLog 
18+
- QuadTrees 
19+
- Leaky Bucket rate limiter
20+
21+
**Efficient Computation**
22+
- Rsync Algorithm 
23+
- Ray Casting 
24+
- Operational Transformation
25+
26+
## Installation
27+
28+
```bash
29+
# Create virtual environment
30+
python -m venv venvsource venv/bin/activate
31+
# Install with PyArrow
32+
pip install pyarrow==8.0.0 -r requirements.txt
33+
```
34+
35+
## Usage
36+
```python
37+
from pyarrow_algorithms import BloomFilter
38+
bf = BloomFilter(capacity=100000, error_rate=0.01)bf.add("important_item")print("item exists:", "important_item" in bf)
39+
```
40+
41+
## Testing
42+
Run the full test suite with property-based testing:
43+
44+
```bash
45+
pytest tests/ --hypothesis-show-statistics --cov=src
46+
```
47+
48+
## TODOs:
49+
- [ ] Implement Raft Consensus Protocol
50+
- [ ] Add distributed  implementation of key algorithms 
51+
- [ ] Build out a more robust testing/simulation suite with hypothesis and Redis 
52+
- [ ] Add Github Workflows for CI/CD
53+
- [ ] Profile memory usage and develop benchmarks against Vanilla Python/Numpy implementations 
54+
55+
## Contributing
56+
1. Fork the repository
57+
2. Create feature branch
58+
3. Add tests for new algorithms
59+
4. Submit Pull Request
60+
61+
## License
62+
MIT License - See [LICENSE](LICENSE) for details.

src/algorithms/__init__.py

Whitespace-only changes.

src/algorithms/bloom_filter.py

Lines changed: 0 additions & 36 deletions
This file was deleted.

src/algorithms/consistent_hash.py

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,30 @@
33
import hashlib
44

55
class ConsistentHash:
6-
"""PyArrow-optimized consistent hashing with weighted nodes
7-
Args:
8-
nodes: Initial nodes with weights {node: weight}
9-
replicas: Base number of virtual nodes per weight unit
6+
"""PyArrow-optimized consistent hashing with weighted nodes.
7+
8+
This class implements a consistent hashing algorithm using PyArrow for
9+
efficient storage and operations. It supports weighted nodes, allowing
10+
for uneven distribution of data across nodes based on their capacity
11+
or other factors.
12+
13+
Consistent hashing ensures that when a node is added or removed,
14+
only a minimal number of keys need to be remapped to different nodes,
15+
minimizing data movement and disruption.
16+
17+
Attributes:
18+
replicas (int): The base number of virtual nodes per weight unit.
19+
ring (pyarrow.Table): A PyArrow Table storing the hash values and
20+
corresponding nodes, representing the hash ring.
21+
22+
Example:
23+
>>> nodes = {"node1": 1, "node2": 2} # Node weights
24+
>>> ch = ConsistentHash(nodes)
25+
>>> ch.get_node("key1") # Returns the node responsible for "key1"
26+
'node2'
27+
>>> ch.add_weighted_node("node3", weight=3)
28+
>>> ch.remove_node("node1")
29+
1030
"""
1131
def __init__(self, nodes: dict, replicas: int = 100):
1232
self.replicas = replicas

src/algorithms/geo_hash.py

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,31 @@
11
import pyarrow as pa
22

33
class GeoHasher:
4-
"""Geohash encoder/decoder with PyArrow optimizations
5-
Args:
6-
precision: Hash length (1-12)
4+
"""Geohash encoder/decoder with PyArrow optimizations.
5+
6+
This class provides functionality to encode geographic coordinates
7+
(latitude, longitude) into geohashes and decode geohashes back into
8+
coordinates. It leverages PyArrow for efficient data handling and
9+
vectorized operations.
10+
11+
Geohashes are short alphanumeric strings that represent rectangular
12+
areas on the Earth's surface. They are commonly used for spatial
13+
indexing, proximity searches, and data visualization.
14+
15+
This implementation supports variable precision (hash length)
16+
and provides methods to calculate neighboring geohashes.
17+
18+
Attributes:
19+
precision (int): The desired length of the geohash (1-12).
20+
BASE32 (pyarrow.Array): A PyArrow array containing the base32 characters.
21+
BASE32_MAP (dict): A dictionary mapping base32 characters to their indices.
22+
23+
Example:
24+
>>> geohasher = GeoHasher(precision=10)
25+
>>> geohash = geohasher.encode(37.7749, -122.4194) # Encode coordinates
26+
>>> coordinates = geohasher.decode(geohash) # Decode geohash
27+
>>> neighbors = geohasher.neighbors(geohash) # Get neighboring geohashes
28+
729
"""
830
BASE32 = pa.array(list('0123456789bcdefghjkmnpqrstuvwxyz'))
931
BASE32_MAP = {c:i for i,c in enumerate(BASE32.to_pylist())}

src/algorithms/hyper_log_log.py

Lines changed: 0 additions & 25 deletions
This file was deleted.

src/algorithms/lossy_counter.py

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,41 @@
33
import pyarrow.compute as pc
44

55
class LossyCounter:
6-
"""Stream frequency estimator with PyArrow optimizations
7-
Args:
8-
epsilon: Maximum error threshold (0 < ε < 1)
9-
delta: Confidence parameter (0 < δ < 1)
6+
"""Stream frequency estimator with PyArrow optimizations.
7+
8+
This class implements a Lossy Counting algorithm, a probabilistic data
9+
structure used for estimating the frequencies of items in a data stream.
10+
It leverages PyArrow for efficient data storage and computation.
11+
12+
Lossy Counting provides a memory-efficient way to approximate the
13+
frequencies of frequent items in large datasets, sacrificing some
14+
accuracy for reduced memory usage. It is particularly useful for
15+
applications where the exact frequency of every item is not crucial,
16+
but identifying the most frequent items is important.
17+
18+
The algorithm maintains a set of buckets, each containing a subset of
19+
items and their estimated counts. Items with lower frequencies are
20+
more likely to be discarded, leading to the "lossy" nature of the
21+
algorithm. The error in frequency estimates is bounded by the
22+
`epsilon` parameter.
23+
24+
Attributes:
25+
epsilon (float): The maximum error threshold (0 < ε < 1).
26+
delta (float): The confidence parameter (0 < δ < 1).
27+
bucket_width (int): The width of each bucket, calculated based on epsilon.
28+
current_count (int): The total number of items processed so far.
29+
bucket (pyarrow.StructArray): A PyArrow StructArray storing the items
30+
and their estimated counts.
31+
error (pyarrow.Array): A PyArrow array storing the error values for
32+
each bucket.
33+
34+
Example:
35+
>>> counter = LossyCounter(epsilon=0.01)
36+
>>> counter.add("item1")
37+
>>> counter.add("item2")
38+
>>> counter.add("item1")
39+
>>> frequent_items = counter.get_most_frequent(min_support=0.05)
40+
1041
"""
1142
def __init__(self, epsilon: float, delta: float = 0.01):
1243
self.epsilon = epsilon

src/algorithms/merkle_tree.py

Lines changed: 56 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,55 @@
22
import pyarrow as pa
33

44
class MerkleTree:
5+
"""
6+
Data structure for efficient and secure data integrity verification.
7+
8+
A Merkle tree is a tree-like structure where each leaf node represents
9+
a data block, and each non-leaf node represents the cryptographic hash
10+
of its child nodes. It allows for efficient verification of data
11+
integrity by comparing the root hash with a known value.
12+
13+
This implementation uses PyArrow for efficient data handling and
14+
provides methods to generate and verify Merkle inclusion proofs.
15+
16+
Attributes:
17+
leaves (list): A list of data blocks (bytes) representing the leaf nodes
18+
of the Merkle tree.
19+
20+
Example:
21+
>>> data_blocks = [b'data1', b'data2', b'data3', b'data4']
22+
>>> tree = MerkleTree(data_blocks) # Tree is built during initialization
23+
>>> proof = tree.get_proof(2) # Generate proof for the third data block
24+
>>> is_valid = tree.verify(b'data3', proof) # Verify the proof
25+
# (root hash is accessed internally)
26+
27+
"""
28+
def __init__(self, leaves: list[bytes]):
29+
"""Initialize the MerkleTree with a list of data blocks."""
30+
self.leaves = leaves
31+
self.root = self._build_tree()
32+
33+
def _build_tree(self) -> bytes:
34+
"""Build the Merkle tree and return the root hash."""
35+
current_level = pa.array(self.leaves)
36+
while len(current_level) > 1:
37+
# Handle odd number of nodes by duplicating the last node
38+
if len(current_level) % 2 != 0:
39+
current_level = current_level.append(current_level[-1])
40+
41+
# Hash pairs of nodes to create the next level
42+
current_level = pa.array([
43+
self._hash(current_level[i] + current_level[i + 1])
44+
for i in range(0, len(current_level), 2)
45+
], type=pa.binary()) # Specify type as binary for byte strings
46+
47+
# The final remaining node is the root hash
48+
return current_level[0].as_py() # Convert to Python bytes
49+
50+
def _hash(self, data: bytes) -> bytes:
51+
"""Calculate the SHA-256 hash of the given data."""
52+
return hashlib.sha256(data).digest()
53+
554
def get_proof(self, index: int) -> pa.ListArray:
655
"""
756
Generate a Merkle inclusion proof for a leaf node
@@ -31,7 +80,7 @@ def get_proof(self, index: int) -> pa.ListArray:
3180
# Store proof element with positional flag
3281
proof.append(pa.struct([
3382
('is_left', pa.scalar(not is_left)),
34-
('hash', sibling_hash)
83+
('hash', self._hash(sibling_hash))
3584
]))
3685

3786
# Move up to parent level
@@ -42,19 +91,12 @@ def get_proof(self, index: int) -> pa.ListArray:
4291
], type=pa.binary())
4392
return pa.ListArray.from_arrays(proof)
4493

45-
@staticmethod
46-
def verify(root: bytes, leaf: bytes, proof: pa.ListArray) -> bool:
47-
"""
48-
Verify Merkle proof against known root hash
49-
Args:
50-
root: Known root hash as bytes
51-
leaf: Leaf node content as bytes
52-
proof: Proof generated by get_proof()
53-
"""
54-
current = hashlib.sha256(leaf).digest()
94+
def verify(self, leaf: bytes, proof: pa.ListArray) -> bool:
95+
"""Verify a Merkle proof against the known root hash."""
96+
current_hash = self._hash(leaf) # Using the private _hash method
5597
for node in proof:
5698
if node['is_left'].as_py():
57-
current = hashlib.sha256(node['hash'].as_py() + current).digest()
99+
current_hash = self._hash(node['hash'].as_py() + current_hash) # Using _hash
58100
else:
59-
current = hashlib.sha256(current + node['hash'].as_py()).digest()
60-
return current == root
101+
current_hash = self._hash(current_hash + node['hash'].as_py()) # Using _hash
102+
return current_hash == self.root # Comparing with self.root

0 commit comments

Comments
 (0)