codeamt
diff --git a/‎README.md‎
Lines changed: 62 additions & 1 deletion b/‎README.md‎
Lines changed: 62 additions & 1 deletion
diff --git a/‎src/algorithms/__init__.py‎ b/‎src/algorithms/__init__.py‎
diff --git a/‎src/algorithms/bloom_filter.py‎
Lines changed: 0 additions & 36 deletions b/‎src/algorithms/bloom_filter.py‎
Lines changed: 0 additions & 36 deletions
diff --git a/‎src/algorithms/consistent_hash.py‎
Lines changed: 24 additions & 4 deletions b/‎src/algorithms/consistent_hash.py‎
Lines changed: 24 additions & 4 deletions
diff --git a/‎src/algorithms/geo_hash.py‎
Lines changed: 25 additions & 3 deletions b/‎src/algorithms/geo_hash.py‎
Lines changed: 25 additions & 3 deletions
diff --git a/‎src/algorithms/hyper_log_log.py‎
Lines changed: 0 additions & 25 deletions b/‎src/algorithms/hyper_log_log.py‎
Lines changed: 0 additions & 25 deletions
diff --git a/‎src/algorithms/lossy_counter.py‎
Lines changed: 35 additions & 4 deletions b/‎src/algorithms/lossy_counter.py‎
Lines changed: 35 additions & 4 deletions
diff --git a/‎src/algorithms/merkle_tree.py‎
Lines changed: 56 additions & 14 deletions b/‎src/algorithms/merkle_tree.py‎
Lines changed: 56 additions & 14 deletions
@@ -1 +1,62 @@
-# pyarrow-algorithms
+# PyArrow Algorithms Toolkit
+[![CI Status](https://github.com/yourusername/pyarrow-algorithms/actions/workflows/ci.yml/badge.svg)](https://github.com/yourusername/pyarrow-algorithms/actions)[![Code Coverage](https://codecov.io/gh/yourusername/pyarrow-algorithms/branch/main/graph/badge.svg)](https://codecov.io/gh/yourusername/pyarrow-algorithms)[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
+
+High-performance foundational system design algorithm implementations using PyArrow and modern Python.
+
+
+
+## Features
+
+**Distributed Systems Algorithms**:   
+- Consistent Hashing  
+- Merkle Trees for synchronization
+- Raft Consensus Protocol (TODO) 
+
+**Data Structures**:  
+- Bloom Filters  
+- HyperLogLog  
+- QuadTrees  
+- Leaky Bucket rate limiter
+
+**Efficient Computation**:  
+- Rsync Algorithm  
+- Ray Casting  
+- Operational Transformation
+
+## Installation
+
+```bash
+# Create virtual environment
+python -m venv venvsource venv/bin/activate
+# Install with PyArrow
+pip install pyarrow==8.0.0 -r requirements.txt
+```
+
+## Usage
+```python
+from pyarrow_algorithms import BloomFilter
+bf = BloomFilter(capacity=100000, error_rate=0.01)bf.add("important_item")print("item exists:", "important_item" in bf)
+```
+
+## Testing
+Run the full test suite with property-based testing:
+
+```bash
+pytest tests/ --hypothesis-show-statistics --cov=src
+```
+
+## TODOs:
+- [ ] Implement Raft Consensus Protocol
+- [ ] Add distributed  implementation of key algorithms 
+- [ ] Build out a more robust testing/simulation suite with hypothesis and Redis 
+- [ ] Add Github Workflows for CI/CD
+- [ ] Profile memory usage and develop benchmarks against Vanilla Python/Numpy implementations 
+
+## Contributing
+1. Fork the repository
+2. Create feature branch 
+3. Add tests for new algorithms
+4. Submit Pull Request
+
+## License
+MIT License - See [LICENSE](LICENSE) for details.
@@ -3,10 +3,30 @@
 import hashlib
 
 class ConsistentHash:
-    """PyArrow-optimized consistent hashing with weighted nodes
-    Args:
-    nodes: Initial nodes with weights {node: weight}
-    replicas: Base number of virtual nodes per weight unit
+    """PyArrow-optimized consistent hashing with weighted nodes.
+
+    This class implements a consistent hashing algorithm using PyArrow for 
+    efficient storage and operations. It supports weighted nodes, allowing 
+    for uneven distribution of data across nodes based on their capacity 
+    or other factors.
+
+    Consistent hashing ensures that when a node is added or removed, 
+    only a minimal number of keys need to be remapped to different nodes, 
+    minimizing data movement and disruption.
+
+    Attributes:
+        replicas (int): The base number of virtual nodes per weight unit.
+        ring (pyarrow.Table): A PyArrow Table storing the hash values and 
+                              corresponding nodes, representing the hash ring.
+
+    Example:
+        >>> nodes = {"node1": 1, "node2": 2}  # Node weights
+        >>> ch = ConsistentHash(nodes)
+        >>> ch.get_node("key1")  # Returns the node responsible for "key1"
+        'node2'
+        >>> ch.add_weighted_node("node3", weight=3)
+        >>> ch.remove_node("node1")
+
     """
     def __init__(self, nodes: dict, replicas: int = 100):
         self.replicas = replicas
 
@@ -1,9 +1,31 @@
 import pyarrow as pa
 
 class GeoHasher:
-    """Geohash encoder/decoder with PyArrow optimizations
-    Args:
-    precision: Hash length (1-12)
+    """Geohash encoder/decoder with PyArrow optimizations.
+
+    This class provides functionality to encode geographic coordinates 
+    (latitude, longitude) into geohashes and decode geohashes back into 
+    coordinates. It leverages PyArrow for efficient data handling and 
+    vectorized operations.
+
+    Geohashes are short alphanumeric strings that represent rectangular 
+    areas on the Earth's surface. They are commonly used for spatial 
+    indexing, proximity searches, and data visualization.
+
+    This implementation supports variable precision (hash length) 
+    and provides methods to calculate neighboring geohashes.
+
+    Attributes:
+        precision (int): The desired length of the geohash (1-12).
+        BASE32 (pyarrow.Array): A PyArrow array containing the base32 characters.
+        BASE32_MAP (dict): A dictionary mapping base32 characters to their indices.
+
+    Example:
+        >>> geohasher = GeoHasher(precision=10)
+        >>> geohash = geohasher.encode(37.7749, -122.4194)  # Encode coordinates
+        >>> coordinates = geohasher.decode(geohash)  # Decode geohash
+        >>> neighbors = geohasher.neighbors(geohash)  # Get neighboring geohashes
+
     """
     BASE32 = pa.array(list('0123456789bcdefghjkmnpqrstuvwxyz'))
     BASE32_MAP = {c:i for i,c in enumerate(BASE32.to_pylist())}
 
@@ -3,10 +3,41 @@
 import pyarrow.compute as pc
 
 class LossyCounter:
-    """Stream frequency estimator with PyArrow optimizations
-    Args:
-    epsilon: Maximum error threshold (0 < ε < 1)
-    delta: Confidence parameter (0 < δ < 1)
+    """Stream frequency estimator with PyArrow optimizations.
+
+    This class implements a Lossy Counting algorithm, a probabilistic data 
+    structure used for estimating the frequencies of items in a data stream. 
+    It leverages PyArrow for efficient data storage and computation.
+
+    Lossy Counting provides a memory-efficient way to approximate the 
+    frequencies of frequent items in large datasets, sacrificing some 
+    accuracy for reduced memory usage. It is particularly useful for 
+    applications where the exact frequency of every item is not crucial, 
+    but identifying the most frequent items is important.
+
+    The algorithm maintains a set of buckets, each containing a subset of 
+    items and their estimated counts. Items with lower frequencies are 
+    more likely to be discarded, leading to the "lossy" nature of the 
+    algorithm. The error in frequency estimates is bounded by the 
+    `epsilon` parameter.
+
+    Attributes:
+        epsilon (float): The maximum error threshold (0 < ε < 1).
+        delta (float): The confidence parameter (0 < δ < 1).
+        bucket_width (int): The width of each bucket, calculated based on epsilon.
+        current_count (int): The total number of items processed so far.
+        bucket (pyarrow.StructArray): A PyArrow StructArray storing the items 
+                                     and their estimated counts.
+        error (pyarrow.Array): A PyArrow array storing the error values for 
+                               each bucket.
+
+    Example:
+        >>> counter = LossyCounter(epsilon=0.01)
+        >>> counter.add("item1")
+        >>> counter.add("item2")
+        >>> counter.add("item1")
+        >>> frequent_items = counter.get_most_frequent(min_support=0.05)
+
     """
     def __init__(self, epsilon: float, delta: float = 0.01):
         self.epsilon = epsilon
 
@@ -2,6 +2,55 @@
 import pyarrow as pa
 
 class MerkleTree:
+    """
+    Data structure for efficient and secure data integrity verification.
+
+    A Merkle tree is a tree-like structure where each leaf node represents 
+    a data block, and each non-leaf node represents the cryptographic hash 
+    of its child nodes. It allows for efficient verification of data 
+    integrity by comparing the root hash with a known value.
+
+    This implementation uses PyArrow for efficient data handling and 
+    provides methods to generate and verify Merkle inclusion proofs.
+
+    Attributes:
+        leaves (list): A list of data blocks (bytes) representing the leaf nodes 
+                       of the Merkle tree.
+
+    Example:
+        >>> data_blocks = [b'data1', b'data2', b'data3', b'data4']
+        >>> tree = MerkleTree(data_blocks)  # Tree is built during initialization
+        >>> proof = tree.get_proof(2)  # Generate proof for the third data block
+        >>> is_valid = tree.verify(b'data3', proof)  # Verify the proof
+                                                      # (root hash is accessed internally)
+
+    """
+    def __init__(self, leaves: list[bytes]):
+        """Initialize the MerkleTree with a list of data blocks."""
+        self.leaves = leaves
+        self.root = self._build_tree()
+      
+    def _build_tree(self) -> bytes:
+      """Build the Merkle tree and return the root hash."""
+      current_level = pa.array(self.leaves)
+      while len(current_level) > 1:
+          # Handle odd number of nodes by duplicating the last node
+          if len(current_level) % 2 != 0:
+              current_level = current_level.append(current_level[-1])
+
+          # Hash pairs of nodes to create the next level
+          current_level = pa.array([
+              self._hash(current_level[i] + current_level[i + 1])
+              for i in range(0, len(current_level), 2)
+          ], type=pa.binary())  # Specify type as binary for byte strings
+
+      # The final remaining node is the root hash
+      return current_level[0].as_py()  # Convert to Python bytes
+                                
+    def _hash(self, data: bytes) -> bytes:
+        """Calculate the SHA-256 hash of the given data."""
+        return hashlib.sha256(data).digest()
+
     def get_proof(self, index: int) -> pa.ListArray:
         """
         Generate a Merkle inclusion proof for a leaf node
@@ -31,7 +80,7 @@ def get_proof(self, index: int) -> pa.ListArray:
             # Store proof element with positional flag
             proof.append(pa.struct([
                 ('is_left', pa.scalar(not is_left)),
-                ('hash', sibling_hash)
+                ('hash', self._hash(sibling_hash))
             ]))
 
             # Move up to parent level
@@ -42,19 +91,12 @@ def get_proof(self, index: int) -> pa.ListArray:
             ], type=pa.binary())
         return pa.ListArray.from_arrays(proof)
 
-    @staticmethod
-    def verify(root: bytes, leaf: bytes, proof: pa.ListArray) -> bool:
-        """
-        Verify Merkle proof against known root hash
-        Args:
-        root: Known root hash as bytes
-        leaf: Leaf node content as bytes
-        proof: Proof generated by get_proof()
-        """
-        current = hashlib.sha256(leaf).digest()
+    def verify(self, leaf: bytes, proof: pa.ListArray) -> bool:
+        """Verify a Merkle proof against the known root hash."""
+        current_hash = self._hash(leaf)  # Using the private _hash method
         for node in proof:
             if node['is_left'].as_py():
-                current = hashlib.sha256(node['hash'].as_py() + current).digest()
+                current_hash = self._hash(node['hash'].as_py() + current_hash)  # Using _hash
             else:
-                current = hashlib.sha256(current + node['hash'].as_py()).digest()
-        return current == root
+                current_hash = self._hash(current_hash + node['hash'].as_py())  # Using _hash
+        return current_hash == self.root  # Comparing with self.root