Skip to content

Commit e0263c2

Browse files
committed
first draft alg impl with tests
1 parent f3ace19 commit e0263c2

28 files changed

+1599
-0
lines changed

.devcontainer/Dockerfile

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
FROM mcr.microsoft.com/devcontainers/python:1-3.11-bullseye
2+
3+
# Install system dependencies
4+
RUN apt-get update && \
5+
apt-get install -y --no-install-recommends \
6+
redis-server \
7+
libsnappy-dev \
8+
libzstd-dev \
9+
&& apt-get clean -y && \
10+
rm -rf /var/lib/apt/lists/*
11+
12+
# Copy requirements first for better cache utilization
13+
COPY requirements.txt /tmp/pip-tmp/
14+
RUN pip3 --disable-pip-version-check --no-cache-dir install -r /tmp/pip-tmp/requirements.txt \
15+
&& rm -rf /tmp/pip-tmp
16+
17+
# Configure Redis for testing
18+
RUN mkdir -p /var/lib/redis && \
19+
chown -R redis:redis /var/lib/redis && \
20+
sed -i 's/bind 127.0.0.1/bind 0.0.0.0/' /etc/redis/redis.conf

.devcontainer/devcontainer.json

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
2+
{
3+
"name": "algokit-dev",
4+
"dockerComposeFile": [
5+
"./docker-compose.yml"
6+
],
7+
"service": "code",
8+
"workspaceFolder": "/workspace",
9+
"customizations": {
10+
"vscode": {
11+
"extensions": [
12+
"ms-python.python",
13+
"ms-python.vscode-pylance",
14+
"ms-azuretools.vscode-docker",
15+
"ms-azuretools.vscode-docker-compose"
16+
],
17+
"settings": {
18+
"python.defaultInterpreterPath": "/usr/local/bin/python",
19+
"python.linting.enabled": true,
20+
"python.linting.pylintEnabled": true,
21+
"python.testing.pytestEnabled": true,
22+
"python.testing.pytestArgs": ["tests/"]
23+
}
24+
25+
}
26+
},
27+
"postCreateCommand": "sudo service redis-server start && pytest tests/",
28+
"forwardPorts": [6379], // optional: exposes Redis port
29+
"remoteUser": "vscode"
30+
}

.devcontainer/docker-compose.yml

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
version: '3.8'
2+
3+
services:
4+
code:
5+
build:
6+
context: ..
7+
dockerfile: .devcontainer/Dockerfile
8+
volumes:
9+
- ..:/workspace:cached
10+
command: sleep infinity
11+
environment:
12+
- REDIS_HOST=redis
13+
- PYTHONPATH=/workspace/src
14+
depends_on:
15+
redis:
16+
condition: service_healthy
17+
18+
redis:
19+
image: redis:6-alpine
20+
restart: unless-stopped
21+
ports:
22+
- "6379:6379"
23+
volumes:
24+
- redis_data:/data
25+
healthcheck:
26+
test: ["CMD", "redis-cli", "ping"]
27+
interval: 1s
28+
timeout: 3s
29+
retries: 30
30+
31+
volumes:
32+
redis_data:

.github/dependabot.yml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# To get started with Dependabot version updates, you'll need to specify which
2+
# package ecosystems to update and where the package manifests are located.
3+
# Please see the documentation for more information:
4+
# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
5+
# https://containers.dev/guide/dependabot
6+
7+
version: 2
8+
updates:
9+
- package-ecosystem: "devcontainers"
10+
directory: "/"
11+
schedule:
12+
interval: weekly

requirements.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pyarrow>=8.0.0
2+
pytest>=7.0
3+
hypothesis>=6.0
4+
redis>=4.0
5+
numpy>=1.0

src/__init__.py

Whitespace-only changes.

src/algorithms/bloom_filter.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import hashlib
2+
import math
3+
import pyarrow as pa
4+
5+
class BloomFilter:
6+
"""Space-efficient probabilistic membership tester using PyArrow buffers
7+
Args:
8+
capacity: Expected maximum number of elements
9+
error_rate: Acceptable false positive rate 01
10+
"""
11+
def __init__(self, capacity: int, error_rate: float):
12+
self.size = self._calc_size(capacity, error_rate)
13+
self.hash_count = self._calc_hash_count(capacity, self.size)
14+
self.bits = pa.BufferOutputStream()
15+
self.bits.write(b'\x00' * self.size)
16+
17+
def add(self, item: str):
18+
"""Insert item into filter"""
19+
h = int.from_bytes(hashlib.blake2s(item.encode()).digest(), 'big')
20+
for s in range(self.hash_count):
21+
idx = (h + s*1299721) % self.size
22+
self.bits.set_bit(idx)
23+
24+
def __contains__(self, item: str) -> bool:
25+
"""Check item membership"""
26+
h = int.from_bytes(hashlib.blake2s(item.encode()).digest(), 'big')
27+
return all(self.bits.get_bit((h+s*1299721)%self.size) for s in range(self.hash_count))
28+
29+
30+
@staticmethod
31+
def _calc_size(n: int, p: float) -> int:
32+
return math.ceil(-(n * math.log(p)) / (math.log(2)**2))
33+
34+
@staticmethod
35+
def _calc_hash_count(n: int, m: int) -> int:
36+
return math.ceil((m/n) * math.log(2))

src/algorithms/consistent_hash.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
import pyarrow as pa
2+
import pyarrow.compute as pc
3+
import hashlib
4+
5+
class ConsistentHash:
6+
"""PyArrow-optimized consistent hashing with weighted nodes
7+
Args:
8+
nodes: Initial nodes with weights {node: weight}
9+
replicas: Base number of virtual nodes per weight unit
10+
"""
11+
def __init__(self, nodes: dict, replicas: int = 100):
12+
self.replicas = replicas
13+
self.ring = pa.Table.from_arrays(
14+
arrays=[[], []],
15+
names=['hash', 'node'],
16+
schema=pa.schema([
17+
('hash', pa.uint64()),
18+
('node', pa.string())
19+
])
20+
)
21+
for node, weight in nodes.items():
22+
self.add_weighted_node(node, weight)
23+
24+
def add_weighted_node(self, node: str, weight: int = 1):
25+
"""Add node with weight using vectorized operations"""
26+
virtual_nodes = weight * self.replicas
27+
hashes = pa.array([
28+
int.from_bytes(
29+
hashlib.blake2s(f"{node}-{i}".encode()).digest()[:8],
30+
'little'
31+
) for i in range(virtual_nodes)
32+
], type=pa.uint64())
33+
nodes = pa.array([node] * virtual_nodes)
34+
new_entries = pa.Table.from_arrays(
35+
[hashes, nodes],
36+
names=['hash', 'node']
37+
)
38+
self.ring = pa.concat_tables([self.ring, new_entries])
39+
self.ring = self.ring.sort_by('hash')
40+
41+
def remove_node(self, node: str):
42+
"""Remove all virtual nodes for a physical node"""
43+
mask = pc.not_equal(self.ring['node'], node)
44+
self.ring = self.ring.filter(mask)
45+
46+
def get_node(self, key: str) -> str:
47+
"""Find node for key using binary search"""
48+
key_hash = int.from_bytes(
49+
hashlib.blake2s(key.encode()).digest()[:8],
50+
'little'
51+
)
52+
hashes = self.ring['hash'].combine_chunks()
53+
idx = pc.binary_search(hashes, value=key_hash)
54+
if idx == len(hashes):
55+
idx = 0
56+
return self.ring['node'][idx].as_py()
57+
58+
def balance_quality(self) -> float:
59+
"""Calculate balance quality (0-1) using Arrow compute"""
60+
total_vnodes = len(self.ring)
61+
node_counts = pc.value_counts(self.ring['node'])
62+
counts = node_counts['counts'].combine_chunks()
63+
return pc.stddev(counts).as_py() / (total_vnodes / len(counts))

src/algorithms/geo_hash.py

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
import pyarrow as pa
2+
3+
class GeoHasher:
4+
"""Geohash encoder/decoder with PyArrow optimizations
5+
Args:
6+
precision: Hash length (1-12)
7+
"""
8+
BASE32 = pa.array(list('0123456789bcdefghjkmnpqrstuvwxyz'))
9+
BASE32_MAP = {c:i for i,c in enumerate(BASE32.to_pylist())}
10+
def __init__(self, precision: int = 10):
11+
self.precision = min(max(precision, 1), 12)
12+
self.bits = self.precision * 5
13+
self.mask = pa.bit_mask(self.bits)
14+
15+
def encode(self, lat: float, lon: float) -> pa.StringScalar:
16+
"""Encode coordinates to geohash"""
17+
lat = pa.scalar(max(-90.0, min(90.0, lat)))
18+
lon = pa.scalar(((lon + 180) % 360) - 180)
19+
bits = pa.BitArrayBuilder()
20+
lat_range, lon_range = [-90.0, 90.0], [-180.0, 180.0]
21+
22+
for i in pa.compute.range(self.bits):
23+
if i % 2: # Latitude bits
24+
mid = (lat_range[0] + lat_range[1]) / 2
25+
bit = lat >= mid
26+
lat_range[bit] = mid
27+
else: # Longitude bits
28+
mid = (lon_range[0] + lon_range[1]) / 2
29+
bit = lon >= mid
30+
lon_range[bit] = mid
31+
bits.append(bit)
32+
33+
return self._pack_bits(bits.finish())
34+
35+
def decode(self, geohash: pa.StringScalar) -> pa.StructScalar:
36+
"""Decode geohash to coordinates with error margins
37+
Returns:
38+
Struct with fields: lon, lat, lon_err, lat_err
39+
"""
40+
bits = self._unpack_bits(geohash)
41+
lon_range = pa.array([-180.0, 180.0])
42+
lat_range = pa.array([-90.0, 90.0])
43+
for i, bit in enumerate(bits):
44+
arr, idx = (lon_range, 0) if i%2==0 else (lat_range, 1)
45+
mid = (arr[0] + arr[1]) / 2
46+
arr = pa.array([arr[0], mid]) if not bit else pa.array([mid, arr[1]])
47+
if i%2 == 0:
48+
lon_range = arr
49+
else:
50+
lat_range = arr
51+
52+
return pa.struct([
53+
('lon', (lon_range[0] + lon_range[1]) / 2),
54+
('lat', (lat_range[0] + lat_range[1]) / 2),
55+
('lon_err', (lon_range[1] - lon_range[0]) / 2),
56+
('lat_err', (lat_range[1] - lat_range[0]) / 2)
57+
])
58+
59+
def _pack_bits(self, bits: pa.BitArray) -> pa.StringScalar:
60+
"""Pack bits into base32 string"""
61+
chunks = bits.buffers()[1].cast(pa.uint32())
62+
return pa.compute.utf8_lower(pa.compute.base32_encode(chunks))[:self.precision]
63+
64+
def _unpack_bits(self, geohash: pa.StringScalar) -> pa.BitArray:
65+
"""Unpack base32 to bit array"""
66+
decoded = pa.compute.base32_decode(geohash.utf8_upper())
67+
return pa.BitArray.from_buffers(
68+
pa.binary(self.bits//8 + 1),
69+
[None, decoded.buffers()[1].copy()]
70+
).mask(self.mask)
71+
72+
def neighbors(self, geohash: pa.StringScalar) -> pa.StructScalar:
73+
"""
74+
Calculate all 8 adjacent geohashes with error boundaries
75+
Args:
76+
geohash: Input geohash string scalar
77+
Returns:
78+
Arrow Struct containing:
79+
- center: Original geohash coordinates
80+
- n/nw/ne/e/se/s/sw/w: Neighboring geohashes
81+
- bounds: Error boundaries for neighbors
82+
"""
83+
decoded = self.decode(geohash)
84+
lat, lon = decoded['lat'], decoded['lon']
85+
lat_err, lon_err = decoded['lat_err'], decoded['lon_err']
86+
# Calculate step sizes using vectorized operations
87+
steps = pa.array([
88+
(lat_err, 0), # north
89+
(lat_err, lon_err), # ne
90+
(0, lon_err), # east
91+
(-lat_err, lon_err), # se
92+
(-lat_err, 0), # south
93+
(-lat_err, -lon_err), # sw
94+
(0, -lon_err), # west
95+
(lat_err, -lon_err) # nw
96+
], type=pa.struct([
97+
('dlat', pa.float64()),
98+
('dlon', pa.float64())
99+
]))
100+
101+
# Vectorized coordinate calculations
102+
new_lats = pa.compute.add(lat, steps['dlat'])
103+
new_lons = pa.compute.add(lon, steps['dlon'])
104+
# Clamp latitudes and wrap longitudes
105+
new_lats = pa.compute.clip(new_lats, -90.0, 90.0)
106+
new_lons = pa.compute.subtract(
107+
pa.compute.modulo(
108+
pa.compute.add(new_lons, 180.0),
109+
360.0
110+
),
111+
180.0
112+
)
113+
114+
# Batch encode neighbors
115+
neighbor_hashes = self.encode(new_lats, new_lons)
116+
117+
return pa.struct([
118+
('center', geohash),
119+
('n', neighbor_hashes[0]),
120+
('ne', neighbor_hashes[1]),
121+
('e', neighbor_hashes[2]),
122+
('se', neighbor_hashes[3]),
123+
('s', neighbor_hashes[4]),
124+
('sw', neighbor_hashes[5]),
125+
('w', neighbor_hashes[6]),
126+
('nw', neighbor_hashes[7]),
127+
('bounds', pa.struct([
128+
('lat_step', lat_err * 2),
129+
('lon_step', lon_err * 2)
130+
]))
131+
])

src/algorithms/hyper_log_log.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
import math
2+
import hashlib
3+
import pyarrow as pa
4+
5+
class HyperLogLog:
6+
"""Cardinality estimator using PyArrow arrays
7+
Args:
8+
precision: Accuracy vs memory tradeoff (4-16)
9+
"""
10+
def __init__(self, precision: int = 12):
11+
self.p = precision
12+
self.m = 1 << precision
13+
self.reg = pa.array([0]*self.m, type=pa.uint8())
14+
15+
def add(self, item: str):
16+
"""Add item to cardinality estimate"""
17+
h = int.from_bytes(hashlib.sha256(item.encode()).digest(), 'big')
18+
idx = h & (self.m-1)
19+
self.reg[idx] = max(self.reg[idx], 64 - (h >> self.p).bit_length())
20+
21+
def count(self) -> int:
22+
"""Get cardinality estimate"""
23+
# epsilon
24+
e = 0.7213/(1+1.079/self.m) * self.m**2 / sum(2**-v for v in self.reg)
25+
return int(e if e > 2.5*self.m else self.m * math.log(self.m/self.reg.null_count))

0 commit comments

Comments
 (0)