Skip to content

Commit 8e57a29

Browse files
Format Version 1: Adds crc32c checks. (#8)
* feat: FORMAT_VERSION=1, adds crc32 check value to format Implementation is backwards compatible with format version 0 though the format is different. * docs: describe format version 1 * test: show validationerror is raised if bits corrupted * feat: allow disabling crc checks
1 parent e25a6af commit 8e57a29

File tree

4 files changed

+47
-7
lines changed

4 files changed

+47
-7
lines changed

README.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,11 @@ The byte string format consists of a 16 byte header, an index, and a series of (
7171
HEADER|INDEX|DATA_REGION
7272
```
7373

74+
| Format Version | description |
75+
|----------------|----------------------------------------|
76+
| 0 | Initial Release |
77+
| 1 | Adds crc32c check values to each item. |
78+
7479
### Header
7580

7681
```
@@ -93,7 +98,7 @@ The index can be consulted by conducting an Eytzinger binary search over the lab
9398

9499
### Data Region
95100

96-
The data objects are serialized to bytes and compressed individually if the header indicates they should be. They are then concatenated in the same order the index specifies.
101+
The data objects are serialized to bytes and compressed individually if the header indicates they should be. They are then concatenated in the same order the index specifies. The last four bytes are a crc32c check value that was added in format version 1.
97102

98103
## Versus Flexbuffers
99104

automated_test.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
import numpy as np
88

9-
from mapbuffer import MapBuffer, HEADER_LENGTH
9+
from mapbuffer import ValidationError, MapBuffer, HEADER_LENGTH
1010

1111
@pytest.mark.parametrize("compress", (None, "gzip", "br", "zstd", "lzma"))
1212
def test_empty(compress):
@@ -56,6 +56,25 @@ def test_full(compress):
5656

5757
assert len(mbuf.buffer) > HEADER_LENGTH
5858

59+
@pytest.mark.parametrize("compress", (None, "gzip", "br", "zstd"))
60+
def test_crc32c(compress):
61+
data = {
62+
1: b"hello",
63+
2: b"world",
64+
}
65+
mbuf = MapBuffer(data, compress=compress)
66+
67+
idx = mbuf.buffer.index(b"hello")
68+
buf = list(mbuf.buffer)
69+
buf[idx] = ord(b'H')
70+
mbuf.buffer = bytes(buf)
71+
72+
try:
73+
mbuf[1]
74+
assert False
75+
except ValidationError:
76+
pass
77+
5978
@pytest.mark.parametrize("compress", (None, "gzip", "br", "zstd"))
6079
def test_mmap_access(compress):
6180
data = {

mapbuffer/mapbuffer.py

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,24 +5,26 @@
55
from .lib import nvl
66
from . import compression
77

8+
import crc32c
89
import numpy as np
910

1011
import mapbufferaccel
1112

12-
FORMAT_VERSION = 0
13+
FORMAT_VERSION = 1
1314
MAGIC_NUMBERS = b"mapbufr"
1415
HEADER_LENGTH = 16
1516

1617
class MapBuffer:
1718
"""Represents a usable int->bytes dictionary as a byte string."""
1819
__slots__ = (
1920
"data", "tobytesfn", "frombytesfn",
20-
"dtype", "buffer", "_header",
21-
"_index", "_compress"
21+
"dtype", "buffer", "check_crc",
22+
"_header", "_index", "_compress"
2223
)
2324
def __init__(
2425
self, data=None, compress=None,
25-
tobytesfn=None, frombytesfn=None
26+
tobytesfn=None, frombytesfn=None,
27+
check_crc=True
2628
):
2729
"""
2830
data: dict (int->byte serializable object) or bytes
@@ -41,6 +43,7 @@ def __init__(
4143
self.frombytesfn = frombytesfn
4244
self.dtype = np.uint64
4345
self.buffer = None
46+
self.check_crc = check_crc
4447

4548
self._header = None
4649
self._index = None
@@ -133,6 +136,16 @@ def getindex(self, i):
133136
else:
134137
value = self.buffer[offset:]
135138

139+
if self.format_version == 1:
140+
stored_check_value = int.from_bytes(value[-4:], byteorder='little')
141+
value = value[:-4]
142+
if self.check_crc:
143+
retrieved_check_value = crc32c.crc32c(value)
144+
if retrieved_check_value != stored_check_value:
145+
raise ValidationError(
146+
f"Label {i} failed its crc32c check. Stored: {stored_check_value} Computed: {retrieved_check_value}"
147+
)
148+
136149
encoding = self.compress
137150
if encoding:
138151
value = compression.decompress(value, encoding, str(index[i,0]))
@@ -213,6 +226,8 @@ def dict2buf(self, data, compress=None, tobytesfn=None):
213226
label: compression.compress(tobytesfn(val), method=compress)
214227
for label, val in data.items()
215228
}
229+
for label in bytes_data:
230+
bytes_data[label] += crc32c.crc32c(bytes_data[label]).to_bytes(4, byteorder='little')
216231

217232
data_region = b"".join(
218233
( bytes_data[label] for label in labels )
@@ -244,7 +259,7 @@ def validate_buffer(buf):
244259
if magic != MAGIC_NUMBERS:
245260
raise ValidationError(f"Magic number mismatch. Expected: {MAGIC_NUMBERS} Got: {magic}")
246261

247-
if mapbuf.format_version not in (0,):
262+
if mapbuf.format_version not in (0,1):
248263
raise ValidationError(f"Unsupported format version. Got: {mapbuf.format_version}")
249264

250265
if mapbuf.compress not in compression.COMPRESSION_TYPES:

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
brotli
2+
crc32c
23
deflate>=0.2.0
34
numpy
45
tqdm

0 commit comments

Comments
 (0)