Skip to content

Commit 59f247e

Browse files
serhiy-storchakaclaudegpshead
authored
gh-115952: Fix a potential virtual memory allocation denial of service in pickle (GH-119204)
Loading a small data which does not even involve arbitrary code execution could consume arbitrary large amount of memory. There were three issues: * PUT and LONG_BINPUT with large argument (the C implementation only). Since the memo is implemented in C as a continuous dynamic array, a single opcode can cause its resizing to arbitrary size. Now the sparsity of memo indices is limited. * BINBYTES, BINBYTES8 and BYTEARRAY8 with large argument. They allocated the bytes or bytearray object of the specified size before reading into it. Now they read very large data by chunks. * BINSTRING, BINUNICODE, LONG4, BINUNICODE8 and FRAME with large argument. They read the whole data by calling the read() method of the underlying file object, which usually allocates the bytes object of the specified size before reading into it. Now they read very large data by chunks. Also add comprehensive benchmark suite to measure performance and memory impact of chunked reading optimization in PR #119204. Features: - Normal mode: benchmarks legitimate pickles (time/memory metrics) - Antagonistic mode: tests malicious pickles (DoS protection) - Baseline comparison: side-by-side comparison of two Python builds - Support for truncated data and sparse memo attack vectors Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com> Co-authored-by: Gregory P. Smith <greg@krypto.org>
1 parent 4085ff7 commit 59f247e

File tree

7 files changed

+1692
-102
lines changed

7 files changed

+1692
-102
lines changed

Lib/pickle.py

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,11 @@ def __init__(self, value):
189189
__all__.extend(x for x in dir() if x.isupper() and not x.startswith('_'))
190190

191191

192+
# Data larger than this will be read in chunks, to prevent extreme
193+
# overallocation.
194+
_MIN_READ_BUF_SIZE = (1 << 20)
195+
196+
192197
class _Framer:
193198

194199
_FRAME_SIZE_MIN = 4
@@ -287,7 +292,7 @@ def read(self, n):
287292
"pickle exhausted before end of frame")
288293
return data
289294
else:
290-
return self.file_read(n)
295+
return self._chunked_file_read(n)
291296

292297
def readline(self):
293298
if self.current_frame:
@@ -302,11 +307,23 @@ def readline(self):
302307
else:
303308
return self.file_readline()
304309

310+
def _chunked_file_read(self, size):
311+
cursize = min(size, _MIN_READ_BUF_SIZE)
312+
b = self.file_read(cursize)
313+
while cursize < size and len(b) == cursize:
314+
delta = min(cursize, size - cursize)
315+
b += self.file_read(delta)
316+
cursize += delta
317+
return b
318+
305319
def load_frame(self, frame_size):
306320
if self.current_frame and self.current_frame.read() != b'':
307321
raise UnpicklingError(
308322
"beginning of a new frame before end of current frame")
309-
self.current_frame = io.BytesIO(self.file_read(frame_size))
323+
data = self._chunked_file_read(frame_size)
324+
if len(data) < frame_size:
325+
raise EOFError
326+
self.current_frame = io.BytesIO(data)
310327

311328

312329
# Tools used for pickling.
@@ -1496,12 +1513,17 @@ def load_binbytes8(self):
14961513
dispatch[BINBYTES8[0]] = load_binbytes8
14971514

14981515
def load_bytearray8(self):
1499-
len, = unpack('<Q', self.read(8))
1500-
if len > maxsize:
1516+
size, = unpack('<Q', self.read(8))
1517+
if size > maxsize:
15011518
raise UnpicklingError("BYTEARRAY8 exceeds system's maximum size "
15021519
"of %d bytes" % maxsize)
1503-
b = bytearray(len)
1504-
self.readinto(b)
1520+
cursize = min(size, _MIN_READ_BUF_SIZE)
1521+
b = bytearray(cursize)
1522+
if self.readinto(b) == cursize:
1523+
while cursize < size and len(b) == cursize:
1524+
delta = min(cursize, size - cursize)
1525+
b += self.read(delta)
1526+
cursize += delta
15051527
self.append(b)
15061528
dispatch[BYTEARRAY8[0]] = load_bytearray8
15071529

Lib/test/pickletester.py

Lines changed: 164 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,15 @@ def count_opcode(code, pickle):
7474
def identity(x):
7575
return x
7676

77+
def itersize(start, stop):
78+
# Produce geometrical increasing sequence from start to stop
79+
# (inclusively) for tests.
80+
size = start
81+
while size < stop:
82+
yield size
83+
size <<= 1
84+
yield stop
85+
7786

7887
class UnseekableIO(io.BytesIO):
7988
def peek(self, *args):
@@ -853,9 +862,8 @@ def assert_is_copy(self, obj, objcopy, msg=None):
853862
self.assertEqual(getattr(obj, slot, None),
854863
getattr(objcopy, slot, None), msg=msg)
855864

856-
def check_unpickling_error(self, errors, data):
857-
with self.subTest(data=data), \
858-
self.assertRaises(errors):
865+
def check_unpickling_error_strict(self, errors, data):
866+
with self.assertRaises(errors):
859867
try:
860868
self.loads(data)
861869
except BaseException as exc:
@@ -864,6 +872,10 @@ def check_unpickling_error(self, errors, data):
864872
(data, exc.__class__.__name__, exc))
865873
raise
866874

875+
def check_unpickling_error(self, errors, data):
876+
with self.subTest(data=data):
877+
self.check_unpickling_error_strict(errors, data)
878+
867879
def test_load_from_data0(self):
868880
self.assert_is_copy(self._testdata, self.loads(DATA0))
869881

@@ -1150,6 +1162,155 @@ def test_negative_32b_binput(self):
11501162
dumped = b'\x80\x03X\x01\x00\x00\x00ar\xff\xff\xff\xff.'
11511163
self.check_unpickling_error(ValueError, dumped)
11521164

1165+
def test_too_large_put(self):
1166+
# Test that PUT with large id does not cause allocation of
1167+
# too large memo table. The C implementation uses a dict-based memo
1168+
# for sparse indices (when idx > memo_len * 2) instead of allocating
1169+
# a massive array. This test verifies large sparse indices work without
1170+
# causing memory exhaustion.
1171+
#
1172+
# The following simple pickle creates an empty list, memoizes it
1173+
# using a large index, then loads it back on the stack, builds
1174+
# a tuple containing 2 identical empty lists and returns it.
1175+
data = lambda n: (b'((lp' + str(n).encode() + b'\n' +
1176+
b'g' + str(n).encode() + b'\nt.')
1177+
# 0: ( MARK
1178+
# 1: ( MARK
1179+
# 2: l LIST (MARK at 1)
1180+
# 3: p PUT 1000000000000
1181+
# 18: g GET 1000000000000
1182+
# 33: t TUPLE (MARK at 0)
1183+
# 34: . STOP
1184+
for idx in [10**6, 10**9, 10**12]:
1185+
if idx > sys.maxsize:
1186+
continue
1187+
self.assertEqual(self.loads(data(idx)), ([],)*2)
1188+
1189+
def test_too_large_long_binput(self):
1190+
# Test that LONG_BINPUT with large id does not cause allocation of
1191+
# too large memo table. The C implementation uses a dict-based memo
1192+
# for sparse indices (when idx > memo_len * 2) instead of allocating
1193+
# a massive array. This test verifies large sparse indices work without
1194+
# causing memory exhaustion.
1195+
#
1196+
# The following simple pickle creates an empty list, memoizes it
1197+
# using a large index, then loads it back on the stack, builds
1198+
# a tuple containing 2 identical empty lists and returns it.
1199+
data = lambda n: (b'(]r' + struct.pack('<I', n) +
1200+
b'j' + struct.pack('<I', n) + b't.')
1201+
# 0: ( MARK
1202+
# 1: ] EMPTY_LIST
1203+
# 2: r LONG_BINPUT 4294967295
1204+
# 7: j LONG_BINGET 4294967295
1205+
# 12: t TUPLE (MARK at 0)
1206+
# 13: . STOP
1207+
for idx in itersize(1 << 20, min(sys.maxsize, (1 << 32) - 1)):
1208+
self.assertEqual(self.loads(data(idx)), ([],)*2)
1209+
1210+
def _test_truncated_data(self, dumped, expected_error=None):
1211+
# Test that instructions to read large data without providing
1212+
# such amount of data do not cause large memory usage.
1213+
if expected_error is None:
1214+
expected_error = self.truncated_data_error
1215+
# BytesIO
1216+
with self.assertRaisesRegex(*expected_error):
1217+
self.loads(dumped)
1218+
if hasattr(self, 'unpickler'):
1219+
try:
1220+
with open(TESTFN, 'wb') as f:
1221+
f.write(dumped)
1222+
# buffered file
1223+
with open(TESTFN, 'rb') as f:
1224+
u = self.unpickler(f)
1225+
with self.assertRaisesRegex(*expected_error):
1226+
u.load()
1227+
# unbuffered file
1228+
with open(TESTFN, 'rb', buffering=0) as f:
1229+
u = self.unpickler(f)
1230+
with self.assertRaisesRegex(*expected_error):
1231+
u.load()
1232+
finally:
1233+
os_helper.unlink(TESTFN)
1234+
1235+
def test_truncated_large_binstring(self):
1236+
data = lambda size: b'T' + struct.pack('<I', size) + b'.' * 5
1237+
# 0: T BINSTRING '....'
1238+
# 9: . STOP
1239+
self.assertEqual(self.loads(data(4)), '....') # self-testing
1240+
for size in itersize(1 << 10, min(sys.maxsize - 5, (1 << 31) - 1)):
1241+
self._test_truncated_data(data(size))
1242+
self._test_truncated_data(data(1 << 31),
1243+
(pickle.UnpicklingError, 'truncated|exceeds|negative byte count'))
1244+
1245+
def test_truncated_large_binunicode(self):
1246+
data = lambda size: b'X' + struct.pack('<I', size) + b'.' * 5
1247+
# 0: X BINUNICODE '....'
1248+
# 9: . STOP
1249+
self.assertEqual(self.loads(data(4)), '....') # self-testing
1250+
for size in itersize(1 << 10, min(sys.maxsize - 5, (1 << 32) - 1)):
1251+
self._test_truncated_data(data(size))
1252+
1253+
def test_truncated_large_binbytes(self):
1254+
data = lambda size: b'B' + struct.pack('<I', size) + b'.' * 5
1255+
# 0: B BINBYTES b'....'
1256+
# 9: . STOP
1257+
self.assertEqual(self.loads(data(4)), b'....') # self-testing
1258+
for size in itersize(1 << 10, min(sys.maxsize, 1 << 31)):
1259+
self._test_truncated_data(data(size))
1260+
1261+
def test_truncated_large_long4(self):
1262+
data = lambda size: b'\x8b' + struct.pack('<I', size) + b'.' * 5
1263+
# 0: \x8b LONG4 0x2e2e2e2e
1264+
# 9: . STOP
1265+
self.assertEqual(self.loads(data(4)), 0x2e2e2e2e) # self-testing
1266+
for size in itersize(1 << 10, min(sys.maxsize - 5, (1 << 31) - 1)):
1267+
self._test_truncated_data(data(size))
1268+
self._test_truncated_data(data(1 << 31),
1269+
(pickle.UnpicklingError, 'LONG pickle has negative byte count'))
1270+
1271+
def test_truncated_large_frame(self):
1272+
data = lambda size: b'\x95' + struct.pack('<Q', size) + b'N.'
1273+
# 0: \x95 FRAME 2
1274+
# 9: N NONE
1275+
# 10: . STOP
1276+
self.assertIsNone(self.loads(data(2))) # self-testing
1277+
for size in itersize(1 << 10, sys.maxsize - 9):
1278+
self._test_truncated_data(data(size))
1279+
if sys.maxsize + 1 < 1 << 64:
1280+
self._test_truncated_data(data(sys.maxsize + 1),
1281+
((OverflowError, ValueError),
1282+
'FRAME length exceeds|frame size > sys.maxsize'))
1283+
1284+
def test_truncated_large_binunicode8(self):
1285+
data = lambda size: b'\x8d' + struct.pack('<Q', size) + b'.' * 5
1286+
# 0: \x8d BINUNICODE8 '....'
1287+
# 13: . STOP
1288+
self.assertEqual(self.loads(data(4)), '....') # self-testing
1289+
for size in itersize(1 << 10, sys.maxsize - 9):
1290+
self._test_truncated_data(data(size))
1291+
if sys.maxsize + 1 < 1 << 64:
1292+
self._test_truncated_data(data(sys.maxsize + 1), self.size_overflow_error)
1293+
1294+
def test_truncated_large_binbytes8(self):
1295+
data = lambda size: b'\x8e' + struct.pack('<Q', size) + b'.' * 5
1296+
# 0: \x8e BINBYTES8 b'....'
1297+
# 13: . STOP
1298+
self.assertEqual(self.loads(data(4)), b'....') # self-testing
1299+
for size in itersize(1 << 10, sys.maxsize):
1300+
self._test_truncated_data(data(size))
1301+
if sys.maxsize + 1 < 1 << 64:
1302+
self._test_truncated_data(data(sys.maxsize + 1), self.size_overflow_error)
1303+
1304+
def test_truncated_large_bytearray8(self):
1305+
data = lambda size: b'\x96' + struct.pack('<Q', size) + b'.' * 5
1306+
# 0: \x96 BYTEARRAY8 bytearray(b'....')
1307+
# 13: . STOP
1308+
self.assertEqual(self.loads(data(4)), bytearray(b'....')) # self-testing
1309+
for size in itersize(1 << 10, sys.maxsize):
1310+
self._test_truncated_data(data(size))
1311+
if sys.maxsize + 1 < 1 << 64:
1312+
self._test_truncated_data(data(sys.maxsize + 1), self.size_overflow_error)
1313+
11531314
def test_badly_escaped_string(self):
11541315
self.check_unpickling_error(ValueError, b"S'\\'\n.")
11551316

Lib/test/test_pickle.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,8 @@ class PyUnpicklerTests(AbstractUnpickleTests, unittest.TestCase):
5959
truncated_errors = (pickle.UnpicklingError, EOFError,
6060
AttributeError, ValueError,
6161
struct.error, IndexError, ImportError)
62+
truncated_data_error = (EOFError, '')
63+
size_overflow_error = (pickle.UnpicklingError, 'exceeds')
6264

6365
def loads(self, buf, **kwds):
6466
f = io.BytesIO(buf)
@@ -103,6 +105,8 @@ class InMemoryPickleTests(AbstractPickleTests, AbstractUnpickleTests,
103105
truncated_errors = (pickle.UnpicklingError, EOFError,
104106
AttributeError, ValueError,
105107
struct.error, IndexError, ImportError)
108+
truncated_data_error = ((pickle.UnpicklingError, EOFError), '')
109+
size_overflow_error = ((OverflowError, pickle.UnpicklingError), 'exceeds')
106110

107111
def dumps(self, arg, protocol=None, **kwargs):
108112
return pickle.dumps(arg, protocol, **kwargs)
@@ -375,6 +379,8 @@ class CUnpicklerTests(PyUnpicklerTests):
375379
unpickler = _pickle.Unpickler
376380
bad_stack_errors = (pickle.UnpicklingError,)
377381
truncated_errors = (pickle.UnpicklingError,)
382+
truncated_data_error = (pickle.UnpicklingError, 'truncated')
383+
size_overflow_error = (OverflowError, 'exceeds')
378384

379385
class CPicklingErrorTests(PyPicklingErrorTests):
380386
pickler = _pickle.Pickler
@@ -478,7 +484,7 @@ def test_pickler(self):
478484
0) # Write buffer is cleared after every dump().
479485

480486
def test_unpickler(self):
481-
basesize = support.calcobjsize('2P2n2P 2P2n2i5P 2P3n8P2n2i')
487+
basesize = support.calcobjsize('2P2n3P 2P2n2i5P 2P3n8P2n2i')
482488
unpickler = _pickle.Unpickler
483489
P = struct.calcsize('P') # Size of memo table entry.
484490
n = struct.calcsize('n') # Size of mark table entry.
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
Fix a potential memory denial of service in the :mod:`pickle` module.
2+
When reading a pickled data received from untrusted source, it could cause
3+
an arbitrary amount of memory to be allocated, even if the code that is
4+
allowed to execute is restricted by overriding the
5+
:meth:`~pickle.Unpickler.find_class` method.
6+
This could have led to symptoms including a :exc:`MemoryError`, swapping, out
7+
of memory (OOM) killed processes or containers, or even system crashes.

0 commit comments

Comments
 (0)