From 3581be26088618789afae9388d8d98f786b1cab8 Mon Sep 17 00:00:00 2001 From: Guillaume Malette Date: Wed, 10 Dec 2025 16:00:13 -0500 Subject: [PATCH 1/3] Add ref_tracking option for object deduplication during serialization When registering an extension type with ref_tracking: true, repeated objects are serialized as back-references instead of being re-encoded. This reduces payload size when the same object appears multiple times. Uses ext type 127 with a compact wire format: - New reference: nil marker followed by the serialized object - Back-reference: positive integer ref_id pointing to earlier object --- bench/bench.rb | 67 ++++++++- ext/msgpack/factory_class.c | 9 ++ ext/msgpack/packer.c | 80 +++++++++++ ext/msgpack/packer.h | 8 ++ ext/msgpack/packer_class.c | 7 + ext/msgpack/packer_ext_registry.h | 1 + ext/msgpack/unpacker.c | 106 ++++++++++++++ ext/msgpack/unpacker.h | 7 + ext/msgpack/unpacker_ext_registry.h | 1 + spec/ref_tracking_spec.rb | 210 ++++++++++++++++++++++++++++ 10 files changed, 495 insertions(+), 1 deletion(-) create mode 100644 spec/ref_tracking_spec.rb diff --git a/bench/bench.rb b/bench/bench.rb index 91afd548..148b241f 100644 --- a/bench/bench.rb +++ b/bench/bench.rb @@ -22,7 +22,7 @@ 'status' => 200, 'bytes' => 2326, 'referer' => 'http://www.example.com/start.html', - 'agent' => 'Mozilla/4.08 [en] (Win98; I ;Nav)', + 'agent' => 'Mozilla/4.08 [en] (Win98; I ;Nav)' } data_structured = MessagePack.pack(object_structured) @@ -46,6 +46,41 @@ def self.from_msgpack_ext(data) extended_packer.register_type(0x00, Extended, :to_msgpack_ext) data_extended = extended_packer.pack(object_extended).to_s +# Struct for optimized_struct benchmarks +BenchStruct = Struct.new(:name, :age, :email, :score, :active) + +# Factory with optimized_struct (C-level fast path) +factory_optimized = MessagePack::Factory.new +factory_optimized.register_type(0x01, BenchStruct, optimized_struct: true) + +# Factory with recursive packer/unpacker (Ruby-level) +factory_recursive = MessagePack::Factory.new +factory_recursive.register_type( + 0x01, + BenchStruct, + packer: lambda { |obj, packer| + packer.write(obj.name) + packer.write(obj.age) + packer.write(obj.email) + packer.write(obj.score) + packer.write(obj.active) + }, + unpacker: lambda { |unpacker| + BenchStruct.new(unpacker.read, unpacker.read, unpacker.read, unpacker.read, unpacker.read) + }, + recursive: true +) + +object_struct = BenchStruct.new('Alice', 30, 'alice@example.com', 95.5, true) +data_struct_optimized = factory_optimized.dump(object_struct) +data_struct_recursive = factory_recursive.dump(object_struct) + +# Pre-create packers/unpackers for fair comparison (avoid factory overhead in loop) +packer_optimized = factory_optimized.packer +packer_recursive = factory_recursive.packer +unpacker_optimized = factory_optimized.unpacker +unpacker_recursive = factory_recursive.unpacker + Benchmark.ips do |x| x.report('pack-plain') do MessagePack.pack(object_plain) @@ -61,6 +96,22 @@ def self.from_msgpack_ext(data) packer.pack(object_extended).to_s end + x.report('pack-struct-optimized') do + packer_optimized.write(object_struct) + packer_optimized.to_s + packer_optimized.clear + end + + x.report('pack-struct-recursive') do + packer_recursive.write(object_struct) + packer_recursive.to_s + packer_recursive.clear + end + + x.compare! +end + +Benchmark.ips do |x| x.report('unpack-plain') do MessagePack.unpack(data_plain) end @@ -75,4 +126,18 @@ def self.from_msgpack_ext(data) unpacker.feed data_extended unpacker.read end + + x.report('unpack-struct-optimized') do + unpacker_optimized.feed(data_struct_optimized) + unpacker_optimized.read + unpacker_optimized.reset + end + + x.report('unpack-struct-recursive') do + unpacker_recursive.feed(data_struct_recursive) + unpacker_recursive.read + unpacker_recursive.reset + end + + x.compare! end diff --git a/ext/msgpack/factory_class.c b/ext/msgpack/factory_class.c index eb3b81a3..9aa0c4a3 100644 --- a/ext/msgpack/factory_class.c +++ b/ext/msgpack/factory_class.c @@ -34,6 +34,7 @@ struct msgpack_factory_t { bool has_bigint_ext_type; bool has_symbol_ext_type; bool optimized_symbol_ext_type; + bool has_ref_tracking_ext_type; int symbol_ext_type; }; @@ -161,6 +162,7 @@ VALUE MessagePack_Factory_packer(int argc, VALUE* argv, VALUE self) msgpack_packer_ext_registry_borrow(packer, &fc->pkrg, &pk->ext_registry); pk->has_bigint_ext_type = fc->has_bigint_ext_type; pk->has_symbol_ext_type = fc->has_symbol_ext_type; + pk->has_ref_tracking_ext_type = fc->has_ref_tracking_ext_type; return packer; } @@ -176,6 +178,7 @@ VALUE MessagePack_Factory_unpacker(int argc, VALUE* argv, VALUE self) msgpack_unpacker_ext_registry_borrow(fc->ukrg, &uk->ext_registry); uk->optimized_symbol_ext_type = fc->optimized_symbol_ext_type; uk->symbol_ext_type = fc->symbol_ext_type; + uk->has_ref_tracking_ext_type = fc->has_ref_tracking_ext_type; return unpacker; } @@ -271,6 +274,12 @@ static VALUE Factory_register_type_internal(VALUE self, VALUE rb_ext_type, VALUE packer_proc = ext_module; unpacker_proc = ext_module; } + + /* ref_tracking: true enables deduplication of repeated objects */ + if (RTEST(rb_hash_aref(options, ID2SYM(rb_intern("ref_tracking"))))) { + flags |= MSGPACK_EXT_REF_TRACKING; + fc->has_ref_tracking_ext_type = true; + } } msgpack_packer_ext_registry_put(self, &fc->pkrg, ext_module, ext_type, flags, packer_proc); diff --git a/ext/msgpack/packer.c b/ext/msgpack/packer.c index 6beb8029..711e9395 100644 --- a/ext/msgpack/packer.c +++ b/ext/msgpack/packer.c @@ -26,11 +26,17 @@ void msgpack_packer_init(msgpack_packer_t* pk) { msgpack_buffer_init(PACKER_BUFFER_(pk)); + pk->ref_table = NULL; + pk->next_ref_id = 1; /* 1-indexed */ } void msgpack_packer_destroy(msgpack_packer_t* pk) { msgpack_buffer_destroy(PACKER_BUFFER_(pk)); + if (pk->ref_table) { + st_free_table(pk->ref_table); + pk->ref_table = NULL; + } } void msgpack_packer_mark(msgpack_packer_t* pk) @@ -46,6 +52,68 @@ void msgpack_packer_reset(msgpack_packer_t* pk) msgpack_buffer_clear(PACKER_BUFFER_(pk)); pk->buffer_ref = Qnil; + + /* Reset ref tracking state */ + if (pk->ref_table) { + st_clear(pk->ref_table); + } + pk->next_ref_id = 1; +} + +/* + * Write a back-reference to a previously serialized object. + * Wire format: ext type 127 followed by msgpack integer ref_id + * We use fixext 1 with a 0 byte as a marker, then write the ref_id as a normal msgpack int. + */ +static void msgpack_packer_write_back_ref(msgpack_packer_t* pk, long ref_id) +{ + /* fixext 1, type 127, payload 0x01 (marker for back-ref) */ + msgpack_buffer_ensure_writable(PACKER_BUFFER_(pk), 3); + msgpack_buffer_write_2(PACKER_BUFFER_(pk), 0xd4, MSGPACK_EXT_REF_TYPE); + msgpack_buffer_write_1(PACKER_BUFFER_(pk), 0x01); + /* Write ref_id as a variable-length msgpack integer */ + msgpack_packer_write_long(pk, ref_id); +} + +/* + * Write a new reference marker followed by the object. + * Wire format: ext type 127 with payload = [nil, serialized_object] + * The nil indicates this is a new ref (vs back-ref which has positive int). + */ +static void msgpack_packer_write_new_ref_header(msgpack_packer_t* pk) +{ + /* We write: ext header (variable len) + nil (1 byte) + object (variable) + * Since we don't know the total length yet, we use a different approach: + * Write nil as ext payload marker, then the object follows in the stream. + * + * Actually, let's use fixext 1 with nil (0xc0) as the 1-byte payload. + * The unpacker will see ext type 127 with payload [0xc0] and know it's a new ref, + * then read the next object from the stream. + */ + msgpack_buffer_ensure_writable(PACKER_BUFFER_(pk), 3); + msgpack_buffer_write_2(PACKER_BUFFER_(pk), 0xd4, MSGPACK_EXT_REF_TYPE); /* fixext 1, type 127 */ + msgpack_buffer_write_1(PACKER_BUFFER_(pk), 0xc0); /* nil marker */ +} + +/* + * Check if a value was already serialized and return its ref_id if so. + * If not found, registers the value and returns 0. + */ +static long msgpack_packer_check_ref(msgpack_packer_t* pk, VALUE v) +{ + if (!pk->ref_table) { + pk->ref_table = st_init_numtable(); + } + + st_data_t ref_id; + if (st_lookup(pk->ref_table, (st_data_t)v, &ref_id)) { + return (long)ref_id; + } + + /* Not found - register this value */ + st_insert(pk->ref_table, (st_data_t)v, (st_data_t)pk->next_ref_id); + pk->next_ref_id++; + return 0; /* 0 means "not a back-reference" */ } @@ -136,6 +204,18 @@ bool msgpack_packer_try_write_with_ext_type_lookup(msgpack_packer_t* pk, VALUE v return false; } + /* Handle ref_tracking: check if we've seen this object before */ + if (ext_flags & MSGPACK_EXT_REF_TRACKING) { + long ref_id = msgpack_packer_check_ref(pk, v); + if (ref_id > 0) { + /* Already seen - write back-reference */ + msgpack_packer_write_back_ref(pk, ref_id); + return true; + } + /* Not seen before - write new-ref header, then continue to serialize normally */ + msgpack_packer_write_new_ref_header(pk); + } + if(ext_flags & MSGPACK_EXT_STRUCT_FAST_PATH) { /* Fast path for Struct: directly access fields in C, no Ruby callbacks */ VALUE held_buffer = MessagePack_Buffer_hold(&pk->buffer); diff --git a/ext/msgpack/packer.h b/ext/msgpack/packer.h index f3441f5f..94c41461 100644 --- a/ext/msgpack/packer.h +++ b/ext/msgpack/packer.h @@ -21,6 +21,9 @@ #include "buffer.h" #include "packer_ext_registry.h" +/* Extension type for reference tracking (used for deduplication) */ +#define MSGPACK_EXT_REF_TYPE 127 + #ifndef MSGPACK_PACKER_IO_FLUSH_THRESHOLD_TO_WRITE_STRING_BODY #define MSGPACK_PACKER_IO_FLUSH_THRESHOLD_TO_WRITE_STRING_BODY (1024) #endif @@ -44,6 +47,11 @@ struct msgpack_packer_t { bool compatibility_mode; bool has_bigint_ext_type; bool has_symbol_ext_type; + bool has_ref_tracking_ext_type; + + /* reference tracking for deduplication */ + st_table *ref_table; /* maps VALUE -> ref_id (1-indexed) */ + long next_ref_id; /* options */ bool comaptibility_mode; diff --git a/ext/msgpack/packer_class.c b/ext/msgpack/packer_class.c index aebdf39d..45b5851c 100644 --- a/ext/msgpack/packer_class.c +++ b/ext/msgpack/packer_class.c @@ -304,6 +304,13 @@ static VALUE Packer_reset(VALUE self) { msgpack_packer_t *pk = MessagePack_Packer_get(self); msgpack_buffer_clear(PACKER_BUFFER_(pk)); + + /* Reset ref tracking state */ + if (pk->ref_table) { + st_clear(pk->ref_table); + } + pk->next_ref_id = 1; + return Qnil; } diff --git a/ext/msgpack/packer_ext_registry.h b/ext/msgpack/packer_ext_registry.h index 3b5c60d9..67b198b5 100644 --- a/ext/msgpack/packer_ext_registry.h +++ b/ext/msgpack/packer_ext_registry.h @@ -23,6 +23,7 @@ #define MSGPACK_EXT_RECURSIVE 0b0001 #define MSGPACK_EXT_STRUCT_FAST_PATH 0b0010 +#define MSGPACK_EXT_REF_TRACKING 0b0100 struct msgpack_packer_ext_registry_t; typedef struct msgpack_packer_ext_registry_t msgpack_packer_ext_registry_t; diff --git a/ext/msgpack/unpacker.c b/ext/msgpack/unpacker.c index b7c14c51..71196b34 100644 --- a/ext/msgpack/unpacker.c +++ b/ext/msgpack/unpacker.c @@ -17,6 +17,7 @@ */ #include "unpacker.h" +#include "unpacker_class.h" #include "rmem.h" #include "extension_value_class.h" #include @@ -124,6 +125,7 @@ void _msgpack_unpacker_init(msgpack_unpacker_t* uk) uk->last_object = Qnil; uk->reading_raw = Qnil; + uk->ref_array = Qnil; } void _msgpack_unpacker_destroy(msgpack_unpacker_t* uk) @@ -154,6 +156,7 @@ void msgpack_unpacker_mark(msgpack_unpacker_t* uk) { rb_gc_mark(uk->last_object); rb_gc_mark(uk->reading_raw); + rb_gc_mark(uk->ref_array); msgpack_unpacker_mark_stack(&uk->stack); msgpack_unpacker_mark_key_cache(&uk->key_cache); /* See MessagePack_Buffer_wrap */ @@ -173,6 +176,11 @@ void _msgpack_unpacker_reset(msgpack_unpacker_t* uk) uk->last_object = Qnil; uk->reading_raw = Qnil; uk->reading_raw_remaining = 0; + + /* Reset ref tracking state */ + if (uk->ref_array != Qnil) { + rb_ary_clear(uk->ref_array); + } } @@ -218,8 +226,102 @@ static inline int object_complete_symbol(msgpack_unpacker_t* uk, VALUE object) return PRIMITIVE_OBJECT_COMPLETE; } +/* Forward declarations for ref tracking */ +static inline int _msgpack_unpacker_stack_push(msgpack_unpacker_t* uk, enum stack_type_t type, size_t count, VALUE object); +static inline size_t msgpack_unpacker_stack_pop(msgpack_unpacker_t* uk); +int msgpack_unpacker_read(msgpack_unpacker_t* uk, size_t target_stack_depth); + +/* + * Handle ext type 127 (ref tracking). + * Payload format: + * - nil (0xc0): new ref marker, the actual object follows in the main stream + * - positive integer: back-reference to a previously unpacked object + */ +static inline int object_complete_ref_tracking(msgpack_unpacker_t* uk, VALUE str) +{ + if (str == Qnil || RSTRING_LEN(str) == 0) { + rb_raise(rb_eArgError, "Invalid ref tracking payload: empty"); + } + + const unsigned char *data = (const unsigned char *)RSTRING_PTR(str); + size_t len = RSTRING_LEN(str); + + /* Check marker byte: 0xc0 (nil) = new ref, 0x01 = back-ref */ + if (data[0] == 0xc0) { + /* New ref marker - the actual object follows in the stream */ + + /* Initialize ref_array lazily */ + if (uk->ref_array == Qnil) { + uk->ref_array = rb_ary_new(); + } + + /* Reserve a slot in the ref_array BEFORE reading the object. + * This is necessary because the object being read may contain + * nested refs that need to be registered in pre-order (DFS order). + * We use Qnil as a placeholder and fill it in after reading. */ + long slot_index = RARRAY_LEN(uk->ref_array); + rb_ary_push(uk->ref_array, Qnil); /* Reserve slot */ + + /* Read the next object from the main stream */ + _msgpack_unpacker_stack_push(uk, STACK_TYPE_RECURSIVE, 1, Qnil); + int ret = msgpack_unpacker_read(uk, 0); + msgpack_unpacker_stack_pop(uk); + + if (ret < 0) { + return ret; + } + + /* Fill in the reserved slot with the actual object */ + rb_ary_store(uk->ref_array, slot_index, uk->last_object); + return PRIMITIVE_OBJECT_COMPLETE; + } + + if (data[0] != 0x01 || len != 1) { + rb_raise(rb_eArgError, "Invalid ref tracking marker: expected 0x01, got 0x%02x", data[0]); + } + + /* Back-reference marker - read the ref_id as a msgpack integer from the stream */ + _msgpack_unpacker_stack_push(uk, STACK_TYPE_RECURSIVE, 1, Qnil); + int ret = msgpack_unpacker_read(uk, 0); + msgpack_unpacker_stack_pop(uk); + + if (ret < 0) { + return ret; + } + + if (!FIXNUM_P(uk->last_object)) { + rb_raise(rb_eArgError, "Invalid ref_id: expected integer"); + } + long ref_id = FIX2LONG(uk->last_object); + + if (ref_id <= 0) { + rb_raise(rb_eArgError, "Invalid ref_id: %ld (must be positive)", ref_id); + } + + if (uk->ref_array == Qnil) { + rb_raise(rb_eArgError, "Back-reference to ref_id %ld but no objects registered yet", ref_id); + } + + /* ref_id is 1-indexed, array is 0-indexed */ + long index = ref_id - 1; + if (index >= RARRAY_LEN(uk->ref_array)) { + rb_raise(rb_eArgError, "Back-reference to ref_id %ld but only %ld objects registered", + ref_id, RARRAY_LEN(uk->ref_array)); + } + + VALUE obj = rb_ary_entry(uk->ref_array, index); + uk->last_object = obj; + reset_head_byte(uk); + return PRIMITIVE_OBJECT_COMPLETE; +} + static inline int object_complete_ext(msgpack_unpacker_t* uk, int ext_type, VALUE str) { + /* Handle ref tracking ext type 127 */ + if (uk->has_ref_tracking_ext_type && ext_type == MSGPACK_EXT_REF_TYPE) { + return object_complete_ref_tracking(uk, str); + } + if (uk->optimized_symbol_ext_type && ext_type == uk->symbol_ext_type) { if (RB_UNLIKELY(NIL_P(str))) { // empty extension is returned as Qnil return object_complete_symbol(uk, ID2SYM(rb_intern3("", 0, rb_utf8_encoding()))); @@ -496,7 +598,11 @@ static inline int read_raw_body_begin(msgpack_unpacker_t* uk, int raw_type) ret = object_complete(uk, string); } else { VALUE string = msgpack_buffer_read_top_as_string(UNPACKER_BUFFER_(uk), length, false, false); + /* Clear reading_raw_remaining BEFORE calling object_complete_ext + * because ref tracking may recursively call msgpack_unpacker_read */ + uk->reading_raw_remaining = 0; ret = object_complete_ext(uk, raw_type, string); + return ret; } } uk->reading_raw_remaining = 0; diff --git a/ext/msgpack/unpacker.h b/ext/msgpack/unpacker.h index 10e13268..706a0630 100644 --- a/ext/msgpack/unpacker.h +++ b/ext/msgpack/unpacker.h @@ -21,6 +21,9 @@ #include "buffer.h" #include "unpacker_ext_registry.h" +/* Extension type for reference tracking (used for deduplication) */ +#define MSGPACK_EXT_REF_TYPE 127 + #define MSGPACK_UNPACKER_STACK_CAPACITY 128 struct msgpack_unpacker_t; @@ -73,6 +76,10 @@ struct msgpack_unpacker_t { bool freeze: 1; bool allow_unknown_ext: 1; bool optimized_symbol_ext_type: 1; + bool has_ref_tracking_ext_type: 1; + + /* reference tracking for deduplication */ + VALUE ref_array; /* array of previously unpacked objects (1-indexed) */ }; #define UNPACKER_BUFFER_(uk) (&(uk)->buffer) diff --git a/ext/msgpack/unpacker_ext_registry.h b/ext/msgpack/unpacker_ext_registry.h index f76a0175..83dfb4d5 100644 --- a/ext/msgpack/unpacker_ext_registry.h +++ b/ext/msgpack/unpacker_ext_registry.h @@ -23,6 +23,7 @@ #define MSGPACK_EXT_RECURSIVE 0b0001 #define MSGPACK_EXT_STRUCT_FAST_PATH 0b0010 +#define MSGPACK_EXT_REF_TRACKING 0b0100 struct msgpack_unpacker_ext_registry_t; typedef struct msgpack_unpacker_ext_registry_t msgpack_unpacker_ext_registry_t; diff --git a/spec/ref_tracking_spec.rb b/spec/ref_tracking_spec.rb new file mode 100644 index 00000000..b70c9b57 --- /dev/null +++ b/spec/ref_tracking_spec.rb @@ -0,0 +1,210 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe 'ref_tracking option' do + # Define a test struct + TestStruct = Struct.new(:name, :value, :nested) + + let(:factory) do + factory = MessagePack::Factory.new + factory.register_type( + 0x01, + TestStruct, + packer: lambda { |obj, pk| + pk.write(obj.name) + pk.write(obj.value) + pk.write(obj.nested) + }, + unpacker: lambda { |uk| + TestStruct.new(uk.read, uk.read, uk.read) + }, + recursive: true, + ref_tracking: true + ) + factory + end + + describe 'packing with ref_tracking' do + it 'writes new-ref marker for first occurrence' do + obj = TestStruct.new('test', 42, nil) + packed = factory.packer.write(obj).to_s + + # Should be able to unpack it + unpacked = factory.unpacker.feed(packed).read + expect(unpacked).to be_a(TestStruct) + expect(unpacked.name).to eq('test') + expect(unpacked.value).to eq(42) + end + + it 'writes back-reference for repeated objects' do + shared = TestStruct.new('shared', 100, nil) + container = TestStruct.new('parent', 0, shared) + + # Pack the shared object twice in an array + packed = factory.packer.write([shared, shared, container]).to_s + + # Unpack and verify + unpacked = factory.unpacker.feed(packed).read + expect(unpacked).to be_an(Array) + expect(unpacked.length).to eq(3) + + # All three references should resolve to objects with the same values + expect(unpacked[0].name).to eq('shared') + expect(unpacked[1].name).to eq('shared') + expect(unpacked[2].nested.name).to eq('shared') + end + + it 'handles deeply nested structures with shared references' do + leaf = TestStruct.new('leaf', 1, nil) + mid1 = TestStruct.new('mid1', 2, leaf) + mid2 = TestStruct.new('mid2', 3, leaf) # shares leaf + root = TestStruct.new('root', 0, [mid1, mid2]) + + packed = factory.packer.write(root).to_s + unpacked = factory.unpacker.feed(packed).read + + expect(unpacked.name).to eq('root') + expect(unpacked.nested[0].name).to eq('mid1') + expect(unpacked.nested[1].name).to eq('mid2') + expect(unpacked.nested[0].nested.name).to eq('leaf') + expect(unpacked.nested[1].nested.name).to eq('leaf') + end + end + + describe 'deduplication effectiveness' do + it 'produces smaller output when objects are repeated' do + shared = TestStruct.new('a' * 100, 42, nil) + + # Pack array with same object repeated many times + repeated = [shared] * 10 + packed_with_ref_tracking = factory.packer.write(repeated).to_s + + # Pack without ref_tracking + factory_without_ref = MessagePack::Factory.new + factory_without_ref.register_type( + 0x01, + TestStruct, + packer: lambda { |obj, pk| + pk.write(obj.name) + pk.write(obj.value) + pk.write(obj.nested) + }, + unpacker: lambda { |uk| + TestStruct.new(uk.read, uk.read, uk.read) + }, + recursive: true + ) + packed_without_ref_tracking = factory_without_ref.packer.write(repeated).to_s + + # With ref_tracking should be smaller since object is only serialized once + expect(packed_with_ref_tracking.bytesize).to be < packed_without_ref_tracking.bytesize + end + end + + describe 'combined with optimized_struct' do + OptimizedStruct = Struct.new(:x, :y, :ref) + + let(:optimized_factory) do + factory = MessagePack::Factory.new + factory.register_type( + 0x02, + OptimizedStruct, + optimized_struct: true, + ref_tracking: true + ) + factory + end + + it 'works with both optimized_struct and ref_tracking' do + shared = OptimizedStruct.new(1, 2, nil) + parent = OptimizedStruct.new(10, 20, shared) + + packed = optimized_factory.packer.write([shared, parent]).to_s + unpacked = optimized_factory.unpacker.feed(packed).read + + expect(unpacked[0].x).to eq(1) + expect(unpacked[0].y).to eq(2) + expect(unpacked[1].x).to eq(10) + expect(unpacked[1].y).to eq(20) + expect(unpacked[1].ref.x).to eq(1) + end + + it 'deduplicates repeated optimized structs' do + shared = OptimizedStruct.new(1, 2, nil) + + packed = optimized_factory.packer.write([shared, shared, shared]).to_s + unpacked = optimized_factory.unpacker.feed(packed).read + + expect(unpacked.length).to eq(3) + expect(unpacked[0].x).to eq(1) + expect(unpacked[1].x).to eq(1) + expect(unpacked[2].x).to eq(1) + end + end + + describe 'edge cases' do + it 'handles nil nested values' do + obj = TestStruct.new('test', nil, nil) + packed = factory.packer.write(obj).to_s + unpacked = factory.unpacker.feed(packed).read + + expect(unpacked.name).to eq('test') + expect(unpacked.value).to be_nil + expect(unpacked.nested).to be_nil + end + + it 'handles empty arrays' do + packed = factory.packer.write([]).to_s + unpacked = factory.unpacker.feed(packed).read + expect(unpacked).to eq([]) + end + + it 'handles single object' do + obj = TestStruct.new('solo', 1, nil) + packed = factory.packer.write(obj).to_s + unpacked = factory.unpacker.feed(packed).read + + expect(unpacked.name).to eq('solo') + end + + it 'resets ref tracking between pack operations' do + packer = factory.packer + + obj = TestStruct.new('test', 1, nil) + packed1 = packer.write(obj).to_s + packer.clear + + # Pack again - should not reference previous pack's objects + packed2 = packer.write(obj).to_s + + # Both should unpack successfully + unpacker = factory.unpacker + unpacked1 = unpacker.feed(packed1).read + unpacker.reset + unpacked2 = unpacker.feed(packed2).read + + expect(unpacked1.name).to eq('test') + expect(unpacked2.name).to eq('test') + end + end + + describe 'ref_id ranges' do + it 'handles many unique objects (testing ref_id encoding)' do + # Create many unique objects to test ref_id encoding beyond fixint range + objects = 150.times.map { |i| TestStruct.new("obj#{i}", i, nil) } + + # Create array that references each twice + array = objects + objects + + packed = factory.packer.write(array).to_s + unpacked = factory.unpacker.feed(packed).read + + expect(unpacked.length).to eq(300) + expect(unpacked[0].name).to eq('obj0') + expect(unpacked[149].name).to eq('obj149') + expect(unpacked[150].name).to eq('obj0') # Back-reference + expect(unpacked[299].name).to eq('obj149') # Back-reference + end + end +end From 17927ffe16022a43c229a2bf506439090c74d027 Mon Sep 17 00:00:00 2001 From: Guillaume Malette Date: Thu, 11 Dec 2025 15:03:54 -0500 Subject: [PATCH 2/3] Edit benchmark --- bench/various_appraches_bench.rb | 38 +++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/bench/various_appraches_bench.rb b/bench/various_appraches_bench.rb index 727d16d8..2476c830 100644 --- a/bench/various_appraches_bench.rb +++ b/bench/various_appraches_bench.rb @@ -21,11 +21,16 @@ begin factory = MessagePack::Factory.new test_struct = Struct.new(:a, keyword_init: true) - factory.register_type(0x01, test_struct, optimized_struct: true) + factory.register_type(0x01, test_struct, optimized_struct: true, ref_tracking: true) obj = test_struct.new(a: 1) arr = [obj, obj, obj] dump = factory.dump(arr) loaded = factory.load(dump) + + unless loaded[0].object_id == loaded[1].object_id + puts "ERROR: msgpack-ruby does not support ref_tracking; the benchmark cannot run." + exit(1) + end rescue StandardError puts "ERROR: msgpack-ruby does not support optimized_struct; the benchmark cannot run." exit(2) @@ -466,6 +471,37 @@ def self.description end end + module RefTrackingInC + def self.build_factory + factory = MessagePack::Factory.new + Coders.register_time(factory) + + type_id = 0x20 + ALL_STRUCTS.each do |struct| + if SHARED_STRUCTS.include?(struct) + factory.register_type( + type_id, + struct, + optimized_struct: true, + ref_tracking: true, + ) + else + factory.register_type(type_id, struct, optimized_struct: true) + end + type_id += 1 + end + + factory + end + + def self.factory + @factory ||= build_factory + end + + def self.description + "Uses optimized_struct with C-level reference tracking for shared types" + end + end end # ============================================================================= From 40af3f3ba51785350827681c3cd6e59b0ef81e78 Mon Sep 17 00:00:00 2001 From: Guillaume Malette Date: Thu, 11 Dec 2025 16:05:33 -0500 Subject: [PATCH 3/3] Use posargs --- bench/various_appraches_bench.rb | 213 +++++++++++++++---------------- 1 file changed, 104 insertions(+), 109 deletions(-) diff --git a/bench/various_appraches_bench.rb b/bench/various_appraches_bench.rb index 2476c830..ea5a0870 100644 --- a/bench/various_appraches_bench.rb +++ b/bench/various_appraches_bench.rb @@ -77,29 +77,25 @@ :owner_id, :created_at, :updated_at, - :original_type, - keyword_init: true, + :original_type ) SellingPlanPriceAdjustment = Struct.new( :order_count, :position, :value_type, - :value, - keyword_init: true, + :value ) SellingPlanOption = Struct.new( :name, :position, - :value, - keyword_init: true, + :value ) SellingPlanCheckoutCharge = Struct.new( :value_type, - :value, - keyword_init: true, + :value ) # SellingPlan is SHARED - the same selling plan instance is referenced by @@ -111,16 +107,14 @@ :recurring_deliveries, :options, :price_adjustments, - :checkout_charge, - keyword_init: true, + :checkout_charge ) SellingPlanGroup = Struct.new( :id, :name, :options, - :selling_plans, - keyword_init: true, + :selling_plans ) # ProductOptionValue is SHARED - the same option value instance appears in @@ -129,16 +123,14 @@ :id, :name, :position, - :swatch_color, - keyword_init: true, + :swatch_color ) ProductOption = Struct.new( :id, :name, :position, - :values, - keyword_init: true, + :values ) # ProductVariant - matches real ProductLoader::Messages::ProductVariant (37 fields) @@ -172,8 +164,7 @@ :requires_shipping, :selling_plans, # References SHARED SellingPlan objects :metafields, - :variant_unit_price_measurement, - keyword_init: true, + :variant_unit_price_measurement ) # Product - matches real ProductLoader::Messages::Product (28 fields) @@ -196,8 +187,7 @@ :variants, :options, # Contains SHARED ProductOptionValue objects :selling_plan_groups, # Contains SHARED SellingPlan objects - :metafields, - keyword_init: true, + :metafields ) ALL_STRUCTS = [ @@ -254,7 +244,7 @@ def self.build_untracked_packer(struct) end def self.build_tracked_unpacker(struct) - args = struct.members.map { |m| "#{m}: unpacker.read" }.join(", ") + args = struct.members.map { |_m| "unpacker.read" }.join(", ") eval(<<~RUBY, binding, __FILE__, __LINE__ + 1) ->(unpacker) { @@ -273,7 +263,7 @@ def self.build_tracked_unpacker(struct) end def self.build_untracked_unpacker(struct) - args = struct.members.map { |m| "#{m}: unpacker.read" }.join(", ") + args = struct.members.map { |_m| "unpacker.read" }.join(", ") eval(<<~RUBY, binding, __FILE__, __LINE__ + 1) ->(unpacker) { #{struct}.new(#{args}) @@ -340,8 +330,8 @@ def self.build_factory ALL_STRUCTS.each do |struct| unpacker = eval(<<~RUBY, binding, __FILE__, __LINE__ + 1) -> (unpacker) { - #{struct.members.join(", ")} = unpacker.read - #{struct}.new(#{struct.members.map{ |m| "#{m}: #{m}" }.join(", ")}) + #{struct.members.join(",")} = unpacker.read + #{struct}.new(#{struct.members.join(", ")}) } RUBY @@ -510,26 +500,26 @@ def self.description def create_selling_plan(id:) SellingPlan.new( - id: id, - name: "Subscribe & Save #{id}", - description: "Save 10% with a subscription", - recurring_deliveries: true, - options: [ - SellingPlanOption.new(name: "Delivery Frequency", position: 1, value: "1 Month"), + id, + "Subscribe & Save #{id}", + "Save 10% with a subscription", + true, + [ + SellingPlanOption.new("Delivery Frequency", 1, "1 Month") ], - price_adjustments: [ - SellingPlanPriceAdjustment.new(order_count: nil, position: 1, value_type: "percentage", value: 10), + [ + SellingPlanPriceAdjustment.new(nil, 1, "percentage", 10) ], - checkout_charge: SellingPlanCheckoutCharge.new(value_type: "percentage", value: 100), + SellingPlanCheckoutCharge.new("percentage", 100) ) end def create_selling_plan_group(id:, selling_plans:) SellingPlanGroup.new( - id: id, - name: "Subscription Group #{id}", - options: [{ name: "Delivery Frequency", position: 1, values: ["1 Month", "2 Months"] }], - selling_plans: selling_plans, + id, + "Subscription Group #{id}", + [{ name: "Delivery Frequency", position: 1, values: ["1 Month", "2 Months"] }], + selling_plans ) end @@ -540,41 +530,42 @@ def create_metafields(owner_id:, count:, owner_type:) # - Relatively short values (1..count).map do |i| Metafield.new( - id: owner_id * 1000 + i, - namespace: "custom", - key: "field_#{i}", - value: "Value #{i}", - type: "single_line_text_field", # this should be an enum - value_type: "string", - definition_id: nil, - owner_type: owner_type, - owner_id: owner_id, - created_at: Time.now, - updated_at: Time.now, - original_type: nil, + owner_id * 1000 + i, + "custom", + "field_#{i}", + "Value #{i}", + "single_line_text_field", # this should be an enum + "string", + nil, + owner_type, + owner_id, + Time.now, + Time.now, + nil ) end end -def create_product(id:, num_variants:, num_options:, selling_plan_groups:, num_product_metafields:, num_variant_metafields:) +def create_product(id:, num_variants:, num_options:, selling_plan_groups:, num_product_metafields:, + num_variant_metafields:) # Create shared option values option_values_by_option = {} options = (1..num_options).map do |opt_idx| values = (1..3).map do |val_idx| ProductOptionValue.new( - id: id * 1000 + opt_idx * 100 + val_idx, - name: "Option#{opt_idx} Value#{val_idx}", - position: val_idx, - swatch_color: nil, # Most products don't have swatch colors + id * 1000 + opt_idx * 100 + val_idx, + "Option#{opt_idx} Value#{val_idx}", + val_idx, + nil # Most products don"t have swatch colors ) end option_values_by_option[opt_idx] = values ProductOption.new( - id: id * 100 + opt_idx, - name: "Option #{opt_idx}", - position: opt_idx, - values: values, + id * 100 + opt_idx, + "Option #{opt_idx}", + opt_idx, + values ) end @@ -588,59 +579,63 @@ def create_product(id:, num_variants:, num_options:, selling_plan_groups:, num_p # Match real ProductVariant structure with some nil fields (sparse data) ProductVariant.new( - id: id * 1000 + var_idx, - product_id: id, - title: "Variant #{var_idx}", - uncontextualized_title: nil, - price: 1999 + var_idx * 100, - compare_at_price: nil, # Most variants don't have compare_at_price - barcode: nil, # Most variants don't have barcodes - options: variant_options, - option1: variant_options[0]&.name, - option2: variant_options[1]&.name, - option1_id: variant_options[0]&.id, - option2_id: variant_options[1]&.id, - taxable: true, - position: var_idx, - created_at: Time.now, - updated_at: Time.now, - fulfillment_service: "manual", - requires_components: false, - inventory_management: "shopify", - inventory_policy: "deny", - weight_unit: "kg", - weight_value: nil, - sku: "SKU-#{id}-#{var_idx}", - requires_shipping: true, - selling_plans: variant_selling_plans, - metafields: create_metafields(owner_id: id * 1000 + var_idx, count: num_variant_metafields, owner_type: "ProductVariant"), - variant_unit_price_measurement: nil, + id * 1000 + var_idx, + id, + "Variant #{var_idx}", + nil, + 1999 + var_idx * 100, + nil, # Most variants don"t have compare_at_price + nil, # Most variants don"t have barcodes + variant_options, + variant_options[0]&.name, + variant_options[1]&.name, + variant_options[0]&.id, + variant_options[1]&.id, + nil, + true, + nil, + var_idx, + Time.now, + Time.now, + "manual", + false, + "shopify", + "deny", + "kg", + nil, + "SKU-#{id}-#{var_idx}", + true, + variant_selling_plans, + create_metafields(owner_id: id * 1000 + var_idx, count: num_variant_metafields, owner_type: "ProductVariant"), + nil ) end # Match real Product structure with some nil fields (sparse data) Product.new( - id: id, - title: "Product #{id}", - handle: "product-#{id}", - description: "Description for product #{id}", - vendor: "Vendor", - published_at: Time.now, - created_at: Time.now, - updated_at: Time.now, - template_suffix: nil, - gift_card: false, - is_published: true, - requires_selling_plan: selling_plan_groups.any?, - published_scope: :published_scope_global, - variants: variants, - options: options, - selling_plan_groups: selling_plan_groups, - metafields: create_metafields(owner_id: id, count: num_product_metafields, owner_type: "Product"), + id, + "Product #{id}", + "product-#{id}", + "Description for product #{id}", + nil, + "Vendor", + Time.now, + Time.now, + Time.now, + nil, + false, + true, + selling_plan_groups.any?, + :published_scope_global, + variants, + options, + selling_plan_groups, + create_metafields(owner_id: id, count: num_product_metafields, owner_type: "Product") ) end -def create_test_data(num_products:, num_variants:, num_selling_plan_groups:, num_selling_plans_per_group:, num_product_metafields: 0, num_variant_metafields: 0) +def create_test_data(num_products:, num_variants:, num_selling_plan_groups:, num_selling_plans_per_group:, + num_product_metafields: 0, num_variant_metafields: 0) # Create SHARED selling plans - same instances used across all products selling_plan_id = 1 selling_plan_groups = (1..num_selling_plan_groups).map do |group_idx| @@ -705,14 +700,14 @@ def run_benchmark(coders, scenario) num_selling_plan_groups: scenario[:spg], num_selling_plans_per_group: scenario[:sp], num_product_metafields: scenario[:product_metafields] || 0, - num_variant_metafields: scenario[:variant_metafields] || 0, + num_variant_metafields: scenario[:variant_metafields] || 0 ) puts "\nBenchmarking scenario: P:#{scenario[:products]} V:#{scenario[:variants]} PM:#{scenario[:product_metafields]} VM:#{scenario[:variant_metafields]} SPG:#{scenario[:spg]} SP:#{scenario[:sp]}" payloads = coders.map { |coder| coder.factory.dump(data) } - result = Benchmark.ips(quiet: true) do |x| + result = Benchmark.ips(time: ENV.fetch("BENCH_TIME", 5).to_f, warmup: ENV.fetch("BENCH_WARMUP", 2).to_f, quiet: true) do |x| coders.each.with_index do |coder, index| x.report(coder.name.split("::").last) do coder.factory.load(payloads[index]) @@ -760,19 +755,19 @@ def print_results(coders, reports) marshal_size = report.bytesize_results[Coders::Marshal] puts "Scenario: #{scenario_str}" - puts "Winner: #{sorted_results.first[0].name.split("::").last} with #{'%.2f' % sorted_results.first[1]}x speedup" + puts "Winner: #{sorted_results.first[0].name.split("::").last} with #{"%.2f" % sorted_results.first[1]}x speedup" linesize = 56 puts "-" * linesize - puts format("%-20s %12s %10s %10s", "Coder", "Size (bytes)", "Size (%)", "Speedup") + puts format("%-20s %12s %10s %10s", "Coder", "Size (bytes)", "Size factor", "Speedup") puts "-" * linesize sorted_results.each do |result| coder, speedup = result size = report.bytesize_results[coder] - size_pct = (size.to_f / marshal_size) * 100 + size_pct = (size.to_f / marshal_size) coder_name = coder.name.split("::").last - line = format("%-20s %12d %9.1f%% %9.2fx", coder_name, size, size_pct, speedup) + line = format("%-20s %12d %9.2fx %9.2fx", coder_name, size, size_pct, speedup) if coder == Coders::Marshal puts "#{BOLD}#{line}#{RESET}" else