From 04360ab2e98278ce6a8c0a7d4071329b4fe42a53 Mon Sep 17 00:00:00 2001 From: Nia Waldvogel Date: Sun, 30 Nov 2025 15:00:29 -0500 Subject: [PATCH] runtime (gc_blocks.go): make sweep branchless Instead of looping over each block, we can use bit hacks to operate on an entire state byte. This deinterleaves the state bits in order to enable these tricks. --- builder/sizes_test.go | 6 +- src/runtime/gc_blocks.go | 144 +++++++++++++++++++++------------------ 2 files changed, 79 insertions(+), 71 deletions(-) diff --git a/builder/sizes_test.go b/builder/sizes_test.go index c1d44f443d..d9a1de0a4b 100644 --- a/builder/sizes_test.go +++ b/builder/sizes_test.go @@ -42,9 +42,9 @@ func TestBinarySize(t *testing.T) { // This is a small number of very diverse targets that we want to test. tests := []sizeTest{ // microcontrollers - {"hifive1b", "examples/echo", 3896, 280, 0, 2268}, - {"microbit", "examples/serial", 2860, 360, 8, 2272}, - {"wioterminal", "examples/pininterrupt", 7361, 1491, 116, 6912}, + {"hifive1b", "examples/echo", 3848, 296, 0, 2268}, + {"microbit", "examples/serial", 2844, 376, 8, 2272}, + {"wioterminal", "examples/pininterrupt", 7301, 1507, 116, 6912}, // TODO: also check wasm. Right now this is difficult, because // wasm binaries are run through wasm-opt and therefore the diff --git a/src/runtime/gc_blocks.go b/src/runtime/gc_blocks.go index 408656b258..e7ca5bdcc6 100644 --- a/src/runtime/gc_blocks.go +++ b/src/runtime/gc_blocks.go @@ -71,19 +71,20 @@ var zeroSizedAlloc uint8 type blockState uint8 const ( - blockStateFree blockState = 0 // 00 - blockStateHead blockState = 1 // 01 - blockStateTail blockState = 2 // 10 - blockStateMark blockState = 3 // 11 - blockStateMask blockState = 3 // 11 + blockStateLow blockState = 1 + blockStateHigh blockState = 1 << blocksPerStateByte + + blockStateFree blockState = 0 + blockStateHead blockState = blockStateLow + blockStateTail blockState = blockStateHigh + blockStateMark blockState = blockStateLow | blockStateHigh + blockStateMask blockState = blockStateLow | blockStateHigh ) +const blockStateEach = 1<>((b%blocksPerStateByte)*stateBits)) & blockStateMask + return blockState(stateByte>>(b%blocksPerStateByte)) & blockStateMask } // State returns the current block state. @@ -193,38 +194,12 @@ func (b gcBlock) state() blockState { // from head to mark. func (b gcBlock) setState(newState blockState) { stateBytePtr := (*uint8)(unsafe.Add(metadataStart, b/blocksPerStateByte)) - *stateBytePtr |= uint8(newState << ((b % blocksPerStateByte) * stateBits)) + *stateBytePtr |= uint8(newState << (b % blocksPerStateByte)) if gcAsserts && b.state() != newState { runtimePanic("gc: setState() was not successful") } } -// markFree sets the block state to free, no matter what state it was in before. -func (b gcBlock) markFree() { - stateBytePtr := (*uint8)(unsafe.Add(metadataStart, b/blocksPerStateByte)) - *stateBytePtr &^= uint8(blockStateMask << ((b % blocksPerStateByte) * stateBits)) - if gcAsserts && b.state() != blockStateFree { - runtimePanic("gc: markFree() was not successful") - } - if gcAsserts { - *(*[wordsPerBlock]uintptr)(unsafe.Pointer(b.address())) = [wordsPerBlock]uintptr{} - } -} - -// unmark changes the state of the block from mark to head. It must be marked -// before calling this function. -func (b gcBlock) unmark() { - if gcAsserts && b.state() != blockStateMark { - runtimePanic("gc: unmark() on a block that is not marked") - } - clearMask := blockStateMask ^ blockStateHead // the bits to clear from the state - stateBytePtr := (*uint8)(unsafe.Add(metadataStart, b/blocksPerStateByte)) - *stateBytePtr &^= uint8(clearMask << ((b % blocksPerStateByte) * stateBits)) - if gcAsserts && b.state() != blockStateHead { - runtimePanic("gc: unmark() was not successful") - } -} - func isOnHeap(ptr uintptr) bool { return ptr >= heapStart && ptr < uintptr(metadataStart) } @@ -685,36 +660,69 @@ func markRoot(addr, root uintptr) { // Sweep goes through all memory and frees unmarked memory. // It returns how many bytes are free in the heap after the sweep. func sweep() (freeBytes uintptr) { - freeCurrentObject := false - var freed uint64 - for block := gcBlock(0); block < endBlock; block++ { - switch block.state() { - case blockStateHead: - // Unmarked head. Free it, including all tail blocks following it. - block.markFree() - freeCurrentObject = true - gcFrees++ - freed++ - case blockStateTail: - if freeCurrentObject { - // This is a tail object following an unmarked head. - // Free it now. - block.markFree() - freed++ - } - case blockStateMark: - // This is a marked object. The next tail blocks must not be freed, - // but the mark bit must be removed so the next GC cycle will - // collect this object if it is unreferenced then. - block.unmark() - freeCurrentObject = false - case blockStateFree: - freeBytes += bytesPerBlock - } - } - gcFreedBlocks += freed - freeBytes += uintptr(freed) * bytesPerBlock - return + endBlock := endBlock + metadataEnd := unsafe.Add(metadataStart, (endBlock+(blocksPerStateByte-1))/blocksPerStateByte) + var oldFreeBlocks, freedHeads, freedTails uintptr + var carry byte + // Pre-subtract the blocks that do not actually exist from oldFreeBlocks. + oldFreeBlocks -= (blocksPerStateByte - 1) - uintptr(endBlock+(blocksPerStateByte-1))%blocksPerStateByte + for meta := metadataStart; meta != metadataEnd; meta = unsafe.Add(meta, 1) { + // Fetch the state byte. + stateBytePtr := (*byte)(unsafe.Pointer(meta)) + stateByte := *stateBytePtr + + // Count existing free blocks in the state byte. + lowState := stateByte & blockStateEach + highState := stateByte >> blocksPerStateByte + freeBlocks := lowState | highState + oldFreeBlocks += uintptr(count4LUT[freeBlocks]) + + // Count unmarked heads in the state byte. + unmarkedHeads := lowState &^ highState + freedHeads += uintptr(count4LUT[unmarkedHeads]) + + // Identify and seperate live and free tails. + // Adding 1 to a run of bits will clear the run. + // We can use this to clear tails after a freed head. + tails := highState &^ lowState + tailClear := tails + (unmarkedHeads << 1) + carry + carry = tailClear >> blocksPerStateByte + freedTails += uintptr(count4LUT[tails&^tailClear]) + tails &= tailClear + + // Construct the new state byte. + markedHeads := highState & lowState + *stateBytePtr = markedHeads | (tails << blocksPerStateByte) + } + + // Update the GC metrics. + gcFrees += uint64(freedHeads) + freedBlocks := freedHeads + freedTails + gcFreedBlocks += uint64(freedBlocks) + freeBlocks := oldFreeBlocks + freedBlocks + + return freeBlocks * bytesPerBlock +} + +// count4LUT is a lookup table used to count set bits in a 4-bit mask. +// TODO: replace with popcnt when available +var count4LUT = [16]uint8{ + 0b0000: 0, + 0b0001: 1, + 0b0010: 1, + 0b0011: 2, + 0b0100: 1, + 0b0101: 2, + 0b0110: 2, + 0b0111: 3, + 0b1000: 1, + 0b1001: 2, + 0b1010: 2, + 0b1011: 3, + 0b1100: 2, + 0b1101: 3, + 0b1110: 3, + 0b1111: 4, } // dumpHeap can be used for debugging purposes. It dumps the state of each heap