Skip to content

Commit f58cbff

Browse files
committed
util/eventbus: use unbounded event queues for DeliveredEvents in subscribers
Bounded DeliveredEvent queues reduce memory usage, but they can deadlock under load. Two common scenarios trigger deadlocks when the number of events published in a short period exceeds twice the queue capacity (there's a PublishedEvent queue of the same size): - a subscriber tries to acquire the same mutex as held by a publisher, or - a subscriber for A events publishes B events Avoiding these scenarios is not practical and would limit eventbus usefulness and reduce its adoption, pushing us back to callbacks and other legacy mechanisms. These deadlocks already occurred in customer devices, dev machines, and tests. They also make it harder to identify and fix slow subscribers and similar issues we have been seeing recently. Choosing an arbitrary large fixed queue capacity would only mask the problem. A client running on a sufficiently large and complex customer environment can exceed any meaningful constant limit, since event volume depends on the number of peers and other factors. Behavior also changes based on scheduling of publishers and subscribers by the Go runtime, OS, and hardware, as the issue is essentially a race between publishers and subscribers. Additionally, on lower-end devices, an unreasonably high constant capacity is practically the same as using unbounded queues. Therefore, this PR changes the event queue implementation to be unbounded by default. The PublishedEvent queue keeps its existing capacity of 16 items, while subscribers' DeliveredEvent queues become unbounded. This change fixes known deadlocks and makes the system stable under load, at the cost of higher potential memory usage, including cases where a queue grows during an event burst and does not shrink when load decreases. Further improvements can be implemented in the future as needed. Fixes tailscale#17973 Fixes tailscale#18012 Signed-off-by: Nick Khyl <nickk@tailscale.com> (cherry picked from commit 1ccece0)
1 parent f2100e2 commit f58cbff

File tree

3 files changed

+16
-39
lines changed

3 files changed

+16
-39
lines changed

util/eventbus/bus.go

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,14 @@ func (b *Bus) Close() {
120120
}
121121

122122
func (b *Bus) pump(ctx context.Context) {
123-
var vals queue[PublishedEvent]
123+
// Limit how many published events we can buffer in the PublishedEvent queue.
124+
//
125+
// Subscribers have unbounded DeliveredEvent queues (see tailscale/tailscale#18020),
126+
// so this queue doesn't need to be unbounded. Keeping it bounded may also help
127+
// catch cases where subscribers stop pumping events completely, such as due to a bug
128+
// in [subscribeState.pump], [Subscriber.dispatch], or [SubscriberFunc.dispatch]).
129+
const maxPublishedEvents = 16
130+
vals := queue[PublishedEvent]{capacity: maxPublishedEvents}
124131
acceptCh := func() chan PublishedEvent {
125132
if vals.Full() {
126133
return nil

util/eventbus/bus_test.go

Lines changed: 2 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -547,23 +547,8 @@ func TestRegression(t *testing.T) {
547547
})
548548
}
549549

550-
const (
551-
maxQueuedItems = 16 // same as in queue.go
552-
totalMaxQueuedItems = maxQueuedItems * 2 // both publisher and subscriber sides
553-
)
554-
555550
func TestPublishWithMutex(t *testing.T) {
556-
t.Run("FewEvents", func(t *testing.T) {
557-
// As of 2025-11-20, publishing up to [totalMaxQueuedItems] is fine.
558-
testPublishWithMutex(t, totalMaxQueuedItems)
559-
})
560-
t.Run("ManyEvents", func(t *testing.T) {
561-
// As of 2025-11-20, publishing more than [totalMaxQueuedItems] may deadlock.
562-
t.Skip("TODO: fix deadlock in https://github.com/tailscale/tailscale/issues/17973")
563-
564-
const N = 3 // N larger than one increases the chance of deadlock.
565-
testPublishWithMutex(t, totalMaxQueuedItems+N)
566-
})
551+
testPublishWithMutex(t, 1024) // arbitrary large number of events
567552
}
568553

569554
// testPublishWithMutex publishes the specified number of events,
@@ -590,13 +575,10 @@ func testPublishWithMutex(t *testing.T, n int) {
590575
var mu sync.Mutex
591576
eventbus.SubscribeFunc[EventA](c, func(e EventA) {
592577
// Acquire the same mutex as the publisher.
593-
// As of 2025-11-20, this can deadlock if n is large enough
594-
// and event queues fill up.
595578
mu.Lock()
596579
mu.Unlock()
597580

598581
// Mark event as received, so we can check for lost events.
599-
// Not required for the deadlock to occur.
600582
exp.Got(e)
601583
})
602584

@@ -619,17 +601,7 @@ func testPublishWithMutex(t *testing.T, n int) {
619601
}
620602

621603
func TestPublishFromSubscriber(t *testing.T) {
622-
t.Run("FewEvents", func(t *testing.T) {
623-
// Publishing up to [totalMaxQueuedItems]-1 is fine.
624-
testPublishFromSubscriber(t, totalMaxQueuedItems-1)
625-
})
626-
t.Run("ManyEvents", func(t *testing.T) {
627-
// As of 2025-11-20, publishing more than [totalMaxQueuedItems] may deadlock.
628-
t.Skip("TODO: fix deadlock in https://github.com/tailscale/tailscale/issues/18012")
629-
630-
// Using 2x to increase chance of deadlock.
631-
testPublishFromSubscriber(t, totalMaxQueuedItems*2)
632-
})
604+
testPublishFromSubscriber(t, 1024) // arbitrary large number of events
633605
}
634606

635607
// testPublishFromSubscriber publishes the specified number of EventA events.
@@ -655,8 +627,6 @@ func testPublishFromSubscriber(t *testing.T, n int) {
655627

656628
eventbus.SubscribeFunc[EventA](c, func(e EventA) {
657629
// Upon receiving EventA, publish EventB.
658-
// As of 2025-11-20, this can deadlock if n is large enough
659-
// and event queues fill up.
660630
pubB.Publish(EventB{Counter: e.Counter})
661631
})
662632
eventbus.SubscribeFunc[EventB](c, func(e EventB) {

util/eventbus/queue.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,18 +7,18 @@ import (
77
"slices"
88
)
99

10-
const maxQueuedItems = 16
11-
12-
// queue is an ordered queue of length up to maxQueuedItems.
10+
// queue is an ordered queue of length up to capacity,
11+
// if capacity is non-zero. Otherwise it is unbounded.
1312
type queue[T any] struct {
14-
vals []T
15-
start int
13+
vals []T
14+
start int
15+
capacity int // zero means unbounded
1616
}
1717

1818
// canAppend reports whether a value can be appended to q.vals without
1919
// shifting values around.
2020
func (q *queue[T]) canAppend() bool {
21-
return cap(q.vals) < maxQueuedItems || len(q.vals) < cap(q.vals)
21+
return q.capacity == 0 || cap(q.vals) < q.capacity || len(q.vals) < cap(q.vals)
2222
}
2323

2424
func (q *queue[T]) Full() bool {

0 commit comments

Comments
 (0)