Skip to content

Commit 2a301a3

Browse files
authored
[maintnotif] Cluster specific handlers (#3613)
* maint notification handlers for cluster messages * unrelax all conns * trigger ci on feature branches
1 parent fd437ce commit 2a301a3

File tree

7 files changed

+244
-19
lines changed

7 files changed

+244
-19
lines changed

.github/workflows/build.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ on:
44
push:
55
branches: [master, v9, 'v9.*']
66
pull_request:
7-
branches: [master, v9, v9.7, v9.8, 'ndyakov/*', 'ofekshenawa/*', 'htemelski-redis/*', 'ce/*']
7+
branches: [master, v9, v9.7, v9.8, 'ndyakov/**', 'ofekshenawa/**', 'ce/**']
88

99
permissions:
1010
contents: read

internal/maintnotifications/logs/log_messages.go

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,11 @@ const (
121121
UnrelaxedTimeoutMessage = "clearing relaxed timeout"
122122
ManagerNotInitializedMessage = "manager not initialized"
123123
FailedToMarkForHandoffMessage = "failed to mark connection for handoff"
124+
InvalidSeqIDInSMigratingNotificationMessage = "invalid SeqID in SMIGRATING notification"
125+
InvalidSeqIDInSMigratedNotificationMessage = "invalid SeqID in SMIGRATED notification"
126+
InvalidHostPortInSMigratedNotificationMessage = "invalid host:port in SMIGRATED notification"
127+
SlotMigratingMessage = "slots migrating, applying relaxed timeout"
128+
SlotMigratedMessage = "slots migrated, triggering cluster state reload"
124129

125130
// ========================================
126131
// used in pool/conn
@@ -623,3 +628,43 @@ func ExtractDataFromLogMessage(logMessage string) map[string]interface{} {
623628
// If JSON parsing fails, return empty map
624629
return result
625630
}
631+
632+
// Cluster notification functions
633+
func InvalidSeqIDInSMigratingNotification(seqID interface{}) string {
634+
message := fmt.Sprintf("%s: %v", InvalidSeqIDInSMigratingNotificationMessage, seqID)
635+
return appendJSONIfDebug(message, map[string]interface{}{
636+
"seqID": fmt.Sprintf("%v", seqID),
637+
})
638+
}
639+
640+
func InvalidSeqIDInSMigratedNotification(seqID interface{}) string {
641+
message := fmt.Sprintf("%s: %v", InvalidSeqIDInSMigratedNotificationMessage, seqID)
642+
return appendJSONIfDebug(message, map[string]interface{}{
643+
"seqID": fmt.Sprintf("%v", seqID),
644+
})
645+
}
646+
647+
func InvalidHostPortInSMigratedNotification(hostPort interface{}) string {
648+
message := fmt.Sprintf("%s: %v", InvalidHostPortInSMigratedNotificationMessage, hostPort)
649+
return appendJSONIfDebug(message, map[string]interface{}{
650+
"hostPort": fmt.Sprintf("%v", hostPort),
651+
})
652+
}
653+
654+
func SlotMigrating(connID uint64, seqID int64, slotRanges []string) string {
655+
message := fmt.Sprintf("conn[%d] %s seqID=%d slots=%v", connID, SlotMigratingMessage, seqID, slotRanges)
656+
return appendJSONIfDebug(message, map[string]interface{}{
657+
"connID": connID,
658+
"seqID": seqID,
659+
"slotRanges": slotRanges,
660+
})
661+
}
662+
663+
func SlotMigrated(seqID int64, hostPort string, slotRanges []string) string {
664+
message := fmt.Sprintf("%s seqID=%d host:port=%s slots=%v", SlotMigratedMessage, seqID, hostPort, slotRanges)
665+
return appendJSONIfDebug(message, map[string]interface{}{
666+
"seqID": seqID,
667+
"hostPort": hostPort,
668+
"slotRanges": slotRanges,
669+
})
670+
}

maintnotifications/README.md

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,14 @@
22

33
Seamless Redis connection handoffs during cluster maintenance operations without dropping connections.
44

5-
## ⚠️ **Important Note**
6-
**Maintenance notifications are currently supported only in standalone Redis clients.** Cluster clients (ClusterClient, FailoverClient, etc.) do not yet support this functionality.
5+
## Cluster Support
6+
7+
**Cluster notifications are now supported for ClusterClient!**
8+
9+
- **SMIGRATING**: `["SMIGRATING", SeqID, slot/range, ...]` - Relaxes timeouts when slots are being migrated
10+
- **SMIGRATED**: `["SMIGRATED", SeqID, host:port, slot/range, ...]` - Reloads cluster state when slot migration completes
11+
12+
**Note:** Other maintenance notifications (MOVING, MIGRATING, MIGRATED, FAILING_OVER, FAILED_OVER) are supported only in standalone Redis clients. Cluster clients support SMIGRATING and SMIGRATED for cluster-specific slot migration handling.
713

814
## Quick Start
915

maintnotifications/manager.go

Lines changed: 47 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,13 @@ import (
1818

1919
// Push notification type constants for maintenance
2020
const (
21-
NotificationMoving = "MOVING"
22-
NotificationMigrating = "MIGRATING"
23-
NotificationMigrated = "MIGRATED"
24-
NotificationFailingOver = "FAILING_OVER"
25-
NotificationFailedOver = "FAILED_OVER"
21+
NotificationMoving = "MOVING" // Per-connection handoff notification
22+
NotificationMigrating = "MIGRATING" // Per-connection migration start notification - relaxes timeouts
23+
NotificationMigrated = "MIGRATED" // Per-connection migration complete notification - clears relaxed timeouts
24+
NotificationFailingOver = "FAILING_OVER" // Per-connection failover start notification - relaxes timeouts
25+
NotificationFailedOver = "FAILED_OVER" // Per-connection failover complete notification - clears relaxed timeouts
26+
NotificationSMigrating = "SMIGRATING" // Cluster slot migrating notification - relaxes timeouts
27+
NotificationSMigrated = "SMIGRATED" // Cluster slot migrated notification - triggers cluster state reload
2628
)
2729

2830
// maintenanceNotificationTypes contains all notification types that maintenance handles
@@ -32,6 +34,8 @@ var maintenanceNotificationTypes = []string{
3234
NotificationMigrated,
3335
NotificationFailingOver,
3436
NotificationFailedOver,
37+
NotificationSMigrating,
38+
NotificationSMigrated,
3539
}
3640

3741
// NotificationHook is called before and after notification processing
@@ -65,6 +69,10 @@ type Manager struct {
6569
// MOVING operation tracking - using sync.Map for better concurrent performance
6670
activeMovingOps sync.Map // map[MovingOperationKey]*MovingOperation
6771

72+
// SMIGRATED notification deduplication - tracks processed SeqIDs
73+
// Multiple connections may receive the same SMIGRATED notification
74+
processedSMigratedSeqIDs sync.Map // map[int64]bool
75+
6876
// Atomic state tracking - no locks needed for state queries
6977
activeOperationCount atomic.Int64 // Number of active operations
7078
closed atomic.Bool // Manager closed state
@@ -73,6 +81,9 @@ type Manager struct {
7381
hooks []NotificationHook
7482
hooksMu sync.RWMutex // Protects hooks slice
7583
poolHooksRef *PoolHook
84+
85+
// Cluster state reload callback for SMIGRATED notifications
86+
clusterStateReloadCallback ClusterStateReloadCallback
7687
}
7788

7889
// MovingOperation tracks an active MOVING operation.
@@ -83,6 +94,14 @@ type MovingOperation struct {
8394
Deadline time.Time
8495
}
8596

97+
// ClusterStateReloadCallback is a callback function that triggers cluster state reload.
98+
// This is used by node clients to notify their parent ClusterClient about SMIGRATED notifications.
99+
// The hostPort parameter indicates the destination node (e.g., "127.0.0.1:6379").
100+
// The slotRanges parameter contains the migrated slots (e.g., ["1234", "5000-6000"]).
101+
// Currently, implementations typically reload the entire cluster state, but in the future
102+
// this could be optimized to reload only the specific slots.
103+
type ClusterStateReloadCallback func(ctx context.Context, hostPort string, slotRanges []string)
104+
86105
// NewManager creates a new simplified manager.
87106
func NewManager(client interfaces.ClientInterface, pool pool.Pooler, config *Config) (*Manager, error) {
88107
if client == nil {
@@ -223,6 +242,15 @@ func (hm *Manager) GetActiveOperationCount() int64 {
223242
return hm.activeOperationCount.Load()
224243
}
225244

245+
// MarkSMigratedSeqIDProcessed attempts to mark a SMIGRATED SeqID as processed.
246+
// Returns true if this is the first time processing this SeqID (should process),
247+
// false if it was already processed (should skip).
248+
// This prevents duplicate processing when multiple connections receive the same notification.
249+
func (hm *Manager) MarkSMigratedSeqIDProcessed(seqID int64) bool {
250+
_, alreadyProcessed := hm.processedSMigratedSeqIDs.LoadOrStore(seqID, true)
251+
return !alreadyProcessed // Return true if NOT already processed
252+
}
253+
226254
// Close closes the manager.
227255
func (hm *Manager) Close() error {
228256
// Use atomic operation for thread-safe close check
@@ -318,3 +346,17 @@ func (hm *Manager) AddNotificationHook(notificationHook NotificationHook) {
318346
defer hm.hooksMu.Unlock()
319347
hm.hooks = append(hm.hooks, notificationHook)
320348
}
349+
350+
// SetClusterStateReloadCallback sets the callback function that will be called when a SMIGRATED notification is received.
351+
// This allows node clients to notify their parent ClusterClient to reload cluster state.
352+
func (hm *Manager) SetClusterStateReloadCallback(callback ClusterStateReloadCallback) {
353+
hm.clusterStateReloadCallback = callback
354+
}
355+
356+
// TriggerClusterStateReload calls the cluster state reload callback if it's set.
357+
// This is called when a SMIGRATED notification is received.
358+
func (hm *Manager) TriggerClusterStateReload(ctx context.Context, hostPort string, slotRanges []string) {
359+
if hm.clusterStateReloadCallback != nil {
360+
hm.clusterStateReloadCallback(ctx, hostPort, slotRanges)
361+
}
362+
}

maintnotifications/manager_test.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,8 @@ func TestManagerRefactoring(t *testing.T) {
217217
NotificationMigrated,
218218
NotificationFailingOver,
219219
NotificationFailedOver,
220+
NotificationSMigrating,
221+
NotificationSMigrated,
220222
}
221223

222224
if len(maintenanceNotificationTypes) != len(expectedTypes) {

maintnotifications/pool_hook_test.go

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -700,9 +700,25 @@ func TestConnectionHook(t *testing.T) {
700700
t.Errorf("Connection should be pooled after handoff (shouldPool=%v, shouldRemove=%v)", shouldPool, shouldRemove)
701701
}
702702

703-
// Wait for handoff to complete
704-
time.Sleep(50 * time.Millisecond)
703+
// Wait for handoff to complete with polling instead of fixed sleep
704+
// This avoids flakiness on slow CI runners where 50ms may not be enough
705+
maxWait := 500 * time.Millisecond
706+
pollInterval := 10 * time.Millisecond
707+
deadline := time.Now().Add(maxWait)
705708

709+
handoffCompleted := false
710+
for time.Now().Before(deadline) {
711+
if conn.IsUsable() && !processor.IsHandoffPending(conn) {
712+
handoffCompleted = true
713+
break
714+
}
715+
time.Sleep(pollInterval)
716+
}
717+
718+
if !handoffCompleted {
719+
t.Fatalf("Handoff did not complete within %v (IsUsable=%v, IsHandoffPending=%v)",
720+
maxWait, conn.IsUsable(), processor.IsHandoffPending(conn))
721+
}
706722
// After handoff completion, connection should be usable again
707723
if !conn.IsUsable() {
708724
t.Error("Connection should be usable after handoff completion")

maintnotifications/push_notification_handler.go

Lines changed: 123 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,10 @@ func (snh *NotificationHandler) HandlePushNotification(ctx context.Context, hand
4949
err = snh.handleFailingOver(ctx, handlerCtx, modifiedNotification)
5050
case NotificationFailedOver:
5151
err = snh.handleFailedOver(ctx, handlerCtx, modifiedNotification)
52+
case NotificationSMigrating:
53+
err = snh.handleSMigrating(ctx, handlerCtx, modifiedNotification)
54+
case NotificationSMigrated:
55+
err = snh.handleSMigrated(ctx, handlerCtx, modifiedNotification)
5256
default:
5357
// Ignore other notification types (e.g., pub/sub messages)
5458
err = nil
@@ -61,7 +65,9 @@ func (snh *NotificationHandler) HandlePushNotification(ctx context.Context, hand
6165
}
6266

6367
// handleMoving processes MOVING notifications.
64-
// ["MOVING", seqNum, timeS, endpoint] - per-connection handoff
68+
// MOVING indicates that a connection should be handed off to a new endpoint.
69+
// This is a per-connection notification that triggers connection handoff.
70+
// Expected format: ["MOVING", seqNum, timeS, endpoint]
6571
func (snh *NotificationHandler) handleMoving(ctx context.Context, handlerCtx push.NotificationHandlerContext, notification []interface{}) error {
6672
if len(notification) < 3 {
6773
internal.Logger.Printf(ctx, logs.InvalidNotification("MOVING", notification))
@@ -167,9 +173,10 @@ func (snh *NotificationHandler) markConnForHandoff(conn *pool.Conn, newEndpoint
167173
}
168174

169175
// handleMigrating processes MIGRATING notifications.
176+
// MIGRATING indicates that a connection migration is starting.
177+
// This is a per-connection notification that applies relaxed timeouts.
178+
// Expected format: ["MIGRATING", ...]
170179
func (snh *NotificationHandler) handleMigrating(ctx context.Context, handlerCtx push.NotificationHandlerContext, notification []interface{}) error {
171-
// MIGRATING notifications indicate that a connection is about to be migrated
172-
// Apply relaxed timeouts to the specific connection that received this notification
173180
if len(notification) < 2 {
174181
internal.Logger.Printf(ctx, logs.InvalidNotification("MIGRATING", notification))
175182
return ErrInvalidNotification
@@ -195,9 +202,10 @@ func (snh *NotificationHandler) handleMigrating(ctx context.Context, handlerCtx
195202
}
196203

197204
// handleMigrated processes MIGRATED notifications.
205+
// MIGRATED indicates that a connection migration has completed.
206+
// This is a per-connection notification that clears relaxed timeouts.
207+
// Expected format: ["MIGRATED", ...]
198208
func (snh *NotificationHandler) handleMigrated(ctx context.Context, handlerCtx push.NotificationHandlerContext, notification []interface{}) error {
199-
// MIGRATED notifications indicate that a connection migration has completed
200-
// Restore normal timeouts for the specific connection that received this notification
201209
if len(notification) < 2 {
202210
internal.Logger.Printf(ctx, logs.InvalidNotification("MIGRATED", notification))
203211
return ErrInvalidNotification
@@ -224,9 +232,10 @@ func (snh *NotificationHandler) handleMigrated(ctx context.Context, handlerCtx p
224232
}
225233

226234
// handleFailingOver processes FAILING_OVER notifications.
235+
// FAILING_OVER indicates that a failover is starting.
236+
// This is a per-connection notification that applies relaxed timeouts.
237+
// Expected format: ["FAILING_OVER", ...]
227238
func (snh *NotificationHandler) handleFailingOver(ctx context.Context, handlerCtx push.NotificationHandlerContext, notification []interface{}) error {
228-
// FAILING_OVER notifications indicate that a connection is about to failover
229-
// Apply relaxed timeouts to the specific connection that received this notification
230239
if len(notification) < 2 {
231240
internal.Logger.Printf(ctx, logs.InvalidNotification("FAILING_OVER", notification))
232241
return ErrInvalidNotification
@@ -253,9 +262,10 @@ func (snh *NotificationHandler) handleFailingOver(ctx context.Context, handlerCt
253262
}
254263

255264
// handleFailedOver processes FAILED_OVER notifications.
265+
// FAILED_OVER indicates that a failover has completed.
266+
// This is a per-connection notification that clears relaxed timeouts.
267+
// Expected format: ["FAILED_OVER", ...]
256268
func (snh *NotificationHandler) handleFailedOver(ctx context.Context, handlerCtx push.NotificationHandlerContext, notification []interface{}) error {
257-
// FAILED_OVER notifications indicate that a connection failover has completed
258-
// Restore normal timeouts for the specific connection that received this notification
259269
if len(notification) < 2 {
260270
internal.Logger.Printf(ctx, logs.InvalidNotification("FAILED_OVER", notification))
261271
return ErrInvalidNotification
@@ -280,3 +290,107 @@ func (snh *NotificationHandler) handleFailedOver(ctx context.Context, handlerCtx
280290
conn.ClearRelaxedTimeout()
281291
return nil
282292
}
293+
294+
// handleSMigrating processes SMIGRATING notifications.
295+
// SMIGRATING indicates that a cluster slot is in the process of migrating to a different node.
296+
// This is a per-connection notification that applies relaxed timeouts during slot migration.
297+
// Expected format: ["SMIGRATING", SeqID, slot/range1-range2, ...]
298+
func (snh *NotificationHandler) handleSMigrating(ctx context.Context, handlerCtx push.NotificationHandlerContext, notification []interface{}) error {
299+
if len(notification) < 3 {
300+
internal.Logger.Printf(ctx, logs.InvalidNotification("SMIGRATING", notification))
301+
return ErrInvalidNotification
302+
}
303+
304+
// Extract SeqID (position 1)
305+
seqID, ok := notification[1].(int64)
306+
if !ok {
307+
internal.Logger.Printf(ctx, logs.InvalidSeqIDInSMigratingNotification(notification[1]))
308+
return ErrInvalidNotification
309+
}
310+
311+
// Extract slot ranges (position 2+)
312+
// For now, we just extract them for logging
313+
// Format can be: single slot "1234" or range "100-200"
314+
var slotRanges []string
315+
for i := 2; i < len(notification); i++ {
316+
if slotRange, ok := notification[i].(string); ok {
317+
slotRanges = append(slotRanges, slotRange)
318+
}
319+
}
320+
321+
if handlerCtx.Conn == nil {
322+
internal.Logger.Printf(ctx, logs.NoConnectionInHandlerContext("SMIGRATING"))
323+
return ErrInvalidNotification
324+
}
325+
326+
conn, ok := handlerCtx.Conn.(*pool.Conn)
327+
if !ok {
328+
internal.Logger.Printf(ctx, logs.InvalidConnectionTypeInHandlerContext("SMIGRATING", handlerCtx.Conn, handlerCtx))
329+
return ErrInvalidNotification
330+
}
331+
332+
// Apply relaxed timeout to this specific connection
333+
if internal.LogLevel.InfoOrAbove() {
334+
internal.Logger.Printf(ctx, logs.SlotMigrating(conn.GetID(), seqID, slotRanges))
335+
}
336+
conn.SetRelaxedTimeout(snh.manager.config.RelaxedTimeout, snh.manager.config.RelaxedTimeout)
337+
return nil
338+
}
339+
340+
// handleSMigrated processes SMIGRATED notifications.
341+
// SMIGRATED indicates that a cluster slot has finished migrating to a different node.
342+
// This is a cluster-level notification that triggers cluster state reload.
343+
// Expected format: ["SMIGRATED", SeqID, host:port, slot1/range1-range2, ...]
344+
// Note: Multiple connections may receive the same notification, so we deduplicate by SeqID before triggering reload.
345+
// but we still process the notification on each connection to clear the relaxed timeout.
346+
func (snh *NotificationHandler) handleSMigrated(ctx context.Context, handlerCtx push.NotificationHandlerContext, notification []interface{}) error {
347+
if len(notification) < 4 {
348+
internal.Logger.Printf(ctx, logs.InvalidNotification("SMIGRATED", notification))
349+
return ErrInvalidNotification
350+
}
351+
352+
// Extract SeqID (position 1)
353+
seqID, ok := notification[1].(int64)
354+
if !ok {
355+
internal.Logger.Printf(ctx, logs.InvalidSeqIDInSMigratedNotification(notification[1]))
356+
return ErrInvalidNotification
357+
}
358+
359+
// Deduplicate by SeqID - multiple connections may receive the same notification
360+
if snh.manager.MarkSMigratedSeqIDProcessed(seqID) {
361+
// Extract host:port (position 2)
362+
hostPort, ok := notification[2].(string)
363+
if !ok {
364+
internal.Logger.Printf(ctx, logs.InvalidHostPortInSMigratedNotification(notification[2]))
365+
return ErrInvalidNotification
366+
}
367+
368+
// Extract slot ranges (position 3+)
369+
// For now, we just extract them for logging
370+
// Format can be: single slot "1234" or range "100-200"
371+
var slotRanges []string
372+
for i := 3; i < len(notification); i++ {
373+
if slotRange, ok := notification[i].(string); ok {
374+
slotRanges = append(slotRanges, slotRange)
375+
}
376+
}
377+
378+
if internal.LogLevel.InfoOrAbove() {
379+
internal.Logger.Printf(ctx, logs.SlotMigrated(seqID, hostPort, slotRanges))
380+
}
381+
// Trigger cluster state reload via callback, passing host:port and slot ranges
382+
// For now, implementations just log these and trigger a full reload
383+
// In the future, this could be optimized to reload only the specific slots
384+
snh.manager.TriggerClusterStateReload(ctx, hostPort, slotRanges)
385+
}
386+
387+
// clear relaxed timeout
388+
if handlerCtx.Conn != nil {
389+
conn, ok := handlerCtx.Conn.(*pool.Conn)
390+
if ok {
391+
conn.ClearRelaxedTimeout()
392+
}
393+
}
394+
395+
return nil
396+
}

0 commit comments

Comments
 (0)