[azeventhubs] Adding in prefetch value for the Processor, updating stress tests (Azure#19786)

richardpark-msft · web-flow · commit 4474d1e0354d · 2023-01-11T21:23:44.000Z
- Added the option to configure prefetch count for ProcessorPartitionClient's created from the Processor - Fixed some issues with the stress tests. Batch tests were using a really large batch size, which was too large for the amount of memory we're using in our test containers. - Processor tests weren't configuring a prefetch value (prior to this PR the setting wasn't exposed) Also, just generally made the stress tests a bit cleaner, leveragin the scenario-matrix more for configuring values. Fixes Azure#19770
diff --git a/sdk/messaging/azeventhubs/CHANGELOG.md b/sdk/messaging/azeventhubs/CHANGELOG.md
@@ -4,6 +4,8 @@
 
 ### Features Added
 
+- Adds ProcessorOptions.Prefetch field, allowing configuration of Prefetch values for PartitionClients created using the Processor. (PR#19786)
+
 ### Breaking Changes
 
 ### Bugs Fixed
diff --git a/sdk/messaging/azeventhubs/internal/eh/stress/Chart.lock b/sdk/messaging/azeventhubs/internal/eh/stress/Chart.lock
@@ -1,6 +1,6 @@
 dependencies:
 - name: stress-test-addons
   repository: https://stresstestcharts.blob.core.windows.net/helm/
-  version: 0.1.21
-digest: sha256:ac7f0861fd54ebba0e3fec92c1fecc058e96f58df9094641605dd4250a7a423f
-generated: "2022-10-26T23:28:36.470982569Z"
+  version: 0.2.0
+digest: sha256:59fff3930e78c4ca9f9c0120433c7695d31db63f36ac61d50abcc91b1f1835a0
+generated: "2023-01-06T23:57:35.96426915Z"
diff --git a/sdk/messaging/azeventhubs/internal/eh/stress/scenarios-matrix.yaml b/sdk/messaging/azeventhubs/internal/eh/stress/scenarios-matrix.yaml
@@ -6,13 +6,39 @@ matrix:
   scenarios:
     batch:
       testTarget: batch
+      type: "batch"
+      rounds: 100
+      prefetch: 0
+      verbose: ""
+      sleepAfter: "5m"
     batchprefetchoff:
-      testTarget: batchprefetchoff
+      testTarget: batch
+      rounds: 100
+      prefetch: -1
+      verbose: ""
+      sleepAfter: "5m"
     batchinfinite:
-      testTarget: batchinfinite
+      testTarget: batch
+      type: "batch"
+      rounds: 100
+      prefetch: 0
+      verbose: ""
+      sleepAfter: "5m"
     processor:
       testTarget: processor
+      rounds: 100
+      prefetch: 0
+      verbose: ""
+      sleepAfter: "5m"
     processorprefetchoff:
-      testTarget: processorprefetchoff
+      testTarget: processor
+      rounds: 100
+      prefetch: -1
+      verbose: ""
+      sleepAfter: "5m"
     processorinfinite:
-      testTarget: processorinfinite
+      testTarget: processor
+      rounds: 100
+      prefetch: 0
+      verbose: ""
+      sleepAfter: "5m"
diff --git a/sdk/messaging/azeventhubs/internal/eh/stress/templates/deploy-job.yaml b/sdk/messaging/azeventhubs/internal/eh/stress/templates/deploy-job.yaml
@@ -29,37 +29,14 @@ spec:
       # NOTE: -verbose activates _all_ the Azure internal logging, which can get quite large.
       # so it's not enabled for every target in here. We also have an issue filed to whittle it
       # down (https://github.com/Azure/azure-sdk-for-go/issues/19459)
-      {{ if (eq .Stress.testTarget "batch") }}
-      - batch
+      - "{{.Stress.testTarget}}"
       - "-rounds"
-      - "100"
-      - "-verbose"
-      {{ else if (eq .Stress.testTarget "batchprefetchoff") }}
-      - batch
-      - "-rounds"
-      - "100"
-      - "-prefetch"
-      - "-1"
-      - "-verbose"
-      {{ else if (eq .Stress.testTarget "batchinfinite") }}
-      - batch
-      - "-rounds"
-      - "-1"      
-      {{ else if (eq .Stress.testTarget "processor") }}
-      - processor
-      - "-rounds"
-      - "100"
-      {{ else if (eq .Stress.testTarget "processorprefetchoff") }}
-      - processor
-      - "-rounds"
-      - "100"
+      - "{{.Stress.rounds}}"
       - "-prefetch"
-      - "-1"
-      {{ else if (eq .Stress.testTarget "processorinfinite") }}
-      - processor
-      - "-rounds"
-      - "-1"
-      {{- end -}}
+      - "{{.Stress.prefetch}}"
+      - "{{.Stress.verbose}}"
+      - "-sleepAfter"
+      - "{{.Stress.sleepAfter}}"
       {{- include "stress-test-addons.container-env" . | nindent 6 }}
 {{- end -}}
 
diff --git a/sdk/messaging/azeventhubs/internal/eh/stress/tests/batch_stress_tester.go b/sdk/messaging/azeventhubs/internal/eh/stress/tests/batch_stress_tester.go
@@ -39,7 +39,7 @@ func getBatchTesterParams(args []string) (batchTesterParams, error) {
 	// Look in ../templates/deploy-job.yaml for some of the other parameter variations we use in stress/longevity
 	// testing.
 	fs.IntVar(&params.numToSend, "send", 1000000, "Number of events to send.")
-	fs.IntVar(&params.batchSize, "receive", 1000000, "Size to request each time we call ReceiveEvents()")
+	fs.IntVar(&params.batchSize, "receive", 1000, "Size to request each time we call ReceiveEvents(). Higher batch sizes will require higher amounts of memory for this test.")
 	fs.StringVar(&batchDurationStr, "timeout", "60s", "Time to wait for each batch (ie: 1m, 30s, etc..)")
 	prefetch := fs.Int("prefetch", 0, "Number of events to set for the prefetch. Negative numbers disable prefetch altogether. 0 uses the default for the package.")
 
@@ -48,8 +48,9 @@ func getBatchTesterParams(args []string) (batchTesterParams, error) {
 	fs.StringVar(&params.partitionID, "partition", "0", "Partition ID to send and receive events to")
 	fs.IntVar(&params.maxDeadlineExceeded, "maxtimeouts", 10, "Number of consecutive receive timeouts allowed before quitting")
 	fs.BoolVar(&params.enableVerboseLogging, "verbose", false, "enable verbose azure sdk logging")
+	sleepAfterFn := addSleepAfterFlag(fs)
 
-	if err := fs.Parse(os.Args[2:]); errors.Is(err, flag.ErrHelp) {
+	if err := fs.Parse(os.Args[2:]); err != nil {
 		fs.PrintDefaults()
 		return batchTesterParams{}, err
 	}
@@ -68,6 +69,7 @@ func getBatchTesterParams(args []string) (batchTesterParams, error) {
 	}
 
 	params.batchDuration = batchDuration
+	params.sleepAfterFn = sleepAfterFn
 
 	return params, nil
 }
@@ -81,6 +83,8 @@ func BatchStressTester(ctx context.Context) error {
 		return err
 	}
 
+	defer params.sleepAfterFn()
+
 	testData, err := newStressTestData("batch", params.enableVerboseLogging, map[string]string{
 		"BatchDuration":       params.batchDuration.String(),
 		"BatchSize":           fmt.Sprintf("%d", params.batchSize),
@@ -160,6 +164,7 @@ type batchTesterParams struct {
 	prefetch             int32
 	maxDeadlineExceeded  int
 	enableVerboseLogging bool
+	sleepAfterFn         func()
 }
 
 func consumeForBatchTester(ctx context.Context, round int64, cc *azeventhubs.ConsumerClient, sp azeventhubs.StartPosition, params batchTesterParams, testData *stressTestData) error {
diff --git a/sdk/messaging/azeventhubs/internal/eh/stress/tests/processor_stress_tester.go b/sdk/messaging/azeventhubs/internal/eh/stress/tests/processor_stress_tester.go
@@ -39,6 +39,9 @@ type processorStressTest struct {
 	eventsPerRound int
 	rounds         int64
 
+	prefetch     int32
+	sleepAfterFn func()
+
 	checkpointStore azeventhubs.CheckpointStore
 }
 
@@ -48,9 +51,11 @@ func newProcessorStressTest(args []string) (*processorStressTest, error) {
 	numProcessors := fs.Int("processors", 1, "Number of processors to run, concurrently")
 	eventsPerRound := fs.Int("send", 5000, "Number of events to send per round")
 	rounds := fs.Int64("rounds", 100, "Number of rounds. -1 means math.MaxInt64")
+	prefetch := fs.Int("prefetch", 0, "Number of events to set for the prefetch. Negative numbers disable prefetch altogether. 0 uses the default for the package.")
 	enableVerboseLogging := fs.Bool("verbose", false, "enable verbose azure sdk logging")
+	sleepAfterFn := addSleepAfterFlag(fs)
 
-	if err := fs.Parse(args); errors.Is(err, flag.ErrHelp) {
+	if err := fs.Parse(args); err != nil {
 		fs.PrintDefaults()
 		return nil, err
 	}
@@ -63,6 +68,7 @@ func newProcessorStressTest(args []string) (*processorStressTest, error) {
 		"Processors":     fmt.Sprintf("%d", numProcessors),
 		"EventsPerRound": fmt.Sprintf("%d", eventsPerRound),
 		"Rounds":         fmt.Sprintf("%d", rounds),
+		"Prefetch":       fmt.Sprintf("%d", *prefetch),
 	})
 
 	if err != nil {
@@ -84,6 +90,8 @@ func newProcessorStressTest(args []string) (*processorStressTest, error) {
 		eventsPerRound:  *eventsPerRound,
 		rounds:          *rounds,
 		checkpointStore: blobStore,
+		prefetch:        int32(*prefetch),
+		sleepAfterFn:    sleepAfterFn,
 	}, nil
 }
 
@@ -93,6 +101,8 @@ func (inf *processorStressTest) Run(ctx context.Context) error {
 		inf.eventsPerRound,
 		inf.containerName)
 
+	defer inf.sleepAfterFn()
+
 	checkpoints, err := initCheckpointStore(ctx, inf.containerName, inf.stressTestData)
 
 	if err != nil {
@@ -123,7 +133,7 @@ func (inf *processorStressTest) Run(ctx context.Context) error {
 				}
 
 				go func() {
-					if err := inf.receiveForever(ctx, partClient, logger); err != nil {
+					if err := inf.receiveForever(ctx, partClient, logger, inf.eventsPerRound); err != nil {
 						inf.TC.TrackException(err)
 						panic(err)
 					}
@@ -194,7 +204,8 @@ func (inf *processorStressTest) Run(ctx context.Context) error {
 		// start checking the checkpoint store to see how far along we are, and when
 		// we're at the end.
 		for {
-			header := fmt.Sprintf("round %d, elapsed %s", round, time.Since(start)/time.Second)
+			var elapsed = time.Since(start) / time.Second
+			header := fmt.Sprintf("round %d, elapsed %d seconds", round, elapsed)
 			output, done, err := inf.report(ctx, header, endPositions)
 
 			if err != nil {
@@ -219,7 +230,7 @@ func (inf *processorStressTest) Run(ctx context.Context) error {
 	return nil
 }
 
-func (inf *processorStressTest) receiveForever(ctx context.Context, partClient *azeventhubs.ProcessorPartitionClient, logger logf) error {
+func (inf *processorStressTest) receiveForever(ctx context.Context, partClient *azeventhubs.ProcessorPartitionClient, logger logf, eventsPerRound int) error {
 	defer func() {
 		logger("Closing")
 
@@ -233,9 +244,11 @@ func (inf *processorStressTest) receiveForever(ctx context.Context, partClient *
 
 	logger("Starting receive loop")
 
+	batchSize := int(math.Min(float64(eventsPerRound), 100))
+
 	for {
 		receiveCtx, cancelReceive := context.WithCancel(ctx)
-		events, err := partClient.ReceiveEvents(receiveCtx, 100, nil)
+		events, err := partClient.ReceiveEvents(receiveCtx, batchSize, nil)
 		cancelReceive()
 
 		if errors.Is(err, context.DeadlineExceeded) && ctx.Err() == nil {
@@ -359,7 +372,9 @@ func (inf *processorStressTest) newProcessorForTest(ctx context.Context) (*azeve
 		return nil, nil, err
 	}
 
-	processor, err := azeventhubs.NewProcessor(cc, cps, nil)
+	processor, err := azeventhubs.NewProcessor(cc, cps, &azeventhubs.ProcessorOptions{
+		Prefetch: inf.prefetch,
+	})
 
 	if err != nil {
 		return nil, nil, err
diff --git a/sdk/messaging/azeventhubs/internal/eh/stress/tests/shared.go b/sdk/messaging/azeventhubs/internal/eh/stress/tests/shared.go
@@ -5,6 +5,7 @@ package tests
 import (
 	"context"
 	"errors"
+	"flag"
 	"fmt"
 	"log"
 	"os"
@@ -381,3 +382,23 @@ func enableVerboseLogging() {
 		log.Printf("[%s] %s", e, s)
 	})
 }
+
+func addSleepAfterFlag(fs *flag.FlagSet) func() {
+	var durationStr string
+	fs.StringVar(&durationStr, "sleepAfter", "0m", "Time to sleep after test completes")
+
+	return func() {
+		sleepAfter, err := time.ParseDuration(durationStr)
+
+		if err != nil {
+			log.Printf("Invalid sleepAfter duration given: %s", sleepAfter)
+			return
+		}
+
+		if sleepAfter > 0 {
+			log.Printf("Sleeping for %s", sleepAfter)
+			time.Sleep(sleepAfter)
+			log.Printf("Done sleeping for %s", sleepAfter)
+		}
+	}
+}
diff --git a/sdk/messaging/azeventhubs/processor.go b/sdk/messaging/azeventhubs/processor.go
@@ -55,6 +55,16 @@ type ProcessorOptions struct {
 	// from partition clients with a lower OwnerLevel.
 	// Default is 0.
 	OwnerLevel int64
+
+	// Prefetch represents the size of the internal prefetch buffer for each ProcessorPartitionClient
+	// created by this Processor. When set, this client will attempt to always maintain
+	// an internal cache of events of this size, asynchronously, increasing the odds that
+	// ReceiveEvents() will use a locally stored cache of events, rather than having to
+	// wait for events to arrive from the network.
+	//
+	// Defaults to 300 events if Prefetch == 0.
+	// Disabled if Prefetch < 0.
+	Prefetch int32
 }
 
 // StartPositions are used if there is no checkpoint for a partition in
@@ -82,6 +92,7 @@ type Processor struct {
 	defaultStartPositions   StartPositions
 	checkpointStore         CheckpointStore
 	ownerLevel              int64
+	prefetch                int32
 
 	// consumerClient is actually a *azeventhubs.ConsumerClient
 	// it's an interface here to make testing easier.
@@ -156,6 +167,7 @@ func newProcessorImpl(consumerClient consumerClientForProcessor, checkpointStore
 			PerPartition: startPosPerPartition,
 			Default:      options.StartPositions.Default,
 		},
+		prefetch:              options.Prefetch,
 		consumerClientDetails: consumerClient.getDetails(),
 		runCalled:             make(chan struct{}),
 		lb:                    newProcessorLoadBalancer(checkpointStore, consumerClient.getDetails(), strategy, partitionDurationExpiration),
@@ -327,6 +339,7 @@ func (p *Processor) addPartitionClient(ctx context.Context, ownership Ownership,
 	partClient, err := p.consumerClient.NewPartitionClient(ownership.PartitionID, &PartitionClientOptions{
 		StartPosition: sp,
 		OwnerLevel:    &p.ownerLevel,
+		Prefetch:      p.prefetch,
 	})
 
 	if err != nil {