operator-framework · jianzhangbjz · Dec 4, 2025 · pedjak · Dec 4, 2025 · jianzhangbjz
@@ -158,7 +158,7 @@ MANIFESTS ?= $(STANDARD_MANIFEST) $(STANDARD_E2E_MANIFEST) $(EXPERIMENTAL_MANIFE
 $(STANDARD_MANIFEST) ?= helm/cert-manager.yaml
 $(STANDARD_E2E_MANIFEST) ?= helm/cert-manager.yaml helm/e2e.yaml
 $(EXPERIMENTAL_MANIFEST) ?= helm/cert-manager.yaml helm/experimental.yaml
-$(EXPERIMENTAL_E2E_MANIFEST) ?= helm/cert-manager.yaml helm/experimental.yaml helm/e2e.yaml
+$(EXPERIMENTAL_E2E_MANIFEST) ?= helm/cert-manager.yaml helm/experimental.yaml helm/e2e.yaml helm/replicas-2.yaml
 HELM_SETTINGS ?=
 .PHONY: $(MANIFESTS)
 $(MANIFESTS): $(HELM)
@@ -484,8 +484,8 @@ run-experimental: run-internal #HELP Build the operator-controller then deploy i
 CATD_NAMESPACE := olmv1-system
 .PHONY: wait
 wait:
-	kubectl wait --for=condition=Available --namespace=$(CATD_NAMESPACE) deployment/catalogd-controller-manager --timeout=60s
-	kubectl wait --for=condition=Ready --namespace=$(CATD_NAMESPACE) certificate/catalogd-service-cert # Avoid upgrade test flakes when reissuing cert
+	kubectl wait --for=condition=Available --namespace=$(CATD_NAMESPACE) deployment/catalogd-controller-manager --timeout=3m
+	kubectl wait --for=condition=Ready --namespace=$(CATD_NAMESPACE) certificate/catalogd-service-cert --timeout=3m # Avoid upgrade test flakes when reissuing cert
 
 .PHONY: docker-build
 docker-build: build-linux #EXHELP Build docker image for operator-controller and catalog with GOOS=linux and local GOARCH.

@@ -155,5 +155,5 @@ spec:
       version: 1.0.0
 EOF
 
-kubectl wait --for=condition=Serving --timeout=60s ClusterCatalog $TEST_CLUSTER_CATALOG_NAME
-kubectl wait --for=condition=Installed --timeout=60s ClusterExtension $TEST_CLUSTER_EXTENSION_NAME
+kubectl wait --for=condition=Serving --timeout=5m ClusterCatalog $TEST_CLUSTER_CATALOG_NAME
+kubectl wait --for=condition=Installed --timeout=5m ClusterExtension $TEST_CLUSTER_EXTENSION_NAME
@@ -12,11 +12,11 @@ metadata:
   namespace: {{ .Values.namespaces.olmv1.name }}
 spec:
   minReadySeconds: 5
-  replicas: 1
+  replicas: {{ .Values.options.catalogd.deployment.replicas }}
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:

@@ -11,11 +11,11 @@ metadata:
   name: operator-controller-controller-manager
   namespace: {{ .Values.namespaces.olmv1.name }}
 spec:
-  replicas: 1
+  replicas: {{ .Values.options.operatorController.deployment.replicas }}
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:

@@ -8,6 +8,7 @@ options:
     enabled: true
     deployment:
       image: quay.io/operator-framework/operator-controller:devel
+      replicas: 1
       extraArguments: []
     features:
       enabled: []
@@ -19,6 +20,7 @@ options:
     enabled: true
     deployment:
       image: quay.io/operator-framework/catalogd:devel
+      replicas: 1
       extraArguments: []
     features:
       enabled: []

@@ -0,0 +1,9 @@
+# Override values to set replicas to 2 for both operator-controller and catalogd
+# This is used in experimental-e2e.yaml to test multi-replica deployments
+options:
+  operatorController:
+    deployment:
+      replicas: 2
+  catalogd:
+    deployment:
+      replicas: 2
@@ -2107,11 +2107,11 @@ metadata:
   namespace: olmv1-system
 spec:
   minReadySeconds: 5
-  replicas: 1
+  replicas: 2
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:
@@ -2258,11 +2258,11 @@ metadata:
   name: operator-controller-controller-manager
   namespace: olmv1-system
 spec:
-  replicas: 1
+  replicas: 2
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:

@@ -2036,7 +2036,7 @@ spec:
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:
@@ -2174,7 +2174,7 @@ spec:
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:

@@ -1799,7 +1799,7 @@ spec:
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:
@@ -1949,7 +1949,7 @@ spec:
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:

@@ -1724,7 +1724,7 @@ spec:
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:
@@ -1861,7 +1861,7 @@ spec:
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:

@@ -27,8 +27,11 @@ import (
 )
 
 const (
-	artifactName         = "operator-controller-e2e"
-	pollDuration         = time.Minute
+	artifactName = "operator-controller-e2e"
+	// pollDuration is set to 3 minutes to account for leader election time in multi-replica deployments.
+	// In the worst case (previous leader crashed), leader election can take up to 163 seconds
+	// (LeaseDuration: 137s + RetryPeriod: 26s). Adding buffer for reconciliation time.
+	pollDuration         = 3 * time.Minute
 	pollInterval         = time.Second
 	testCatalogRefEnvVar = "CATALOG_IMG"
 	testCatalogName      = "test-catalog"
@@ -169,18 +172,19 @@ location = "docker-registry.operator-controller-e2e.svc.cluster.local:5000"`,
 		require.NoError(ct, c.Get(context.Background(), types.NamespacedName{Name: clusterExtension.Name}, clusterExtension))
 	}, 2*time.Minute, pollInterval)
 
-	// Give the check 2 minutes instead of the typical 1 for the pod's
-	// files to update from the configmap change.
+	// Give the check extra time for the pod's files to update from the configmap change.
 	// The theoretical max time is the kubelet sync period of 1 minute +
-	// ConfigMap cache TTL of 1 minute = 2 minutes
+	// ConfigMap cache TTL of 1 minute = 2 minutes.
+	// With multi-replica deployments, add leader election time (up to 163s in worst case).
+	// Total: 2 min (ConfigMap) + 2.7 min (leader election) + buffer = 5 minutes
 	t.Log("By eventually reporting progressing as True")
 	require.EventuallyWithT(t, func(ct *assert.CollectT) {
 		require.NoError(ct, c.Get(context.Background(), types.NamespacedName{Name: clusterExtension.Name}, clusterExtension))
 		cond := apimeta.FindStatusCondition(clusterExtension.Status.Conditions, ocv1.TypeProgressing)
 		require.NotNil(ct, cond)
 		require.Equal(ct, metav1.ConditionTrue, cond.Status)
 		require.Equal(ct, ocv1.ReasonSucceeded, cond.Reason)
-	}, 2*time.Minute, pollInterval)
+	}, 5*time.Minute, pollInterval)
 
 	t.Log("By eventually installing the package successfully")
 	require.EventuallyWithT(t, func(ct *assert.CollectT) {
@@ -655,6 +659,8 @@ func TestClusterExtensionRecoversFromNoNamespaceWhenFailureFixed(t *testing.T) {
 	// backoff of this eventually check we MUST ensure we do not touch the ClusterExtension
 	// after creating int the Namespace and ServiceAccount.
 	t.Log("By eventually installing the package successfully")
+	// Use 5 minutes for recovery tests to account for exponential backoff after repeated failures
+	// plus leader election time (up to 163s in worst case)
 	require.EventuallyWithT(t, func(ct *assert.CollectT) {
 		require.NoError(ct, c.Get(context.Background(), types.NamespacedName{Name: clusterExtension.Name}, clusterExtension))
 		cond := apimeta.FindStatusCondition(clusterExtension.Status.Conditions, ocv1.TypeInstalled)
@@ -663,7 +669,7 @@ func TestClusterExtensionRecoversFromNoNamespaceWhenFailureFixed(t *testing.T) {
 		require.Equal(ct, ocv1.ReasonSucceeded, cond.Reason)
 		require.Contains(ct, cond.Message, "Installed bundle")
 		require.NotEmpty(ct, clusterExtension.Status.Install)
-	}, pollDuration, pollInterval)
+	}, 5*time.Minute, pollInterval)
 
 	t.Log("By eventually reporting Progressing == True with Reason Success")
 	require.EventuallyWithT(t, func(ct *assert.CollectT) {
@@ -777,6 +783,8 @@ func TestClusterExtensionRecoversFromExistingDeploymentWhenFailureFixed(t *testi
 	// backoff of this eventually check we MUST ensure we do not touch the ClusterExtension
 	// after deleting the Deployment.
 	t.Log("By eventually installing the package successfully")
+	// Use 5 minutes for recovery tests to account for exponential backoff after repeated failures
+	// plus leader election time (up to 163s in worst case)
 	require.EventuallyWithT(t, func(ct *assert.CollectT) {
 		require.NoError(ct, c.Get(context.Background(), types.NamespacedName{Name: clusterExtension.Name}, clusterExtension))
 		cond := apimeta.FindStatusCondition(clusterExtension.Status.Conditions, ocv1.TypeInstalled)
@@ -785,7 +793,7 @@ func TestClusterExtensionRecoversFromExistingDeploymentWhenFailureFixed(t *testi
 		require.Equal(ct, ocv1.ReasonSucceeded, cond.Reason)
 		require.Contains(ct, cond.Message, "Installed bundle")
 		require.NotEmpty(ct, clusterExtension.Status.Install)
-	}, pollDuration, pollInterval)
+	}, 5*time.Minute, pollInterval)
 
 	t.Log("By eventually reporting Progressing == True with Reason Success")
 	require.EventuallyWithT(t, func(ct *assert.CollectT) {

@@ -5,6 +5,7 @@ import (
 	"fmt"
 	"os"
 	"testing"
+	"time"
 
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
@@ -404,9 +405,12 @@ func TestClusterExtensionVersionUpdate(t *testing.T) {
 		require.Equal(ct, ocv1.ReasonSucceeded, cond.Reason)
 	}, pollDuration, pollInterval)
 	t.Log("We should have two ClusterExtensionRevision resources")
+	// Use 5 minutes for checking ClusterExtensionRevision creation after upgrade.
+	// In multi-replica deployments, revision creation happens after the upgrade completes,
+	// which includes leader election time (up to 163s) plus reconciliation overhead.
 	require.EventuallyWithT(t, func(ct *assert.CollectT) {
 		cerList := &ocv1.ClusterExtensionRevisionList{}
 		require.NoError(ct, c.List(context.Background(), cerList))
 		require.Len(ct, cerList.Items, 2)
-	}, pollDuration, pollInterval)
+	}, 5*time.Minute, pollInterval)
 }
@@ -5,6 +5,7 @@ import (
 	"fmt"
 	"os"
 	"testing"
+	"time"
 
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
@@ -138,6 +139,8 @@ func TestWebhookSupport(t *testing.T) {
 	})
 
 	t.Log("By waiting for webhook-operator extension to be installed successfully")
+	// Use 5 minutes for webhook installation as it requires webhook cert generation via cert-manager,
+	// which adds additional time on top of leader election and reconciliation in multi-replica deployments.
 	require.EventuallyWithT(t, func(ct *assert.CollectT) {
 		require.NoError(ct, c.Get(t.Context(), types.NamespacedName{Name: clusterExtension.Name}, clusterExtension))
 		cond := apimeta.FindStatusCondition(clusterExtension.Status.Conditions, ocv1.TypeInstalled)
@@ -147,7 +150,7 @@ func TestWebhookSupport(t *testing.T) {
 		require.Contains(ct, cond.Message, "Installed bundle")
 		require.NotNil(ct, clusterExtension.Status.Install)
 		require.NotEmpty(ct, clusterExtension.Status.Install.Bundle)
-	}, pollDuration, pollInterval)
+	}, 5*time.Minute, pollInterval)
 
 	t.Log("By waiting for webhook-operator deployment to be available")
 	require.EventuallyWithT(t, func(ct *assert.CollectT) {

@@ -34,7 +34,10 @@ var (
 )
 
 const (
-	pollDuration         = time.Minute
+	// pollDuration is set to 3 minutes to account for leader election time in multi-replica deployments.
+	// In the worst case (previous leader crashed), leader election can take up to 163 seconds
+	// (LeaseDuration: 137s + RetryPeriod: 26s). Adding buffer for reconciliation time.
+	pollDuration         = 3 * time.Minute
 	pollInterval         = time.Second
 	testCatalogName      = "test-catalog"
 	testCatalogRefEnvVar = "CATALOG_IMG"
@@ -289,31 +292,34 @@ func ValidateCatalogUnpackWithName(t *testing.T, catalogName string) {
 func EnsureNoExtensionResources(t *testing.T, clusterExtensionName string) {
 	ls := labels.Set{"olm.operatorframework.io/owner-name": clusterExtensionName}
 
-	// CRDs may take an extra long time to be deleted, and may run into the following error:
-	// Condition=Terminating Status=True Reason=InstanceDeletionFailed Message="could not list instances: storage is (re)initializing"
+	// CRDs may take extra time to be deleted in multi-replica deployments due to leader election
+	// and finalizer processing. Use 3 minutes which accounts for leader election (up to 163s)
+	// plus buffer for reconciliation overhead.
 	t.Logf("By waiting for CustomResourceDefinitions of %q to be deleted", clusterExtensionName)
 	require.EventuallyWithT(t, func(ct *assert.CollectT) {
 		list := &apiextensionsv1.CustomResourceDefinitionList{}
 		err := c.List(context.Background(), list, client.MatchingLabelsSelector{Selector: ls.AsSelector()})
 		require.NoError(ct, err)
 		require.Empty(ct, list.Items)
-	}, 5*pollDuration, pollInterval)
+	}, pollDuration, pollInterval)
 
+	// ClusterRoleBindings and ClusterRoles deletion is typically faster than CRDs.
+	// Use pollDuration (3 minutes) to account for leader election in multi-replica deployments.
 	t.Logf("By waiting for ClusterRoleBindings of %q to be deleted", clusterExtensionName)
 	require.EventuallyWithT(t, func(ct *assert.CollectT) {
 		list := &rbacv1.ClusterRoleBindingList{}
 		err := c.List(context.Background(), list, client.MatchingLabelsSelector{Selector: ls.AsSelector()})
 		require.NoError(ct, err)
 		require.Empty(ct, list.Items)
-	}, 2*pollDuration, pollInterval)
+	}, pollDuration, pollInterval)
 
 	t.Logf("By waiting for ClusterRoles of %q to be deleted", clusterExtensionName)
 	require.EventuallyWithT(t, func(ct *assert.CollectT) {
 		list := &rbacv1.ClusterRoleList{}
 		err := c.List(context.Background(), list, client.MatchingLabelsSelector{Selector: ls.AsSelector()})
 		require.NoError(ct, err)
 		require.Empty(ct, list.Items)
-	}, 2*pollDuration, pollInterval)
+	}, pollDuration, pollInterval)
 }
 
 func TestCleanup(t *testing.T, cat *ocv1.ClusterCatalog, clusterExtension *ocv1.ClusterExtension, sa *corev1.ServiceAccount, ns *corev1.Namespace) {