Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ MANIFESTS ?= $(STANDARD_MANIFEST) $(STANDARD_E2E_MANIFEST) $(EXPERIMENTAL_MANIFE
$(STANDARD_MANIFEST) ?= helm/cert-manager.yaml
$(STANDARD_E2E_MANIFEST) ?= helm/cert-manager.yaml helm/e2e.yaml
$(EXPERIMENTAL_MANIFEST) ?= helm/cert-manager.yaml helm/experimental.yaml
$(EXPERIMENTAL_E2E_MANIFEST) ?= helm/cert-manager.yaml helm/experimental.yaml helm/e2e.yaml
$(EXPERIMENTAL_E2E_MANIFEST) ?= helm/cert-manager.yaml helm/experimental.yaml helm/e2e.yaml helm/replicas-2.yaml
HELM_SETTINGS ?=
.PHONY: $(MANIFESTS)
$(MANIFESTS): $(HELM)
Expand Down Expand Up @@ -484,8 +484,8 @@ run-experimental: run-internal #HELP Build the operator-controller then deploy i
CATD_NAMESPACE := olmv1-system
.PHONY: wait
wait:
kubectl wait --for=condition=Available --namespace=$(CATD_NAMESPACE) deployment/catalogd-controller-manager --timeout=60s
kubectl wait --for=condition=Ready --namespace=$(CATD_NAMESPACE) certificate/catalogd-service-cert # Avoid upgrade test flakes when reissuing cert
kubectl wait --for=condition=Available --namespace=$(CATD_NAMESPACE) deployment/catalogd-controller-manager --timeout=3m
kubectl wait --for=condition=Ready --namespace=$(CATD_NAMESPACE) certificate/catalogd-service-cert --timeout=3m # Avoid upgrade test flakes when reissuing cert

.PHONY: docker-build
docker-build: build-linux #EXHELP Build docker image for operator-controller and catalog with GOOS=linux and local GOARCH.
Expand Down
4 changes: 2 additions & 2 deletions hack/test/pre-upgrade-setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -155,5 +155,5 @@ spec:
version: 1.0.0
EOF

kubectl wait --for=condition=Serving --timeout=60s ClusterCatalog $TEST_CLUSTER_CATALOG_NAME
kubectl wait --for=condition=Installed --timeout=60s ClusterExtension $TEST_CLUSTER_EXTENSION_NAME
kubectl wait --for=condition=Serving --timeout=5m ClusterCatalog $TEST_CLUSTER_CATALOG_NAME
kubectl wait --for=condition=Installed --timeout=5m ClusterExtension $TEST_CLUSTER_EXTENSION_NAME
Comment on lines +158 to +159
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are these changes really needed in this PR, i.e. can we do them separately?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess no. I updated it since it failed during the test. You can check the test history.

Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@ metadata:
namespace: {{ .Values.namespaces.olmv1.name }}
spec:
minReadySeconds: 5
replicas: 1
replicas: {{ .Values.options.catalogd.deployment.replicas }}
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
maxUnavailable: 0 # Never allow pods to be unavailable during updates
selector:
matchLabels:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@ metadata:
name: operator-controller-controller-manager
namespace: {{ .Values.namespaces.olmv1.name }}
spec:
replicas: 1
replicas: {{ .Values.options.operatorController.deployment.replicas }}
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
maxUnavailable: 0 # Never allow pods to be unavailable during updates
selector:
matchLabels:
Expand Down
2 changes: 2 additions & 0 deletions helm/olmv1/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ options:
enabled: true
deployment:
image: quay.io/operator-framework/operator-controller:devel
replicas: 1
extraArguments: []
features:
enabled: []
Expand All @@ -19,6 +20,7 @@ options:
enabled: true
deployment:
image: quay.io/operator-framework/catalogd:devel
replicas: 1
extraArguments: []
features:
enabled: []
Expand Down
9 changes: 9 additions & 0 deletions helm/replicas-2.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Override values to set replicas to 2 for both operator-controller and catalogd
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you come up with some better name for this file, give that these are helm values for HA setup?

# This is used in experimental-e2e.yaml to test multi-replica deployments
options:
operatorController:
deployment:
replicas: 2
catalogd:
deployment:
replicas: 2
8 changes: 4 additions & 4 deletions manifests/experimental-e2e.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2107,11 +2107,11 @@ metadata:
namespace: olmv1-system
spec:
minReadySeconds: 5
replicas: 1
replicas: 2
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same as above.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a generated file, the change comes from values.yaml.

strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
maxUnavailable: 0 # Never allow pods to be unavailable during updates
selector:
matchLabels:
Expand Down Expand Up @@ -2258,11 +2258,11 @@ metadata:
name: operator-controller-controller-manager
namespace: olmv1-system
spec:
replicas: 1
replicas: 2
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same comment as earlier.

strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
maxUnavailable: 0 # Never allow pods to be unavailable during updates
selector:
matchLabels:
Expand Down
4 changes: 2 additions & 2 deletions manifests/experimental.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2036,7 +2036,7 @@ spec:
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
maxUnavailable: 0 # Never allow pods to be unavailable during updates
selector:
matchLabels:
Expand Down Expand Up @@ -2174,7 +2174,7 @@ spec:
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
maxUnavailable: 0 # Never allow pods to be unavailable during updates
selector:
matchLabels:
Expand Down
4 changes: 2 additions & 2 deletions manifests/standard-e2e.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1799,7 +1799,7 @@ spec:
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
maxUnavailable: 0 # Never allow pods to be unavailable during updates
selector:
matchLabels:
Expand Down Expand Up @@ -1949,7 +1949,7 @@ spec:
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
maxUnavailable: 0 # Never allow pods to be unavailable during updates
selector:
matchLabels:
Expand Down
4 changes: 2 additions & 2 deletions manifests/standard.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1724,7 +1724,7 @@ spec:
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
maxUnavailable: 0 # Never allow pods to be unavailable during updates
selector:
matchLabels:
Expand Down Expand Up @@ -1861,7 +1861,7 @@ spec:
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
maxUnavailable: 0 # Never allow pods to be unavailable during updates
selector:
matchLabels:
Expand Down
24 changes: 16 additions & 8 deletions test/e2e/cluster_extension_install_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,11 @@ import (
)

const (
artifactName = "operator-controller-e2e"
pollDuration = time.Minute
artifactName = "operator-controller-e2e"
// pollDuration is set to 3 minutes to account for leader election time in multi-replica deployments.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

our e2e test do not need to run now in HA setup, having just single replica should be fine. If some additional HA tests are needed, they should be provided downstream.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

However, the upstream e2e's do run downstream as well, and we need to ensure they can work there as well.

Copy link
Contributor

@pedjak pedjak Dec 5, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, but if I am not mistaken, OCPBUGS-62517 reports a downstream test to fail, not an upstream one?

Copy link
Contributor

@tmshort tmshort Dec 5, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but it's all the same downstream (OCP) configuration. We can't change the OCP configuration just to run our upstream e2e's. Our upstream e2e's are expected to be able to run mostly unmodified on OCP.

The upstream e2e will encounter 2 replicas when run in OCP (because that's the OCP configuration), but will encounter 1 replica upstream (because that's the upstream configuration).

And because we'd also allow the user to support multiple replicas upstream, our upstream e2e should accommodate that.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

our upstream e2e should accommodate that.

Agree that our e2e tests should be agnostic to underlying setup, but our upstream e2e CI test jobs should run against an OLM deployment with single replicas (as they do until now).

// In the worst case (previous leader crashed), leader election can take up to 163 seconds
// (LeaseDuration: 137s + RetryPeriod: 26s). Adding buffer for reconciliation time.
pollDuration = 3 * time.Minute
pollInterval = time.Second
testCatalogRefEnvVar = "CATALOG_IMG"
testCatalogName = "test-catalog"
Expand Down Expand Up @@ -169,18 +172,19 @@ location = "docker-registry.operator-controller-e2e.svc.cluster.local:5000"`,
require.NoError(ct, c.Get(context.Background(), types.NamespacedName{Name: clusterExtension.Name}, clusterExtension))
}, 2*time.Minute, pollInterval)

// Give the check 2 minutes instead of the typical 1 for the pod's
// files to update from the configmap change.
// Give the check extra time for the pod's files to update from the configmap change.
// The theoretical max time is the kubelet sync period of 1 minute +
// ConfigMap cache TTL of 1 minute = 2 minutes
// ConfigMap cache TTL of 1 minute = 2 minutes.
// With multi-replica deployments, add leader election time (up to 163s in worst case).
// Total: 2 min (ConfigMap) + 2.7 min (leader election) + buffer = 5 minutes
t.Log("By eventually reporting progressing as True")
require.EventuallyWithT(t, func(ct *assert.CollectT) {
require.NoError(ct, c.Get(context.Background(), types.NamespacedName{Name: clusterExtension.Name}, clusterExtension))
cond := apimeta.FindStatusCondition(clusterExtension.Status.Conditions, ocv1.TypeProgressing)
require.NotNil(ct, cond)
require.Equal(ct, metav1.ConditionTrue, cond.Status)
require.Equal(ct, ocv1.ReasonSucceeded, cond.Reason)
}, 2*time.Minute, pollInterval)
}, 5*time.Minute, pollInterval)

t.Log("By eventually installing the package successfully")
require.EventuallyWithT(t, func(ct *assert.CollectT) {
Expand Down Expand Up @@ -655,6 +659,8 @@ func TestClusterExtensionRecoversFromNoNamespaceWhenFailureFixed(t *testing.T) {
// backoff of this eventually check we MUST ensure we do not touch the ClusterExtension
// after creating int the Namespace and ServiceAccount.
t.Log("By eventually installing the package successfully")
// Use 5 minutes for recovery tests to account for exponential backoff after repeated failures
// plus leader election time (up to 163s in worst case)
require.EventuallyWithT(t, func(ct *assert.CollectT) {
require.NoError(ct, c.Get(context.Background(), types.NamespacedName{Name: clusterExtension.Name}, clusterExtension))
cond := apimeta.FindStatusCondition(clusterExtension.Status.Conditions, ocv1.TypeInstalled)
Expand All @@ -663,7 +669,7 @@ func TestClusterExtensionRecoversFromNoNamespaceWhenFailureFixed(t *testing.T) {
require.Equal(ct, ocv1.ReasonSucceeded, cond.Reason)
require.Contains(ct, cond.Message, "Installed bundle")
require.NotEmpty(ct, clusterExtension.Status.Install)
}, pollDuration, pollInterval)
}, 5*time.Minute, pollInterval)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why replacing a variable with an value here, we could update pollDuration varibale to 5m?


t.Log("By eventually reporting Progressing == True with Reason Success")
require.EventuallyWithT(t, func(ct *assert.CollectT) {
Expand Down Expand Up @@ -777,6 +783,8 @@ func TestClusterExtensionRecoversFromExistingDeploymentWhenFailureFixed(t *testi
// backoff of this eventually check we MUST ensure we do not touch the ClusterExtension
// after deleting the Deployment.
t.Log("By eventually installing the package successfully")
// Use 5 minutes for recovery tests to account for exponential backoff after repeated failures
// plus leader election time (up to 163s in worst case)
require.EventuallyWithT(t, func(ct *assert.CollectT) {
require.NoError(ct, c.Get(context.Background(), types.NamespacedName{Name: clusterExtension.Name}, clusterExtension))
cond := apimeta.FindStatusCondition(clusterExtension.Status.Conditions, ocv1.TypeInstalled)
Expand All @@ -785,7 +793,7 @@ func TestClusterExtensionRecoversFromExistingDeploymentWhenFailureFixed(t *testi
require.Equal(ct, ocv1.ReasonSucceeded, cond.Reason)
require.Contains(ct, cond.Message, "Installed bundle")
require.NotEmpty(ct, clusterExtension.Status.Install)
}, pollDuration, pollInterval)
}, 5*time.Minute, pollInterval)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same as above.


t.Log("By eventually reporting Progressing == True with Reason Success")
require.EventuallyWithT(t, func(ct *assert.CollectT) {
Expand Down
6 changes: 5 additions & 1 deletion test/e2e/single_namespace_support_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"fmt"
"os"
"testing"
"time"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
Expand Down Expand Up @@ -404,9 +405,12 @@ func TestClusterExtensionVersionUpdate(t *testing.T) {
require.Equal(ct, ocv1.ReasonSucceeded, cond.Reason)
}, pollDuration, pollInterval)
t.Log("We should have two ClusterExtensionRevision resources")
// Use 5 minutes for checking ClusterExtensionRevision creation after upgrade.
// In multi-replica deployments, revision creation happens after the upgrade completes,
// which includes leader election time (up to 163s) plus reconciliation overhead.
require.EventuallyWithT(t, func(ct *assert.CollectT) {
cerList := &ocv1.ClusterExtensionRevisionList{}
require.NoError(ct, c.List(context.Background(), cerList))
require.Len(ct, cerList.Items, 2)
}, pollDuration, pollInterval)
}, 5*time.Minute, pollInterval)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same as above.

}
5 changes: 4 additions & 1 deletion test/e2e/webhook_support_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"fmt"
"os"
"testing"
"time"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
Expand Down Expand Up @@ -138,6 +139,8 @@ func TestWebhookSupport(t *testing.T) {
})

t.Log("By waiting for webhook-operator extension to be installed successfully")
// Use 5 minutes for webhook installation as it requires webhook cert generation via cert-manager,
// which adds additional time on top of leader election and reconciliation in multi-replica deployments.
require.EventuallyWithT(t, func(ct *assert.CollectT) {
require.NoError(ct, c.Get(t.Context(), types.NamespacedName{Name: clusterExtension.Name}, clusterExtension))
cond := apimeta.FindStatusCondition(clusterExtension.Status.Conditions, ocv1.TypeInstalled)
Expand All @@ -147,7 +150,7 @@ func TestWebhookSupport(t *testing.T) {
require.Contains(ct, cond.Message, "Installed bundle")
require.NotNil(ct, clusterExtension.Status.Install)
require.NotEmpty(ct, clusterExtension.Status.Install.Bundle)
}, pollDuration, pollInterval)
}, 5*time.Minute, pollInterval)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same as above.


t.Log("By waiting for webhook-operator deployment to be available")
require.EventuallyWithT(t, func(ct *assert.CollectT) {
Expand Down
18 changes: 12 additions & 6 deletions test/helpers/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,10 @@ var (
)

const (
pollDuration = time.Minute
// pollDuration is set to 3 minutes to account for leader election time in multi-replica deployments.
// In the worst case (previous leader crashed), leader election can take up to 163 seconds
// (LeaseDuration: 137s + RetryPeriod: 26s). Adding buffer for reconciliation time.
pollDuration = 3 * time.Minute
pollInterval = time.Second
testCatalogName = "test-catalog"
testCatalogRefEnvVar = "CATALOG_IMG"
Expand Down Expand Up @@ -289,31 +292,34 @@ func ValidateCatalogUnpackWithName(t *testing.T, catalogName string) {
func EnsureNoExtensionResources(t *testing.T, clusterExtensionName string) {
ls := labels.Set{"olm.operatorframework.io/owner-name": clusterExtensionName}

// CRDs may take an extra long time to be deleted, and may run into the following error:
// Condition=Terminating Status=True Reason=InstanceDeletionFailed Message="could not list instances: storage is (re)initializing"
// CRDs may take extra time to be deleted in multi-replica deployments due to leader election
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CRD deletion is not anyhow impacted by having a controller in HA setup or not. Please correct/revert the comment.

// and finalizer processing. Use 3 minutes which accounts for leader election (up to 163s)
// plus buffer for reconciliation overhead.
t.Logf("By waiting for CustomResourceDefinitions of %q to be deleted", clusterExtensionName)
require.EventuallyWithT(t, func(ct *assert.CollectT) {
list := &apiextensionsv1.CustomResourceDefinitionList{}
err := c.List(context.Background(), list, client.MatchingLabelsSelector{Selector: ls.AsSelector()})
require.NoError(ct, err)
require.Empty(ct, list.Items)
}, 5*pollDuration, pollInterval)
}, pollDuration, pollInterval)

// ClusterRoleBindings and ClusterRoles deletion is typically faster than CRDs.
// Use pollDuration (3 minutes) to account for leader election in multi-replica deployments.
t.Logf("By waiting for ClusterRoleBindings of %q to be deleted", clusterExtensionName)
require.EventuallyWithT(t, func(ct *assert.CollectT) {
list := &rbacv1.ClusterRoleBindingList{}
err := c.List(context.Background(), list, client.MatchingLabelsSelector{Selector: ls.AsSelector()})
require.NoError(ct, err)
require.Empty(ct, list.Items)
}, 2*pollDuration, pollInterval)
}, pollDuration, pollInterval)

t.Logf("By waiting for ClusterRoles of %q to be deleted", clusterExtensionName)
require.EventuallyWithT(t, func(ct *assert.CollectT) {
list := &rbacv1.ClusterRoleList{}
err := c.List(context.Background(), list, client.MatchingLabelsSelector{Selector: ls.AsSelector()})
require.NoError(ct, err)
require.Empty(ct, list.Items)
}, 2*pollDuration, pollInterval)
}, pollDuration, pollInterval)
}

func TestCleanup(t *testing.T, cat *ocv1.ClusterCatalog, clusterExtension *ocv1.ClusterExtension, sa *corev1.ServiceAccount, ns *corev1.Namespace) {
Expand Down
Loading