test(e2e): enhance debugging capabilities with detailed diagnostic explanations (#112)

mfleader · web-flow · commit 484e30ca1c1c · 2025-07-22T16:27:04.000-04:00
Improved observability into system state during end-to-end tests by
logging state information from Distribution Status, Namespace Events,
Deployment, Pods, Services and Service Endpoints, and Operator Health.

---------

Signed-off-by: Matthew F Leader &lt;mleader@redhat.com&gt;
diff --git a/tests/e2e/creation_test.go b/tests/e2e/creation_test.go
@@ -98,7 +98,7 @@ func testCreateDistribution(t *testing.T) *v1alpha1.LlamaStackDistribution {
 		status, statusFound, _ := unstructured.NestedMap(u.Object, "status")
 		return specFound && statusFound && spec != nil && status != nil
 	})
-	require.NoError(t, err)
+	requireNoErrorWithDebugging(t, TestEnv, err, "Service readiness check failed", llsdistributionCR.Namespace, llsdistributionCR.Name)
 
 	return llsdistributionCR
 }
@@ -183,7 +183,7 @@ func testHealthStatus(t *testing.T, distribution *v1alpha1.LlamaStackDistributio
 		}
 		return updatedDistribution.Status.Phase == v1alpha1.LlamaStackDistributionPhaseReady, nil
 	})
-	require.NoError(t, err, "Failed to wait for distribution status update")
+	requireNoErrorWithDebugging(t, TestEnv, err, "Failed to wait for distribution status update", distribution.Namespace, distribution.Name)
 }
 
 func testDistributionStatus(t *testing.T, llsdistributionCR *v1alpha1.LlamaStackDistribution) {
@@ -223,7 +223,7 @@ func testDistributionStatus(t *testing.T, llsdistributionCR *v1alpha1.LlamaStack
 			Namespace: llsdistributionCR.Namespace,
 			Name:      llsdistributionCR.Name,
 		}, finalDistribution)
-		require.NoError(t, err, "Failed to wait for distribution status update", finalDistribution.Status)
+		requireNoErrorWithDebugging(t, TestEnv, err, "Failed to wait for distribution status update", llsdistributionCR.Namespace, llsdistributionCR.Name)
 	}
 
 	// Get final state and verify
diff --git a/tests/e2e/test_utils.go b/tests/e2e/test_utils.go
@@ -12,6 +12,7 @@ import (
 	"github.com/llamastack/llama-stack-k8s-operator/api/v1alpha1"
 	"github.com/stretchr/testify/require"
 	appsv1 "k8s.io/api/apps/v1"
+	corev1 "k8s.io/api/core/v1"
 	apiextv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
 	"k8s.io/apimachinery/pkg/api/errors"
 	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
@@ -182,3 +183,218 @@ func GetSampleCR(t *testing.T) *v1alpha1.LlamaStackDistribution {
 
 	return distribution
 }
+
+// checkLlamaStackDistributionStatus helps identify if the custom resource reached the expected state during test execution.
+func checkLlamaStackDistributionStatus(t *testing.T, testenv *TestEnvironment, namespace, name string) {
+	t.Helper()
+
+	llsDistro := &v1alpha1.LlamaStackDistribution{}
+	err := testenv.Client.Get(testenv.Ctx, client.ObjectKey{Namespace: namespace, Name: name}, llsDistro)
+	if err != nil {
+		t.Logf("⚠️  Error getting LlamaStackDistribution: %v", err)
+		return
+	}
+
+	t.Logf("LlamaStackDistribution status:")
+	t.Logf("  Phase: %s", llsDistro.Status.Phase)
+	t.Logf("  Generation: %d", llsDistro.Generation)
+	t.Logf("  ResourceVersion: %s", llsDistro.ResourceVersion)
+	t.Logf("  Conditions: %+v", llsDistro.Status.Conditions)
+}
+
+// checkNamespaceEvents reveals what Kubernetes operations occurred and why they may have failed.
+func checkNamespaceEvents(t *testing.T, testenv *TestEnvironment, namespace string) {
+	t.Helper()
+
+	eventList := &corev1.EventList{}
+	err := testenv.Client.List(testenv.Ctx, eventList, client.InNamespace(namespace))
+	if err != nil {
+		t.Logf("⚠️  Error getting events: %v", err)
+		return
+	}
+
+	if len(eventList.Items) == 0 {
+		t.Log("📝 No events found in namespace")
+		return
+	}
+
+	maxEvents := 25
+	if len(eventList.Items) > maxEvents {
+		t.Logf("📝 Showing first %d events (of %d total):", maxEvents, len(eventList.Items))
+		eventList.Items = eventList.Items[:maxEvents]
+	} else {
+		t.Logf("📝 Found %d events in namespace %s:", len(eventList.Items), namespace)
+	}
+
+	for _, event := range eventList.Items {
+		t.Logf("  %s: %s (%s) - %s",
+			event.LastTimestamp.Format("15:04:05"),
+			event.Reason,
+			event.Type,
+			event.Message)
+	}
+}
+
+// requireNoErrorWithDebugging provides comprehensive debugging context when tests fail to help identify root causes quickly.
+func requireNoErrorWithDebugging(t *testing.T, testenv *TestEnvironment, err error, msg string, namespace, crName string) {
+	t.Helper()
+	if err != nil {
+		t.Logf("💥 ERROR OCCURRED: %s - %v", msg, err)
+
+		// Check custom resource status first to see if the operator processed the request correctly
+		checkLlamaStackDistributionStatus(t, testenv, namespace, crName)
+
+		// Check events to understand what Kubernetes operations were attempted and why they failed
+		checkNamespaceEvents(t, testenv, namespace)
+
+		// Check pod details to identify container startup issues or crash loops
+		logPodDetails(t, testenv, namespace)
+
+		// Check service endpoints to see if pods are being discovered by services
+		logServiceEndpoints(t, testenv, namespace, crName+"-service")
+
+		// Check service configuration to identify selector mismatches
+		logServiceSpec(t, testenv, namespace, crName+"-service")
+
+		// Check deployment spec to identify configuration problems preventing pod startup
+		logDeploymentSpec(t, testenv, namespace, crName)
+
+		require.NoError(t, err, msg)
+	}
+}
+
+// logPodDetails helps diagnose pod startup issues and container restart problems during test failures.
+func logPodDetails(t *testing.T, testenv *TestEnvironment, namespace string) {
+	t.Helper()
+
+	podList := &corev1.PodList{}
+	err := testenv.Client.List(testenv.Ctx, podList, client.InNamespace(namespace))
+	if err != nil {
+		t.Logf("Failed to list pods: %v", err)
+		return
+	}
+
+	t.Logf("📦 Found %d pods in namespace %s:", len(podList.Items), namespace)
+	for _, pod := range podList.Items {
+		t.Logf("Pod: %s, Phase: %s", pod.Name, pod.Status.Phase)
+
+		for _, cs := range pod.Status.ContainerStatuses {
+			// RestartCount indicates crash loops or configuration issues
+			t.Logf("  Container %s: Ready=%v, RestartCount=%d",
+				cs.Name, cs.Ready, cs.RestartCount)
+
+			// Container states reveal why pods aren't starting or are crashing
+			if cs.State.Waiting != nil {
+				t.Logf("    Waiting: %s - %s",
+					cs.State.Waiting.Reason, cs.State.Waiting.Message)
+			}
+			if cs.State.Terminated != nil {
+				t.Logf("    Terminated: %s - %s",
+					cs.State.Terminated.Reason, cs.State.Terminated.Message)
+			}
+		}
+
+		// Pod logs would show startup errors but require different client access
+		t.Logf("  (Pod logs require direct kubectl access)")
+	}
+}
+
+// logServiceEndpoints logs service endpoint details to see if pods are ready.
+func logServiceEndpoints(t *testing.T, testenv *TestEnvironment, namespace, serviceName string) {
+	t.Helper()
+
+	endpoints := &corev1.Endpoints{}
+	err := testenv.Client.Get(testenv.Ctx, types.NamespacedName{
+		Name:      serviceName,
+		Namespace: namespace,
+	}, endpoints)
+
+	if err != nil {
+		t.Logf("Failed to get endpoints for service %s: %v", serviceName, err)
+		return
+	}
+
+	t.Logf("🔗 Service %s endpoints:", serviceName)
+	for i, subset := range endpoints.Subsets {
+		t.Logf("  Subset %d:", i)
+		// Ready addresses indicate pods that passed health checks and can receive traffic
+		t.Logf("    Ready addresses: %d", len(subset.Addresses))
+		for _, addr := range subset.Addresses {
+			t.Logf("      - %s", addr.IP)
+		}
+		// Not ready addresses show pods that exist but failed health checks
+		t.Logf("    Not ready addresses: %d", len(subset.NotReadyAddresses))
+		for _, addr := range subset.NotReadyAddresses {
+			t.Logf("      - %s", addr.IP)
+		}
+		t.Logf("    Ports:")
+		for _, port := range subset.Ports {
+			t.Logf("      - %s: %d", port.Name, port.Port)
+		}
+	}
+}
+
+// logDeploymentSpec helps identify configuration mismatches that prevent pods from starting correctly.
+func logDeploymentSpec(t *testing.T, testenv *TestEnvironment, namespace, name string) {
+	t.Helper()
+
+	deployment := &appsv1.Deployment{}
+	err := testenv.Client.Get(testenv.Ctx, types.NamespacedName{
+		Name:      name,
+		Namespace: namespace,
+	}, deployment)
+
+	if err != nil {
+		t.Logf("Failed to get deployment: %v", err)
+		return
+	}
+
+	t.Logf("🚀 Deployment %s spec:", name)
+	t.Logf("  Replicas: %d", *deployment.Spec.Replicas)
+	// Selector must match pod labels or pods won't be managed by deployment
+	t.Logf("  Selector: %+v", deployment.Spec.Selector.MatchLabels)
+	t.Logf("  Template labels: %+v", deployment.Spec.Template.Labels)
+
+	for _, container := range deployment.Spec.Template.Spec.Containers {
+		t.Logf("  Container: %s", container.Name)
+		t.Logf("    Image: %s", container.Image)
+		t.Logf("    Ports:")
+		for _, port := range container.Ports {
+			t.Logf("      - %d", port.ContainerPort)
+		}
+		// Environment variables can cause startup failures if misconfigured
+		t.Logf("    Env vars:")
+		for _, env := range container.Env {
+			t.Logf("      %s=%s", env.Name, env.Value)
+		}
+		// Readiness probe configuration affects when pods become service endpoints
+		if container.ReadinessProbe != nil {
+			t.Logf("    Readiness probe: %+v", container.ReadinessProbe)
+		}
+	}
+}
+
+// logServiceSpec logs the actual service configuration to debug selector issues.
+func logServiceSpec(t *testing.T, testenv *TestEnvironment, namespace, serviceName string) {
+	t.Helper()
+
+	service := &corev1.Service{}
+	err := testenv.Client.Get(testenv.Ctx, types.NamespacedName{
+		Name:      serviceName,
+		Namespace: namespace,
+	}, service)
+
+	if err != nil {
+		t.Logf("Failed to get service %s: %v", serviceName, err)
+		return
+	}
+
+	t.Logf("🔧 Service %s spec:", serviceName)
+	t.Logf("  Type: %s", service.Spec.Type)
+	// Selector must match pod labels or service won't route traffic to pods
+	t.Logf("  Selector: %+v", service.Spec.Selector)
+	t.Logf("  Ports:")
+	for _, port := range service.Spec.Ports {
+		t.Logf("    - %s: %d -> %s", port.Name, port.Port, port.TargetPort.String())
+	}
+}

Original file line number	Diff line number	Diff line change
`@@ -98,7 +98,7 @@ func testCreateDistribution(t testing.T) v1alpha1.LlamaStackDistribution {`
`98`	`98`	`status, statusFound, _ := unstructured.NestedMap(u.Object, "status")`
`99`	`99`	`return specFound && statusFound && spec != nil && status != nil`
`100`	`100`	`})`
`101`		`- require.NoError(t, err)`
	`101`	`+ requireNoErrorWithDebugging(t, TestEnv, err, "Service readiness check failed", llsdistributionCR.Namespace, llsdistributionCR.Name)`
`102`	`102`
`103`	`103`	`return llsdistributionCR`
`104`	`104`	`}`
`@@ -183,7 +183,7 @@ func testHealthStatus(t testing.T, distribution v1alpha1.LlamaStackDistributio`
`183`	`183`	`}`
`184`	`184`	`return updatedDistribution.Status.Phase == v1alpha1.LlamaStackDistributionPhaseReady, nil`
`185`	`185`	`})`
`186`		`- require.NoError(t, err, "Failed to wait for distribution status update")`
	`186`	`+ requireNoErrorWithDebugging(t, TestEnv, err, "Failed to wait for distribution status update", distribution.Namespace, distribution.Name)`
`187`	`187`	`}`
`188`	`188`
`189`	`189`	`func testDistributionStatus(t testing.T, llsdistributionCR v1alpha1.LlamaStackDistribution) {`
`@@ -223,7 +223,7 @@ func testDistributionStatus(t testing.T, llsdistributionCR v1alpha1.LlamaStack`
`223`	`223`	`Namespace: llsdistributionCR.Namespace,`
`224`	`224`	`Name: llsdistributionCR.Name,`
`225`	`225`	`}, finalDistribution)`
`226`		`- require.NoError(t, err, "Failed to wait for distribution status update", finalDistribution.Status)`
	`226`	`+ requireNoErrorWithDebugging(t, TestEnv, err, "Failed to wait for distribution status update", llsdistributionCR.Namespace, llsdistributionCR.Name)`
`227`	`227`	`}`
`228`	`228`
`229`	`229`	`// Get final state and verify`