Skip to content

Commit 484e30c

Browse files
authored
test(e2e): enhance debugging capabilities with detailed diagnostic explanations (#112)
Improved observability into system state during end-to-end tests by logging state information from Distribution Status, Namespace Events, Deployment, Pods, Services and Service Endpoints, and Operator Health. --------- Signed-off-by: Matthew F Leader <mleader@redhat.com>
1 parent f45c34a commit 484e30c

File tree

2 files changed

+219
-3
lines changed

2 files changed

+219
-3
lines changed

tests/e2e/creation_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ func testCreateDistribution(t *testing.T) *v1alpha1.LlamaStackDistribution {
9898
status, statusFound, _ := unstructured.NestedMap(u.Object, "status")
9999
return specFound && statusFound && spec != nil && status != nil
100100
})
101-
require.NoError(t, err)
101+
requireNoErrorWithDebugging(t, TestEnv, err, "Service readiness check failed", llsdistributionCR.Namespace, llsdistributionCR.Name)
102102

103103
return llsdistributionCR
104104
}
@@ -183,7 +183,7 @@ func testHealthStatus(t *testing.T, distribution *v1alpha1.LlamaStackDistributio
183183
}
184184
return updatedDistribution.Status.Phase == v1alpha1.LlamaStackDistributionPhaseReady, nil
185185
})
186-
require.NoError(t, err, "Failed to wait for distribution status update")
186+
requireNoErrorWithDebugging(t, TestEnv, err, "Failed to wait for distribution status update", distribution.Namespace, distribution.Name)
187187
}
188188

189189
func testDistributionStatus(t *testing.T, llsdistributionCR *v1alpha1.LlamaStackDistribution) {
@@ -223,7 +223,7 @@ func testDistributionStatus(t *testing.T, llsdistributionCR *v1alpha1.LlamaStack
223223
Namespace: llsdistributionCR.Namespace,
224224
Name: llsdistributionCR.Name,
225225
}, finalDistribution)
226-
require.NoError(t, err, "Failed to wait for distribution status update", finalDistribution.Status)
226+
requireNoErrorWithDebugging(t, TestEnv, err, "Failed to wait for distribution status update", llsdistributionCR.Namespace, llsdistributionCR.Name)
227227
}
228228

229229
// Get final state and verify

tests/e2e/test_utils.go

Lines changed: 216 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import (
1212
"github.com/llamastack/llama-stack-k8s-operator/api/v1alpha1"
1313
"github.com/stretchr/testify/require"
1414
appsv1 "k8s.io/api/apps/v1"
15+
corev1 "k8s.io/api/core/v1"
1516
apiextv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
1617
"k8s.io/apimachinery/pkg/api/errors"
1718
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
@@ -182,3 +183,218 @@ func GetSampleCR(t *testing.T) *v1alpha1.LlamaStackDistribution {
182183

183184
return distribution
184185
}
186+
187+
// checkLlamaStackDistributionStatus helps identify if the custom resource reached the expected state during test execution.
188+
func checkLlamaStackDistributionStatus(t *testing.T, testenv *TestEnvironment, namespace, name string) {
189+
t.Helper()
190+
191+
llsDistro := &v1alpha1.LlamaStackDistribution{}
192+
err := testenv.Client.Get(testenv.Ctx, client.ObjectKey{Namespace: namespace, Name: name}, llsDistro)
193+
if err != nil {
194+
t.Logf("⚠️ Error getting LlamaStackDistribution: %v", err)
195+
return
196+
}
197+
198+
t.Logf("LlamaStackDistribution status:")
199+
t.Logf(" Phase: %s", llsDistro.Status.Phase)
200+
t.Logf(" Generation: %d", llsDistro.Generation)
201+
t.Logf(" ResourceVersion: %s", llsDistro.ResourceVersion)
202+
t.Logf(" Conditions: %+v", llsDistro.Status.Conditions)
203+
}
204+
205+
// checkNamespaceEvents reveals what Kubernetes operations occurred and why they may have failed.
206+
func checkNamespaceEvents(t *testing.T, testenv *TestEnvironment, namespace string) {
207+
t.Helper()
208+
209+
eventList := &corev1.EventList{}
210+
err := testenv.Client.List(testenv.Ctx, eventList, client.InNamespace(namespace))
211+
if err != nil {
212+
t.Logf("⚠️ Error getting events: %v", err)
213+
return
214+
}
215+
216+
if len(eventList.Items) == 0 {
217+
t.Log("📝 No events found in namespace")
218+
return
219+
}
220+
221+
maxEvents := 25
222+
if len(eventList.Items) > maxEvents {
223+
t.Logf("📝 Showing first %d events (of %d total):", maxEvents, len(eventList.Items))
224+
eventList.Items = eventList.Items[:maxEvents]
225+
} else {
226+
t.Logf("📝 Found %d events in namespace %s:", len(eventList.Items), namespace)
227+
}
228+
229+
for _, event := range eventList.Items {
230+
t.Logf(" %s: %s (%s) - %s",
231+
event.LastTimestamp.Format("15:04:05"),
232+
event.Reason,
233+
event.Type,
234+
event.Message)
235+
}
236+
}
237+
238+
// requireNoErrorWithDebugging provides comprehensive debugging context when tests fail to help identify root causes quickly.
239+
func requireNoErrorWithDebugging(t *testing.T, testenv *TestEnvironment, err error, msg string, namespace, crName string) {
240+
t.Helper()
241+
if err != nil {
242+
t.Logf("💥 ERROR OCCURRED: %s - %v", msg, err)
243+
244+
// Check custom resource status first to see if the operator processed the request correctly
245+
checkLlamaStackDistributionStatus(t, testenv, namespace, crName)
246+
247+
// Check events to understand what Kubernetes operations were attempted and why they failed
248+
checkNamespaceEvents(t, testenv, namespace)
249+
250+
// Check pod details to identify container startup issues or crash loops
251+
logPodDetails(t, testenv, namespace)
252+
253+
// Check service endpoints to see if pods are being discovered by services
254+
logServiceEndpoints(t, testenv, namespace, crName+"-service")
255+
256+
// Check service configuration to identify selector mismatches
257+
logServiceSpec(t, testenv, namespace, crName+"-service")
258+
259+
// Check deployment spec to identify configuration problems preventing pod startup
260+
logDeploymentSpec(t, testenv, namespace, crName)
261+
262+
require.NoError(t, err, msg)
263+
}
264+
}
265+
266+
// logPodDetails helps diagnose pod startup issues and container restart problems during test failures.
267+
func logPodDetails(t *testing.T, testenv *TestEnvironment, namespace string) {
268+
t.Helper()
269+
270+
podList := &corev1.PodList{}
271+
err := testenv.Client.List(testenv.Ctx, podList, client.InNamespace(namespace))
272+
if err != nil {
273+
t.Logf("Failed to list pods: %v", err)
274+
return
275+
}
276+
277+
t.Logf("📦 Found %d pods in namespace %s:", len(podList.Items), namespace)
278+
for _, pod := range podList.Items {
279+
t.Logf("Pod: %s, Phase: %s", pod.Name, pod.Status.Phase)
280+
281+
for _, cs := range pod.Status.ContainerStatuses {
282+
// RestartCount indicates crash loops or configuration issues
283+
t.Logf(" Container %s: Ready=%v, RestartCount=%d",
284+
cs.Name, cs.Ready, cs.RestartCount)
285+
286+
// Container states reveal why pods aren't starting or are crashing
287+
if cs.State.Waiting != nil {
288+
t.Logf(" Waiting: %s - %s",
289+
cs.State.Waiting.Reason, cs.State.Waiting.Message)
290+
}
291+
if cs.State.Terminated != nil {
292+
t.Logf(" Terminated: %s - %s",
293+
cs.State.Terminated.Reason, cs.State.Terminated.Message)
294+
}
295+
}
296+
297+
// Pod logs would show startup errors but require different client access
298+
t.Logf(" (Pod logs require direct kubectl access)")
299+
}
300+
}
301+
302+
// logServiceEndpoints logs service endpoint details to see if pods are ready.
303+
func logServiceEndpoints(t *testing.T, testenv *TestEnvironment, namespace, serviceName string) {
304+
t.Helper()
305+
306+
endpoints := &corev1.Endpoints{}
307+
err := testenv.Client.Get(testenv.Ctx, types.NamespacedName{
308+
Name: serviceName,
309+
Namespace: namespace,
310+
}, endpoints)
311+
312+
if err != nil {
313+
t.Logf("Failed to get endpoints for service %s: %v", serviceName, err)
314+
return
315+
}
316+
317+
t.Logf("🔗 Service %s endpoints:", serviceName)
318+
for i, subset := range endpoints.Subsets {
319+
t.Logf(" Subset %d:", i)
320+
// Ready addresses indicate pods that passed health checks and can receive traffic
321+
t.Logf(" Ready addresses: %d", len(subset.Addresses))
322+
for _, addr := range subset.Addresses {
323+
t.Logf(" - %s", addr.IP)
324+
}
325+
// Not ready addresses show pods that exist but failed health checks
326+
t.Logf(" Not ready addresses: %d", len(subset.NotReadyAddresses))
327+
for _, addr := range subset.NotReadyAddresses {
328+
t.Logf(" - %s", addr.IP)
329+
}
330+
t.Logf(" Ports:")
331+
for _, port := range subset.Ports {
332+
t.Logf(" - %s: %d", port.Name, port.Port)
333+
}
334+
}
335+
}
336+
337+
// logDeploymentSpec helps identify configuration mismatches that prevent pods from starting correctly.
338+
func logDeploymentSpec(t *testing.T, testenv *TestEnvironment, namespace, name string) {
339+
t.Helper()
340+
341+
deployment := &appsv1.Deployment{}
342+
err := testenv.Client.Get(testenv.Ctx, types.NamespacedName{
343+
Name: name,
344+
Namespace: namespace,
345+
}, deployment)
346+
347+
if err != nil {
348+
t.Logf("Failed to get deployment: %v", err)
349+
return
350+
}
351+
352+
t.Logf("🚀 Deployment %s spec:", name)
353+
t.Logf(" Replicas: %d", *deployment.Spec.Replicas)
354+
// Selector must match pod labels or pods won't be managed by deployment
355+
t.Logf(" Selector: %+v", deployment.Spec.Selector.MatchLabels)
356+
t.Logf(" Template labels: %+v", deployment.Spec.Template.Labels)
357+
358+
for _, container := range deployment.Spec.Template.Spec.Containers {
359+
t.Logf(" Container: %s", container.Name)
360+
t.Logf(" Image: %s", container.Image)
361+
t.Logf(" Ports:")
362+
for _, port := range container.Ports {
363+
t.Logf(" - %d", port.ContainerPort)
364+
}
365+
// Environment variables can cause startup failures if misconfigured
366+
t.Logf(" Env vars:")
367+
for _, env := range container.Env {
368+
t.Logf(" %s=%s", env.Name, env.Value)
369+
}
370+
// Readiness probe configuration affects when pods become service endpoints
371+
if container.ReadinessProbe != nil {
372+
t.Logf(" Readiness probe: %+v", container.ReadinessProbe)
373+
}
374+
}
375+
}
376+
377+
// logServiceSpec logs the actual service configuration to debug selector issues.
378+
func logServiceSpec(t *testing.T, testenv *TestEnvironment, namespace, serviceName string) {
379+
t.Helper()
380+
381+
service := &corev1.Service{}
382+
err := testenv.Client.Get(testenv.Ctx, types.NamespacedName{
383+
Name: serviceName,
384+
Namespace: namespace,
385+
}, service)
386+
387+
if err != nil {
388+
t.Logf("Failed to get service %s: %v", serviceName, err)
389+
return
390+
}
391+
392+
t.Logf("🔧 Service %s spec:", serviceName)
393+
t.Logf(" Type: %s", service.Spec.Type)
394+
// Selector must match pod labels or service won't route traffic to pods
395+
t.Logf(" Selector: %+v", service.Spec.Selector)
396+
t.Logf(" Ports:")
397+
for _, port := range service.Spec.Ports {
398+
t.Logf(" - %s: %d -> %s", port.Name, port.Port, port.TargetPort.String())
399+
}
400+
}

0 commit comments

Comments
 (0)