refactor(controller): delegate health checks to k8s readiness probes (#130)

mfleader · web-flow · commit 2fa21ea25dc9 · 2025-08-14T15:52:20.000Z
The controller now relies on the Deployment's readiness status, which is driven by a new readiness probe on the container.

This removes the operator's redundant, manual health-checking logic and makes the system more robust by using a standard Kubernetes feature.


Approved-by: rhdedgar
diff --git a/controllers/llamastackdistribution_controller.go b/controllers/llamastackdistribution_controller.go
@@ -836,28 +836,6 @@ func (r *LlamaStackDistributionReconciler) getServerURL(instance *llamav1alpha1.
 	}
 }
 
-// checkHealth makes an HTTP request to the health endpoint.
-func (r *LlamaStackDistributionReconciler) checkHealth(ctx context.Context, instance *llamav1alpha1.LlamaStackDistribution) (bool, error) {
-	u := r.getServerURL(instance, "/v1/health")
-
-	client := &http.Client{
-		Timeout: 5 * time.Second,
-	}
-
-	req, err := http.NewRequestWithContext(ctx, http.MethodGet, u.String(), nil)
-	if err != nil {
-		return false, fmt.Errorf("failed to create health check request: %w", err)
-	}
-
-	resp, err := client.Do(req)
-	if err != nil {
-		return false, fmt.Errorf("failed to make health check request: %w", err)
-	}
-	defer resp.Body.Close()
-
-	return resp.StatusCode == http.StatusOK, nil
-}
-
 // getProviderInfo makes an HTTP request to the providers endpoint.
 func (r *LlamaStackDistributionReconciler) getProviderInfo(ctx context.Context, instance *llamav1alpha1.LlamaStackDistribution) ([]llamav1alpha1.ProviderInfo, error) {
 	u := r.getServerURL(instance, "/v1/providers")
@@ -928,6 +906,7 @@ func (r *LlamaStackDistributionReconciler) getVersionInfo(ctx context.Context, i
 
 // updateStatus refreshes the LlamaStack status.
 func (r *LlamaStackDistributionReconciler) updateStatus(ctx context.Context, instance *llamav1alpha1.LlamaStackDistribution, reconcileErr error) error {
+	logger := log.FromContext(ctx)
 	// Initialize OperatorVersion if not set
 	if instance.Status.Version.OperatorVersion == "" {
 		instance.Status.Version.OperatorVersion = os.Getenv("OPERATOR_VERSION")
@@ -949,7 +928,26 @@ func (r *LlamaStackDistributionReconciler) updateStatus(ctx context.Context, ins
 		r.updateDistributionConfig(instance)
 
 		if deploymentReady {
-			r.performHealthChecks(ctx, instance)
+			instance.Status.Phase = llamav1alpha1.LlamaStackDistributionPhaseReady
+
+			providers, err := r.getProviderInfo(ctx, instance)
+			if err != nil {
+				logger.Error(err, "failed to get provider info, clearing provider list")
+				instance.Status.DistributionConfig.Providers = nil
+			} else {
+				instance.Status.DistributionConfig.Providers = providers
+			}
+
+			version, err := r.getVersionInfo(ctx, instance)
+			if err != nil {
+				logger.Error(err, "failed to get version info from API endpoint")
+				// Don't clear the version if we cant fetch it - keep the existing one
+			} else {
+				instance.Status.Version.LlamaStackServerVersion = version
+				logger.V(1).Info("Updated LlamaStack version from API endpoint", "version", version)
+			}
+
+			SetHealthCheckCondition(&instance.Status, true, MessageHealthCheckPassed)
 		} else {
 			// If not ready, health can't be checked. Set condition appropriately.
 			SetHealthCheckCondition(&instance.Status, false, "Deployment not ready")
@@ -1046,41 +1044,6 @@ func (r *LlamaStackDistributionReconciler) updateDistributionConfig(instance *ll
 	instance.Status.DistributionConfig.ActiveDistribution = activeDistribution
 }
 
-func (r *LlamaStackDistributionReconciler) performHealthChecks(ctx context.Context, instance *llamav1alpha1.LlamaStackDistribution) {
-	logger := log.FromContext(ctx)
-
-	healthy, err := r.checkHealth(ctx, instance)
-	switch {
-	case err != nil:
-		instance.Status.Phase = llamav1alpha1.LlamaStackDistributionPhaseInitializing
-		SetHealthCheckCondition(&instance.Status, false, fmt.Sprintf("Health check failed: %v", err))
-	case !healthy:
-		instance.Status.Phase = llamav1alpha1.LlamaStackDistributionPhaseFailed
-		SetHealthCheckCondition(&instance.Status, false, MessageHealthCheckFailed)
-	default:
-		instance.Status.Phase = llamav1alpha1.LlamaStackDistributionPhaseReady
-		SetHealthCheckCondition(&instance.Status, true, MessageHealthCheckPassed)
-	}
-
-	providers, err := r.getProviderInfo(ctx, instance)
-	if err != nil {
-		logger.Error(err, "failed to get provider info, clearing provider list")
-		instance.Status.DistributionConfig.Providers = nil
-	} else {
-		instance.Status.DistributionConfig.Providers = providers
-	}
-
-	// Get version information from the API endpoint
-	version, err := r.getVersionInfo(ctx, instance)
-	if err != nil {
-		logger.Error(err, "failed to get version info from API endpoint")
-		// Don't clear the version if we cant fetch it - keep the existing one
-	} else {
-		instance.Status.Version.LlamaStackServerVersion = version
-		logger.V(1).Info("Updated LlamaStack version from API endpoint", "version", version)
-	}
-}
-
 // reconcileNetworkPolicy manages the NetworkPolicy for the LlamaStack server.
 func (r *LlamaStackDistributionReconciler) reconcileNetworkPolicy(ctx context.Context, instance *llamav1alpha1.LlamaStackDistribution) error {
 	logger := log.FromContext(ctx)
diff --git a/controllers/resource_helper.go b/controllers/resource_helper.go
@@ -25,6 +25,7 @@ import (
 
 	llamav1alpha1 "github.com/llamastack/llama-stack-k8s-operator/api/v1alpha1"
 	corev1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/util/intstr"
 	"k8s.io/utils/ptr"
 	"sigs.k8s.io/controller-runtime/pkg/log"
 )
@@ -36,6 +37,15 @@ const (
 	maxConfigMapKeyLength = 253
 )
 
+// Readiness probe configuration.
+const (
+	readinessProbeInitialDelaySeconds = 15 // Time to wait before the first probe
+	readinessProbePeriodSeconds       = 10 // How often to probe
+	readinessProbeTimeoutSeconds      = 5  // When the probe times out
+	readinessProbeFailureThreshold    = 3  // Pod is marked Unhealthy after 3 consecutive failures
+	readinessProbeSuccessThreshold    = 1  // Pod is marked Ready after 1 successful probe
+)
+
 // validConfigMapKeyRegex defines allowed characters for ConfigMap keys.
 // Kubernetes ConfigMap keys must be valid DNS subdomain names or data keys.
 var validConfigMapKeyRegex = regexp.MustCompile(`^[a-zA-Z0-9]([a-zA-Z0-9\-_.]*[a-zA-Z0-9])?$`)
@@ -70,6 +80,19 @@ func buildContainerSpec(ctx context.Context, r *LlamaStackDistributionReconciler
 		Resources:       instance.Spec.Server.ContainerSpec.Resources,
 		ImagePullPolicy: corev1.PullAlways,
 		Ports:           []corev1.ContainerPort{{ContainerPort: getContainerPort(instance)}},
+		ReadinessProbe: &corev1.Probe{
+			ProbeHandler: corev1.ProbeHandler{
+				HTTPGet: &corev1.HTTPGetAction{
+					Path: "/v1/health",
+					Port: intstr.FromInt(int(getContainerPort(instance))),
+				},
+			},
+			InitialDelaySeconds: readinessProbeInitialDelaySeconds,
+			PeriodSeconds:       readinessProbePeriodSeconds,
+			TimeoutSeconds:      readinessProbeTimeoutSeconds,
+			FailureThreshold:    readinessProbeFailureThreshold,
+			SuccessThreshold:    readinessProbeSuccessThreshold,
+		},
 	}
 
 	// Configure environment variables and mounts
diff --git a/controllers/resource_helper_test.go b/controllers/resource_helper_test.go
@@ -29,6 +29,7 @@ import (
 	corev1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/intstr"
 )
 
 func TestBuildContainerSpec(t *testing.T) {
@@ -49,9 +50,10 @@ func TestBuildContainerSpec(t *testing.T) {
 			},
 			image: "test-image:latest",
 			expectedResult: corev1.Container{
-				Name:  llamav1alpha1.DefaultContainerName,
-				Image: "test-image:latest",
-				Ports: []corev1.ContainerPort{{ContainerPort: llamav1alpha1.DefaultServerPort}},
+				Name:           llamav1alpha1.DefaultContainerName,
+				Image:          "test-image:latest",
+				Ports:          []corev1.ContainerPort{{ContainerPort: llamav1alpha1.DefaultServerPort}},
+				ReadinessProbe: newDefaultReadinessProbe(llamav1alpha1.DefaultServerPort),
 				VolumeMounts: []corev1.VolumeMount{{
 					Name:      "lls-storage",
 					MountPath: llamav1alpha1.DefaultMountPath,
@@ -87,9 +89,10 @@ func TestBuildContainerSpec(t *testing.T) {
 			},
 			image: "test-image:latest",
 			expectedResult: corev1.Container{
-				Name:  "custom-container",
-				Image: "test-image:latest",
-				Ports: []corev1.ContainerPort{{ContainerPort: 9000}},
+				Name:           "custom-container",
+				Image:          "test-image:latest",
+				Ports:          []corev1.ContainerPort{{ContainerPort: 9000}},
+				ReadinessProbe: newDefaultReadinessProbe(9000),
 				Resources: corev1.ResourceRequirements{
 					Limits: corev1.ResourceList{
 						corev1.ResourceCPU:    resource.MustParse("1"),
@@ -121,11 +124,12 @@ func TestBuildContainerSpec(t *testing.T) {
 			},
 			image: "test-image:latest",
 			expectedResult: corev1.Container{
-				Name:    llamav1alpha1.DefaultContainerName,
-				Image:   "test-image:latest",
-				Command: []string{"/custom/entrypoint.sh"},
-				Args:    []string{"--config", "/etc/config.yaml", "--debug"},
-				Ports:   []corev1.ContainerPort{{ContainerPort: llamav1alpha1.DefaultServerPort}},
+				Name:           llamav1alpha1.DefaultContainerName,
+				Image:          "test-image:latest",
+				Command:        []string{"/custom/entrypoint.sh"},
+				Args:           []string{"--config", "/etc/config.yaml", "--debug"},
+				Ports:          []corev1.ContainerPort{{ContainerPort: llamav1alpha1.DefaultServerPort}},
+				ReadinessProbe: newDefaultReadinessProbe(llamav1alpha1.DefaultServerPort),
 				VolumeMounts: []corev1.VolumeMount{{
 					Name:      "lls-storage",
 					MountPath: llamav1alpha1.DefaultMountPath,
@@ -156,6 +160,7 @@ func TestBuildContainerSpec(t *testing.T) {
 				Image:           "test-image:latest",
 				ImagePullPolicy: corev1.PullAlways,
 				Ports:           []corev1.ContainerPort{{ContainerPort: llamav1alpha1.DefaultServerPort}},
+				ReadinessProbe:  newDefaultReadinessProbe(llamav1alpha1.DefaultServerPort),
 				Command:         []string{"python", "-m", "llama_stack.distribution.server.server"},
 				Args:            []string{"--config", "/etc/llama-stack/run.yaml"},
 				Env: []corev1.EnvVar{
@@ -187,6 +192,7 @@ func TestBuildContainerSpec(t *testing.T) {
 			assert.Equal(t, tc.expectedResult.VolumeMounts, result.VolumeMounts)
 			assert.Equal(t, tc.expectedResult.Command, result.Command)
 			assert.Equal(t, tc.expectedResult.Args, result.Args)
+			assert.Equal(t, tc.expectedResult.ReadinessProbe, result.ReadinessProbe)
 		})
 	}
 }
@@ -641,3 +647,22 @@ func TestValidateConfigMapKeys(t *testing.T) {
 		})
 	}
 }
+
+// newDefaultReadinessProbe returns a Kubernetes HTTP readiness probe that checks
+// the "/v1/health" endpoint on the given port using default timing and
+// threshold settings.
+func newDefaultReadinessProbe(port int32) *corev1.Probe {
+	return &corev1.Probe{
+		ProbeHandler: corev1.ProbeHandler{
+			HTTPGet: &corev1.HTTPGetAction{
+				Path: "/v1/health",
+				Port: intstr.FromInt(int(port)),
+			},
+		},
+		InitialDelaySeconds: readinessProbeInitialDelaySeconds,
+		PeriodSeconds:       readinessProbePeriodSeconds,
+		TimeoutSeconds:      readinessProbeTimeoutSeconds,
+		FailureThreshold:    readinessProbeFailureThreshold,
+		SuccessThreshold:    readinessProbeSuccessThreshold,
+	}
+}