Skip to content

Commit 2fa21ea

Browse files
authored
refactor(controller): delegate health checks to k8s readiness probes (#130)
The controller now relies on the Deployment's readiness status, which is driven by a new readiness probe on the container. This removes the operator's redundant, manual health-checking logic and makes the system more robust by using a standard Kubernetes feature. Approved-by: rhdedgar
1 parent 80df219 commit 2fa21ea

File tree

3 files changed

+80
-69
lines changed

3 files changed

+80
-69
lines changed

controllers/llamastackdistribution_controller.go

Lines changed: 21 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -836,28 +836,6 @@ func (r *LlamaStackDistributionReconciler) getServerURL(instance *llamav1alpha1.
836836
}
837837
}
838838

839-
// checkHealth makes an HTTP request to the health endpoint.
840-
func (r *LlamaStackDistributionReconciler) checkHealth(ctx context.Context, instance *llamav1alpha1.LlamaStackDistribution) (bool, error) {
841-
u := r.getServerURL(instance, "/v1/health")
842-
843-
client := &http.Client{
844-
Timeout: 5 * time.Second,
845-
}
846-
847-
req, err := http.NewRequestWithContext(ctx, http.MethodGet, u.String(), nil)
848-
if err != nil {
849-
return false, fmt.Errorf("failed to create health check request: %w", err)
850-
}
851-
852-
resp, err := client.Do(req)
853-
if err != nil {
854-
return false, fmt.Errorf("failed to make health check request: %w", err)
855-
}
856-
defer resp.Body.Close()
857-
858-
return resp.StatusCode == http.StatusOK, nil
859-
}
860-
861839
// getProviderInfo makes an HTTP request to the providers endpoint.
862840
func (r *LlamaStackDistributionReconciler) getProviderInfo(ctx context.Context, instance *llamav1alpha1.LlamaStackDistribution) ([]llamav1alpha1.ProviderInfo, error) {
863841
u := r.getServerURL(instance, "/v1/providers")
@@ -928,6 +906,7 @@ func (r *LlamaStackDistributionReconciler) getVersionInfo(ctx context.Context, i
928906

929907
// updateStatus refreshes the LlamaStack status.
930908
func (r *LlamaStackDistributionReconciler) updateStatus(ctx context.Context, instance *llamav1alpha1.LlamaStackDistribution, reconcileErr error) error {
909+
logger := log.FromContext(ctx)
931910
// Initialize OperatorVersion if not set
932911
if instance.Status.Version.OperatorVersion == "" {
933912
instance.Status.Version.OperatorVersion = os.Getenv("OPERATOR_VERSION")
@@ -949,7 +928,26 @@ func (r *LlamaStackDistributionReconciler) updateStatus(ctx context.Context, ins
949928
r.updateDistributionConfig(instance)
950929

951930
if deploymentReady {
952-
r.performHealthChecks(ctx, instance)
931+
instance.Status.Phase = llamav1alpha1.LlamaStackDistributionPhaseReady
932+
933+
providers, err := r.getProviderInfo(ctx, instance)
934+
if err != nil {
935+
logger.Error(err, "failed to get provider info, clearing provider list")
936+
instance.Status.DistributionConfig.Providers = nil
937+
} else {
938+
instance.Status.DistributionConfig.Providers = providers
939+
}
940+
941+
version, err := r.getVersionInfo(ctx, instance)
942+
if err != nil {
943+
logger.Error(err, "failed to get version info from API endpoint")
944+
// Don't clear the version if we cant fetch it - keep the existing one
945+
} else {
946+
instance.Status.Version.LlamaStackServerVersion = version
947+
logger.V(1).Info("Updated LlamaStack version from API endpoint", "version", version)
948+
}
949+
950+
SetHealthCheckCondition(&instance.Status, true, MessageHealthCheckPassed)
953951
} else {
954952
// If not ready, health can't be checked. Set condition appropriately.
955953
SetHealthCheckCondition(&instance.Status, false, "Deployment not ready")
@@ -1046,41 +1044,6 @@ func (r *LlamaStackDistributionReconciler) updateDistributionConfig(instance *ll
10461044
instance.Status.DistributionConfig.ActiveDistribution = activeDistribution
10471045
}
10481046

1049-
func (r *LlamaStackDistributionReconciler) performHealthChecks(ctx context.Context, instance *llamav1alpha1.LlamaStackDistribution) {
1050-
logger := log.FromContext(ctx)
1051-
1052-
healthy, err := r.checkHealth(ctx, instance)
1053-
switch {
1054-
case err != nil:
1055-
instance.Status.Phase = llamav1alpha1.LlamaStackDistributionPhaseInitializing
1056-
SetHealthCheckCondition(&instance.Status, false, fmt.Sprintf("Health check failed: %v", err))
1057-
case !healthy:
1058-
instance.Status.Phase = llamav1alpha1.LlamaStackDistributionPhaseFailed
1059-
SetHealthCheckCondition(&instance.Status, false, MessageHealthCheckFailed)
1060-
default:
1061-
instance.Status.Phase = llamav1alpha1.LlamaStackDistributionPhaseReady
1062-
SetHealthCheckCondition(&instance.Status, true, MessageHealthCheckPassed)
1063-
}
1064-
1065-
providers, err := r.getProviderInfo(ctx, instance)
1066-
if err != nil {
1067-
logger.Error(err, "failed to get provider info, clearing provider list")
1068-
instance.Status.DistributionConfig.Providers = nil
1069-
} else {
1070-
instance.Status.DistributionConfig.Providers = providers
1071-
}
1072-
1073-
// Get version information from the API endpoint
1074-
version, err := r.getVersionInfo(ctx, instance)
1075-
if err != nil {
1076-
logger.Error(err, "failed to get version info from API endpoint")
1077-
// Don't clear the version if we cant fetch it - keep the existing one
1078-
} else {
1079-
instance.Status.Version.LlamaStackServerVersion = version
1080-
logger.V(1).Info("Updated LlamaStack version from API endpoint", "version", version)
1081-
}
1082-
}
1083-
10841047
// reconcileNetworkPolicy manages the NetworkPolicy for the LlamaStack server.
10851048
func (r *LlamaStackDistributionReconciler) reconcileNetworkPolicy(ctx context.Context, instance *llamav1alpha1.LlamaStackDistribution) error {
10861049
logger := log.FromContext(ctx)

controllers/resource_helper.go

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525

2626
llamav1alpha1 "github.com/llamastack/llama-stack-k8s-operator/api/v1alpha1"
2727
corev1 "k8s.io/api/core/v1"
28+
"k8s.io/apimachinery/pkg/util/intstr"
2829
"k8s.io/utils/ptr"
2930
"sigs.k8s.io/controller-runtime/pkg/log"
3031
)
@@ -36,6 +37,15 @@ const (
3637
maxConfigMapKeyLength = 253
3738
)
3839

40+
// Readiness probe configuration.
41+
const (
42+
readinessProbeInitialDelaySeconds = 15 // Time to wait before the first probe
43+
readinessProbePeriodSeconds = 10 // How often to probe
44+
readinessProbeTimeoutSeconds = 5 // When the probe times out
45+
readinessProbeFailureThreshold = 3 // Pod is marked Unhealthy after 3 consecutive failures
46+
readinessProbeSuccessThreshold = 1 // Pod is marked Ready after 1 successful probe
47+
)
48+
3949
// validConfigMapKeyRegex defines allowed characters for ConfigMap keys.
4050
// Kubernetes ConfigMap keys must be valid DNS subdomain names or data keys.
4151
var validConfigMapKeyRegex = regexp.MustCompile(`^[a-zA-Z0-9]([a-zA-Z0-9\-_.]*[a-zA-Z0-9])?$`)
@@ -70,6 +80,19 @@ func buildContainerSpec(ctx context.Context, r *LlamaStackDistributionReconciler
7080
Resources: instance.Spec.Server.ContainerSpec.Resources,
7181
ImagePullPolicy: corev1.PullAlways,
7282
Ports: []corev1.ContainerPort{{ContainerPort: getContainerPort(instance)}},
83+
ReadinessProbe: &corev1.Probe{
84+
ProbeHandler: corev1.ProbeHandler{
85+
HTTPGet: &corev1.HTTPGetAction{
86+
Path: "/v1/health",
87+
Port: intstr.FromInt(int(getContainerPort(instance))),
88+
},
89+
},
90+
InitialDelaySeconds: readinessProbeInitialDelaySeconds,
91+
PeriodSeconds: readinessProbePeriodSeconds,
92+
TimeoutSeconds: readinessProbeTimeoutSeconds,
93+
FailureThreshold: readinessProbeFailureThreshold,
94+
SuccessThreshold: readinessProbeSuccessThreshold,
95+
},
7396
}
7497

7598
// Configure environment variables and mounts

controllers/resource_helper_test.go

Lines changed: 36 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import (
2929
corev1 "k8s.io/api/core/v1"
3030
"k8s.io/apimachinery/pkg/api/resource"
3131
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
32+
"k8s.io/apimachinery/pkg/util/intstr"
3233
)
3334

3435
func TestBuildContainerSpec(t *testing.T) {
@@ -49,9 +50,10 @@ func TestBuildContainerSpec(t *testing.T) {
4950
},
5051
image: "test-image:latest",
5152
expectedResult: corev1.Container{
52-
Name: llamav1alpha1.DefaultContainerName,
53-
Image: "test-image:latest",
54-
Ports: []corev1.ContainerPort{{ContainerPort: llamav1alpha1.DefaultServerPort}},
53+
Name: llamav1alpha1.DefaultContainerName,
54+
Image: "test-image:latest",
55+
Ports: []corev1.ContainerPort{{ContainerPort: llamav1alpha1.DefaultServerPort}},
56+
ReadinessProbe: newDefaultReadinessProbe(llamav1alpha1.DefaultServerPort),
5557
VolumeMounts: []corev1.VolumeMount{{
5658
Name: "lls-storage",
5759
MountPath: llamav1alpha1.DefaultMountPath,
@@ -87,9 +89,10 @@ func TestBuildContainerSpec(t *testing.T) {
8789
},
8890
image: "test-image:latest",
8991
expectedResult: corev1.Container{
90-
Name: "custom-container",
91-
Image: "test-image:latest",
92-
Ports: []corev1.ContainerPort{{ContainerPort: 9000}},
92+
Name: "custom-container",
93+
Image: "test-image:latest",
94+
Ports: []corev1.ContainerPort{{ContainerPort: 9000}},
95+
ReadinessProbe: newDefaultReadinessProbe(9000),
9396
Resources: corev1.ResourceRequirements{
9497
Limits: corev1.ResourceList{
9598
corev1.ResourceCPU: resource.MustParse("1"),
@@ -121,11 +124,12 @@ func TestBuildContainerSpec(t *testing.T) {
121124
},
122125
image: "test-image:latest",
123126
expectedResult: corev1.Container{
124-
Name: llamav1alpha1.DefaultContainerName,
125-
Image: "test-image:latest",
126-
Command: []string{"/custom/entrypoint.sh"},
127-
Args: []string{"--config", "/etc/config.yaml", "--debug"},
128-
Ports: []corev1.ContainerPort{{ContainerPort: llamav1alpha1.DefaultServerPort}},
127+
Name: llamav1alpha1.DefaultContainerName,
128+
Image: "test-image:latest",
129+
Command: []string{"/custom/entrypoint.sh"},
130+
Args: []string{"--config", "/etc/config.yaml", "--debug"},
131+
Ports: []corev1.ContainerPort{{ContainerPort: llamav1alpha1.DefaultServerPort}},
132+
ReadinessProbe: newDefaultReadinessProbe(llamav1alpha1.DefaultServerPort),
129133
VolumeMounts: []corev1.VolumeMount{{
130134
Name: "lls-storage",
131135
MountPath: llamav1alpha1.DefaultMountPath,
@@ -156,6 +160,7 @@ func TestBuildContainerSpec(t *testing.T) {
156160
Image: "test-image:latest",
157161
ImagePullPolicy: corev1.PullAlways,
158162
Ports: []corev1.ContainerPort{{ContainerPort: llamav1alpha1.DefaultServerPort}},
163+
ReadinessProbe: newDefaultReadinessProbe(llamav1alpha1.DefaultServerPort),
159164
Command: []string{"python", "-m", "llama_stack.distribution.server.server"},
160165
Args: []string{"--config", "/etc/llama-stack/run.yaml"},
161166
Env: []corev1.EnvVar{
@@ -187,6 +192,7 @@ func TestBuildContainerSpec(t *testing.T) {
187192
assert.Equal(t, tc.expectedResult.VolumeMounts, result.VolumeMounts)
188193
assert.Equal(t, tc.expectedResult.Command, result.Command)
189194
assert.Equal(t, tc.expectedResult.Args, result.Args)
195+
assert.Equal(t, tc.expectedResult.ReadinessProbe, result.ReadinessProbe)
190196
})
191197
}
192198
}
@@ -641,3 +647,22 @@ func TestValidateConfigMapKeys(t *testing.T) {
641647
})
642648
}
643649
}
650+
651+
// newDefaultReadinessProbe returns a Kubernetes HTTP readiness probe that checks
652+
// the "/v1/health" endpoint on the given port using default timing and
653+
// threshold settings.
654+
func newDefaultReadinessProbe(port int32) *corev1.Probe {
655+
return &corev1.Probe{
656+
ProbeHandler: corev1.ProbeHandler{
657+
HTTPGet: &corev1.HTTPGetAction{
658+
Path: "/v1/health",
659+
Port: intstr.FromInt(int(port)),
660+
},
661+
},
662+
InitialDelaySeconds: readinessProbeInitialDelaySeconds,
663+
PeriodSeconds: readinessProbePeriodSeconds,
664+
TimeoutSeconds: readinessProbeTimeoutSeconds,
665+
FailureThreshold: readinessProbeFailureThreshold,
666+
SuccessThreshold: readinessProbeSuccessThreshold,
667+
}
668+
}

0 commit comments

Comments
 (0)