feat: Add support for uvicorn workers for llama-stack

VaishnaviHire · VaishnaviHire · commit 5c03fe0007e2 · 2025-12-05T16:02:10.000-05:00
Signed-off-by: Vaishnavi Hire &lt;vhire@redhat.com&gt;
diff --git a/api/v1alpha1/llamastackdistribution_types.go b/api/v1alpha1/llamastackdistribution_types.go
diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go
diff --git a/config/crd/bases/llamastack.io_llamastackdistributions.yaml b/config/crd/bases/llamastack.io_llamastackdistributions.yaml
@@ -2568,6 +2568,13 @@ spec:
                     required:
                     - configMapName
                     type: object
+                  workers:
+                    description: |-
+                      Workers configures the number of uvicorn worker processes to run.
+                      When set, the operator will launch llama-stack using uvicorn with the specified worker count.
+                    format: int32
+                    minimum: 1
+                    type: integer
                 required:
                 - distribution
                 type: object
diff --git a/config/samples/_v1alpha1_llamastackdistribution.yaml b/config/samples/_v1alpha1_llamastackdistribution.yaml
@@ -14,6 +14,7 @@ spec:
       name: llama-stack
     distribution:
       name: starter
+    workers: 2
     podDisruptionBudget:
       minAvailable: 1
     topologySpreadConstraints:
diff --git a/controllers/resource_helper.go b/controllers/resource_helper.go
@@ -21,6 +21,7 @@ import (
 	"errors"
 	"fmt"
 	"regexp"
+	"strconv"
 	"strings"
 
 	llamav1alpha1 "github.com/llamastack/llama-stack-k8s-operator/api/v1alpha1"
@@ -91,21 +92,26 @@ try:
         print('Using core module path (llama_stack.core.server.server)', file=sys.stderr)
         print(1)
     else:
-        print('Using new CLI command (llama stack run)', file=sys.stderr)
+        print('Using uvicorn CLI command', file=sys.stderr)
         print(2)
 except Exception as e:
     print(f'Version detection failed, defaulting to new CLI: {e}', file=sys.stderr)
     print(2)
 ")
 
+PORT=${LLS_PORT:-8321}
+WORKERS=${LLS_WORKERS:-1}
+
 # Execute the appropriate CLI based on version
 case $VERSION_CODE in
     0) python3 -m llama_stack.distribution.server.server --config /etc/llama-stack/run.yaml ;;
     1) python3 -m llama_stack.core.server.server /etc/llama-stack/run.yaml ;;
-    2) llama stack run /etc/llama-stack/run.yaml ;;
-    *) echo "Invalid version code: $VERSION_CODE, using new CLI"; llama stack run /etc/llama-stack/run.yaml ;;
+    2) exec uvicorn llama_stack.core.server.server:create_app --host 0.0.0.0 --port "$PORT" --workers "$WORKERS" --factory ;;
+    *) exec uvicorn llama_stack.core.server.server:create_app --host 0.0.0.0 --port "$PORT" --workers "$WORKERS" --factory ;;
 esac`
 
+const llamaStackConfigPath = "/etc/llama-stack/run.yaml"
+
 // validateConfigMapKeys validates that all ConfigMap keys contain only safe characters.
 // Note: This function validates key names only. PEM content validation is performed
 // separately in the controller's reconcileCABundleConfigMap function.
@@ -227,6 +233,27 @@ func configureContainerEnvironment(ctx context.Context, r *LlamaStackDistributio
 		})
 	}
 
+	// Always provide worker/port/config env for uvicorn; workers default to 1 when unspecified.
+	workers := instance.Spec.Server.Workers
+	if workers == nil {
+		defaultWorkers := int32(1)
+		workers = &defaultWorkers
+	}
+	container.Env = append(container.Env,
+		corev1.EnvVar{
+			Name:  "LLS_WORKERS",
+			Value: strconv.Itoa(int(*workers)),
+		},
+		corev1.EnvVar{
+			Name:  "LLS_PORT",
+			Value: strconv.Itoa(int(getContainerPort(instance))),
+		},
+		corev1.EnvVar{
+			Name:  "LLAMA_STACK_CONFIG",
+			Value: llamaStackConfigPath,
+		},
+	)
+
 	// Finally, add the user provided env vars
 	container.Env = append(container.Env, instance.Spec.Server.ContainerSpec.Env...)
 }
diff --git a/controllers/resource_helper_test.go b/controllers/resource_helper_test.go
@@ -32,6 +32,10 @@ import (
 	"k8s.io/apimachinery/pkg/util/intstr"
 )
 
+func int32Ptr(val int32) *int32 {
+	return &val
+}
+
 func TestBuildContainerSpec(t *testing.T) {
 	testCases := []struct {
 		name           string
@@ -66,6 +70,9 @@ func TestBuildContainerSpec(t *testing.T) {
 				}},
 				Env: []corev1.EnvVar{
 					{Name: "HF_HOME", Value: "/.llama"},
+					{Name: "LLS_WORKERS", Value: "1"},
+					{Name: "LLS_PORT", Value: "8321"},
+					{Name: "LLAMA_STACK_CONFIG", Value: "/etc/llama-stack/run.yaml"},
 				},
 			},
 		},
@@ -111,6 +118,9 @@ func TestBuildContainerSpec(t *testing.T) {
 				},
 				Env: []corev1.EnvVar{
 					{Name: "HF_HOME", Value: "/custom/path"},
+					{Name: "LLS_WORKERS", Value: "1"},
+					{Name: "LLS_PORT", Value: "9000"},
+					{Name: "LLAMA_STACK_CONFIG", Value: "/etc/llama-stack/run.yaml"},
 					{Name: "TEST_ENV", Value: "test-value"},
 				},
 				VolumeMounts: []corev1.VolumeMount{{
@@ -152,7 +162,43 @@ func TestBuildContainerSpec(t *testing.T) {
 				}},
 				Env: []corev1.EnvVar{
 					{Name: "HF_HOME", Value: "/.llama"},
+					{Name: "LLS_WORKERS", Value: "1"},
+					{Name: "LLS_PORT", Value: "8321"},
+					{Name: "LLAMA_STACK_CONFIG", Value: "/etc/llama-stack/run.yaml"},
+				},
+			},
+		},
+		{
+			name: "uvicorn workers configured",
+			instance: &llamav1alpha1.LlamaStackDistribution{
+				Spec: llamav1alpha1.LlamaStackDistributionSpec{
+					Server: llamav1alpha1.ServerSpec{
+						Workers: int32Ptr(4),
+					},
+				},
+			},
+			image: "test-image:latest",
+			expectedResult: corev1.Container{
+				Name:  llamav1alpha1.DefaultContainerName,
+				Image: "test-image:latest",
+				Resources: corev1.ResourceRequirements{
+					Requests: corev1.ResourceList{
+						corev1.ResourceCPU:    llamav1alpha1.DefaultServerCPURequest,
+						corev1.ResourceMemory: llamav1alpha1.DefaultServerMemoryRequest,
+					},
+				},
+				Ports:        []corev1.ContainerPort{{ContainerPort: llamav1alpha1.DefaultServerPort}},
+				StartupProbe: newDefaultStartupProbe(llamav1alpha1.DefaultServerPort),
+				Env: []corev1.EnvVar{
+					{Name: "HF_HOME", Value: "/.llama"},
+					{Name: "LLS_WORKERS", Value: "4"},
+					{Name: "LLS_PORT", Value: "8321"},
+					{Name: "LLAMA_STACK_CONFIG", Value: "/etc/llama-stack/run.yaml"},
 				},
+				VolumeMounts: []corev1.VolumeMount{{
+					Name:      "lls-storage",
+					MountPath: llamav1alpha1.DefaultMountPath,
+				}},
 			},
 		},
 		{
@@ -187,6 +233,9 @@ func TestBuildContainerSpec(t *testing.T) {
 				Args:         []string{},
 				Env: []corev1.EnvVar{
 					{Name: "HF_HOME", Value: llamav1alpha1.DefaultMountPath},
+					{Name: "LLS_WORKERS", Value: "1"},
+					{Name: "LLS_PORT", Value: "8321"},
+					{Name: "LLAMA_STACK_CONFIG", Value: "/etc/llama-stack/run.yaml"},
 				},
 				VolumeMounts: []corev1.VolumeMount{
 					{
diff --git a/docs/api-overview.md b/docs/api-overview.md
@@ -218,6 +218,7 @@ _Appears in:_
 | --- | --- | --- | --- |
 | `distribution` _[DistributionType](#distributiontype)_ |  |  |  |
 | `containerSpec` _[ContainerSpec](#containerspec)_ |  |  |  |
+| `workers` _integer_ | Workers configures the number of uvicorn worker processes to run.<br />When set, the operator will launch llama-stack using uvicorn with the specified worker count. |  | Minimum: 1 <br /> |
 | `podOverrides` _[PodOverrides](#podoverrides)_ |  |  |  |
 | `podDisruptionBudget` _[PodDisruptionBudgetSpec](#poddisruptionbudgetspec)_ | PodDisruptionBudget controls voluntary disruption tolerance for the server pods |  |  |
 | `topologySpreadConstraints` _[TopologySpreadConstraint](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.31/#topologyspreadconstraint-v1-core) array_ | TopologySpreadConstraints defines fine-grained spreading rules |  |  |
diff --git a/release/operator.yaml b/release/operator.yaml
@@ -2577,6 +2577,13 @@ spec:
                     required:
                     - configMapName
                     type: object
+                  workers:
+                    description: |-
+                      Workers configures the number of uvicorn worker processes to run.
+                      When set, the operator will launch llama-stack using uvicorn with the specified worker count.
+                    format: int32
+                    minimum: 1
+                    type: integer
                 required:
                 - distribution
                 type: object