Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,35 @@ kubectl apply -f feature-flags.yaml
Within the next reconciliation loop the operator will begin creating a `<name>-network-policy` resource for each distribution.
Set `enabled: false` (or remove the block) to turn the feature back off; the operator will delete the previously managed policies.

## Image Mapping Overrides

The operator supports ConfigMap-driven image updates for LLS Distribution images. This allows independent patching for security fixes or bug fixes without requiring a new operator version.

### Configuration

Create or update the operator ConfigMap with an `image-overrides` key:

```yaml

image-overrides: |
starter-gpu: quay.io/custom/llama-stack:starter-gpu
starter: quay.io/custom/llama-stack:starter
```

### Configuration Format

Use the distribution name directly as the key (e.g., `starter-gpu`, `starter`). The operator will apply these overrides automatically

### Example Usage

To update the LLS Distribution image for all `starter` distributions:

```bash
kubectl patch configmap llama-stack-operator-config -n llama-stack-k8s-operator-system --type merge -p '{"data":{"image-overrides":"starter: quay.io/opendatahub/llama-stack:latest"}}'
```
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is confessedly harder, when the version is stored in a map's value, so maybe my former comment on putting the version into a value, not the key, might make these kind of updates harder.


This will cause all LlamaStackDistribution resources using the `starter` distribution to restart with the new image.

## Developer Guide

### Prerequisites
Expand Down
172 changes: 142 additions & 30 deletions controllers/llamastackdistribution_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import (

"github.com/go-logr/logr"
"github.com/google/go-cmp/cmp"
"github.com/google/go-containerregistry/pkg/name"
llamav1alpha1 "github.com/llamastack/llama-stack-k8s-operator/api/v1alpha1"
"github.com/llamastack/llama-stack-k8s-operator/pkg/cluster"
"github.com/llamastack/llama-stack-k8s-operator/pkg/deploy"
Expand Down Expand Up @@ -83,11 +84,17 @@ const (
// When a ConfigMap's data changes, it automatically triggers reconciliation of the referencing
// LlamaStackDistribution, which recalculates a content-based hash and updates the deployment's
// pod template annotations. This causes Kubernetes to restart the pods with the updated configuration.
//
// Operator ConfigMap Watching Feature:
// This reconciler also watches for changes to the operator configuration ConfigMap. When the operator
// config changes, it triggers reconciliation of all LlamaStackDistribution resources.
type LlamaStackDistributionReconciler struct {
client.Client
Scheme *runtime.Scheme
// Feature flags
EnableNetworkPolicy bool
// Image mapping overrides
ImageMappingOverrides map[string]string
// Cluster info
ClusterInfo *cluster.ClusterInfo
httpClient *http.Client
Expand Down Expand Up @@ -593,6 +600,40 @@ func (r *LlamaStackDistributionReconciler) configMapUpdatePredicate(e event.Upda
return false
}

// Check if this is the operator config ConfigMap
if r.handleOperatorConfigUpdate(newConfigMap) {
return true
}

// Handle referenced ConfigMap updates
return r.handleReferencedConfigMapUpdate(oldConfigMap, newConfigMap)
}

// handleOperatorConfigUpdate processes updates to the operator config ConfigMap.
func (r *LlamaStackDistributionReconciler) handleOperatorConfigUpdate(configMap *corev1.ConfigMap) bool {
operatorNamespace, err := deploy.GetOperatorNamespace()
if err != nil {
return false
}

if configMap.Name != operatorConfigData || configMap.Namespace != operatorNamespace {
return false
}

// Update feature flags
EnableNetworkPolicy, err := parseFeatureFlags(configMap.Data)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is network policy enabled/disable via a feature flag (which is used for enabling/disabling experimental features which become eventually permanent features (or dropped)).

For me it looks like the usage of NetworkPolicies is an admin decision so that it shoudl be part of the regular configuration. A feature flag could handle if this configuration has any effect, but the overall switch network policy yes/no should be part of LLSD.

if err != nil {
log.FromContext(context.Background()).Error(err, "Failed to parse feature flags")
} else {
r.EnableNetworkPolicy = EnableNetworkPolicy
}

r.ImageMappingOverrides = ParseImageMappingOverrides(context.Background(), configMap.Data)
return true
}

// handleReferencedConfigMapUpdate processes updates to referenced ConfigMaps.
func (r *LlamaStackDistributionReconciler) handleReferencedConfigMapUpdate(oldConfigMap, newConfigMap *corev1.ConfigMap) bool {
// Only proceed if this ConfigMap is referenced by any LlamaStackDistribution
if !r.isConfigMapReferenced(newConfigMap) {
return false
Expand Down Expand Up @@ -758,6 +799,27 @@ func (r *LlamaStackDistributionReconciler) manuallyCheckConfigMapReference(confi

// findLlamaStackDistributionsForConfigMap maps ConfigMap changes to LlamaStackDistribution reconcile requests.
func (r *LlamaStackDistributionReconciler) findLlamaStackDistributionsForConfigMap(ctx context.Context, configMap client.Object) []reconcile.Request {
logger := log.FromContext(ctx).WithValues(
"configMapName", configMap.GetName(),
"configMapNamespace", configMap.GetNamespace())

operatorNamespace, err := deploy.GetOperatorNamespace()
if err != nil {
logger.Error(err, "Failed to get operator namespace for config map event processing")
return nil
}
// If the operator config was changed, we reconcile all LlamaStackDistributions
if configMap.GetName() == operatorConfigData && configMap.GetNamespace() == operatorNamespace {
// List all LlamaStackDistribution resources across all namespaces
allLlamaStacks := llamav1alpha1.LlamaStackDistributionList{}
err = r.List(ctx, &allLlamaStacks)
if err != nil {
logger.Error(err, "Failed to list all LlamaStackDistributions for operator config change")
return nil
}
return r.convertToReconcileRequests(allLlamaStacks)
}
Comment on lines +811 to +821
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm thinking if there happen to be a lot of LlamaStackDistributions on a large cluster, we might want to introduce some jittering / rate limiting so there's not an immediate thundering herd effect. (More like an optimization than a requirement for this PR.)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good idea, we could add this to the backlog


// Try field indexer lookup first
attachedLlamaStacks, found := r.tryFieldIndexerLookup(ctx, configMap)
if !found {
Expand Down Expand Up @@ -1632,53 +1694,103 @@ func NewLlamaStackDistributionReconciler(ctx context.Context, client client.Clie
return nil, fmt.Errorf("failed to get operator namespace: %w", err)
}

// Get the ConfigMap
// If the ConfigMap doesn't exist, create it with default feature flags
// If the ConfigMap exists, parse the feature flags from the Configmap
// Initialize operator config ConfigMap
configMap, err := initializeOperatorConfigMap(ctx, client, operatorNamespace)
if err != nil {
return nil, err
}

// Parse feature flags from ConfigMap
enableNetworkPolicy, err := parseFeatureFlags(configMap.Data)
if err != nil {
return nil, fmt.Errorf("failed to parse feature flags: %w", err)
}

// Parse image mapping overrides from ConfigMap
imageMappingOverrides := ParseImageMappingOverrides(ctx, configMap.Data)

return &LlamaStackDistributionReconciler{
Client: client,
Scheme: scheme,
EnableNetworkPolicy: enableNetworkPolicy,
ImageMappingOverrides: imageMappingOverrides,
ClusterInfo: clusterInfo,
httpClient: &http.Client{Timeout: 5 * time.Second},
}, nil
}

// initializeOperatorConfigMap gets or creates the operator config ConfigMap.
func initializeOperatorConfigMap(ctx context.Context, c client.Client, operatorNamespace string) (*corev1.ConfigMap, error) {
configMap := &corev1.ConfigMap{}
configMapName := types.NamespacedName{
Name: operatorConfigData,
Namespace: operatorNamespace,
}

if err = client.Get(ctx, configMapName, configMap); err != nil {
if !k8serrors.IsNotFound(err) {
return nil, fmt.Errorf("failed to get ConfigMap: %w", err)
}
err := c.Get(ctx, configMapName, configMap)
if err == nil {
return configMap, nil
}

// ConfigMap doesn't exist, create it with defaults
configMap, err = createDefaultConfigMap(configMapName)
if err != nil {
return nil, fmt.Errorf("failed to generate default configMap: %w", err)
if !k8serrors.IsNotFound(err) {
return nil, fmt.Errorf("failed to get ConfigMap: %w", err)
}

// ConfigMap doesn't exist, create it with defaults
configMap, err = createDefaultConfigMap(configMapName)
if err != nil {
return nil, fmt.Errorf("failed to generate default configMap: %w", err)
}

if err = c.Create(ctx, configMap); err != nil {
return nil, fmt.Errorf("failed to create ConfigMap: %w", err)
}

return configMap, nil
}

func ParseImageMappingOverrides(ctx context.Context, configMapData map[string]string) map[string]string {
imageMappingOverrides := make(map[string]string)
logger := log.FromContext(ctx)

// Look for the image-overrides key in the ConfigMap data
if overridesYAML, exists := configMapData["image-overrides"]; exists {
// Parse the YAML content
var overrides map[string]string
if err := yaml.Unmarshal([]byte(overridesYAML), &overrides); err != nil {
// Log error but continue with empty overrides
logger.V(1).Info("failed to parse image-overrides YAML", "error", err)
return imageMappingOverrides
}

if err = client.Create(ctx, configMap); err != nil {
return nil, fmt.Errorf("failed to create ConfigMap: %w", err)
// Validate and copy the parsed overrides to our result map
for version, image := range overrides {
// Validate the image reference format
if _, err := name.ParseReference(image); err != nil {
logger.V(1).Info(
"skipping invalid image override",
"version", version,
"image", image,
"error", err,
)
continue
}
imageMappingOverrides[version] = image
}
}

// Parse feature flags from ConfigMap
enableNetworkPolicy, err := parseFeatureFlags(configMap.Data)
if err != nil {
return nil, fmt.Errorf("failed to parse feature flags: %w", err)
}
return &LlamaStackDistributionReconciler{
Client: client,
Scheme: scheme,
EnableNetworkPolicy: enableNetworkPolicy,
ClusterInfo: clusterInfo,
httpClient: &http.Client{Timeout: 5 * time.Second},
}, nil
return imageMappingOverrides
}

// NewTestReconciler creates a reconciler for testing, allowing injection of a custom http client and feature flags.
func NewTestReconciler(client client.Client, scheme *runtime.Scheme, clusterInfo *cluster.ClusterInfo,
httpClient *http.Client, enableNetworkPolicy bool) *LlamaStackDistributionReconciler {
return &LlamaStackDistributionReconciler{
Client: client,
Scheme: scheme,
ClusterInfo: clusterInfo,
httpClient: httpClient,
EnableNetworkPolicy: enableNetworkPolicy,
Client: client,
Scheme: scheme,
ClusterInfo: clusterInfo,
httpClient: httpClient,
EnableNetworkPolicy: enableNetworkPolicy,
ImageMappingOverrides: make(map[string]string),
}
}
Loading
Loading