diff --git a/approval-request-metric-collector/Images/approval-request-metric-collector.png b/approval-request-metric-collector/Images/approval-request-metric-collector.png new file mode 100644 index 0000000..4ffc280 Binary files /dev/null and b/approval-request-metric-collector/Images/approval-request-metric-collector.png differ diff --git a/approval-request-metric-collector/Makefile b/approval-request-metric-collector/Makefile new file mode 100644 index 0000000..d9bf736 --- /dev/null +++ b/approval-request-metric-collector/Makefile @@ -0,0 +1,65 @@ +# Makefile for ApprovalRequest Controller and Metric Collector + +# Image settings +REGISTRY ?= +TAG ?= latest + +# Build settings +GOOS ?= $(shell go env GOOS) +GOARCH ?= $(shell go env GOARCH) + +# Tools +CONTROLLER_GEN_VERSION ?= v0.16.0 +CONTROLLER_GEN = go run sigs.k8s.io/controller-tools/cmd/controller-gen@$(CONTROLLER_GEN_VERSION) + +.PHONY: help +help: ## Display this help + @awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m\033[0m\n"} /^[a-zA-Z_-]+:.*?##/ { printf " \033[36m%-15s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST) + +##@ Code Generation + +.PHONY: manifests +manifests: ## Generate CRD manifests + $(CONTROLLER_GEN) crd paths="./apis/..." output:crd:artifacts:config=config/crd/bases + +.PHONY: generate +generate: ## Generate DeepCopy code + $(CONTROLLER_GEN) object:headerFile="hack/boilerplate.go.txt" paths="./apis/..." + +##@ Build + +.PHONY: docker-build-approval-controller +docker-build-approval-controller: ## Build approval-request-controller docker image + docker buildx build \ + --file docker/approval-request-controller.Dockerfile \ + --tag $(REGISTRY)/approval-request-controller:$(TAG) \ + --platform=linux/$(GOARCH) \ + --build-arg GOARCH=$(GOARCH) \ + --push \ + . + +.PHONY: docker-build-metric-collector +docker-build-metric-collector: ## Build metric-collector docker image + docker buildx build \ + --file docker/metric-collector.Dockerfile \ + --tag $(REGISTRY)/metric-collector:$(TAG) \ + --platform=linux/$(GOARCH) \ + --build-arg GOARCH=$(GOARCH) \ + --push \ + . + +.PHONY: docker-build-metric-app +docker-build-metric-app: ## Build metric-app docker image + docker buildx build \ + --file docker/metric-app.Dockerfile \ + --tag $(REGISTRY)/metric-app:$(TAG) \ + --platform=linux/$(GOARCH) \ + --build-arg GOARCH=$(GOARCH) \ + --push \ + . + +.PHONY: docker-build-all +docker-build-all: docker-build-approval-controller docker-build-metric-collector docker-build-metric-app ## Build and push all docker images + +.PHONY: docker-build +docker-build: docker-build-all ## Alias for docker-build-all diff --git a/approval-request-metric-collector/README.md b/approval-request-metric-collector/README.md new file mode 100644 index 0000000..af338cd --- /dev/null +++ b/approval-request-metric-collector/README.md @@ -0,0 +1,735 @@ +# Approval Controller and Metric Collector Tutorial + +This tutorial demonstrates how to use the Approval Request Controller and Metric Collector with KubeFleet for automated staged rollout approvals based on workload health metrics. + +> **Note:** This tutorial is self-contained and provides all the steps needed to get started. For additional context on KubeFleet's staged update functionality, you can optionally refer to the [Staged Update How-To Guide](https://github.com/Azure/fleet/blob/main/docs/howtos/staged-update.md). + +## Overview + +This directory contains two controllers: +- **approval-request-controller**: Runs on the hub cluster to automate approval decisions for staged updates +- **metric-collector**: Runs on member clusters to collect and report workload health metrics + +![Approval Controller and Metric Collector Architecture](./images/approval-request-metric-collector.png) + +## How It Works + +### Custom Resource Definitions (CRDs) + +This solution introduces three new CRDs that work together with KubeFleet's native resources: + +#### Hub Cluster CRDs + +1. **MetricCollectorReport** (namespaced) + - Created by approval-request-controller in `fleet-member-` namespaces on hub (these namespaces are automatically created by KubeFleet when member clusters join) + - Watched and updated by metric-collector running on member clusters + - Contains specification of Prometheus URL and collected `workload_health` metrics + - Updated every 30 seconds by the metric collector with latest health data + +2. **ClusterStagedWorkloadTracker** (cluster-scoped) + - Defines which workloads to monitor for a ClusterStagedUpdateRun + - The name must match the ClusterStagedUpdateRun name + - Specifies workload's name, namespace, and kind (e.g., Deployment, StatefulSet) + - Used by approval-request-controller to determine if stage is ready for approval + +3. **StagedWorkloadTracker** (namespaced) + - Defines which workloads to monitor for a StagedUpdateRun + - The name and namespace must match the StagedUpdateRun name and namespace + - Specifies namespace, workload name, and kind + - Used by approval-request-controller to determine if stage is ready for approval + +### Automated Approval Flow + +1. **Stage Initialization** + - User creates an UpdateRun (`ClusterStagedUpdateRun` or `StagedUpdateRun`) on the hub + - KubeFleet creates an ApprovalRequest (`ClusterApprovalRequest` or `ApprovalRequest`) for the first stage + - The ApprovalRequest enters "Pending" state, waiting for approval + +2. **Metric Collector Report Creation** + - Approval-request-controller watches the `ClusterApprovalRequest` and `ApprovalRequest` objects + - For each cluster in the current stage: + - Creates a `MetricCollectorReport` in the `fleet-member-` namespace on hub (this namespace already exists, created by KubeFleet when the member cluster joined) + - Sets `spec.prometheusUrl` to the Prometheus endpoint + - Each report is specific to one cluster + +3. **Metric Collection on Member Clusters** + - Metric-collector controller runs on each member cluster + - Watches for `MetricCollectorReport` in its `fleet-member-` namespace on hub + - Every 30 seconds, it: + - Queries local Prometheus using URL from report spec with PromQL: `workload_health` + - Prometheus returns metrics for all pods with `prometheus.io/scrape: "true"` annotation + - Extracts workload health (1.0 = healthy, 0.0 = unhealthy) along with metadata labels + - Updates the `MetricCollectorReport` status on hub with **all** collected metrics + + **Example Prometheus Metric:** + ``` + workload_health{app="sample-metric-app", instance="10.244.0.32:8080", job="kubernetes-pods", namespace="test-ns", pod="sample-metric-app-565fd6595b-7pfb6", pod_template_hash="565fd6595b", workload_kind="Deployment"} 1 + ``` + + **Important Note on Multiple Pods:** When a workload (e.g., a Deployment) has multiple pods/replicas emitting health signals: + - The metric collector **collects all metrics** from Prometheus and stores them in the MetricCollectorReport + - If `sample-metric-app` has 3 replicas, the report will contain 3 separate `WorkloadMetrics` entries + - However, for simplicity, the approval-request-controller only evaluates the **first matching metric** when checking workload health + - This means if the first pod reports healthy, the workload is considered healthy, even if other pods report differently + - This simplified approach works well when all pods of a workload consistently report the same health status + - **Limitation:** If pods have different health states, only the first metric encountered is used for approval decisions + + **Customizing Health Aggregation Logic:** + To implement more sophisticated health checks (e.g., all pods must be healthy, or majority healthy): + 1. Edit `pkg/controllers/approvalrequest/controller.go` in the approval-request-controller + 2. Locate the health check loop (search for "Simplified health check using first matching metric") + 3. Remove the `break` statement that stops at the first match + 4. Collect all matching metrics for the workload into a slice + 5. Implement your aggregation logic: + - **All healthy:** Check that every metric has `Health == true` + - **Majority healthy:** Count healthy metrics and compare to total + - **Threshold-based:** Require N out of M pods to be healthy + 6. Rebuild and redeploy the approval-request-controller image + +4. **Health Evaluation** + - Approval-request-controller monitors `MetricCollectorReports` from all stage clusters + - Every 15 seconds, it: + - Fetches the appropriate workload tracker: + - For cluster-scoped: `ClusterStagedWorkloadTracker` with same name as ClusterStagedUpdateRun + - For namespace-scoped: `StagedWorkloadTracker` with same name and namespace as StagedUpdateRun + - For each cluster in the stage: + - Reads its `MetricCollectorReport` status from `fleet-member-` namespace + - Verifies all tracked workloads are present and healthy + - If any workload is missing or unhealthy, waits for next cycle + - If ALL workloads across ALL clusters are healthy: + - Sets ApprovalRequest condition `Approved: True` + - KubeFleet proceeds to roll out the stage + +5. **Stage Progression** + - KubeFleet applies the update to the approved stage clusters + - Creates a new ApprovalRequest for the next stage (if any) + - The cycle repeats for each stage + +## Prerequisites + +- Docker for building images +- Azure CLI (`az`) for ACR operations +- kubectl configured with access to your clusters +- Helm 3.x +- KubeFleet installed on hub and member clusters +- Azure Container Registry (ACR) with anonymous pull enabled + +## Building and Pushing Images to ACR + +Before installing the controllers, you need to build the Docker images and push them to Azure Container Registry (ACR). + +**Critical Note:** Enable anonymous pull on the ACR so that clusters can pull images without authentication. Ensure to disable anonymous pull or delete the ACR after testing. + +### 1. Create ACR with Anonymous Pull + +Create a resource group and ACR with Standard SKU (Basic SKU doesn't support anonymous pull): + +```bash +# Create resource group +az group create --name test-kubefleet-rg --location eastus + +# Create container registry with Standard SKU +az acr create --resource-group test-kubefleet-rg --name myfleetacr --sku Standard + +# Login to ACR +az acr login --name myfleetacr + +# Enable anonymous pull +az acr update --name myfleetacr --anonymous-pull-enabled +``` + +From the `az acr create` output, note down the login server (e.g., `myfleetacr.azurecr.io`). + +> Note: Users can also create their own registry to push their docker images, it doesn't have to be ACR. + +### 2. Build and Push Images + +Export registry and tag variables: + +```bash +export REGISTRY="myfleetacr.azurecr.io" +export TAG="latest" + +cd approval-request-metric-collector +``` + +Build and push all images at once, to build for a specific architecture (default is your system's architecture): + +```bash +# For AMD64 (x86_64), ARCH used by AKS fleet, clusters. +make docker-build-all GOARCH=amd64 + +# For ARM64 (Apple Silicon, ARM servers) +make docker-build-all GOARCH=arm64 +``` + +Or build individual images: + +```bash +# Build and push approval-request-controller image +make docker-build-approval-controller + +# Build and push metric-collector image +make docker-build-metric-collector + +# Build and push metric-app image +make docker-build-metric-app +``` + +### 3. Verify Images in ACR + +List images in your ACR: + +```bash +az acr repository list --name myfleetacr --output table +``` + +Expected output: +``` +Result +--------------------------- +approval-request-controller +metric-app +metric-collector +``` + +Verify tags for a specific image: + +```bash +az acr repository show-tags --name myfleetacr --repository approval-request-controller --output table +``` + +Expected output: +``` +Result +-------- +latest +``` + +**You're now ready to proceed with the setup!** Your ACR contains all three required images that will be pulled by your clusters. + +### 4. Cleanup (After Testing) + +When you're done testing, delete the resource group to clean up all resources: + +```bash +az group delete --name test-kubefleet-rg +``` + +## Setup Overview + +Before diving into the setup steps, here's a bird's eye view of what you'll be building: + +### Architecture Components + +**Hub Cluster** - The control plane where you'll deploy: +1. **3 Member Clusters** (cluster-1, cluster-2, cluster-3) + - Labeled with `environment=staging` or `environment=prod` + - These labels determine which stage each cluster belongs to during rollouts + +2. **Prometheus** (propagated to all clusters) + - Monitors workload health via `/metrics` endpoints + - Scrapes pods with `prometheus.io/scrape: "true"` annotation + - Provides `workload_health` metric (1.0 = healthy, 0.0 = unhealthy) + +3. **Approval Request Controller** + - Watches `ClusterApprovalRequest` and `ApprovalRequest` objects + - Creates MetricCollectorReport directly in `fleet-member-` namespaces + - Evaluates workload health from MetricCollectorReport status + - Auto-approves stages when all workloads are healthy + +4. **Sample Metric App** (will be rolled out to clusters) + - Simple Go application exposing `/metrics` endpoint + - Reports `workload_health=1.0` by default + - Used to demonstrate health-based approvals + +**Member Clusters** - Where workloads run: +1. **Metric Collector** + - Connects to hub cluster to watch MetricCollectorReport in its namespace + - Queries local Prometheus every 30 seconds using URL from MetricCollectorReport spec + - Updates MetricCollectorReport status on hub with collected health metrics + +2. **Prometheus** (received from hub) + - Runs on each member cluster + - Scrapes local workload metrics + +3. **Sample Metric App** (received from hub) + - Deployed via staged rollout + - Monitored for health during updates + +### WorkloadTracker - The Decision Maker + +The **WorkloadTracker** is a critical resource that tells the approval controller which workloads must be healthy before approving a stage. Without it, the controller doesn't know what to monitor. + +**Two Types:** + +1. **ClusterStagedWorkloadTracker** (for ClusterStagedUpdateRun) + - Cluster-scoped resource on the hub + - Name must exactly match the ClusterStagedUpdateRun name + - Example: If your UpdateRun is named `example-cluster-staged-run`, the tracker must also be named `example-cluster-staged-run` + - Contains a list of workloads (name, namespace, and kind) to monitor across all clusters in each stage + +2. **StagedWorkloadTracker** (for StagedUpdateRun) + - Namespace-scoped resource on the hub + - Name and namespace must exactly match the StagedUpdateRun + - Example: If your UpdateRun is `example-staged-run` in namespace `test-ns`, the tracker must be `example-staged-run` in `test-ns` + - Contains a list of workloads to monitor + +**How It Works:** +```yaml +# ClusterStagedWorkloadTracker example +workloads: + - name: sample-metric-app # Workload name (matches the app label) + namespace: test-ns # Namespace where it runs + kind: Deployment # Workload kind (optional, enables precise matching) +``` + +When the approval controller evaluates a stage: +1. It fetches the WorkloadTracker that matches the UpdateRun name (and namespace) +2. For each cluster in the stage, it reads the MetricCollectorReport +3. It verifies that every workload listed in the tracker appears in the report as healthy +4. The matching logic compares namespace, name, and kind (if specified) in a case-insensitive manner +5. Only when ALL workloads in ALL clusters are healthy does it approve the stage + +**Critical Rule:** The WorkloadTracker must be created BEFORE starting the UpdateRun. If the controller can't find a matching tracker, it won't approve any stages. + +### The Staged Rollout Flow + +When you create a **ClusterStagedUpdateRun** or **StagedUpdateRun**, here's what happens: + +1. **Stage 1 (staging)**: Rollout starts with `cluster-1` + - KubeFleet creates an ApprovalRequest for the staging stage + - Approval controller creates MetricCollectorReport in `fleet-member-cluster-1` namespace + - Metric collector on `cluster-1` watches its report on hub and updates status with health metrics + - When `sample-metric-app` is healthy, approval controller auto-approves + - KubeFleet proceeds with the rollout to `cluster-1` + +2. **Stage 2 (prod)**: After staging succeeds + - KubeFleet creates an ApprovalRequest for the prod stage + - Approval controller creates MetricCollectorReports in `fleet-member-cluster-2` and `fleet-member-cluster-3` + - Metric collectors on both clusters watch their reports and update with health data + - When ALL workloads across BOTH prod clusters are healthy, auto-approve + - KubeFleet completes the rollout to production clusters + +### Key Resources You'll Create + +| Resource | Purpose | Where | +|----------|---------|-------| +| **ClusterResourcePlacement** | Define what resources to propagate (Prometheus, sample-app) | Hub | +| **StagedUpdateStrategy** | Define stages with label selectors and approval requirements | Hub | +| **WorkloadTracker** | Specify which workloads to monitor for health | Hub | +| **UpdateRun** | Start the staged rollout process | Hub | +| **MetricCollectorReport** | Created by approval controller, updated by metric collector | Hub (fleet-member-* ns) | + +## Setup + +### Prerequisites + +Before starting this tutorial, ensure you have: +- A KubeFleet hub cluster with fleet controllers installed +- Three member clusters joined to the hub cluster +- kubectl configured with access to the hub cluster context + +This can be achieved through a number of ways, +- https://kubefleet.dev/docs/getting-started/ +- https://learn.microsoft.com/en-us/azure/kubernetes-fleet/quickstart-create-fleet-and-members-portal + +### 1. Label Member Clusters for Staged Rollout + +The staged rollout uses labels to determine which clusters belong to each stage. Ensure your member clusters have the following labels: + +**Stage 1 (staging)** - One cluster: +- `environment=staging` + +**Stage 2 (prod)** - Two or more clusters: +- `environment=prod` + +Expected cluster configuration: +``` +cluster-1: environment=staging +cluster-2: environment=prod +cluster-3: environment=prod +``` + +The `StagedUpdateStrategy` uses these labels to select clusters for each stage: +- **Stage 1 (staging)**: Selects clusters with `environment=staging` +- **Stage 2 (prod)**: Selects clusters with `environment=prod` + +**Labeling Options:** + +For **Azure-managed member clusters** (joined via Azure portal/CLI): +```bash +az fleet member update -g -f -n --labels "=" +``` +> **Note:** Member clusters joined via Azure portal or CLI have a validating webhook that prevents direct kubectl modifications. You must use the `az fleet member update` command and cannot use `kubectl label` or `kubectl edit`. + +For **manually created member clusters** (e.g., kind clusters): +```bash +# Option 1: Add labels using kubectl label +kubectl label membercluster environment=staging + +# Option 2: Edit the MemberCluster CR directly +kubectl edit membercluster + +# Option 3: Apply example files with labels pre-configured +# Edit examples/membercluster/fleet_v1beta1_membercluster.yaml with your cluster details and labels +kubectl apply -f ./examples/membercluster/fleet_v1beta1_membercluster.yaml +``` +The example files in `examples/membercluster/` show how to create MemberCluster CRs with the appropriate labels already configured. + +### 2. Deploy Prometheus + +From the kubefleet-cookbook repo, navigate to the approval-request-metric-collector directory and deploy Prometheus for metrics collection: + +```bash +cd approval-request-metric-collector + +# Switch to hub cluster context +kubectl config use-context + +# Create prometheus namespace +kubectl create ns prometheus + +# Deploy Prometheus (ConfigMap, Deployment, Service, RBAC, and CRP) +# - ConfigMap: Contains Prometheus scrape configuration +# - Deployment: Runs Prometheus server +# - Service: Exposes Prometheus on port 9090 +# - RBAC: ServiceAccount, ClusterRole, and ClusterRoleBinding for pod discovery +# - CRP: ClusterResourcePlacement to propagate Prometheus to all member clusters +kubectl apply -f ./examples/prometheus/ +``` + +This deploys Prometheus configured to scrape pods from all namespaces with the proper annotations. + +### 3. Deploy Sample Metric Application + +Create the test namespace and deploy the sample application: + +```bash +# Create test namespace +kubectl create ns test-ns + +# Create sample-metric-app deployment +kubectl apply -f ./examples/sample-metric-app/ +``` + +> Note: If users are using a different REGISTRY, TAG variables from the setup, please update examples/sample-metric-app/sample-metric-app.yaml accordingly. + +**Important: Configuring WORKLOAD_KIND Environment Variable** + +The sample-metric-app emits a `workload_health` metric with a `workload_kind` label that identifies the parent workload type. This label **must match** the `kind` field specified in your WorkloadTracker. + +The sample deployment sets `WORKLOAD_KIND=Deployment`: +```yaml +env: +- name: WORKLOAD_KIND + value: "Deployment" +``` + +For other workload types, update the environment variable accordingly: +- **StatefulSet**: `WORKLOAD_KIND=StatefulSet` +- **DaemonSet**: `WORKLOAD_KIND=DaemonSet` +- **Job**: `WORKLOAD_KIND=Job` + +This is necessary because Prometheus's `__meta_kubernetes_pod_controller_kind` returns the immediate controller (e.g., ReplicaSet for Deployments), not the actual parent resource. By setting this environment variable, the metric app emits the correct workload type that matches your WorkloadTracker configuration. + +### 4. Install Approval Request Controller (Hub Cluster) + +Install the approval request controller on the hub cluster using the ACR registry: + +```bash +# Set your ACR registry name +export REGISTRY="myfleetacr.azurecr.io" + +# Run the installation script +scripts/install-on-hub.sh ${REGISTRY} +``` + +The script performs the following: +1. Configures the controller to use the approval-request-controller image from your ACR +2. Verifies that required KubeFleet CRDs are installed +3. Installs the controller via Helm with the custom CRDs (MetricCollectorReport, ClusterStagedWorkloadTracker, StagedWorkloadTracker) +4. Verifies the installation + +### 5. Configure Workload Tracker + +Apply the appropriate workload tracker based on which type of staged update you'll use: + +#### For Cluster-Scoped Updates (ClusterStagedUpdateRun): + +```bash +# Apply ClusterStagedWorkloadTracker +# This defines which workloads to monitor for the staged rollout +# The name "example-cluster-staged-run" must match the ClusterStagedUpdateRun name +# Tracks: sample-metric-app Deployment in test-ns namespace +kubectl apply -f ./examples/workloadtracker/clusterstagedworkloadtracker.yaml +``` + +#### For Namespace-Scoped Updates (StagedUpdateRun): + +```bash +# Apply StagedWorkloadTracker +# This defines which workloads to monitor for the namespace-scoped staged rollout +# The name "example-staged-run" and namespace "test-ns" must match the StagedUpdateRun +# Tracks: sample-metric-app in test-ns namespace with kind Deployment +kubectl apply -f ./examples/workloadtracker/stagedworkloadtracker.yaml +``` + +### 6. Install Metric Collector (Member Clusters) + +Install the metric collector on all member clusters using the ACR registry: + +```bash +# Find the contexts for hub, member clusters. +kubectl config get-contexts +``` + +```bash +# Run the installation script for all member clusters +# Replace with your actual cluster contexts +scripts/install-on-member.sh ${REGISTRY} + +# Example: +# scripts/install-on-member.sh ${REGISTRY} hub cluster-1 cluster-2 cluster-3 +``` + +The script performs the following: +1. Configures the metric-collector to use the image from your ACR +2. Creates service account with hub cluster access token +3. Installs metric-collector via Helm on each member cluster +4. Configures connection to hub API server and local Prometheus + +### 7. Start Staged Rollout + +Choose one of the following options based on your use case: + +#### Option A: Cluster-Scoped Staged Update (ClusterStagedUpdateRun) + +Create a cluster-scoped staged update run: + +Switch back to hub cluster and create a cluster-scoped staged update run: + +```bash +# Switch to hub cluster +kubectl config use-context + +# Apply ClusterStagedUpdateStrategy +# Defines the stages for the rollout: staging (cluster-1) -> prod (cluster-2, cluster-3) +# Each stage requires approval before proceeding +kubectl apply -f ./examples/updateRun/example-csus.yaml + +# Apply ClusterResourcePlacement for sample-metric-app +# This is the resource that will be updated across stages +# Selects the sample-metric-app deployment in test-ns namespace +kubectl apply -f ./examples/updateRun/example-crp.yaml + +# Verify CRP is created +kubectl get crp -A +``` + +Output: +```bash +NAME GEN SCHEDULED SCHEDULED-GEN AVAILABLE AVAILABLE-GEN AGE +example-crp 1 True 1 4s +prometheus-crp 1 True 1 True 1 3m1s +``` + +```bash +# Apply ClusterStagedUpdateRun to start the staged rollout +# This creates the actual update run that progresses through the defined stages +# Name: example-cluster-staged-run (must match ClusterStagedWorkloadTracker) +# References the ClusterResourcePlacement (example-crp) and ClusterStagedUpdateStrategy +kubectl apply -f ./examples/updateRun/example-csur.yaml + +# Check the staged update run status +kubectl get csur -A +``` + +#### Option B: Namespace-Scoped Staged Update (StagedUpdateRun) + +Alternatively, you can use namespace-scoped resources: + +```bash +# Switch to hub cluster +kubectl config use-context + +# Apply namespace-scoped ClusterResourcePlacement +# This CRP is configured to only place resources in the test-ns namespace +# This resource is needed because we cannot propagate Namespace which is a +# cluster-scoped resource via RP +kubectl apply -f ./examples/updateRun/example-ns-only-crp.yaml + +kubectl get crp -A +``` + +Output: +```bash +NAME GEN SCHEDULED SCHEDULED-GEN AVAILABLE AVAILABLE-GEN AGE +ns-only-crp 1 True 1 True 1 4s +prometheus-crp 1 True 1 True 1 31m +``` + +```bash +# Apply StagedUpdateStrategy (namespace-scoped) +# Defines the stages: staging (cluster-1) -> prod (cluster-2, cluster-3) +# Each stage requires approval before proceeding +kubectl apply -f ./examples/updateRun/example-sus.yaml + +# Apply ResourcePlacement (namespace-scoped) +# This is the namespace-scoped version that works with the test-ns namespace +# References the ns-only-crp ClusterResourcePlacement +kubectl apply -f ./examples/updateRun/example-rp.yaml + +# Verify RP is created +kubectl get rp -A +``` + +Output: +```bash +NAMESPACE NAME GEN SCHEDULED SCHEDULED-GEN AVAILABLE AVAILABLE-GEN AGE +test-ns example-rp 1 True 1 35s +``` + +```bash +# Apply StagedUpdateRun to start the staged rollout (namespace-scoped) +# This creates the actual update run that progresses through the defined stages +# Name: example-staged-run (must match StagedWorkloadTracker) +# Namespace: test-ns (must match StagedWorkloadTracker namespace) +# References the ResourcePlacement (example-rp) +kubectl apply -f ./examples/updateRun/example-sur.yaml + +# Check the staged update run status +kubectl get sur -A +``` + +Output: +```bash +NAMESPACE NAME PLACEMENT RESOURCE-SNAPSHOT-INDEX POLICY-SNAPSHOT-INDEX INITIALIZED PROGRESSING SUCCEEDED AGE +test-ns example-staged-run example-rp 0 0 True True 5s +``` + +### 8. Monitor the Staged Rollout + +Watch the staged update progress: + +#### For Cluster-Scoped Updates: + +```bash +# Check the staged update run status +kubectl get csur -A + +# Check approval requests (should be auto-approved based on metrics) +kubectl get clusterapprovalrequest -A +``` + +Output: +```bash +NAME UPDATE-RUN STAGE APPROVED AGE +example-cluster-staged-run-after-staging example-cluster-staged-run staging True 2m9s +``` + +```bash +# Check metric collector reports +kubectl get metriccollectorreport -A +``` + +Output: +```bash +NAMESPACE NAME WORKLOADS LAST-COLLECTION AGE +fleet-member-cluster-1 mc-example-cluster-staged-run-staging 1 27s 2m57s +``` + +#### For Namespace-Scoped Updates: + +```bash +# Check the staged update run status +kubectl get sur -A + +# Check approval requests (should be auto-approved based on metrics) +kubectl get approvalrequest -A +``` + +Output: +```bash +NAMESPACE NAME UPDATE-RUN STAGE APPROVED AGE +test-ns example-staged-run-after-staging example-staged-run staging True 64s +``` + +```bash +# Check metric collector reports +kubectl get metriccollectorreport -A +``` + +Output: +```bash +NAMESPACE NAME WORKLOADS LAST-COLLECTION AGE +fleet-member-cluster-1 mc-example-staged-run-staging 1 27s 57s +``` + +The approval controller will automatically approve stages when the metric collectors report that workloads are healthy. + +## Verification + +### Check Controller Status + +On the hub cluster: +```bash +kubectl config use-context +kubectl get pods -n fleet-system +kubectl logs -n fleet-system deployment/approval-request-controller -f +``` + +On member clusters: +```bash +kubectl config use-context +kubectl get pods -n default +kubectl logs -n default deployment/metric-collector -f +``` + +### Check Metrics Collection + +Verify that MetricCollectorReports are being created and updated on the hub: +```bash +kubectl config use-context +kubectl get metriccollectorreport -A +``` + +## Configuration + +### Approval Request Controller +- Located in `charts/approval-request-controller/values.yaml` +- Key settings: log level, resource limits, RBAC, CRD installation +- Default Prometheus URL: `http://prometheus.prometheus.svc.cluster.local:9090` +- Reconciliation interval: 15 seconds + +### Metric Collector +- Located in `charts/metric-collector/values.yaml` +- Key settings: hub cluster URL, Prometheus URL, member cluster name +- Metric collection interval: 30 seconds +- Connects to hub using service account token + +## Troubleshooting + +### Controller not starting +- Check that all required CRDs are installed: `kubectl get crds | grep autoapprove.kubernetes-fleet.io` +- Verify RBAC permissions are configured correctly + +### Metrics not being collected +- Verify Prometheus is accessible: `kubectl port-forward -n prometheus svc/prometheus 9090:9090` +- Check metric collector logs for connection errors +- Ensure workloads have Prometheus scrape annotations + +### Approvals not happening +- Check the appropriate Workload tracker object exists +- Check that the workload tracker name matches the update run name: + - For ClusterStagedUpdateRun: ClusterStagedWorkloadTracker name must match + - For StagedUpdateRun: StagedWorkloadTracker name and namespace must match +- Verify workloads in the tracker match those reporting metrics (name, namespace, and kind) +- Verify MetricCollectorReports are being created on the hub +- Review approval-request-controller logs for decision-making details + +## Additional Resources + +- [Approval Request Controller README](./approval-request-controller/README.md) +- [Metric Collector README](./metric-collector/README.md) +- [KubeFleet Documentation](https://github.com/Azure/kubefleet) diff --git a/approval-request-metric-collector/apis/autoapprove/v1alpha1/doc.go b/approval-request-metric-collector/apis/autoapprove/v1alpha1/doc.go new file mode 100644 index 0000000..22473a8 --- /dev/null +++ b/approval-request-metric-collector/apis/autoapprove/v1alpha1/doc.go @@ -0,0 +1,20 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package v1alpha1 contains API Schema definitions for the autoapprove v1alpha1 API group +// +kubebuilder:object:generate=true +// +groupName=autoapprove.kubernetes-fleet.io +package v1alpha1 diff --git a/approval-request-metric-collector/apis/autoapprove/v1alpha1/groupversion_info.go b/approval-request-metric-collector/apis/autoapprove/v1alpha1/groupversion_info.go new file mode 100644 index 0000000..6f1fbac --- /dev/null +++ b/approval-request-metric-collector/apis/autoapprove/v1alpha1/groupversion_info.go @@ -0,0 +1,35 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// +kubebuilder:object:generate=true +// +groupName=autoapprove.kubernetes-fleet.io +package v1alpha1 + +import ( + "k8s.io/apimachinery/pkg/runtime/schema" + "sigs.k8s.io/controller-runtime/pkg/scheme" +) + +var ( + // GroupVersion is group version used to register these objects + GroupVersion = schema.GroupVersion{Group: "autoapprove.kubernetes-fleet.io", Version: "v1alpha1"} + + // SchemeBuilder is used to add go types to the GroupVersionKind scheme + SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion} + + // AddToScheme adds the types in this group-version to the given scheme. + AddToScheme = SchemeBuilder.AddToScheme +) diff --git a/approval-request-metric-collector/apis/autoapprove/v1alpha1/metriccollectorreport_types.go b/approval-request-metric-collector/apis/autoapprove/v1alpha1/metriccollectorreport_types.go new file mode 100644 index 0000000..ef36731 --- /dev/null +++ b/approval-request-metric-collector/apis/autoapprove/v1alpha1/metriccollectorreport_types.go @@ -0,0 +1,119 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha1 + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +const ( + // MetricCollectorReportConditionTypeMetricsCollected indicates whether metrics have been successfully collected + MetricCollectorReportConditionTypeMetricsCollected = "MetricsCollected" + + // MetricCollectorReportConditionReasonCollectionFailed indicates metric collection failed + MetricCollectorReportConditionReasonCollectionFailed = "CollectionFailed" + + // MetricCollectorReportConditionReasonCollectionSucceeded indicates metric collection succeeded + MetricCollectorReportConditionReasonCollectionSucceeded = "CollectionSucceeded" +) + +// +genclient +// +kubebuilder:object:root=true +// +kubebuilder:subresource:status +// +kubebuilder:resource:scope="Namespaced",shortName=mcr,categories={fleet,fleet-metrics} +// +kubebuilder:storageversion +// +kubebuilder:printcolumn:JSONPath=`.status.workloadsMonitored`,name="Workloads",type=integer +// +kubebuilder:printcolumn:JSONPath=`.status.lastCollectionTime`,name="Last-Collection",type=date +// +kubebuilder:printcolumn:JSONPath=`.metadata.creationTimestamp`,name="Age",type=date + +// MetricCollectorReport is created by the approval-request-controller on the hub cluster +// in the fleet-member-{clusterName} namespace. The metric-collector on the member cluster +// watches these reports and updates their status with collected metrics. +// +// Controller workflow: +// 1. Approval-controller creates MetricCollectorReport with spec on hub +// 2. Metric-collector watches MetricCollectorReport on hub (in fleet-member-{clusterName} namespace) +// 3. Metric-collector queries Prometheus on member cluster +// 4. Metric-collector updates MetricCollectorReport status on hub with collected metrics +// +// Namespace: fleet-member-{clusterName} +// Name: Matches the UpdateRun name +type MetricCollectorReport struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + Spec MetricCollectorReportSpec `json:"spec,omitempty"` + Status MetricCollectorReportStatus `json:"status,omitempty"` +} + +// MetricCollectorReportSpec defines the configuration for metric collection. +type MetricCollectorReportSpec struct { + // PrometheusURL is the URL of the Prometheus server on the member cluster + // Example: "http://prometheus.fleet-system.svc.cluster.local:9090" + PrometheusURL string `json:"prometheusUrl"` +} + +// MetricCollectorReportStatus contains the collected metrics from the member cluster. +type MetricCollectorReportStatus struct { + // Conditions represent the latest available observations of the report's state. + // +optional + Conditions []metav1.Condition `json:"conditions,omitempty"` + + // WorkloadsMonitored is the count of workloads being monitored. + // +optional + WorkloadsMonitored int32 `json:"workloadsMonitored,omitempty"` + + // LastCollectionTime is when metrics were last collected on the member cluster. + // +optional + LastCollectionTime *metav1.Time `json:"lastCollectionTime,omitempty"` + + // CollectedMetrics contains the most recent metrics from each workload. + // +optional + CollectedMetrics []WorkloadMetrics `json:"collectedMetrics,omitempty"` +} + +// WorkloadMetrics represents metrics collected from a single workload. +type WorkloadMetrics struct { + // Namespace of the workload. + // +required + Namespace string `json:"namespace"` + + // Name of the workload. + // +required + WorkloadName string `json:"workloadName"` + + // Kind of the workload controller (e.g., Deployment, StatefulSet, DaemonSet). + // +optional + WorkloadKind string `json:"workloadKind,omitempty"` + + // Health indicates if the workload is healthy (true=healthy, false=unhealthy). + // +required + Health bool `json:"health"` +} + +// +kubebuilder:object:root=true + +// MetricCollectorReportList contains a list of MetricCollectorReport. +type MetricCollectorReportList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []MetricCollectorReport `json:"items"` +} + +func init() { + SchemeBuilder.Register(&MetricCollectorReport{}, &MetricCollectorReportList{}) +} diff --git a/approval-request-metric-collector/apis/autoapprove/v1alpha1/workloadtracker_types.go b/approval-request-metric-collector/apis/autoapprove/v1alpha1/workloadtracker_types.go new file mode 100644 index 0000000..42431ca --- /dev/null +++ b/approval-request-metric-collector/apis/autoapprove/v1alpha1/workloadtracker_types.go @@ -0,0 +1,104 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha1 + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// WorkloadReference represents a workload to be tracked +type WorkloadReference struct { + // Name is the name of the workload + // +required + Name string `json:"name"` + + // Namespace is the namespace of the workload + // +required + Namespace string `json:"namespace"` + + // Kind is the kind of the workload controller (e.g., Deployment, StatefulSet, DaemonSet) + // +optional + Kind string `json:"kind,omitempty"` +} + +// +genclient +// +genclient:nonNamespaced +// +kubebuilder:object:root=true +// +kubebuilder:resource:scope="Cluster",categories={fleet,fleet-placement} +// +kubebuilder:storageversion +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object + +// ClusterStagedWorkloadTracker expresses user intent to track certain workloads for a ClusterStagedUpdateRun. +// The name of this resource should match the name of the ClusterStagedUpdateRun it is used for. +// For example, if the ClusterStagedUpdateRun is named "example-cluster-staged-run", the +// ClusterStagedWorkloadTracker should also be named "example-cluster-staged-run". +type ClusterStagedWorkloadTracker struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + // Workloads is a list of workloads to track + // +optional + Workloads []WorkloadReference `json:"workloads,omitempty"` +} + +// +kubebuilder:object:root=true +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object + +// ClusterStagedWorkloadTrackerList contains a list of ClusterStagedWorkloadTracker +type ClusterStagedWorkloadTrackerList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []ClusterStagedWorkloadTracker `json:"items"` +} + +// +genclient +// +kubebuilder:object:root=true +// +kubebuilder:resource:scope="Namespaced",categories={fleet,fleet-placement} +// +kubebuilder:storageversion +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object + +// StagedWorkloadTracker expresses user intent to track certain workloads for a StagedUpdateRun. +// The name and namespace of this resource should match the name and namespace of the StagedUpdateRun it is used for. +// For example, if the StagedUpdateRun is named "example-staged-run" in namespace "test-ns", the +// StagedWorkloadTracker should also be named "example-staged-run" in namespace "test-ns". +type StagedWorkloadTracker struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + // Workloads is a list of workloads to track + // +optional + Workloads []WorkloadReference `json:"workloads,omitempty"` +} + +// +kubebuilder:object:root=true +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object + +// StagedWorkloadTrackerList contains a list of StagedWorkloadTracker +type StagedWorkloadTrackerList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []StagedWorkloadTracker `json:"items"` +} + +func init() { + SchemeBuilder.Register( + &ClusterStagedWorkloadTracker{}, + &ClusterStagedWorkloadTrackerList{}, + &StagedWorkloadTracker{}, + &StagedWorkloadTrackerList{}, + ) +} diff --git a/approval-request-metric-collector/apis/autoapprove/v1alpha1/zz_generated.deepcopy.go b/approval-request-metric-collector/apis/autoapprove/v1alpha1/zz_generated.deepcopy.go new file mode 100644 index 0000000..7e3ca6d --- /dev/null +++ b/approval-request-metric-collector/apis/autoapprove/v1alpha1/zz_generated.deepcopy.go @@ -0,0 +1,285 @@ +//go:build !ignore_autogenerated + +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by controller-gen. DO NOT EDIT. + +package v1alpha1 + +import ( + "k8s.io/apimachinery/pkg/apis/meta/v1" + runtime "k8s.io/apimachinery/pkg/runtime" +) + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ClusterStagedWorkloadTracker) DeepCopyInto(out *ClusterStagedWorkloadTracker) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + if in.Workloads != nil { + in, out := &in.Workloads, &out.Workloads + *out = make([]WorkloadReference, len(*in)) + copy(*out, *in) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ClusterStagedWorkloadTracker. +func (in *ClusterStagedWorkloadTracker) DeepCopy() *ClusterStagedWorkloadTracker { + if in == nil { + return nil + } + out := new(ClusterStagedWorkloadTracker) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *ClusterStagedWorkloadTracker) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ClusterStagedWorkloadTrackerList) DeepCopyInto(out *ClusterStagedWorkloadTrackerList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]ClusterStagedWorkloadTracker, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ClusterStagedWorkloadTrackerList. +func (in *ClusterStagedWorkloadTrackerList) DeepCopy() *ClusterStagedWorkloadTrackerList { + if in == nil { + return nil + } + out := new(ClusterStagedWorkloadTrackerList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *ClusterStagedWorkloadTrackerList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *MetricCollectorReport) DeepCopyInto(out *MetricCollectorReport) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + out.Spec = in.Spec + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MetricCollectorReport. +func (in *MetricCollectorReport) DeepCopy() *MetricCollectorReport { + if in == nil { + return nil + } + out := new(MetricCollectorReport) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *MetricCollectorReport) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *MetricCollectorReportList) DeepCopyInto(out *MetricCollectorReportList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]MetricCollectorReport, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MetricCollectorReportList. +func (in *MetricCollectorReportList) DeepCopy() *MetricCollectorReportList { + if in == nil { + return nil + } + out := new(MetricCollectorReportList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *MetricCollectorReportList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *MetricCollectorReportSpec) DeepCopyInto(out *MetricCollectorReportSpec) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MetricCollectorReportSpec. +func (in *MetricCollectorReportSpec) DeepCopy() *MetricCollectorReportSpec { + if in == nil { + return nil + } + out := new(MetricCollectorReportSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *MetricCollectorReportStatus) DeepCopyInto(out *MetricCollectorReportStatus) { + *out = *in + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]v1.Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + if in.LastCollectionTime != nil { + in, out := &in.LastCollectionTime, &out.LastCollectionTime + *out = (*in).DeepCopy() + } + if in.CollectedMetrics != nil { + in, out := &in.CollectedMetrics, &out.CollectedMetrics + *out = make([]WorkloadMetrics, len(*in)) + copy(*out, *in) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MetricCollectorReportStatus. +func (in *MetricCollectorReportStatus) DeepCopy() *MetricCollectorReportStatus { + if in == nil { + return nil + } + out := new(MetricCollectorReportStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *StagedWorkloadTracker) DeepCopyInto(out *StagedWorkloadTracker) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + if in.Workloads != nil { + in, out := &in.Workloads, &out.Workloads + *out = make([]WorkloadReference, len(*in)) + copy(*out, *in) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new StagedWorkloadTracker. +func (in *StagedWorkloadTracker) DeepCopy() *StagedWorkloadTracker { + if in == nil { + return nil + } + out := new(StagedWorkloadTracker) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *StagedWorkloadTracker) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *StagedWorkloadTrackerList) DeepCopyInto(out *StagedWorkloadTrackerList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]StagedWorkloadTracker, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new StagedWorkloadTrackerList. +func (in *StagedWorkloadTrackerList) DeepCopy() *StagedWorkloadTrackerList { + if in == nil { + return nil + } + out := new(StagedWorkloadTrackerList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *StagedWorkloadTrackerList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *WorkloadMetrics) DeepCopyInto(out *WorkloadMetrics) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new WorkloadMetrics. +func (in *WorkloadMetrics) DeepCopy() *WorkloadMetrics { + if in == nil { + return nil + } + out := new(WorkloadMetrics) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *WorkloadReference) DeepCopyInto(out *WorkloadReference) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new WorkloadReference. +func (in *WorkloadReference) DeepCopy() *WorkloadReference { + if in == nil { + return nil + } + out := new(WorkloadReference) + in.DeepCopyInto(out) + return out +} diff --git a/approval-request-metric-collector/charts/approval-request-controller/Chart.yaml b/approval-request-metric-collector/charts/approval-request-controller/Chart.yaml new file mode 100644 index 0000000..f5e253c --- /dev/null +++ b/approval-request-metric-collector/charts/approval-request-controller/Chart.yaml @@ -0,0 +1,6 @@ +apiVersion: v2 +name: approval-request-controller +description: A Helm chart for ApprovalRequest Controller on Hub Cluster +type: application +version: 0.1.0 +appVersion: "1.0" diff --git a/approval-request-metric-collector/charts/approval-request-controller/templates/_helpers.tpl b/approval-request-metric-collector/charts/approval-request-controller/templates/_helpers.tpl new file mode 100644 index 0000000..a603fac --- /dev/null +++ b/approval-request-metric-collector/charts/approval-request-controller/templates/_helpers.tpl @@ -0,0 +1,60 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "approval-request-controller.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +*/}} +{{- define "approval-request-controller.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "approval-request-controller.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "approval-request-controller.labels" -}} +helm.sh/chart: {{ include "approval-request-controller.chart" . }} +{{ include "approval-request-controller.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "approval-request-controller.selectorLabels" -}} +app.kubernetes.io/name: {{ include "approval-request-controller.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "approval-request-controller.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "approval-request-controller.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} diff --git a/approval-request-metric-collector/charts/approval-request-controller/templates/crds/autoapprove.kubernetes-fleet.io_clusterstagedworkloadtrackers.yaml b/approval-request-metric-collector/charts/approval-request-controller/templates/crds/autoapprove.kubernetes-fleet.io_clusterstagedworkloadtrackers.yaml new file mode 120000 index 0000000..89ed678 --- /dev/null +++ b/approval-request-metric-collector/charts/approval-request-controller/templates/crds/autoapprove.kubernetes-fleet.io_clusterstagedworkloadtrackers.yaml @@ -0,0 +1 @@ +../../../../config/crd/bases/autoapprove.kubernetes-fleet.io_clusterstagedworkloadtrackers.yaml \ No newline at end of file diff --git a/approval-request-metric-collector/charts/approval-request-controller/templates/crds/autoapprove.kubernetes-fleet.io_metriccollectorreports.yaml b/approval-request-metric-collector/charts/approval-request-controller/templates/crds/autoapprove.kubernetes-fleet.io_metriccollectorreports.yaml new file mode 120000 index 0000000..32b1524 --- /dev/null +++ b/approval-request-metric-collector/charts/approval-request-controller/templates/crds/autoapprove.kubernetes-fleet.io_metriccollectorreports.yaml @@ -0,0 +1 @@ +../../../../config/crd/bases/autoapprove.kubernetes-fleet.io_metriccollectorreports.yaml \ No newline at end of file diff --git a/approval-request-metric-collector/charts/approval-request-controller/templates/crds/autoapprove.kubernetes-fleet.io_stagedworkloadtrackers.yaml b/approval-request-metric-collector/charts/approval-request-controller/templates/crds/autoapprove.kubernetes-fleet.io_stagedworkloadtrackers.yaml new file mode 120000 index 0000000..db857c7 --- /dev/null +++ b/approval-request-metric-collector/charts/approval-request-controller/templates/crds/autoapprove.kubernetes-fleet.io_stagedworkloadtrackers.yaml @@ -0,0 +1 @@ +../../../../config/crd/bases/autoapprove.kubernetes-fleet.io_stagedworkloadtrackers.yaml \ No newline at end of file diff --git a/approval-request-metric-collector/charts/approval-request-controller/templates/deployment.yaml b/approval-request-metric-collector/charts/approval-request-controller/templates/deployment.yaml new file mode 100644 index 0000000..dd84869 --- /dev/null +++ b/approval-request-metric-collector/charts/approval-request-controller/templates/deployment.yaml @@ -0,0 +1,84 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "approval-request-controller.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "approval-request-controller.labels" . | nindent 4 }} +spec: + replicas: {{ .Values.controller.replicas }} + selector: + matchLabels: + {{- include "approval-request-controller.selectorLabels" . | nindent 6 }} + template: + metadata: + {{- with .Values.podAnnotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + labels: + {{- include "approval-request-controller.selectorLabels" . | nindent 8 }} + spec: + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "approval-request-controller.serviceAccountName" . }} + securityContext: + {{- toYaml .Values.podSecurityContext | nindent 8 }} + containers: + - name: controller + securityContext: + {{- toYaml .Values.securityContext | nindent 10 }} + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + command: + - /approval-request-controller + args: + - --metrics-bind-address=:{{ .Values.metrics.port }} + - --health-probe-bind-address=:{{ .Values.healthProbe.port }} + - -v={{ .Values.controller.logLevel }} + + ports: + {{- if .Values.metrics.enabled }} + - name: metrics + containerPort: {{ .Values.metrics.port }} + protocol: TCP + {{- end }} + {{- if .Values.healthProbe.enabled }} + - name: health + containerPort: {{ .Values.healthProbe.port }} + protocol: TCP + {{- end }} + + {{- if .Values.healthProbe.enabled }} + livenessProbe: + httpGet: + path: /healthz + port: health + initialDelaySeconds: 15 + periodSeconds: 20 + + readinessProbe: + httpGet: + path: /readyz + port: health + initialDelaySeconds: 5 + periodSeconds: 10 + {{- end }} + + resources: + {{- toYaml .Values.controller.resources | nindent 10 }} + + {{- with .Values.controller.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.controller.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.controller.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} diff --git a/approval-request-metric-collector/charts/approval-request-controller/templates/rbac.yaml b/approval-request-metric-collector/charts/approval-request-controller/templates/rbac.yaml new file mode 100644 index 0000000..4e59d03 --- /dev/null +++ b/approval-request-metric-collector/charts/approval-request-controller/templates/rbac.yaml @@ -0,0 +1,72 @@ +{{- if .Values.rbac.create }} +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "approval-request-controller.fullname" . }} + labels: + {{- include "approval-request-controller.labels" . | nindent 4 }} +rules: + # CRD access for checking prerequisites + - apiGroups: ["apiextensions.k8s.io"] + resources: ["customresourcedefinitions"] + verbs: ["get", "list"] + + # ApprovalRequest and ClusterApprovalRequest (KubeFleet resources) + - apiGroups: ["placement.kubernetes-fleet.io"] + resources: ["approvalrequests", "clusterapprovalrequests"] + verbs: ["get", "list", "watch", "update", "patch"] + - apiGroups: ["placement.kubernetes-fleet.io"] + resources: ["approvalrequests/status", "clusterapprovalrequests/status"] + verbs: ["update", "patch"] + - apiGroups: ["placement.kubernetes-fleet.io"] + resources: ["approvalrequests/finalizers", "clusterapprovalrequests/finalizers"] + verbs: ["update"] + + # MetricCollector and MetricCollectorReport (our custom resources) + - apiGroups: ["autoapprove.kubernetes-fleet.io"] + resources: ["metriccollectorreports"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] + - apiGroups: ["autoapprove.kubernetes-fleet.io"] + resources: ["metriccollectorreports/status"] + verbs: ["update", "patch"] + + # ClusterResourcePlacement and ClusterResourceOverride (KubeFleet resources) + - apiGroups: ["placement.kubernetes-fleet.io"] + resources: ["clusterresourceplacements", "clusterresourceoverrides"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] + + # UpdateRuns (KubeFleet resources) + - apiGroups: ["placement.kubernetes-fleet.io"] + resources: ["stagedupdateruns", "clusterstagedupdateruns"] + verbs: ["get", "list", "watch"] + + # WorkloadTracker (our custom resource) + - apiGroups: ["autoapprove.kubernetes-fleet.io"] + resources: ["clusterstagedworkloadtrackers", "stagedworkloadtrackers"] + verbs: ["get", "list", "watch"] + + # Events + - apiGroups: [""] + resources: ["events"] + verbs: ["create", "patch"] + + # Leader election + - apiGroups: ["coordination.k8s.io"] + resources: ["leases"] + verbs: ["get", "create", "update", "delete"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "approval-request-controller.fullname" . }} + labels: + {{- include "approval-request-controller.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ include "approval-request-controller.fullname" . }} +subjects: + - kind: ServiceAccount + name: {{ include "approval-request-controller.serviceAccountName" . }} + namespace: {{ .Release.Namespace }} +{{- end }} diff --git a/approval-request-metric-collector/charts/approval-request-controller/templates/serviceaccount.yaml b/approval-request-metric-collector/charts/approval-request-controller/templates/serviceaccount.yaml new file mode 100644 index 0000000..ba3fdd1 --- /dev/null +++ b/approval-request-metric-collector/charts/approval-request-controller/templates/serviceaccount.yaml @@ -0,0 +1,13 @@ +{{- if .Values.serviceAccount.create -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "approval-request-controller.serviceAccountName" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "approval-request-controller.labels" . | nindent 4 }} + {{- with .Values.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +{{- end }} diff --git a/approval-request-metric-collector/charts/approval-request-controller/values.yaml b/approval-request-metric-collector/charts/approval-request-controller/values.yaml new file mode 100644 index 0000000..89713c0 --- /dev/null +++ b/approval-request-metric-collector/charts/approval-request-controller/values.yaml @@ -0,0 +1,84 @@ +# Default values for approval-request-controller +# This is a YAML-formatted file. + +# Controller image configuration +image: + repository: approval-request-controller + pullPolicy: IfNotPresent + tag: "latest" + +imagePullSecrets: [] +nameOverride: "" +fullnameOverride: "" + +# Controller configuration +controller: + # Number of replicas + replicas: 1 + + # Log verbosity level (0-10) + logLevel: 2 + + # Resource requests and limits + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 100m + memory: 128Mi + + # Node selector + nodeSelector: {} + + # Tolerations + tolerations: [] + + # Affinity + affinity: {} + +# RBAC configuration +rbac: + create: true + +# ServiceAccount configuration +serviceAccount: + # Specifies whether a service account should be created + create: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: "" + +# Pod annotations +podAnnotations: {} + +# Pod security context +podSecurityContext: + runAsNonRoot: true + runAsUser: 65532 + fsGroup: 65532 + +# Container security context +securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + +# Metrics server configuration +metrics: + enabled: true + port: 8080 + +# Health probe configuration +healthProbe: + enabled: true + port: 8081 + +# CRD installation +crds: + # Install MetricCollectorReport CRD + install: true diff --git a/approval-request-metric-collector/charts/metric-collector/Chart.yaml b/approval-request-metric-collector/charts/metric-collector/Chart.yaml new file mode 100644 index 0000000..2ea221d --- /dev/null +++ b/approval-request-metric-collector/charts/metric-collector/Chart.yaml @@ -0,0 +1,16 @@ +apiVersion: v2 +name: metric-collector +description: MetricCollector for Kubernetes Fleet - Collects workload health metrics and reports to hub cluster +type: application +version: 0.1.0 +appVersion: "latest" +keywords: + - kubernetes + - fleet + - metrics + - monitoring +maintainers: + - name: KubeFleet Team +home: https://github.com/kubefleet-dev/kubefleet +sources: + - https://github.com/kubefleet-dev/kubefleet/tree/main/standalone-metric-collector diff --git a/approval-request-metric-collector/charts/metric-collector/templates/_helpers.tpl b/approval-request-metric-collector/charts/metric-collector/templates/_helpers.tpl new file mode 100644 index 0000000..653f3de --- /dev/null +++ b/approval-request-metric-collector/charts/metric-collector/templates/_helpers.tpl @@ -0,0 +1,60 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "metric-collector.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +*/}} +{{- define "metric-collector.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "metric-collector.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "metric-collector.labels" -}} +helm.sh/chart: {{ include "metric-collector.chart" . }} +{{ include "metric-collector.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "metric-collector.selectorLabels" -}} +app.kubernetes.io/name: {{ include "metric-collector.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "metric-collector.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "metric-collector.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} diff --git a/approval-request-metric-collector/charts/metric-collector/templates/crds/autoapprove.kubernetes-fleet.io_metriccollectorreports.yaml b/approval-request-metric-collector/charts/metric-collector/templates/crds/autoapprove.kubernetes-fleet.io_metriccollectorreports.yaml new file mode 120000 index 0000000..32b1524 --- /dev/null +++ b/approval-request-metric-collector/charts/metric-collector/templates/crds/autoapprove.kubernetes-fleet.io_metriccollectorreports.yaml @@ -0,0 +1 @@ +../../../../config/crd/bases/autoapprove.kubernetes-fleet.io_metriccollectorreports.yaml \ No newline at end of file diff --git a/approval-request-metric-collector/charts/metric-collector/templates/deployment.yaml b/approval-request-metric-collector/charts/metric-collector/templates/deployment.yaml new file mode 100644 index 0000000..a28a095 --- /dev/null +++ b/approval-request-metric-collector/charts/metric-collector/templates/deployment.yaml @@ -0,0 +1,113 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "metric-collector.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "metric-collector.labels" . | nindent 4 }} +spec: + replicas: {{ .Values.controller.replicas }} + selector: + matchLabels: + {{- include "metric-collector.selectorLabels" . | nindent 6 }} + template: + metadata: + {{- with .Values.podAnnotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + labels: + {{- include "metric-collector.selectorLabels" . | nindent 8 }} + spec: + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "metric-collector.serviceAccountName" . }} + securityContext: + {{- toYaml .Values.podSecurityContext | nindent 8 }} + containers: + - name: controller + securityContext: + {{- toYaml .Values.securityContext | nindent 10 }} + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + command: + - /metric-collector + args: + - --v={{ .Values.controller.logLevel }} + - --hub-qps=100 + - --hub-burst=200 + - --metrics-bind-address=:{{ .Values.metrics.port }} + - --health-probe-bind-address=:{{ .Values.healthProbe.port }} + - --leader-elect=false + env: + # Member cluster identity + - name: MEMBER_CLUSTER_NAME + value: {{ .Values.memberCluster.name | quote }} + + # Hub cluster connection + - name: HUB_SERVER_URL + value: {{ .Values.hubCluster.url | quote }} + + # Prometheus URL + - name: PROMETHEUS_URL + value: {{ .Values.prometheus.url | quote }} + + # Token-based authentication (path to token file) + - name: CONFIG_PATH + value: /var/run/secrets/hub/{{ .Values.hubCluster.auth.tokenSecretKey }} + + volumeMounts: + - name: hub-token + mountPath: /var/run/secrets/hub + readOnly: true + + ports: + {{- if .Values.metrics.enabled }} + - name: metrics + containerPort: {{ .Values.metrics.port }} + protocol: TCP + {{- end }} + {{- if .Values.healthProbe.enabled }} + - name: health + containerPort: {{ .Values.healthProbe.port }} + protocol: TCP + {{- end }} + + {{- if .Values.healthProbe.enabled }} + livenessProbe: + httpGet: + path: /healthz + port: health + initialDelaySeconds: 15 + periodSeconds: 20 + + readinessProbe: + httpGet: + path: /readyz + port: health + initialDelaySeconds: 5 + periodSeconds: 10 + {{- end }} + + resources: + {{- toYaml .Values.controller.resources | nindent 10 }} + + volumes: + - name: hub-token + secret: + secretName: {{ .Values.hubCluster.auth.tokenSecretName }} + + {{- with .Values.controller.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.controller.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.controller.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} diff --git a/approval-request-metric-collector/charts/metric-collector/templates/hub-rbac.yaml b/approval-request-metric-collector/charts/metric-collector/templates/hub-rbac.yaml new file mode 100644 index 0000000..5df3812 --- /dev/null +++ b/approval-request-metric-collector/charts/metric-collector/templates/hub-rbac.yaml @@ -0,0 +1,85 @@ +{{- if .Values.hubCluster.createRBAC }} +# This template generates RBAC resources for the hub cluster +# Apply this on the HUB cluster to grant the metric-collector permissions +# to watch/update MetricCollectorReport resources in the fleet-member- namespace +# +# Usage: +# helm template metric-collector ./charts/metric-collector \ +# --set hubCluster.createRBAC=true \ +# --show-only templates/hub-rbac.yaml | kubectl apply -f - --context=hub-cluster +# +--- +# Role for MetricCollectorReport access in fleet-member- namespace +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: {{ include "metric-collector.fullname" . }}-report-access + namespace: fleet-member-{{ .Values.memberCluster.name }} + labels: + {{- include "metric-collector.labels" . | nindent 4 }} + app.kubernetes.io/component: hub-rbac + annotations: + helm.sh/resource-policy: keep +rules: + # MetricCollectorReport access + - apiGroups: ["autoapprove.kubernetes-fleet.io"] + resources: ["metriccollectorreports"] + verbs: ["get", "list", "watch", "update", "patch"] + - apiGroups: ["autoapprove.kubernetes-fleet.io"] + resources: ["metriccollectorreports/status"] + verbs: ["update", "patch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ include "metric-collector.fullname" . }}-report-access + namespace: fleet-member-{{ .Values.memberCluster.name }} + labels: + {{- include "metric-collector.labels" . | nindent 4 }} + app.kubernetes.io/component: hub-rbac + fleet.kubernetes.io/member-cluster: {{ .Values.memberCluster.name }} + annotations: + helm.sh/resource-policy: keep +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: {{ include "metric-collector.fullname" . }}-report-access +subjects: + - kind: ServiceAccount + name: {{ .Values.hubCluster.auth.serviceAccountName | default (include "metric-collector.serviceAccountName" .) }} + namespace: fleet-member-{{ .Values.memberCluster.name }} +--- +# ClusterRole for reading ClusterStagedWorkloadTracker (cluster-scoped) +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "metric-collector.fullname" . }}-workloadtracker-reader + labels: + {{- include "metric-collector.labels" . | nindent 4 }} + app.kubernetes.io/component: hub-rbac + annotations: + helm.sh/resource-policy: keep +rules: + - apiGroups: ["placement.kubernetes-fleet.io"] + resources: ["clusterstagedworkloadtrackers"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "metric-collector.fullname" . }}-{{ .Values.memberCluster.name }}-workloadtracker + labels: + {{- include "metric-collector.labels" . | nindent 4 }} + app.kubernetes.io/component: hub-rbac + fleet.kubernetes.io/member-cluster: {{ .Values.memberCluster.name }} + annotations: + helm.sh/resource-policy: keep +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ include "metric-collector.fullname" . }}-workloadtracker-reader +subjects: + - kind: ServiceAccount + name: {{ .Values.hubCluster.auth.serviceAccountName | default (include "metric-collector.serviceAccountName" .) }} + namespace: fleet-member-{{ .Values.memberCluster.name }} +{{- end }} diff --git a/approval-request-metric-collector/charts/metric-collector/templates/rbac-member.yaml b/approval-request-metric-collector/charts/metric-collector/templates/rbac-member.yaml new file mode 100644 index 0000000..7b8102a --- /dev/null +++ b/approval-request-metric-collector/charts/metric-collector/templates/rbac-member.yaml @@ -0,0 +1,39 @@ +{{- if .Values.rbac.create }} +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "metric-collector.fullname" . }} + labels: + {{- include "metric-collector.labels" . | nindent 4 }} +rules: + # MetricCollector CRD access on member cluster + - apiGroups: ["autoapprove.kubernetes-fleet.io"] + resources: ["metriccollectors"] + verbs: ["get", "list", "watch", "update", "patch"] + - apiGroups: ["autoapprove.kubernetes-fleet.io"] + resources: ["metriccollectors/status"] + verbs: ["update", "patch"] + - apiGroups: ["autoapprove.kubernetes-fleet.io"] + resources: ["metriccollectors/finalizers"] + verbs: ["update"] + + # Events + - apiGroups: [""] + resources: ["events"] + verbs: ["create", "patch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "metric-collector.fullname" . }} + labels: + {{- include "metric-collector.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ include "metric-collector.fullname" . }} +subjects: + - kind: ServiceAccount + name: {{ include "metric-collector.serviceAccountName" . }} + namespace: {{ .Release.Namespace }} +{{- end }} diff --git a/approval-request-metric-collector/charts/metric-collector/templates/serviceaccount.yaml b/approval-request-metric-collector/charts/metric-collector/templates/serviceaccount.yaml new file mode 100644 index 0000000..b5d081d --- /dev/null +++ b/approval-request-metric-collector/charts/metric-collector/templates/serviceaccount.yaml @@ -0,0 +1,13 @@ +{{- if .Values.serviceAccount.create -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "metric-collector.serviceAccountName" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "metric-collector.labels" . | nindent 4 }} + {{- with .Values.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +{{- end }} diff --git a/approval-request-metric-collector/charts/metric-collector/values.yaml b/approval-request-metric-collector/charts/metric-collector/values.yaml new file mode 100644 index 0000000..a034758 --- /dev/null +++ b/approval-request-metric-collector/charts/metric-collector/values.yaml @@ -0,0 +1,119 @@ +# Default values for metric-collector +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +# Controller image configuration +image: + repository: metric-collector + pullPolicy: IfNotPresent + tag: "latest" + +# Metric app image configuration (used in sample deployments) +metricApp: + image: + repository: metric-app + tag: "latest" + +imagePullSecrets: [] +nameOverride: "" +fullnameOverride: "" + +# Member cluster configuration +memberCluster: + # Name of the member cluster (required) + # This should match the cluster name in the fleet + name: "" + +# Hub cluster connection configuration +hubCluster: + # Hub API server URL (required) + # Example: https://hub-cluster.example.com:6443 + url: "" + + # Set to true to generate hub RBAC resources + # These resources must be applied on the hub cluster + createRBAC: false + + # Token-based authentication configuration + auth: + # Token secret details + tokenSecretName: "hub-token" + tokenSecretKey: "token" + + # ServiceAccount details for RBAC binding on hub cluster + # Leave empty to use the default serviceAccount from this chart + serviceAccountName: "" + serviceAccountNamespace: "" + +# Prometheus configuration +prometheus: + # Prometheus URL (required) + # Example: http://prometheus.monitoring.svc.cluster.local:9090 + url: "" + +# Controller configuration +controller: + # Number of replicas + replicas: 1 + + # Log verbosity level (0-10) + logLevel: 2 + + # Resource requests and limits + resources: + limits: + cpu: 200m + memory: 256Mi + requests: + cpu: 100m + memory: 128Mi + + # Node selector + nodeSelector: {} + + # Tolerations + tolerations: [] + + # Affinity + affinity: {} + +# RBAC configuration +rbac: + create: true + +# ServiceAccount configuration +serviceAccount: + # Specifies whether a service account should be created + create: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: "" + +# Pod annotations +podAnnotations: {} + +# Pod security context +podSecurityContext: + runAsNonRoot: true + runAsUser: 65532 + fsGroup: 65532 + +# Container security context +securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + +# Metrics server configuration +metrics: + enabled: true + port: 8080 + +# Health probe configuration +healthProbe: + enabled: true + port: 8081 diff --git a/approval-request-metric-collector/cmd/approvalrequestcontroller/main.go b/approval-request-metric-collector/cmd/approvalrequestcontroller/main.go new file mode 100644 index 0000000..0bc5835 --- /dev/null +++ b/approval-request-metric-collector/cmd/approvalrequestcontroller/main.go @@ -0,0 +1,167 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "context" + "flag" + "fmt" + "os" + + apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" + "k8s.io/apimachinery/pkg/runtime" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + "k8s.io/klog/v2" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/healthz" + "sigs.k8s.io/controller-runtime/pkg/log/zap" + metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" + + autoapprovev1alpha1 "github.com/kubefleet-dev/kubefleet-cookbook/approval-request-metric-collector/apis/autoapprove/v1alpha1" + approvalcontroller "github.com/kubefleet-dev/kubefleet-cookbook/approval-request-metric-collector/pkg/controllers/approvalrequest" + placementv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1" +) + +var ( + scheme = runtime.NewScheme() +) + +func init() { + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + utilruntime.Must(placementv1beta1.AddToScheme(scheme)) + utilruntime.Must(autoapprovev1alpha1.AddToScheme(scheme)) + utilruntime.Must(apiextensionsv1.AddToScheme(scheme)) +} + +func main() { + var metricsAddr string + var probeAddr string + + // Add klog flags to support -v for verbosity + klog.InitFlags(nil) + + flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.") + flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") + + opts := zap.Options{ + Development: true, + } + opts.BindFlags(flag.CommandLine) + flag.Parse() + + ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) + + klog.InfoS("Starting ApprovalRequest Controller") + + config := ctrl.GetConfigOrDie() + + // Check required CRDs are installed before starting + if err := checkRequiredCRDs(config); err != nil { + klog.ErrorS(err, "Required CRDs not found") + os.Exit(1) + } + + mgr, err := ctrl.NewManager(config, ctrl.Options{ + Scheme: scheme, + Metrics: metricsserver.Options{ + BindAddress: metricsAddr, + }, + HealthProbeBindAddress: probeAddr, + }) + if err != nil { + klog.ErrorS(err, "Unable to create manager") + os.Exit(1) + } + + // Setup ApprovalRequest controller + approvalRequestReconciler := &approvalcontroller.Reconciler{ + Client: mgr.GetClient(), + } + if err = approvalRequestReconciler.SetupWithManagerForApprovalRequest(mgr); err != nil { + klog.ErrorS(err, "Unable to create controller", "controller", "ApprovalRequest") + os.Exit(1) + } + + // Setup ClusterApprovalRequest controller + clusterApprovalRequestReconciler := &approvalcontroller.Reconciler{ + Client: mgr.GetClient(), + } + if err = clusterApprovalRequestReconciler.SetupWithManagerForClusterApprovalRequest(mgr); err != nil { + klog.ErrorS(err, "Unable to create controller", "controller", "ClusterApprovalRequest") + os.Exit(1) + } + + if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { + klog.ErrorS(err, "Unable to set up health check") + os.Exit(1) + } + + if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil { + klog.ErrorS(err, "Unable to set up ready check") + os.Exit(1) + } + + klog.InfoS("Starting manager") + if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil { + klog.ErrorS(err, "Problem running manager") + os.Exit(1) + } +} + +// checkRequiredCRDs checks that all required CRDs are installed +func checkRequiredCRDs(config *rest.Config) error { + requiredCRDs := []string{ + "approvalrequests.placement.kubernetes-fleet.io", + "clusterapprovalrequests.placement.kubernetes-fleet.io", + "metriccollectorreports.autoapprove.kubernetes-fleet.io", + "clusterstagedworkloadtrackers.autoapprove.kubernetes-fleet.io", + "stagedworkloadtrackers.autoapprove.kubernetes-fleet.io", + "clusterstagedupdateruns.placement.kubernetes-fleet.io", + "stagedupdateruns.placement.kubernetes-fleet.io", + } + + klog.InfoS("Checking for required CRDs", "count", len(requiredCRDs)) + + c, err := client.New(config, client.Options{Scheme: scheme}) + if err != nil { + return err + } + + ctx := context.Background() + missingCRDs := []string{} + + for _, crdName := range requiredCRDs { + crd := &apiextensionsv1.CustomResourceDefinition{} + err := c.Get(ctx, client.ObjectKey{Name: crdName}, crd) + if err != nil { + klog.ErrorS(err, "CRD not found", "crd", crdName) + missingCRDs = append(missingCRDs, crdName) + } else { + klog.V(3).InfoS("CRD found", "crd", crdName) + } + } + + if len(missingCRDs) > 0 { + return fmt.Errorf("missing required CRDs: %v", missingCRDs) + } + + klog.InfoS("All required CRDs are installed") + return nil +} diff --git a/approval-request-metric-collector/cmd/metricapp/main.go b/approval-request-metric-collector/cmd/metricapp/main.go new file mode 100644 index 0000000..f2b2668 --- /dev/null +++ b/approval-request-metric-collector/cmd/metricapp/main.go @@ -0,0 +1,40 @@ +package main + +import ( + "net/http" + "os" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" +) + +func main() { + // Get the workload kind from environment variable + // This should be set to the actual parent resource (e.g., "Deployment", "StatefulSet", "DaemonSet") + // not the immediate controller like ReplicaSet + workloadKind := os.Getenv("WORKLOAD_KIND") + if workloadKind == "" { + workloadKind = "Unknown" + } + + // Define a simple gauge metric for health with workload_kind label + workloadHealth := prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "workload_health", + Help: "Indicates if the workload is healthy (1=healthy, 0=unhealthy)", + }, + []string{"workload_kind"}, + ) + + // Set it to 1 (healthy) with the workload kind label + workloadHealth.WithLabelValues(workloadKind).Set(1) + + // Register metric with Prometheus default registry + prometheus.MustRegister(workloadHealth) + + // Expose metrics endpoint + http.Handle("/metrics", promhttp.Handler()) + + // Start HTTP server + http.ListenAndServe(":8080", nil) +} diff --git a/approval-request-metric-collector/cmd/metriccollector/main.go b/approval-request-metric-collector/cmd/metriccollector/main.go new file mode 100644 index 0000000..c647985 --- /dev/null +++ b/approval-request-metric-collector/cmd/metriccollector/main.go @@ -0,0 +1,171 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "context" + "flag" + "fmt" + "os" + + "k8s.io/apimachinery/pkg/runtime" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + "k8s.io/klog/v2" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/cache" + "sigs.k8s.io/controller-runtime/pkg/healthz" + metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" + + autoapprovev1alpha1 "github.com/kubefleet-dev/kubefleet-cookbook/approval-request-metric-collector/apis/autoapprove/v1alpha1" + metriccollector "github.com/kubefleet-dev/kubefleet-cookbook/approval-request-metric-collector/pkg/controllers/metriccollector" + placementv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1" +) + +var ( + hubQPS = flag.Int("hub-qps", 100, "QPS for hub cluster client") + hubBurst = flag.Int("hub-burst", 200, "Burst for hub cluster client") + metricsAddr = flag.String("metrics-bind-address", ":8080", "The address the metric endpoint binds to.") + probeAddr = flag.String("health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") + leaderElectionID = flag.String("leader-election-id", "metric-collector-leader", "The leader election ID.") + enableLeaderElect = flag.Bool("leader-elect", true, "Enable leader election for controller manager.") +) + +func main() { + klog.InitFlags(nil) + flag.Parse() + + klog.InfoS("Starting MetricCollector Controller") + + // Get member cluster identity + memberClusterName := os.Getenv("MEMBER_CLUSTER_NAME") + if memberClusterName == "" { + klog.ErrorS(nil, "MEMBER_CLUSTER_NAME environment variable not set") + os.Exit(1) + } + + // Construct hub namespace + hubNamespace := fmt.Sprintf("fleet-member-%s", memberClusterName) + klog.InfoS("Using hub namespace", "namespace", hubNamespace, "memberCluster", memberClusterName) + + // Build hub cluster config + hubConfig, err := buildHubConfig() + if err != nil { + klog.ErrorS(err, "Failed to build hub cluster config") + os.Exit(1) + } + hubConfig.QPS = float32(*hubQPS) + hubConfig.Burst = *hubBurst + + // Start controller + if err := Start(ctrl.SetupSignalHandler(), hubConfig, memberClusterName, hubNamespace); err != nil { + klog.ErrorS(err, "Failed to start controller") + os.Exit(1) + } +} + +// buildHubConfig creates hub cluster config using token-based authentication +// with TLS verification disabled (insecure mode) +func buildHubConfig() (*rest.Config, error) { + hubURL := os.Getenv("HUB_SERVER_URL") + if hubURL == "" { + return nil, fmt.Errorf("HUB_SERVER_URL environment variable not set") + } + + // Get token path (defaults to /var/run/secrets/hub/token) + configPath := os.Getenv("CONFIG_PATH") + if configPath == "" { + configPath = "/var/run/secrets/hub/token" + } + + // Read token file + tokenData, err := os.ReadFile(configPath) + if err != nil { + return nil, fmt.Errorf("failed to read hub token from %s: %w", configPath, err) + } + + klog.InfoS("Using token-based authentication with insecure TLS for hub cluster") + + // Create hub config with token auth and insecure TLS + return &rest.Config{ + Host: hubURL, + BearerToken: string(tokenData), + TLSClientConfig: rest.TLSClientConfig{ + Insecure: true, + }, + }, nil +} + +// Start starts the controller with hub cluster connection +func Start(ctx context.Context, hubCfg *rest.Config, memberClusterName, hubNamespace string) error { + // Create scheme with required APIs + scheme := runtime.NewScheme() + if err := clientgoscheme.AddToScheme(scheme); err != nil { + return fmt.Errorf("failed to add client-go scheme: %w", err) + } + if err := autoapprovev1alpha1.AddToScheme(scheme); err != nil { + return fmt.Errorf("failed to add autoapprove v1alpha1 API to scheme: %w", err) + } + if err := placementv1beta1.AddToScheme(scheme); err != nil { + return fmt.Errorf("failed to add placement v1beta1 API to scheme: %w", err) + } + + // Create hub cluster manager - watches MetricCollectorReport in hub namespace + hubMgr, err := ctrl.NewManager(hubCfg, ctrl.Options{ + Scheme: scheme, + Cache: cache.Options{ + DefaultNamespaces: map[string]cache.Config{ + hubNamespace: {}, // Only watch fleet-member- + }, + }, + Metrics: metricsserver.Options{ + BindAddress: *metricsAddr, + }, + HealthProbeBindAddress: *probeAddr, + LeaderElection: *enableLeaderElect, + LeaderElectionID: *leaderElectionID, + }) + if err != nil { + return fmt.Errorf("failed to create hub manager: %w", err) + } + + // Setup MetricCollectorReport controller (watches hub, queries member Prometheus) + if err := (&metriccollector.Reconciler{ + HubClient: hubMgr.GetClient(), + }).SetupWithManager(hubMgr); err != nil { + return fmt.Errorf("failed to setup controller: %w", err) + } + + // Add health checks + if err := hubMgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { + return fmt.Errorf("failed to add healthz check: %w", err) + } + if err := hubMgr.AddReadyzCheck("readyz", healthz.Ping); err != nil { + return fmt.Errorf("failed to add readyz check: %w", err) + } + + klog.InfoS("Starting MetricCollector controller", + "hubUrl", hubCfg.Host, + "hubNamespace", hubNamespace, + "memberCluster", memberClusterName, + "metricsAddr", *metricsAddr, + "probeAddr", *probeAddr) + + // Start hub manager (watches MetricCollectorReport on hub, queries Prometheus on member) + klog.InfoS("Starting hub manager", "namespace", hubNamespace) + return hubMgr.Start(ctx) +} diff --git a/approval-request-metric-collector/config/crd/bases/autoapprove.kubernetes-fleet.io_clusterstagedworkloadtrackers.yaml b/approval-request-metric-collector/config/crd/bases/autoapprove.kubernetes-fleet.io_clusterstagedworkloadtrackers.yaml new file mode 100644 index 0000000..375c86b --- /dev/null +++ b/approval-request-metric-collector/config/crd/bases/autoapprove.kubernetes-fleet.io_clusterstagedworkloadtrackers.yaml @@ -0,0 +1,68 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.0 + name: clusterstagedworkloadtrackers.autoapprove.kubernetes-fleet.io +spec: + group: autoapprove.kubernetes-fleet.io + names: + categories: + - fleet + - fleet-placement + kind: ClusterStagedWorkloadTracker + listKind: ClusterStagedWorkloadTrackerList + plural: clusterstagedworkloadtrackers + singular: clusterstagedworkloadtracker + scope: Cluster + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + ClusterStagedWorkloadTracker expresses user intent to track certain workloads for a ClusterStagedUpdateRun. + The name of this resource should match the name of the ClusterStagedUpdateRun it is used for. + For example, if the ClusterStagedUpdateRun is named "example-cluster-staged-run", the + ClusterStagedWorkloadTracker should also be named "example-cluster-staged-run". + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + workloads: + description: Workloads is a list of workloads to track + items: + description: WorkloadReference represents a workload to be tracked + properties: + kind: + description: Kind is the kind of the workload controller (e.g., + Deployment, StatefulSet, DaemonSet) + type: string + name: + description: Name is the name of the workload + type: string + namespace: + description: Namespace is the namespace of the workload + type: string + required: + - name + - namespace + type: object + type: array + type: object + served: true + storage: true diff --git a/approval-request-metric-collector/config/crd/bases/autoapprove.kubernetes-fleet.io_metriccollectorreports.yaml b/approval-request-metric-collector/config/crd/bases/autoapprove.kubernetes-fleet.io_metriccollectorreports.yaml new file mode 100644 index 0000000..6bc4a88 --- /dev/null +++ b/approval-request-metric-collector/config/crd/bases/autoapprove.kubernetes-fleet.io_metriccollectorreports.yaml @@ -0,0 +1,181 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.0 + name: metriccollectorreports.autoapprove.kubernetes-fleet.io +spec: + group: autoapprove.kubernetes-fleet.io + names: + categories: + - fleet + - fleet-metrics + kind: MetricCollectorReport + listKind: MetricCollectorReportList + plural: metriccollectorreports + shortNames: + - mcr + singular: metriccollectorreport + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .status.workloadsMonitored + name: Workloads + type: integer + - jsonPath: .status.lastCollectionTime + name: Last-Collection + type: date + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + MetricCollectorReport is created by the approval-request-controller on the hub cluster + in the fleet-member-{clusterName} namespace. The metric-collector on the member cluster + watches these reports and updates their status with collected metrics. + + Controller workflow: + 1. Approval-controller creates MetricCollectorReport with spec on hub + 2. Metric-collector watches MetricCollectorReport on hub (in fleet-member-{clusterName} namespace) + 3. Metric-collector queries Prometheus on member cluster + 4. Metric-collector updates MetricCollectorReport status on hub with collected metrics + + Namespace: fleet-member-{clusterName} + Name: Matches the UpdateRun name + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: MetricCollectorReportSpec defines the configuration for metric + collection. + properties: + prometheusUrl: + description: |- + PrometheusURL is the URL of the Prometheus server on the member cluster + Example: "http://prometheus.fleet-system.svc.cluster.local:9090" + type: string + required: + - prometheusUrl + type: object + status: + description: MetricCollectorReportStatus contains the collected metrics + from the member cluster. + properties: + collectedMetrics: + description: CollectedMetrics contains the most recent metrics from + each workload. + items: + description: WorkloadMetrics represents metrics collected from a + single workload. + properties: + health: + description: Health indicates if the workload is healthy (true=healthy, + false=unhealthy). + type: boolean + namespace: + description: Namespace of the workload. + type: string + workloadKind: + description: Kind of the workload controller (e.g., Deployment, + StatefulSet, DaemonSet). + type: string + workloadName: + description: Name of the workload. + type: string + required: + - health + - namespace + - workloadName + type: object + type: array + conditions: + description: Conditions represent the latest available observations + of the report's state. + items: + description: Condition contains details for one aspect of the current + state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + lastCollectionTime: + description: LastCollectionTime is when metrics were last collected + on the member cluster. + format: date-time + type: string + workloadsMonitored: + description: WorkloadsMonitored is the count of workloads being monitored. + format: int32 + type: integer + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/approval-request-metric-collector/config/crd/bases/autoapprove.kubernetes-fleet.io_stagedworkloadtrackers.yaml b/approval-request-metric-collector/config/crd/bases/autoapprove.kubernetes-fleet.io_stagedworkloadtrackers.yaml new file mode 100644 index 0000000..70faaf9 --- /dev/null +++ b/approval-request-metric-collector/config/crd/bases/autoapprove.kubernetes-fleet.io_stagedworkloadtrackers.yaml @@ -0,0 +1,68 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.0 + name: stagedworkloadtrackers.autoapprove.kubernetes-fleet.io +spec: + group: autoapprove.kubernetes-fleet.io + names: + categories: + - fleet + - fleet-placement + kind: StagedWorkloadTracker + listKind: StagedWorkloadTrackerList + plural: stagedworkloadtrackers + singular: stagedworkloadtracker + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + StagedWorkloadTracker expresses user intent to track certain workloads for a StagedUpdateRun. + The name and namespace of this resource should match the name and namespace of the StagedUpdateRun it is used for. + For example, if the StagedUpdateRun is named "example-staged-run" in namespace "test-ns", the + StagedWorkloadTracker should also be named "example-staged-run" in namespace "test-ns". + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + workloads: + description: Workloads is a list of workloads to track + items: + description: WorkloadReference represents a workload to be tracked + properties: + kind: + description: Kind is the kind of the workload controller (e.g., + Deployment, StatefulSet, DaemonSet) + type: string + name: + description: Name is the name of the workload + type: string + namespace: + description: Namespace is the namespace of the workload + type: string + required: + - name + - namespace + type: object + type: array + type: object + served: true + storage: true diff --git a/approval-request-metric-collector/docker/approval-request-controller.Dockerfile b/approval-request-metric-collector/docker/approval-request-controller.Dockerfile new file mode 100644 index 0000000..39c40ae --- /dev/null +++ b/approval-request-metric-collector/docker/approval-request-controller.Dockerfile @@ -0,0 +1,27 @@ +# Build stage +FROM golang:1.24 AS builder + +WORKDIR /workspace + +# Copy go mod files +COPY go.mod go.sum* ./ +RUN go mod download + +# Copy source code +COPY apis/ apis/ +COPY pkg/ pkg/ +COPY cmd/ cmd/ + +# Build the controller +ARG GOARCH +RUN CGO_ENABLED=0 GOOS=linux GOARCH=${GOARCH} go build \ + -a -o approval-request-controller \ + ./cmd/approvalrequestcontroller + +# Runtime stage +FROM gcr.io/distroless/static:nonroot +WORKDIR / +COPY --from=builder /workspace/approval-request-controller . +USER 65532:65532 + +ENTRYPOINT ["/approval-request-controller"] diff --git a/approval-request-metric-collector/docker/metric-app.Dockerfile b/approval-request-metric-collector/docker/metric-app.Dockerfile new file mode 100644 index 0000000..9ddc9d1 --- /dev/null +++ b/approval-request-metric-collector/docker/metric-app.Dockerfile @@ -0,0 +1,27 @@ +# Build stage +FROM golang:1.24 AS builder + +WORKDIR /workspace + +# Copy go mod files +COPY go.mod go.sum* ./ +RUN go mod download + +# Copy source code +COPY apis/ apis/ +COPY pkg/ pkg/ +COPY cmd/ cmd/ + +# Build the application +ARG GOARCH +RUN CGO_ENABLED=0 GOOS=linux GOARCH=${GOARCH} go build \ + -a -o metric-app \ + ./cmd/metricapp + +# Runtime stage +FROM gcr.io/distroless/static:nonroot +WORKDIR / +COPY --from=builder /workspace/metric-app . +USER 65532:65532 + +ENTRYPOINT ["/metric-app"] diff --git a/approval-request-metric-collector/docker/metric-collector.Dockerfile b/approval-request-metric-collector/docker/metric-collector.Dockerfile new file mode 100644 index 0000000..641c08a --- /dev/null +++ b/approval-request-metric-collector/docker/metric-collector.Dockerfile @@ -0,0 +1,27 @@ +# Build stage +FROM golang:1.24 AS builder + +WORKDIR /workspace + +# Copy go mod files +COPY go.mod go.sum* ./ +RUN go mod download + +# Copy source code +COPY apis/ apis/ +COPY pkg/ pkg/ +COPY cmd/ cmd/ + +# Build the collector +ARG GOARCH +RUN CGO_ENABLED=0 GOOS=linux GOARCH=${GOARCH} go build \ + -a -o metric-collector \ + ./cmd/metriccollector + +# Runtime stage +FROM gcr.io/distroless/static:nonroot +WORKDIR / +COPY --from=builder /workspace/metric-collector . +USER 65532:65532 + +ENTRYPOINT ["/metric-collector"] diff --git a/approval-request-metric-collector/examples/membercluster/fleet_v1beta1_membercluster.yaml b/approval-request-metric-collector/examples/membercluster/fleet_v1beta1_membercluster.yaml new file mode 100644 index 0000000..d45e1bd --- /dev/null +++ b/approval-request-metric-collector/examples/membercluster/fleet_v1beta1_membercluster.yaml @@ -0,0 +1,38 @@ +apiVersion: cluster.kubernetes-fleet.io/v1beta1 +kind: MemberCluster +metadata: + name: kind-cluster-1 + labels: + environment: staging +spec: + identity: + name: fleet-member-agent-cluster-1 + kind: ServiceAccount + namespace: fleet-system + apiGroup: "" +--- +apiVersion: cluster.kubernetes-fleet.io/v1beta1 +kind: MemberCluster +metadata: + name: kind-cluster-2 + labels: + environment: prod +spec: + identity: + name: fleet-member-agent-cluster-2 + kind: ServiceAccount + namespace: fleet-system + apiGroup: "" +--- +apiVersion: cluster.kubernetes-fleet.io/v1beta1 +kind: MemberCluster +metadata: + name: kind-cluster-3 + labels: + environment: prod +spec: + identity: + name: fleet-member-agent-cluster-3 + kind: ServiceAccount + namespace: fleet-system + apiGroup: "" diff --git a/approval-request-metric-collector/examples/prometheus/configmap.yaml b/approval-request-metric-collector/examples/prometheus/configmap.yaml new file mode 100644 index 0000000..e300b16 --- /dev/null +++ b/approval-request-metric-collector/examples/prometheus/configmap.yaml @@ -0,0 +1,41 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus-config + namespace: prometheus +data: + prometheus.yml: | + global: + scrape_interval: 15s + evaluation_interval: 15s + + scrape_configs: + - job_name: 'kubernetes-pods' + kubernetes_sd_configs: + - role: pod + # Scrape pods from all namespaces + relabel_configs: + # Only scrape pods with prometheus.io/scrape annotation + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] + action: keep + regex: true + # Use the port from prometheus.io/port annotation or default pod IP + - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] + action: replace + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + target_label: __address__ + # Use the path from prometheus.io/path annotation or default /metrics + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + # Add pod metadata as labels + - source_labels: [__meta_kubernetes_namespace] + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + target_label: pod + - source_labels: [__meta_kubernetes_pod_label_app] + target_label: app + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) diff --git a/approval-request-metric-collector/examples/prometheus/deployment.yaml b/approval-request-metric-collector/examples/prometheus/deployment.yaml new file mode 100644 index 0000000..a922073 --- /dev/null +++ b/approval-request-metric-collector/examples/prometheus/deployment.yaml @@ -0,0 +1,48 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: prometheus + namespace: prometheus + labels: + app: prometheus +spec: + replicas: 1 + selector: + matchLabels: + app: prometheus + template: + metadata: + labels: + app: prometheus + spec: + serviceAccountName: prometheus + containers: + - name: prometheus + image: prom/prometheus:v2.47.0 + args: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--web.console.libraries=/usr/share/prometheus/console_libraries' + - '--web.console.templates=/usr/share/prometheus/consoles' + - '--web.enable-lifecycle' + ports: + - name: web + containerPort: 9090 + volumeMounts: + - name: prometheus-config + mountPath: /etc/prometheus + - name: prometheus-storage + mountPath: /prometheus + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi + volumes: + - name: prometheus-config + configMap: + name: prometheus-config + - name: prometheus-storage + emptyDir: {} diff --git a/approval-request-metric-collector/examples/prometheus/prometheus-crp.yaml b/approval-request-metric-collector/examples/prometheus/prometheus-crp.yaml new file mode 100644 index 0000000..695d88b --- /dev/null +++ b/approval-request-metric-collector/examples/prometheus/prometheus-crp.yaml @@ -0,0 +1,22 @@ +apiVersion: placement.kubernetes-fleet.io/v1beta1 +kind: ClusterResourcePlacement +metadata: + name: prometheus-crp +spec: + resourceSelectors: + - group: "" + version: v1 + kind: Namespace + name: prometheus + - group: "rbac.authorization.k8s.io" + version: v1 + kind: ClusterRole + name: prometheus + - group: "rbac.authorization.k8s.io" + version: v1 + kind: ClusterRoleBinding + name: prometheus + policy: + placementType: PickAll + strategy: + type: RollingUpdate diff --git a/approval-request-metric-collector/examples/prometheus/rbac.yaml b/approval-request-metric-collector/examples/prometheus/rbac.yaml new file mode 100644 index 0000000..4dd638d --- /dev/null +++ b/approval-request-metric-collector/examples/prometheus/rbac.yaml @@ -0,0 +1,39 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus + namespace: prometheus +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: prometheus +rules: + - apiGroups: [""] + resources: + - nodes + - nodes/proxy + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] + - apiGroups: + - extensions + resources: + - ingresses + verbs: ["get", "list", "watch"] + - nonResourceURLs: ["/metrics"] + verbs: ["get"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: prometheus +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus +subjects: + - kind: ServiceAccount + name: prometheus + namespace: prometheus diff --git a/approval-request-metric-collector/examples/prometheus/service.yaml b/approval-request-metric-collector/examples/prometheus/service.yaml new file mode 100644 index 0000000..ff61964 --- /dev/null +++ b/approval-request-metric-collector/examples/prometheus/service.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: Service +metadata: + name: prometheus + namespace: prometheus + labels: + app: prometheus +spec: + type: ClusterIP + ports: + - name: web + port: 9090 + targetPort: 9090 + protocol: TCP + selector: + app: prometheus diff --git a/approval-request-metric-collector/examples/sample-metric-app/sample-metric-app.yaml b/approval-request-metric-collector/examples/sample-metric-app/sample-metric-app.yaml new file mode 100644 index 0000000..2714d65 --- /dev/null +++ b/approval-request-metric-collector/examples/sample-metric-app/sample-metric-app.yaml @@ -0,0 +1,30 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: sample-metric-app + namespace: test-ns + labels: + app: sample-metric-app +spec: + replicas: 1 + selector: + matchLabels: + app: sample-metric-app + template: + metadata: + labels: + app: sample-metric-app + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + spec: + containers: + - name: metric-app + image: arvindacr.azurecr.io/metric-app:latest + imagePullPolicy: Always + env: + - name: WORKLOAD_KIND + value: "Deployment" + ports: + - containerPort: 8080 diff --git a/approval-request-metric-collector/examples/updateRun/example-crp.yaml b/approval-request-metric-collector/examples/updateRun/example-crp.yaml new file mode 100644 index 0000000..21a0827 --- /dev/null +++ b/approval-request-metric-collector/examples/updateRun/example-crp.yaml @@ -0,0 +1,14 @@ +apiVersion: placement.kubernetes-fleet.io/v1beta1 +kind: ClusterResourcePlacement +metadata: + name: example-crp +spec: + resourceSelectors: + - group: "" + kind: Namespace + name: test-ns + version: v1 + policy: + placementType: PickAll + strategy: + type: External diff --git a/approval-request-metric-collector/examples/updateRun/example-csur.yaml b/approval-request-metric-collector/examples/updateRun/example-csur.yaml new file mode 100644 index 0000000..ece9a3b --- /dev/null +++ b/approval-request-metric-collector/examples/updateRun/example-csur.yaml @@ -0,0 +1,9 @@ +apiVersion: placement.kubernetes-fleet.io/v1beta1 +kind: ClusterStagedUpdateRun +metadata: + name: example-cluster-staged-run +spec: + placementName: example-crp + resourceSnapshotIndex: "0" + stagedRolloutStrategyName: example-cluster-staged-strategy + state: Started diff --git a/approval-request-metric-collector/examples/updateRun/example-csus.yaml b/approval-request-metric-collector/examples/updateRun/example-csus.yaml new file mode 100644 index 0000000..9b9a9a7 --- /dev/null +++ b/approval-request-metric-collector/examples/updateRun/example-csus.yaml @@ -0,0 +1,18 @@ +apiVersion: placement.kubernetes-fleet.io/v1beta1 +kind: ClusterStagedUpdateStrategy +metadata: + name: example-cluster-staged-strategy +spec: + stages: + - name: staging + labelSelector: + matchLabels: + environment: staging + afterStageTasks: + - type: Approval + - name: prod + labelSelector: + matchLabels: + environment: prod + afterStageTasks: + - type: Approval diff --git a/approval-request-metric-collector/examples/updateRun/example-ns-only-crp.yaml b/approval-request-metric-collector/examples/updateRun/example-ns-only-crp.yaml new file mode 100644 index 0000000..54dd705 --- /dev/null +++ b/approval-request-metric-collector/examples/updateRun/example-ns-only-crp.yaml @@ -0,0 +1,15 @@ +apiVersion: placement.kubernetes-fleet.io/v1beta1 +kind: ClusterResourcePlacement +metadata: + name: ns-only-crp +spec: + resourceSelectors: + - group: "" + kind: Namespace + name: test-ns + version: v1 + selectionScope: NamespaceOnly + policy: + placementType: PickAll + strategy: + type: RollingUpdate diff --git a/approval-request-metric-collector/examples/updateRun/example-rp.yaml b/approval-request-metric-collector/examples/updateRun/example-rp.yaml new file mode 100644 index 0000000..214d1c3 --- /dev/null +++ b/approval-request-metric-collector/examples/updateRun/example-rp.yaml @@ -0,0 +1,15 @@ +apiVersion: placement.kubernetes-fleet.io/v1beta1 +kind: ResourcePlacement +metadata: + name: example-rp + namespace: test-ns +spec: + resourceSelectors: + - group: "apps" + kind: Deployment + name: sample-metric-app + version: v1 + policy: + placementType: PickAll + strategy: + type: External diff --git a/approval-request-metric-collector/examples/updateRun/example-sur.yaml b/approval-request-metric-collector/examples/updateRun/example-sur.yaml new file mode 100644 index 0000000..bb1471f --- /dev/null +++ b/approval-request-metric-collector/examples/updateRun/example-sur.yaml @@ -0,0 +1,10 @@ +apiVersion: placement.kubernetes-fleet.io/v1beta1 +kind: StagedUpdateRun +metadata: + name: example-staged-run + namespace: test-ns +spec: + placementName: example-rp + resourceSnapshotIndex: "0" + stagedRolloutStrategyName: example-staged-strategy + state: Started diff --git a/approval-request-metric-collector/examples/updateRun/example-sus.yaml b/approval-request-metric-collector/examples/updateRun/example-sus.yaml new file mode 100644 index 0000000..4505e29 --- /dev/null +++ b/approval-request-metric-collector/examples/updateRun/example-sus.yaml @@ -0,0 +1,19 @@ +apiVersion: placement.kubernetes-fleet.io/v1beta1 +kind: StagedUpdateStrategy +metadata: + name: example-staged-strategy + namespace: test-ns +spec: + stages: + - name: staging + labelSelector: + matchLabels: + environment: staging + afterStageTasks: + - type: Approval + - name: prod + labelSelector: + matchLabels: + environment: prod + afterStageTasks: + - type: Approval diff --git a/approval-request-metric-collector/examples/workloadtracker/clusterstagedworkloadtracker.yaml b/approval-request-metric-collector/examples/workloadtracker/clusterstagedworkloadtracker.yaml new file mode 100644 index 0000000..f521ab4 --- /dev/null +++ b/approval-request-metric-collector/examples/workloadtracker/clusterstagedworkloadtracker.yaml @@ -0,0 +1,9 @@ +apiVersion: autoapprove.kubernetes-fleet.io/v1alpha1 +kind: ClusterStagedWorkloadTracker +metadata: + # The name must match the name of the ClusterStagedUpdateRun it is used for + name: example-cluster-staged-run +workloads: + - name: sample-metric-app + namespace: test-ns + kind: Deployment diff --git a/approval-request-metric-collector/examples/workloadtracker/stagedworkloadtracker.yaml b/approval-request-metric-collector/examples/workloadtracker/stagedworkloadtracker.yaml new file mode 100644 index 0000000..2da34fe --- /dev/null +++ b/approval-request-metric-collector/examples/workloadtracker/stagedworkloadtracker.yaml @@ -0,0 +1,10 @@ +apiVersion: autoapprove.kubernetes-fleet.io/v1alpha1 +kind: StagedWorkloadTracker +metadata: + # The name and namespace must match the name and namespace of the StagedUpdateRun it is used for + name: example-staged-run + namespace: test-ns +workloads: + - name: sample-metric-app + namespace: test-ns + kind: Deployment diff --git a/approval-request-metric-collector/go.mod b/approval-request-metric-collector/go.mod new file mode 100644 index 0000000..1513223 --- /dev/null +++ b/approval-request-metric-collector/go.mod @@ -0,0 +1,72 @@ +module github.com/kubefleet-dev/kubefleet-cookbook/approval-request-metric-collector + +go 1.24.9 + +require ( + github.com/kubefleet-dev/kubefleet v0.1.2 + github.com/prometheus/client_golang v1.22.0 + k8s.io/api v0.34.1 + k8s.io/apiextensions-apiserver v0.34.1 + k8s.io/apimachinery v0.34.1 + k8s.io/client-go v0.34.1 + k8s.io/klog/v2 v2.130.1 + sigs.k8s.io/controller-runtime v0.22.4 +) + +require ( + github.com/beorn7/perks v1.0.1 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect + github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect + github.com/emicklei/go-restful/v3 v3.12.2 // indirect + github.com/evanphx/json-patch/v5 v5.9.11 // indirect + github.com/fsnotify/fsnotify v1.9.0 // indirect + github.com/fxamacker/cbor/v2 v2.9.0 // indirect + github.com/go-logr/logr v1.4.3 // indirect + github.com/go-logr/zapr v1.3.0 // indirect + github.com/go-openapi/jsonpointer v0.21.1 // indirect + github.com/go-openapi/jsonreference v0.21.0 // indirect + github.com/go-openapi/swag v0.23.1 // indirect + github.com/gogo/protobuf v1.3.2 // indirect + github.com/google/btree v1.1.3 // indirect + github.com/google/gnostic-models v0.7.0 // indirect + github.com/google/go-cmp v0.7.0 // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/josharian/intern v1.0.0 // indirect + github.com/json-iterator/go v1.1.12 // indirect + github.com/mailru/easyjson v0.9.0 // indirect + github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect + github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect + github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect + github.com/onsi/gomega v1.37.0 // indirect + github.com/pkg/errors v0.9.1 // indirect + github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect + github.com/prometheus/client_model v0.6.2 // indirect + github.com/prometheus/common v0.62.0 // indirect + github.com/prometheus/procfs v0.15.1 // indirect + github.com/spf13/pflag v1.0.6 // indirect + github.com/x448/float16 v0.8.4 // indirect + go.goms.io/fleet-networking v0.3.3 // indirect + go.uber.org/multierr v1.11.0 // indirect + go.uber.org/zap v1.27.0 // indirect + go.yaml.in/yaml/v2 v2.4.2 // indirect + go.yaml.in/yaml/v3 v3.0.4 // indirect + golang.org/x/net v0.47.0 // indirect + golang.org/x/oauth2 v0.29.0 // indirect + golang.org/x/sync v0.18.0 // indirect + golang.org/x/sys v0.38.0 // indirect + golang.org/x/term v0.37.0 // indirect + golang.org/x/text v0.31.0 // indirect + golang.org/x/time v0.11.0 // indirect + gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect + google.golang.org/protobuf v1.36.6 // indirect + gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect + gopkg.in/inf.v0 v0.9.1 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect + k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b // indirect + k8s.io/metrics v0.32.3 // indirect + k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 // indirect + sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 // indirect + sigs.k8s.io/randfill v1.0.0 // indirect + sigs.k8s.io/structured-merge-diff/v6 v6.3.0 // indirect + sigs.k8s.io/yaml v1.6.0 // indirect +) diff --git a/approval-request-metric-collector/go.sum b/approval-request-metric-collector/go.sum new file mode 100644 index 0000000..90d0995 --- /dev/null +++ b/approval-request-metric-collector/go.sum @@ -0,0 +1,196 @@ +github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= +github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/emicklei/go-restful/v3 v3.12.2 h1:DhwDP0vY3k8ZzE0RunuJy8GhNpPL6zqLkDf9B/a0/xU= +github.com/emicklei/go-restful/v3 v3.12.2/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= +github.com/evanphx/json-patch v0.5.2 h1:xVCHIVMUu1wtM/VkR9jVZ45N3FhZfYMMYGorLCR8P3k= +github.com/evanphx/json-patch v0.5.2/go.mod h1:ZWS5hhDbVDyob71nXKNL0+PWn6ToqBHMikGIFbs31qQ= +github.com/evanphx/json-patch/v5 v5.9.11 h1:/8HVnzMq13/3x9TPvjG08wUGqBTmZBsCWzjTM0wiaDU= +github.com/evanphx/json-patch/v5 v5.9.11/go.mod h1:3j+LviiESTElxA4p3EMKAB9HXj3/XEtnUf6OZxqIQTM= +github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k= +github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= +github.com/fxamacker/cbor/v2 v2.9.0 h1:NpKPmjDBgUfBms6tr6JZkTHtfFGcMKsw3eGcmD/sapM= +github.com/fxamacker/cbor/v2 v2.9.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ= +github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= +github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ= +github.com/go-logr/zapr v1.3.0/go.mod h1:YKepepNBd1u/oyhd/yQmtjVXmm9uML4IXUgMOwR8/Gg= +github.com/go-openapi/jsonpointer v0.21.1 h1:whnzv/pNXtK2FbX/W9yJfRmE2gsmkfahjMKB0fZvcic= +github.com/go-openapi/jsonpointer v0.21.1/go.mod h1:50I1STOfbY1ycR8jGz8DaMeLCdXiI6aDteEdRNNzpdk= +github.com/go-openapi/jsonreference v0.21.0 h1:Rs+Y7hSXT83Jacb7kFyjn4ijOuVGSvOdF2+tg1TRrwQ= +github.com/go-openapi/jsonreference v0.21.0/go.mod h1:LmZmgsrTkVg9LG4EaHeY8cBDslNPMo06cago5JNLkm4= +github.com/go-openapi/swag v0.23.1 h1:lpsStH0n2ittzTnbaSloVZLuB5+fvSY/+hnagBjSNZU= +github.com/go-openapi/swag v0.23.1/go.mod h1:STZs8TbRvEQQKUA+JZNAm3EWlgaOBGpyFDqQnDHMef0= +github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= +github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= +github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= +github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= +github.com/google/btree v1.1.3 h1:CVpQJjYgC4VbzxeGVHfvZrv1ctoYCAI8vbl07Fcxlyg= +github.com/google/btree v1.1.3/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4= +github.com/google/gnostic-models v0.7.0 h1:qwTtogB15McXDaNqTZdzPJRHvaVJlAl+HVQnLmJEJxo= +github.com/google/gnostic-models v0.7.0/go.mod h1:whL5G0m6dmc5cPxKc5bdKdEN3UjI7OUGxBlw57miDrQ= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= +github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= +github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 h1:BHT72Gu3keYf3ZEu2J0b1vyeLSOYI8bm5wbJM/8yDe8= +github.com/google/pprof v0.0.0-20250403155104-27863c87afa6/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= +github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= +github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= +github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= +github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= +github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= +github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= +github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/kubefleet-dev/kubefleet v0.1.2 h1:BUOwehI9iBavU6TEbebrSxtFXHwyOcY1eacHyfHEjxo= +github.com/kubefleet-dev/kubefleet v0.1.2/go.mod h1:EYDCdtdM02qQkH3Gm5/K1cHDy26f2LbM7WzVGn2saLs= +github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= +github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= +github.com/mailru/easyjson v0.9.0 h1:PrnmzHw7262yW8sTBwxi1PdJA3Iw/EKBa8psRf7d9a4= +github.com/mailru/easyjson v0.9.0/go.mod h1:1+xMtQp2MRNVL/V1bOzuP3aP8VNwRW55fQUto+XFtTU= +github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee h1:W5t00kpgFdJifH4BDsTlE89Zl93FEloxaWZfGcifgq8= +github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= +github.com/onsi/ginkgo/v2 v2.23.4 h1:ktYTpKJAVZnDT4VjxSbiBenUjmlL/5QkBEocaWXiQus= +github.com/onsi/ginkgo/v2 v2.23.4/go.mod h1:Bt66ApGPBFzHyR+JO10Zbt0Gsp4uWxu5mIOTusL46e8= +github.com/onsi/gomega v1.37.0 h1:CdEG8g0S133B4OswTDC/5XPSzE1OeP29QOioj2PID2Y= +github.com/onsi/gomega v1.37.0/go.mod h1:8D9+Txp43QWKhM24yyOBEdpkzN8FvJyAwecBgsU4KU0= +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_golang v1.22.0 h1:rb93p9lokFEsctTys46VnV1kLCDpVZ0a/Y92Vm0Zc6Q= +github.com/prometheus/client_golang v1.22.0/go.mod h1:R7ljNsLXhuQXYZYtw6GAE9AZg8Y7vEW5scdCXrWRXC0= +github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= +github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= +github.com/prometheus/common v0.62.0 h1:xasJaQlnWAeyHdUBeGjXmutelfJHWMRr+Fg4QszZ2Io= +github.com/prometheus/common v0.62.0/go.mod h1:vyBcEuLSvWos9B1+CyL7JZ2up+uFzXhkqml0W5zIY1I= +github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= +github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= +github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= +github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o= +github.com/spf13/pflag v1.0.6 h1:jFzHGLGAlb3ruxLB8MhbI6A8+AQX/2eW4qeyNZXNp2o= +github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= +github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= +github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= +github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= +github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +go.goms.io/fleet-networking v0.3.3 h1:5rwBntaUoLF+E1CzaWAEL4GdvLJPQorKhjgkbLlllPE= +go.goms.io/fleet-networking v0.3.3/go.mod h1:Qgbi8M1fGaz/p5rtb6HJPmTDATWRnMt9HD1gz57WKUc= +go.uber.org/automaxprocs v1.6.0 h1:O3y2/QNTOdbF+e/dpXNNW7Rx2hZ4sTIPyybbxyNqTUs= +go.uber.org/automaxprocs v1.6.0/go.mod h1:ifeIMSnPZuznNm6jmdzmU3/bfk01Fe2fotchwEFJ8r8= +go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= +go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= +go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= +go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= +go.uber.org/zap v1.27.0 h1:aJMhYGrd5QSmlpLMr2MftRKl7t8J8PTZPA732ud/XR8= +go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= +go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI= +go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU= +go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= +go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= +golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY= +golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU= +golang.org/x/oauth2 v0.29.0 h1:WdYw2tdTK1S8olAzWHdgeqfy+Mtm9XNhv/xJsY65d98= +golang.org/x/oauth2 v0.29.0/go.mod h1:onh5ek6nERTohokkhCD/y2cV4Do3fxFHFuAejCkRWT8= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.18.0 h1:kr88TuHDroi+UVf+0hZnirlk8o8T+4MrK6mr60WkH/I= +golang.org/x/sync v0.18.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc= +golang.org/x/sys v0.38.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/term v0.37.0 h1:8EGAD0qCmHYZg6J17DvsMy9/wJ7/D/4pV/wfnld5lTU= +golang.org/x/term v0.37.0/go.mod h1:5pB4lxRNYYVZuTLmy8oR2BH8dflOR+IbTYFD8fi3254= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.31.0 h1:aC8ghyu4JhP8VojJ2lEHBnochRno1sgL6nEi9WGFGMM= +golang.org/x/text v0.31.0/go.mod h1:tKRAlv61yKIjGGHX/4tP1LTbc13YSec1pxVEWXzfoeM= +golang.org/x/time v0.11.0 h1:/bpjEDfN9tkoN/ryeYHnv5hcMlc8ncjMcM4XBk5NWV0= +golang.org/x/time v0.11.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= +golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= +golang.org/x/tools v0.38.0 h1:Hx2Xv8hISq8Lm16jvBZ2VQf+RLmbd7wVUsALibYI/IQ= +golang.org/x/tools v0.38.0/go.mod h1:yEsQ/d/YK8cjh0L6rZlY8tgtlKiBNTL14pGDJPJpYQs= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw= +gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= +google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= +google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= +gopkg.in/evanphx/json-patch.v4 v4.12.0 h1:n6jtcsulIzXPJaxegRbvFNNrZDjbij7ny3gmSPG+6V4= +gopkg.in/evanphx/json-patch.v4 v4.12.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= +gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= +gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +k8s.io/api v0.34.1 h1:jC+153630BMdlFukegoEL8E/yT7aLyQkIVuwhmwDgJM= +k8s.io/api v0.34.1/go.mod h1:SB80FxFtXn5/gwzCoN6QCtPD7Vbu5w2n1S0J5gFfTYk= +k8s.io/apiextensions-apiserver v0.34.1 h1:NNPBva8FNAPt1iSVwIE0FsdrVriRXMsaWFMqJbII2CI= +k8s.io/apiextensions-apiserver v0.34.1/go.mod h1:hP9Rld3zF5Ay2Of3BeEpLAToP+l4s5UlxiHfqRaRcMc= +k8s.io/apimachinery v0.34.1 h1:dTlxFls/eikpJxmAC7MVE8oOeP1zryV7iRyIjB0gky4= +k8s.io/apimachinery v0.34.1/go.mod h1:/GwIlEcWuTX9zKIg2mbw0LRFIsXwrfoVxn+ef0X13lw= +k8s.io/client-go v0.34.1 h1:ZUPJKgXsnKwVwmKKdPfw4tB58+7/Ik3CrjOEhsiZ7mY= +k8s.io/client-go v0.34.1/go.mod h1:kA8v0FP+tk6sZA0yKLRG67LWjqufAoSHA2xVGKw9Of8= +k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= +k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= +k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b h1:MloQ9/bdJyIu9lb1PzujOPolHyvO06MXG5TUIj2mNAA= +k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b/go.mod h1:UZ2yyWbFTpuhSbFhv24aGNOdoRdJZgsIObGBUaYVsts= +k8s.io/metrics v0.32.3 h1:2vsBvw0v8rIIlczZ/lZ8Kcqk9tR6Fks9h+dtFNbc2a4= +k8s.io/metrics v0.32.3/go.mod h1:9R1Wk5cb+qJpCQon9h52mgkVCcFeYxcY+YkumfwHVCU= +k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 h1:hwvWFiBzdWw1FhfY1FooPn3kzWuJ8tmbZBHi4zVsl1Y= +k8s.io/utils v0.0.0-20250604170112-4c0f3b243397/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +sigs.k8s.io/controller-runtime v0.22.4 h1:GEjV7KV3TY8e+tJ2LCTxUTanW4z/FmNB7l327UfMq9A= +sigs.k8s.io/controller-runtime v0.22.4/go.mod h1:+QX1XUpTXN4mLoblf4tqr5CQcyHPAki2HLXqQMY6vh8= +sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 h1:gBQPwqORJ8d8/YNZWEjoZs7npUVDpVXUUOFfW6CgAqE= +sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= +sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU= +sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= +sigs.k8s.io/structured-merge-diff/v6 v6.3.0 h1:jTijUJbW353oVOd9oTlifJqOGEkUw2jB/fXCbTiQEco= +sigs.k8s.io/structured-merge-diff/v6 v6.3.0/go.mod h1:M3W8sfWvn2HhQDIbGWj3S099YozAsymCo/wrT5ohRUE= +sigs.k8s.io/yaml v1.6.0 h1:G8fkbMSAFqgEFgh4b1wmtzDnioxFCUgTZhlbj5P9QYs= +sigs.k8s.io/yaml v1.6.0/go.mod h1:796bPqUfzR/0jLAl6XjHl3Ck7MiyVv8dbTdyT3/pMf4= diff --git a/approval-request-metric-collector/hack/boilerplate.go.txt b/approval-request-metric-collector/hack/boilerplate.go.txt new file mode 100644 index 0000000..1f31a2d --- /dev/null +++ b/approval-request-metric-collector/hack/boilerplate.go.txt @@ -0,0 +1,15 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ diff --git a/approval-request-metric-collector/pkg/controllers/approvalrequest/controller.go b/approval-request-metric-collector/pkg/controllers/approvalrequest/controller.go new file mode 100644 index 0000000..6180838 --- /dev/null +++ b/approval-request-metric-collector/pkg/controllers/approvalrequest/controller.go @@ -0,0 +1,502 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package approvalrequest features a controller to reconcile ApprovalRequest objects +// and create MetricCollectorReport resources on the hub cluster for metric collection. +package approvalrequest + +import ( + "context" + "fmt" + "time" + + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/tools/record" + "k8s.io/klog/v2" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/predicate" + + autoapprovev1alpha1 "github.com/kubefleet-dev/kubefleet-cookbook/approval-request-metric-collector/apis/autoapprove/v1alpha1" + placementv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1" + "github.com/kubefleet-dev/kubefleet/pkg/utils" +) + +const ( + // metricCollectorFinalizer is the finalizer added to ApprovalRequest objects for cleanup. + metricCollectorFinalizer = "kubernetes-fleet.io/metric-collector-report-cleanup" + + // prometheusURL is the default Prometheus URL to use for all clusters + prometheusURL = "http://prometheus.prometheus.svc.cluster.local:9090" + + // parentApprovalRequestLabel is the label key used to track which ApprovalRequest owns the MetricCollectorReport + parentApprovalRequestLabel = "kubernetes-fleet.io/parent-approval-request" +) + +// Reconciler reconciles an ApprovalRequest object and creates MetricCollectorReport resources +// on the hub cluster in fleet-member-{clusterName} namespaces. +type Reconciler struct { + client.Client + recorder record.EventRecorder +} + +// Reconcile reconciles an ApprovalRequest or ClusterApprovalRequest object. +func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + startTime := time.Now() + klog.V(2).InfoS("ApprovalRequest reconciliation starts", "request", req.NamespacedName) + defer func() { + latency := time.Since(startTime).Milliseconds() + klog.V(2).InfoS("ApprovalRequest reconciliation ends", "request", req.NamespacedName, "latency", latency) + }() + + approvalReqObj, err := r.getApprovalRequestObj(ctx, req) + if err != nil { + if errors.IsNotFound(err) { + klog.V(2).InfoS("ApprovalRequest not found, ignoring", "request", req.NamespacedName) + return ctrl.Result{}, nil + } + klog.ErrorS(err, "Failed to get ApprovalRequest", "request", req.NamespacedName) + return ctrl.Result{}, err + } + + return r.reconcileApprovalRequestObj(ctx, approvalReqObj) +} + +// getApprovalRequestObj fetches either ApprovalRequest or ClusterApprovalRequest based on the request namespace. +func (r *Reconciler) getApprovalRequestObj(ctx context.Context, req ctrl.Request) (placementv1beta1.ApprovalRequestObj, error) { + if req.Namespace != "" { + // Fetch namespaced ApprovalRequest + approvalReq := &placementv1beta1.ApprovalRequest{} + if err := r.Client.Get(ctx, req.NamespacedName, approvalReq); err != nil { + return nil, err + } + return approvalReq, nil + } + + // Fetch cluster-scoped ClusterApprovalRequest + clusterApprovalReq := &placementv1beta1.ClusterApprovalRequest{} + if err := r.Client.Get(ctx, types.NamespacedName{Name: req.Name}, clusterApprovalReq); err != nil { + return nil, err + } + return clusterApprovalReq, nil +} + +// reconcileApprovalRequestObj reconciles an ApprovalRequestObj (either ApprovalRequest or ClusterApprovalRequest). +func (r *Reconciler) reconcileApprovalRequestObj(ctx context.Context, approvalReqObj placementv1beta1.ApprovalRequestObj) (ctrl.Result, error) { + approvalReqRef := klog.KObj(approvalReqObj) + + // Handle deletion + if !approvalReqObj.GetDeletionTimestamp().IsZero() { + return r.handleDelete(ctx, approvalReqObj) + } + + // Check if the approval request is already approved or rejected - stop reconciliation if so + approvedCond := meta.FindStatusCondition(approvalReqObj.GetApprovalRequestStatus().Conditions, string(placementv1beta1.ApprovalRequestConditionApproved)) + if approvedCond != nil && approvedCond.Status == metav1.ConditionTrue { + klog.V(2).InfoS("ApprovalRequest has been approved, stopping reconciliation", "approvalRequest", approvalReqRef) + return ctrl.Result{}, nil + } + + // Add finalizer if not present + if !controllerutil.ContainsFinalizer(approvalReqObj, metricCollectorFinalizer) { + controllerutil.AddFinalizer(approvalReqObj, metricCollectorFinalizer) + if err := r.Client.Update(ctx, approvalReqObj); err != nil { + klog.ErrorS(err, "Failed to add finalizer", "approvalRequest", approvalReqRef) + return ctrl.Result{}, err + } + klog.V(2).InfoS("Added finalizer to ApprovalRequest", "approvalRequest", approvalReqRef) + } + + // Get the UpdateRun (ClusterStagedUpdateRun or StagedUpdateRun) + spec := approvalReqObj.GetApprovalRequestSpec() + updateRunName := spec.TargetUpdateRun + stageName := spec.TargetStage + + var stageStatus *placementv1beta1.StageUpdatingStatus + if approvalReqObj.GetNamespace() == "" { + updateRun := &placementv1beta1.ClusterStagedUpdateRun{} + if err := r.Client.Get(ctx, types.NamespacedName{Name: updateRunName}, updateRun); err != nil { + klog.ErrorS(err, "Failed to get ClusterStagedUpdateRun", "approvalRequest", approvalReqRef, "updateRun", updateRunName) + return ctrl.Result{}, err + } + + // Find the stage + for i := range updateRun.Status.StagesStatus { + if updateRun.Status.StagesStatus[i].StageName == stageName { + stageStatus = &updateRun.Status.StagesStatus[i] + break + } + } + } else { + updateRun := &placementv1beta1.StagedUpdateRun{} + if err := r.Client.Get(ctx, types.NamespacedName{Name: updateRunName, Namespace: approvalReqObj.GetNamespace()}, updateRun); err != nil { + klog.ErrorS(err, "Failed to get StagedUpdateRun", "approvalRequest", approvalReqRef, "updateRun", updateRunName) + return ctrl.Result{}, err + } + + // Find the stage + for i := range updateRun.Status.StagesStatus { + if updateRun.Status.StagesStatus[i].StageName == stageName { + stageStatus = &updateRun.Status.StagesStatus[i] + break + } + } + } + + if stageStatus == nil { + // This should never happen - ApprovalRequest is only created after stage initialization + // If we reach here, it indicates an unexpected state inconsistency + // This is a non-retriable error - retrying won't fix the underlying issue + klog.ErrorS(nil, "Unexpected state: stage not found in UpdateRun - this indicates unexpected behavior as ApprovalRequest should only be created for initialized stages", "approvalRequest", approvalReqRef, "updateRun", updateRunName, "stage", stageName) + r.recorder.Event(approvalReqObj, "Warning", "UnexpectedState", fmt.Sprintf("Stage %s not found in UpdateRun %s", stageName, updateRunName)) + // Don't return error to avoid retries - this won't be fixed by reconciliation + return ctrl.Result{}, nil + } + + // Get all cluster names from the stage + clusterNames := make([]string, 0, len(stageStatus.Clusters)) + for _, cluster := range stageStatus.Clusters { + clusterNames = append(clusterNames, cluster.ClusterName) + } + + if len(clusterNames) == 0 { + klog.V(2).InfoS("No clusters in stage, skipping", "approvalRequest", approvalReqRef, "stage", stageName) + return ctrl.Result{}, nil + } + + klog.V(2).InfoS("Found clusters in stage", "approvalRequest", approvalReqRef, "stage", stageName, "clusters", clusterNames) + + // Create or update MetricCollectorReport resources in fleet-member namespaces + if err := r.ensureMetricCollectorReports(ctx, approvalReqObj, clusterNames, updateRunName, stageName); err != nil { + klog.ErrorS(err, "Failed to ensure MetricCollectorReport resources", "approvalRequest", approvalReqRef) + return ctrl.Result{}, err + } + + klog.V(2).InfoS("Successfully ensured MetricCollectorReport resources", "approvalRequest", approvalReqRef, "clusters", clusterNames) + + // Check workload health and approve if all workloads are healthy + if err := r.checkWorkloadHealthAndApprove(ctx, approvalReqObj, clusterNames, updateRunName, stageName); err != nil { + klog.ErrorS(err, "Failed to check workload health", "approvalRequest", approvalReqRef) + return ctrl.Result{}, err + } + + // Requeue after 15 seconds to check again (will stop if approved in next reconciliation) + return ctrl.Result{RequeueAfter: 15 * time.Second}, nil +} + +// ensureMetricCollectorReports creates MetricCollectorReport in each fleet-member-{clusterName} namespace +func (r *Reconciler) ensureMetricCollectorReports( + ctx context.Context, + approvalReq placementv1beta1.ApprovalRequestObj, + clusterNames []string, + updateRunName, stageName string, +) error { + // Generate report name (same for all clusters, different namespaces) + reportName := fmt.Sprintf("mc-%s-%s", updateRunName, stageName) + + // Create MetricCollectorReport in each fleet-member namespace + // Note: We cannot use owner references here because Kubernetes does not allow cross-namespace + // owner references. The ApprovalRequest (in one namespace or cluster-scoped) cannot be set as + // the owner of MetricCollectorReports in different fleet-member-* namespaces. Instead, we use + // a finalizer on the ApprovalRequest to ensure proper cleanup when it's deleted. + for _, clusterName := range clusterNames { + reportNamespace := fmt.Sprintf(utils.NamespaceNameFormat, clusterName) + + report := &autoapprovev1alpha1.MetricCollectorReport{ + ObjectMeta: metav1.ObjectMeta{ + Name: reportName, + Namespace: reportNamespace, + }, + } + + // Create or update MetricCollectorReport using controllerutil + op, err := controllerutil.CreateOrUpdate(ctx, r.Client, report, func() error { + // Set labels + if report.Labels == nil { + report.Labels = make(map[string]string) + } + + // Set parent-approval-request label to uniquely identify the ApprovalRequest + // For cluster-scoped ApprovalRequests: just the name + // For namespace-scoped ApprovalRequests: namespace.name format (using dot as separator) + if approvalReq.GetNamespace() == "" { + // Cluster-scoped: ClusterApprovalRequest + report.Labels[parentApprovalRequestLabel] = approvalReq.GetName() + } else { + // Namespace-scoped: ApprovalRequest (use dot instead of slash for valid label) + report.Labels[parentApprovalRequestLabel] = fmt.Sprintf("%s.%s", approvalReq.GetNamespace(), approvalReq.GetName()) + } + + // Set spec + // PrometheusURL is a configurable spec field that could differ per cluster. + // For setup simplicity, we use a constant value pointing to the Prometheus service + // deployed via examples/prometheus/service.yaml and propagated to all clusters. + // This assumes Prometheus is deployed with the same service name/namespace on all member clusters. + report.Spec.PrometheusURL = prometheusURL + + return nil + }) + + if err != nil { + return fmt.Errorf("failed to create or update MetricCollectorReport in %s: %w", reportNamespace, err) + } + + klog.V(2).InfoS("Ensured MetricCollectorReport", "report", reportName, "namespace", reportNamespace, "cluster", clusterName, "operation", op) + } + + return nil +} + +// checkWorkloadHealthAndApprove checks if all workloads specified in ClusterStagedWorkloadTracker or StagedWorkloadTracker are healthy +// across all clusters in the stage, and approves the ApprovalRequest if they are. +func (r *Reconciler) checkWorkloadHealthAndApprove( + ctx context.Context, + approvalReqObj placementv1beta1.ApprovalRequestObj, + clusterNames []string, + updateRunName, stageName string, +) error { + approvalReqRef := klog.KObj(approvalReqObj) + + klog.V(2).InfoS("Starting workload health check", "approvalRequest", approvalReqRef, "clusters", clusterNames) + + // Get the appropriate WorkloadTracker based on scope + // The WorkloadTracker name matches the UpdateRun name + var workloads []autoapprovev1alpha1.WorkloadReference + var workloadTrackerName string + + if approvalReqObj.GetNamespace() == "" { + // Cluster-scoped: Get ClusterStagedWorkloadTracker with same name as ClusterStagedUpdateRun + clusterWorkloadTracker := &autoapprovev1alpha1.ClusterStagedWorkloadTracker{} + if err := r.Client.Get(ctx, types.NamespacedName{Name: updateRunName}, clusterWorkloadTracker); err != nil { + if errors.IsNotFound(err) { + klog.V(2).InfoS("ClusterStagedWorkloadTracker not found, skipping health check", "approvalRequest", approvalReqRef, "updateRun", updateRunName) + return nil + } + klog.ErrorS(err, "Failed to get ClusterStagedWorkloadTracker", "approvalRequest", approvalReqRef, "updateRun", updateRunName) + return fmt.Errorf("failed to get ClusterStagedWorkloadTracker: %w", err) + } + workloads = clusterWorkloadTracker.Workloads + workloadTrackerName = clusterWorkloadTracker.Name + klog.V(2).InfoS("Found ClusterStagedWorkloadTracker", "approvalRequest", approvalReqRef, "workloadTracker", workloadTrackerName, "workloadCount", len(workloads)) + } else { + // Namespace-scoped: Get StagedWorkloadTracker with same name and namespace as StagedUpdateRun + stagedWorkloadTracker := &autoapprovev1alpha1.StagedWorkloadTracker{} + if err := r.Client.Get(ctx, types.NamespacedName{Name: updateRunName, Namespace: approvalReqObj.GetNamespace()}, stagedWorkloadTracker); err != nil { + if errors.IsNotFound(err) { + klog.V(2).InfoS("StagedWorkloadTracker not found, skipping health check", "approvalRequest", approvalReqRef, "updateRun", updateRunName, "namespace", approvalReqObj.GetNamespace()) + return nil + } + klog.ErrorS(err, "Failed to get StagedWorkloadTracker", "approvalRequest", approvalReqRef, "updateRun", updateRunName) + return fmt.Errorf("failed to get StagedWorkloadTracker: %w", err) + } + workloads = stagedWorkloadTracker.Workloads + workloadTrackerName = stagedWorkloadTracker.Name + klog.V(2).InfoS("Found StagedWorkloadTracker", "approvalRequest", approvalReqRef, "workloadTracker", klog.KObj(stagedWorkloadTracker), "workloadCount", len(workloads)) + } + + if len(workloads) == 0 { + klog.V(2).InfoS("WorkloadTracker has no workloads defined, skipping health check", "approvalRequest", approvalReqRef, "workloadTracker", workloadTrackerName) + return nil + } + + // MetricCollectorReport name is same as MetricCollector name + metricCollectorName := fmt.Sprintf("mc-%s-%s", updateRunName, stageName) + + // Check each cluster for the required workloads + allHealthy := true + unhealthyDetails := []string{} + + for _, clusterName := range clusterNames { + reportNamespace := fmt.Sprintf(utils.NamespaceNameFormat, clusterName) + + klog.V(2).InfoS("Checking MetricCollectorReport", "approvalRequest", approvalReqRef, "cluster", clusterName, "reportName", metricCollectorName, "reportNamespace", reportNamespace) + + // Get MetricCollectorReport for this cluster + report := &autoapprovev1alpha1.MetricCollectorReport{} + err := r.Client.Get(ctx, types.NamespacedName{ + Name: metricCollectorName, + Namespace: reportNamespace, + }, report) + + if err != nil { + if errors.IsNotFound(err) { + klog.V(2).InfoS("MetricCollectorReport not found yet", "approvalRequest", approvalReqRef, "cluster", clusterName, "report", metricCollectorName, "namespace", reportNamespace) + allHealthy = false + unhealthyDetails = append(unhealthyDetails, fmt.Sprintf("cluster %s: report not found", clusterName)) + continue + } + klog.ErrorS(err, "Failed to get MetricCollectorReport", "approvalRequest", approvalReqRef, "cluster", clusterName, "report", metricCollectorName, "namespace", reportNamespace) + return fmt.Errorf("failed to get MetricCollectorReport for cluster %s: %w", clusterName, err) + } + + klog.V(2).InfoS("Found MetricCollectorReport", "approvalRequest", approvalReqRef, "cluster", clusterName, "collectedMetrics", len(report.Status.CollectedMetrics), "workloadsMonitored", report.Status.WorkloadsMonitored) + + // Check if all workloads from WorkloadTracker are present and healthy + for _, trackedWorkload := range workloads { + found := false + healthy := false + + // Important: Simplified health check using first matching metric + // When a workload has multiple pods/replicas, the MetricCollectorReport will contain + // multiple WorkloadMetrics entries (one per pod). This implementation uses the FIRST + // matching metric to determine workload health. + // + // Limitation: If different pods report different health states, only the first one + // encountered is used for approval decisions. + // + // To implement aggregation logic (e.g., all pods must be healthy, or majority healthy): + // 1. Remove the 'break' statement below + // 2. Collect all matching metrics into a slice + // 3. Apply your aggregation logic (e.g., allHealthy := all metrics have Health==true) + // 4. Set 'healthy' based on the aggregated result + for _, collectedMetric := range report.Status.CollectedMetrics { + // Match workload by namespace, name, and kind. + if collectedMetric.Namespace == trackedWorkload.Namespace && + collectedMetric.WorkloadName == trackedWorkload.Name && + trackedWorkload.Kind == collectedMetric.WorkloadKind { + found = true + healthy = collectedMetric.Health + klog.V(2).InfoS("Workload metric found", "approvalRequest", approvalReqRef, "cluster", clusterName, "workload", trackedWorkload.Name, "namespace", trackedWorkload.Namespace, "kind", trackedWorkload.Kind, "healthy", healthy) + break // Remove this to collect all metrics for aggregation + } + } + + if !found { + klog.V(2).InfoS("Workload not found in MetricCollectorReport", "approvalRequest", approvalReqRef, "cluster", clusterName, "workload", trackedWorkload.Name, "namespace", trackedWorkload.Namespace) + allHealthy = false + unhealthyDetails = append(unhealthyDetails, + fmt.Sprintf("cluster %s: workload %s/%s not found", clusterName, trackedWorkload.Namespace, trackedWorkload.Name)) + } else if !healthy { + klog.V(2).InfoS("Workload is not healthy", "approvalRequest", approvalReqRef, "cluster", clusterName, "workload", trackedWorkload.Name, "namespace", trackedWorkload.Namespace) + allHealthy = false + unhealthyDetails = append(unhealthyDetails, + fmt.Sprintf("cluster %s: workload %s/%s unhealthy", clusterName, trackedWorkload.Namespace, trackedWorkload.Name)) + } + } + } + + // If all workloads are healthy across all clusters, approve the ApprovalRequest + if allHealthy { + klog.InfoS("All workloads are healthy, approving ApprovalRequest", "approvalRequest", approvalReqRef, "clusters", clusterNames, "workloads", len(workloads)) + + status := approvalReqObj.GetApprovalRequestStatus() + // we have already checked that the condition is not present or not true. + meta.SetStatusCondition(&status.Conditions, metav1.Condition{ + Type: string(placementv1beta1.ApprovalRequestConditionApproved), + Status: metav1.ConditionTrue, + ObservedGeneration: approvalReqObj.GetGeneration(), + Reason: "AllWorkloadsHealthy", + Message: fmt.Sprintf("All %d workloads are healthy across %d clusters", len(workloads), len(clusterNames)), + }) + + approvalReqObj.SetApprovalRequestStatus(*status) + if err := r.Client.Status().Update(ctx, approvalReqObj); err != nil { + klog.ErrorS(err, "Failed to approve ApprovalRequest", "approvalRequest", approvalReqRef) + return fmt.Errorf("failed to approve ApprovalRequest: %w", err) + } + + klog.InfoS("Successfully approved ApprovalRequest", "approvalRequest", approvalReqRef) + r.recorder.Event(approvalReqObj, "Normal", "Approved", fmt.Sprintf("All %d workloads are healthy across %d clusters in stage %s", len(workloads), len(clusterNames), stageName)) + + // Approval successful or already approved + return nil + } + + // Not all workloads are healthy yet, log details and return nil (reconcile will requeue) + klog.V(2).InfoS("Not all workloads are healthy yet", "approvalRequest", approvalReqRef, "unhealthyDetails", unhealthyDetails) + + return nil +} + +// handleDelete handles the deletion of an ApprovalRequest or ClusterApprovalRequest +func (r *Reconciler) handleDelete(ctx context.Context, approvalReqObj placementv1beta1.ApprovalRequestObj) (ctrl.Result, error) { + if !controllerutil.ContainsFinalizer(approvalReqObj, metricCollectorFinalizer) { + return ctrl.Result{}, nil + } + + approvalReqRef := klog.KObj(approvalReqObj) + klog.V(2).InfoS("Cleaning up MetricCollectorReports for ApprovalRequest", "approvalRequest", approvalReqRef) + + // Build the parent-approval-request label value to match + // For cluster-scoped: just the name + // For namespace-scoped: namespace.name format (using dot as separator) + var parentApprovalRequestValue string + if approvalReqObj.GetNamespace() == "" { + // Cluster-scoped: ClusterApprovalRequest + parentApprovalRequestValue = approvalReqObj.GetName() + } else { + // Namespace-scoped: ApprovalRequest (use dot instead of slash for valid label) + parentApprovalRequestValue = fmt.Sprintf("%s.%s", approvalReqObj.GetNamespace(), approvalReqObj.GetName()) + } + + // List all MetricCollectorReports with the parent-approval-request label across all namespaces + reportList := &autoapprovev1alpha1.MetricCollectorReportList{} + listOptions := []client.ListOption{ + client.MatchingLabels{ + parentApprovalRequestLabel: parentApprovalRequestValue, + }, + } + + if err := r.Client.List(ctx, reportList, listOptions...); err != nil { + klog.ErrorS(err, "Failed to list MetricCollectorReports for cleanup", "approvalRequest", approvalReqRef, "parentApprovalRequest", parentApprovalRequestValue) + return ctrl.Result{}, fmt.Errorf("failed to list MetricCollectorReports: %w", err) + } + + klog.V(2).InfoS("Found MetricCollectorReports to delete", "approvalRequest", approvalReqRef, "count", len(reportList.Items)) + + // Delete all found MetricCollectorReports + for i := range reportList.Items { + report := &reportList.Items[i] + if err := r.Client.Delete(ctx, report); err != nil && !errors.IsNotFound(err) { + klog.ErrorS(err, "Failed to delete MetricCollectorReport", "report", report.Name, "namespace", report.Namespace) + return ctrl.Result{}, fmt.Errorf("failed to delete MetricCollectorReport %s/%s: %w", report.Namespace, report.Name, err) + } + klog.V(2).InfoS("Deleted MetricCollectorReport", "report", report.Name, "namespace", report.Namespace) + } + + // Remove finalizer + controllerutil.RemoveFinalizer(approvalReqObj, metricCollectorFinalizer) + if err := r.Client.Update(ctx, approvalReqObj); err != nil { + klog.ErrorS(err, "Failed to remove finalizer", "approvalRequest", approvalReqRef) + return ctrl.Result{}, err + } + + klog.V(2).InfoS("Successfully cleaned up MetricCollectorReports", "approvalRequest", approvalReqRef, "deletedCount", len(reportList.Items)) + return ctrl.Result{}, nil +} + +// SetupWithManagerForClusterApprovalRequest sets up the controller with the Manager for ClusterApprovalRequest resources. +func (r *Reconciler) SetupWithManagerForClusterApprovalRequest(mgr ctrl.Manager) error { + r.recorder = mgr.GetEventRecorderFor("clusterapprovalrequest-controller") + return ctrl.NewControllerManagedBy(mgr). + Named("clusterapprovalrequest-controller"). + For(&placementv1beta1.ClusterApprovalRequest{}, builder.WithPredicates(predicate.GenerationChangedPredicate{})). + Complete(r) +} + +// SetupWithManagerForApprovalRequest sets up the controller with the Manager for ApprovalRequest resources. +func (r *Reconciler) SetupWithManagerForApprovalRequest(mgr ctrl.Manager) error { + r.recorder = mgr.GetEventRecorderFor("approvalrequest-controller") + return ctrl.NewControllerManagedBy(mgr). + Named("approvalrequest-controller"). + For(&placementv1beta1.ApprovalRequest{}, builder.WithPredicates(predicate.GenerationChangedPredicate{})). + Complete(r) +} diff --git a/approval-request-metric-collector/pkg/controllers/metriccollector/collector.go b/approval-request-metric-collector/pkg/controllers/metriccollector/collector.go new file mode 100644 index 0000000..6952e27 --- /dev/null +++ b/approval-request-metric-collector/pkg/controllers/metriccollector/collector.go @@ -0,0 +1,148 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metriccollector + +import ( + "context" + "encoding/base64" + "encoding/json" + "fmt" + "io" + "net/http" + "net/url" + "strings" + "time" + + corev1 "k8s.io/api/core/v1" +) + +// PrometheusClient is the interface for querying Prometheus +type PrometheusClient interface { + Query(ctx context.Context, query string) (PrometheusData, error) +} + +// prometheusClient implements PrometheusClient for querying Prometheus API +type prometheusClient struct { + baseURL string + authType string + authSecret *corev1.Secret + httpClient *http.Client +} + +// NewPrometheusClient creates a new Prometheus client +func NewPrometheusClient(baseURL, authType string, authSecret *corev1.Secret) PrometheusClient { + return &prometheusClient{ + baseURL: baseURL, + authType: authType, + authSecret: authSecret, + httpClient: &http.Client{ + Timeout: 30 * time.Second, + }, + } +} + +// Query executes a PromQL query against Prometheus API +func (c *prometheusClient) Query(ctx context.Context, query string) (PrometheusData, error) { + // Build query URL + queryURL := fmt.Sprintf("%s/api/v1/query", strings.TrimSuffix(c.baseURL, "/")) + params := url.Values{} + params.Add("query", query) + fullURL := fmt.Sprintf("%s?%s", queryURL, params.Encode()) + + // Create request + req, err := http.NewRequestWithContext(ctx, "GET", fullURL, nil) + if err != nil { + return PrometheusData{}, fmt.Errorf("failed to create request: %w", err) + } + + // Add authentication + if err := c.addAuth(req); err != nil { + return PrometheusData{}, fmt.Errorf("failed to add authentication: %w", err) + } + + // Execute request + resp, err := c.httpClient.Do(req) + if err != nil { + return PrometheusData{}, fmt.Errorf("failed to query Prometheus: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(resp.Body) + return PrometheusData{}, fmt.Errorf("Prometheus query failed with status %d: %s", resp.StatusCode, string(body)) + } + + // Parse response + var result PrometheusResponse + if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { + return PrometheusData{}, fmt.Errorf("failed to decode response: %w", err) + } + + if result.Status != "success" { + return PrometheusData{}, fmt.Errorf("Prometheus query failed: %s", result.Error) + } + + return result.Data, nil +} + +// addAuth adds authentication to the request +func (c *prometheusClient) addAuth(req *http.Request) error { + if c.authType == "" || c.authSecret == nil { + return nil + } + + switch c.authType { + case "bearer": + token, ok := c.authSecret.Data["token"] + if !ok { + return fmt.Errorf("token not found in secret") + } + req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", string(token))) + case "basic": + username, ok := c.authSecret.Data["username"] + if !ok { + return fmt.Errorf("username not found in secret") + } + password, ok := c.authSecret.Data["password"] + if !ok { + return fmt.Errorf("password not found in secret") + } + auth := base64.StdEncoding.EncodeToString([]byte(fmt.Sprintf("%s:%s", username, password))) + req.Header.Set("Authorization", fmt.Sprintf("Basic %s", auth)) + } + + return nil +} + +// PrometheusResponse represents the Prometheus API response +type PrometheusResponse struct { + Status string `json:"status"` + Data PrometheusData `json:"data"` + Error string `json:"error,omitempty"` +} + +// PrometheusData represents the data section of Prometheus response +type PrometheusData struct { + ResultType string `json:"resultType"` + Result []PrometheusResult `json:"result"` +} + +// PrometheusResult represents a single result from Prometheus +type PrometheusResult struct { + Metric map[string]string `json:"metric"` + Value []interface{} `json:"value"` // [timestamp, value] +} diff --git a/approval-request-metric-collector/pkg/controllers/metriccollector/controller.go b/approval-request-metric-collector/pkg/controllers/metriccollector/controller.go new file mode 100644 index 0000000..4f3675e --- /dev/null +++ b/approval-request-metric-collector/pkg/controllers/metriccollector/controller.go @@ -0,0 +1,188 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metriccollector + +import ( + "context" + "fmt" + "time" + + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/klog/v2" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/predicate" + + autoapprovev1alpha1 "github.com/kubefleet-dev/kubefleet-cookbook/approval-request-metric-collector/apis/autoapprove/v1alpha1" +) + +const ( + // defaultCollectionInterval is the interval for collecting metrics (30 seconds) + defaultCollectionInterval = 30 * time.Second +) + +// Reconciler reconciles a MetricCollectorReport object on the hub cluster +type Reconciler struct { + // HubClient is the client to access the hub cluster (for MetricCollectorReport and WorkloadTracker) + HubClient client.Client +} + +// Reconcile watches MetricCollectorReport on hub and updates it with metrics from member Prometheus +func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + startTime := time.Now() + klog.V(2).InfoS("MetricCollectorReport reconciliation starts", "report", req.NamespacedName) + defer func() { + latency := time.Since(startTime).Milliseconds() + klog.V(2).InfoS("MetricCollectorReport reconciliation ends", "report", req.NamespacedName, "latency", latency) + }() + + // 1. Get MetricCollectorReport from hub cluster + report := &autoapprovev1alpha1.MetricCollectorReport{} + if err := r.HubClient.Get(ctx, req.NamespacedName, report); err != nil { + if errors.IsNotFound(err) { + klog.V(2).InfoS("MetricCollectorReport not found, ignoring", "report", req.NamespacedName) + return ctrl.Result{}, nil + } + klog.ErrorS(err, "Failed to get MetricCollectorReport", "report", req.NamespacedName) + return ctrl.Result{}, err + } + + klog.InfoS("Reconciling MetricCollectorReport", "name", report.Name, "namespace", report.Namespace) + + // 2. Get PrometheusURL from report spec (or use default) + prometheusURL := report.Spec.PrometheusURL + + // 3. Query Prometheus on member cluster for all workload_health metrics + promClient := NewPrometheusClient(prometheusURL, "", nil) + collectedMetrics, collectErr := r.collectAllWorkloadMetrics(ctx, promClient) + + // 5. Update MetricCollectorReport status on hub + now := metav1.Now() + report.Status.LastCollectionTime = &now + report.Status.CollectedMetrics = collectedMetrics + report.Status.WorkloadsMonitored = int32(len(collectedMetrics)) + + if collectErr != nil { + klog.ErrorS(collectErr, "Failed to collect metrics", "prometheusUrl", prometheusURL) + meta.SetStatusCondition(&report.Status.Conditions, metav1.Condition{ + Type: autoapprovev1alpha1.MetricCollectorReportConditionTypeMetricsCollected, + Status: metav1.ConditionFalse, + ObservedGeneration: report.Generation, + Reason: autoapprovev1alpha1.MetricCollectorReportConditionReasonCollectionFailed, + Message: fmt.Sprintf("Failed to collect metrics: %v", collectErr), + }) + } else { + klog.V(2).InfoS("Successfully collected metrics", "report", report.Name, "workloads", len(collectedMetrics)) + meta.SetStatusCondition(&report.Status.Conditions, metav1.Condition{ + Type: autoapprovev1alpha1.MetricCollectorReportConditionTypeMetricsCollected, + Status: metav1.ConditionTrue, + ObservedGeneration: report.Generation, + Reason: autoapprovev1alpha1.MetricCollectorReportConditionReasonCollectionSucceeded, + Message: fmt.Sprintf("Successfully collected metrics from %d workloads", len(collectedMetrics)), + }) + } + + if err := r.HubClient.Status().Update(ctx, report); err != nil { + klog.ErrorS(err, "Failed to update MetricCollectorReport status", "report", req.NamespacedName) + return ctrl.Result{}, err + } + + klog.InfoS("Successfully updated MetricCollectorReport", "metricsCount", len(collectedMetrics), "prometheusUrl", prometheusURL) + return ctrl.Result{RequeueAfter: defaultCollectionInterval}, nil +} + +// collectAllWorkloadMetrics queries Prometheus for all workload_health metrics +func (r *Reconciler) collectAllWorkloadMetrics(ctx context.Context, promClient PrometheusClient) ([]autoapprovev1alpha1.WorkloadMetrics, error) { + var collectedMetrics []autoapprovev1alpha1.WorkloadMetrics + + // Query all workload_health metrics (no filtering) + query := "workload_health" + + data, err := promClient.Query(ctx, query) + if err != nil { + klog.ErrorS(err, "Failed to query Prometheus for workload_health metrics") + return nil, err + } + + if len(data.Result) == 0 { + klog.V(4).InfoS("No workload_health metrics found in Prometheus") + return collectedMetrics, nil + } + + // Extract metrics from Prometheus result + for _, res := range data.Result { + // Extract labels from the Prometheus metric + // The workload_health metric includes labels like: workload_health{namespace="test-ns",app="sample-app",workload_kind="Deployment"} + // These labels come from Kubernetes pod labels and are added by Prometheus during scraping. + // The relabeling configuration is in examples/prometheus/configmap.yaml: + // - namespace: from __meta_kubernetes_namespace (pod's namespace) + // - app: from __meta_kubernetes_pod_label_app (pod's "app" label) + // - workload_kind: from __meta_kubernetes_pod_controller_kind (controller kind) + namespace := res.Metric["namespace"] + workloadName := res.Metric["app"] + workloadKind := res.Metric["workload_kind"] + + if namespace == "" || workloadName == "" { + continue + } + + // Extract health value from Prometheus result + // Prometheus returns values as [timestamp, value_string] array + // We need at least 2 elements: index 0 is timestamp, index 1 is the metric value + var health float64 + if len(res.Value) >= 2 { + if valueStr, ok := res.Value[1].(string); ok { + if _, err := fmt.Sscanf(valueStr, "%f", &health); err != nil { + klog.ErrorS(err, "Failed to parse health value from Prometheus result", "namespace", namespace, "workload", workloadName, "kind", workloadKind, "valueStr", valueStr) + continue + } + } else { + klog.ErrorS(nil, "Health value is not a string in Prometheus result", "namespace", namespace, "workload", workloadName, "kind", workloadKind, "value", res.Value[1]) + continue + } + } else { + klog.ErrorS(nil, "Prometheus result value array has insufficient elements", "namespace", namespace, "workload", workloadName, "kind", workloadKind, "valueLength", len(res.Value)) + continue + } + + // Convert float to bool: workload is healthy if metric value >= 1.0 + // We use >= instead of == to handle floating point precision issues that can occur + // during JSON serialization/deserialization. The metric app emits 1.0 for healthy + // and 0.0 for unhealthy, so >= 1.0 safely distinguishes between the two states. + workloadMetrics := autoapprovev1alpha1.WorkloadMetrics{ + WorkloadName: workloadName, + Namespace: namespace, + WorkloadKind: workloadKind, + Health: health >= 1.0, + } + collectedMetrics = append(collectedMetrics, workloadMetrics) + } + + klog.V(2).InfoS("Collected workload metrics from Prometheus", "count", len(collectedMetrics)) + return collectedMetrics, nil +} + +// SetupWithManager sets up the controller with the Manager. +func (r *Reconciler) SetupWithManager(mgr ctrl.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + Named("metriccollector-controller"). + For(&autoapprovev1alpha1.MetricCollectorReport{}, builder.WithPredicates(predicate.GenerationChangedPredicate{})). + Complete(r) +} diff --git a/approval-request-metric-collector/scripts/install-on-hub.sh b/approval-request-metric-collector/scripts/install-on-hub.sh new file mode 100755 index 0000000..602bbd9 --- /dev/null +++ b/approval-request-metric-collector/scripts/install-on-hub.sh @@ -0,0 +1,118 @@ +#!/bin/bash +set -e + +# Detect script directory to support execution from multiple locations +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" + +# Usage: ./install-on-hub.sh +# Example: ./install-on-hub.sh arvindtestacr.azurecr.io kind-hub + +if [ "$#" -lt 2 ]; then + echo "Usage: $0 " + echo "Example: $0 arvindtestacr.azurecr.io kind-hub" + echo "" + echo "Parameters:" + echo " registry - ACR registry URL (e.g., arvindtestacr.azurecr.io)" + echo " hub-cluster - Hub cluster name (e.g., kind-hub)" + exit 1 +fi + +# Configuration +REGISTRY="$1" +HUB_CLUSTER="$2" +IMAGE_NAME="approval-request-controller" +IMAGE_TAG="${IMAGE_TAG:-latest}" +NAMESPACE="fleet-system" +CHART_NAME="approval-request-controller" + +# Get hub cluster context using kubectl config view (following kubefleet pattern) +HUB_CONTEXT=$(kubectl config view -o jsonpath="{.contexts[?(@.context.cluster==\"$HUB_CLUSTER\")].name}") + +if [ -z "$HUB_CONTEXT" ]; then + echo "Error: Could not find context for hub cluster '$HUB_CLUSTER'" + echo "Available clusters:" + kubectl config view -o jsonpath='{.clusters[*].name}' | tr ' ' '\n' + exit 1 +fi + +# Construct full image repository path +IMAGE_REPOSITORY="${REGISTRY}/${IMAGE_NAME}" + +echo "=== Installing ApprovalRequest Controller on hub cluster ===" +echo "Registry: ${REGISTRY}" +echo "Image: ${IMAGE_REPOSITORY}:${IMAGE_TAG}" +echo "Hub cluster: ${HUB_CLUSTER}" +echo "Hub context: ${HUB_CONTEXT}" +echo "Namespace: ${NAMESPACE}" +echo "" + +echo "" + +# Step 1: Verify kubefleet CRDs are installed +echo "Step 1: Verifying required kubefleet CRDs..." +REQUIRED_CRDS=( + "approvalrequests.placement.kubernetes-fleet.io" + "clusterapprovalrequests.placement.kubernetes-fleet.io" + "clusterresourceplacements.placement.kubernetes-fleet.io" + "clusterresourceoverrides.placement.kubernetes-fleet.io" + "clusterstagedupdateruns.placement.kubernetes-fleet.io" + "stagedupdateruns.placement.kubernetes-fleet.io" +) + +MISSING_CRDS=() +for crd in "${REQUIRED_CRDS[@]}"; do + if ! kubectl --context=${HUB_CONTEXT} get crd ${crd} &>/dev/null; then + MISSING_CRDS+=("${crd}") + fi +done + +if [ ${#MISSING_CRDS[@]} -ne 0 ]; then + echo "Error: Missing required CRDs from kubefleet hub-agent:" + for crd in "${MISSING_CRDS[@]}"; do + echo " - ${crd}" + done + echo "" + echo "Please ensure kubefleet hub-agent is installed first." + exit 1 +fi + +echo "✓ All required kubefleet CRDs are installed" +echo "" + +# Step 2: Install helm chart on hub cluster (includes MetricCollector, MetricCollectorReport, WorkloadTracker CRDs) +echo "Step 2: Installing helm chart on hub cluster..." +helm upgrade --install ${CHART_NAME} ${REPO_ROOT}/charts/${CHART_NAME} \ + --kube-context=${HUB_CONTEXT} \ + --namespace ${NAMESPACE} \ + --create-namespace \ + --set image.repository=${IMAGE_REPOSITORY} \ + --set image.tag=${IMAGE_TAG} \ + --set image.pullPolicy=Always \ + --set controller.logLevel=2 + +echo "✓ Helm chart installed on hub cluster" +echo "" + +# Step 3: Verify installation +echo "Step 3: Verifying installation..." +echo "Checking CRDs installed by this chart..." +kubectl --context=${HUB_CONTEXT} get crd | grep -E "metriccollectors|metriccollectorreports|workloadtrackers" || echo " (CRDs may take a moment to appear)" + +echo "" +echo "Checking pods in ${NAMESPACE}..." +kubectl --context=${HUB_CONTEXT} get pods -n ${NAMESPACE} -l app.kubernetes.io/name=${CHART_NAME} + +echo "" +echo "=== Installation Complete ===" +echo "" +echo "To check controller logs:" +echo " kubectl --context=${HUB_CONTEXT} logs -n ${NAMESPACE} -l app.kubernetes.io/name=${CHART_NAME} -f" +echo "" +echo "To verify CRDs:" +echo " kubectl --context=${HUB_CONTEXT} get crd | grep autoapprove.kubernetes-fleet.io" +echo "" +echo "Next steps:" +echo " 1. Create a WorkloadTracker to define which workloads to monitor" +echo " 2. ApprovalRequests will be automatically processed when created by staged updates" +echo "" diff --git a/approval-request-metric-collector/scripts/install-on-member.sh b/approval-request-metric-collector/scripts/install-on-member.sh new file mode 100755 index 0000000..8083630 --- /dev/null +++ b/approval-request-metric-collector/scripts/install-on-member.sh @@ -0,0 +1,236 @@ +#!/bin/bash +set -e + +# Detect script directory to support execution from multiple locations +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" + +# Usage: ./install-on-member.sh [member-cluster-2] [member-cluster-3] ... +# Example: ./install-on-member.sh arvindtestacr.azurecr.io kind-hub kind-cluster-1 kind-cluster-2 kind-cluster-3 + +if [ "$#" -lt 3 ]; then + echo "Usage: $0 [member-cluster-2] ..." + echo "Example: $0 arvindtestacr.azurecr.io kind-hub kind-cluster-1 kind-cluster-2 kind-cluster-3" + echo "" + echo "Parameters:" + echo " registry - ACR registry URL (e.g., arvindtestacr.azurecr.io)" + echo " hub-cluster - Hub cluster name (e.g., kind-hub)" + echo " member-clusters - One or more member cluster names" + exit 1 +fi + +# Configuration +REGISTRY="$1" +HUB_CLUSTER="$2" +MEMBER_CLUSTERS=("${@:3}") +MEMBER_NAMESPACE="default" +PROMETHEUS_URL="http://prometheus.test-ns:9090" +IMAGE_TAG="${IMAGE_TAG:-latest}" +METRIC_COLLECTOR_IMAGE="metric-collector" +METRIC_APP_IMAGE="metric-app" + +# Get hub cluster context and API server URL using kubectl config view (following kubefleet pattern) +HUB_CONTEXT=$(kubectl config view -o jsonpath="{.contexts[?(@.context.cluster==\"$HUB_CLUSTER\")].name}") +HUB_API_SERVER=$(kubectl config view -o jsonpath="{.clusters[?(@.name==\"$HUB_CLUSTER\")].cluster.server}") + +if [ -z "$HUB_CONTEXT" ]; then + echo "Error: Could not find context for hub cluster '$HUB_CLUSTER'" + echo "Available clusters:" + kubectl config view -o jsonpath='{.clusters[*].name}' | tr ' ' '\n' + exit 1 +fi + +if [ -z "$HUB_API_SERVER" ]; then + echo "Error: Could not find API server URL for hub cluster '$HUB_CLUSTER'" + exit 1 +fi + +# Construct full image repository paths +METRIC_COLLECTOR_REPOSITORY="${REGISTRY}/${METRIC_COLLECTOR_IMAGE}" +METRIC_APP_REPOSITORY="${REGISTRY}/${METRIC_APP_IMAGE}" + +echo "=== Installing MetricCollector on ${#MEMBER_CLUSTERS[@]} member cluster(s) ===" +echo "Registry: ${REGISTRY}" +echo "Metric Collector Image: ${METRIC_COLLECTOR_REPOSITORY}:${IMAGE_TAG}" +echo "Metric App Image: ${METRIC_APP_REPOSITORY}:${IMAGE_TAG}" +echo "Hub cluster: ${HUB_CLUSTER}" +echo "Hub context: ${HUB_CONTEXT}" +echo "Hub API server: ${HUB_API_SERVER}" +echo "Member clusters: ${MEMBER_CLUSTERS[@]}" +echo "" + +echo "" + +# Install on each member cluster +CLUSTER_INDEX=0 +for MEMBER_CLUSTER in "${MEMBER_CLUSTERS[@]}"; do + CLUSTER_INDEX=$((CLUSTER_INDEX + 1)) + + MEMBER_CONTEXT=$(kubectl config view -o jsonpath="{.contexts[?(@.context.cluster==\"$MEMBER_CLUSTER\")].name}") + MEMBER_CLUSTER_NAME="${MEMBER_CLUSTER}" + HUB_NAMESPACE="fleet-member-${MEMBER_CLUSTER_NAME}" + + if [ -z "$MEMBER_CONTEXT" ]; then + echo "Error: Could not find context for member cluster '$MEMBER_CLUSTER'" + echo "Available clusters:" + kubectl config view -o jsonpath='{.clusters[*].name}' | tr ' ' '\n' + exit 1 + fi + + echo "========================================" + echo "Installing on Member Cluster ${CLUSTER_INDEX}/${#MEMBER_CLUSTERS[@]}" + echo " Cluster: ${MEMBER_CLUSTER}" + echo " Context: ${MEMBER_CONTEXT}" + echo " Cluster Name: ${MEMBER_CLUSTER_NAME}" + echo "========================================" + echo "" + + # Step 1: Setup RBAC on hub cluster + echo "Step 1: Setting up RBAC on hub cluster..." + + # Verify namespace exists (should be created by KubeFleet when member cluster joins) + if ! kubectl --context=${HUB_CONTEXT} get namespace ${HUB_NAMESPACE} &>/dev/null; then + echo "Error: Namespace ${HUB_NAMESPACE} does not exist on hub cluster" + echo "This namespace should be automatically created by KubeFleet when the member cluster joins the hub" + echo "Please ensure the member cluster is properly registered with the hub" + exit 1 + fi + + cat <