Skip to content

Commit 69a8ed4

Browse files
author
Arvind Thirumurugan
committed
simplify approval controller
Signed-off-by: Arvind Thirumurugan <arvindth@microsoft.com>
1 parent cac604c commit 69a8ed4

File tree

3 files changed

+120
-174
lines changed
  • approval-controller-metric-collector

3 files changed

+120
-174
lines changed

approval-controller-metric-collector/approval-request-controller/cmd/approvalrequestcontroller/main.go

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -127,12 +127,9 @@ func checkRequiredCRDs(config *rest.Config) error {
127127
requiredCRDs := []string{
128128
"approvalrequests.placement.kubernetes-fleet.io",
129129
"clusterapprovalrequests.placement.kubernetes-fleet.io",
130-
"metriccollectors.metric.kubernetes-fleet.io",
131130
"metriccollectorreports.metric.kubernetes-fleet.io",
132131
"clusterstagedworkloadtrackers.metric.kubernetes-fleet.io",
133132
"stagedworkloadtrackers.metric.kubernetes-fleet.io",
134-
"clusterresourceplacements.placement.kubernetes-fleet.io",
135-
"clusterresourceoverrides.placement.kubernetes-fleet.io",
136133
"clusterstagedupdateruns.placement.kubernetes-fleet.io",
137134
"stagedupdateruns.placement.kubernetes-fleet.io",
138135
}

approval-controller-metric-collector/approval-request-controller/pkg/controller/controller.go

Lines changed: 119 additions & 170 deletions
Original file line numberDiff line numberDiff line change
@@ -15,15 +15,14 @@ limitations under the License.
1515
*/
1616

1717
// Package controller features a controller to reconcile ApprovalRequest objects
18-
// and create MetricCollector resources on member clusters for approved stages.
18+
// and create MetricCollectorReport resources on the hub cluster for metric collection.
1919
package controller
2020

2121
import (
2222
"context"
2323
"fmt"
2424
"time"
2525

26-
apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
2726
"k8s.io/apimachinery/pkg/api/errors"
2827
"k8s.io/apimachinery/pkg/api/meta"
2928
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -42,15 +41,15 @@ import (
4241
)
4342

4443
const (
45-
// metricCollectorFinalizer is the finalizer added to ApprovalRequest objects
46-
metricCollectorFinalizer = "kubernetes-fleet.io/metric-collector-cleanup"
44+
// metricCollectorFinalizer is the finalizer added to ApprovalRequest objects for cleanup
45+
metricCollectorFinalizer = "kubernetes-fleet.io/metric-collector-report-cleanup"
4746

48-
// prometheusURL is the default Prometheus URL to use
47+
// prometheusURL is the default Prometheus URL to use for all clusters
4948
prometheusURL = "http://prometheus.prometheus.svc.cluster.local:9090"
5049
)
5150

52-
// Reconciler reconciles an ApprovalRequest object and creates MetricCollector resources
53-
// on member clusters when the approval is granted.
51+
// Reconciler reconciles an ApprovalRequest object and creates MetricCollectorReport resources
52+
// on the hub cluster in fleet-member-{clusterName} namespaces.
5453
type Reconciler struct {
5554
client.Client
5655
recorder record.EventRecorder
@@ -182,13 +181,13 @@ func (r *Reconciler) reconcileApprovalRequestObj(ctx context.Context, approvalRe
182181

183182
klog.V(2).InfoS("Found clusters in stage", "approvalRequest", approvalReqRef, "stage", stageName, "clusters", clusterNames)
184183

185-
// Create or update the MetricCollector resource, CRP, and ResourceOverrides
186-
if err := r.ensureMetricCollectorResources(ctx, obj, clusterNames, updateRunName, stageName); err != nil {
187-
klog.ErrorS(err, "Failed to ensure MetricCollector resources", "approvalRequest", approvalReqRef)
184+
// Create or update MetricCollectorReport resources in fleet-member namespaces
185+
if err := r.ensureMetricCollectorReports(ctx, obj, clusterNames, updateRunName, stageName); err != nil {
186+
klog.ErrorS(err, "Failed to ensure MetricCollectorReport resources", "approvalRequest", approvalReqRef)
188187
return ctrl.Result{}, err
189188
}
190189

191-
klog.V(2).InfoS("Successfully ensured MetricCollector resources", "approvalRequest", approvalReqRef, "clusters", clusterNames)
190+
klog.V(2).InfoS("Successfully ensured MetricCollectorReport resources", "approvalRequest", approvalReqRef, "clusters", clusterNames)
192191

193192
// Check workload health and approve if all workloads are healthy
194193
if err := r.checkWorkloadHealthAndApprove(ctx, approvalReqObj, clusterNames, updateRunName, stageName); err != nil {
@@ -200,154 +199,67 @@ func (r *Reconciler) reconcileApprovalRequestObj(ctx context.Context, approvalRe
200199
return ctrl.Result{RequeueAfter: 15 * time.Second}, nil
201200
}
202201

203-
// ensureMetricCollectorResources creates the Namespace, MetricCollector, CRP, and ResourceOverrides
204-
func (r *Reconciler) ensureMetricCollectorResources(
202+
// ensureMetricCollectorReports creates MetricCollectorReport in each fleet-member-{clusterName} namespace
203+
func (r *Reconciler) ensureMetricCollectorReports(
205204
ctx context.Context,
206205
approvalReq client.Object,
207206
clusterNames []string,
208207
updateRunName, stageName string,
209208
) error {
210-
// Generate names
211-
metricCollectorName := fmt.Sprintf("mc-%s-%s", updateRunName, stageName)
212-
crpName := fmt.Sprintf("crp-mc-%s-%s", updateRunName, stageName)
213-
roName := fmt.Sprintf("ro-mc-%s-%s", updateRunName, stageName)
214-
215-
// Create MetricCollector resource (cluster-scoped) on hub
216-
metricCollector := &localv1alpha1.MetricCollector{
217-
ObjectMeta: metav1.ObjectMeta{
218-
Name: metricCollectorName,
219-
Labels: map[string]string{
220-
"app": "metric-collector",
221-
"approval-request": approvalReq.GetName(),
222-
"update-run": updateRunName,
223-
"stage": stageName,
224-
},
225-
},
226-
Spec: localv1alpha1.MetricCollectorSpec{
227-
PrometheusURL: prometheusURL,
228-
// ReportNamespace will be overridden per cluster
229-
ReportNamespace: "placeholder",
230-
},
231-
}
232-
233-
// Create or update MetricCollector
234-
existingMC := &localv1alpha1.MetricCollector{}
235-
err := r.Client.Get(ctx, types.NamespacedName{Name: metricCollectorName}, existingMC)
236-
if err != nil {
237-
if errors.IsNotFound(err) {
238-
if err := r.Client.Create(ctx, metricCollector); err != nil {
239-
return fmt.Errorf("failed to create MetricCollector: %w", err)
240-
}
241-
klog.V(2).InfoS("Created MetricCollector", "metricCollector", klog.KObj(metricCollector))
242-
} else {
243-
return fmt.Errorf("failed to get MetricCollector: %w", err)
244-
}
245-
}
209+
// Generate report name (same for all clusters, different namespaces)
210+
reportName := fmt.Sprintf("mc-%s-%s", updateRunName, stageName)
246211

247-
// Create ResourceOverride with rules for each cluster
248-
overrideRules := make([]placementv1beta1.OverrideRule, 0, len(clusterNames))
212+
// Create MetricCollectorReport in each fleet-member namespace
249213
for _, clusterName := range clusterNames {
250214
reportNamespace := fmt.Sprintf(utils.NamespaceNameFormat, clusterName)
251215

252-
overrideRules = append(overrideRules, placementv1beta1.OverrideRule{
253-
ClusterSelector: &placementv1beta1.ClusterSelector{
254-
ClusterSelectorTerms: []placementv1beta1.ClusterSelectorTerm{
255-
{
256-
LabelSelector: &metav1.LabelSelector{
257-
MatchLabels: map[string]string{
258-
"kubernetes-fleet.io/cluster-name": clusterName,
259-
},
260-
},
261-
},
262-
},
263-
},
264-
JSONPatchOverrides: []placementv1beta1.JSONPatchOverride{
265-
{
266-
Operator: placementv1beta1.JSONPatchOverrideOpReplace,
267-
Path: "/spec/reportNamespace",
268-
Value: apiextensionsv1.JSON{Raw: []byte(fmt.Sprintf(`"%s"`, reportNamespace))},
269-
},
270-
},
271-
})
272-
}
273-
274-
// Create ClusterResourceOverride with rules for each cluster
275-
clusterResourceOverride := &placementv1beta1.ClusterResourceOverride{
276-
ObjectMeta: metav1.ObjectMeta{
277-
Name: roName,
278-
Labels: map[string]string{
279-
"approval-request": approvalReq.GetName(),
280-
"update-run": updateRunName,
281-
"stage": stageName,
282-
},
283-
},
284-
Spec: placementv1beta1.ClusterResourceOverrideSpec{
285-
ClusterResourceSelectors: []placementv1beta1.ResourceSelectorTerm{
286-
{
287-
Group: "metric.kubernetes-fleet.io",
288-
Version: "v1alpha1",
289-
Kind: "MetricCollector",
290-
Name: metricCollectorName,
216+
report := &localv1alpha1.MetricCollectorReport{
217+
ObjectMeta: metav1.ObjectMeta{
218+
Name: reportName,
219+
Namespace: reportNamespace,
220+
Labels: map[string]string{
221+
"approval-request": approvalReq.GetName(),
222+
"update-run": updateRunName,
223+
"stage": stageName,
224+
"cluster": clusterName,
291225
},
292226
},
293-
Policy: &placementv1beta1.OverridePolicy{
294-
OverrideRules: overrideRules,
227+
Spec: localv1alpha1.MetricCollectorReportSpec{
228+
PrometheusURL: prometheusURL,
295229
},
296-
},
297-
}
298-
299-
// Create or update ClusterResourceOverride
300-
existingCRO := &placementv1beta1.ClusterResourceOverride{}
301-
err = r.Client.Get(ctx, types.NamespacedName{Name: roName}, existingCRO)
302-
if err != nil {
303-
if errors.IsNotFound(err) {
304-
if err := r.Client.Create(ctx, clusterResourceOverride); err != nil {
305-
return fmt.Errorf("failed to create ClusterResourceOverride: %w", err)
306-
}
307-
klog.V(2).InfoS("Created ClusterResourceOverride", "clusterResourceOverride", roName)
308-
} else {
309-
return fmt.Errorf("failed to get ClusterResourceOverride: %w", err)
310230
}
311-
}
312231

313-
// Create ClusterResourcePlacement with PickFixed policy
314-
// CRP resource selector selects the MetricCollector directly
315-
crp := &placementv1beta1.ClusterResourcePlacement{
316-
ObjectMeta: metav1.ObjectMeta{
317-
Name: crpName,
318-
Labels: map[string]string{
319-
"approval-request": approvalReq.GetName(),
320-
"update-run": updateRunName,
321-
"stage": stageName,
322-
},
323-
},
324-
Spec: placementv1beta1.PlacementSpec{
325-
ResourceSelectors: []placementv1beta1.ResourceSelectorTerm{
326-
{
327-
Group: "metric.kubernetes-fleet.io",
328-
Version: "v1alpha1",
329-
Kind: "MetricCollector",
330-
Name: metricCollectorName,
331-
},
332-
},
333-
Policy: &placementv1beta1.PlacementPolicy{
334-
PlacementType: placementv1beta1.PickFixedPlacementType,
335-
ClusterNames: clusterNames,
336-
},
337-
},
338-
}
232+
// Create or update MetricCollectorReport
233+
existingReport := &localv1alpha1.MetricCollectorReport{}
234+
err := r.Client.Get(ctx, types.NamespacedName{
235+
Name: reportName,
236+
Namespace: reportNamespace,
237+
}, existingReport)
339238

340-
// Create or update CRP
341-
existingCRP := &placementv1beta1.ClusterResourcePlacement{}
342-
err = r.Client.Get(ctx, types.NamespacedName{Name: crpName}, existingCRP)
343-
if err != nil {
344-
if errors.IsNotFound(err) {
345-
if err := r.Client.Create(ctx, crp); err != nil {
346-
return fmt.Errorf("failed to create ClusterResourcePlacement: %w", err)
239+
if err != nil {
240+
if errors.IsNotFound(err) {
241+
if err := r.Client.Create(ctx, report); err != nil {
242+
return fmt.Errorf("failed to create MetricCollectorReport in %s: %w", reportNamespace, err)
243+
}
244+
klog.V(2).InfoS("Created MetricCollectorReport",
245+
"report", reportName,
246+
"namespace", reportNamespace,
247+
"cluster", clusterName)
248+
} else {
249+
return fmt.Errorf("failed to get MetricCollectorReport in %s: %w", reportNamespace, err)
347250
}
348-
klog.V(2).InfoS("Created ClusterResourcePlacement", "crp", crpName)
349251
} else {
350-
return fmt.Errorf("failed to get ClusterResourcePlacement: %w", err)
252+
// Update spec if needed
253+
if existingReport.Spec.PrometheusURL != prometheusURL {
254+
existingReport.Spec.PrometheusURL = prometheusURL
255+
if err := r.Client.Update(ctx, existingReport); err != nil {
256+
return fmt.Errorf("failed to update MetricCollectorReport in %s: %w", reportNamespace, err)
257+
}
258+
klog.V(2).InfoS("Updated MetricCollectorReport",
259+
"report", reportName,
260+
"namespace", reportNamespace,
261+
"cluster", clusterName)
262+
}
351263
}
352264
}
353265

@@ -465,15 +377,15 @@ func (r *Reconciler) checkWorkloadHealthAndApprove(
465377
klog.V(2).InfoS("Found MetricCollectorReport",
466378
"approvalRequest", approvalReqRef,
467379
"cluster", clusterName,
468-
"collectedMetrics", len(report.CollectedMetrics),
469-
"workloadsMonitored", report.WorkloadsMonitored)
380+
"collectedMetrics", len(report.Status.CollectedMetrics),
381+
"workloadsMonitored", report.Status.WorkloadsMonitored)
470382

471383
// Check if all workloads from WorkloadTracker are present and healthy
472384
for _, trackedWorkload := range workloads {
473385
found := false
474386
healthy := false
475387

476-
for _, collectedMetric := range report.CollectedMetrics {
388+
for _, collectedMetric := range report.Status.CollectedMetrics {
477389
if collectedMetric.Namespace == trackedWorkload.Namespace &&
478390
collectedMetric.WorkloadName == trackedWorkload.Name {
479391
found = true
@@ -562,40 +474,77 @@ func (r *Reconciler) handleDelete(ctx context.Context, approvalReqObj placementv
562474
}
563475

564476
approvalReqRef := klog.KObj(obj)
565-
klog.V(2).InfoS("Cleaning up resources for ApprovalRequest", "approvalRequest", approvalReqRef)
477+
klog.V(2).InfoS("Cleaning up MetricCollectorReports for ApprovalRequest", "approvalRequest", approvalReqRef)
566478

567-
// Delete CRP (it will cascade delete the resources on member clusters)
479+
// Get cluster names from UpdateRun to know which reports to delete
568480
spec := approvalReqObj.GetApprovalRequestSpec()
569481
updateRunName := spec.TargetUpdateRun
570482
stageName := spec.TargetStage
571-
crpName := fmt.Sprintf("crp-mc-%s-%s", updateRunName, stageName)
572-
metricCollectorName := fmt.Sprintf("mc-%s-%s", updateRunName, stageName)
573-
croName := fmt.Sprintf("ro-mc-%s-%s", updateRunName, stageName)
483+
reportName := fmt.Sprintf("mc-%s-%s", updateRunName, stageName)
574484

575-
crp := &placementv1beta1.ClusterResourcePlacement{}
576-
if err := r.Client.Get(ctx, types.NamespacedName{Name: crpName}, crp); err == nil {
577-
if err := r.Client.Delete(ctx, crp); err != nil && !errors.IsNotFound(err) {
578-
return ctrl.Result{}, fmt.Errorf("failed to delete CRP: %w", err)
485+
// Fetch UpdateRun to get cluster names
486+
var clusterNames []string
487+
if obj.GetNamespace() == "" {
488+
// Cluster-scoped: Get ClusterStagedUpdateRun
489+
updateRun := &placementv1beta1.ClusterStagedUpdateRun{}
490+
if err := r.Client.Get(ctx, types.NamespacedName{Name: updateRunName}, updateRun); err != nil {
491+
if !errors.IsNotFound(err) {
492+
klog.ErrorS(err, "Failed to get ClusterStagedUpdateRun for cleanup", "approvalRequest", approvalReqRef)
493+
}
494+
// Continue with finalizer removal even if UpdateRun not found
495+
} else {
496+
// Find the stage
497+
for i := range updateRun.Status.StagesStatus {
498+
if updateRun.Status.StagesStatus[i].StageName == stageName {
499+
for _, cluster := range updateRun.Status.StagesStatus[i].Clusters {
500+
clusterNames = append(clusterNames, cluster.ClusterName)
501+
}
502+
break
503+
}
504+
}
579505
}
580-
klog.V(2).InfoS("Deleted ClusterResourcePlacement", "crp", crpName)
581-
}
582-
583-
// Delete ClusterResourceOverride
584-
cro := &placementv1beta1.ClusterResourceOverride{}
585-
if err := r.Client.Get(ctx, types.NamespacedName{Name: croName}, cro); err == nil {
586-
if err := r.Client.Delete(ctx, cro); err != nil && !errors.IsNotFound(err) {
587-
return ctrl.Result{}, fmt.Errorf("failed to delete ClusterResourceOverride: %w", err)
506+
} else {
507+
// Namespace-scoped: Get StagedUpdateRun
508+
updateRun := &placementv1beta1.StagedUpdateRun{}
509+
if err := r.Client.Get(ctx, types.NamespacedName{Name: updateRunName, Namespace: obj.GetNamespace()}, updateRun); err != nil {
510+
if !errors.IsNotFound(err) {
511+
klog.ErrorS(err, "Failed to get StagedUpdateRun for cleanup", "approvalRequest", approvalReqRef)
512+
}
513+
// Continue with finalizer removal even if UpdateRun not found
514+
} else {
515+
// Find the stage
516+
for i := range updateRun.Status.StagesStatus {
517+
if updateRun.Status.StagesStatus[i].StageName == stageName {
518+
for _, cluster := range updateRun.Status.StagesStatus[i].Clusters {
519+
clusterNames = append(clusterNames, cluster.ClusterName)
520+
}
521+
break
522+
}
523+
}
588524
}
589-
klog.V(2).InfoS("Deleted ClusterResourceOverride", "clusterResourceOverride", croName)
590525
}
591526

592-
// Delete MetricCollector
593-
metricCollector := &localv1alpha1.MetricCollector{}
594-
if err := r.Client.Get(ctx, types.NamespacedName{Name: metricCollectorName}, metricCollector); err == nil {
595-
if err := r.Client.Delete(ctx, metricCollector); err != nil && !errors.IsNotFound(err) {
596-
return ctrl.Result{}, fmt.Errorf("failed to delete MetricCollector: %w", err)
527+
// Delete MetricCollectorReport from each fleet-member namespace
528+
for _, clusterName := range clusterNames {
529+
reportNamespace := fmt.Sprintf(utils.NamespaceNameFormat, clusterName)
530+
report := &localv1alpha1.MetricCollectorReport{}
531+
532+
if err := r.Client.Get(ctx, types.NamespacedName{
533+
Name: reportName,
534+
Namespace: reportNamespace,
535+
}, report); err == nil {
536+
if err := r.Client.Delete(ctx, report); err != nil && !errors.IsNotFound(err) {
537+
klog.ErrorS(err, "Failed to delete MetricCollectorReport",
538+
"report", reportName,
539+
"namespace", reportNamespace,
540+
"cluster", clusterName)
541+
return ctrl.Result{}, fmt.Errorf("failed to delete MetricCollectorReport in %s: %w", reportNamespace, err)
542+
}
543+
klog.V(2).InfoS("Deleted MetricCollectorReport",
544+
"report", reportName,
545+
"namespace", reportNamespace,
546+
"cluster", clusterName)
597547
}
598-
klog.V(2).InfoS("Deleted MetricCollector", "metricCollector", metricCollectorName)
599548
}
600549

601550
// Remove finalizer
@@ -605,7 +554,7 @@ func (r *Reconciler) handleDelete(ctx context.Context, approvalReqObj placementv
605554
return ctrl.Result{}, err
606555
}
607556

608-
klog.V(2).InfoS("Successfully cleaned up resources", "approvalRequest", approvalReqRef)
557+
klog.V(2).InfoS("Successfully cleaned up MetricCollectorReports", "approvalRequest", approvalReqRef, "clusters", clusterNames)
609558
return ctrl.Result{}, nil
610559
}
611560

0 commit comments

Comments
 (0)