Skip to content

Commit 3e02f3e

Browse files
author
Wei
authored
Merge pull request #165 from cloudpilot-ai/cherry-pick-158-release-0.1
Cherry pick PR(158)/fix: calculate the overhead correctly
2 parents b700bf6 + 107f1b6 commit 3e02f3e

File tree

2 files changed

+58
-43
lines changed

2 files changed

+58
-43
lines changed

pkg/providers/instancetype/instancetype.go

Lines changed: 1 addition & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@ import (
2929
"github.com/patrickmn/go-cache"
3030
"github.com/samber/lo"
3131
corev1 "k8s.io/api/core/v1"
32-
"k8s.io/apimachinery/pkg/api/resource"
3332
"k8s.io/apimachinery/pkg/util/sets"
3433
"sigs.k8s.io/controller-runtime/pkg/client"
3534
"sigs.k8s.io/controller-runtime/pkg/log"
@@ -179,11 +178,6 @@ func (p *DefaultProvider) List(ctx context.Context, kc *v1alpha1.KubeletConfigur
179178
return nil, fmt.Errorf("failed to get cluster CNI: %w", err)
180179
}
181180

182-
nodeResourceOverhead, err := p.nodeOverhead(ctx)
183-
if err != nil {
184-
return nil, fmt.Errorf("failed to get node resource overhead: %w", err)
185-
}
186-
187181
result := lo.Map(p.instanceTypesInfo, func(i *ecsclient.DescribeInstanceTypesResponseBodyInstanceTypesInstanceType, _ int) *cloudprovider.InstanceType {
188182
zoneData := lo.Map(allZones.UnsortedList(), func(zoneID string, _ int) ZoneData {
189183
if !p.instanceTypesOfferings[lo.FromPtr(i.InstanceTypeId)].Has(zoneID) || !vSwitchsZones.Has(zoneID) {
@@ -203,7 +197,7 @@ func (p *DefaultProvider) List(ctx context.Context, kc *v1alpha1.KubeletConfigur
203197
// so that Karpenter is able to cache the set of InstanceTypes based on values that alter the set of instance types
204198
// !!! Important !!!
205199
offers := p.createOfferings(ctx, *i.InstanceTypeId, zoneData)
206-
return NewInstanceType(ctx, nodeResourceOverhead, i, kc, p.region, nodeClass.Spec.SystemDisk, offers, clusterCNI)
200+
return NewInstanceType(ctx, i, kc, p.region, nodeClass.Spec.SystemDisk, offers, clusterCNI)
207201
})
208202

209203
// Filter out nil values
@@ -213,37 +207,6 @@ func (p *DefaultProvider) List(ctx context.Context, kc *v1alpha1.KubeletConfigur
213207
return result, nil
214208
}
215209

216-
func (p *DefaultProvider) nodeOverhead(ctx context.Context) (corev1.ResourceList, error) {
217-
var nodes corev1.NodeList
218-
if err := p.kubeClient.List(ctx, &nodes); err != nil {
219-
return corev1.ResourceList{}, err
220-
}
221-
222-
// We do not sure how to calculate the overhead of the node, let's just use the maximum possible
223-
// To avoid some loop node creation
224-
maxCPUOverHead := int64(0)
225-
maxMemoryOverHead := int64(0)
226-
for _, node := range nodes.Items {
227-
capacity := node.Status.Capacity
228-
allocatable := node.Status.Allocatable
229-
230-
cpuOverHead := capacity.Cpu().MilliValue() - allocatable.Cpu().MilliValue()
231-
memoryOverHead := capacity.Memory().Value() - allocatable.Memory().Value()
232-
233-
if cpuOverHead > maxCPUOverHead {
234-
maxCPUOverHead = cpuOverHead
235-
}
236-
if memoryOverHead > maxMemoryOverHead {
237-
maxMemoryOverHead = memoryOverHead
238-
}
239-
}
240-
241-
return corev1.ResourceList{
242-
corev1.ResourceCPU: *resource.NewMilliQuantity(maxCPUOverHead, resource.DecimalSI),
243-
corev1.ResourceMemory: *resource.NewQuantity(maxMemoryOverHead, resource.DecimalSI),
244-
}, nil
245-
}
246-
247210
func (p *DefaultProvider) UpdateInstanceTypes(ctx context.Context) error {
248211
// DO NOT REMOVE THIS LOCK ----------------------------------------------------------------------------
249212
// We lock here so that multiple callers to getInstanceTypesOfferings do not result in cache misses and multiple

pkg/providers/instancetype/types.go

Lines changed: 57 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,52 @@ type ZoneData struct {
5757
Available bool
5858
}
5959

60-
func NewInstanceType(ctx context.Context, overhead corev1.ResourceList,
60+
func calculateResourceOverhead(pods, cpuM, memoryMi int64) corev1.ResourceList {
61+
// referring to: https://help.aliyun.com/zh/ack/ack-managed-and-ack-dedicated/user-guide/resource-reservation-policy#0f5ffe176df7q
62+
// CPU overhead calculation
63+
cpuOverHead := calculateCPUOverhead(cpuM)
64+
65+
// TODO: In a real environment, the formula does not produce accurate results,
66+
// consistently yielding values that are 200MiB larger than expected.
67+
// Memory overhead: min(11*pods + 255, memoryMi*0.25)
68+
memoryOverHead := int64(math.Min(float64(11*pods+255), float64(memoryMi)*0.25)) + 200
69+
70+
return corev1.ResourceList{
71+
corev1.ResourceCPU: *resource.NewMilliQuantity(cpuOverHead, resource.DecimalSI),
72+
corev1.ResourceMemory: *resources.Quantity(fmt.Sprintf("%dMi", memoryOverHead)),
73+
}
74+
}
75+
76+
// thresholds defines CPU overhead thresholds and their corresponding percentages
77+
var thresholds = [...]struct {
78+
cores int64
79+
overhead float64
80+
}{
81+
{1000, 0.06},
82+
{3000, 0.01},
83+
{3000, 0.005},
84+
{4000, 0.005},
85+
}
86+
87+
func calculateCPUOverhead(cpuM int64) int64 {
88+
var cpuOverHead int64
89+
90+
// Calculate overhead for each threshold
91+
for _, t := range thresholds {
92+
if cpuM >= t.cores {
93+
cpuOverHead += int64(1000 * t.overhead)
94+
}
95+
}
96+
97+
// Additional overhead for CPU > 4 cores (0.25%)
98+
if cpuM > 4000 {
99+
cpuOverHead += int64(float64(cpuM-4000) * 0.0025)
100+
}
101+
102+
return cpuOverHead
103+
}
104+
105+
func NewInstanceType(ctx context.Context,
61106
info *ecsclient.DescribeInstanceTypesResponseBodyInstanceTypesInstanceType,
62107
kc *v1alpha1.KubeletConfiguration, region string, systemDisk *v1alpha1.SystemDisk,
63108
offerings cloudprovider.Offerings, clusterCNI string) *cloudprovider.InstanceType {
@@ -71,12 +116,15 @@ func NewInstanceType(ctx context.Context, overhead corev1.ResourceList,
71116
Offerings: offerings,
72117
Capacity: computeCapacity(ctx, info, kc.MaxPods, kc.PodsPerCore, systemDisk, clusterCNI),
73118
Overhead: &cloudprovider.InstanceTypeOverhead{
74-
// Follow overhead will be merged, so we can set only one overhead totally
75-
KubeReserved: overhead,
119+
KubeReserved: corev1.ResourceList{},
76120
SystemReserved: corev1.ResourceList{},
77121
EvictionThreshold: corev1.ResourceList{},
78122
},
79123
}
124+
125+
// Follow KubeReserved/SystemReserved/EvictionThreshold will be merged, so we can set only one overhead totally
126+
it.Overhead.KubeReserved = calculateResourceOverhead(it.Capacity.Pods().Value(),
127+
it.Capacity.Cpu().MilliValue(), extractMemory(info).Value()/MiBByteRatio)
80128
if it.Requirements.Compatible(scheduling.NewRequirements(scheduling.NewRequirement(corev1.LabelOSStable, corev1.NodeSelectorOpIn, string(corev1.Windows)))) == nil {
81129
it.Capacity[v1alpha1.ResourcePrivateIPv4Address] = *privateIPv4Address(info)
82130
}
@@ -206,9 +254,13 @@ func cpu(info *ecsclient.DescribeInstanceTypesResponseBodyInstanceTypesInstanceT
206254
return resources.Quantity(fmt.Sprint(*info.CpuCoreCount))
207255
}
208256

209-
func memory(ctx context.Context, info *ecsclient.DescribeInstanceTypesResponseBodyInstanceTypesInstanceType) *resource.Quantity {
257+
func extractMemory(info *ecsclient.DescribeInstanceTypesResponseBodyInstanceTypesInstanceType) *resource.Quantity {
210258
sizeInGib := tea.Float32Value(info.MemorySize)
211-
mem := resources.Quantity(fmt.Sprintf("%fGi", sizeInGib))
259+
return resources.Quantity(fmt.Sprintf("%fGi", sizeInGib))
260+
}
261+
262+
func memory(ctx context.Context, info *ecsclient.DescribeInstanceTypesResponseBodyInstanceTypesInstanceType) *resource.Quantity {
263+
mem := extractMemory(info)
212264
if mem.IsZero() {
213265
return mem
214266
}

0 commit comments

Comments
 (0)