@@ -45,6 +45,7 @@ type OverReserve struct {
4545 lh logr.Logger
4646 client ctrlclient.Reader
4747 lock sync.Mutex
48+ generation uint64
4849 nrts * nrtStore
4950 assumedResources map [string ]* resourceStore // nodeName -> resourceStore
5051 // nodesMaybeOverreserved counts how many times a node is filtered out. This is used as trigger condition to try
@@ -97,30 +98,33 @@ func NewOverReserve(ctx context.Context, lh logr.Logger, cfg *apiconfig.NodeReso
9798 return obj , nil
9899}
99100
100- func (ov * OverReserve ) GetCachedNRTCopy (ctx context.Context , nodeName string , pod * corev1.Pod ) (* topologyv1alpha2.NodeResourceTopology , bool ) {
101+ func (ov * OverReserve ) GetCachedNRTCopy (ctx context.Context , nodeName string , pod * corev1.Pod ) (* topologyv1alpha2.NodeResourceTopology , CachedNRTInfo ) {
101102 ov .lock .Lock ()
102103 defer ov .lock .Unlock ()
103104 if ov .nodesWithForeignPods .IsSet (nodeName ) {
104- return nil , false
105+ return nil , CachedNRTInfo {}
105106 }
106107
108+ info := CachedNRTInfo {Fresh : true }
107109 nrt := ov .nrts .GetNRTCopyByNodeName (nodeName )
108110 if nrt == nil {
109- return nil , true
111+ return nil , info
110112 }
113+
114+ info .Generation = ov .generation
111115 nodeAssumedResources , ok := ov .assumedResources [nodeName ]
112116 if ! ok {
113- return nrt , true
117+ return nrt , info
114118 }
115119
116120 logID := klog .KObj (pod )
117- lh := ov .lh .WithValues (logging .KeyPod , logID , logging .KeyPodUID , logging .PodUID (pod ), logging .KeyNode , nodeName )
121+ lh := ov .lh .WithValues (logging .KeyPod , logID , logging .KeyPodUID , logging .PodUID (pod ), logging .KeyNode , nodeName , logging . KeyGeneration , ov . generation )
118122
119123 lh .V (6 ).Info ("NRT" , "fromcache" , stringify .NodeResourceTopologyResources (nrt ))
120124 nodeAssumedResources .UpdateNRT (nrt , logging .KeyPod , logID )
121125
122126 lh .V (5 ).Info ("NRT" , "withassumed" , stringify .NodeResourceTopologyResources (nrt ))
123- return nrt , true
127+ return nrt , info
124128}
125129
126130func (ov * OverReserve ) NodeMaybeOverReserved (nodeName string , pod * corev1.Pod ) {
@@ -176,6 +180,7 @@ func (ov *OverReserve) UnreserveNodeResources(nodeName string, pod *corev1.Pod)
176180}
177181
178182type DesyncedNodes struct {
183+ Generation uint64
179184 MaybeOverReserved []string
180185 ConfigChanged []string
181186}
@@ -207,6 +212,10 @@ func (rn DesyncedNodes) DirtyCount() int {
207212func (ov * OverReserve ) GetDesyncedNodes (lh logr.Logger ) DesyncedNodes {
208213 ov .lock .Lock ()
209214 defer ov .lock .Unlock ()
215+
216+ // make sure to log the generation to be able to crosscorrelate with later logs
217+ lh = lh .WithValues (logging .KeyGeneration , ov .generation )
218+
210219 // this is intentionally aggressive. We don't yet make any attempt to find out if the
211220 // node was discarded because pessimistically overrserved (which should indeed trigger
212221 // a resync) or if it was discarded because the actual resources on the node really were
@@ -229,6 +238,7 @@ func (ov *OverReserve) GetDesyncedNodes(lh logr.Logger) DesyncedNodes {
229238 lh .V (4 ).Info ("found dirty nodes" , "foreign" , foreignCount , "discarded" , overreservedCount , "configChange" , configChangeCount , "total" , nodes .Len ())
230239 }
231240 return DesyncedNodes {
241+ Generation : ov .generation ,
232242 MaybeOverReserved : nodes .Keys (),
233243 ConfigChanged : configChangeNodes .Keys (),
234244 }
@@ -244,11 +254,14 @@ func (ov *OverReserve) GetDesyncedNodes(lh logr.Logger) DesyncedNodes {
244254// too aggressive resync attempts, so to more, likely unnecessary, computation work on the scheduler side.
245255func (ov * OverReserve ) Resync () {
246256 // we are not working with a specific pod, so we need a unique key to track this flow
247- lh_ := ov .lh .WithName (logging .FlowCacheSync ). WithValues ( logging . KeyLogID , logging . TimeLogID ())
257+ lh_ := ov .lh .WithName (logging .FlowCacheSync )
248258 lh_ .V (4 ).Info (logging .FlowBegin )
249259 defer lh_ .V (4 ).Info (logging .FlowEnd )
250260
251261 nodes := ov .GetDesyncedNodes (lh_ )
262+ // we start without because chicken/egg problem. This is the earliest we can use the generation value.
263+ lh_ = lh_ .WithValues (logging .KeyGeneration , nodes .Generation )
264+
252265 // avoid as much as we can unnecessary work and logs.
253266 if nodes .Len () == 0 {
254267 lh_ .V (5 ).Info ("no dirty nodes detected" )
@@ -331,6 +344,7 @@ func (ov *OverReserve) Resync() {
331344func (ov * OverReserve ) FlushNodes (lh logr.Logger , nrts ... * topologyv1alpha2.NodeResourceTopology ) {
332345 ov .lock .Lock ()
333346 defer ov .lock .Unlock ()
347+
334348 for _ , nrt := range nrts {
335349 lh .V (2 ).Info ("flushing" , logging .KeyNode , nrt .Name )
336350 ov .nrts .Update (nrt )
@@ -339,6 +353,14 @@ func (ov *OverReserve) FlushNodes(lh logr.Logger, nrts ...*topologyv1alpha2.Node
339353 ov .nodesWithForeignPods .Delete (nrt .Name )
340354 ov .nodesWithAttrUpdate .Delete (nrt .Name )
341355 }
356+
357+ if len (nrts ) == 0 {
358+ return
359+ }
360+
361+ // increase only if we mutated the internal state
362+ ov .generation += 1
363+ lh .V (2 ).Info ("generation" , "new" , ov .generation )
342364}
343365
344366// to be used only in tests
0 commit comments