Skip to content

Commit a83ff7b

Browse files
authored
fix(probes): improve timing of k8s lifecycle probes (#6861)
Previous timings were leading to spurious restarts, especially on start-up as components waited to connect to the scheduler. It was possible for the scheduler image pull to take sufficiently long (and be timed in an unlucky way), such that other components restarted a number of times. The new timings simply reduce that possibility by being more generous wrt the startup probe.
1 parent 1952e43 commit a83ff7b

File tree

4 files changed

+44
-36
lines changed

4 files changed

+44
-36
lines changed

k8s/helm-charts/seldon-core-v2-setup/templates/_components-deployments.tpl

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -607,6 +607,7 @@ spec:
607607
httpGet:
608608
path: /live
609609
port: health
610+
initialDelaySeconds: 10
610611
periodSeconds: 5
611612
name: scheduler
612613
ports:
@@ -630,6 +631,7 @@ spec:
630631
httpGet:
631632
path: /ready
632633
port: health
634+
initialDelaySeconds: 10
633635
periodSeconds: 5
634636
resources:
635637
limits:
@@ -833,12 +835,12 @@ spec:
833835
cpu: '{{ .Values.pipelinegateway.resources.cpu }}'
834836
memory: '{{ .Values.pipelinegateway.resources.memory }}'
835837
startupProbe:
836-
failureThreshold: 3
838+
failureThreshold: 10
837839
httpGet:
838840
path: /startup
839841
port: health
840-
initialDelaySeconds: 3
841-
periodSeconds: 5
842+
initialDelaySeconds: 10
843+
periodSeconds: 10
842844
volumeMounts:
843845
- mountPath: /mnt/kafka
844846
name: kafka-config-volume
@@ -997,12 +999,12 @@ spec:
997999
cpu: '{{ .Values.modelgateway.resources.cpu }}'
9981000
memory: '{{ .Values.modelgateway.resources.memory }}'
9991001
startupProbe:
1000-
failureThreshold: 3
1002+
failureThreshold: 10
10011003
httpGet:
10021004
path: /startup
10031005
port: health
1004-
initialDelaySeconds: 3
1005-
periodSeconds: 5
1006+
initialDelaySeconds: 10
1007+
periodSeconds: 10
10061008
volumeMounts:
10071009
- mountPath: /mnt/kafka
10081010
name: kafka-config-volume
@@ -1266,12 +1268,12 @@ spec:
12661268
cpu: '{{ .Values.dataflow.resources.cpu }}'
12671269
memory: '{{ .Values.dataflow.resources.memory }}'
12681270
startupProbe:
1269-
failureThreshold: 3
1271+
failureThreshold: 10
12701272
httpGet:
12711273
path: /startup
12721274
port: health
1273-
initialDelaySeconds: 3
1274-
periodSeconds: 5
1275+
initialDelaySeconds: 10
1276+
periodSeconds: 10
12751277
volumeMounts:
12761278
- mountPath: /mnt/schema-registry
12771279
name: kafka-schema-volume

k8s/helm-charts/seldon-core-v2-setup/templates/_components-statefulsets.tpl

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -607,6 +607,7 @@ spec:
607607
httpGet:
608608
path: /live
609609
port: health
610+
initialDelaySeconds: 10
610611
periodSeconds: 5
611612
name: scheduler
612613
ports:
@@ -630,6 +631,7 @@ spec:
630631
httpGet:
631632
path: /ready
632633
port: health
634+
initialDelaySeconds: 10
633635
periodSeconds: 5
634636
resources:
635637
limits:
@@ -833,12 +835,12 @@ spec:
833835
cpu: '{{ .Values.pipelinegateway.resources.cpu }}'
834836
memory: '{{ .Values.pipelinegateway.resources.memory }}'
835837
startupProbe:
836-
failureThreshold: 3
838+
failureThreshold: 10
837839
httpGet:
838840
path: /startup
839841
port: health
840-
initialDelaySeconds: 3
841-
periodSeconds: 5
842+
initialDelaySeconds: 10
843+
periodSeconds: 10
842844
volumeMounts:
843845
- mountPath: /mnt/kafka
844846
name: kafka-config-volume
@@ -997,12 +999,12 @@ spec:
997999
cpu: '{{ .Values.modelgateway.resources.cpu }}'
9981000
memory: '{{ .Values.modelgateway.resources.memory }}'
9991001
startupProbe:
1000-
failureThreshold: 3
1002+
failureThreshold: 10
10011003
httpGet:
10021004
path: /startup
10031005
port: health
1004-
initialDelaySeconds: 3
1005-
periodSeconds: 5
1006+
initialDelaySeconds: 10
1007+
periodSeconds: 10
10061008
volumeMounts:
10071009
- mountPath: /mnt/kafka
10081010
name: kafka-config-volume
@@ -1266,12 +1268,12 @@ spec:
12661268
cpu: '{{ .Values.dataflow.resources.cpu }}'
12671269
memory: '{{ .Values.dataflow.resources.memory }}'
12681270
startupProbe:
1269-
failureThreshold: 3
1271+
failureThreshold: 10
12701272
httpGet:
12711273
path: /startup
12721274
port: health
1273-
initialDelaySeconds: 3
1274-
periodSeconds: 5
1275+
initialDelaySeconds: 10
1276+
periodSeconds: 10
12751277
volumeMounts:
12761278
- mountPath: /mnt/schema-registry
12771279
name: kafka-schema-volume

k8s/yaml/components.yaml

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -449,6 +449,7 @@ spec:
449449
httpGet:
450450
path: /live
451451
port: health
452+
initialDelaySeconds: 10
452453
periodSeconds: 5
453454
name: scheduler
454455
ports:
@@ -472,6 +473,7 @@ spec:
472473
httpGet:
473474
path: /ready
474475
port: health
476+
initialDelaySeconds: 10
475477
periodSeconds: 5
476478
resources:
477479
limits:
@@ -669,12 +671,12 @@ spec:
669671
cpu: '100m'
670672
memory: '1G'
671673
startupProbe:
672-
failureThreshold: 3
674+
failureThreshold: 10
673675
httpGet:
674676
path: /startup
675677
port: health
676-
initialDelaySeconds: 3
677-
periodSeconds: 5
678+
initialDelaySeconds: 10
679+
periodSeconds: 10
678680
volumeMounts:
679681
- mountPath: /mnt/kafka
680682
name: kafka-config-volume
@@ -828,12 +830,12 @@ spec:
828830
cpu: '100m'
829831
memory: '1G'
830832
startupProbe:
831-
failureThreshold: 3
833+
failureThreshold: 10
832834
httpGet:
833835
path: /startup
834836
port: health
835-
initialDelaySeconds: 3
836-
periodSeconds: 5
837+
initialDelaySeconds: 10
838+
periodSeconds: 10
837839
volumeMounts:
838840
- mountPath: /mnt/kafka
839841
name: kafka-config-volume
@@ -1089,12 +1091,12 @@ spec:
10891091
cpu: '100m'
10901092
memory: '3G'
10911093
startupProbe:
1092-
failureThreshold: 3
1094+
failureThreshold: 10
10931095
httpGet:
10941096
path: /startup
10951097
port: health
1096-
initialDelaySeconds: 3
1097-
periodSeconds: 5
1098+
initialDelaySeconds: 10
1099+
periodSeconds: 10
10981100
volumeMounts:
10991101
- mountPath: /mnt/schema-registry
11001102
name: kafka-schema-volume

operator/config/seldonconfigs/default.yaml

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -45,12 +45,12 @@ spec:
4545
- containerPort: 8000
4646
name: health
4747
startupProbe:
48-
failureThreshold: 3
48+
failureThreshold: 10
4949
httpGet:
5050
path: /startup
5151
port: health
52-
initialDelaySeconds: 3
53-
periodSeconds: 5
52+
initialDelaySeconds: 10
53+
periodSeconds: 10
5454
readinessProbe:
5555
failureThreshold: 3
5656
httpGet:
@@ -184,9 +184,9 @@ spec:
184184
httpGet:
185185
path: /startup
186186
port: health
187-
initialDelaySeconds: 3
188-
periodSeconds: 5
189-
failureThreshold: 3
187+
initialDelaySeconds: 10
188+
periodSeconds: 10
189+
failureThreshold: 10
190190
readinessProbe:
191191
httpGet:
192192
path: /ready
@@ -286,9 +286,9 @@ spec:
286286
httpGet:
287287
path: /startup
288288
port: health
289-
initialDelaySeconds: 3
290-
periodSeconds: 5
291-
failureThreshold: 3
289+
initialDelaySeconds: 10
290+
periodSeconds: 10
291+
failureThreshold: 10
292292
readinessProbe:
293293
httpGet:
294294
path: /ready
@@ -399,12 +399,14 @@ spec:
399399
port: health
400400
periodSeconds: 5
401401
failureThreshold: 3
402+
initialDelaySeconds: 10
402403
livenessProbe:
403404
httpGet:
404405
path: /live
405406
port: health
406407
periodSeconds: 5
407408
failureThreshold: 3
409+
initialDelaySeconds: 10
408410
resources:
409411
limits:
410412
memory: 1G

0 commit comments

Comments
 (0)