Skip to content
This repository was archived by the owner on May 25, 2023. It is now read-only.

Commit 323b180

Browse files
authored
Merge pull request #629 from k82cn/automated-cherry-pick-of-#592-upstream-release-0.4
Automated cherry pick of #592: Update prometheus vendor libs
2 parents 66162c4 + 54da657 commit 323b180

40 files changed

+3998
-1039
lines changed

Gopkg.lock

Lines changed: 12 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Gopkg.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,3 +72,7 @@
7272
[prune]
7373
go-tests = true
7474
unused-packages = true
75+
76+
[[constraint]]
77+
name = "github.com/prometheus/client_golang"
78+
version = "0.9.2"

cmd/kube-batch/app/options/options.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ type ServerOption struct {
3434
LockObjectNamespace string
3535
DefaultQueue string
3636
PrintVersion bool
37+
ListenAddress string
3738
}
3839

3940
// NewServerOption creates a new CMServer with a default config.
@@ -56,6 +57,7 @@ func (s *ServerOption) AddFlags(fs *pflag.FlagSet) {
5657
"executing the main loop. Enable this when running replicated kube-batch for high availability")
5758
fs.BoolVar(&s.PrintVersion, "version", false, "Show version and quit")
5859
fs.StringVar(&s.LockObjectNamespace, "lock-object-namespace", s.LockObjectNamespace, "Define the namespace of the lock object")
60+
fs.StringVar(&s.ListenAddress, "listen-address", ":8080", "The address to listen on for HTTP requests.")
5961
}
6062

6163
func (s *ServerOption) CheckOptionOrDie() error {

cmd/kube-batch/app/server.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,15 @@ package app
1919
import (
2020
"context"
2121
"fmt"
22+
"net/http"
2223
"os"
2324
"time"
2425

2526
"github.com/golang/glog"
2627
"github.com/kubernetes-sigs/kube-batch/cmd/kube-batch/app/options"
2728
"github.com/kubernetes-sigs/kube-batch/pkg/scheduler"
2829
"github.com/kubernetes-sigs/kube-batch/pkg/version"
30+
"github.com/prometheus/client_golang/prometheus/promhttp"
2931

3032
v1 "k8s.io/api/core/v1"
3133
"k8s.io/apimachinery/pkg/util/uuid"
@@ -75,6 +77,11 @@ func Run(opt *options.ServerOption) error {
7577
panic(err)
7678
}
7779

80+
go func() {
81+
http.Handle("/metrics", promhttp.Handler())
82+
glog.Fatalf("Prometheus Http Server failed %s", http.ListenAndServe(opt.ListenAddress, nil))
83+
}()
84+
7885
run := func(ctx context.Context) {
7986
sched.Run(ctx.Done())
8087
<-ctx.Done()

pkg/scheduler/actions/allocate/allocate.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ func (alloc *allocateAction) Execute(ssn *framework.Session) {
105105
for !tasks.Empty() {
106106
predicateNodes := []*api.NodeInfo{}
107107
nodeScores := map[int][]*api.NodeInfo{}
108+
108109
task := tasks.Pop().(*api.TaskInfo)
109110
assigned := false
110111

pkg/scheduler/actions/preempt/preempt.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323

2424
"github.com/kubernetes-sigs/kube-batch/pkg/scheduler/api"
2525
"github.com/kubernetes-sigs/kube-batch/pkg/scheduler/framework"
26+
"github.com/kubernetes-sigs/kube-batch/pkg/scheduler/metrics"
2627
"github.com/kubernetes-sigs/kube-batch/pkg/scheduler/util"
2728
)
2829

@@ -212,6 +213,7 @@ func preempt(
212213
}
213214
}
214215
victims := ssn.Preemptable(preemptor, preemptees)
216+
metrics.UpdatePreemptionVictimsCount(len(victims))
215217

216218
if err := validateVictims(victims, resreq); err != nil {
217219
glog.V(3).Infof("No validated victims on Node <%s>: %v", node.Name, err)
@@ -235,6 +237,7 @@ func preempt(
235237
resreq.Sub(preemptee.Resreq)
236238
}
237239

240+
metrics.RegisterPreemptionAttempts()
238241
glog.V(3).Infof("Preempted <%v> for task <%s/%s> requested <%v>.",
239242
preempted, preemptor.Namespace, preemptor.Name, preemptor.Resreq)
240243

pkg/scheduler/framework/framework.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,13 @@ limitations under the License.
1717
package framework
1818

1919
import (
20+
"time"
21+
2022
"github.com/golang/glog"
2123

2224
"github.com/kubernetes-sigs/kube-batch/pkg/scheduler/cache"
2325
"github.com/kubernetes-sigs/kube-batch/pkg/scheduler/conf"
26+
"github.com/kubernetes-sigs/kube-batch/pkg/scheduler/metrics"
2427
)
2528

2629
func OpenSession(cache cache.Cache, tiers []conf.Tier) *Session {
@@ -39,15 +42,19 @@ func OpenSession(cache cache.Cache, tiers []conf.Tier) *Session {
3942
}
4043

4144
for _, plugin := range ssn.plugins {
45+
onSessionOpenStart := time.Now()
4246
plugin.OnSessionOpen(ssn)
47+
metrics.UpdatePluginDuration(plugin.Name(), metrics.OnSessionOpen, metrics.Duration(onSessionOpenStart))
4348
}
4449

4550
return ssn
4651
}
4752

4853
func CloseSession(ssn *Session) {
4954
for _, plugin := range ssn.plugins {
55+
onSessionCloseStart := time.Now()
5056
plugin.OnSessionClose(ssn)
57+
metrics.UpdatePluginDuration(plugin.Name(), metrics.OnSessionClose, metrics.Duration(onSessionCloseStart))
5158
}
5259

5360
closeSession(ssn)

pkg/scheduler/framework/session.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ import (
2121

2222
"github.com/golang/glog"
2323

24-
"k8s.io/api/core/v1"
24+
v1 "k8s.io/api/core/v1"
2525
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2626
"k8s.io/apimachinery/pkg/types"
2727
"k8s.io/apimachinery/pkg/util/uuid"
@@ -30,6 +30,7 @@ import (
3030
"github.com/kubernetes-sigs/kube-batch/pkg/scheduler/api"
3131
"github.com/kubernetes-sigs/kube-batch/pkg/scheduler/cache"
3232
"github.com/kubernetes-sigs/kube-batch/pkg/scheduler/conf"
33+
"github.com/kubernetes-sigs/kube-batch/pkg/scheduler/metrics"
3334
)
3435

3536
type Session struct {
@@ -295,6 +296,7 @@ func (ssn *Session) dispatch(task *api.TaskInfo) error {
295296
task.Job, ssn.UID)
296297
}
297298

299+
metrics.UpdateTaskScheduleDuration(metrics.Duration(task.Pod.CreationTimestamp.Time))
298300
return nil
299301
}
300302

pkg/scheduler/metrics/metrics.go

Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
/*
2+
Copyright 2018 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package metrics
18+
19+
import (
20+
"time"
21+
22+
"github.com/prometheus/client_golang/prometheus"
23+
"github.com/prometheus/client_golang/prometheus/promauto" // auto-registry collectors in default registry
24+
)
25+
26+
const (
27+
// KubeBatchNamespace - namespace in prometheus used by kube-batch
28+
KubeBatchNamespace = "kube_batch"
29+
30+
// OnSessionOpen label
31+
OnSessionOpen = "OnSessionOpen"
32+
33+
// OnSessionClose label
34+
OnSessionClose = "OnSessionClose"
35+
)
36+
37+
var (
38+
e2eSchedulingLatency = promauto.NewHistogram(
39+
prometheus.HistogramOpts{
40+
Subsystem: KubeBatchNamespace,
41+
Name: "e2e_scheduling_latency_milliseconds",
42+
Help: "E2e scheduling latency in milliseconds (scheduling algorithm + binding)",
43+
Buckets: prometheus.ExponentialBuckets(5, 2, 10),
44+
},
45+
)
46+
47+
pluginSchedulingLatency = promauto.NewHistogramVec(
48+
prometheus.HistogramOpts{
49+
Subsystem: KubeBatchNamespace,
50+
Name: "plugin_scheduling_latency_microseconds",
51+
Help: "Plugin scheduling latency in microseconds",
52+
Buckets: prometheus.ExponentialBuckets(5, 2, 10),
53+
}, []string{"plugin", "OnSession"},
54+
)
55+
56+
actionSchedulingLatency = promauto.NewHistogramVec(
57+
prometheus.HistogramOpts{
58+
Subsystem: KubeBatchNamespace,
59+
Name: "action_scheduling_latency_microseconds",
60+
Help: "Action scheduling latency in microseconds",
61+
Buckets: prometheus.ExponentialBuckets(5, 2, 10),
62+
}, []string{"action"},
63+
)
64+
65+
taskSchedulingLatency = promauto.NewHistogram(
66+
prometheus.HistogramOpts{
67+
Subsystem: KubeBatchNamespace,
68+
Name: "task_scheduling_latency_microseconds",
69+
Help: "Task scheduling latency in microseconds",
70+
Buckets: prometheus.ExponentialBuckets(5, 2, 10),
71+
},
72+
)
73+
74+
scheduleAttempts = promauto.NewCounterVec(
75+
prometheus.CounterOpts{
76+
Subsystem: KubeBatchNamespace,
77+
Name: "schedule_attempts_total",
78+
Help: "Number of attempts to schedule pods, by the result. 'unschedulable' means a pod could not be scheduled, while 'error' means an internal scheduler problem.",
79+
}, []string{"result"},
80+
)
81+
82+
preemptionVictims = promauto.NewGauge(
83+
prometheus.GaugeOpts{
84+
Subsystem: KubeBatchNamespace,
85+
Name: "pod_preemption_victims",
86+
Help: "Number of selected preemption victims",
87+
},
88+
)
89+
90+
preemptionAttempts = promauto.NewCounter(
91+
prometheus.CounterOpts{
92+
Subsystem: KubeBatchNamespace,
93+
Name: "total_preemption_attempts",
94+
Help: "Total preemption attempts in the cluster till now",
95+
},
96+
)
97+
98+
unscheduleTaskCount = promauto.NewGaugeVec(
99+
prometheus.GaugeOpts{
100+
Subsystem: KubeBatchNamespace,
101+
Name: "unschedule_task_count",
102+
Help: "Number of tasks could not be scheduled",
103+
}, []string{"job_id"},
104+
)
105+
106+
unscheduleJobCount = promauto.NewGauge(
107+
prometheus.GaugeOpts{
108+
Subsystem: KubeBatchNamespace,
109+
Name: "unschedule_job_count",
110+
Help: "Number of jobs could not be scheduled",
111+
},
112+
)
113+
114+
jobRetryCount = promauto.NewCounterVec(
115+
prometheus.CounterOpts{
116+
Subsystem: KubeBatchNamespace,
117+
Name: "job_retry_counts",
118+
Help: "Number of retry counts for one job",
119+
}, []string{"job_id"},
120+
)
121+
)
122+
123+
// UpdatePluginDuration updates latency for every plugin
124+
func UpdatePluginDuration(pluginName, OnSessionStatus string, duration time.Duration) {
125+
pluginSchedulingLatency.WithLabelValues(pluginName, OnSessionStatus).Observe(DurationInMicroseconds(duration))
126+
}
127+
128+
// UpdateActionDuration updates latency for every action
129+
func UpdateActionDuration(actionName string, duration time.Duration) {
130+
actionSchedulingLatency.WithLabelValues(actionName).Observe(DurationInMicroseconds(duration))
131+
}
132+
133+
// UpdateE2eDuration updates entire end to end scheduling latency
134+
func UpdateE2eDuration(duration time.Duration) {
135+
e2eSchedulingLatency.Observe(DurationInMilliseconds(duration))
136+
}
137+
138+
// UpdateTaskScheduleDuration updates single task scheduling latency
139+
func UpdateTaskScheduleDuration(duration time.Duration) {
140+
taskSchedulingLatency.Observe(DurationInMicroseconds(duration))
141+
}
142+
143+
// UpdatePodScheduleStatus update pod schedule decision, could be Success, Failure, Error
144+
func UpdatePodScheduleStatus(label string, count int) {
145+
scheduleAttempts.WithLabelValues(label).Add(float64(count))
146+
}
147+
148+
// UpdatePreemptionVictimsCount updates count of preemption victims
149+
func UpdatePreemptionVictimsCount(victimsCount int) {
150+
preemptionVictims.Set(float64(victimsCount))
151+
}
152+
153+
// RegisterPreemptionAttempts records number of attempts for preemtion
154+
func RegisterPreemptionAttempts() {
155+
preemptionAttempts.Inc()
156+
}
157+
158+
// UpdateUnscheduleTaskCount records total number of unscheduleable tasks
159+
func UpdateUnscheduleTaskCount(jobID string, taskCount int) {
160+
unscheduleTaskCount.WithLabelValues(jobID).Set(float64(taskCount))
161+
}
162+
163+
// UpdateUnscheduleJobCount records total number of unscheduleable jobs
164+
func UpdateUnscheduleJobCount(jobCount int) {
165+
unscheduleJobCount.Set(float64(jobCount))
166+
}
167+
168+
// RegisterJobRetries total number of job retries.
169+
func RegisterJobRetries(jobID string) {
170+
jobRetryCount.WithLabelValues(jobID).Inc()
171+
}
172+
173+
// DurationInMicroseconds gets the time in microseconds.
174+
func DurationInMicroseconds(duration time.Duration) float64 {
175+
return float64(duration.Nanoseconds()) / float64(time.Microsecond.Nanoseconds())
176+
}
177+
178+
// DurationInMilliseconds gets the time in milliseconds.
179+
func DurationInMilliseconds(duration time.Duration) float64 {
180+
return float64(duration.Nanoseconds()) / float64(time.Millisecond.Nanoseconds())
181+
}
182+
183+
// DurationInSeconds gets the time in seconds.
184+
func DurationInSeconds(duration time.Duration) float64 {
185+
return duration.Seconds()
186+
}
187+
188+
// Duration get the time since specified start
189+
func Duration(start time.Time) time.Duration {
190+
return time.Since(start)
191+
}

0 commit comments

Comments
 (0)