Skip to content
Merged
7 changes: 4 additions & 3 deletions k8s/app/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ spec:
spec:
containers:
- name: camera-onboarding
image: mrsert/camera-onboarding:monitoring
image: mrsert/camera-onboarding:metrics-v2
resources:
requests:
cpu: "275m"
Expand All @@ -31,19 +31,20 @@ spec:
envFrom:
- secretRef:
name: camera-onboarding-secrets

livenessProbe:
httpGet:
path: /actuator/health/liveness
port: 8080
initialDelaySeconds: 30
initialDelaySeconds: 180
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 6
readinessProbe:
httpGet:
path: /actuator/health/readiness
port: 8080
initialDelaySeconds: 15
initialDelaySeconds: 180
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 6
Expand Down
252 changes: 252 additions & 0 deletions k8s/logging/grafana/custom-app-dashboard.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,252 @@
# k8s/logging/grafana/custom-app-dashboard.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-custom-app-dashboard
namespace: logging
labels:
grafana_dashboard: "1"
data:
custom-app-dashboard.json: |
{
"id": null,
"title": "Custom Application Metrics",
"tags": [
"spring-boot",
"app-metrics",
"camera-onboarding"
],
"style": "dark",
"timezone": "browser",
"panels": [
{
"id": 1,
"title": "Request Latency Percentiles",
"type": "timeseries",
"datasource": {
"type": "prometheus",
"uid": "Prometheus"},
"targets": [
{
"expr": "histogram_quantile(0.95, sum(rate(http_server_requests_seconds_bucket{kubernetes_namespace=\"app\"}[2m])) by (le))",
"legendFormat": "P95 Latency",
"refId": "A"
},
{
"expr": "histogram_quantile(0.90, sum(rate(http_server_requests_seconds_bucket{kubernetes_namespace=\"app\"}[2m])) by (le))",
"legendFormat": "P90 Latency",
"refId": "B"
},
{
"expr": "histogram_quantile(0.50, sum(rate(http_server_requests_seconds_bucket{kubernetes_namespace=\"app\"}[2m])) by (le))",
"legendFormat": "P50 Latency",
"refId": "C"
}
],
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 0},
"fieldConfig": {
"defaults": {
"unit": "s",
"min": 0,
"color": {
"mode": "palette-classic"}
}
},
"options": {
"legend": {
"displayMode": "table",
"values": [
"last",
"max"]}
}
},
{
"id": 2,
"title": "Request Rate (req/sec)",
"type": "timeseries",
"datasource": {
"type": "prometheus",
"uid": "Prometheus"},
"targets": [
{
"expr": "sum(rate(http_server_requests_seconds_count{kubernetes_namespace=\"app\"}[1m]))",
"legendFormat": "Total RPS",
"refId": "A"
}
],
"gridPos": {
"h": 6,
"w": 24,
"x": 0,
"y": 8},
"fieldConfig": {
"defaults": {
"unit": "reqps",
"color": {
"mode": "palette-classic"
}
}
}
},
{
"id": 3,
"title": "Successful Requests (5m)",
"type": "stat",
"datasource": {
"type": "prometheus",
"uid": "Prometheus"},
"targets": [
{
"expr": "sum(increase(http_server_requests_seconds_count{kubernetes_namespace=\"app\", status=~\"2..\"}[5m]))",
"legendFormat": "Successful"
}
],
"gridPos": {
"h": 6,
"w": 8,
"x": 0,
"y": 14},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"},
"thresholds": {
"steps": [
{
"color": "green",
"value": null}
]
}
}
}
},
{
"id": 4,
"title": "Failed Requests (5m)",
"type": "stat",
"datasource": {
"type": "prometheus",
"uid": "Prometheus"},
"targets": [
{
"expr": "sum(increase(http_server_requests_seconds_count{kubernetes_namespace=\"app\", status=~\"[45]..\"}[5m]))",
"legendFormat": "Failed"
}
],
"gridPos": {
"h": 6,
"w": 8,
"x": 8,
"y": 14},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"},
"thresholds": {
"steps": [
{
"color": "green",
"value": null},
{
"color": "red",
"value": 1}
]
}
}
}
},
{
"id": 5,
"title": "Success Rate (%)",
"type": "stat",
"datasource": {
"type": "prometheus",
"uid": "Prometheus"},
"targets": [
{
"expr": "(sum(rate(http_server_requests_seconds_count{kubernetes_namespace=\"app\", status=~\"2..\"}[5m])) / sum(rate(http_server_requests_seconds_count{kubernetes_namespace=\"app\"}[5m]))) * 100",
"legendFormat": "Success Rate"
}
],
"gridPos": {
"h": 6,
"w": 8,
"x": 16,
"y": 14},
"fieldConfig": {
"defaults": {
"unit": "percent",
"color": {
"mode": "thresholds"},
"thresholds": {
"steps": [
{
"color": "red",
"value": null
},
{
"color": "yellow",
"value": 90},
{
"color": "green",
"value": 95}
]
}
}
}
},
{
"id": 6,
"title": "Request Status Distribution",
"type": "timeseries",
"datasource": {
"type": "prometheus",
"uid": "Prometheus"},
"targets": [
{
"expr": "sum by (status) (rate(http_server_requests_seconds_count{kubernetes_namespace=\"app\"}[1m]))",
"legendFormat": "{{status}}"
}
],
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 20},
"fieldConfig": {
"defaults": {
"unit": "reqps",
"color": {
"mode": "palette-classic"}
}
}
},
{
"id": 7,
"title": "DEBUG PANEL: Available HTTP Metrics",
"type": "table",
"datasource": {
"type": "prometheus",
"uid": "Prometheus"},
"targets": [
{
"expr": "group by (__name__) ({__name__=~\".*http.*request.*\", kubernetes_namespace=\"app\"})",
"format": "table"
}
],
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 20}
}
],
"time": {
"from": "now-15m",
"to": "now"},
"refresh": "5s"
}
105 changes: 104 additions & 1 deletion k8s/logging/grafana/dashboard.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -168,4 +168,107 @@ data:
"from": "now-1h",
"to": "now"},
"refresh": "5s"
}
}
custom-app-dashboard.json: |
{
"id": null,
"title": "Custom Application Metrics",
"tags": [
"spring-boot",
"app-metrics"],
"style": "dark",
"timezone": "browser",
"panels": [
{
"id": 1,
"title": "Request Latency (P95)",
"type": "timeseries",
"datasource": {
"type": "prometheus",
"uid": "Prometheus"},
"targets": [
{
"expr": "histogram_quantile(0.95, sum by (le) (rate(http_server_requests_seconds_bucket{kubernetes_namespace=\"app\"}[2m])))",
"legendFormat": "P95 (http_server_requests)",
"refId": "A"
},
{
"expr": "histogram_quantile(0.95, sum by (le) (rate(http_request_duration_seconds_bucket{kubernetes_namespace=\"app\"}[2m])))",
"legendFormat": "P95 (http_request_duration)",
"refId": "B"
}
],
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 0},
"fieldConfig": {
"defaults": {
"unit": "s",
"min": 0}}
},
{
"id": 2,
"title": "Successful Requests (5m)",
"type": "stat",
"datasource": {
"type": "prometheus",
"uid": "Prometheus"},
"targets": [
{
"expr": "sum(increase(http_server_requests_seconds_count{kubernetes_namespace=\"app\", status=~\"2..\"}[5m]))",
"legendFormat": "Successful"
}
],
"gridPos": {
"h": 6,
"w": 12,
"x": 0,
"y": 8}
},
{
"id": 3,
"title": "Failed Requests (5m)",
"type": "stat",
"datasource": {
"type": "prometheus",
"uid": "Prometheus"
},
"targets": [
{
"expr": "sum(increase(http_server_requests_seconds_count{kubernetes_namespace=\"app\", status=~\"[45]..\"}[5m]))",
"legendFormat": "Failed"
}
],
"gridPos": {
"h": 6,
"w": 12,
"x": 12,
"y": 8}
},
{
"id": 4,
"title": "DEBUG PANEL: Available HTTP Metrics",
"type": "table",
"datasource": {
"type": "prometheus",
"uid": "Prometheus"},
"targets": [
{
"expr": "group by (__name__) ({__name__=~\".*http.*request.*\", kubernetes_namespace=\"app\"})",
"format": "table"
}
],
"gridPos": {
"h": 9,
"w": 24,
"x": 0,
"y": 14}
}
],
"time": {
"from": "now-15m",
"to": "now"},
"refresh": "5s"
}
Loading