Skip to content

Commit a50a8cf

Browse files
committed
feat: add observability metrics infrastructure
- Add scripts/metrics.py with 7 Prometheus metrics definitions - Add CLI timing logs to register_stac.py - Expose metrics endpoint in workflow pods (port 8000) - Add prometheus-client dependency - Background metrics server with trap cleanup
1 parent 9b5eed4 commit a50a8cf

File tree

8 files changed

+140
-10
lines changed

8 files changed

+140
-10
lines changed

notebooks/02_pyramid_performance.ipynb

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -399,7 +399,7 @@
399399
"plt.show()\n",
400400
"\n",
401401
"print(\n",
402-
" f\"\\n📊 Key Metric: {np.mean([s for z, s in zip(zooms, [measured[i]/expected[i] for i in range(len(zooms))], strict=False) if z <= 10]):.1f}× average speedup at production-relevant zooms\"\n",
402+
" f\"\\n📊 Key Metric: {np.mean([s for z, s in zip(zooms, [measured[i] / expected[i] for i in range(len(zooms))], strict=False) if z <= 10]):.1f}× average speedup at production-relevant zooms\"\n",
403403
")"
404404
]
405405
},
@@ -426,15 +426,15 @@
426426
"print(\"Return on Investment:\")\n",
427427
"print(\"=\" * 60)\n",
428428
"print(\"Storage Cost:\")\n",
429-
"print(f\" Native only: {native_storage:,} pixels ({native_storage/1e6:.0f} MB uncompressed)\")\n",
430-
"print(f\" With pyramids: {total_storage:,} pixels ({total_storage/1e6:.0f} MB uncompressed)\")\n",
429+
"print(f\" Native only: {native_storage:,} pixels ({native_storage / 1e6:.0f} MB uncompressed)\")\n",
430+
"print(f\" With pyramids: {total_storage:,} pixels ({total_storage / 1e6:.0f} MB uncompressed)\")\n",
431431
"print(f\" Overhead: +{overhead_pct:.0f}%\")\n",
432432
"print(\"\\nPerformance Gain:\")\n",
433433
"print(\n",
434-
" f\" z6-10 (low zoom): {np.mean([measured[i]/expected[i] for i, z in enumerate(zooms) if z <= 10]):.1f}× faster\"\n",
434+
" f\" z6-10 (low zoom): {np.mean([measured[i] / expected[i] for i, z in enumerate(zooms) if z <= 10]):.1f}× faster\"\n",
435435
")\n",
436436
"print(\n",
437-
" f\" z12-14 (high zoom): {np.mean([measured[i]/expected[i] for i, z in enumerate(zooms) if z >= 12]):.1f}× faster\"\n",
437+
" f\" z12-14 (high zoom): {np.mean([measured[i] / expected[i] for i, z in enumerate(zooms) if z >= 12]):.1f}× faster\"\n",
438438
")\n",
439439
"print(\"\\nProduction Impact:\")\n",
440440
"print(\" • Consistent 100-200ms tile generation across all zooms\")\n",

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ dependencies = [
3131
"pika>=1.3.0",
3232
"tenacity>=8.0.0",
3333
"requests>=2.31.0",
34+
"prometheus-client>=0.19.0",
3435
]
3536

3637
[project.optional-dependencies]

scripts/benchmark_geozarr.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ def main(argv: list[str] | None = None) -> int:
110110
if speedup > 1:
111111
logger.info(f"✅ GeoZarr is {speedup}x faster than EOPF")
112112
else:
113-
logger.warning(f"⚠️ EOPF is {1/speedup:.2f}x faster than GeoZarr")
113+
logger.warning(f"⚠️ EOPF is {1 / speedup:.2f}x faster than GeoZarr")
114114

115115
return 0
116116

scripts/benchmark_tile_performance.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,7 @@ def benchmark_zoom_level(
154154
status = "✓" if result["success"] else "✗"
155155
logger.debug(
156156
f" {status} z{z}/{x}/{y}: {result['latency_ms']:.1f}ms "
157-
f"({result['size_bytes']/1024:.1f}KB)"
157+
f"({result['size_bytes'] / 1024:.1f}KB)"
158158
)
159159

160160
# Calculate statistics

scripts/metrics.py

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
#!/usr/bin/env python3
2+
"""Prometheus metrics instrumentation for data-pipeline scripts.
3+
4+
This module provides shared metric definitions and a metrics server
5+
for exposing metrics to the Prometheus scraper in Kubernetes.
6+
7+
Usage:
8+
from scripts.metrics import start_metrics_server, CONVERSION_DURATION
9+
10+
start_metrics_server(port=8000) # In main()
11+
12+
with CONVERSION_DURATION.labels(collection="sentinel-2-l2a").time():
13+
convert_data()
14+
"""
15+
16+
from __future__ import annotations
17+
18+
import logging
19+
import os
20+
21+
from prometheus_client import Counter, Histogram, start_http_server
22+
23+
logger = logging.getLogger(__name__)
24+
25+
# Metrics port for Kubernetes ServiceMonitor to scrape
26+
DEFAULT_METRICS_PORT = 8000
27+
28+
# Conversion workflow metrics
29+
CONVERSION_DURATION = Histogram(
30+
"geozarr_conversion_seconds",
31+
"Time to convert source to GeoZarr format",
32+
labelnames=["collection", "resolution"],
33+
)
34+
35+
CONVERSION_DATA_SIZE = Histogram(
36+
"geozarr_conversion_bytes",
37+
"Size of data converted in bytes",
38+
labelnames=["collection"],
39+
buckets=[1e6, 10e6, 100e6, 1e9, 10e9, 100e9], # 1MB to 100GB
40+
)
41+
42+
# STAC API interaction metrics
43+
STAC_REGISTRATION_TOTAL = Counter(
44+
"stac_registration_total",
45+
"Total STAC item registration attempts",
46+
labelnames=["collection", "status"], # status: success|failure|retry
47+
)
48+
49+
STAC_HTTP_REQUEST_DURATION = Histogram(
50+
"stac_http_request_seconds",
51+
"STAC API HTTP request duration",
52+
labelnames=["method", "endpoint", "status_code"],
53+
)
54+
55+
# Preview generation metrics
56+
PREVIEW_GENERATION_DURATION = Histogram(
57+
"preview_generation_seconds",
58+
"Time to generate preview images",
59+
labelnames=["collection", "preview_type"], # preview_type: true_color|quicklook|s1_grd
60+
)
61+
62+
PREVIEW_HTTP_REQUEST_DURATION = Histogram(
63+
"preview_http_request_seconds",
64+
"HTTP request duration for preview-related operations",
65+
labelnames=["operation", "status_code"],
66+
)
67+
68+
# AMQP workflow metrics
69+
AMQP_PUBLISH_TOTAL = Counter(
70+
"amqp_publish_total",
71+
"Total AMQP messages published",
72+
labelnames=["exchange", "status"], # status: success|failure
73+
)
74+
75+
76+
def start_metrics_server(port: int | None = None) -> None:
77+
"""Start Prometheus metrics HTTP server.
78+
79+
Args:
80+
port: Port to listen on. Defaults to METRICS_PORT env var or 8000.
81+
82+
Note:
83+
Should only be called once per process. Safe to call in Kubernetes
84+
pod startup. Metrics exposed at http://localhost:<port>/metrics
85+
"""
86+
if port is None:
87+
port = int(os.getenv("METRICS_PORT", str(DEFAULT_METRICS_PORT)))
88+
89+
try:
90+
start_http_server(port)
91+
logger.info("Metrics server started on port %d", port)
92+
except OSError as e:
93+
# Port already in use (e.g., from previous run)
94+
logger.warning("Failed to start metrics server on port %d: %s", port, e)
95+
96+
97+
def is_metrics_enabled() -> bool:
98+
"""Check if metrics collection is enabled.
99+
100+
Returns:
101+
True if ENABLE_METRICS env var is set to "true" (case-insensitive).
102+
Defaults to True if not set (opt-out model).
103+
"""
104+
return os.getenv("ENABLE_METRICS", "true").lower() == "true"

scripts/register_stac.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import logging
1616
import os
1717
import sys
18+
import time
1819
from typing import Any, cast
1920
from urllib.parse import urlparse
2021

@@ -436,6 +437,8 @@ def register_item(
436437

437438
def main() -> int:
438439
"""CLI entrypoint."""
440+
start_time = time.perf_counter()
441+
439442
parser = argparse.ArgumentParser(description="Register GeoZarr output to STAC API")
440443
parser.add_argument(
441444
"--stac",
@@ -510,11 +513,13 @@ def main() -> int:
510513
headers=headers,
511514
)
512515

513-
logger.info("Registration complete")
516+
duration = time.perf_counter() - start_time
517+
logger.info(f"Registration complete in {duration:.2f}s")
514518
return 0
515519

516520
except Exception as exc:
517-
logger.error(f" {exc}")
521+
duration = time.perf_counter() - start_time
522+
logger.error(f"Registration failed after {duration:.2f}s: {exc}")
518523
import traceback
519524

520525
traceback.print_exc()

uv.lock

Lines changed: 11 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

workflows/template.yaml

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,11 @@ spec:
242242
source: |
243243
set -euo pipefail
244244
245+
# Start metrics server in background (for Prometheus scraping)
246+
python -c "from scripts.metrics import start_metrics_server; start_metrics_server()" &
247+
METRICS_PID=$!
248+
trap "kill $METRICS_PID 2>/dev/null || true" EXIT
249+
245250
echo "════════════════════════════════════════════════════════════════════════════"
246251
echo " STEP 3/4: STAC REGISTRATION"
247252
echo "════════════════════════════════════════════════════════════════════════════"
@@ -280,10 +285,14 @@ spec:
280285
limits:
281286
memory: "2Gi"
282287
cpu: "1"
283-
source: |
284288
source: |
285289
set -euo pipefail
286290
291+
# Start metrics server in background (for Prometheus scraping)
292+
python -c "from scripts.metrics import start_metrics_server; start_metrics_server()" &
293+
METRICS_PID=$!
294+
trap "kill $METRICS_PID 2>/dev/null || true" EXIT
295+
287296
echo "════════════════════════════════════════════════════════════════════════════"
288297
echo " STEP 4/4: STAC AUGMENTATION"
289298
echo "════════════════════════════════════════════════════════════════════════════"

0 commit comments

Comments
 (0)