Skip to main content

Metrics & Monitoring

Comprehensive metrics collection and monitoring for Olympus Cloud.

Overview

The monitoring stack provides observability across all services:

ComponentTechnologyPurpose
Infrastructure MetricsCloud MonitoringGCP resource metrics
Application MetricsCustom Metrics APIBusiness metrics
Edge MetricsCloudflare AnalyticsEdge performance
DashboardsGrafanaVisualization
AlertingCloud MonitoringIncident detection

Metrics Architecture

┌─────────────────────────────────────────────────────────────────┐
│ Data Sources │
│ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ │
│ │ Cloud │ │ Cloud │ │ Cloudflare│ │ Custom │ │
│ │ Run │ │ Spanner │ │ Workers │ │ App │ │
│ └────┬─────┘ └────┬─────┘ └────┬─────┘ └────┬─────┘ │
└───────┼─────────────┼─────────────┼─────────────┼───────────────┘
│ │ │ │
└─────────────┴──────┬──────┴─────────────┘


┌──────────────────────────────┐
│ Cloud Monitoring │
│ • Metrics Storage │
│ • Query Engine │
│ • Alerting │
└──────────────┬───────────────┘

┌──────────────┴───────────────┐
│ Grafana │
│ • Dashboards │
│ • Visualization │
│ • Annotations │
└──────────────────────────────┘

Infrastructure Metrics

Cloud Run Metrics

MetricDescriptionAlert Threshold
request_countTotal requestsN/A
request_latenciesResponse time distributionp99 > 500ms
container/cpu/utilizationCPU usage> 80%
container/memory/utilizationMemory usage> 85%
container/instance_countRunning instances< min or > 90% max
container/startup_latenciesCold start timep99 > 10s

Cloud Spanner Metrics

MetricDescriptionAlert Threshold
query_countTotal queriesN/A
query_latencyQuery response timep99 > 100ms
cpu/utilizationCPU usage> 65%
storage/utilizationStorage used> 80%
transaction_countTransactions/secN/A
lock_wait_timeLock contention> 10ms avg

Pub/Sub Metrics

MetricDescriptionAlert Threshold
num_undelivered_messagesQueue depth> 10000
oldest_unacked_message_ageMessage age> 5 minutes
delivery_latencyEnd-to-end latencyp99 > 1s

Application Metrics

Custom Metrics Implementation

// src/metrics.rs
use prometheus::{Counter, Histogram, Registry, Opts, HistogramOpts};

lazy_static! {
pub static ref REGISTRY: Registry = Registry::new();

pub static ref HTTP_REQUESTS: Counter = Counter::with_opts(
Opts::new("http_requests_total", "Total HTTP requests")
.namespace("olympus")
.subsystem("api")
).unwrap();

pub static ref HTTP_LATENCY: Histogram = Histogram::with_opts(
HistogramOpts::new("http_request_duration_seconds", "HTTP request latency")
.namespace("olympus")
.subsystem("api")
.buckets(vec![0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0])
).unwrap();

pub static ref ORDERS_CREATED: Counter = Counter::with_opts(
Opts::new("orders_created_total", "Total orders created")
.namespace("olympus")
.subsystem("commerce")
).unwrap();

pub static ref ORDER_VALUE: Histogram = Histogram::with_opts(
HistogramOpts::new("order_value_dollars", "Order value distribution")
.namespace("olympus")
.subsystem("commerce")
.buckets(vec![5.0, 10.0, 25.0, 50.0, 100.0, 250.0, 500.0])
).unwrap();
}

pub fn init_metrics() {
REGISTRY.register(Box::new(HTTP_REQUESTS.clone())).unwrap();
REGISTRY.register(Box::new(HTTP_LATENCY.clone())).unwrap();
REGISTRY.register(Box::new(ORDERS_CREATED.clone())).unwrap();
REGISTRY.register(Box::new(ORDER_VALUE.clone())).unwrap();
}

// Middleware for automatic HTTP metrics
pub async fn metrics_middleware<B>(
request: Request<B>,
next: Next<B>,
) -> Response {
let start = Instant::now();
let method = request.method().clone();
let path = request.uri().path().to_string();

let response = next.run(request).await;

let latency = start.elapsed().as_secs_f64();
let status = response.status().as_u16();

HTTP_REQUESTS
.with_label_values(&[&method.to_string(), &path, &status.to_string()])
.inc();

HTTP_LATENCY
.with_label_values(&[&method.to_string(), &path])
.observe(latency);

response
}

Business Metrics

MetricTypeLabelsDescription
orders_created_totalCountertenant_id, location_id, order_typeOrders created
order_value_dollarsHistogramtenant_id, location_idOrder values
items_sold_totalCountertenant_id, item_idItems sold
payment_processed_totalCountertenant_id, method, statusPayments
kds_ticket_time_secondsHistogramtenant_id, stationKitchen ticket times
inventory_levelGaugetenant_id, item_idCurrent inventory

Exporting to Cloud Monitoring

// src/metrics/export.rs
use google_cloud_monitoring::client::MetricServiceClient;

pub struct MetricExporter {
client: MetricServiceClient,
project: String,
}

impl MetricExporter {
pub async fn export(&self, metrics: &[Metric]) -> Result<()> {
let time_series: Vec<TimeSeries> = metrics
.iter()
.map(|m| self.to_time_series(m))
.collect();

self.client
.create_time_series(&self.project, time_series)
.await?;

Ok(())
}

fn to_time_series(&self, metric: &Metric) -> TimeSeries {
TimeSeries {
metric: Some(MonitoringMetric {
type_: format!("custom.googleapis.com/olympus/{}", metric.name),
labels: metric.labels.clone(),
}),
resource: Some(MonitoredResource {
type_: "cloud_run_revision".into(),
labels: hashmap! {
"project_id" => self.project.clone(),
"service_name" => std::env::var("K_SERVICE").unwrap_or_default(),
"revision_name" => std::env::var("K_REVISION").unwrap_or_default(),
},
}),
points: vec![Point {
interval: Some(TimeInterval {
end_time: Some(Timestamp::now()),
..Default::default()
}),
value: Some(metric.value.clone()),
}],
..Default::default()
}
}
}

Edge Metrics

Cloudflare Analytics

// workers/metrics.ts
export async function recordMetric(
env: Env,
metric: string,
value: number,
labels: Record<string, string>
) {
env.ANALYTICS.writeDataPoint({
blobs: [metric, JSON.stringify(labels)],
doubles: [value],
indexes: [labels.tenant_id || ''],
});
}

// Usage in worker
export default {
async fetch(request: Request, env: Env): Promise<Response> {
const start = Date.now();

const response = await handleRequest(request, env);

await recordMetric(env, 'edge_request_latency', Date.now() - start, {
path: new URL(request.url).pathname,
status: response.status.toString(),
});

return response;
},
};

Edge Metrics Dashboard

MetricDescriptionSource
RequestsTotal edge requestsCloudflare Analytics
BandwidthData transferredCloudflare Analytics
Cache Hit Rate% served from cacheCloudflare Analytics
Edge LatencyTime at edgeWorkers Analytics
WAF BlocksBlocked threatsCloudflare Security

DORA Metrics

Implementation

# .github/workflows/dora-metrics.yml
name: DORA Metrics

on:
workflow_run:
workflows: ["Deploy to Production"]
types: [completed]

jobs:
collect-metrics:
runs-on: ubuntu-latest
steps:
- name: Calculate Deployment Frequency
run: |
# Count deployments in last 24 hours
DEPLOYS=$(gh run list --workflow deploy.yml --created ">=$(date -d '24 hours ago' +%Y-%m-%d)" --json status --jq '[.[] | select(.status=="completed")] | length')
echo "deployment_frequency=$DEPLOYS" >> $GITHUB_OUTPUT

- name: Calculate Lead Time
run: |
# Time from commit to deploy
COMMIT_TIME=$(git log -1 --format=%ct)
DEPLOY_TIME=$(date +%s)
LEAD_TIME=$((DEPLOY_TIME - COMMIT_TIME))
echo "lead_time_seconds=$LEAD_TIME" >> $GITHUB_OUTPUT

- name: Push to Cloud Monitoring
run: |
gcloud monitoring metrics create \
custom.googleapis.com/dora/deployment_frequency \
--value ${{ steps.frequency.outputs.deployment_frequency }}

DORA Dashboard

MetricTargetCurrentStatus
Deployment FrequencyDailyMultiple/dayElite
Lead Time for Changes< 1 day< 1 hourElite
Change Failure Rate< 15%~5%Elite
Time to Restore< 1 hour< 15 minElite

Dashboards

Service Overview Dashboard

{
"displayName": "Olympus Service Overview",
"gridLayout": {
"columns": 3,
"widgets": [
{
"title": "Request Rate",
"xyChart": {
"dataSets": [{
"timeSeriesQuery": {
"timeSeriesFilter": {
"filter": "resource.type=\"cloud_run_revision\" AND metric.type=\"run.googleapis.com/request_count\"",
"aggregation": {
"alignmentPeriod": "60s",
"perSeriesAligner": "ALIGN_RATE"
}
}
}
}]
}
},
{
"title": "Latency (p99)",
"xyChart": {
"dataSets": [{
"timeSeriesQuery": {
"timeSeriesFilter": {
"filter": "resource.type=\"cloud_run_revision\" AND metric.type=\"run.googleapis.com/request_latencies\"",
"aggregation": {
"alignmentPeriod": "60s",
"perSeriesAligner": "ALIGN_PERCENTILE_99"
}
}
}
}]
}
},
{
"title": "Error Rate",
"xyChart": {
"dataSets": [{
"timeSeriesQuery": {
"timeSeriesFilterRatio": {
"numerator": {
"filter": "resource.type=\"cloud_run_revision\" AND metric.type=\"run.googleapis.com/request_count\" AND metric.labels.response_code_class=\"5xx\""
},
"denominator": {
"filter": "resource.type=\"cloud_run_revision\" AND metric.type=\"run.googleapis.com/request_count\""
}
}
}
}]
}
}
]
}
}

Business Metrics Dashboard

PanelMetricVisualization
Orders Todayorders_created_totalCounter
Revenue Todaysum(order_value_dollars)Counter
Avg Order Valueavg(order_value_dollars)Gauge
Orders by Typeorders_created_total by typePie Chart
Hourly Ordersorders_created_totalTime Series
Top Itemsitems_sold_totalBar Chart

SLOs (Service Level Objectives)

Definition

# slos/platform-service.yaml
slos:
- name: availability
description: "Platform service availability"
target: 99.9%
window: 30d
indicator:
type: availability
good_events: "response_code < 500"
total_events: "all requests"

- name: latency
description: "API response latency"
target: 99%
window: 30d
indicator:
type: latency
threshold: 500ms
percentile: 99

- name: error_rate
description: "Error rate"
target: 99.9%
window: 30d
indicator:
type: error_rate
good_events: "response_code < 500"

SLO Burn Rate Alerts

# Alert when burning error budget too fast
alerts:
- name: high_burn_rate
condition: |
(error_rate_1h > 14.4 * (1 - slo_target)) AND
(error_rate_5m > 14.4 * (1 - slo_target))
severity: critical

- name: medium_burn_rate
condition: |
(error_rate_6h > 6 * (1 - slo_target)) AND
(error_rate_30m > 6 * (1 - slo_target))
severity: warning

Metrics Endpoints

Prometheus Format

// src/routes/metrics.rs
use axum::{routing::get, Router};
use prometheus::Encoder;

pub fn metrics_routes() -> Router {
Router::new()
.route("/metrics", get(prometheus_metrics))
.route("/metrics/json", get(json_metrics))
}

async fn prometheus_metrics() -> impl IntoResponse {
let encoder = prometheus::TextEncoder::new();
let metric_families = REGISTRY.gather();
let mut buffer = Vec::new();
encoder.encode(&metric_families, &mut buffer).unwrap();

(
[(header::CONTENT_TYPE, "text/plain; charset=utf-8")],
buffer,
)
}