Skip to main content

Alerting

Configure and manage alerts for Olympus Cloud platform health.

Overview

The alerting system provides proactive incident detection:

ComponentTechnologyPurpose
DetectionCloud MonitoringMetric-based alerts
RoutingAlert PoliciesNotification channels
EscalationPagerDutyOn-call management
CommunicationSlack/EmailTeam notifications
TrackingIncident ManagementResolution workflow

Alert Architecture

┌─────────────────────────────────────────────────────────────────┐
│ Monitoring Data │
│ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ │
│ │ Metrics │ │ Logs │ │ Traces │ │ Uptime │ │
│ └────┬─────┘ └────┬─────┘ └────┬─────┘ └────┬─────┘ │
└───────┼─────────────┼─────────────┼─────────────┼───────────────┘
│ │ │ │
└─────────────┴──────┬──────┴─────────────┘

┌──────────────▼───────────────┐
│ Alert Policies │
│ • Threshold conditions │
│ • Duration windows │
│ • Notification channels │
└──────────────┬───────────────┘

┌───────────────────┼───────────────────┐
│ │ │
▼ ▼ ▼
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
│ PagerDuty │ │ Slack │ │ Email │
│ (P1 Critical) │ │ (P2/P3 Alerts) │ │ (All Alerts) │
└─────────────────┘ └─────────────────┘ └─────────────────┘

Alert Policies

Infrastructure Alerts

# terraform/alerting.tf

# High Error Rate
resource "google_monitoring_alert_policy" "high_error_rate" {
display_name = "High Error Rate - Platform Service"
combiner = "OR"

conditions {
display_name = "Error rate > 1%"

condition_threshold {
filter = <<-EOT
resource.type = "cloud_run_revision"
AND resource.labels.service_name = "platform-service"
AND metric.type = "run.googleapis.com/request_count"
AND metric.labels.response_code_class = "5xx"
EOT

aggregations {
alignment_period = "60s"
per_series_aligner = "ALIGN_RATE"
}

comparison = "COMPARISON_GT"
threshold_value = 0.01 # 1%
duration = "300s" # 5 minutes

trigger {
count = 1
}
}
}

notification_channels = [
google_monitoring_notification_channel.pagerduty.name,
google_monitoring_notification_channel.slack_critical.name,
]

alert_strategy {
auto_close = "1800s" # 30 minutes
}

documentation {
content = <<-EOT
## High Error Rate Alert

The platform service is experiencing elevated error rates.

### Investigation Steps
1. Check Cloud Logging for error details
2. Verify database connectivity
3. Check recent deployments

### Runbook
See: https://docs.olympuscloud.ai/runbooks/high-error-rate
EOT
mime_type = "text/markdown"
}
}

# High Latency
resource "google_monitoring_alert_policy" "high_latency" {
display_name = "High Latency - API Gateway"
combiner = "OR"

conditions {
display_name = "P99 latency > 500ms"

condition_threshold {
filter = <<-EOT
resource.type = "cloud_run_revision"
AND resource.labels.service_name = "api-gateway"
AND metric.type = "run.googleapis.com/request_latencies"
EOT

aggregations {
alignment_period = "60s"
per_series_aligner = "ALIGN_PERCENTILE_99"
}

comparison = "COMPARISON_GT"
threshold_value = 500 # 500ms
duration = "300s"

trigger {
count = 1
}
}
}

notification_channels = [
google_monitoring_notification_channel.slack_warnings.name,
]
}

# Database CPU
resource "google_monitoring_alert_policy" "spanner_cpu" {
display_name = "High CPU - Cloud Spanner"
combiner = "OR"

conditions {
display_name = "CPU utilization > 65%"

condition_threshold {
filter = <<-EOT
resource.type = "spanner_instance"
AND metric.type = "spanner.googleapis.com/instance/cpu/utilization"
EOT

aggregations {
alignment_period = "300s"
per_series_aligner = "ALIGN_MEAN"
}

comparison = "COMPARISON_GT"
threshold_value = 0.65
duration = "600s"

trigger {
count = 1
}
}
}

notification_channels = [
google_monitoring_notification_channel.pagerduty.name,
]
}

Application Alerts

# Order Processing Failed
resource "google_monitoring_alert_policy" "order_failures" {
display_name = "Order Processing Failures"
combiner = "OR"

conditions {
display_name = "Order failures > 10/min"

condition_threshold {
filter = <<-EOT
resource.type = "cloud_run_revision"
AND metric.type = "custom.googleapis.com/olympus/order_failures_total"
EOT

aggregations {
alignment_period = "60s"
per_series_aligner = "ALIGN_RATE"
}

comparison = "COMPARISON_GT"
threshold_value = 10
duration = "60s"

trigger {
count = 1
}
}
}

notification_channels = [
google_monitoring_notification_channel.pagerduty.name,
google_monitoring_notification_channel.slack_critical.name,
]
}

# Payment Processing
resource "google_monitoring_alert_policy" "payment_failures" {
display_name = "Payment Processing Failures"
combiner = "OR"

conditions {
display_name = "Payment failure rate > 5%"

condition_monitoring_query_language {
query = <<-EOT
fetch cloud_run_revision
| metric 'custom.googleapis.com/olympus/payment_processed_total'
| filter metric.status == 'failed'
| align rate(1m)
| every 1m
| group_by [], [value_payment_processed_total_aggregate: aggregate(value.payment_processed_total)]
| condition val() > 0.05
EOT

duration = "300s"

trigger {
count = 1
}
}
}

notification_channels = [
google_monitoring_notification_channel.pagerduty.name,
]
}

SLO-Based Alerts

# SLO Burn Rate Alert
resource "google_monitoring_alert_policy" "slo_burn_rate" {
display_name = "SLO Burn Rate - Platform Availability"
combiner = "OR"

conditions {
display_name = "Fast burn rate (1h window)"

condition_threshold {
filter = <<-EOT
select_slo_burn_rate(
"projects/olympuscloud-prod/services/platform-service/serviceLevelObjectives/availability-slo",
"3600s"
)
EOT

comparison = "COMPARISON_GT"
threshold_value = 10 # 10x normal burn rate
duration = "0s"

trigger {
count = 1
}
}
}

conditions {
display_name = "Slow burn rate (6h window)"

condition_threshold {
filter = <<-EOT
select_slo_burn_rate(
"projects/olympuscloud-prod/services/platform-service/serviceLevelObjectives/availability-slo",
"21600s"
)
EOT

comparison = "COMPARISON_GT"
threshold_value = 2 # 2x normal burn rate
duration = "0s"

trigger {
count = 1
}
}
}

notification_channels = [
google_monitoring_notification_channel.pagerduty.name,
]
}

Notification Channels

Channel Configuration

# PagerDuty
resource "google_monitoring_notification_channel" "pagerduty" {
display_name = "PagerDuty - Platform Team"
type = "pagerduty"

labels = {
service_key = var.pagerduty_service_key
}
}

# Slack Critical
resource "google_monitoring_notification_channel" "slack_critical" {
display_name = "Slack - #alerts-critical"
type = "slack"

labels = {
channel_name = "#alerts-critical"
}

sensitive_labels {
auth_token = var.slack_webhook_token
}
}

# Slack Warnings
resource "google_monitoring_notification_channel" "slack_warnings" {
display_name = "Slack - #alerts-warnings"
type = "slack"

labels = {
channel_name = "#alerts-warnings"
}

sensitive_labels {
auth_token = var.slack_webhook_token
}
}

# Email
resource "google_monitoring_notification_channel" "email" {
display_name = "Email - Platform Team"
type = "email"

labels = {
email_address = "platform-team@olympuscloud.ai"
}
}

Routing Rules

SeverityChannelResponse Time
P1 - CriticalPagerDuty + Slack #critical5 minutes
P2 - HighSlack #alerts + Email30 minutes
P3 - MediumSlack #alerts4 hours
P4 - LowEmail onlyNext business day

Alert Severity Matrix

Severity Definitions

SeverityImpactExamples
P1Complete service outageDatabase down, all APIs failing
P2Degraded serviceHigh latency, elevated errors
P3Partial impactSingle feature broken
P4Minimal impactNon-critical warnings

Alert Classifications

AlertSeverityThresholdDuration
All services downP1Error rate 100%Immediate
Error rate > 5%P1Error rate > 5%5 min
Error rate > 1%P2Error rate > 1%5 min
P99 latency > 1sP2Latency > 1000ms5 min
P99 latency > 500msP3Latency > 500ms10 min
Database CPU > 80%P2CPU > 80%10 min
Database CPU > 65%P3CPU > 65%15 min
Disk > 90%P2Storage > 90%Immediate
Certificate expiringP3< 14 daysDaily

PagerDuty Integration

Service Configuration

# pagerduty/services.yaml
services:
- name: olympus-platform
description: Olympus Cloud Platform Services
escalation_policy: platform-team
alert_creation: create_alerts_and_incidents
auto_resolve_timeout: 14400 # 4 hours
acknowledgement_timeout: 1800 # 30 minutes

integrations:
- type: events_api_v2
name: Cloud Monitoring

incident_urgency_rule:
type: constant
urgency: high

escalation_policies:
- name: platform-team
repeat_enabled: true
num_loops: 3
rules:
- escalation_delay_in_minutes: 5
targets:
- type: schedule_reference
id: primary-oncall
- escalation_delay_in_minutes: 15
targets:
- type: schedule_reference
id: secondary-oncall
- escalation_delay_in_minutes: 30
targets:
- type: user_reference
id: engineering-manager

On-Call Schedules

# pagerduty/schedules.yaml
schedules:
- name: primary-oncall
time_zone: America/Los_Angeles
layers:
- name: Primary
rotation_virtual_start: "2026-01-01T08:00:00-08:00"
rotation_turn_length_seconds: 604800 # 1 week
users:
- engineer-1
- engineer-2
- engineer-3
- engineer-4

- name: secondary-oncall
time_zone: America/Los_Angeles
layers:
- name: Secondary
rotation_virtual_start: "2026-01-01T08:00:00-08:00"
rotation_turn_length_seconds: 604800
start_offset: 604800 # Offset by 1 week
users:
- engineer-1
- engineer-2
- engineer-3
- engineer-4

Incident Response

Response Workflow

Alert Triggered


┌─────────────────┐
│ Acknowledge │ ← Within 5 min (P1) / 30 min (P2)
│ in PagerDuty │
└────────┬────────┘


┌─────────────────┐
│ Assess Impact │
│ & Severity │
└────────┬────────┘

├─── P1 ──→ Start Incident Call
│ Create Status Page


┌─────────────────┐
│ Investigate │
│ Root Cause │
└────────┬────────┘


┌─────────────────┐
│ Apply Fix │
│ or Mitigation │
└────────┬────────┘


┌─────────────────┐
│ Verify │
│ Resolution │
└────────┬────────┘


┌─────────────────┐
│ Resolve Alert │
│ Update Status │
└────────┬────────┘


┌─────────────────┐
│ Post-Incident │
│ Review (P1/P2) │
└─────────────────┘

Incident Communication Template

## Incident: [Title]

**Status**: Investigating | Identified | Monitoring | Resolved
**Severity**: P1 | P2 | P3
**Started**: YYYY-MM-DD HH:MM UTC
**Resolved**: YYYY-MM-DD HH:MM UTC

### Impact
[Description of customer impact]

### Timeline
- HH:MM - Alert triggered
- HH:MM - Acknowledged by [Name]
- HH:MM - Root cause identified
- HH:MM - Fix deployed
- HH:MM - Verified resolution

### Root Cause
[Description of what caused the incident]

### Resolution
[Description of how the incident was resolved]

### Action Items
- [ ] Item 1
- [ ] Item 2

Uptime Checks

Configuration

# terraform/uptime.tf

resource "google_monitoring_uptime_check_config" "api_health" {
display_name = "API Gateway Health Check"
timeout = "10s"
period = "60s"

http_check {
path = "/health"
port = 443
use_ssl = true
validate_ssl = true
}

monitored_resource {
type = "uptime_url"
labels = {
project_id = var.project_id
host = "api.olympuscloud.ai"
}
}

content_matchers {
content = "OK"
matcher = "CONTAINS_STRING"
}

selected_regions = [
"USA",
"EUROPE",
"ASIA_PACIFIC",
]
}

# Alert for uptime check failure
resource "google_monitoring_alert_policy" "uptime_failure" {
display_name = "API Gateway Uptime Failure"
combiner = "OR"

conditions {
display_name = "Uptime check failed"

condition_threshold {
filter = <<-EOT
resource.type = "uptime_url"
AND metric.type = "monitoring.googleapis.com/uptime_check/check_passed"
AND metric.labels.check_id = "${google_monitoring_uptime_check_config.api_health.uptime_check_id}"
EOT

aggregations {
alignment_period = "60s"
per_series_aligner = "ALIGN_NEXT_OLDER"
cross_series_reducer = "REDUCE_COUNT_FALSE"
group_by_fields = ["resource.label.host"]
}

comparison = "COMPARISON_GT"
threshold_value = 1
duration = "60s"
}
}

notification_channels = [
google_monitoring_notification_channel.pagerduty.name,
]
}

Alert Suppression

Maintenance Windows

# Suppress alerts during maintenance
resource "google_monitoring_alert_policy" "with_maintenance" {
# ... alert configuration ...

alert_strategy {
notification_rate_limit {
period = "300s" # Max 1 notification per 5 min
}

# Don't alert during maintenance windows
notification_channel_strategy {
notification_channel_names = [
google_monitoring_notification_channel.pagerduty.name,
]
renotify_interval = "3600s"
}
}
}

Snooze Configuration

warning

Alert suppression and snooze windows should be used sparingly and only during planned maintenance. Always set a defined end time and never snooze P1 alert policies. Forgotten snooze windows are a common cause of missed production incidents.

# Snooze alerts via gcloud
gcloud alpha monitoring snoozes create \
--display-name="Deployment maintenance" \
--criteria-policies="projects/olympuscloud-prod/alertPolicies/123456" \
--start-time="2026-01-20T02:00:00Z" \
--end-time="2026-01-20T04:00:00Z"

Testing Alerts

Alert Testing

# Send test alert to PagerDuty
curl -X POST https://events.pagerduty.com/v2/enqueue \
-H "Content-Type: application/json" \
-d '{
"routing_key": "YOUR_ROUTING_KEY",
"event_action": "trigger",
"dedup_key": "test-alert-123",
"payload": {
"summary": "Test Alert - Please Ignore",
"severity": "warning",
"source": "manual-test"
}
}'

# Resolve test alert
curl -X POST https://events.pagerduty.com/v2/enqueue \
-H "Content-Type: application/json" \
-d '{
"routing_key": "YOUR_ROUTING_KEY",
"event_action": "resolve",
"dedup_key": "test-alert-123"
}'