Alerting
Configure and manage alerts for Olympus Cloud platform health.
Overview
The alerting system provides proactive incident detection:
| Component | Technology | Purpose |
|---|---|---|
| Detection | Cloud Monitoring | Metric-based alerts |
| Routing | Alert Policies | Notification channels |
| Escalation | PagerDuty | On-call management |
| Communication | Slack/Email | Team notifications |
| Tracking | Incident Management | Resolution workflow |
Alert Architecture
┌─────────────────────────────────────────────────────────────────┐
│ Monitoring Data │
│ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ │
│ │ Metrics │ │ Logs │ │ Traces │ │ Uptime │ │
│ └────┬─────┘ └────┬─────┘ └────┬─────┘ └────┬─────┘ │
└───────┼─────────────┼─────────────┼─────────────┼───────────────┘
│ │ │ │
└─────────────┴──────┬──────┴─────────────┘
│
┌──────────────▼───────────────┐
│ Alert Policies │
│ • Threshold conditions │
│ • Duration windows │
│ • Notification channels │
└──────────────┬───────────────┘
│
┌───────────────────┼───────────────────┐
│ │ │
▼ ▼ ▼
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
│ PagerDuty │ │ Slack │ │ Email │
│ (P1 Critical) │ │ (P2/P3 Alerts) │ │ (All Alerts) │
└─────────────────┘ └─────────────────┘ └─────────────────┘
Alert Policies
Infrastructure Alerts
# terraform/alerting.tf
# High Error Rate
resource "google_monitoring_alert_policy" "high_error_rate" {
display_name = "High Error Rate - Platform Service"
combiner = "OR"
conditions {
display_name = "Error rate > 1%"
condition_threshold {
filter = <<-EOT
resource.type = "cloud_run_revision"
AND resource.labels.service_name = "platform-service"
AND metric.type = "run.googleapis.com/request_count"
AND metric.labels.response_code_class = "5xx"
EOT
aggregations {
alignment_period = "60s"
per_series_aligner = "ALIGN_RATE"
}
comparison = "COMPARISON_GT"
threshold_value = 0.01 # 1%
duration = "300s" # 5 minutes
trigger {
count = 1
}
}
}
notification_channels = [
google_monitoring_notification_channel.pagerduty.name,
google_monitoring_notification_channel.slack_critical.name,
]
alert_strategy {
auto_close = "1800s" # 30 minutes
}
documentation {
content = <<-EOT
## High Error Rate Alert
The platform service is experiencing elevated error rates.
### Investigation Steps
1. Check Cloud Logging for error details
2. Verify database connectivity
3. Check recent deployments
### Runbook
See: https://docs.olympuscloud.ai/runbooks/high-error-rate
EOT
mime_type = "text/markdown"
}
}
# High Latency
resource "google_monitoring_alert_policy" "high_latency" {
display_name = "High Latency - API Gateway"
combiner = "OR"
conditions {
display_name = "P99 latency > 500ms"
condition_threshold {
filter = <<-EOT
resource.type = "cloud_run_revision"
AND resource.labels.service_name = "api-gateway"
AND metric.type = "run.googleapis.com/request_latencies"
EOT
aggregations {
alignment_period = "60s"
per_series_aligner = "ALIGN_PERCENTILE_99"
}
comparison = "COMPARISON_GT"
threshold_value = 500 # 500ms
duration = "300s"
trigger {
count = 1
}
}
}
notification_channels = [
google_monitoring_notification_channel.slack_warnings.name,
]
}
# Database CPU
resource "google_monitoring_alert_policy" "spanner_cpu" {
display_name = "High CPU - Cloud Spanner"
combiner = "OR"
conditions {
display_name = "CPU utilization > 65%"
condition_threshold {
filter = <<-EOT
resource.type = "spanner_instance"
AND metric.type = "spanner.googleapis.com/instance/cpu/utilization"
EOT
aggregations {
alignment_period = "300s"
per_series_aligner = "ALIGN_MEAN"
}
comparison = "COMPARISON_GT"
threshold_value = 0.65
duration = "600s"
trigger {
count = 1
}
}
}
notification_channels = [
google_monitoring_notification_channel.pagerduty.name,
]
}
Application Alerts
# Order Processing Failed
resource "google_monitoring_alert_policy" "order_failures" {
display_name = "Order Processing Failures"
combiner = "OR"
conditions {
display_name = "Order failures > 10/min"
condition_threshold {
filter = <<-EOT
resource.type = "cloud_run_revision"
AND metric.type = "custom.googleapis.com/olympus/order_failures_total"
EOT
aggregations {
alignment_period = "60s"
per_series_aligner = "ALIGN_RATE"
}
comparison = "COMPARISON_GT"
threshold_value = 10
duration = "60s"
trigger {
count = 1
}
}
}
notification_channels = [
google_monitoring_notification_channel.pagerduty.name,
google_monitoring_notification_channel.slack_critical.name,
]
}
# Payment Processing
resource "google_monitoring_alert_policy" "payment_failures" {
display_name = "Payment Processing Failures"
combiner = "OR"
conditions {
display_name = "Payment failure rate > 5%"
condition_monitoring_query_language {
query = <<-EOT
fetch cloud_run_revision
| metric 'custom.googleapis.com/olympus/payment_processed_total'
| filter metric.status == 'failed'
| align rate(1m)
| every 1m
| group_by [], [value_payment_processed_total_aggregate: aggregate(value.payment_processed_total)]
| condition val() > 0.05
EOT
duration = "300s"
trigger {
count = 1
}
}
}
notification_channels = [
google_monitoring_notification_channel.pagerduty.name,
]
}
SLO-Based Alerts
# SLO Burn Rate Alert
resource "google_monitoring_alert_policy" "slo_burn_rate" {
display_name = "SLO Burn Rate - Platform Availability"
combiner = "OR"
conditions {
display_name = "Fast burn rate (1h window)"
condition_threshold {
filter = <<-EOT
select_slo_burn_rate(
"projects/olympuscloud-prod/services/platform-service/serviceLevelObjectives/availability-slo",
"3600s"
)
EOT
comparison = "COMPARISON_GT"
threshold_value = 10 # 10x normal burn rate
duration = "0s"
trigger {
count = 1
}
}
}
conditions {
display_name = "Slow burn rate (6h window)"
condition_threshold {
filter = <<-EOT
select_slo_burn_rate(
"projects/olympuscloud-prod/services/platform-service/serviceLevelObjectives/availability-slo",
"21600s"
)
EOT
comparison = "COMPARISON_GT"
threshold_value = 2 # 2x normal burn rate
duration = "0s"
trigger {
count = 1
}
}
}
notification_channels = [
google_monitoring_notification_channel.pagerduty.name,
]
}
Notification Channels
Channel Configuration
# PagerDuty
resource "google_monitoring_notification_channel" "pagerduty" {
display_name = "PagerDuty - Platform Team"
type = "pagerduty"
labels = {
service_key = var.pagerduty_service_key
}
}
# Slack Critical
resource "google_monitoring_notification_channel" "slack_critical" {
display_name = "Slack - #alerts-critical"
type = "slack"
labels = {
channel_name = "#alerts-critical"
}
sensitive_labels {
auth_token = var.slack_webhook_token
}
}
# Slack Warnings
resource "google_monitoring_notification_channel" "slack_warnings" {
display_name = "Slack - #alerts-warnings"
type = "slack"
labels = {
channel_name = "#alerts-warnings"
}
sensitive_labels {
auth_token = var.slack_webhook_token
}
}
# Email
resource "google_monitoring_notification_channel" "email" {
display_name = "Email - Platform Team"
type = "email"
labels = {
email_address = "platform-team@olympuscloud.ai"
}
}
Routing Rules
| Severity | Channel | Response Time |
|---|---|---|
| P1 - Critical | PagerDuty + Slack #critical | 5 minutes |
| P2 - High | Slack #alerts + Email | 30 minutes |
| P3 - Medium | Slack #alerts | 4 hours |
| P4 - Low | Email only | Next business day |
Alert Severity Matrix
Severity Definitions
| Severity | Impact | Examples |
|---|---|---|
| P1 | Complete service outage | Database down, all APIs failing |
| P2 | Degraded service | High latency, elevated errors |
| P3 | Partial impact | Single feature broken |
| P4 | Minimal impact | Non-critical warnings |
Alert Classifications
| Alert | Severity | Threshold | Duration |
|---|---|---|---|
| All services down | P1 | Error rate 100% | Immediate |
| Error rate > 5% | P1 | Error rate > 5% | 5 min |
| Error rate > 1% | P2 | Error rate > 1% | 5 min |
| P99 latency > 1s | P2 | Latency > 1000ms | 5 min |
| P99 latency > 500ms | P3 | Latency > 500ms | 10 min |
| Database CPU > 80% | P2 | CPU > 80% | 10 min |
| Database CPU > 65% | P3 | CPU > 65% | 15 min |
| Disk > 90% | P2 | Storage > 90% | Immediate |
| Certificate expiring | P3 | < 14 days | Daily |
PagerDuty Integration
Service Configuration
# pagerduty/services.yaml
services:
- name: olympus-platform
description: Olympus Cloud Platform Services
escalation_policy: platform-team
alert_creation: create_alerts_and_incidents
auto_resolve_timeout: 14400 # 4 hours
acknowledgement_timeout: 1800 # 30 minutes
integrations:
- type: events_api_v2
name: Cloud Monitoring
incident_urgency_rule:
type: constant
urgency: high
escalation_policies:
- name: platform-team
repeat_enabled: true
num_loops: 3
rules:
- escalation_delay_in_minutes: 5
targets:
- type: schedule_reference
id: primary-oncall
- escalation_delay_in_minutes: 15
targets:
- type: schedule_reference
id: secondary-oncall
- escalation_delay_in_minutes: 30
targets:
- type: user_reference
id: engineering-manager
On-Call Schedules
# pagerduty/schedules.yaml
schedules:
- name: primary-oncall
time_zone: America/Los_Angeles
layers:
- name: Primary
rotation_virtual_start: "2026-01-01T08:00:00-08:00"
rotation_turn_length_seconds: 604800 # 1 week
users:
- engineer-1
- engineer-2
- engineer-3
- engineer-4
- name: secondary-oncall
time_zone: America/Los_Angeles
layers:
- name: Secondary
rotation_virtual_start: "2026-01-01T08:00:00-08:00"
rotation_turn_length_seconds: 604800
start_offset: 604800 # Offset by 1 week
users:
- engineer-1
- engineer-2
- engineer-3
- engineer-4
Incident Response
Response Workflow
Alert Triggered
│
▼
┌─────────────────┐
│ Acknowledge │ ← Within 5 min (P1) / 30 min (P2)
│ in PagerDuty │
└────────┬────────┘
│
▼
┌─────────────────┐
│ Assess Impact │
│ & Severity │
└────────┬────────┘
│
├─── P1 ──→ Start Incident Call
│ Create Status Page
│
▼
┌─────────────────┐
│ Investigate │
│ Root Cause │
└────────┬────────┘
│
▼
┌─────────────────┐
│ Apply Fix │
│ or Mitigation │
└────────┬────────┘
│
▼
┌─────────────────┐
│ Verify │
│ Resolution │
└────────┬────────┘
│
▼
┌─────────────────┐
│ Resolve Alert │
│ Update Status │
└────────┬────────┘
│
▼
┌─────────────────┐
│ Post-Incident │
│ Review (P1/P2) │
└─────────────────┘
Incident Communication Template
## Incident: [Title]
**Status**: Investigating | Identified | Monitoring | Resolved
**Severity**: P1 | P2 | P3
**Started**: YYYY-MM-DD HH:MM UTC
**Resolved**: YYYY-MM-DD HH:MM UTC
### Impact
[Description of customer impact]
### Timeline
- HH:MM - Alert triggered
- HH:MM - Acknowledged by [Name]
- HH:MM - Root cause identified
- HH:MM - Fix deployed
- HH:MM - Verified resolution
### Root Cause
[Description of what caused the incident]
### Resolution
[Description of how the incident was resolved]
### Action Items
- [ ] Item 1
- [ ] Item 2
Uptime Checks
Configuration
# terraform/uptime.tf
resource "google_monitoring_uptime_check_config" "api_health" {
display_name = "API Gateway Health Check"
timeout = "10s"
period = "60s"
http_check {
path = "/health"
port = 443
use_ssl = true
validate_ssl = true
}
monitored_resource {
type = "uptime_url"
labels = {
project_id = var.project_id
host = "api.olympuscloud.ai"
}
}
content_matchers {
content = "OK"
matcher = "CONTAINS_STRING"
}
selected_regions = [
"USA",
"EUROPE",
"ASIA_PACIFIC",
]
}
# Alert for uptime check failure
resource "google_monitoring_alert_policy" "uptime_failure" {
display_name = "API Gateway Uptime Failure"
combiner = "OR"
conditions {
display_name = "Uptime check failed"
condition_threshold {
filter = <<-EOT
resource.type = "uptime_url"
AND metric.type = "monitoring.googleapis.com/uptime_check/check_passed"
AND metric.labels.check_id = "${google_monitoring_uptime_check_config.api_health.uptime_check_id}"
EOT
aggregations {
alignment_period = "60s"
per_series_aligner = "ALIGN_NEXT_OLDER"
cross_series_reducer = "REDUCE_COUNT_FALSE"
group_by_fields = ["resource.label.host"]
}
comparison = "COMPARISON_GT"
threshold_value = 1
duration = "60s"
}
}
notification_channels = [
google_monitoring_notification_channel.pagerduty.name,
]
}
Alert Suppression
Maintenance Windows
# Suppress alerts during maintenance
resource "google_monitoring_alert_policy" "with_maintenance" {
# ... alert configuration ...
alert_strategy {
notification_rate_limit {
period = "300s" # Max 1 notification per 5 min
}
# Don't alert during maintenance windows
notification_channel_strategy {
notification_channel_names = [
google_monitoring_notification_channel.pagerduty.name,
]
renotify_interval = "3600s"
}
}
}
Snooze Configuration
warning
Alert suppression and snooze windows should be used sparingly and only during planned maintenance. Always set a defined end time and never snooze P1 alert policies. Forgotten snooze windows are a common cause of missed production incidents.
# Snooze alerts via gcloud
gcloud alpha monitoring snoozes create \
--display-name="Deployment maintenance" \
--criteria-policies="projects/olympuscloud-prod/alertPolicies/123456" \
--start-time="2026-01-20T02:00:00Z" \
--end-time="2026-01-20T04:00:00Z"
Testing Alerts
Alert Testing
# Send test alert to PagerDuty
curl -X POST https://events.pagerduty.com/v2/enqueue \
-H "Content-Type: application/json" \
-d '{
"routing_key": "YOUR_ROUTING_KEY",
"event_action": "trigger",
"dedup_key": "test-alert-123",
"payload": {
"summary": "Test Alert - Please Ignore",
"severity": "warning",
"source": "manual-test"
}
}'
# Resolve test alert
curl -X POST https://events.pagerduty.com/v2/enqueue \
-H "Content-Type: application/json" \
-d '{
"routing_key": "YOUR_ROUTING_KEY",
"event_action": "resolve",
"dedup_key": "test-alert-123"
}'
Related Documentation
- Metrics - Metrics collection
- Logging - Log management
- Cloud Run Deployment - Service deployment