veza/config/alertmanager/routes.yml

# Alertmanager routing tree (v1.0.9 W2 Day 10).
#
# Two channels :
#   * page-oncall  — slack #alerts-page + PagerDuty bridge. Wakes
#                    someone up. Reserved for severity=critical AND
#                    page=true.
#   * ticket-oncall — slack #alerts-ticket. Files a ticket; investigate
#                     during business hours.
#
# Routing key = labels on the alert. The SLO rules in slo.yml set
# `page: "true"` on fast-burn alerts and `page: "false"` on slow-burn,
# so the burn-rate methodology and the routing tree stay coupled.
#
# This file is meant to be merged into the main alertmanager.yml
# (or stitched in via -config-file overrides). Keeping it separate
# makes it easy to diff and review the routing logic without
# touching receiver credentials.
route:
  receiver: 'slack-default'
  group_by: ['alertname', 'job', 'slo']
  group_wait: 30s
  group_interval: 5m
  repeat_interval: 4h

  routes:
    # Page-grade : critical + explicitly tagged page=true.
    - matchers:
        - severity = critical
        - page = "true"
      receiver: page-oncall
      group_wait: 10s          # page faster than the default 30s
      repeat_interval: 30m     # keep paging until ack'd
      continue: false

    # Ticket-grade : warning OR critical-without-page.
    - matchers:
        - page = "false"
      receiver: ticket-oncall
      group_wait: 1m
      repeat_interval: 12h
      continue: false

    # Fallback : critical alerts without a page=… label still go to
    # page-oncall. Better wake someone up for an unlabelled critical
    # than silently route it to ticket.
    - matchers:
        - severity = critical
      receiver: page-oncall
      continue: false

receivers:
  - name: page-oncall
    slack_configs:
      - api_url: '${SLACK_WEBHOOK_URL_PAGE}'
        channel: '#alerts-page'
        send_resolved: true
        title: '🚨 PAGE: {{ .GroupLabels.alertname }}'
        text: |
          {{ range .Alerts }}
          *Severity:* {{ .Labels.severity }}
          *SLO:* {{ .Labels.slo | default "n/a" }}
          *Description:* {{ .Annotations.description }}
          *Runbook:* {{ .Annotations.runbook_url }}
          {{ end }}
    # PagerDuty integration — populate routing_key from
    # ${PAGERDUTY_ROUTING_KEY} once an account is provisioned. Until
    # then the slack channel is the only page surface.
    pagerduty_configs:
      - routing_key: '${PAGERDUTY_ROUTING_KEY}'
        severity: '{{ .GroupLabels.severity }}'
        send_resolved: true

  - name: ticket-oncall
    slack_configs:
      - api_url: '${SLACK_WEBHOOK_URL_TICKET}'
        channel: '#alerts-ticket'
        send_resolved: true
        title: '🎫 TICKET: {{ .GroupLabels.alertname }}'
        text: |
          {{ range .Alerts }}
          *Severity:* {{ .Labels.severity }}
          *SLO:* {{ .Labels.slo | default "n/a" }}
          *Description:* {{ .Annotations.description }}
          *Runbook:* {{ .Annotations.runbook_url }}
          {{ end }}

  # slack-default kept as a no-op fallback so a misconfigured alert
  # that escapes the route tree still hits a receiver — Alertmanager
  # logs noise instead of silently dropping the alert.
  - name: slack-default
    slack_configs:
      - api_url: '${SLACK_WEBHOOK_URL}'
        channel: '#alerts'
        send_resolved: true
        title: '{{ .Status | toUpper }}: {{ .GroupLabels.alertname }}'
        text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'