summaryrefslogtreecommitdiff
path: root/monitoring/prometheus/alert_rules.yml
blob: aca2f1cd2f823b438120df7abc516da81bf35e73 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
groups:
  - name: trading-platform
    rules:
      - alert: ServiceDown
        expr: up == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Service {{ $labels.job }} is down"
          description: "{{ $labels.instance }} has been unreachable for 1 minute."

      - alert: HighErrorRate
        expr: rate(errors_total[5m]) > 10
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "High error rate on {{ $labels.job }}"
          description: "Error rate is {{ $value }} errors/sec over 5 minutes."

      - alert: HighProcessingLatency
        expr: histogram_quantile(0.95, rate(processing_seconds_bucket[5m])) > 5
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High p95 latency on {{ $labels.job }}"
          description: "95th percentile processing time is {{ $value }}s."