diff options
Diffstat (limited to 'monitoring/prometheus/alert_rules.yml')
| -rw-r--r-- | monitoring/prometheus/alert_rules.yml | 29 |
1 files changed, 29 insertions, 0 deletions
diff --git a/monitoring/prometheus/alert_rules.yml b/monitoring/prometheus/alert_rules.yml new file mode 100644 index 0000000..aca2f1c --- /dev/null +++ b/monitoring/prometheus/alert_rules.yml @@ -0,0 +1,29 @@ +groups: + - name: trading-platform + rules: + - alert: ServiceDown + expr: up == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Service {{ $labels.job }} is down" + description: "{{ $labels.instance }} has been unreachable for 1 minute." + + - alert: HighErrorRate + expr: rate(errors_total[5m]) > 10 + for: 2m + labels: + severity: warning + annotations: + summary: "High error rate on {{ $labels.job }}" + description: "Error rate is {{ $value }} errors/sec over 5 minutes." + + - alert: HighProcessingLatency + expr: histogram_quantile(0.95, rate(processing_seconds_bucket[5m])) > 5 + for: 5m + labels: + severity: warning + annotations: + summary: "High p95 latency on {{ $labels.job }}" + description: "95th percentile processing time is {{ $value }}s." |
