330 lines
13 KiB
YAML
330 lines
13 KiB
YAML
groups:
|
|
- name: fluxer_api_alerts
|
|
interval: 30s
|
|
rules:
|
|
- alert: FluxerHighErrorRate
|
|
expr: |
|
|
(
|
|
rate(http_server_request_count{http_response_status_code=~"5.."}[5m])
|
|
/
|
|
rate(http_server_request_count[5m])
|
|
) > 0.05
|
|
and rate(http_server_request_count[5m]) > 10
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
service: fluxer-api
|
|
alert_type: error_rate
|
|
annotations:
|
|
summary: 'High error rate on {{ $labels.service_name }}'
|
|
description: 'Error rate is above 5% (minimum 10 requests/5m) on {{ $labels.service_name }}. Current value: {{ $value | humanizePercentage }}'
|
|
runbook: 'https://docs.fluxer.dev/runbooks/high-error-rate'
|
|
|
|
- alert: FluxerElevatedErrorRate
|
|
expr: |
|
|
(
|
|
rate(http_server_request_count{http_response_status_code=~"5.."}[5m])
|
|
/
|
|
rate(http_server_request_count[5m])
|
|
) > 0.01
|
|
and rate(http_server_request_count[5m]) > 10
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
service: fluxer-api
|
|
alert_type: error_rate
|
|
annotations:
|
|
summary: 'Elevated error rate on {{ $labels.service_name }}'
|
|
description: 'Error rate is above 1% on {{ $labels.service_name }}. Current value: {{ $value | humanizePercentage }}'
|
|
runbook: 'https://docs.fluxer.dev/runbooks/high-error-rate'
|
|
|
|
- name: fluxer_queue_alerts
|
|
interval: 30s
|
|
rules:
|
|
- alert: FluxerQueueDepthCritical
|
|
expr: |
|
|
fluxer_queue_depth{service_name="fluxer-queue"} > 10000
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
service: fluxer-queue
|
|
alert_type: queue_depth
|
|
annotations:
|
|
summary: 'Queue depth critically high for {{ $labels.queue_name }}'
|
|
description: 'Queue {{ $labels.queue_name }} has {{ $value }} jobs pending (threshold: 10,000). Jobs may be delayed or processing is stalled.'
|
|
runbook: 'https://docs.fluxer.dev/runbooks/queue-depth-critical'
|
|
|
|
- alert: FluxerQueueDepthElevated
|
|
expr: |
|
|
fluxer_queue_depth{service_name="fluxer-queue"} > 5000
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
service: fluxer-queue
|
|
alert_type: queue_depth
|
|
annotations:
|
|
summary: 'Queue depth elevated for {{ $labels.queue_name }}'
|
|
description: 'Queue {{ $labels.queue_name }} has {{ $value }} jobs pending (threshold: 5,000). Monitor for escalation.'
|
|
|
|
- alert: FluxerDLQRateCritical
|
|
expr: |
|
|
sum(rate(fluxer_queue_dead_letter{service_name="fluxer-queue"}[5m])) > 5
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
service: fluxer-queue
|
|
alert_type: dlq_rate
|
|
annotations:
|
|
summary: 'High dead letter queue rate'
|
|
description: 'Jobs are failing and moving to DLQ at rate {{ $value | humanize }} jobs/sec. Check job failures and error logs.'
|
|
runbook: 'https://docs.fluxer.dev/runbooks/high-dlq-rate'
|
|
|
|
- name: fluxer_gateway_alerts
|
|
interval: 30s
|
|
rules:
|
|
- alert: FluxerGatewayConnectionDropCritical
|
|
expr: |
|
|
sum(rate(gateway_websocket_disconnections{reason="error"}[1m])) by (service_name) > 10
|
|
for: 3m
|
|
labels:
|
|
severity: critical
|
|
service: fluxer-gateway
|
|
alert_type: connection_drop
|
|
annotations:
|
|
summary: 'Critical WebSocket error disconnect rate'
|
|
description: 'Gateway experiencing {{ $value | humanize }} error disconnects/min. This may indicate service instability or network issues.'
|
|
runbook: 'https://docs.fluxer.dev/runbooks/gateway-connection-drop'
|
|
|
|
- alert: FluxerGatewayDisconnectElevated
|
|
expr: |
|
|
sum(rate(gateway_websocket_disconnections{reason="error"}[1m])) by (service_name) > 5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: fluxer-gateway
|
|
alert_type: connection_drop
|
|
annotations:
|
|
summary: 'Elevated WebSocket error disconnect rate'
|
|
description: 'Gateway experiencing {{ $value | humanize }} error disconnects/min. Monitor for escalation.'
|
|
|
|
- alert: FluxerGatewayDisconnectRatioHigh
|
|
expr: |
|
|
(
|
|
sum(rate(gateway_websocket_disconnections{reason="error"}[5m])) by (service_name)
|
|
/
|
|
sum(rate(gateway_websocket_connections[5m])) by (service_name)
|
|
) > 0.1
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
service: fluxer-gateway
|
|
alert_type: disconnect_ratio
|
|
annotations:
|
|
summary: 'Gateway disconnect ratio above 10%'
|
|
description: 'Error disconnects represent {{ $value | humanizePercentage }} of new connections. Check gateway stability.'
|
|
runbook: 'https://docs.fluxer.dev/runbooks/gateway-connection-drop'
|
|
|
|
- alert: FluxerGatewayRPCLatencyHigh
|
|
expr: |
|
|
histogram_quantile(0.95,
|
|
sum(rate(gateway_rpc_latency_bucket{service_name="fluxer-gateway"}[5m])) by (le)
|
|
) > 500
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
service: fluxer-gateway
|
|
alert_type: rpc_latency
|
|
annotations:
|
|
summary: 'Gateway RPC P95 latency above 500ms'
|
|
description: 'Gateway RPC calls experiencing high latency. Current P95: {{ $value | humanize }}ms'
|
|
runbook: 'https://docs.fluxer.dev/runbooks/gateway-rpc-latency'
|
|
|
|
- name: fluxer_log_alerts
|
|
interval: 30s
|
|
rules:
|
|
- alert: FluxerLogErrorSpikeCritical
|
|
expr: |
|
|
sum(rate(logs_count{severity_text="ERROR"}[5m])) by (service_name) > 50
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
alert_type: log_error_spike
|
|
annotations:
|
|
summary: 'Critical error log volume spike on {{ $labels.service_name }}'
|
|
description: 'Service {{ $labels.service_name }} logging {{ $value | humanize }} errors/sec. Check logs and traces for root cause.'
|
|
runbook: 'https://docs.fluxer.dev/runbooks/log-error-spike'
|
|
|
|
- alert: FluxerLogErrorElevated
|
|
expr: |
|
|
sum(rate(logs_count{severity_text="ERROR"}[5m])) by (service_name) > 20
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
alert_type: log_error_elevated
|
|
annotations:
|
|
summary: 'Elevated error log volume on {{ $labels.service_name }}'
|
|
description: 'Service {{ $labels.service_name }} logging {{ $value | humanize }} errors/sec. Monitor for escalation.'
|
|
|
|
- alert: FluxerLogWarningElevated
|
|
expr: |
|
|
sum(rate(logs_count{severity_text="WARN"}[5m])) by (service_name) > 100
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
alert_type: log_warning_elevated
|
|
annotations:
|
|
summary: 'Elevated warning log volume on {{ $labels.service_name }}'
|
|
description: 'Service {{ $labels.service_name }} logging {{ $value | humanize }} warnings/sec. Review warning patterns.'
|
|
|
|
- name: fluxer_api_performance_alerts
|
|
interval: 30s
|
|
rules:
|
|
- alert: FluxerAPILatencyCritical
|
|
expr: |
|
|
histogram_quantile(0.95,
|
|
sum(rate(http_server_request_duration_bucket{service_name="fluxer-api"}[5m])) by (le, http_route)
|
|
) > 2000
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
service: fluxer-api
|
|
alert_type: latency
|
|
annotations:
|
|
summary: 'Critical API latency on route {{ $labels.http_route }}'
|
|
description: 'P95 latency for route {{ $labels.http_route }} is above 2 seconds. Current: {{ $value | humanize }}ms'
|
|
runbook: 'https://docs.fluxer.dev/runbooks/high-api-latency'
|
|
|
|
- alert: FluxerAPILatencyElevated
|
|
expr: |
|
|
histogram_quantile(0.95,
|
|
sum(rate(http_server_request_duration_bucket{service_name="fluxer-api"}[5m])) by (le, http_route)
|
|
) > 1000
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
service: fluxer-api
|
|
alert_type: latency
|
|
annotations:
|
|
summary: 'Elevated API latency on route {{ $labels.http_route }}'
|
|
description: 'P95 latency for route {{ $labels.http_route }} is above 1 second. Current: {{ $value | humanize }}ms'
|
|
|
|
- name: fluxer_database_alerts
|
|
interval: 30s
|
|
rules:
|
|
- alert: FluxerDBLatencyCritical
|
|
expr: |
|
|
histogram_quantile(0.95,
|
|
sum(rate(db_query_latency_bucket[5m])) by (le, query_type)
|
|
) > 1000
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
alert_type: database_latency
|
|
annotations:
|
|
summary: 'Critical database query latency for {{ $labels.query_type }}'
|
|
description: 'P95 {{ $labels.query_type }} query latency above 1 second. Current: {{ $value | humanize }}ms'
|
|
runbook: 'https://docs.fluxer.dev/runbooks/database-latency'
|
|
|
|
- alert: FluxerDBConnectionPoolHigh
|
|
expr: |
|
|
db_connection_pool_active / db_connection_pool_max > 0.8
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
alert_type: connection_pool
|
|
annotations:
|
|
summary: 'Database connection pool usage above 80%'
|
|
description: 'Connection pool at {{ $value | humanizePercentage }} capacity. May lead to connection waits.'
|
|
runbook: 'https://docs.fluxer.dev/runbooks/connection-pool'
|
|
|
|
- name: fluxer_cache_alerts
|
|
interval: 30s
|
|
rules:
|
|
- alert: FluxerCacheHitRateLow
|
|
expr: |
|
|
sum(rate(cache_operation{status="hit"}[5m])) by (cache_name)
|
|
/
|
|
sum(rate(cache_operation{status=~"hit|miss"}[5m])) by (cache_name) < 0.5
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
alert_type: cache_efficiency
|
|
annotations:
|
|
summary: 'Low cache hit rate for {{ $labels.cache_name }}'
|
|
description: 'Cache {{ $labels.cache_name }} hit rate below 50%. Current: {{ $value | humanizePercentage }}'
|
|
runbook: 'https://docs.fluxer.dev/runbooks/low-cache-hit-rate'
|
|
|
|
- name: fluxer_worker_alerts
|
|
interval: 30s
|
|
rules:
|
|
- alert: FluxerWorkerFailureRateCritical
|
|
expr: |
|
|
sum(rate(fluxer_worker_task_failure[5m])) by (task_name) > 1
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
alert_type: worker_failure
|
|
annotations:
|
|
summary: 'Critical worker task failure rate for {{ $labels.task_name }}'
|
|
description: 'Worker task {{ $labels.task_name }} failing at {{ $value | humanize }} tasks/sec. Check task logs.'
|
|
runbook: 'https://docs.fluxer.dev/runbooks/worker-failures'
|
|
|
|
- alert: FluxerCronJobOverdue
|
|
expr: |
|
|
time() - max(fluxer_queue_cron_tick_timestamp by (cron)) > 3600
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: fluxer-queue
|
|
alert_type: cron
|
|
annotations:
|
|
summary: 'Cron job {{ $labels.cron }} has not executed in over 1 hour'
|
|
description: "Scheduled cron job hasn't run since {{ $value | humanizeTimestamp }}. May indicate hung process."
|
|
runbook: 'https://docs.fluxer.dev/runbooks/cron-overdue'
|
|
|
|
- name: fluxer_telemetry_alerts
|
|
interval: 60s
|
|
rules:
|
|
- alert: FluxerTelemetryIngestionStopped
|
|
expr: |
|
|
increase(signoz_traces_signoz_index_v2[15m]) == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
alert_type: telemetry
|
|
annotations:
|
|
summary: 'No traces being ingested'
|
|
description: "SigNoz collector hasn't received traces in 15 minutes. Check collector health and service instrumentation."
|
|
runbook: 'https://docs.fluxer.dev/runbooks/telemetry-down'
|
|
|
|
- name: fluxer_media_proxy_alerts
|
|
interval: 30s
|
|
rules:
|
|
- alert: FluxerMediaProxyErrorRate
|
|
expr: |
|
|
sum(rate(media_proxy_failure{service_name="fluxer-media-proxy"}[5m]))
|
|
/
|
|
sum(rate(http_server_request_count{service_name="fluxer-media-proxy"}[5m])) > 0.1
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
service: fluxer-media-proxy
|
|
alert_type: error_rate
|
|
annotations:
|
|
summary: 'Media proxy error rate above 10%'
|
|
description: 'Media proxy failing {{ $value | humanizePercentage }} of requests. Check origin servers and cache.'
|
|
runbook: 'https://docs.fluxer.dev/runbooks/media-proxy-errors'
|
|
|
|
- alert: FluxerMediaProxyTimeoutRate
|
|
expr: |
|
|
sum(rate(media_proxy_failure{error_type="timeout"}[5m])) > 5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: fluxer-media-proxy
|
|
alert_type: timeout
|
|
annotations:
|
|
summary: 'Media proxy timeout rate above 5 req/s'
|
|
description: 'Media proxy experiencing high timeout rate. May indicate network issues or slow origins.'
|
|
runbook: 'https://docs.fluxer.dev/runbooks/media-proxy-timeouts'
|