{ "name": "Fluxer Critical Alerts", "description": "Critical alerts for Fluxer services", "version": 2, "alerts": [ { "id": "high-api-error-rate", "name": "High API Error Rate", "type": "metric", "condition": { "query": "sum(rate(http_server_request_count{service_name='fluxer-api',http_response_status_code=~'5..'}[5m])) > 10", "evaluation_interval": "1m", "for": "5m" }, "severity": "critical", "annotations": { "summary": "API error rate is above 10 req/s", "description": "The fluxer-api service is experiencing a high error rate (5xx responses). This may indicate a service degradation or outage." }, "labels": { "service": "fluxer-api", "alert_type": "error_rate" }, "actions": [ { "type": "notification", "channel": "slack", "target": "#alerts-critical" } ] }, { "id": "high-api-latency", "name": "High API Latency", "type": "metric", "condition": { "query": "histogram_quantile(0.95, sum(rate(http_server_request_duration_bucket{service_name='fluxer-api'}[5m])) > 1000", "evaluation_interval": "1m", "for": "10m" }, "severity": "warning", "annotations": { "summary": "API P95 latency is above 1000ms", "description": "The fluxer-api service is experiencing high latency. 95% of requests are taking longer than 1 second." }, "labels": { "service": "fluxer-api", "alert_type": "latency" }, "actions": [ { "type": "notification", "channel": "slack", "target": "#alerts-warning" } ] }, { "id": "queue-depth-high", "name": "Queue Depth Too High", "type": "metric", "condition": { "query": "fluxer_queue_depth > 10000", "evaluation_interval": "1m", "for": "15m" }, "severity": "warning", "annotations": { "summary": "Queue depth is above 10,000 jobs", "description": "The job queue has accumulated more than 10,000 jobs. This may indicate processing is slower than job arrival." }, "labels": { "service": "fluxer-queue", "alert_type": "queue_depth" }, "actions": [ { "type": "notification", "channel": "slack", "target": "#alerts-warning" } ] }, { "id": "queue-dlq-rate", "name": "High Dead Letter Queue Rate", "type": "metric", "condition": { "query": "sum(rate(fluxer_queue_dead_letter[5m])) > 5", "evaluation_interval": "1m", "for": "10m" }, "severity": "critical", "annotations": { "summary": "DLQ rate is above 5 jobs/sec", "description": "Jobs are being moved to the dead letter queue at a high rate. This may indicate persistent job failures." }, "labels": { "service": "fluxer-queue", "alert_type": "dlq_rate" }, "actions": [ { "type": "notification", "channel": "slack", "target": "#alerts-critical" } ] }, { "id": "gateway-connection-drop", "name": "Gateway Connection Drop Rate", "type": "metric", "condition": { "query": "rate(gateway_websocket_disconnections[1m]) / rate(gateway_websocket_connections[1m]) > 0.5", "evaluation_interval": "1m", "for": "5m" }, "severity": "critical", "annotations": { "summary": "Gateway disconnect rate exceeds 50% of connect rate", "description": "WebSocket connections are dropping at an unusually high rate. This may indicate network issues or service instability." }, "labels": { "service": "fluxer-gateway", "alert_type": "connection_stability" }, "actions": [ { "type": "notification", "channel": "slack", "target": "#alerts-critical" } ] }, { "id": "gateway-rpc-latency-high", "name": "Gateway RPC Latency High", "type": "metric", "condition": { "query": "gateway_rpc_latency_p95 > 500", "evaluation_interval": "1m", "for": "10m" }, "severity": "warning", "annotations": { "summary": "Gateway RPC P95 latency above 500ms", "description": "RPC calls from gateway to backend are experiencing high latency." }, "labels": { "service": "fluxer-gateway", "alert_type": "latency" }, "actions": [ { "type": "notification", "channel": "slack", "target": "#alerts-warning" } ] }, { "id": "media-proxy-error-rate", "name": "Media Proxy High Error Rate", "type": "metric", "condition": { "query": "sum(rate(media_proxy_failure{service_name='fluxer-media-proxy'}[5m])) / sum(rate(http_server_request_count{service_name='fluxer-media-proxy'}[5m])) > 0.1", "evaluation_interval": "1m", "for": "10m" }, "severity": "warning", "annotations": { "summary": "Media proxy error rate above 10%", "description": "The media proxy is failing more than 10% of requests. This may indicate origin issues or cache problems." }, "labels": { "service": "fluxer-media-proxy", "alert_type": "error_rate" }, "actions": [ { "type": "notification", "channel": "slack", "target": "#alerts-warning" } ] }, { "id": "media-proxy-timeout-rate", "name": "Media Proxy High Timeout Rate", "type": "metric", "condition": { "query": "sum(rate(media_proxy_failure{error_type='timeout'}[5m])) > 5", "evaluation_interval": "1m", "for": "5m" }, "severity": "warning", "annotations": { "summary": "Media proxy timeout rate above 5 req/s", "description": "The media proxy is experiencing a high rate of timeouts. This may indicate network issues or slow origin servers." }, "labels": { "service": "fluxer-media-proxy", "alert_type": "timeout" }, "actions": [ { "type": "notification", "channel": "slack", "target": "#alerts-warning" } ] }, { "id": "telemetry-ingestion-stopped", "name": "Telemetry Ingestion Stopped", "type": "metric", "condition": { "query": "increase(signoz_traces_signoz_index_v2[15m]) == 0", "evaluation_interval": "1m", "for": "5m" }, "severity": "critical", "annotations": { "summary": "No traces being ingested", "description": "The SigNoz collector has not received any traces in the last 15 minutes. This may indicate a collector issue or service instrumentation failure." }, "labels": { "service": "signoz", "alert_type": "telemetry" }, "actions": [ { "type": "notification", "channel": "slack", "target": "#alerts-critical" } ] }, { "id": "cron-job-overdue", "name": "Cron Job Overdue", "type": "metric", "condition": { "query": "time() - max(fluxer_queue_cron_tick_timestamp by (cron)) > 3600", "evaluation_interval": "5m", "for": "5m" }, "severity": "warning", "annotations": { "summary": "Cron job has not executed in over 1 hour", "description": "A scheduled cron job has not run in over an hour. This may indicate a hung cron process or scheduling issue." }, "labels": { "service": "fluxer-queue", "alert_type": "cron" }, "actions": [ { "type": "notification", "channel": "slack", "target": "#alerts-warning" } ] }, { "id": "csam-match-detected", "name": "CSAM Match Detected", "type": "metric", "condition": { "query": "sum(rate(fluxer_csam_matches_total{service_name='fluxer-api'}[1m])) > 0", "evaluation_interval": "1m", "for": "0m" }, "severity": "critical", "annotations": { "summary": "CSAM content has been detected", "description": "CSAM content has been detected. Immediate review required." }, "labels": { "service": "fluxer-api", "alert_type": "csam_match" }, "actions": [ { "type": "notification", "channel": "slack", "target": "#alerts-critical" } ] }, { "id": "photodna-api-error-rate-high", "name": "PhotoDNA API Error Rate High", "type": "metric", "condition": { "query": "sum(rate(fluxer_csam_photodna_api_total{service_name='fluxer-api',status='error'}[5m])) / sum(rate(fluxer_csam_photodna_api_total{service_name='fluxer-api'}[5m])) > 0.1", "evaluation_interval": "1m", "for": "5m" }, "severity": "warning", "annotations": { "summary": "PhotoDNA API error rate exceeds 10%", "description": "PhotoDNA API error rate exceeds 10%" }, "labels": { "service": "fluxer-api", "alert_type": "photodna_error_rate" }, "actions": [ { "type": "notification", "channel": "slack", "target": "#alerts-warning" } ] }, { "id": "ncmec-submission-failure", "name": "NCMEC Submission Failure", "type": "metric", "condition": { "query": "sum(rate(fluxer_csam_ncmec_submissions{service_name='fluxer-api',status='error'}[5m])) > 0", "evaluation_interval": "1m", "for": "5m" }, "severity": "critical", "annotations": { "summary": "NCMEC report submission has failed", "description": "NCMEC report submission has failed. Manual intervention required." }, "labels": { "service": "fluxer-api", "alert_type": "ncmec_submission" }, "actions": [ { "type": "notification", "channel": "slack", "target": "#alerts-critical" } ] }, { "id": "csam-scan-failure-rate-high", "name": "CSAM Scan Failure Rate High", "type": "metric", "condition": { "query": "sum(rate(fluxer_csam_scans_total{service_name='fluxer-api',status='error'}[5m])) / sum(rate(fluxer_csam_scans_total{service_name='fluxer-api'}[5m])) > 0.05", "evaluation_interval": "1m", "for": "5m" }, "severity": "warning", "annotations": { "summary": "CSAM scan failure rate exceeds 5%", "description": "CSAM scan failure rate exceeds 5%" }, "labels": { "service": "fluxer-api", "alert_type": "csam_scan_failure_rate" }, "actions": [ { "type": "notification", "channel": "slack", "target": "#alerts-warning" } ] }, { "id": "photodna-api-latency-high", "name": "PhotoDNA API Latency High", "type": "metric", "condition": { "query": "histogram_quantile(0.95, sum(rate(fluxer_csam_photodna_api_duration_ms_bucket{service_name='fluxer-api'}[5m])) by (le)) > 5000", "evaluation_interval": "1m", "for": "5m" }, "severity": "warning", "annotations": { "summary": "PhotoDNA API p95 latency exceeds 5 seconds", "description": "PhotoDNA API p95 latency exceeds 5 seconds" }, "labels": { "service": "fluxer-api", "alert_type": "photodna_latency" }, "actions": [ { "type": "notification", "channel": "slack", "target": "#alerts-warning" } ] } ], "notification_channels": { "slack": { "type": "webhook", "url": "${ALERT_WEBHOOK_URL}", "channel_mapping": { "critical": "#alerts-critical", "warning": "#alerts-warning" } } } }