408 lines
11 KiB
JSON
408 lines
11 KiB
JSON
{
|
|
"name": "Fluxer Critical Alerts",
|
|
"description": "Critical alerts for Fluxer services",
|
|
"version": 2,
|
|
"alerts": [
|
|
{
|
|
"id": "high-api-error-rate",
|
|
"name": "High API Error Rate",
|
|
"type": "metric",
|
|
"condition": {
|
|
"query": "sum(rate(http_server_request_count{service_name='fluxer-api',http_response_status_code=~'5..'}[5m])) > 10",
|
|
"evaluation_interval": "1m",
|
|
"for": "5m"
|
|
},
|
|
"severity": "critical",
|
|
"annotations": {
|
|
"summary": "API error rate is above 10 req/s",
|
|
"description": "The fluxer-api service is experiencing a high error rate (5xx responses). This may indicate a service degradation or outage."
|
|
},
|
|
"labels": {
|
|
"service": "fluxer-api",
|
|
"alert_type": "error_rate"
|
|
},
|
|
"actions": [
|
|
{
|
|
"type": "notification",
|
|
"channel": "slack",
|
|
"target": "#alerts-critical"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": "high-api-latency",
|
|
"name": "High API Latency",
|
|
"type": "metric",
|
|
"condition": {
|
|
"query": "histogram_quantile(0.95, sum(rate(http_server_request_duration_bucket{service_name='fluxer-api'}[5m])) > 1000",
|
|
"evaluation_interval": "1m",
|
|
"for": "10m"
|
|
},
|
|
"severity": "warning",
|
|
"annotations": {
|
|
"summary": "API P95 latency is above 1000ms",
|
|
"description": "The fluxer-api service is experiencing high latency. 95% of requests are taking longer than 1 second."
|
|
},
|
|
"labels": {
|
|
"service": "fluxer-api",
|
|
"alert_type": "latency"
|
|
},
|
|
"actions": [
|
|
{
|
|
"type": "notification",
|
|
"channel": "slack",
|
|
"target": "#alerts-warning"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": "queue-depth-high",
|
|
"name": "Queue Depth Too High",
|
|
"type": "metric",
|
|
"condition": {
|
|
"query": "fluxer_queue_depth > 10000",
|
|
"evaluation_interval": "1m",
|
|
"for": "15m"
|
|
},
|
|
"severity": "warning",
|
|
"annotations": {
|
|
"summary": "Queue depth is above 10,000 jobs",
|
|
"description": "The job queue has accumulated more than 10,000 jobs. This may indicate processing is slower than job arrival."
|
|
},
|
|
"labels": {
|
|
"service": "fluxer-queue",
|
|
"alert_type": "queue_depth"
|
|
},
|
|
"actions": [
|
|
{
|
|
"type": "notification",
|
|
"channel": "slack",
|
|
"target": "#alerts-warning"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": "queue-dlq-rate",
|
|
"name": "High Dead Letter Queue Rate",
|
|
"type": "metric",
|
|
"condition": {
|
|
"query": "sum(rate(fluxer_queue_dead_letter[5m])) > 5",
|
|
"evaluation_interval": "1m",
|
|
"for": "10m"
|
|
},
|
|
"severity": "critical",
|
|
"annotations": {
|
|
"summary": "DLQ rate is above 5 jobs/sec",
|
|
"description": "Jobs are being moved to the dead letter queue at a high rate. This may indicate persistent job failures."
|
|
},
|
|
"labels": {
|
|
"service": "fluxer-queue",
|
|
"alert_type": "dlq_rate"
|
|
},
|
|
"actions": [
|
|
{
|
|
"type": "notification",
|
|
"channel": "slack",
|
|
"target": "#alerts-critical"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": "gateway-connection-drop",
|
|
"name": "Gateway Connection Drop Rate",
|
|
"type": "metric",
|
|
"condition": {
|
|
"query": "rate(gateway_websocket_disconnections[1m]) / rate(gateway_websocket_connections[1m]) > 0.5",
|
|
"evaluation_interval": "1m",
|
|
"for": "5m"
|
|
},
|
|
"severity": "critical",
|
|
"annotations": {
|
|
"summary": "Gateway disconnect rate exceeds 50% of connect rate",
|
|
"description": "WebSocket connections are dropping at an unusually high rate. This may indicate network issues or service instability."
|
|
},
|
|
"labels": {
|
|
"service": "fluxer-gateway",
|
|
"alert_type": "connection_stability"
|
|
},
|
|
"actions": [
|
|
{
|
|
"type": "notification",
|
|
"channel": "slack",
|
|
"target": "#alerts-critical"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": "gateway-rpc-latency-high",
|
|
"name": "Gateway RPC Latency High",
|
|
"type": "metric",
|
|
"condition": {
|
|
"query": "gateway_rpc_latency_p95 > 500",
|
|
"evaluation_interval": "1m",
|
|
"for": "10m"
|
|
},
|
|
"severity": "warning",
|
|
"annotations": {
|
|
"summary": "Gateway RPC P95 latency above 500ms",
|
|
"description": "RPC calls from gateway to backend are experiencing high latency."
|
|
},
|
|
"labels": {
|
|
"service": "fluxer-gateway",
|
|
"alert_type": "latency"
|
|
},
|
|
"actions": [
|
|
{
|
|
"type": "notification",
|
|
"channel": "slack",
|
|
"target": "#alerts-warning"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": "media-proxy-error-rate",
|
|
"name": "Media Proxy High Error Rate",
|
|
"type": "metric",
|
|
"condition": {
|
|
"query": "sum(rate(media_proxy_failure{service_name='fluxer-media-proxy'}[5m])) / sum(rate(http_server_request_count{service_name='fluxer-media-proxy'}[5m])) > 0.1",
|
|
"evaluation_interval": "1m",
|
|
"for": "10m"
|
|
},
|
|
"severity": "warning",
|
|
"annotations": {
|
|
"summary": "Media proxy error rate above 10%",
|
|
"description": "The media proxy is failing more than 10% of requests. This may indicate origin issues or cache problems."
|
|
},
|
|
"labels": {
|
|
"service": "fluxer-media-proxy",
|
|
"alert_type": "error_rate"
|
|
},
|
|
"actions": [
|
|
{
|
|
"type": "notification",
|
|
"channel": "slack",
|
|
"target": "#alerts-warning"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": "media-proxy-timeout-rate",
|
|
"name": "Media Proxy High Timeout Rate",
|
|
"type": "metric",
|
|
"condition": {
|
|
"query": "sum(rate(media_proxy_failure{error_type='timeout'}[5m])) > 5",
|
|
"evaluation_interval": "1m",
|
|
"for": "5m"
|
|
},
|
|
"severity": "warning",
|
|
"annotations": {
|
|
"summary": "Media proxy timeout rate above 5 req/s",
|
|
"description": "The media proxy is experiencing a high rate of timeouts. This may indicate network issues or slow origin servers."
|
|
},
|
|
"labels": {
|
|
"service": "fluxer-media-proxy",
|
|
"alert_type": "timeout"
|
|
},
|
|
"actions": [
|
|
{
|
|
"type": "notification",
|
|
"channel": "slack",
|
|
"target": "#alerts-warning"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": "telemetry-ingestion-stopped",
|
|
"name": "Telemetry Ingestion Stopped",
|
|
"type": "metric",
|
|
"condition": {
|
|
"query": "increase(signoz_traces_signoz_index_v2[15m]) == 0",
|
|
"evaluation_interval": "1m",
|
|
"for": "5m"
|
|
},
|
|
"severity": "critical",
|
|
"annotations": {
|
|
"summary": "No traces being ingested",
|
|
"description": "The SigNoz collector has not received any traces in the last 15 minutes. This may indicate a collector issue or service instrumentation failure."
|
|
},
|
|
"labels": {
|
|
"service": "signoz",
|
|
"alert_type": "telemetry"
|
|
},
|
|
"actions": [
|
|
{
|
|
"type": "notification",
|
|
"channel": "slack",
|
|
"target": "#alerts-critical"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": "cron-job-overdue",
|
|
"name": "Cron Job Overdue",
|
|
"type": "metric",
|
|
"condition": {
|
|
"query": "time() - max(fluxer_queue_cron_tick_timestamp by (cron)) > 3600",
|
|
"evaluation_interval": "5m",
|
|
"for": "5m"
|
|
},
|
|
"severity": "warning",
|
|
"annotations": {
|
|
"summary": "Cron job has not executed in over 1 hour",
|
|
"description": "A scheduled cron job has not run in over an hour. This may indicate a hung cron process or scheduling issue."
|
|
},
|
|
"labels": {
|
|
"service": "fluxer-queue",
|
|
"alert_type": "cron"
|
|
},
|
|
"actions": [
|
|
{
|
|
"type": "notification",
|
|
"channel": "slack",
|
|
"target": "#alerts-warning"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": "csam-match-detected",
|
|
"name": "CSAM Match Detected",
|
|
"type": "metric",
|
|
"condition": {
|
|
"query": "sum(rate(fluxer_csam_matches_total{service_name='fluxer-api'}[1m])) > 0",
|
|
"evaluation_interval": "1m",
|
|
"for": "0m"
|
|
},
|
|
"severity": "critical",
|
|
"annotations": {
|
|
"summary": "CSAM content has been detected",
|
|
"description": "CSAM content has been detected. Immediate review required."
|
|
},
|
|
"labels": {
|
|
"service": "fluxer-api",
|
|
"alert_type": "csam_match"
|
|
},
|
|
"actions": [
|
|
{
|
|
"type": "notification",
|
|
"channel": "slack",
|
|
"target": "#alerts-critical"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": "photodna-api-error-rate-high",
|
|
"name": "PhotoDNA API Error Rate High",
|
|
"type": "metric",
|
|
"condition": {
|
|
"query": "sum(rate(fluxer_csam_photodna_api_total{service_name='fluxer-api',status='error'}[5m])) / sum(rate(fluxer_csam_photodna_api_total{service_name='fluxer-api'}[5m])) > 0.1",
|
|
"evaluation_interval": "1m",
|
|
"for": "5m"
|
|
},
|
|
"severity": "warning",
|
|
"annotations": {
|
|
"summary": "PhotoDNA API error rate exceeds 10%",
|
|
"description": "PhotoDNA API error rate exceeds 10%"
|
|
},
|
|
"labels": {
|
|
"service": "fluxer-api",
|
|
"alert_type": "photodna_error_rate"
|
|
},
|
|
"actions": [
|
|
{
|
|
"type": "notification",
|
|
"channel": "slack",
|
|
"target": "#alerts-warning"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": "ncmec-submission-failure",
|
|
"name": "NCMEC Submission Failure",
|
|
"type": "metric",
|
|
"condition": {
|
|
"query": "sum(rate(fluxer_csam_ncmec_submissions{service_name='fluxer-api',status='error'}[5m])) > 0",
|
|
"evaluation_interval": "1m",
|
|
"for": "5m"
|
|
},
|
|
"severity": "critical",
|
|
"annotations": {
|
|
"summary": "NCMEC report submission has failed",
|
|
"description": "NCMEC report submission has failed. Manual intervention required."
|
|
},
|
|
"labels": {
|
|
"service": "fluxer-api",
|
|
"alert_type": "ncmec_submission"
|
|
},
|
|
"actions": [
|
|
{
|
|
"type": "notification",
|
|
"channel": "slack",
|
|
"target": "#alerts-critical"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": "csam-scan-failure-rate-high",
|
|
"name": "CSAM Scan Failure Rate High",
|
|
"type": "metric",
|
|
"condition": {
|
|
"query": "sum(rate(fluxer_csam_scans_total{service_name='fluxer-api',status='error'}[5m])) / sum(rate(fluxer_csam_scans_total{service_name='fluxer-api'}[5m])) > 0.05",
|
|
"evaluation_interval": "1m",
|
|
"for": "5m"
|
|
},
|
|
"severity": "warning",
|
|
"annotations": {
|
|
"summary": "CSAM scan failure rate exceeds 5%",
|
|
"description": "CSAM scan failure rate exceeds 5%"
|
|
},
|
|
"labels": {
|
|
"service": "fluxer-api",
|
|
"alert_type": "csam_scan_failure_rate"
|
|
},
|
|
"actions": [
|
|
{
|
|
"type": "notification",
|
|
"channel": "slack",
|
|
"target": "#alerts-warning"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": "photodna-api-latency-high",
|
|
"name": "PhotoDNA API Latency High",
|
|
"type": "metric",
|
|
"condition": {
|
|
"query": "histogram_quantile(0.95, sum(rate(fluxer_csam_photodna_api_duration_ms_bucket{service_name='fluxer-api'}[5m])) by (le)) > 5000",
|
|
"evaluation_interval": "1m",
|
|
"for": "5m"
|
|
},
|
|
"severity": "warning",
|
|
"annotations": {
|
|
"summary": "PhotoDNA API p95 latency exceeds 5 seconds",
|
|
"description": "PhotoDNA API p95 latency exceeds 5 seconds"
|
|
},
|
|
"labels": {
|
|
"service": "fluxer-api",
|
|
"alert_type": "photodna_latency"
|
|
},
|
|
"actions": [
|
|
{
|
|
"type": "notification",
|
|
"channel": "slack",
|
|
"target": "#alerts-warning"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"notification_channels": {
|
|
"slack": {
|
|
"type": "webhook",
|
|
"url": "${ALERT_WEBHOOK_URL}",
|
|
"channel_mapping": {
|
|
"critical": "#alerts-critical",
|
|
"warning": "#alerts-warning"
|
|
}
|
|
}
|
|
}
|
|
}
|