282 lines
7.9 KiB
Python
282 lines
7.9 KiB
Python
#!/usr/bin/env python3
|
|
|
|
import pathlib
|
|
import sys
|
|
|
|
sys.path.append(str(pathlib.Path(__file__).resolve().parents[1]))
|
|
|
|
from ci_workflow import parse_step_env_args
|
|
from ci_utils import run_step
|
|
|
|
|
|
STEPS = {
|
|
"set_temp_paths": """
|
|
set -euo pipefail
|
|
: "${RUNNER_TEMP:?RUNNER_TEMP is not set}"
|
|
echo "WORKDIR=$RUNNER_TEMP/cassandra-restore-test" >> "$GITHUB_ENV"
|
|
""",
|
|
"pre_clean": """
|
|
set -euo pipefail
|
|
docker rm -f "${CASS_CONTAINER}" "${UTIL_CONTAINER}" 2>/dev/null || true
|
|
docker volume rm "${CASS_VOLUME}" 2>/dev/null || true
|
|
docker volume rm "${BACKUP_VOLUME}" 2>/dev/null || true
|
|
rm -rf "${WORKDIR}" 2>/dev/null || true
|
|
""",
|
|
"install_tools": """
|
|
set -euo pipefail
|
|
sudo apt-get update -y
|
|
sudo apt-get install -y --no-install-recommends rclone age ca-certificates
|
|
""",
|
|
"fetch_backup": """
|
|
set -euo pipefail
|
|
|
|
rm -rf "$WORKDIR"
|
|
mkdir -p "$WORKDIR"
|
|
|
|
export RCLONE_CONFIG_B2S3_TYPE=s3
|
|
export RCLONE_CONFIG_B2S3_PROVIDER=Other
|
|
export RCLONE_CONFIG_B2S3_ACCESS_KEY_ID="${B2_KEY_ID}"
|
|
export RCLONE_CONFIG_B2S3_SECRET_ACCESS_KEY="${B2_APPLICATION_KEY}"
|
|
export RCLONE_CONFIG_B2S3_ENDPOINT="https://s3.eu-central-003.backblazeb2.com"
|
|
export RCLONE_CONFIG_B2S3_REGION="eu-central-003"
|
|
export RCLONE_CONFIG_B2S3_FORCE_PATH_STYLE=true
|
|
|
|
LATEST_BACKUP="$(
|
|
rclone lsf "B2S3:fluxer" --recursive --files-only --fast-list \
|
|
| grep -E '(^|/)cassandra-backup-[0-9]{8}-[0-9]{6}\.tar\.age$' \
|
|
| sort -r \
|
|
| head -n 1
|
|
)"
|
|
|
|
if [ -z "${LATEST_BACKUP}" ]; then
|
|
echo "Error: No backup found in bucket"
|
|
exit 1
|
|
fi
|
|
|
|
echo "LATEST_BACKUP=${LATEST_BACKUP}" >> "$GITHUB_ENV"
|
|
|
|
base="$(basename "${LATEST_BACKUP}")"
|
|
ts="${base#cassandra-backup-}"
|
|
ts="${ts%.tar.age}"
|
|
|
|
if ! [[ "$ts" =~ ^[0-9]{8}-[0-9]{6}$ ]]; then
|
|
echo "Error: Could not extract timestamp from backup filename: ${base}"
|
|
exit 1
|
|
fi
|
|
|
|
BACKUP_EPOCH="$(date -u -d "${ts:0:8} ${ts:9:2}:${ts:11:2}:${ts:13:2}" +%s)"
|
|
CURRENT_EPOCH="$(date -u +%s)"
|
|
AGE_HOURS=$(( (CURRENT_EPOCH - BACKUP_EPOCH) / 3600 ))
|
|
|
|
echo "Backup age: ${AGE_HOURS} hours"
|
|
if [ "${AGE_HOURS}" -ge 3 ]; then
|
|
echo "Error: Latest backup is ${AGE_HOURS} hours old (threshold: 3 hours)"
|
|
exit 1
|
|
fi
|
|
|
|
rclone copyto "B2S3:fluxer/${LATEST_BACKUP}" "${WORKDIR}/backup.tar.age" --fast-list
|
|
|
|
umask 077
|
|
printf '%s' "${AGE_PRIVATE_KEY}" > "${WORKDIR}/age.key"
|
|
|
|
docker volume create "${BACKUP_VOLUME}"
|
|
|
|
age -d -i "${WORKDIR}/age.key" "${WORKDIR}/backup.tar.age" \
|
|
| docker run --rm -i \
|
|
-v "${BACKUP_VOLUME}:/backup" \
|
|
--entrypoint bash \
|
|
"${CASSANDRA_IMAGE}" -lc '
|
|
set -euo pipefail
|
|
rm -rf /backup/*
|
|
mkdir -p /backup/_tmp
|
|
tar -C /backup/_tmp -xf -
|
|
|
|
top="$(find /backup/_tmp -maxdepth 1 -mindepth 1 -type d -name "cassandra-backup-*" | head -n 1 || true)"
|
|
|
|
if [ -n "$top" ] && [ -f "$top/schema.cql" ]; then
|
|
cp -a "$top"/. /backup/
|
|
elif [ -f /backup/_tmp/schema.cql ]; then
|
|
cp -a /backup/_tmp/. /backup/
|
|
else
|
|
echo "Error: schema.cql not found after extraction"
|
|
find /backup/_tmp -maxdepth 3 -type f -print | sed -n "1,80p" || true
|
|
exit 1
|
|
fi
|
|
|
|
rm -rf /backup/_tmp
|
|
'
|
|
|
|
docker run --rm \
|
|
-v "${BACKUP_VOLUME}:/backup:ro" \
|
|
--entrypoint bash \
|
|
"${CASSANDRA_IMAGE}" -lc '
|
|
set -euo pipefail
|
|
test -f /backup/schema.cql
|
|
echo "Extracted backup layout (top 3 levels):"
|
|
find /backup -maxdepth 3 -type d -print | sed -n "1,200p" || true
|
|
echo "Sample SSTables (*Data.db):"
|
|
find /backup -type f -name "*Data.db" | sed -n "1,30p" || true
|
|
'
|
|
""",
|
|
"create_data_volume": """
|
|
set -euo pipefail
|
|
docker volume create "${CASS_VOLUME}"
|
|
""",
|
|
"restore_keyspaces": """
|
|
set -euo pipefail
|
|
|
|
docker run --rm \
|
|
--name "${UTIL_CONTAINER}" \
|
|
-v "${CASS_VOLUME}:/var/lib/cassandra" \
|
|
-v "${BACKUP_VOLUME}:/backup:ro" \
|
|
--entrypoint bash \
|
|
"${CASSANDRA_IMAGE}" -lc '
|
|
set -euo pipefail
|
|
shopt -s nullglob
|
|
|
|
BASE=/var/lib/cassandra
|
|
DATA_DIR="$BASE/data"
|
|
mkdir -p "$DATA_DIR" "$BASE/commitlog" "$BASE/hints" "$BASE/saved_caches"
|
|
|
|
ROOT=/backup
|
|
if [ -d "$ROOT/cassandra_data" ]; then ROOT="$ROOT/cassandra_data"; fi
|
|
if [ -d "$ROOT/data" ]; then ROOT="$ROOT/data"; fi
|
|
|
|
echo "Using backup ROOT=$ROOT"
|
|
echo "Restoring into DATA_DIR=$DATA_DIR"
|
|
|
|
restored=0
|
|
for keyspace_dir in "$ROOT"/*/; do
|
|
[ -d "$keyspace_dir" ] || continue
|
|
ks="$(basename "$keyspace_dir")"
|
|
|
|
if [ "$ks" = "system_schema" ] || ! [[ "$ks" =~ ^system ]]; then
|
|
echo "Restoring keyspace: $ks"
|
|
rm -rf "$DATA_DIR/$ks"
|
|
cp -a "$keyspace_dir" "$DATA_DIR/"
|
|
restored=$((restored + 1))
|
|
fi
|
|
done
|
|
|
|
if [ "$restored" -le 0 ]; then
|
|
echo "Error: No keyspaces restored from backup root: $ROOT"
|
|
echo "Debug: listing $ROOT:"
|
|
ls -la "$ROOT" || true
|
|
find "$ROOT" -maxdepth 2 -type d -print | sed -n "1,100p" || true
|
|
exit 1
|
|
fi
|
|
|
|
promoted=0
|
|
for ks_dir in "$DATA_DIR"/*/; do
|
|
[ -d "$ks_dir" ] || continue
|
|
ks="$(basename "$ks_dir")"
|
|
|
|
if [ "$ks" != "system_schema" ] && [[ "$ks" =~ ^system ]]; then
|
|
continue
|
|
fi
|
|
|
|
for table_dir in "$ks_dir"*/; do
|
|
[ -d "$table_dir" ] || continue
|
|
|
|
snap_root="$table_dir/snapshots"
|
|
[ -d "$snap_root" ] || continue
|
|
|
|
latest_snap="$(ls -1d "$snap_root"/*/ 2>/dev/null | sort -r | head -n 1 || true)"
|
|
[ -n "$latest_snap" ] || continue
|
|
|
|
files=( "$latest_snap"* )
|
|
if [ "${#files[@]}" -gt 0 ]; then
|
|
cp -av "${files[@]}" "$table_dir"
|
|
promoted=$((promoted + $(ls -1 "$latest_snap"/*Data.db 2>/dev/null | wc -l || true)))
|
|
fi
|
|
done
|
|
done
|
|
|
|
chown -R cassandra:cassandra "$BASE"
|
|
|
|
echo "Promoted Data.db files: $promoted"
|
|
if [ "$promoted" -le 0 ]; then
|
|
echo "Error: No *Data.db files were promoted out of snapshots"
|
|
echo "Debug: first snapshot dirs found:"
|
|
find "$DATA_DIR" -type d -path "*/snapshots/*" | sed -n "1,50p" || true
|
|
exit 1
|
|
fi
|
|
'
|
|
""",
|
|
"start_cassandra": """
|
|
set -euo pipefail
|
|
|
|
docker run -d \
|
|
--name "${CASS_CONTAINER}" \
|
|
-v "${CASS_VOLUME}:/var/lib/cassandra" \
|
|
-e MAX_HEAP_SIZE="${MAX_HEAP_SIZE}" \
|
|
-e HEAP_NEWSIZE="${HEAP_NEWSIZE}" \
|
|
-e JVM_OPTS="-Dcassandra.disable_mlock=true" \
|
|
"${CASSANDRA_IMAGE}"
|
|
|
|
for i in $(seq 1 150); do
|
|
status="$(docker inspect -f '{{.State.Status}}' "${CASS_CONTAINER}" 2>/dev/null || true)"
|
|
if [ "${status}" != "running" ]; then
|
|
docker inspect "${CASS_CONTAINER}" --format 'ExitCode={{.State.ExitCode}} OOMKilled={{.State.OOMKilled}} Error={{.State.Error}}' || true
|
|
docker logs --tail 300 "${CASS_CONTAINER}" || true
|
|
exit 1
|
|
fi
|
|
if docker exec "${CASS_CONTAINER}" cqlsh -e "SELECT now() FROM system.local;" >/dev/null 2>&1; then
|
|
break
|
|
fi
|
|
sleep 2
|
|
done
|
|
|
|
docker exec "${CASS_CONTAINER}" cqlsh -e "SELECT now() FROM system.local;" >/dev/null 2>&1
|
|
""",
|
|
"verify_data": """
|
|
set -euo pipefail
|
|
|
|
USER_COUNT=""
|
|
for i in $(seq 1 20); do
|
|
USER_COUNT="$(
|
|
docker exec "${CASS_CONTAINER}" cqlsh -e "SELECT COUNT(*) FROM fluxer.users;" 2>/dev/null \
|
|
| awk "/^[[:space:]]*[0-9]+[[:space:]]*$/ {print \$1; exit}" || true
|
|
)"
|
|
if [ -n "${USER_COUNT}" ]; then
|
|
break
|
|
fi
|
|
sleep 2
|
|
done
|
|
|
|
if [ -n "${USER_COUNT}" ] && [ "${USER_COUNT}" -gt 0 ] 2>/dev/null; then
|
|
echo "Backup restore verification passed"
|
|
else
|
|
echo "Backup restore verification failed"
|
|
docker logs --tail 300 "${CASS_CONTAINER}" || true
|
|
exit 1
|
|
fi
|
|
""",
|
|
"cleanup": """
|
|
set -euo pipefail
|
|
docker rm -f "${CASS_CONTAINER}" 2>/dev/null || true
|
|
docker volume rm "${CASS_VOLUME}" 2>/dev/null || true
|
|
docker volume rm "${BACKUP_VOLUME}" 2>/dev/null || true
|
|
rm -rf "${WORKDIR}" 2>/dev/null || true
|
|
""",
|
|
"report_status": """
|
|
set -euo pipefail
|
|
LATEST_BACKUP_NAME="${LATEST_BACKUP:-unknown}"
|
|
if [ "${JOB_STATUS}" = "success" ]; then
|
|
echo "Backup ${LATEST_BACKUP_NAME} is valid and restorable"
|
|
else
|
|
echo "Backup ${LATEST_BACKUP_NAME} test failed"
|
|
fi
|
|
""",
|
|
}
|
|
|
|
|
|
def main() -> int:
|
|
args = parse_step_env_args()
|
|
run_step(STEPS, args.step)
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|