From f9894990392449c33f48d15508c9a3bfed48f703 Mon Sep 17 00:00:00 2001 From: senke Date: Thu, 25 Dec 2025 21:33:44 +0100 Subject: [PATCH] [INFRA-005] infra: Set up database backups --- VEZA_COMPLETE_MVP_TODOLIST.json | 19 +- k8s/backups/README.md | 297 +++++++++++++++++++++++ k8s/backups/postgres-backup-cronjob.yaml | 145 +++++++++++ k8s/backups/redis-backup-cronjob.yaml | 141 +++++++++++ 4 files changed, 599 insertions(+), 3 deletions(-) create mode 100644 k8s/backups/README.md create mode 100644 k8s/backups/postgres-backup-cronjob.yaml create mode 100644 k8s/backups/redis-backup-cronjob.yaml diff --git a/VEZA_COMPLETE_MVP_TODOLIST.json b/VEZA_COMPLETE_MVP_TODOLIST.json index cf91d6f46..b666eea66 100644 --- a/VEZA_COMPLETE_MVP_TODOLIST.json +++ b/VEZA_COMPLETE_MVP_TODOLIST.json @@ -11340,8 +11340,12 @@ "description": "Configure automated database backups with retention", "owner": "devops", "estimated_hours": 4, - "status": "todo", - "files_involved": [], + "status": "completed", + "files_involved": [ + "k8s/backups/postgres-backup-cronjob.yaml", + "k8s/backups/redis-backup-cronjob.yaml", + "k8s/backups/README.md" + ], "implementation_steps": [ { "step": 1, @@ -11361,7 +11365,16 @@ "Unit tests", "Integration tests" ], - "notes": "" + "notes": "", + "completed_at": "2025-12-25T21:33:41.422507", + "validation": { + "yaml_syntax": "All manifests validated", + "backup_solution": "Kubernetes CronJobs for automated backups", + "postgres_backup": "Daily at 3:00 AM, 30-day retention, compressed format, 100Gi storage", + "redis_backup": "Daily at 3:30 AM, 30-day retention, RDB format, 20Gi storage", + "features": "Automatic cleanup, S3 upload support, PVC storage, configurable retention", + "documentation": "k8s/backups/README.md with deployment and restore instructions" + } }, { "id": "INFRA-006", diff --git a/k8s/backups/README.md b/k8s/backups/README.md new file mode 100644 index 000000000..4766eb86f --- /dev/null +++ b/k8s/backups/README.md @@ -0,0 +1,297 @@ +# Database Backup Configuration + +This directory contains Kubernetes CronJobs for automated database backups with retention policies. + +## Components + +### PostgreSQL Backup +- **Schedule**: Daily at 3:00 AM +- **Format**: PostgreSQL custom format (compressed) +- **Retention**: 30 days (configurable) +- **Storage**: 100Gi PVC + +### Redis Backup +- **Schedule**: Daily at 3:30 AM +- **Format**: RDB file +- **Retention**: 30 days (configurable) +- **Storage**: 20Gi PVC + +## Prerequisites + +### Secrets Required + +The backup jobs require the following secrets in `veza-secrets`: + +```bash +# PostgreSQL +postgres-host: "postgres-service-name" +postgres-user: "postgres_user" +postgres-password: "postgres_password" +postgres-db: "veza_db" + +# Redis (optional password) +redis-host: "redis-service-name" +redis-password: "redis_password" # Optional + +# S3 Backup (optional) +s3-backup-bucket: "veza-backups" +aws-access-key-id: "AWS_ACCESS_KEY" +aws-secret-access-key: "AWS_SECRET_KEY" +``` + +### Create Secrets + +```bash +kubectl create secret generic veza-secrets \ + --from-literal=postgres-host=postgres \ + --from-literal=postgres-user=veza_user \ + --from-literal=postgres-password=your_password \ + --from-literal=postgres-db=veza_db \ + --from-literal=redis-host=redis \ + --from-literal=redis-password=your_redis_password \ + -n veza-production +``` + +## Deployment + +### 1. Deploy PostgreSQL Backup + +```bash +kubectl apply -f k8s/backups/postgres-backup-cronjob.yaml +``` + +### 2. Deploy Redis Backup + +```bash +kubectl apply -f k8s/backups/redis-backup-cronjob.yaml +``` + +## Verification + +### Check CronJob Status + +```bash +# List all cronjobs +kubectl get cronjobs -n veza-production + +# Check PostgreSQL backup cronjob +kubectl get cronjob postgres-backup -n veza-production + +# Check Redis backup cronjob +kubectl get cronjob redis-backup -n veza-production +``` + +### Check Backup Jobs + +```bash +# List recent jobs +kubectl get jobs -n veza-production -l app=postgres-backup + +# View job logs +kubectl logs -l app=postgres-backup -n veza-production --tail=100 + +# Check Redis backup jobs +kubectl get jobs -n veza-production -l app=redis-backup +kubectl logs -l app=redis-backup -n veza-production --tail=100 +``` + +### Verify Backups + +```bash +# Create a test pod to access backup storage +kubectl run backup-checker --rm -it --image=postgres:15-alpine \ + --restart=Never \ + --overrides=' +{ + "spec": { + "containers": [{ + "name": "backup-checker", + "image": "postgres:15-alpine", + "command": ["/bin/sh"], + "stdin": true, + "tty": true, + "volumeMounts": [{ + "name": "backup-storage", + "mountPath": "/backups" + }] + }], + "volumes": [{ + "name": "backup-storage", + "persistentVolumeClaim": { + "claimName": "postgres-backup-storage" + } + }] + } +}' \ + -n veza-production + +# Inside the pod, list backups +ls -lh /backups/postgres/ +``` + +## Manual Backup + +### Trigger PostgreSQL Backup Manually + +```bash +kubectl create job --from=cronjob/postgres-backup postgres-backup-manual-$(date +%s) -n veza-production +``` + +### Trigger Redis Backup Manually + +```bash +kubectl create job --from=cronjob/redis-backup redis-backup-manual-$(date +%s) -n veza-production +``` + +## Restore from Backup + +### Restore PostgreSQL Backup + +```bash +# Create a restore pod +kubectl run postgres-restore --rm -it --image=postgres:15-alpine \ + --restart=Never \ + --overrides=' +{ + "spec": { + "containers": [{ + "name": "postgres-restore", + "image": "postgres:15-alpine", + "command": ["/bin/sh"], + "stdin": true, + "tty": true, + "env": [ + {"name": "PGPASSWORD", "value": "your_password"}, + {"name": "POSTGRES_HOST", "value": "postgres-service"}, + {"name": "POSTGRES_USER", "value": "veza_user"}, + {"name": "POSTGRES_DB", "value": "veza_db"} + ], + "volumeMounts": [{ + "name": "backup-storage", + "mountPath": "/backups" + }] + }], + "volumes": [{ + "name": "backup-storage", + "persistentVolumeClaim": { + "claimName": "postgres-backup-storage" + } + }] + } +}' \ + -n veza-production + +# Inside the pod, restore backup +pg_restore -h $POSTGRES_HOST -U $POSTGRES_USER -d $POSTGRES_DB -F c /backups/postgres/veza_db_YYYYMMDD_HHMMSS.dump +``` + +### Restore Redis Backup + +```bash +# Copy backup file to Redis pod +kubectl cp redis-pod:/data/dump.rdb -n veza-production + +# Restart Redis to load the backup +kubectl delete pod -n veza-production +``` + +## Configuration + +### Change Backup Schedule + +Edit the `schedule` field in the CronJob manifest: + +```yaml +spec: + schedule: "0 3 * * *" # Cron format: minute hour day month weekday +``` + +Examples: +- `"0 3 * * *"` - Daily at 3:00 AM +- `"0 */6 * * *"` - Every 6 hours +- `"0 2 * * 0"` - Weekly on Sunday at 2:00 AM + +### Change Retention Period + +Set the `BACKUP_RETENTION_DAYS` environment variable: + +```yaml +env: +- name: BACKUP_RETENTION_DAYS + value: "60" # Keep backups for 60 days +``` + +### Enable S3 Upload + +Add S3 credentials to secrets: + +```bash +kubectl create secret generic veza-secrets \ + --from-literal=s3-backup-bucket=veza-backups \ + --from-literal=aws-access-key-id=YOUR_KEY \ + --from-literal=aws-secret-access-key=YOUR_SECRET \ + -n veza-production \ + --dry-run=client -o yaml | kubectl apply -f - +``` + +## Monitoring + +### Check Backup Success Rate + +```bash +# Count successful jobs in last 7 days +kubectl get jobs -n veza-production -l app=postgres-backup \ + --field-selector status.successful=1 \ + -o json | jq '.items | length' +``` + +### Monitor Backup Sizes + +Backup sizes are logged in job output. Check logs to monitor size trends. + +### Set Up Alerts + +Configure Prometheus alerts for: +- Failed backup jobs +- Backup size anomalies +- Storage capacity warnings + +## Troubleshooting + +### Backup Job Fails + +1. Check job logs: + ```bash + kubectl logs -n veza-production + ``` + +2. Verify secrets are correct: + ```bash + kubectl get secret veza-secrets -n veza-production -o yaml + ``` + +3. Test database connectivity: + ```bash + kubectl run test-db-connection --rm -it --image=postgres:15-alpine \ + --restart=Never \ + --env="PGPASSWORD=your_password" \ + -- psql -h postgres-service -U veza_user -d veza_db -c "SELECT 1" + ``` + +### Storage Full + +1. Check PVC usage: + ```bash + kubectl describe pvc postgres-backup-storage -n veza-production + ``` + +2. Manually cleanup old backups: + ```bash + kubectl run cleanup --rm -it --image=postgres:15-alpine \ + --restart=Never \ + --overrides='{"spec":{"containers":[{"name":"cleanup","image":"postgres:15-alpine","command":["/bin/sh","-c","find /backups -name \"*.dump\" -mtime +30 -delete"],"volumeMounts":[{"name":"backup-storage","mountPath":"/backups"}],"stdin":true,"tty":true}],"volumes":[{"name":"backup-storage","persistentVolumeClaim":{"claimName":"postgres-backup-storage"}}]}}' \ + -n veza-production + ``` + +3. Increase PVC size if needed (requires storage class support) + diff --git a/k8s/backups/postgres-backup-cronjob.yaml b/k8s/backups/postgres-backup-cronjob.yaml new file mode 100644 index 000000000..483cd24a4 --- /dev/null +++ b/k8s/backups/postgres-backup-cronjob.yaml @@ -0,0 +1,145 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + name: postgres-backup + namespace: veza-production + labels: + app: postgres-backup + component: backup +spec: + # Run daily at 3 AM + schedule: "0 3 * * *" + successfulJobsHistoryLimit: 3 + failedJobsHistoryLimit: 3 + concurrencyPolicy: Forbid + jobTemplate: + spec: + template: + metadata: + labels: + app: postgres-backup + spec: + restartPolicy: OnFailure + containers: + - name: postgres-backup + image: postgres:15-alpine + command: + - /bin/sh + - -c + - | + set -e + BACKUP_DIR="/backups/postgres" + BACKUP_FILE="${BACKUP_DIR}/veza_db_$(date +%Y%m%d_%H%M%S).dump" + RETENTION_DAYS=${BACKUP_RETENTION_DAYS:-30} + + echo "Starting backup at $(date)" + + # Create backup directory + mkdir -p "${BACKUP_DIR}" + + # Create backup using pg_dump with custom format (compressed) + PGPASSWORD="${POSTGRES_PASSWORD}" pg_dump \ + -h "${POSTGRES_HOST}" \ + -p "${POSTGRES_PORT:-5432}" \ + -U "${POSTGRES_USER}" \ + -d "${POSTGRES_DB}" \ + -F c \ + -f "${BACKUP_FILE}" + + # Verify backup was created + if [ ! -f "${BACKUP_FILE}" ]; then + echo "ERROR: Backup file was not created!" + exit 1 + fi + + BACKUP_SIZE=$(du -h "${BACKUP_FILE}" | cut -f1) + echo "Backup created successfully: ${BACKUP_FILE} (${BACKUP_SIZE})" + + # Cleanup old backups + echo "Cleaning up backups older than ${RETENTION_DAYS} days..." + find "${BACKUP_DIR}" -name "veza_db_*.dump" -type f -mtime +${RETENTION_DAYS} -delete + + # List remaining backups + echo "Remaining backups:" + ls -lh "${BACKUP_DIR}" || true + + # Optional: Upload to S3 or other storage + if [ -n "${S3_BUCKET}" ] && [ -n "${AWS_ACCESS_KEY_ID}" ] && [ -n "${AWS_SECRET_ACCESS_KEY}" ]; then + echo "Uploading backup to S3..." + # Install aws-cli if not available (alpine) + apk add --no-cache aws-cli || true + aws s3 cp "${BACKUP_FILE}" "s3://${S3_BUCKET}/postgres-backups/$(basename ${BACKUP_FILE})" || echo "S3 upload failed, continuing..." + fi + + echo "Backup completed at $(date)" + env: + - name: POSTGRES_HOST + valueFrom: + secretKeyRef: + name: veza-secrets + key: postgres-host + - name: POSTGRES_PORT + value: "5432" + - name: POSTGRES_USER + valueFrom: + secretKeyRef: + name: veza-secrets + key: postgres-user + - name: POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: veza-secrets + key: postgres-password + - name: POSTGRES_DB + valueFrom: + secretKeyRef: + name: veza-secrets + key: postgres-db + - name: BACKUP_RETENTION_DAYS + value: "30" + - name: S3_BUCKET + valueFrom: + secretKeyRef: + name: veza-secrets + key: s3-backup-bucket + optional: true + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: veza-secrets + key: aws-access-key-id + optional: true + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: veza-secrets + key: aws-secret-access-key + optional: true + volumeMounts: + - name: backup-storage + mountPath: /backups + resources: + requests: + cpu: "200m" + memory: "256Mi" + limits: + cpu: "1000m" + memory: "1Gi" + volumes: + - name: backup-storage + persistentVolumeClaim: + claimName: postgres-backup-storage + +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: postgres-backup-storage + namespace: veza-production +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 100Gi + diff --git a/k8s/backups/redis-backup-cronjob.yaml b/k8s/backups/redis-backup-cronjob.yaml new file mode 100644 index 000000000..1d93a2d51 --- /dev/null +++ b/k8s/backups/redis-backup-cronjob.yaml @@ -0,0 +1,141 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + name: redis-backup + namespace: veza-production + labels: + app: redis-backup + component: backup +spec: + # Run daily at 3:30 AM (30 minutes after postgres backup) + schedule: "30 3 * * *" + successfulJobsHistoryLimit: 3 + failedJobsHistoryLimit: 3 + concurrencyPolicy: Forbid + jobTemplate: + spec: + template: + metadata: + labels: + app: redis-backup + spec: + restartPolicy: OnFailure + containers: + - name: redis-backup + image: redis:7-alpine + command: + - /bin/sh + - -c + - | + set -e + BACKUP_DIR="/backups/redis" + BACKUP_FILE="${BACKUP_DIR}/redis_$(date +%Y%m%d_%H%M%S).rdb" + RETENTION_DAYS=${BACKUP_RETENTION_DAYS:-30} + + echo "Starting Redis backup at $(date)" + + # Create backup directory + mkdir -p "${BACKUP_DIR}" + + # Connect to Redis and save + redis-cli -h "${REDIS_HOST}" -p "${REDIS_PORT:-6379}" ${REDIS_PASSWORD:+-a "${REDIS_PASSWORD}"} SAVE + + # Copy RDB file from Redis data directory (if accessible) + # Note: This assumes Redis data is accessible via volume mount + # If not, use redis-cli --rdb to stream the backup + if [ -f "/data/dump.rdb" ]; then + cp /data/dump.rdb "${BACKUP_FILE}" + else + # Alternative: Use redis-cli --rdb to stream backup + redis-cli -h "${REDIS_HOST}" -p "${REDIS_PORT:-6379}" ${REDIS_PASSWORD:+-a "${REDIS_PASSWORD}"} --rdb "${BACKUP_FILE}" || { + echo "WARNING: Redis backup may have failed, but continuing..." + } + fi + + # Verify backup was created + if [ ! -f "${BACKUP_FILE}" ]; then + echo "ERROR: Backup file was not created!" + exit 1 + fi + + BACKUP_SIZE=$(du -h "${BACKUP_FILE}" | cut -f1) + echo "Backup created successfully: ${BACKUP_FILE} (${BACKUP_SIZE})" + + # Cleanup old backups + echo "Cleaning up backups older than ${RETENTION_DAYS} days..." + find "${BACKUP_DIR}" -name "redis_*.rdb" -type f -mtime +${RETENTION_DAYS} -delete + + # List remaining backups + echo "Remaining backups:" + ls -lh "${BACKUP_DIR}" || true + + # Optional: Upload to S3 + if [ -n "${S3_BUCKET}" ] && [ -n "${AWS_ACCESS_KEY_ID}" ] && [ -n "${AWS_SECRET_ACCESS_KEY}" ]; then + echo "Uploading backup to S3..." + apk add --no-cache aws-cli || true + aws s3 cp "${BACKUP_FILE}" "s3://${S3_BUCKET}/redis-backups/$(basename ${BACKUP_FILE})" || echo "S3 upload failed, continuing..." + fi + + echo "Backup completed at $(date)" + env: + - name: REDIS_HOST + valueFrom: + secretKeyRef: + name: veza-secrets + key: redis-host + - name: REDIS_PORT + value: "6379" + - name: REDIS_PASSWORD + valueFrom: + secretKeyRef: + name: veza-secrets + key: redis-password + optional: true + - name: BACKUP_RETENTION_DAYS + value: "30" + - name: S3_BUCKET + valueFrom: + secretKeyRef: + name: veza-secrets + key: s3-backup-bucket + optional: true + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: veza-secrets + key: aws-access-key-id + optional: true + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: veza-secrets + key: aws-secret-access-key + optional: true + volumeMounts: + - name: backup-storage + mountPath: /backups + resources: + requests: + cpu: "100m" + memory: "128Mi" + limits: + cpu: "500m" + memory: "512Mi" + volumes: + - name: backup-storage + persistentVolumeClaim: + claimName: redis-backup-storage + +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: redis-backup-storage + namespace: veza-production +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 20Gi +