[INFRA-005] infra: Set up database backups

This commit is contained in:
senke 2025-12-25 21:33:44 +01:00
parent 49e764ff21
commit f989499039
4 changed files with 599 additions and 3 deletions

View file

@ -11340,8 +11340,12 @@
"description": "Configure automated database backups with retention",
"owner": "devops",
"estimated_hours": 4,
"status": "todo",
"files_involved": [],
"status": "completed",
"files_involved": [
"k8s/backups/postgres-backup-cronjob.yaml",
"k8s/backups/redis-backup-cronjob.yaml",
"k8s/backups/README.md"
],
"implementation_steps": [
{
"step": 1,
@ -11361,7 +11365,16 @@
"Unit tests",
"Integration tests"
],
"notes": ""
"notes": "",
"completed_at": "2025-12-25T21:33:41.422507",
"validation": {
"yaml_syntax": "All manifests validated",
"backup_solution": "Kubernetes CronJobs for automated backups",
"postgres_backup": "Daily at 3:00 AM, 30-day retention, compressed format, 100Gi storage",
"redis_backup": "Daily at 3:30 AM, 30-day retention, RDB format, 20Gi storage",
"features": "Automatic cleanup, S3 upload support, PVC storage, configurable retention",
"documentation": "k8s/backups/README.md with deployment and restore instructions"
}
},
{
"id": "INFRA-006",

297
k8s/backups/README.md Normal file
View file

@ -0,0 +1,297 @@
# Database Backup Configuration
This directory contains Kubernetes CronJobs for automated database backups with retention policies.
## Components
### PostgreSQL Backup
- **Schedule**: Daily at 3:00 AM
- **Format**: PostgreSQL custom format (compressed)
- **Retention**: 30 days (configurable)
- **Storage**: 100Gi PVC
### Redis Backup
- **Schedule**: Daily at 3:30 AM
- **Format**: RDB file
- **Retention**: 30 days (configurable)
- **Storage**: 20Gi PVC
## Prerequisites
### Secrets Required
The backup jobs require the following secrets in `veza-secrets`:
```bash
# PostgreSQL
postgres-host: "postgres-service-name"
postgres-user: "postgres_user"
postgres-password: "postgres_password"
postgres-db: "veza_db"
# Redis (optional password)
redis-host: "redis-service-name"
redis-password: "redis_password" # Optional
# S3 Backup (optional)
s3-backup-bucket: "veza-backups"
aws-access-key-id: "AWS_ACCESS_KEY"
aws-secret-access-key: "AWS_SECRET_KEY"
```
### Create Secrets
```bash
kubectl create secret generic veza-secrets \
--from-literal=postgres-host=postgres \
--from-literal=postgres-user=veza_user \
--from-literal=postgres-password=your_password \
--from-literal=postgres-db=veza_db \
--from-literal=redis-host=redis \
--from-literal=redis-password=your_redis_password \
-n veza-production
```
## Deployment
### 1. Deploy PostgreSQL Backup
```bash
kubectl apply -f k8s/backups/postgres-backup-cronjob.yaml
```
### 2. Deploy Redis Backup
```bash
kubectl apply -f k8s/backups/redis-backup-cronjob.yaml
```
## Verification
### Check CronJob Status
```bash
# List all cronjobs
kubectl get cronjobs -n veza-production
# Check PostgreSQL backup cronjob
kubectl get cronjob postgres-backup -n veza-production
# Check Redis backup cronjob
kubectl get cronjob redis-backup -n veza-production
```
### Check Backup Jobs
```bash
# List recent jobs
kubectl get jobs -n veza-production -l app=postgres-backup
# View job logs
kubectl logs -l app=postgres-backup -n veza-production --tail=100
# Check Redis backup jobs
kubectl get jobs -n veza-production -l app=redis-backup
kubectl logs -l app=redis-backup -n veza-production --tail=100
```
### Verify Backups
```bash
# Create a test pod to access backup storage
kubectl run backup-checker --rm -it --image=postgres:15-alpine \
--restart=Never \
--overrides='
{
"spec": {
"containers": [{
"name": "backup-checker",
"image": "postgres:15-alpine",
"command": ["/bin/sh"],
"stdin": true,
"tty": true,
"volumeMounts": [{
"name": "backup-storage",
"mountPath": "/backups"
}]
}],
"volumes": [{
"name": "backup-storage",
"persistentVolumeClaim": {
"claimName": "postgres-backup-storage"
}
}]
}
}' \
-n veza-production
# Inside the pod, list backups
ls -lh /backups/postgres/
```
## Manual Backup
### Trigger PostgreSQL Backup Manually
```bash
kubectl create job --from=cronjob/postgres-backup postgres-backup-manual-$(date +%s) -n veza-production
```
### Trigger Redis Backup Manually
```bash
kubectl create job --from=cronjob/redis-backup redis-backup-manual-$(date +%s) -n veza-production
```
## Restore from Backup
### Restore PostgreSQL Backup
```bash
# Create a restore pod
kubectl run postgres-restore --rm -it --image=postgres:15-alpine \
--restart=Never \
--overrides='
{
"spec": {
"containers": [{
"name": "postgres-restore",
"image": "postgres:15-alpine",
"command": ["/bin/sh"],
"stdin": true,
"tty": true,
"env": [
{"name": "PGPASSWORD", "value": "your_password"},
{"name": "POSTGRES_HOST", "value": "postgres-service"},
{"name": "POSTGRES_USER", "value": "veza_user"},
{"name": "POSTGRES_DB", "value": "veza_db"}
],
"volumeMounts": [{
"name": "backup-storage",
"mountPath": "/backups"
}]
}],
"volumes": [{
"name": "backup-storage",
"persistentVolumeClaim": {
"claimName": "postgres-backup-storage"
}
}]
}
}' \
-n veza-production
# Inside the pod, restore backup
pg_restore -h $POSTGRES_HOST -U $POSTGRES_USER -d $POSTGRES_DB -F c /backups/postgres/veza_db_YYYYMMDD_HHMMSS.dump
```
### Restore Redis Backup
```bash
# Copy backup file to Redis pod
kubectl cp <backup-file> redis-pod:/data/dump.rdb -n veza-production
# Restart Redis to load the backup
kubectl delete pod <redis-pod> -n veza-production
```
## Configuration
### Change Backup Schedule
Edit the `schedule` field in the CronJob manifest:
```yaml
spec:
schedule: "0 3 * * *" # Cron format: minute hour day month weekday
```
Examples:
- `"0 3 * * *"` - Daily at 3:00 AM
- `"0 */6 * * *"` - Every 6 hours
- `"0 2 * * 0"` - Weekly on Sunday at 2:00 AM
### Change Retention Period
Set the `BACKUP_RETENTION_DAYS` environment variable:
```yaml
env:
- name: BACKUP_RETENTION_DAYS
value: "60" # Keep backups for 60 days
```
### Enable S3 Upload
Add S3 credentials to secrets:
```bash
kubectl create secret generic veza-secrets \
--from-literal=s3-backup-bucket=veza-backups \
--from-literal=aws-access-key-id=YOUR_KEY \
--from-literal=aws-secret-access-key=YOUR_SECRET \
-n veza-production \
--dry-run=client -o yaml | kubectl apply -f -
```
## Monitoring
### Check Backup Success Rate
```bash
# Count successful jobs in last 7 days
kubectl get jobs -n veza-production -l app=postgres-backup \
--field-selector status.successful=1 \
-o json | jq '.items | length'
```
### Monitor Backup Sizes
Backup sizes are logged in job output. Check logs to monitor size trends.
### Set Up Alerts
Configure Prometheus alerts for:
- Failed backup jobs
- Backup size anomalies
- Storage capacity warnings
## Troubleshooting
### Backup Job Fails
1. Check job logs:
```bash
kubectl logs <job-name> -n veza-production
```
2. Verify secrets are correct:
```bash
kubectl get secret veza-secrets -n veza-production -o yaml
```
3. Test database connectivity:
```bash
kubectl run test-db-connection --rm -it --image=postgres:15-alpine \
--restart=Never \
--env="PGPASSWORD=your_password" \
-- psql -h postgres-service -U veza_user -d veza_db -c "SELECT 1"
```
### Storage Full
1. Check PVC usage:
```bash
kubectl describe pvc postgres-backup-storage -n veza-production
```
2. Manually cleanup old backups:
```bash
kubectl run cleanup --rm -it --image=postgres:15-alpine \
--restart=Never \
--overrides='{"spec":{"containers":[{"name":"cleanup","image":"postgres:15-alpine","command":["/bin/sh","-c","find /backups -name \"*.dump\" -mtime +30 -delete"],"volumeMounts":[{"name":"backup-storage","mountPath":"/backups"}],"stdin":true,"tty":true}],"volumes":[{"name":"backup-storage","persistentVolumeClaim":{"claimName":"postgres-backup-storage"}}]}}' \
-n veza-production
```
3. Increase PVC size if needed (requires storage class support)

View file

@ -0,0 +1,145 @@
apiVersion: batch/v1
kind: CronJob
metadata:
name: postgres-backup
namespace: veza-production
labels:
app: postgres-backup
component: backup
spec:
# Run daily at 3 AM
schedule: "0 3 * * *"
successfulJobsHistoryLimit: 3
failedJobsHistoryLimit: 3
concurrencyPolicy: Forbid
jobTemplate:
spec:
template:
metadata:
labels:
app: postgres-backup
spec:
restartPolicy: OnFailure
containers:
- name: postgres-backup
image: postgres:15-alpine
command:
- /bin/sh
- -c
- |
set -e
BACKUP_DIR="/backups/postgres"
BACKUP_FILE="${BACKUP_DIR}/veza_db_$(date +%Y%m%d_%H%M%S).dump"
RETENTION_DAYS=${BACKUP_RETENTION_DAYS:-30}
echo "Starting backup at $(date)"
# Create backup directory
mkdir -p "${BACKUP_DIR}"
# Create backup using pg_dump with custom format (compressed)
PGPASSWORD="${POSTGRES_PASSWORD}" pg_dump \
-h "${POSTGRES_HOST}" \
-p "${POSTGRES_PORT:-5432}" \
-U "${POSTGRES_USER}" \
-d "${POSTGRES_DB}" \
-F c \
-f "${BACKUP_FILE}"
# Verify backup was created
if [ ! -f "${BACKUP_FILE}" ]; then
echo "ERROR: Backup file was not created!"
exit 1
fi
BACKUP_SIZE=$(du -h "${BACKUP_FILE}" | cut -f1)
echo "Backup created successfully: ${BACKUP_FILE} (${BACKUP_SIZE})"
# Cleanup old backups
echo "Cleaning up backups older than ${RETENTION_DAYS} days..."
find "${BACKUP_DIR}" -name "veza_db_*.dump" -type f -mtime +${RETENTION_DAYS} -delete
# List remaining backups
echo "Remaining backups:"
ls -lh "${BACKUP_DIR}" || true
# Optional: Upload to S3 or other storage
if [ -n "${S3_BUCKET}" ] && [ -n "${AWS_ACCESS_KEY_ID}" ] && [ -n "${AWS_SECRET_ACCESS_KEY}" ]; then
echo "Uploading backup to S3..."
# Install aws-cli if not available (alpine)
apk add --no-cache aws-cli || true
aws s3 cp "${BACKUP_FILE}" "s3://${S3_BUCKET}/postgres-backups/$(basename ${BACKUP_FILE})" || echo "S3 upload failed, continuing..."
fi
echo "Backup completed at $(date)"
env:
- name: POSTGRES_HOST
valueFrom:
secretKeyRef:
name: veza-secrets
key: postgres-host
- name: POSTGRES_PORT
value: "5432"
- name: POSTGRES_USER
valueFrom:
secretKeyRef:
name: veza-secrets
key: postgres-user
- name: POSTGRES_PASSWORD
valueFrom:
secretKeyRef:
name: veza-secrets
key: postgres-password
- name: POSTGRES_DB
valueFrom:
secretKeyRef:
name: veza-secrets
key: postgres-db
- name: BACKUP_RETENTION_DAYS
value: "30"
- name: S3_BUCKET
valueFrom:
secretKeyRef:
name: veza-secrets
key: s3-backup-bucket
optional: true
- name: AWS_ACCESS_KEY_ID
valueFrom:
secretKeyRef:
name: veza-secrets
key: aws-access-key-id
optional: true
- name: AWS_SECRET_ACCESS_KEY
valueFrom:
secretKeyRef:
name: veza-secrets
key: aws-secret-access-key
optional: true
volumeMounts:
- name: backup-storage
mountPath: /backups
resources:
requests:
cpu: "200m"
memory: "256Mi"
limits:
cpu: "1000m"
memory: "1Gi"
volumes:
- name: backup-storage
persistentVolumeClaim:
claimName: postgres-backup-storage
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: postgres-backup-storage
namespace: veza-production
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 100Gi

View file

@ -0,0 +1,141 @@
apiVersion: batch/v1
kind: CronJob
metadata:
name: redis-backup
namespace: veza-production
labels:
app: redis-backup
component: backup
spec:
# Run daily at 3:30 AM (30 minutes after postgres backup)
schedule: "30 3 * * *"
successfulJobsHistoryLimit: 3
failedJobsHistoryLimit: 3
concurrencyPolicy: Forbid
jobTemplate:
spec:
template:
metadata:
labels:
app: redis-backup
spec:
restartPolicy: OnFailure
containers:
- name: redis-backup
image: redis:7-alpine
command:
- /bin/sh
- -c
- |
set -e
BACKUP_DIR="/backups/redis"
BACKUP_FILE="${BACKUP_DIR}/redis_$(date +%Y%m%d_%H%M%S).rdb"
RETENTION_DAYS=${BACKUP_RETENTION_DAYS:-30}
echo "Starting Redis backup at $(date)"
# Create backup directory
mkdir -p "${BACKUP_DIR}"
# Connect to Redis and save
redis-cli -h "${REDIS_HOST}" -p "${REDIS_PORT:-6379}" ${REDIS_PASSWORD:+-a "${REDIS_PASSWORD}"} SAVE
# Copy RDB file from Redis data directory (if accessible)
# Note: This assumes Redis data is accessible via volume mount
# If not, use redis-cli --rdb to stream the backup
if [ -f "/data/dump.rdb" ]; then
cp /data/dump.rdb "${BACKUP_FILE}"
else
# Alternative: Use redis-cli --rdb to stream backup
redis-cli -h "${REDIS_HOST}" -p "${REDIS_PORT:-6379}" ${REDIS_PASSWORD:+-a "${REDIS_PASSWORD}"} --rdb "${BACKUP_FILE}" || {
echo "WARNING: Redis backup may have failed, but continuing..."
}
fi
# Verify backup was created
if [ ! -f "${BACKUP_FILE}" ]; then
echo "ERROR: Backup file was not created!"
exit 1
fi
BACKUP_SIZE=$(du -h "${BACKUP_FILE}" | cut -f1)
echo "Backup created successfully: ${BACKUP_FILE} (${BACKUP_SIZE})"
# Cleanup old backups
echo "Cleaning up backups older than ${RETENTION_DAYS} days..."
find "${BACKUP_DIR}" -name "redis_*.rdb" -type f -mtime +${RETENTION_DAYS} -delete
# List remaining backups
echo "Remaining backups:"
ls -lh "${BACKUP_DIR}" || true
# Optional: Upload to S3
if [ -n "${S3_BUCKET}" ] && [ -n "${AWS_ACCESS_KEY_ID}" ] && [ -n "${AWS_SECRET_ACCESS_KEY}" ]; then
echo "Uploading backup to S3..."
apk add --no-cache aws-cli || true
aws s3 cp "${BACKUP_FILE}" "s3://${S3_BUCKET}/redis-backups/$(basename ${BACKUP_FILE})" || echo "S3 upload failed, continuing..."
fi
echo "Backup completed at $(date)"
env:
- name: REDIS_HOST
valueFrom:
secretKeyRef:
name: veza-secrets
key: redis-host
- name: REDIS_PORT
value: "6379"
- name: REDIS_PASSWORD
valueFrom:
secretKeyRef:
name: veza-secrets
key: redis-password
optional: true
- name: BACKUP_RETENTION_DAYS
value: "30"
- name: S3_BUCKET
valueFrom:
secretKeyRef:
name: veza-secrets
key: s3-backup-bucket
optional: true
- name: AWS_ACCESS_KEY_ID
valueFrom:
secretKeyRef:
name: veza-secrets
key: aws-access-key-id
optional: true
- name: AWS_SECRET_ACCESS_KEY
valueFrom:
secretKeyRef:
name: veza-secrets
key: aws-secret-access-key
optional: true
volumeMounts:
- name: backup-storage
mountPath: /backups
resources:
requests:
cpu: "100m"
memory: "128Mi"
limits:
cpu: "500m"
memory: "512Mi"
volumes:
- name: backup-storage
persistentVolumeClaim:
claimName: redis-backup-storage
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: redis-backup-storage
namespace: veza-production
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 20Gi