veza/scripts/bfg-cleanup.sh

239 lines
7.5 KiB
Bash
Raw Permalink Normal View History

#!/usr/bin/env bash
# ============================================================
# BFG history cleanup for Veza monorepo
# ============================================================
# Goal: strip committed audio (.mp3/.wav), certs (.pem/.key/.crt),
# Go binaries, and AI session artefacts from git history, then
# compact .git from ~2.3 GB down to an expected <500 MB.
#
# WHEN TO RUN: after commits 98ee449f4 + 1f00fb762 (untrack debris
# + dev key regen) have been pushed to origin and reviewed.
#
# CHOICE: this script uses `git-filter-repo` (modern, fast, pure
# Python). BFG (Java) is supported as a fallback — set
# USE_BFG=1 to force it.
#
# ============================================================
# SAFETY MODEL
# ============================================================
# This script NEVER force-pushes by itself. It:
# 1. Verifies prereqs
# 2. Clones repo as bare mirror to /tmp/veza-bfg.git
# 3. Strips blobs > SIZE_THRESHOLD
# 4. Strips files matching FILE_PATTERNS
# 5. Runs aggressive gc
# 6. Prints size-before / size-after
# 7. Prints the exact force-push commands for YOU to run manually
#
# You verify the bare clone by hand before force-pushing. No surprises.
#
# ============================================================
# PREREQS
# ============================================================
# git-filter-repo: pip install --user git-filter-repo
# OR: https://github.com/newren/git-filter-repo
# (fallback) BFG: https://rtyley.github.io/bfg-repo-cleaner/
# Requires Java 8+. `brew install bfg` or download .jar
#
# ============================================================
set -euo pipefail
# ---------- CONFIG ----------
REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
BARE_CLONE="${BARE_CLONE:-/tmp/veza-bfg.git}"
SIZE_THRESHOLD="${SIZE_THRESHOLD:-5M}"
USE_BFG="${USE_BFG:-0}"
# Files to strip from ALL history (even if they're <SIZE_THRESHOLD).
# Match syntax differs: git-filter-repo uses glob, BFG uses bare name.
FILE_PATTERNS_FILTERREPO=(
# Audio uploads (44 files, up to 26 MB each)
"veza-backend-api/uploads/*.mp3"
"veza-backend-api/uploads/*.wav"
"veza-backend-api/uploads/*.flac"
"veza-backend-api/uploads/*.ogg"
"veza-backend-api/uploads/*.m4a"
# TLS + JWT secrets (match at any depth)
"**/*.pem"
"**/*.key"
"**/*.crt"
# Go binaries historically committed
"veza-backend-api/api"
"veza-backend-api/main"
"veza-backend-api/veza-api"
"veza-backend-api/seed"
"veza-backend-api/seed-v2"
"veza-backend-api/server"
"veza-backend-api/modern-server"
"veza-backend-api/encrypt_oauth_tokens"
"veza-backend-api/migrate_tool"
# AI session artefacts
"CLAUDE_CONTEXT.txt"
"UI_CONTEXT_SUMMARY.md"
# Root PNG blobs (all prefixes that were ever committed)
"design-system-*.png"
"forgot-password-*.png"
"register-*.png"
"reset-password-*.png"
"settings-*.png"
"storybook-*.png"
"dashboard-*.png"
"login-*.png"
"audit-*.png"
# Stale generated scripts
"generate_page_fix_prompts.sh"
# Apps/web dead reports
"apps/web/AUDIT_ISSUES.json"
"apps/web/audit_remediation.json"
"apps/web/lint_comprehensive.json"
"apps/web/storybook-roadmap.json"
"apps/web/e2e-results.json"
)
# BFG equivalent list (bare filenames, no path)
FILE_PATTERNS_BFG=(
"*.mp3" "*.wav" "*.flac" "*.ogg" "*.m4a"
"*.pem" "*.key" "*.crt"
"CLAUDE_CONTEXT.txt" "UI_CONTEXT_SUMMARY.md"
"generate_page_fix_prompts.sh"
)
# ---------- HELPERS ----------
die() { echo "ERROR: $*" >&2; exit 1; }
section() { echo ""; echo "━━━ $* ━━━"; }
check_tool() {
if command -v git-filter-repo >/dev/null 2>&1 && [[ "$USE_BFG" != "1" ]]; then
TOOL="filter-repo"
elif command -v bfg >/dev/null 2>&1; then
TOOL="bfg"
elif command -v java >/dev/null 2>&1 && [[ -f "${BFG_JAR:-/usr/local/lib/bfg.jar}" ]]; then
TOOL="bfg-jar"
else
die "Install git-filter-repo (pip install --user git-filter-repo) or BFG (https://rtyley.github.io/bfg-repo-cleaner/)"
fi
echo "Using: $TOOL"
}
human_size() {
du -sh "$1" 2>/dev/null | awk '{print $1}'
}
# ---------- SECTION 1: PREREQS ----------
section "1. Prereqs"
check_tool
[[ -d "$REPO_ROOT/.git" ]] || die "REPO_ROOT ($REPO_ROOT) is not a git repo"
cd "$REPO_ROOT"
# Refuse to run if working tree is dirty
if ! git diff-index --quiet HEAD --; then
die "Working tree has uncommitted changes. Commit or stash first."
fi
CURRENT_BRANCH="$(git branch --show-current)"
echo "Current branch: $CURRENT_BRANCH"
echo "Current .git size: $(human_size .git)"
read -r -p "Proceed with bare mirror clone to $BARE_CLONE? [y/N] " ANSWER
[[ "$ANSWER" == "y" || "$ANSWER" == "Y" ]] || die "Aborted by user"
# ---------- SECTION 2: BARE MIRROR CLONE ----------
section "2. Bare mirror clone"
if [[ -e "$BARE_CLONE" ]]; then
read -r -p "$BARE_CLONE already exists. Delete and recreate? [y/N] " ANSWER
[[ "$ANSWER" == "y" || "$ANSWER" == "Y" ]] || die "Aborted"
rm -rf "$BARE_CLONE"
fi
git clone --mirror "$REPO_ROOT" "$BARE_CLONE"
BEFORE_SIZE="$(human_size "$BARE_CLONE")"
echo "Bare clone size BEFORE: $BEFORE_SIZE"
# ---------- SECTION 3: STRIP ----------
section "3. Strip history"
cd "$BARE_CLONE"
if [[ "$TOOL" == "filter-repo" ]]; then
# Strip blobs bigger than threshold
git filter-repo --strip-blobs-bigger-than "$SIZE_THRESHOLD" --force
# Strip specific path patterns
PATH_ARGS=()
for p in "${FILE_PATTERNS_FILTERREPO[@]}"; do
if [[ "$p" == "!"* ]]; then continue; fi # skip negations for now
PATH_ARGS+=(--path-glob "$p")
done
# filter-repo uses --invert-paths to DELETE matched paths
git filter-repo --invert-paths "${PATH_ARGS[@]}" --force
elif [[ "$TOOL" == "bfg" ]]; then
# BFG: strip by size
bfg --strip-blobs-bigger-than "$SIZE_THRESHOLD" --no-blob-protection .
# BFG: strip by filename (no path — matches filename anywhere in history)
for p in "${FILE_PATTERNS_BFG[@]}"; do
bfg --delete-files "$p" --no-blob-protection .
done
elif [[ "$TOOL" == "bfg-jar" ]]; then
java -jar "${BFG_JAR}" --strip-blobs-bigger-than "$SIZE_THRESHOLD" --no-blob-protection .
for p in "${FILE_PATTERNS_BFG[@]}"; do
java -jar "${BFG_JAR}" --delete-files "$p" --no-blob-protection .
done
fi
# ---------- SECTION 4: GC ----------
section "4. Aggressive gc"
git reflog expire --expire=now --all
git gc --prune=now --aggressive
AFTER_SIZE="$(human_size "$BARE_CLONE")"
echo ""
echo "━━━ RESULT ━━━"
echo "BEFORE: $BEFORE_SIZE"
echo "AFTER: $AFTER_SIZE"
echo ""
# ---------- SECTION 5: NEXT STEPS ----------
section "5. Next steps (manual)"
cat <<MANUAL
The bare clone at $BARE_CLONE is ready. To finalize:
1. INSPECT — check a few refs to make sure history makes sense:
cd $BARE_CLONE
git log --oneline -20 main
git log --oneline -5 chore/v1.0.7-cleanup
git tag | head -20
2. VERIFY size reduction is reasonable:
du -sh $BARE_CLONE
3. FORCE PUSH to origin (rewrites all refs + tags — all collaborators
must re-clone):
cd $BARE_CLONE
git push --force --all origin
git push --force --tags origin
4. RE-CLONE your working copy (the old one has pre-BFG history):
cd "$(dirname "$REPO_ROOT")"
mv veza veza-prebfg-backup
git clone <origin-url> veza
Or if you trust this machine's local blob state:
cd $REPO_ROOT
git reflog expire --expire=now --all
git gc --prune=now --aggressive
5. REGENERATE local dev secrets that live outside git:
./scripts/generate-jwt-keys.sh
./scripts/generate-ssl-cert.sh
6. DELETE the bare clone once everything is verified stable:
rm -rf $BARE_CLONE
MANUAL