#!/usr/bin/env bash # validate-cohort.sh — sanity-check a soft-launch beta cohort CSV # before it gets fed to send-invitations.sh. # # The CSV is the operator's curated list of beta-tester emails + # segmentation. This script catches the avoidable mistakes BEFORE # we batch-insert 100 rows into beta_invites and start spraying # emails : # # - Empty file or wrong header # - Duplicate emails (would create 2 invites for the same person) # - Malformed emails (missing @, leading/trailing whitespace) # - Cohort distribution looks off (no creators, only one segment, # under-50 total — soft-launch acceptance gate is ≥50 testers) # - Email collisions with existing users (already registered = the # invite code is wasted) # # v1.0.10 Cluster 3.4. # # Usage : # bash scripts/soft-launch/validate-cohort.sh path/to/cohort.csv # # Optional env : # DATABASE_URL if set, also checks for collisions with the users # table (email already registered → flagged but not # fatal — operator may want to invite an existing # user back to test the new flows). # MIN_COHORT minimum total rows required (default 50, matches the # acceptance-gate threshold in SOFT_LAUNCH_BETA_2026.md). # MIN_CREATORS minimum number of creator-* cohort rows (default 5). # # Exit codes : # 0 — cohort valid # 1 — cohort malformed (will block send-invitations.sh) # 2 — cohort merely warns (size below minimum, missing collision # check) ; operator may proceed with --force set -euo pipefail CSV=${1:-} if [ -z "$CSV" ] || [ ! -f "$CSV" ]; then cat >&2 <- so the post-launch attribution report groups cleanly. EOF exit 1 fi MIN_COHORT=${MIN_COHORT:-50} MIN_CREATORS=${MIN_CREATORS:-5} # 1. Header check. header=$(head -1 "$CSV" | tr -d '\r') if [ "$header" != "email,cohort,sent_by_email" ]; then echo "ERROR: header line must be exactly 'email,cohort,sent_by_email' (got: $header)" >&2 exit 1 fi # 2. Row count + duplicates + email shape (awk pipeline reads once). total=0 malformed=0 duplicates=0 declare -A seen declare -A cohort_count declare -a malformed_lines while IFS=, read -r email cohort sent_by_email; do email=$(echo "$email" | tr -d '\r' | xargs) cohort=$(echo "$cohort" | tr -d '\r' | xargs) total=$(( total + 1 )) # Email shape : must contain exactly one @, no whitespace, > 5 chars. if [[ ! "$email" =~ ^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$ ]]; then malformed=$(( malformed + 1 )) malformed_lines+=(" line $(( total + 1 )) : invalid email '$email'") continue fi # Duplicate detection. if [ -n "${seen[$email]:-}" ]; then duplicates=$(( duplicates + 1 )) malformed_lines+=(" line $(( total + 1 )) : duplicate email '$email' (first seen at line ${seen[$email]})") continue fi seen[$email]=$(( total + 1 )) # Cohort tally. cohort_count[$cohort]=$(( ${cohort_count[$cohort]:-0} + 1 )) done < <(tail -n +2 "$CSV") echo "----------------------------------------------------------------" echo "Cohort validation report" echo "----------------------------------------------------------------" echo " CSV file : $CSV" echo " Total rows : $total" echo " Unique emails : ${#seen[@]}" echo " Malformed rows : $malformed" echo " Duplicates : $duplicates" echo "" echo "Distribution by cohort :" for c in "${!cohort_count[@]}"; do printf " %-40s %d\n" "$c" "${cohort_count[$c]}" done | sort echo "" exit_code=0 # 3. Hard checks (block send). if [ "$malformed" -gt 0 ] || [ "$duplicates" -gt 0 ]; then echo "ERROR: $malformed malformed + $duplicates duplicate row(s) — fix before sending." for line in "${malformed_lines[@]}"; do echo "$line" done exit 1 fi # 4. Soft checks (warn, don't block — operator decides). if [ "$total" -lt "$MIN_COHORT" ]; then echo "WARN : cohort has $total rows ; soft-launch acceptance gate is ≥ $MIN_COHORT." exit_code=2 fi creator_total=0 for c in "${!cohort_count[@]}"; do if [[ "$c" == creator-* ]]; then creator_total=$(( creator_total + cohort_count[$c] )) fi done if [ "$creator_total" -lt "$MIN_CREATORS" ]; then echo "WARN : only $creator_total creator-* cohort rows ; goal is ≥ $MIN_CREATORS for upload-flow coverage." exit_code=2 fi if [ "${#cohort_count[@]}" -lt 3 ]; then echo "WARN : only ${#cohort_count[@]} distinct cohort labels — feedback will be narrow." exit_code=2 fi # 5. Optional : DATABASE_URL collision check. if [ -n "${DATABASE_URL:-}" ]; then command -v psql >/dev/null 2>&1 || { echo "WARN : DATABASE_URL set but psql not on \$PATH ; skipping collision check." exit_code=2 } if command -v psql >/dev/null 2>&1; then emails_csv=$(printf '%s,' "${!seen[@]}" | sed 's/,$//') collisions=$(psql "$DATABASE_URL" -A -t -c " SELECT count(*) FROM users WHERE email = ANY(string_to_array('$emails_csv', ',')); " 2>/dev/null | tr -d ' ' || echo "?") if [ "$collisions" = "?" ]; then echo "WARN : couldn't query users table (psql connection issue) ; skipping collision check." exit_code=2 elif [ "$collisions" -gt 0 ]; then echo "INFO : $collisions email(s) in the cohort already exist in the users table — invite codes will be wasted on existing accounts." exit_code=2 fi fi fi echo "" case $exit_code in 0) echo "PASS : cohort valid, ready for send-invitations.sh." ;; 2) echo "WARN : cohort valid but with caveats — review and re-run with --force from send-invitations.sh if intentional." ;; esac exit $exit_code