spack · jjnesbitt · Jan 23, 2025 · Jan 24, 2025
diff --git a/analytics/dev/db-backup-scripts/.gitignore b/analytics/dev/db-backup-scripts/.gitignore
@@ -0,0 +1 @@
+dumps/*
diff --git a/analytics/dev/db-backup-scripts/README.md b/analytics/dev/db-backup-scripts/README.md
@@ -0,0 +1,13 @@
+## What is this?
+
+These scripts allow population of the local database with real data from the prod analytics database. This is useful for testing changes to the analytics app. These scripts have the following functions:
+
+1. `analytics-db-proxy.sh` - This script spins up a pod within the cluster that bounces all traffic pointed at `localhost:9999` to the analytics database (specified by the `REMOTE_DB_HOST` env var). This is necessary as the database is behind an AWS VPC, so it's inaccessible outside of the cluster.
+2. `perform-queries.sh` - This script dumps data from the dimensional and fact tables to CSV files in the `dumps/` directory.
+3. `restore-db.sh` - This script populates data into the database from the files in the `dumps/` directory. **Note**, this will delete any existing rows in any of the tables that data is populated into, to avoid conflicts.
+
+## Requirements
+
+For the `analytics-db-proxy.sh` script, the `$REMOTE_DB_HOST` env var must be set to the hostname of the AWS RDS instance. This can be found in the AWS console. You must also ensure you have cluster access, and that your `KUBECONFIG` file is properly set, to allow for the cluster proxy to work.
+
+For the `perform-queries.sh` script, the `REMOTE_PGPASSWORD` env var must be set to the password of the `postgres` user to the prod analytics database.
diff --git a/analytics/dev/db-backup-scripts/analytics-db-proxy.sh b/analytics/dev/db-backup-scripts/analytics-db-proxy.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+set -euo pipefail
+
+UNIQUE_ID=$(whoami | sed 's/\.//g')
+
+export LOCALPORT=9999
+export PORT=5432
+export ADDR=$REMOTE_DB_HOST
+export PODNAME="pg-bastion-$UNIQUE_ID"
+
+# trap for deleting the pod
+function cleanup {
+  kubectl delete pod --now ${PODNAME} || true
+}
+trap cleanup 2
+
+if kubectl get pod ${PODNAME} &> /dev/null; then
+  kubectl delete pod --now ${PODNAME}
+fi
+
+kubectl run --restart=Never --image=alpine/socat ${PODNAME} -- -d -d tcp-listen:${PORT},fork,reuseaddr tcp-connect:${ADDR}:${PORT}
+kubectl wait --for=condition=Ready pod/${PODNAME}
+kubectl port-forward pod/${PODNAME} ${LOCALPORT}:${PORT}
diff --git a/analytics/dev/db-backup-scripts/perform-queries.sh b/analytics/dev/db-backup-scripts/perform-queries.sh
@@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+
+export PGUSER=postgres
+export PGPORT=9999
+export PGHOST=localhost
+export PGDATABASE=analytics
+export PGPASSWORD=$REMOTE_PGPASSWORD
+
+# Fact tables and time limits
+declare -A FACT_TABLES_TO_DAYS=(
+    ["core_jobfact"]=90
+    ["core_timerfact"]=30
+    ["core_timerphasefact"]=7
+)
+declare -A FACT_TABLES_TO_DATE_FIELD=(
+    ["core_jobfact"]="start_date_id"
+    ["core_timerfact"]="date_id"
+    ["core_timerphasefact"]="date_id"
+)
+
+function join_by {
+  local d=${1-} f=${2-}
+  if shift 2; then
+    printf %s "$f" "${@/#/$d}"
+  fi
+}
+
+
+# Query options
+NULL_VALUE="<null>"
+
+# Export dimension tables
+DIMENSION_TABLES=$(psql -t -c "SELECT table_name FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE='BASE TABLE' AND table_name LIKE 'core_%dimension'")
+for dimension_table in $DIMENSION_TABLES
+do
+    echo "Querying $dimension_table..."
+    COLUMNS=$(psql -t -c "SELECT column_name FROM information_schema.columns WHERE table_name = '$dimension_table' AND is_generated = 'NEVER'")
+    JOINED_COLUMNS=$(join_by , $COLUMNS)
+    psql --pset="null=$NULL_VALUE" --csv -c "SELECT $JOINED_COLUMNS FROM $dimension_table" -o "dumps/$dimension_table.csv"
+done
+
+
+# Export fact tables
+for fact_table in ${!FACT_TABLES_TO_DAYS[@]}
+do
+    echo "Querying $fact_table..."
+
+    FACT_TABLE_COLUMNS=$(psql -t -c "SELECT column_name FROM information_schema.columns WHERE table_name = '$fact_table' AND is_generated = 'NEVER'")
+    JOINED_COLUMNS=$(join_by , $FACT_TABLE_COLUMNS)
+
+    DAY_LIMIT=${FACT_TABLES_TO_DAYS[$fact_table]}
+    DATE_FIELD=${FACT_TABLES_TO_DATE_FIELD[$fact_table]}
+    psql --pset="null=$NULL_VALUE" --csv \
+        -o "dumps/${fact_table}_last_${DAY_LIMIT}_days.csv" \
+        -c "SELECT $JOINED_COLUMNS FROM $fact_table \
+            LEFT JOIN core_datedimension ON $fact_table.$DATE_FIELD = core_datedimension.date_key \
+            WHERE date >= CAST((NOW() + INTERVAL '-$DAY_LIMIT day') AS date)"
+done
diff --git a/analytics/dev/db-backup-scripts/restore-db.sh b/analytics/dev/db-backup-scripts/restore-db.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+
+export PGUSER=postgres
+export PGPORT=5432
+export PGHOST=localhost
+export PGDATABASE=django
+export PGPASSWORD=postgres
+
+NULL_VALUE="<null>"
+
+
+function load_from_file {
+    local backup_file=$1
+
+    echo "-------------------------------------------"
+    table_name=$(echo $backup_file | rev | cut -d '/' -f 1 | rev | cut -d "." -f 1 | cut -d "_" -f 1,2)
+    echo "Dropping existing rows from $table_name ..."
+    psql -c "TRUNCATE TABLE $table_name CASCADE"
+
+    HEADERS=$(head -n 1 $backup_file)
+    echo "loading $table_name from $backup_file ..."
+    psql -c "\copy $table_name($HEADERS) FROM '$backup_file' WITH(FORMAT CSV, HEADER, NULL '<null>')"
+}
+
+
+# Restore dimensional tables before fact tables,
+# to ensure foreign keys are added correctly
+
+echo "Restoring dimensional tables ..."
+for backup_file in ./dumps/*dimension.csv
+do
+    load_from_file $backup_file
+done
+
+echo "Restoring fact tables ..."
+for backup_file in ./dumps/*fact*.csv
+do
+    load_from_file $backup_file
+done