diff --git a/analytics/dev/db-backup-scripts/.gitignore b/analytics/dev/db-backup-scripts/.gitignore new file mode 100644 index 000000000..fa5a334ed --- /dev/null +++ b/analytics/dev/db-backup-scripts/.gitignore @@ -0,0 +1 @@ +dumps/* diff --git a/analytics/dev/db-backup-scripts/README.md b/analytics/dev/db-backup-scripts/README.md new file mode 100644 index 000000000..c18422d7c --- /dev/null +++ b/analytics/dev/db-backup-scripts/README.md @@ -0,0 +1,13 @@ +## What is this? + +These scripts allow population of the local database with real data from the prod analytics database. This is useful for testing changes to the analytics app. These scripts have the following functions: + +1. `analytics-db-proxy.sh` - This script spins up a pod within the cluster that bounces all traffic pointed at `localhost:9999` to the analytics database (specified by the `REMOTE_DB_HOST` env var). This is necessary as the database is behind an AWS VPC, so it's inaccessible outside of the cluster. +2. `perform-queries.sh` - This script dumps data from the dimensional and fact tables to CSV files in the `dumps/` directory. +3. `restore-db.sh` - This script populates data into the database from the files in the `dumps/` directory. **Note**, this will delete any existing rows in any of the tables that data is populated into, to avoid conflicts. + +## Requirements + +For the `analytics-db-proxy.sh` script, the `$REMOTE_DB_HOST` env var must be set to the hostname of the AWS RDS instance. This can be found in the AWS console. You must also ensure you have cluster access, and that your `KUBECONFIG` file is properly set, to allow for the cluster proxy to work. + +For the `perform-queries.sh` script, the `REMOTE_PGPASSWORD` env var must be set to the password of the `postgres` user to the prod analytics database. diff --git a/analytics/dev/db-backup-scripts/analytics-db-proxy.sh b/analytics/dev/db-backup-scripts/analytics-db-proxy.sh new file mode 100755 index 000000000..1b8a2ccff --- /dev/null +++ b/analytics/dev/db-backup-scripts/analytics-db-proxy.sh @@ -0,0 +1,25 @@ +#!/bin/bash +[ -z "$KUBECONFIG" ] && echo "KUBECONFIG env var must be set" && exit 1; + +set -euo pipefail + +UNIQUE_ID=$(whoami | sed 's/\.//g') + +export LOCALPORT=9999 +export PORT=5432 +export ADDR=$REMOTE_DB_HOST +export PODNAME="pg-bastion-$UNIQUE_ID" + +# trap for deleting the pod +function cleanup { + kubectl delete pod --now ${PODNAME} || true +} +trap cleanup 2 + +if kubectl get pod ${PODNAME} &> /dev/null; then + kubectl delete pod --now ${PODNAME} +fi + +kubectl run --restart=Never --image=alpine/socat ${PODNAME} -- -d -d tcp-listen:${PORT},fork,reuseaddr tcp-connect:${ADDR}:${PORT} +kubectl wait --for=condition=Ready pod/${PODNAME} +kubectl port-forward pod/${PODNAME} ${LOCALPORT}:${PORT} diff --git a/analytics/dev/db-backup-scripts/perform-queries.sh b/analytics/dev/db-backup-scripts/perform-queries.sh new file mode 100755 index 000000000..b383a2b70 --- /dev/null +++ b/analytics/dev/db-backup-scripts/perform-queries.sh @@ -0,0 +1,109 @@ +#!/usr/bin/env bash + +export PGUSER=postgres +export PGPORT=9999 +export PGHOST=localhost +export PGDATABASE=analytics +export PGPASSWORD=$REMOTE_PGPASSWORD + + +declare -A TABLE_ARGS + +# Process arguments +while [[ $# -gt 0 ]]; do + case $1 in + -t|--table) + # TABLE_ARG="$2" + TABLE_ARGS[$2]=1 + shift # past argument + shift # past value + ;; + -*|--*) + echo "Unknown option $1" + exit 1 + ;; + *) + POSITIONAL_ARGS+=("$1") # save positional arg + shift # past argument + ;; + esac +done + + +# Fact tables and time limits +declare -A FACT_TABLES_TO_DAYS=( + ["core_jobfact"]=1000 + ["core_timerfact"]=30 + ["core_timerphasefact"]=7 +) +declare -A FACT_TABLES_TO_DATE_FIELD=( + ["core_jobfact"]="start_date_id" + ["core_timerfact"]="date_id" + ["core_timerphasefact"]="date_id" +) +declare -A FACT_TABLES_TO_TIME_FIELD=( + ["core_jobfact"]="start_time_id" + ["core_timerfact"]="time_id" + ["core_timerphasefact"]="time_id" +) + +function join_by { + local d=${1-} f=${2-} + if shift 2; then + printf %s "$f" "${@/#/$d}" + fi +} + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +DUMPS_DIR="${SCRIPT_DIR}/dumps" + +# Ensure dumps directory exists +if [ ! -d $DUMPS_DIR ]; then + mkdir $DUMPS_DIR +fi + + +# Ensure all retrieved records have a consistent end timestamp +LAST_RECORD_TIMESTAMP=$(psql -t -c "select NOW() + INTERVAL '-1 hour' as timestamp") + +# Query options +NULL_VALUE="" + +# Export dimension tables +DIMENSION_TABLES=$(psql -t -c "SELECT table_name FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE='BASE TABLE' AND table_name LIKE 'core_%dimension'") +for dimension_table in $DIMENSION_TABLES +do + # Skip if table arg supplied and this table name does not match + [ "${#TABLE_ARGS[@]}" -eq 0 ] || [ "${TABLE_ARGS[$dimension_table]}" ] || continue + + echo "Querying $dimension_table..." + COLUMNS=$(psql -t -c "SELECT column_name FROM information_schema.columns WHERE table_name = '$dimension_table' AND is_generated = 'NEVER'") + JOINED_COLUMNS=$(join_by , $COLUMNS) + psql --pset="null=$NULL_VALUE" --csv -c "SELECT $JOINED_COLUMNS FROM $dimension_table" -o "$DUMPS_DIR/$dimension_table.csv" +done + + +# Export fact tables +for fact_table in ${!FACT_TABLES_TO_DAYS[@]} +do + # Skip if table arg supplied and this table name does not match + [ "${#TABLE_ARGS[@]}" -eq 0 ] || [ "${TABLE_ARGS[$fact_table]}" ] || continue + + echo "Querying $fact_table..." + + FACT_TABLE_COLUMNS=$(psql -t -c "SELECT column_name FROM information_schema.columns WHERE table_name = '$fact_table' AND is_generated = 'NEVER'") + JOINED_COLUMNS=$(join_by , $FACT_TABLE_COLUMNS) + + DAY_LIMIT=${FACT_TABLES_TO_DAYS[$fact_table]} + DATE_FIELD=${FACT_TABLES_TO_DATE_FIELD[$fact_table]} + TIME_FIELD=${FACT_TABLES_TO_TIME_FIELD[$fact_table]} + psql --pset="null=$NULL_VALUE" --csv \ + -o "$DUMPS_DIR/${fact_table}_last_${DAY_LIMIT}_days.csv" \ + -c "SELECT $JOINED_COLUMNS FROM $fact_table \ + LEFT JOIN core_datedimension dd ON $fact_table.$DATE_FIELD = dd.date_key \ + LEFT JOIN core_timedimension td ON $fact_table.$TIME_FIELD = td.time_key \ + WHERE \ + date >= CAST((NOW() + INTERVAL '-$DAY_LIMIT day') as date) \ + AND CAST((dd.date + td.time) as timestamp) < '$LAST_RECORD_TIMESTAMP' \ + " +done diff --git a/analytics/dev/db-backup-scripts/restore-db.sh b/analytics/dev/db-backup-scripts/restore-db.sh new file mode 100755 index 000000000..4bdff506a --- /dev/null +++ b/analytics/dev/db-backup-scripts/restore-db.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash + +export PGUSER=postgres +export PGPORT=5432 +export PGHOST=localhost +export PGDATABASE=django +export PGPASSWORD=postgres + +NULL_VALUE="" + + +function load_from_file { + local backup_file=$1 + + echo "-------------------------------------------" + table_name=$(echo $backup_file | rev | cut -d '/' -f 1 | rev | cut -d "." -f 1 | cut -d "_" -f 1,2) + echo "Dropping existing rows from $table_name ..." + psql -c "TRUNCATE TABLE $table_name CASCADE" + + HEADERS=$(head -n 1 $backup_file) + echo "loading $table_name from $backup_file ..." + psql -c "\copy $table_name($HEADERS) FROM '$backup_file' WITH(FORMAT CSV, HEADER, NULL '')" +} + + +# Restore dimensional tables before fact tables, +# to ensure foreign keys are added correctly + +echo "Restoring dimensional tables ..." +for backup_file in ./dumps/*dimension.csv +do + load_from_file $backup_file +done + +echo "Restoring fact tables ..." +for backup_file in ./dumps/*fact*.csv +do + load_from_file $backup_file +done