diff --git a/.github/workflows/build-deploy-pudl.yml b/.github/workflows/build-deploy-pudl.yml index 141dc76681..be0a3680bd 100644 --- a/.github/workflows/build-deploy-pudl.yml +++ b/.github/workflows/build-deploy-pudl.yml @@ -117,6 +117,7 @@ jobs: --container-env DAGSTER_PG_HOST="104.154.182.24" \ --container-env DAGSTER_PG_DB="dagster-storage" \ --container-env PUDL_SETTINGS_YML="/home/catalyst/src/pudl/package_data/settings/etl_full.yml" \ + --container-env FLY_ACCESS_TOKEN=${{ secrets.FLY_ACCESS_TOKEN }} \ # Start the VM - name: Start the deploy-pudl-vm diff --git a/.gitignore b/.gitignore index 2c0293ffae..997dd77884 100644 --- a/.gitignore +++ b/.gitignore @@ -38,3 +38,9 @@ notebooks/*.tgz terraform/.terraform/* .env .hypothesis/ + +# generated by datasette/publish.py fresh for every deploy - we shouldn't track changes. +devtools/datasette/fly/Dockerfile +devtools/datasette/fly/inspect-data.json +devtools/datasette/fly/metadata.yml +devtools/datasette/fly/all_dbs.tar.zst diff --git a/devtools/datasette/fly/fly.toml b/devtools/datasette/fly/fly.toml new file mode 100644 index 0000000000..4b8923dacf --- /dev/null +++ b/devtools/datasette/fly/fly.toml @@ -0,0 +1,34 @@ +# fly.toml app configuration file generated for catalyst-coop-pudl on 2023-11-03T15:31:15-04:00 +# +# See https://fly.io/docs/reference/configuration/ for information about how to use this file. +# +app = "catalyst-coop-pudl" +primary_region = "bos" + +[[mounts]] + destination = "/data" + source = "datasette" + +[[services]] + internal_port = 8080 + protocol = "tcp" + + [services.concurrency] + hard_limit = 25 + soft_limit = 20 + + [[services.ports]] + handlers = ["http"] + port = 80 + + [[services.ports]] + handlers = ["tls", "http"] + port = 443 + + [[services.tcp_checks]] + grace_period = "1m" + interval = 10000 + timeout = 2000 + +[deploy] +wait_timeout = "15m" \ No newline at end of file diff --git a/devtools/datasette/fly/run.sh b/devtools/datasette/fly/run.sh new file mode 100755 index 0000000000..9516d73d7a --- /dev/null +++ b/devtools/datasette/fly/run.sh @@ -0,0 +1,10 @@ +#! /usr/bin/env bash +set -eux + +shopt -s nullglob + +find /data/ -name '*.sqlite' -delete +mv all_dbs.tar.zst /data +zstd -f -d /data/all_dbs.tar.zst -o /data/all_dbs.tar +tar -xf /data/all_dbs.tar --directory /data +datasette serve --host 0.0.0.0 /data/*.sqlite --cors --inspect-file inspect-data.json --metadata metadata.yml --setting sql_time_limit_ms 5000 --port $PORT \ No newline at end of file diff --git a/devtools/datasette/publish.py b/devtools/datasette/publish.py new file mode 100644 index 0000000000..a5b3b3123f --- /dev/null +++ b/devtools/datasette/publish.py @@ -0,0 +1,122 @@ +"""Publish the datasette to fly.io. + +We use custom logic here because the datasette-publish-fly plugin bakes the +uncompressed databases into the image, which makes the image too large. + +We compress the databases before baking them into the image. Then we decompress +them at runtime to a Fly volume mounted at /data. This avoids a long download +at startup, and allows us stay within the Fly.io 8GB image size limit. + +The volume handling is done manually outside of this publish.py script - it +should be terraformed at some point. + +Some static fly.io deployment-related files live in ./fly: +* fly.toml - service configuration +* run.sh - service entrypoint + +Apart from that: the Dockerfile and dataset-specific +metadata.yml/inspect-data.json are generated by this script. +""" + +import json +import logging +import secrets +from pathlib import Path +from subprocess import check_call, check_output + +from pudl.metadata.classes import DatasetteMetadata +from pudl.workspace.setup import PudlPaths + +logging.basicConfig(format="%(asctime)s %(message)s", level=logging.INFO) + +DOCKERFILE_TEMPLATE = """ +FROM python:3.11.0-slim-bullseye +COPY . /app +WORKDIR /app + +RUN apt-get update +RUN apt-get install -y zstd + +ENV DATASETTE_SECRET '{datasette_secret}' +RUN pip install -U datasette datasette-cluster-map datasette-vega datasette-block-robots +ENV PORT 8080 +EXPOSE 8080 + +CMD ["./run.sh"] +""" + + +def make_dockerfile(): + """Write a dockerfile from template, to use in fly deploy. + + We write this from template so we can generate a datasette secret. This way + we don't have to manage secrets at all. + """ + datasette_secret = secrets.token_hex(16) + return DOCKERFILE_TEMPLATE.format(datasette_secret=datasette_secret) + + +def inspect_data(datasets, pudl_out): + """Pre-inspect databases to generate some metadata for Datasette. + + This is done in the image build process in datasette-publish-fly, but since + we don't have access to the databases in the build process we have to + inspect before building the Docker image. + """ + inspect_output = json.loads( + check_output( + [ # noqa: S603 + "datasette", + "inspect", + ] + + [str(pudl_out / ds) for ds in datasets] + ) + ) + + for dataset in inspect_output: + name = Path(inspect_output[dataset]["file"]).name + new_filepath = Path("/data") / name + inspect_output[dataset]["file"] = str(new_filepath) + return inspect_output + + +def metadata(pudl_out) -> str: + """Return human-readable metadata for Datasette.""" + return DatasetteMetadata.from_data_source_ids(pudl_out).to_yaml() + + +def main(): + """Generate deployment files and run the deploy.""" + fly_dir = Path(__file__).parent.absolute() / "fly" + docker_path = fly_dir / "Dockerfile" + inspect_path = fly_dir / "inspect-data.json" + metadata_path = fly_dir / "metadata.yml" + + pudl_out = PudlPaths().pudl_output + datasets = [str(p.name) for p in pudl_out.glob("*.sqlite")] + logging.info(f"Inspecting DBs for datasette: {datasets}...") + inspect_output = inspect_data(datasets, pudl_out) + with inspect_path.open("w") as f: + f.write(json.dumps(inspect_output)) + + logging.info("Writing metadata...") + with metadata_path.open("w") as f: + f.write(metadata(pudl_out)) + + logging.info("Writing Dockerfile...") + with docker_path.open("w") as f: + f.write(make_dockerfile()) + + logging.info(f"Compressing {datasets} and putting into docker context...") + check_call( + ["tar", "-a", "-czvf", fly_dir / "all_dbs.tar.zst"] + datasets, # noqa: S603 + cwd=pudl_out, + ) + + logging.info("Running fly deploy...") + check_call(["/usr/bin/env", "flyctl", "deploy"], cwd=fly_dir) # noqa: S603 + logging.info("Deploy finished!") + + +if __name__ == "__main__": + main() diff --git a/devtools/datasette/publish.sh b/devtools/datasette/publish.sh deleted file mode 100755 index 53bed7d3f7..0000000000 --- a/devtools/datasette/publish.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/sh - -PUDL_OUT=`grep "^pudl_out" $HOME/.pudl.yml | sed -e "s/^pudl_out: //"` -SQLITE_DIR="$PUDL_OUTPUT" - -# make metadata.yml -datasette_metadata_to_yml -o "metadata.yml" - -datasette publish cloudrun \ - --service catalyst-datasette \ - --memory 32Gi \ - --install datasette-cluster-map \ - --install datasette-vega \ - --install datasette-block-robots \ - --metadata metadata.yml \ - --extra-options="--setting sql_time_limit_ms 5000" \ - $SQLITE_DIR/pudl.sqlite \ - $SQLITE_DIR/ferc1.sqlite \ - $SQLITE_DIR/ferc2.sqlite \ - $SQLITE_DIR/ferc6.sqlite \ - $SQLITE_DIR/ferc60.sqlite \ - $SQLITE_DIR/ferc1_xbrl.sqlite \ - $SQLITE_DIR/ferc2_xbrl.sqlite \ - $SQLITE_DIR/ferc6_xbrl.sqlite \ - $SQLITE_DIR/ferc60_xbrl.sqlite \ - $SQLITE_DIR/ferc714_xbrl.sqlite diff --git a/docker/Dockerfile b/docker/Dockerfile index 324bf79061..9095a38f31 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,5 +1,7 @@ FROM condaforge/mambaforge:23.3.1-1 +SHELL [ "/bin/bash", "-exo", "pipefail", "-c" ] + # Install curl and js # awscli requires unzip, less, groff and mandoc # hadolint ignore=DL3008 @@ -24,6 +26,10 @@ ENV CONTAINER_HOME=/home/catalyst USER catalyst WORKDIR ${CONTAINER_HOME} +# Install flyctl +RUN curl -L https://fly.io/install.sh | sh +ENV PATH="${CONTAINER_HOME}/.fly/bin:$PATH" + ENV CONDA_PREFIX=${CONTAINER_HOME}/env ENV PUDL_REPO=${CONTAINER_HOME}/pudl ENV CONDA_RUN="conda run --no-capture-output --prefix ${CONDA_PREFIX}" diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 47b6a730b6..ee49390b09 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -12,6 +12,7 @@ services: environment: - API_KEY_EIA - GCP_BILLING_PROJECT + - FLY_ACCESS_TOKEN env_file: - .env build: diff --git a/docker/gcp_pudl_etl.sh b/docker/gcp_pudl_etl.sh index ba6d1e0f89..e22739802a 100644 --- a/docker/gcp_pudl_etl.sh +++ b/docker/gcp_pudl_etl.sh @@ -85,10 +85,8 @@ function notify_slack() { # 2>&1 redirects stderr to stdout. run_pudl_etl 2>&1 | tee $LOGFILE -# Notify slack if the etl succeeded. +# if pipeline is successful, distribute + publish datasette if [[ ${PIPESTATUS[0]} == 0 ]]; then - notify_slack "success" - # Dump outputs to s3 bucket if branch is dev or build was triggered by a tag if [ $GITHUB_ACTION_TRIGGER = "push" ] || [ $GITHUB_REF = "dev" ]; then copy_outputs_to_distribution_bucket @@ -96,9 +94,15 @@ if [[ ${PIPESTATUS[0]} == 0 ]]; then # Deploy the updated data to datasette if [ $GITHUB_REF = "dev" ]; then - gcloud config set run/region us-central1 - source ~/devtools/datasette/publish.sh + python ~/devtools/datasette/publish.py 2>&1 | tee -a $LOGFILE fi +fi + +# Notify slack about entire pipeline's success or failure; +# PIPESTATUS[0] either refers to the failed ETL run or the last distribution +# task that was run above +if [[ ${PIPESTATUS[0]} == 0 ]]; then + notify_slack "success" else notify_slack "failure" fi diff --git a/pyproject.toml b/pyproject.toml index a0518d645d..12af588f86 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -100,7 +100,6 @@ keywords = [ metadata_to_rst = "pudl.convert.metadata_to_rst:main" epacems_to_parquet = "pudl.convert.epacems_to_parquet:main" ferc_to_sqlite = "pudl.ferc_to_sqlite.cli:main" -datasette_metadata_to_yml = "pudl.convert.datasette_metadata_to_yml:main" pudl_datastore = "pudl.workspace.datastore:main" pudl_etl = "pudl.cli.etl:main" pudl_setup = "pudl.workspace.setup_cli:main" diff --git a/src/pudl/convert/__init__.py b/src/pudl/convert/__init__.py index 1085accee9..02676d1eeb 100644 --- a/src/pudl/convert/__init__.py +++ b/src/pudl/convert/__init__.py @@ -13,7 +13,6 @@ """ from . import ( censusdp1tract_to_sqlite, - datasette_metadata_to_yml, epacems_to_parquet, metadata_to_rst, ) diff --git a/src/pudl/convert/datasette_metadata_to_yml.py b/src/pudl/convert/datasette_metadata_to_yml.py deleted file mode 100644 index 7d88931788..0000000000 --- a/src/pudl/convert/datasette_metadata_to_yml.py +++ /dev/null @@ -1,62 +0,0 @@ -"""Export metadata to YAML for Datasette.""" - -import argparse -import os -import sys - -from dotenv import load_dotenv - -import pudl -from pudl.metadata.classes import DatasetteMetadata - -logger = pudl.logging_helpers.get_logger(__name__) - - -def parse_command_line(argv): - """Parse command line arguments. See the -h option. - - Args: - argv (str): Command line arguments, including absolute path to output filename. - - Returns: - dict: Dictionary of command line arguments and their parsed values. - """ - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument( - "-o", - "--output", - help="Path to the file where the YAML output should be written.", - default=False, - ) - parser.add_argument( - "--logfile", - default=None, - type=str, - help="If specified, write logs to this file.", - ) - parser.add_argument( - "--loglevel", - help="Set logging level (DEBUG, INFO, WARNING, ERROR, or CRITICAL).", - default="INFO", - ) - arguments = parser.parse_args(argv[1:]) - return arguments - - -def main(): - """Convert metadata to YAML.""" - load_dotenv() - args = parse_command_line(sys.argv) - - pudl.logging_helpers.configure_root_logger( - logfile=args.logfile, loglevel=args.loglevel - ) - - logger.info(f"Exporting Datasette metadata to: {args.output}") - - dm = DatasetteMetadata.from_data_source_ids(os.getenv("PUDL_OUTPUT")) - dm.to_yaml(path=args.output) - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/src/pudl/metadata/classes.py b/src/pudl/metadata/classes.py index 49b97cfb8f..a5e8f0be31 100644 --- a/src/pudl/metadata/classes.py +++ b/src/pudl/metadata/classes.py @@ -2021,7 +2021,7 @@ def from_data_source_ids( xbrl_resources=xbrl_resources, ) - def to_yaml(self, path: str = None) -> None: + def to_yaml(self) -> str: """Output database, table, and column metadata to YAML file.""" template = _get_jinja_environment().get_template("datasette-metadata.yml.jinja") rendered = template.render( @@ -2031,7 +2031,4 @@ def to_yaml(self, path: str = None) -> None: xbrl_resources=self.xbrl_resources, label_columns=self.label_columns, ) - if path: - Path(path).write_text(rendered) - else: - sys.stdout.write(rendered) + return rendered diff --git a/test/integration/datasette_metadata_test.py b/test/integration/datasette_metadata_test.py index dfd0f0838f..e039a156b3 100644 --- a/test/integration/datasette_metadata_test.py +++ b/test/integration/datasette_metadata_test.py @@ -18,7 +18,8 @@ def test_datasette_metadata_to_yml(ferc1_engine_xbrl): logger.info(f"Writing Datasette Metadata to {metadata_yml}") dm = DatasetteMetadata.from_data_source_ids(PudlPaths().output_dir) - dm.to_yaml(path=metadata_yml) + with metadata_yml.open("w") as f: + f.write(dm.to_yaml()) logger.info("Parsing generated metadata using datasette utils.") metadata_json = json.dumps(yaml.safe_load(metadata_yml.open()))