Deploy Datasette to fly.io instead of Cloud Run (#3018)

* Try using datasette publish fly * Pull run command into its own shell script; only deploy one dataset for iteration speed. * Append publishing logs to the logfile as well
catalyst-cooperative · Nov 9, 2023 · d8512b5 · d8512b5
1 parent a2bdffa
commit d8512b5
Show file tree

Hide file tree

Showing 14 changed files with 193 additions and 101 deletions.
diff --git a/.github/workflows/build-deploy-pudl.yml b/.github/workflows/build-deploy-pudl.yml
@@ -117,6 +117,7 @@ jobs:
             --container-env DAGSTER_PG_HOST="104.154.182.24" \
             --container-env DAGSTER_PG_DB="dagster-storage" \
             --container-env PUDL_SETTINGS_YML="/home/catalyst/src/pudl/package_data/settings/etl_full.yml" \
+            --container-env FLY_ACCESS_TOKEN=${{ secrets.FLY_ACCESS_TOKEN }} \
 
       # Start the VM
       - name: Start the deploy-pudl-vm

diff --git a/.gitignore b/.gitignore
@@ -38,3 +38,9 @@ notebooks/*.tgz
 terraform/.terraform/*
 .env
 .hypothesis/
+
+# generated by datasette/publish.py fresh for every deploy - we shouldn't track changes.
+devtools/datasette/fly/Dockerfile
+devtools/datasette/fly/inspect-data.json
+devtools/datasette/fly/metadata.yml
+devtools/datasette/fly/all_dbs.tar.zst
diff --git a/devtools/datasette/fly/fly.toml b/devtools/datasette/fly/fly.toml
@@ -0,0 +1,34 @@
+# fly.toml app configuration file generated for catalyst-coop-pudl on 2023-11-03T15:31:15-04:00
+#
+# See https://fly.io/docs/reference/configuration/ for information about how to use this file.
+#
+app = "catalyst-coop-pudl"
+primary_region = "bos"
+
+[[mounts]]
+  destination = "/data"
+  source = "datasette"
+
+[[services]]
+  internal_port = 8080
+  protocol = "tcp"
+
+  [services.concurrency]
+    hard_limit = 25
+    soft_limit = 20
+
+  [[services.ports]]
+    handlers = ["http"]
+    port = 80
+
+  [[services.ports]]
+    handlers = ["tls", "http"]
+    port = 443
+
+  [[services.tcp_checks]]
+    grace_period = "1m"
+    interval = 10000
+    timeout = 2000
+
+[deploy]
+wait_timeout = "15m"
diff --git a/devtools/datasette/fly/run.sh b/devtools/datasette/fly/run.sh
@@ -0,0 +1,10 @@
+#! /usr/bin/env bash
+set -eux
+
+shopt -s nullglob
+
+find /data/ -name '*.sqlite' -delete
+mv all_dbs.tar.zst /data
+zstd -f -d /data/all_dbs.tar.zst -o /data/all_dbs.tar
+tar -xf /data/all_dbs.tar --directory /data
+datasette serve --host 0.0.0.0 /data/*.sqlite --cors --inspect-file inspect-data.json --metadata metadata.yml --setting sql_time_limit_ms 5000 --port $PORT
diff --git a/devtools/datasette/publish.py b/devtools/datasette/publish.py
@@ -0,0 +1,122 @@
+"""Publish the datasette to fly.io.
+
+We use custom logic here because the datasette-publish-fly plugin bakes the
+uncompressed databases into the image, which makes the image too large.
+
+We compress the databases before baking them into the image. Then we decompress
+them at runtime to a Fly volume mounted at /data. This avoids a long download
+at startup, and allows us stay within the Fly.io 8GB image size limit.
+
+The volume handling is done manually outside of this publish.py script - it
+should be terraformed at some point.
+
+Some static fly.io deployment-related files live in ./fly:
+* fly.toml - service configuration
+* run.sh - service entrypoint
+
+Apart from that: the Dockerfile and dataset-specific
+metadata.yml/inspect-data.json are generated by this script.
+"""
+
+import json
+import logging
+import secrets
+from pathlib import Path
+from subprocess import check_call, check_output
+
+from pudl.metadata.classes import DatasetteMetadata
+from pudl.workspace.setup import PudlPaths
+
+logging.basicConfig(format="%(asctime)s %(message)s", level=logging.INFO)
+
+DOCKERFILE_TEMPLATE = """
+FROM python:3.11.0-slim-bullseye
+COPY . /app
+WORKDIR /app
+
+RUN apt-get update
+RUN apt-get install -y zstd
+
+ENV DATASETTE_SECRET '{datasette_secret}'
+RUN pip install -U datasette datasette-cluster-map datasette-vega datasette-block-robots
+ENV PORT 8080
+EXPOSE 8080
+
+CMD ["./run.sh"]
+"""
+
+
+def make_dockerfile():
+    """Write a dockerfile from template, to use in fly deploy.
+
+    We write this from template so we can generate a datasette secret. This way
+    we don't have to manage secrets at all.
+    """
+    datasette_secret = secrets.token_hex(16)
+    return DOCKERFILE_TEMPLATE.format(datasette_secret=datasette_secret)
+
+
+def inspect_data(datasets, pudl_out):
+    """Pre-inspect databases to generate some metadata for Datasette.
+
+    This is done in the image build process in datasette-publish-fly, but since
+    we don't have access to the databases in the build process we have to
+    inspect before building the Docker image.
+    """
+    inspect_output = json.loads(
+        check_output(
+            [  # noqa: S603
+                "datasette",
+                "inspect",
+            ]
+            + [str(pudl_out / ds) for ds in datasets]
+        )
+    )
+
+    for dataset in inspect_output:
+        name = Path(inspect_output[dataset]["file"]).name
+        new_filepath = Path("/data") / name
+        inspect_output[dataset]["file"] = str(new_filepath)
+    return inspect_output
+
+
+def metadata(pudl_out) -> str:
+    """Return human-readable metadata for Datasette."""
+    return DatasetteMetadata.from_data_source_ids(pudl_out).to_yaml()
+
+
+def main():
+    """Generate deployment files and run the deploy."""
+    fly_dir = Path(__file__).parent.absolute() / "fly"
+    docker_path = fly_dir / "Dockerfile"
+    inspect_path = fly_dir / "inspect-data.json"
+    metadata_path = fly_dir / "metadata.yml"
+
+    pudl_out = PudlPaths().pudl_output
+    datasets = [str(p.name) for p in pudl_out.glob("*.sqlite")]
+    logging.info(f"Inspecting DBs for datasette: {datasets}...")
+    inspect_output = inspect_data(datasets, pudl_out)
+    with inspect_path.open("w") as f:
+        f.write(json.dumps(inspect_output))
+
+    logging.info("Writing metadata...")
+    with metadata_path.open("w") as f:
+        f.write(metadata(pudl_out))
+
+    logging.info("Writing Dockerfile...")
+    with docker_path.open("w") as f:
+        f.write(make_dockerfile())
+
+    logging.info(f"Compressing {datasets} and putting into docker context...")
+    check_call(
+        ["tar", "-a", "-czvf", fly_dir / "all_dbs.tar.zst"] + datasets,  # noqa: S603
+        cwd=pudl_out,
+    )
+
+    logging.info("Running fly deploy...")
+    check_call(["/usr/bin/env", "flyctl", "deploy"], cwd=fly_dir)  # noqa: S603
+    logging.info("Deploy finished!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/devtools/datasette/publish.sh b/devtools/datasette/publish.sh
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -1,5 +1,7 @@
 FROM condaforge/mambaforge:23.3.1-1
 
+SHELL [ "/bin/bash", "-exo", "pipefail", "-c" ]
+
 # Install curl and js
 # awscli requires unzip, less, groff and mandoc
 # hadolint ignore=DL3008
@@ -24,6 +26,10 @@ ENV CONTAINER_HOME=/home/catalyst
 USER catalyst
 WORKDIR ${CONTAINER_HOME}
 
+# Install flyctl
+RUN curl -L https://fly.io/install.sh | sh
+ENV PATH="${CONTAINER_HOME}/.fly/bin:$PATH"
+
 ENV CONDA_PREFIX=${CONTAINER_HOME}/env
 ENV PUDL_REPO=${CONTAINER_HOME}/pudl
 ENV CONDA_RUN="conda run --no-capture-output --prefix ${CONDA_PREFIX}"

diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml
@@ -12,6 +12,7 @@ services:
     environment:
       - API_KEY_EIA
       - GCP_BILLING_PROJECT
+      - FLY_ACCESS_TOKEN
     env_file:
       - .env
     build:

diff --git a/docker/gcp_pudl_etl.sh b/docker/gcp_pudl_etl.sh
@@ -85,20 +85,24 @@ function notify_slack() {
 # 2>&1 redirects stderr to stdout.
 run_pudl_etl 2>&1 | tee $LOGFILE
 
-# Notify slack if the etl succeeded.
+# if pipeline is successful, distribute + publish datasette
 if [[ ${PIPESTATUS[0]} == 0 ]]; then
-    notify_slack "success"
-
     # Dump outputs to s3 bucket if branch is dev or build was triggered by a tag
     if [ $GITHUB_ACTION_TRIGGER = "push" ] || [ $GITHUB_REF = "dev" ]; then
         copy_outputs_to_distribution_bucket
     fi
 
     # Deploy the updated data to datasette
     if [ $GITHUB_REF = "dev" ]; then
-        gcloud config set run/region us-central1
-        source ~/devtools/datasette/publish.sh
+        python ~/devtools/datasette/publish.py 2>&1 | tee -a $LOGFILE
     fi
+fi
+
+# Notify slack about entire pipeline's success or failure;
+# PIPESTATUS[0] either refers to the failed ETL run or the last distribution
+# task that was run above
+if [[ ${PIPESTATUS[0]} == 0 ]]; then
+    notify_slack "success"
 else
     notify_slack "failure"
 fi

diff --git a/pyproject.toml b/pyproject.toml
@@ -100,7 +100,6 @@ keywords = [
 metadata_to_rst = "pudl.convert.metadata_to_rst:main"
 epacems_to_parquet = "pudl.convert.epacems_to_parquet:main"
 ferc_to_sqlite = "pudl.ferc_to_sqlite.cli:main"
-datasette_metadata_to_yml = "pudl.convert.datasette_metadata_to_yml:main"
 pudl_datastore = "pudl.workspace.datastore:main"
 pudl_etl = "pudl.cli.etl:main"
 pudl_setup = "pudl.workspace.setup_cli:main"

diff --git a/src/pudl/convert/__init__.py b/src/pudl/convert/__init__.py
@@ -13,7 +13,6 @@
 """
 from . import (
     censusdp1tract_to_sqlite,
-    datasette_metadata_to_yml,
     epacems_to_parquet,
     metadata_to_rst,
 )
diff --git a/src/pudl/convert/datasette_metadata_to_yml.py b/src/pudl/convert/datasette_metadata_to_yml.py
diff --git a/src/pudl/metadata/classes.py b/src/pudl/metadata/classes.py
@@ -2021,7 +2021,7 @@ def from_data_source_ids(
             xbrl_resources=xbrl_resources,
         )
 
-    def to_yaml(self, path: str = None) -> None:
+    def to_yaml(self) -> str:
         """Output database, table, and column metadata to YAML file."""
         template = _get_jinja_environment().get_template("datasette-metadata.yml.jinja")
         rendered = template.render(
@@ -2031,7 +2031,4 @@ def to_yaml(self, path: str = None) -> None:
             xbrl_resources=self.xbrl_resources,
             label_columns=self.label_columns,
         )
-        if path:
-            Path(path).write_text(rendered)
-        else:
-            sys.stdout.write(rendered)
+        return rendered
diff --git a/test/integration/datasette_metadata_test.py b/test/integration/datasette_metadata_test.py
@@ -18,7 +18,8 @@ def test_datasette_metadata_to_yml(ferc1_engine_xbrl):
     logger.info(f"Writing Datasette Metadata to {metadata_yml}")
 
     dm = DatasetteMetadata.from_data_source_ids(PudlPaths().output_dir)
-    dm.to_yaml(path=metadata_yml)
+    with metadata_yml.open("w") as f:
+        f.write(dm.to_yaml())
 
     logger.info("Parsing generated metadata using datasette utils.")
     metadata_json = json.dumps(yaml.safe_load(metadata_yml.open()))