Skip to content

Commit

Permalink
Deploy Datasette to fly.io instead of Cloud Run (#3018)
Browse files Browse the repository at this point in the history
* Try using datasette publish fly
* Pull run command into its own shell script; only deploy one dataset for iteration speed.
* Append publishing logs to the logfile as well
  • Loading branch information
jdangerx authored Nov 9, 2023
1 parent a2bdffa commit d8512b5
Show file tree
Hide file tree
Showing 14 changed files with 193 additions and 101 deletions.
1 change: 1 addition & 0 deletions .github/workflows/build-deploy-pudl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ jobs:
--container-env DAGSTER_PG_HOST="104.154.182.24" \
--container-env DAGSTER_PG_DB="dagster-storage" \
--container-env PUDL_SETTINGS_YML="/home/catalyst/src/pudl/package_data/settings/etl_full.yml" \
--container-env FLY_ACCESS_TOKEN=${{ secrets.FLY_ACCESS_TOKEN }} \
# Start the VM
- name: Start the deploy-pudl-vm
Expand Down
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,9 @@ notebooks/*.tgz
terraform/.terraform/*
.env
.hypothesis/

# generated by datasette/publish.py fresh for every deploy - we shouldn't track changes.
devtools/datasette/fly/Dockerfile
devtools/datasette/fly/inspect-data.json
devtools/datasette/fly/metadata.yml
devtools/datasette/fly/all_dbs.tar.zst
34 changes: 34 additions & 0 deletions devtools/datasette/fly/fly.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# fly.toml app configuration file generated for catalyst-coop-pudl on 2023-11-03T15:31:15-04:00
#
# See https://fly.io/docs/reference/configuration/ for information about how to use this file.
#
app = "catalyst-coop-pudl"
primary_region = "bos"

[[mounts]]
destination = "/data"
source = "datasette"

[[services]]
internal_port = 8080
protocol = "tcp"

[services.concurrency]
hard_limit = 25
soft_limit = 20

[[services.ports]]
handlers = ["http"]
port = 80

[[services.ports]]
handlers = ["tls", "http"]
port = 443

[[services.tcp_checks]]
grace_period = "1m"
interval = 10000
timeout = 2000

[deploy]
wait_timeout = "15m"
10 changes: 10 additions & 0 deletions devtools/datasette/fly/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#! /usr/bin/env bash
set -eux

shopt -s nullglob

find /data/ -name '*.sqlite' -delete
mv all_dbs.tar.zst /data
zstd -f -d /data/all_dbs.tar.zst -o /data/all_dbs.tar
tar -xf /data/all_dbs.tar --directory /data
datasette serve --host 0.0.0.0 /data/*.sqlite --cors --inspect-file inspect-data.json --metadata metadata.yml --setting sql_time_limit_ms 5000 --port $PORT
122 changes: 122 additions & 0 deletions devtools/datasette/publish.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
"""Publish the datasette to fly.io.
We use custom logic here because the datasette-publish-fly plugin bakes the
uncompressed databases into the image, which makes the image too large.
We compress the databases before baking them into the image. Then we decompress
them at runtime to a Fly volume mounted at /data. This avoids a long download
at startup, and allows us stay within the Fly.io 8GB image size limit.
The volume handling is done manually outside of this publish.py script - it
should be terraformed at some point.
Some static fly.io deployment-related files live in ./fly:
* fly.toml - service configuration
* run.sh - service entrypoint
Apart from that: the Dockerfile and dataset-specific
metadata.yml/inspect-data.json are generated by this script.
"""

import json
import logging
import secrets
from pathlib import Path
from subprocess import check_call, check_output

from pudl.metadata.classes import DatasetteMetadata
from pudl.workspace.setup import PudlPaths

logging.basicConfig(format="%(asctime)s %(message)s", level=logging.INFO)

DOCKERFILE_TEMPLATE = """
FROM python:3.11.0-slim-bullseye
COPY . /app
WORKDIR /app
RUN apt-get update
RUN apt-get install -y zstd
ENV DATASETTE_SECRET '{datasette_secret}'
RUN pip install -U datasette datasette-cluster-map datasette-vega datasette-block-robots
ENV PORT 8080
EXPOSE 8080
CMD ["./run.sh"]
"""


def make_dockerfile():
"""Write a dockerfile from template, to use in fly deploy.
We write this from template so we can generate a datasette secret. This way
we don't have to manage secrets at all.
"""
datasette_secret = secrets.token_hex(16)
return DOCKERFILE_TEMPLATE.format(datasette_secret=datasette_secret)


def inspect_data(datasets, pudl_out):
"""Pre-inspect databases to generate some metadata for Datasette.
This is done in the image build process in datasette-publish-fly, but since
we don't have access to the databases in the build process we have to
inspect before building the Docker image.
"""
inspect_output = json.loads(
check_output(
[ # noqa: S603
"datasette",
"inspect",
]
+ [str(pudl_out / ds) for ds in datasets]
)
)

for dataset in inspect_output:
name = Path(inspect_output[dataset]["file"]).name
new_filepath = Path("/data") / name
inspect_output[dataset]["file"] = str(new_filepath)
return inspect_output


def metadata(pudl_out) -> str:
"""Return human-readable metadata for Datasette."""
return DatasetteMetadata.from_data_source_ids(pudl_out).to_yaml()


def main():
"""Generate deployment files and run the deploy."""
fly_dir = Path(__file__).parent.absolute() / "fly"
docker_path = fly_dir / "Dockerfile"
inspect_path = fly_dir / "inspect-data.json"
metadata_path = fly_dir / "metadata.yml"

pudl_out = PudlPaths().pudl_output
datasets = [str(p.name) for p in pudl_out.glob("*.sqlite")]
logging.info(f"Inspecting DBs for datasette: {datasets}...")
inspect_output = inspect_data(datasets, pudl_out)
with inspect_path.open("w") as f:
f.write(json.dumps(inspect_output))

logging.info("Writing metadata...")
with metadata_path.open("w") as f:
f.write(metadata(pudl_out))

logging.info("Writing Dockerfile...")
with docker_path.open("w") as f:
f.write(make_dockerfile())

logging.info(f"Compressing {datasets} and putting into docker context...")
check_call(
["tar", "-a", "-czvf", fly_dir / "all_dbs.tar.zst"] + datasets, # noqa: S603
cwd=pudl_out,
)

logging.info("Running fly deploy...")
check_call(["/usr/bin/env", "flyctl", "deploy"], cwd=fly_dir) # noqa: S603
logging.info("Deploy finished!")


if __name__ == "__main__":
main()
26 changes: 0 additions & 26 deletions devtools/datasette/publish.sh

This file was deleted.

6 changes: 6 additions & 0 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
FROM condaforge/mambaforge:23.3.1-1

SHELL [ "/bin/bash", "-exo", "pipefail", "-c" ]

# Install curl and js
# awscli requires unzip, less, groff and mandoc
# hadolint ignore=DL3008
Expand All @@ -24,6 +26,10 @@ ENV CONTAINER_HOME=/home/catalyst
USER catalyst
WORKDIR ${CONTAINER_HOME}

# Install flyctl
RUN curl -L https://fly.io/install.sh | sh
ENV PATH="${CONTAINER_HOME}/.fly/bin:$PATH"

ENV CONDA_PREFIX=${CONTAINER_HOME}/env
ENV PUDL_REPO=${CONTAINER_HOME}/pudl
ENV CONDA_RUN="conda run --no-capture-output --prefix ${CONDA_PREFIX}"
Expand Down
1 change: 1 addition & 0 deletions docker/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ services:
environment:
- API_KEY_EIA
- GCP_BILLING_PROJECT
- FLY_ACCESS_TOKEN
env_file:
- .env
build:
Expand Down
14 changes: 9 additions & 5 deletions docker/gcp_pudl_etl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -85,20 +85,24 @@ function notify_slack() {
# 2>&1 redirects stderr to stdout.
run_pudl_etl 2>&1 | tee $LOGFILE

# Notify slack if the etl succeeded.
# if pipeline is successful, distribute + publish datasette
if [[ ${PIPESTATUS[0]} == 0 ]]; then
notify_slack "success"

# Dump outputs to s3 bucket if branch is dev or build was triggered by a tag
if [ $GITHUB_ACTION_TRIGGER = "push" ] || [ $GITHUB_REF = "dev" ]; then
copy_outputs_to_distribution_bucket
fi

# Deploy the updated data to datasette
if [ $GITHUB_REF = "dev" ]; then
gcloud config set run/region us-central1
source ~/devtools/datasette/publish.sh
python ~/devtools/datasette/publish.py 2>&1 | tee -a $LOGFILE
fi
fi

# Notify slack about entire pipeline's success or failure;
# PIPESTATUS[0] either refers to the failed ETL run or the last distribution
# task that was run above
if [[ ${PIPESTATUS[0]} == 0 ]]; then
notify_slack "success"
else
notify_slack "failure"
fi
Expand Down
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,6 @@ keywords = [
metadata_to_rst = "pudl.convert.metadata_to_rst:main"
epacems_to_parquet = "pudl.convert.epacems_to_parquet:main"
ferc_to_sqlite = "pudl.ferc_to_sqlite.cli:main"
datasette_metadata_to_yml = "pudl.convert.datasette_metadata_to_yml:main"
pudl_datastore = "pudl.workspace.datastore:main"
pudl_etl = "pudl.cli.etl:main"
pudl_setup = "pudl.workspace.setup_cli:main"
Expand Down
1 change: 0 additions & 1 deletion src/pudl/convert/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
"""
from . import (
censusdp1tract_to_sqlite,
datasette_metadata_to_yml,
epacems_to_parquet,
metadata_to_rst,
)
62 changes: 0 additions & 62 deletions src/pudl/convert/datasette_metadata_to_yml.py

This file was deleted.

7 changes: 2 additions & 5 deletions src/pudl/metadata/classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -2021,7 +2021,7 @@ def from_data_source_ids(
xbrl_resources=xbrl_resources,
)

def to_yaml(self, path: str = None) -> None:
def to_yaml(self) -> str:
"""Output database, table, and column metadata to YAML file."""
template = _get_jinja_environment().get_template("datasette-metadata.yml.jinja")
rendered = template.render(
Expand All @@ -2031,7 +2031,4 @@ def to_yaml(self, path: str = None) -> None:
xbrl_resources=self.xbrl_resources,
label_columns=self.label_columns,
)
if path:
Path(path).write_text(rendered)
else:
sys.stdout.write(rendered)
return rendered
3 changes: 2 additions & 1 deletion test/integration/datasette_metadata_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ def test_datasette_metadata_to_yml(ferc1_engine_xbrl):
logger.info(f"Writing Datasette Metadata to {metadata_yml}")

dm = DatasetteMetadata.from_data_source_ids(PudlPaths().output_dir)
dm.to_yaml(path=metadata_yml)
with metadata_yml.open("w") as f:
f.write(dm.to_yaml())

logger.info("Parsing generated metadata using datasette utils.")
metadata_json = json.dumps(yaml.safe_load(metadata_yml.open()))
Expand Down

0 comments on commit d8512b5

Please sign in to comment.