Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Deploy Datasette to fly.io instead of Cloud Run #3018

Merged
merged 6 commits into from
Nov 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/build-deploy-pudl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ jobs:
--container-env DAGSTER_PG_HOST="104.154.182.24" \
--container-env DAGSTER_PG_DB="dagster-storage" \
--container-env PUDL_SETTINGS_YML="/home/catalyst/src/pudl/package_data/settings/etl_full.yml" \
--container-env FLY_ACCESS_TOKEN=${{ secrets.FLY_ACCESS_TOKEN }} \

# Start the VM
- name: Start the deploy-pudl-vm
Expand Down
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,9 @@ notebooks/*.tgz
terraform/.terraform/*
.env
.hypothesis/

# generated by datasette/publish.py fresh for every deploy - we shouldn't track changes.
devtools/datasette/fly/Dockerfile
devtools/datasette/fly/inspect-data.json
devtools/datasette/fly/metadata.yml
devtools/datasette/fly/all_dbs.tar.zst
bendnorman marked this conversation as resolved.
Show resolved Hide resolved
34 changes: 34 additions & 0 deletions devtools/datasette/fly/fly.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# fly.toml app configuration file generated for catalyst-coop-pudl on 2023-11-03T15:31:15-04:00
#
# See https://fly.io/docs/reference/configuration/ for information about how to use this file.
#
app = "catalyst-coop-pudl"
primary_region = "bos"

[[mounts]]
destination = "/data"
source = "datasette"

[[services]]
internal_port = 8080
protocol = "tcp"

[services.concurrency]
hard_limit = 25
soft_limit = 20

[[services.ports]]
handlers = ["http"]
port = 80

[[services.ports]]
handlers = ["tls", "http"]
port = 443

[[services.tcp_checks]]
grace_period = "1m"
interval = 10000
timeout = 2000

[deploy]
wait_timeout = "15m"
10 changes: 10 additions & 0 deletions devtools/datasette/fly/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#! /usr/bin/env bash
set -eux

shopt -s nullglob

find /data/ -name '*.sqlite' -delete
mv all_dbs.tar.zst /data
bendnorman marked this conversation as resolved.
Show resolved Hide resolved
zstd -f -d /data/all_dbs.tar.zst -o /data/all_dbs.tar
tar -xf /data/all_dbs.tar --directory /data
datasette serve --host 0.0.0.0 /data/*.sqlite --cors --inspect-file inspect-data.json --metadata metadata.yml --setting sql_time_limit_ms 5000 --port $PORT
122 changes: 122 additions & 0 deletions devtools/datasette/publish.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
"""Publish the datasette to fly.io.

We use custom logic here because the datasette-publish-fly plugin bakes the
uncompressed databases into the image, which makes the image too large.

We compress the databases before baking them into the image. Then we decompress
them at runtime to a Fly volume mounted at /data. This avoids a long download
at startup, and allows us stay within the Fly.io 8GB image size limit.

The volume handling is done manually outside of this publish.py script - it
should be terraformed at some point.

Some static fly.io deployment-related files live in ./fly:
* fly.toml - service configuration
* run.sh - service entrypoint

Apart from that: the Dockerfile and dataset-specific
metadata.yml/inspect-data.json are generated by this script.
"""

import json
import logging
import secrets
from pathlib import Path
from subprocess import check_call, check_output

from pudl.metadata.classes import DatasetteMetadata
from pudl.workspace.setup import PudlPaths

logging.basicConfig(format="%(asctime)s %(message)s", level=logging.INFO)

DOCKERFILE_TEMPLATE = """
FROM python:3.11.0-slim-bullseye
COPY . /app
WORKDIR /app

RUN apt-get update
RUN apt-get install -y zstd

ENV DATASETTE_SECRET '{datasette_secret}'
RUN pip install -U datasette datasette-cluster-map datasette-vega datasette-block-robots
ENV PORT 8080
EXPOSE 8080

CMD ["./run.sh"]
"""


def make_dockerfile():
"""Write a dockerfile from template, to use in fly deploy.

We write this from template so we can generate a datasette secret. This way
we don't have to manage secrets at all.
"""
datasette_secret = secrets.token_hex(16)
return DOCKERFILE_TEMPLATE.format(datasette_secret=datasette_secret)


def inspect_data(datasets, pudl_out):
"""Pre-inspect databases to generate some metadata for Datasette.

This is done in the image build process in datasette-publish-fly, but since
we don't have access to the databases in the build process we have to
inspect before building the Docker image.
"""
inspect_output = json.loads(
check_output(
[ # noqa: S603
"datasette",
"inspect",
]
+ [str(pudl_out / ds) for ds in datasets]
)
)

for dataset in inspect_output:
name = Path(inspect_output[dataset]["file"]).name
new_filepath = Path("/data") / name
inspect_output[dataset]["file"] = str(new_filepath)
return inspect_output


def metadata(pudl_out) -> str:
"""Return human-readable metadata for Datasette."""
return DatasetteMetadata.from_data_source_ids(pudl_out).to_yaml()


def main():
"""Generate deployment files and run the deploy."""
fly_dir = Path(__file__).parent.absolute() / "fly"
docker_path = fly_dir / "Dockerfile"
inspect_path = fly_dir / "inspect-data.json"
metadata_path = fly_dir / "metadata.yml"

pudl_out = PudlPaths().pudl_output
datasets = [str(p.name) for p in pudl_out.glob("*.sqlite")]
logging.info(f"Inspecting DBs for datasette: {datasets}...")
inspect_output = inspect_data(datasets, pudl_out)
with inspect_path.open("w") as f:
f.write(json.dumps(inspect_output))

logging.info("Writing metadata...")
with metadata_path.open("w") as f:
f.write(metadata(pudl_out))

logging.info("Writing Dockerfile...")
with docker_path.open("w") as f:
f.write(make_dockerfile())

logging.info(f"Compressing {datasets} and putting into docker context...")
check_call(
["tar", "-a", "-czvf", fly_dir / "all_dbs.tar.zst"] + datasets, # noqa: S603
cwd=pudl_out,
)

logging.info("Running fly deploy...")
check_call(["/usr/bin/env", "flyctl", "deploy"], cwd=fly_dir) # noqa: S603
logging.info("Deploy finished!")


if __name__ == "__main__":
main()
26 changes: 0 additions & 26 deletions devtools/datasette/publish.sh

This file was deleted.

6 changes: 6 additions & 0 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
FROM condaforge/mambaforge:23.3.1-1

SHELL [ "/bin/bash", "-exo", "pipefail", "-c" ]

# Install curl and js
# awscli requires unzip, less, groff and mandoc
# hadolint ignore=DL3008
Expand All @@ -24,6 +26,10 @@ ENV CONTAINER_HOME=/home/catalyst
USER catalyst
WORKDIR ${CONTAINER_HOME}

# Install flyctl
RUN curl -L https://fly.io/install.sh | sh
ENV PATH="${CONTAINER_HOME}/.fly/bin:$PATH"

ENV CONDA_PREFIX=${CONTAINER_HOME}/env
ENV PUDL_REPO=${CONTAINER_HOME}/pudl
ENV CONDA_RUN="conda run --no-capture-output --prefix ${CONDA_PREFIX}"
Expand Down
1 change: 1 addition & 0 deletions docker/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ services:
environment:
- API_KEY_EIA
- GCP_BILLING_PROJECT
- FLY_ACCESS_TOKEN
env_file:
- .env
build:
Expand Down
14 changes: 9 additions & 5 deletions docker/gcp_pudl_etl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -85,20 +85,24 @@ function notify_slack() {
# 2>&1 redirects stderr to stdout.
run_pudl_etl 2>&1 | tee $LOGFILE

# Notify slack if the etl succeeded.
# if pipeline is successful, distribute + publish datasette
if [[ ${PIPESTATUS[0]} == 0 ]]; then
notify_slack "success"

# Dump outputs to s3 bucket if branch is dev or build was triggered by a tag
if [ $GITHUB_ACTION_TRIGGER = "push" ] || [ $GITHUB_REF = "dev" ]; then
copy_outputs_to_distribution_bucket
fi

# Deploy the updated data to datasette
if [ $GITHUB_REF = "dev" ]; then
gcloud config set run/region us-central1
source ~/devtools/datasette/publish.sh
python ~/devtools/datasette/publish.py 2>&1 | tee -a $LOGFILE
fi
fi

# Notify slack about entire pipeline's success or failure;
# PIPESTATUS[0] either refers to the failed ETL run or the last distribution
# task that was run above
if [[ ${PIPESTATUS[0]} == 0 ]]; then
notify_slack "success"
else
notify_slack "failure"
fi
Expand Down
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,6 @@ keywords = [
metadata_to_rst = "pudl.convert.metadata_to_rst:main"
epacems_to_parquet = "pudl.convert.epacems_to_parquet:main"
ferc_to_sqlite = "pudl.ferc_to_sqlite.cli:main"
datasette_metadata_to_yml = "pudl.convert.datasette_metadata_to_yml:main"
pudl_datastore = "pudl.workspace.datastore:main"
pudl_etl = "pudl.cli.etl:main"
pudl_setup = "pudl.workspace.setup_cli:main"
Expand Down
1 change: 0 additions & 1 deletion src/pudl/convert/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
"""
from . import (
censusdp1tract_to_sqlite,
datasette_metadata_to_yml,
epacems_to_parquet,
metadata_to_rst,
)
62 changes: 0 additions & 62 deletions src/pudl/convert/datasette_metadata_to_yml.py

This file was deleted.

7 changes: 2 additions & 5 deletions src/pudl/metadata/classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -2021,7 +2021,7 @@ def from_data_source_ids(
xbrl_resources=xbrl_resources,
)

def to_yaml(self, path: str = None) -> None:
def to_yaml(self) -> str:
"""Output database, table, and column metadata to YAML file."""
template = _get_jinja_environment().get_template("datasette-metadata.yml.jinja")
rendered = template.render(
Expand All @@ -2031,7 +2031,4 @@ def to_yaml(self, path: str = None) -> None:
xbrl_resources=self.xbrl_resources,
label_columns=self.label_columns,
)
if path:
Path(path).write_text(rendered)
else:
sys.stdout.write(rendered)
return rendered
3 changes: 2 additions & 1 deletion test/integration/datasette_metadata_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ def test_datasette_metadata_to_yml(ferc1_engine_xbrl):
logger.info(f"Writing Datasette Metadata to {metadata_yml}")

dm = DatasetteMetadata.from_data_source_ids(PudlPaths().output_dir)
dm.to_yaml(path=metadata_yml)
with metadata_yml.open("w") as f:
f.write(dm.to_yaml())

logger.info("Parsing generated metadata using datasette utils.")
metadata_json = json.dumps(yaml.safe_load(metadata_yml.open()))
Expand Down