Skip to content

Commit

Permalink
Adjust Dockerfile & deployment use micromamba
Browse files Browse the repository at this point in the history
* Use micromamba not conda in Dockerfile CMD, also use pip install --no-deps
* Use micromamba not conda in command passed to build container
* Use default mambauser rather than catalyst in docker container
* Remove --no-capture-output which isn't supported by micromamba. Is this a problem?
* Remove uninterpolated vars in .env and more --no-capture-output
* Separate ETL and pytest commands.
* Stop trying to run tests in parallel. Sigh.
* Add google cloud sdk to conda environment.
* Install Google Cloud SDK from conda-forge.
* Add back in the making of required directories. Oops.
* Attempt to have micromamba run pass through output
  • Loading branch information
zaneselvans committed Oct 26, 2023
1 parent 69a6342 commit 4aa5959
Show file tree
Hide file tree
Showing 13 changed files with 3,683 additions and 19,987 deletions.
9 changes: 5 additions & 4 deletions .github/workflows/build-deploy-pudl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -92,11 +92,12 @@ jobs:
gcloud compute instances update-container "$GCE_INSTANCE" \
--zone "$GCE_INSTANCE_ZONE" \
--container-image "docker.io/catalystcoop/pudl-etl:${{ env.GITHUB_REF }}" \
--container-command "conda" \
--container-command "micromamba" \
--container-arg="run" \
--container-arg="--no-capture-output" \
--container-arg="-p" \
--container-arg="/home/catalyst/env" \
--container-arg="--prefix" \
--container-arg="/home/mambauser/env" \
--container-arg="--attach" \
--container-arg='' \
--container-arg="bash" \
--container-arg="./docker/gcp_pudl_etl.sh" \
--container-env-file="./docker/.env" \
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/zenodo-cache-sync.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ env:
PUBLIC_ZENODO_CACHE_BUCKET: gs://zenodo-cache.catalyst.coop
GITHUB_REF: ${{ github.ref_name }} # This is changed to dev if running on a schedule
PUDL_OUTPUT: ~/pudl-work/output
PUDL_INPUT: ~/pudl-work/data/
PUDL_INPUT: ~/pudl-work/input/

jobs:
zenodo-cache-sync:
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ endif

# Regenerate the conda lockfile and render platform specific conda environments.
conda-lock:
rm -f environments/conda-lock.yml
rm -f environments/conda-*lock.yml
conda-lock \
--${mamba} \
--file=pyproject.toml \
Expand Down
16 changes: 8 additions & 8 deletions docker/.env
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
HOST_PUDL_IN=./pudl_in
HOST_PUDL_OUT=./pudl_out
CONTAINER_HOME=/home/catalyst
PUDL_INPUT=/home/catalyst/pudl_work/data
PUDL_OUTPUT=/home/catalyst/pudl_work/output
DAGSTER_HOME=/home/catalyst/pudl_work/dagster_home
CONDA_PREFIX=/home/catalyst/env
PUDL_SETTINGS_YML=/home/catalyst/src/pudl/package_data/settings/etl_full.yml
LOGFILE=/home/catalyst/pudl_work/output/pudl-etl.log
CONDA_RUN="conda run --no-capture-output --prefix /home/catalyst/env"
CONTAINER_HOME=/home/mambauser
PUDL_INPUT=/home/mambauser/pudl_work/input
PUDL_OUTPUT=/home/mambauser/pudl_work/output
DAGSTER_HOME=/home/mambauser/pudl_work/dagster_home
CONDA_PREFIX=/home/mambauser/env
PUDL_SETTINGS_YML=/home/mambauser/src/pudl/package_data/settings/etl_full.yml
LOGFILE=/home/mambauser/pudl_work/output/pudl-etl.log
CONDA_RUN="micromamba run --prefix /home/mambauser/env --attach ''"
GCS_CACHE=gs://zenodo-cache.catalyst.coop
7 changes: 5 additions & 2 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ ENV PUDL_INPUT=${CONTAINER_PUDL_WORKSPACE}/input
ENV PUDL_OUTPUT=${CONTAINER_PUDL_WORKSPACE}/output
ENV DAGSTER_HOME=${CONTAINER_PUDL_WORKSPACE}/dagster_home

# Create data input/output directories
RUN mkdir -p ${PUDL_INPUT} ${PUDL_OUTPUT} ${DAGSTER_HOME}

# Create a conda environment based on the specification in the repo
COPY environments/conda-lock.yml environments/conda-lock.yml
RUN micromamba create --prefix ${CONDA_PREFIX} --yes --category main dev docs test datasette --file environments/conda-lock.yml && \
Expand All @@ -46,9 +49,9 @@ ENV LD_LIBRARY_PATH=${CONDA_PREFIX}/lib
# We need information from .git to get version with setuptools_scm so we mount that
# directory without copying it into the image.
RUN --mount=type=bind,source=.git,target=${PUDL_REPO}/.git \
${CONDA_RUN} pip install --no-cache-dir --editable . && \
${CONDA_RUN} pip install --no-cache-dir --no-deps --editable . && \
# Run the PUDL setup script so we know where to read and write data
${CONDA_RUN} pudl_setup

# Run the unit tests:
CMD ["conda", "run", "--no-capture-output", "--prefix", "${CONDA_PREFIX}", "pytest", "test/unit"]
CMD ["micromamba", "run", "--prefix", "${CONDA_PREFIX}", "--attach", "''", "pytest", "test/unit"]
6 changes: 2 additions & 4 deletions docker/gcp_pudl_etl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,16 +33,14 @@ function run_pudl_etl() {
--max-concurrent 6 \
--gcs-cache-path gs://internal-zenodo-cache.catalyst.coop \
$PUDL_SETTINGS_YML && \
# Run multiple pytest processes in the background and wait for them to exit
pytest \
--gcs-cache-path=gs://internal-zenodo-cache.catalyst.coop \
--etl-settings=$PUDL_SETTINGS_YML \
--live-dbs test/integration test/unit & \
--live-dbs test/integration test/unit && \
pytest \
--gcs-cache-path=gs://internal-zenodo-cache.catalyst.coop \
--etl-settings=$PUDL_SETTINGS_YML \
--live-dbs test/validate & \
wait
--live-dbs test/validate
}

function shutdown_vm() {
Expand Down
15 changes: 8 additions & 7 deletions environments/conda-linux-64.lock.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Generated by conda-lock.
# platform: linux-64
# input_hash: 7c8b2f7fe28cdfc4b83bb1b9bf64f5957d45794c915656c2c040ddb9df08ef23
# input_hash: 06292074438cca8fc2f86928d2604f3a6bed207c4a90a6b035b70dc335244765

channels:
- conda-forge
Expand Down Expand Up @@ -174,6 +174,7 @@ dependencies:
- freexl=2.0.0=h743c826_0
- frozenlist=1.4.0=py311h459d7ec_1
- fsspec=2023.9.2=pyh1a96a4e_0
- google-cloud-sdk=452.0.1=py311h38be061_0
- greenlet=3.0.1=py311hb755f60_0
- grpcio=1.57.0=py311ha6695c7_2
- hpack=4.0.0=pyh9f0ad1d_0
Expand Down Expand Up @@ -294,13 +295,13 @@ dependencies:
- xlsxwriter=3.1.7=pyhd8ed1ab_0
- xorg-libxext=1.3.4=h0b41bf4_2
- xorg-libxrender=0.9.11=hd590300_0
- xyzservices=2023.10.0=pyhd8ed1ab_0
- xyzservices=2023.10.1=pyhd8ed1ab_0
- zipp=3.17.0=pyhd8ed1ab_0
- aiosignal=1.3.1=pyhd8ed1ab_0
- anyio=4.0.0=pyhd8ed1ab_0
- asgi-csrf=0.9=pyhd8ed1ab_0
- asgiref=3.7.2=pyhd8ed1ab_0
- asttokens=2.4.0=pyhd8ed1ab_0
- asttokens=2.4.1=pyhd8ed1ab_0
- async-lru=2.0.4=pyhd8ed1ab_0
- aws-c-auth=0.7.4=h1083cbe_2
- aws-c-mqtt=0.9.7=h55cd26b_0
Expand Down Expand Up @@ -383,7 +384,7 @@ dependencies:
- arrow=1.3.0=pyhd8ed1ab_0
- async-timeout=4.0.3=pyhd8ed1ab_0
- aws-c-s3=0.3.17=hfb4bb88_4
- botocore=1.31.70=pyhd8ed1ab_0
- botocore=1.31.71=pyhd8ed1ab_0
- branca=0.6.0=pyhd8ed1ab_0
- cmarkgfm=0.8.0=py311h459d7ec_3
- croniter=2.0.1=pyhd8ed1ab_0
Expand Down Expand Up @@ -427,10 +428,10 @@ dependencies:
- wcwidth=0.2.8=pyhd8ed1ab_0
- aiohttp=3.8.6=py311h459d7ec_1
- alembic=1.12.0=pyhd8ed1ab_0
- arelle-release=2.16.3=pyhd8ed1ab_0
- arelle-release=2.17.0=pyhd8ed1ab_0
- argon2-cffi=23.1.0=pyhd8ed1ab_0
- aws-crt-cpp=0.24.2=ha28989d_2
- black=23.10.0=py311h38be061_0
- black=23.10.1=py311h38be061_0
- bottleneck=1.3.7=py311h1f0f07a_1
- cachecontrol=0.13.1=pyhd8ed1ab_0
- contourpy=1.1.1=py311h9547e67_1
Expand Down Expand Up @@ -468,7 +469,7 @@ dependencies:
- uvicorn-standard=0.23.2=h38be061_1
- virtualenv=20.24.6=pyhd8ed1ab_0
- aws-sdk-cpp=1.11.156=h314d761_4
- boto3=1.28.70=pyhd8ed1ab_0
- boto3=1.28.71=pyhd8ed1ab_0
- cachecontrol-with-filecache=0.13.1=pyhd8ed1ab_0
- dagster=1.5.4=pyhd8ed1ab_1
- datasette=0.64.4=pyhd8ed1ab_1
Expand Down
Loading

0 comments on commit 4aa5959

Please sign in to comment.