Set up triggers to automate updating of conda lockfiles

* Remove dagster-postgres version to avoid PyPI conda-forge conflict. * Break up and distribute the nuke target. * Resolve issues with pandera dependency. Reorganize makefile. * Add triggers and commit/PR for workflow_dispatch, pull_request, schedule and set appropriate GITHUB_REF values for each case. * Use push instead of pull_request to trigger on path. This avoids re-locking the dependencies every single time you push to a PR that had a change to pyproject.toml *somewhere* in it. * Also trigger based on path if .github/workflows/update-lockfile.yml changes. * Update conda-lock.yml and rendered conda environment files.
catalyst-cooperative · Nov 8, 2023 · 614e4ee · 614e4ee
1 parent 2da6ba1
commit 614e4ee
Show file tree

Hide file tree

Showing 9 changed files with 489 additions and 487 deletions.
diff --git a/.github/workflows/tox-pytest.yml b/.github/workflows/tox-pytest.yml
@@ -46,6 +46,7 @@ jobs:
 
       - name: Lint and build PUDL documentation with Sphinx
         run: |
+          pip install --no-deps --editable .
           make docs-build
 
       - name: Upload coverage
@@ -90,6 +91,7 @@ jobs:
 
       - name: Run PUDL unit tests and collect test coverage
         run: |
+          pip install --no-deps --editable .
           make pytest-unit
 
       - name: Upload coverage
@@ -166,6 +168,7 @@ jobs:
 
       - name: Run integration tests, trying to use GCS cache if possible
         run: |
+          pip install --no-deps --editable .
           make pytest-integration
 
       - name: Upload coverage

diff --git a/.github/workflows/update-lockfile.yml b/.github/workflows/update-lockfile.yml
@@ -3,28 +3,46 @@ name: update-lockfile
 
 on:
   workflow_dispatch:
-  #schedule:
-  #  - cron: "0 9 * * 1-5" # Weekdays at 9AM UTC
-  pull_request:
+  schedule:
+    - cron: "0 9 * * 1-5" # Weekdays at 9AM UTC
+  push:
     paths:
       - "pyproject.toml"
-    branches:
-      - conda-lockfile
+      - "environments/*"
+      - ".github/workflows/update-lockfile.yml"
+
+# What branch does this action run on?
+# - workflow_dispatch: Whatever branch it was run against.
+# - schedule: Always the same branch (will be dev or main)
+# - push: Base branch of the PR.
 
 jobs:
-  conda-lock:
-    # Don't run scheduled job on forks.
-    # if: (github.event_name == 'schedule' && github.repository == 'catalyst-cooperative/pudl') || (github.event_name != 'schedule')
+  update-conda-lockfile:
+    runs-on: ubuntu-latest
+    if: ${{ (github.event_name == 'push' && github.actor != 'pudlbot') || (github.event_name == 'schedule' && github.repository == 'catalyst-cooperative/pudl') || (github.event_name == 'workflow_dispatch') }}
     defaults:
       run:
         shell: bash -l {0}
-    runs-on: ubuntu-latest
     steps:
+      - name: Set GITHUB_REF for use with workflow_dispatch
+        if: ${{ (github.event_name == 'workflow_dispatch') }}
+        run: |
+          echo "GITHUB_REF="${{ github.ref_name }} >> $GITHUB_ENV
+      - name: Set GITHUB_REF for use with schedule
+        if: ${{ (github.event_name == 'schedule') }}
+        run: |
+          echo "GITHUB_REF=conda-lockfile" >> $GITHUB_ENV
+      - name: Set GITHUB_REF for use with push
+        if: ${{ (github.event_name == 'push') }}
+        run: |
+          echo "GITHUB_REF="${{ github.ref_name }} >> $GITHUB_ENV
+      - name: Log final value of GITHUB_REF
+        run: |
+          echo "Final GITHUB_REF:" ${{ env.GITHUB_REF }}
       - uses: actions/checkout@v4
-        # If running on a schedule, run on dev.
-        # If running from workflow_dispatch, run on whatever the chosen branch/ref was.
-        # with:
-        #   ref: dev
+        with:
+          token: ${{ secrets.PUDL_BOT_PAT }}
+          ref: ${{ env.GITHUB_REF }}
       - name: Install Micromamba
         uses: mamba-org/setup-micromamba@v1
         with:
@@ -33,31 +51,34 @@ jobs:
             python=3.11
             conda-lock
             prettier
-
       - name: Run conda-lock to recreate lockfile from scratch
         run: |
-          make conda-lock
-
-      # TODO: Make this step require the success of the the previous make conda-lock step
-      - name: Open a pull request
+          make conda-clean
+          make conda-lock.yml
+      - name: Commit updated conda lockfiles to branch
+        # If running on push due to dependency changes, commit directly to the base
+        # branch of the existing PR. Don't trigger the workflow again if we're already
+        # running it as pudlbot (to avoid infinite recursion).
+        if: ${{ (github.event_name == 'push' && github.actor != 'pudlbot') }}
+        uses: stefanzweifel/git-auto-commit-action@v5
+        with:
+          file_pattern: "environments/*"
+          commit_message: "Update conda-lock.yml and rendered conda environment files."
+      - name: Make a PR to merge updated conda lockfiles
+        # If we are relocking dependencies on a schedule or workflow_dispatch, we need
+        # to make our own PR to check whether the updated environment actually solves
+        # and the tests pass.
+        if: ${{ (github.event_name == 'schedule' && github.repository == 'catalyst-cooperative/pudl') || (github.event_name == 'workflow_dispatch') }}
         uses: peter-evans/create-pull-request@v5
         with:
-          # # The default GITHUB_TOKEN doesn't allow other workflows to trigger.
-          # # Thus if there are tests to be run, they won't be run. For more info,
-          # # see the note under
-          # # <https://github.com/peter-evans/create-pull-request#action-inputs>.
-          # # One possible workaround is to specify a Personal Access Token (PAT).
-          # # This PAT should have read-write permissions for "Pull Requests"
-          # # and read-write permissions for "Contents".
-          # token: ${{ secrets.GH_PAT_FOR_PR }}
-          commit-message: Update lockfile
+          commit-message: "Update conda-lock.yml and rendered conda environment files."
           title: Update Lockfile
           body: >
             This pull request relocks the dependencies with conda-lock.
             It is triggered by [update-lockfile](https://github.com/catalyst-cooperative/pudl/blob/main/.github/workflows/update-lockfile.yml).
           labels: dependencies, conda-lock
           reviewers: zaneselvans
-          delete-branch: true
-          branch: update-lockfile
-          # base: dev
+          branch: update-conda-lockfile
+          base: ${{ env.GITHUB_REF }}
           draft: true
+          delete-branch: true
diff --git a/Makefile b/Makefile
@@ -7,24 +7,31 @@ etl_fast_yml := src/pudl/package_data/settings/etl_fast.yml
 etl_full_yml := src/pudl/package_data/settings/etl_full.yml
 pip_install_pudl := pip install --no-deps --editable ./
 
+# We use mamba locally, but micromamba in CI, so choose the right binary:
+ifdef GITHUB_ACTION
+  mamba := micromamba
+else
+  mamba := mamba
+endif
+
+# Tell make to look in the environments and output directory for targets and sources.
+VPATH = environments:${PUDL_OUTPUT}
+
 ########################################################################################
-# Start up the Dagster UI
+# Targets for starting up interactive web-interfaces
+# Note that these commands do not return until you quit out of the server with ^C
 ########################################################################################
 .PHONY: dagster
 dagster:
 	dagster dev -m pudl.etl -m pudl.ferc_to_sqlite
 
+.PHONY: jlab
+jlab:
+	jupyter lab --no-browser
+
 ########################################################################################
 # Conda lockfile generation and environment creation
 ########################################################################################
-ifdef GITHUB_ACTION
-  mamba := micromamba
-else
-  mamba := mamba
-endif
-
-# Tell make to look in the environments directory for targets and sources.
-VPATH = environments
 
 # Remove pre-existing conda lockfile and rendered environment files
 .PHONY: conda-clean
@@ -60,23 +67,47 @@ pudl-dev: conda-lock.yml
 install-pudl: pudl-dev
 	${mamba} run --name pudl-dev pip install --no-deps --editable .
 
-
 ########################################################################################
-# Build documentation (for local use)
+# Build documentation for local use or testing
 ########################################################################################
 .PHONY: docs-clean
 docs-clean:
 	rm -rf docs/_build
 	rm -f coverage.xml
 
+# Note that there's some PUDL code which only gets run when we generate the docs, so
+# we want to generate coverage from the docs build. Then we need to convert that
+# coverage output to XML so it's the coverage reports generated by pytest below, and can
+# be combined into a single unified coverage report.
 .PHONY: docs-build
 docs-build: docs-clean
 	doc8 docs/ README.rst
 	coverage run ${covargs} -- ${CONDA_PREFIX}/bin/sphinx-build -W -b html docs docs/_build/html
 	coverage xml
 
 ########################################################################################
-# Common pytest cases
+# Running the Full ETL
+# NOTE: these commands will clobber your existing databases, and may take an hour or
+# more to run.
+########################################################################################
+
+# Extract all FERC DBF and XBRL data to SQLite.
+ferc1.sqlite ferc1_xbrl.sqlite:
+	coverage run ${covargs} -- \
+		src/pudl/ferc_to_sqlite/cli.py \
+		--clobber \
+		${gcs_cache_path} \
+		${etl_full_yml}
+
+# Run the full PUDL ETL
+pudl.sqlite:
+	coverage run ${covargs} -- \
+		src/pudl/cli/etl.py \
+		${gcs_cache_path} \
+		${etl_full_yml}
+
+########################################################################################
+# pytest
 ########################################################################################
 .PHONY: pytest-unit
 pytest-unit:
@@ -103,49 +134,26 @@ pytest-validate:
 	pytest --live-dbs test/validate
 	pudl_check_fks
 
-# Extract all FERC DBF and XBRL data to SQLite.
-# NOTE: This will clobber your existing databases.
-.PHONY: ferc-to-sqlite
-ferc-to-sqlite:
-	coverage run ${covargs} -- \
-		src/pudl/ferc_to_sqlite/cli.py \
-		--clobber \
-		${gcs_cache_path} \
-		${etl_full_yml}
-
-# Run the full PUDL ETL
-# NOTE: This will clobber your existing databases.
-.PHONY: pudl-etl-full
-pudl-etl-full: ferc-to-sqlite
-	coverage run ${covargs} -- \
-		src/pudl/cli/etl.py \
-		--clobber \
-		${gcs_cache_path} \
-		${etl_full_yml}
-
 # Run the full ETL, generating new FERC & PUDL SQLite DBs and EPA CEMS Parquet files.
 # Then run the full integration tests and data validations on all years of data.
 # NOTE: This will clobber your existing databases and takes hours to run!!!
 # Backgrounding the data validation and integration tests and using wait allows them to
 # run in parallel.
 .PHONY: nuke
-nuke: coverage-erase docs-build pytest-unit pytest-integration pudl-etl-full
+nuke: coverage-erase docs-build pytest-unit ferc1.sqlite ferc1_xbrl.sqlite pudl.sqlite
 	pudl_check_fks
 	pytest ${pytest_args} --live-dbs --etl-settings ${etl_full_yml} test/integration & \
 	pytest ${pytest_args} --live-dbs test/validate & \
 	wait
 	${coverage_report}
 
-########################################################################################
-# Some miscellaneous test cases
-########################################################################################
-
 # Check that designated Jupyter notebooks can be run against the current DB
 .PHONY: pytest-jupyter
 pytest-jupyter:
 	pytest --live-dbs test/integration/jupyter_notebooks_test.py
 
-# Compare actual and expected number of rows in many tables:
+# Compare actual and expected number of rows in many tables. This will run any test
+# whose name contains "minmax_rows" so it's important to follow that naming convention.
 .PHONY: pytest-minmax-rows
 pytest-minmax-rows:
 	pytest --live-dbs test/validate -k minmax_rows