catalyst-cooperative · zschira · Jan 9, 2025 · Jan 10, 2025 · Jan 10, 2025 · Jan 10, 2025
diff --git a/environments/conda-linux-64.lock.yml b/environments/conda-linux-64.lock.yml
diff --git a/environments/conda-lock.yml b/environments/conda-lock.yml
diff --git a/environments/conda-osx-64.lock.yml b/environments/conda-osx-64.lock.yml
diff --git a/environments/conda-osx-arm64.lock.yml b/environments/conda-osx-arm64.lock.yml
diff --git a/pyproject.toml b/pyproject.toml
@@ -7,7 +7,7 @@ name = "catalystcoop.pudl"
 description = "An open data processing pipeline for US energy data"
 readme = { file = "README.rst", content-type = "text/x-rst" }
 authors = [{ name = "Catalyst Cooperative", email = "[email protected]" }]
-requires-python = ">=3.12,<3.13"
+requires-python = ">=3.10,<3.13"
 dynamic = ["version"]
 license = { file = "LICENSE.txt" }
 dependencies = [
@@ -23,10 +23,12 @@ dependencies = [
     "conda-lock>=2.5.7",
     "coverage>=7.6",
     "dagster>=1.9",
+    "dagster-dbt>=0.25.6,<1",
     "dagster-postgres>=0.24,<1", # Update when dagster-postgres graduates to 1.x
     "dask>=2024",
     "dask-expr", # Required for dask[dataframe]
     "datasette>=0.65",
+    "dbt-duckdb",
     "doc8>=1.1",
     "duckdb>=1.1.3",
     "email-validator>=1.0.3", # pydantic[email]
@@ -83,6 +85,7 @@ dependencies = [
     "sphinxcontrib_googleanalytics>=0.4",
     "sqlalchemy>=2",
     "sqlglot>=25",
+    "s3fs>=2024",
     "timezonefinder>=6.2",
     "universal_pathlib>=0.2",
     "urllib3>=1.26.18",
@@ -343,7 +346,7 @@ nodejs = ">=20"
 pandoc = ">=2"
 pip = ">=24"
 prettier = ">=3.0"
-python = ">=3.12,<3.13"
+python = ">=3.10,<3.13"
 sqlite = ">=3.47"
 zip = ">=3.0"
 

diff --git a/src/pudl/dbt/.gitignore b/src/pudl/dbt/.gitignore
@@ -0,0 +1,4 @@
+
+target/
+dbt_packages/
+logs/
diff --git a/src/pudl/dbt/.user.yml b/src/pudl/dbt/.user.yml
@@ -0,0 +1 @@
+id: 143b9efc-6985-409a-8029-865947b8f8f1
diff --git a/src/pudl/dbt/README.md b/src/pudl/dbt/README.md
@@ -0,0 +1,86 @@
+### Overview
+This directory contains an initial setup of a `dbt` project meant to write
+[data tests](https://docs.getdbt.com/docs/build/data-tests) for PUDL data. The
+project is setup with profiles that allow you to select running tests on `nightly`
+builds, `etl-full`, or `etl-fast` outputs. The `nightly` profile will operate
+directly on parquet files in our S3 bucket, while both the `etl-full` and `etl-fast`
+profiles will look for parquet files based on your `PUDL_OUTPUT` environment
+variable. See the `Usage` section below for examples using these profiles.
+
+
+### Development
+To setup the `dbt` project, simply install the PUDL `conda` environment as normal,
+then run the following command from this directory.
+
+```
+dbt deps
+```
+
+#### Adding new tables
+To add a new table to the project, you must add it as a
+[dbt source](https://docs.getdbt.com/docs/build/sources). You can do this by editing
+the file `src/pudl/dbt/models/schema.yml`. I've already added the table
+`out_vcerare__hourly_available_capacity_factor`, which can be used as a reference.
+
+#### Adding tests
+Once a table is included as a `source`, you can add tests for the table. You can
+either add a generic test directly in `src/pudl/dbt/models/schema.yml`, or create
+a `sql` file in the directory `src/pudl/dbt/tests/`, which references the `source`.
+When adding `sql` tests like this, you should construct a query that `SELECT`'s rows
+that indicate a failure. That is, if the query returns any rows, `dbt` will raise a
+failure for that test.
+
+The project includes [dbt-expectations](https://github.com/calogica/dbt-expectations)
+and [dbt-utils](https://github.com/dbt-labs/dbt-utils) as dependencies. These
+packages include useful tests out of the box that can be applied to any tables
+in the project. There are several examples in `src/pudl/dbt/models/schema.yml` which
+use `dbt-expectations`.
+
+#### Modifying a table before test
+In many cases we modify a table slightly before executing a test. There are a couple
+ways to accomplish this. First, when creating a `sql` test in `src/pudl/dbt/tests/`,
+you can structure your query to modify the table/column before selecting failure
+rows. The second method is to create a [model](https://docs.getdbt.com/docs/build/models) in `src/pudl/dbt/models/validation`. Any models created here will create a view
+in a `duckdb` database being used by `dbt`. You can then reference this model in
+`src/pudl/dbt/models/schema.yml`, and apply tests as you would with `sources`. There's
+an example of this pattern which takes the table `out_ferc1__yearly_steam_plants_fuel_by_plant_sched402`,
+computes fuel cost per mmbtu in the `sql` model, then applies `dbt_expectations` tests
+to this model.
+
+#### Usage
+There are a few ways to execute tests. To run all tests with a single command:
+
+```
+dbt build
+```
+
+This command will first run any models, then execute all tests.
+
+For more finegrained control, first run:
+
+```
+dbt run
+```
+
+This will run all models, thus prepairing any `sql` views that will be referenced in
+tests. Once you've done this, you can run all tests with:
+
+```
+dbt test
+```
+
+To run all tests for a single source table:
+
+```
+dbt test --select source:pudl.{table_name}
+```
+
+To run all tests for a model table:
+
+```
+dbt test --select {model_name}
+```
+
+##### Selecting target profile
+To select between `nightly`, `etl-full`, and `etl-fast` profiles, append
+`--target {target_name}` to any of the previous commands.
diff --git a/src/pudl/dbt/dbt_project.yml b/src/pudl/dbt/dbt_project.yml
@@ -0,0 +1,14 @@
+# Name your project! Project names should contain only lowercase characters
+# and underscores. A good package name should reflect your organization's
+# name or the intended use of these models
+name: "pudl_dbt"
+version: "1.0.0"
+
+# This setting configures which "profile" dbt uses for this project.
+profile: "pudl_dbt"
+
+# These configurations specify where dbt should look for different types of files.
+# The `model-paths` config, for example, states that models in this project can be
+# found in the "models/" directory. You probably won't need to change these!
+model-paths: ["models"]
+test-paths: ["tests"]
diff --git a/src/pudl/dbt/models/schema.yml b/src/pudl/dbt/models/schema.yml
@@ -0,0 +1,90 @@
+version: 2
+
+sources:
+  - name: pudl
+    meta:
+      external_location: |
+        {%- if target.name == "nightly" -%} 'https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/nightly/{name}.parquet'
+        {%- else -%} '{{ env_var('PUDL_OUTPUT') }}/parquet/{name}.parquet'
+        {%- endif -%}
+    tables:
+      - name: out_eia923__boiler_fuel
+      - name: out_eia923__monthly_boiler_fuel
+      - name: out_ferc1__yearly_steam_plants_fuel_by_plant_sched402
+      - name: out_vcerare__hourly_available_capacity_factor
+        data_tests:
+          - dbt_expectations.expect_table_row_count_to_equal:
+              value: |
+                {%- if target.name == "etl-fast" -%} 27287400
+                {%- else -%} 136437000
+                {%- endif -%}
+          - dbt_expectations.expect_compound_columns_to_be_unique:
+              column_list: ["county_id_fips", "datetime_utc"]
+              row_condition: "county_id_fips is not null"
+        columns:
+          - name: capacity_factor_solar_pv
+            data_tests:
+              - not_null
+              - dbt_expectations.expect_column_max_to_be_between:
+                  max_value: 1.02
+              - dbt_expectations.expect_column_min_to_be_between:
+                  min_value: 0.00
+          - name: capacity_factor_offshore_wind
+            data_tests:
+              - not_null
+              - dbt_expectations.expect_column_max_to_be_between:
+                  max_value: 1.00
+              - dbt_expectations.expect_column_min_to_be_between:
+                  min_value: 0.00
+          - name: hour_of_year
+            data_tests:
+              - not_null
+              - dbt_expectations.expect_column_max_to_be_between:
+                  min_value: 8759
+                  max_value: 8761
+          - name: datetime_utc
+            data_tests:
+              - not_null
+              - dbt_expectations.expect_column_values_to_not_be_in_set:
+                  value_set: ["{{ dbt_date.date(2020, 12, 31) }}"]
+          - name: county_or_lake_name
+            data_tests:
+              - not_null
+              - dbt_expectations.expect_column_values_to_not_be_in_set:
+                  value_set: ["bedford_city", "clifton_forge_city"]
+models:
+  - name: ferc1_fbp_cost_per_mmbtu
+    columns:
+      - name: gas_cost_per_mmbtu
+        data_tests:
+          - dbt_expectations.expect_column_quantile_values_to_be_between:
+              quantile: 0.05
+              min_value: 1.5
+          - dbt_expectations.expect_column_quantile_values_to_be_between:
+              quantile: 0.90
+              max_value: 15.0
+          - dbt_expectations.expect_column_median_to_be_between:
+              min_value: 2.0
+              max_value: 10.0
+      - name: oil_cost_per_mmbtu
+        data_tests:
+          - dbt_expectations.expect_column_quantile_values_to_be_between:
+              quantile: 0.10
+              min_value: 3.5
+          - dbt_expectations.expect_column_quantile_values_to_be_between:
+              quantile: 0.90
+              max_value: 25.0
+          - dbt_expectations.expect_column_median_to_be_between:
+              min_value: 6.5
+              max_value: 17.0
+      - name: coal_cost_per_mmbtu
+        data_tests:
+          - dbt_expectations.expect_column_quantile_values_to_be_between:
+              quantile: 0.10
+              min_value: 0.75
+          - dbt_expectations.expect_column_quantile_values_to_be_between:
+              quantile: 0.90
+              max_value: 4.0
+          - dbt_expectations.expect_column_median_to_be_between:
+              min_value: 1.0
+              max_value: 2.5
diff --git a/src/pudl/dbt/models/validation/ferc1_fbp_cost_per_mmbtu.sql b/src/pudl/dbt/models/validation/ferc1_fbp_cost_per_mmbtu.sql
@@ -0,0 +1,6 @@
+
+select
+{% for fuel_type in ["gas", "oil", "coal"] %}
+    {{ fuel_type }}_fraction_cost * fuel_cost / ({{ fuel_type }}_fraction_mmbtu * fuel_mmbtu) as {{ fuel_type }}_cost_per_mmbtu,
+{% endfor %}
+from {{ source('pudl', 'out_ferc1__yearly_steam_plants_fuel_by_plant_sched402') }}
diff --git a/src/pudl/dbt/package-lock.yml b/src/pudl/dbt/package-lock.yml
@@ -0,0 +1,8 @@
+packages:
+  - package: calogica/dbt_expectations
+    version: 0.10.4
+  - package: dbt-labs/dbt_utils
+    version: 1.3.0
+  - package: calogica/dbt_date
+    version: 0.10.1
+sha1_hash: 29571f46f50e6393ca399c3db7361c22657f2d6b
diff --git a/src/pudl/dbt/packages.yml b/src/pudl/dbt/packages.yml
@@ -0,0 +1,5 @@
+packages:
+  - package: calogica/dbt_expectations
+    version: [">=0.10.0", "<0.11.0"]
+  - package: dbt-labs/dbt_utils
+    version: [">=1.3.0", "<1.4.0"]
diff --git a/src/pudl/dbt/profiles.yml b/src/pudl/dbt/profiles.yml
@@ -0,0 +1,17 @@
+pudl_dbt:
+  outputs:
+    # Define targets for nightly builds, and local ETL full/fast
+    # See models/schema.yml for further configuration
+    nightly:
+      type: duckdb
+      path: "{{ env_var('PUDL_OUTPUT') }}/pudl.duckdb"
+      filesystems:
+        - fs: s3
+    etl-full:
+      type: duckdb
+      path: "{{ env_var('PUDL_OUTPUT') }}/pudl.duckdb"
+    etl-fast:
+      type: duckdb
+      path: "{{ env_var('PUDL_OUTPUT') }}/pudl.duckdb"
+
+  target: nightly
diff --git a/src/pudl/dbt/tests/.gitkeep b/src/pudl/dbt/tests/.gitkeep
diff --git a/src/pudl/transform/ferc714.py b/src/pudl/transform/ferc714.py
@@ -362,7 +362,7 @@ def _filter_for_freshest_data_xbrl(
     into the raw instant or duration XBRL table name.
     """
     table_name_raw_xbrl = (
-        f"{TABLE_NAME_MAP_FERC714[table_name]["xbrl"]}_{instant_or_duration}"
+        f"{TABLE_NAME_MAP_FERC714[table_name]['xbrl']}_{instant_or_duration}"
     )
     xbrl = filter_for_freshest_data_xbrl(
         raw_xbrl,