From 85c6fe35612cd23755b4b4fab279323ebb70b5f9 Mon Sep 17 00:00:00 2001 From: bendnorman Date: Tue, 7 Nov 2023 16:27:30 -0900 Subject: [PATCH 1/2] Add deprecation warnings to PudlTabl and add minor naming docs updates --- docs/dev/naming_conventions.rst | 15 +++++++++++++-- docs/release_notes.rst | 17 +++++++++++++++++ src/pudl/output/pudltabl.py | 11 +++++++++++ 3 files changed, 41 insertions(+), 2 deletions(-) diff --git a/docs/dev/naming_conventions.rst b/docs/dev/naming_conventions.rst index 178f22934c..3c2d0908b0 100644 --- a/docs/dev/naming_conventions.rst +++ b/docs/dev/naming_conventions.rst @@ -29,7 +29,9 @@ names should generally follow this naming convention: ``eia860``, ``ferc1`` and ``epacems``. * ``asset_type`` describes how the asset in modeled. * ``asset_name`` should describe the entity, categorical code type, or measurement of - the asset. + the asset. Note: FERC Form 1 assets typically include the schedule number in the + ``asset_name`` so users and contributors know which schedule the cleaned asset + refers to. Raw layer ^^^^^^^^^ @@ -55,14 +57,23 @@ These assets are typically stored in parquet files or tables in a database. Naming convention: ``core_{source}__{asset_type}_{asset_name}`` +* ``source`` is sometimes ``pudl``. This means the asset + is a derived connection the contributors of PUDL created to connect multiple + datasets via manual or machine learning methods. + * ``asset_type`` describes how the asset is modeled and its role in PUDL’s collection of core assets. There are a handful of table types in this layer: * ``assn``: Association tables provide connections between entities. This data - can be manually compiled or extracted from data sources. Examples: + can be manually compiled or extracted from data sources. If the asset associates + data from two sources, the source names should be included in the ``asset_name``. + The source names should appear in the same order for all assets that associate + the two sources. Examples: * ``core_pudl__assn_plants_eia`` associates EIA Plant IDs and manually assigned PUDL Plant IDs. + * ``core_epa__assn_epacamd_eia`` associates EPA units with EIA plants, boilers, + and generators. * ``codes``: Code tables contain more verbose descriptions of categorical codes typically manually compiled from source data dictionaries. Examples: diff --git a/docs/release_notes.rst b/docs/release_notes.rst index 9b477f8fa6..5c2536dbd9 100644 --- a/docs/release_notes.rst +++ b/docs/release_notes.rst @@ -67,6 +67,23 @@ Dagster Adoption * :mod:`pudl.convert.censusdp1tract_to_sqlite` and :mod:`pudl.output.censusdp1tract` are now integrated into dagster. See :issue:`1973` and :pr:`2621`. +New Asset Naming Convention +^^^^^^^^^^^^^^^^^^^^^^^^^^^ +There are hundreds of new tables in ``pudl.sqlite`` now that the methods in ``PudlTabl`` +have been converted to Dagster assets. This significant increase in tables and diversity +of table types prompted us to create a new naming convention to make the table names +more descriptive and organized. You can read about the new naming convention in the +:ref:`docs `. + +To help users migrate away from using ``PudlTabl`` and our temporary table names, +we've created a `google sheet `__ +that maps the old table names and ``PudlTabl`` methods to the new table names. + +We plan to remove ``PudlTabl`` from the pudl package once our known users have +succesfully migrated to pulling data directly from ``pudl.sqlite``. We've added +deprecation warnings to the ``PudlTabl`` class. We expect to remove ``PudlTabl`` +at the end of February 2024. + Data Coverage ^^^^^^^^^^^^^ diff --git a/src/pudl/output/pudltabl.py b/src/pudl/output/pudltabl.py index 08f03a9137..eefc4bb63b 100644 --- a/src/pudl/output/pudltabl.py +++ b/src/pudl/output/pudltabl.py @@ -89,6 +89,12 @@ def __init__( unit_ids: If True, use several heuristics to assign individual generators to functional units. EXPERIMENTAL. """ + logger.warning( + "PudlTabl is deprecated and will be removed from the pudl package" + "at the end of February 2024. To acccess the data returned by" + "this class, pull the desired table directly from the pudl.sqlite" + "database." + ) if not isinstance(pudl_engine, sa.engine.base.Engine): raise TypeError( "PudlTabl needs pudl_engine to be a SQLAlchemy Engine, but we " @@ -296,6 +302,11 @@ def _get_table_from_db( "It is retained for backwards compatibility only." ) table_name = self._agg_table_name(table_name) + logger.warning( + "PudlTabl is deprecated and will be removed from the pudl package" + "at the end of February 2024. To access the data returned by this method," + f"use the {table_name} table in the pudl.sqlite database." + ) resource = Resource.from_id(table_name) return pd.concat( [ From 479ec7f921999bfa79af0188220f01f70a1b62fc Mon Sep 17 00:00:00 2001 From: bendnorman Date: Wed, 8 Nov 2023 10:58:32 -0900 Subject: [PATCH 2/2] Remove PudlTabl removal data and make assn table name sources alphabetical --- docs/dev/naming_conventions.rst | 5 ++--- docs/release_notes.rst | 7 +++---- src/pudl/output/pudltabl.py | 12 ++++++------ 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/docs/dev/naming_conventions.rst b/docs/dev/naming_conventions.rst index 3c2d0908b0..5ccf005030 100644 --- a/docs/dev/naming_conventions.rst +++ b/docs/dev/naming_conventions.rst @@ -66,9 +66,8 @@ Naming convention: ``core_{source}__{asset_type}_{asset_name}`` * ``assn``: Association tables provide connections between entities. This data can be manually compiled or extracted from data sources. If the asset associates - data from two sources, the source names should be included in the ``asset_name``. - The source names should appear in the same order for all assets that associate - the two sources. Examples: + data from two sources, the source names should be included in the ``asset_name`` + in alphabetical order. Examples: * ``core_pudl__assn_plants_eia`` associates EIA Plant IDs and manually assigned PUDL Plant IDs. diff --git a/docs/release_notes.rst b/docs/release_notes.rst index 5c2536dbd9..5bdc338d95 100644 --- a/docs/release_notes.rst +++ b/docs/release_notes.rst @@ -79,10 +79,9 @@ To help users migrate away from using ``PudlTabl`` and our temporary table names we've created a `google sheet `__ that maps the old table names and ``PudlTabl`` methods to the new table names. -We plan to remove ``PudlTabl`` from the pudl package once our known users have -succesfully migrated to pulling data directly from ``pudl.sqlite``. We've added -deprecation warnings to the ``PudlTabl`` class. We expect to remove ``PudlTabl`` -at the end of February 2024. +We've added deprecation warnings to the ``PudlTabl`` class. We plan to remove +``PudlTabl`` from the ``pudl`` package once our known users have +succesfully migrated to pulling data directly from ``pudl.sqlite``. Data Coverage ^^^^^^^^^^^^^ diff --git a/src/pudl/output/pudltabl.py b/src/pudl/output/pudltabl.py index eefc4bb63b..ede31c3f00 100644 --- a/src/pudl/output/pudltabl.py +++ b/src/pudl/output/pudltabl.py @@ -90,10 +90,9 @@ def __init__( individual generators to functional units. EXPERIMENTAL. """ logger.warning( - "PudlTabl is deprecated and will be removed from the pudl package" - "at the end of February 2024. To acccess the data returned by" - "this class, pull the desired table directly from the pudl.sqlite" - "database." + "PudlTabl is deprecated and will be removed from the pudl package " + "once known users have migrated to accessing the data directly from " + "pudl.sqlite. " ) if not isinstance(pudl_engine, sa.engine.base.Engine): raise TypeError( @@ -303,8 +302,9 @@ def _get_table_from_db( ) table_name = self._agg_table_name(table_name) logger.warning( - "PudlTabl is deprecated and will be removed from the pudl package" - "at the end of February 2024. To access the data returned by this method," + "PudlTabl is deprecated and will be removed from the pudl package " + "once known users have migrated to accessing the data directly from " + "pudl.sqlite. To access the data returned by this method, " f"use the {table_name} table in the pudl.sqlite database." ) resource = Resource.from_id(table_name)