Skip to content

Commit

Permalink
Fix XBRL extraction clobber (#3026)
Browse files Browse the repository at this point in the history
Clobber once per form, instead of once per form-year.
  • Loading branch information
jdangerx authored Nov 10, 2023
1 parent ed6455c commit fa096f2
Show file tree
Hide file tree
Showing 16 changed files with 126 additions and 34 deletions.
2 changes: 1 addition & 1 deletion notebooks/work-in-progress/CEMS_by_utility.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
"from pudl.workspace.setup import PudlPaths\n",
"\n",
"\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db(\"ferc1\"))\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db_uri(\"ferc1\"))\n",
"\n",
"pudl_engine = sa.create_engine(PudlPaths().pudl_db())\n",
"#display(pudl_engine)\n",
Expand Down
2 changes: 1 addition & 1 deletion notebooks/work-in-progress/better-heatrates.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,7 @@
"from pudl.workspace.setup import PudlPaths\n",
"\n",
"# TODO(janrous): provide property for accessing ferc db?\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db(\"ferc1\"))\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db_uri(\"ferc1\"))\n",
"pudl_engine = sa.create_engine(PudlPaths().pudl_db)\n",
"\n",
"API_KEY_EIA = os.environ[\"API_KEY_EIA\"]\n",
Expand Down
2 changes: 1 addition & 1 deletion notebooks/work-in-progress/ferc714-output.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@
"source": [
"from pudl.workspace.setup import PudlPaths\n",
"\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db(\"ferc1\"))\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db_uri(\"ferc1\"))\n",
"display(ferc1_engine)\n",
"\n",
"pudl_engine = sa.create_engine(PudlPaths().pudl_db)\n",
Expand Down
2 changes: 1 addition & 1 deletion notebooks/work-in-progress/jupyterhub-test.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
"source": [
"from pudl.workspace.setup import PudlPaths\n",
"\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db(\"ferc1\"))\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db_uri(\"ferc1\"))\n",
"pudl_engine = sa.create_engine(PudlPaths().pudl_db)\n",
"pudl_out = pudl.output.pudltabl.PudlTabl(pudl_engine=pudl_engine)"
]
Expand Down
2 changes: 1 addition & 1 deletion notebooks/work-in-progress/state-demand.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@
"#HARVEST_ACCOUNT_ID = os.environ[\"HARVEST_ACCOUNT_ID\"]\n",
"\n",
"from pudl.workspace.setup import PudlPaths\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db(\"ferc1\"))\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db_uri(\"ferc1\"))\n",
"pudl_engine = sa.create_engine(PudlPaths().pudl_db)\n",
"pudl_out = pudl.output.pudltabl.PudlTabl(pudl_engine=pudl_engine)"
]
Expand Down
27 changes: 18 additions & 9 deletions src/pudl/extract/xbrl.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,12 +85,22 @@ def xbrl2sqlite(context) -> None:
logger.info(f"Dataset ferc{form}_xbrl is disabled, skipping")
continue

sql_path = PudlPaths().sqlite_db_path(f"ferc{form.value}_xbrl")

if sql_path.exists():
if clobber:
sql_path.unlink()
else:
raise RuntimeError(
f"Found existing DB at {sql_path} and clobber was set to False. Aborting."
)

convert_form(
settings,
form,
datastore,
output_path=output_path,
clobber=clobber,
sql_path=sql_path,
batch_size=batch_size,
workers=workers,
)
Expand All @@ -101,7 +111,7 @@ def convert_form(
form: XbrlFormNumber,
datastore: FercXbrlDatastore,
output_path: Path,
clobber: bool,
sql_path: Path,
batch_size: int | None = None,
workers: int | None = None,
) -> None:
Expand All @@ -111,8 +121,8 @@ def convert_form(
form_settings: Validated settings for converting the desired XBRL form to SQLite.
form: FERC form number.
datastore: Instance of a FERC XBRL datastore for retrieving data.
pudl_settings: Dictionary containing paths and database URLs
used by PUDL.
output_path: PUDL output directory
sql_path: path to the SQLite DB we'd like to write to.
batch_size: Number of XBRL filings to process in a single CPU process.
workers: Number of CPU processes to create for processing XBRL filings.
Expand All @@ -125,13 +135,12 @@ def convert_form(
for year in form_settings.years:
taxonomy_archive, taxonomy_entry_point = datastore.get_taxonomy(year, form)
filings_archive = datastore.get_filings(year, form)

# if we set clobber=True, clobbers on *every* call to run_main;
# we already delete the existing base on `clobber=True` in `xbrl2sqlite`
run_main(
instance_path=filings_archive,
sql_path=PudlPaths()
.sqlite_db(f"ferc{form.value}_xbrl")
.removeprefix("sqlite:///"), # Temp hacky solution
clobber=clobber,
sql_path=sql_path,
clobber=False,
taxonomy=taxonomy_archive,
entry_point=taxonomy_entry_point,
form_number=form.value,
Expand Down
20 changes: 12 additions & 8 deletions src/pudl/workspace/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,12 @@ class Config:
@property
def input_dir(self) -> Path:
"""Path to PUDL input directory."""
return Path(self.pudl_input)
return Path(self.pudl_input).absolute()

@property
def output_dir(self) -> Path:
"""Path to PUDL output directory."""
return Path(self.pudl_output)
return Path(self.pudl_output).absolute()

@property
def settings_dir(self) -> Path:
Expand All @@ -73,18 +73,22 @@ def data_dir(self) -> Path:
return self.input_dir

@property
def pudl_db(self) -> Path:
def pudl_db(self) -> str:
"""Returns url of locally stored pudl sqlite database."""
return self.sqlite_db("pudl")
return self.sqlite_db_uri("pudl")

def sqlite_db(self, name: str) -> str:
"""Returns url of locally stored pudl slqlite database with given name.
def sqlite_db_uri(self, name: str) -> str:
"""Returns url of locally stored pudl sqlite database with given name.
The name is expected to be the name of the database without the .sqlite
suffix. E.g. pudl, ferc1 and so on.
"""
db_path = PudlPaths().output_dir / f"{name}.sqlite"
return f"sqlite:///{db_path}"
# SQLite URI has 3 slashes - 2 to separate URI scheme, 1 to separate creds
# sqlite://{credentials}/{db_path}
return f"sqlite:///{self.sqlite_db_path(name)}"

def sqlite_db_path(self, name: str) -> Path:
"""Return path to locally stored SQLite DB file."""
return self.output_dir / f"{name}.sqlite"

def output_file(self, filename: str) -> Path:
Expand Down
87 changes: 83 additions & 4 deletions test/unit/extract/xbrl_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,12 +122,91 @@ def test_xbrl2sqlite(settings, forms, mocker):
form,
mock_datastore,
output_path=PudlPaths().output_dir,
sql_path=PudlPaths().output_dir / f"ferc{form.value}_xbrl.sqlite",
batch_size=20,
workers=10,
clobber=True,
)


def test_xbrl2sqlite_db_exists_no_clobber(mocker):
convert_form_mock = mocker.MagicMock()
mocker.patch("pudl.extract.xbrl.convert_form", new=convert_form_mock)

# Mock datastore object to allow comparison
mock_datastore = mocker.MagicMock()
mocker.patch("pudl.extract.xbrl.FercXbrlDatastore", return_value=mock_datastore)

ferc1_sqlite_path = PudlPaths().output_dir / "ferc1_xbrl.sqlite"
ferc1_sqlite_path.touch()
settings = FercToSqliteSettings(
ferc1_dbf_to_sqlite_settings=Ferc1DbfToSqliteSettings(),
ferc1_xbrl_to_sqlite_settings=Ferc1XbrlToSqliteSettings(),
ferc2_xbrl_to_sqlite_settings=None,
ferc6_xbrl_to_sqlite_settings=None,
ferc60_xbrl_to_sqlite_settings=None,
ferc714_xbrl_to_sqlite_settings=None,
)
# Construct xbrl2sqlite op context
context = build_op_context(
resources={
"ferc_to_sqlite_settings": settings,
"datastore": "datastore",
},
config={
"workers": 10,
"batch_size": 20,
"clobber": False,
},
)

with pytest.raises(RuntimeError, match="Found existing DB"):
xbrl2sqlite(context)


def test_xbrl2sqlite_db_exists_yes_clobber(mocker):
convert_form_mock = mocker.MagicMock()
mocker.patch("pudl.extract.xbrl.convert_form", new=convert_form_mock)

# Mock datastore object to allow comparison
mock_datastore = mocker.MagicMock()
mocker.patch("pudl.extract.xbrl.FercXbrlDatastore", return_value=mock_datastore)

ferc1_sqlite_path = PudlPaths().output_dir / "ferc1_xbrl.sqlite"
ferc1_sqlite_path.touch()

# mock the db path so we can assert it gets clobbered
mock_db_path = mocker.MagicMock(spec=ferc1_sqlite_path)
mock_pudl_paths = mocker.MagicMock(
spec=PudlPaths(), sqlite_db_path=lambda _x: mock_db_path
)
mocker.patch("pudl.extract.xbrl.PudlPaths", return_value=mock_pudl_paths)

settings = FercToSqliteSettings(
ferc1_dbf_to_sqlite_settings=Ferc1DbfToSqliteSettings(),
ferc1_xbrl_to_sqlite_settings=Ferc1XbrlToSqliteSettings(),
ferc2_xbrl_to_sqlite_settings=None,
ferc6_xbrl_to_sqlite_settings=None,
ferc60_xbrl_to_sqlite_settings=None,
ferc714_xbrl_to_sqlite_settings=None,
)

context = build_op_context(
resources={
"ferc_to_sqlite_settings": settings,
"datastore": "datastore",
},
config={
"workers": 10,
"batch_size": 20,
"clobber": True,
},
)

xbrl2sqlite(context)

mock_db_path.unlink.assert_any_call()


def test_convert_form(mocker):
"""Test convert_form method is properly calling extractor."""
extractor_mock = mocker.MagicMock()
Expand Down Expand Up @@ -158,7 +237,7 @@ def get_filings(self, year, form: XbrlFormNumber):
form,
FakeDatastore(),
output_path=output_path,
clobber=True,
sql_path=output_path / f"ferc{form.value}_xbrl.sqlite",
batch_size=10,
workers=5,
)
Expand All @@ -169,8 +248,8 @@ def get_filings(self, year, form: XbrlFormNumber):
expected_calls.append(
mocker.call(
instance_path=f"filings_{year}_{form.value}",
sql_path=str(output_path / f"ferc{form.value}_xbrl.sqlite"),
clobber=True,
sql_path=output_path / f"ferc{form.value}_xbrl.sqlite",
clobber=False,
taxonomy=f"raw_archive_{year}_{form.value}",
entry_point=f"taxonomy_entry_point_{year}_{form.value}",
form_number=form.value,
Expand Down
2 changes: 1 addition & 1 deletion test/validate/notebooks/validate_bf_eia923.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@
"outputs": [],
"source": [
"from pudl.workspace.setup import PudlPaths\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db(\"ferc1\"))\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db_uri(\"ferc1\"))\n",
"pudl_engine = sa.create_engine(PudlPaths().pudl_db)"
]
},
Expand Down
2 changes: 1 addition & 1 deletion test/validate/notebooks/validate_fbp_ferc1.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@
"outputs": [],
"source": [
"from pudl.workspace.setup import PudlPaths\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db(\"ferc1\"))\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db_uri(\"ferc1\"))\n",
"pudl_engine = sa.create_engine(PudlPaths().pudl_db)"
]
},
Expand Down
2 changes: 1 addition & 1 deletion test/validate/notebooks/validate_frc_eia923.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@
"outputs": [],
"source": [
"from pudl.workspace.setup import PudlPaths\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db(\"ferc1\"))\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db_uri(\"ferc1\"))\n",
"pudl_engine = sa.create_engine(PudlPaths().pudl_db)"
]
},
Expand Down
2 changes: 1 addition & 1 deletion test/validate/notebooks/validate_fuel_ferc1.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@
"outputs": [],
"source": [
"from pudl.workspace.setup import PudlPaths\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db("ferc1"))\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db_uri("ferc1"))\n",
"pudl_engine = sa.create_engine(PudlPaths().pudl_db)\n",
"pudl_settings"
]
Expand Down
2 changes: 1 addition & 1 deletion test/validate/notebooks/validate_gens_eia860.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@
"outputs": [],
"source": [
"from pudl.workspace.setup import PudlPaths\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db("ferc1"))\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db_uri("ferc1"))\n",
"pudl_engine = sa.create_engine(PudlPaths().pudl_db)"
]
},
Expand Down
2 changes: 1 addition & 1 deletion test/validate/notebooks/validate_gf_eia923.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@
"outputs": [],
"source": [
"from pudl.workspace.setup import PudlPaths\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db("ferc1"))\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db_uri("ferc1"))\n",
"pudl_engine = sa.create_engine(PudlPaths().pudl_db)\n",
"pudl_settings"
]
Expand Down
2 changes: 1 addition & 1 deletion test/validate/notebooks/validate_mcoe.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@
"outputs": [],
"source": [
"from pudl.workspace.setup import PudlPaths\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db("ferc1"))\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db_uri("ferc1"))\n",
"pudl_engine = sa.create_engine(PudlPaths().pudl_db)\n",
"pudl_settings"
]
Expand Down
2 changes: 1 addition & 1 deletion test/validate/notebooks/validate_plants_steam_ferc1.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@
"outputs": [],
"source": [
"from pudl.workspace.setup import PudlPaths\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db("ferc1"))\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db_uri("ferc1"))\n",
"pudl_engine = sa.create_engine(PudlPaths().pudl_db)\n",
"pudl_settings"
]
Expand Down

0 comments on commit fa096f2

Please sign in to comment.