Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix XBRL extraction clobber #3026

Merged
merged 5 commits into from
Nov 10, 2023
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
WIP rebase to give a good message
jdangerx committed Nov 9, 2023
commit 90ade22822f1f4ab4a196a51cb782d79beec8fbe
16 changes: 10 additions & 6 deletions src/pudl/extract/xbrl.py
Original file line number Diff line number Diff line change
@@ -2,6 +2,7 @@
import io
from datetime import date
from pathlib import Path
from urllib.parse import urlparse

from dagster import Field, Noneable, op
from ferc_xbrl_extractor.cli import run_main
@@ -85,12 +86,17 @@ def xbrl2sqlite(context) -> None:
logger.info(f"Dataset ferc{form}_xbrl is disabled, skipping")
continue

sql_path = Path(urlparse(PudlPaths().sqlite_db(f"ferc{form.value}_xbrl")).path)

if clobber:
sql_path.unlink(missing_ok=True)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

okay so this is now the guy that removes the db and we always run clobber=True in pudl when we call the extractor's run_main bc each run is per year. cool cool!

convert_form(
settings,
form,
datastore,
output_path=output_path,
clobber=clobber,
sql_path=sql_path,
batch_size=batch_size,
workers=workers,
)
@@ -101,7 +107,7 @@ def convert_form(
form: XbrlFormNumber,
datastore: FercXbrlDatastore,
output_path: Path,
clobber: bool,
sql_path: Path,
jdangerx marked this conversation as resolved.
Show resolved Hide resolved
batch_size: int | None = None,
workers: int | None = None,
) -> None:
@@ -128,10 +134,8 @@ def convert_form(

run_main(
instance_path=filings_archive,
sql_path=PudlPaths()
.sqlite_db(f"ferc{form.value}_xbrl")
.removeprefix("sqlite:///"), # Temp hacky solution
clobber=clobber,
sql_path=sql_path,
clobber=False, # if we set clobber=True, clobbers on *every* call to run_main
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems like this makes the clobber option non-functional, at least in this location. The idea with clobber is to functionally rm -f the_database.sqlite before starting a new ETL run. Maybe it should only exist at the highest level in the ferc_to_sqlite script and actually unlink the paths if set -- and then everything in here can just be like "You gave me a DB URL and I'm going to write to it. Not my problem if there was already something incompatible there."

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the unlink() on line 92 does the rm -f you're looking for?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Previously when I failed to clobber, the XBRL extractor would write duplicate data - when I ran xbrl_to_sqlite multiple times I still only had one copy of the data afterwards. I guess it could have not written any data at all, the later times. I can run again and see if the database is gone between the unlink() and the database write.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, the database gets deleted at the unlink() step if clobber is set to True.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

DELETED. Strongbad would be proud. I guess the other half of the logic we've previously associated with --clobber is that if the DB already exists and you attempt to run the ETL, it bails out early, to avoid unintentionally modifying an existing DB. It's not trying to be anything fancy -- just a binary switch "Do you want me to make something new and blow everything that exists away if it's there? Or do you expect that there's a clean slate, and there should be no possible conflicts.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this is the current behavior though. If you don't say clobber and there is an existing DB what happens?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the past if you don't say clobber and there's an existing DB, we actually just append to the existing DB. Which is pretty bad. I'll add a quick check to just bail out if there's an existing DB instead.

taxonomy=taxonomy_archive,
entry_point=taxonomy_entry_point,
form_number=form.value,
5 changes: 3 additions & 2 deletions src/pudl/io_managers.py
Original file line number Diff line number Diff line change
@@ -426,8 +426,9 @@ def __init__(

super().__init__(base_dir, db_name, md, timeout)

existing_schema_context = MigrationContext.configure(self.engine.connect())
metadata_diff = compare_metadata(existing_schema_context, self.md)
with self.engine.connect() as conn:
jdangerx marked this conversation as resolved.
Show resolved Hide resolved
existing_schema_context = MigrationContext.configure(conn)
metadata_diff = compare_metadata(existing_schema_context, self.md)
if metadata_diff:
logger.info(f"Metadata diff:\n\n{metadata_diff}")
raise RuntimeError(
9 changes: 4 additions & 5 deletions src/pudl/workspace/setup.py
Original file line number Diff line number Diff line change
@@ -54,12 +54,12 @@ class Config:
@property
def input_dir(self) -> Path:
"""Path to PUDL input directory."""
return Path(self.pudl_input)
return Path(self.pudl_input).absolute()

@property
def output_dir(self) -> Path:
"""Path to PUDL output directory."""
return Path(self.pudl_output)
return Path(self.pudl_output).absolute()

@property
def settings_dir(self) -> Path:
@@ -83,9 +83,8 @@ def sqlite_db(self, name: str) -> str:
The name is expected to be the name of the database without the .sqlite
suffix. E.g. pudl, ferc1 and so on.
"""
db_path = PudlPaths().output_dir / f"{name}.sqlite"
return f"sqlite:///{db_path}"
return self.output_dir / f"{name}.sqlite"
db_path = self.output_dir / f"{name}.sqlite"
return f"sqlite://{db_path}"

def output_file(self, filename: str) -> Path:
"""Path to file in PUDL output directory."""
8 changes: 4 additions & 4 deletions test/unit/extract/xbrl_test.py
Original file line number Diff line number Diff line change
@@ -122,9 +122,9 @@ def test_xbrl2sqlite(settings, forms, mocker):
form,
mock_datastore,
output_path=PudlPaths().output_dir,
sql_path=PudlPaths().output_dir / f"ferc{form.value}_xbrl.sqlite",
batch_size=20,
workers=10,
clobber=True,
)


@@ -158,7 +158,7 @@ def get_filings(self, year, form: XbrlFormNumber):
form,
FakeDatastore(),
output_path=output_path,
clobber=True,
sql_path=output_path / f"ferc{form.value}_xbrl.sqlite",
batch_size=10,
workers=5,
)
@@ -169,8 +169,8 @@ def get_filings(self, year, form: XbrlFormNumber):
expected_calls.append(
mocker.call(
instance_path=f"filings_{year}_{form.value}",
sql_path=str(output_path / f"ferc{form.value}_xbrl.sqlite"),
clobber=True,
sql_path=output_path / f"ferc{form.value}_xbrl.sqlite",
clobber=False,
taxonomy=f"raw_archive_{year}_{form.value}",
entry_point=f"taxonomy_entry_point_{year}_{form.value}",
form_number=form.value,