Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix XBRL extraction clobber #3026

Merged
merged 5 commits into from
Nov 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion notebooks/work-in-progress/CEMS_by_utility.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
"from pudl.workspace.setup import PudlPaths\n",
"\n",
"\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db(\"ferc1\"))\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db_uri(\"ferc1\"))\n",
"\n",
"pudl_engine = sa.create_engine(PudlPaths().pudl_db())\n",
"#display(pudl_engine)\n",
Expand Down
2 changes: 1 addition & 1 deletion notebooks/work-in-progress/better-heatrates.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,7 @@
"from pudl.workspace.setup import PudlPaths\n",
"\n",
"# TODO(janrous): provide property for accessing ferc db?\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db(\"ferc1\"))\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db_uri(\"ferc1\"))\n",
"pudl_engine = sa.create_engine(PudlPaths().pudl_db)\n",
"\n",
"API_KEY_EIA = os.environ[\"API_KEY_EIA\"]\n",
Expand Down
2 changes: 1 addition & 1 deletion notebooks/work-in-progress/ferc714-output.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@
"source": [
"from pudl.workspace.setup import PudlPaths\n",
"\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db(\"ferc1\"))\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db_uri(\"ferc1\"))\n",
"display(ferc1_engine)\n",
"\n",
"pudl_engine = sa.create_engine(PudlPaths().pudl_db)\n",
Expand Down
2 changes: 1 addition & 1 deletion notebooks/work-in-progress/jupyterhub-test.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
"source": [
"from pudl.workspace.setup import PudlPaths\n",
"\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db(\"ferc1\"))\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db_uri(\"ferc1\"))\n",
"pudl_engine = sa.create_engine(PudlPaths().pudl_db)\n",
"pudl_out = pudl.output.pudltabl.PudlTabl(pudl_engine=pudl_engine)"
]
Expand Down
2 changes: 1 addition & 1 deletion notebooks/work-in-progress/state-demand.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@
"#HARVEST_ACCOUNT_ID = os.environ[\"HARVEST_ACCOUNT_ID\"]\n",
"\n",
"from pudl.workspace.setup import PudlPaths\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db(\"ferc1\"))\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db_uri(\"ferc1\"))\n",
"pudl_engine = sa.create_engine(PudlPaths().pudl_db)\n",
"pudl_out = pudl.output.pudltabl.PudlTabl(pudl_engine=pudl_engine)"
]
Expand Down
27 changes: 18 additions & 9 deletions src/pudl/extract/xbrl.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,12 +85,22 @@ def xbrl2sqlite(context) -> None:
logger.info(f"Dataset ferc{form}_xbrl is disabled, skipping")
continue

sql_path = PudlPaths().sqlite_db_path(f"ferc{form.value}_xbrl")

if sql_path.exists():
if clobber:
sql_path.unlink()
else:
raise RuntimeError(
f"Found existing DB at {sql_path} and clobber was set to False. Aborting."
)

convert_form(
settings,
form,
datastore,
output_path=output_path,
clobber=clobber,
sql_path=sql_path,
batch_size=batch_size,
workers=workers,
)
Expand All @@ -101,7 +111,7 @@ def convert_form(
form: XbrlFormNumber,
datastore: FercXbrlDatastore,
output_path: Path,
clobber: bool,
sql_path: Path,
jdangerx marked this conversation as resolved.
Show resolved Hide resolved
batch_size: int | None = None,
workers: int | None = None,
) -> None:
Expand All @@ -111,8 +121,8 @@ def convert_form(
form_settings: Validated settings for converting the desired XBRL form to SQLite.
form: FERC form number.
datastore: Instance of a FERC XBRL datastore for retrieving data.
pudl_settings: Dictionary containing paths and database URLs
used by PUDL.
output_path: PUDL output directory
sql_path: path to the SQLite DB we'd like to write to.
batch_size: Number of XBRL filings to process in a single CPU process.
workers: Number of CPU processes to create for processing XBRL filings.

Expand All @@ -125,13 +135,12 @@ def convert_form(
for year in form_settings.years:
taxonomy_archive, taxonomy_entry_point = datastore.get_taxonomy(year, form)
filings_archive = datastore.get_filings(year, form)

# if we set clobber=True, clobbers on *every* call to run_main;
# we already delete the existing base on `clobber=True` in `xbrl2sqlite`
run_main(
instance_path=filings_archive,
sql_path=PudlPaths()
.sqlite_db(f"ferc{form.value}_xbrl")
.removeprefix("sqlite:///"), # Temp hacky solution
clobber=clobber,
sql_path=sql_path,
clobber=False,
taxonomy=taxonomy_archive,
entry_point=taxonomy_entry_point,
form_number=form.value,
Expand Down
20 changes: 12 additions & 8 deletions src/pudl/workspace/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,12 @@ class Config:
@property
def input_dir(self) -> Path:
"""Path to PUDL input directory."""
return Path(self.pudl_input)
return Path(self.pudl_input).absolute()

@property
def output_dir(self) -> Path:
"""Path to PUDL output directory."""
return Path(self.pudl_output)
return Path(self.pudl_output).absolute()

@property
def settings_dir(self) -> Path:
Expand All @@ -73,18 +73,22 @@ def data_dir(self) -> Path:
return self.input_dir

@property
def pudl_db(self) -> Path:
def pudl_db(self) -> str:
"""Returns url of locally stored pudl sqlite database."""
return self.sqlite_db("pudl")
return self.sqlite_db_uri("pudl")

def sqlite_db(self, name: str) -> str:
"""Returns url of locally stored pudl slqlite database with given name.
def sqlite_db_uri(self, name: str) -> str:
"""Returns url of locally stored pudl sqlite database with given name.

The name is expected to be the name of the database without the .sqlite
suffix. E.g. pudl, ferc1 and so on.
"""
db_path = PudlPaths().output_dir / f"{name}.sqlite"
return f"sqlite:///{db_path}"
# SQLite URI has 3 slashes - 2 to separate URI scheme, 1 to separate creds
# sqlite://{credentials}/{db_path}
return f"sqlite:///{self.sqlite_db_path(name)}"

def sqlite_db_path(self, name: str) -> Path:
"""Return path to locally stored SQLite DB file."""
return self.output_dir / f"{name}.sqlite"

def output_file(self, filename: str) -> Path:
Expand Down
87 changes: 83 additions & 4 deletions test/unit/extract/xbrl_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,12 +122,91 @@ def test_xbrl2sqlite(settings, forms, mocker):
form,
mock_datastore,
output_path=PudlPaths().output_dir,
sql_path=PudlPaths().output_dir / f"ferc{form.value}_xbrl.sqlite",
batch_size=20,
workers=10,
clobber=True,
)


def test_xbrl2sqlite_db_exists_no_clobber(mocker):
convert_form_mock = mocker.MagicMock()
mocker.patch("pudl.extract.xbrl.convert_form", new=convert_form_mock)

# Mock datastore object to allow comparison
mock_datastore = mocker.MagicMock()
mocker.patch("pudl.extract.xbrl.FercXbrlDatastore", return_value=mock_datastore)

ferc1_sqlite_path = PudlPaths().output_dir / "ferc1_xbrl.sqlite"
ferc1_sqlite_path.touch()
settings = FercToSqliteSettings(
ferc1_dbf_to_sqlite_settings=Ferc1DbfToSqliteSettings(),
ferc1_xbrl_to_sqlite_settings=Ferc1XbrlToSqliteSettings(),
ferc2_xbrl_to_sqlite_settings=None,
ferc6_xbrl_to_sqlite_settings=None,
ferc60_xbrl_to_sqlite_settings=None,
ferc714_xbrl_to_sqlite_settings=None,
)
# Construct xbrl2sqlite op context
context = build_op_context(
resources={
"ferc_to_sqlite_settings": settings,
"datastore": "datastore",
},
config={
"workers": 10,
"batch_size": 20,
"clobber": False,
},
)

with pytest.raises(RuntimeError, match="Found existing DB"):
xbrl2sqlite(context)


def test_xbrl2sqlite_db_exists_yes_clobber(mocker):
convert_form_mock = mocker.MagicMock()
mocker.patch("pudl.extract.xbrl.convert_form", new=convert_form_mock)

# Mock datastore object to allow comparison
mock_datastore = mocker.MagicMock()
mocker.patch("pudl.extract.xbrl.FercXbrlDatastore", return_value=mock_datastore)

ferc1_sqlite_path = PudlPaths().output_dir / "ferc1_xbrl.sqlite"
ferc1_sqlite_path.touch()

# mock the db path so we can assert it gets clobbered
mock_db_path = mocker.MagicMock(spec=ferc1_sqlite_path)
mock_pudl_paths = mocker.MagicMock(
spec=PudlPaths(), sqlite_db_path=lambda _x: mock_db_path
)
mocker.patch("pudl.extract.xbrl.PudlPaths", return_value=mock_pudl_paths)

settings = FercToSqliteSettings(
ferc1_dbf_to_sqlite_settings=Ferc1DbfToSqliteSettings(),
ferc1_xbrl_to_sqlite_settings=Ferc1XbrlToSqliteSettings(),
ferc2_xbrl_to_sqlite_settings=None,
ferc6_xbrl_to_sqlite_settings=None,
ferc60_xbrl_to_sqlite_settings=None,
ferc714_xbrl_to_sqlite_settings=None,
)

context = build_op_context(
resources={
"ferc_to_sqlite_settings": settings,
"datastore": "datastore",
},
config={
"workers": 10,
"batch_size": 20,
"clobber": True,
},
)

xbrl2sqlite(context)

mock_db_path.unlink.assert_any_call()


def test_convert_form(mocker):
"""Test convert_form method is properly calling extractor."""
extractor_mock = mocker.MagicMock()
Expand Down Expand Up @@ -158,7 +237,7 @@ def get_filings(self, year, form: XbrlFormNumber):
form,
FakeDatastore(),
output_path=output_path,
clobber=True,
sql_path=output_path / f"ferc{form.value}_xbrl.sqlite",
batch_size=10,
workers=5,
)
Expand All @@ -169,8 +248,8 @@ def get_filings(self, year, form: XbrlFormNumber):
expected_calls.append(
mocker.call(
instance_path=f"filings_{year}_{form.value}",
sql_path=str(output_path / f"ferc{form.value}_xbrl.sqlite"),
clobber=True,
sql_path=output_path / f"ferc{form.value}_xbrl.sqlite",
clobber=False,
taxonomy=f"raw_archive_{year}_{form.value}",
entry_point=f"taxonomy_entry_point_{year}_{form.value}",
form_number=form.value,
Expand Down
2 changes: 1 addition & 1 deletion test/validate/notebooks/validate_bf_eia923.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@
"outputs": [],
"source": [
"from pudl.workspace.setup import PudlPaths\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db(\"ferc1\"))\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db_uri(\"ferc1\"))\n",
"pudl_engine = sa.create_engine(PudlPaths().pudl_db)"
]
},
Expand Down
2 changes: 1 addition & 1 deletion test/validate/notebooks/validate_fbp_ferc1.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@
"outputs": [],
"source": [
"from pudl.workspace.setup import PudlPaths\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db(\"ferc1\"))\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db_uri(\"ferc1\"))\n",
"pudl_engine = sa.create_engine(PudlPaths().pudl_db)"
]
},
Expand Down
2 changes: 1 addition & 1 deletion test/validate/notebooks/validate_frc_eia923.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@
"outputs": [],
"source": [
"from pudl.workspace.setup import PudlPaths\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db(\"ferc1\"))\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db_uri(\"ferc1\"))\n",
"pudl_engine = sa.create_engine(PudlPaths().pudl_db)"
]
},
Expand Down
2 changes: 1 addition & 1 deletion test/validate/notebooks/validate_fuel_ferc1.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@
"outputs": [],
"source": [
"from pudl.workspace.setup import PudlPaths\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db("ferc1"))\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db_uri("ferc1"))\n",
"pudl_engine = sa.create_engine(PudlPaths().pudl_db)\n",
"pudl_settings"
]
Expand Down
2 changes: 1 addition & 1 deletion test/validate/notebooks/validate_gens_eia860.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@
"outputs": [],
"source": [
"from pudl.workspace.setup import PudlPaths\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db("ferc1"))\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db_uri("ferc1"))\n",
"pudl_engine = sa.create_engine(PudlPaths().pudl_db)"
]
},
Expand Down
2 changes: 1 addition & 1 deletion test/validate/notebooks/validate_gf_eia923.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@
"outputs": [],
"source": [
"from pudl.workspace.setup import PudlPaths\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db("ferc1"))\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db_uri("ferc1"))\n",
"pudl_engine = sa.create_engine(PudlPaths().pudl_db)\n",
"pudl_settings"
]
Expand Down
2 changes: 1 addition & 1 deletion test/validate/notebooks/validate_mcoe.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@
"outputs": [],
"source": [
"from pudl.workspace.setup import PudlPaths\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db("ferc1"))\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db_uri("ferc1"))\n",
"pudl_engine = sa.create_engine(PudlPaths().pudl_db)\n",
"pudl_settings"
]
Expand Down
2 changes: 1 addition & 1 deletion test/validate/notebooks/validate_plants_steam_ferc1.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@
"outputs": [],
"source": [
"from pudl.workspace.setup import PudlPaths\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db("ferc1"))\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db_uri("ferc1"))\n",
"pudl_engine = sa.create_engine(PudlPaths().pudl_db)\n",
"pudl_settings"
]
Expand Down