Skip to content

Commit

Permalink
Break data down by 5-year increments
Browse files Browse the repository at this point in the history
  • Loading branch information
b-j-mills committed Jan 24, 2025
1 parent a5666a0 commit d5fc110
Show file tree
Hide file tree
Showing 7 changed files with 6,057 additions and 6,033 deletions.
2 changes: 1 addition & 1 deletion requirements-test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ pockets==0.9.1
# via
# -c requirements.txt
# sphinxcontrib-napoleon
pydantic==2.10.5
pydantic==2.10.6
# via
# -c requirements.txt
# frictionless
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ ply==3.11
# libhxl
pockets==0.9.1
# via sphinxcontrib-napoleon
pydantic==2.10.5
pydantic==2.10.6
# via frictionless
pydantic-core==2.27.2
# via pydantic
Expand Down
7 changes: 6 additions & 1 deletion src/hdx/scraper/acled/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from hdx.api.configuration import Configuration
from hdx.data.user import User
from hdx.facades.infer_arguments import facade
from hdx.utilities.dateparse import now_utc
from hdx.utilities.downloader import Download
from hdx.utilities.path import (
wheretostart_tempdir_batch,
Expand Down Expand Up @@ -56,7 +57,11 @@ def main(
)

acled = Acled(configuration, retriever, temp_dir)
acled.download_data()

today = now_utc()
year = today.year
acled.download_data(year)

dataset = acled.generate_dataset()
dataset.update_from_yaml(
path=join(dirname(__file__), "config", "hdx_dataset_static.yaml")
Expand Down
58 changes: 35 additions & 23 deletions src/hdx/scraper/acled/acled.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from hdx.location.country import Country
from hdx.utilities.dateparse import parse_date_range
from hdx.utilities.retriever import Retrieve
from pandas import DataFrame, concat, read_excel
from pandas import concat, read_excel

logger = logging.getLogger(__name__)

Expand All @@ -20,9 +20,9 @@ def __init__(self, configuration: Configuration, retriever: Retrieve, temp_dir:
self._retriever = retriever
self._temp_dir = temp_dir
self.dates = []
self.data = DataFrame()
self.data = {}

def download_data(self):
def download_data(self, year: int):
for dataset_name in self._configuration["datasets"]:
event_type = dataset_name[: dataset_name.index("event") - 1]
event_type = event_type.replace("-", "_")
Expand Down Expand Up @@ -70,12 +70,22 @@ def download_data(self):
},
inplace=True,
)
contents = contents[self._configuration["hxl_tags"].keys()]
# contents = contents[self._configuration["hxl_tags"].keys()]

if len(self.data) == 0:
self.data = contents
else:
self.data = concat([self.data, contents])
for year_start in range(1995, year + 1, 5):
year_end = year_start + 4
year_range = f"{year_start}-{year_end}"
subset = contents.loc[
(contents["Year"] >= year_start) & (contents["Year"] <= year_end),
self._configuration["hxl_tags"].keys(),
]
if len(subset) == 0:
continue

if year_range in self.data:
self.data[year_range] = concat([self.data[year_range], subset])
else:
self.data[year_range] = subset

def generate_dataset(self) -> Optional[Dataset]:
dataset = Dataset(
Expand All @@ -90,22 +100,24 @@ def generate_dataset(self) -> Optional[Dataset]:
end_date = max(self.dates)
dataset.set_time_period(start_date, end_date)

self.data = self.data.to_dict(orient="records")
resourcedata = {
"name": "conflict_events_and_fatalities",
"description": "A weekly dataset providing the total number of reported "
"events and fatalities broken down by country and month.",
}
hxl_tags = self._configuration["hxl_tags"]
headers = list(hxl_tags.keys())
dataset.generate_resource_from_iterable(
headers,
self.data,
hxl_tags,
self._temp_dir,
"conflict_events_and_fatalities.csv",
resourcedata,
encoding="utf-8-sig",
)
for date_range in reversed(self.data.keys()):
data = self.data[date_range].to_dict(orient="records")
resourcedata = {
"name": f"conflict_events_and_fatalities for {date_range}",
"description": f"A weekly dataset providing the total number of reported "
f"conflict events and fatalities broken down by country and month for "
f"{date_range}.",
}
dataset.generate_resource_from_iterable(
headers,
data,
hxl_tags,
self._temp_dir,
f"conflict_events_and_fatalities_{date_range}.csv",
resourcedata,
encoding="utf-8-sig",
)

return dataset
Loading

0 comments on commit d5fc110

Please sign in to comment.