Break data down by 5-year increments

OCHA-DAP · Jan 24, 2025 · d5fc110 · d5fc110
1 parent a5666a0
commit d5fc110
Show file tree

Hide file tree

Showing 7 changed files with 6,057 additions and 6,033 deletions.
diff --git a/requirements-test.txt b/requirements-test.txt
@@ -186,7 +186,7 @@ pockets==0.9.1
     # via
     #   -c requirements.txt
     #   sphinxcontrib-napoleon
-pydantic==2.10.5
+pydantic==2.10.6
     # via
     #   -c requirements.txt
     #   frictionless

diff --git a/requirements.txt b/requirements.txt
@@ -101,7 +101,7 @@ ply==3.11
     #   libhxl
 pockets==0.9.1
     # via sphinxcontrib-napoleon
-pydantic==2.10.5
+pydantic==2.10.6
     # via frictionless
 pydantic-core==2.27.2
     # via pydantic

diff --git a/src/hdx/scraper/acled/__main__.py b/src/hdx/scraper/acled/__main__.py
@@ -11,6 +11,7 @@
 from hdx.api.configuration import Configuration
 from hdx.data.user import User
 from hdx.facades.infer_arguments import facade
+from hdx.utilities.dateparse import now_utc
 from hdx.utilities.downloader import Download
 from hdx.utilities.path import (
     wheretostart_tempdir_batch,
@@ -56,7 +57,11 @@ def main(
             )
 
             acled = Acled(configuration, retriever, temp_dir)
-            acled.download_data()
+
+            today = now_utc()
+            year = today.year
+            acled.download_data(year)
+
             dataset = acled.generate_dataset()
             dataset.update_from_yaml(
                 path=join(dirname(__file__), "config", "hdx_dataset_static.yaml")

diff --git a/src/hdx/scraper/acled/acled.py b/src/hdx/scraper/acled/acled.py
@@ -9,7 +9,7 @@
 from hdx.location.country import Country
 from hdx.utilities.dateparse import parse_date_range
 from hdx.utilities.retriever import Retrieve
-from pandas import DataFrame, concat, read_excel
+from pandas import concat, read_excel
 
 logger = logging.getLogger(__name__)
 
@@ -20,9 +20,9 @@ def __init__(self, configuration: Configuration, retriever: Retrieve, temp_dir:
         self._retriever = retriever
         self._temp_dir = temp_dir
         self.dates = []
-        self.data = DataFrame()
+        self.data = {}
 
-    def download_data(self):
+    def download_data(self, year: int):
         for dataset_name in self._configuration["datasets"]:
             event_type = dataset_name[: dataset_name.index("event") - 1]
             event_type = event_type.replace("-", "_")
@@ -70,12 +70,22 @@ def download_data(self):
                     },
                     inplace=True,
                 )
-                contents = contents[self._configuration["hxl_tags"].keys()]
+                # contents = contents[self._configuration["hxl_tags"].keys()]
 
-                if len(self.data) == 0:
-                    self.data = contents
-                else:
-                    self.data = concat([self.data, contents])
+                for year_start in range(1995, year + 1, 5):
+                    year_end = year_start + 4
+                    year_range = f"{year_start}-{year_end}"
+                    subset = contents.loc[
+                        (contents["Year"] >= year_start) & (contents["Year"] <= year_end),
+                        self._configuration["hxl_tags"].keys(),
+                    ]
+                    if len(subset) == 0:
+                        continue
+
+                    if year_range in self.data:
+                        self.data[year_range] = concat([self.data[year_range], subset])
+                    else:
+                        self.data[year_range] = subset
 
     def generate_dataset(self) -> Optional[Dataset]:
         dataset = Dataset(
@@ -90,22 +100,24 @@ def generate_dataset(self) -> Optional[Dataset]:
         end_date = max(self.dates)
         dataset.set_time_period(start_date, end_date)
 
-        self.data = self.data.to_dict(orient="records")
-        resourcedata = {
-            "name": "conflict_events_and_fatalities",
-            "description": "A weekly dataset providing the total number of reported "
-            "events and fatalities broken down by country and month.",
-        }
         hxl_tags = self._configuration["hxl_tags"]
         headers = list(hxl_tags.keys())
-        dataset.generate_resource_from_iterable(
-            headers,
-            self.data,
-            hxl_tags,
-            self._temp_dir,
-            "conflict_events_and_fatalities.csv",
-            resourcedata,
-            encoding="utf-8-sig",
-        )
+        for date_range in reversed(self.data.keys()):
+            data = self.data[date_range].to_dict(orient="records")
+            resourcedata = {
+                "name": f"conflict_events_and_fatalities for {date_range}",
+                "description": f"A weekly dataset providing the total number of reported "
+                f"conflict events and fatalities broken down by country and month for "
+                f"{date_range}.",
+            }
+            dataset.generate_resource_from_iterable(
+                headers,
+                data,
+                hxl_tags,
+                self._temp_dir,
+                f"conflict_events_and_fatalities_{date_range}.csv",
+                resourcedata,
+                encoding="utf-8-sig",
+            )
 
         return dataset