Merge pull request #96 from torresramiro350/fixing_uproot

Fixing uproot
threeML · May 24, 2024 · 6696aee · 6696aee
2 parents cd2b4b5 + 542a493
commit 6696aee
Show file tree

Hide file tree

Showing 5 changed files with 58 additions and 90 deletions.
diff --git a/README.md b/README.md
@@ -93,7 +93,8 @@ response = ... # This can be either a ROOT or hdf5 file
 hawc = HAL("HAWC",
            maptree,
            response,
-           roi)
+           roi,
+           n_workers=3) # enables ability of loading ROOT's files faster with multiprocessing
 
 # Use from bin 1 to bin 9
 hawc.set_active_measurements(1, 9)

diff --git a/hawc_hal/HAL.py b/hawc_hal/HAL.py
@@ -22,17 +22,12 @@
 from threeML.utils.statistics.gammaln import logfactorial
 from tqdm.auto import tqdm
 
-from hawc_hal.convolved_source import (
-    ConvolvedExtendedSource2D,
-    ConvolvedExtendedSource3D,
-    ConvolvedPointSource,
-    ConvolvedSourcesContainer,
-)
-from hawc_hal.healpix_handling import (
-    FlatSkyToHealpixTransform,
-    SparseHealpix,
-    get_gnomonic_projection,
-)
+from hawc_hal.convolved_source import (ConvolvedExtendedSource2D,
+                                       ConvolvedExtendedSource3D,
+                                       ConvolvedPointSource,
+                                       ConvolvedSourcesContainer)
+from hawc_hal.healpix_handling import (FlatSkyToHealpixTransform,
+                                       SparseHealpix, get_gnomonic_projection)
 from hawc_hal.log_likelihood import log_likelihood
 from hawc_hal.maptree import map_tree_factory
 from hawc_hal.maptree.data_analysis_bin import DataAnalysisBin
@@ -53,6 +48,9 @@ class HAL(PluginPrototype):
     :param response: Response of HAWC (either ROOT or hd5 format)
     :param roi: a ROI instance describing the Region Of Interest
     :param flat_sky_pixels_size: size of the pixel for the flat sky projection (Hammer Aitoff)
+    :param n_workers: number of workers to use for the parallelization of reading
+    :param set_transits: specifies the number of transits to use for the given maptree.
+    ROOT files, default=1
     """
 
     def __init__(
@@ -283,9 +281,7 @@ def set_active_measurements(self, bin_id_min=None, bin_id_max=None, bin_list=Non
             for this_bin in range(bin_id_min, bin_id_max + 1):
                 this_bin = str(this_bin)
                 if this_bin not in self._all_planes:
-                    raise ValueError(
-                        f"Bin {this_bin} is not contained in this maptree."
-                    )
+                    raise ValueError(f"Bin {this_bin} is not contained in this maptree.")
 
                 self._active_planes.append(this_bin)
 
@@ -301,9 +297,7 @@ def set_active_measurements(self, bin_id_min=None, bin_id_max=None, bin_list=Non
             for this_bin in bin_list:
                 # if not this_bin in self._all_planes:
                 if this_bin not in self._all_planes:
-                    raise ValueError(
-                        f"Bin {this_bin} is not contained in this maptree."
-                    )
+                    raise ValueError(f"Bin {this_bin} is not contained in this maptree.")
 
                 self._active_planes.append(this_bin)
 
@@ -433,10 +427,7 @@ def get_excess_background(self, ra: float, dec: float, radius: float):
             this_nside = data_analysis_bin.observation_map.nside
 
             radial_bin_pixels = hp.query_disc(
-                this_nside,
-                center,
-                radius_radians,
-                inclusive=False,
+                this_nside, center, radius_radians, inclusive=False
             )
 
             # calculate the areas per bin by the product
@@ -445,9 +436,7 @@ def get_excess_background(self, ra: float, dec: float, radius: float):
 
             # NOTE: select active pixels according to each radial bin
             bin_active_pixel_indexes = np.intersect1d(
-                self._active_pixels[energy_id],
-                radial_bin_pixels,
-                return_indices=True,
+                self._active_pixels[energy_id], radial_bin_pixels, return_indices=True
             )[1]
 
             # obtain the excess, background, and expected excess at
@@ -523,32 +512,23 @@ def get_radial_profile(
         # The area of each ring is then given by the difference between two
         # subsequent circe areas.
         area = np.array(
-            [
-                self.get_excess_background(ra, dec, r + offset * delta_r)[0]
-                for r in radii
-            ]
+            [self.get_excess_background(ra, dec, r + offset * delta_r)[0] for r in radii]
         )
 
         temp = area[1:] - area[:-1]
         area[1:] = temp
 
         # signals
         signal = np.array(
-            [
-                self.get_excess_background(ra, dec, r + offset * delta_r)[1]
-                for r in radii
-            ]
+            [self.get_excess_background(ra, dec, r + offset * delta_r)[1] for r in radii]
         )
 
         temp = signal[1:] - signal[:-1]
         signal[1:] = temp
 
         # backgrounds
         bkg = np.array(
-            [
-                self.get_excess_background(ra, dec, r + offset * delta_r)[2]
-                for r in radii
-            ]
+            [self.get_excess_background(ra, dec, r + offset * delta_r)[2] for r in radii]
         )
 
         temp = bkg[1:] - bkg[:-1]
@@ -559,10 +539,7 @@ def get_radial_profile(
         # model
         # convert 'top hat' excess into 'ring' excesses.
         model = np.array(
-            [
-                self.get_excess_background(ra, dec, r + offset * delta_r)[3]
-                for r in radii
-            ]
+            [self.get_excess_background(ra, dec, r + offset * delta_r)[3] for r in radii]
         )
 
         temp = model[1:] - model[:-1]
@@ -696,9 +673,7 @@ def plot_radial_profile(
 
         plt.plot(radii, excess_model, color="red", label="Model")
 
-        plt.legend(
-            bbox_to_anchor=(1.0, 1.0), loc="upper right", numpoints=1, fontsize=16
-        )
+        plt.legend(bbox_to_anchor=(1.0, 1.0), loc="upper right", numpoints=1, fontsize=16)
         plt.axhline(0, color="deepskyblue", linestyle="--")
 
         x_limits = [0, max_radius]
@@ -805,9 +780,7 @@ def display_spectrum(self):
 
         yerr = [yerr_high, yerr_low]
 
-        return self._plot_spectrum(
-            net_counts, yerr, model_only, residuals, residuals_err
-        )
+        return self._plot_spectrum(net_counts, yerr, model_only, residuals, residuals_err)
 
     def _plot_spectrum(self, net_counts, yerr, model_only, residuals, residuals_err):
         fig, subs = plt.subplots(
@@ -1080,9 +1053,7 @@ def _get_expectation(
             )
 
             # Now multiply by the pixel area of the new map to go back to flux
-            this_model_map_hpx *= hp.nside2pixarea(
-                data_analysis_bin.nside, degrees=True
-            )
+            this_model_map_hpx *= hp.nside2pixarea(data_analysis_bin.nside, degrees=True)
 
         else:
             # No sources
@@ -1207,9 +1178,7 @@ def display_fit(self, smoothing_kernel_sigma=0.1, display_colorbar=False):
             subs[i][0].set_title("model, bin {}".format(data_analysis_bin.name))
 
             # Plot data map
-            images[1] = subs[i][1].imshow(
-                proj_data, origin="lower", vmin=vmin, vmax=vmax
-            )
+            images[1] = subs[i][1].imshow(proj_data, origin="lower", vmin=vmin, vmax=vmax)
             subs[i][1].set_title("excess, bin {}".format(data_analysis_bin.name))
 
             # Plot background map.
@@ -1329,9 +1298,7 @@ def _get_model_map(self, plane_id, n_pt_src, n_ext_src):
             raise ValueError(f"{plane_id} not a plane in the current model")
 
         model_map = SparseHealpix(
-            self._get_expectation(
-                self._maptree[plane_id], plane_id, n_pt_src, n_ext_src
-            ),
+            self._get_expectation(self._maptree[plane_id], plane_id, n_pt_src, n_ext_src),
             self._active_pixels[plane_id],
             self._maptree[plane_id].observation_map.nside,
         )
@@ -1404,17 +1371,13 @@ def _write_a_map(self, file_name, which, fluctuate=False, return_map=False):
         if return_map:
             return new_map_tree
 
-    def write_model_map(
-        self, file_name, poisson_fluctuate=False, test_return_map=False
-    ):
+    def write_model_map(self, file_name, poisson_fluctuate=False, test_return_map=False):
         """
         This function writes the model map to a file.
         The interface is based off of HAWCLike for consistency
         """
         if test_return_map:
-            log.warning(
-                "test_return_map=True should only be used for testing purposes!"
-            )
+            log.warning("test_return_map=True should only be used for testing purposes!")
         return self._write_a_map(file_name, "model", poisson_fluctuate, test_return_map)
 
     def write_residual_map(self, file_name, test_return_map=False):
@@ -1423,7 +1386,5 @@ def write_residual_map(self, file_name, test_return_map=False):
         The interface is based off of HAWCLike for consistency
         """
         if test_return_map:
-            log.warning(
-                "test_return_map=True should only be used for testing purposes!"
-            )
+            log.warning("test_return_map=True should only be used for testing purposes!")
         return self._write_a_map(file_name, "residual", False, test_return_map)
diff --git a/hawc_hal/maptree/from_root_file.py b/hawc_hal/maptree/from_root_file.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import,annotations
+from __future__ import absolute_import, annotations
 
 import collections
 import multiprocessing
@@ -61,9 +61,7 @@ def _counts_npixels(self) -> int:
             if self._legacy_convention
             else self.analysis_bin_names[0]
         )
-        return self.maptree_ttree_directory[f"nHit{bin_id}/data/count"].member(
-            "fEntries"
-        )
+        return self.maptree_ttree_directory[f"nHit{bin_id}/data/count"].member("fEntries")
 
     @property
     def _bkg_npixels(self) -> int:
@@ -73,9 +71,7 @@ def _bkg_npixels(self) -> int:
             if self._legacy_convention
             else self.analysis_bin_names[0]
         )
-        return self.maptree_ttree_directory[f"nHit{bin_id}/bkg/count"].member(
-            "fEntries"
-        )
+        return self.maptree_ttree_directory[f"nHit{bin_id}/bkg/count"].member("fEntries")
 
     @property
     def nside_cnt(self) -> int:
@@ -121,9 +117,9 @@ def get_array_from_file(
 
     if roi is not None:
         # NOTE: load only the pixels within the ROI
-        return bin_id, map_infile[
-            f"nHit{current_bin_id}/data/count"
-        ].array().to_numpy()[hpx_map > 0.0]
+        return bin_id, map_infile[f"nHit{current_bin_id}/data/count"].array().to_numpy()[
+            hpx_map > 0.0
+        ]
 
     return bin_id, map_infile[f"nHit{current_bin_id}/data/count"].array().to_numpy()
 
@@ -210,7 +206,15 @@ def from_root_file(
     # cannot perform operations on histrograms
 
     # Read the maptree
-    with uproot.open(str(map_tree_file)) as map_infile:
+    with multiprocessing.Pool(processes=n_workers) as pool, uproot.open(
+        map_tree_file.as_posix(),
+        handler=uproot.MemmapSource,
+        num_fallback_workers=n_workers,
+    ) as map_infile:
+        # the handler for MemmapSource loads the file as it's needed
+        # suggested as the best for large local files
+        # otherwise use MultithreadedFileSource for remote files
+        # which requires setting the option for use_threads to True
         log.info("Reading Maptree!")
 
         maptree_metadata = MaptreeMetaData(map_infile)
@@ -246,11 +250,8 @@ def from_root_file(
         # NOTE: The number of workers is suggested to be kept equal one less
         # than the number of available cores in the system.
 
-        with multiprocessing.Pool(processes=n_workers) as executor:
-            result_data = list(executor.starmap(get_array_from_file, signal_data_info))
-            result_bkg = list(
-                executor.starmap(get_bkg_array_from_file, signal_data_info)
-            )
+        result_data = list(pool.starmap(get_array_from_file, signal_data_info))
+        result_bkg = list(pool.starmap(get_bkg_array_from_file, signal_data_info))
 
         # Processes are not guaranteed to preserve order of analysis bin names
         # Organize them into a dictionary for proper readout

diff --git a/hawc_hal/response/response.py b/hawc_hal/response/response.py
@@ -406,7 +406,15 @@ def from_root_file(cls, response_file_name: Path, n_workers: int = 1):
                 f"Response {response_file_name} does not exist or is not readable"
             )
 
-        with uproot.open(response_file_name) as response_file_directory:
+        with multiprocessing.Pool(processes=n_workers) as pool, uproot.open(
+            response_file_name,
+            handler=uproot.MemmapSource,
+            num_fallback_workers=n_workers,
+        ) as response_file_directory:
+            # the handler for MemmapSource loads the file as it's needed
+            # suggested as the best for large local files
+            # otherwise use MultithreadedFileSource for remote files
+            # which requires setting the option for use_threads to True
             resp_metadata = ResponseMetaData(response_file_directory)
 
             # NOTE:Get the Response function basic information
@@ -426,12 +434,9 @@ def from_root_file(cls, response_file_name: Path, n_workers: int = 1):
                 for bin_id in analysis_bins_arr
             ]
 
-            with multiprocessing.Pool(processes=n_workers) as executor:
-                results = list(executor.starmap(resp_metadata.get_energy_hist, args))
-                results_bkg = list(
-                    executor.starmap(resp_metadata.get_energy_bkg_hist, args)
-                )
-                psf_param = list(executor.starmap(resp_metadata.get_psf_params, args))
+            results = list(pool.starmap(resp_metadata.get_energy_hist, args))
+            results_bkg = list(pool.starmap(resp_metadata.get_energy_bkg_hist, args))
+            psf_param = list(pool.starmap(resp_metadata.get_psf_params, args))
 
         energy_hists = cls.create_dict_from_results(results)
         energy_bkgs = cls.create_dict_from_results(results_bkg)

diff --git a/setup.py b/setup.py
@@ -46,15 +46,15 @@ def find_data_files(directory):
         "threeml",
         "astromodels",
         "pandas",
-        "healpy",
+        "pytest",
         "six",
         "astropy",
         "scipy",
         "matplotlib",
         "numba",
         "reproject",
         "tqdm",
-        "uproot==5.1.2",
+        "uproot",
         "awkward",
         "mplhep",
         "hist",