fix: update CMR search utility to replace deprecated scrolling (#147)

tsutterley · Oct 17, 2024 · 01b9449 · 01b9449
1 parent 4b1c32f
commit 01b9449
Show file tree

Hide file tree

Showing 11 changed files with 116 additions and 67 deletions.
diff --git a/.github/workflows/python-request.yml b/.github/workflows/python-request.yml
@@ -12,7 +12,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-latest, macos-latest, windows-latest]
+        os: [ubuntu-latest, macos-latest]
         python-version: [3.11]
     env:
       OS: ${{ matrix.os }}

diff --git a/.gitignore b/.gitignore
@@ -30,6 +30,7 @@
 .Rhistory
 __pycache__
 build/
+_build/
 dist/
 develop-eggs/
 run/

diff --git a/doc/environment.yml b/doc/environment.yml
@@ -2,7 +2,7 @@ name: gravity-docs
 channels:
   - conda-forge
 dependencies:
-  - docutils<0.18
+  - docutils
   - fontconfig
   - freetype
   - future

diff --git a/doc/make.bat b/doc/make.bat
@@ -10,8 +10,6 @@ if "%SPHINXBUILD%" == "" (
 set SOURCEDIR=source
 set BUILDDIR=build
 
-if "%1" == "" goto help
-
 %SPHINXBUILD% >NUL 2>NUL
 if errorlevel 9009 (
 	echo.
@@ -21,10 +19,12 @@ if errorlevel 9009 (
 	echo.may add the Sphinx directory to PATH.
 	echo.
 	echo.If you don't have Sphinx installed, grab it from
-	echo.http://sphinx-doc.org/
+	echo.https://www.sphinx-doc.org/
 	exit /b 1
 )
 
+if "%1" == "" goto help
+
 %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
 goto end
 

diff --git a/doc/source/api_reference/utilities.rst b/doc/source/api_reference/utilities.rst
@@ -62,6 +62,8 @@ General Methods
 
 .. autofunction:: gravity_toolkit.utilities.from_http
 
+.. autofunction:: gravity_toolkit.utilities.from_json
+
 .. autofunction:: gravity_toolkit.utilities.attempt_login
 
 .. autofunction:: gravity_toolkit.utilities.build_opener

diff --git a/gravity_toolkit/geocenter.py b/gravity_toolkit/geocenter.py
@@ -131,8 +131,8 @@ def case_insensitive_filename(self, filename):
                 f = [f.name for f in self.filename.parent.iterdir() if
                     re.match(self.filename.name, f.name, re.I)]
                 if not f:
-                    errmsg = f'{filename} not found in file system'
-                    raise FileNotFoundError(errmsg)
+                    msg = f'{filename} not found in file system'
+                    raise FileNotFoundError(msg)
                 self.filename = self.filename.with_name(f.pop())
         # return the filename
         return self
@@ -160,8 +160,8 @@ def from_AOD1B(self, release, year, month, product='glo'):
         AOD1B_file = self.directory.joinpath(granule)
         # check that file exists
         if not AOD1B_file.exists():
-            errmsg = f'AOD1B File {AOD1B_file} not in File System'
-            raise FileNotFoundError(errmsg)
+            msg = f'AOD1B File {AOD1B_file} not in File System'
+            raise FileNotFoundError(msg)
         # read AOD1b geocenter skipping over commented header text
         with AOD1B_file.open(mode='r', encoding='utf8') as f:
             file_contents=[i for i in f.read().splitlines() if not re.match(r'#',i)]
@@ -356,8 +356,8 @@ def from_SLR(self, geocenter_file, **kwargs):
         self.directory = base_dir.joinpath('AOD1B', kwargs['release'], 'geocenter')
         # check that AOD1B directory exists
         if not self.directory.exists():
-            errmsg = f'{str(self.directory)} not found in file system'
-            raise FileNotFoundError(errmsg)
+            msg = f'{str(self.directory)} not found in file system'
+            raise FileNotFoundError(msg)
 
         # Input geocenter file and split lines
         with self.filename.open(mode='r', encoding='utf8') as f:

diff --git a/gravity_toolkit/harmonics.py b/gravity_toolkit/harmonics.py
@@ -185,8 +185,8 @@ def case_insensitive_filename(self, filename):
                 f = [f.name for f in self.filename.parent.iterdir() if
                     re.match(self.filename.name, f.name, re.I)]
                 if not f:
-                    errmsg = f'{filename} not found in file system'
-                    raise FileNotFoundError(errmsg)
+                    msg = f'{filename} not found in file system'
+                    raise FileNotFoundError(msg)
                 self.filename = self.filename.with_name(f.pop())
         # return the filename
         return self
@@ -1181,7 +1181,7 @@ def to_masked_array(self):
             self.squeeze()
         # return the triangular matrix
         return Ylms
-    
+
     def to_coo_array(self):
         """
         Convert data arrays to a COO sparse matrices

diff --git a/gravity_toolkit/spatial.py b/gravity_toolkit/spatial.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 u"""
 spatial.py
-Written by Tyler Sutterley (06/2024)
+Written by Tyler Sutterley (10/2024)
 
 Data class for reading, writing and processing spatial data
 
@@ -20,6 +20,7 @@
     time.py: utilities for calculating time operations
 
 UPDATE HISTORY:
+    Updated 10/2024: allow 2D and 3D arrays in output netCDF4 files
     Updated 06/2024: use wrapper to importlib for optional dependencies
     Updated 05/2024: make subscriptable and allow item assignment
     Updated 10/2023: place time and month variables in try/except block
@@ -157,8 +158,8 @@ def case_insensitive_filename(self, filename):
                 f = [f.name for f in self.filename.parent.iterdir() if
                     re.match(self.filename.name, f.name, re.I)]
                 if not f:
-                    errmsg = f'{filename} not found in file system'
-                    raise FileNotFoundError(errmsg)
+                    msg = f'{filename} not found in file system'
+                    raise FileNotFoundError(msg)
                 self.filename = self.filename.with_name(f.pop())
         # return the filename
         return self
@@ -877,8 +878,9 @@ def to_netCDF4(self, filename, **kwargs):
         variables = set(kwargs['field_mapping'].keys()) - set(dimensions)
         for field in sorted(variables):
             temp = getattr(self,field)
+            ndim = temp.ndim
             key = kwargs['field_mapping'][field]
-            nc[key] = fileID.createVariable(key, temp.dtype, dims,
+            nc[key] = fileID.createVariable(key, temp.dtype, dims[:ndim],
                 fill_value=self.fill_value, zlib=True)
         # filling NetCDF variables
         for field,key in kwargs['field_mapping'].items():
@@ -1360,20 +1362,17 @@ def subset(self, months):
         # indices to sort data objects
         months_list = [i for i,m in enumerate(self.month) if m in months]
         # output spatial object
-        temp = spatial(nlat=self.shape[0], nlon=self.shape[1],
-            fill_value=self.fill_value)
+        temp = self.zeros_like()
         # create output spatial object
-        temp.data = np.zeros((temp.shape[0],temp.shape[1],n))
-        temp.mask = np.zeros((temp.shape[0],temp.shape[1],n), dtype=bool)
+        temp.data = np.zeros((self.shape[0],self.shape[1],n))
+        temp.mask = np.zeros((self.shape[0],self.shape[1],n), dtype=bool)
         # create output spatial error
         try:
             getattr(self, 'error')
-            temp.error = np.zeros((temp.shape[0],temp.shape[1],n))
+            temp.error = np.zeros((self.shape[0],self.shape[1],n))
         except AttributeError:
             pass
-        # copy dimensions
-        temp.lon = self.lon.copy()
-        temp.lat = self.lat.copy()
+        # allocate for output dates
         temp.time = np.zeros((n))
         temp.month = np.zeros((n),dtype=np.int64)
         temp.filename = []

diff --git a/gravity_toolkit/utilities.py b/gravity_toolkit/utilities.py
@@ -1,14 +1,17 @@
 #!/usr/bin/env python
 u"""
 utilities.py
-Written by Tyler Sutterley (06/2024)
+Written by Tyler Sutterley (10/2024)
 Download and management utilities for syncing time and auxiliary files
 
 PYTHON DEPENDENCIES:
     lxml: processing XML and HTML in Python
         https://pypi.python.org/pypi/lxml
 
 UPDATE HISTORY:
+    Updated 10/2024: update CMR search utility to replace deprecated scrolling
+        https://cmr.earthdata.nasa.gov/search/site/docs/search/api.html
+    Updated 08/2024: generalize hash function to use any available algorithm
     Updated 06/2024: added wrapper to importlib for optional dependencies
         make default case for an import exception be a class
     Updated 04/2024: added argument for products in CMR shortname query
@@ -114,7 +117,7 @@ def import_dependency(
     ):
     """
     Import an optional dependency
-    
+
     Adapted from ``pandas.compat._optional::import_optional_dependency``
 
     Parameters
@@ -166,7 +169,7 @@ def __get__(self, inst, objtype=None):
 # PURPOSE: get the hash value of a file
 def get_hash(
         local: str | io.IOBase | pathlib.Path,
-        algorithm: str = 'MD5'
+        algorithm: str = 'md5'
     ):
     """
     Get the hash value from a local file or ``BytesIO`` object
@@ -175,18 +178,16 @@ def get_hash(
     ----------
     local: obj, str or pathlib.Path
         BytesIO object or path to file
-    algorithm: str, default 'MD5'
+    algorithm: str, default 'md5'
         hashing algorithm for checksum validation
-
-            - ``'MD5'``: Message Digest
-            - ``'sha1'``: Secure Hash Algorithm
     """
     # check if open file object or if local file exists
     if isinstance(local, io.IOBase):
-        if (algorithm == 'MD5'):
-            return hashlib.md5(local.getvalue()).hexdigest()
-        elif (algorithm == 'sha1'):
-            return hashlib.sha1(local.getvalue()).hexdigest()
+        # generate checksum hash for a given type
+        if algorithm in hashlib.algorithms_available:
+            return hashlib.new(algorithm, local.getvalue()).hexdigest()
+        else:
+            raise ValueError(f'Invalid hashing algorithm: {algorithm}')
     elif isinstance(local, (str, pathlib.Path)):
         # generate checksum hash for local file
         local = pathlib.Path(local).expanduser()
@@ -196,10 +197,10 @@ def get_hash(
         # open the local_file in binary read mode
         with local.open(mode='rb') as local_buffer:
             # generate checksum hash for a given type
-            if (algorithm == 'MD5'):
-                return hashlib.md5(local_buffer.read()).hexdigest()
-            elif (algorithm == 'sha1'):
-                return hashlib.sha1(local_buffer.read()).hexdigest()
+            if algorithm in hashlib.algorithms_available:
+                return hashlib.new(algorithm, local_buffer.read()).hexdigest()
+            else:
+                raise ValueError(f'Invalid hashing algorithm: {algorithm}')
     else:
         return ''
 
@@ -813,6 +814,44 @@ def from_http(
         remote_buffer.seek(0)
         return remote_buffer
 
+# PURPOSE: load a JSON response from a http host
+def from_json(
+        HOST: str | list,
+        timeout: int | None = None,
+        context: ssl.SSLContext = _default_ssl_context
+    ) -> dict:
+    """
+    Load a JSON response from a http host
+
+    Parameters
+    ----------
+    HOST: str or list
+        remote http host path split as list
+    timeout: int or NoneType, default None
+        timeout in seconds for blocking operations
+    context: obj, default pyTMD.utilities._default_ssl_context
+        SSL context for ``urllib`` opener object
+    """
+    # verify inputs for remote http host
+    if isinstance(HOST, str):
+        HOST = url_split(HOST)
+    # try loading JSON from http
+    try:
+        # Create and submit request for JSON response
+        request = urllib2.Request(posixpath.join(*HOST))
+        request.add_header('Accept', 'application/json')
+        response = urllib2.urlopen(request, timeout=timeout, context=context)
+    except urllib2.HTTPError as exc:
+        logging.debug(exc.code)
+        raise RuntimeError(exc.reason) from exc
+    except urllib2.URLError as exc:
+        logging.debug(exc.reason)
+        msg = 'Load error from {0}'.format(posixpath.join(*HOST))
+        raise Exception(msg) from exc
+    else:
+        # load JSON response
+        return json.loads(response.read())
+
 # PURPOSE: attempt to build an opener with netrc
 def attempt_login(
         urs: str,
@@ -1196,7 +1235,7 @@ def s3_region():
     boto3 = import_dependency('boto3')
     region_name = boto3.session.Session().region_name
     return region_name
-        
+
 # PURPOSE: get AWS s3 client for PO.DAAC Cumulus
 def s3_client(
         HOST: str = _s3_endpoints['podaac'],
@@ -1819,7 +1858,6 @@ def cmr(
     CMR_KEYS.append(f'?provider={provider}')
     CMR_KEYS.append('&sort_key[]=start_date')
     CMR_KEYS.append('&sort_key[]=producer_granule_id')
-    CMR_KEYS.append('&scroll=true')
     CMR_KEYS.append(f'&page_size={cmr_page_size}')
     # dictionary of product shortnames
     short_names = cmr_product_shortname(mission, center, release,
@@ -1844,20 +1882,21 @@ def cmr(
     granule_names = []
     granule_urls = []
     granule_mtimes = []
-    cmr_scroll_id = None
+    cmr_search_after = None
     while True:
         req = urllib2.Request(cmr_query_url)
-        if cmr_scroll_id:
-            req.add_header('cmr-scroll-id', cmr_scroll_id)
+        # add CMR search after header
+        if cmr_search_after:
+            req.add_header('CMR-Search-After', cmr_search_after)
+            logging.debug(f'CMR-Search-After: {cmr_search_after}')
         response = opener.open(req)
-        # get scroll id for next iteration
-        if not cmr_scroll_id:
-            headers = {k.lower():v for k,v in dict(response.info()).items()}
-            cmr_scroll_id = headers['cmr-scroll-id']
+        # get search after index for next iteration
+        headers = {k.lower():v for k,v in dict(response.info()).items()}
+        cmr_search_after = headers.get('cmr-search-after')
         # read the CMR search as JSON
         search_page = json.loads(response.read().decode('utf8'))
         ids,urls,mtimes = cmr_filter_json(search_page, endpoint=endpoint)
-        if not urls:
+        if not urls or cmr_search_after is None:
             break
         # extend lists
         granule_names.extend(ids)

diff --git a/scripts/plot_global_grid_3maps.py b/scripts/plot_global_grid_3maps.py
@@ -407,7 +407,7 @@ def plot_grid(base_dir, FILENAMES,
     cbar.solids.set_rasterized(True)
     # Add label to the colorbar
     cbar.ax.set_ylabel(CBTITLE, labelpad=10, fontsize=18)
-    cbar.ax.set_title(CBUNITS, fontsize=18, va='bottom')
+    cbar.ax.set_title(CBUNITS, fontsize=18, va='bottom', y=-1.4)
     # Set the tick levels for the colorbar
     cbar.set_ticks(levels)
     cbar.set_ticklabels([CBFORMAT.format(ct) for ct in levels])
-Original file line number
+Diff line change
@@ Expand Up / @@ -30,6 +30,7 @@ @@
     .Rhistory
     __pycache__
     build/
+    _build/
     dist/
     develop-eggs/
     run/
@@ Expand Down @@