diff --git a/HISTORY.txt b/HISTORY.txt index 1634370..bc67560 100644 --- a/HISTORY.txt +++ b/HISTORY.txt @@ -354,3 +354,4 @@ Begin updating PdbxLoader and RepoLoadEx to support weekly update workflow CLI requirements 03-Apr-2024 V1.717 Add int_list cifType to DataTypeApplicationInfo 09-Apr-2024 V1.718 Update RepoLoadExec CLI and RepoLoadWorkflow to support CLI usage from weekly-update workflow + 6-May-2024 V1.719 Updates to CLI utilities diff --git a/README.md b/README.md index b7ca6d9..f4bd13b 100644 --- a/README.md +++ b/README.md @@ -195,7 +195,7 @@ usage: exdb_repo_load_cli [-h] [--op OP_TYPE] [--load_type LOAD_TYPE] optional arguments: -h, --help show this help message and exit - --op {pdbx-loader,build-resource-cache,pdbx-db-wiper,pdbx-id-list-splitter,pdbx-loader-check,etl-entity-sequence-clusters,etl-repository-holdings} + --op {pdbx_loader,build_resource_cache,pdbx_db_wiper,pdbx_id_list_splitter,pdbx_loader_check,etl_entity_sequence_clusters,etl_repository_holdings} Loading operation to perform --load_type {replace,full} Type of load ('replace' for incremental and @@ -281,9 +281,9 @@ export OE_LICENSE=/path/to/oe_license.txt export NLTK_DATA=/path/to/nltk_data ``` -`--op build-resource-cache` - Build the external resource cache that will be used for and integrated with the loading of PDB structure data. +`--op build_resource_cache` - Build the external resource cache that will be used for and integrated with the loading of PDB structure data. ```bash -exdb_repo_load_cli --op "build-resource-cache" \ +exdb_repo_load_cli --op "build_resource_cache" \ --config_path "/opt/etl-scratch/config/exdb-loader-config.yml" \ --config_name "site_info_remote_configuration" \ --num_proc 6 \ @@ -291,9 +291,9 @@ exdb_repo_load_cli --op "build-resource-cache" \ ``` -`--op pdbx-db-wiper` - Wipe the pre-existing database (and all of its collections). +`--op pdbx_db_wiper` - Wipe the pre-existing database (and all of its collections). ```bash -exdb_repo_load_cli --op "pdbx-db-wiper" \ +exdb_repo_load_cli --op "pdbx_db_wiper" \ --database "pdbx_core" \ --config_path "/opt/etl-scratch/config/exdb-loader-config.yml" \ --config_name "site_info_remote_configuration" \ @@ -301,9 +301,9 @@ exdb_repo_load_cli --op "pdbx-db-wiper" \ ``` -`--op pdbx-id-list-splitter` - Split the full list of input IDs into smaller, equally-sized sublists. +`--op pdbx_id_list_splitter` - Split the full list of input IDs into smaller, equally-sized sublists. ```bash -exdb_repo_load_cli --op "pdbx-id-list-splitter" \ +exdb_repo_load_cli --op "pdbx_id_list_splitter" \ --database "pdbx_core" \ --config_path "/opt/etl-scratch/config/exdb-loader-config.yml" \ --config_name "site_info_remote_configuration" \ @@ -314,9 +314,9 @@ exdb_repo_load_cli --op "pdbx-id-list-splitter" \ ``` -`--op pdbx-loader` - Load a list of entry IDs to ExDB. +`--op pdbx_loader` - Load a list of entry IDs to ExDB. ```bash -exdb_repo_load_cli --op "pdbx-loader" \ +exdb_repo_load_cli --op "pdbx_loader" \ --database "pdbx_core" \ --load_type replace \ --config_path /opt/etl-scratch/config/exdb-loader-config.yml \ @@ -329,9 +329,9 @@ exdb_repo_load_cli --op "pdbx-loader" \ ``` -`--op pdbx-loader-check` - Check the resulting ExDB database to confirm that all expected documents were loaded. +`--op pdbx_loader_check` - Check the resulting ExDB database to confirm that all expected documents were loaded. ```bash -exdb_repo_load_cli --op "pdbx-loader-check" \ +exdb_repo_load_cli --op "pdbx_loader_check" \ --database "pdbx_core" \ --config_path "/opt/etl-scratch/config/exdb-loader-config.yml" \ --config_name "site_info_remote_configuration" \ diff --git a/pylintrc b/pylintrc index 4f9de98..bfb3a90 100644 --- a/pylintrc +++ b/pylintrc @@ -575,5 +575,5 @@ min-public-methods=2 # Exceptions that will emit a warning when being caught. Defaults to # "BaseException, Exception". -overgeneral-exceptions=BaseException, - Exception +# overgeneral-exceptions=builtins.BaseException, +# builtins.Exception diff --git a/rcsb/db/cli/RepoLoadExec.py b/rcsb/db/cli/RepoLoadExec.py index 40bd79c..de23615 100644 --- a/rcsb/db/cli/RepoLoadExec.py +++ b/rcsb/db/cli/RepoLoadExec.py @@ -24,6 +24,7 @@ # 5-Apr-2024 - dwp Change arguments and execution structure to make more flexible; # Add arguments and logic to support CLI usage from weekly-update workflow; # Add support for logging output to a specific file +# 25-Apr-2024 - dwp Add support for remote config file loading; use underscores instead of hyphens for arg choices ## __docformat__ = "restructuredtext en" __author__ = "John Westbrook" @@ -48,14 +49,12 @@ def main(): parser = argparse.ArgumentParser() # - # defaultConfigName = "site_info_configuration" - # parser.add_argument( "--op", default=None, required=True, help="Loading operation to perform", - choices=["pdbx-loader", "build-resource-cache", "pdbx-db-wiper", "pdbx-id-list-splitter", "pdbx-loader-check", "etl-entity-sequence-clusters", "etl-repository-holdings"] + choices=["pdbx_loader", "build_resource_cache", "pdbx_db_wiper", "pdbx_id_list_splitter", "pdbx_loader_check", "etl_entity_sequence_clusters", "etl_repository_holdings"] ) # parser.add_argument( @@ -115,53 +114,51 @@ def main(): # try: op, commonD, loadD = processArguments(args) - except Exception as e: - logger.exception("Argument processing problem %s", str(e)) - parser.print_help(sys.stderr) - exit(1) + except Exception as err: + logger.exception("Argument processing problem %s", str(err)) + raise ValueError("Argument processing problem") from err + # # # Log input arguments loadLogD = {k: v for d in [commonD, loadD] for k, v in d.items() if k != "inputIdCodeList"} logger.info("running load op %r on loadLogD %r:", op, loadLogD) - - # ----------------------- - ----------------------- - ----------------------- - ----------------------- - ----------------------- - - # Run the operation # + # Run the operation okR = False rlWf = RepoLoadWorkflow(**commonD) - if op in ["pdbx-loader", "etl-entity-sequence-clusters", "etl-repository-holdings"]: + if op in ["pdbx_loader", "etl_entity_sequence_clusters", "etl_repository_holdings"]: okR = rlWf.load(op, **loadD) # - elif op == "build-resource-cache": + elif op == "build_resource_cache": okR = rlWf.buildResourceCache(rebuildCache=True, providerTypeExclude=loadD["providerTypeExclude"]) # - elif op == "pdbx-id-list-splitter": + elif op == "pdbx_id_list_splitter": okR = rlWf.splitIdList(op, **loadD) # - elif op == "pdbx-db-wiper": + elif op == "pdbx_db_wiper": okR = rlWf.removeAndRecreateDbCollections(op, **loadD) # - elif op == "pdbx-loader-check": + elif op == "pdbx_loader_check": okR = rlWf.loadCompleteCheck(op, **loadD) # else: logger.error("Unsupported op %r", op) - + # logger.info("Operation %r completed with status %r", op, okR) - + # if not okR: logger.error("Operation %r failed with status %r", op, okR) - exit(1) + raise ValueError("Operation %r failed" % op) def processArguments(args): + # Logging details logFilePath = args.log_file_path debugFlag = args.debug if debugFlag: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) - # if logFilePath: logDir = os.path.dirname(logFilePath) if not os.path.isdir(logDir): @@ -175,37 +172,38 @@ def processArguments(args): handler.setFormatter(formatter) logger.addHandler(handler) # - # Configuration Details + # Configuration details configPath = args.config_path configName = args.config_name - if not configPath: - configPath = os.getenv("DBLOAD_CONFIG_PATH", None) - try: - if os.access(configPath, os.R_OK): - os.environ["DBLOAD_CONFIG_PATH"] = configPath - logger.info("Using configuation path %s (%s)", configPath, configName) - else: - logger.error("Missing or access issue with config file %r", configPath) - exit(1) - mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") if args.mock else None - cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=mockTopPath) - if args.vrpt_repo_path: - vrptPath = args.vrpt_repo_path - if not os.access(vrptPath, os.R_OK): - logger.error("Unreadable validation report repository path %r", vrptPath) - envName = cfgOb.get("VRPT_REPO_PATH_ENV", sectionName=configName) - os.environ[envName] = vrptPath - logger.info("Using alternate validation report path %s", os.getenv(envName)) - except Exception as e: - logger.error("Missing or access issue with config file %r with %s", configPath, str(e)) - exit(1) + if not (configPath and configName): + logger.error("Config path and/or name not provided: %r, %r", configPath, configName) + raise ValueError("Config path and/or name not provided: %r, %r" % (configPath, configName)) + mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") if args.mock else None + logger.info("Using configuration file %r (section %r)", configPath, configName) + cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=mockTopPath) + cfgObTmp = cfgOb.exportConfig() + logger.info("Length of config object (%r)", len(cfgObTmp)) + if len(cfgObTmp) == 0: + logger.error("Missing or access issue for config file %r", configPath) + raise ValueError("Missing or access issue for config file %r" % configPath) + else: + del cfgObTmp + # + if args.vrpt_repo_path: + vrptPath = args.vrpt_repo_path + if not os.access(vrptPath, os.R_OK): + logger.error("Unreadable validation report repository path %r", vrptPath) + raise ValueError("Unreadable validation report repository path %r" % vrptPath) + envName = cfgOb.get("VRPT_REPO_PATH_ENV", sectionName=configName) + os.environ[envName] = vrptPath + logger.info("Using alternate validation report path %s", os.getenv(envName)) # - # First do any needed argument checking + # Do any additional argument checking op = args.op databaseName = args.database if not op: raise ValueError("Must supply a value to '--op' argument") - if op == "pdbx-loader" and not databaseName: + if op == "pdbx_loader" and not databaseName: raise ValueError("Must supply a value to '--database' argument for op type 'pdbx-loader") # if databaseName == "bird_family": # Not sure if this is relevant anymore @@ -261,4 +259,8 @@ def processArguments(args): if __name__ == "__main__": - main() + try: + main() + except Exception as e: + logger.exception("Run failed %s", str(e)) + sys.exit(1) diff --git a/rcsb/db/cli/__init__.py b/rcsb/db/cli/__init__.py index c847c82..697de79 100644 --- a/rcsb/db/cli/__init__.py +++ b/rcsb/db/cli/__init__.py @@ -2,4 +2,4 @@ __author__ = "John Westbrook" __email__ = "john.westbrook@rcsb.org" __license__ = "Apache 2.0" -__version__ = "1.718" +__version__ = "1.719" diff --git a/rcsb/db/mongo/PdbxLoader.py b/rcsb/db/mongo/PdbxLoader.py index 1e9f119..2535267 100644 --- a/rcsb/db/mongo/PdbxLoader.py +++ b/rcsb/db/mongo/PdbxLoader.py @@ -205,6 +205,7 @@ def load( # # -- Check database to see if any entries have already been loaded, and determine the delta for the current load inputIdCodeList = inputIdCodeList if inputIdCodeList else [] + inputIdCodeList = [id.upper() for id in inputIdCodeList] if databaseName in ["pdbx_core", "pdbx_comp_model_core"]: totalIdsAlreadyLoaded = self.__getLoadedRcsbIdList(databaseName=databaseName, collectionName=databaseName + "_entry") # Get the list of IDs from only the given sublist that are already loaded diff --git a/rcsb/db/tests-mongo/testRepoLoadWorkflow.py b/rcsb/db/tests-mongo/testRepoLoadWorkflow.py index 8a10633..b5c843b 100644 --- a/rcsb/db/tests-mongo/testRepoLoadWorkflow.py +++ b/rcsb/db/tests-mongo/testRepoLoadWorkflow.py @@ -92,7 +92,7 @@ def testPdbxLoaderWorkflow(self): self.assertTrue(ok) for ld in self.__ldList: ld.update(self.__loadCommonD) - ok = rlWf.load("pdbx-loader", **ld) + ok = rlWf.load("pdbx_loader", **ld) self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) @@ -112,10 +112,10 @@ def testEtlLoaderWorkflow(self): ok = rlWf.buildResourceCache(rebuildCache=False) self.assertTrue(ok) # - ok = rlWf.load("etl-repository-holdings", **etlCommonD) + ok = rlWf.load("etl_repository_holdings", **etlCommonD) self.assertTrue(ok) # - ok = rlWf.load("etl-entity-sequence-clusters", **etlCommonD) + ok = rlWf.load("etl_entity_sequence_clusters", **etlCommonD) self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) diff --git a/rcsb/db/wf/RepoLoadWorkflow.py b/rcsb/db/wf/RepoLoadWorkflow.py index 2165f2b..4d578a9 100644 --- a/rcsb/db/wf/RepoLoadWorkflow.py +++ b/rcsb/db/wf/RepoLoadWorkflow.py @@ -39,7 +39,7 @@ class RepoLoadWorkflow(object): def __init__(self, **kwargs): # Configuration Details configPath = kwargs.get("configPath", "exdb-config-example.yml") - self.__configName = kwargs.get("configName", "site_info_configuration") + self.__configName = kwargs.get("configName", "site_info_remote_configuration") mockTopPath = kwargs.get("mockTopPath", None) self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=self.__configName, mockTopPath=mockTopPath) # @@ -60,7 +60,7 @@ def load(self, op, **kwargs): # logger.error("Resource cache test or rebuild has failed - exiting") # return False # argument processing - if op not in ["pdbx-loader", "etl-repository-holdings", "etl-entity-sequence-clusters"]: + if op not in ["pdbx_loader", "etl_repository_holdings", "etl_entity_sequence_clusters"]: logger.error("Unsupported operation %r - exiting", op) return False try: @@ -112,7 +112,7 @@ def load(self, op, **kwargs): return False # - if op == "pdbx-loader" and dbType == "mongo" and databaseName in databaseNameList: + if op == "pdbx_loader" and dbType == "mongo" and databaseName in databaseNameList: okS = True try: inputPathList, inputIdCodeList = None, None @@ -167,7 +167,7 @@ def load(self, op, **kwargs): okS = self.loadStatus(mw.getLoadStatus(), readBackCheck=readBackCheck) except Exception as e: logger.exception("Operation %r database %r failing with %s", op, databaseName, str(e)) - elif op == "etl-entity-sequence-clusters" and dbType == "mongo": + elif op == "etl_entity_sequence_clusters" and dbType == "mongo": cw = SequenceClustersEtlWorker( self.__cfgOb, numProc=numProc, @@ -181,7 +181,7 @@ def load(self, op, **kwargs): ) ok = cw.etl(dataSetId, seqDataLocator, loadType=loadType) okS = self.loadStatus(cw.getLoadStatus(), readBackCheck=readBackCheck) - elif op == "etl-repository-holdings" and dbType == "mongo": + elif op == "etl_repository_holdings" and dbType == "mongo": rhw = RepoHoldingsEtlWorker( self.__cfgOb, sandboxPath, @@ -213,7 +213,7 @@ def loadStatus(self, statusList, readBackCheck=True): logger.exception("Failing with %s", str(e)) return ret - def buildResourceCache(self, rebuildCache=False, providerTypeExclude=None): + def buildResourceCache(self, rebuildCache=False, providerTypeExclude=None, restoreUseStash=True, restoreUseGit=True): """Generate and cache resource dependencies.""" ret = False try: @@ -230,8 +230,8 @@ def buildResourceCache(self, rebuildCache=False, providerTypeExclude=None): self.__cfgOb, configName=self.__configName, cachePath=self.__cachePath, - restoreUseStash=True, - restoreUseGit=True, + restoreUseStash=restoreUseStash, + restoreUseGit=restoreUseGit, providerTypeExclude=providerTypeExclude, ) ret = rP.cacheResources(useCache=useCache, doBackup=False, useStash=False, useGit=False) @@ -242,7 +242,7 @@ def buildResourceCache(self, rebuildCache=False, providerTypeExclude=None): return ret def removeAndRecreateDbCollections(self, op, **kwargs): - if op not in ["pdbx-db-wiper"]: + if op not in ["pdbx_db_wiper"]: logger.error("Unsupported operation %r - exiting", op) return False try: @@ -278,7 +278,7 @@ def removeAndRecreateDbCollections(self, op, **kwargs): return ok def splitIdList(self, op, **kwargs): - if op not in ["pdbx-id-list-splitter"]: + if op not in ["pdbx_id_list_splitter"]: logger.error("Unsupported operation %r - exiting", op) return False @@ -384,7 +384,7 @@ def splitIdListAndWriteToFiles(self, inputList, nFiles, outfileDir, outfilePrefi return filePathMappingD def loadCompleteCheck(self, op, **kwargs): - if op not in ["pdbx-loader-check"]: + if op not in ["pdbx_loader_check"]: logger.error("Unsupported operation %r - exiting", op) return False try: