Skip to content

Commit

Permalink
Merge pull request #74 from rcsb/RO-4234
Browse files Browse the repository at this point in the history
V1.719 Updates to CLI utilities
  • Loading branch information
piehld authored May 8, 2024
2 parents e4f82f4 + 55a0486 commit 3d4c2c0
Show file tree
Hide file tree
Showing 8 changed files with 76 additions and 72 deletions.
1 change: 1 addition & 0 deletions HISTORY.txt
Original file line number Diff line number Diff line change
Expand Up @@ -354,3 +354,4 @@
Begin updating PdbxLoader and RepoLoadEx to support weekly update workflow CLI requirements
03-Apr-2024 V1.717 Add int_list cifType to DataTypeApplicationInfo
09-Apr-2024 V1.718 Update RepoLoadExec CLI and RepoLoadWorkflow to support CLI usage from weekly-update workflow
6-May-2024 V1.719 Updates to CLI utilities
22 changes: 11 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ usage: exdb_repo_load_cli [-h] [--op OP_TYPE] [--load_type LOAD_TYPE]

optional arguments:
-h, --help show this help message and exit
--op {pdbx-loader,build-resource-cache,pdbx-db-wiper,pdbx-id-list-splitter,pdbx-loader-check,etl-entity-sequence-clusters,etl-repository-holdings}
--op {pdbx_loader,build_resource_cache,pdbx_db_wiper,pdbx_id_list_splitter,pdbx_loader_check,etl_entity_sequence_clusters,etl_repository_holdings}
Loading operation to perform
--load_type {replace,full}
Type of load ('replace' for incremental and
Expand Down Expand Up @@ -281,29 +281,29 @@ export OE_LICENSE=/path/to/oe_license.txt
export NLTK_DATA=/path/to/nltk_data
```
`--op build-resource-cache` - Build the external resource cache that will be used for and integrated with the loading of PDB structure data.
`--op build_resource_cache` - Build the external resource cache that will be used for and integrated with the loading of PDB structure data.
```bash
exdb_repo_load_cli --op "build-resource-cache" \
exdb_repo_load_cli --op "build_resource_cache" \
--config_path "/opt/etl-scratch/config/exdb-loader-config.yml" \
--config_name "site_info_remote_configuration" \
--num_proc 6 \
--cache_path "/opt/etl-scratch/data/CACHE" \
```
`--op pdbx-db-wiper` - Wipe the pre-existing database (and all of its collections).
`--op pdbx_db_wiper` - Wipe the pre-existing database (and all of its collections).
```bash
exdb_repo_load_cli --op "pdbx-db-wiper" \
exdb_repo_load_cli --op "pdbx_db_wiper" \
--database "pdbx_core" \
--config_path "/opt/etl-scratch/config/exdb-loader-config.yml" \
--config_name "site_info_remote_configuration" \
--cache_path "/opt/etl-scratch/data/CACHE" \
```
`--op pdbx-id-list-splitter` - Split the full list of input IDs into smaller, equally-sized sublists.
`--op pdbx_id_list_splitter` - Split the full list of input IDs into smaller, equally-sized sublists.
```bash
exdb_repo_load_cli --op "pdbx-id-list-splitter" \
exdb_repo_load_cli --op "pdbx_id_list_splitter" \
--database "pdbx_core" \
--config_path "/opt/etl-scratch/config/exdb-loader-config.yml" \
--config_name "site_info_remote_configuration" \
Expand All @@ -314,9 +314,9 @@ exdb_repo_load_cli --op "pdbx-id-list-splitter" \
```
`--op pdbx-loader` - Load a list of entry IDs to ExDB.
`--op pdbx_loader` - Load a list of entry IDs to ExDB.
```bash
exdb_repo_load_cli --op "pdbx-loader" \
exdb_repo_load_cli --op "pdbx_loader" \
--database "pdbx_core" \
--load_type replace \
--config_path /opt/etl-scratch/config/exdb-loader-config.yml \
Expand All @@ -329,9 +329,9 @@ exdb_repo_load_cli --op "pdbx-loader" \
```
`--op pdbx-loader-check` - Check the resulting ExDB database to confirm that all expected documents were loaded.
`--op pdbx_loader_check` - Check the resulting ExDB database to confirm that all expected documents were loaded.
```bash
exdb_repo_load_cli --op "pdbx-loader-check" \
exdb_repo_load_cli --op "pdbx_loader_check" \
--database "pdbx_core" \
--config_path "/opt/etl-scratch/config/exdb-loader-config.yml" \
--config_name "site_info_remote_configuration" \
Expand Down
4 changes: 2 additions & 2 deletions pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -575,5 +575,5 @@ min-public-methods=2

# Exceptions that will emit a warning when being caught. Defaults to
# "BaseException, Exception".
overgeneral-exceptions=BaseException,
Exception
# overgeneral-exceptions=builtins.BaseException,
# builtins.Exception
90 changes: 46 additions & 44 deletions rcsb/db/cli/RepoLoadExec.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
# 5-Apr-2024 - dwp Change arguments and execution structure to make more flexible;
# Add arguments and logic to support CLI usage from weekly-update workflow;
# Add support for logging output to a specific file
# 25-Apr-2024 - dwp Add support for remote config file loading; use underscores instead of hyphens for arg choices
##
__docformat__ = "restructuredtext en"
__author__ = "John Westbrook"
Expand All @@ -48,14 +49,12 @@
def main():
parser = argparse.ArgumentParser()
#
# defaultConfigName = "site_info_configuration"
#
parser.add_argument(
"--op",
default=None,
required=True,
help="Loading operation to perform",
choices=["pdbx-loader", "build-resource-cache", "pdbx-db-wiper", "pdbx-id-list-splitter", "pdbx-loader-check", "etl-entity-sequence-clusters", "etl-repository-holdings"]
choices=["pdbx_loader", "build_resource_cache", "pdbx_db_wiper", "pdbx_id_list_splitter", "pdbx_loader_check", "etl_entity_sequence_clusters", "etl_repository_holdings"]
)
#
parser.add_argument(
Expand Down Expand Up @@ -115,53 +114,51 @@ def main():
#
try:
op, commonD, loadD = processArguments(args)
except Exception as e:
logger.exception("Argument processing problem %s", str(e))
parser.print_help(sys.stderr)
exit(1)
except Exception as err:
logger.exception("Argument processing problem %s", str(err))
raise ValueError("Argument processing problem") from err
#
#
# Log input arguments
loadLogD = {k: v for d in [commonD, loadD] for k, v in d.items() if k != "inputIdCodeList"}
logger.info("running load op %r on loadLogD %r:", op, loadLogD)

# ----------------------- - ----------------------- - ----------------------- - ----------------------- - ----------------------- -
# Run the operation
#
# Run the operation
okR = False
rlWf = RepoLoadWorkflow(**commonD)
if op in ["pdbx-loader", "etl-entity-sequence-clusters", "etl-repository-holdings"]:
if op in ["pdbx_loader", "etl_entity_sequence_clusters", "etl_repository_holdings"]:
okR = rlWf.load(op, **loadD)
#
elif op == "build-resource-cache":
elif op == "build_resource_cache":
okR = rlWf.buildResourceCache(rebuildCache=True, providerTypeExclude=loadD["providerTypeExclude"])
#
elif op == "pdbx-id-list-splitter":
elif op == "pdbx_id_list_splitter":
okR = rlWf.splitIdList(op, **loadD)
#
elif op == "pdbx-db-wiper":
elif op == "pdbx_db_wiper":
okR = rlWf.removeAndRecreateDbCollections(op, **loadD)
#
elif op == "pdbx-loader-check":
elif op == "pdbx_loader_check":
okR = rlWf.loadCompleteCheck(op, **loadD)
#
else:
logger.error("Unsupported op %r", op)

#
logger.info("Operation %r completed with status %r", op, okR)

#
if not okR:
logger.error("Operation %r failed with status %r", op, okR)
exit(1)
raise ValueError("Operation %r failed" % op)


def processArguments(args):
# Logging details
logFilePath = args.log_file_path
debugFlag = args.debug
if debugFlag:
logger.setLevel(logging.DEBUG)
else:
logger.setLevel(logging.INFO)
#
if logFilePath:
logDir = os.path.dirname(logFilePath)
if not os.path.isdir(logDir):
Expand All @@ -175,37 +172,38 @@ def processArguments(args):
handler.setFormatter(formatter)
logger.addHandler(handler)
#
# Configuration Details
# Configuration details
configPath = args.config_path
configName = args.config_name
if not configPath:
configPath = os.getenv("DBLOAD_CONFIG_PATH", None)
try:
if os.access(configPath, os.R_OK):
os.environ["DBLOAD_CONFIG_PATH"] = configPath
logger.info("Using configuation path %s (%s)", configPath, configName)
else:
logger.error("Missing or access issue with config file %r", configPath)
exit(1)
mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") if args.mock else None
cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=mockTopPath)
if args.vrpt_repo_path:
vrptPath = args.vrpt_repo_path
if not os.access(vrptPath, os.R_OK):
logger.error("Unreadable validation report repository path %r", vrptPath)
envName = cfgOb.get("VRPT_REPO_PATH_ENV", sectionName=configName)
os.environ[envName] = vrptPath
logger.info("Using alternate validation report path %s", os.getenv(envName))
except Exception as e:
logger.error("Missing or access issue with config file %r with %s", configPath, str(e))
exit(1)
if not (configPath and configName):
logger.error("Config path and/or name not provided: %r, %r", configPath, configName)
raise ValueError("Config path and/or name not provided: %r, %r" % (configPath, configName))
mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") if args.mock else None
logger.info("Using configuration file %r (section %r)", configPath, configName)
cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=mockTopPath)
cfgObTmp = cfgOb.exportConfig()
logger.info("Length of config object (%r)", len(cfgObTmp))
if len(cfgObTmp) == 0:
logger.error("Missing or access issue for config file %r", configPath)
raise ValueError("Missing or access issue for config file %r" % configPath)
else:
del cfgObTmp
#
if args.vrpt_repo_path:
vrptPath = args.vrpt_repo_path
if not os.access(vrptPath, os.R_OK):
logger.error("Unreadable validation report repository path %r", vrptPath)
raise ValueError("Unreadable validation report repository path %r" % vrptPath)
envName = cfgOb.get("VRPT_REPO_PATH_ENV", sectionName=configName)
os.environ[envName] = vrptPath
logger.info("Using alternate validation report path %s", os.getenv(envName))
#
# First do any needed argument checking
# Do any additional argument checking
op = args.op
databaseName = args.database
if not op:
raise ValueError("Must supply a value to '--op' argument")
if op == "pdbx-loader" and not databaseName:
if op == "pdbx_loader" and not databaseName:
raise ValueError("Must supply a value to '--database' argument for op type 'pdbx-loader")
#
if databaseName == "bird_family": # Not sure if this is relevant anymore
Expand Down Expand Up @@ -261,4 +259,8 @@ def processArguments(args):


if __name__ == "__main__":
main()
try:
main()
except Exception as e:
logger.exception("Run failed %s", str(e))
sys.exit(1)
2 changes: 1 addition & 1 deletion rcsb/db/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
__author__ = "John Westbrook"
__email__ = "[email protected]"
__license__ = "Apache 2.0"
__version__ = "1.718"
__version__ = "1.719"
1 change: 1 addition & 0 deletions rcsb/db/mongo/PdbxLoader.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,7 @@ def load(
#
# -- Check database to see if any entries have already been loaded, and determine the delta for the current load
inputIdCodeList = inputIdCodeList if inputIdCodeList else []
inputIdCodeList = [id.upper() for id in inputIdCodeList]
if databaseName in ["pdbx_core", "pdbx_comp_model_core"]:
totalIdsAlreadyLoaded = self.__getLoadedRcsbIdList(databaseName=databaseName, collectionName=databaseName + "_entry")
# Get the list of IDs from only the given sublist that are already loaded
Expand Down
6 changes: 3 additions & 3 deletions rcsb/db/tests-mongo/testRepoLoadWorkflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def testPdbxLoaderWorkflow(self):
self.assertTrue(ok)
for ld in self.__ldList:
ld.update(self.__loadCommonD)
ok = rlWf.load("pdbx-loader", **ld)
ok = rlWf.load("pdbx_loader", **ld)
self.assertTrue(ok)
except Exception as e:
logger.exception("Failing with %s", str(e))
Expand All @@ -112,10 +112,10 @@ def testEtlLoaderWorkflow(self):
ok = rlWf.buildResourceCache(rebuildCache=False)
self.assertTrue(ok)
#
ok = rlWf.load("etl-repository-holdings", **etlCommonD)
ok = rlWf.load("etl_repository_holdings", **etlCommonD)
self.assertTrue(ok)
#
ok = rlWf.load("etl-entity-sequence-clusters", **etlCommonD)
ok = rlWf.load("etl_entity_sequence_clusters", **etlCommonD)
self.assertTrue(ok)
except Exception as e:
logger.exception("Failing with %s", str(e))
Expand Down
22 changes: 11 additions & 11 deletions rcsb/db/wf/RepoLoadWorkflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ class RepoLoadWorkflow(object):
def __init__(self, **kwargs):
# Configuration Details
configPath = kwargs.get("configPath", "exdb-config-example.yml")
self.__configName = kwargs.get("configName", "site_info_configuration")
self.__configName = kwargs.get("configName", "site_info_remote_configuration")
mockTopPath = kwargs.get("mockTopPath", None)
self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=self.__configName, mockTopPath=mockTopPath)
#
Expand All @@ -60,7 +60,7 @@ def load(self, op, **kwargs):
# logger.error("Resource cache test or rebuild has failed - exiting")
# return False
# argument processing
if op not in ["pdbx-loader", "etl-repository-holdings", "etl-entity-sequence-clusters"]:
if op not in ["pdbx_loader", "etl_repository_holdings", "etl_entity_sequence_clusters"]:
logger.error("Unsupported operation %r - exiting", op)
return False
try:
Expand Down Expand Up @@ -112,7 +112,7 @@ def load(self, op, **kwargs):
return False
#

if op == "pdbx-loader" and dbType == "mongo" and databaseName in databaseNameList:
if op == "pdbx_loader" and dbType == "mongo" and databaseName in databaseNameList:
okS = True
try:
inputPathList, inputIdCodeList = None, None
Expand Down Expand Up @@ -167,7 +167,7 @@ def load(self, op, **kwargs):
okS = self.loadStatus(mw.getLoadStatus(), readBackCheck=readBackCheck)
except Exception as e:
logger.exception("Operation %r database %r failing with %s", op, databaseName, str(e))
elif op == "etl-entity-sequence-clusters" and dbType == "mongo":
elif op == "etl_entity_sequence_clusters" and dbType == "mongo":
cw = SequenceClustersEtlWorker(
self.__cfgOb,
numProc=numProc,
Expand All @@ -181,7 +181,7 @@ def load(self, op, **kwargs):
)
ok = cw.etl(dataSetId, seqDataLocator, loadType=loadType)
okS = self.loadStatus(cw.getLoadStatus(), readBackCheck=readBackCheck)
elif op == "etl-repository-holdings" and dbType == "mongo":
elif op == "etl_repository_holdings" and dbType == "mongo":
rhw = RepoHoldingsEtlWorker(
self.__cfgOb,
sandboxPath,
Expand Down Expand Up @@ -213,7 +213,7 @@ def loadStatus(self, statusList, readBackCheck=True):
logger.exception("Failing with %s", str(e))
return ret

def buildResourceCache(self, rebuildCache=False, providerTypeExclude=None):
def buildResourceCache(self, rebuildCache=False, providerTypeExclude=None, restoreUseStash=True, restoreUseGit=True):
"""Generate and cache resource dependencies."""
ret = False
try:
Expand All @@ -230,8 +230,8 @@ def buildResourceCache(self, rebuildCache=False, providerTypeExclude=None):
self.__cfgOb,
configName=self.__configName,
cachePath=self.__cachePath,
restoreUseStash=True,
restoreUseGit=True,
restoreUseStash=restoreUseStash,
restoreUseGit=restoreUseGit,
providerTypeExclude=providerTypeExclude,
)
ret = rP.cacheResources(useCache=useCache, doBackup=False, useStash=False, useGit=False)
Expand All @@ -242,7 +242,7 @@ def buildResourceCache(self, rebuildCache=False, providerTypeExclude=None):
return ret

def removeAndRecreateDbCollections(self, op, **kwargs):
if op not in ["pdbx-db-wiper"]:
if op not in ["pdbx_db_wiper"]:
logger.error("Unsupported operation %r - exiting", op)
return False
try:
Expand Down Expand Up @@ -278,7 +278,7 @@ def removeAndRecreateDbCollections(self, op, **kwargs):
return ok

def splitIdList(self, op, **kwargs):
if op not in ["pdbx-id-list-splitter"]:
if op not in ["pdbx_id_list_splitter"]:
logger.error("Unsupported operation %r - exiting", op)
return False

Expand Down Expand Up @@ -384,7 +384,7 @@ def splitIdListAndWriteToFiles(self, inputList, nFiles, outfileDir, outfilePrefi
return filePathMappingD

def loadCompleteCheck(self, op, **kwargs):
if op not in ["pdbx-loader-check"]:
if op not in ["pdbx_loader_check"]:
logger.error("Unsupported operation %r - exiting", op)
return False
try:
Expand Down

0 comments on commit 3d4c2c0

Please sign in to comment.