Refactored branch update (rfct-jw-1)

rcsb · Sep 2, 2019 · 7d76480 · 7d76480
1 parent 8cba622
commit 7d76480
Show file tree

Hide file tree

Showing 103 changed files with 12,903 additions and 3,502 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,7 @@
 .env/
 test-output/
 !test-output/.gitkeep
+CACHE/
 LOGTOX
 
 # Byte-compiled / optimized / DLL files

diff --git a/HISTORY.txt b/HISTORY.txt
@@ -104,7 +104,7 @@
 
                Configuration changes to simplify the handling of versioning removing these from database and collection names
                and schema file names, version configuration now managed in separate configuration options,
-               NCBI_TAXONOMY_LOCATOR removed, added configuration options NCBI_TAXONOMY_PATH and ENZYME_CLASSIFICATION_DATA_PATH,
+               NCBI_TAXONOMY_LOCATOR removed, added configuration options NCBI_TAXONOMY_CACHE_PATH and ENZYME_CLASSIFICATION_CACHE_PATH,
                provide separate configuration options schema version assignment within collections,
                up-version api and json schema, add config option VRPT_REPO_PATH_ENV.
 

diff --git a/README.md b/README.md
@@ -119,12 +119,12 @@ optional arguments:
   --update_drugbank_core
                         Update DrugBank schema
   --update_config_all   Update using configuration settings (e.g.
-                        SCHEMA_NAMES_ALL)
+                        DATABASE_NAMES_ALL)
   --update_config_deployed
                         Update using configuration settings (e.g.
-                        SCHEMA_NAMES_DEPLOYED)
+                        DATABASE_NAMES_DEPLOYED)
   --update_config_test  Update using configuration settings (e.g.
-                        SCHEMA_NAMES_TEST)
+                        DATABASE_NAMES_TEST)
   --config_path CONFIG_PATH
                         Path to configuration options file
   --config_name CONFIG_NAME
@@ -398,7 +398,7 @@ For instance, to perform a fresh/full load of all of the chemical component defi
 
 cd rcsb/db/scripts
 python RepoLoadExec.py --full  --load_chem_comp_ref  \
-                      --config_path ../../mock-data/config/dbload-setup-example.yml \
+                      --config_path ../config/exdb-config-example.yml \
                       --config_name site_info \
                       --fail_file_list_path failed-cc-path-list.txt \
                       --read_back_check
@@ -411,13 +411,13 @@ this same data.
 
 cd rcsb/db/scripts
 python RepoLoadExec.py  --mock --full  --load_entry_data \
-                     --config_path ../../mock-data/config/dbload-setup-example.yml \
+                     --config_path ../config/exdb-config-example.yml \
                      --config_name site_info \
                      --save_file_list_path  LATEST_PDBX_LOAD_LIST.txt \
                      --fail_file_list_path failed-entry-path-list.txt
 
 python RepoLoadExec.py --mock --replace  --load_entry_data \
-                      --config_path ../../mock-data/config/dbload-setup-example.yml \
+                      --config_path ../config/exdb-config-example.yml \
                       --config_name site_info \
                       --load_file_list_path  LATEST_PDBX_LOAD_LIST.txt \
                       --fail_file_list_path failed-entry-path-list.txt
@@ -468,7 +468,7 @@ bird_chem_comp_core,.. ).
 #  3-Mar-2019 jdw adjust release filter for chem_comp_* collections to accept only REL and REF_ONLY status codes.
 # 14-Mar-2019 jdw simplify the handling of versioning removing these from database and collection names and schema file names,
 #                 version configuration now managed in separate configuration options, NCBI_TAXONOMY_LOCATOR removed,
-#                 added configuration options NCBI_TAXONOMY_PATH and ENZYME_CLASSIFICATION_DATA_PATH, provide separate
+#                 added configuration options NCBI_TAXONOMY_CACHE_PATH and ENZYME_CLASSIFICATION_CACHE_PATH, provide separate
 #                 configuration options schema version assignment within collections, normalize chem_comp attributes
 #                 between chem_comp_core and bird_chem_comp_core collections, up-version api and json schema, add VRPT_REPO_PATH_ENV.
 # 17-Mar-2019 jdw add subcategory aggregate rcsb_macromolecular_names_combined
@@ -478,7 +478,7 @@ bird_chem_comp_core,.. ).
 # 31-Mar-2019 jdw add block_attributes: REF_PARENT_CATEGORY_NAME: REF_PARENT_ATTRIBUTE_NAME: to provide parent details for this
 #                 synthetic key
 #  1-Apr-2019 jdw add ihm_dev_full to relax schema content exclusions.
-#  6-Apr-2019 jdw add STRUCT_DOMAIN_CLASSIFICATION_DATA_PATH
+#  6-Apr-2019 jdw add STRUCT_DOMAIN_CLASSIFICATION_CACHE_PATH
 #  8-Apr-2019 jdw adding category rcsb_entity_instance_domain
 #  9-Apr-2019 jdw add tree node list, create moved instance level validation to entity_instance_validation
 # 25-Apr-2019 jdw suppress struct_asym in core_entity, add ncbi_taxonomy_scientific_name in source and host organism collections
@@ -526,16 +526,16 @@ site_info:
     DRUGBANK_DATA_LOCATOR: DrugBank/full_database.xml.gz
     CCDC_MAPPING_LOCATOR: chem_comp_models/ccdc_pdb_mapping.json
     #
-    NCBI_TAXONOMY_PATH: NCBI
-    ENZYME_CLASSIFICATION_DATA_PATH: ec
-     SIFTS_SUMMARY_PATH: sifts-summary
+    NCBI_TAXONOMY_CACHE_PATH: NCBI
+    ENZYME_CLASSIFICATION_CACHE_PATH: ec
+     SIFTS_SUMMARY_CACHE_PATH: sifts-summary
 
     #
-    SCHEMA_DEF_LOCATOR_PATH: schema
-    JSON_SCHEMA_LOCATOR_PATH: json-schema
+    SCHEMA_DEFINITION_CACHE_PATH: schema
+    JSON_SCHEMA_DEFINITION_CACHE_PATH: json-schema
     INSTANCE_DATA_TYPE_INFO_LOCATOR_PATH: data_type_info
     #
-    STRUCT_DOMAIN_CLASSIFICATION_DATA_PATH: domains_struct
+    STRUCT_DOMAIN_CLASSIFICATION_CACHE_PATH: domains_struct
     #
     MONGO_DB_HOST: localhost
     MONGO_DB_PORT: '27017'
@@ -555,31 +555,31 @@ site_info:
 #
 # Inventory of current databases and collections -
 #
-schema_catalog_info:
+database_catalog_configuration:
   # All defined schema -
-  SCHEMA_NAMES_ALL: pdbx,pdbx_core,chem_comp,chem_comp_core,bird,bird_family,bird_chem_comp,bird_chem_comp_core,repository_holdings,entity_sequence_clusters,data_exchange,drugbank_core,ihm_dev
+  DATABASE_NAMES_ALL: pdbx,pdbx_core,chem_comp,chem_comp_core,bird,bird_family,bird_chem_comp,bird_chem_comp_core,repository_holdings,entity_sequence_clusters,data_exchange,drugbank_core,ihm_dev
   DATATYPING_ALL: ANY,SQL
-  SCHEMA_TYPES_ALL: rcsb,json,bson
-  SCHEMA_LEVELS_ALL: min,full
+  ENCODING_TYPES_ALL: rcsb,json,bson
+  VALIDATION_LEVELS_ALL: min,full
   #
   #  Schema in active use -
   #
-  SCHEMA_NAMES_DEPLOYED: pdbx_core,chem_comp_core,bird_chem_comp_core,repository_holdings,entity_sequence_clusters,data_exchange,drugbank_core,ihm_dev
+  DATABASE_NAMES_DEPLOYED: pdbx_core,chem_comp_core,bird_chem_comp_core,repository_holdings,entity_sequence_clusters,data_exchange,drugbank_core,ihm_dev
   DATATYPING_DEPLOYED: ANY,SQL
-  SCHEMA_TYPES_DEPLOYED: rcsb,json,bson
-  SCHEMA_LEVELS_DEPLOYED: min,full
+  ENCODING_TYPES_DEPLOYED: rcsb,json,bson
+  VALIDATION_LEVELS_DEPLOYED: min,full
   #
   # Schema subset used for CI testing -
   #
-  SCHEMA_NAMES_TEST: pdbx,pdbx_core,chem_comp_core,bird_chem_comp_core
+  DATABASE_NAMES_TEST: pdbx,pdbx_core,chem_comp_core,bird_chem_comp_core
   DATATYPING_TEST: ANY,SQL
-  SCHEMA_TYPES_TEST: rcsb,json,bson
-  SCHEMA_LEVELS_TEST: min,full
+  ENCODING_TYPES_TEST: rcsb,json,bson
+  VALIDATION_LEVELS_TEST: min,full
   #
-  #SCHEMA_NAMES_TEST: pdbx_core,chem_comp_core,bird_chem_comp_core,repository_holdings,entity_sequence_clusters,data_exchange,drugbank_core
+  #DATABASE_NAMES_TEST: pdbx_core,chem_comp_core,bird_chem_comp_core,repository_holdings,entity_sequence_clusters,data_exchange,drugbank_core
   #DATATYPING_TEST: ANY,SQL
   #SCHEMA_FORMATS_TEST: rcsb,json,bson
-  #SCHEMA_LEVELS_TEST: min,full#
+  #VALIDATION_LEVELS_TEST: min,full#
 #
 # Some schema details for integrated collections -
 #
@@ -1898,7 +1898,7 @@ schemadef_helper:
 #
 document_helper:
     #
-    schema_collection_names:
+    document_collection_names:
         ihm_dev:
             - NAME: ihm_dev
               VERSION: 1.1.0

diff --git a/azure-template-tox-job.yml b/azure-template-tox-job.yml
@@ -4,6 +4,7 @@
 #
 # Updates:
 #  6-Aug-2019  jdw build source and binary wheels by default.
+# 13-Aug-2019  jdw export config support token prior to launching tox runner
 #
 ##
 parameters:
@@ -20,6 +21,9 @@ jobs:
     ${{ if eq(parameters.os, 'linux') }}:
       vmImage: 'ubuntu-latest'
 
+  variables:
+    - group: py-shared-variables
+
   steps:
     #
     # ensure the required Python versions are available
@@ -122,16 +126,24 @@ jobs:
       displayName: 'Install dependencies'
     #
     - ${{ if startsWith(parameters.tox, 'py') }}:
-      - script: ${{ format('python -m tox -e {0}', parameters.tox) }}
+      - script: |
+          export CONFIG_SUPPORT_TOKEN_ENV=$(VAR_CONFIG_SUPPORT_TOKEN_ENV)
+          ${{ format('python -m tox -e {0}', parameters.tox) }}
         displayName: 'Running tox task'
     - ${{ if and(not(startsWith(parameters.tox, 'py')), startsWith(parameters.python, '3.8')) }}:
-      - script: ${{ format('python -m tox -e {0}-py38', parameters.tox) }}
+      - script: |
+          export CONFIG_SUPPORT_TOKEN_ENV=$(VAR_CONFIG_SUPPORT_TOKEN_ENV)
+          ${{ format('python -m tox -e {0}-py38', parameters.tox) }}
         displayName: 'Running tox task'
     - ${{ if and(not(startsWith(parameters.tox, 'py')), startsWith(parameters.python, '3.7')) }}:
-      - script: ${{ format('python -m tox -e {0}-py37', parameters.tox) }}
+      - script: |
+          export CONFIG_SUPPORT_TOKEN_ENV=$(VAR_CONFIG_SUPPORT_TOKEN_ENV)
+          ${{ format('python -m tox -e {0}-py37', parameters.tox) }}
         displayName: 'Running tox task'
     - ${{ if and(not(startsWith(parameters.tox, 'py')), startsWith(parameters.python, '2.7')) }}:
-      - script: ${{ format('python -m tox -e {0}-py27', parameters.tox) }}
+      - script: |
+          export CONFIG_SUPPORT_TOKEN_ENV=$(VAR_CONFIG_SUPPORT_TOKEN_ENV)
+          ${{ format('python -m tox -e {0}-py27', parameters.tox) }}
         displayName: 'Runing tox task'
     #
     #  Build artifacts if this is a test target (i.e. labeled as py##)

diff --git a/rcsb/db/scripts/ETLExec.py → rcsb/db/cli/ETLExec.py b/rcsb/db/scripts/ETLExec.py → rcsb/db/cli/ETLExec.py
@@ -10,6 +10,7 @@
 #   4-Jan-2019 jdw differentiate config sections for provenance
 #   9-Apr-2019 jdw add tree node list loader
 #  25-Apr-2019 jdw move the --etl_tree_node_lists function to the rcsb.exdb package.
+#   2-Sep-2019 jdw add cache options and move trees and chemref to module rcsb.exdb
 #
 ##
 __docformat__ = "restructuredtext en"
@@ -22,11 +23,12 @@
 import os
 import sys
 
+from rcsb.db.cli.RepoHoldingsEtlWorker import RepoHoldingsEtlWorker
+from rcsb.db.cli.SequenceClustersEtlWorker import SequenceClustersEtlWorker
+
+# from rcsb.db.cli.TreeNodeListWorker import TreeNodeListWorker
+from rcsb.db.helpers.DictMethodResourceProvider import DictMethodResourceProvider
 from rcsb.db.mongo.DocumentLoader import DocumentLoader
-from rcsb.db.scripts.ChemRefEtlWorker import ChemRefEtlWorker
-from rcsb.db.scripts.RepoHoldingsEtlWorker import RepoHoldingsEtlWorker
-from rcsb.db.scripts.SequenceClustersEtlWorker import SequenceClustersEtlWorker
-from rcsb.db.scripts.TreeNodeListWorker import TreeNodeListWorker
 from rcsb.db.utils.TimeUtil import TimeUtil
 from rcsb.utils.config.ConfigUtil import ConfigUtil
 
@@ -37,26 +39,38 @@
 logger = logging.getLogger()
 
 
-def loadStatus(statusList, cfgOb, readBackCheck=True):
-    sectionName = "data_exchange"
-    dl = DocumentLoader(cfgOb, "MONGO_DB", numProc=2, chunkSize=2, documentLimit=None, verbose=False, readBackCheck=readBackCheck)
+def loadStatus(statusList, cfgOb, cachePath, readBackCheck=True):
+    sectionName = "data_exchange_configuration"
+    dl = DocumentLoader(cfgOb, cachePath, "MONGO_DB", numProc=2, chunkSize=2, documentLimit=None, verbose=False, readBackCheck=readBackCheck)
     #
     databaseName = cfgOb.get("DATABASE_NAME", sectionName=sectionName)
     collectionName = cfgOb.get("COLLECTION_UPDATE_STATUS", sectionName=sectionName)
     ok = dl.load(databaseName, collectionName, loadType="append", documentList=statusList, indexAttributeList=["update_id", "database_name", "object_name"], keyNames=None)
     return ok
 
 
+def buildResourceCache(cfgOb, configName, cachePath, rebuildCache=False):
+    """Generate and cache resource dependencies.
+    """
+    ret = False
+    try:
+        rp = DictMethodResourceProvider(cfgOb, configName=configName, cachePath=cachePath)
+        ret = rp.cacheResources(useCache=not rebuildCache)
+    except Exception as e:
+        logger.exception("Failing with %s", str(e))
+    return ret
+
+
 def main():
     parser = argparse.ArgumentParser()
     #
-    defaultConfigName = "site_info"
+    defaultConfigName = "site_info_configuration"
     #
     parser.add_argument("--full", default=True, action="store_true", help="Fresh full load in a new tables/collections (Default)")
     #
     parser.add_argument("--etl_entity_sequence_clusters", default=False, action="store_true", help="ETL entity sequence clusters")
     parser.add_argument("--etl_repository_holdings", default=False, action="store_true", help="ETL repository holdings")
-    parser.add_argument("--etl_chemref", default=False, action="store_true", help="ETL integrated chemical reference data")
+    # parser.add_argument("--etl_chemref", default=False, action="store_true", help="ETL integrated chemical reference data")
     # parser.add_argument("--etl_tree_node_lists", default=False, action='store_true', help="ETL tree node lists")
 
     parser.add_argument("--data_set_id", default=None, help="Data set identifier (default= 2018_14 for current week)")
@@ -80,8 +94,10 @@ def main():
     parser.add_argument("--prune_document_size", default=None, help="Prune large documents to this size limit (MB)")
     parser.add_argument("--debug", default=False, action="store_true", help="Turn on verbose logging")
     parser.add_argument("--mock", default=False, action="store_true", help="Use MOCK repository configuration for testing")
-    parser.add_argument("--working_path", default=None, help="Working path for temporary files")
-    parser.add_argument("--use_cache", default=False, action="store_true", help="Use cache files from remote resources")
+    parser.add_argument("--cache_path", default=None, help="Path containing cache directories")
+    # parser.add_argument("--use_cache", default=False, action="store_true", help="Use cache files from remote resources")
+    parser.add_argument("--rebuild_cache", default=False, action="store_true", help="Rebuild cached resource files")
+    # parser.add_argument("--rebuild_schema", default=False, action="store_true", help="Rebuild schema on-the-fly if not cached")
     #
     #
     args = parser.parse_args()
@@ -93,7 +109,7 @@ def main():
     #                                       Configuration Details
     configPath = args.config_path
     configName = args.config_name
-    useCache = args.use_cache
+    # useCache = args.use_cache
     if not configPath:
         configPath = os.getenv("DBLOAD_CONFIG_PATH", None)
     try:
@@ -126,8 +142,10 @@ def main():
         loadType = "full" if args.full else "replace"
         # loadType = 'replace' if args.replace else 'full'
 
-        workPath = args.working_path if args.working_path else "."
-
+        cachePath = args.cache_path if args.cache_path else "."
+        rebuildCache = args.rebuild_cache if args.rebuild_cache else False
+        # rebuildSchemaFlag = args.rebuild_schema if args.rebuild_schema else False
+        #
         # if args.document_style not in ['rowwise_by_name', 'rowwise_by_name_with_cardinality', 'columnwise_by_name', 'rowwise_by_id', 'rowwise_no_name']:
         #    logger.error("Unsupported document style %s" % args.document_style)
 
@@ -138,41 +156,38 @@ def main():
         parser.print_help(sys.stderr)
         exit(1)
     # ----------------------- - ----------------------- - ----------------------- - ----------------------- - ----------------------- -
+    #  Rebuild or check resource cache
+    ok = buildResourceCache(cfgOb, configName, cachePath, rebuildCache=rebuildCache)
+    if not ok:
+        logger.error("Cache rebuild or check failure (rebuild %r) %r", rebuildCache, cachePath)
+        exit(1)
     ##
     if args.db_type == "mongo":
         if args.etl_entity_sequence_clusters:
             cw = SequenceClustersEtlWorker(
-                cfgOb, numProc=numProc, chunkSize=chunkSize, documentLimit=documentLimit, verbose=debugFlag, readBackCheck=readBackCheck, workPath=workPath
+                cfgOb, numProc=numProc, chunkSize=chunkSize, documentLimit=documentLimit, verbose=debugFlag, readBackCheck=readBackCheck, workPath=cachePath
             )
             ok = cw.etl(dataSetId, seqDataLocator, loadType=loadType)
-            okS = loadStatus(cw.getLoadStatus(), cfgOb, readBackCheck=readBackCheck)
+            okS = loadStatus(cw.getLoadStatus(), cfgOb, cachePath, readBackCheck=readBackCheck)
 
         if args.etl_repository_holdings:
             rhw = RepoHoldingsEtlWorker(
-                cfgOb, sandboxPath=sandboxPath, numProc=numProc, chunkSize=chunkSize, documentLimit=documentLimit, verbose=debugFlag, readBackCheck=readBackCheck, workPath=workPath
-            )
-            ok = rhw.load(dataSetId, loadType=loadType)
-            okS = loadStatus(rhw.getLoadStatus(), cfgOb, readBackCheck=readBackCheck)
-
-        if args.etl_chemref:
-            crw = ChemRefEtlWorker(cfgOb, numProc=numProc, chunkSize=chunkSize, documentLimit=documentLimit, verbose=debugFlag, readBackCheck=readBackCheck, workPath=workPath)
-            ok = crw.load(dataSetId, extResource="DrugBank", loadType=loadType)
-            okS = loadStatus(crw.getLoadStatus(), cfgOb, readBackCheck=readBackCheck)
-
-        if args.etl_tree_node_lists:
-            rhw = TreeNodeListWorker(
-                cfgOb,
-                mockTopPath=mockTopPath,
-                numProc=numProc,
-                chunkSize=chunkSize,
-                documentLimit=documentLimit,
-                verbose=debugFlag,
-                readBackCheck=readBackCheck,
-                workPath=workPath,
-                useCache=useCache,
+                cfgOb, sandboxPath, cachePath, numProc=numProc, chunkSize=chunkSize, documentLimit=documentLimit, verbose=debugFlag, readBackCheck=readBackCheck
             )
             ok = rhw.load(dataSetId, loadType=loadType)
-            okS = loadStatus(rhw.getLoadStatus(), cfgOb, readBackCheck=readBackCheck)
+            okS = loadStatus(rhw.getLoadStatus(), cfgOb, cachePath, readBackCheck=readBackCheck)
+
+        # if args.etl_chemref:
+        #    crw = ChemRefEtlWorker(cfgOb, cachePath=cachePath, numProc=numProc, chunkSize=chunkSize, documentLimit=documentLimit, verbose=debugFlag, readBackCheck=readBackCheck)
+        #    ok = crw.load(dataSetId, extResource="DrugBank", loadType=loadType)
+        #    okS = loadStatus(crw.getLoadStatus(), cfgOb, cachePath, readBackCheck=readBackCheck)
+
+        # f args.etl_tree_node_lists:
+        #    rhw = TreeNodeListWorker(
+        #        cfgOb, cachePath, numProc=numProc, chunkSize=chunkSize, documentLimit=documentLimit, verbose=debugFlag, readBackCheck=readBackCheck, useCache=useCache
+        #    )
+        #    ok = rhw.load(dataSetId, loadType=loadType)
+        #    okS = loadStatus(rhw.getLoadStatus(), cfgOb, cachePath, readBackCheck=readBackCheck)
         logger.info("Operation completed with status %r " % ok and okS)
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,6 +3,7 @@ @@
     .env/
     test-output/
     !test-output/.gitkeep
+    CACHE/
     LOGTOX
     # Byte-compiled / optimized / DLL files
@@ Expand Down @@