From 4945dd711d8b4c32f5d90ac64b3d4bfbd772a560 Mon Sep 17 00:00:00 2001 From: jwest Date: Mon, 15 Feb 2021 10:24:36 -0500 Subject: [PATCH] V0.11 Created module with content migrated from rcsb.db --- .gitignore | 5 +- .gitmodules | 3 + HISTORY.txt | 3 + MANIFEST.in | 5 + README.md | 45 +- azure-pipelines.yml | 45 + azure-template-publish-job.yml | 61 + azure-template-tox-job.yml | 202 + pylintrc | 588 +++ rcsb/__init__.py | 1 + rcsb/mock-data | 1 + rcsb/utils/__init__.py | 1 + .../dictionary/DictMethodAssemblyHelper.py | 746 +++ .../dictionary/DictMethodChemRefHelper.py | 937 ++++ .../utils/dictionary/DictMethodCommonUtils.py | 3994 +++++++++++++++++ .../dictionary/DictMethodEntityHelper.py | 1960 ++++++++ .../DictMethodEntityInstanceHelper.py | 1776 ++++++++ .../utils/dictionary/DictMethodEntryHelper.py | 1214 +++++ .../dictionary/DictMethodResourceProvider.py | 393 ++ rcsb/utils/dictionary/DictMethodRunner.py | 227 + .../utils/dictionary/DictionaryApiProvider.py | 113 + .../DictionaryApiProviderWrapper.py | 74 + rcsb/utils/dictionary/__init__.py | 5 + rcsb/utils/tests-dictionary/__init__.py | 0 .../tests-dictionary/testDictMethodRunner.py | 129 + .../testDictionaryApiProvider.py | 87 + .../testDictionaryApiProviderWrapper.py | 106 + requirements.txt | 11 + setup.cfg | 7 + setup.py | 71 + tox.ini | 170 + 31 files changed, 12977 insertions(+), 3 deletions(-) create mode 100644 .gitmodules create mode 100644 HISTORY.txt create mode 100644 MANIFEST.in create mode 100644 azure-pipelines.yml create mode 100644 azure-template-publish-job.yml create mode 100644 azure-template-tox-job.yml create mode 100644 pylintrc create mode 100644 rcsb/__init__.py create mode 160000 rcsb/mock-data create mode 100644 rcsb/utils/__init__.py create mode 100644 rcsb/utils/dictionary/DictMethodAssemblyHelper.py create mode 100644 rcsb/utils/dictionary/DictMethodChemRefHelper.py create mode 100644 rcsb/utils/dictionary/DictMethodCommonUtils.py create mode 100644 rcsb/utils/dictionary/DictMethodEntityHelper.py create mode 100644 rcsb/utils/dictionary/DictMethodEntityInstanceHelper.py create mode 100644 rcsb/utils/dictionary/DictMethodEntryHelper.py create mode 100644 rcsb/utils/dictionary/DictMethodResourceProvider.py create mode 100644 rcsb/utils/dictionary/DictMethodRunner.py create mode 100644 rcsb/utils/dictionary/DictionaryApiProvider.py create mode 100644 rcsb/utils/dictionary/DictionaryApiProviderWrapper.py create mode 100644 rcsb/utils/dictionary/__init__.py create mode 100644 rcsb/utils/tests-dictionary/__init__.py create mode 100644 rcsb/utils/tests-dictionary/testDictMethodRunner.py create mode 100644 rcsb/utils/tests-dictionary/testDictionaryApiProvider.py create mode 100644 rcsb/utils/tests-dictionary/testDictionaryApiProviderWrapper.py create mode 100644 requirements.txt create mode 100755 setup.cfg create mode 100755 setup.py create mode 100644 tox.ini diff --git a/.gitignore b/.gitignore index b6e4761..be2084b 100644 --- a/.gitignore +++ b/.gitignore @@ -2,7 +2,10 @@ __pycache__/ *.py[cod] *$py.class - +test-output/ +CACHE/ +LOGTOX +.tox/ # C extensions *.so diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..ef7431d --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "rcsb/mock-data"] + path = rcsb/mock-data + url = https://github.com/rcsb/py-rcsb_mock-data.git diff --git a/HISTORY.txt b/HISTORY.txt new file mode 100644 index 0000000..db7c475 --- /dev/null +++ b/HISTORY.txt @@ -0,0 +1,3 @@ +# File: HISTORY.txt +# +14-Feb-2021 - V0.11 Created module with content migrated from rcsb.db \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..d36a05f --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,5 @@ +# +# File: py-rcsb_utils_chemref/MANIFEST.in +# +include HISTORY.txt +# diff --git a/README.md b/README.md index 850eac0..fc1b546 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,43 @@ -# py-rcsb_utils_dictionary -RCSB dictionary method utilities +# RCSB Dictionary Method Utilities + +## A collection of Python Dictionary Method Utilities + +## Introduction + +This module contains a collection of utility classes for defining and managing the execution of +dynamic dictionary methods. + +### Installation + +Install via [pip](https://pypi.python.org/pypi/pip). + +```bash +pip install rcsb.utils.dictionary +``` + +Or, to install from the source, download the library source software from the project repository: + +```bash + +git clone --recurse-submodules https://github.com/rcsb/py-rcsb_utils_dictionary.git + +``` + +Optionally, run test suite (Python versions 3.7+) using +[setuptools](https://setuptools.readthedocs.io/en/latest/) or +[tox](http://tox.readthedocs.io/en/latest/example/platform.html): + +```bash +python setup.py test + +or simply run + +tox +``` + +Installation is via the program [pip](https://pypi.python.org/pypi/pip). To run tests +from the source tree, the package must be installed in editable mode (i.e. -e): + +```bash +pip install -e . +``` diff --git a/azure-pipelines.yml b/azure-pipelines.yml new file mode 100644 index 0000000..060822c --- /dev/null +++ b/azure-pipelines.yml @@ -0,0 +1,45 @@ +# File: azure-pipelines.yml +# Date: 4-Jul-2019 jdw Created pipeline +# 8-Jul-2019 jdw add macos test/publish +# 8-Jul-2019 jdw templatize +# 9-Jul-2019 jdw add publish jobs +# 21-Jul-2019 jdw update to Py38 +# 10-Mar-2020 jdw py38 only +# 10-Dec-2020 jdw py39 only +# +name: $(BuildDefinitionName)_$(Date:yyyyMMdd)$(Rev:.rr) + +trigger: +- master +- utilsdict-* + +pr: +- master + +schedules: +- cron: "0 12 * * 0" + displayName: Weekly Sunday build + branches: + include: + - master + always: true + +jobs: + - template: azure-template-tox-job.yml + parameters: {tox: 'format_pep8', python: '3.9', os: 'linux'} + - template: azure-template-tox-job.yml + parameters: {tox: 'lint_pylint', python: '3.9', os: 'linux'} + - template: azure-template-tox-job.yml + parameters: {tox: 'test_coverage', python: '3.9', os: 'linux'} + # + - template: azure-template-tox-job.yml + parameters: {tox: 'py39', python: '3.9', os: 'linux'} + # + - template: azure-template-tox-job.yml + parameters: {tox: 'py39', python: '3.9', os: 'macos'} + # + #- template: azure-template-publish-job.yml + # parameters: {tox: 'py39', python: '3.9', os: 'macos'} + - template: azure-template-publish-job.yml + parameters: {tox: 'py39', python: '3.9', os: 'linux'} + # diff --git a/azure-template-publish-job.yml b/azure-template-publish-job.yml new file mode 100644 index 0000000..e5cb74e --- /dev/null +++ b/azure-template-publish-job.yml @@ -0,0 +1,61 @@ +# File: azure-template-publish-job.yml +# Date: 8-Jul-2019 jdw split out from original pipeline +# +# Update: +# 6-Aug-2019 jdw push source wheels (universal) for linux +# +## +parameters: + tox: "" + python: "" + os: "linux" + fixtures: "" + +jobs: +- job: ${{ format('publish_{0}_{1}', parameters.tox, parameters.os) }} + pool: + ${{ if eq(parameters.os, 'macos') }}: + vmImage: 'macOS-10.15' + ${{ if eq(parameters.os, 'linux') }}: + vmImage: 'ubuntu-20.04' + dependsOn: + - ${{ format('build_test_{0}_{1}', parameters.tox, parameters.os) }} + condition: and(succeeded(), ne(variables['Build.Reason'], 'PullRequest'), eq(variables['Build.SourceBranch'], 'refs/heads/master')) + # + steps: + - task: UsePythonVersion@0 + inputs: + versionSpec: ${{ parameters.python }} + addToPath: true + displayName: setup python + # + #- checkout: self + # submodules: true + # + - download: current + artifact: ${{ format('sw_{0}_{1}', parameters.tox, parameters.os) }} + + - download: current + artifact: ${{ format('sw_u_{0}_{1}', parameters.tox, parameters.os) }} + # + - script: ls -lR $(Pipeline.Workspace)/${{ format('sw_{0}_{1}', parameters.tox, parameters.os) }} + displayName: "Listing of downloaded artifacts" + # + - script: python -m pip install --upgrade pip twine setuptools wheel + displayName: 'Install packaging tools' + # + - task: DownloadSecureFile@1 + name: pypicred + displayName: 'Download PyPI credentials' + inputs: + secureFile: 'PYPIRC-AZURE' + + - ${{ if startsWith(parameters.os, 'linux') }}: + - script: twine upload --verbose --skip-existing -r pypi --config-file $(pypicred.secureFilePath) $(Pipeline.Workspace)/${{ format('sw_u_{0}_{1}', parameters.tox, parameters.os) }}/* + displayName: "Linux upload sdist and source wheel to PyPi ..." + continueOnError: true + # + - ${{ if startsWith(parameters.os, 'macos') }}: + - script: twine upload --verbose --skip-existing -r pypi --config-file $(pypicred.secureFilePath) $(Pipeline.Workspace)/${{ format('sw_{0}_{1}', parameters.tox, parameters.os) }}/* + displayName: "Mac upload sdist and binary wheel to PyPi ..." + continueOnError: true \ No newline at end of file diff --git a/azure-template-tox-job.yml b/azure-template-tox-job.yml new file mode 100644 index 0000000..d8c1e1a --- /dev/null +++ b/azure-template-tox-job.yml @@ -0,0 +1,202 @@ +# File: azure-template-tox-job.yml +# Date: 8-Jul-2019 jdw split out from original pipeline +# Supports: fixtures=mysql,mongodb (linux) +# +# Updates: +# 6-Aug-2019 jdw build source and binary wheels by default. +# 13-Aug-2019 jdw export config support token prior to launching tox runner +# +## +parameters: + tox: "" + python: "" + os: "linux" + fixtures: "" + +jobs: +- job: ${{ format('build_test_{0}_{1}', parameters.tox, parameters.os) }} + timeoutInMinutes: 0 + pool: + ${{ if eq(parameters.os, 'macos') }}: + vmImage: 'macOS-10.15' + ${{ if eq(parameters.os, 'linux') }}: + vmImage: 'ubuntu-20.04' + + variables: + - group: py-shared-variables + + steps: + # + # ensure the required Python versions are available + - task: UsePythonVersion@0 + inputs: + versionSpec: ${{ parameters.python }} + addToPath: true + displayName: setup python + # + - checkout: self + submodules: true + # + - ${{ if startsWith(parameters.os, 'macos') }}: + - bash: | + set -e + ls -la /Applications/Xcode* + sudo xcode-select --switch /Applications/Xcode_12.2.app/Contents/Developer + which g++ + c++ --version + displayName: "setup Xcode" + # + - script: which brew + displayName: 'Check package manager' + - script: brew install flex + displayName: 'Install flex' + - script: which flex + displayName: 'Check flex' + - script: brew install bison + displayName: 'Install bison' + - script: which bison + displayName: 'Check bison' + # ---------------------------------------------- + - ${{ if startsWith(parameters.os, 'linux') }}: + - script: lsb_release -a + displayName: 'Ubuntu kernal version' + - script: which apt + displayName: 'Installing OS dependencies' + - script: apt-cache policy | grep http | awk '{print $2 $3}' | sort -u + displayName: 'Checking for repos' + # + - script: sudo apt-get update + displayName: 'update apt' + #- script: sudo apt-get upgrade + # displayName: 'upgrade apt' + #- script: sudo apt-get update + # displayName: 'update apt' + - script: sudo apt-get install flex + displayName: 'Install flex' + - script: sudo apt-get install bison + displayName: 'Install bison' + # + - ${{ if and(contains(parameters.fixtures, 'mysql'), startsWith(parameters.os, 'linux')) }}: + - bash: | + sudo apt-get install python3-dev mysql-server + sudo apt-get install default-libmysqlclient-dev + sudo apt-get install python-mysqldb + sudo apt list --installed | grep -i mysql + displayName: 'Install mysql development libraries' + - bash: | + echo "Retarting mysql service" + sudo systemctl restart mysql.service + mysql -V + mysql --user=root --password=root -e "use mysql; select * from user;" + # + echo "Try resetting password" + mysqladmin --user=root --password=root password 'ChangeMeSoon' + # + # mysql -u root -p root -e "SET PASSWORD FOR root@'localhost' = PASSWORD(‘ChangeMeSoon’);" + # mysql -u root -p root -e "FLUSH PRIVILEGES; update mysql.user set password=password('ChangeMeSoon') where user='root'; FLUSH PRIVILEGES;" + # UPDATE mysql.user SET Password=PASSWORD('ChangeMeSoon') WHERE User='root'; + + echo "Running preliminary mysql setup" + mysql --user=root --password=ChangeMeSoon <<_EOF_ + DELETE FROM mysql.user WHERE User=''; + DELETE FROM mysql.user WHERE User='root' AND Host NOT IN ('localhost', '127.0.0.1', '::1'); + DROP DATABASE IF EXISTS test; + DELETE FROM mysql.db WHERE Db='test' OR Db='test\\_%'; + FLUSH PRIVILEGES; + _EOF_ + ps -ef | grep -i my + mysql --user=root --password=ChangeMeSoon -e "show databases;" + # + displayName: 'Start and configure mysql ...' + # ----- + + - ${{ if and(contains(parameters.fixtures, 'mongodb'), startsWith(parameters.os, 'linux')) }}: + # Mongo install + - script: | + sudo apt-get install gnupg wget + wget -qO - https://www.mongodb.org/static/pgp/server-4.2.asc | sudo apt-key add - + # sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 9DA31620334BD75D9DCB49F368818C72E52529D4 + sudo apt list --installed | grep mongodb + echo "deb [ arch=amd64,arm64 ] https://repo.mongodb.org/apt/ubuntu bionic/mongodb-org/4.2 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-4.2.list + # echo "deb [ arch=amd64,arm64 ] https://repo.mongodb.org/apt/ubuntu xenial/mongodb-org/4.2 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-4.2.list + sudo apt-get update + sudo apt-get install -y mongodb-org + sudo apt list --installed | grep mongo + displayName: "Installing mongodb" + # + - script: | + sudo service mongod start + sudo ss -tulpn + displayName: "Start Mongo service" + # + # + - script: "python -c \"import sys; print(sys.version); print(sys.executable)\"" + displayName: show python information + # + - script: python -m pip install --upgrade pip tox + displayName: 'Install tools' + # + - script: pip install -r requirements.txt + displayName: 'Install dependencies' + # + # + - task: DownloadSecureFile@1 + name: oelicense + displayName: 'Download OE license file' + inputs: + secureFile: 'oe_license.txt' + + - ${{ if startsWith(parameters.tox, 'py') }}: + - script: | + export OE_LICENSE=$(oelicense.secureFilePath) + export CONFIG_SUPPORT_TOKEN_ENV=$(VAR_CONFIG_SUPPORT_TOKEN_ENV) + ${{ format('python -m tox -e {0}', parameters.tox) }} + displayName: 'Running tox task' + - ${{ if and(not(startsWith(parameters.tox, 'py')), startsWith(parameters.python, '3.9')) }}: + - script: | + export OE_LICENSE=$(oelicense.secureFilePath) + export CONFIG_SUPPORT_TOKEN_ENV=$(VAR_CONFIG_SUPPORT_TOKEN_ENV) + ${{ format('python -m tox -e {0}-py39', parameters.tox) }} + displayName: 'Running tox task' + - ${{ if and(not(startsWith(parameters.tox, 'py')), startsWith(parameters.python, '3.8')) }}: + - script: | + export OE_LICENSE=$(oelicense.secureFilePath) + export CONFIG_SUPPORT_TOKEN_ENV=$(VAR_CONFIG_SUPPORT_TOKEN_ENV) + ${{ format('python -m tox -e {0}-py38', parameters.tox) }} + displayName: 'Running tox task' + - ${{ if and(not(startsWith(parameters.tox, 'py')), startsWith(parameters.python, '3.7')) }}: + - script: | + export OE_LICENSE=$(oelicense.secureFilePath) + export CONFIG_SUPPORT_TOKEN_ENV=$(VAR_CONFIG_SUPPORT_TOKEN_ENV) + ${{ format('python -m tox -e {0}-py37', parameters.tox) }} + displayName: 'Running tox task' + - ${{ if and(not(startsWith(parameters.tox, 'py')), startsWith(parameters.python, '2.7')) }}: + - script: | + export OE_LICENSE=$(oelicense.secureFilePath) + export CONFIG_SUPPORT_TOKEN_ENV=$(VAR_CONFIG_SUPPORT_TOKEN_ENV) + ${{ format('python -m tox -e {0}-py27', parameters.tox) }} + displayName: 'Runing tox task' + # + # Build artifacts if this is a test target (i.e. labeled as py##) + # + - ${{ if startsWith(parameters.tox, 'py') }}: + - script: pip install --upgrade pip twine setuptools wheel + displayName: "Acquire build tools" + - script: python setup.py sdist --dist-dir "$(System.DefaultWorkingDirectory)/dist" + displayName: "Build source dist" + - script: python setup.py bdist_wheel --dist-dir "$(System.DefaultWorkingDirectory)/dist" + displayName: "Build wheel" + # + - script: python setup.py sdist --dist-dir "$(System.DefaultWorkingDirectory)/udist" + displayName: "Build source dist" + # + # Check the install artifacts + - script: ls -lR "$(System.DefaultWorkingDirectory)/dist" "$(System.DefaultWorkingDirectory)/udist" + displayName: "Listing of installed software" + # + - publish: $(System.DefaultWorkingDirectory)/dist + artifact: ${{ format('sw_{0}_{1}', parameters.tox, parameters.os) }} + # + - publish: $(System.DefaultWorkingDirectory)/udist + artifact: ${{ format('sw_u_{0}_{1}', parameters.tox, parameters.os) }} + # diff --git a/pylintrc b/pylintrc new file mode 100644 index 0000000..6058ff4 --- /dev/null +++ b/pylintrc @@ -0,0 +1,588 @@ +[MASTER] + +# A comma-separated list of package or module names from where C extensions may +# be loaded. Extensions are loading into the active Python interpreter and may +# run arbitrary code. +extension-pkg-whitelist=MySQLdb,rdkit + +# Add files or directories to the blacklist. They should be base names, not +# paths. +ignore=CVS + +# Add files or directories matching the regex patterns to the blacklist. The +# regex matches against base names, not paths. +ignore-patterns= + +# Python code to execute, usually for sys.path manipulation such as +# pygtk.require(). +#init-hook= + +# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the +# number of processors available to use. +jobs=1 + +# Control the amount of potential inferred values when inferring a single +# object. This can help the performance when dealing with large functions or +# complex, nested conditions. +limit-inference-results=100 + +# List of plugins (as comma separated values of python modules names) to load, +# usually to register additional checkers. +load-plugins= + +# Pickle collected data for later comparisons. +persistent=yes + +# Specify a configuration file. +#rcfile= + +# When enabled, pylint would attempt to guess common misconfiguration and emit +# user-friendly hints instead of false-positive error messages. +suggestion-mode=yes + +# Allow loading of arbitrary C extensions. Extensions are imported into the +# active Python interpreter and may run arbitrary code. +unsafe-load-any-extension=no + + +[MESSAGES CONTROL] + +# Only show warnings with the listed confidence levels. Leave empty to show +# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED. +confidence= + +# Disable the message, report, category or checker with the given id(s). You +# can either give multiple identifiers separated by comma (,) or put this +# option multiple times (only on the command line, not in the configuration +# file where it should appear only once). You can also use "--disable=all" to +# disable everything first and then reenable specific checks. For example, if +# you want to run only the similarities checker, you can use "--disable=all +# --enable=similarities". If you want to run only the classes checker, but have +# no Warning level messages displayed, use "--disable=all --enable=classes +# --disable=W". +disable=missing-docstring, + empty-docstring, + bad-continuation, + print-statement, + parameter-unpacking, + unpacking-in-except, + old-raise-syntax, + backtick, + long-suffix, + old-ne-operator, + old-octal-literal, + import-star-module-level, + non-ascii-bytes-literal, + raw-checker-failed, + bad-inline-option, + locally-disabled, + file-ignored, + suppressed-message, + useless-suppression, + deprecated-pragma, + use-symbolic-message-instead, + broad-except, + apply-builtin, + basestring-builtin, + buffer-builtin, + cmp-builtin, + coerce-builtin, + execfile-builtin, + file-builtin, + long-builtin, + raw_input-builtin, + reduce-builtin, + standarderror-builtin, + unicode-builtin, + xrange-builtin, + coerce-method, + delslice-method, + getslice-method, + setslice-method, + no-absolute-import, + old-division, + dict-iter-method, + dict-view-method, + next-method-called, + metaclass-assignment, + indexing-exception, + raising-string, + reload-builtin, + oct-method, + hex-method, + nonzero-method, + cmp-method, + input-builtin, + round-builtin, + intern-builtin, + unichr-builtin, + map-builtin-not-iterating, + zip-builtin-not-iterating, + range-builtin-not-iterating, + filter-builtin-not-iterating, + using-cmp-argument, + eq-without-hash, + div-method, + idiv-method, + rdiv-method, + exception-message-attribute, + invalid-str-codec, + sys-max-int, + bad-python3-import, + deprecated-string-function, + deprecated-str-translate-call, + deprecated-itertools-function, + deprecated-types-field, + next-method-defined, + dict-items-not-iterating, + dict-keys-not-iterating, + dict-values-not-iterating, + deprecated-operator-function, + deprecated-urllib-function, + xreadlines-attribute, + deprecated-sys-function, + exception-escape, + comprehension-escape + +# Enable the message, report, category or checker with the given id(s). You can +# either give multiple identifier separated by comma (,) or put this option +# multiple time (only on the command line, not in the configuration file where +# it should appear only once). See also the "--disable" option for examples. +enable=c-extension-no-member + + +[REPORTS] + +# Python expression which should return a note less than 10 (10 is the highest +# note). You have access to the variables errors warning, statement which +# respectively contain the number of errors / warnings messages and the total +# number of statements analyzed. This is used by the global evaluation report +# (RP0004). +evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) + +# Template used to display messages. This is a python new-style format string +# used to format the message information. See doc for all details. +#msg-template= + +# Set the output format. Available formats are text, parseable, colorized, json +# and msvs (visual studio). You can also give a reporter class, e.g. +# mypackage.mymodule.MyReporterClass. +output-format=text + +# Tells whether to display a full report or only the messages. +reports=no + +# Activate the evaluation score. +score=yes + + +[REFACTORING] + +# Maximum number of nested blocks for function / method body +max-nested-blocks=5 + +# Complete name of functions that never returns. When checking for +# inconsistent-return-statements if a never returning function is called then +# it will be considered as an explicit return statement and no message will be +# printed. +never-returning-functions=sys.exit + + +[LOGGING] + +# Format style used to check logging format string. `old` means using % +# formatting, while `new` is for `{}` formatting. +logging-format-style=old + +# Logging modules to check that the string format arguments are in logging +# function parameter format. +logging-modules=logging + + +[SPELLING] + +# Limits count of emitted suggestions for spelling mistakes. +max-spelling-suggestions=4 + +# Spelling dictionary name. Available dictionaries: none. To make it working +# install python-enchant package.. +spelling-dict= + +# List of comma separated words that should not be checked. +spelling-ignore-words= + +# A path to a file that contains private dictionary; one word per line. +spelling-private-dict-file= + +# Tells whether to store unknown words to indicated private dictionary in +# --spelling-private-dict-file option instead of raising a message. +spelling-store-unknown-words=no + + +[MISCELLANEOUS] + +# List of note tags to take in consideration, separated by a comma. +notes=FIXME, + XXX, + TODO + + +[TYPECHECK] + +# List of decorators that produce context managers, such as +# contextlib.contextmanager. Add to this list to register other decorators that +# produce valid context managers. +contextmanager-decorators=contextlib.contextmanager + +# List of members which are set dynamically and missed by pylint inference +# system, and so shouldn't trigger E1101 when accessed. Python regular +# expressions are accepted. +generated-members= + +# Tells whether missing members accessed in mixin class should be ignored. A +# mixin class is detected if its name ends with "mixin" (case insensitive). +ignore-mixin-members=yes + +# Tells whether to warn about missing members when the owner of the attribute +# is inferred to be None. +ignore-none=yes + +# This flag controls whether pylint should warn about no-member and similar +# checks whenever an opaque object is returned when inferring. The inference +# can return multiple potential results while evaluating a Python object, but +# some branches might not be evaluated, which results in partial inference. In +# that case, it might be useful to still emit no-member and other checks for +# the rest of the inferred objects. +ignore-on-opaque-inference=yes + +# List of class names for which member attributes should not be checked (useful +# for classes with dynamically set attributes). This supports the use of +# qualified names. +ignored-classes=optparse.Values,thread._local,_thread._local + +# List of module names for which member attributes should not be checked +# (useful for modules/projects where namespaces are manipulated during runtime +# and thus existing member attributes cannot be deduced by static analysis. It +# supports qualified module names, as well as Unix pattern matching. +ignored-modules= + +# Show a hint with possible names when a member name was not found. The aspect +# of finding the hint is based on edit distance. +missing-member-hint=yes + +# The minimum edit distance a name should have in order to be considered a +# similar match for a missing member name. +missing-member-hint-distance=1 + +# The total number of similar names that should be taken in consideration when +# showing a hint for a missing member. +missing-member-max-choices=1 + + +[VARIABLES] + +# List of additional names supposed to be defined in builtins. Remember that +# you should avoid defining new builtins when possible. +additional-builtins= + +# Tells whether unused global variables should be treated as a violation. +allow-global-unused-variables=yes + +# List of strings which can identify a callback function by name. A callback +# name must start or end with one of those strings. +callbacks=cb_, + _cb + +# A regular expression matching the name of dummy variables (i.e. expected to +# not be used). +dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_ + +# Argument names that match this expression will be ignored. Default to name +# with leading underscore. +ignored-argument-names=_.*|^ignored_|^unused_ + +# Tells whether we should check for unused import in __init__ files. +init-import=no + +# List of qualified module names which can have objects that can redefine +# builtins. +redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io + + +[FORMAT] + +# Expected format of line ending, e.g. empty (any line ending), LF or CRLF. +expected-line-ending-format= + +# Regexp for a line that is allowed to be longer than the limit. +ignore-long-lines=^\s*(# )??$ + +# Number of spaces of indent required inside a hanging or continued line. +indent-after-paren=4 + +# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 +# tab). +indent-string=' ' + +# Maximum number of characters on a single line. +max-line-length=185 + +# Maximum number of lines in a module. +max-module-lines=1000 + +# List of optional constructs for which whitespace checking is disabled. `dict- +# separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}. +# `trailing-comma` allows a space between comma and closing bracket: (a, ). +# `empty-line` allows space-only lines. +no-space-check=trailing-comma, + dict-separator + +# Allow the body of a class to be on the same line as the declaration if body +# contains single statement. +single-line-class-stmt=no + +# Allow the body of an if to be on the same line as the test if there is no +# else. +single-line-if-stmt=no + + +[SIMILARITIES] + +# Ignore comments when computing similarities. +ignore-comments=yes + +# Ignore docstrings when computing similarities. +ignore-docstrings=yes + +# Ignore imports when computing similarities. +ignore-imports=no + +# Minimum lines number of a similarity. +min-similarity-lines=4 + + +[BASIC] + +# Naming style matching correct argument names. +argument-naming-style=snake_case + +# Regular expression matching correct argument names. Overrides argument- +# naming-style. +argument-rgx=[a-z_][a-zA-Z0-9]{1,30}$ + +# Naming style matching correct attribute names. +attr-naming-style=snake_case + +# Regular expression matching correct attribute names. Overrides attr-naming- +# style. +attr-rgx=_?_?[a-z][A-Za-z0-9]{1,40}$ + +# Bad variable names which should always be refused, separated by a comma. +bad-names=foo, + bar, + baz, + toto, + tutu, + tata + +# Naming style matching correct class attribute names. +class-attribute-naming-style=camelCase + +# Regular expression matching correct class attribute names. Overrides class- +# attribute-naming-style. +class-attribute-rgx=_?_?[a-z][A-Za-z0-9]{1,40}$ + +# Naming style matching correct class names. +class-naming-style=PascalCase + +# Regular expression matching correct class names. Overrides class-naming- +# style. +#class-rgx= + +# Naming style matching correct constant names. +const-naming-style=any + +# Regular expression matching correct constant names. Overrides const-naming- +# style. +#const-rgx= + +# Minimum line length for functions/classes that require docstrings, shorter +# ones are exempt. +docstring-min-length=-1 + +# Naming style matching correct function names. +function-naming-style=camelCase + +# Regular expression matching correct function names. Overrides function- +# naming-style. +#function-rgx= + +# Good variable names which should always be accepted, separated by a comma. +good-names=_, + i, + j, + k, + v, + ii, + jj, + kk, +# t, +# c, +# d, + e, +# r, +# s, +# v, +# p, +# ts, +# tS, + ok, + logger + +# Include a hint for the correct naming format with invalid-name. +include-naming-hint=no + +# Naming style matching correct inline iteration names. +inlinevar-naming-style=any + +# Regular expression matching correct inline iteration names. Overrides +# inlinevar-naming-style. +#inlinevar-rgx= + +# Naming style matching correct method names. +method-naming-style=snake_case + +# Regular expression matching correct method names. Overrides method-naming- +# style. +method-rgx=_?_?[a-z][A-Za-z0-9]{1,40}_?_?$ + +# Naming style matching correct module names. +module-naming-style=any + +# Regular expression matching correct module names. Overrides module-naming- +# style. +#module-rgx= + +# Colon-delimited sets of names that determine each other's naming style when +# the name regexes allow several styles. +name-group= + +# Regular expression which should only match function or class names that do +# not require a docstring. +no-docstring-rgx=^_ + +# List of decorators that produce properties, such as abc.abstractproperty. Add +# to this list to register other decorators that produce valid properties. +# These decorators are taken in consideration only for invalid-name. +property-classes=abc.abstractproperty + +# Naming style matching correct variable names. +variable-naming-style=snake_case + +# Regular expression matching correct variable names. Overrides variable- +# naming-style. +variable-rgx=[a-z_][a-zA-Z0-9]{1,40}$ + + +[STRING] + +# This flag controls whether the implicit-str-concat-in-sequence should +# generate a warning on implicit string concatenation in sequences defined over +# several lines. +check-str-concat-over-line-jumps=no + + +[IMPORTS] + +# Allow wildcard imports from modules that define __all__. +allow-wildcard-with-all=no + +# Analyse import fallback blocks. This can be used to support both Python 2 and +# 3 compatible code, which means that the block might have code that exists +# only in one or another interpreter, leading to false positives when analysed. +analyse-fallback-blocks=no + +# Deprecated modules which should not be used, separated by a comma. +deprecated-modules=optparse,tkinter.tix + +# Create a graph of external dependencies in the given file (report RP0402 must +# not be disabled). +ext-import-graph= + +# Create a graph of every (i.e. internal and external) dependencies in the +# given file (report RP0402 must not be disabled). +import-graph= + +# Create a graph of internal dependencies in the given file (report RP0402 must +# not be disabled). +int-import-graph= + +# Force import order to recognize a module as part of the standard +# compatibility libraries. +known-standard-library= + +# Force import order to recognize a module as part of a third party library. +known-third-party=enchant + + +[CLASSES] + +# List of method names used to declare (i.e. assign) instance attributes. +defining-attr-methods=__init__, + __new__, + setUp + +# List of member names, which should be excluded from the protected access +# warning. +exclude-protected=_asdict, + _fields, + _replace, + _source, + _make + +# List of valid names for the first argument in a class method. +valid-classmethod-first-arg=cls + +# List of valid names for the first argument in a metaclass class method. +valid-metaclass-classmethod-first-arg=cls + + +[DESIGN] + +# Maximum number of arguments for function / method. +max-args=5 + +# Maximum number of attributes for a class (see R0902). +max-attributes=7 + +# Maximum number of boolean expressions in an if statement. +max-bool-expr=5 + +# Maximum number of branch for function / method body. +max-branches=12 + +# Maximum number of locals for function / method body. +max-locals=15 + +# Maximum number of parents for a class (see R0901). +max-parents=7 + +# Maximum number of public methods for a class (see R0904). +max-public-methods=20 + +# Maximum number of return / yield for function / method body. +max-returns=6 + +# Maximum number of statements in function / method body. +max-statements=50 + +# Minimum number of public methods for a class (see R0903). +min-public-methods=2 + + +[EXCEPTIONS] + +# Exceptions that will emit a warning when being caught. Defaults to +# "BaseException, Exception". +overgeneral-exceptions=BaseException, + Exception diff --git a/rcsb/__init__.py b/rcsb/__init__.py new file mode 100644 index 0000000..8db66d3 --- /dev/null +++ b/rcsb/__init__.py @@ -0,0 +1 @@ +__path__ = __import__("pkgutil").extend_path(__path__, __name__) diff --git a/rcsb/mock-data b/rcsb/mock-data new file mode 160000 index 0000000..8bfc542 --- /dev/null +++ b/rcsb/mock-data @@ -0,0 +1 @@ +Subproject commit 8bfc542c445225d48d6f98f6b5f6099caf48dc46 diff --git a/rcsb/utils/__init__.py b/rcsb/utils/__init__.py new file mode 100644 index 0000000..8db66d3 --- /dev/null +++ b/rcsb/utils/__init__.py @@ -0,0 +1 @@ +__path__ = __import__("pkgutil").extend_path(__path__, __name__) diff --git a/rcsb/utils/dictionary/DictMethodAssemblyHelper.py b/rcsb/utils/dictionary/DictMethodAssemblyHelper.py new file mode 100644 index 0000000..0f590f3 --- /dev/null +++ b/rcsb/utils/dictionary/DictMethodAssemblyHelper.py @@ -0,0 +1,746 @@ +## +# File: DictMethodAssemblyHelper.py +# Author: J. Westbrook +# Date: 16-Jul-2019 +# Version: 0.001 Initial version +# +## +""" +Helper class implementing external assembly-level methods supporting the RCSB dictionary extension. + +""" +__docformat__ = "restructuredtext en" +__author__ = "John Westbrook" +__email__ = "jwest@rcsb.rutgers.edu" +__license__ = "Apache 2.0" + +import logging +import re +from collections import Counter + +from mmcif.api.DataCategory import DataCategory + +logger = logging.getLogger(__name__) + + +def cmpElements(lhs, rhs): + return 0 if (lhs[-1].isdigit() or lhs[-1] in ["R", "S"]) and rhs[0].isdigit() else -1 + + +class DictMethodAssemblyHelper(object): + """Helper class implementing external assembly-level methods supporting the RCSB dictionary extension.""" + + def __init__(self, **kwargs): + """ + Args: + **kwargs: (dict) Placeholder for future key-value arguments + + """ + # + self._raiseExceptions = kwargs.get("raiseExceptions", False) + # + rP = kwargs.get("resourceProvider") + self.__commonU = rP.getResource("DictMethodCommonUtils instance") if rP else None + self.__dApi = rP.getResource("Dictionary API instance (pdbx_core)") if rP else None + # + logger.debug("Dictionary method helper init") + + def echo(self, msg): + logger.info(msg) + + def addAssemblyInfo(self, dataContainer, catName, **kwargs): + """Build rcsb_assembly_info category. + + Args: + dataContainer (object): mmif.api.DataContainer object instance + catName (str): Category name + + Returns: + bool: True for success or False otherwise + + """ + logger.debug("Starting catName %s kwargs %r", catName, kwargs) + try: + if not (dataContainer.exists("entry") and dataContainer.exists("pdbx_struct_assembly")): + return False + logger.debug("%s beginning for %s", dataContainer.getName(), catName) + # Create the new target category rcsb_assembly_info + if not dataContainer.exists(catName): + dataContainer.append(DataCategory(catName, attributeNameList=self.__dApi.getAttributeNameList(catName))) + # + # + logger.debug("%s beginning for %s", dataContainer.getName(), catName) + # + # Get assembly comp details - + # + rD = self.__getAssemblyComposition(dataContainer) + # + cObj = dataContainer.getObj(catName) + + tObj = dataContainer.getObj("entry") + entryId = tObj.getValue("id", 0) + # + tObj = dataContainer.getObj("pdbx_struct_assembly") + assemblyIdL = tObj.getAttributeValueList("id") + # + # + for ii, assemblyId in enumerate(assemblyIdL): + if assemblyId not in rD["assemblyHeavyAtomCountByTypeD"]: + continue + if assemblyId not in rD["assemblyHeavyAtomCountD"]: + continue + dD = rD["assemblyHeavyAtomCountByTypeD"][assemblyId] + # + cObj.setValue(entryId, "entry_id", ii) + cObj.setValue(assemblyId, "assembly_id", ii) + # + + num = dD["polymer"] if "polymer" in dD else 0 + cObj.setValue(num, "polymer_atom_count", ii) + + num = dD["non-polymer"] if "non-polymer" in dD else 0 + cObj.setValue(num, "nonpolymer_atom_count", ii) + + num = dD["water"] if "water" in dD else 0 + cObj.setValue(num, "solvent_atom_count", ii) + + num = dD["branched"] if "branched" in dD else 0 + cObj.setValue(num, "branched_atom_count", ii) + + num = rD["assemblyHeavyAtomCountD"][assemblyId] + cObj.setValue(num, "atom_count", ii) + # + num = rD["assemblyHydrogenAtomCountD"][assemblyId] + cObj.setValue(num, "hydrogen_atom_count", ii) + # + num1 = rD["assemblyModeledMonomerCountD"][assemblyId] + num2 = rD["assemblyUnmodeledMonomerCountD"][assemblyId] + cObj.setValue(num1, "modeled_polymer_monomer_count", ii) + cObj.setValue(num2, "unmodeled_polymer_monomer_count", ii) + cObj.setValue(num1 + num2, "polymer_monomer_count", ii) + # + dD = rD["assemblyPolymerClassD"][assemblyId] + cObj.setValue(dD["polymerCompClass"], "polymer_composition", ii) + cObj.setValue(dD["subsetCompClass"], "selected_polymer_entity_types", ii) + cObj.setValue(dD["naCompClass"], "na_polymer_entity_types", ii) + # + dD = rD["assemblyInstanceCountByTypeD"][assemblyId] + num = dD["polymer"] if "polymer" in dD else 0 + cObj.setValue(num, "polymer_entity_instance_count", ii) + # + num = dD["non-polymer"] if "non-polymer" in dD else 0 + cObj.setValue(num, "nonpolymer_entity_instance_count", ii) + # + num = dD["branched"] if "branched" in dD else 0 + cObj.setValue(num, "branched_entity_instance_count", ii) + # + num = dD["water"] if "water" in dD else 0 + cObj.setValue(num, "solvent_entity_instance_count", ii) + # + dD = rD["assemblyInstanceCountByPolymerTypeD"][assemblyId] + num = dD["Protein"] if "Protein" in dD else 0 + cObj.setValue(num, "polymer_entity_instance_count_protein", ii) + num1 = dD["DNA"] if "DNA" in dD else 0 + cObj.setValue(num1, "polymer_entity_instance_count_DNA", ii) + num2 = dD["RNA"] if "RNA" in dD else 0 + cObj.setValue(num2, "polymer_entity_instance_count_RNA", ii) + cObj.setValue(num1 + num2, "polymer_entity_instance_count_nucleic_acid", ii) + num = dD["NA-hybrid"] if "NA-hybrid" in dD else 0 + cObj.setValue(num, "polymer_entity_instance_count_nucleic_acid_hybrid", ii) + # + dD = rD["assemblyEntityCountByPolymerTypeD"][assemblyId] + num = dD["Protein"] if "Protein" in dD else 0 + cObj.setValue(num, "polymer_entity_count_protein", ii) + num1 = dD["DNA"] if "DNA" in dD else 0 + cObj.setValue(num1, "polymer_entity_count_DNA", ii) + num2 = dD["RNA"] if "RNA" in dD else 0 + cObj.setValue(num2, "polymer_entity_count_RNA", ii) + cObj.setValue(num1 + num2, "polymer_entity_count_nucleic_acid", ii) + num = dD["NA-hybrid"] if "NA-hybrid" in dD else 0 + cObj.setValue(num, "polymer_entity_count_nucleic_acid_hybrid", ii) + # + dD = rD["assemblyEntityCountByTypeD"][assemblyId] + num = dD["polymer"] if "polymer" in dD else 0 + cObj.setValue(num, "polymer_entity_count", ii) + # + num = dD["non-polymer"] if "non-polymer" in dD else 0 + cObj.setValue(num, "nonpolymer_entity_count", ii) + # + num = dD["branched"] if "branched" in dD else 0 + cObj.setValue(num, "branched_entity_count", ii) + # + num = dD["water"] if "water" in dD else 0 + cObj.setValue(num, "solvent_entity_count", ii) + # + return + except Exception as e: + logger.exception("For %s failing with %s", catName, str(e)) + return False + + def buildContainerAssemblyIds(self, dataContainer, catName, **kwargs): + """Build category rcsb_assembly_container_identifiers. + + Args: + dataContainer (object): mmif.api.DataContainer object instance + catName (str): Category name + + Returns: + bool: True for success or False otherwise + + For example, + + loop_ + _rcsb_assembly_container_identifiers.entry_id + _rcsb_assembly_container_identifiers.assembly_id + ... + + + """ + logger.debug("Starting catName %s kwargs %r", catName, kwargs) + try: + if not (dataContainer.exists("entry") and dataContainer.exists("pdbx_struct_assembly")): + return False + if not dataContainer.exists(catName): + dataContainer.append(DataCategory(catName, attributeNameList=self.__dApi.getAttributeNameList(catName))) + # + cObj = dataContainer.getObj(catName) + + tObj = dataContainer.getObj("entry") + entryId = tObj.getValue("id", 0) + cObj.setValue(entryId, "entry_id", 0) + # + tObj = dataContainer.getObj("pdbx_struct_assembly") + assemblyIdL = tObj.getAttributeValueList("id") + for ii, assemblyId in enumerate(assemblyIdL): + cObj.setValue(entryId, "entry_id", ii) + cObj.setValue(assemblyId, "assembly_id", ii) + cObj.setValue(entryId + "-" + assemblyId, "rcsb_id", ii) + + # + return True + except Exception as e: + logger.exception("For %s failing with %s", catName, str(e)) + return False + + def addDepositedAssembly(self, dataContainer, catName, **kwargs): + """Add the deposited coordinates as an additional separate assembly labeled as 'deposited' + to categories, pdbx_struct_assembly and pdb_struct_assembly_gen. + + Args: + dataContainer (object): mmif.api.DataContainer object instance + catName (str): Category name + + Returns: + bool: True for success or False otherwise + + """ + logger.debug("Starting catName %s kwargs %r", catName, kwargs) + try: + if not dataContainer.exists("struct_asym"): + return False + if not dataContainer.exists("pdbx_struct_assembly"): + dataContainer.append( + DataCategory( + "pdbx_struct_assembly", + attributeNameList=["id", "details", "method_details", "oligomeric_details", "oligomeric_count", "rcsb_details", "rcsb_candidate_assembly"], + ) + ) + if not dataContainer.exists("pdbx_struct_assembly_gen"): + dataContainer.append(DataCategory("pdbx_struct_assembly_gen", attributeNameList=["assembly_id", "oper_expression", "asym_id_list", "ordinal"])) + + if not dataContainer.exists("pdbx_struct_oper_list"): + row = [ + "1", + "identity operation", + "1_555", + "x, y, z", + "1.0000000000", + "0.0000000000", + "0.0000000000", + "0.0000000000", + "0.0000000000", + "1.0000000000", + "0.0000000000", + "0.0000000000", + "0.0000000000", + "0.0000000000", + "1.0000000000", + "0.0000000000", + ] + atList = [ + "id", + "type", + "name", + "symmetry_operation", + "matrix[1][1]", + "matrix[1][2]", + "matrix[1][3]", + "vector[1]", + "matrix[2][1]", + "matrix[2][2]", + "matrix[2][3]", + "vector[2]", + "matrix[3][1]", + "matrix[3][2]", + "matrix[3][3]", + "vector[3]", + ] + dataContainer.append(DataCategory("pdbx_struct_oper_list", attributeNameList=atList, rowList=[row])) + + # + logger.debug("Add deposited assembly for %s", dataContainer.getName()) + cObj = dataContainer.getObj("struct_asym") + asymIdL = cObj.getAttributeValueList("id") + logger.debug("AsymIdL %r", asymIdL) + # + # Ordinal is added by subsequent attribure-level method. + tObj = dataContainer.getObj("pdbx_struct_assembly_gen") + rowIdx = tObj.getRowCount() + tObj.setValue("deposited", "assembly_id", rowIdx) + tObj.setValue("1", "oper_expression", rowIdx) + tObj.setValue(",".join(asymIdL), "asym_id_list", rowIdx) + # + tObj = dataContainer.getObj("pdbx_struct_assembly") + rowIdx = tObj.getRowCount() + tObj.setValue("deposited", "id", rowIdx) + tObj.setValue("deposited_coordinates", "details", rowIdx) + # + for atName in ["oligomeric_details", "method_details", "oligomeric_count"]: + if tObj.hasAttribute(atName): + tObj.setValue("?", atName, rowIdx) + # + # + # + logger.debug("Full row is %r", tObj.getRow(rowIdx)) + # + return True + except Exception as e: + logger.exception("For %s failing with %s", catName, str(e)) + return False + + def filterAssemblyDetails(self, dataContainer, catName, **kwargs): + """Filter _pdbx_struct_assembly.details -> _pdbx_struct_assembly.rcsb_details + with a more limited vocabulary - + + + Args: + dataContainer (object): mmif.api.DataContainer object instance + catName (str): Category name + + Returns: + bool: True for success or False otherwise + + For example, mapping to the following limited enumeration, + + 'author_and_software_defined_assembly' + 'author_defined_assembly' + 'software_defined_assembly' + + """ + logger.debug("Starting catName %s kwargs %r", catName, kwargs) + mD = { + "author_and_software_defined_assembly": "author_and_software_defined_assembly", + "author_defined_assembly": "author_defined_assembly", + "complete icosahedral assembly": "author_and_software_defined_assembly", + "complete point assembly": "author_and_software_defined_assembly", + "crystal asymmetric unit": "software_defined_assembly", + "crystal asymmetric unit, crystal frame": "software_defined_assembly", + "details": "software_defined_assembly", + "helical asymmetric unit": "software_defined_assembly", + "helical asymmetric unit, std helical frame": "software_defined_assembly", + "icosahedral 23 hexamer": "software_defined_assembly", + "icosahedral asymmetric unit": "software_defined_assembly", + "icosahedral asymmetric unit, std point frame": "software_defined_assembly", + "icosahedral pentamer": "software_defined_assembly", + "pentasymmetron capsid unit": "software_defined_assembly", + "point asymmetric unit": "software_defined_assembly", + "point asymmetric unit, std point frame": "software_defined_assembly", + "representative helical assembly": "author_and_software_defined_assembly", + "software_defined_assembly": "software_defined_assembly", + "trisymmetron capsid unit": "software_defined_assembly", + "deposited_coordinates": "software_defined_assembly", + } + # + try: + if not dataContainer.exists("pdbx_struct_assembly"): + return False + + logger.debug("Filter assembly details for %s", dataContainer.getName()) + tObj = dataContainer.getObj("pdbx_struct_assembly") + atName = "rcsb_details" + if not tObj.hasAttribute(atName): + tObj.appendAttribute(atName) + # + for iRow in range(tObj.getRowCount()): + details = tObj.getValue("details", iRow) + if details in mD: + tObj.setValue(mD[details], "rcsb_details", iRow) + else: + tObj.setValue("software_defined_assembly", "rcsb_details", iRow) + # logger.debug("Full row is %r", tObj.getRow(iRow)) + return True + except Exception as e: + logger.exception("For %s %s failing with %s", catName, atName, str(e)) + return False + + def assignAssemblyCandidates(self, dataContainer, catName, **kwargs): + """Flag candidate biological assemblies as 'author_defined_assembly' ad author_and_software_defined_assembly' + + Args: + dataContainer (object): mmif.api.DataContainer object instance + catName (str): Category name + + Returns: + bool: True for success or False otherwise + + """ + logger.debug("Starting catName %s kwargs %r", catName, kwargs) + mD = { + "author_and_software_defined_assembly": "author_and_software_defined_assembly", + "author_defined_assembly": "author_defined_assembly", + "complete icosahedral assembly": "author_and_software_defined_assembly", + "complete point assembly": "author_and_software_defined_assembly", + "crystal asymmetric unit": "software_defined_assembly", + "crystal asymmetric unit, crystal frame": "software_defined_assembly", + "details": "software_defined_assembly", + "helical asymmetric unit": "software_defined_assembly", + "helical asymmetric unit, std helical frame": "software_defined_assembly", + "icosahedral 23 hexamer": "software_defined_assembly", + "icosahedral asymmetric unit": "software_defined_assembly", + "icosahedral asymmetric unit, std point frame": "software_defined_assembly", + "icosahedral pentamer": "software_defined_assembly", + "pentasymmetron capsid unit": "software_defined_assembly", + "point asymmetric unit": "software_defined_assembly", + "point asymmetric unit, std point frame": "software_defined_assembly", + "representative helical assembly": "author_and_software_defined_assembly", + "software_defined_assembly": "software_defined_assembly", + "trisymmetron capsid unit": "software_defined_assembly", + "deposited_coordinates": "software_defined_assembly", + } + # + eD = { + k: True + for k in [ + "crystal asymmetric unit", + "crystal asymmetric unit, crystal frame", + "helical asymmetric unit", + "helical asymmetric unit, std helical frame", + "icosahedral 23 hexamer", + "icosahedral asymmetric unit", + "icosahedral asymmetric unit, std point frame", + "icosahedral pentamer", + "pentasymmetron capsid unit", + "point asymmetric unit", + "point asymmetric unit, std point frame", + "trisymmetron capsid unit", + "deposited_coordinates", + "details", + ] + } + try: + if not dataContainer.exists("pdbx_struct_assembly"): + return False + atName = "rcsb_candidate_assembly" + tObj = dataContainer.getObj("pdbx_struct_assembly") + if not tObj.hasAttribute(atName): + tObj.appendAttribute(atName) + # + for iRow in range(tObj.getRowCount()): + details = tObj.getValue("details", iRow) + if details in mD and details not in eD: + tObj.setValue("Y", "rcsb_candidate_assembly", iRow) + else: + tObj.setValue("N", "rcsb_candidate_assembly", iRow) + # logger.debug("Full row is %r", tObj.getRow(iRow)) + + # + return True + except Exception as e: + logger.exception("For %s %s failing with %s", catName, atName, str(e)) + return False + + def filterAssemblyCandidates(self, dataContainer, catName, **kwargs): + """Filter assemblies to only candidates and deposited cases + + + Args: + dataContainer (object): mmif.api.DataContainer object instance + catName (str): Category name + + Returns: + bool: True for success or False otherwise + + + """ + logger.debug("Starting catName %s kwargs %r", catName, kwargs) + try: + if not dataContainer.exists("pdbx_struct_assembly"): + return False + + logger.debug("Filter candidate assemblyfor %s", dataContainer.getName()) + tObj = dataContainer.getObj("pdbx_struct_assembly") + # + indexList = [] + for iRow in range(tObj.getRowCount()): + isCandidate = tObj.getValue("rcsb_candidate_assembly", iRow) == "Y" + isDeposited = tObj.getValue("id", iRow) == "deposited" + + if not (isCandidate or isDeposited): + indexList.append(iRow) + tObj.removeRows(indexList) + # + # --- + numAssemblies = tObj.getRowCount() + logger.debug("Assembly count is %d", numAssemblies) + if dataContainer.exists("rcsb_entry_info"): + eiObj = dataContainer.getObj("rcsb_entry_info") + eiObj.setValue(numAssemblies, "assembly_count", 0) + # + return True + except Exception as e: + logger.exception("For %s failing with %s", catName, str(e)) + return False + + def __expandOperatorList(self, operExpression): + """ + Operation expressions may have the forms: + + (1) the single operation 1 + (1,2,5) the operations 1, 2, 5 + (1-4) the operations 1,2,3 and 4 + (1,2)(3,4) the combinations of operations + 3 and 4 followed by 1 and 2 (i.e. + the cartesian product of parenthetical + groups applied from right to left) + """ + + rL = [] + opCount = 1 + try: + if operExpression.find("(") < 0: + opL = [operExpression] + else: + opL = [tV.strip().strip("(").rstrip(")") for tV in re.findall(r"\(.*?\)", operExpression)] + # + for op in opL: + teL = [] + tL = op.split(",") + for tV in tL: + trngL = tV.split("-") + if len(trngL) == 2: + rngL = [str(r) for r in range(int(trngL[0]), int(trngL[1]) + 1)] + else: + rngL = trngL + teL.extend(rngL) + rL.append(teL) + opCount *= len(teL) + + except Exception as e: + logger.exception("Failing parsing %r with %s", operExpression, str(e)) + # + if not rL: + opCount = 0 + return opCount, rL + + def __getAssemblyComposition(self, dataContainer): + """Return assembly composition by entity and instance type counts. + + Example - + loop_ + _pdbx_struct_assembly.id + _pdbx_struct_assembly.details + _pdbx_struct_assembly.method_details + _pdbx_struct_assembly.oligomeric_details + _pdbx_struct_assembly.oligomeric_count + 1 'complete icosahedral assembly' ? 180-meric 180 + 2 'icosahedral asymmetric unit' ? trimeric 3 + 3 'icosahedral pentamer' ? pentadecameric 15 + 4 'icosahedral 23 hexamer' ? octadecameric 18 + 5 'icosahedral asymmetric unit, std point frame' ? trimeric 3 + # + loop_ + _pdbx_struct_assembly_gen.assembly_id + _pdbx_struct_assembly_gen.oper_expression + _pdbx_struct_assembly_gen.asym_id_list + 1 '(1-60)' A,B,C + 2 1 A,B,C + 3 '(1-5)' A,B,C + 4 '(1,2,6,10,23,24)' A,B,C + 5 P A,B,C + # + """ + # + instanceTypeD = self.__commonU.getInstanceTypes(dataContainer) + instancePolymerTypeD = self.__commonU.getInstancePolymerTypes(dataContainer) + instEntityD = self.__commonU.getInstanceEntityMap(dataContainer) + # + epTypeD = self.__commonU.getEntityPolymerTypes(dataContainer) + eTypeD = self.__commonU.getEntityTypes(dataContainer) + epTypeFilteredD = self.__commonU.getPolymerEntityFilteredTypes(dataContainer) + # JDW + instHeavyAtomCount = self.__commonU.getInstanceHeavyAtomCounts(dataContainer, modelId="1") + instHydrogenAtomCount = self.__commonU.getInstanceHydrogenAtomCounts(dataContainer, modelId="1") + # + instModeledMonomerCount = self.__commonU.getInstanceModeledMonomerCounts(dataContainer, modelId="1") + instUnmodeledMonomerCount = self.__commonU.getInstanceUnModeledMonomerCounts(dataContainer, modelId="1") + # ------------------------- + assemblyInstanceCountByTypeD = {} + assemblyHeavyAtomCountByTypeD = {} + assemblyHeavyAtomCountD = {} + assemblyHydrogenAtomCountD = {} + assemblyModeledMonomerCountD = {} + assemblyUnmodeledMonomerCountD = {} + # Pre-generation (source instances) + assemblyInstanceD = {} + # Post-generation (gerated instances) + assemblyInstanceGenD = {} + assemblyInstanceCountByPolymerTypeD = {} + assemblyPolymerInstanceCountD = {} + assemblyPolymerClassD = {} + # + assemblyEntityCountByPolymerTypeD = {} + assemblyEntityCountByTypeD = {} + # -------------- + # + try: + if dataContainer.exists("pdbx_struct_assembly_gen"): + tObj = dataContainer.getObj("pdbx_struct_assembly_gen") + for ii in range(tObj.getRowCount()): + assemblyId = tObj.getValue("assembly_id", ii) + # Initialize instances count + if assemblyId not in assemblyInstanceCountByTypeD: + assemblyInstanceCountByTypeD[assemblyId] = {eType: 0 for eType in ["polymer", "non-polymer", "branched", "macrolide", "water"]} + if assemblyId not in assemblyHeavyAtomCountByTypeD: + assemblyHeavyAtomCountByTypeD[assemblyId] = {eType: 0 for eType in ["polymer", "non-polymer", "branched", "macrolide", "water"]} + if assemblyId not in assemblyModeledMonomerCountD: + assemblyModeledMonomerCountD[assemblyId] = 0 + if assemblyId not in assemblyUnmodeledMonomerCountD: + assemblyUnmodeledMonomerCountD[assemblyId] = 0 + if assemblyId not in assemblyHeavyAtomCountD: + assemblyHeavyAtomCountD[assemblyId] = 0 + if assemblyId not in assemblyHydrogenAtomCountD: + assemblyHydrogenAtomCountD[assemblyId] = 0 + # + opExpression = tObj.getValue("oper_expression", ii) + opCount, opL = self.__expandOperatorList(opExpression) + tS = tObj.getValue("asym_id_list", ii) + asymIdList = [t.strip() for t in tS.strip().split(",")] + assemblyInstanceD.setdefault(assemblyId, []).extend(asymIdList) + assemblyInstanceGenD.setdefault(assemblyId, []).extend(asymIdList * opCount) + # + logger.debug("%s assembly %r opExpression %r opCount %d opL %r", dataContainer.getName(), assemblyId, opExpression, opCount, opL) + logger.debug("%s assembly %r length asymIdList %r", dataContainer.getName(), assemblyId, len(asymIdList)) + # + for eType in ["polymer", "non-polymer", "branched", "macrolide", "water"]: + iList = [asymId for asymId in asymIdList if asymId in instanceTypeD and instanceTypeD[asymId] == eType] + assemblyInstanceCountByTypeD[assemblyId][eType] += len(iList) * opCount + # + atCountList = [ + instHeavyAtomCount[asymId] for asymId in asymIdList if asymId in instanceTypeD and instanceTypeD[asymId] == eType and asymId in instHeavyAtomCount + ] + assemblyHeavyAtomCountByTypeD[assemblyId][eType] += sum(atCountList) * opCount + assemblyHeavyAtomCountD[assemblyId] += sum(atCountList) * opCount + # + hAtCountList = [ + instHydrogenAtomCount[asymId] for asymId in asymIdList if asymId in instanceTypeD and instanceTypeD[asymId] == eType and asymId in instHydrogenAtomCount + ] + assemblyHydrogenAtomCountD[assemblyId] += sum(hAtCountList) * opCount + # + modeledMonomerCountList = [ + instModeledMonomerCount[asymId] + for asymId in asymIdList + if asymId in instanceTypeD and instanceTypeD[asymId] == "polymer" and asymId in instModeledMonomerCount + ] + assemblyModeledMonomerCountD[assemblyId] += sum(modeledMonomerCountList) * opCount + # + unmodeledMonomerCountList = [ + instUnmodeledMonomerCount[asymId] + for asymId in asymIdList + if asymId in instanceTypeD and instanceTypeD[asymId] == "polymer" and asymId in instUnmodeledMonomerCount + ] + assemblyUnmodeledMonomerCountD[assemblyId] += sum(unmodeledMonomerCountList) * opCount + + # + assemblyInstanceCountByPolymerTypeD = {} + assemblyPolymerInstanceCountD = {} + assemblyPolymerClassD = {} + # + assemblyEntityCountByPolymerTypeD = {} + assemblyEntityCountByTypeD = {} + # + # Using the generated list of instance assembly components ... + for assemblyId, asymIdList in assemblyInstanceGenD.items(): + # ------ + # Instance polymer composition + pInstTypeList = [instancePolymerTypeD[asymId] for asymId in asymIdList if asymId in instancePolymerTypeD] + pInstTypeD = Counter(pInstTypeList) + assemblyInstanceCountByPolymerTypeD[assemblyId] = {pType: 0 for pType in ["Protein", "DNA", "RNA", "NA-hybrid", "Other"]} + assemblyInstanceCountByPolymerTypeD[assemblyId] = {pType: pInstTypeD[pType] for pType in ["Protein", "DNA", "RNA", "NA-hybrid", "Other"] if pType in pInstTypeD} + assemblyPolymerInstanceCountD[assemblyId] = len(pInstTypeList) + # + logger.debug("%s assemblyId %r pInstTypeD %r", dataContainer.getName(), assemblyId, pInstTypeD.items()) + + # ------------- + # Entity and polymer entity composition + # + entityIdList = list(set([instEntityD[asymId] for asymId in asymIdList if asymId in instEntityD])) + pTypeL = [epTypeD[entityId] for entityId in entityIdList if entityId in epTypeD] + # + polymerCompClass, subsetCompClass, naCompClass, _ = self.__commonU.getPolymerComposition(pTypeL) + assemblyPolymerClassD[assemblyId] = {"polymerCompClass": polymerCompClass, "subsetCompClass": subsetCompClass, "naCompClass": naCompClass} + # + logger.debug( + "%s assemblyId %s polymerCompClass %r subsetCompClass %r naCompClass %r pTypeL %r", + dataContainer.getName(), + assemblyId, + polymerCompClass, + subsetCompClass, + naCompClass, + pTypeL, + ) + pTypeFilteredL = [epTypeFilteredD[entityId] for entityId in entityIdList if entityId in epTypeFilteredD] + # + pEntityTypeD = Counter(pTypeFilteredL) + assemblyEntityCountByPolymerTypeD[assemblyId] = {pType: 0 for pType in ["Protein", "DNA", "RNA", "NA-hybrid", "Other"]} + assemblyEntityCountByPolymerTypeD[assemblyId] = { + pType: pEntityTypeD[pType] for pType in ["Protein", "DNA", "RNA", "NA-hybrid", "Other"] if pType in pEntityTypeD + } + # + eTypeL = [eTypeD[entityId] for entityId in entityIdList if entityId in eTypeD] + entityTypeD = Counter(eTypeL) + assemblyEntityCountByTypeD[assemblyId] = {eType: 0 for eType in ["polymer", "non-polymer", "branched", "macrolide", "water"]} + assemblyEntityCountByTypeD[assemblyId] = { + eType: entityTypeD[eType] for eType in ["polymer", "non-polymer", "branched", "macrolide", "water"] if eType in entityTypeD + } + # + # --------------- + # + # + logger.debug("%s assemblyInstanceCountByTypeD %r", dataContainer.getName(), assemblyInstanceCountByTypeD.items()) + logger.debug("%s assemblyHeavyAtomCountByTypeD %r", dataContainer.getName(), assemblyHeavyAtomCountByTypeD.items()) + logger.debug("%s assemblyHeavyAtomCountD %r", dataContainer.getName(), assemblyHeavyAtomCountD.items()) + logger.debug("%s assemblyHydrogenAtomCountD %r", dataContainer.getName(), assemblyHydrogenAtomCountD.items()) + logger.debug("%s assemblyModeledMonomerCountD %r", dataContainer.getName(), assemblyModeledMonomerCountD.items()) + logger.debug("%s assemblyUnmodeledMonomerCountD %r", dataContainer.getName(), assemblyUnmodeledMonomerCountD.items()) + logger.debug("%s assemblyPolymerClassD %r", dataContainer.getName(), assemblyPolymerClassD.items()) + logger.debug("%s assemblyPolymerInstanceCountD %r", dataContainer.getName(), assemblyPolymerInstanceCountD.items()) + logger.debug("%s assemblyInstanceCountByPolymerTypeD %r", dataContainer.getName(), assemblyInstanceCountByPolymerTypeD.items()) + logger.debug("%s assemblyEntityCountByPolymerTypeD %r", dataContainer.getName(), assemblyEntityCountByPolymerTypeD.items()) + logger.debug("%s assemblyEntityCountByTypeD %r", dataContainer.getName(), assemblyEntityCountByTypeD.items()) + # + rD = { + "assemblyInstanceCountByTypeD": assemblyInstanceCountByTypeD, + "assemblyHeavyAtomCountByTypeD": assemblyHeavyAtomCountByTypeD, + "assemblyHeavyAtomCountD": assemblyHeavyAtomCountD, + "assemblyHydrogenAtomCountD": assemblyHydrogenAtomCountD, + "assemblyModeledMonomerCountD": assemblyModeledMonomerCountD, + "assemblyUnmodeledMonomerCountD": assemblyUnmodeledMonomerCountD, + "assemblyInstanceCountByPolymerTypeD": assemblyInstanceCountByPolymerTypeD, + "assemblyPolymerInstanceCountD": assemblyPolymerInstanceCountD, + "assemblyPolymerClassD": assemblyPolymerClassD, + "assemblyEntityCountByPolymerTypeD": assemblyEntityCountByPolymerTypeD, + "assemblyEntityCountByTypeD": assemblyEntityCountByTypeD, + } + except Exception as e: + logger.exception("Failing %s with %s", dataContainer.getName(), str(e)) + return rD diff --git a/rcsb/utils/dictionary/DictMethodChemRefHelper.py b/rcsb/utils/dictionary/DictMethodChemRefHelper.py new file mode 100644 index 0000000..3d7792d --- /dev/null +++ b/rcsb/utils/dictionary/DictMethodChemRefHelper.py @@ -0,0 +1,937 @@ +## +# File: DictMethodChemRefHelper.py +# Author: J. Westbrook +# Date: 16-Jul-2019 +# Version: 0.001 Initial version +# +## +""" +Helper class implements external method references supporting chemical +reference data definitions in the RCSB dictionary extension. +""" +__docformat__ = "restructuredtext en" +__author__ = "John Westbrook" +__email__ = "jwest@rcsb.rutgers.edu" +__license__ = "Apache 2.0" + +import logging + +# from collections import Counter, OrderedDict + +from mmcif.api.DataCategory import DataCategory + +logger = logging.getLogger(__name__) + + +class DictMethodChemRefHelper(object): + """Helper class implements external method references supporting chemical + reference data definitions in the RCSB dictionary extension. + """ + + def __init__(self, **kwargs): + """ + Args: + resourceProvider: (obj) instance of DictMethodResourceProvider() + + """ + # + self._raiseExceptions = kwargs.get("raiseExceptions", False) + # + rP = kwargs.get("resourceProvider") + self.__dApi = rP.getResource("Dictionary API instance (pdbx_core)") if rP else None + logger.debug("Dictionary method helper init") + + def echo(self, msg): + logger.info(msg) + + def addChemCompRelated(self, dataContainer, catName, **kwargs): + """Add category rcsb_chem_comp_related. + + Args: + dataContainer (object): mmif.api.DataContainer object instance + catName (str): Category name + + Returns: + bool: True for success or False otherwise + + For example, + + loop_ + _rcsb_chem_comp_related.comp_id + _rcsb_chem_comp_related.ordinal + _rcsb_chem_comp_related.resource_name + _rcsb_chem_comp_related.resource_accession_code + _rcsb_chem_comp_related.related_mapping_method + ATP 1 DrugBank DB00171 'assigned by resource' + """ + try: + logger.debug("Starting with %r %r", dataContainer.getName(), catName) + if not (dataContainer.exists("chem_comp_atom") and dataContainer.exists("chem_comp_bond")): + return False + rP = kwargs.get("resourceProvider") + # ------- new + ccId = self.__getChemCompId(dataContainer) + dbId, atcIdL, mappingType, dbVersion = self.__getDrugBankMapping(dataContainer, rP) + logger.debug("Using DrugBank version %r", dbVersion) + # ------------ ----------------------- ----------------------- ----------------------- ----------- + if dbId: + # + if dataContainer.exists("rcsb_chem_comp_container_identifiers"): + tObj = dataContainer.getObj("rcsb_chem_comp_container_identifiers") + if not tObj.hasAttribute("drugbank_id"): + tObj.appendAttribute("drugbank_id") + tObj.setValue(dbId, "drugbank_id", 0) + if atcIdL: + if not tObj.hasAttribute("atc_codes"): + tObj.appendAttribute("atc_codes") + tObj.setValue(",".join(atcIdL), "atc_codes", 0) + # + if not dataContainer.exists(catName): + dataContainer.append(DataCategory(catName, attributeNameList=self.__dApi.getAttributeNameList(catName))) + wObj = dataContainer.getObj(catName) + rL = wObj.selectIndices("DrugBank", "resource_name") + ok = False + if rL: + ok = wObj.removeRows(rL) + if not ok: + logger.debug("Error removing rows in %r %r", catName, rL) + # --- + iRow = wObj.getRowCount() + wObj.setValue(ccId, "comp_id", iRow) + wObj.setValue(iRow + 1, "ordinal", iRow) + wObj.setValue("DrugBank", "resource_name", iRow) + wObj.setValue(dbId, "resource_accession_code", iRow) + wObj.setValue(mappingType, "related_mapping_method", iRow) + # + # ------------ ----------------------- ----------------------- ----------------------- ----------- + ccmProvider = rP.getResource("ChemCompModelProvider instance") if rP else None + csdMapD = ccmProvider.getMapping() + # + if csdMapD and ccId in csdMapD: + if not dataContainer.exists(catName): + dataContainer.append(DataCategory(catName, attributeNameList=self.__dApi.getAttributeNameList(catName))) + wObj = dataContainer.getObj(catName) + logger.debug("Using CSD model mapping length %d", len(csdMapD)) + dbId = csdMapD[ccId][0]["db_code"] + rL = wObj.selectIndices("CCDC/CSD", "resource_name") + if rL: + ok = wObj.removeRows(rL) + if not ok: + logger.debug("Error removing rows in %r %r", catName, rL) + iRow = wObj.getRowCount() + wObj.setValue(ccId, "comp_id", iRow) + wObj.setValue(iRow + 1, "ordinal", iRow) + wObj.setValue("CCDC/CSD", "resource_name", iRow) + wObj.setValue(dbId, "resource_accession_code", iRow) + wObj.setValue("assigned by PDB", "related_mapping_method", iRow) + # + residProvider = rP.getResource("ResidProvider instance") if rP else None + residMapD = residProvider.getMapping() + # + if residMapD and ccId in residMapD: + if not dataContainer.exists(catName): + dataContainer.append(DataCategory(catName, attributeNameList=self.__dApi.getAttributeNameList(catName))) + wObj = dataContainer.getObj(catName) + rL = wObj.selectIndices("RESID", "resource_name") + if rL: + ok = wObj.removeRows(rL) + if not ok: + logger.debug("Error removing rows in %r %r", catName, rL) + logger.debug("Using RESID model mapping length %d", len(residMapD)) + for rD in residMapD[ccId]: + dbId = rD["residCode"] + iRow = wObj.getRowCount() + wObj.setValue(ccId, "comp_id", iRow) + wObj.setValue(iRow + 1, "ordinal", iRow) + wObj.setValue("RESID", "resource_name", iRow) + wObj.setValue(dbId, "resource_accession_code", iRow) + wObj.setValue("matching by RESID resource", "related_mapping_method", iRow) + # + pubchemProvider = rP.getResource("PubChemProvider instance") if rP else None + pubchemMapD = pubchemProvider.getIdentifiers() + if pubchemMapD and ccId in pubchemMapD: + pharosProvider = rP.getResource("PharosProvider instance") if rP else None + pharosChemblD = pharosProvider.getIdentifiers() + + if not dataContainer.exists(catName): + dataContainer.append(DataCategory(catName, attributeNameList=self.__dApi.getAttributeNameList(catName))) + wObj = dataContainer.getObj(catName) + for rName in ["ChEBI", "ChEMBL", "CAS", "PubChem"]: + rL = wObj.selectIndices(rName, "resource_name") + if rL: + ok = wObj.removeRows(rL) + if not ok: + logger.debug("Error removing rows in %r %r", catName, rL) + # + logger.debug("Using PubChem mapping length %d", len(pubchemMapD)) + xD = {} + for rD in pubchemMapD[ccId]: + for tName, tObj in rD.items(): + if tName == "pcId": + xD.setdefault("PubChem", set()).add(tObj) + elif tName in ["CAS", "ChEBI"]: + for tId in tObj: + xD.setdefault(tName, set()).add(tId) + elif tName in ["ChEMBL"]: + for tId in tObj: + xD.setdefault(tName, set()).add(tId) + if pharosChemblD and tId in pharosChemblD: + logger.debug("Mapping ccId %r to Pharos %r", ccId, tId) + xD.setdefault("Pharos", set()).add(tId) + + # + for rName, rIdS in xD.items(): + if rName in ["PubChem", "Pharos"]: + aMethod = "matching InChIKey in PubChem" + elif rName in ["CAS", "ChEMBL", "ChEBI"]: + aMethod = "assigned by PubChem resource" + elif rName in ["Pharos"]: + aMethod = "matching ChEMBL ID in Pharos" + for rId in rIdS: + iRow = wObj.getRowCount() + wObj.setValue(ccId, "comp_id", iRow) + wObj.setValue(iRow + 1, "ordinal", iRow) + wObj.setValue(rName, "resource_name", iRow) + wObj.setValue(rId, "resource_accession_code", iRow) + wObj.setValue(aMethod, "related_mapping_method", iRow) + + return True + except Exception as e: + logger.exception("For %s failing with %s", catName, str(e)) + return False + + def __getChemCompId(self, dataContainer): + if not dataContainer.exists("chem_comp"): + return None + ccObj = dataContainer.getObj("chem_comp") + if not ccObj.hasAttribute("pdbx_release_status"): + return None + return ccObj.getValueOrDefault("id", 0, None) + + def __getDrugBankMapping(self, dataContainer, resourceProvider): + """Return the DrugBank mapping for the chemical definition in the input dataContainer. + + Args: + dataContainer (obj): instance of a DataContainer() object + resourceProvider (obj): instance of a ResourceProvider() object + + Returns: + mType, DrugBankId, actL (str,str, list): mapping type and DrugBank accession code, list of ATC assignments + """ + try: + dbId = None + atcL = [] + mappingType = None + + dbProvider = resourceProvider.getResource("DrugBankProvider instance") if resourceProvider else None + dbD = dbProvider.getMapping() + dbVersion = dbProvider.getVersion() + if dbD: + ccId = self.__getChemCompId(dataContainer) + # + dbMapD = dbD["id_map"] + inKeyD = dbD["inchikey_map"] + atcD = dbD["db_atc_map"] + logger.debug("DrugBank correspondence length is %d", len(dbMapD)) + logger.debug("atcD length is %d", len(atcD)) + logger.debug("inKeyD length is %d", len(inKeyD)) + # + if dataContainer.exists("rcsb_chem_comp_descriptor"): + ccIObj = dataContainer.getObj("rcsb_chem_comp_descriptor") + + if ccIObj.hasAttribute("InChIKey"): + inky = ccIObj.getValue("InChIKey", 0) + logger.debug("inKeyD length is %d testing %r", len(inKeyD), inky) + if inky in inKeyD: + logger.debug("Matching inchikey for %s", ccId) + dbId = inKeyD[inky][0]["drugbank_id"] + mappingType = "matching InChIKey in DrugBank" + # + + if not dbId and dbMapD and dataContainer.getName() in dbMapD: + dbId = dbMapD[ccId]["drugbank_id"] + mappingType = "assigned by DrugBank resource" + logger.debug("Matching db assignment for %s", ccId) + if atcD and dbId in atcD: + atcL = atcD[dbId] + + except Exception as e: + logger.exception("Failing with %s", str(e)) + + return dbId, atcL, mappingType, dbVersion + + def addChemCompAnnotation(self, dataContainer, catName, **kwargs): + """Generate the rcsb_chem_annotation category - + + Args: + dataContainer ([type]): [description] + catName ([type]): [description] + + Returns: + [type]: [description] + + loop_ + _rcsb_chem_comp_annotation.ordinal + _rcsb_chem_comp_annotation.entry_id + _rcsb_chem_comp_annotation.entity_id + # + _rcsb_chem_comp_annotation.annotation_id + _rcsb_chem_comp_annotation.type + _rcsb_chem_comp_annotation.name + _rcsb_chem_comp_annotation.description + # + _rcsb_chem_comp_annotation.annotation_lineage_id + _rcsb_chem_comp_annotation.annotation_lineage_name + _rcsb_chem_comp_annotation.annotation_lineage_depth + # + _rcsb_chem_comp_annotation.provenance_source + _rcsb_chem_comp_annotation.assignment_version + # ... + + loop_ + _pdbx_chem_comp_feature.comp_id + _pdbx_chem_comp_feature.type + _pdbx_chem_comp_feature.value + _pdbx_chem_comp_feature.source + _pdbx_chem_comp_feature.support + NAG 'CARBOHYDRATE ISOMER' D PDB ? + NAG 'CARBOHYDRATE RING' pyranose PDB ? + NAG 'CARBOHYDRATE ANOMER' beta PDB ? + """ + try: + if not (dataContainer.exists("chem_comp_atom") and dataContainer.exists("chem_comp_bond")): + return False + # + logger.debug("Starting with %r %r", dataContainer.getName(), catName) + rP = kwargs.get("resourceProvider") + ccId = self.__getChemCompId(dataContainer) + # ---- + if dataContainer.exists("pdbx_chem_comp_feature"): + fObj = dataContainer.getObj("pdbx_chem_comp_feature") + if not dataContainer.exists(catName): + dataContainer.append(DataCategory(catName, attributeNameList=self.__dApi.getAttributeNameList(catName))) + wObj = dataContainer.getObj(catName) + # + modDate = None + if dataContainer.exists("chem_comp"): + cObj = dataContainer.getObj("chem_comp") + if cObj.hasAttribute("pdbx_modified_date"): + modDate = cObj.getValue("pdbx_modified_date", 0) + else: + logger.info("%r missing modified_date", ccId) + # + fD = {} + for ii in range(fObj.getRowCount()): + pSource = fObj.getValue("source", ii) + pCode = "PDB Reference Data" if pSource.upper() == "PDB" else None + if not pCode: + continue + fType = fObj.getValue("type", ii) + if fType.upper() not in ["CARBOHYDRATE ISOMER", "CARBOHYDRATE RING", "CARBOHYDRATE ANOMER", "CARBOHYDRATE PRIMARY CARBONYL GROUP"]: + continue + fType = fType.title() + fValue = fObj.getValue("value", ii) + if (fType, fValue, pCode) in fD: + continue + fD[(fType, fValue, pCode)] = True + # + iRow = wObj.getRowCount() + wObj.setValue(ccId, "comp_id", iRow) + wObj.setValue(iRow + 1, "ordinal", iRow) + wObj.setValue(fType, "type", iRow) + wObj.setValue("%s_%d" % (ccId, ii + 1), "annotation_id", iRow) + wObj.setValue(fValue, "name", iRow) + wObj.setValue(pCode, "provenance_source", iRow) + av = modDate if modDate else "1.0" + wObj.setValue(av, "assignment_version", iRow) + # + # ---- + + dbId, atcIdL, mappingType, dbVersion = self.__getDrugBankMapping(dataContainer, rP) + atcP = rP.getResource("AtcProvider instance") if rP else None + if atcIdL and atcP: + # + if not dataContainer.exists(catName): + dataContainer.append(DataCategory(catName, attributeNameList=self.__dApi.getAttributeNameList(catName))) + # ----- + wObj = dataContainer.getObj(catName) + # + for atcId in atcIdL: + iRow = wObj.getRowCount() + wObj.setValue(ccId, "comp_id", iRow) + wObj.setValue(iRow + 1, "ordinal", iRow) + wObj.setValue("ATC", "type", iRow) + wObj.setValue(atcId, "annotation_id", iRow) + wObj.setValue(atcP.getAtcName(atcId), "name", iRow) + # + wObj.setValue("ATC " + mappingType, "description", iRow) + # --- + wObj.setValue(";".join(atcP.getNameLineage(atcId)), "annotation_lineage_name", iRow) + idLinL = atcP.getIdLineage(atcId) + wObj.setValue(";".join(idLinL), "annotation_lineage_id", iRow) + wObj.setValue(";".join([str(jj) for jj in range(0, len(idLinL) + 1)]), "annotation_lineage_depth", iRow) + # + wObj.setValue("DrugBank", "provenance_source", iRow) + wObj.setValue(dbVersion, "assignment_version", iRow) + logger.debug("dbId %r atcId %r lineage %r", dbId, atcId, idLinL) + # ----- + rsProvider = rP.getResource("ResidProvider instance") if rP else None + residD = rsProvider.getMapping() + # + if residD and (ccId in residD): + if not dataContainer.exists(catName): + dataContainer.append(DataCategory(catName, attributeNameList=self.__dApi.getAttributeNameList(catName))) + wObj = dataContainer.getObj(catName) + # ----- + residVersion = rsProvider.getVersion() + jj = 1 + for rD in residD[ccId]: + if "modRes" not in rD: + continue + for modRes in rD["modRes"]: + iRow = wObj.getRowCount() + wObj.setValue(ccId, "comp_id", iRow) + wObj.setValue(iRow + 1, "ordinal", iRow) + wObj.setValue("Modification Type", "type", iRow) + wObj.setValue("modres_%d" % jj, "annotation_id", iRow) + wObj.setValue(modRes, "name", iRow) + wObj.setValue("RESID", "provenance_source", iRow) + wObj.setValue(residVersion, "assignment_version", iRow) + jj += 1 + # + jj = 1 + for rD in residD[ccId]: + if "genEnzymes" not in rD: + continue + for genEnzyme in rD["genEnzymes"]: + iRow = wObj.getRowCount() + wObj.setValue(ccId, "comp_id", iRow) + wObj.setValue(iRow + 1, "ordinal", iRow) + wObj.setValue("Generating Enzyme", "type", iRow) + wObj.setValue("enzyme_%d" % jj, "annotation_id", iRow) + wObj.setValue(genEnzyme, "name", iRow) + wObj.setValue("RESID", "provenance_source", iRow) + wObj.setValue(residVersion, "assignment_version", iRow) + jj += 1 + # + psimodP = rP.getResource("PsiModProvider instance") if rP else None + if psimodP: + jj = 1 + for rD in residD[ccId]: + if "ontRefs" not in rD: + continue + for ontId in rD["ontRefs"]: + if ontId[:3] != "MOD": + continue + iRow = wObj.getRowCount() + wObj.setValue(ccId, "comp_id", iRow) + wObj.setValue(iRow + 1, "ordinal", iRow) + wObj.setValue("PSI-MOD", "type", iRow) + wObj.setValue(ontId, "annotation_id", iRow) + wObj.setValue(psimodP.getName(ontId), "name", iRow) + wObj.setValue("RESID", "provenance_source", iRow) + wObj.setValue(residVersion, "assignment_version", iRow) + # + linL = psimodP.getLineage(ontId) + wObj.setValue(";".join([tup[0] for tup in linL]), "annotation_lineage_id", iRow) + wObj.setValue(";".join([tup[1] for tup in linL]), "annotation_lineage_name", iRow) + wObj.setValue(";".join([str(tup[2]) for tup in linL]), "annotation_lineage_depth", iRow) + # + return True + except Exception as e: + logger.exception("For %s failing with %s", catName, str(e)) + return False + + def addChemCompTargets(self, dataContainer, catName, **kwargs): + """Add category rcsb_chem_comp_target using DrugBank annotations. + + Args: + dataContainer (object): mmif.api.DataContainer object instance + catName (str): Category name + + Returns: + bool: True for success or False otherwise + + Example: + loop_ + _rcsb_chem_comp_target.comp_id + _rcsb_chem_comp_target.ordinal + _rcsb_chem_comp_target.name + _rcsb_chem_comp_target.interaction_type + _rcsb_chem_comp_target.target_actions + _rcsb_chem_comp_target.organism_common_name + _rcsb_chem_comp_target.reference_database_name + _rcsb_chem_comp_target.reference_database_accession_code + _rcsb_chem_comp_target.provenance_code + ATP 1 "O-phosphoseryl-tRNA(Sec) selenium transferase" target cofactor Human UniProt Q9HD40 DrugBank + + DrugBank target info: + { + "type": "target", + "name": "Alanine--glyoxylate aminotransferase 2, mitochondrial", + "organism": "Human", + "actions": [ + "cofactor" + ], + "known_action": "unknown", + "uniprot_ids": "Q9BYV1" + }, + + """ + try: + logger.debug("Starting with %r %r", dataContainer.getName(), catName) + # Exit if source categories are missing + if not (dataContainer.exists("chem_comp_atom") and dataContainer.exists("chem_comp_bond")): + return False + + # + rP = kwargs.get("resourceProvider") + dbProvider = rP.getResource("DrugBankProvider instance") if rP else None + dbD = dbProvider.getMapping() + if not dbD: + return False + + dbMapD = dbD["id_map"] if "id_map" in dbD else None + # + ccId = dataContainer.getName() + if dbMapD and ccId in dbMapD and "target_interactions" in dbMapD[ccId]: + # + # Create the new target category + if not dataContainer.exists(catName): + dataContainer.append( + DataCategory( + catName, + attributeNameList=[ + "comp_id", + "ordinal", + "name", + "interaction_type", + "target_actions", + "organism_common_name", + "reference_database_name", + "reference_database_accession_code", + "provenance_code", + ], + ) + ) + wObj = dataContainer.getObj(catName) + logger.debug("Using DrugBank mapping length %d", len(dbMapD)) + rL = wObj.selectIndices("DrugBank", "provenance_code") + if rL: + ok = wObj.removeRows(rL) + if not ok: + logger.debug("Error removing rows in %r %r", catName, rL) + # + iRow = wObj.getRowCount() + iRow = wObj.getRowCount() + for tD in dbMapD[ccId]["target_interactions"]: + wObj.setValue(ccId, "comp_id", iRow) + wObj.setValue(iRow + 1, "ordinal", iRow) + wObj.setValue(tD["name"], "name", iRow) + wObj.setValue(tD["type"], "interaction_type", iRow) + if "actions" in tD and tD["actions"]: + wObj.setValue(";".join(tD["actions"]), "target_actions", iRow) + if "organism" in tD: + wObj.setValue(tD["organism"], "organism_common_name", iRow) + if "uniprot_ids" in tD: + wObj.setValue("UniProt", "reference_database_name", iRow) + wObj.setValue(tD["uniprot_ids"], "reference_database_accession_code", iRow) + wObj.setValue("DrugBank", "provenance_code", iRow) + iRow += 1 + + # + return True + except Exception as e: + logger.exception("For %s failing with %s", catName, str(e)) + return False + + def __getAuditDates(self, dataContainer, catName): + createDate = None + releaseDate = None + reviseDate = None + try: + if dataContainer.exists(catName): + cObj = dataContainer.getObj(catName) + for iRow in range(cObj.getRowCount()): + aType = cObj.getValueOrDefault("action_type", iRow, defaultValue=None) + dateVal = cObj.getValueOrDefault("date", iRow, defaultValue=None) + if aType in ["Create component"]: + createDate = dateVal + elif aType in ["Initial release"]: + releaseDate = dateVal + reviseDate = cObj.getValueOrDefault("date", cObj.getRowCount() - 1, defaultValue=None) + except Exception as e: + logger.exception("Faling with %s", str(e)) + return createDate, releaseDate, reviseDate + + def addChemCompInfo(self, dataContainer, catName, **kwargs): + """Add category rcsb_chem_comp_info and rcsb_chem_comp_container_identifiers. + + Args: + dataContainer (object): mmif.api.DataContainer object instance + catName (str): Category name + + Returns: + bool: True for success or False otherwise + + For example, + _rcsb_chem_comp_info.comp_id BNZ + _rcsb_chem_comp_info.atom_count 12 + _rcsb_chem_comp_info.atom_count_chiral 0 + _rcsb_chem_comp_info.bond_count 12 + _rcsb_chem_comp_info.bond_count_aromatic 6 + _rcsb_chem_comp_info.atom_count_heavy 6 + """ + try: + logger.debug("Starting with %r %r %r", dataContainer.getName(), catName, kwargs) + # Exit if source categories are missing + if not dataContainer.exists("chem_comp"): + return False + ccObj = dataContainer.getObj("chem_comp") + if not ccObj.hasAttribute("pdbx_release_status"): + return False + ccId = ccObj.getValue("id", 0) + ccReleaseStatus = ccObj.getValue("pdbx_release_status", 0) + subComponentIds = ccObj.getValueOrDefault("pdbx_subcomponent_list", 0, defaultValue=None) + # + # + prdId = prdReleaseStatus = representAs = None + if dataContainer.exists("pdbx_reference_molecule"): + prdObj = dataContainer.getObj("pdbx_reference_molecule") + prdId = prdObj.getValueOrDefault("prd_id", 0, defaultValue=None) + prdReleaseStatus = prdObj.getValueOrDefault("release_status", 0, defaultValue=None) + representAs = prdObj.getValueOrDefault("represent_as", 0, defaultValue=None) + # + # ------- add the canonical identifiers -------- + cN = "rcsb_chem_comp_container_identifiers" + if not dataContainer.exists(cN): + dataContainer.append(DataCategory(cN, attributeNameList=self.__dApi.getAttributeNameList(cN))) + idObj = dataContainer.getObj(cN) + idObj.setValue(ccId, "comp_id", 0) + if prdId: + idObj.setValue(prdId, "prd_id", 0) + idObj.setValue(ccId, "rcsb_id", 0) + if subComponentIds: + tL = [tV.strip() for tV in subComponentIds.split()] + idObj.setValue(",".join(tL), "subcomponent_ids", 0) + # + # Get audit info - + if representAs and representAs.lower() in ["polymer"]: + _, releaseDate, revisionDate = self.__getAuditDates(dataContainer, "pdbx_prd_audit") + else: + _, releaseDate, revisionDate = self.__getAuditDates(dataContainer, "pdbx_chem_comp_audit") + # + # --------- --------- --------- --------- + # Create the new target category + # + if not dataContainer.exists(catName): + dataContainer.append(DataCategory(catName, attributeNameList=self.__dApi.getAttributeNameList(catName))) + # + # ------- + wObj = dataContainer.getObj(catName) + # + numAtoms = 0 + numAtomsHeavy = 0 + numAtomsChiral = 0 + try: + cObj = dataContainer.getObj("chem_comp_atom") + numAtoms = cObj.getRowCount() + numAtomsHeavy = 0 + numAtomsChiral = 0 + for ii in range(numAtoms): + el = cObj.getValue("type_symbol", ii) + if el != "H": + numAtomsHeavy += 1 + chFlag = cObj.getValue("pdbx_stereo_config", ii) + if chFlag != "N": + numAtomsChiral += 1 + except Exception: + logger.warning("Missing chem_comp_atom category for %s", ccId) + numAtoms = 0 + numAtomsHeavy = 0 + numAtomsChiral = 0 + # + wObj.setValue(ccId, "comp_id", 0) + if prdReleaseStatus: + wObj.setValue(prdReleaseStatus, "release_status", 0) + else: + wObj.setValue(ccReleaseStatus, "release_status", 0) + # + wObj.setValue(releaseDate, "initial_release_date", 0) + wObj.setValue(revisionDate, "revision_date", 0) + # + wObj.setValue(numAtoms, "atom_count", 0) + wObj.setValue(numAtomsChiral, "atom_count_chiral", 0) + wObj.setValue(numAtomsHeavy, "atom_count_heavy", 0) + # + # ------ + numBonds = 0 + numBondsAro = 0 + try: + cObj = dataContainer.getObj("chem_comp_bond") + numBonds = cObj.getRowCount() + numBondsAro = 0 + for ii in range(numAtoms): + aroFlag = cObj.getValue("pdbx_aromatic_flag", ii) + if aroFlag != "N": + numBondsAro += 1 + except Exception: + pass + # + wObj.setValue(numBonds, "bond_count", 0) + wObj.setValue(numBondsAro, "bond_count_aromatic", 0) + # + return True + except Exception as e: + logger.exception("For %s failing with %s", catName, str(e)) + return False + + def addChemCompDescriptor(self, dataContainer, catName, **kwargs): + """Add category rcsb_chem_comp_descriptor. + + Args: + dataContainer (object): mmif.api.DataContainer object instance + catName (str): Category name + + Returns: + bool: True for success or False otherwise + + For example, parse the pdbx_chem_comp_descriptor category and extract SMILES/CACTVS and InChI descriptors - + + loop_ + _pdbx_chem_comp_descriptor.comp_id + _pdbx_chem_comp_descriptor.type + _pdbx_chem_comp_descriptor.program + _pdbx_chem_comp_descriptor.program_version + _pdbx_chem_comp_descriptor.descriptor + ATP SMILES ACDLabs 10.04 "O=P(O)(O)OP(=O)(O)OP(=O)(O)OCC3OC(n2cnc1c(ncnc12)N)C(O)C3O" + ATP SMILES_CANONICAL CACTVS 3.341 "Nc1ncnc2n(cnc12)[C@@H]3O[C@H](CO[P@](O)(=O)O[P@@](O)(=O)O[P](O)(O)=O)[C@@H](O)[C@H]3O" + ATP SMILES CACTVS 3.341 "Nc1ncnc2n(cnc12)[CH]3O[CH](CO[P](O)(=O)O[P](O)(=O)O[P](O)(O)=O)[CH](O)[CH]3O" + ATP SMILES_CANONICAL "OpenEye OEToolkits" 1.5.0 "c1nc(c2c(n1)n(cn2)[C@H]3[C@@H]([C@@H]([C@H](O3)CO[P@@](=O)(O)O[P@](=O)(O)OP(=O)(O)O)O)O)N" + ATP SMILES "OpenEye OEToolkits" 1.5.0 "c1nc(c2c(n1)n(cn2)C3C(C(C(O3)COP(=O)(O)OP(=O)(O)OP(=O)(O)O)O)O)N" + ATP InChI InChI 1.03 "InChI=1S/C10H16N5O13P3/c11-8-5-9(13-2-12-8)15(3- ...." + ATP InChIKey InChI 1.03 ZKHQWZAMYRWXGA-KQYNXXCUSA-N + + To produce - + _rcsb_chem_comp_descriptor.comp_id ATP + _rcsb_chem_comp_descriptor.SMILES 'Nc1ncnc2n(cnc12)[CH]3O[CH](CO[P](O)(=O)O[P](O)(=O)O[P](O)(O)=O)[CH](O)[CH]3O' + _rcsb_chem_comp_descriptor.SMILES_stereo 'Nc1ncnc2n(cnc12)[C@@H]3O[C@H](CO[P@](O)(=O)O[P@@](O)(=O)O[P](O)(O)=O)[C@@H](O)[C@H]3O' + _rcsb_chem_comp_descriptor.InChI 'InChI=1S/C10H16N5O13P3/c11-8-5-9(13-2-12-8)15(3-14-5)10-7(17)6(16)4(26-10)1-25 ...' + _rcsb_chem_comp_descriptor.InChIKey 'ZKHQWZAMYRWXGA-KQYNXXCUSA-N' + """ + try: + logger.debug("Starting with %r %r %r", dataContainer.getName(), catName, kwargs) + # Exit if source categories are missing + if not (dataContainer.exists("chem_comp") and dataContainer.exists("pdbx_chem_comp_descriptor")): + return False + # + # Create the new target category + if not dataContainer.exists(catName): + dataContainer.append(DataCategory(catName, attributeNameList=["comp_id", "SMILES", "SMILES_stereo", "InChI", "InChIKey"])) + # + wObj = dataContainer.getObj(catName) + ccIObj = dataContainer.getObj("pdbx_chem_comp_descriptor") + iRow = 0 + ccId = "" + for ii in range(ccIObj.getRowCount()): + ccId = ccIObj.getValue("comp_id", ii) + nm = ccIObj.getValue("descriptor", ii) + prog = ccIObj.getValue("program", ii) + typ = ccIObj.getValue("type", ii) + # + if typ == "SMILES_CANONICAL" and prog.upper().startswith("OPENEYE"): + wObj.setValue(nm, "SMILES_stereo", iRow) + elif typ == "SMILES" and prog.upper().startswith("OPENEYE"): + wObj.setValue(nm, "SMILES", iRow) + elif typ == "InChI" and prog == "InChI": + wObj.setValue(nm, "InChI", iRow) + elif typ == "InChIKey" and prog == "InChI": + wObj.setValue(nm, "InChIKey", iRow) + # + wObj.setValue(ccId, "comp_id", iRow) + # + return True + except Exception as e: + logger.exception("For %s failing with %s", catName, str(e)) + return False + + def renameCitationCategory(self, dataContainer, catName, **kwargs): + """Rename citation and citation author categories. + + Args: + dataContainer (object): mmif.api.DataContainer object instance + catName (str): Category name + + Returns: + bool: True for success or False otherwise + """ + try: + _ = kwargs + logger.debug("Starting with %r %r", dataContainer.getName(), catName) + if not (dataContainer.exists("chem_comp") and dataContainer.exists("pdbx_chem_comp_identifier")): + return False + # + # Rename target categories + if dataContainer.exists("citation"): + dataContainer.rename("citation", "rcsb_bird_citation") + if dataContainer.exists("citation_author"): + dataContainer.rename("citation_author", "rcsb_bird_citation_author") + return True + except Exception as e: + logger.exception("For %s failing with %s", catName, str(e)) + + return False + + def addChemCompSynonyms(self, dataContainer, catName, **kwargs): + """Add category rcsb_chem_comp_synonyms including PDB and DrugBank annotations. + + Args: + dataContainer (object): mmif.api.DataContainer object instance + catName (str): Category name + + Returns: + bool: True for success or False otherwise + + For example, + + loop_ + _rcsb_chem_comp_synonyms.comp_id + _rcsb_chem_comp_synonyms.ordinal + _rcsb_chem_comp_synonyms.name + _rcsb_chem_comp_synonyms.provenance_code + _rcsb_chem_comp_synonyms.type + + ATP 1 "adenosine 5'-(tetrahydrogen triphosphate)" 'PDB Reference Data' 'Preferred Name' + ATP 2 "Adenosine 5'-triphosphate" 'PDB Reference Data' 'PDB Reference Data' 'Preferred Common Name' + ATP 3 Atriphos DrugBank 'Synonym' + ATP 4 Striadyne DrugBank 'Synonym' + + """ + try: + logger.debug("Starting with %r %r", dataContainer.getName(), catName) + if not (dataContainer.exists("chem_comp") and dataContainer.exists("chem_comp_atom") and dataContainer.exists("pdbx_chem_comp_identifier")): + return False + # + # + # Create the new target category + if not dataContainer.exists(catName): + dataContainer.append(DataCategory(catName, attributeNameList=self.__dApi.getAttributeNameList(catName))) + else: + # remove the rowlist - + pass + # + tTupL = self.__dApi.getEnumListWithDetail(catName, "type") + typeLookupD = {tTup[0].upper(): tTup[0] for tTup in tTupL} + + pTupL = self.__dApi.getEnumListWithDetail(catName, "provenance_source") + provLookupD = {pTup[0].upper(): pTup[0] for pTup in pTupL} + + provLookupD["ACD-LABS"] = "ACDLabs" + provLookupD["PDB"] = "PDB Reference Data" + + wObj = dataContainer.getObj(catName) + # + # Get all of the names relevant names from the definition - + # + iRow = 0 + nmD = {} + provCode = "PDB Reference Data" + ccObj = dataContainer.getObj("chem_comp") + ccId = ccObj.getValue("id", 0) + ccName = ccObj.getValue("name", 0) + # ccSynonymL = [] + # if ccObj.hasAttribute("pdbx_synonyms"): + # ccSynonymL = str(ccObj.getValue("pdbx_synonyms", 0)).split(";") + # + wObj.setValue(ccId, "comp_id", iRow) + wObj.setValue(ccName, "name", iRow) + wObj.setValue(iRow + 1, "ordinal", iRow) + wObj.setValue(provCode, "provenance_source", iRow) + wObj.setValue("Preferred Name", "type", iRow) + iRow += 1 + nmD[ccName] = True + # + if dataContainer.exists("pdbx_chem_comp_synonyms"): + qObj = dataContainer.getObj("pdbx_chem_comp_synonyms") + for ii in range(qObj.getRowCount()): + sType = qObj.getValue("type", ii) + pCode = provCode + pType = "Preferred Synonym" if sType.upper() == "PREFERRED" else "Synonym" + nm = str(qObj.getValue("name", ii)).strip() + if nm in nmD: + continue + nmD[nm] = True + logger.debug("Synonym %r sType %r pCode %r", nm, sType, pCode) + wObj.setValue(ccId, "comp_id", iRow) + wObj.setValue(nm, "name", iRow) + wObj.setValue(iRow + 1, "ordinal", iRow) + wObj.setValue(pCode, "provenance_source", iRow) + wObj.setValue(pType, "type", iRow) + iRow += 1 + else: + logger.debug("No synonyms for %s", ccId) + + # for nm in ccSynonymL: + # if nm in ["?", "."]: + # continue + # if nm in nmD: + # continue + # nmD[nm] = True + # wObj.setValue(ccId, "comp_id", iRow) + # wObj.setValue(nm, "name", iRow) + # wObj.setValue(iRow + 1, "ordinal", iRow) + # wObj.setValue(provCode, "provenance_source", iRow) + # wObj.setValue("Synonym", "type", iRow) + # iRow += 1 + # + ccIObj = dataContainer.getObj("pdbx_chem_comp_identifier") + for ii in range(ccIObj.getRowCount()): + nm = str(ccIObj.getValue("identifier", ii)).strip() + prog = ccIObj.getValue("program", ii) + iType = ccIObj.getValue("type", ii) + if not iType or iType.upper() not in typeLookupD: + continue + if prog and prog.upper() in provLookupD: + sProg = provLookupD[prog.upper()] + sType = typeLookupD[iType.upper()] + wObj.setValue(ccId, "comp_id", iRow) + wObj.setValue(nm, "name", iRow) + wObj.setValue(iRow + 1, "ordinal", iRow) + wObj.setValue(sProg, "provenance_source", iRow) + wObj.setValue(sType, "type", iRow) + + iRow += 1 + else: + logger.error("%s unknown provenance %r", ccId, prog) + # + rP = kwargs.get("resourceProvider") + dbProvider = rP.getResource("DrugBankProvider instance") if rP else None + dbD = dbProvider.getMapping() + if dbD: + dbMapD = dbD["id_map"] + # + if dbMapD and ccId in dbMapD: + if "aliases" in dbMapD[ccId]: + iRow = wObj.getRowCount() + for nm in dbMapD[ccId]["aliases"]: + wObj.setValue(ccId, "comp_id", iRow) + wObj.setValue(str(nm).strip(), "name", iRow) + wObj.setValue(iRow + 1, "ordinal", iRow) + wObj.setValue("DrugBank", "provenance_source", iRow) + wObj.setValue("Synonym", "type", iRow) + iRow += 1 + if "brand_names" in dbMapD[ccId]: + iRow = wObj.getRowCount() + for nm in dbMapD[ccId]["brand_names"]: + wObj.setValue(ccId, "comp_id", iRow) + wObj.setValue(str(nm).strip(), "name", iRow) + wObj.setValue(iRow + 1, "ordinal", iRow) + wObj.setValue("DrugBank", "provenance_source", iRow) + wObj.setValue("Brand Name", "type", iRow) + iRow += 1 + + return True + except Exception as e: + logger.exception("For %s failing with %s", catName, str(e)) + + return False diff --git a/rcsb/utils/dictionary/DictMethodCommonUtils.py b/rcsb/utils/dictionary/DictMethodCommonUtils.py new file mode 100644 index 0000000..0a33d69 --- /dev/null +++ b/rcsb/utils/dictionary/DictMethodCommonUtils.py @@ -0,0 +1,3994 @@ +## +# File: DictMethodCommonUtils.py +# Author: J. Westbrook +# Date: 16-Jul-2019 +# Version: 0.001 Initial version +# +# Updates: +# 26-Jul-2019 jdw Include struct_mon_prot_cis with secondary structure features +# Add general processing of intermolecular and other connections. +# 19-Sep-2019 jdw Add method getEntityReferenceAlignments() +# 13-Oct-2019 jdw add isoform support +## +""" +Helper class implements common utility external method references supporting the RCSB dictionary extension. + +""" +__docformat__ = "restructuredtext en" +__author__ = "John Westbrook" +__email__ = "jwest@rcsb.rutgers.edu" +__license__ = "Apache 2.0" + +# pylint: disable=too-many-lines + +import datetime +import itertools +import logging +import re +import sys +from collections import OrderedDict, namedtuple + +from rcsb.utils.io.CacheUtils import CacheUtils +from rcsb.utils.seq.SeqAlign import SeqAlign + +logger = logging.getLogger(__name__) + +OutlierValueFields = ("compId", "seqId", "outlierType", "description", "reported", "reference", "uncertaintyValue", "uncertaintyType") +OutlierValue = namedtuple("OutlierValue", OutlierValueFields, defaults=(None,) * len(OutlierValueFields)) + +BoundEntityFields = ("targetCompId", "connectType", "partnerCompId", "partnerEntityId", "partnerEntityType") +NonpolymerBoundEntity = namedtuple("NonpolymerBoundEntity", BoundEntityFields, defaults=(None,) * len(BoundEntityFields)) + +BoundInstanceFields = ("targetCompId", "connectType", "partnerCompId", "partnerAsymId", "partnerEntityType", "partnerSeqId", "bondDistance", "bondOrder") +NonpolymerBoundInstance = namedtuple("NonpolymerBoundInstance", BoundInstanceFields, defaults=(None,) * len(BoundInstanceFields)) + +NonpolymerValidationFields = ("rsr", "rscc", "mogul_bonds_rmsz", "mogul_angles_rmsz", "missing_heavy_atom_count") +NonpolymerValidationInstance = namedtuple("NonpolymerValidationInstance", NonpolymerValidationFields, defaults=(None,) * len(NonpolymerValidationFields)) + + +class DictMethodCommonUtils(object): + """Helper class implements common utility external method references supporting the RCSB dictionary extension.""" + + # Dictionary of current standard monomers - + aaDict3 = { + "ALA": "A", + "ARG": "R", + "ASN": "N", + "ASP": "D", + "ASX": "B", + "CYS": "C", + "GLN": "Q", + "GLU": "E", + "GLX": "Z", + "GLY": "G", + "HIS": "H", + "ILE": "I", + "LEU": "L", + "LYS": "K", + "MET": "M", + "PHE": "F", + "PRO": "P", + "SER": "S", + "THR": "T", + "TRP": "W", + "TYR": "Y", + "VAL": "V", + "PYL": "O", + "SEC": "U", + } + dnaDict3 = {"DA": "A", "DC": "C", "DG": "G", "DT": "T", "DU": "U", "DI": "I"} + rnaDict3 = {"A": "A", "C": "C", "G": "G", "I": "I", "N": "N", "T": "T", "U": "U"} + # "UNK": "X", + # "MSE":"M", + # ".": "." + monDict3 = {**aaDict3, **dnaDict3, **rnaDict3} + + def __init__(self, **kwargs): + """ + Args: + **kwargs: (dict) Placeholder for future key-value arguments + + """ + # + self._raiseExceptions = kwargs.get("raiseExceptions", False) + self.__wsPattern = re.compile(r"\s+", flags=re.UNICODE | re.MULTILINE) + self.__reNonDigit = re.compile(r"[^\d]+") + # + cacheSize = 5 + self.__entityAndInstanceMapCache = CacheUtils(size=cacheSize, label="instance mapping") + self.__atomInfoCache = CacheUtils(size=cacheSize, label="atom site counts and mapping") + self.__protSSCache = CacheUtils(size=cacheSize, label="protein secondary structure") + self.__instanceConnectionCache = CacheUtils(size=cacheSize, label="instance connections") + self.__entityReferenceSequenceDetailsCache = CacheUtils(size=cacheSize, label="entity reference sequence details") + self.__entitySequenceFeatureCache = CacheUtils(size=cacheSize, label="entity sequence features") + self.__instanceSiteInfoCache = CacheUtils(size=cacheSize, label="instance site details") + self.__instanceUnobservedCache = CacheUtils(size=cacheSize, label="instance unobserved details") + self.__modelOutliersCache = CacheUtils(size=cacheSize, label="model outlier details") + # + logger.debug("Dictionary common utilities init") + + def echo(self, msg): + logger.info(msg) + + def testCache(self): + return True + + def isFloat(self, val): + try: + float(val) + except Exception: + return False + return True + + def __fetchEntityAndInstanceTypes(self, dataContainer): + wD = self.__entityAndInstanceMapCache.get(dataContainer.getName()) + if not wD: + wD = self.__getEntityAndInstanceTypes(dataContainer) + self.__entityAndInstanceMapCache.set(dataContainer.getName(), wD) + return wD + + def getFormulaWeightNonSolvent(self, dataContainer): + """Return a formula weight of the non-solvent entities in the deposited entry. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + float: formula weight (kilodaltons) + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchEntityAndInstanceTypes(dataContainer) + return wD["fwNonSolvent"] if "fwNonSolvent" in wD else {} + + def getInstancePolymerTypes(self, dataContainer): + """Return a dictionary of polymer types for each polymer instance. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + dict: {'asymId': , ...} + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchEntityAndInstanceTypes(dataContainer) + return wD["instancePolymerTypeD"] if "instancePolymerTypeD" in wD else {} + + def getInstanceTypes(self, dataContainer): + """Return a dictionary of entity types for each entity instance. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + dict: {'asymId': , ...} + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchEntityAndInstanceTypes(dataContainer) + return wD["instanceTypeD"] if "instanceTypeD" in wD else {} + + def getInstanceTypeCounts(self, dataContainer): + """Return a dictionary of the counts entity types for each entity type. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + dict: {'entity type': <# of instances>, ...} + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchEntityAndInstanceTypes(dataContainer) + return wD["instanceTypeCountD"] if "instanceTypeCountD" in wD else {} + + def getInstanceEntityMap(self, dataContainer): + """Return a dictionary of entities corresponding to each entity instance. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + dict: {'asymId': , ...} + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchEntityAndInstanceTypes(dataContainer) + return wD["instEntityD"] if "instEntityD" in wD else {} + + def getEntityPolymerTypes(self, dataContainer): + """Return a dictionary of polymer types for each polymer entity. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + dict: {'entityId': , ...} + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchEntityAndInstanceTypes(dataContainer) + return wD["epTypeD"] if "epTypeD" in wD else {} + + def getEntityTypes(self, dataContainer): + """Return a dictionary of entity types for each entity. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + dict: {'entityId': , ...} + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchEntityAndInstanceTypes(dataContainer) + return wD["eTypeD"] if "eTypeD" in wD else {} + + def getPolymerEntityFilteredTypes(self, dataContainer): + """Return a dictionary of filtered entity polymer types for each polymer entity. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + dict: {'entityId': , ...} + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchEntityAndInstanceTypes(dataContainer) + return wD["epTypeFilteredD"] if "epTypeFilteredD" in wD else {} + + def getPolymerEntityLengths(self, dataContainer): + """Return a dictionary of entity polymer lengths for each polymer entity. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + dict: {'entityId': , ...} + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchEntityAndInstanceTypes(dataContainer) + return wD["epLengthD"] if "epLengthD" in wD else {} + + def getPolymerEntityLengthsEnumerated(self, dataContainer): + """Return a dictionary of entity polymer lengths for each polymer entity. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + dict: {'entityId': , ...} + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchEntityAndInstanceTypes(dataContainer) + return wD["entityPolymerLengthD"] if "entityPolymerLengthD" in wD else {} + + def getPolymerEntityMonomerCounts(self, dataContainer): + """Return a dictionary of monomer counts for each polymer entity. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + dict: {'entityId': {'compId': , ... }} + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchEntityAndInstanceTypes(dataContainer) + return wD["entityPolymerMonomerCountD"] if "entityPolymerMonomerCountD" in wD else {} + + def getPolymerEntityModifiedMonomers(self, dataContainer): + """Return a dictionary of nonstandard monomers for each polymer entity. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + dict: {'entityId': [mod_comp_id, mod_comp_id,...]} + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchEntityAndInstanceTypes(dataContainer) + return wD["entityPolymerModifiedMonomers"] if "entityPolymerModifiedMonomers" in wD else {} + + def getPolymerModifiedMonomerFeatures(self, dataContainer): + """Return a dictionary of nonstandard monomer features. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + dict: [(entityId, seqId, compId, 'modified_monomer')] = set(compId) + + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchEntityAndInstanceTypes(dataContainer) + return wD["seqModMonomerFeatureD"] if "seqModMonomerFeatureD" in wD else {} + + def getEntityPolymerLengthBounds(self, dataContainer): + """Return a dictionary of polymer length bounds by entity type. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + tuple: (minLen, maxLen) + """ + if not dataContainer or not dataContainer.getName(): + return () + wD = self.__fetchEntityAndInstanceTypes(dataContainer) + return wD["entityPolymerLengthBounds"] if "entityPolymerLengthBounds" in wD else () + + def getEntityFormulaWeightBounds(self, dataContainer): + """Return a dictionary of formula weight bounds by entity type. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + dict: [entityType] = (minFw, maxFw) + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchEntityAndInstanceTypes(dataContainer) + return wD["fwTypeBoundD"] if "fwTypeBoundD" in wD else {} + + def getTargetComponents(self, dataContainer): + """Return a components targets. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + list: [compId, compId,...] + """ + if not dataContainer or not dataContainer.getName(): + return [] + wD = self.__fetchEntityAndInstanceTypes(dataContainer) + return wD["ccTargets"] if "ccTargets" in wD else [] + + def __getEntityAndInstanceTypes(self, dataContainer): + """Internal method to collect and return entity/instance type, size and mapping information. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + (dict) : Return dictionary of entity types, type counts and polymer type (where applicable) for + each instance in the deposited unit. + + Type and count contents: + + instanceTypeD[asymId] = + instanceTypeCountD[] = # + instancePolymerTypeD[asymId] = + eTypeD[entityId] = + instEntityD[asymId] = entityId + epTypeD[entityId] = + epTypeFilteredD[entityId] = + epLengthD[entityId] = polymer monomer length (from one-letter-code) + entityPolymerLengthD[entityId] = polymer monomer length (from enumerated sequence) + entityPolymerMonomerCountD[entityId][compId] = mononer count + entityPolymerModifiedMonomers[entity]=[mod compId, mod compId] + seqModMonomerFeatureD[(entityId, seqId, compId, 'modified_monomer')] = set(compId) + fwNonSolvent = float value (kilodaltons) + fwTypeBoundD[entityType] = (minFw, maxFw) + entityPolymerLengthBounds = (minL, maxL) + ccTargets = [compId, compId] + """ + rD = {} + # + try: + # + if not dataContainer.exists("entity") or not dataContainer.exists("struct_asym"): + return {} + eFwD = {} + instanceTypeD = {} + instancePolymerTypeD = {} + instanceTypeCountD = {} + # + eObj = dataContainer.getObj("entity") + eTypeD = {} + for ii in range(eObj.getRowCount()): + # logger.info("Attribute %r %r" % (ii, eObj.getAttributeList())) + entityId = eObj.getValue("id", ii) + eType = eObj.getValue("type", ii) + eTypeD[entityId] = eType + fw = eObj.getValue("formula_weight", ii) + eFwD[entityId] = float(fw) if fw and fw not in [".", "?"] else 0.0 + # + epTypeD = {} + epLengthD = {} + epTypeFilteredD = {} + hasEntityPoly = False + if dataContainer.exists("entity_poly"): + hasEntityPoly = True + epObj = dataContainer.getObj("entity_poly") + for ii in range(epObj.getRowCount()): + entityId = epObj.getValue("entity_id", ii) + pType = epObj.getValue("type", ii) + epTypeFilteredD[entityId] = self.filterEntityPolyType(pType) + epTypeD[entityId] = pType + if epObj.hasAttribute("pdbx_seq_one_letter_code_can"): + sampleSeq = self.__stripWhiteSpace(epObj.getValue("pdbx_seq_one_letter_code_can", ii)) + epLengthD[entityId] = len(sampleSeq) if sampleSeq and sampleSeq not in ["?", "."] else None + + # + seqModMonomerFeatureD = {} + entityPolymerMonomerCountD = {} + entityPolymerLengthD = {} + hasEntityPolySeq = False + epsObj = None + if dataContainer.exists("entity_poly_seq"): + epsObj = dataContainer.getObj("entity_poly_seq") + hasEntityPolySeq = True + tSeqD = {} + for ii in range(epsObj.getRowCount()): + entityId = epsObj.getValue("entity_id", ii) + seqNum = epsObj.getValue("num", ii) + compId = epsObj.getValue("mon_id", ii) + if compId not in DictMethodCommonUtils.monDict3: + seqModMonomerFeatureD.setdefault((entityId, seqNum, compId, "modified_monomer"), set()).add(compId) + # handle heterogeneity with the entityId,seqNum tuple + tSeqD.setdefault(entityId, set()).add((entityId, seqNum)) + if entityId not in entityPolymerMonomerCountD: + entityPolymerMonomerCountD[entityId] = {} + entityPolymerMonomerCountD[entityId][compId] = entityPolymerMonomerCountD[entityId][compId] + 1 if compId in entityPolymerMonomerCountD[entityId] else 1 + # + entityPolymerLengthD = {entityId: len(tSet) for entityId, tSet in tSeqD.items()} + # + if not hasEntityPoly and hasEntityPolySeq: + for entityId, eType in eTypeD.items(): + if eType in ["polymer"]: + monomerL = epsObj.selectValuesWhere("mon_id", entityId, "entity_id") + pType, fpType = self.guessEntityPolyTypes(monomerL) + epTypeFilteredD[entityId] = fpType + epTypeD[entityId] = pType + epLengthD[entityId] = len(monomerL) + + entityPolymerModifiedMonomers = {} + for entityId, cD in entityPolymerMonomerCountD.items(): + tL = [] + for compId, _ in cD.items(): + modFlag = "N" if compId in DictMethodCommonUtils.monDict3 else "Y" + if modFlag == "Y": + tL.append(compId) + entityPolymerModifiedMonomers[entityId] = sorted(set(tL)) + # + logger.debug("%s entityPolymerModifiedMonomers %r", dataContainer.getName(), entityPolymerModifiedMonomers) + # Add branched here + # + instEntityD = {} + sObj = dataContainer.getObj("struct_asym") + for ii in range(sObj.getRowCount()): + entityId = sObj.getValue("entity_id", ii) + asymId = sObj.getValue("id", ii) + instEntityD[asymId] = entityId + if entityId in eTypeD: + instanceTypeD[asymId] = eTypeD[entityId] + else: + logger.warning("Missing entity id entry %r asymId %r entityId %r", dataContainer.getName(), entityId, asymId) + if entityId in epTypeD: + instancePolymerTypeD[asymId] = epTypeFilteredD[entityId] + # + # + # Count the instance by type - initialize all types + # + instanceTypeCountD = {k: 0 for k in ["polymer", "non-polymer", "branched", "macrolide", "water"]} + for asymId, eType in instanceTypeD.items(): + instanceTypeCountD[eType] += 1 + # + # Compute the total weight of polymer and non-polymer instances (full entities) - (kilodaltons) + # + fwNonSolvent = 0.0 + for asymId, eType in instanceTypeD.items(): + if eType not in ["water"]: + entityId = instEntityD[asymId] + fwNonSolvent += eFwD[entityId] + fwNonSolvent = fwNonSolvent / 1000.0 + # + # Get ligand of interest. + # + ccTargets = [] + if dataContainer.exists("pdbx_entity_instance_feature"): + ifObj = dataContainer.getObj("pdbx_entity_instance_feature") + for ii in range(ifObj.getRowCount()): + compId = ifObj.getValue("comp_id", ii) + ft = ifObj.getValue("feature_type", ii) + if ft.upper() in ["SUBJECT OF INVESTIGATION"]: + ccTargets.append(compId) + # + # + fwTypeBoundD = {} + tBoundD = {et: {"min": float("inf"), "max": -1.0} for eId, et in eTypeD.items()} + for entityId, fw in eFwD.items(): + fw = fw / 1000.0 + eType = eTypeD[entityId] + tBoundD[eType]["min"] = fw if fw < tBoundD[eType]["min"] else tBoundD[eType]["min"] + tBoundD[eType]["max"] = fw if fw > tBoundD[eType]["max"] else tBoundD[eType]["max"] + for eType in tBoundD: + if tBoundD[eType]["min"] > 0.00000001: + fwTypeBoundD[eType] = tBoundD[eType] + # + + entityPolymerLengthBounds = None + maxL = -1 + minL = sys.maxsize + if epLengthD: + for entityId, pLen in epLengthD.items(): + minL = pLen if pLen < minL else minL + maxL = pLen if pLen > maxL else maxL + entityPolymerLengthBounds = (minL, maxL) + # + + rD = { + "instanceTypeD": instanceTypeD, + "instancePolymerTypeD": instancePolymerTypeD, + "instanceTypeCountD": instanceTypeCountD, + "instEntityD": instEntityD, + "eTypeD": eTypeD, + "epLengthD": epLengthD, + "epTypeD": epTypeD, + "epTypeFilteredD": epTypeFilteredD, + "entityPolymerMonomerCountD": entityPolymerMonomerCountD, + "entityPolymerLengthD": entityPolymerLengthD, + "entityPolymerModifiedMonomers": entityPolymerModifiedMonomers, + "seqModMonomerFeatureD": seqModMonomerFeatureD, + "fwNonSolvent": fwNonSolvent, + "fwTypeBoundD": fwTypeBoundD, + "entityPolymerLengthBounds": entityPolymerLengthBounds, + "ccTargets": ccTargets, + } + logger.debug("%s length struct_asym %d (%d) instanceTypeD %r", dataContainer.getName(), sObj.getRowCount(), len(instanceTypeD), instanceTypeD) + # + except Exception as e: + logger.exception("Failing with %r with %r", dataContainer.getName(), str(e)) + # + return rD + + def getAsymAuthIdMap(self, dataContainer): + """Return a dictionary of mapping between asymId and authAsymId. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + dict: {'asymId': authAsymId, ...} + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchAtomSiteInfo(dataContainer) + return wD["asymAuthIdD"] if "asymAuthIdD" in wD else {} + + def getInstanceHeavyAtomCounts(self, dataContainer, modelId="1"): + """Return a dictionary of deposited heavy atom counts for each entity instance. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + modelId (str, optional): model index. Defaults to "1". + + + Returns: + dict: {'asymId': <# of deposited atoms>, ...} + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchAtomSiteInfo(dataContainer, modelId=modelId) + return wD["instanceHeavyAtomCountD"] if "instanceHeavyAtomCountD" in wD else {} + + def getInstanceHydrogenAtomCounts(self, dataContainer, modelId="1"): + """Return a dictionary of deposited hydrogen atom counts for each entity instance. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + modelId (str, optional): model index. Defaults to "1". + + + Returns: + dict: {'asymId': <# of deposited atoms>, ...} + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchAtomSiteInfo(dataContainer, modelId=modelId) + return wD["instanceHydrogenAtomCountD"] if "instanceHydrogenAtomCountD" in wD else {} + + def getModelIdList(self, dataContainer): + """Return a list of model identifiers for the entry. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + list: [1,2,3] + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchAtomSiteInfo(dataContainer) + return wD["modelIdList"] if "modelIdList" in wD else [] + + def getEntityTypeHeavyAtomCounts(self, dataContainer, modelId="1"): + """Return a dictionary of deposited heavy atom counts for each entity type. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + modelId (str, optional): model index. Defaults to "1". + + Returns: + dict: {'entity type': <# of deposited atoms>, ...} + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchAtomSiteInfo(dataContainer, modelId=modelId) + return wD["typeHeavyAtomCountD"] if "typeHeavyAtomCountD" in wD else {} + + def getInstanceModeledMonomerCounts(self, dataContainer, modelId="1"): + """Return a dictionary of deposited modeled monomer counts for each entity instance. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + modelId (str, optional): model index. Defaults to "1". + + Returns: + dict: {'asymId': <# of deposited modeled monomers>, ...} + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchAtomSiteInfo(dataContainer, modelId=modelId) + return wD["instancePolymerModeledMonomerCountD"] if "instancePolymerModeledMonomerCountD" in wD else {} + + def getInstanceUnModeledMonomerCounts(self, dataContainer, modelId="1"): + """Return a dictionary of deposited unmodeled monomer counts for each entity instance. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + modelId (str, optional): model index. Defaults to "1". + + Returns: + dict: {'asymId': <# of deposited unmodeled mononmers>, ...} + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchAtomSiteInfo(dataContainer, modelId=modelId) + return wD["instancePolymerUnmodeledMonomerCountD"] if "instancePolymerUnmodeledMonomerCountD" in wD else {} + + def getDepositedMonomerCounts(self, dataContainer, modelId="1"): + """Return deposited modeled and unmodeled polymer monomer counts for the input modelid. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + modelId (str, optional): model index. Defaults to "1". + + + Returns: + (int,int): modeled and unmodeled monomer counts + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchAtomSiteInfo(dataContainer, modelId=modelId) + modeledCount = sum(wD["instancePolymerModeledMonomerCountD"].values()) + unModeledCount = sum(wD["instancePolymerUnmodeledMonomerCountD"].values()) + return modeledCount, unModeledCount + + def getDepositedAtomCounts(self, dataContainer, modelId="1"): + """Return the number of deposited heavy atoms in the input model, the total deposited atom + and the total model count. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + modelId (str, optional): model index. Defaults to "1". + + Returns: + (int, int, int, int) deposited heavy atoms in input model, hydrogen atoms in input model, total deposited atom count, and total deposited model count + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchAtomSiteInfo(dataContainer, modelId=modelId) + numHeavyAtomsModel = wD["numHeavyAtomsModel"] if "numHeavyAtomsModel" in wD else 0 + numHydrogenAtomsModel = wD["numHydrogenAtomsModel"] if "numHydrogenAtomsModel" in wD else 0 + numAtomsTotal = wD["numAtomsAll"] if "numAtomsAll" in wD else 0 + numModelsTotal = wD["numModels"] if "numModels" in wD else 0 + return numHeavyAtomsModel, numHydrogenAtomsModel, numAtomsTotal, numModelsTotal + + def getInstancePolymerRanges(self, dataContainer): + """Return a dictionary of polymer residue range and length for each entity instance. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + dict: {"asymId": , {"sampleSeqLen": sampleSeqLen, + "obsSeqLen": obsSeqLen, + "begSeqId": begSeqId, + "endSeqId": endSeqId, + "begAuthSeqId": begAuthSeqId, + "endAuthSeqId": endAuthSeqId, + "begInsCode": begAuthInsCode, + "endInsCode": endAuthInsCode,}...} + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchAtomSiteInfo(dataContainer) + return wD["asymIdPolymerRangesD"] if "asymIdPolymerRangesD" in wD else {} + + def getInstanceIdMap(self, dataContainer): + """Return a dictionary of cardinal identifiers for each entity instance. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + dict: {"asymId": {"entry_id": entryId, + "entity_id": entityId, + "entity_type": entityTypeD[entityId], + "asym_id": asymId, + "auth_asym_id": authAsymId, + "comp_id": monId, + "auth_seq_id": "?",}, ...} + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchAtomSiteInfo(dataContainer) + return wD["instanceIdMapD"] if "instanceIdMapD" in wD else {} + + def getNonPolymerIdMap(self, dataContainer): + """Return a dictionary of cardinal identifiers for each non-polymer entity instance. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + dict: {(authAsymId, resNum): {"entry_id": entryId, + "entity_id": entityId, + "entity_type": entityTypeD[entityId], + "asym_id": asymId, + "auth_asym_id": authAsymId, + "comp_id": monId, + "auth_seq_id": resNum, + }, ...} + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchAtomSiteInfo(dataContainer) + return wD["npAuthAsymIdMapD"] if "npAuthAsymIdMapD" in wD else {} + + def getPolymerIdMap(self, dataContainer): + """Return a dictionary of cardinal identifiers for each polymer entity instance. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + dict: {(authAsymId, authSeqId, insCode): { + "entry_id": entryId, + "entity_id": entityId, + "entity_type": entityTypeD[entityId], + "asym_id": asymId, + "comp_id": compId, + }, ... } + + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchAtomSiteInfo(dataContainer) + return wD["pAuthAsymIdMapD"] if "pAuthAsymIdMapD" in wD else {} + + def getBranchedIdMap(self, dataContainer): + """Return a dictionary of cardinal identifiers for each branched entity instance. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + dict: {(authAsymId, authSeqNum): { + "entry_id": entryId, + "entity_id": entityId, + "entity_type": entityTypeD[entityId], + "asym_id": asymId, + "auth_asym_id": authAsymId, + "comp_id": monId, + "auth_seq_id": authSeqNum, + "seq_num": seqNum, + }, ...} + + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchAtomSiteInfo(dataContainer) + return wD["brAuthAsymIdMapD"] if "brAuthAsymIdMapD" in wD else {} + + def getEntityTypeUniqueIds(self, dataContainer): + """Return a nested dictionary of selected unique identifiers for entity types. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + dict: [][] = {'asymIds': [...],'authAsymIds': [...], 'ccIds': [...]} + + + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchAtomSiteInfo(dataContainer) + return wD["entityTypeUniqueIds"] if "entityTypeUniqueIds" in wD else {} + + def getAuthToSeqIdMap(self, dataContainer): + """Return an instance (asymId) dictionary of auth to entity residue sequence mapping + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + dict: seqIdMapAsymD[asymId] = [, ... ] + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchAtomSiteInfo(dataContainer) + return wD["seqIdMapAsymD"] if "seqIdMapAsymD" in wD else {} + + def __fetchAtomSiteInfo(self, dataContainer, modelId="1"): + wD = self.__atomInfoCache.get((dataContainer.getName(), modelId)) + if not wD: + wD = self.__getAtomSiteInfo(dataContainer, modelId=modelId) + self.__atomInfoCache.set((dataContainer.getName(), modelId), wD) + return wD + + def __getAtomSiteInfo(self, dataContainer, modelId="1"): + """Get counting information for each instance in the deposited coordinates for the input model. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + modelId (str, optional): model index. Defaults to "1". + + Returns: + (dict): with atom site counting and instance mapping details. + + For instance, the following are calculated: + + instanceHeavyAtomCountD[asymId]: number of deposited heavy atoms + typeHeavyAtomCountD[entity type]: number of deposited heavy atoms + + numHeavyAtomsModel: number of deposited heavy atoms in input model_id + modelId: modelId + + instancePolymerModeledMonomerCountD[asymId]: number modeled polymer monomers in deposited coordinates + instancePolymerUnmodeledMonomerCountD[asymId]: number of polymer unmodeled monomers in deposited coordinates + + numModels: total number of deposited models + numAtomsAll: total number of deposited atoms + + asymAuthIdD = {asymId: authAsymId, ... } + + asymIdPolymerRangesD = {asymId: {"sampleSeqLen": sampleSeqLen, + "obsSeqLen": obsSeqLen, + "begSeqId": begSeqId, + "endSeqId": endSeqId, + "begAuthSeqId": begAuthSeqId, + "endAuthSeqId": endAuthSeqId, + "begInsCode": begAuthInsCode, + "endInsCode": endAuthInsCode,}, ...} + instanceIdMapD = {asymId: {"entry_id": entryId, + "entity_id": entityId, + "entity_type": entityTypeD[entityId], + "asym_id": asymId, + "auth_asym_id": authAsymId, + "comp_id": monId, + "auth_seq_id": "?",}, ...} + + pAuthAsymIdMapD[(authAsymId, authSeqId, insCode)] = { + "entry_id": entryId, + "entity_id": entityId, + "entity_type": entityTypeD[entityId], + "asym_id": asymId, + "comp_id": compId, + "seq_id": seqId, + } + + npAuthAsymIdMapD[(authAsymId, resNum)] = { + "entry_id": entryId, + "entity_id": entityId, + "entity_type": entityTypeD[entityId], + "asym_id": asymId, + "auth_asym_id": authAsymId, + "comp_id": monId, + "auth_seq_id": resNum, + } + + brAuthAsymIdMapD[(authAsymId, authSeqNum)] = { + "entry_id": entryId, + "entity_id": entityId, + "entity_type": entityTypeD[entityId], + "asym_id": asymId, + "auth_asym_id": authAsymId, + "comp_id": monId, + "auth_seq_id": authSeqNum, + "seq_num": seqNum, + } + entityTypeUniqueIds[][] = {'asymIds': [...],'authAsymIds': [...], 'ccIds': [...]} + + seqIdMapAsymD[asymId] = [, ... ] + + """ + # + numAtomsAll = 0 + numHeavyAtomsModel = 0 + typeHeavyAtomCountD = {} + instanceHeavyAtomCountD = {} + # + numHydrogenAtomsModel = 0 + typeHydrogenAtomCountD = {} + instanceHydrogenAtomCountD = {} + # + instancePolymerModeledMonomerCountD = {} + instancePolymerUnmodeledMonomerCountD = {} + atomSiteInfoD = {} + modelIdL = [] + asymAuthIdD = {} + instanceTypeD = self.getInstanceTypes(dataContainer) + entityTypeD = self.getEntityTypes(dataContainer) + # + eObj = dataContainer.getObj("entity") + entityIdL = eObj.getAttributeValueList("id") + # + try: + if dataContainer.exists("atom_site"): + tObj = dataContainer.getObj("atom_site") + # All atoms all types deposited - + numAtomsAll = tObj.getRowCount() + # Heavy atoms per model - + cndL = [("type_symbol", "not in", ["H", "D", "T"]), ("pdbx_PDB_model_num", "eq", modelId)] + numHeavyAtomsModel = tObj.countValuesWhereOpConditions(cndL) + # + modelIdL = tObj.getAttributeUniqueValueList("pdbx_PDB_model_num") + cD = tObj.getCombinationCountsWithConditions(["label_asym_id", "pdbx_PDB_model_num"], [("type_symbol", "not in", ["H", "D", "T"])]) + # + for asymId, _ in instanceTypeD.items(): + instanceHeavyAtomCountD[asymId] = cD[(asymId, modelId)] if (asymId, modelId) in cD else 0 + # + # for eType in ['polymer', 'non-polymer', 'branched', 'macrolide', 'solvent']: + typeHeavyAtomCountD = {k: 0 for k in ["polymer", "non-polymer", "branched", "macrolide", "water"]} + for asymId, aCount in instanceHeavyAtomCountD.items(): + tt = instanceTypeD[asymId] + typeHeavyAtomCountD[tt] += aCount + + # Hydrogen counts ... + cndL = [("type_symbol", "in", ["H", "D", "T"]), ("pdbx_PDB_model_num", "eq", modelId)] + numHydrogenAtomsModel = tObj.countValuesWhereOpConditions(cndL) + # + cD = tObj.getCombinationCountsWithConditions(["label_asym_id", "pdbx_PDB_model_num"], [("type_symbol", "in", ["H", "D", "T"])]) + for asymId, _ in instanceTypeD.items(): + instanceHydrogenAtomCountD[asymId] = cD[(asymId, modelId)] if (asymId, modelId) in cD else 0 + # + typeHydrogenAtomCountD = {k: 0 for k in ["polymer", "non-polymer", "branched", "macrolide", "water"]} + for asymId, aCount in instanceHydrogenAtomCountD.items(): + tt = instanceTypeD[asymId] + typeHydrogenAtomCountD[tt] += aCount + # + else: + logger.warning("Missing atom_site category for %s", dataContainer.getName()) + # + numModels = len(modelIdL) + if numModels < 1: + logger.warning("Missing model details in atom_site category for %s", dataContainer.getName()) + # + atomSiteInfoD = { + "instanceHeavyAtomCountD": instanceHeavyAtomCountD, + "typeHeavyAtomCountD": typeHeavyAtomCountD, + "numAtomsAll": numAtomsAll, + "numHeavyAtomsModel": numHeavyAtomsModel, + "numModels": len(modelIdL), + "modelId": modelId, + "modelIdList": sorted(modelIdL), + "instancePolymerModeledMonomerCountD": {}, + "instancePolymerUnmodeledMonomerCountD": {}, + "instanceHydrogenAtomCountD": instanceHydrogenAtomCountD, + "typeHydrogenAtomCountD": typeHydrogenAtomCountD, + "numHydrogenAtomsModel": numHydrogenAtomsModel, + } + except Exception as e: + logger.exception("Failing with %r with %r", dataContainer.getName(), str(e)) + + # + entityTypeUniqueIds = {} + tAsymIdD = {} + seqIdObsMapD = {} + seqIdMapAsymD = {} + epLengthD = self.getPolymerEntityLengths(dataContainer) + asymIdPolymerRangesD = {} + instanceIdMapD = {} + npAuthAsymIdMapD = {} + pAuthAsymIdMapD = {} + brAuthAsymIdMapD = {} + try: + eObj = dataContainer.getObj("entry") + entryId = eObj.getValue("id", 0) + # + psObj = dataContainer.getObj("pdbx_poly_seq_scheme") + if psObj is not None: + # -- + for eId in entityIdL: + if entityTypeD[eId] in ["polymer"]: + tAsymIdL = psObj.selectValuesWhere("asym_id", eId, "entity_id") + tAuthAsymIdL = psObj.selectValuesWhere("pdb_strand_id", eId, "entity_id") + tCcIdL = psObj.selectValuesWhere("mon_id", eId, "entity_id") + entityTypeUniqueIds.setdefault(entityTypeD[eId], {}).setdefault(eId, {"asymIds": tAsymIdL, "authAsymIds": tAuthAsymIdL, "ccIds": tCcIdL}) + # --- + aSeqD = {} + aOrgSeqD = {} + for ii in range(psObj.getRowCount()): + asymId = psObj.getValue("asym_id", ii) + # authSeqId = psObj.getValue("auth_seq_num", ii) + authSeqId = psObj.getValue("pdb_seq_num", ii) + authOrgSeqId = psObj.getValue("auth_seq_num", ii) + seqId = psObj.getValue("seq_id", ii) + compId = psObj.getValue("mon_id", ii) + entityId = psObj.getValue("entity_id", ii) + authAsymId = psObj.getValue("pdb_strand_id", ii) + # + insCode = psObj.getValueOrDefault("pdb_ins_code", ii, defaultValue=None) + aSeqD.setdefault(asymId, []).append(authSeqId) + aOrgSeqD.setdefault(asymId, []).append(authOrgSeqId) + # --- + tC = authSeqId + if authSeqId not in [".", "?"]: + seqIdObsMapD.setdefault(asymId, {})[seqId] = (authSeqId, insCode) + else: + tC = "?" + if insCode and tC != "?": + tC += insCode + seqIdMapAsymD.setdefault(asymId, []).append(tC) + # --- + # + pAuthAsymIdMapD[(authAsymId, authSeqId, insCode)] = { + "entry_id": entryId, + "entity_id": entityId, + "entity_type": entityTypeD[entityId], + "asym_id": asymId, + "comp_id": compId, + "seq_id": seqId, + } + # + if asymId in tAsymIdD: + continue + tAsymIdD[asymId] = entityId + asymAuthIdD[asymId] = authAsymId + # + instanceIdMapD[asymId] = { + "entry_id": entryId, + "entity_id": entityId, + "entity_type": entityTypeD[entityId], + "asym_id": asymId, + "auth_asym_id": authAsymId, + "rcsb_id": entryId + "." + asymId, + "comp_id": "?", + "auth_seq_id": "?", + } + # + + # + # Get the modeled and unmodeled monomer counts by asymId + # JDW not use aOrgSeqD.items() + for asymId, sL in aOrgSeqD.items(): + instancePolymerModeledMonomerCountD[asymId] = len([t for t in sL if t not in ["?", "."]]) + instancePolymerUnmodeledMonomerCountD[asymId] = len([t for t in sL if t in ["?", "."]]) + # Get polymer range details for each polymer instance + for asymId, entityId in tAsymIdD.items(): + sampleSeqLen = epLengthD[entityId] if entityId in epLengthD else None + sL = list(seqIdObsMapD[asymId].items()) + begSeqId, (begAuthSeqId, begAuthInsCode) = sL[0] + endSeqId, (endAuthSeqId, endAuthInsCode) = sL[-1] + obsSeqLen = len(sL) + # + asymIdPolymerRangesD[asymId] = { + "sampleSeqLen": sampleSeqLen, + "obsSeqLen": obsSeqLen, + "begSeqId": begSeqId, + "endSeqId": endSeqId, + "begAuthSeqId": begAuthSeqId, + "endAuthSeqId": endAuthSeqId, + "begInsCode": begAuthInsCode, + "endInsCode": endAuthInsCode, + } + atomSiteInfoD["instancePolymerModeledMonomerCountD"] = instancePolymerModeledMonomerCountD + atomSiteInfoD["instancePolymerUnmodeledMonomerCountD"] = instancePolymerUnmodeledMonomerCountD + atomSiteInfoD["asymAuthIdD"] = asymAuthIdD + atomSiteInfoD["asymIdPolymerRangesD"] = asymIdPolymerRangesD + atomSiteInfoD["seqIdMapAsymD"] = seqIdMapAsymD + # -------------- + logger.debug( + "%s instancePolymerModeledMonomerCountD(%d) %r", + dataContainer.getName(), + sum(atomSiteInfoD["instancePolymerModeledMonomerCountD"].values()), + atomSiteInfoD["instancePolymerModeledMonomerCountD"], + ) + logger.debug("%s instancePolymerUnmodeledMonomerCountD %r", dataContainer.getName(), atomSiteInfoD["instancePolymerUnmodeledMonomerCountD"]) + # + # -------------- -------------- -------------- -------------- -------------- -------------- -------------- -------------- + # Add nonpolymer instance mapping + # + npsObj = dataContainer.getObj("pdbx_nonpoly_scheme") + if npsObj is not None: + # -- + for eId in entityIdL: + if entityTypeD[eId] in ["non-polymer", "water"]: + tAsymIdL = npsObj.selectValuesWhere("asym_id", eId, "entity_id") + tAuthAsymIdL = npsObj.selectValuesWhere("pdb_strand_id", eId, "entity_id") + tCcIdL = npsObj.selectValuesWhere("mon_id", eId, "entity_id") + entityTypeUniqueIds.setdefault(entityTypeD[eId], {}).setdefault(eId, {"asymIds": tAsymIdL, "authAsymIds": tAuthAsymIdL, "ccIds": tCcIdL}) + # --- + for ii in range(npsObj.getRowCount()): + asymId = npsObj.getValue("asym_id", ii) + entityId = npsObj.getValue("entity_id", ii) + authAsymId = npsObj.getValue("pdb_strand_id", ii) + resNum = npsObj.getValue("pdb_seq_num", ii) + monId = npsObj.getValue("mon_id", ii) + asymAuthIdD[asymId] = authAsymId + if asymId not in instanceIdMapD: + instanceIdMapD[asymId] = { + "entry_id": entryId, + "entity_id": entityId, + "entity_type": entityTypeD[entityId], + "asym_id": asymId, + "auth_asym_id": authAsymId, + "rcsb_id": entryId + "." + asymId, + "comp_id": monId, + "auth_seq_id": resNum, + } + npAuthAsymIdMapD[(authAsymId, resNum)] = { + "entry_id": entryId, + "entity_id": entityId, + "entity_type": entityTypeD[entityId], + "asym_id": asymId, + "auth_asym_id": authAsymId, + "comp_id": monId, + "auth_seq_id": resNum, + } + + # --------- + brsObj = dataContainer.getObj("pdbx_branch_scheme") + if brsObj is not None: + # -- + for eId in entityIdL: + if entityTypeD[eId] in ["branched"]: + tAsymIdL = brsObj.selectValuesWhere("asym_id", eId, "entity_id") + # changed to pdb_asym_id on 2020-07-29 + tAuthAsymIdL = brsObj.selectValuesWhere("pdb_asym_id", eId, "entity_id") + tCcIdL = brsObj.selectValuesWhere("mon_id", eId, "entity_id") + entityTypeUniqueIds.setdefault(entityTypeD[eId], {}).setdefault(eId, {"asymIds": tAsymIdL, "authAsymIds": tAuthAsymIdL, "ccIds": tCcIdL}) + # --- + for ii in range(brsObj.getRowCount()): + asymId = brsObj.getValue("asym_id", ii) + entityId = brsObj.getValue("entity_id", ii) + # + authAsymId = brsObj.getValue("pdb_asym_id", ii) + authSeqNum = brsObj.getValue("pdb_seq_num", ii) + monId = brsObj.getValue("mon_id", ii) + seqNum = brsObj.getValue("num", ii) + asymAuthIdD[asymId] = authAsymId + if asymId not in instanceIdMapD: + instanceIdMapD[asymId] = { + "entry_id": entryId, + "entity_id": entityId, + "entity_type": entityTypeD[entityId], + "asym_id": asymId, + "auth_asym_id": authAsymId, + "rcsb_id": entryId + "." + asymId, + "comp_id": monId, + "auth_seq_id": "?", + } + brAuthAsymIdMapD[(authAsymId, authSeqNum)] = { + "entry_id": entryId, + "entity_id": entityId, + "entity_type": entityTypeD[entityId], + "asym_id": asymId, + "auth_asym_id": authAsymId, + "comp_id": monId, + "auth_seq_id": authSeqNum, + "seq_num": seqNum, + } + + # + atomSiteInfoD["instanceIdMapD"] = instanceIdMapD + atomSiteInfoD["npAuthAsymIdMapD"] = npAuthAsymIdMapD + atomSiteInfoD["pAuthAsymIdMapD"] = pAuthAsymIdMapD + atomSiteInfoD["brAuthAsymIdMapD"] = brAuthAsymIdMapD + atomSiteInfoD["entityTypeUniqueIds"] = entityTypeUniqueIds + + except Exception as e: + logger.exception("Failing for %s with %s", dataContainer.getName(), str(e)) + + # + return atomSiteInfoD + + def getProtHelixFeatures(self, dataContainer): + """Return a dictionary protein helical features (entity/label sequence coordinates). + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + dict: {: (asymId, begSeqId, endSeqId), ...} + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchProtSecStructFeatures(dataContainer) + return wD["helixRangeD"] if "helixRangeD" in wD else {} + + def getProtUnassignedSecStructFeatures(self, dataContainer): + """Return a dictionary protein regions lacking SS feature assignments (entity/label sequence coordinates). + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + dict: {: (asymId, begSeqId, endSeqId), ...} + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchProtSecStructFeatures(dataContainer) + return wD["unassignedRangeD"] if "unassignedRangeD" in wD else {} + + def getProtSheetFeatures(self, dataContainer): + """Return a dictionary protein beta strand features (entity/label sequence coordinates). + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + dict: {: {asymId: [(begSeqId, endSeqId), ...], } + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchProtSecStructFeatures(dataContainer) + return wD["instSheetRangeD"] if "instSheetRangeD" in wD else {} + + def getProtSheetSense(self, dataContainer): + """Return a dictionary protein beta strand sense . + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + dict: {: mixed|parallel|anti-parallel, ...} + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchProtSecStructFeatures(dataContainer) + return wD["senseTypeD"] if "senseTypeD" in wD else {} + + def getCisPeptides(self, dataContainer): + """Return a dictionary cis-peptides linkages using standard nomenclature. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + dict: {: (begAsymId, begSeqId, endSeqId, modelId, omegaAngle), ...} + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchProtSecStructFeatures(dataContainer) + return wD["cisPeptideD"] if "cisPeptideD" in wD else {} + + def __fetchProtSecStructFeatures(self, dataContainer): + wD = self.__protSSCache.get(dataContainer.getName()) + if not wD: + wD = self.getProtSecStructFeatures(dataContainer) + self.__protSSCache.set(dataContainer.getName(), wD) + return wD + + def getProtSecStructFeatures(self, dataContainer): + """Get secondary structure features using standard nomenclature. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + (dict): with secondary structuree details + + For instance, the following are calculated: + { + "helixCountD": {}, + "sheetStrandCountD": {}, + "unassignedCountD": {}, + "helixLengthD": {}, + "sheetStrandLengthD": {}, + "unassignedLengthD": {}, + "helixFracD": {}, + "sheetStrandFracD": {}, + "unassignedFracD": {}, + "sheetSenseD": {}, + "sheetFullStrandCountD": {}, + "featureMonomerSequenceD": {}, + "featureSequenceD": {}, + # + "unassignedRangeD": {}, + "helixRangeD": {}, + "instHelixD": {}, + "sheetRangeD": {}, + "instSheetD": {}, + "senseTypeD": {} + "cisPeptideD": {}, + } + + # -- Target data categories --- + loop_ + _struct_conf.conf_type_id + _struct_conf.id + _struct_conf.pdbx_PDB_helix_id + _struct_conf.beg_label_comp_id + _struct_conf.beg_label_asym_id + _struct_conf.beg_label_seq_id + _struct_conf.pdbx_beg_PDB_ins_code + _struct_conf.end_label_comp_id + _struct_conf.end_label_asym_id + _struct_conf.end_label_seq_id + _struct_conf.pdbx_end_PDB_ins_code + + _struct_conf.beg_auth_comp_id + _struct_conf.beg_auth_asym_id + _struct_conf.beg_auth_seq_id + _struct_conf.end_auth_comp_id + _struct_conf.end_auth_asym_id + _struct_conf.end_auth_seq_id + _struct_conf.pdbx_PDB_helix_class + _struct_conf.details + _struct_conf.pdbx_PDB_helix_length + HELX_P HELX_P1 AA1 SER A 5 ? LYS A 19 ? SER A 2 LYS A 16 1 ? 15 + HELX_P HELX_P2 AA2 GLU A 26 ? LYS A 30 ? GLU A 23 LYS A 27 5 ? 5 + HELX_P HELX_P3 AA3 GLY A 47 ? LYS A 60 ? GLY A 44 LYS A 57 1 ? 14 + HELX_P HELX_P4 AA4 ASP A 111 ? LEU A 125 ? ASP A 108 LEU A 122 1 ? 15 + # + _struct_conf_type.id HELX_P + _struct_conf_type.criteria ? + _struct_conf_type.reference ? + # ------------------------------------------------------------------- + + loop_ + _struct_asym.id + _struct_asym.pdbx_blank_PDB_chainid_flag + _struct_asym.pdbx_modified + _struct_asym.entity_id + _struct_asym.details + A N N 1 ? + B N N 1 ? + # + _struct_sheet.id A + _struct_sheet.type ? + _struct_sheet.number_strands 8 + _struct_sheet.details ? + # + loop_ + _struct_sheet_order.sheet_id + _struct_sheet_order.range_id_1 + _struct_sheet_order.range_id_2 + _struct_sheet_order.offset + _struct_sheet_order.sense + A 1 2 ? anti-parallel + A 2 3 ? anti-parallel + A 3 4 ? anti-parallel + A 4 5 ? anti-parallel + A 5 6 ? anti-parallel + A 6 7 ? anti-parallel + A 7 8 ? anti-parallel + # + loop_ + _struct_sheet_range.sheet_id + _struct_sheet_range.id + _struct_sheet_range.beg_label_comp_id + _struct_sheet_range.beg_label_asym_id + _struct_sheet_range.beg_label_seq_id + _struct_sheet_range.pdbx_beg_PDB_ins_code + _struct_sheet_range.end_label_comp_id + _struct_sheet_range.end_label_asym_id + _struct_sheet_range.end_label_seq_id + _struct_sheet_range.pdbx_end_PDB_ins_code + + _struct_sheet_range.beg_auth_comp_id + _struct_sheet_range.beg_auth_asym_id + _struct_sheet_range.beg_auth_seq_id + _struct_sheet_range.end_auth_comp_id + _struct_sheet_range.end_auth_asym_id + _struct_sheet_range.end_auth_seq_id + A 1 LYS A 5 ? VAL A 8 ? LYS A 5 VAL A 8 + A 2 ARG A 11 ? THR A 16 ? ARG A 11 THR A 16 + A 3 VAL A 19 ? LEU A 26 ? VAL A 19 LEU A 26 + A 4 TYR A 29 ? ALA A 35 ? TYR A 29 ALA A 35 + A 5 TYR B 29 ? ALA B 35 ? TYR B 29 ALA B 35 + A 6 VAL B 19 ? LEU B 26 ? VAL B 19 LEU B 26 + A 7 ARG B 11 ? THR B 16 ? ARG B 11 THR B 16 + A 8 LYS B 5 ? VAL B 8 ? LYS B 5 VAL B 8 + # + _struct_mon_prot_cis.pdbx_id 1 + _struct_mon_prot_cis.label_comp_id ASN + _struct_mon_prot_cis.label_seq_id 189 + _struct_mon_prot_cis.label_asym_id C + _struct_mon_prot_cis.label_alt_id . + _struct_mon_prot_cis.pdbx_PDB_ins_code ? + _struct_mon_prot_cis.auth_comp_id ASN + _struct_mon_prot_cis.auth_seq_id 2007 + _struct_mon_prot_cis.auth_asym_id 2 + + _struct_mon_prot_cis.pdbx_label_comp_id_2 PRO + _struct_mon_prot_cis.pdbx_label_seq_id_2 190 + _struct_mon_prot_cis.pdbx_label_asym_id_2 C + _struct_mon_prot_cis.pdbx_PDB_ins_code_2 ? + _struct_mon_prot_cis.pdbx_auth_comp_id_2 PRO + _struct_mon_prot_cis.pdbx_auth_seq_id_2 2008 + _struct_mon_prot_cis.pdbx_auth_asym_id_2 2 + + _struct_mon_prot_cis.pdbx_PDB_model_num 1 + _struct_mon_prot_cis.pdbx_omega_angle -6.45 + """ + rD = { + "helixCountD": {}, + "sheetStrandCountD": {}, + "unassignedCountD": {}, + "helixLengthD": {}, + "sheetStrandLengthD": {}, + "unassignedLengthD": {}, + "helixFracD": {}, + "sheetStrandFracD": {}, + "unassignedFracD": {}, + "sheetSenseD": {}, + "sheetFullStrandCountD": {}, + "featureMonomerSequenceD": {}, + "featureSequenceD": {}, + # + "unassignedRangeD": {}, + "helixRangeD": {}, + "instHelixD": {}, + "sheetRangeD": {}, + "instSheetD": {}, + "senseTypeD": {}, + "cisPeptideD": {}, + } + try: + instancePolymerTypeD = self.getInstancePolymerTypes(dataContainer) + instEntityD = self.getInstanceEntityMap(dataContainer) + epLengthD = self.getPolymerEntityLengths(dataContainer) + # + helixRangeD = {} + sheetRangeD = {} + sheetSenseD = {} + unassignedRangeD = {} + cisPeptideD = OrderedDict() + # + if dataContainer.exists("struct_mon_prot_cis"): + tObj = dataContainer.getObj("struct_mon_prot_cis") + for ii in range(tObj.getRowCount()): + cId = tObj.getValue("pdbx_id", ii) + begAsymId = tObj.getValue("label_asym_id", ii) + # begCompId = tObj.getValue("label_comp_id", ii) + begSeqId = int(tObj.getValue("label_seq_id", ii)) + endAsymId = tObj.getValue("pdbx_label_asym_id_2", ii) + # endCompId = int(tObj.getValue("pdbx_label_comp_id_2", ii)) + endSeqId = int(tObj.getValue("pdbx_label_seq_id_2", ii)) + modelId = int(tObj.getValue("pdbx_PDB_model_num", ii)) + omegaAngle = float(tObj.getValue("pdbx_omega_angle", ii)) + # + if (begAsymId == endAsymId) and (begSeqId <= endSeqId): + cisPeptideD.setdefault(cId, []).append((begAsymId, begSeqId, endSeqId, modelId, omegaAngle)) + else: + logger.debug("%s inconsistent cis peptide description id = %s", dataContainer.getName(), cId) + + if dataContainer.exists("struct_conf"): + tObj = dataContainer.getObj("struct_conf") + helixRangeD = OrderedDict() + for ii in range(tObj.getRowCount()): + confType = str(tObj.getValue("conf_type_id", ii)).strip().upper() + if confType in ["HELX_P"]: + hId = tObj.getValue("id", ii) + begAsymId = tObj.getValue("beg_label_asym_id", ii) + endAsymId = tObj.getValue("end_label_asym_id", ii) + try: + tbegSeqId = int(tObj.getValue("beg_label_seq_id", ii)) + tendSeqId = int(tObj.getValue("end_label_seq_id", ii)) + begSeqId = min(tbegSeqId, tendSeqId) + endSeqId = max(tbegSeqId, tendSeqId) + except Exception: + continue + if (begAsymId == endAsymId) and (begSeqId <= endSeqId): + helixRangeD.setdefault(hId, []).append((begAsymId, begSeqId, endSeqId)) + else: + logger.debug("%s inconsistent struct_conf description id = %s", dataContainer.getName(), hId) + + logger.debug("%s helixRangeD %r", dataContainer.getName(), helixRangeD.items()) + + if dataContainer.exists("struct_sheet_range"): + tObj = dataContainer.getObj("struct_sheet_range") + sheetRangeD = OrderedDict() + for ii in range(tObj.getRowCount()): + sId = tObj.getValue("sheet_id", ii) + begAsymId = tObj.getValue("beg_label_asym_id", ii) + endAsymId = tObj.getValue("end_label_asym_id", ii) + # Most obsolete entries do no define this + try: + tbegSeqId = int(tObj.getValue("beg_label_seq_id", ii)) + tendSeqId = int(tObj.getValue("end_label_seq_id", ii)) + begSeqId = min(tbegSeqId, tendSeqId) + endSeqId = max(tbegSeqId, tendSeqId) + except Exception: + continue + if (begAsymId == endAsymId) and (begSeqId <= endSeqId): + sheetRangeD.setdefault(sId, []).append((begAsymId, begSeqId, endSeqId)) + else: + logger.debug("%s inconsistent struct_sheet_range description id = %s", dataContainer.getName(), sId) + + logger.debug("%s sheetRangeD %r", dataContainer.getName(), sheetRangeD.items()) + # + if dataContainer.exists("struct_sheet_order"): + tObj = dataContainer.getObj("struct_sheet_order") + # + sheetSenseD = OrderedDict() + for ii in range(tObj.getRowCount()): + sId = tObj.getValue("sheet_id", ii) + sense = str(tObj.getValue("sense", ii)).strip().lower() + sheetSenseD.setdefault(sId, []).append(sense) + # + logger.debug("%s sheetSenseD %r", dataContainer.getName(), sheetSenseD.items()) + # -------- + + unassignedCoverageD = {} + unassignedCountD = {} + unassignedLengthD = {} + unassignedFracD = {} + + helixCoverageD = {} + helixCountD = {} + helixLengthD = {} + helixFracD = {} + instHelixD = {} + + sheetCoverageD = {} + sheetStrandCountD = {} + sheetStrandLengthD = {} + strandsPerBetaSheetD = {} + sheetFullStrandCountD = {} + sheetStrandFracD = {} + instSheetD = {} + instSheetSenseD = {} + # + featureMonomerSequenceD = {} + featureSequenceD = {} + # + # ------------ + # Initialize over all protein instances + for asymId, filteredType in instancePolymerTypeD.items(): + if filteredType != "Protein": + continue + helixCoverageD[asymId] = [] + helixLengthD[asymId] = [] + helixCountD[asymId] = 0 + helixFracD[asymId] = 0.0 + instHelixD[asymId] = [] + # + sheetCoverageD[asymId] = [] + sheetStrandCountD[asymId] = 0 + sheetStrandLengthD[asymId] = [] + sheetFullStrandCountD[asymId] = [] + sheetStrandFracD[asymId] = 0.0 + instSheetD[asymId] = [] + instSheetSenseD[asymId] = [] + # + unassignedCountD[asymId] = 0 + unassignedLengthD[asymId] = [] + unassignedFracD[asymId] = 0.0 + # + featureMonomerSequenceD[asymId] = None + featureSequenceD[asymId] = None + # ------------- + # + for hId, hL in helixRangeD.items(): + for (asymId, begSeqId, endSeqId) in hL: + helixCoverageD.setdefault(asymId, []).extend(range(begSeqId, endSeqId + 1)) + helixLengthD.setdefault(asymId, []).append(abs(begSeqId - endSeqId) + 1) + helixCountD[asymId] = helixCountD[asymId] + 1 if asymId in helixCountD else 0 + instHelixD.setdefault(asymId, []).append(hId) + # + # --------- + # betaSheetCount = len(sheetRangeD) + # + for sId, sL in sheetRangeD.items(): + strandsPerBetaSheetD[sId] = len(sL) + for (asymId, begSeqId, endSeqId) in sL: + sheetCoverageD.setdefault(asymId, []).extend(range(begSeqId, endSeqId + 1)) + sheetStrandLengthD.setdefault(asymId, []).append(abs(begSeqId - endSeqId) + 1) + sheetStrandCountD[asymId] = sheetStrandCountD[asymId] + 1 if asymId in sheetStrandCountD else 0 + instSheetD.setdefault(asymId, []).append(sId) + # + instSheetRangeD = {} + for sId, sL in sheetRangeD.items(): + aD = {} + for (asymId, begSeqId, endSeqId) in sL: + aD.setdefault(asymId, []).append((begSeqId, endSeqId)) + instSheetRangeD[sId] = aD + # + # --------- + senseTypeD = {} + for sheetId, sL in sheetSenseD.items(): + if not sL: + continue + usL = list(set(sL)) + if len(usL) == 1: + senseTypeD[sheetId] = usL[0] + else: + senseTypeD[sheetId] = "mixed" + # --------- + # + for asymId, filteredType in instancePolymerTypeD.items(): + logger.debug("%s processing %s type %r", dataContainer.getName(), asymId, filteredType) + if filteredType != "Protein": + continue + entityId = instEntityD[asymId] + entityLen = epLengthD[entityId] + entityS = set(range(1, entityLen + 1)) + eLen = len(entityS) + # + helixS = set(helixCoverageD[asymId]) + sheetS = set(sheetCoverageD[asymId]) + commonS = helixS & sheetS + if commonS: + logger.debug("%s asymId %s overlapping secondary structure assignments for monomers %r", dataContainer.getName(), asymId, commonS) + # continue + + hLen = len(helixS) if asymId in helixCoverageD else 0 + sLen = len(sheetS) if asymId in sheetCoverageD else 0 + # + unassignedS = entityS - helixS if hLen else entityS + unassignedS = unassignedS - sheetS if sLen else unassignedS + tLen = len(unassignedS) + # + # if eLen != hLen + sLen + tLen: + # logger.warning("%s overlapping secondary structure assignments for asymId %s", dataContainer.getName(), asymId) + # continue + # + unassignedCoverageD[asymId] = list(unassignedS) + helixFracD[asymId] = float(hLen) / float(eLen) + sheetStrandFracD[asymId] = float(sLen) / float(eLen) + unassignedFracD[asymId] = float(tLen) / float(eLen) + # + unassignedRangeD[asymId] = list(self.__toRangeList(unassignedS)) + unassignedCountD[asymId] = len(unassignedRangeD[asymId]) + unassignedLengthD[asymId] = [abs(i - j) + 1 for (i, j) in unassignedRangeD[asymId]] + # + # ------ + sIdL = instSheetD[asymId] + # + instSheetSenseD[asymId] = [senseTypeD[sId] for sId in sIdL if sId in senseTypeD] + sheetFullStrandCountD[asymId] = [strandsPerBetaSheetD[sId] for sId in sIdL if sId in strandsPerBetaSheetD] + # + + # ------ + ssTypeL = ["_"] * eLen + if hLen: + for idx in helixCoverageD[asymId]: + ssTypeL[idx - 1] = "H" + if sLen: + for idx in sheetCoverageD[asymId]: + ssTypeL[idx - 1] = "S" + if tLen: + for idx in unassignedCoverageD[asymId]: + ssTypeL[idx - 1] = "_" + # + featureMonomerSequenceD[asymId] = "".join(ssTypeL) + featureSequenceD[asymId] = "".join([t[0] for t in itertools.groupby(ssTypeL)]) + # --------- + + rD = { + "helixCountD": helixCountD, + "sheetStrandCountD": sheetStrandCountD, + "unassignedCountD": unassignedCountD, + "helixLengthD": helixLengthD, + "sheetStrandLengthD": sheetStrandLengthD, + "unassignedLengthD": unassignedLengthD, + "helixFracD": helixFracD, + "sheetStrandFracD": sheetStrandFracD, + "unassignedFracD": unassignedFracD, + "sheetSenseD": instSheetSenseD, + "sheetFullStrandCountD": sheetFullStrandCountD, + "featureMonomerSequenceD": featureMonomerSequenceD, + "featureSequenceD": featureSequenceD, + # + "unassignedRangeD": unassignedRangeD, + "helixRangeD": helixRangeD, + "instHelixD": instHelixD, + # "sheetRangeD": sheetRangeD, + "instSheetRangeD": instSheetRangeD, + "instSheetD": instSheetD, + "senseTypeD": senseTypeD, + "cisPeptideD": cisPeptideD, + } + # self.__secondaryStructD = rD + # self.__setEntryCache(dataContainer.getName()) + except Exception as e: + logger.exception("Failing for %s with %s", dataContainer.getName(), str(e)) + # + return rD + + # Connection related + def getInstanceConnectionCounts(self, dataContainer): + """Return a dictionary instance connection counts. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + dict: {: #count, ...} + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchInstanceConnections(dataContainer) + return wD["instConnectCountD"] if "instConnectCountD" in wD else {} + + def getInstanceConnections(self, dataContainer): + """Return a list of instance connections. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + list: [{"connect_type": , "connect_target_label_comp_id": , ... },...] + + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchInstanceConnections(dataContainer) + return wD["instConnectL"] if "instConnectL" in wD else {} + + def getBoundNonpolymersComponentIds(self, dataContainer): + """Return a list of bound non-polymers in the entry. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + dict: {: NonpolymerBoundEntity("targetCompId", "connectType", "partnerCompId", "partnerEntityId", "partnerEntityType"), } + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchInstanceConnections(dataContainer) + return wD["boundNonpolymerComponentIdL"] if "boundNonpolymerComponentIdL" in wD else {} + + def getBoundNonpolymersByEntity(self, dataContainer): + """Return a dictonary of bound non-polymers by entity. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + dict: {: NonpolymerBoundEntity("targetCompId", "connectType", "partnerCompId", "partnerEntityId", "partnerEntityType"), } + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchInstanceConnections(dataContainer) + return wD["boundNonpolymerEntityD"] if "boundNonpolymerEntityD" in wD else {} + + def getBoundNonpolymersByInstance(self, dataContainer): + """Return a dictonary of bound non-polymers by instance. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + dict: {: NonpolymerBoundInstance("targetCompId", "connectType", "partnerCompId", "partnerAsymId", "partnerEntityType", "bondDistance", "bondOrder"), } + + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchInstanceConnections(dataContainer) + return wD["boundNonpolymerInstanceD"] if "boundNonpolymerInstanceD" in wD else {} + + def __fetchInstanceConnections(self, dataContainer): + wD = self.__instanceConnectionCache.get(dataContainer.getName()) + if not wD: + wD = self.__getInstanceConnections(dataContainer) + self.__instanceConnectionCache.set(dataContainer.getName(), wD) + return wD + + def __getInstanceConnections(self, dataContainer): + """Get instance connections (e.g., intermolecular bonds and non-primary connectivity) + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + dict: instConnectCountD{: count, ... } + + For instance, the following are calculated: + {Get counting information about intermolecular linkages. + covale . + disulf . + hydrog . + metalc + + loop_ + _struct_asym.id + _struct_asym.pdbx_blank_PDB_chainid_flag + _struct_asym.pdbx_modified + _struct_asym.entity_id + _struct_asym.details + A N N 1 ? + B N N 1 ? + # + _struct_biol.id 1 + # + loop_ + _struct_conn.id + _struct_conn.conn_type_id + _struct_conn.pdbx_leaving_atom_flag + _struct_conn.pdbx_PDB_id + _struct_conn.ptnr1_label_asym_id + _struct_conn.ptnr1_label_comp_id + _struct_conn.ptnr1_label_seq_id + _struct_conn.ptnr1_label_atom_id + _struct_conn.pdbx_ptnr1_label_alt_id + _struct_conn.pdbx_ptnr1_PDB_ins_code + _struct_conn.pdbx_ptnr1_standard_comp_id + _struct_conn.ptnr1_symmetry + _struct_conn.ptnr2_label_asym_id + _struct_conn.ptnr2_label_comp_id + _struct_conn.ptnr2_label_seq_id + _struct_conn.ptnr2_label_atom_id + _struct_conn.pdbx_ptnr2_label_alt_id + _struct_conn.pdbx_ptnr2_PDB_ins_code + _struct_conn.ptnr1_auth_asym_id + _struct_conn.ptnr1_auth_comp_id + _struct_conn.ptnr1_auth_seq_id + _struct_conn.ptnr2_auth_asym_id + _struct_conn.ptnr2_auth_comp_id + _struct_conn.ptnr2_auth_seq_id + _struct_conn.ptnr2_symmetry + _struct_conn.pdbx_ptnr3_label_atom_id + _struct_conn.pdbx_ptnr3_label_seq_id + _struct_conn.pdbx_ptnr3_label_comp_id + _struct_conn.pdbx_ptnr3_label_asym_id + _struct_conn.pdbx_ptnr3_label_alt_id + _struct_conn.pdbx_ptnr3_PDB_ins_code + _struct_conn.details + _struct_conn.pdbx_dist_value + _struct_conn.pdbx_value_order + disulf1 disulf ? ? A CYS 31 SG ? ? ? 1_555 B CYS 31 SG ? ? A CYS 31 B CYS 31 1_555 ? ? ? ? ? ? ? 1.997 ? + covale1 covale ? ? A VAL 8 C ? ? ? 1_555 A DPR 9 N ? ? A VAL 8 A DPR 9 1_555 ? ? ? ? ? ? ? 1.360 ? + covale2 covale ? ? A DPR 9 C ? ? ? 1_555 A GLY 10 N ? ? A DPR 9 A GLY 10 1_555 ? ? ? ? ? ? ? 1.324 ? + # + """ + iAttMapD = { + "id": "id", + "connect_type": "conn_type_id", + "connect_target_label_comp_id": "ptnr1_label_comp_id", + "connect_target_label_asym_id": "ptnr1_label_asym_id", + "connect_target_label_seq_id": "ptnr1_label_seq_id", + "connect_target_label_atom_id": "ptnr1_label_atom_id", + "connect_target_label_alt_id": "pdbx_ptnr1_label_alt_id", + "connect_target_symmetry": "ptnr1_symmetry", + # + "connect_partner_label_comp_id": "ptnr2_label_comp_id", + "connect_partner_label_asym_id": "ptnr2_label_asym_id", + "connect_partner_label_seq_id": "ptnr2_label_seq_id", + "connect_partner_label_atom_id": "ptnr2_label_atom_id", + "connect_partner_label_alt_id": "pdbx_ptnr2_label_alt_id", + "connect_partner_symmetry": "ptnr2_symmetry", + "value_order": "pdbx_value_order", + "dist_value": "pdbx_dist_value", + "description": "details", + "role": "pdbx_role", + } + jAttMapD = { + "id": "id", + "connect_type": "conn_type_id", + "connect_target_label_comp_id": "ptnr2_label_comp_id", + "connect_target_label_asym_id": "ptnr2_label_asym_id", + "connect_target_label_seq_id": "ptnr2_label_seq_id", + "connect_target_label_atom_id": "ptnr2_label_atom_id", + "connect_target_label_alt_id": "pdbx_ptnr2_label_alt_id", + "connect_target_symmetry": "ptnr2_symmetry", + # + "connect_partner_label_comp_id": "ptnr1_label_comp_id", + "connect_partner_label_asym_id": "ptnr1_label_asym_id", + "connect_partner_label_seq_id": "ptnr1_label_seq_id", + "connect_partner_label_atom_id": "ptnr1_label_atom_id", + "connect_partner_label_alt_id": "pdbx_ptnr1_label_alt_id", + "connect_partner_symmetry": "ptnr1_symmetry", + "value_order": "pdbx_value_order", + "dist_value": "pdbx_dist_value", + "description": "details", + "role": "pdbx_role", + } + typeMapD = { + "covale": "covalent bond", + "disulf": "disulfide bridge", + "hydrog": "hydrogen bond", + "metalc": "metal coordination", + "mismat": "mismatched base pairs", + "saltbr": "ionic interaction", + "modres": "covalent residue modification", + "covale_base": "covalent modification of a nucleotide base", + "covale_sugar": "covalent modification of a nucleotide sugar", + "covale_phosphate": "covalent modification of a nucleotide phosphate", + } + # + instConnectL = [] + instConnectCountD = {ky: 0 for ky in typeMapD} + boundNonpolymerEntityD = {} + boundNonpolymerInstanceD = {} + boundNonpolymerComponentIdL = [] + # + if dataContainer.exists("struct_conn"): + tObj = dataContainer.getObj("struct_conn") + for ii in range(tObj.getRowCount()): + bt = str(tObj.getValue("conn_type_id", ii)).strip().lower() + if bt not in instConnectCountD: + logger.error("Unsupported intermolecular bond type %r in %r", bt, dataContainer.getName()) + continue + instConnectCountD[bt] = instConnectCountD[bt] + 1 if bt in instConnectCountD else instConnectCountD[bt] + # + tD = OrderedDict() + for ky, atName in iAttMapD.items(): + if tObj.hasAttribute(atName): + val = tObj.getValue(atName, ii) if atName != "conn_type_id" else typeMapD[tObj.getValue(atName, ii).lower()] + tD[ky] = val + instConnectL.append(tD) + # Flip the bond sense so all target connections are accounted for + tD = OrderedDict() + for ky, atName in jAttMapD.items(): + if tObj.hasAttribute(atName): + val = tObj.getValue(atName, ii) if atName != "conn_type_id" else typeMapD[tObj.getValue(atName, ii).lower()] + tD[ky] = val + instConnectL.append(tD) + + boundNonpolymerEntityD, boundNonpolymerInstanceD, boundNonpolymerComponentIdL = self.__getBoundNonpolymers(dataContainer, instConnectL) + + return { + "instConnectL": instConnectL, + "instConnectCountD": instConnectCountD, + "boundNonpolymerEntityD": boundNonpolymerEntityD, + "boundNonpolymerInstanceD": boundNonpolymerInstanceD, + "boundNonpolymerComponentIdL": boundNonpolymerComponentIdL, + } + + def __getBoundNonpolymers(self, dataContainer, instConnectL): + """Get nonpolymer bound + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + bool: True for success or False otherwise + + Example: + """ + logger.debug("Starting with %r", dataContainer.getName()) + # + boundNonpolymerEntityD = {} + boundNonpolymerInstanceD = {} + boundNonpolymerComponentIdL = [] + try: + cDL = instConnectL + asymIdD = self.getInstanceEntityMap(dataContainer) + # asymAuthIdD = self.getAsymAuthIdMap(dataContainer) + eTypeD = self.getEntityTypes(dataContainer) + # + ts = set() + for cD in cDL: + tAsymId = cD["connect_target_label_asym_id"] + tEntityId = asymIdD[tAsymId] + if eTypeD[tEntityId] == "non-polymer" and cD["connect_type"] in ["covale", "covalent bond", "metalc", "metal coordination"]: + pAsymId = cD["connect_partner_label_asym_id"] + pEntityId = asymIdD[pAsymId] + pCompId = cD["connect_partner_label_comp_id"] + pSeqId = cD["connect_partner_label_seq_id"] + tCompId = cD["connect_target_label_comp_id"] + bondOrder = cD["value_order"] + bondDist = cD["dist_value"] + pType = eTypeD[pEntityId] + # + ts.add(tCompId) + boundNonpolymerInstanceD.setdefault(tAsymId, []).append( + NonpolymerBoundInstance(tCompId, cD["connect_type"], pCompId, pAsymId, pType, pSeqId, bondDist, bondOrder) + ) + boundNonpolymerEntityD.setdefault(tEntityId, []).append(NonpolymerBoundEntity(tCompId, cD["connect_type"], pCompId, pEntityId, pType)) + # + for asymId in boundNonpolymerInstanceD: + boundNonpolymerInstanceD[asymId] = sorted(set(boundNonpolymerInstanceD[asymId])) + for entityId in boundNonpolymerEntityD: + boundNonpolymerEntityD[entityId] = sorted(set(boundNonpolymerEntityD[entityId])) + boundNonpolymerComponentIdL = sorted(ts) + except Exception as e: + logger.exception("%s failing with %s", dataContainer.getName(), str(e)) + return boundNonpolymerEntityD, boundNonpolymerInstanceD, boundNonpolymerComponentIdL + + def getEntitySequenceFeatureCounts(self, dataContainer): + """Return a dictionary of sequence feature counts. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + dict: {: {'mutation': #, 'artifact': #, 'conflict': #, ... }, } + + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchSequenceFeatures(dataContainer) + return wD["seqFeatureCountsD"] if "seqFeatureCountsD" in wD else {} + + def getEntitySequenceMonomerFeatures(self, dataContainer): + """Return a dictionary of sequence monomer features. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + dict: {(entityId,seqId,compId,filteredFeature): {detail,detail}, .. } + + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchSequenceFeatures(dataContainer) + return wD["seqMonomerFeatureD"] if "seqMonomerFeatureD" in wD else {} + + def getEntitySequenceRangeFeatures(self, dataContainer): + """Return a dictionary of sequence range features. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + dict: {(entityId,benSeqId,endSeqId,filteredFeature): {detail,detail}, .. } + + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchSequenceFeatures(dataContainer) + return wD["seqRangeFeatureD"] if "seqRangeFeatureD" in wD else {} + + def getEntityReferenceAlignments(self, dataContainer): + """Return a dictionary of reference sequence alignments for each entity. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + dict: {entityId: {'dbName': , 'dbAccession': , 'authAsymId': , 'entitySeqIdBeg':, 'dbSeqIdBeg':, ... }, .. } + + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchReferenceSequenceDetails(dataContainer) + return wD["seqEntityAlignmentD"] if "seqEntityAlignmentD" in wD else {} + + def getEntityPolymerSequences(self, dataContainer): + """Return a dictionary of the sequences (one-letter-codes) for each polymer entity. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + dict: {entityId: {'sequence': ..., 'polymerType': ... , 'polymerTypeFiltered': ... }, ... } + + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchReferenceSequenceDetails(dataContainer) + return wD["entityPolymerSequenceD"] if "entityPolymerSequenceD" in wD else {} + + def getEntitySequenceReferenceCodes(self, dataContainer): + """Return a dictionary of reference database accession codes. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + dict: {entityId: {'dbName': , 'dbAccession': }, ... } + + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchReferenceSequenceDetails(dataContainer) + return wD["seqEntityRefDbD"] if "seqEntityRefDbD" in wD else {} + + def __fetchSequenceFeatures(self, dataContainer): + wD = self.__entitySequenceFeatureCache.get(dataContainer.getName()) + if not wD: + wD = self.__getSequenceFeatures(dataContainer) + self.__entitySequenceFeatureCache.set(dataContainer.getName(), wD) + return wD + + def __fetchReferenceSequenceDetails(self, dataContainer): + wD = self.__entityReferenceSequenceDetailsCache.get(dataContainer.getName()) + if not wD: + wD = self.__getReferenceSequenceDetails(dataContainer) + self.__entityReferenceSequenceDetailsCache.set(dataContainer.getName(), wD) + return wD + + def getDatabaseNameMap(self): + dbNameMapD = { + "UNP": "UniProt", + "GB": "GenBank", + "PDB": "PDB", + "EMBL": "EMBL", + "GENP": "GenBank", + "NDB": "NDB", + "NOR": "NORINE", + "PIR": "PIR", + "PRF": "PRF", + "REF": "RefSeq", + "TPG": "GenBank", + "TREMBL": "UniProt", + "SWS": "UniProt", + "SWALL": "UniProt", + } + return dbNameMapD + + def __getReferenceSequenceDetails(self, dataContainer): + """Get reference sequence and related alignment details. + + Args: + dataContainer (object): mmif.api.DataContainer object instance + + Returns: + dict : { + "seqEntityAlignmentD" : {entityId: [{'dbName': 'UNP' , 'dbAccession': 'P000000', ... }]} + "seqEntityRefDbD": {entityId: [{'dbName': 'UNP' , 'dbAccession': 'P000000'), }]}, + } + + Example source content: + + _struct_ref.id 1 + _struct_ref.db_name UNP + _struct_ref.db_code KSYK_HUMAN + _struct_ref.pdbx_db_accession P43405 + _struct_ref.entity_id 1 + _struct_ref.pdbx_seq_one_letter_code + ;ADPEEIRPKEVYLDRKLLTLEDKELGSGNFGTVKKGYYQMKKVVKTVAVKILKNEANDPALKDELLAEANVMQQLDNPYI + VRMIGICEAESWMLVMEMAELGPLNKYLQQNRHVKDKNIIELVHQVSMGMKYLEESNFVHRDLAARNVLLVTQHYAKISD + FGLSKALRADENYYKAQTHGKWPVKWYAPECINYYKFSSKSDVWSFGVLMWEAFSYGQKPYRGMKGSEVTAMLEKGERMG + CPAGCPREMYDLMNLCWTYDVENRPGFAAVELRLRNYYYDVVN + ; + _struct_ref.pdbx_align_begin 353 + _struct_ref.pdbx_db_isoform ? + # + _struct_ref_seq.align_id 1 + _struct_ref_seq.ref_id 1 + _struct_ref_seq.pdbx_PDB_id_code 1XBB + _struct_ref_seq.pdbx_strand_id A + _struct_ref_seq.seq_align_beg 1 + _struct_ref_seq.pdbx_seq_align_beg_ins_code ? + _struct_ref_seq.seq_align_end 283 + _struct_ref_seq.pdbx_seq_align_end_ins_code ? + _struct_ref_seq.pdbx_db_accession P43405 + _struct_ref_seq.db_align_beg 353 + _struct_ref_seq.pdbx_db_align_beg_ins_code ? + _struct_ref_seq.db_align_end 635 + _struct_ref_seq.pdbx_db_align_end_ins_code ? + _struct_ref_seq.pdbx_auth_seq_align_beg 353 + _struct_ref_seq.pdbx_auth_seq_align_end 635 + _struct_ref_seq.rcsb_entity_id 1 + # + loop_ + _struct_ref_seq_dif.align_id + _struct_ref_seq_dif.pdbx_pdb_id_code + _struct_ref_seq_dif.mon_id + _struct_ref_seq_dif.pdbx_pdb_strand_id + _struct_ref_seq_dif.seq_num + _struct_ref_seq_dif.pdbx_pdb_ins_code + _struct_ref_seq_dif.pdbx_seq_db_name + _struct_ref_seq_dif.pdbx_seq_db_accession_code + _struct_ref_seq_dif.db_mon_id + _struct_ref_seq_dif.pdbx_seq_db_seq_num + _struct_ref_seq_dif.details + _struct_ref_seq_dif.pdbx_auth_seq_num + _struct_ref_seq_dif.pdbx_ordinal + _struct_ref_seq_dif.rcsb_entity_id + 1 1XBB MET A 1 ? UNP P43405 ALA 353 'CLONING ARTIFACT' 353 1 1 + 1 1XBB ALA A 2 ? UNP P43405 ASP 354 'CLONING ARTIFACT' 354 2 1 + 1 1XBB LEU A 3 ? UNP P43405 PRO 355 'CLONING ARTIFACT' 355 3 1 + 1 1XBB GLU A 284 ? UNP P43405 ? ? 'CLONING ARTIFACT' 636 4 1 + 1 1XBB GLY A 285 ? UNP P43405 ? ? 'CLONING ARTIFACT' 637 5 1 + 1 1XBB HIS A 286 ? UNP P43405 ? ? 'EXPRESSION TAG' 638 6 1 + 1 1XBB HIS A 287 ? UNP P43405 ? ? 'EXPRESSION TAG' 639 7 1 + 1 1XBB HIS A 288 ? UNP P43405 ? ? 'EXPRESSION TAG' 640 8 1 + 1 1XBB HIS A 289 ? UNP P43405 ? ? 'EXPRESSION TAG' 641 9 1 + 1 1XBB HIS A 290 ? UNP P43405 ? ? 'EXPRESSION TAG' 642 10 1 + 1 1XBB HIS A 291 ? UNP P43405 ? ? 'EXPRESSION TAG' 643 11 1 + # + # + loop_ + _struct_ref_seq_dif.align_id + _struct_ref_seq_dif.pdbx_pdb_id_code + _struct_ref_seq_dif.mon_id + _struct_ref_seq_dif.pdbx_pdb_strand_id + _struct_ref_seq_dif.seq_num + _struct_ref_seq_dif.pdbx_pdb_ins_code + _struct_ref_seq_dif.pdbx_seq_db_name + _struct_ref_seq_dif.pdbx_seq_db_accession_code + _struct_ref_seq_dif.db_mon_id + _struct_ref_seq_dif.pdbx_seq_db_seq_num + _struct_ref_seq_dif.details + _struct_ref_seq_dif.pdbx_auth_seq_num + _struct_ref_seq_dif.pdbx_ordinal + _struct_ref_seq_dif.rcsb_entity_id + 1 3RIJ TYR A 53 ? UNP Q5SHN1 PHE 54 'ENGINEERED MUTATION' 54 1 1 + 1 3RIJ GLY A 54 ? UNP Q5SHN1 VAL 55 'ENGINEERED MUTATION' 55 2 1 + 2 3RIJ ASP A 98 ? UNP Q5SHN1 ALA 99 'ENGINEERED MUTATION' 99 3 1 + 2 3RIJ ALA A 99 ? UNP Q5SHN1 ILE 100 'ENGINEERED MUTATION' 100 4 1 + 2 3RIJ LEU A 158 ? UNP Q5SHN1 ? ? INSERTION 159 5 1 + 2 3RIJ GLU A 159 ? UNP Q5SHN1 ? ? INSERTION 160 6 1 + 2 3RIJ HIS A 160 ? UNP Q5SHN1 ? ? 'EXPRESSION TAG' 161 7 1 + 2 3RIJ HIS A 161 ? UNP Q5SHN1 ? ? 'EXPRESSION TAG' 162 8 1 + 2 3RIJ HIS A 162 ? UNP Q5SHN1 ? ? 'EXPRESSION TAG' 163 9 1 + 2 3RIJ HIS A 163 ? UNP Q5SHN1 ? ? 'EXPRESSION TAG' 164 10 1 + 2 3RIJ HIS A 164 ? UNP Q5SHN1 ? ? 'EXPRESSION TAG' 165 11 1 + 2 3RIJ HIS A 165 ? UNP Q5SHN1 ? ? 'EXPRESSION TAG' 166 12 1 + 3 3RIJ TYR B 53 ? UNP Q5SHN1 PHE 54 'ENGINEERED MUTATION' 54 13 1 + 3 3RIJ GLY B 54 ? UNP Q5SHN1 VAL 55 'ENGINEERED MUTATION' 55 14 1 + 4 3RIJ ASP B 98 ? UNP Q5SHN1 ALA 99 'ENGINEERED MUTATION' 99 15 1 + 4 3RIJ ALA B 99 ? UNP Q5SHN1 ILE 100 'ENGINEERED MUTATION' 100 16 1 + 4 3RIJ LEU B 158 ? UNP Q5SHN1 ? ? INSERTION 159 17 1 + 4 3RIJ GLU B 159 ? UNP Q5SHN1 ? ? INSERTION 160 18 1 + 4 3RIJ HIS B 160 ? UNP Q5SHN1 ? ? 'EXPRESSION TAG' 161 19 1 + 4 3RIJ HIS B 161 ? UNP Q5SHN1 ? ? 'EXPRESSION TAG' 162 20 1 + 4 3RIJ HIS B 162 ? UNP Q5SHN1 ? ? 'EXPRESSION TAG' 163 21 1 + 4 3RIJ HIS B 163 ? UNP Q5SHN1 ? ? 'EXPRESSION TAG' 164 22 1 + 4 3RIJ HIS B 164 ? UNP Q5SHN1 ? ? 'EXPRESSION TAG' 165 23 1 + 4 3RIJ HIS B 165 ? UNP Q5SHN1 ? ? 'EXPRESSION TAG' 166 24 1 + 5 3RIJ TYR C 53 ? UNP Q5SHN1 PHE 54 'ENGINEERED MUTATION' 54 25 1 + 5 3RIJ GLY C 54 ? UNP Q5SHN1 VAL 55 'ENGINEERED MUTATION' 55 26 1 + 6 3RIJ ASP C 98 ? UNP Q5SHN1 ALA 99 'ENGINEERED MUTATION' 99 27 1 + 6 3RIJ ALA C 99 ? UNP Q5SHN1 ILE 100 'ENGINEERED MUTATION' 100 28 1 + 6 3RIJ LEU C 158 ? UNP Q5SHN1 ? ? INSERTION 159 29 1 + 6 3RIJ GLU C 159 ? UNP Q5SHN1 ? ? INSERTION 160 30 1 + 6 3RIJ HIS C 160 ? UNP Q5SHN1 ? ? 'EXPRESSION TAG' 161 31 1 + 6 3RIJ HIS C 161 ? UNP Q5SHN1 ? ? 'EXPRESSION TAG' 162 32 1 + 6 3RIJ HIS C 162 ? UNP Q5SHN1 ? ? 'EXPRESSION TAG' 163 33 1 + 6 3RIJ HIS C 163 ? UNP Q5SHN1 ? ? 'EXPRESSION TAG' 164 34 1 + 6 3RIJ HIS C 164 ? UNP Q5SHN1 ? ? 'EXPRESSION TAG' 165 35 1 + 6 3RIJ HIS C 165 ? UNP Q5SHN1 ? ? 'EXPRESSION TAG' 166 36 1 + 7 3RIJ TYR D 53 ? UNP Q5SHN1 PHE 54 'ENGINEERED MUTATION' 54 37 1 + 7 3RIJ GLY D 54 ? UNP Q5SHN1 VAL 55 'ENGINEERED MUTATION' 55 38 1 + 8 3RIJ ASP D 98 ? UNP Q5SHN1 ALA 99 'ENGINEERED MUTATION' 99 39 1 + 8 3RIJ ALA D 99 ? UNP Q5SHN1 ILE 100 'ENGINEERED MUTATION' 100 40 1 + 8 3RIJ LEU D 158 ? UNP Q5SHN1 ? ? INSERTION 159 41 1 + 8 3RIJ GLU D 159 ? UNP Q5SHN1 ? ? INSERTION 160 42 1 + 8 3RIJ HIS D 160 ? UNP Q5SHN1 ? ? 'EXPRESSION TAG' 161 43 1 + 8 3RIJ HIS D 161 ? UNP Q5SHN1 ? ? 'EXPRESSION TAG' 162 44 1 + 8 3RIJ HIS D 162 ? UNP Q5SHN1 ? ? 'EXPRESSION TAG' 163 45 1 + 8 3RIJ HIS D 163 ? UNP Q5SHN1 ? ? 'EXPRESSION TAG' 164 46 1 + 8 3RIJ HIS D 164 ? UNP Q5SHN1 ? ? 'EXPRESSION TAG' 165 47 1 + 8 3RIJ HIS D 165 ? UNP Q5SHN1 ? ? 'EXPRESSION TAG' 166 48 1 + # + """ + logger.debug("Starting with %r", dataContainer.getName()) + self.__addStructRefSeqEntityIds(dataContainer) + # + # To exclude self references - + excludeRefDbList = ["PDB"] + rD = {"seqEntityAlignmentD": {}, "seqEntityRefDbD": {}, "entityPolymerSeqenceD": {}} + try: + # Exit if source categories are missing + if not (dataContainer.exists("struct_ref_seq") and dataContainer.exists("struct_ref") and dataContainer.exists("entity_poly")): + return rD + # ------- --------- ------- --------- ------- --------- ------- --------- ------- --------- + entityPolymerSequenceD = {} + if dataContainer.exists("entity_poly"): + epObj = dataContainer.getObj("entity_poly") + for ii in range(epObj.getRowCount()): + entityId = epObj.getValue("entity_id", ii) + pType = epObj.getValue("type", ii) + pTypeFiltered = self.filterEntityPolyType(pType) + if epObj.hasAttribute("pdbx_seq_one_letter_code_can"): + sampleSeq = self.__stripWhiteSpace(epObj.getValue("pdbx_seq_one_letter_code_can", ii)) + if sampleSeq and sampleSeq not in ["?", "."]: + entityPolymerSequenceD[entityId] = {"sequence": sampleSeq, "polymerType": pType, "polymerTypeFiltered": pTypeFiltered} + # + srObj = None + if dataContainer.exists("struct_ref"): + srObj = dataContainer.getObj("struct_ref") + # + srsObj = None + if dataContainer.exists("struct_ref_seq"): + srsObj = dataContainer.getObj("struct_ref_seq") + + # srsdObj = None + # if dataContainer.exists("struct_ref_seq_dif"): + # srsdObj = dataContainer.getObj("struct_ref_seq_dif") + + polymerEntityTypeD = self.getPolymerEntityFilteredTypes(dataContainer) + # Map alignId -> entityId + seqEntityRefDbD = {} + tupSeqEntityRefDbD = {} + alignEntityMapD = {} + # entity alignment details + seqEntityAlignmentD = {} + for ii in range(srObj.getRowCount()): + dbAccessionAlignS = set() + entityId = srObj.getValue("entity_id", ii) + refId = srObj.getValue("id", ii) + dbName = str(srObj.getValue("db_name", ii)).strip().upper() + # + if dbName in excludeRefDbList: + continue + # + if entityId not in polymerEntityTypeD: + logger.debug("%s skipping non-polymer entity %r in sequence reference", dataContainer.getName(), entityId) + continue + + if dbName in ["UNP"] and polymerEntityTypeD[entityId] != "Protein": + logger.debug("%s skipping inconsistent reference assignment for %s polymer type %s", dataContainer.getName(), dbName, polymerEntityTypeD[entityId]) + continue + # + tS = srObj.getValue("pdbx_db_accession", ii) + dbAccession = tS if tS and tS not in [".", "?"] else None + # + tS = srObj.getValue("pdbx_db_isoform", ii) + dbIsoform = tS if tS and tS not in [".", "?"] else None + # Look for a stray isoform + if dbName in ["UNP"] and dbAccession and "-" in dbAccession: + if not dbIsoform: + dbIsoform = dbAccession + ff = dbAccession.split("-") + dbAccession = ff[0] + + # + if dbIsoform and dbAccession not in dbIsoform: + logger.debug("entryId %r entityId %r accession %r isoform %r inconsistency", dataContainer.getName(), entityId, dbAccession, dbIsoform) + # --- + # Get indices for the target refId. + iRowL = srsObj.selectIndices(refId, "ref_id") + logger.debug("entryId %r entityId %r refId %r rowList %r", dataContainer.getName(), entityId, refId, iRowL) + entitySeqIdBeg = entitySeqIdEnd = 0 + for iRow in iRowL: + try: + entitySeqIdBeg = srsObj.getValue("seq_align_beg", iRow) + entitySeqIdEnd = srsObj.getValue("seq_align_end", iRow) + entityAlignLength = int(entitySeqIdEnd) - int(entitySeqIdBeg) + 1 + except Exception: + entityAlignLength = 0 + # + if entityAlignLength <= 0: + logger.debug("%s entity %r skipping bad alignment seqBeg %r seqEnd %r", dataContainer.getName(), entityId, entitySeqIdBeg, entitySeqIdEnd) + continue + + alignId = srsObj.getValue("align_id", iRow) + alignEntityMapD[alignId] = entityId + # + authAsymId = srsObj.getValue("pdbx_strand_id", iRow) + dbSeqIdBeg = srsObj.getValue("db_align_beg", iRow) + dbSeqIdEnd = srsObj.getValue("db_align_end", iRow) + # ---- + try: + idbSeqIdBeg = int(dbSeqIdBeg) + if idbSeqIdBeg == 0: + idbSeqIdBeg = 1 + dbSeqIdBeg = str(idbSeqIdBeg) + idbSeqIdEnd = int(dbSeqIdEnd) + idbSeqIdEnd += 1 + dbSeqIdEnd = str(idbSeqIdEnd) + logger.debug("%s offset reference sequence database position", dataContainer.getName()) + except Exception: + pass + # ---- + # + tS = srsObj.getValue("pdbx_db_accession", iRow) + # use the parent pdbx_accession + dbAccessionAlign = tS if tS and tS not in [".", "?"] else dbAccession + # Look for a stray isoform + if dbName in ["UNP"] and dbAccessionAlign and "-" in dbAccessionAlign: + if not dbIsoform: + dbIsoform = dbAccessionAlign + ff = dbAccessionAlign.split("-") + dbAccessionAlign = ff[0] + + dbAccessionAlignS.add(dbAccessionAlign) + # + # + seqEntityAlignmentD.setdefault(entityId, []).append( + SeqAlign( + "PDB", + **{ + "authAsymId": authAsymId, + "entitySeqIdBeg": entitySeqIdBeg, + "entitySeqIdEnd": entitySeqIdEnd, + "dbSeqIdBeg": dbSeqIdBeg, + "dbSeqIdEnd": dbSeqIdEnd, + "dbName": dbName, + "dbAccession": dbAccessionAlign, + "dbIsoform": dbIsoform, + "entityAlignLength": entityAlignLength, + }, + ) + ) + # Check consistency + try: + if len(dbAccessionAlignS) == 1 and list(dbAccessionAlignS)[0] == dbAccession: + tupSeqEntityRefDbD.setdefault(entityId, []).append((dbName, dbAccession, dbIsoform)) + elif len(dbAccessionAlignS) == 1 and list(dbAccessionAlignS)[0]: + tupSeqEntityRefDbD.setdefault(entityId, []).append((dbName, list(dbAccessionAlignS)[0], None)) + elif dbAccession: + tupSeqEntityRefDbD.setdefault(entityId, []).append((dbName, dbAccession, dbIsoform)) + else: + logger.debug("%s entityId %r inconsistent reference sequence %r %r", dataContainer.getName(), entityId, dbAccession, dbAccessionAlignS) + except Exception: + logger.exception("%s entityId %r inconsistent reference sequence %r %r", dataContainer.getName(), entityId, dbAccession, dbAccessionAlignS) + + # ----- + dbMapD = self.getDatabaseNameMap() + for entityId, tupL in tupSeqEntityRefDbD.items(): + uTupL = list(OrderedDict({tup: True for tup in tupL}).keys()) + for tup in uTupL: + tS = dbMapD[tup[0]] if tup[0] in dbMapD else tup[0] + if tup[1]: + seqEntityRefDbD.setdefault(entityId, []).append({"dbName": tS, "dbAccession": tup[1], "dbIsoform": tup[2]}) + else: + logger.debug("%s %s skipping incomplete sequence reference assignment %r", dataContainer.getName(), entityId, tup) + + return { + "seqEntityAlignmentD": seqEntityAlignmentD, + "seqEntityRefDbD": seqEntityRefDbD, + "entityPolymerSequenceD": entityPolymerSequenceD, + } + except Exception as e: + logger.exception("%s failing with %s", dataContainer.getName(), str(e)) + return rD + + def __getSequenceFeatures(self, dataContainer): + """Get point and range sequence features. + + Args: + dataContainer (object): mmif.api.DataContainer object instance + + Returns: + dict : {"seqFeatureCountsD": {entityId: {"mutation": #, "conflict": # ... }, } + "seqMonomerFeatureD": {(entityId, seqId, compId, filteredFeature): set(feature,...), ...} + "seqRangeFeatureD" : {(entityId, str(beg), str(end), "artifact"): set(details)} + } + + """ + logger.debug("Starting with %r", dataContainer.getName()) + self.__addStructRefSeqEntityIds(dataContainer) + # + # To exclude self references - + # excludeRefDbList = ["PDB"] + rD = {"seqFeatureCountsD": {}, "seqMonomerFeatureD": {}, "seqRangeFeatureD": {}} + try: + # Exit if source categories are missing + if not (dataContainer.exists("struct_ref_seq") and dataContainer.exists("struct_ref")): + return rD + # ------- --------- ------- --------- ------- --------- ------- --------- ------- --------- + # srObj = None + # if dataContainer.exists("struct_ref"): + # srObj = dataContainer.getObj("struct_ref") + # + # srsObj = None + # if dataContainer.exists("struct_ref_seq"): + # srsObj = dataContainer.getObj("struct_ref_seq") + + srsdObj = None + if dataContainer.exists("struct_ref_seq_dif"): + srsdObj = dataContainer.getObj("struct_ref_seq_dif") + + # polymerEntityTypeD = self.getPolymerEntityFilteredTypes(dataContainer) + # + # ------- --------- ------- --------- ------- --------- ------- --------- ------- --------- + # (entityId, seqId, compId, filteredFeature) -> set{details, ...} + # + seqFeatureCountsD = {} + seqMonomerFeatureD = {} + seqRangeFeatureD = {} + entityArtifactD = {} + seqIdDetailsD = {} + if srsdObj: + for ii in range(srsdObj.getRowCount()): + # alignId = srsdObj.getValue("align_id", ii) + # + # entityId = alignEntityMapD[alignId] + entityId = srsdObj.getValueOrDefault("rcsb_entity_id", ii, defaultValue=None) + if not entityId: + continue + # + # authAsymId = srsdObj.getValue("pdbx_pdb_strand_id", ii) + # dbName = srsdObj.getValue("pdbx_seq_db_name", ii) + # + # Can't rely on alignId + # Keep difference records for self-referenced entity sequences. + # if alignId not in alignEntityMapD and dbName not in excludeRefDbList: + # logger.warning("%s inconsistent alignment ID %r in difference record %d", dataContainer.getName(), alignId, ii + 1) + # continue + # + seqId = srsdObj.getValueOrDefault("seq_num", ii, defaultValue=None) + if not seqId: + continue + compId = srsdObj.getValue("mon_id", ii) + # + details = srsdObj.getValue("details", ii) + filteredDetails = self.filterRefSequenceDif(details) + if filteredDetails == "artifact": + try: + entityArtifactD.setdefault(entityId, []).append(int(seqId)) + seqIdDetailsD[int(seqId)] = details.lower() + except Exception: + logger.debug("Incomplete sequence difference for %r %r %r %r", dataContainer.getName(), entityId, seqId, details) + else: + seqMonomerFeatureD.setdefault((entityId, seqId, compId, filteredDetails), set()).add(details.lower()) + # + # Consolidate the artifacts as ranges - + for entityId, sL in entityArtifactD.items(): + # logger.debug("%s artifact ranges SL %r ranges %r", dataContainer.getName(), sL, list(self.__toRangeList(sL))) + srL = self.__toRangeList(sL) + for sr in srL: + seqRangeFeatureD.setdefault((entityId, str(sr[0]), str(sr[1]), "artifact"), set()).update([seqIdDetailsD[sr[0]], seqIdDetailsD[sr[1]]]) + # JDW + # logger.info("%s seqMonomerFeatureD %r ", dataContainer.getName(), seqMonomerFeatureD) + # + # Tabulate sequence monomer features by entity for the filtered cases - + for (entityId, _, _, fDetails), _ in seqMonomerFeatureD.items(): + if entityId not in seqFeatureCountsD: + seqFeatureCountsD[entityId] = {"mutation": 0, "artifact": 0, "insertion": 0, "deletion": 0, "conflict": 0, "other": 0} + seqFeatureCountsD[entityId][fDetails] += 1 + # + # + # Tabulate sequence range features by entity for the filtered cases - + for (entityId, _, _, fDetails), _ in seqRangeFeatureD.items(): + if entityId not in seqFeatureCountsD: + seqFeatureCountsD[entityId] = {"mutation": 0, "artifact": 0, "insertion": 0, "deletion": 0, "conflict": 0, "other": 0} + seqFeatureCountsD[entityId][fDetails] += 1 + + return { + "seqFeatureCountsD": seqFeatureCountsD, + "seqMonomerFeatureD": seqMonomerFeatureD, + "seqRangeFeatureD": seqRangeFeatureD, + } + except Exception as e: + logger.exception("%s failing with %s", dataContainer.getName(), str(e)) + return rD + + def __addStructRefSeqEntityIds(self, dataContainer): + """Add entity ids in categories struct_ref_seq and struct_ref_seq_dir instances. + + Args: + dataContainer (object): mmif.api.DataContainer object instance + catName (str): Category name + atName (str): Attribute name + + Returns: + bool: True for success or False otherwise + + """ + catName = "struct_ref_seq" + try: + logger.debug("Starting with %r %r", dataContainer.getName(), catName) + # + if not (dataContainer.exists(catName) and dataContainer.exists("struct_ref")): + return False + # + atName = "rcsb_entity_id" + srsObj = dataContainer.getObj(catName) + if not srsObj.hasAttribute(atName): + srsObj.appendAttributeExtendRows(atName, defaultValue="?") + else: + # skip if attribute has already been added - + return True + # + srObj = dataContainer.getObj("struct_ref") + # + srsdObj = None + if dataContainer.exists("struct_ref_seq_dif"): + srsdObj = dataContainer.getObj("struct_ref_seq_dif") + if not srsdObj.hasAttribute(atName): + # srsdObj.appendAttribute(atName) + srsdObj.appendAttributeExtendRows(atName, defaultValue="?") + + for ii in range(srObj.getRowCount()): + entityId = srObj.getValue("entity_id", ii) + refId = srObj.getValue("id", ii) + # + # Get indices for the target refId. + iRowL = srsObj.selectIndices(refId, "ref_id") + for iRow in iRowL: + srsObj.setValue(entityId, "rcsb_entity_id", iRow) + alignId = srsObj.getValue("align_id", iRow) + # + if srsdObj: + jRowL = srsdObj.selectIndices(alignId, "align_id") + for jRow in jRowL: + srsdObj.setValue(entityId, "rcsb_entity_id", jRow) + + return True + except Exception as e: + logger.exception("%s %s failing with %s", dataContainer.getName(), catName, str(e)) + return False + + def filterRefSequenceDif(self, details): + filteredDetails = details + if details.upper() in [ + "ACETYLATION", + "CHROMOPHORE", + "VARIANT", + "MODIFIED RESIDUE", + "MODIFIED", + "ENGINEERED", + "ENGINEERED MUTATION", + "AMIDATION", + "FORMYLATION", + "ALLELIC VARIANT", + "AUTOPHOSPHORYLATION", + "BENZOYLATION", + "CHEMICAL MODIFICATION", + "CHEMICALLY MODIFIED", + "CHROMOPHOR, REM 999", + "CHROMOPHORE, REM 999", + "D-CONFIGURATION", + "ENGINEERED AND OXIDIZED CYS", + "ENGINEERED MUTANT", + "ENGINERED MUTATION", + "HYDROXYLATION", + "METHYLATED ASN", + "METHYLATION", + "MICROHETEROGENEITY", + "MODEIFED RESIDUE", + "MODIFICATION", + "MODIFIED AMINO ACID", + "MODIFIED CHROMOPHORE", + "MODIFIED GLN", + "MODIFIED RESIDUES", + "MUTATION", + "MYC EPITOPE", + "MYRISTOYLATED", + "MYRISTOYLATION", + "NATURAL VARIANT", + "NATURAL VARIANTS", + "OXIDIZED CY", + "OXIDIZED CYS", + "PHOSPHORYLATION", + "POLYMORPHIC VARIANT", + "PROPIONATION", + "SOMATIC VARIANT", + "SUBSTITUTION", + "TRNA EDITING", + "TRNA MODIFICATION", + "TRNA", + "VARIANT STRAIN", + "VARIANTS", + ]: + filteredDetails = "mutation" + elif details.upper() in [ + "LEADER SEQUENCE", + "INITIATING METHIONINE", + "INITIATOR METHIONINE", + "LINKER", + "EXPRESSION TAG", + "CLONING", + "CLONING ARTIFACT", + "C-TERM CLONING ARTIFA", + "C-TERMINAL HIS TAG", + "C-TERMINLA HIS-TAG", + "CLONING AETIFACT", + "CLONING ARATIFACT", + "CLONING ARTEFACT", + "CLONING ARTFIACT", + "CLONING ARTIACT", + "CLONING ARTIFACTS", + "CLONING ARTUFACT", + "CLONING ATIFACT", + "CLONING MUTATION", + "CLONING REMNANT", + "CLONING SITE RESIDUE", + "CLONNG ARTIFACT", + "CLONONG ARTIFACT", + "DETECTION TAG", + "ENGINEERED LINKER", + "EXPRESSION ARTIFACT", + "EXPRESSIOPN TAG", + "EXPRSSION TAG", + "FLAG TAG", + "GCN4 TAG", + "GPGS TAG", + "GST TAG", + "HIA TAG", + "HIS TAG", + "HIS-TAG", + "INITIAL METHIONINE", + "INITIATING MET", + "INITIATING METHIONIE", + "INITIATING MSE", + "INITIATING RESIDUE", + "INITIATOR N-FORMYL-MET", + "INTIATING METHIONINE", + "INTRACHAIN HIS TAG", + "LINKER INSERTION", + "LINKER PEPTIDE", + "LINKER RESIDUE", + "LINKER SEQUENCE", + "LYS TAG", + "MOD. RESIDUE/CLONING ARTIFACT", + "MYC TAG", + "N-TERMINAL EXTENSION", + "N-TERMINAL HIS TAG", + "PURIFICATION TAG", + "RANDOM MUTAGENESIS", + "RECOMBINANT HIS TAG", + "RESIDUAL LINKER", + "STREP-TAGII", + "T7 EPITOPE TAG", + "T7-TAG", + "TAG", + ]: + filteredDetails = "artifact" + elif details.upper() in ["INSERTION", "ENGINEERED INSERTION", "INSERTED", "INSERTION AT N-TERMINUS"]: + filteredDetails = "insertion" + elif details.upper() in ["DELETION", "CONFLICT/DELETION", "ENGINEERED DELETION"]: + filteredDetails = "deletion" + elif details.upper() in ["CONFLICT", "SEQUENCE CONFLICT", "SEQUENCE CONFLICT8"]: + filteredDetails = "conflict" + else: + logger.debug("Unanticipated sequence difference details %r", details) + filteredDetails = "other" + # + return filteredDetails + + def filterEntityPolyType(self, pType): + """Map input dictionary polymer type to simplified molecular type. + + Args: + pType (str): PDBx/mmCIF dictionary polymer type + + Returns: + str: simplified mappings + + Returns mappings: + 'Protein' 'polypeptide(D) or polypeptide(L)' + 'DNA' 'polydeoxyribonucleotide' + 'RNA' 'polyribonucleotide' + 'NA-hybrid' 'polydeoxyribonucleotide/polyribonucleotide hybrid' + 'Other' 'polysaccharide(D), polysaccharide(L), cyclic-pseudo-peptide, peptide nucleic acid, or other' + """ + polymerType = pType.lower() + if polymerType in ["polypeptide(d)", "polypeptide(l)"]: + rT = "Protein" + elif polymerType in ["polydeoxyribonucleotide"]: + rT = "DNA" + elif polymerType in ["polyribonucleotide"]: + rT = "RNA" + elif polymerType in ["polydeoxyribonucleotide/polyribonucleotide hybrid"]: + rT = "NA-hybrid" + else: + rT = "Other" + return rT + + def guessEntityPolyTypes(self, monomerL): + """Guess the polymer types to from the monomer list. + + Args: + monomerL (list): list of monomers (chemical component ids) + + Returns: + tuple: polymerType, filtered polymer Type. + + Returns mappings: + 'Protein' 'polypeptide(D) or polypeptide(L)' + 'DNA' 'polydeoxyribonucleotide' + 'RNA' 'polyribonucleotide' + 'NA-hybrid' 'polydeoxyribonucleotide/polyribonucleotide hybrid' + 'Other' 'polysaccharide(D), polysaccharide(L), cyclic-pseudo-peptide, peptide nucleic acid, or other' + """ + hasAA = hasDNA = hasRNA = False + pType = fpType = None + for monomer in monomerL: + if monomer in DictMethodCommonUtils.aaDict3: + hasAA = True + elif monomer in DictMethodCommonUtils.dnaDict3: + hasDNA = True + elif monomer in DictMethodCommonUtils.rnaDict3: + hasRNA = True + # + if hasAA and not hasDNA and not hasRNA: + pType = "polypeptide(d)" + elif hasDNA and not hasAA and not hasRNA: + pType = "polydeoxyribonucleotide" + elif hasRNA and not hasAA and not hasDNA: + pType = "polyribonucleotide" + elif not hasAA and hasDNA and hasRNA: + pType = "polydeoxyribonucleotide/polyribonucleotide hybrid" + + if pType: + fpType = self.filterEntityPolyType(pType) + else: + pType = None + fpType = "Other" + # + return pType, fpType + + def getPolymerComposition(self, polymerTypeList): + """Map in list of dictionary entity polymer/branched types to a composition string. + Input polymerTypeList contains entity_poly.type and pdbx_entity_branch.type values. + + Args: + polymerTypeList (list): List of PDBx/mmCIF dictionary polymer/branched types + + Returns: + tuple: compClass, ptClass, naClass, cD + + compClass - simplified composition string + ptClass - subset class + naClass - nucleic acid subset class + cD (dict) - composition type counts + + Current polymer type list: + 'polypeptide(D)' + 'polypeptide(L)' + 'polydeoxyribonucleotide' + 'polyribonucleotide' + 'polysaccharide(D)' + 'polysaccharide(L)' + 'polydeoxyribonucleotide/polyribonucleotide hybrid' + 'cyclic-pseudo-peptide' + 'peptide nucleic acid' + 'other' + "other type pair (polymer type count = 2)" + "other composition (polymer type count >= 3)" + + Current branch type list: + 'oligosaccharide' + + Output composition classes: + + 'homomeric protein' 'single protein entity' + 'heteromeric protein' 'multiple protein entities' + 'DNA' 'DNA entity/entities only' + 'RNA' 'RNA entity/entities only' + 'NA-hybrid' 'DNA/RNA hybrid entity/entities only' + 'protein/NA' 'Both protein and nucleic acid polymer entities' + 'DNA/RNA' 'Both DNA and RNA polymer entities' + 'oligosaccharide' 'One of more oligosaccharide entities' + 'protein/oligosaccharide' 'Both protein and oligosaccharide entities' + 'NA/oligosaccharide' 'Both NA and oligosaccharide entities' + 'other' 'Neither an individual protein, nucleic acid polymer nor oligosaccharide entity' + 'other type pair' 'Other combinations of 2 polymer types' + 'other type composition' 'Other combinations of 3 or more polymer types' + + And selected types (ptClass)- + 'Protein (only)' 'protein entity/entities only' + 'Nucleic acid (only)' 'DNA, RNA or NA-hybrid entity/entities only' + 'Protein/NA' 'Both protein and nucleic acid (DNA, RNA, or NA-hybrid) polymer entities' + 'Other' 'Another polymer type composition' + + And selected NA types (naClass) - + 'DNA (only)' 'DNA entity/entities only' + 'RNA (only)' 'RNA entity/entities only' + 'NA-hybrid (only)' 'NA-hybrid entity/entities only' + 'DNA/RNA (only)' 'Both DNA and RNA polymer entities only' + 'Other' 'Another polymer type composition' + """ + + compClass = "other" + # get type counts + cD = {} + for polymerType in polymerTypeList: + if polymerType in ["polypeptide(D)", "polypeptide(L)"]: + cD["protein"] = cD["protein"] + 1 if "protein" in cD else 1 + elif polymerType in ["polydeoxyribonucleotide"]: + cD["DNA"] = cD["DNA"] + 1 if "DNA" in cD else 1 + elif polymerType in ["polyribonucleotide"]: + cD["RNA"] = cD["RNA"] + 1 if "RNA" in cD else 1 + elif polymerType in ["polydeoxyribonucleotide/polyribonucleotide hybrid"]: + cD["NA-hybrid"] = cD["NA-hybrid"] + 1 if "NA-hybrid" in cD else 1 + elif polymerType in ["oligosaccharide"]: + cD["oligosaccharide"] = cD["oligosaccharide"] + 1 if "oligosaccharide" in cD else 1 + else: + cD["other"] = cD["other"] + 1 if "other" in cD else 1 + # + if len(cD) == 1: + ky = list(cD.keys())[0] + if "protein" in cD: + if cD["protein"] == 1: + compClass = "homomeric protein" + else: + compClass = "heteromeric protein" + elif ky in ["DNA", "RNA", "NA-hybrid", "oligosaccharide", "other"]: + compClass = ky + elif len(cD) == 2: + if "protein" in cD: + if ("DNA" in cD) or ("RNA" in cD) or ("NA-hybrid" in cD): + compClass = "protein/NA" + elif "oligosaccharide" in cD: + compClass = "protein/oligosaccharide" + elif "DNA" in cD and "RNA" in cD: + compClass = "DNA/RNA" + elif "oligosaccharide" in cD and ("RNA" in cD or "DNA" in cD): + compClass = "NA/oligosaccharide" + else: + compClass = "other type pair" + elif len(cD) == 3: + if "DNA" in cD and "RNA" in cD and "NA-hybrid" in cD: + compClass = "DNA/RNA" + elif "oligosaccharide" in cD and all([j in ["oligosaccharide", "DNA", "RNA", "NA-hybrid"] for j in cD]): + compClass = "NA/oligosaccharide" + elif "protein" in cD and all([j in ["protein", "DNA", "RNA", "NA-hybrid"] for j in cD]): + compClass = "protein/NA" + elif "oligosaccharide" in cD and "protein" in cD and all([j in ["protein", "oligosaccharide", "DNA", "RNA", "NA-hybrid"] for j in cD]): + compClass = "protein/NA/oligosaccharide" + else: + compClass = "other type composition" + elif len(cD) >= 4: + if "oligosaccharide" in cD and all([j in ["oligosaccharide", "DNA", "RNA", "NA-hybrid"] for j in cD]): + compClass = "NA/oligosaccharide" + elif "protein" in cD and all([j in ["protein", "DNA", "RNA", "NA-hybrid"] for j in cD]): + compClass = "protein/NA" + elif "oligosaccharide" in cD and "protein" in cD and all([j in ["protein", "oligosaccharide", "DNA", "RNA", "NA-hybrid"] for j in cD]): + compClass = "protein/NA/oligosaccharide" + else: + compClass = "other type composition" + else: + compClass = "other type composition" + + # Subset type class -- + # + if compClass in ["homomeric protein", "heteromeric protein"]: + ptClass = "Protein (only)" + elif compClass in ["DNA", "RNA", "NA-hybrid", "DNA/RNA"]: + ptClass = "Nucleic acid (only)" + elif compClass in ["protein/NA"]: + ptClass = "Protein/NA" + # JDW + elif compClass in ["protein/oligosaccharide"]: + ptClass = "Protein/Oligosaccharide" + elif compClass in ["oligosaccharide"]: + ptClass = "Oligosaccharide (only)" + # elif compClass in ["protein/NA/oligosaccharide"]: + # ptClass = "Protein/NA/Oligosaccharide" + # JDW + else: + ptClass = "Other" + # + # NA subtype class --- + # + if compClass in ["DNA"]: + naClass = "DNA (only)" + elif compClass in ["RNA"]: + naClass = "RNA (only)" + elif compClass in ["NA-hybrid"]: + naClass = "NA-hybrid (only)" + elif compClass in ["DNA/RNA"]: + naClass = "DNA/RNA (only)" + else: + naClass = "Other" + # + return compClass, ptClass, naClass, cD + + def filterExperimentalMethod(self, methodL): + """Apply a standard filter to the input experimental method list returning a method count and + a simplified method name. + + Args: + methodL (list): List of dictionary compliant experimental method names + + Returns: + tuple(int,str): methodCount, simpleMethodName + + For example: + 'X-ray' 'X-RAY DIFFRACTION, FIBER DIFFRACTION, or POWDER DIFFRACTION' + 'NMR' 'SOLUTION NMR or SOLID-STATE NMR' + 'EM' 'ELECTRON MICROSCOPY or ELECTRON CRYSTALLOGRAPHY or ELECTRON TOMOGRAPHY' + 'Neutron' 'NEUTRON DIFFRACTION' + 'Multiple methods' 'Multiple experimental methods' + 'Other' 'SOLUTION SCATTERING, EPR, THEORETICAL MODEL, INFRARED SPECTROSCOPY or FLUORESCENCE TRANSFER' + """ + methodCount = len(methodL) + if methodCount > 1: + expMethod = "Multiple methods" + else: + # + mS = methodL[0].upper() + expMethod = "Other" + if mS in ["X-RAY DIFFRACTION", "FIBER DIFFRACTION", "POWDER DIFFRACTION"]: + expMethod = "X-ray" + elif mS in ["SOLUTION NMR", "SOLID-STATE NMR"]: + expMethod = "NMR" + elif mS in ["ELECTRON MICROSCOPY", "ELECTRON CRYSTALLOGRAPHY", "ELECTRON DIFFRACTION", "CRYO-ELECTRON MICROSCOPY", "ELECTRON TOMOGRAPHY"]: + expMethod = "EM" + elif mS in ["NEUTRON DIFFRACTION"]: + expMethod = "Neutron" + elif mS in ["SOLUTION SCATTERING", "EPR", "THEORETICAL MODEL", "INFRARED SPECTROSCOPY", "FLUORESCENCE TRANSFER"]: + expMethod = "Other" + else: + logger.error("Unexpected method ") + + return methodCount, expMethod + + def hasMethodNMR(self, methodL): + """Return if the input dictionary experimental method list contains an NMR experimental method. + + Args: + methodL (list): List of dictionary experimental method names + + Returns: + bool: True if the input contains NMR or False otherwise + """ + ok = False + for method in methodL: + if method in ["SOLUTION NMR", "SOLID-STATE NMR"]: + return True + return ok + + def __getTimeStamp(self): + utcnow = datetime.datetime.utcnow() + ts = utcnow.strftime("%Y-%m-%d:%H:%M:%S") + return ts + + def __stripWhiteSpace(self, val): + """Remove all white space from the input value.""" + if val is None: + return val + return self.__wsPattern.sub("", val) + + def __toRangeList(self, iterable): + iterable = sorted(set(iterable)) + for _, group in itertools.groupby(enumerate(iterable), lambda t: t[1] - t[0]): + group = list(group) + yield group[0][1], group[-1][1] + + # + def getTargetSiteInfo(self, dataContainer): + """Return a dictionary of target site binding interactions using standard nomenclature. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + dict: {site_id: [{'asymId': , 'compId': , 'seqId': }, ...], ... } + + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchInstanceSiteInfo(dataContainer) + return wD["targetSiteD"] if "targetSiteD" in wD else {} + + def getLigandSiteInfo(self, dataContainer): + """Return a dictionary of ligand site binding interactions. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + dict: {site_id: {"evCode": software|author, + "fromDetails": True|False, + "isRaw": True|False, + "entityType": polymer|non-polymer, + "polymerLigand": {"asymId": ., "entityId": ., "begSeqId": ., "endSeqId":. }, + "nonPolymerLigands": [{"asymId": ., "entityId": ., "compId": .}, ...], + "description": raw or generated text, + } + } + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchInstanceSiteInfo(dataContainer) + return wD["ligandSiteD"] if "ligandSiteD" in wD else {} + + def __fetchInstanceSiteInfo(self, dataContainer): + wD = self.__instanceSiteInfoCache.get(dataContainer.getName()) + if not wD: + wD = self.__getInstanceSiteInfo(dataContainer) + self.__instanceSiteInfoCache.set(dataContainer.getName(), wD) + return wD + + def __getInstanceSiteInfo(self, dataContainer): + """[summary] + + Args: + dataContainer (object): mmif.api.DataContainer object instance + + Returns: + dict : {"targetSiteD" = {: {}} + "ligandSiteD": {: {}} + } + + For example: + + loop_ + _struct_site.id + _struct_site.pdbx_evidence_code + _struct_site.pdbx_auth_asym_id + _struct_site.pdbx_auth_comp_id + _struct_site.pdbx_auth_seq_id + _struct_site.pdbx_auth_ins_code # never used + _struct_site.pdbx_num_residues + _struct_site.details + AC1 Software ? ? ? ? 7 'BINDING SITE FOR RESIDUE ADP A 105' + AC2 Software ? ? ? ? 16 'BINDING SITE FOR RESIDUE ADP B 101' + AC3 Software ? ? ? ? 6 'BINDING SITE FOR RESIDUE MG B 66' + AC4 Software ? ? ? ? 13 'BINDING SITE FOR RESIDUE ADP C 102' + AC5 Software ? ? ? ? 16 'BINDING SITE FOR RESIDUE ADP E 103' + AC6 Software ? ? ? ? 10 'BINDING SITE FOR RESIDUE ADP F 104' + AC7 Software ? ? ? ? 6 'BINDING SITE FOR RESIDUE MG K 9' + # + loop_ + _struct_site_gen.id + _struct_site_gen.site_id + _struct_site_gen.pdbx_num_res + _struct_site_gen.label_comp_id + _struct_site_gen.label_asym_id + _struct_site_gen.label_seq_id + _struct_site_gen.pdbx_auth_ins_code + _struct_site_gen.auth_comp_id + _struct_site_gen.auth_asym_id + _struct_site_gen.auth_seq_id + _struct_site_gen.label_atom_id + _struct_site_gen.label_alt_id + _struct_site_gen.symmetry + _struct_site_gen.details + 1 AC1 7 TYR A 25 ? TYR A 25 . ? 1_555 ? + 2 AC1 7 GLY A 29 ? GLY A 29 . ? 1_555 ? + 3 AC1 7 THR A 61 ? THR A 61 . ? 1_555 ? + 4 AC1 7 VAL A 63 ? VAL A 63 . ? 1_555 ? + 5 AC1 7 ILE B 30 ? ILE B 30 . ? 1_555 ? + 6 AC1 7 LEU B 32 ? LEU B 32 . ? 1_555 ? + 7 AC1 7 GLN B 52 ? GLN B 52 . ? 1_555 ? + 8 AC2 16 TYR B 25 ? TYR B 25 . ? 1_555 ? + 9 AC2 16 LEU B 26 ? LEU B 26 . ? 1_555 ? + 10 AC2 16 GLY B 29 ? GLY B 29 . ? 1_555 ? + 11 AC2 16 LYS B 31 ? LYS B 31 . ? 1_555 ? + 12 AC2 16 SER B 60 ? SER B 60 . ? 1_555 ? + 13 AC2 16 THR B 61 ? THR B 61 . ? 1_555 ? + 14 AC2 16 HOH P . ? HOH B 113 . ? 1_555 ? + 15 AC2 16 HOH P . ? HOH B 116 . ? 1_555 ? + 16 AC2 16 HOH P . ? HOH B 201 . ? 1_555 ? + 17 AC2 16 HOH P . ? HOH B 241 . ? 1_555 ? + 18 AC2 16 LEU C 26 ? LEU C 26 . ? 1_555 ? + 19 AC2 16 ASN C 28 ? ASN C 28 . ? 1_555 ? + 20 AC2 16 ILE C 30 ? ILE C 30 . ? 1_555 ? + 21 AC2 16 LEU C 32 ? LEU C 32 . ? 1_555 ? + 22 AC2 16 ARG F 16 ? ARG F 16 . ? 1_565 ? + 23 AC2 16 ARG F 17 ? ARG F 17 . ? 1_565 ? + """ + logger.debug("Starting with %r", dataContainer.getName()) + # + rD = {"targetSiteD": {}, "ligandSiteD": {}} + try: + # Exit if source categories are missing + if not (dataContainer.exists("struct_site") and dataContainer.exists("struct_site_gen")): + return rD + # ------- --------- ------- --------- ------- --------- ------- --------- ------- --------- + ssObj = None + if dataContainer.exists("struct_site"): + ssObj = dataContainer.getObj("struct_site") + # + ssgObj = None + if dataContainer.exists("struct_site_gen"): + ssgObj = dataContainer.getObj("struct_site_gen") + + # + ligandSiteD = {} + for ii in range(ssObj.getRowCount()): + ligL = [] + evCode = str(ssObj.getValue("pdbx_evidence_code", ii)).lower() + if evCode not in ["software", "author"]: + continue + sId = ssObj.getValue("id", ii) + authAsymId = ssObj.getValueOrDefault("pdbx_auth_asym_id", ii, defaultValue=None) + compId = ssObj.getValueOrDefault("pdbx_auth_comp_id", ii, defaultValue=None) + authSeqId = ssObj.getValueOrDefault("pdbx_auth_seq_id", ii, defaultValue=None) + ssDetails = ssObj.getValueOrDefault("details", ii, defaultValue=None) + fromDetails = False + if authAsymId: + ligL.append((authAsymId, compId, authSeqId, ssDetails)) + else: + fromDetails = True + if evCode == "software": + ligL = self.__parseStructSiteLigandDetails(ssDetails) + elif evCode == "author": + ligL.append((None, None, None, ssDetails)) + # + ligandSiteD[sId] = self.__transStructSiteLigandDetails(dataContainer, ligL, evCode=evCode, fromDetails=fromDetails) + # + + targetSiteD = {} + instTypeD = self.getInstanceTypes(dataContainer) + for ii in range(ssgObj.getRowCount()): + sId = ssgObj.getValue("site_id", ii) + asymId = ssgObj.getValueOrDefault("label_asym_id", ii, defaultValue=None) + compId = ssgObj.getValueOrDefault("label_comp_id", ii, defaultValue=None) + seqId = ssgObj.getValueOrDefault("label_seq_id", ii, defaultValue=None) + # + if asymId and compId and seqId and asymId in instTypeD and instTypeD[asymId] == "polymer": + targetSiteD.setdefault(sId, []).append({"asymId": asymId, "compId": compId, "seqId": seqId}) + # + return {"targetSiteD": targetSiteD, "ligandSiteD": ligandSiteD} + except Exception as e: + logger.exception("%s failing with %s", dataContainer.getName(), str(e)) + return rD + + def __transStructSiteLigandDetails(self, dataContainer, ligL, evCode="software", fromDetails=True): + """Convert struct_site ligand details to standard nomenclature. + + Args: + dataContainer (object): mmif.api.DataContainer object instance + ligL (list): list of raw ligand details in author nomenclature + evCode (str): string (software|author) + fromDetails (bool, optional): details parsed from descriptive text. Defaults to True. + + Returns: + dict: {"evCode": software|author, + "fromDetails": True|False, + "isRaw": True|False, + "entityType": polymer|non-polymer, + "polymerLigand": {"asymId": ., "entityId": ., "begSeqId": ., "endSeqId":. }, + "nonPolymerLigands": [{"asymId": ., "entityId": ., "compId": .}, ...], + "description": raw or generated text, + "siteLabel": replacement for data site id, + } + + """ + rD = { + "evCode": evCode, + "fromDetails": fromDetails, + "isRaw": True, + "entityType": None, + "polymerLigand": None, + "nonPolymerLigands": None, + "description": None, + "siteLabel": None, + } + npAuthAsymD = self.getNonPolymerIdMap(dataContainer) + pAuthAsymD = self.getPolymerIdMap(dataContainer) + asymAuthIdD = self.getAsymAuthIdMap(dataContainer) + asymIdPolymerRangesD = self.getInstancePolymerRanges(dataContainer) + iTypeD = self.getInstanceTypes(dataContainer) + asymAuthIdD = self.getAsymAuthIdMap(dataContainer) + # Note that this is a non-unique index inversion + authAsymD = {v: k for k, v in asymAuthIdD.items()} + instEntityD = self.getInstanceEntityMap(dataContainer) + evS = "Software generated" if evCode == "software" else "Author provided" + # + if len(ligL) == 1: + authAsymId, compId, authSeqId, ssDetails = ligL[0] + # + if not authAsymId: + rD["description"] = ssDetails + rD["isRaw"] = True + elif not authSeqId: + # An unqualified authAsymId - + asymId = authAsymD[authAsymId] if authAsymId in authAsymD else None + entityId = instEntityD[asymId] if asymId in instEntityD else None + if entityId and asymId and asymId in iTypeD and iTypeD[asymId] == "polymer" and asymId in asymIdPolymerRangesD: + # insert the full residue range - + rD["entityType"] = iTypeD[asymId] + begSeqId = asymIdPolymerRangesD[asymId]["begSeqId"] + endSeqId = asymIdPolymerRangesD[asymId]["endSeqId"] + tD = {"asymId": asymId, "entityId": instEntityD[asymId], "begSeqId": begSeqId, "endSeqId": endSeqId} + rD["description"] = "%s binding site for entity %s (%s-%s) instance %s chain %s" % (evS, entityId, begSeqId, endSeqId, asymId, authAsymId) + rD["polymerLigand"] = tD + rD["siteLabel"] = "chain %s" % authAsymId + elif (authAsymId, authSeqId) in npAuthAsymD: + # single non-polymer-ligand - + asymId = npAuthAsymD[(authAsymId, authSeqId)]["asym_id"] + rD["entityType"] = iTypeD[asymId] + entityId = instEntityD[asymId] + tD = {"asymId": asymId, "entityId": instEntityD[asymId], "compId": compId} + rD["nonPolymerLigands"] = [tD] + rD["description"] = "%s binding site for ligand entity %s component %s instance %s chain %s" % (evS, entityId, compId, asymId, authAsymId) + rD["siteLabel"] = "ligand %s" % compId + elif (authAsymId, authSeqId, None) in pAuthAsymD: + # single monomer ligand - an odd case + asymId = pAuthAsymD[(authAsymId, authSeqId, None)]["asym_id"] + entityId = pAuthAsymD[(authAsymId, authSeqId, None)]["entity_id"] + seqId = pAuthAsymD[(authAsymId, authSeqId, None)]["seq_id"] + rD["entityType"] = iTypeD[asymId] + tD = {"asymId": asymId, "entityId": entityId, "begSeqId": seqId, "endSeqId": seqId} + rD["description"] = "%s binding site for entity %s instance %s chainId %s (%s)" % (evS, entityId, asymId, authAsymId, authSeqId) + rD["polymerLigand"] = tD + rD["siteLabel"] = "chain %s" % authAsymId + else: + logger.debug("%s untranslated single ligand details %r", dataContainer.getName(), ligL) + logger.debug("npAuthAsymD %r", npAuthAsymD) + rD["description"] = ssDetails + rD["isRaw"] = True + # + elif len(ligL) == 2: + authAsymIdA, compIdA, authSeqIdA, ssDetailsA = ligL[0] + authAsymIdB, compIdB, authSeqIdB, _ = ligL[1] + # + # is np + if (authAsymIdA, authSeqIdA) in npAuthAsymD and (authAsymIdB, authSeqIdB) in npAuthAsymD: + asymIdA = npAuthAsymD[(authAsymIdA, authSeqIdA)]["asym_id"] + entityIdA = npAuthAsymD[(authAsymIdA, authSeqIdA)]["entity_id"] + asymIdB = npAuthAsymD[(authAsymIdB, authSeqIdB)]["asym_id"] + entityIdB = npAuthAsymD[(authAsymIdB, authSeqIdB)]["entity_id"] + tDA = {"asymId": asymIdA, "entityId": entityIdA, "compId": compIdA} + tDB = {"asymId": asymIdB, "entityId": entityIdB, "compId": compIdB} + rD["nonPolymerLigands"] = [tDA, tDB] + rD["entityType"] = iTypeD[asymIdA] + rD["description"] = "%s binding site for ligands: entity %s component %s instance %s chain %s and entity %s component %s instance %s chain %s" % ( + evS, + entityIdA, + compIdA, + asymIdA, + authAsymIdA, + entityIdB, + compIdB, + asymIdB, + authAsymIdB, + ) + rD["siteLabel"] = "ligands %s/%s" % (compIdA, compIdB) + elif (authAsymIdA, authSeqIdA, None) in pAuthAsymD and (authAsymIdB, authSeqIdB, None) in pAuthAsymD and authAsymIdA == authAsymIdB: + asymIdA = pAuthAsymD[(authAsymIdA, authSeqIdA, None)]["asym_id"] + entityIdA = pAuthAsymD[(authAsymIdA, authSeqIdA, None)]["entity_id"] + asymIdB = pAuthAsymD[(authAsymIdB, authSeqIdB, None)]["asym_id"] + entityIdB = pAuthAsymD[(authAsymIdB, authSeqIdB, None)]["entity_id"] + begSeqId = pAuthAsymD[(authAsymIdA, authSeqIdA, None)]["seq_id"] + endSeqId = pAuthAsymD[(authAsymIdB, authSeqIdB, None)]["seq_id"] + tD = {"asymId": asymIdA, "entityId": instEntityD[asymIdA], "begSeqId": begSeqId, "endSeqId": endSeqId} + rD["entityType"] = iTypeD[asymIdA] + rD["description"] = "%s binding site for entity %s instance %s chain %s and entity %s instance %s chain %s" % ( + evS, + entityIdA, + asymIdA, + authAsymIdA, + entityIdB, + asymIdB, + authAsymIdB, + ) + rD["polymerLigand"] = tD + rD["siteLabel"] = "chains %s/%s" % (authAsymIdA, authAsymIdB) + else: + logger.debug("%s untranslated ligand details %r", dataContainer.getName(), ligL) + rD["description"] = ssDetailsA + rD["isRaw"] = True + else: + logger.error("%s unexpected ligand expression %r", dataContainer.getName(), ligL) + return rD + + def __parseStructSiteLigandDetails(self, ssDetails): + """Parse the input site description text and returning structured details + where possible. + + Args: + ssDetails (str): struct_site.details text + + Returns: + list: [(authAsymId, compId, authSeqId, ssDetails), ... ] + + """ + retL = [] + # + try: + if not ssDetails: + retL.append((None, None, None, None)) + return retL + prefixL = [ + "BINDING SITE FOR RESIDUE ", + "binding site for residue ", + "Binding site for Ligand ", + "binding site for Ligand ", + "Binding site for Mono-Saccharide ", + "BINDING SITE FOR MONO-SACCHARIDE ", + "binding site for Mono-Saccharide ", + "binding site for Poly-Saccharide ", + "binding site for nucleotide ", + ] + for prefix in prefixL: + tup = ssDetails.partition(prefix) + if tup[1] == prefix: + ff = tup[2].split(" ") + # binding site for Ligand residues POL d 4 through N7P d 1 bound to THR b 1 + if ff[0] == "residues" and len(ff) > 8 and ff[4].lower() == "through": + compIdA = ff[1] + authAsymIdA = ff[2] + authSeqIdA = ff[3] + retL.append((authAsymIdA, compIdA, authSeqIdA, ssDetails)) + # + compIdB = ff[5] + authAsymIdB = ff[6] + authSeqIdB = ff[7] + retL.append((authAsymIdB, compIdB, authSeqIdB, ssDetails)) + return retL + elif len(ff) == 2: + compId = ff[0] + authAsymId = ff[1][0] + authSeqId = ff[1][1:] + retL.append((authAsymId, compId, authSeqId, ssDetails)) + return retL + elif len(ff) == 3: + compId = ff[0] + authAsymId = ff[1] + authSeqId = ff[2] + retL.append((authAsymId, compId, authSeqId, ssDetails)) + return retL + + # + # Binding site for residues GCD A 900 and NGA A 901 + # Binding site for residues FUC A1118 and BGC A1119' + prefixL = [ + "Binding site for residues ", + "binding site for residues ", + "BINDING SITE FOR DI-SACCHARIDE ", + "Binding site for Di-Saccharide ", + "binding site for Di-Saccharide ", + "binding site for Di-peptide ", + "Binding site for Di-peptide ", + "binding site for Di-nucleotide ", + ] + for prefix in prefixL: + tup = ssDetails.partition(prefix) + if tup[1] == prefix: + ff = tup[2].split(" ") + if len(ff) == 5: + compIdA = ff[0] + authAsymIdA = ff[1][0] + authSeqIdA = ff[1][1:] + compIdB = ff[3] + authAsymIdB = ff[4][0] + authSeqIdB = ff[4][1:] + elif len(ff) == 7: + compIdA = ff[0] + authAsymIdA = ff[1] + authSeqIdA = ff[2] + compIdB = ff[4] + authAsymIdB = ff[5] + authSeqIdB = ff[6] + else: + compIdA = authAsymIdA = authSeqIdA = compIdB = authAsymIdB = authSeqIdB = None + + retL.append((authAsymIdA, compIdA, authSeqIdA, ssDetails)) + retL.append((authAsymIdB, compIdB, authSeqIdB, ssDetails)) + return retL + # + # BINDING SITE FOR LINKED RESIDUES A 1519 A 1520 A 1521 A 1522 A 1523 A 1524 A 1525 + # BINDING SITE FOR LINKED RESIDUES A 801 to 802 + prefixL = ["BINDING SITE FOR LINKED RESIDUES "] + for prefix in prefixL: + tup = ssDetails.partition(prefix) + if tup[1] == prefix: + ff = tup[2].split(" ") + if len(ff) == 2: + # BINDING SITE FOR LINKED RESIDUES A 502-507 + try: + tff = ff[1].split("-") + authAsymIdA = ff[0] + authSeqIdA = tff[0] + authSeqIdB = tff[1] + except Exception: + continue + if len(ff) == 4 and ff[2].lower() == "to": + authAsymIdA = ff[0] + authSeqIdA = ff[1] + authSeqIdB = ff[3] + elif len(ff) == 4 and ff[2].lower() != "to": + authAsymIdA = ff[0] + authSeqIdA = ff[1] + authSeqIdB = ff[3] + elif len(ff) > 4: + authAsymIdA = ff[0] + authSeqIdA = ff[1] + authSeqIdB = ff[-1] + else: + continue + retL.append((authAsymIdA, None, authSeqIdA, ssDetails)) + retL.append((authAsymIdA, None, authSeqIdB, ssDetails)) + return retL + + # + # + prefixL = ["BINDING SITE FOR CHAIN ", "binding site for chain "] + for prefix in prefixL: + tup = ssDetails.partition(prefix) + if tup[1] == prefix: + ff = tup[2].split(" ") + authAsymId = ff[0] + retL.append((authAsymId, None, None, ssDetails)) + return retL + # punt - + retL.append((None, None, None, ssDetails)) + return retL + except Exception as e: + logger.exception("Failing with %s for %r", str(e), ssDetails) + return [(None, None, None, ssDetails)] + + def getUnobservedPolymerResidueInfo(self, dataContainer): + """Return a dictionary of unobserved regions of polymer instances. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + dict: {(modelId, asymId, occFlag): [seqId range list], ...} + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchUnobservedInfo(dataContainer) + return wD["polyResRng"] if "polyResRng" in wD else {} + + def getUnobservedPolymerAtomInfo(self, dataContainer): + """Return a dictionary of polymer regions containing unobserved atoms. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + dict: {(modelId, asymId, occFlag): [seqId range list], ...} + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchUnobservedInfo(dataContainer) + return wD["polyAtomRng"] if "polyAtomRng" in wD else {} + + def getUnobservedNonPolymerAtomInfo(self, dataContainer): + """Return a dictionary of nonpolymer instances containing unobserved atoms (std nomenclature). + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + dict: {(modelId, compId, asymId, occFlag): [atomId, .. ], ...} + + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchUnobservedInfo(dataContainer) + return wD["nonPolyMissingAtomD"] if "nonPolyMissingAtomD" in wD else {} + + def getUnobservedNonPolymerAtomInfoAuth(self, dataContainer): + """Return a dictionary of nonpolymer instances containing unobserved atoms (auth nomenclature) + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + dict: {(modelId, compId, authtAsymId, authSeqIdm, occFlag): [atomId, .. ], ...} + + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchUnobservedInfo(dataContainer) + return wD["nonPolyMissingAtomAuthD"] if "nonPolyMissingAtomAuthD" in wD else {} + + def __fetchUnobservedInfo(self, dataContainer): + wD = self.__instanceUnobservedCache.get(dataContainer.getName()) + if not wD: + wD = self.__getUnobserved(dataContainer) + self.__instanceUnobservedCache.set(dataContainer.getName(), wD) + return wD + + def __getUnobserved(self, dataContainer): + """Internal method to extract unobserved and zero occupancy features. + + Args: + dataContainer ([type]): [description] + + Returns: + {"polyResRng": {(modelId, asymId, occFlag): [seqId range list], ...}, + "polyAtomRng": {(modelId, asymId, occFlag): [seqId range list], ...}, + "nonPolyMissingAtomD": {(modelId, compId, asymId, zeroOccFlag): [atomId,...], }, + "nonPolyMissingAtomAuthD": {(modelId, compId, authAsymId, authSeqId, zeroOccFlag): [atomId,...], }, + } + + occFlag = 0 -> zero occupancy + Example: + + loop_ + _pdbx_unobs_or_zero_occ_atoms.id + _pdbx_unobs_or_zero_occ_atoms.PDB_model_num + _pdbx_unobs_or_zero_occ_atoms.polymer_flag + _pdbx_unobs_or_zero_occ_atoms.occupancy_flag + _pdbx_unobs_or_zero_occ_atoms.auth_asym_id + _pdbx_unobs_or_zero_occ_atoms.auth_comp_id + _pdbx_unobs_or_zero_occ_atoms.auth_seq_id + _pdbx_unobs_or_zero_occ_atoms.PDB_ins_code + _pdbx_unobs_or_zero_occ_atoms.auth_atom_id + _pdbx_unobs_or_zero_occ_atoms.label_alt_id + _pdbx_unobs_or_zero_occ_atoms.label_asym_id + _pdbx_unobs_or_zero_occ_atoms.label_comp_id + _pdbx_unobs_or_zero_occ_atoms.label_seq_id + _pdbx_unobs_or_zero_occ_atoms.label_atom_id + 1 1 Y 1 B ARG 17 ? NE ? B ARG 17 NE + 2 1 Y 1 B ARG 17 ? CZ ? B ARG 17 CZ + 3 1 Y 1 B ARG 17 ? NH1 ? B ARG 17 NH1 + + # + loop_ + _pdbx_unobs_or_zero_occ_residues.id + _pdbx_unobs_or_zero_occ_residues.PDB_model_num + _pdbx_unobs_or_zero_occ_residues.polymer_flag + _pdbx_unobs_or_zero_occ_residues.occupancy_flag + _pdbx_unobs_or_zero_occ_residues.auth_asym_id + _pdbx_unobs_or_zero_occ_residues.auth_comp_id + _pdbx_unobs_or_zero_occ_residues.auth_seq_id + _pdbx_unobs_or_zero_occ_residues.PDB_ins_code + _pdbx_unobs_or_zero_occ_residues.label_asym_id + _pdbx_unobs_or_zero_occ_residues.label_comp_id + _pdbx_unobs_or_zero_occ_residues.label_seq_id + 1 1 Y 1 A MET 1 ? A MET 1 + 2 1 Y 1 A ALA 2 ? A ALA 2 + 3 1 Y 1 A LYS 3 ? A LYS 3 + """ + logger.debug("Starting with %r", dataContainer.getName()) + # + rD = {} + try: + # Exit if source categories are missing + if not (dataContainer.exists("pdbx_unobs_or_zero_occ_residues") or dataContainer.exists("pdbx_unobs_or_zero_occ_atoms")): + return rD + # ------- --------- ------- --------- ------- --------- ------- --------- ------- --------- + resObj = None + if dataContainer.exists("pdbx_unobs_or_zero_occ_residues"): + resObj = dataContainer.getObj("pdbx_unobs_or_zero_occ_residues") + # + atomObj = None + if dataContainer.exists("pdbx_unobs_or_zero_occ_atoms"): + atomObj = dataContainer.getObj("pdbx_unobs_or_zero_occ_atoms") + # + polyResRngD = {} + if resObj: + for ii in range(resObj.getRowCount()): + modelId = resObj.getValueOrDefault("PDB_model_num", ii, defaultValue=None) + pFlag = resObj.getValueOrDefault("polymer_flag", ii, defaultValue=None) + if pFlag == "Y": + occFlag = resObj.getValueOrDefault("occupancy_flag", ii, defaultValue=None) + zeroOccFlag = int(occFlag) == 0 + asymId = resObj.getValueOrDefault("label_asym_id", ii, defaultValue=None) + # authAsymId = resObj.getValueOrDefault("auth_asym_id", ii, defaultValue=None) + seqId = resObj.getValueOrDefault("label_seq_id", ii, defaultValue=None) + if seqId: + polyResRngD.setdefault((modelId, asymId, zeroOccFlag), []).append(int(seqId)) + # + for tup in polyResRngD: + polyResRngD[tup] = list(self.__toRangeList(polyResRngD[tup])) + logger.debug("polyResRngD %r", polyResRngD) + # + polyAtomRngD = {} + nonPolyMissingAtomD = {} + nonPolyMissingAtomAuthD = {} + if atomObj: + for ii in range(atomObj.getRowCount()): + modelId = atomObj.getValueOrDefault("PDB_model_num", ii, defaultValue=None) + pFlag = atomObj.getValueOrDefault("polymer_flag", ii, defaultValue=None) + occFlag = atomObj.getValueOrDefault("occupancy_flag", ii, defaultValue=None) + zeroOccFlag = occFlag and int(occFlag) == 0 + asymId = atomObj.getValueOrDefault("label_asym_id", ii, defaultValue=None) + if pFlag == "Y": + # authAsymId = resObj.getValueOrDefault("auth_asym_id", ii, defaultValue=None) + seqId = atomObj.getValueOrDefault("label_seq_id", ii, defaultValue=None) + if seqId: + polyAtomRngD.setdefault((modelId, asymId, zeroOccFlag), []).append(int(seqId)) + else: + authAsymId = atomObj.getValueOrDefault("auth_asym_id", ii, defaultValue=None) + authSeqId = atomObj.getValueOrDefault("auth_seq_id", ii, defaultValue=None) + atomId = atomObj.getValueOrDefault("label_atom_id", ii, defaultValue=None) + compId = atomObj.getValueOrDefault("label_comp_id", ii, defaultValue=None) + nonPolyMissingAtomD.setdefault((modelId, compId, asymId, zeroOccFlag), []).append(atomId) + nonPolyMissingAtomAuthD.setdefault((modelId, compId, authAsymId, authSeqId, zeroOccFlag), []).append(atomId) + # + for tup in polyAtomRngD: + polyAtomRngD[tup] = list(self.__toRangeList(polyAtomRngD[tup])) + logger.debug("polyAtomRngD %r", polyAtomRngD) + # + rD = {"polyResRng": polyResRngD, "polyAtomRng": polyAtomRngD, "nonPolyMissingAtomD": nonPolyMissingAtomD, "nonPolyMissingAtomAuthD": nonPolyMissingAtomAuthD} + except Exception as e: + logger.exception("%s failing with %s", dataContainer.getName(), str(e)) + return rD + + def getInstanceModelOutlierInfo(self, dataContainer): + """Return a dictionary of polymer model outliers. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + dict: {(modelId, asymId): (seqId,compId), ...} + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchInstanceModelOutliers(dataContainer) + return wD["instanceModelOutlierD"] if "instanceModelOutlierD" in wD else {} + + def getInstanceNonpolymerValidationInfo(self, dataContainer): + """Return a dictionary of nonpolymer validation details. + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + + Returns: + dict: {(modelId, asymId): NonpolymerValidationInstance(rsr, rsrCc, bondsRmsZ, anglesRmsZ, missingAtomCount)} + + """ + if not dataContainer or not dataContainer.getName(): + return {} + wD = self.__fetchInstanceModelOutliers(dataContainer) + return wD["instanceModelValidationD"] if "instanceModelValidationD" in wD else {} + + def __fetchInstanceModelOutliers(self, dataContainer): + wD = self.__modelOutliersCache.get(dataContainer.getName()) + if not wD: + wD = self.__getInstanceModelOutliers(dataContainer) + self.__modelOutliersCache.set(dataContainer.getName(), wD) + return wD + + def __getInstanceModelOutliers(self, dataContainer): + """Internal method to assemble model outliers details. + + Args: + dataContainer ([type]): [description] + + Returns: + {"instanceModelOutlierD": {(modelId, asymId): [(compId, seqId, "BOND_OUTLIER", optional_description), ...}} + # + loop_ + _pdbx_vrpt_instance_results.ordinal + _pdbx_vrpt_instance_results.entity_id + _pdbx_vrpt_instance_results.auth_asym_id + _pdbx_vrpt_instance_results.label_asym_id + _pdbx_vrpt_instance_results.label_comp_id + _pdbx_vrpt_instance_results.auth_seq_id + _pdbx_vrpt_instance_results.label_seq_id + _pdbx_vrpt_instance_results.PDB_ins_code + _pdbx_vrpt_instance_results.label_alt_id + _pdbx_vrpt_instance_results.PDB_model_num + _pdbx_vrpt_instance_results.num_H_reduce + _pdbx_vrpt_instance_results.cis_peptide + _pdbx_vrpt_instance_results.natoms_eds + _pdbx_vrpt_instance_results.RSR + _pdbx_vrpt_instance_results.RSRCC + _pdbx_vrpt_instance_results.RSRZ + _pdbx_vrpt_instance_results.OWAB + _pdbx_vrpt_instance_results.average_occupancy + _pdbx_vrpt_instance_results.ramachandran_class + _pdbx_vrpt_instance_results.rotamer_class + _pdbx_vrpt_instance_results.phi + _pdbx_vrpt_instance_results.psi + _pdbx_vrpt_instance_results.mogul_angles_RMSZ + _pdbx_vrpt_instance_results.mogul_bonds_RMSZ + _pdbx_vrpt_instance_results.mogul_RMSZ_num_angles + _pdbx_vrpt_instance_results.mogul_RMSZ_num_bonds + # ... + 302 1 A A TYR 340 343 ? ? 1 9 ? 12 0.108 0.943 0.117 71.350 1.000 Favored m-85 -111.8 6.4 ? ? ? ? + 303 1 A A LYS 341 344 ? ? 1 13 ? 9 0.120 0.955 -0.380 67.860 1.000 Favored mttt -73.3 139.6 ? ? ? ? + 304 1 A A ILE 342 345 ? ? 1 11 ? 8 0.147 0.964 0.799 76.030 1.000 Favored pt -140.0 171.7 ? ? ? ? + 305 1 A A ASN 343 346 ? ? 1 6 ? 8 0.182 0.948 1.114 82.730 1.000 Favored m-80 52.8 49.6 ? ? ? ? + 306 1 A A GLN 344 347 ? ? 1 2 ? 5 0.193 0.807 1.002 97.730 1.000 ? ? ? ? ? ? ? ? + # ... + 307 2 A B PEG 401 . ? A 1 10 ? 14 0.154 0.914 ? 36.150 1.000 ? ? ? ? 0.76 0.64 5 6 + 308 2 A B PEG 401 . ? B 1 10 ? 14 0.154 0.914 ? 36.150 1.000 ? ? ? ? 0.97 0.68 5 6 + 309 3 A C HYO 402 . ? ? 1 ? ? 21 0.108 0.947 ? 35.530 1.000 ? ? ? ? 2.18 4.96 32 23 + 310 4 A D NI 403 . ? ? 1 ? ? 1 0.096 0.999 ? 28.080 1.000 ? ? ? ? ? ? ? ? + 311 5 A E OGA 404 . ? ? 1 3 ? 10 0.104 0.976 ? 30.510 1.000 ? ? ? ? 1.87 3.23 4 3 + 312 6 A F EDO 405 . ? ? 1 6 ? 4 0.097 0.941 ? 42.000 1.000 ? ? ? ? 0.32 0.80 2 3 + 313 6 A G EDO 406 . ? ? 1 6 ? 4 0.252 0.797 ? 57.320 1.000 ? ? ? ? 0.73 0.61 2 3 + 314 7 A H SR 407 . ? ? 1 ? ? 1 0.143 1.000 ? 30.560 0.840 ? ? ? ? ? ? ? ? + 315 8 A I UNX 408 . ? ? 1 ? ? 1 0.321 0.940 ? 41.340 1.000 ? ? ? ? ? ? ? ? + 316 8 A J UNX 409 . ? ? 1 ? ? 1 0.611 0.922 ? 61.040 1.000 ? ? ? ? ? ? ? ? + # ... + """ + logger.debug("Starting with %r", dataContainer.getName()) + # + rD = {} + try: + # Exit if no source categories are present + if not ( + dataContainer.exists("pdbx_vrpt_instance_results") + or dataContainer.exists("pdbx_vrpt_bond_outliers") + or dataContainer.exists("pdbx_vrpt_angle_outliers") + or dataContainer.exists("pdbx_vrpt_mogul_bond_outliers") + or dataContainer.exists("pdbx_vrpt_mogul_angle_outliers") + ): + return rD + # ------- --------- ------- --------- ------- --------- ------- --------- ------- --------- + nonPolyMissingAtomD = self.getUnobservedNonPolymerAtomInfo(dataContainer) + # + instanceModelOutlierD = {} + instanceModelValidationD = {} + vObj = None + if dataContainer.exists("pdbx_vrpt_bond_outliers"): + vObj = dataContainer.getObj("pdbx_vrpt_bond_outliers") + if vObj: + for ii in range(vObj.getRowCount()): + seqId = vObj.getValueOrDefault("label_seq_id", ii, defaultValue=None) + if seqId: + modelId = vObj.getValueOrDefault("PDB_model_num", ii, defaultValue=None) + asymId = vObj.getValueOrDefault("label_asym_id", ii, defaultValue=None) + compId = vObj.getValueOrDefault("label_comp_id", ii, defaultValue=None) + # + atomI = vObj.getValueOrDefault("atom0", ii, defaultValue=None) + atomJ = vObj.getValueOrDefault("atom1", ii, defaultValue=None) + obsDist = vObj.getValueOrDefault("obs", ii, defaultValue=None) + zVal = vObj.getValueOrDefault("Z", ii, defaultValue=None) + tS = "%s-%s dist=%s Z=%s" % (atomI, atomJ, obsDist, zVal) + # + instanceModelOutlierD.setdefault((modelId, asymId, True), []).append( + OutlierValue( + compId, + int(seqId), + "BOND_OUTLIER", + tS, + ) + ) + # + logger.debug("length instanceModelOutlierD %d", len(instanceModelOutlierD)) + # ---- + vObj = None + if dataContainer.exists("pdbx_vrpt_angle_outliers"): + vObj = dataContainer.getObj("pdbx_vrpt_angle_outliers") + if vObj: + for ii in range(vObj.getRowCount()): + seqId = vObj.getValueOrDefault("label_seq_id", ii, defaultValue=None) + if seqId: + modelId = vObj.getValueOrDefault("PDB_model_num", ii, defaultValue=None) + asymId = vObj.getValueOrDefault("label_asym_id", ii, defaultValue=None) + compId = vObj.getValueOrDefault("label_comp_id", ii, defaultValue=None) + # + atomI = vObj.getValueOrDefault("atom0", ii, defaultValue=None) + atomJ = vObj.getValueOrDefault("atom1", ii, defaultValue=None) + atomK = vObj.getValueOrDefault("atom2", ii, defaultValue=None) + obsDist = vObj.getValueOrDefault("obs", ii, defaultValue=None) + zVal = vObj.getValueOrDefault("Z", ii, defaultValue=None) + tS = "%s-%s-%s angle=%s Z=%s" % (atomI, atomJ, atomK, obsDist, zVal) + # + instanceModelOutlierD.setdefault((modelId, asymId, True), []).append( + OutlierValue( + compId, + int(seqId), + "ANGLE_OUTLIER", + tS, + ) + ) + # + logger.debug("length instanceModelOutlierD %d", len(instanceModelOutlierD)) + # ---- + vObj = None + if dataContainer.exists("pdbx_vrpt_mogul_bond_outliers"): + vObj = dataContainer.getObj("pdbx_vrpt_mogul_bond_outliers") + if vObj: + for ii in range(vObj.getRowCount()): + seqId = vObj.getValueOrDefault("label_seq_id", ii, defaultValue=None) + + modelId = vObj.getValueOrDefault("PDB_model_num", ii, defaultValue=None) + asymId = vObj.getValueOrDefault("label_asym_id", ii, defaultValue=None) + compId = vObj.getValueOrDefault("label_comp_id", ii, defaultValue=None) + # + atoms = vObj.getValueOrDefault("atoms", ii, defaultValue=None) + obsDist = vObj.getValueOrDefault("obsval", ii, defaultValue=None) + meanValue = vObj.getValueOrDefault("mean", ii, defaultValue=None) + zVal = vObj.getValueOrDefault("Zscore", ii, defaultValue=None) + tS = "%s angle=%s Z=%s" % (atoms, obsDist, zVal) + # OutlierValue = collections.namedtuple("OutlierValue", "compId, seqId, outlierType, description, reported, reference, uncertaintyValue, uncertaintyType") + if seqId: + instanceModelOutlierD.setdefault((modelId, asymId, True), []).append( + OutlierValue( + compId, + int(seqId), + "MOGUL_BOND_OUTLIER", + tS, + ) + ) + else: + instanceModelOutlierD.setdefault((modelId, asymId, False), []).append( + OutlierValue(compId, None, "MOGUL_BOND_OUTLIER", tS, obsDist, meanValue, zVal, "Z-Score") + ) + # + logger.debug("length instanceModelOutlierD %d", len(instanceModelOutlierD)) + + vObj = None + if dataContainer.exists("pdbx_vrpt_mogul_angle_outliers"): + vObj = dataContainer.getObj("pdbx_vrpt_mogul_angle_outliers") + if vObj: + for ii in range(vObj.getRowCount()): + seqId = vObj.getValueOrDefault("label_seq_id", ii, defaultValue=None) + + modelId = vObj.getValueOrDefault("PDB_model_num", ii, defaultValue=None) + asymId = vObj.getValueOrDefault("label_asym_id", ii, defaultValue=None) + compId = vObj.getValueOrDefault("label_comp_id", ii, defaultValue=None) + # + atoms = vObj.getValueOrDefault("atoms", ii, defaultValue=None) + obsDist = vObj.getValueOrDefault("obsval", ii, defaultValue=None) + meanValue = vObj.getValueOrDefault("mean", ii, defaultValue=None) + zVal = vObj.getValueOrDefault("Zscore", ii, defaultValue=None) + tS = "%s angle=%s Z=%s" % (atoms, obsDist, zVal) + if seqId: + instanceModelOutlierD.setdefault((modelId, asymId, True), []).append( + OutlierValue( + compId, + int(seqId), + "MOGUL_ANGLE_OUTLIER", + tS, + ) + ) + else: + instanceModelOutlierD.setdefault((modelId, asymId, False), []).append( + OutlierValue(compId, None, "MOGUL_ANGLE_OUTLIER", tS, obsDist, meanValue, zVal, "Z-Score") + ) + logger.debug("length instanceModelOutlierD %d", len(instanceModelOutlierD)) + # + # + vObj = None + if dataContainer.exists("pdbx_vrpt_instance_results"): + vObj = dataContainer.getObj("pdbx_vrpt_instance_results") + + if vObj: + logger.debug("Row count for %s: %d", vObj.getName(), vObj.getRowCount()) + for ii in range(vObj.getRowCount()): + seqId = vObj.getValueOrDefault("label_seq_id", ii, defaultValue=None) + modelId = vObj.getValueOrDefault("PDB_model_num", ii, defaultValue=None) + asymId = vObj.getValueOrDefault("label_asym_id", ii, defaultValue=None) + compId = vObj.getValueOrDefault("label_comp_id", ii, defaultValue=None) + # + rotamerClass = vObj.getValueOrDefault("rotamer_class", ii, defaultValue=None) + ramaClass = vObj.getValueOrDefault("ramachandran_class", ii, defaultValue=None) + rsr = vObj.getValueOrDefault("RSR", ii, defaultValue=None) + rsrZ = vObj.getValueOrDefault("RSRZ", ii, defaultValue=None) + rsrCc = vObj.getValueOrDefault("RSRCC", ii, defaultValue=None) + # + anglesRmsZ = vObj.getValueOrDefault("mogul_angles_RMSZ", ii, defaultValue=None) + bondsRmsZ = vObj.getValueOrDefault("mogul_bonds_RMSZ", ii, defaultValue=None) + # + if seqId: + if rotamerClass and rotamerClass.upper() == "OUTLIER": + instanceModelOutlierD.setdefault((modelId, asymId, True), []).append( + OutlierValue( + compId, + int(seqId), + "ROTAMER_OUTLIER", + None, + ) + ) + if ramaClass and ramaClass.upper() == "OUTLIER": + instanceModelOutlierD.setdefault((modelId, asymId, True), []).append( + OutlierValue( + compId, + int(seqId), + "RAMACHANDRAN_OUTLIER", + None, + ) + ) + if rsrZ and float(rsrZ) > 2.0: + tS = "%s > 2.0" % rsrZ + instanceModelOutlierD.setdefault((modelId, asymId, True), []).append( + OutlierValue( + compId, + int(seqId), + "RSRZ_OUTLIER", + tS, + ) + ) + if rsrCc and float(rsrCc) < 0.650: + tS = "RSCC < 0.65" + instanceModelOutlierD.setdefault((modelId, asymId, True), []).append( + OutlierValue( + compId, + int(seqId), + "RSCC_OUTLIER", + tS, + ) + ) + else: + if rsrZ and float(rsrZ) > 2.0: + tS = "%s > 2.0" % rsrZ + instanceModelOutlierD.setdefault((modelId, asymId, False), []).append(OutlierValue(compId, None, "RSRZ_OUTLIER", tS, rsr, None, rsrZ, "Z-Score")) + if rsrCc and float(rsrCc) < 0.650: + tS = "RSCC < 0.65" + instanceModelOutlierD.setdefault((modelId, asymId, False), []).append(OutlierValue(compId, None, "RSCC_OUTLIER", tS, rsrCc)) + # NonpolymerValidationFields = ("rsr", "rscc", "mogul_bonds_rmsz", "mogul_angles_rmsz", "heavy_atom_count", "modeled_heavy_atom_count") + # "nonPolyMissingAtomD": {(modelId, compId, asymId, zeroOccFlag): [atomId,...], }, + missingAtomCount = len(nonPolyMissingAtomD[(modelId, compId, asymId, 0)]) if (modelId, compId, asymId, 0) in nonPolyMissingAtomD else 0 + missingAtomCount += len(nonPolyMissingAtomD[(modelId, compId, asymId, 1)]) if (modelId, compId, asymId, 1) in nonPolyMissingAtomD else 0 + instanceModelValidationD[(modelId, asymId, compId)] = NonpolymerValidationInstance( + float(rsr) if rsr else None, + float(rsrCc) if rsrCc else None, + float(bondsRmsZ) if bondsRmsZ else None, + float(anglesRmsZ) if anglesRmsZ else None, + missingAtomCount, + ) + if missingAtomCount > 0: + logger.debug("%s %s missing atom count %d", dataContainer.getName(), compId, missingAtomCount) + # + logger.debug("instanceModelOutlierD %r", instanceModelOutlierD) + logger.debug("instanceModelValidationD %r", instanceModelValidationD) + + rD = {"instanceModelOutlierD": instanceModelOutlierD, "instanceModelValidationD": instanceModelValidationD} + except Exception as e: + logger.exception("%s failing with %s", dataContainer.getName(), str(e)) + return rD diff --git a/rcsb/utils/dictionary/DictMethodEntityHelper.py b/rcsb/utils/dictionary/DictMethodEntityHelper.py new file mode 100644 index 0000000..8bc9eaf --- /dev/null +++ b/rcsb/utils/dictionary/DictMethodEntityHelper.py @@ -0,0 +1,1960 @@ +## +# File: DictMethodEntityHelper.py +# Author: J. Westbrook +# Date: 16-Jul-2019 +# Version: 0.001 Initial version +# +## +""" +Helper class implements methods supporting entity-level item and category methods in the RCSB dictionary extension. + +""" +__docformat__ = "restructuredtext en" +__author__ = "John Westbrook" +__email__ = "jwest@rcsb.rutgers.edu" +__license__ = "Apache 2.0" + +# pylint: disable=too-many-lines + +import functools +import itertools +import logging +import re + +from collections import defaultdict, OrderedDict + +from mmcif.api.DataCategory import DataCategory +from rcsb.utils.seq.SeqAlign import splitSeqAlignObjList + +logger = logging.getLogger(__name__) + + +def cmpElements(lhs, rhs): + return 0 if (lhs[-1].isdigit() or lhs[-1] in ["R", "S"]) and rhs[0].isdigit() else -1 + + +class DictMethodEntityHelper(object): + """Helper class implements methods supporting entity-level item and category methods in the RCSB dictionary extension.""" + + def __init__(self, **kwargs): + """ + Args: + resourceProvider: (obj) instance of DictMethodResourceProvider() + + """ + # + self._raiseExceptions = kwargs.get("raiseExceptions", False) + self.__wsPattern = re.compile(r"\s+", flags=re.UNICODE | re.MULTILINE) + self.__reNonDigit = re.compile(r"[^\d]+") + # + rP = kwargs.get("resourceProvider") + self.__commonU = rP.getResource("DictMethodCommonUtils instance") if rP else None + self.__dApi = rP.getResource("Dictionary API instance (pdbx_core)") if rP else None + # + self.__useSiftsAlign = rP.getReferenceSequenceAlignmentOpt() == "SIFTS" + # logger.info("SIFTS alignment option %r", self.__useSiftsAlign) + # + self.__ssP = None + if self.__useSiftsAlign: + self.__ssP = rP.getResource("SiftsSummaryProvider instance") if rP else None + self.__useSiftsAlign = False if not self.__ssP else self.__useSiftsAlign + # + self.__ccP = rP.getResource("ChemCompProvider instance") if rP else None + + # + logger.debug("Dictionary entity method helper init") + + def __processSiftsAlignments(self, dataContainer): + # + tObj = dataContainer.getObj("entry") + entryId = tObj.getValue("id", 0) + # + asymIdD = self.__commonU.getInstanceEntityMap(dataContainer) + asymAuthIdD = self.__commonU.getAsymAuthIdMap(dataContainer) + instTypeD = self.__commonU.getInstanceTypes(dataContainer) + siftsEntityAlignD = {} + # + # Process sifts alignments - + siftsAlignD = {} + for asymId, authAsymId in asymAuthIdD.items(): + if instTypeD[asymId] not in ["polymer", "branched"]: + continue + entityId = asymIdD[asymId] + # accumulate the sifts alignments by entity. + # siftsAlignD.setdefault((entryId, entityId), []).extend([SeqAlign("SIFTS", **sa) for sa in self.__ssP.getIdentifiers(entryId, authAsymId, idType="UNPAL")]) + siftsAlignD.setdefault((entryId, entityId), []).extend(self.__ssP.getSeqAlignObjList(entryId, authAsymId)) + for (entryId, entityId), seqAlignObjL in siftsAlignD.items(): + if seqAlignObjL: + # re-group alignments by common accession + alRefD = {} + for seqAlignObj in seqAlignObjL: + alRefD.setdefault((seqAlignObj.getDbName(), seqAlignObj.getDbAccession(), seqAlignObj.getDbIsoform()), []).append(seqAlignObj) + # + # Get the longest overlapping entity region of each ref alignment - + for (dbName, dbAcc, dbIsoform), aL in alRefD.items(): + alGrpD = splitSeqAlignObjList(aL) + logger.debug("SIFTS -> entryId %s entityId %s dbName %r dbAcc %r dbIsoform %r alGrpD %r", entryId, entityId, dbName, dbAcc, dbIsoform, alGrpD) + for _, grpAlignL in alGrpD.items(): + + lenL = [seqAlignObj.getEntityAlignLength() for seqAlignObj in grpAlignL] + idxMax = lenL.index(max(lenL)) + siftsEntityAlignD.setdefault((entryId, entityId, "SIFTS"), {}).setdefault((dbName, dbAcc, dbIsoform), []).append(grpAlignL[idxMax]) + # + logger.debug("PROCESSED SIFTS -> %r", siftsEntityAlignD) + return siftsEntityAlignD + + def __processPdbAlignments(self, dataContainer): + # + tObj = dataContainer.getObj("entry") + entryId = tObj.getValue("id", 0) + # + entityRefAlignmentD = self.__commonU.getEntityReferenceAlignments(dataContainer) + pdbEntityAlignD = {} + # --- PDB alignments - + for entityId, seqAlignObjL in entityRefAlignmentD.items(): + # seqAlignObjL = [SeqAlign("PDB", **sa) for sa in entityAlignL] + if seqAlignObjL: + alRefD = {} + for seqAlignObj in seqAlignObjL: + alRefD.setdefault((seqAlignObj.getDbName(), seqAlignObj.getDbAccession(), seqAlignObj.getDbIsoform()), []).append(seqAlignObj) + for (dbName, dbAcc, dbIsoform), aL in alRefD.items(): + alGrpD = splitSeqAlignObjList(aL) + logger.debug("PDB -> entryId %s entityId %s dbName %r dbAcc %r dbIsoform %r alGrpD %r", entryId, entityId, dbName, dbAcc, dbIsoform, alGrpD) + for _, grpAlignL in alGrpD.items(): + # get the longest overlapping entity region of each ref seq - + lenL = [seqAlignObj.getEntityAlignLength() for seqAlignObj in grpAlignL] + idxMax = lenL.index(max(lenL)) + try: + tLen = grpAlignL[idxMax].getEntityAlignLength() + if tLen and tLen > 0: + pdbEntityAlignD.setdefault((entryId, entityId, "PDB"), {}).setdefault((dbName, dbAcc, dbIsoform), []).append(grpAlignL[idxMax]) + else: + logger.warning("Skipping %s inconsistent alignment for entity %r %r", entryId, entityId, seqAlignObjL) + except Exception: + pass + # + logger.debug("PROCESSED PDB -> %r", pdbEntityAlignD) + return pdbEntityAlignD + + def addPolymerEntityReferenceAlignments(self, dataContainer, catName, **kwargs): + """[summary] + + Args: + dataContainer ([type]): [description] + catName ([type]): [description] + + Returns: + [type]: [description] + + Example: + _rcsb_polymer_entity_align.ordinal + _rcsb_polymer_entity_align.entry_id + _rcsb_polymer_entity_align.entity_id + # + _rcsb_polymer_entity_align.reference_database_name + _rcsb_polymer_entity_align.reference_database_accession + _rcsb_polymer_entity_align.provenance_source + # + _rcsb_polymer_entity_align.aligned_regions_ref_beg_seq_id + _rcsb_polymer_entity_align.aligned_regions_entity_beg_seq_id + _rcsb_polymer_entity_align.aligned_regions_length + # + """ + dbNameMapD = self.__commonU.getDatabaseNameMap() + logger.debug("Starting %s catName %s kwargs %r", dataContainer.getName(), catName, kwargs) + try: + if not (dataContainer.exists("entry") and dataContainer.exists("entity")): + return False + if not dataContainer.exists(catName): + dataContainer.append(DataCategory(catName, attributeNameList=self.__dApi.getAttributeNameList(catName))) + # + cObj = dataContainer.getObj(catName) + # + pdbEntityAlignD = self.__processPdbAlignments(dataContainer) + # + if self.__useSiftsAlign: + siftsEntityAlignD = self.__processSiftsAlignments(dataContainer) + logger.debug("siftsEntityAlignD %d", len(siftsEntityAlignD)) + # + for (entryId, entityId, provSource), refD in siftsEntityAlignD.items(): + if (entryId, entityId, "PDB") in pdbEntityAlignD: + del pdbEntityAlignD[(entryId, entityId, "PDB")] + pdbEntityAlignD.update({(entryId, entityId, provSource): refD}) + # + # --- + + iRow = cObj.getRowCount() + for (entryId, entityId, provSource), refD in pdbEntityAlignD.items(): + # + for (dbName, dbAcc, dbIsoform), saoL in refD.items(): + # + if dbName not in dbNameMapD: + logger.error("Skipping unsupported reference database %r for entry %s entity %s", dbName, entryId, entityId) + continue + # + cObj.setValue(iRow + 1, "ordinal", iRow) + cObj.setValue(entryId, "entry_id", iRow) + cObj.setValue(entityId, "entity_id", iRow) + # + dispDbName = dbNameMapD[dbName] + cObj.setValue(dispDbName, "reference_database_name", iRow) + cObj.setValue(dbAcc, "reference_database_accession", iRow) + if dbIsoform: + cObj.setValue(dbIsoform, "reference_database_isoform", iRow) + cObj.setValue(provSource, "provenance_source", iRow) + # + cObj.setValue(",".join([str(sao.getDbSeqIdBeg()) for sao in saoL]), "aligned_regions_ref_beg_seq_id", iRow) + cObj.setValue(",".join([str(sao.getEntitySeqIdBeg()) for sao in saoL]), "aligned_regions_entity_beg_seq_id", iRow) + cObj.setValue(",".join([str(sao.getEntityAlignLength()) for sao in saoL]), "aligned_regions_length", iRow) + iRow += 1 + + return True + except Exception as e: + logger.exception("For %s %s failing with %s", dataContainer.getName(), catName, str(e)) + return False + + # + + def buildContainerEntityIds(self, dataContainer, catName, **kwargs): + """Load the input category with rcsb_entity_container_identifiers content. + + Args: + dataContainer (object): mmif.api.DataContainer object instance + catName (str): Category name + + Returns: + bool: True for success or False otherwise + + For example, build: + + loop_ + _rcsb_entity_container_identifiers.entry_id + _rcsb_entity_container_identifiers.entity_id + # + _rcsb_entity_container_identifiers.asym_ids + _rcsb_entity_container_identifiers.auth_asym_ids + # + _rcsb_entity_container_identifiers.nonpolymer_comp_id + _rcsb_entity_container_identifiers.chem_comp_monomers + + _rcsb_entity_container_identifiers.prd_id + ... + """ + logger.debug("Starting catName %s kwargs %r", catName, kwargs) + try: + if not (dataContainer.exists("entry") and dataContainer.exists("entity")): + return False + if not dataContainer.exists(catName): + dataContainer.append(DataCategory(catName, attributeNameList=self.__dApi.getAttributeNameList(catName))) + # + cObj = dataContainer.getObj(catName) + # + tObj = dataContainer.getObj("entry") + entryId = tObj.getValue("id", 0) + cObj.setValue(entryId, "entry_id", 0) + # + tObj = dataContainer.getObj("entity") + entityIdL = tObj.getAttributeValueList("id") + seqEntityRefDbD = self.__commonU.getEntitySequenceReferenceCodes(dataContainer) + # + entityTypeUniqueIds = self.__commonU.getEntityTypeUniqueIds(dataContainer) + entityPolymerModMonomerIds = self.__commonU.getPolymerEntityModifiedMonomers(dataContainer) + # ------- + eTypeD = self.__commonU.getEntityTypes(dataContainer) + aObj = dataContainer.getObj("struct_asym") + if not aObj.hasAttribute("rcsb_entity_type"): + aObj.appendAttribute("rcsb_entity_type") + for ii in range(aObj.getRowCount()): + entityId = aObj.getValue("entity_id", ii) + aObj.setValue(eTypeD[entityId], "rcsb_entity_type", ii) + # --------- + ii = 0 + for entityId in entityIdL: + cObj.setValue(entryId, "entry_id", ii) + cObj.setValue(entityId, "entity_id", ii) + cObj.setValue(entryId + "_" + entityId, "rcsb_id", ii) + eType = tObj.getValue("type", ii) + asymIdL = [] + authAsymIdL = [] + ccMonomerL = [] + ccLigandL = [] + modPolymerMonomerL = entityPolymerModMonomerIds[entityId] if entityId in entityPolymerModMonomerIds else [] + # + refSeqIdD = {"dbName": [], "dbAccession": [], "provSource": [], "dbIsoform": []} + + asymIdL = entityTypeUniqueIds[eType][entityId]["asymIds"] if eType in entityTypeUniqueIds else [] + authAsymIdL = entityTypeUniqueIds[eType][entityId]["authAsymIds"] if eType in entityTypeUniqueIds else [] + ccMonomerL = entityTypeUniqueIds[eType][entityId]["ccIds"] if eType in entityTypeUniqueIds else [] + + if eType in ["polymer", "non-polymer", "branched"] and not asymIdL: + logger.warning("%s inconsistent molecular system (no instances) for %r entity %s", entryId, eType, entityId) + # + if eType == "polymer": + + if self.__useSiftsAlign: + dbIdL = [] + for authAsymId in authAsymIdL: + dbIdL.extend(self.__ssP.getIdentifiers(entryId, authAsymId, idType="UNPID")) + # If SIFTS is defined + if dbIdL: + for dbId in sorted(set(dbIdL)): + refSeqIdD["dbName"].append("UniProt") + refSeqIdD["provSource"].append("SIFTS") + refSeqIdD["dbAccession"].append(dbId) + refSeqIdD["dbIsoform"].append("?") + # else fallback to PDB + elif entityId in seqEntityRefDbD: + for dbD in seqEntityRefDbD[entityId]: + refSeqIdD["dbName"].append(dbD["dbName"]) + refSeqIdD["provSource"].append("PDB") + refSeqIdD["dbAccession"].append(dbD["dbAccession"]) + # + if dbD["dbIsoform"]: + refSeqIdD["dbIsoform"].append(dbD["dbIsoform"]) + else: + refSeqIdD["dbIsoform"].append("?") + + else: + if entityId in seqEntityRefDbD: + for dbD in seqEntityRefDbD[entityId]: + refSeqIdD["dbName"].append(dbD["dbName"]) + refSeqIdD["provSource"].append("PDB") + refSeqIdD["dbAccession"].append(dbD["dbAccession"]) + # + if dbD["dbIsoform"]: + refSeqIdD["dbIsoform"].append(dbD["dbIsoform"]) + else: + refSeqIdD["dbIsoform"].append("?") + + # + # logger.info("refSeqIdD %r %r %r", entryId, entityId, refSeqIdD) + + if asymIdL: + cObj.setValue(",".join(sorted(set(asymIdL))).strip(), "asym_ids", ii) + if authAsymIdL: + cObj.setValue(",".join(sorted(set(authAsymIdL))).strip(), "auth_asym_ids", ii) + if ccMonomerL and eType in ["branched", "polymer"]: + cObj.setValue(",".join(sorted(set(ccMonomerL))).strip(), "chem_comp_monomers", ii) + else: + cObj.setValue("?", "chem_comp_monomers", ii) + # + if modPolymerMonomerL: + cObj.setValue(",".join(sorted(set(modPolymerMonomerL))).strip(), "chem_comp_nstd_monomers", ii) + else: + cObj.setValue("?", "chem_comp_nstd_monomers", ii) + # + if eType in ["non-polymer"] and ccMonomerL: + cObj.setValue(",".join(sorted(set(ccLigandL))).strip(), "nonpolymer_comp_id", ii) + else: + cObj.setValue("?", "nonpolymer_comp_id", ii) + # + if refSeqIdD["dbName"]: + cObj.setValue(",".join(refSeqIdD["dbName"]).strip(), "reference_sequence_identifiers_database_name", ii) + cObj.setValue(",".join(refSeqIdD["dbAccession"]).strip(), "reference_sequence_identifiers_database_accession", ii) + cObj.setValue(",".join(refSeqIdD["provSource"]).strip(), "reference_sequence_identifiers_provenance_source", ii) + cObj.setValue(",".join(refSeqIdD["dbIsoform"]).strip(), "reference_sequence_identifiers_database_isoform", ii) + # + + ii += 1 + _ = self.__addEntityCompIds(dataContainer) + _ = self.__addBirdEntityIds(dataContainer) + + return True + except Exception as e: + logger.exception("For %s %s failing with %s", dataContainer.getName(), catName, str(e)) + return False + + def __salvageMissingTaxonomy(self, dataContainer, **kwargs): + """Add missing taxonomy identifiers using scientific name as a surogate. + + Args: + dataContainer (obj): data container object + + Returns: + bool: True for success or False otherwise + """ + # + ok = False + try: + rP = kwargs.get("resourceProvider") + taxU = rP.getResource("TaxonomyProvider instance") if rP else None + # "pdbx_gene_src_scientific_name" "pdbx_gene_src_ncbi_taxonomy_id" + for catName, atSn, atTaxId in [ + ("entity_src_gen", "pdbx_gene_src_scientific_name", "pdbx_gene_src_ncbi_taxonomy_id"), + ("entity_src_gen", "pdbx_host_org_scientific_name", "pdbx_host_org_ncbi_taxonomy_id"), + ("entity_src_nat", "pdbx_organism_scientific", "pdbx_ncbi_taxonomy_id"), + ("entity_src_syn", "organism_scientific", "ncbi_taxonomy_id"), + ]: + if dataContainer.exists(catName): + sObj = dataContainer.getObj(catName) + for ii in range(sObj.getRowCount()): + taxId = sObj.getValueOrDefault(atTaxId, ii, defaultValue=None) + # + if taxId: + continue + sn = sObj.getValueOrDefault(atSn, ii, defaultValue=None) + if sn: + taxId = taxU.getTaxId(sn) + if taxId: + if not sObj.hasAttribute(atTaxId): + sObj.appendAttribute(atTaxId) + logger.debug("%s salvaged taxId %r using %r", dataContainer.getName(), taxId, sn) + sObj.setValue(str(taxId), atTaxId, ii) + else: + logger.warning("%s taxId salvage fails for scientific name %s", dataContainer.getName(), sn) + ok = True + except Exception as e: + logger.exception("Failing for %r with %s", dataContainer.getName(), str(e)) + + return ok + + def filterSourceOrganismDetails(self, dataContainer, catName, **kwargs): + """Load new categories rcsb_entity_source_organism and rcsb_entity_host_organism + and add related source flags in the entity category. + + Args: + dataContainer (object): mmif.api.DataContainer object instance + catName (str): Category name + + Returns: + bool: True for success or False otherwise + + For intance, select relevant source and host organism details from + primary data categories and load + + Build: + loop_ + _rcsb_entity_source_organism.entity_id + _rcsb_entity_source_organism.pdbx_src_id + _rcsb_entity_source_organism.source_type + _rcsb_entity_source_organism.scientific_name + _rcsb_entity_source_organism.common_name + _rcsb_entity_source_organism.ncbi_taxonomy_id + _rcsb_entity_source_organism.provenance_source + _rcsb_entity_source_organism.beg_seq_num + _rcsb_entity_source_organism.end_seq_num + _rcsb_entity_source_organism.taxonomy_lineage_id + _rcsb_entity_source_organism.taxonomy_lineage_name + _rcsb_entity_source_organism.taxonomy_lineage_depth + 1 1 natural 'Homo sapiens' human 9606 'PDB Primary Data' 1 202 . . . + # ... abbreviated + + + loop_ + _rcsb_entity_host_organism.entity_id + _rcsb_entity_host_organism.pdbx_src_id + _rcsb_entity_host_organism.scientific_name + _rcsb_entity_host_organism.common_name + _rcsb_entity_host_organism.ncbi_taxonomy_id + _rcsb_entity_host_organism.provenance_source + _rcsb_entity_host_organism.beg_seq_num + _rcsb_entity_host_organism.end_seq_num + _rcsb_entity_host_organism.taxonomy_lineage_id + _rcsb_entity_host_organism.taxonomy_lineage_name + _rcsb_entity_host_organism.taxonomy_lineage_depth + 1 1 'Escherichia coli' 'E. coli' 562 'PDB Primary Data' 1 102 . . . + # ... abbreviated + + And two related items - + + _entity.rcsb_multiple_source_flag + _entity.rcsb_source_part_count + + """ + # + hostCatName = "rcsb_entity_host_organism" + try: + logger.debug("Starting with %r %r", dataContainer.getName(), catName) + if catName == hostCatName: + logger.debug("Skipping method for %r %r", dataContainer.getName(), catName) + return True + # + # if there is no source information then exit + if not (dataContainer.exists("entity_src_gen") or dataContainer.exists("entity_src_nat") or dataContainer.exists("pdbx_entity_src_syn")): + return False + # + # Try to supply missing taxIds + self.__salvageMissingTaxonomy(dataContainer, **kwargs) + # + # Create the new target category + if not dataContainer.exists(catName): + dataContainer.append(DataCategory(catName, attributeNameList=self.__dApi.getAttributeNameList(catName))) + # + if not dataContainer.exists(hostCatName): + dataContainer.append(DataCategory(hostCatName, attributeNameList=self.__dApi.getAttributeNameList(hostCatName))) + # + rP = kwargs.get("resourceProvider") + taxU = rP.getResource("TaxonomyProvider instance") if rP else None + # + cObj = dataContainer.getObj(catName) + hObj = dataContainer.getObj(hostCatName) + # + s1Obj = dataContainer.getObj("entity_src_gen") + atHTupL = [ + ("entity_id", "entity_id"), + ("pdbx_host_org_scientific_name", "scientific_name"), + ("pdbx_host_org_common_name", "common_name"), + ("pdbx_host_org_ncbi_taxonomy_id", "ncbi_taxonomy_id"), + ("pdbx_src_id", "pdbx_src_id"), + ("pdbx_beg_seq_num", "beg_seq_num"), + ("pdbx_end_seq_num", "end_seq_num"), + ] + atHSL, atHL = self.__getAttribList(s1Obj, atHTupL) + # + at1TupL = [ + ("entity_id", "entity_id"), + ("pdbx_gene_src_scientific_name", "scientific_name"), + ("gene_src_common_name", "common_name"), + ("pdbx_gene_src_ncbi_taxonomy_id", "ncbi_taxonomy_id"), + ("pdbx_src_id", "pdbx_src_id"), + ("pdbx_beg_seq_num", "beg_seq_num"), + ("pdbx_end_seq_num", "end_seq_num"), + ("pdbx_gene_src_gene", "rcsb_gene_name_value"), + ] + at1SL, at1L = self.__getAttribList(s1Obj, at1TupL) + # + s2Obj = dataContainer.getObj("entity_src_nat") + at2TupL = [ + ("entity_id", "entity_id"), + ("pdbx_organism_scientific", "scientific_name"), + ("nat_common_name", "common_name"), + ("pdbx_ncbi_taxonomy_id", "ncbi_taxonomy_id"), + ("pdbx_src_id", "pdbx_src_id"), + ("pdbx_beg_seq_num", "beg_seq_num"), + ("pdbx_end_seq_num", "end_seq_num"), + ] + at2SL, at2L = self.__getAttribList(s2Obj, at2TupL) + # + s3Obj = dataContainer.getObj("pdbx_entity_src_syn") + at3TupL = [ + ("entity_id", "entity_id"), + ("organism_scientific", "scientific_name"), + ("organism_common_name", "common_name"), + ("ncbi_taxonomy_id", "ncbi_taxonomy_id"), + ("pdbx_src_id", "pdbx_src_id"), + ("beg_seq_num", "beg_seq_num"), + ("end_seq_num", "end_seq_num"), + ] + at3SL, at3L = self.__getAttribList(s3Obj, at3TupL) + # + eObj = dataContainer.getObj("entity") + entityIdL = eObj.getAttributeValueList("id") + provSource = "PDB Primary Data" + # + partCountD = {} + srcL = [] + hostL = [] + for entityId in entityIdL: + partCountD[entityId] = 0 + eL = [] + tf = False + if s1Obj: + sType = "genetically engineered" + vL = s1Obj.selectValueListWhere(at1SL, entityId, "entity_id") + if vL: + for v in vL: + eL.append((entityId, sType, at1L, v)) + logger.debug("%r entity %r - %r", sType, entityId, vL) + partCountD[entityId] = len(eL) + srcL.extend(eL) + tf = True + # + vL = s1Obj.selectValueListWhere(atHSL, entityId, "entity_id") + if vL: + for v in vL: + hostL.append((entityId, sType, atHL, v)) + logger.debug("%r entity %r - %r", sType, entityId, vL) + if tf: + continue + + if s2Obj: + sType = "natural" + vL = s2Obj.selectValueListWhere(at2SL, entityId, "entity_id") + if vL: + for v in vL: + eL.append((entityId, sType, at2L, v)) + logger.debug("%r entity %r - %r", sType, entityId, vL) + partCountD[entityId] = len(eL) + srcL.extend(eL) + continue + + if s3Obj: + sType = "synthetic" + vL = s3Obj.selectValueListWhere(at3SL, entityId, "entity_id") + if vL: + for v in vL: + eL.append((entityId, sType, at3L, v)) + logger.debug("%r entity %r - %r", sType, entityId, vL) + partCountD[entityId] = len(eL) + srcL.extend(eL) + continue + + iRow = 0 + entryTaxIdD = defaultdict(int) + entityTaxIdD = {} + for (entityId, sType, atL, tv) in srcL: + ii = atL.index("ncbi_taxonomy_id") if "ncbi_taxonomy_id" in atL else -1 + if ii > 0 and len(tv[ii].split(",")) > 1: + tvL = self.__normalizeCsvToList(dataContainer.getName(), tv) + ii = atL.index("pdbx_src_id") if "pdbx_src_id" in atL else -1 + for jj, row in enumerate(tvL, 1): + row[ii] = str(jj) + partCountD[entityId] = len(tvL) + else: + tvL = [tv] + for v in tvL: + cObj.setValue(sType, "source_type", iRow) + cObj.setValue(provSource, "provenance_source", iRow) + for ii, at in enumerate(atL): + # add check for missing values here + if at in ["rcsb_gene_name_value"] and v[ii] and v[ii] not in [".", "?"]: + tgL = v[ii].split(",") + fgL = self.__filterCaseDuplicates(tgL) + cObj.setValue(";".join(fgL), at, iRow) + cObj.setValue(";".join([provSource for jj in range(len(tgL))]), "rcsb_gene_name_provenance_source", iRow) + else: + cObj.setValue(v[ii], at, iRow) + # if at == 'ncbi_taxonomy_id' and v[ii] and v[ii] not in ['.', '?'] and v[ii].isdigit(): + if at == "ncbi_taxonomy_id" and v[ii] and v[ii] not in [".", "?"]: + taxId = int(self.__reNonDigit.sub("", v[ii])) + taxId = taxU.getMergedTaxId(taxId) + cObj.setValue(str(taxId), "ncbi_taxonomy_id", iRow) + entryTaxIdD[taxId] += 1 + entityTaxIdD.setdefault(entityId, set()).add(taxId) + # + sn = taxU.getScientificName(taxId) + if sn: + cObj.setValue(sn, "ncbi_scientific_name", iRow) + # + psn = taxU.getParentScientificName(taxId) + if psn: + cObj.setValue(psn, "ncbi_parent_scientific_name", iRow) + # + cnL = taxU.getCommonNames(taxId) + if cnL: + fcnL = self.__filterCaseDuplicates(cnL) + cObj.setValue(";".join(list(OrderedDict.fromkeys(fcnL))), "ncbi_common_names", iRow) + # Add lineage - + linL = taxU.getLineageWithNames(taxId) + if linL is not None: + cObj.setValue(";".join([str(tup[0]) for tup in OrderedDict.fromkeys(linL)]), "taxonomy_lineage_depth", iRow) + cObj.setValue(";".join([str(tup[1]) for tup in OrderedDict.fromkeys(linL)]), "taxonomy_lineage_id", iRow) + cObj.setValue(";".join([str(tup[2]) for tup in OrderedDict.fromkeys(linL)]), "taxonomy_lineage_name", iRow) + else: + logger.warning("%s taxId %r lineage %r", dataContainer.getName(), taxId, linL) + + logger.debug("%r entity %r - UPDATED %r %r", sType, entityId, atL, v) + iRow += 1 + # + iRow = 0 + for (entityId, sType, atL, tv) in hostL: + ii = atL.index("ncbi_taxonomy_id") if "ncbi_taxonomy_id" in atL else -1 + if ii > 0 and len(tv[ii].split(",")) > 1: + tvL = self.__normalizeCsvToList(dataContainer.getName(), tv) + ii = atL.index("pdbx_src_id") if "pdbx_src_id" in atL else -1 + for jj, row in enumerate(tvL, 1): + row[ii] = str(jj) + # partCountD[entityId] = len(tvL) + else: + tvL = [tv] + for v in tvL: + hObj.setValue(provSource, "provenance_source", iRow) + for ii, at in enumerate(atL): + hObj.setValue(v[ii], at, iRow) + # if at == 'ncbi_taxonomy_id' and v[ii] and v[ii] not in ['.', '?'] and v[ii].isdigit(): + if at == "ncbi_taxonomy_id" and v[ii] and v[ii] not in [".", "?"]: + taxId = int(self.__reNonDigit.sub("", v[ii])) + taxId = taxU.getMergedTaxId(taxId) + hObj.setValue(str(taxId), "ncbi_taxonomy_id", iRow) + sn = taxU.getScientificName(taxId) + if sn: + hObj.setValue(sn, "ncbi_scientific_name", iRow) + # + psn = taxU.getParentScientificName(taxId) + if psn: + hObj.setValue(psn, "ncbi_parent_scientific_name", iRow) + # + cnL = taxU.getCommonNames(taxId) + if cnL: + hObj.setValue(";".join(sorted(set(cnL))), "ncbi_common_names", iRow) + # Add lineage - + linL = taxU.getLineageWithNames(taxId) + if linL is not None: + hObj.setValue(";".join([str(tup[0]) for tup in OrderedDict.fromkeys(linL)]), "taxonomy_lineage_depth", iRow) + hObj.setValue(";".join([str(tup[1]) for tup in OrderedDict.fromkeys(linL)]), "taxonomy_lineage_id", iRow) + hObj.setValue(";".join([str(tup[2]) for tup in OrderedDict.fromkeys(linL)]), "taxonomy_lineage_name", iRow) + else: + logger.warning("%s taxId %r lineage %r", dataContainer.getName(), taxId, linL) + logger.debug("%r entity %r - UPDATED %r %r", sType, entityId, atL, v) + iRow += 1 + # ------------------------------------------------------------------------- + # ------------------------------------------------------------------------- + # Update entity attributes + # _entity.rcsb_multiple_source_flag + # _entity.rcsb_source_part_count + for atName in ["rcsb_source_part_count", "rcsb_multiple_source_flag", "rcsb_source_taxonomy_count"]: + if not eObj.hasAttribute(atName): + eObj.appendAttribute(atName) + # + taxCountTotal = 0 + for ii in range(eObj.getRowCount()): + entityId = eObj.getValue("id", ii) + cFlag = "Y" if partCountD[entityId] > 1 else "N" + eObj.setValue(partCountD[entityId], "rcsb_source_part_count", ii) + eObj.setValue(cFlag, "rcsb_multiple_source_flag", ii) + taxCount = len(entityTaxIdD[entityId]) if entityId in entityTaxIdD else 0 + eObj.setValue(taxCount, "rcsb_source_taxonomy_count", ii) + taxCountTotal += taxCount + + logger.debug("Entities with taxonomies %d entry total taxonomy count is %d", len(entryTaxIdD), taxCountTotal) + if dataContainer.exists("rcsb_entry_info"): + eiObj = dataContainer.getObj("rcsb_entry_info") + eiObj.setValue(taxCountTotal, "polymer_entity_taxonomy_count", 0) + # + return True + except Exception as e: + logger.exception("In %s for %s failing with %s", dataContainer.getName(), catName, str(e)) + return False + + def __addEntityCompIds(self, dataContainer): + """Add entity_id and BIRD codes to selected categories. + + Args: + dataContainer (object): mmif.api.DataContainer object instance + + Returns: + bool: True for success or False otherwise + + For example, update/add identifiers: + + loop_ + _pdbx_entity_nonpoly.entity_id + _pdbx_entity_nonpoly.name + _pdbx_entity_nonpoly.comp_id + + """ + try: + eD = {} + if dataContainer.exists("pdbx_entity_nonpoly"): + npObj = dataContainer.getObj("pdbx_entity_nonpoly") + for ii in range(npObj.getRowCount()): + entityId = npObj.getValue("entity_id", ii) + compId = npObj.getValue("comp_id", ii) + eD[entityId] = compId + + if dataContainer.exists("rcsb_entity_container_identifiers"): + pObj = dataContainer.getObj("rcsb_entity_container_identifiers") + if not pObj.hasAttribute("nonpolymer_comp_id"): + pObj.appendAttribute("nonpolymer_comp_id") + for ii in range(pObj.getRowCount()): + entityId = pObj.getValue("entity_id", ii) + compId = eD[entityId] if entityId in eD else "?" + pObj.setValue(compId, "nonpolymer_comp_id", ii) + # + return True + except Exception as e: + logger.exception("%s failing with %s", dataContainer.getName(), str(e)) + return False + + def __addBirdEntityIds(self, dataContainer): + """Add entity_id and BIRD codes to selected categories. + + Args: + dataContainer (object): mmif.api.DataContainer object instance + + Returns: + bool: True for success or False otherwise + + For example, update/add identifiers: + + loop_ + _pdbx_molecule.instance_id + _pdbx_molecule.prd_id + _pdbx_molecule.asym_id + + loop_ + _pdbx_entity_nonpoly.entity_id + _pdbx_entity_nonpoly.name + _pdbx_entity_nonpoly.comp_id + + with: + + _pdbx_molecule.rcsb_entity_id + _pdbx_molecule.rcsb_comp_id + + _pdbx_entity_nonpoly.rcsb_prd_id + _entity_poly.rcsb_prd_id + + _rcsb_entity_containter_identifiers.prd_id + + """ + catName = "pdbx_molecule" + atName = "rcsb_entity_id" + try: + + logger.debug("Starting catName %s atName %s", catName, atName) + if catName != "pdbx_molecule" and "atName" != "rcsb_entity_id": + return False + # + if not (dataContainer.exists(catName) and dataContainer.exists("struct_asym")): + return False + # + cObj = dataContainer.getObj(catName) + if not cObj.hasAttribute(atName): + cObj.appendAttribute(atName) + # + if not cObj.hasAttribute("rcsb_comp_id"): + cObj.appendAttribute("rcsb_comp_id") + # + aD = {} + aObj = dataContainer.getObj("struct_asym") + for ii in range(aObj.getRowCount()): + entityId = aObj.getValue("entity_id", ii) + asymId = aObj.getValue("id", ii) + aD[asymId] = entityId + # + eD = {} + if dataContainer.exists("pdbx_entity_nonpoly"): + npObj = dataContainer.getObj("pdbx_entity_nonpoly") + for ii in range(npObj.getRowCount()): + entityId = npObj.getValue("entity_id", ii) + compId = npObj.getValue("comp_id", ii) + eD[entityId] = compId + # + # + prdD = {} + for ii in range(cObj.getRowCount()): + asymId = cObj.getValue("asym_id", ii) + prdId = cObj.getValue("prd_id", ii) + if asymId in aD: + entityId = aD[asymId] + prdD[entityId] = prdId + cObj.setValue(entityId, atName, ii) + compId = eD[entityId] if entityId in eD else "." + cObj.setValue(compId, "rcsb_comp_id", ii) + else: + logger.error("%s missing entityId for asymId %s", dataContainer.getName(), asymId) + # + if prdD and dataContainer.exists("pdbx_entity_nonpoly"): + npObj = dataContainer.getObj("pdbx_entity_nonpoly") + if not npObj.hasAttribute("rcsb_prd_id"): + npObj.appendAttribute("rcsb_prd_id") + for ii in range(npObj.getRowCount()): + entityId = npObj.getValue("entity_id", ii) + prdId = prdD[entityId] if entityId in prdD else "." + npObj.setValue(prdId, "rcsb_prd_id", ii) + # + if prdD and dataContainer.exists("entity_poly"): + pObj = dataContainer.getObj("entity_poly") + if not pObj.hasAttribute("rcsb_prd_id"): + pObj.appendAttribute("rcsb_prd_id") + for ii in range(pObj.getRowCount()): + entityId = pObj.getValue("entity_id", ii) + prdId = prdD[entityId] if entityId in prdD else "." + pObj.setValue(prdId, "rcsb_prd_id", ii) + # + # + if prdD and dataContainer.exists("rcsb_entity_container_identifiers"): + pObj = dataContainer.getObj("rcsb_entity_container_identifiers") + if not pObj.hasAttribute("prd_id"): + pObj.appendAttribute("prd_id") + if not pObj.hasAttribute("nonpolymer_comp_id"): + pObj.appendAttribute("nonpolymer_comp_id") + for ii in range(pObj.getRowCount()): + entityId = pObj.getValue("entity_id", ii) + prdId = prdD[entityId] if entityId in prdD else "?" + pObj.setValue(prdId, "prd_id", ii) + compId = eD[entityId] if entityId in eD else "?" + pObj.setValue(compId, "nonpolymer_comp_id", ii) + + # + # + return True + except Exception as e: + logger.exception("%s %s %s failing with %s", dataContainer.getName(), catName, atName, str(e)) + return False + + def addStructRefSeqEntityIds(self, dataContainer, catName, **kwargs): + """Add entity ids in categories struct_ref_seq and struct_ref_seq_dir instances. + + Args: + dataContainer (object): mmif.api.DataContainer object instance + catName (str): Category name + atName (str): Attribute name + + Returns: + bool: True for success or False otherwise + + """ + try: + logger.debug("Starting with %r %r %r", dataContainer.getName(), catName, kwargs) + if catName != "struct_ref_seq": + return False + # + if not (dataContainer.exists(catName) and dataContainer.exists("struct_ref")): + return False + # + atName = "rcsb_entity_id" + srsObj = dataContainer.getObj(catName) + if not srsObj.hasAttribute(atName): + # srsObj.appendAttribute(atName) + srsObj.appendAttributeExtendRows(atName, defaultValue="?") + # + srObj = dataContainer.getObj("struct_ref") + # + srsdObj = None + if dataContainer.exists("struct_ref_seq_dif"): + srsdObj = dataContainer.getObj("struct_ref_seq_dif") + if not srsdObj.hasAttribute(atName): + # srsdObj.appendAttribute(atName) + srsdObj.appendAttributeExtendRows(atName, defaultValue="?") + + for ii in range(srObj.getRowCount()): + entityId = srObj.getValue("entity_id", ii) + refId = srObj.getValue("id", ii) + # + # Get indices for the target refId. + iRowL = srsObj.selectIndices(refId, "ref_id") + for iRow in iRowL: + srsObj.setValue(entityId, "rcsb_entity_id", iRow) + alignId = srsObj.getValue("align_id", iRow) + # + if srsdObj: + jRowL = srsdObj.selectIndices(alignId, "align_id") + for jRow in jRowL: + srsdObj.setValue(entityId, "rcsb_entity_id", jRow) + + return True + except Exception as e: + logger.exception("%s %s failing with %s", dataContainer.getName(), catName, str(e)) + return False + + def buildEntityPolyInfo(self, dataContainer, catName, **kwargs): + """Build category rcsb_entity_poly_info and supplement category entity_poly. + + Args: + dataContainer (object): mmif.api.DataContainer object instance + catName (str): Category name + + Returns: + bool: True for success or False otherwise + + For example, : + loop_ + _rcsb_entity_poly_info.ordinal_id + _rcsb_entity_poly_info.entry_id + _rcsb_entity_poly_info.entity_id + _rcsb_entity_poly_info.comp_id + _rcsb_entity_poly_info.is_modified + _rcsb_entity_poly_info.is_heterogeneous + _rcsb_entity_poly_info.entity_sequence_length + _rcsb_entity_poly_info.chem_comp_count + + 1 1ABC 1 1 MSE Y N 100 1 + 2 1ABC 1 2 TRP N N 100 4 + # ... abbreviated ... + + """ + logger.debug("Starting with %r %r %r", dataContainer.getName(), catName, kwargs) + try: + # Exit if source categories are missing + if not (dataContainer.exists("entity_poly") and dataContainer.exists("entry")): + return False + # + # Create the new target category + if not dataContainer.exists(catName): + dataContainer.append(DataCategory(catName, attributeNameList=self.__dApi.getAttributeNameList(catName))) + cObj = dataContainer.getObj(catName) + # + cN = "rcsb_entity_monomer_container_identifiers" + if not dataContainer.exists(cN): + dataContainer.append(DataCategory(cN, attributeNameList=self.__dApi.getAttributeNameList(cN))) + idObj = dataContainer.getObj(cN) + + # + epObj = dataContainer.getObj("entity_poly") + for atName in [ + "rcsb_mutation_count", + "rcsb_artifact_monomer_count", + "rcsb_conflict_count", + "rcsb_insertion_count", + "rcsb_deletion_count", + "rcsb_sample_sequence_length", + "rcsb_non_std_monomer_count", + "rcsb_non_std_monomers", + ]: + if not epObj.hasAttribute(atName): + epObj.appendAttribute(atName) + + # + eObj = dataContainer.getObj("entry") + entryId = eObj.getValue("id", 0) + # ------- --------- ------- --------- ------- --------- ------- --------- ------- --------- + seqDifD = self.__commonU.getEntitySequenceFeatureCounts(dataContainer) + eD = self.__commonU.getPolymerEntityMonomerCounts(dataContainer) + elD = self.__commonU.getPolymerEntityLengthsEnumerated(dataContainer) + modMonD = self.__commonU.getPolymerEntityModifiedMonomers(dataContainer) + # + monDict3 = self.__commonU.monDict3 + ii = 0 + for entityId, cD in eD.items(): + for compId, chemCompCount in cD.items(): + modFlag = "N" if compId in monDict3 else "Y" + cObj.setValue(ii + 1, "ordinal_id", ii) + cObj.setValue(entryId, "entry_id", ii) + cObj.setValue(entityId, "entity_id", ii) + cObj.setValue(compId, "comp_id", ii) + cObj.setValue(chemCompCount, "chem_comp_count", ii) + cObj.setValue(round(float(chemCompCount) / float(elD[entityId]), 5), "chem_comp_polymer_fraction", ii) + cObj.setValue(modFlag, "is_modified", ii) + # + idObj.setValue(ii + 1, "ordinal_id", ii) + idObj.setValue(entryId, "entry_id", ii) + idObj.setValue(entityId, "entity_id", ii) + idObj.setValue(compId, "comp_id", ii) + ii += 1 + # + for ii in range(epObj.getRowCount()): + entityId = epObj.getValue("entity_id", ii) + mutations = seqDifD[entityId]["mutation"] if entityId in seqDifD else 0 + conflicts = seqDifD[entityId]["conflict"] if entityId in seqDifD else 0 + insertions = seqDifD[entityId]["insertion"] if entityId in seqDifD else 0 + deletions = seqDifD[entityId]["deletion"] if entityId in seqDifD else 0 + artifacts = seqDifD[entityId]["artifact"] if entityId in seqDifD else 0 + seqLen = elD[entityId] if entityId in elD else None + epObj.setValue(mutations, "rcsb_mutation_count", ii) + epObj.setValue(artifacts, "rcsb_artifact_monomer_count", ii) + epObj.setValue(conflicts, "rcsb_conflict_count", ii) + epObj.setValue(insertions, "rcsb_insertion_count", ii) + epObj.setValue(deletions, "rcsb_deletion_count", ii) + if seqLen is not None: + epObj.setValue(seqLen, "rcsb_sample_sequence_length", ii) + # + numMod = len(modMonD[entityId]) + uModL = ",".join(modMonD[entityId]) if numMod else "?" + epObj.setValue(numMod, "rcsb_non_std_monomer_count", ii) + epObj.setValue(uModL, "rcsb_non_std_monomers", ii) + + return True + except Exception as e: + logger.exception("%s %s failing with %s", dataContainer.getName(), catName, str(e)) + return False + + def addBranchedEntityComponentCounts(self, dataContainer, catName, atName, **kwargs): + """Add total number branched components in the branched entity. + + Args: + dataContainer (object): mmif.api.DataContainer object instance + catName (str): target category name + atName (str): target attribute name + + Returns: + bool: True for success or False otherwise + """ + try: + logger.debug("Starting with %r %r %r %r", dataContainer.getName(), catName, atName, kwargs) + if not (dataContainer.exists("pdbx_entity_branch") and dataContainer.exists("pdbx_entity_branch_list")): + return False + # + ebObj = dataContainer.getObj("pdbx_entity_branch") + eblObj = dataContainer.getObj("pdbx_entity_branch_list") + # + if not ebObj.hasAttribute(atName): + ebObj.appendAttribute(atName) + + for ii in range(ebObj.getRowCount()): + entityId = ebObj.getValue("entity_id", ii) + tL = eblObj.selectValuesWhere("entity_id", entityId, "entity_id") + ebObj.setValue(len(tL), atName, ii) + + return True + except Exception as e: + logger.exception("For %s %s failing with %s", catName, atName, str(e)) + return False + + def addEntityMisc(self, dataContainer, catName, atName, **kwargs): + """Add consolidated enzyme classification macromolecule names to the entity category. + + Args: + dataContainer (object): mmif.api.DataContainer object instance + catName (str): Category name + + Returns: + bool: True for success or False otherwise + + For instance, add: + + _entity.rcsb_macromolecular_names_combined <<< Dictionary target + + _entity.rcsb_ec_lineage_name + _entity.rcsb_ec_lineage_id + _entity.rcsb_ec_lineage_depth + + """ + try: + if not (dataContainer.exists("entry") and dataContainer.exists("entity")): + return False + # + if catName == "entity" and atName in ["rcsb_ec_lineage_name", "rcsb_ec_lineage_id", "rcsb_ec_lineage_depth"]: + return True + # + eObj = dataContainer.getObj("entity") + atList = [ + "rcsb_ec_lineage_depth", + "rcsb_ec_lineage_id", + "rcsb_ec_lineage_name", + "rcsb_macromolecular_names_combined_name", + "rcsb_macromolecular_names_combined_provenance_source", + "rcsb_macromolecular_names_combined_provenance_code", + "rcsb_enzyme_class_combined_ec", + "rcsb_enzyme_class_combined_provenance_source", + "rcsb_enzyme_class_combined_depth", + ] + for at in atList: + if not eObj.hasAttribute(at): + eObj.appendAttribute(at) + + hasEc = eObj.hasAttribute("pdbx_ec") + # + rP = kwargs.get("resourceProvider") + ecU = None + if hasEc: + ecU = rP.getResource("EnzymeProvider instance") if rP else None + # + ncObj = None + if dataContainer.exists("entity_name_com"): + ncObj = dataContainer.getObj("entity_name_com") + # get an BIRD assigned names - + birdFeatureD = self.__getBirdFeatures(dataContainer) + birdNameD = {} + for (entityId, _, _, filteredFeature), fName in birdFeatureD.items(): + if filteredFeature == "BIRD_MOLECULAR_NAME" and fName: + birdNameD.setdefault(entityId, []).append(fName) + + for ii in range(eObj.getRowCount()): + entityId = eObj.getValue("id", ii) + entityType = eObj.getValue("type", ii) + # + eObj.setValue("?", "rcsb_ec_lineage_depth", ii) + eObj.setValue("?", "rcsb_ec_lineage_id", ii) + eObj.setValue("?", "rcsb_ec_lineage_name", ii) + eObj.setValue("?", "rcsb_macromolecular_names_combined_name", ii) + eObj.setValue("?", "rcsb_macromolecular_names_combined_provenance_source", ii) + eObj.setValue("?", "rcsb_macromolecular_names_combined_provenance_code", ii) + eObj.setValue("?", "rcsb_enzyme_class_combined_ec", ii) + eObj.setValue("?", "rcsb_enzyme_class_combined_provenance_source", ii) + eObj.setValue("?", "rcsb_enzyme_class_combined_depth", ii) + # + if entityType not in ["polymer", "branched"]: + continue + # + # -------------------------------------------------------------------------- + # PDB assigned names + nameL = [] + sourceL = [] + provCodeL = [] + nmL = str(eObj.getValue("pdbx_description", ii)).split(",") + nmL = self.__cleanupCsv(nmL) + nmL = [tV.strip() for tV in nmL if len(tV) > 3] + nmLookUpD = {} + for nm in nmL: + if nm.upper() in nmLookUpD: + continue + nmLookUpD[nm.upper()] = True + nameL.append(nm) + sourceL.append("PDB Preferred Name") + provCodeL.append("ECO:0000304") + # + # PDB common names/synonyms + logger.debug("%s ii %d nmL %r", dataContainer.getName(), ii, nmL) + # + if ncObj: + ncL = [] + tL = ncObj.selectValuesWhere("name", entityId, "entity_id") + logger.debug("%s ii %d tL %r", dataContainer.getName(), ii, tL) + for tV in tL: + tff = tV.split(",") + ncL.extend(tff) + ncL = self.__cleanupCsv(ncL) + ncL = [tV.strip() for tV in ncL if len(tV) > 3] + for nc in ncL: + if nc.upper() in nmLookUpD: + continue + nmLookUpD[nc.upper()] = True + nameL.append(nc) + sourceL.append("PDB Synonym") + provCodeL.append("ECO:0000303") + logger.debug("%s ii %d ncL %r", dataContainer.getName(), ii, ncL) + # + if entityId in birdNameD: + for nm in birdNameD[entityId]: + if nm.upper() in nmLookUpD: + continue + nmLookUpD[nm.upper()] = True + nameL.append(nm) + sourceL.append("PDB BIRD Name") + provCodeL.append("ECO:0000303") + # + if nameL: + eObj.setValue(";".join(nameL), "rcsb_macromolecular_names_combined_name", ii) + eObj.setValue(";".join(sourceL), "rcsb_macromolecular_names_combined_provenance_source", ii) + eObj.setValue(";".join(provCodeL), "rcsb_macromolecular_names_combined_provenance_code", ii) + + # -------------------------------------------------------------------------- + linL = [] + ecIdUpdL = [] + ecDepthUpdL = [] + ecV = eObj.getValueOrDefault("pdbx_ec", ii, defaultValue=None) + if ecV: + ecIdL = ecV.split(",") if ecV else [] + if ecIdL: + ecIdL = list(OrderedDict.fromkeys(ecIdL)) + for tId in ecIdL: + ecId = ecU.normalize(tId) + if not ecU.exists(ecId): + continue + # tL = ecU.getLineage(ecId) if ecId and len(ecId) > 7 else None + tL = ecU.getLineage(ecId) + if tL: + linL.extend(tL) + ecIdUpdL.append(ecId) + ecDepthUpdL.append(str(ecId.count(".") + 1)) + + if linL: + eObj.setValue(";".join([str(tup[0]) for tup in linL]), "rcsb_ec_lineage_depth", ii) + eObj.setValue(";".join([str(tup[1]) for tup in linL]), "rcsb_ec_lineage_id", ii) + eObj.setValue(";".join([tup[2] for tup in linL]), "rcsb_ec_lineage_name", ii) + if ecIdUpdL: + eObj.setValue(",".join(ecIdUpdL), "pdbx_ec", ii) + eObj.setValue(";".join(ecIdUpdL), "rcsb_enzyme_class_combined_ec", ii) + eObj.setValue(";".join(ecDepthUpdL), "rcsb_enzyme_class_combined_depth", ii) + eObj.setValue(";".join(["PDB Primary Data" for _ in ecIdUpdL]), "rcsb_enzyme_class_combined_provenance_source", ii) + else: + eObj.setValue("?", "pdbx_ec", ii) + eObj.setValue("?", "rcsb_enzyme_class_combined_ec", ii) + eObj.setValue("?", "rcsb_enzyme_class_combined_provenance_source", ii) + eObj.setValue("?", "rcsb_enzyme_class_combined_depth", ii) + if ecIdL: + logger.debug("%s obsolete or undefined EC class detected %r", dataContainer.getName(), ecV) + return True + except Exception as e: + logger.exception("For %s %s failing with %s", catName, atName, str(e)) + return False + + def __cleanupCsv(self, tL): + """Ad hoc cleanup function for comma separated lists with embedded punctuation""" + rL = [] + try: + key_paths = functools.cmp_to_key(cmpElements) + groups = [",".join(grp) for key, grp in itertools.groupby(tL, key_paths)] + rL = list(OrderedDict.fromkeys(groups)) + except Exception: + pass + return rL + + def __filterCaseDuplicates(self, inpSL): + oL = [] + try: + lookUpD = {} + for inpS in inpSL: + if inpS.upper() in lookUpD: + continue + lookUpD[inpS.upper()] = True + oL.append(inpS) + except Exception: + return inpSL + + return oL + + def __getAttribList(self, sObj, atTupL): + atL = [] + atSL = [] + if sObj: + for (atS, at) in atTupL: + if sObj.hasAttribute(atS): + atL.append(at) + atSL.append(atS) + return atSL, atL + + def __normalizeCsvToList(self, entryId, colL, separator=","): + """Normalize a row containing some character delimited fields. + + Expand list of uneven lists into unifornm list of lists. + Only two list lengths are logically supported: 1 and second + maximum length. + + returns: list of expanded rows or the original input. + + """ + tcL = [] + countL = [] + for col in colL: + cL = [t.strip() for t in col.split(separator)] + tcL.append(cL) + countL.append(len(cL)) + # + tL = list(OrderedDict.fromkeys(countL)) + if len(tL) == 1 and tL[0] == 1: + return [colL] + # + orig = False + # Report pathological cases ... + if orig: + if (len(tL) > 2) or (tL[0] != 1 and len(tL) == 2): + logger.error("%s integrated source data inconsistent %r colL", entryId, colL) + return [colL] + # + # Expand the columns with uniform length + # + + if orig: + icL = [] + maxL = tL[1] + for tc in tcL: + if len(tc) == 1: + tc = tc * maxL + icL.append(tc) + else: + icL = [] + maxL = tL[1] + for tc in tcL: + if len(tc) == 1: + tc = tc * maxL + if len(tc) < maxL: + for _ in range(maxL - len(tc)): + tc.append("?") + icL.append(tc[:maxL]) + + # + logger.debug("%s icL %r", entryId, icL) + # Convert back to a row list + # + iRow = 0 + rL = [] + for iRow in range(maxL): + row = [] + for ic in icL: + row.append(ic[iRow]) + rL.append(row) + + return rL + + def __stripWhiteSpace(self, val): + """Remove all white space from the input value.""" + if val is None: + return val + return self.__wsPattern.sub("", val) + + # + def __getTargetComponentFeatures(self, dataContainer): + """Get targeted components- + + Args: + dataContainer ([type]): [description] + + Returns: + dict: {(entityId, compId, "SUBJECT_OF_INVESTIGATION")} = True + + """ + tcD = {} + try: + if not dataContainer.exists("pdbx_entity_nonpoly"): + return tcD + ccTargets = self.__commonU.getTargetComponents(dataContainer) + if dataContainer.exists("pdbx_entity_nonpoly"): + npObj = dataContainer.getObj("pdbx_entity_nonpoly") + for ii in range(npObj.getRowCount()): + entityId = npObj.getValue("entity_id", ii) + compId = npObj.getValue("comp_id", ii) + if compId in ccTargets: + tcD[(entityId, compId, "SUBJECT_OF_INVESTIGATION")] = True + except Exception as e: + logger.exception("Failing for %s with %s", dataContainer.getName(), str(e)) + return tcD + + # + def __getBirdFeatures(self, dataContainer): + """Get type and class Bird annotations - + + Args: + dataContainer ([type]): [description] + + Returns: + dict: {(entityId, compId, "BIRD MOLECULE CLASS|BIRD MOLECULE CLASS")} = + + + Example: + _pdbx_molecule_features.prd_id PRD_002214 + _pdbx_molecule_features.name 'N-[(5-METHYLISOXAZOL-3-Y ...' + _pdbx_molecule_features.type Peptide-like + _pdbx_molecule_features.class Inhibitor + _pdbx_molecule_features.details ? + # + _pdbx_molecule.instance_id 1 + _pdbx_molecule.prd_id PRD_002214 + _pdbx_molecule.asym_id B + _pdbx_molecule.rcsb_entity_id 2 + _pdbx_molecule.rcsb_comp_id . + # + + """ + bD = {} + if not (dataContainer.exists("pdbx_molecule_features") and dataContainer.exists("pdbx_molecule")): + return bD + + try: + asymIdEntityIdD = self.__commonU.getInstanceEntityMap(dataContainer) + eD = {} + if dataContainer.exists("pdbx_entity_nonpoly"): + npObj = dataContainer.getObj("pdbx_entity_nonpoly") + for ii in range(npObj.getRowCount()): + entityId = npObj.getValue("entity_id", ii) + compId = npObj.getValue("comp_id", ii) + eD[entityId] = compId + + pfObj = dataContainer.getObj("pdbx_molecule_features") + pfD = {} + for ii in range(pfObj.getRowCount()): + prdId = pfObj.getValue("prd_id", ii) + prdType = pfObj.getValueOrDefault("class", ii, defaultValue=None) + prdClass = pfObj.getValueOrDefault("type", ii, defaultValue=None) + prdName = pfObj.getValueOrDefault("name", ii, defaultValue=None) + pfD[prdId] = (prdType, prdClass, prdName) + + pObj = dataContainer.getObj("pdbx_molecule") + bD = {} + for ii in range(pObj.getRowCount()): + asymId = pObj.getValue("asym_id", ii) + prdId = pObj.getValue("prd_id", ii) + entityId = asymIdEntityIdD[asymId] + compId = eD[entityId] if entityId in eD else None + if pfD[prdId][0]: + bD[(entityId, compId, prdId, "BIRD_MOLECULE_TYPE")] = pfD[prdId][0] + if pfD[prdId][1]: + bD[(entityId, compId, prdId, "BIRD_MOLECULE_CLASS")] = pfD[prdId][1] + if pfD[prdId][2]: + bD[(entityId, compId, prdId, "BIRD_MOLECULE_NAME")] = pfD[prdId][2] + + except Exception as e: + logger.exception("Failing for %s with %s", dataContainer.getName(), str(e)) + return bD + + def __getEntityFeatureTypes(self, eType): + eTupL = [] + if eType == "polymer": + eTupL = self.__dApi.getEnumListWithDetail("rcsb_polymer_entity_feature_summary", "type") + elif eType == "non-polymer": + eTupL = self.__dApi.getEnumListWithDetail("rcsb_nonpolymer_entity_feature_summary", "type") + elif eType == "branched": + eTupL = self.__dApi.getEnumListWithDetail("rcsb_branched_entity_feature_summary", "type") + # + fTypeL = sorted([tup[0] for tup in eTupL]) + return fTypeL + + def buildEntityFeatureSummary(self, dataContainer, catName, **kwargs): + """Build category rcsb_entity_feature_summary (UPDATED) + + Example: + + loop_ + _rcsb_entity_feature_summary.ordinal + _rcsb_entity_feature_summary.entry_id + _rcsb_entity_feature_summary.entity_id + _rcsb_entity_feature_summary.type + _rcsb_entity_feature_summary.count + _rcsb_entity_feature_summary.coverage + # ... + """ + logger.debug("Starting with %r %r %r", dataContainer.getName(), catName, kwargs) + try: + if catName != "rcsb_entity_feature_summary": + return False + if not dataContainer.exists("rcsb_entity_feature") and not dataContainer.exists("entry"): + return False + + if not dataContainer.exists(catName): + dataContainer.append(DataCategory(catName, attributeNameList=self.__dApi.getAttributeNameList(catName))) + # + eObj = dataContainer.getObj("entry") + entryId = eObj.getValue("id", 0) + # + sObj = dataContainer.getObj(catName) + fObj = dataContainer.getObj("rcsb_entity_feature") + # + entityPolymerLengthD = self.__commonU.getPolymerEntityLengthsEnumerated(dataContainer) + eTypeD = self.__commonU.getEntityTypes(dataContainer) + + fCountD = OrderedDict() + fMonomerCountD = OrderedDict() + for ii in range(fObj.getRowCount()): + entityId = fObj.getValue("entity_id", ii) + # + fType = fObj.getValue("type", ii) + fId = fObj.getValue("feature_id", ii) + fCountD.setdefault(entityId, {}).setdefault(fType, set()).add(fId) + + # + tbegS = fObj.getValueOrDefault("feature_positions_beg_seq_id", ii, defaultValue=None) + tendS = fObj.getValueOrDefault("feature_positions_end_seq_id", ii, defaultValue=None) + if fObj.hasAttribute("feature_positions_beg_seq_id") and tbegS is not None and fObj.hasAttribute("feature_positions_end_seq_id") and tendS is not None: + begSeqIdL = str(fObj.getValue("feature_positions_beg_seq_id", ii)).split(";") + endSeqIdL = str(fObj.getValue("feature_positions_end_seq_id", ii)).split(";") + monCount = 0 + for begSeqId, endSeqId in zip(begSeqIdL, endSeqIdL): + monCount += abs(int(endSeqId) - int(begSeqId) + 1) + fMonomerCountD.setdefault(entityId, {}).setdefault(fType, []).append(monCount) + elif fObj.hasAttribute("feature_positions_beg_seq_id") and tbegS: + seqIdL = str(fObj.getValue("feature_positions_beg_seq_id", ii)).split(";") + fMonomerCountD.setdefault(entityId, {}).setdefault(fType, []).append(len(seqIdL)) + # + ii = 0 + for entityId, eType in eTypeD.items(): + fTypes = self.__getEntityFeatureTypes(eType) + for fType in fTypes: + sObj.setValue(ii + 1, "ordinal", ii) + sObj.setValue(entryId, "entry_id", ii) + sObj.setValue(entityId, "entity_id", ii) + sObj.setValue(fType, "type", ii) + + minL = maxL = None + fracC = 0.0 + fCount = 0 + if entityId in fCountD and fType in fCountD[entityId]: + fCount = len(fCountD[entityId][fType]) + + if entityId in fMonomerCountD and fType in fMonomerCountD[entityId] and entityId in entityPolymerLengthD: + fracC = float(sum(fMonomerCountD[entityId][fType])) / float(entityPolymerLengthD[entityId]) + # + if fType in ["artifact"] and entityId in fMonomerCountD and fType in fMonomerCountD[entityId]: + minL = min(fMonomerCountD[entityId][fType]) + maxL = max(fMonomerCountD[entityId][fType]) + + sObj.setValue(round(fracC, 5), "coverage", ii) + sObj.setValue(fCount, "count", ii) + if minL is not None: + sObj.setValue(minL, "minimum_length", ii) + sObj.setValue(maxL, "maximum_length", ii) + # + ii += 1 + except Exception as e: + logger.exception("Failing with %s", str(e)) + return True + + def buildEntityFeatures(self, dataContainer, catName, **kwargs): + """Build category rcsb_entity_feature ... + + Example: + loop_ + _rcsb_entity_feature.ordinal + _rcsb_entity_feature.entry_id + _rcsb_entity_feature.entity_id + _rcsb_entity_feature.feature_id + _rcsb_entity_feature.type + _rcsb_entity_feature.name + _rcsb_entity_feature.description + _rcsb_entity_feature.reference_scheme + _rcsb_entity_feature.provenance_source + _rcsb_entity_feature.assignment_version + _rcsb_entity_feature.feature_positions_beg_seq_id + _rcsb_entity_feature.feature_positions_end_seq_id + _rcsb_entity_feature.feature_positions_value + + """ + logger.debug("Starting with %r %r %r", dataContainer.getName(), catName, kwargs) + try: + if catName != "rcsb_entity_feature": + return False + # Exit if source categories are missing + if not dataContainer.exists("entry"): + return False + # + # Create the new target category + if not dataContainer.exists(catName): + dataContainer.append(DataCategory(catName, attributeNameList=self.__dApi.getAttributeNameList(catName))) + cObj = dataContainer.getObj(catName) + # + # rP = kwargs.get("resourceProvider") + + eObj = dataContainer.getObj("entry") + entryId = eObj.getValue("id", 0) + # + # --------------- + ii = cObj.getRowCount() + jj = 1 + # + targetFeatureD = self.__getTargetComponentFeatures(dataContainer) + for (entityId, compId, filteredFeature) in targetFeatureD: + cObj.setValue(ii + 1, "ordinal", ii) + cObj.setValue(entryId, "entry_id", ii) + cObj.setValue(entityId, "entity_id", ii) + cObj.setValue(compId, "comp_id", ii) + cObj.setValue(filteredFeature, "type", ii) + cObj.setValue("entity_feature_%d" % jj, "feature_id", ii) + details = "Ligand targeted in this investigation" + cObj.setValue(details, "description", ii) + cObj.setValue(compId, "name", ii) + cObj.setValue("PDB", "provenance_source", ii) + cObj.setValue("V1.0", "assignment_version", ii) + # + jj += 1 + ii += 1 + # + # BIRD type and class + skipBird = True + if not skipBird: + birdFeatureD = self.__getBirdFeatures(dataContainer) + for (entityId, compId, prdId, filteredFeature), fName in birdFeatureD.items(): + cObj.setValue(ii + 1, "ordinal", ii) + cObj.setValue(entryId, "entry_id", ii) + cObj.setValue(entityId, "entity_id", ii) + cObj.setValue(compId, "comp_id", ii) + cObj.setValue(filteredFeature, "type", ii) + cObj.setValue("entity_feature_%d" % jj, "feature_id", ii) + if compId: + details = "Non-polymer BIRD %s chemical component %s" % (prdId, compId) + else: + details = "Polymer BIRD %s entity %s" % (prdId, entityId) + cObj.setValue(details, "description", ii) + # + cObj.setValue(fName, "name", ii) + cObj.setValue("PDB", "provenance_source", ii) + cObj.setValue("V1.0", "assignment_version", ii) + # + jj += 1 + ii += 1 + # + # Monomer modifications + jj = 1 + modMonomerFeatures = self.__commonU.getPolymerModifiedMonomerFeatures(dataContainer) + for (entityId, seqId, compId, filteredFeature) in modMonomerFeatures: + parentCompId = self.__ccP.getParentComponent(compId) + + cObj.setValue(ii + 1, "ordinal", ii) + cObj.setValue(entryId, "entry_id", ii) + cObj.setValue(entityId, "entity_id", ii) + cObj.setValue(filteredFeature, "type", ii) + cObj.setValue("monomer_feature_%d" % jj, "feature_id", ii) + if parentCompId: + details = "Parent monomer %s" % parentCompId + cObj.setValue(details, "name", ii) + # + cObj.setValue(compId, "feature_positions_beg_comp_id", ii) + cObj.setValue(seqId, "feature_positions_beg_seq_id", ii) + # + cObj.setValue("PDB entity", "reference_scheme", ii) + cObj.setValue("PDB", "provenance_source", ii) + cObj.setValue("V1.0", "assignment_version", ii) + # + jj += 1 + ii += 1 + # + # + seqMonomerFeatures = self.__commonU.getEntitySequenceMonomerFeatures(dataContainer) + for (entityId, seqId, compId, filteredFeature), sDetails in seqMonomerFeatures.items(): + if filteredFeature not in ["mutation"]: + continue + cObj.setValue(ii + 1, "ordinal", ii) + cObj.setValue(entryId, "entry_id", ii) + cObj.setValue(entityId, "entity_id", ii) + cObj.setValue(filteredFeature, "type", ii) + cObj.setValue("monomer_feature_%d" % jj, "feature_id", ii) + details = ",".join(list(sDetails)) + cObj.setValue(details, "name", ii) + # + cObj.setValue(compId, "feature_positions_beg_comp_id", ii) + cObj.setValue(seqId, "feature_positions_beg_seq_id", ii) + # + cObj.setValue("PDB entity", "reference_scheme", ii) + cObj.setValue("PDB", "provenance_source", ii) + cObj.setValue("V1.0", "assignment_version", ii) + # + jj += 1 + ii += 1 + # + jj = 1 + seqRangeFeatures = self.__commonU.getEntitySequenceRangeFeatures(dataContainer) + for (entityId, begSeqId, endSeqId, filteredFeature), sDetails in seqRangeFeatures.items(): + if filteredFeature not in ["artifact"]: + continue + cObj.setValue(ii + 1, "ordinal", ii) + cObj.setValue(entryId, "entry_id", ii) + cObj.setValue(entityId, "entity_id", ii) + cObj.setValue(filteredFeature, "type", ii) + cObj.setValue("range_feature_%d" % jj, "feature_id", ii) + details = ",".join(list(sDetails)) + cObj.setValue(details, "name", ii) + # + cObj.setValue(begSeqId, "feature_positions_beg_seq_id", ii) + cObj.setValue(endSeqId, "feature_positions_end_seq_id", ii) + # + cObj.setValue("PDB entity", "reference_scheme", ii) + cObj.setValue("PDB", "provenance_source", ii) + cObj.setValue("V1.0", "assignment_version", ii) + # + jj += 1 + ii += 1 + return True + except Exception as e: + logger.exception("%s %s failing with %s", dataContainer.getName(), catName, str(e)) + return False + + def addTypedEntityCategories(self, dataContainer, blockName, **kwargs): + """Slice common entity categories into type specific entity categories. + + Args: + dataContainer (object): mmif.api.DataContainer object instance + blockName (str): Block name + + Returns: + bool: True for success or False otherwise + + """ + logger.debug("Starting with %r %r %r", dataContainer.getName(), blockName, kwargs) + try: + if not (dataContainer.exists("entry") and dataContainer.exists("entity")): + return False + if dataContainer.exists("rcsb_polymer_entity") or dataContainer.exists("rcsb_nonpolymer_entity") or dataContainer.exists("rcsb_branched_entity"): + return True + # ----- + categoryMapD = { + "polymer": [ + ("entity", "rcsb_polymer_entity", "id"), + ("entity_keywords", "rcsb_polymer_entity_keywords", "entity_id"), + ("entity_name_com", "rcsb_polymer_entity_name_com", "entity_id"), + ("entity_name_sys", "rcsb_polymer_entity_name_sys", "entity_id"), + ("rcsb_entity_container_identifiers", "rcsb_polymer_entity_container_identifiers", "entity_id"), + ("rcsb_entity_instance_container_identifiers", "rcsb_polymer_entity_instance_container_identifiers", "entity_id"), + ], + "non-polymer": [ + ("entity", "rcsb_nonpolymer_entity", "id"), + ("entity_keywords", "rcsb_nonpolymer_entity_keywords", "entity_id"), + ("entity_name_com", "rcsb_nonpolymer_entity_name_com", "entity_id"), + ("entity_name_sys", "rcsb_nonpolymer_entity_name_sys", "entity_id"), + ("rcsb_entity_container_identifiers", "rcsb_nonpolymer_entity_container_identifiers", "entity_id"), + ("rcsb_entity_instance_container_identifiers", "rcsb_nonpolymer_entity_instance_container_identifiers", "entity_id"), + ], + "branched": [ + ("entity", "rcsb_branched_entity", "id"), + ("entity_keywords", "rcsb_branched_entity_keywords", "entity_id"), + ("entity_name_com", "rcsb_branched_entity_name_com", "entity_id"), + ("entity_name_sys", "rcsb_branched_entity_name_sys", "entity_id"), + ("rcsb_entity_container_identifiers", "rcsb_branched_entity_container_identifiers", "entity_id"), + ("rcsb_entity_instance_container_identifiers", "rcsb_branched_entity_instance_container_identifiers", "entity_id"), + ], + } + ok = self.__sliceCategoriesByEntityType(dataContainer, categoryMapD) + return ok + except Exception as e: + logger.exception("%s for %s failing with %s", dataContainer.getName(), blockName, str(e)) + return False + + def addTypedEntityFeatureCategories(self, dataContainer, blockName, **kwargs): + """Slice common entity categories into type specific entity categories. + + Args: + dataContainer (object): mmif.api.DataContainer object instance + blockName (str): Block name + + Returns: + bool: True for success or False otherwise + + """ + logger.debug("Starting with %r %r %r", dataContainer.getName(), blockName, kwargs) + try: + if not (dataContainer.exists("entry") and dataContainer.exists("entity")): + return False + if ( + dataContainer.exists("rcsb_polymer_entity_feature") + or dataContainer.exists("rcsb_nonpolymer_entity_feature") + or dataContainer.exists("rcsb_branched_entity_feature") + ): + return True + # ----- + categoryMapD = { + "polymer": [ + ("rcsb_entity_feature", "rcsb_polymer_entity_feature", "entity_id"), + ("rcsb_entity_feature_summary", "rcsb_polymer_entity_feature_summary", "entity_id"), + ("rcsb_entity_instance_feature", "rcsb_polymer_instance_feature", "entity_id"), + ("rcsb_entity_instance_feature_summary", "rcsb_polymer_instance_feature_summary", "entity_id"), + ("rcsb_entity_instance_validation_feature", "rcsb_polymer_instance_feature", "entity_id"), + ("rcsb_entity_instance_validation_feature_summary", "rcsb_polymer_instance_feature_summary", "entity_id"), + ("rcsb_struct_conn", "rcsb_polymer_struct_conn", "entity_id"), + ("rcsb_entity_annotation", "rcsb_polymer_entity_annotation", "entity_id"), + ("rcsb_entity_instance_annotation", "rcsb_polymer_instance_annotation", "entity_id"), + ], + "non-polymer": [ + ("rcsb_entity_feature", "rcsb_nonpolymer_entity_feature", "entity_id"), + ("rcsb_entity_feature_summary", "rcsb_nonpolymer_entity_feature_summary", "entity_id"), + ("rcsb_entity_instance_feature", "rcsb_nonpolymer_instance_feature", "entity_id"), + ("rcsb_entity_instance_feature_summary", "rcsb_nonpolymer_instance_feature_summary", "entity_id"), + ("rcsb_entity_instance_validation_feature", "rcsb_nonpolymer_instance_feature", "entity_id"), + ("rcsb_entity_instance_validation_feature_summary", "rcsb_nonpolymer_instance_feature_summary", "entity_id"), + ("rcsb_struct_conn", "rcsb_nonpolymer_struct_conn", "entity_id"), + ("rcsb_entity_annotation", "rcsb_nonpolymer_entity_annotation", "entity_id"), + ("rcsb_entity_instance_annotation", "rcsb_nonpolymer_instance_annotation", "entity_id"), + ], + "branched": [ + ("rcsb_entity_feature", "rcsb_branched_entity_feature", "entity_id"), + ("rcsb_entity_feature_summary", "rcsb_branched_entity_feature_summary", "entity_id"), + ("rcsb_entity_instance_feature", "rcsb_branched_instance_feature", "entity_id"), + ("rcsb_entity_instance_feature_summary", "rcsb_branched_instance_feature_summary", "entity_id"), + ("rcsb_entity_instance_validation_feature", "rcsb_branched_instance_feature", "entity_id"), + ("rcsb_entity_instance_validation_feature_summary", "rcsb_branched_instance_feature_summary", "entity_id"), + ("rcsb_struct_conn", "rcsb_branched_struct_conn", "entity_id"), + ("rcsb_entity_annotation", "rcsb_branched_entity_annotation", "entity_id"), + ("rcsb_entity_instance_annotation", "rcsb_branched_instance_annotation", "entity_id"), + ], + } + ok = self.__sliceCategoriesByEntityType(dataContainer, categoryMapD) + return ok + except Exception as e: + logger.exception("%s for %s failing with %s", dataContainer.getName(), blockName, str(e)) + return False + + def __sliceCategoriesByEntityType(self, dataContainer, categoryMapD): + """Slice common entity categories into type specific entity categories. + + Args: + dataContainer (object): mmif.api.DataContainer object instance + categoryMapD (dict): {: [{, , }, ... ], ... } + + Returns: + bool: True for success or False otherwise + + """ + logger.debug("Starting with %r", dataContainer.getName()) + try: + if not (dataContainer.exists("entry") and dataContainer.exists("entity")): + return False + eObj = dataContainer.getObj("entity") + eCount = eObj.getRowCount() + eTypeD = {eObj.getValue("id", ii): eObj.getValue("type", ii) for ii in range(eCount) if eObj.getValue("type", ii)} + eTypes = list(set(eTypeD.values())) + logger.debug("%s entity types %r map %r", dataContainer.getName(), eTypes, eTypeD) + for eType, catTupL in categoryMapD.items(): + if eType in eTypes: + # create new categories as needed + for srcCatN, dstCatN, entityIdKey in catTupL: + if dataContainer.exists(srcCatN): + if not dataContainer.exists(dstCatN): + dataContainer.append(DataCategory(dstCatN, attributeNameList=self.__dApi.getAttributeNameList(dstCatN))) + srcObj = dataContainer.getObj(srcCatN) + dstObj = dataContainer.getObj(dstCatN) + jj = dstObj.getRowCount() + for ii in range(srcObj.getRowCount()): + entityId = srcObj.getValue(entityIdKey, ii) + logger.debug("%s srcCatN %s row %d key %r entityId %r", dataContainer.getName(), srcCatN, ii, entityIdKey, entityId) + if eTypeD[entityId] != eType: + continue + for dstAtName in dstObj.getAttributeList(): + srcAtName = entityIdKey if dstAtName == "entity_id" else dstAtName + logger.debug( + "%s entityId %r srcCatN %r srcAtName %s dstCatN %s dstAtName %s", dataContainer.getName(), entityId, srcCatN, srcAtName, dstCatN, dstAtName + ) + if srcObj.hasAttribute(srcAtName): + tS = srcObj.getValue(srcAtName, ii) + logger.debug("%s entityId %r srcCatN %r srcAtName %s value %s", dataContainer.getName(), entityId, srcCatN, srcAtName, tS) + if srcAtName in ["formula_weight"]: + # dalton to kiloDalton + try: + tV = float(tS) / 1000.0 + tS = "%.3f" % tV + except Exception: + tS = "?" + if dstAtName in ["ordinal"]: + tS = jj + 1 + _ = dstObj.setValue(tS, dstAtName, jj) + else: + logger.debug("Missing srcCatN %s srcAtName %s", srcCatN, srcAtName) + _ = dstObj.setValue("?", dstAtName, jj) + jj += 1 + return True + except Exception as e: + logger.exception("%s failing with %s", dataContainer.getName(), str(e)) + return False + + # + def buildEntityAnnotations(self, dataContainer, catName, **kwargs): + """Build category rcsb_entity_annotation ... + + Example: + loop_ + _rcsb_entity_annotation.ordinal + _rcsb_entity_annotation.entry_id + _rcsb_entity_annotation.entity_id + _rcsb_entity_annotation.annotation_id + _rcsb_entity_annotation.type + _rcsb_entity_annotation.name + _rcsb_entity_annotation.description + _rcsb_entity_annotation.annotation_lineage_id + _rcsb_entity_annotation.annotation_lineage_name + _rcsb_entity_annotation.annotation_lineage_depth + _rcsb_entity_annotation.provenance_source + _rcsb_entity_annotation.assignment_version + + """ + logger.debug("Starting with %r %r %r", dataContainer.getName(), catName, kwargs) + try: + if catName != "rcsb_entity_annotation": + return False + # Exit if source categories are missing + if not dataContainer.exists("entry"): + return False + # + # Create the new target category + if not dataContainer.exists(catName): + dataContainer.append(DataCategory(catName, attributeNameList=self.__dApi.getAttributeNameList(catName))) + cObj = dataContainer.getObj(catName) + # + eObj = dataContainer.getObj("entry") + entryId = eObj.getValue("id", 0) + # + # --------------- + ii = cObj.getRowCount() + jj = 1 + # + targetFeatureD = self.__getTargetComponentFeatures(dataContainer) + # + for (entityId, compId, filteredFeature) in targetFeatureD: + cObj.setValue(ii + 1, "ordinal", ii) + cObj.setValue(entryId, "entry_id", ii) + cObj.setValue(entityId, "entity_id", ii) + cObj.setValue(compId, "comp_id", ii) + cObj.setValue(filteredFeature, "type", ii) + cObj.setValue("entity_annotation_%d" % jj, "annotation_id", ii) + details = "Ligand targeted in this investigation" + cObj.setValue(details, "description", ii) + cObj.setValue(compId, "name", ii) + cObj.setValue("PDB", "provenance_source", ii) + cObj.setValue("V1.0", "assignment_version", ii) + # + jj += 1 + ii += 1 + # + skipBird = True + if not skipBird: + # BIRD type and class + birdFeatureD = self.__getBirdFeatures(dataContainer) + for (entityId, compId, prdId, filteredFeature), fName in birdFeatureD.items(): + cObj.setValue(ii + 1, "ordinal", ii) + cObj.setValue(entryId, "entry_id", ii) + cObj.setValue(entityId, "entity_id", ii) + cObj.setValue(compId, "comp_id", ii) + cObj.setValue(filteredFeature, "type", ii) + cObj.setValue("entity_annotation_%d" % jj, "annotation_id", ii) + if compId: + details = "Non-polymer BIRD %s chemical component %s" % (prdId, compId) + else: + details = "Polymer BIRD %s entity %s" % (prdId, entityId) + cObj.setValue(details, "description", ii) + # + cObj.setValue(fName, "name", ii) + cObj.setValue("PDB", "provenance_source", ii) + cObj.setValue("V1.0", "assignment_version", ii) + # + jj += 1 + ii += 1 + return True + except Exception as e: + logger.exception("%s %s failing with %s", dataContainer.getName(), catName, str(e)) + return False diff --git a/rcsb/utils/dictionary/DictMethodEntityInstanceHelper.py b/rcsb/utils/dictionary/DictMethodEntityInstanceHelper.py new file mode 100644 index 0000000..6b67f3c --- /dev/null +++ b/rcsb/utils/dictionary/DictMethodEntityInstanceHelper.py @@ -0,0 +1,1776 @@ +## +# File: DictMethodEntityInstanceHelper.py +# Author: J. Westbrook +# Date: 16-Jul-2019 +# Version: 0.001 Initial version +# +## +""" +This helper class implements methods supporting entity-instance-level functions in the RCSB dictionary extension. + +""" +__docformat__ = "restructuredtext en" +__author__ = "John Westbrook" +__email__ = "jwest@rcsb.rutgers.edu" +__license__ = "Apache 2.0" + +# pylint: disable=too-many-lines + +import logging +import re +import time +from collections import OrderedDict + +from mmcif.api.DataCategory import DataCategory + +logger = logging.getLogger(__name__) + + +class DictMethodEntityInstanceHelper(object): + """This helper class implements methods supporting entity-instance-level functions in the RCSB dictionary extension.""" + + def __init__(self, **kwargs): + """ + Args: + resourceProvider: (obj) instance of DictMethodResourceProvider() + raiseExceptions: (bool, optional) flag to raise rather than handle exceptions + + """ + # + self._raiseExceptions = kwargs.get("raiseExceptions", False) + self.__wsPattern = re.compile(r"\s+", flags=re.UNICODE | re.MULTILINE) + self.__reNonDigit = re.compile(r"[^\d]+") + # + rP = kwargs.get("resourceProvider") + self.__commonU = rP.getResource("DictMethodCommonUtils instance") if rP else None + self.__dApi = rP.getResource("Dictionary API instance (pdbx_core)") if rP else None + self.__ccP = rP.getResource("ChemCompProvider instance") if rP else None + self.__rlsP = rP.getResource("RcsbLigandScoreProvider instance") if rP else None + # + logger.debug("Dictionary entity-instance level method helper init") + + def buildContainerEntityInstanceIds(self, dataContainer, catName, **kwargs): + """ + Build: + + loop_ + _rcsb_entity_instance_container_identifiers.entry_id + _rcsb_entity_instance_container_identifiers.entity_id + _rcsb_entity_instance_container_identifiers.entity_type + _rcsb_entity_instance_container_identifiers.asym_id + _rcsb_entity_instance_container_identifiers.auth_asym_id + _rcsb_entity_instance_container_identifiers.comp_id + _rcsb_entity_instance_container_identifiers.auth_seq_id + ... + + """ + logger.debug("Starting catName %s kwargs %r", catName, kwargs) + try: + if not (dataContainer.exists("entry") and dataContainer.exists("entity")): + return False + if not dataContainer.exists(catName): + dataContainer.append(DataCategory(catName, attributeNameList=self.__dApi.getAttributeNameList(catName))) + # + cObj = dataContainer.getObj(catName) + asymD = self.__commonU.getInstanceIdMap(dataContainer) + npAuthAsymD = self.__commonU.getNonPolymerIdMap(dataContainer) + brAuthAsymD = self.__commonU.getBranchedIdMap(dataContainer) + seqIdMapAsymD = self.__commonU.getAuthToSeqIdMap(dataContainer) + # + for ii, asymId in enumerate(sorted(asymD)): + for k, v in asymD[asymId].items(): + cObj.setValue(v, k, ii) + v = ",".join(seqIdMapAsymD[asymId]) if asymId in seqIdMapAsymD else "?" + cObj.setValue(v, "auth_to_entity_poly_seq_mapping", ii) + + ok = self.__addPdbxValidateAsymIds(dataContainer, asymD, npAuthAsymD, brAuthAsymD) + return ok + except Exception as e: + logger.exception("For %s failing with %s", catName, str(e)) + return False + + def __addPdbxValidateAsymIds(self, dataContainer, asymMapD, npAuthAsymMapD, brAuthAsymMapD): + """Internal method to insert Asym_id's into the following categories: + + _pdbx_validate_close_contact.rcsb_label_asym_id_1 + _pdbx_validate_close_contact.rcsb_label_asym_id_2 + _pdbx_validate_symm_contact.rcsb_label_asym_id_1 + _pdbx_validate_symm_contact.rcsb_label_asym_id_2 + _pdbx_validate_rmsd_bond.rcsb_label_asym_id_1 + _pdbx_validate_rmsd_bond.rcsb_label_asym_id_2 + _pdbx_validate_rmsd_angle.rcsb_label_asym_id_1 + _pdbx_validate_rmsd_angle.rcsb_label_asym_id_2 + _pdbx_validate_rmsd_angle.rcsb_label_asym_id_3 + _pdbx_validate_torsion.rcsb_label_asym_id + _pdbx_validate_peptide_omega.rcsb_label_asym_id_1 + _pdbx_validate_peptide_omega.rcsb_label_asym_id_2 + _pdbx_validate_chiral.rcsb_label_asym_id + _pdbx_validate_planes.rcsb_label_asym_id + _pdbx_validate_planes_atom.rcsb_label_asym_id + _pdbx_validate_main_chain_plane.rcsb_label_asym_id + _pdbx_validate_polymer_linkage.rcsb_label_asym_id_1 + _pdbx_validate_polymer_linkage.rcsb_label_asym_id_2 + """ + # + mD = { + "pdbx_validate_close_contact": [("auth_asym_id_1", "auth_seq_id_1", "rcsb_label_asym_id_1"), ("auth_asym_id_2", "auth_seq_id_2", "rcsb_label_asym_id_2")], + "pdbx_validate_symm_contact": [("auth_asym_id_1", "auth_seq_id_1", "rcsb_label_asym_id_1"), ("auth_asym_id_2", "auth_seq_id_2", "rcsb_label_asym_id_2")], + "pdbx_validate_rmsd_bond": [("auth_asym_id_1", "auth_seq_id_1", "rcsb_label_asym_id_1"), ("auth_asym_id_2", "auth_seq_id_2", "rcsb_label_asym_id_2")], + "pdbx_validate_rmsd_angle": [ + ("auth_asym_id_1", "auth_seq_id_1", "rcsb_label_asym_id_1"), + ("auth_asym_id_2", "auth_seq_id_2", "rcsb_label_asym_id_2"), + ("auth_asym_id_3", "auth_seq_id_3", "rcsb_label_asym_id_3"), + ], + "pdbx_validate_torsion": [("auth_asym_id", "auth_seq_id", "rcsb_label_asym_id")], + "pdbx_validate_peptide_omega": [("auth_asym_id_1", "auth_seq_id_1", "rcsb_label_asym_id_1"), ("auth_asym_id_2", "auth_seq_id_2", "rcsb_label_asym_id_2")], + "pdbx_validate_chiral": [("auth_asym_id", "auth_seq_id", "rcsb_label_asym_id")], + "pdbx_validate_planes": [("auth_asym_id", "auth_seq_id", "rcsb_label_asym_id")], + "pdbx_validate_planes_atom": [("auth_asym_id", "auth_seq_id", "rcsb_label_asym_id")], + "pdbx_validate_main_chain_plane": [("auth_asym_id", "auth_seq_id", "rcsb_label_asym_id")], + "pdbx_validate_polymer_linkage": [("auth_asym_id_1", "auth_seq_id_1", "rcsb_label_asym_id_1"), ("auth_asym_id_2", "auth_seq_id_2", "rcsb_label_asym_id_2")], + "pdbx_distant_solvent_atoms": [("auth_asym_id", "auth_seq_id", "rcsb_label_asym_id")], + } + # -- JDW + # polymer lookup + authAsymD = {} + for asymId, dD in asymMapD.items(): + if dD["entity_type"].lower() in ["polymer", "branched"]: + authAsymD[(dD["auth_asym_id"], "?")] = asymId + # + # non-polymer lookup + # + logger.debug("%s authAsymD %r", dataContainer.getName(), authAsymD) + for (authAsymId, seqId), dD in npAuthAsymMapD.items(): + if dD["entity_type"].lower() not in ["polymer", "branched"]: + authAsymD[(authAsymId, seqId)] = dD["asym_id"] + # + # branched lookup + logger.debug("%s authAsymD %r", dataContainer.getName(), authAsymD) + for (authAsymId, seqId), dD in brAuthAsymMapD.items(): + if dD["entity_type"].lower() in ["branched"]: + authAsymD[(authAsymId, seqId)] = dD["asym_id"] + # + # + for catName, mTupL in mD.items(): + if not dataContainer.exists(catName): + continue + cObj = dataContainer.getObj(catName) + for ii in range(cObj.getRowCount()): + for mTup in mTupL: + try: + authVal = cObj.getValue(mTup[0], ii) + except Exception: + authVal = "?" + try: + authSeqId = cObj.getValue(mTup[1], ii) + except Exception: + authSeqId = "?" + + # authVal = cObj.getValue(mTup[0], ii) + # authSeqId = cObj.getValue(mTup[1], ii) + # + # logger.debug("%s %4d authAsymId %r authSeqId %r" % (catName, ii, authVal, authSeqId)) + # + if (authVal, authSeqId) in authAsymD: + if not cObj.hasAttribute(mTup[2]): + cObj.appendAttribute(mTup[2]) + cObj.setValue(authAsymD[(authVal, authSeqId)], mTup[2], ii) + elif (authVal, "?") in authAsymD: + if not cObj.hasAttribute(mTup[2]): + cObj.appendAttribute(mTup[2]) + cObj.setValue(authAsymD[(authVal, "?")], mTup[2], ii) + else: + if authVal not in ["."]: + logger.warning("%s %s missing mapping auth asymId %s authSeqId %r", dataContainer.getName(), catName, authVal, authSeqId) + if not cObj.hasAttribute(mTup[2]): + cObj.appendAttribute(mTup[2]) + cObj.setValue("?", mTup[2], ii) + + return True + + def __initializeInstanceFeatureType(self, dataContainer, asymId, fCountD, countType="set"): + instTypeD = self.__commonU.getInstanceTypes(dataContainer) + eTupL = [] + eType = instTypeD[asymId] + if eType == "polymer": + eTupL = self.__dApi.getEnumListWithDetail("rcsb_polymer_instance_feature_summary", "type") + elif eType in ["non-polymer", "water"]: + eTupL = self.__dApi.getEnumListWithDetail("rcsb_nonpolymer_instance_feature_summary", "type") + elif eType == "branched": + eTupL = self.__dApi.getEnumListWithDetail("rcsb_branched_instance_feature_summary", "type") + else: + logger.error("%r asymId %r eType %r", dataContainer.getName(), asymId, eType) + # + fTypeL = sorted([tup[0] for tup in eTupL]) + # + for fType in fTypeL: + if countType == "set": + fCountD.setdefault(asymId, {}).setdefault(fType, set()) + else: + fCountD.setdefault(asymId, {}).setdefault(fType, []) + # + return fCountD + + # ---- JDW + def buildEntityInstanceFeatureSummaryPrev(self, dataContainer, catName, **kwargs): + """Build category rcsb_entity_instance_feature_summary (UPDATED) + + Example: + + loop_ + _rcsb_entity_instance_feature_summary.ordinal + _rcsb_entity_instance_feature_summary.entry_id + _rcsb_entity_instance_feature_summary.entity_id + _rcsb_entity_instance_feature_summary.asym_id + _rcsb_entity_instance_feature_summary.auth_asym_id + # + _rcsb_entity_instance_feature_summary.type + _rcsb_entity_instance_feature_summary.count + _rcsb_entity_instance_feature_summary.coverage + # ... + """ + logger.debug("Starting with %r %r %r", dataContainer.getName(), catName, kwargs) + try: + if catName != "rcsb_entity_instance_feature_summary": + return False + if not dataContainer.exists("rcsb_entity_instance_feature") and not dataContainer.exists("entry"): + return False + + if not dataContainer.exists(catName): + dataContainer.append(DataCategory(catName, attributeNameList=self.__dApi.getAttributeNameList(catName))) + # + eObj = dataContainer.getObj("entry") + entryId = eObj.getValue("id", 0) + # + sObj = dataContainer.getObj(catName) + fObj = dataContainer.getObj("rcsb_entity_instance_feature") + # + instEntityD = self.__commonU.getInstanceEntityMap(dataContainer) + entityPolymerLengthD = self.__commonU.getPolymerEntityLengthsEnumerated(dataContainer) + # typeList = self.__dApi.getEnumList("rcsb_entity_instance_feature_summary", "type", sortFlag=True) + asymAuthD = self.__commonU.getAsymAuthIdMap(dataContainer) + instIdMapD = self.__commonU.getInstanceIdMap(dataContainer) + + fCountD = OrderedDict() + fMonomerCountD = OrderedDict() + for ii in range(fObj.getRowCount()): + asymId = fObj.getValue("asym_id", ii) + # ---- initialize counts + # fCountD = self.__initializeInstanceFeatureType(dataContainer, asymId, fCountD, countType="set") + # fMonomerCountD = self.__initializeInstanceFeatureType(dataContainer, asymId, fMonomerCountD, countType="list") + # ---- + fType = fObj.getValue("type", ii) + fId = fObj.getValue("feature_id", ii) + fCountD.setdefault(asymId, {}).setdefault(fType, set()).add(fId) + # + tbegS = fObj.getValueOrDefault("feature_positions_beg_seq_id", ii, defaultValue=None) + tendS = fObj.getValueOrDefault("feature_positions_end_seq_id", ii, defaultValue=None) + if fObj.hasAttribute("feature_positions_beg_seq_id") and tbegS is not None and fObj.hasAttribute("feature_positions_end_seq_id") and tendS is not None: + begSeqIdL = str(fObj.getValue("feature_positions_beg_seq_id", ii)).split(";") + endSeqIdL = str(fObj.getValue("feature_positions_end_seq_id", ii)).split(";") + monCount = 0 + for begSeqId, endSeqId in zip(begSeqIdL, endSeqIdL): + try: + monCount += abs(int(endSeqId) - int(begSeqId) + 1) + except Exception: + logger.warning( + "%s fType %r fId %r bad sequence begSeqIdL %r endSeqIdL %r tbegS %r tendS %r", + dataContainer.getName(), + fType, + fId, + begSeqIdL, + endSeqIdL, + tbegS, + tendS, + ) + + fMonomerCountD.setdefault(asymId, {}).setdefault(fType, []).append(monCount) + elif fObj.hasAttribute("feature_positions_beg_seq_id") and tbegS: + seqIdL = str(fObj.getValue("feature_positions_beg_seq_id", ii)).split(";") + fMonomerCountD.setdefault(asymId, {}).setdefault(fType, []).append(len(seqIdL)) + # + logger.debug("%s fCountD %r", entryId, fCountD) + # + + ii = 0 + for asymId, fTypeD in fCountD.items(): + entityId = instEntityD[asymId] + authAsymId = asymAuthD[asymId] + for fType, fS in fTypeD.items(): + sObj.setValue(ii + 1, "ordinal", ii) + sObj.setValue(entryId, "entry_id", ii) + sObj.setValue(entityId, "entity_id", ii) + sObj.setValue(asymId, "asym_id", ii) + sObj.setValue(authAsymId, "auth_asym_id", ii) + # add comp + if asymId in instIdMapD and "comp_id" in instIdMapD[asymId] and instIdMapD[asymId]["comp_id"]: + sObj.setValue(instIdMapD[asymId]["comp_id"], "comp_id", ii) + sObj.setValue(fType, "type", ii) + # + if fType.startswith("UNOBSERVED") and asymId in fMonomerCountD and fType in fMonomerCountD[asymId]: + fCount = sum(fMonomerCountD[asymId][fType]) + else: + fCount = len(fS) + sObj.setValue(fCount, "count", ii) + fracC = 0.0 + if asymId in fMonomerCountD and fType in fMonomerCountD[asymId] and entityId in entityPolymerLengthD: + fracC = float(sum(fMonomerCountD[asymId][fType])) / float(entityPolymerLengthD[entityId]) + sObj.setValue(round(fracC, 5), "coverage", ii) + if ( + fType in ["CATH", "SCOP", "HELIX_P", "SHEET", "UNASSIGNED_SEC_STRUCT", "UNOBSERVED_RESIDUE_XYZ", "ZERO_OCCUPANCY_RESIDUE_XYZ"] + and asymId in fMonomerCountD + and fType in fMonomerCountD[asymId] + ): + minL = min(fMonomerCountD[asymId][fType]) if fMonomerCountD[asymId][fType] else 0 + maxL = max(fMonomerCountD[asymId][fType]) if fMonomerCountD[asymId][fType] else 0 + sObj.setValue(minL, "minimum_length", ii) + sObj.setValue(maxL, "maximum_length", ii) + ii += 1 + except Exception as e: + logger.exception("Failing for %s with %s", dataContainer.getName(), str(e)) + return True + + # ---- JDW + + def buildEntityInstanceFeatures(self, dataContainer, catName, **kwargs): + """Build category rcsb_entity_instance_feature ... + + Example: + loop_ + _rcsb_entity_instance_feature.ordinal + _rcsb_entity_instance_feature.entry_id + _rcsb_entity_instance_feature.entity_id + _rcsb_entity_instance_feature.asym_id + _rcsb_entity_instance_feature.auth_asym_id + _rcsb_entity_instance_feature.feature_id + _rcsb_entity_instance_feature.type + _rcsb_entity_instance_feature.name + _rcsb_entity_instance_feature.description + _rcsb_entity_instance_feature.reference_scheme + _rcsb_entity_instance_feature.provenance_source + _rcsb_entity_instance_feature.assignment_version + _rcsb_entity_instance_feature.feature_positions_beg_seq_id + _rcsb_entity_instance_feature.feature_positions_end_seq_id + _rcsb_entity_instance_feature.feature_positions_value + + """ + doLineage = False + logger.debug("Starting with %r %r %r", dataContainer.getName(), catName, kwargs) + try: + if catName != "rcsb_entity_instance_feature": + return False + # Exit if source categories are missing + if not dataContainer.exists("entry"): + return False + # + # Create the new target category + if not dataContainer.exists(catName): + dataContainer.append(DataCategory(catName, attributeNameList=self.__dApi.getAttributeNameList(catName))) + cObj = dataContainer.getObj(catName) + # + rP = kwargs.get("resourceProvider") + + eObj = dataContainer.getObj("entry") + entryId = eObj.getValue("id", 0) + # + asymIdD = self.__commonU.getInstanceEntityMap(dataContainer) + asymAuthIdD = self.__commonU.getAsymAuthIdMap(dataContainer) + asymIdRangesD = self.__commonU.getInstancePolymerRanges(dataContainer) + pAuthAsymD = self.__commonU.getPolymerIdMap(dataContainer) + instTypeD = self.__commonU.getInstanceTypes(dataContainer) + # --------------- + # Add CATH assignments + cathU = rP.getResource("CathProvider instance") if rP else None + ii = cObj.getRowCount() + # + for asymId, authAsymId in asymAuthIdD.items(): + if instTypeD[asymId] not in ["polymer", "branched"]: + continue + entityId = asymIdD[asymId] + dL = cathU.getCathResidueRanges(entryId.lower(), authAsymId) + logger.debug("%s asymId %s authAsymId %s dL %r", entryId, asymId, authAsymId, dL) + vL = cathU.getCathVersions(entryId.lower(), authAsymId) + for (cathId, domId, tId, authSeqBeg, authSeqEnd) in dL: + begSeqId = pAuthAsymD[(authAsymId, authSeqBeg, None)]["seq_id"] if (authAsymId, authSeqBeg, None) in pAuthAsymD else None + endSeqId = pAuthAsymD[(authAsymId, authSeqEnd, None)]["seq_id"] if (authAsymId, authSeqEnd, None) in pAuthAsymD else None + if not (begSeqId and endSeqId): + # take the full chain + begSeqId = asymIdRangesD[asymId]["begSeqId"] if asymId in asymIdRangesD else None + endSeqId = asymIdRangesD[asymId]["endSeqId"] if asymId in asymIdRangesD else None + if not (begSeqId and endSeqId): + logger.info( + "%s CATH cathId %r domId %r tId %r asymId %r authAsymId %r authSeqBeg %r authSeqEnd %r", + entryId, + cathId, + domId, + tId, + asymId, + authAsymId, + authSeqBeg, + authSeqEnd, + ) + continue + + cObj.setValue(ii + 1, "ordinal", ii) + cObj.setValue(entryId, "entry_id", ii) + cObj.setValue(entityId, "entity_id", ii) + cObj.setValue(asymId, "asym_id", ii) + cObj.setValue(authAsymId, "auth_asym_id", ii) + cObj.setValue("CATH", "type", ii) + # + cObj.setValue(str(cathId), "feature_id", ii) + # cObj.setValue(str(domId), "feature_id", ii) + # cObj.setValue(cathId, "name", ii) + cObj.setValue(cathU.getCathName(cathId), "name", ii) + # + if doLineage: + cObj.setValue(";".join(cathU.getNameLineage(cathId)), "annotation_lineage_name", ii) + idLinL = cathU.getIdLineage(cathId) + cObj.setValue(";".join(idLinL), "annotation_lineage_id", ii) + cObj.setValue(";".join([str(jj) for jj in range(1, len(idLinL) + 1)]), "annotation_lineage_depth", ii) + # + # + cObj.setValue(begSeqId, "feature_positions_beg_seq_id", ii) + cObj.setValue(endSeqId, "feature_positions_end_seq_id", ii) + # + cObj.setValue("PDB entity", "reference_scheme", ii) + cObj.setValue("CATH", "provenance_source", ii) + cObj.setValue(vL[0], "assignment_version", ii) + # + ii += 1 + # ------------ + # Add SCOP assignments + oldCode = False + scopU = rP.getResource("ScopProvider instance") if rP else None + for asymId, authAsymId in asymAuthIdD.items(): + if instTypeD[asymId] not in ["polymer", "branched"]: + continue + entityId = asymIdD[asymId] + dL = scopU.getScopResidueRanges(entryId.lower(), authAsymId) + version = scopU.getScopVersion() + for (sunId, domId, sccs, tId, authSeqBeg, authSeqEnd) in dL: + begSeqId = pAuthAsymD[(authAsymId, authSeqBeg, None)]["seq_id"] if (authAsymId, authSeqBeg, None) in pAuthAsymD else None + endSeqId = pAuthAsymD[(authAsymId, authSeqEnd, None)]["seq_id"] if (authAsymId, authSeqEnd, None) in pAuthAsymD else None + # logger.info("%s (first) begSeqId %r endSeqId %r", entryId, begSeqId, endSeqId) + if not (begSeqId and endSeqId): + # try another full range + # begSeqId = asymIdRangesD[asymId]["begAuthSeqId"] if asymId in asymIdRangesD and "begAuthSeqId" in asymIdRangesD[asymId] else None + # endSeqId = asymIdRangesD[asymId]["endAuthSeqId"] if asymId in asymIdRangesD and "endAuthSeqId" in asymIdRangesD[asymId] else None + begSeqId = asymIdRangesD[asymId]["begSeqId"] if asymId in asymIdRangesD else None + endSeqId = asymIdRangesD[asymId]["endSeqId"] if asymId in asymIdRangesD else None + # logger.info("%s (altd) begSeqId %r endSeqId %r", entryId, begSeqId, endSeqId) + if not (begSeqId and endSeqId): + logger.debug( + "%s unqalified SCOP sunId %r domId %r sccs %r asymId %r authAsymId %r authSeqBeg %r authSeqEnd %r", + entryId, + sunId, + domId, + sccs, + asymId, + authAsymId, + authSeqBeg, + authSeqEnd, + ) + continue + + cObj.setValue(ii + 1, "ordinal", ii) + cObj.setValue(entryId, "entry_id", ii) + cObj.setValue(entityId, "entity_id", ii) + cObj.setValue(asymId, "asym_id", ii) + cObj.setValue(authAsymId, "auth_asym_id", ii) + cObj.setValue("SCOP", "type", ii) + # + # cObj.setValue(str(sunId), "domain_id", ii) + cObj.setValue(domId, "feature_id", ii) + cObj.setValue(scopU.getScopName(sunId), "name", ii) + # + if doLineage: + tL = [t if t is not None else "" for t in scopU.getNameLineage(sunId)] + cObj.setValue(";".join(tL), "annotation_lineage_name", ii) + idLinL = scopU.getIdLineage(sunId) + cObj.setValue(";".join([str(t) for t in idLinL]), "annotation_lineage_id", ii) + cObj.setValue(";".join([str(jj) for jj in range(1, len(idLinL) + 1)]), "annotation_lineage_depth", ii) + # + cObj.setValue(begSeqId, "feature_positions_beg_seq_id", ii) + cObj.setValue(endSeqId, "feature_positions_end_seq_id", ii) + if oldCode: + if begSeqId is not None and endSeqId is not None: + if begSeqId == 0: + begSeqId += 1 + endSeqId += 1 + cObj.setValue(begSeqId, "feature_positions_beg_seq_id", ii) + cObj.setValue(endSeqId, "feature_positions_end_seq_id", ii) + else: + tSeqBeg = asymIdRangesD[asymId]["begAuthSeqId"] if asymId in asymIdRangesD and "begAuthSeqId" in asymIdRangesD[asymId] else None + cObj.setValue(tSeqBeg, "feature_positions_beg_seq_id", ii) + tSeqEnd = asymIdRangesD[asymId]["endAuthSeqId"] if asymId in asymIdRangesD and "endAuthSeqId" in asymIdRangesD[asymId] else None + cObj.setValue(tSeqEnd, "feature_positions_end_seq_id", ii) + # + cObj.setValue("PDB entity", "reference_scheme", ii) + cObj.setValue("SCOPe", "provenance_source", ii) + cObj.setValue(version, "assignment_version", ii) + # + ii += 1 + # ------------ + # Add sheet features + instSheetRangeD = self.__commonU.getProtSheetFeatures(dataContainer) + sheetSenseD = self.__commonU.getProtSheetSense(dataContainer) + for sId, sD in instSheetRangeD.items(): + for asymId, rTupL in sD.items(): + entityId = asymIdD[asymId] + authAsymId = asymAuthIdD[asymId] + cObj.setValue(ii + 1, "ordinal", ii) + cObj.setValue(entryId, "entry_id", ii) + cObj.setValue(entityId, "entity_id", ii) + cObj.setValue(asymId, "asym_id", ii) + cObj.setValue(authAsymId, "auth_asym_id", ii) + cObj.setValue("SHEET", "type", ii) + # + cObj.setValue(str(sId), "feature_id", ii) + cObj.setValue("sheet", "name", ii) + if sId in sheetSenseD: + cObj.setValue(sheetSenseD[sId] + " sense sheet", "description", ii) + # + tSeqId = ";".join([str(rTup[0]) for rTup in rTupL]) + cObj.setValue(tSeqId, "feature_positions_beg_seq_id", ii) + tSeqId = ";".join([str(rTup[1]) for rTup in rTupL]) + cObj.setValue(tSeqId, "feature_positions_end_seq_id", ii) + # + cObj.setValue("PDB entity", "reference_scheme", ii) + cObj.setValue("PROMOTIF", "provenance_source", ii) + cObj.setValue("V1.0", "assignment_version", ii) + # + ii += 1 + # ------------------ + # Helix features + helixRangeD = self.__commonU.getProtHelixFeatures(dataContainer) + for hId, hL in helixRangeD.items(): + for (asymId, begSeqId, endSeqId) in hL: + entityId = asymIdD[asymId] + authAsymId = asymAuthIdD[asymId] + cObj.setValue(ii + 1, "ordinal", ii) + cObj.setValue(entryId, "entry_id", ii) + cObj.setValue(entityId, "entity_id", ii) + cObj.setValue(asymId, "asym_id", ii) + cObj.setValue(authAsymId, "auth_asym_id", ii) + cObj.setValue("HELIX_P", "type", ii) + # + cObj.setValue(str(hId), "feature_id", ii) + cObj.setValue("helix", "name", ii) + # + cObj.setValue(begSeqId, "feature_positions_beg_seq_id", ii) + cObj.setValue(endSeqId, "feature_positions_end_seq_id", ii) + # + cObj.setValue("PDB entity", "reference_scheme", ii) + cObj.setValue("PROMOTIF", "provenance_source", ii) + cObj.setValue("V1.0", "assignment_version", ii) + # + ii += 1 + # + # ------------------ + # Unassigned SS features + unassignedRangeD = self.__commonU.getProtUnassignedSecStructFeatures(dataContainer) + for asymId, rTupL in unassignedRangeD.items(): + if not rTupL: + continue + entityId = asymIdD[asymId] + authAsymId = asymAuthIdD[asymId] + cObj.setValue(ii + 1, "ordinal", ii) + cObj.setValue(entryId, "entry_id", ii) + cObj.setValue(entityId, "entity_id", ii) + cObj.setValue(asymId, "asym_id", ii) + cObj.setValue(authAsymId, "auth_asym_id", ii) + cObj.setValue("UNASSIGNED_SEC_STRUCT", "type", ii) + # + cObj.setValue(str(1), "feature_id", ii) + cObj.setValue("unassigned secondary structure", "name", ii) + # + cObj.setValue(";".join([str(rTup[0]) for rTup in rTupL]), "feature_positions_beg_seq_id", ii) + cObj.setValue(";".join([str(rTup[1]) for rTup in rTupL]), "feature_positions_end_seq_id", ii) + # + cObj.setValue("PDB entity", "reference_scheme", ii) + cObj.setValue("PROMOTIF", "provenance_source", ii) + cObj.setValue("V1.0", "assignment_version", ii) + # + ii += 1 + # + cisPeptideD = self.__commonU.getCisPeptides(dataContainer) + for cId, cL in cisPeptideD.items(): + for (asymId, begSeqId, endSeqId, modelId, omegaAngle) in cL: + entityId = asymIdD[asymId] + authAsymId = asymAuthIdD[asymId] + cObj.setValue(ii + 1, "ordinal", ii) + cObj.setValue(entryId, "entry_id", ii) + cObj.setValue(entityId, "entity_id", ii) + cObj.setValue(asymId, "asym_id", ii) + cObj.setValue(authAsymId, "auth_asym_id", ii) + cObj.setValue("CIS-PEPTIDE", "type", ii) + cObj.setValue(str(cId), "feature_id", ii) + cObj.setValue("cis-peptide", "name", ii) + # + cObj.setValue(begSeqId, "feature_positions_beg_seq_id", ii) + cObj.setValue(endSeqId, "feature_positions_end_seq_id", ii) + # + cObj.setValue("PDB entity", "reference_scheme", ii) + cObj.setValue("PDB", "provenance_source", ii) + cObj.setValue("V1.0", "assignment_version", ii) + tS = "cis-peptide bond in model %d with omega angle %.2f" % (modelId, omegaAngle) + cObj.setValue(tS, "description", ii) + # + ii += 1 + # + targetSiteD = self.__commonU.getTargetSiteInfo(dataContainer) + ligandSiteD = self.__commonU.getLigandSiteInfo(dataContainer) + for tId, tL in targetSiteD.items(): + aD = OrderedDict() + for tD in tL: + aD.setdefault(tD["asymId"], []).append((tD["compId"], tD["seqId"])) + for asymId, aL in aD.items(): + entityId = asymIdD[asymId] + authAsymId = asymAuthIdD[asymId] + cObj.setValue(ii + 1, "ordinal", ii) + cObj.setValue(entryId, "entry_id", ii) + cObj.setValue(entityId, "entity_id", ii) + cObj.setValue(asymId, "asym_id", ii) + cObj.setValue(authAsymId, "auth_asym_id", ii) + cObj.setValue("BINDING_SITE", "type", ii) + cObj.setValue(str(tId), "feature_id", ii) + cObj.setValue("binding_site", "name", ii) + # + cObj.setValue(";".join([tup[0] for tup in aL]), "feature_positions_beg_comp_id", ii) + cObj.setValue(";".join([tup[1] for tup in aL]), "feature_positions_beg_seq_id", ii) + # + cObj.setValue("PDB entity", "reference_scheme", ii) + cObj.setValue("PDB", "provenance_source", ii) + cObj.setValue("V1.0", "assignment_version", ii) + if tId in ligandSiteD: + cObj.setValue(ligandSiteD[tId]["description"], "description", ii) + if ligandSiteD[tId]["siteLabel"]: + cObj.setValue(ligandSiteD[tId]["siteLabel"], "name", ii) + # + ii += 1 + # + unObsPolyResRngD = self.__commonU.getUnobservedPolymerResidueInfo(dataContainer) + for (modelId, asymId, zeroOccFlag), rTupL in unObsPolyResRngD.items(): + entityId = asymIdD[asymId] + authAsymId = asymAuthIdD[asymId] + cObj.setValue(ii + 1, "ordinal", ii) + cObj.setValue(entryId, "entry_id", ii) + cObj.setValue(entityId, "entity_id", ii) + cObj.setValue(asymId, "asym_id", ii) + cObj.setValue(authAsymId, "auth_asym_id", ii) + # + if zeroOccFlag: + cObj.setValue("ZERO_OCCUPANCY_RESIDUE_XYZ", "type", ii) + tS = "All atom coordinates for this residue are reported with zero-occupancy in model %s" % modelId + cObj.setValue(tS, "description", ii) + cObj.setValue("residue coordinates with zero occupancy", "name", ii) + else: + cObj.setValue("UNOBSERVED_RESIDUE_XYZ", "type", ii) + tS = "No coordinates for this residue are reported in model %s" % modelId + cObj.setValue(tS, "description", ii) + cObj.setValue("unmodeled residue", "name", ii) + # + cObj.setValue(str(1), "feature_id", ii) + # + cObj.setValue(";".join([str(rTup[0]) for rTup in rTupL]), "feature_positions_beg_seq_id", ii) + cObj.setValue(";".join([str(rTup[1]) for rTup in rTupL]), "feature_positions_end_seq_id", ii) + # + cObj.setValue("PDB entity", "reference_scheme", ii) + cObj.setValue("PDB", "provenance_source", ii) + cObj.setValue("V1.0", "assignment_version", ii) + # + ii += 1 + + unObsPolyAtomRngD = self.__commonU.getUnobservedPolymerAtomInfo(dataContainer) + for (modelId, asymId, zeroOccFlag), rTupL in unObsPolyAtomRngD.items(): + entityId = asymIdD[asymId] + authAsymId = asymAuthIdD[asymId] + cObj.setValue(ii + 1, "ordinal", ii) + cObj.setValue(entryId, "entry_id", ii) + cObj.setValue(entityId, "entity_id", ii) + cObj.setValue(asymId, "asym_id", ii) + cObj.setValue(authAsymId, "auth_asym_id", ii) + # + if zeroOccFlag: + cObj.setValue("ZERO_OCCUPANCY_ATOM_XYZ", "type", ii) + tS = "Some atom coordinates in this residue are reported with zero-occupancy in model %s" % modelId + cObj.setValue(tS, "description", ii) + cObj.setValue("atom coordinates with zero occupancy", "name", ii) + else: + cObj.setValue("UNOBSERVED_ATOM_XYZ", "type", ii) + tS = "Some atom coordinates in this residue are not reported in model %s" % modelId + cObj.setValue(tS, "description", ii) + cObj.setValue("partially modeled residue", "name", ii) + # + cObj.setValue(str(1), "feature_id", ii) + # + cObj.setValue(";".join([str(rTup[0]) for rTup in rTupL]), "feature_positions_beg_seq_id", ii) + cObj.setValue(";".join([str(rTup[1]) for rTup in rTupL]), "feature_positions_end_seq_id", ii) + # + cObj.setValue("PDB entity", "reference_scheme", ii) + cObj.setValue("PDB", "provenance_source", ii) + cObj.setValue("V1.0", "assignment_version", ii) + # + ii += 1 + + npbD = self.__commonU.getBoundNonpolymersByInstance(dataContainer) + jj = 1 + for asymId, rTupL in npbD.items(): + for rTup in rTupL: + if rTup.connectType in ["covalent bond"]: + fType = "HAS_COVALENT_LINKAGE" + fId = "COVALENT_LINKAGE_%d" % jj + + elif rTup.connectType in ["metal coordination"]: + fType = "HAS_METAL_COORDINATION_LINKAGE" + fId = "METAL_COORDINATION_LINKAGE_%d" % jj + else: + continue + + entityId = asymIdD[asymId] + authAsymId = asymAuthIdD[asymId] + cObj.setValue(ii + 1, "ordinal", ii) + cObj.setValue(entryId, "entry_id", ii) + cObj.setValue(entityId, "entity_id", ii) + cObj.setValue(asymId, "asym_id", ii) + cObj.setValue(authAsymId, "auth_asym_id", ii) + cObj.setValue(rTup.targetCompId, "comp_id", ii) + cObj.setValue(fId, "feature_id", ii) + cObj.setValue(fType, "type", ii) + # + # ("targetCompId", "connectType", "partnerCompId", "partnerAsymId", "partnerEntityType", "bondDistance", "bondOrder") + cObj.setValue( + ";".join( + ["%s has %s with %s instance %s in model 1" % (rTup.targetCompId, rTup.connectType, rTup.partnerEntityType, rTup.partnerAsymId) for rTup in rTupL] + ), + "feature_value_details", + ii, + ) + cObj.setValue(";".join([rTup.partnerCompId if rTup.partnerCompId else "?" for rTup in rTupL]), "feature_value_comp_id", ii) + cObj.setValue(";".join([rTup.bondDistance if rTup.bondDistance else "?" for rTup in rTupL]), "feature_value_reported", ii) + cObj.setValue(";".join(["?" for rTup in rTupL]), "feature_value_reference", ii) + cObj.setValue(";".join(["?" for rTup in rTupL]), "feature_value_uncertainty_estimate", ii) + cObj.setValue(";".join(["?" for rTup in rTupL]), "feature_value_uncertainty_estimate_type", ii) + + cObj.setValue("PDB", "provenance_source", ii) + cObj.setValue("V1.0", "assignment_version", ii) + # + ii += 1 + jj += 1 + + return True + except Exception as e: + logger.exception("%s %s failing with %s", dataContainer.getName(), catName, str(e)) + return False + + def addProtSecStructInfo(self, dataContainer, catName, **kwargs): + """ + Add category rcsb_prot_sec_struct_info. + + """ + try: + logger.debug("Starting with %r %r %r", dataContainer.getName(), catName, kwargs) + # Exit if source categories are missing + if not dataContainer.exists("entry") and not (dataContainer.exists("struct_conf") or dataContainer.exists("struct_sheet_range")): + return False + # + # Create the new target category rcsb_prot_sec_struct_info + if not dataContainer.exists(catName): + dataContainer.append(DataCategory(catName, attributeNameList=self.__dApi.getAttributeNameList(catName))) + sD = self.__commonU.getProtSecStructFeatures(dataContainer) + # catName = rcsb_prot_sec_struct_info + cObj = dataContainer.getObj(catName) + # + xObj = dataContainer.getObj("entry") + entryId = xObj.getValue("id", 0) + # + for ii, asymId in enumerate(sD["helixCountD"]): + cObj.setValue(entryId, "entry_id", ii) + cObj.setValue(asymId, "label_asym_id", ii) + # + cObj.setValue(sD["helixCountD"][asymId], "helix_count", ii) + cObj.setValue(sD["sheetStrandCountD"][asymId], "beta_strand_count", ii) + cObj.setValue(sD["unassignedCountD"][asymId], "unassigned_count", ii) + # + cObj.setValue(",".join([str(t) for t in sD["helixLengthD"][asymId]]), "helix_length", ii) + cObj.setValue(",".join([str(t) for t in sD["sheetStrandLengthD"][asymId]]), "beta_strand_length", ii) + cObj.setValue(",".join([str(t) for t in sD["unassignedLengthD"][asymId]]), "unassigned_length", ii) + + cObj.setValue("%.2f" % (100.0 * sD["helixFracD"][asymId]), "helix_coverage_percent", ii) + cObj.setValue("%.2f" % (100.0 * sD["sheetStrandFracD"][asymId]), "beta_strand_coverage_percent", ii) + cObj.setValue("%.2f" % (100.0 * sD["unassignedFracD"][asymId]), "unassigned_coverage_percent", ii) + + cObj.setValue(",".join(sD["sheetSenseD"][asymId]), "beta_sheet_sense", ii) + cObj.setValue(",".join([str(t) for t in sD["sheetFullStrandCountD"][asymId]]), "beta_sheet_strand_count", ii) + + cObj.setValue(sD["featureMonomerSequenceD"][asymId], "feature_monomer_sequence", ii) + cObj.setValue(sD["featureSequenceD"][asymId], "feature_sequence", ii) + + return True + except Exception as e: + logger.exception("For %s %r failing with %s", dataContainer.getName(), catName, str(e)) + return False + + def addConnectionDetails(self, dataContainer, catName, **kwargs): + """Build rcsb_struct_conn category - + + Args: + dataContainer (object): mmcif.api.mmif.api.DataContainer object instance + catName (str): category name + + Returns: + bool: True for success or False otherwise + + Example: + loop_ + _rcsb_struct_conn.ordinal_id + _rcsb_struct_conn.id + _rcsb_struct_conn.conn_type + _rcsb_struct_conn.connect_target_label_comp_id + _rcsb_struct_conn.connect_target_label_asym_id + _rcsb_struct_conn.connect_target_label_seq_id + _rcsb_struct_conn.connect_target_label_atom_id + _rcsb_struct_conn.connect_target_label_alt_id + _rcsb_struct_conn.connect_target_auth_asym_id + _rcsb_struct_conn.connect_target_auth_seq_id + _rcsb_struct_conn.connect_target_symmetry + _rcsb_struct_conn.connect_partner_label_comp_id + _rcsb_struct_conn.connect_partner_label_asym_id + _rcsb_struct_conn.connect_partner_label_seq_id + _rcsb_struct_conn.connect_partner_label_atom_id + _rcsb_struct_conn.connect_partner_label_alt_id + _rcsb_struct_conn.connect_partner_symmetry + _rcsb_struct_conn.details + + # - - - - data truncated for brevity - - - - + """ + try: + logger.debug("Starting with %r %r %r", dataContainer.getName(), catName, kwargs) + # Exit if source categories are missing + if not dataContainer.exists("entry") and not dataContainer.exists("struct_conn"): + return False + # + # Create the new target category rcsb_struct_conn + if not dataContainer.exists(catName): + dataContainer.append(DataCategory(catName, attributeNameList=self.__dApi.getAttributeNameList(catName))) + cDL = self.__commonU.getInstanceConnections(dataContainer) + asymIdD = self.__commonU.getInstanceEntityMap(dataContainer) + asymAuthIdD = self.__commonU.getAsymAuthIdMap(dataContainer) + # + # catName = rcsb_struct_conn + cObj = dataContainer.getObj(catName) + # + xObj = dataContainer.getObj("entry") + entryId = xObj.getValue("id", 0) + # + for ii, cD in enumerate(cDL): + asymId = cD["connect_target_label_asym_id"] + entityId = asymIdD[asymId] + authAsymId = asymAuthIdD[asymId] if asymId in asymAuthIdD else None + cObj.setValue(ii + 1, "ordinal_id", ii) + cObj.setValue(entryId, "entry_id", ii) + cObj.setValue(asymId, "asym_id", ii) + cObj.setValue(entityId, "entity_id", ii) + if authAsymId: + cObj.setValue(authAsymId, "auth_asym_id", ii) + else: + logger.error("Missing mapping for %s asymId %s to authAsymId ", entryId, asymId) + for ky, val in cD.items(): + cObj.setValue(val, ky, ii) + # + return True + except Exception as e: + logger.exception("For %s %r failing with %s", dataContainer.getName(), catName, str(e)) + return False + + def __stripWhiteSpace(self, val): + """Remove all white space from the input value.""" + if val is None: + return val + return self.__wsPattern.sub("", val) + + def buildInstanceValidationFeatures(self, dataContainer, catName, **kwargs): + """Build category rcsb_entity_instance_validation_feature ... + + Example: + loop_ + _rcsb_entity_instance_validation_feature.ordinal + _rcsb_entity_instance_validation_feature.entry_id + _rcsb_entity_instance_validation_feature.entity_id + _rcsb_entity_instance_validation_feature.asym_id + _rcsb_entity_instance_validation_feature.auth_asym_id + _rcsb_entity_instance_validation_feature.feature_id + _rcsb_entity_instance_validation_feature.type + _rcsb_entity_instance_validation_feature.name + _rcsb_entity_instance_validation_feature.description + _rcsb_entity_instance_validation_feature.annotation_lineage_id + _rcsb_entity_instance_validation_feature.annotation_lineage_name + _rcsb_entity_instance_validation_feature.annotation_lineage_depth + _rcsb_entity_instance_validation_feature.reference_scheme + _rcsb_entity_instance_validation_feature.provenance_source + _rcsb_entity_instance_validation_feature.assignment_version + _rcsb_entity_instance_validation_feature.feature_positions_beg_seq_id + _rcsb_entity_instance_validation_feature.feature_positions_end_seq_id + _rcsb_entity_instance_validation_feature.feature_positions_beg_comp_id + # + _rcsb_entity_instance_validation_feature.feature_value_comp_id + _rcsb_entity_instance_validation_feature.feature_value_reported + _rcsb_entity_instance_validation_feature.feature_value_reference + _rcsb_entity_instance_validation_feature.feature_value_uncertainty_estimate + _rcsb_entity_instance_validation_feature.feature_value_uncertainty_estimate_type + _rcsb_entity_instance_validation_feature.feature_value_details + + """ + logger.debug("Starting with %r %r %r", dataContainer.getName(), catName, kwargs) + typeMapD = { + "ROTAMER_OUTLIER": "Molprobity rotamer outlier", + "RAMACHANDRAN_OUTLIER": "Molprobity Ramachandran outlier", + "RSRZ_OUTLIER": "Real space R-value Z score > 2", + "RSCC_OUTLIER": "Real space density correlation value < 0.65", + "MOGUL_BOND_OUTLIER": "Mogul bond distance outlier", + "MOGUL_ANGLE_OUTLIER": "Mogul bond angle outlier", + "BOND_OUTLIER": "Molprobity bond distance outlier", + "ANGLE_OUTLIER": "Molprobity bond angle outlier", + } + try: + if catName != "rcsb_entity_instance_validation_feature": + return False + # Exit if source categories are missing + if not dataContainer.exists("entry"): + return False + # + eObj = dataContainer.getObj("entry") + entryId = eObj.getValue("id", 0) + # + # Create the new target category + if not dataContainer.exists(catName): + dataContainer.append(DataCategory(catName, attributeNameList=self.__dApi.getAttributeNameList(catName))) + cObj = dataContainer.getObj(catName) + ii = cObj.getRowCount() + # + asymIdD = self.__commonU.getInstanceEntityMap(dataContainer) + asymAuthIdD = self.__commonU.getAsymAuthIdMap(dataContainer) + # + instanceModelOutlierD = self.__commonU.getInstanceModelOutlierInfo(dataContainer) + # + # ("OutlierValue", "compId, seqId, outlierType, description, reported, reference, uncertaintyValue, uncertaintyType") + # + logger.debug("Length instanceModelOutlierD %d", len(instanceModelOutlierD)) + # + # (modelId, asymId), []).append((compId, int(seqId), "RSCC_OUTLIER", tS) + for (modelId, asymId, hasSeq), pTupL in instanceModelOutlierD.items(): + fTypeL = sorted(set([pTup.outlierType for pTup in pTupL])) + jj = 1 + for fType in fTypeL: + if (asymId not in asymIdD) or (asymId not in asymAuthIdD): + continue + entityId = asymIdD[asymId] + authAsymId = asymAuthIdD[asymId] + # + cObj.setValue(ii + 1, "ordinal", ii) + cObj.setValue(entryId, "entry_id", ii) + cObj.setValue(entityId, "entity_id", ii) + cObj.setValue(asymId, "asym_id", ii) + cObj.setValue(authAsymId, "auth_asym_id", ii) + + # + cObj.setValue(fType, "type", ii) + tN = typeMapD[fType] if fType in typeMapD else fType + cObj.setValue(tN, "name", ii) + # + tFn = "%s_%d" % (fType, jj) + cObj.setValue(tFn, "feature_id", ii) + # + if hasSeq: + descriptionS = tN + " in instance %s model %s" % (asymId, modelId) + cObj.setValue(";".join([pTup.compId for pTup in pTupL if pTup.outlierType == fType]), "feature_positions_beg_comp_id", ii) + cObj.setValue(";".join([str(pTup.seqId) for pTup in pTupL if pTup.outlierType == fType]), "feature_positions_beg_seq_id", ii) + + else: + cObj.setValue(pTupL[0].compId, "comp_id", ii) + descriptionS = tN + " in %s instance %s model %s" % (pTupL[0].compId, asymId, modelId) + cObj.setValue(";".join([pTup.compId if pTup.compId else "?" for pTup in pTupL if pTup.outlierType == fType]), "feature_value_comp_id", ii) + cObj.setValue(";".join([pTup.description if pTup.description else "?" for pTup in pTupL if pTup.outlierType == fType]), "feature_value_details", ii) + cObj.setValue(";".join([pTup.reported if pTup.reported else "?" for pTup in pTupL if pTup.outlierType == fType]), "feature_value_reported", ii) + cObj.setValue(";".join([pTup.reference if pTup.reference else "?" for pTup in pTupL if pTup.outlierType == fType]), "feature_value_reference", ii) + cObj.setValue( + ";".join([pTup.uncertaintyValue if pTup.uncertaintyValue else "?" for pTup in pTupL if pTup.outlierType == fType]), + "feature_value_uncertainty_estimate", + ii, + ) + cObj.setValue( + ";".join([pTup.uncertaintyType if pTup.uncertaintyType else "?" for pTup in pTupL if pTup.outlierType == fType]), + "feature_value_uncertainty_estimate_type", + ii, + ) + cObj.setValue("PDB entity", "reference_scheme", ii) + cObj.setValue(descriptionS, "description", ii) + cObj.setValue("PDB", "provenance_source", ii) + cObj.setValue("V1.0", "assignment_version", ii) + # + jj += 1 + ii += 1 + # + ## + return True + except Exception as e: + logger.exception("For %s %r failing with %s", dataContainer.getName(), catName, str(e)) + return False + + # --- JDW + def buildInstanceValidationFeatureSummaryPrev(self, dataContainer, catName, **kwargs): + """Build category rcsb_entity_instance_validation_feature_summary + + Example: + + loop_ + _rcsb_entity_instance_validation_feature_summary.ordinal + _rcsb_entity_instance_validation_feature_summary.entry_id + _rcsb_entity_instance_validation_feature_summary.entity_id + _rcsb_entity_instance_validation_feature_summary.asym_id + _rcsb_entity_instance_validation_feature_summary.auth_asym_id + #validation_ + _rcsb_entity_instance_validation_feature_summary.type + _rcsb_entity_instance_validation_feature_summary.count + _rcsb_entity_instance_validation_feature_summary.coverage + # ... + """ + logger.debug("Starting with %r %r %r", dataContainer.getName(), catName, kwargs) + try: + if catName != "rcsb_entity_instance_validation_feature_summary": + return False + if not dataContainer.exists("rcsb_entity_instance_validation_feature") and not dataContainer.exists("entry"): + return False + + if not dataContainer.exists(catName): + dataContainer.append(DataCategory(catName, attributeNameList=self.__dApi.getAttributeNameList(catName))) + # + eObj = dataContainer.getObj("entry") + entryId = eObj.getValue("id", 0) + # + sObj = dataContainer.getObj(catName) + fObj = dataContainer.getObj("rcsb_entity_instance_validation_feature") + # + instIdMapD = self.__commonU.getInstanceIdMap(dataContainer) + instEntityD = self.__commonU.getInstanceEntityMap(dataContainer) + entityPolymerLengthD = self.__commonU.getPolymerEntityLengthsEnumerated(dataContainer) + asymAuthD = self.__commonU.getAsymAuthIdMap(dataContainer) + + fCountD = OrderedDict() + fMonomerCountD = OrderedDict() + fInstanceCountD = OrderedDict() + for ii in range(fObj.getRowCount()): + asymId = fObj.getValue("asym_id", ii) + # ---- initialize counts + # fCountD = self.__initializeInstanceValidationFeatureType(dataContainer, asymId, fCountD, countType="set") + # fMonomerCountD = self.__initializeInstanceValidationFeatureType(dataContainer, asymId, fMonomerCountD, countType="list") + # fInstanceCountD = self.__initializeInstanceValidationFeatureType(dataContainer, asymId, fInstanceCountD, countType="list") + # ---- + fType = fObj.getValue("type", ii) + fId = fObj.getValue("feature_id", ii) + fCountD.setdefault(asymId, {}).setdefault(fType, set()).add(fId) + # + tbegS = fObj.getValueOrDefault("feature_positions_beg_seq_id", ii, defaultValue=None) + tendS = fObj.getValueOrDefault("feature_positions_end_seq_id", ii, defaultValue=None) + if fObj.hasAttribute("feature_positions_beg_seq_id") and tbegS is not None and fObj.hasAttribute("feature_positions_end_seq_id") and tendS is not None: + begSeqIdL = str(fObj.getValue("feature_positions_beg_seq_id", ii)).split(";") + endSeqIdL = str(fObj.getValue("feature_positions_end_seq_id", ii)).split(";") + monCount = 0 + for begSeqId, endSeqId in zip(begSeqIdL, endSeqIdL): + try: + monCount += abs(int(endSeqId) - int(begSeqId) + 1) + except Exception: + logger.warning( + "In %s fType %r fId %r bad sequence range begSeqIdL %r endSeqIdL %r tbegS %r tendS %r", + dataContainer.getName(), + fType, + fId, + begSeqIdL, + endSeqIdL, + tbegS, + tendS, + ) + fMonomerCountD.setdefault(asymId, {}).setdefault(fType, []).append(monCount) + elif fObj.hasAttribute("feature_positions_beg_seq_id") and tbegS: + seqIdL = str(fObj.getValue("feature_positions_beg_seq_id", ii)).split(";") + fMonomerCountD.setdefault(asymId, {}).setdefault(fType, []).append(len(seqIdL)) + + tS = fObj.getValueOrDefault("feature_value_details", ii, defaultValue=None) + if fObj.hasAttribute("feature_value_details") and tS is not None: + dL = str(fObj.getValue("feature_value_details", ii)).split(";") + fInstanceCountD.setdefault(asymId, {}).setdefault(fType, []).append(len(dL)) + # + # logger.debug("%s fCountD %r", entryId, fCountD) + # + ii = 0 + for asymId, fTypeD in fCountD.items(): + entityId = instEntityD[asymId] + authAsymId = asymAuthD[asymId] + for fType, fS in fTypeD.items(): + # + sObj.setValue(ii + 1, "ordinal", ii) + sObj.setValue(entryId, "entry_id", ii) + sObj.setValue(entityId, "entity_id", ii) + sObj.setValue(asymId, "asym_id", ii) + if asymId in instIdMapD and "comp_id" in instIdMapD[asymId] and instIdMapD[asymId]["comp_id"]: + sObj.setValue(instIdMapD[asymId]["comp_id"], "comp_id", ii) + sObj.setValue(authAsymId, "auth_asym_id", ii) + sObj.setValue(fType, "type", ii) + fracC = 0.0 + # + if asymId in fMonomerCountD and fType in fMonomerCountD[asymId] and fMonomerCountD[asymId][fType]: + fCount = sum(fMonomerCountD[asymId][fType]) + if asymId in fMonomerCountD and fType in fMonomerCountD[asymId] and entityId in entityPolymerLengthD: + fracC = float(sum(fMonomerCountD[asymId][fType])) / float(entityPolymerLengthD[entityId]) + elif asymId in fInstanceCountD and fType in fInstanceCountD[asymId] and fInstanceCountD[asymId][fType]: + fCount = sum(fInstanceCountD[asymId][fType]) + else: + fCount = len(fS) + # + sObj.setValue(fCount, "count", ii) + sObj.setValue(round(fracC, 5), "coverage", ii) + # + ii += 1 + + except Exception as e: + logger.exception("Failing with %s", str(e)) + return True + + def __initializeInstanceValidationFeatureType(self, dataContainer, asymId, fCountD, countType="set"): + instTypeD = self.__commonU.getInstanceTypes(dataContainer) + eType = instTypeD[asymId] + eTupL = [] + # rcsb_entity_instance_validation_feature_summary.type + if eType == "polymer": + eTupL = self.__dApi.getEnumListWithDetail("rcsb_entity_instance_validation_feature_summary", "type") + elif eType in ["non-polymer", "water"]: + eTupL = self.__dApi.getEnumListWithDetail("rcsb_entity_instance_validation_feature_summary", "type") + elif eType == "branched": + eTupL = self.__dApi.getEnumListWithDetail("rcsb_entity_instance_validation_feature_summary", "type") + else: + logger.error("%r asymId %r eType %r", dataContainer.getName(), asymId, eType) + # + fTypeL = sorted([tup[0] for tup in eTupL]) + # + for fType in fTypeL: + if countType == "set": + fCountD.setdefault(asymId, {}).setdefault(fType, set()) + else: + fCountD.setdefault(asymId, {}).setdefault(fType, []) + # + return fCountD + + # --- JDW + def __getInstanceFeatureTypes(self, eType): + # + vTupL = self.__dApi.getEnumListWithDetail("rcsb_entity_instance_validation_feature_summary", "type") + if eType == "polymer": + eTupL = self.__dApi.getEnumListWithDetail("rcsb_polymer_instance_feature_summary", "type") + elif eType in ["non-polymer", "water"]: + eTupL = self.__dApi.getEnumListWithDetail("rcsb_nonpolymer_instance_feature_summary", "type") + elif eType == "branched": + eTupL = self.__dApi.getEnumListWithDetail("rcsb_branched_instance_feature_summary", "type") + else: + logger.error("Unexpected eType %r -- no features types provided", eType) + eTupL = [] + # Distinct elements in the instance specific categories. (remove validation types) + vTypeL = sorted([tup[0] for tup in vTupL]) + iTypeL = sorted([tup[0] for tup in eTupL]) + fTypeL = sorted(set(iTypeL) - set(vTypeL)) + return fTypeL + + def __getInstanceValidationFeatureTypes(self, eType): + # + vTupL = self.__dApi.getEnumListWithDetail("rcsb_entity_instance_validation_feature_summary", "type") + if eType == "polymer": + eTupL = self.__dApi.getEnumListWithDetail("rcsb_polymer_instance_feature_summary", "type") + elif eType in ["non-polymer", "water"]: + eTupL = self.__dApi.getEnumListWithDetail("rcsb_nonpolymer_instance_feature_summary", "type") + elif eType == "branched": + eTupL = self.__dApi.getEnumListWithDetail("rcsb_branched_instance_feature_summary", "type") + else: + logger.error("Unexpected eType %r -- no features types provided", eType) + eTupL = [] + # Common elements in the instance specific categories. + vTypeL = sorted([tup[0] for tup in vTupL]) + iTypeL = sorted([tup[0] for tup in eTupL]) + fTypeL = sorted(set(vTypeL).intersection(iTypeL)) + return fTypeL + + # --- JDW + def buildEntityInstanceFeatureSummary(self, dataContainer, catName, **kwargs): + """Build category rcsb_entity_instance_feature_summary (UPDATED) + + Example: + + loop_ + _rcsb_entity_instance_feature_summary.ordinal + _rcsb_entity_instance_feature_summary.entry_id + _rcsb_entity_instance_feature_summary.entity_id + _rcsb_entity_instance_feature_summary.asym_id + _rcsb_entity_instance_feature_summary.auth_asym_id + # + _rcsb_entity_instance_feature_summary.type + _rcsb_entity_instance_feature_summary.count + _rcsb_entity_instance_feature_summary.coverage + # ... + """ + logger.debug("Starting with %r %r %r", dataContainer.getName(), catName, kwargs) + try: + if catName != "rcsb_entity_instance_feature_summary": + return False + if not dataContainer.exists("rcsb_entity_instance_feature") and not dataContainer.exists("entry"): + return False + + if not dataContainer.exists(catName): + dataContainer.append(DataCategory(catName, attributeNameList=self.__dApi.getAttributeNameList(catName))) + # + eObj = dataContainer.getObj("entry") + entryId = eObj.getValue("id", 0) + # + sObj = dataContainer.getObj(catName) + fObj = dataContainer.getObj("rcsb_entity_instance_feature") + # + instEntityD = self.__commonU.getInstanceEntityMap(dataContainer) + entityPolymerLengthD = self.__commonU.getPolymerEntityLengthsEnumerated(dataContainer) + # typeList = self.__dApi.getEnumList("rcsb_entity_instance_feature_summary", "type", sortFlag=True) + asymAuthD = self.__commonU.getAsymAuthIdMap(dataContainer) + instIdMapD = self.__commonU.getInstanceIdMap(dataContainer) + instTypeD = self.__commonU.getInstanceTypes(dataContainer) + # + fCountD = OrderedDict() + fValuesD = OrderedDict() + fMonomerCountD = OrderedDict() + for ii in range(fObj.getRowCount()): + asymId = fObj.getValue("asym_id", ii) + # ---- initialize counts + # fCountD = self.__initializeInstanceFeatureType(dataContainer, asymId, fCountD, countType="set") + # fMonomerCountD = self.__initializeInstanceFeatureType(dataContainer, asymId, fMonomerCountD, countType="list") + # ---- + fType = fObj.getValue("type", ii) + fId = fObj.getValue("feature_id", ii) + fCountD.setdefault(asymId, {}).setdefault(fType, set()).add(fId) + # + tbegS = fObj.getValueOrDefault("feature_positions_beg_seq_id", ii, defaultValue=None) + tendS = fObj.getValueOrDefault("feature_positions_end_seq_id", ii, defaultValue=None) + if fObj.hasAttribute("feature_positions_beg_seq_id") and tbegS is not None and fObj.hasAttribute("feature_positions_end_seq_id") and tendS is not None: + begSeqIdL = str(fObj.getValue("feature_positions_beg_seq_id", ii)).split(";") + endSeqIdL = str(fObj.getValue("feature_positions_end_seq_id", ii)).split(";") + monCount = 0 + for begSeqId, endSeqId in zip(begSeqIdL, endSeqIdL): + try: + monCount += abs(int(endSeqId) - int(begSeqId) + 1) + except Exception: + logger.warning( + "%s fType %r fId %r bad sequence begSeqIdL %r endSeqIdL %r tbegS %r tendS %r", + dataContainer.getName(), + fType, + fId, + begSeqIdL, + endSeqIdL, + tbegS, + tendS, + ) + + fMonomerCountD.setdefault(asymId, {}).setdefault(fType, []).append(monCount) + elif fObj.hasAttribute("feature_positions_beg_seq_id") and tbegS: + seqIdL = str(fObj.getValue("feature_positions_beg_seq_id", ii)).split(";") + fMonomerCountD.setdefault(asymId, {}).setdefault(fType, []).append(len(seqIdL)) + # JDW + elif fObj.hasAttribute("feature_value_reported"): + tValue = fObj.getValueOrDefault("feature_value_reported", ii, defaultValue=None) + if tValue: + try: + tvL = [float(t) for t in tValue.split(";")] + fValuesD.setdefault(asymId, {}).setdefault(fType, []).extend(tvL) + except Exception: + pass + + # + logger.debug("%s fCountD %r", entryId, fCountD) + # + + ii = 0 + for asymId, entityId in instEntityD.items(): + eType = instTypeD[asymId] + authAsymId = asymAuthD[asymId] + fTypeL = self.__getInstanceFeatureTypes(eType) + # All entity type specific features + for fType in fTypeL: + sObj.setValue(ii + 1, "ordinal", ii) + sObj.setValue(entryId, "entry_id", ii) + sObj.setValue(entityId, "entity_id", ii) + sObj.setValue(asymId, "asym_id", ii) + sObj.setValue(authAsymId, "auth_asym_id", ii) + # add comp + if asymId in instIdMapD and "comp_id" in instIdMapD[asymId] and instIdMapD[asymId]["comp_id"]: + sObj.setValue(instIdMapD[asymId]["comp_id"], "comp_id", ii) + sObj.setValue(fType, "type", ii) + fracC = 0.0 + minL = maxL = 0 + if asymId in fMonomerCountD and fType in fMonomerCountD[asymId]: + if fType.startswith("UNOBSERVED"): + fCount = sum(fMonomerCountD[asymId][fType]) + else: + fCount = len(fCountD[asymId][fType]) + + if entityId in entityPolymerLengthD: + fracC = float(sum(fMonomerCountD[asymId][fType])) / float(entityPolymerLengthD[entityId]) + + if fType in ["CATH", "SCOP", "HELIX_P", "SHEET", "UNASSIGNED_SEC_STRUCT", "UNOBSERVED_RESIDUE_XYZ", "ZERO_OCCUPANCY_RESIDUE_XYZ"]: + minL = min(fMonomerCountD[asymId][fType]) + maxL = max(fMonomerCountD[asymId][fType]) + + elif asymId in fCountD and fType in fCountD[asymId] and fCountD[asymId][fType]: + fCount = len(fCountD[asymId][fType]) + else: + fCount = 0 + # + minV = maxV = 0 + if asymId in fValuesD and fType in fValuesD[asymId]: + if fType in ["HAS_COVALENT_LINKAGE", "HAS_METAL_COORDINATION_LINKAGE"]: + try: + minV = min(fValuesD[asymId][fType]) + maxV = max(fValuesD[asymId][fType]) + except Exception: + pass + + sObj.setValue(fCount, "count", ii) + sObj.setValue(round(fracC, 5), "coverage", ii) + if minL is not None: + sObj.setValue(minL, "minimum_length", ii) + sObj.setValue(maxL, "maximum_length", ii) + if minV is not None: + sObj.setValue(minV, "minimum_value", ii) + sObj.setValue(maxV, "maximum_value", ii) + # + ii += 1 + except Exception as e: + logger.exception("Failing for %s with %s", dataContainer.getName(), str(e)) + return True + + def buildInstanceValidationFeatureSummary(self, dataContainer, catName, **kwargs): + """Build category rcsb_entity_instance_validation_feature_summary + + Example: + + loop_ + _rcsb_entity_instance_validation_feature_summary.ordinal + _rcsb_entity_instance_validation_feature_summary.entry_id + _rcsb_entity_instance_validation_feature_summary.entity_id + _rcsb_entity_instance_validation_feature_summary.asym_id + _rcsb_entity_instance_validation_feature_summary.auth_asym_id + _rcsb_entity_instance_validation_feature_summary.type + _rcsb_entity_instance_validation_feature_summary.count + _rcsb_entity_instance_validation_feature_summary.coverage + # ... + """ + logger.debug("Starting with %r %r %r", dataContainer.getName(), catName, kwargs) + try: + if catName != "rcsb_entity_instance_validation_feature_summary": + return False + if not dataContainer.exists("rcsb_entity_instance_validation_feature") and not dataContainer.exists("entry"): + return False + + if not dataContainer.exists(catName): + dataContainer.append(DataCategory(catName, attributeNameList=self.__dApi.getAttributeNameList(catName))) + # + eObj = dataContainer.getObj("entry") + entryId = eObj.getValue("id", 0) + # + sObj = dataContainer.getObj(catName) + fObj = dataContainer.getObj("rcsb_entity_instance_validation_feature") + # + instIdMapD = self.__commonU.getInstanceIdMap(dataContainer) + instEntityD = self.__commonU.getInstanceEntityMap(dataContainer) + entityPolymerLengthD = self.__commonU.getPolymerEntityLengthsEnumerated(dataContainer) + asymAuthD = self.__commonU.getAsymAuthIdMap(dataContainer) + instTypeD = self.__commonU.getInstanceTypes(dataContainer) + + fCountD = OrderedDict() + fMonomerCountD = OrderedDict() + fInstanceCountD = OrderedDict() + for ii in range(fObj.getRowCount()): + asymId = fObj.getValue("asym_id", ii) + fType = fObj.getValue("type", ii) + fId = fObj.getValue("feature_id", ii) + fCountD.setdefault(asymId, {}).setdefault(fType, set()).add(fId) + # + tbegS = fObj.getValueOrDefault("feature_positions_beg_seq_id", ii, defaultValue=None) + tendS = fObj.getValueOrDefault("feature_positions_end_seq_id", ii, defaultValue=None) + if fObj.hasAttribute("feature_positions_beg_seq_id") and tbegS is not None and fObj.hasAttribute("feature_positions_end_seq_id") and tendS is not None: + begSeqIdL = str(fObj.getValue("feature_positions_beg_seq_id", ii)).split(";") + endSeqIdL = str(fObj.getValue("feature_positions_end_seq_id", ii)).split(";") + monCount = 0 + for begSeqId, endSeqId in zip(begSeqIdL, endSeqIdL): + try: + monCount += abs(int(endSeqId) - int(begSeqId) + 1) + except Exception: + logger.warning( + "In %s fType %r fId %r bad sequence range begSeqIdL %r endSeqIdL %r tbegS %r tendS %r", + dataContainer.getName(), + fType, + fId, + begSeqIdL, + endSeqIdL, + tbegS, + tendS, + ) + fMonomerCountD.setdefault(asymId, {}).setdefault(fType, []).append(monCount) + elif fObj.hasAttribute("feature_positions_beg_seq_id") and tbegS: + seqIdL = str(fObj.getValue("feature_positions_beg_seq_id", ii)).split(";") + fMonomerCountD.setdefault(asymId, {}).setdefault(fType, []).append(len(seqIdL)) + + tS = fObj.getValueOrDefault("feature_value_details", ii, defaultValue=None) + if fObj.hasAttribute("feature_value_details") and tS is not None: + dL = str(fObj.getValue("feature_value_details", ii)).split(";") + fInstanceCountD.setdefault(asymId, {}).setdefault(fType, []).append(len(dL)) + # + ii = 0 + # Summarize all instances - + for asymId, entityId in instEntityD.items(): + eType = instTypeD[asymId] + authAsymId = asymAuthD[asymId] + fTypeL = self.__getInstanceValidationFeatureTypes(eType) + # All entity type specific features + for fType in fTypeL: + # + sObj.setValue(ii + 1, "ordinal", ii) + sObj.setValue(entryId, "entry_id", ii) + sObj.setValue(entityId, "entity_id", ii) + sObj.setValue(asymId, "asym_id", ii) + if asymId in instIdMapD and "comp_id" in instIdMapD[asymId] and instIdMapD[asymId]["comp_id"]: + sObj.setValue(instIdMapD[asymId]["comp_id"], "comp_id", ii) + sObj.setValue(authAsymId, "auth_asym_id", ii) + sObj.setValue(fType, "type", ii) + # + # Sum features of different granularities + # + fracC = 0.0 + if asymId in fMonomerCountD and fType in fMonomerCountD[asymId] and fMonomerCountD[asymId][fType]: + fCount = sum(fMonomerCountD[asymId][fType]) + if asymId in fMonomerCountD and fType in fMonomerCountD[asymId] and entityId in entityPolymerLengthD: + fracC = float(sum(fMonomerCountD[asymId][fType])) / float(entityPolymerLengthD[entityId]) + elif asymId in fInstanceCountD and fType in fInstanceCountD[asymId] and fInstanceCountD[asymId][fType]: + fCount = sum(fInstanceCountD[asymId][fType]) + elif asymId in fCountD and fType in fCountD[asymId] and fCountD[asymId][fType]: + fCount = len(fCountD[asymId][fType]) + else: + # default zero value + fCount = 0 + # + sObj.setValue(fCount, "count", ii) + sObj.setValue(round(fracC, 5), "coverage", ii) + # + ii += 1 + + except Exception as e: + logger.exception("Failing with %s", str(e)) + return True + + # + def buildEntityInstanceAnnotations(self, dataContainer, catName, **kwargs): + """Build category rcsb_entity_instance_annotation ... + + Example: + loop_ + _rcsb_entity_instance_annotation.ordinal + _rcsb_entity_instance_annotation.entry_id + _rcsb_entity_instance_annotation.entity_id + _rcsb_entity_instance_annotation.asym_id + _rcsb_entity_instance_annotation.auth_asym_id + _rcsb_entity_instance_annotation.annotation_id + _rcsb_entity_instance_annotation.type + _rcsb_entity_instance_annotation.name + _rcsb_entity_instance_annotation.description + _rcsb_entity_instance_annotation.annotation_lineage_id + _rcsb_entity_instance_annotation.annotation_lineage_name + _rcsb_entity_instance_annotation.annotation_lineage_depth + _rcsb_entity_instance_annotation.reference_scheme + _rcsb_entity_instance_annotation.provenance_source + _rcsb_entity_instance_annotation.assignment_version + + """ + logger.debug("Starting with %r %r %r", dataContainer.getName(), catName, kwargs) + try: + if catName != "rcsb_entity_instance_annotation": + return False + # Exit if source categories are missing + if not dataContainer.exists("entry"): + return False + # + # Create the new target category + if not dataContainer.exists(catName): + dataContainer.append(DataCategory(catName, attributeNameList=self.__dApi.getAttributeNameList(catName))) + cObj = dataContainer.getObj(catName) + # + rP = kwargs.get("resourceProvider") + + eObj = dataContainer.getObj("entry") + entryId = eObj.getValue("id", 0) + # + asymIdD = self.__commonU.getInstanceEntityMap(dataContainer) + asymAuthIdD = self.__commonU.getAsymAuthIdMap(dataContainer) + # asymIdRangesD = self.__commonU.getInstancePolymerRanges(dataContainer) + # pAuthAsymD = self.__commonU.getPolymerIdMap(dataContainer) + instTypeD = self.__commonU.getInstanceTypes(dataContainer) + # --------------- + # Add CATH assignments + cathU = rP.getResource("CathProvider instance") if rP else None + ii = cObj.getRowCount() + # + for asymId, authAsymId in asymAuthIdD.items(): + if instTypeD[asymId] not in ["polymer", "branched"]: + continue + entityId = asymIdD[asymId] + dL = cathU.getCathResidueRanges(entryId.lower(), authAsymId) + logger.debug("%s asymId %s authAsymId %s dL %r", entryId, asymId, authAsymId, dL) + vL = cathU.getCathVersions(entryId.lower(), authAsymId) + for (cathId, domId, _, _, _) in dL: + cObj.setValue(ii + 1, "ordinal", ii) + cObj.setValue(entryId, "entry_id", ii) + cObj.setValue(entityId, "entity_id", ii) + cObj.setValue(asymId, "asym_id", ii) + cObj.setValue(authAsymId, "auth_asym_id", ii) + cObj.setValue("CATH", "type", ii) + # + cObj.setValue(str(cathId), "annotation_id", ii) + # cObj.setValue(str(domId), "annotation_id", ii) + # cObj.setValue(cathId, "name", ii) + cObj.setValue(cathU.getCathName(cathId), "name", ii) + # + cObj.setValue(";".join(cathU.getNameLineage(cathId)), "annotation_lineage_name", ii) + idLinL = cathU.getIdLineage(cathId) + cObj.setValue(";".join(idLinL), "annotation_lineage_id", ii) + cObj.setValue(";".join([str(jj) for jj in range(1, len(idLinL) + 1)]), "annotation_lineage_depth", ii) + # + cObj.setValue("CATH", "provenance_source", ii) + cObj.setValue(vL[0], "assignment_version", ii) + # + ii += 1 + # ------------ + # Add SCOP assignments + scopU = rP.getResource("ScopProvider instance") if rP else None + for asymId, authAsymId in asymAuthIdD.items(): + if instTypeD[asymId] not in ["polymer", "branched"]: + continue + entityId = asymIdD[asymId] + dL = scopU.getScopResidueRanges(entryId.lower(), authAsymId) + version = scopU.getScopVersion() + for (sunId, domId, _, _, _, _) in dL: + cObj.setValue(ii + 1, "ordinal", ii) + cObj.setValue(entryId, "entry_id", ii) + cObj.setValue(entityId, "entity_id", ii) + cObj.setValue(asymId, "asym_id", ii) + cObj.setValue(authAsymId, "auth_asym_id", ii) + cObj.setValue("SCOP", "type", ii) + # + # cObj.setValue(str(sunId), "domain_id", ii) + cObj.setValue(domId, "annotation_id", ii) + cObj.setValue(scopU.getScopName(sunId), "name", ii) + # + tL = [t if t is not None else "" for t in scopU.getNameLineage(sunId)] + cObj.setValue(";".join(tL), "annotation_lineage_name", ii) + idLinL = scopU.getIdLineage(sunId) + cObj.setValue(";".join([str(t) for t in idLinL]), "annotation_lineage_id", ii) + cObj.setValue(";".join([str(jj) for jj in range(1, len(idLinL) + 1)]), "annotation_lineage_depth", ii) + # + cObj.setValue("SCOPe", "provenance_source", ii) + cObj.setValue(version, "assignment_version", ii) + # + ii += 1 + # ------------ + # Add covalent attchment property + npbD = self.__commonU.getBoundNonpolymersByInstance(dataContainer) + jj = 1 + for asymId, rTupL in npbD.items(): + for rTup in rTupL: + if rTup.connectType in ["covalent bond"]: + fType = "HAS_COVALENT_LINKAGE" + fId = "COVALENT_LINKAGE_%d" % jj + + elif rTup.connectType in ["metal coordination"]: + fType = "HAS_METAL_COORDINATION_LINKAGE" + fId = "METAL_COORDINATION_LINKAGE_%d" % jj + else: + continue + + entityId = asymIdD[asymId] + authAsymId = asymAuthIdD[asymId] + cObj.setValue(ii + 1, "ordinal", ii) + cObj.setValue(entryId, "entry_id", ii) + cObj.setValue(entityId, "entity_id", ii) + cObj.setValue(asymId, "asym_id", ii) + cObj.setValue(authAsymId, "auth_asym_id", ii) + cObj.setValue(rTup.targetCompId, "comp_id", ii) + cObj.setValue(fId, "annotation_id", ii) + cObj.setValue(fType, "type", ii) + # + # ("targetCompId", "connectType", "partnerCompId", "partnerAsymId", "partnerEntityType", "bondDistance", "bondOrder") + cObj.setValue( + "%s has %s with %s instance %s in model 1" % (rTup.targetCompId, rTup.connectType, rTup.partnerEntityType, rTup.partnerAsymId), + "description", + ii, + ) + + cObj.setValue("PDB", "provenance_source", ii) + cObj.setValue("V1.0", "assignment_version", ii) + # + ii += 1 + jj += 1 + + return True + except Exception as e: + logger.exception("%s %s failing with %s", dataContainer.getName(), catName, str(e)) + return False + + def buildInstanceValidationScores(self, dataContainer, catName, **kwargs): + """Build category rcsb_nonpolymer_instance_validation_score ... + + Example: + loop_ + _rcsb_nonpolymer_instance_validation_score.ordinal + _rcsb_nonpolymer_instance_validation_score.entry_id + _rcsb_nonpolymer_instance_validation_score.entity_id + _rcsb_nonpolymer_instance_validation_score.asym_id + _rcsb_nonpolymer_instance_validation_score.auth_asym_id + _rcsb_nonpolymer_instance_validation_score.comp_id + _rcsb_nonpolymer_instance_validation_score.model_id + _rcsb_nonpolymer_instance_validation_score.type + _rcsb_nonpolymer_instance_validation_score.mogul_angles_RMSZ + _rcsb_nonpolymer_instance_validation_score.mogul_bonds_RMSZ + _rcsb_nonpolymer_instance_validation_score.RSR + _rcsb_nonpolymer_instance_validation_score.RSCC + _rcsb_nonpolymer_instance_validation_score.score_model_fit + _rcsb_nonpolymer_instance_validation_score.score_model_geometry + _rcsb_nonpolymer_instance_validation_score.ranking_model_fit + _rcsb_nonpolymer_instance_validation_score.ranking_model_geometry + _rcsb_nonpolymer_instance_validation_score.is_best_instance + _rcsb_nonpolymer_instance_validation_score.is_subject_of_investigation + # + """ + logger.debug("Starting with %s %r %r", dataContainer.getName(), catName, kwargs) + startTime = time.time() + try: + if catName != "rcsb_nonpolymer_instance_validation_score": + return False + if not dataContainer.exists("entry"): + return False + # + eObj = dataContainer.getObj("entry") + entryId = eObj.getValue("id", 0) + # + # Create the new target category + if not dataContainer.exists(catName): + dataContainer.append(DataCategory(catName, attributeNameList=self.__dApi.getAttributeNameList(catName))) + cObj = dataContainer.getObj(catName) + ii = cObj.getRowCount() + # + asymIdD = self.__commonU.getInstanceEntityMap(dataContainer) + asymAuthIdD = self.__commonU.getAsymAuthIdMap(dataContainer) + # + instanceModelValidationD = self.__commonU.getInstanceNonpolymerValidationInfo(dataContainer) + # + # NonpolymerValidationFields = ("rsr", "rscc", "mogul_bonds_rmsz", "mogul_angles_rmsz", "missing_heavy_atom_count") + # + logger.debug("Length instanceModelValidationD %d", len(instanceModelValidationD)) + # + ccTargets = self.__commonU.getTargetComponents(dataContainer) + # + meanD, stdD, loadingD = self.__rlsP.getParameterStatistics() + excludeList = self.__rlsP.getLigandExcludeList() + rankD = {} + scoreD = {} + # calculate scores and ranks and find best ranking + for (modelId, asymId, compId), vTup in instanceModelValidationD.items(): + if (asymId not in asymIdD) or (asymId not in asymAuthIdD): + continue + numHeavyAtoms = self.__ccP.getAtomCountHeavy(compId) + if not numHeavyAtoms: + continue + completeness = float(numHeavyAtoms - vTup.missing_heavy_atom_count) / float(numHeavyAtoms) + logger.debug("compId %s numHeavyAtoms %d completeness %0.2f", compId, numHeavyAtoms, completeness) + # + fitScore, fitRanking = self.__calculateFitScore(vTup.rsr, vTup.rscc, completeness, meanD, stdD, loadingD) + geoScore, geoRanking = self.__calculateGeometryScore(vTup.mogul_bonds_rmsz, vTup.mogul_angles_rmsz, meanD, stdD, loadingD) + # + rankD[compId] = (max(fitRanking, rankD[compId][0]), asymId) if compId in rankD else (fitRanking, asymId) + + scoreD[(modelId, asymId, compId)] = (fitScore, fitRanking, geoScore, geoRanking) + # + for (modelId, asymId, compId), vTup in instanceModelValidationD.items(): + if (modelId, asymId, compId) not in scoreD: + continue + # + entityId = asymIdD[asymId] + authAsymId = asymAuthIdD[asymId] + # + cObj.setValue(ii + 1, "ordinal", ii) + cObj.setValue(modelId, "model_id", ii) + cObj.setValue(entryId, "entry_id", ii) + cObj.setValue(entityId, "entity_id", ii) + cObj.setValue(asymId, "asym_id", ii) + cObj.setValue(authAsymId, "auth_asym_id", ii) + cObj.setValue(compId, "comp_id", ii) + cObj.setValue("RCSB_LIGAND_QUALITY_2021", "type", ii) + # + cObj.setValue(vTup.rsr, "RSR", ii) + cObj.setValue(vTup.rscc, "RSCC", ii) + cObj.setValue(vTup.mogul_angles_rmsz, "mogul_angles_RMSZ", ii) + cObj.setValue(vTup.mogul_bonds_rmsz, "mogul_bonds_RMSZ", ii) + # + sTup = scoreD[(modelId, asymId, compId)] + cObj.setValue(sTup[0], "score_model_fit", ii) + cObj.setValue(sTup[1], "ranking_model_fit", ii) + cObj.setValue(sTup[2], "score_model_geometry", ii) + cObj.setValue(sTup[3], "ranking_model_geometry", ii) + isBest = "Y" if rankD[compId][1] == asymId else "N" + cObj.setValue(isBest, "is_best_instance", ii) + # + isTarget = "N" + if compId in ccTargets: + isTarget = "Y" + elif compId in excludeList: + isTarget = "N" + elif self.__ccP.getFormulaWeight(compId) and self.__ccP.getFormulaWeight(compId) > 150.0: + isTarget = "Y" + cObj.setValue(isTarget, "is_subject_of_investigation", ii) + # + ii += 1 + # + ## + endTime = time.time() + logger.debug("Completed at %s (%.4f seconds)", time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - startTime) + return True + except Exception as e: + logger.exception("For %s %r failing with %s", dataContainer.getName(), catName, str(e)) + return False + + def __calculateFitScore(self, rsr, rscc, completeness, meanD, stdD, loadingD): + fitScore = None + fitRanking = 0.0 + try: + if rsr and rscc: + if completeness < 1.0: + rsr = rsr + 0.08235 * (1.0 - completeness) + rscc = rscc - 0.09652 * (1.0 - completeness) + fitScore = ((rsr - meanD["rsr"]) / stdD["rsr"]) * loadingD["rsr"] + ((rscc - meanD["rscc"]) / stdD["rscc"]) * loadingD["rscc"] + fitRanking = self.__rlsP.getFitScoreRanking(fitScore) + except Exception as e: + logger.exception("Failing for rsr %r rscc %r with %s", rsr, rscc, str(e)) + return fitScore, fitRanking + + def __calculateGeometryScore(self, bondsRmsZ, anglesRmsZ, meanD, stdD, loadingD): + geoScore = None + geoRanking = 0.0 + try: + if bondsRmsZ and anglesRmsZ: + geoScore = ((bondsRmsZ - meanD["mogul_bonds_rmsz"]) / stdD["mogul_bonds_rmsz"]) * loadingD["mogul_bonds_rmsz"] + ( + (anglesRmsZ - meanD["mogul_angles_rmsz"]) / stdD["mogul_angles_rmsz"] + ) * loadingD["mogul_angles_rmsz"] + geoRanking = self.__rlsP.getGeometryScoreRanking(geoScore) + except Exception as e: + logger.exception("Failing for bondsRmsZ %r anglesRmsZ %r with %r", bondsRmsZ, anglesRmsZ, str(e)) + + return geoScore, geoRanking diff --git a/rcsb/utils/dictionary/DictMethodEntryHelper.py b/rcsb/utils/dictionary/DictMethodEntryHelper.py new file mode 100644 index 0000000..51c362b --- /dev/null +++ b/rcsb/utils/dictionary/DictMethodEntryHelper.py @@ -0,0 +1,1214 @@ +## +# File: DictMethodEntryHelper.py (DictMethodRunnerHelper.py) +# Author: J. Westbrook +# Date: 18-Aug-2018 +# Version: 0.001 Initial version +# +# +# Updates: +# 4-Sep-2018 jdw add methods to construct entry and entity identier categories. +# 10-Sep-2018 jdw add method for citation author aggregation +# 22-Sep-2018 jdw add method assignAssemblyCandidates() +# 27-Oct-2018 jdw add method consolidateAccessionDetails() +# 30-Oct-2018 jdw add category methods addChemCompRelated(), addChemCompInfo(), +# addChemCompDescriptor() +# 10-Nov-2018 jdw add addChemCompSynonyms(), addChemCompTargets(), filterBlockByMethod() +# 12-Nov-2018 jdw add InChIKey matching in addChemCompRelated() +# 15-Nov-2018 jdw add handling for antibody misrepresentation of multisource organisms +# 28-Nov-2018 jdw relax constraints on the production of rcsb_entry_info +# 1-Dec-2018 jdw add ncbi source and host organism info +# 11-Dec-2018 jdw add addStructRefSeqEntityIds and buildEntityPolySeq +# 10-Jan-2019 jdw better handle initialization in filterBlockByMethod() +# 11-Jan-2019 jdw revise classification in assignAssemblyCandidates() +# 16-Feb-2019 jdw add buildContainerEntityInstanceIds() +# 19-Feb-2019 jdw add internal method __addPdbxValidateAsymIds() to add cardinal identifiers to +# pdbx_validate_* categories +# 28-Feb-2019 jdw change criteria for adding rcsb_chem_comp_container_identiers to work with ion definitions +# 11-Mar-2019 jdw replace taxonomy file handling with calls to TaxonomyUtils() +# 11-Mar-2019 jdw add EC lineage using EnzymeDatabaseUtils() +# 17-Mar-2019 jdw add support for entity subcategory rcsb_macromolecular_names_combined +# 23-Mar-2019 jdw change criteria chem_comp collection criteria to _chem_comp.pdbx_release_status +# 25-Mar-2019 jdw remap merged taxons and adjust exception handling for taxonomy lineage generation +# 7-Apr-2019 jdw add CathClassificationUtils and CathClassificationUtils and sequence difference type counts +# 25-Apr-2019 jdw For source and host organism add ncbi_parent_scientific_name +# add rcsb_entry_info.deposited_modeled_polymer_monomer_count and +# rcsb_entry_info.deposited_unmodeled_polymer_monomer_count, +# 1-May-2019 jdw add support for _rcsb_entry_info.deposited_polymer_monomer_count, +# _rcsb_entry_info.polymer_entity_count_protein, +# _rcsb_entry_info.polymer_entity_count_nucleic_acid, +# _rcsb_entry_info.polymer_entity_count_nucleic_acid_hybrid, +# _rcsb_entry_info.polymer_entity_count_DNA, +# _rcsb_entry_info.polymer_entity_count_RNA, +# _rcsb_entry_info.nonpolymer_ligand_entity_count +# _rcsb_entry_info.selected_polymer_entity_types +# _rcsb_entry_info.polymer_entity_taxonomy_count +# _rcsb_entry_info.assembly_count +# add categories rcsb_entity_instance_domain_scop and rcsb_entity_instance_domain_cath +# 4-May-2019 jdw extend content in categories rcsb_entity_instance_domain_scop and rcsb_entity_instance_domain_cath +# 13-May-2019 jdw add rcsb_entry_info.deposited_polymer_entity_instance_count and deposited_nonpolymer_entity_instance_count +# add entity_poly.rcsb_non_std_monomer_count and rcsb_non_std_monomers +# 15-May-2019 jdw add _rcsb_entry_info.na_polymer_entity_types update enumerations for _rcsb_entry_info.selected_polymer_entity_types +# 19-May-2019 jdw add method __getStructConfInfo() +# 21-May-2019 jdw handle odd ordering of records in struct_ref_seq_dif. +# 25-Nov-2019 jdw add method normalizeCitiationJournalAbbrev() and dependencies +# +## +""" +Helper class implements entry-level method references in the RCSB dictionary extension. + +All data accessors and structures here refer to dictionary category and attribute names. + +""" +__docformat__ = "restructuredtext en" +__author__ = "John Westbrook" +__email__ = "jwest@rcsb.rutgers.edu" +__license__ = "Apache 2.0" + +# pylint: disable=too-many-lines + +import logging +from string import capwords + +from mmcif.api.DataCategory import DataCategory + +logger = logging.getLogger(__name__) + + +def cmpElements(lhs, rhs): + return 0 if (lhs[-1].isdigit() or lhs[-1] in ["R", "S"]) and rhs[0].isdigit() else -1 + + +class DictMethodEntryHelper(object): + """Helper class implements entry-level method references in the RCSB dictionary extension.""" + + def __init__(self, **kwargs): + """ + Args: + resourceProvider: (obj) instance of DictMethodResourceProvider() + + """ + # + logger.debug("Dictionary entry method helper init with kwargs %r", kwargs) + self._raiseExceptions = kwargs.get("raiseExceptions", False) + # + rP = kwargs.get("resourceProvider") + self.__commonU = rP.getResource("DictMethodCommonUtils instance") if rP else None + self.__dApi = rP.getResource("Dictionary API instance (pdbx_core)") if rP else None + # + self.__crP = rP.getResource("CitationReferenceProvider instance") if rP else None + self.__jtaP = rP.getResource("JournalTitleAbbreviationProvider instance") if rP else None + # + # logger.debug("Dictionary entry method helper init") + + def echo(self, msg): + logger.info(msg) + + def deferredItemMethod(self, dataContainer, catName, atName, **kwargs): + """Placeholder for an item method.""" + _ = kwargs + logger.debug("Called deferred item method %r %r for %r", catName, atName, dataContainer.getName()) + return True + + def deferredCategoryMethod(self, dataContainer, catName, **kwargs): + """Placeholder for a category method.""" + _ = kwargs + logger.debug("Called deferred category method %r for %r", catName, dataContainer.getName()) + return True + + def setDatablockId(self, dataContainer, catName, atName, **kwargs): + """Item-level method to set the value of the input item to the current container name. + + Args: + dataContainer (object): mmif.api.DataContainer object instance + catName (str): Category name + atName (str): Attribute name + + Returns: + bool: True for success or False otherwise + """ + logger.debug("Starting catName %s atName %s kwargs %r", catName, atName, kwargs) + try: + val = dataContainer.getName() + if not dataContainer.exists(catName): + dataContainer.append(DataCategory(catName, attributeNameList=[atName])) + # + cObj = dataContainer.getObj(catName) + if not cObj.hasAttribute(atName): + cObj.appendAttribute(atName) + # + rc = cObj.getRowCount() + numRows = rc if rc else 1 + for ii in range(numRows): + cObj.setValue(val, atName, ii) + return True + except Exception as e: + logger.exception("Failing with %s", str(e)) + return False + + def setLoadDateTime(self, dataContainer, catName, atName, **kwargs): + """Set the value of the input data item with container load date. + + Args: + dataContainer (object): mmif.api.DataContainer object instance + catName (str): Category name + atName (str): Attribute name + + Returns: + bool: True for success or False otherwise + """ + logger.debug("Starting catName %s atName %s kwargs %r", catName, atName, kwargs) + try: + val = dataContainer.getProp("load_date") + if not dataContainer.exists(catName): + dataContainer.append(DataCategory(catName, attributeNameList=[atName])) + # + cObj = dataContainer.getObj(catName) + if not cObj.hasAttribute(atName): + cObj.appendAttribute(atName) + # + rc = cObj.getRowCount() + numRows = rc if rc else 1 + for ii in range(numRows): + cObj.setValue(val, atName, ii) + return True + except Exception as e: + logger.exception("Failing with %s", str(e)) + return False + + def setLocator(self, dataContainer, catName, atName, **kwargs): + """Set the value of the input data item with container locator path. + + Args: + dataContainer (object): mmif.api.DataContainer object instance + catName (str): Category name + atName (str): Attribute name + + Returns: + bool: True for success or False otherwise + """ + logger.debug("Starting catName %s atName %s kwargs %r", catName, atName, kwargs) + try: + val = dataContainer.getProp("locator") + if not dataContainer.exists(catName): + dataContainer.append(DataCategory(catName, attributeNameList=[atName])) + # + cObj = dataContainer.getObj(catName) + if not cObj.hasAttribute(atName): + cObj.appendAttribute(atName) + # + rc = cObj.getRowCount() + numRows = rc if rc else 1 + for ii in range(numRows): + cObj.setValue(val, atName, ii) + return True + except Exception as e: + logger.exception("Failing with %s", str(e)) + return False + + def setRowIndex(self, dataContainer, catName, atName, **kwargs): + """Set the values of the input data item with the category row index. + + Args: + dataContainer (object): mmif.api.DataContainer object instance + catName (str): Category name + atName (str): Attribute name + + Returns: + bool: True for success or False otherwise + """ + logger.debug("Starting catName %s atName %s kwargs %r", catName, atName, kwargs) + try: + if not dataContainer.exists(catName): + # exit if there is no category to index + return False + # + cObj = dataContainer.getObj(catName) + if not cObj.hasAttribute(atName): + cObj.appendAttribute(atName) + # + rc = cObj.getRowCount() + numRows = rc if rc else 1 + for ii, iRow in enumerate(range(numRows), 1): + # Note - we set the integer value as a string - + cObj.setValue(str(ii), atName, iRow) + return True + except Exception as e: + logger.exception("Failing with %s", str(e)) + return False + + def aggregateCitationOrcidIdentifiers(self, dataContainer, catName, atName, **kwargs): + """Set the value of the input data item with list of citation authors. + + Args: + dataContainer (object): mmif.api.DataContainer object instance + catName (str): Category name + atName (str): Attribute name + + Returns: + bool: True for success or False otherwise + """ + logger.debug("Starting catName %s atName %s kwargs %r", catName, atName, kwargs) + try: + if not dataContainer.exists(catName) or not dataContainer.exists("citation_author"): + return False + # + cObj = dataContainer.getObj(catName) + if not cObj.hasAttribute(atName): + cObj.appendAttribute(atName) + citIdL = cObj.getAttributeValueList("id") + # + tObj = dataContainer.getObj("citation_author") + # + + citIdL = list(set(citIdL)) + tD = {} + for ii, citId in enumerate(citIdL): + if tObj.hasAttribute("identifier_ORCID"): + tD[citId] = tObj.selectValuesWhere("identifier_ORCID", citId, "citation_id") + else: + tD[citId] = [] + for ii in range(cObj.getRowCount()): + citId = cObj.getValue("id", ii) + if tD[citId]: + cObj.setValue(",".join(tD[citId]), atName, ii) + else: + cObj.setValue("?", atName, ii) + return True + except Exception as e: + logger.exception("Failing for %r with %s", dataContainer.getName(), str(e)) + return False + + def aggregateCitationAuthors(self, dataContainer, catName, atName, **kwargs): + """Set the value of the input data item with list of citation authors. + + Args: + dataContainer (object): mmif.api.DataContainer object instance + catName (str): Category name + atName (str): Attribute name + + Returns: + bool: True for success or False otherwise + """ + logger.debug("Starting catName %s atName %s kwargs %r", catName, atName, kwargs) + try: + if not dataContainer.exists(catName) or not dataContainer.exists("citation_author"): + return False + # + cObj = dataContainer.getObj(catName) + if not cObj.hasAttribute(atName): + cObj.appendAttribute(atName) + citIdL = cObj.getAttributeValueList("id") + # + tObj = dataContainer.getObj("citation_author") + # + citIdL = list(set(citIdL)) + tD = {} + for ii, citId in enumerate(citIdL): + tD[citId] = tObj.selectValuesWhere("name", citId, "citation_id") + for ii in range(cObj.getRowCount()): + citId = cObj.getValue("id", ii) + cObj.setValue("|".join(tD[citId]), atName, ii) + return True + except Exception as e: + logger.exception("Failing for %r with %s", dataContainer.getName(), str(e)) + return False + + def normalizeCitationJournalAbbrev(self, dataContainer, catName, atName, **kwargs): + """Normalize citation journal abbrev. + + Args: + dataContainer (object): mmif.api.DataContainer object instance + catName (str): Category name + atName (str): Attribute name + + Returns: + bool: True for success or False otherwise + """ + logger.debug("Starting catName %s atName %s kwargs %r", catName, atName, kwargs) + revAbbrev = None + try: + if not dataContainer.exists(catName): + return False + # + cObj = dataContainer.getObj(catName) + if not cObj.hasAttribute("journal_abbrev") or not cObj.hasAttribute("id") or not cObj.hasAttribute("journal_id_ISSN"): + return False + # + if not cObj.hasAttribute(atName): + cObj.appendAttribute(atName) + # + rcsbId = dataContainer.getName() + for ii in range(cObj.getRowCount()): + # citId = cObj.getValue("id", ii) + issn = cObj.getValueOrDefault("journal_id_ISSN", ii, defaultValue=None) + curAbbrev = cObj.getValueOrDefault("journal_abbrev", ii, defaultValue=None) + if curAbbrev: + revAbbrev = self.__updateJournalAbbreviation(rcsbId, issn, curAbbrev) + revAbbrev = revAbbrev if revAbbrev else curAbbrev + # + logger.debug("%s journal abbreviation issn %r current %r normalized %r", rcsbId, issn, curAbbrev, revAbbrev) + cObj.setValue(revAbbrev, atName, ii) + return True + except Exception as e: + logger.exception("Failing for %r with %s", dataContainer.getName(), str(e)) + return False + + def __updateJournalAbbreviation(self, rcsbId, issn, curAbbrev): + revAbbrev = None + try: + if issn: + medlineAbbrev = self.__crP.getMedlineJournalAbbreviation(issn) + # medlineIsoAbbrev = self.__crP.getMedlineJournalIsoAbbreviation(issn) + crIssn = issn.replace("-", "") + crTitle = self.__crP.getCrossRefJournalTitle(crIssn) + # + revAbbrev = medlineAbbrev + if not medlineAbbrev and not crTitle: + logger.debug("%s: missing information for issn %r curAbbrev %r", rcsbId, issn, curAbbrev) + revAbbrev = capwords(curAbbrev.replace(".", " ")) + elif not medlineAbbrev: + revAbbrev = self.__jtaP.getJournalAbbreviation(crTitle, usePunctuation=False) + else: + if curAbbrev.upper() in ["TO BE PUBLISHED", "IN PREPARATION"]: + revAbbrev = "To be published" + elif curAbbrev.upper().startswith("THESIS"): + revAbbrev = "Thesis" + else: + revAbbrev = capwords(curAbbrev.replace(".", " ")) + logger.debug("%r: missing issn and non-standard abbrev for %r", rcsbId, curAbbrev) + + if not curAbbrev: + logger.info("%r: missing issn and journal abbrev", rcsbId) + # + logger.debug("%s: revised: %r current: %r", rcsbId, revAbbrev, curAbbrev) + except Exception as e: + logger.exception("Failing on %r %r %r with %r", rcsbId, issn, curAbbrev, str(e)) + + return revAbbrev + + def assignPrimaryCitation(self, dataContainer, catName, atName, **kwargs): + """Normalize citation journal abbrev. + + Args: + dataContainer (object): mmif.api.DataContainer object instance + catName (str): Category name + atName (str): Attribute name + + Returns: + bool: True for success or False otherwise + """ + logger.debug("Starting catName %s atName %s kwargs %r", catName, atName, kwargs) + try: + if not dataContainer.exists(catName): + return False + # + cObj = dataContainer.getObj(catName) + if not cObj.hasAttribute(atName): + cObj.appendAttribute(atName) + # + for ii in range(cObj.getRowCount()): + citId = cObj.getValue("id", ii) + if citId.upper() == "PRIMARY": + cObj.setValue("Y", atName, ii) + else: + cObj.setValue("N", atName, ii) + return True + except Exception as e: + logger.exception("Failing for %r with %s", dataContainer.getName(), str(e)) + return False + + def __getEmdbIdentifiers(self, dataContainer): + """[summary] + + Args: + dataContainer ([type]): [description] + + Returns: + [type]: [description] + + # + loop_ + _database_2.database_id + _database_2.database_code + PDB 6QUY + WWPDB D_1292100913 + EMDB EMD-4644 + # + loop_ + _pdbx_database_related.db_name + _pdbx_database_related.details + _pdbx_database_related.db_id + _pdbx_database_related.content_type + EMDB 'HsCKK (human CAMSAP1) decorated 13pf taxol-GDP microtubule (asymmetric unit)' EMD-4643 'other EM volume' + PDB 'HsCKK (human CAMSAP1) decorated 13pf taxol-GDP microtubule (asymmetric unit)' 6QUS unspecified + EMDB 'NgCKK (N.Gruberi CKK) decorated 13pf taxol-GDP microtubule' EMD-4644 'associated EM volume' + # + """ + emdbIdD = {} + emdbIdAltD = {} + if dataContainer.exists("database_2"): + dbObj = dataContainer.getObj("database_2") + for ii in range(dbObj.getRowCount()): + dbId = dbObj.getValue("database_id", ii) + dbCode = dbObj.getValue("database_code", ii) + if dbId.upper() == "EMDB": + emdbIdD[dbCode] = "associated EM volume" + + if dataContainer.exists("pdbx_database_related"): + drObj = dataContainer.getObj("pdbx_database_related") + for ii in range(drObj.getRowCount()): + dbCode = drObj.getValue("db_id", ii) + dbName = drObj.getValue("db_name", ii) + contentType = drObj.getValue("content_type", ii) + if dbName.upper() == "EMDB" and contentType.upper() == "ASSOCIATED EM VOLUME" and dbCode not in emdbIdD: + emdbIdD[dbCode] = "associated EM volume" + elif dbName.upper() == "EMDB" and contentType.upper() != "ASSOCIATED EM VOLUME" and dbCode not in emdbIdAltD: + emdbIdAltD[dbCode] = contentType + return emdbIdD, emdbIdAltD + + def buildContainerEntryIds(self, dataContainer, catName, **kwargs): + """Load the input category with rcsb_entry_container_identifiers content. + + Args: + dataContainer (object): mmif.api.DataContainer object instance + catName (str): Category name + + Returns: + bool: True for success or False otherwise + + For example: + + loop_ + _rcsb_entry_container_identifiers.entry_id + _rcsb_entry_container_identifiers.entity_ids + _rcsb_entry_container_identifiers.polymer_entity_ids_polymer + _rcsb_entry_container_identifiers.non-polymer_entity_ids + _rcsb_entry_container_identifiers.assembly_ids + _rcsb_entry_container_identifiers.rcsb_id + ... + + """ + logger.debug("Starting catName %s kwargs %r", catName, kwargs) + try: + if not dataContainer.exists("entry"): + return False + if not dataContainer.exists(catName): + dataContainer.append(DataCategory(catName, attributeNameList=self.__dApi.getAttributeNameList(catName))) + # + cObj = dataContainer.getObj(catName) + + tObj = dataContainer.getObj("entry") + entryId = tObj.getValue("id", 0) + cObj.setValue(entryId, "entry_id", 0) + cObj.setValue(entryId, "rcsb_id", 0) + + # + tObj = dataContainer.getObj("entity") + entityIdL = tObj.getAttributeValueList("id") + cObj.setValue(",".join(entityIdL), "entity_ids", 0) + # + # + tIdL = tObj.selectValuesWhere("id", "polymer", "type") + tV = ",".join(tIdL) if tIdL else "?" + cObj.setValue(tV, "polymer_entity_ids", 0) + + tIdL = tObj.selectValuesWhere("id", "non-polymer", "type") + tV = ",".join(tIdL) if tIdL else "?" + cObj.setValue(tV, "non-polymer_entity_ids", 0) + # + tIdL = tObj.selectValuesWhere("id", "branched", "type") + tV = ",".join(tIdL) if tIdL else "?" + cObj.setValue(tV, "branched_entity_ids", 0) + # + # tIdL = tObj.selectValuesWhere("id", "water", "type") + # tV = ",".join(tIdL) if tIdL else "?" + # cObj.setValue(tV, "water_entity_ids", 0) + # + tObj = dataContainer.getObj("pdbx_struct_assembly") + assemblyIdL = tObj.getAttributeValueList("id") if tObj else [] + tV = ",".join(assemblyIdL) if assemblyIdL else "?" + cObj.setValue(tV, "assembly_ids", 0) + # + # + emdbIdD, emdbIdAltD = self.__getEmdbIdentifiers(dataContainer) + tV = ",".join([tId for tId in emdbIdD]) if emdbIdD else "?" + cObj.setValue(tV, "emdb_ids", 0) + tV = ",".join([tId for tId in emdbIdAltD]) if emdbIdAltD else "?" + cObj.setValue(tV, "related_emdb_ids", 0) + # + modelIdList = self.__commonU.getModelIdList(dataContainer) + tV = ",".join([str(tId) for tId in modelIdList]) if modelIdList else "?" + cObj.setValue(tV, "model_ids", 0) + # + return True + except Exception as e: + logger.exception("For %s failing with %s", catName, str(e)) + return False + + def consolidateAccessionDetails(self, dataContainer, catName, **kwargs): + """Consolidate accession details into the rcsb_accession_info category. Also include + a flag for the availability of any supporting experimental data. + + Args: + dataContainer (object): mmif.api.DataContainer object instance + catName (str): Category name + + Returns: + bool: True for success or False otherwise + + For example: + For example - + _rcsb_accession_info.entry_id 1ABC + _rcsb_accession_info.status_code REL + _rcsb_accession_info.deposit_date 2018-01-11 + _rcsb_accession_info.initial_release_date 2018-03-23 + _rcsb_accession_info.major_revision 1 + _rcsb_accession_info.minor_revision 2 + _rcsb_accession_info.revision_date 2018-10-25 + + + Taking data values from: + + _pdbx_database_status.entry_id 3OQP + _pdbx_database_status.deposit_site RCSB + _pdbx_database_status.process_site RCSB + _pdbx_database_status.recvd_initial_deposition_date 2010-09-03 + _pdbx_database_status.status_code REL + _pdbx_database_status.status_code_sf REL + _pdbx_database_status.status_code_mr ? + _pdbx_database_status.status_code_cs ? + _pdbx_database_status.pdb_format_compatible Y + _pdbx_database_status.methods_development_category ? + _pdbx_database_status.SG_entry Y + # + loop_ + _pdbx_audit_revision_history.ordinal + _pdbx_audit_revision_history.data_content_type + _pdbx_audit_revision_history.major_revision + _pdbx_audit_revision_history.minor_revision + _pdbx_audit_revision_history.revision_date + 1 'Structure model' 1 0 2010-10-13 + 2 'Structure model' 1 1 2011-07-13 + 3 'Structure model' 1 2 2011-07-20 + 4 'Structure model' 1 3 2014-11-12 + 5 'Structure model' 1 4 2017-10-25 + # + + # - For EM and SAS - + _pdbx_database_related.db_name EMDB + _pdbx_database_related.details + 'pseudo-atomic model of the RNA polymerase lambda-based antitermination complex solved by cryo-EM' + _pdbx_database_related.db_id EMD-3561 + _pdbx_database_related.content_type 'associated EM volume' + """ + ## + try: + logger.debug("Starting with %r %r %r", dataContainer.getName(), catName, kwargs) + # + # if there is incomplete accessioninformation then exit + if not (dataContainer.exists("pdbx_database_status") or dataContainer.exists("pdbx_audit_revision_history")): + return False + # Create the new target category + if not dataContainer.exists(catName): + dataContainer.append(DataCategory(catName, attributeNameList=self.__dApi.getAttributeNameList(catName))) + + cObj = dataContainer.getObj(catName) + # + tObj = dataContainer.getObj("pdbx_database_status") + entryId = tObj.getValue("entry_id", 0) + statusCode = tObj.getValue("status_code", 0) + depositDate = tObj.getValue("recvd_initial_deposition_date", 0) + # + cObj.setValue(entryId, "entry_id", 0) + cObj.setValue(statusCode, "status_code", 0) + cObj.setValue(depositDate, "deposit_date", 0) + # cObj.setValue(depositDate[:4], "deposit_year", 0) + # + # -- Experimental data availability -- + # + expDataRelFlag = "N" + statusSf = tObj.getValueOrDefault("status_code_sf", 0, defaultValue=None) + statusMr = tObj.getValueOrDefault("status_code_mr", 0, defaultValue=None) + statusCs = tObj.getValueOrDefault("status_code_cs", 0, defaultValue=None) + # + if statusSf == "REL" or statusMr == "REL" or statusCs == "REL": + expDataRelFlag = "Y" + else: + if dataContainer.exists("pdbx_database_related"): + rObj = dataContainer.getObj("pdbx_database_related") + ctL = rObj.getAttributeValueList("content_type") + if "associated EM volume" in ctL or "associated SAS data" in ctL: + expDataRelFlag = "Y" + # + cObj.setValue(expDataRelFlag, "has_released_experimental_data", 0) + # + tObj = dataContainer.getObj("pdbx_audit_revision_history") + nRows = tObj.getRowCount() + # Assuming the default sorting order from the release module - + releaseDate = tObj.getValue("revision_date", 0) + minorRevision = tObj.getValue("minor_revision", nRows - 1) + majorRevision = tObj.getValue("major_revision", nRows - 1) + revisionDate = tObj.getValue("revision_date", nRows - 1) + cObj.setValue(releaseDate, "initial_release_date", 0) + # cObj.setValue(releaseDate[:4], "initial_release_year", 0) + cObj.setValue(minorRevision, "minor_revision", 0) + cObj.setValue(majorRevision, "major_revision", 0) + cObj.setValue(revisionDate, "revision_date", 0) + # + return True + except Exception as e: + logger.exception("In %s for %s failing with %s", dataContainer.getName(), catName, str(e)) + return False + + def addEntryInfo(self, dataContainer, catName, **kwargs): + """ + Add _rcsb_entry_info, for example: + _rcsb_entry_info.entry_id 1ABC + _rcsb_entry_info.polymer_composition 'heteromeric protein' + _rcsb_entry_info.experimental_method 'multiple methods' + _rcsb_entry_info.experimental_method_count 2 + _rcsb_entry_info.polymer_entity_count 2 + _rcsb_entry_info.entity_count 2 + _rcsb_entry_info.nonpolymer_entity_count 2 + _rcsb_entry_info.branched_entity_count 0 + _rcsb_entry_info.software_programs_combined 'Phenix;RefMac' + .... + + Also add the related field: + + _entity_poly.rcsb_entity_polymer_type + + 'Protein' 'polypeptide(D) or polypeptide(L)' + 'DNA' 'polydeoxyribonucleotide' + 'RNA' 'polyribonucleotide' + 'NA-hybrid' 'polydeoxyribonucleotide/polyribonucleotide hybrid' + 'Other' 'polysaccharide(D), polysaccharide(L), cyclic-pseudo-peptide, peptide nucleic acid, or other' + # + _rcsb_entry_info.deposited_polymer_monomer_count + 'polymer_entity_count_protein', + 'polymer_entity_count_nucleic_acid', + 'polymer_entity_count_nucleic_acid_hybrid', + 'polymer_entity_count_DNA', + 'polymer_entity_count_RNA', + + """ + try: + logger.debug("Starting with %r %r %r", dataContainer.getName(), catName, kwargs) + # Exit if source categories are missing + if not (dataContainer.exists("exptl") and dataContainer.exists("entity")): + return False + # + # Create the new target category rcsb_entry_info + if not dataContainer.exists(catName): + dataContainer.append(DataCategory(catName, attributeNameList=self.__dApi.getAttributeNameList(catName))) + # -------------------------------------------------------------------------------------------------------- + # catName = rcsb_entry_info + cObj = dataContainer.getObj(catName) + # + # -------------------------------------------------------------------------------------------------------- + # Filter experimental methods + # + xObj = dataContainer.getObj("exptl") + entryId = xObj.getValue("entry_id", 0) + methodL = xObj.getAttributeValueList("method") + methodCount, expMethod = self.__commonU.filterExperimentalMethod(methodL) + cObj.setValue(entryId, "entry_id", 0) + cObj.setValue(expMethod, "experimental_method", 0) + cObj.setValue(methodCount, "experimental_method_count", 0) + # + # -------------------------------------------------------------------------------------------------------- + # Experimental resolution - + # + resL = self.__filterExperimentalResolution(dataContainer) + if resL: + cObj.setValue(",".join(resL), "resolution_combined", 0) + # + # --------------------------------------------------------------------------------------------------------- + # Consolidate software details - + # + swNameL = [] + if dataContainer.exists("software"): + swObj = dataContainer.getObj("software") + swNameL.extend(swObj.getAttributeUniqueValueList("name")) + if dataContainer.exists("pdbx_nmr_software"): + swObj = dataContainer.getObj("pdbx_nmr_software") + swNameL.extend(swObj.getAttributeUniqueValueList("name")) + if dataContainer.exists("em_software"): + swObj = dataContainer.getObj("em_software") + swNameL.extend(swObj.getAttributeUniqueValueList("name")) + if swNameL: + swNameD = {swName.upper().strip(): True for swName in swNameL if swName not in [".", "?"]} + swNameL = sorted(swNameD.keys()) + cObj.setValue(";".join(swNameL), "software_programs_combined", 0) + # --------------------------------------------------------------------------------------------------------- + # ENTITY FEATURES + # + # entity and polymer entity counts - + ## + eObj = dataContainer.getObj("entity") + eTypeL = eObj.getAttributeValueList("type") + # + numPolymers = 0 + numNonPolymers = 0 + numBranched = 0 + numSolvent = 0 + for eType in eTypeL: + if eType == "polymer": + numPolymers += 1 + elif eType == "non-polymer": + numNonPolymers += 1 + elif eType == "branched": + numBranched += 1 + elif eType == "water": + numSolvent += 1 + else: + logger.error("Unexpected entity type for %s %s", dataContainer.getName(), eType) + totalEntities = numPolymers + numNonPolymers + numBranched + numSolvent + # + # Simplified entity polymer type: 'Protein', 'DNA', 'RNA', 'NA-hybrid', or 'Other' + pTypeL = [] + if dataContainer.exists("entity_poly"): + epObj = dataContainer.getObj("entity_poly") + pTypeL = epObj.getAttributeValueList("type") + # + atName = "rcsb_entity_polymer_type" + if not epObj.hasAttribute(atName): + epObj.appendAttribute(atName) + for ii in range(epObj.getRowCount()): + epObj.setValue(self.__commonU.filterEntityPolyType(pTypeL[ii]), atName, ii) + # + # Add any branched entity types to the type list - + if dataContainer.exists("pdbx_entity_branch"): + ebObj = dataContainer.getObj("pdbx_entity_branch") + pTypeL.extend(ebObj.getAttributeValueList("type")) + # + polymerCompClass, ptClass, naClass, eptD = self.__commonU.getPolymerComposition(pTypeL) + if eptD and len(eptD) > 2: + logger.debug("%s entity type count=%d class=%s typeD %r", dataContainer.getName(), len(eptD), polymerCompClass, eptD) + # + cObj.setValue(polymerCompClass, "polymer_composition", 0) + cObj.setValue(ptClass, "selected_polymer_entity_types", 0) + cObj.setValue(naClass, "na_polymer_entity_types", 0) + cObj.setValue(numPolymers, "polymer_entity_count", 0) + cObj.setValue(numNonPolymers, "nonpolymer_entity_count", 0) + cObj.setValue(numBranched, "branched_entity_count", 0) + cObj.setValue(numSolvent, "solvent_entity_count", 0) + cObj.setValue(totalEntities, "entity_count", 0) + # + num = eptD["protein"] if "protein" in eptD else 0 + cObj.setValue(num, "polymer_entity_count_protein", 0) + # + num = eptD["NA-hybrid"] if "NA-hybrid" in eptD else 0 + cObj.setValue(num, "polymer_entity_count_nucleic_acid_hybrid", 0) + # + numDNA = eptD["DNA"] if "DNA" in eptD else 0 + cObj.setValue(numDNA, "polymer_entity_count_DNA", 0) + # + numRNA = eptD["RNA"] if "RNA" in eptD else 0 + cObj.setValue(numRNA, "polymer_entity_count_RNA", 0) + cObj.setValue(numDNA + numRNA, "polymer_entity_count_nucleic_acid", 0) + # + # --------------------------------------------------------------------------------------------------------- + # INSTANCE FEATURES + # + ## + repModelL = ["1"] + if self.__commonU.hasMethodNMR(methodL): + repModelL = self.__getRepresentativeModels(dataContainer) + logger.debug("Representative model list %r", repModelL) + # + instanceTypeCountD = self.__commonU.getInstanceTypeCounts(dataContainer) + cObj.setValue(instanceTypeCountD["polymer"], "deposited_polymer_entity_instance_count", 0) + cObj.setValue(instanceTypeCountD["non-polymer"], "deposited_nonpolymer_entity_instance_count", 0) + + # + # Various atom counts - + # + repModelId = repModelL[0] + numHeavyAtomsModel, numHydrogenAtomsModel, numAtomsTotal, numModelsTotal = self.__commonU.getDepositedAtomCounts(dataContainer, modelId=repModelId) + # + logger.debug("numAtomsTotal %d numHeavyAtomsModel %d numModelsTotal %d", numAtomsTotal, numHeavyAtomsModel, numModelsTotal) + logger.debug("entity type atom counts %r", self.__commonU.getEntityTypeHeavyAtomCounts(dataContainer, modelId=repModelId)) + logger.debug("instance atom counts %r", self.__commonU.getEntityTypeHeavyAtomCounts(dataContainer, modelId=repModelId)) + # + + if numHeavyAtomsModel > 0: + cObj.setValue(numHeavyAtomsModel, "deposited_atom_count", 0) + cObj.setValue(numModelsTotal, "deposited_model_count", 0) + cObj.setValue(numHydrogenAtomsModel, "deposited_hydrogen_atom_count", 0) + tCD = self.__commonU.getEntityTypeHeavyAtomCounts(dataContainer, modelId=repModelId) + wCount = tCD["water"] if tCD and "water" in tCD else 0 + cObj.setValue(wCount, "deposited_solvent_atom_count", 0) + # + # --------------------------------------------------------------------------------------------------------- + # Deposited monomer/residue instance counts + # + # Get modeled and unmodeled residue counts + # + modeledCount, unModeledCount = self.__commonU.getDepositedMonomerCounts(dataContainer, modelId=repModelId) + cObj.setValue(modeledCount, "deposited_modeled_polymer_monomer_count", 0) + cObj.setValue(unModeledCount, "deposited_unmodeled_polymer_monomer_count", 0) + cObj.setValue(modeledCount + unModeledCount, "deposited_polymer_monomer_count", 0) + # + # --------------------------------------------------------------------------------------------------------- + # Counts of intermolecular bonds/linkages + # + # + bCountsD = self.__commonU.getInstanceConnectionCounts(dataContainer) + cObj.setValue(bCountsD["disulf"], "disulfide_bond_count", 0) + cObj.setValue(bCountsD["metalc"], "inter_mol_metalic_bond_count", 0) + cObj.setValue(bCountsD["covale"], "inter_mol_covalent_bond_count", 0) + # + cisPeptideD = self.__commonU.getCisPeptides(dataContainer) + cObj.setValue(len(cisPeptideD), "cis_peptide_count", 0) + # + # This is reset in anothor method - filterSourceOrganismDetails() + cObj.setValue(None, "polymer_entity_taxonomy_count", 0) + # + fw = self.__commonU.getFormulaWeightNonSolvent(dataContainer) + cObj.setValue(str(round(fw, 2)), "molecular_weight", 0) + # + # nonpolymer_bound_components + # + bcL = self.__commonU.getBoundNonpolymersComponentIds(dataContainer) + if bcL: + cObj.setValue(";".join(bcL), "nonpolymer_bound_components", 0) + # + # polymer_molecular_weight_minimum + # polymer_molecular_weight_maximum + # nonpolymer_molecular_weight_minimum + # nonpolymer_molecular_weight_maximum + # branched_molecular_weight_minimum + # branched_molecular_weight_maximum + # + fwBoundD = self.__commonU.getEntityFormulaWeightBounds(dataContainer) + if "polymer" in fwBoundD and fwBoundD["polymer"]["min"] and fwBoundD["polymer"]["max"]: + cObj.setValue(str(round(fwBoundD["polymer"]["min"], 2)), "polymer_molecular_weight_minimum", 0) + cObj.setValue(str(round(fwBoundD["polymer"]["max"], 2)), "polymer_molecular_weight_maximum", 0) + if "non-polymer" in fwBoundD and fwBoundD["non-polymer"]["min"] and fwBoundD["non-polymer"]["max"]: + cObj.setValue(str(round(fwBoundD["non-polymer"]["min"], 2)), "nonpolymer_molecular_weight_minimum", 0) + cObj.setValue(str(round(fwBoundD["non-polymer"]["max"], 2)), "nonpolymer_molecular_weight_maximum", 0) + if "branched" in fwBoundD and fwBoundD["branched"]["min"] and fwBoundD["branched"]["max"]: + cObj.setValue(str(round(fwBoundD["branched"]["min"], 2)), "branched_molecular_weight_minimum", 0) + cObj.setValue(str(round(fwBoundD["branched"]["max"], 2)), "branched_molecular_weight_maximum", 0) + # + # polymer_monomer_count_maximum + # polymer_monomer_count_minimum + # + polymerLengthBounds = self.__commonU.getEntityPolymerLengthBounds(dataContainer) + if polymerLengthBounds: + cObj.setValue(str(polymerLengthBounds[0]), "polymer_monomer_count_minimum", 0) + cObj.setValue(str(polymerLengthBounds[1]), "polymer_monomer_count_maximum", 0) + # + # --------------------------------------------------------------------------------------------------------- + # Consolidate diffraction wavelength details - + wL = [] + try: + if dataContainer.exists("diffrn_radiation_wavelength"): + swObj = dataContainer.getObj("diffrn_radiation_wavelength") + wL.extend(swObj.getAttributeUniqueValueList("wavelength")) + if dataContainer.exists("diffrn_radiation"): + swObj = dataContainer.getObj("diffrn_radiation") + if swObj.hasAttribute("pdbx_wavelength"): + wL.extend(swObj.getAttributeUniqueValueList("pdbx_wavelength")) + if swObj.hasAttribute("pdbx_wavelength_list"): + tL = [] + for tS in swObj.getAttributeUniqueValueList("pdbx_wavelength_list"): + tL.extend(tS.split(",")) + if tL: + wL.extend(tL) + if dataContainer.exists("diffrn_source"): + swObj = dataContainer.getObj("diffrn_source") + if swObj.hasAttribute("pdbx_wavelength"): + wL.extend(swObj.getAttributeUniqueValueList("pdbx_wavelength")) + if swObj.hasAttribute("pdbx_wavelength_list"): + tL = [] + for tS in swObj.getAttributeUniqueValueList("pdbx_wavelength_list"): + tL.extend(tS.split(",")) + if tL: + wL.extend(tL) + fL = [] + for wS in wL: + try: + fL.append(float(wS)) + except Exception: + pass + if fL: + cObj.setValue("%.4f" % min(fL), "diffrn_radiation_wavelength_minimum", 0) + cObj.setValue("%.4f" % max(fL), "diffrn_radiation_wavelength_maximum", 0) + + except Exception as e: + logger.exception("%s failing wavelength processing with %s", entryId, str(e)) + # + # JDW + self.__updateReflnsResolution(dataContainer) + return True + except Exception as e: + logger.exception("For %s %r failing with %s", dataContainer.getName(), catName, str(e)) + # + return False + + def filterBlockByMethod(self, dataContainer, blockName, **kwargs): + """Filter empty placeholder data categories by experimental method.""" + logger.debug("Starting with %r blockName %r kwargs %r", dataContainer.getName(), blockName, kwargs) + try: + if not dataContainer.exists("exptl"): + return False + # + xObj = dataContainer.getObj("exptl") + methodL = xObj.getAttributeValueList("method") + objNameL = [] + # Test for a diffraction method in the case of multiple methods + if len(methodL) > 1: + isXtal = False + for method in methodL: + if method in ["X-RAY DIFFRACTION", "FIBER DIFFRACTION", "POWDER DIFFRACTION", "ELECTRON CRYSTALLOGRAPHY", "NEUTRON DIFFRACTION", "ELECTRON DIFFRACTION"]: + isXtal = True + break + if not isXtal: + objNameL = ["cell", "symmetry", "refine", "refine_hist", "software", "diffrn", "diffrn_radiation"] + else: + # + mS = methodL[0].upper() + if mS in ["X-RAY DIFFRACTION", "FIBER DIFFRACTION", "POWDER DIFFRACTION", "ELECTRON CRYSTALLOGRAPHY", "NEUTRON DIFFRACTION", "ELECTRON DIFFRACTION"]: + objNameL = [] + elif mS in ["SOLUTION NMR", "SOLID-STATE NMR"]: + objNameL = ["cell", "symmetry", "refine", "refine_hist", "software", "diffrn", "diffrn_radiation"] + elif mS in ["ELECTRON MICROSCOPY", "CRYO-ELECTRON MICROSCOPY"]: + objNameL = ["cell", "symmetry", "refine", "refine_hist", "software", "diffrn", "diffrn_radiation"] + elif mS in ["SOLUTION SCATTERING", "EPR", "THEORETICAL MODEL", "INFRARED SPECTROSCOPY", "FLUORESCENCE TRANSFER"]: + objNameL = ["cell", "symmetry", "refine", "refine_hist", "software", "diffrn", "diffrn_radiation"] + else: + logger.error("%s Unexpected method %r", dataContainer.getName(), mS) + # + for objName in objNameL: + dataContainer.remove(objName) + return True + except Exception as e: + logger.exception("For %s failing with %s", dataContainer.getName(), str(e)) + return False + + def filterEnumerations(self, dataContainer, catName, atName, **kwargs): + """Standardize the item value to conform to enumeration specifications.""" + logger.debug("Starting with %r %r %r %r", dataContainer.getName(), atName, catName, kwargs) + subD = {("pdbx_reference_molecule", "class"): [("Anti-tumor", "Antitumor")]} + try: + if not dataContainer.exists(catName): + return False + # + cObj = dataContainer.getObj(catName) + if not cObj.hasAttribute(atName): + return False + # + subL = subD[(catName, atName)] if (catName, atName) in subD else [] + # + for ii in range(cObj.getRowCount()): + tV = cObj.getValue(atName, ii) + if tV and tV not in [".", "?"]: + for sub in subL: + if sub[0] in tV: + tV = tV.replace(sub[0], sub[1]) + cObj.setValue(tV, atName, ii) + return True + except Exception as e: + logger.exception("%s %s %s failing with %s", dataContainer.getName(), catName, atName, str(e)) + return False + + def __getRepresentativeModels(self, dataContainer): + """Return the list of representative models + + Example: + # + _pdbx_nmr_ensemble.entry_id 5TM0 + _pdbx_nmr_ensemble.conformers_calculated_total_number 15 + _pdbx_nmr_ensemble.conformers_submitted_total_number 15 + _pdbx_nmr_ensemble.conformer_selection_criteria 'all calculated structures submitted' + _pdbx_nmr_ensemble.representative_conformer ? + _pdbx_nmr_ensemble.average_constraints_per_residue ? + _pdbx_nmr_ensemble.average_constraint_violations_per_residue ? + _pdbx_nmr_ensemble.maximum_distance_constraint_violation ? + _pdbx_nmr_ensemble.average_distance_constraint_violation ? + _pdbx_nmr_ensemble.maximum_upper_distance_constraint_violation ? + _pdbx_nmr_ensemble.maximum_lower_distance_constraint_violation ? + _pdbx_nmr_ensemble.distance_constraint_violation_method ? + _pdbx_nmr_ensemble.maximum_torsion_angle_constraint_violation ? + _pdbx_nmr_ensemble.average_torsion_angle_constraint_violation ? + _pdbx_nmr_ensemble.torsion_angle_constraint_violation_method ? + # + _pdbx_nmr_representative.entry_id 5TM0 + _pdbx_nmr_representative.conformer_id 1 + _pdbx_nmr_representative.selection_criteria 'fewest violations' + """ + repModelL = [] + if dataContainer.exists("pdbx_nmr_representative"): + tObj = dataContainer.getObj("pdbx_nmr_representative") + if tObj.hasAttribute("conformer_id"): + for ii in range(tObj.getRowCount()): + nn = tObj.getValue("conformer_id", ii) + if nn is not None and nn.isdigit(): + repModelL.append(nn) + + if dataContainer.exists("pdbx_nmr_ensemble"): + tObj = dataContainer.getObj("pdbx_nmr_ensemble") + if tObj.hasAttribute("representative_conformer"): + nn = tObj.getValue("representative_conformer", 0) + if nn is not None and nn and nn.isdigit(): + repModelL.append(nn) + # + repModelL = list(set(repModelL)) + if not repModelL: + logger.debug("Missing representative model data for %s using 1", dataContainer.getName()) + repModelL = ["1"] + + return repModelL + + def __filterExperimentalResolution(self, dataContainer): + """Collect resolution estimates from method specific sources.""" + rL = [] + if dataContainer.exists("refine"): + tObj = dataContainer.getObj("refine") + if tObj.hasAttribute("ls_d_res_high"): + for ii in range(tObj.getRowCount()): + rv = tObj.getValue("ls_d_res_high", ii) + if self.__commonU.isFloat(rv): + rL.append(rv) + + if dataContainer.exists("em_3d_reconstruction"): + tObj = dataContainer.getObj("em_3d_reconstruction") + if tObj.hasAttribute("resolution"): + for ii in range(tObj.getRowCount()): + rv = tObj.getValue("resolution", ii) + if self.__commonU.isFloat(rv): + rL.append(rv) + return rL + + def addCategoryPrimaryCitation(self, dataContainer, blockName, **kwargs): + """ + Add rcsb_primary_citation category as a copy or the citation category + with rcsb extensions. + """ + catName = None + try: + logger.debug("Starting with %r %r %r", dataContainer.getName(), blockName, kwargs) + # Exit if source categories are missing + if not dataContainer.exists("citation"): + return False + cObj = dataContainer.getObj("citation") + catName = "rcsb_primary_citation" + # + if not dataContainer.exists(catName): + dataContainer.append(DataCategory(catName, attributeNameList=self.__dApi.getAttributeNameList(catName))) + # -------------------------------------------------------------------------------------------------------- + rObj = dataContainer.getObj(catName) + atNameList = self.__dApi.getAttributeNameList(catName) + logger.debug("Category %s dict attributes %r", catName, atNameList) + # + for ii in range(cObj.getRowCount()): + pv = cObj.getValue("id", ii) + if pv.upper() == "PRIMARY": + for atName in atNameList: + if cObj.hasAttribute(atName): + rObj.setValue(cObj.getValue(atName, ii), atName, 0) + + return True + except Exception as e: + logger.exception("%s %s failing with %s", dataContainer.getName(), catName, str(e)) + return False + + def __updateReflnsResolution(self, dataContainer): + """Find a plausable data collection diffraction high resolution limit from one of the following sources. + # + _rcsb_entry_info.diffrn_resolution_high_value + _rcsb_entry_info.diffrn_resolution_high_provenance_source + + Update category 'reflns' with any missing resolution extrema data using limits in category reflns_shell. + + _reflns.entry_id 2DCG + _reflns.d_resolution_high 0.900 + _reflns.pdbx_diffrn_id 1 + _reflns.pdbx_ordinal 1 + + + _refine.entry_id 2DCG + _refine.ls_number_reflns_obs 15000 + _refine.ls_number_reflns_all ? + _refine.pdbx_ls_sigma_I 2.000 + _refine.ls_d_res_low ? + _refine.ls_d_res_high 0.900 + _refine.pdbx_refine_id 'X-RAY DIFFRACTION' + _refine.pdbx_diffrn_id 1 + + _reflns_shell.d_res_high 1.18 + _reflns_shell.d_res_low 1.25 + _reflns_shell.pdbx_ordinal 1 + _reflns_shell.pdbx_diffrn_id 1 + # + + """ + try: + logger.debug("Starting with %r", dataContainer.getName()) + # + if not dataContainer.exists("exptl") or not dataContainer.exists("rcsb_entry_info"): + return False + # -------------------------------------------------------------------------------------------------------- + # Only applicable to X-ray + # + xObj = dataContainer.getObj("exptl") + methodL = xObj.getAttributeValueList("method") + _, expMethod = self.__commonU.filterExperimentalMethod(methodL) + if expMethod not in ["X-ray", "Neutron", "Multiple methods"]: + return False + # + resValue = resProvSource = None + # + # Here are the various cases - + if dataContainer.exists("reflns"): + rObj = dataContainer.getObj("reflns") + if rObj.hasAttribute("d_resolution_high"): + rvL = rObj.getAttributeValueList("d_resolution_high") + fvL = [float(rv) for rv in rvL if self.__commonU.isFloat(rv)] + if fvL: + resValue = round(min(fvL), 2) + resProvSource = "Depositor assigned" + + if not resValue and dataContainer.exists("reflns_shell"): + rObj = dataContainer.getObj("reflns_shell") + if rObj.hasAttribute("d_res_high"): + rvL = rObj.getAttributeValueList("d_res_high") + fvL = [float(rv) for rv in rvL if self.__commonU.isFloat(rv)] + if fvL: + resValue = round(min(fvL), 2) + resProvSource = "From the high resolution shell" + + if not resValue and dataContainer.exists("refine"): + + rObj = dataContainer.getObj("refine") + if rObj.hasAttribute("ls_d_res_high"): + fvL = [] + for ii in range(rObj.getRowCount()): + rId = rObj.getValue("pdbx_refine_id", ii) + if rId in ["X-RAY DIFFRACTION", "NEUTRON DIFFRACTION", "FIBER DIFFRACTION"]: + rv = rObj.getValue("ls_d_res_high", ii) + if self.__commonU.isFloat(rv): + fvL.append(float(rv)) + if fvL: + resValue = round(min(fvL), 2) + resProvSource = "From refinement resolution cutoff" + # + if not resValue: + logger.debug("No source of data collection resolution available for %r", dataContainer.getName()) + else: + logger.debug("Data collection diffraction limit %r PS %r", resValue, resProvSource) + + if resValue: + eObj = dataContainer.getObj("rcsb_entry_info") + for atName in ["diffrn_resolution_high_value", "diffrn_resolution_high_provenance_source"]: + if not eObj.hasAttribute(atName): + eObj.appendAttribute(atName) + eObj.setValue(resValue, "diffrn_resolution_high_value", 0) + eObj.setValue(resProvSource, "diffrn_resolution_high_provenance_source", 0) + # -------------------------------------------------------------------------------------------------------- + return True + except Exception as e: + logger.exception("%s failing with %s", dataContainer.getName(), str(e)) + return False diff --git a/rcsb/utils/dictionary/DictMethodResourceProvider.py b/rcsb/utils/dictionary/DictMethodResourceProvider.py new file mode 100644 index 0000000..ec4cf74 --- /dev/null +++ b/rcsb/utils/dictionary/DictMethodResourceProvider.py @@ -0,0 +1,393 @@ +## +# File: DictMethodResourceProvider.py +# Author: J. Westbrook +# Date: 3-Jun-2019 +# Version: 0.001 Initial version +# +# +# Updates: +# 17-Jul-2019 jdw add resource for common utilities and dictionary api +# 7-Aug-2019 jdw use dictionary locator map +# 13-Aug-2019 jdw return class instances in all cases. Add cache management support. +# 9-Sep-2019 jdw add AtcProvider() and SiftsSummaryProvider() +# 25-Nov-2019 jdw add CitationReferenceProvider(), ChemCompProvider() and JournalTitleAbbreviationProvider()'s +# 16-Feb-2020 jdw add support for configuration of development resources +# 19-Mar-2020 jdw add ResidProvider() and send cachePath directly to all modules in rcsb.utils.chemref. +# 29-Jul-2020 jdw add PubChemProvider() from rcsb.utils.chemref. +# 30-Jul-2020 jdw add PharosProvider() from rcsb.utils.chemref. +# 29-Oct-2020 jdw add method getReferenceSequenceAlignmentOpt() +## +## +""" +Resource provider for DictMethodHelper tools. + +""" +__docformat__ = "restructuredtext en" +__author__ = "John Westbrook" +__email__ = "jwest@rcsb.rutgers.edu" +__license__ = "Apache 2.0" + +import logging +import os +import platform +import resource +import time + +from rcsb.utils.dictionary.DictionaryApiProviderWrapper import DictionaryApiProviderWrapper +from rcsb.utils.dictionary.DictMethodCommonUtils import DictMethodCommonUtils +from rcsb.utils.chemref.AtcProvider import AtcProvider +from rcsb.utils.chemref.ChemCompModelProvider import ChemCompModelProvider +from rcsb.utils.chemref.ChemCompProvider import ChemCompProvider +from rcsb.utils.chemref.DrugBankProvider import DrugBankProvider +from rcsb.utils.chemref.PsiModProvider import PsiModProvider +from rcsb.utils.chemref.PharosProvider import PharosProvider +from rcsb.utils.chemref.PubChemProvider import PubChemProvider +from rcsb.utils.chemref.RcsbLigandScoreProvider import RcsbLigandScoreProvider +from rcsb.utils.chemref.ResidProvider import ResidProvider +from rcsb.utils.citation.CitationReferenceProvider import CitationReferenceProvider +from rcsb.utils.citation.JournalTitleAbbreviationProvider import JournalTitleAbbreviationProvider +from rcsb.utils.ec.EnzymeDatabaseProvider import EnzymeDatabaseProvider +from rcsb.utils.io.SingletonClass import SingletonClass +from rcsb.utils.seq.SiftsSummaryProvider import SiftsSummaryProvider +from rcsb.utils.struct.CathClassificationProvider import CathClassificationProvider +from rcsb.utils.struct.ScopClassificationProvider import ScopClassificationProvider +from rcsb.utils.taxonomy.TaxonomyProvider import TaxonomyProvider +from rcsb.utils.validation.ValidationReportProvider import ValidationReportProvider + +logger = logging.getLogger(__name__) + + +class DictMethodResourceProvider(SingletonClass): + """Resource provider for DictMethodHelper tools.""" + + def __init__(self, cfgOb, **kwargs): + """Resource provider for dictionary method runner. + + Arguments: + cfgOb {object} -- instance ConfigUtils class + + Keyword agruments: + configName {string} -- configuration section name (default: default section name) + cachePath {str} -- path used for temporary file management (default: '.') + + """ + self.__cfgOb = cfgOb + + self.__configName = kwargs.get("configName", self.__cfgOb.getDefaultSectionName()) + self.__cachePath = kwargs.get("cachePath", ".") + # + self.__taxU = None + self.__ecU = None + self.__scopU = None + self.__cathU = None + self.__dbU = None + self.__residU = None + self.__psimodU = None + self.__ccU = None + self.__ccmU = None + self.__commonU = None + self.__dApiW = None + self.__atcP = None + # self.__siftsAbbreviated = kwargs.get("siftsAbbreviated", "PROD") + self.__siftsAbbreviated = kwargs.get("siftsAbbreviated", "TEST") + self.__ssP = None + self.__vrptP = None + self.__crP = None + self.__jtaP = None + self.__pcP = None + self.__phP = None + self.__rlsP = None + # + # + # self.__wsPattern = re.compile(r"\s+", flags=re.UNICODE | re.MULTILINE) + # self.__re_non_digit = re.compile(r"[^\d]+") + # + self.__resourcesD = { + "SiftsSummaryProvider instance": self.__fetchSiftsSummaryProvider, + "Dictionary API instance (pdbx_core)": self.__fetchDictionaryApi, + "TaxonomyProvider instance": self.__fetchTaxonomyProvider, + "ScopProvider instance": self.__fetchScopProvider, + "CathProvider instance": self.__fetchCathProvider, + "EnzymeProvider instance": self.__fetchEnzymeProvider, + "DrugBankProvider instance": self.__fetchDrugBankProvider, + "ResidProvider instance": self.__fetchResidProvider, + "PsiModProvider instance": self.__fetchPsiModProvider, + "ChemCompModelProvider instance": self.__fetchChemCompModelProvider, + "ChemCompProvider instance": self.__fetchChemCompProvider, + "AtcProvider instance": self.__fetchAtcProvider, + "DictMethodCommonUtils instance": self.__fetchCommonUtils, + "ValidationProvider instance": self.__fetchValidationProvider, + "CitationReferenceProvider instance": self.__fetchCitationReferenceProvider, + "JournalTitleAbbreviationProvider instance": self.__fetchJournalTitleAbbreviationProvider, + "PubChemProvider instance": self.__fetchPubChemProvider, + "PharosProvider instance": self.__fetchPharosProvider, + "RcsbLigandScoreProvider instance": self.__fetchRcsbLigandScoreProvider, + } + logger.debug("Dictionary resource provider init completed") + # + + def echo(self, msg): + logger.info(msg) + + def getReferenceSequenceAlignmentOpt(self): + return self.__cfgOb.get("REFERENCE_SEQUENCE_ALIGNMETS", sectionName=self.__configName, default="SIFTS") + + def getResource(self, resourceName, default=None, useCache=True, **kwargs): + """Return the named input resource or the default value. + + Arguments: + resourceName {str} -- resource name + useCache (bool, optional): use current cace. Defaults to True. + + Keyword Arguments: + default {obj} -- default return value for missing resources (default: {None}) + + Returns: + [obj] -- resource object + """ + logger.debug("Requesting resource %r", resourceName) + if resourceName in self.__resourcesD: + return self.__resourcesD[resourceName](self.__cfgOb, self.__configName, self.__cachePath, useCache=useCache, **kwargs) + else: + logger.error("Request for unsupported resource %r returning %r", resourceName, default) + # + return default + + def cacheResources(self, useCache=False, **kwargs): + """Update and optionally clear all resource caches. + + Args: + useCache (bool, optional): use current cace. Defaults to False. + + Returns: + bool: True for success or False otherwise + """ + ret = True + tName = "CHECKING" if useCache else "REBUILDING" + logger.info("Begin %s cache for %d resources", tName, len(self.__resourcesD)) + # + for resourceName in self.__resourcesD: + startTime = time.time() + logger.debug("Caching resources for %r", resourceName) + tU = self.__resourcesD[resourceName](self.__cfgOb, self.__configName, self.__cachePath, useCache=useCache, **kwargs) + ok = tU.testCache() + if not ok: + logger.error("%s %s fails", tName, resourceName) + ret = ret and ok + if not ret: + logger.info("%s resource %r step status %r cumulative status %r", tName, resourceName, ok, ret) + self.__resourceUsageReport(startTime) + # + logger.info("Completed %s %d resources with status %r", tName, len(self.__resourcesD), ret) + return ret + + def __resourceUsageReport(self, startTime): + unitS = "MB" if platform.system() == "Darwin" else "GB" + rusageMax = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + # logger.info("Maximum total resident memory size %.3f %s", rusageMax / 10 ** 6, unitS) + endTime = time.time() + logger.info( + "Step completed at %s (%.4f secs/%.3f %s)", + time.strftime("%Y %m %d %H:%M:%S", time.localtime()), + endTime - startTime, + rusageMax / 10 ** 6, + unitS, + ) + + def __fetchCitationReferenceProvider(self, cfgOb, configName, cachePath, useCache=True, **kwargs): + logger.debug("configName %s cachePath %s kwargs %r", configName, cachePath, kwargs) + if not self.__crP: + cachePath = os.path.join(cachePath, cfgOb.get("CITATION_REFERENCE_CACHE_DIR", sectionName=configName)) + self.__crP = CitationReferenceProvider(cachePath=cachePath, useCache=useCache, **kwargs) + return self.__crP + + def __fetchJournalTitleAbbreviationProvider(self, cfgOb, configName, cachePath, useCache=True, **kwargs): + logger.debug("configName %s cachePath %s kwargs %r", configName, cachePath, kwargs) + if not self.__jtaP: + cachePath = os.path.join(cachePath, cfgOb.get("CITATION_REFERENCE_CACHE_DIR", sectionName=configName)) + self.__jtaP = JournalTitleAbbreviationProvider(cachePath=cachePath, useCache=useCache, **kwargs) + return self.__jtaP + + def __fetchTaxonomyProvider(self, cfgOb, configName, cachePath, useCache=True, **kwargs): + logger.debug("configName %s cachePath %s kwargs %r", configName, cachePath, kwargs) + if not self.__taxU: + taxonomyDataPath = os.path.join(cachePath, cfgOb.get("NCBI_TAXONOMY_CACHE_DIR", sectionName=configName)) + self.__taxU = TaxonomyProvider(taxDirPath=taxonomyDataPath, useCache=useCache, **kwargs) + return self.__taxU + + def __fetchScopProvider(self, cfgOb, configName, cachePath, useCache=True, **kwargs): + logger.debug("configName %s cachePath %s kwargs %r", configName, cachePath, kwargs) + if not self.__scopU: + structDomainDataPath = os.path.join(cachePath, cfgOb.get("STRUCT_DOMAIN_CLASSIFICATION_CACHE_DIR", sectionName=configName)) + self.__scopU = ScopClassificationProvider(scopDirPath=structDomainDataPath, useCache=useCache, **kwargs) + return self.__scopU + + def __fetchCathProvider(self, cfgOb, configName, cachePath, useCache=True, **kwargs): + logger.debug("configName %s cachePath %s kwargs %r", configName, cachePath, kwargs) + if not self.__cathU: + structDomainDataPath = os.path.join(cachePath, cfgOb.get("STRUCT_DOMAIN_CLASSIFICATION_CACHE_DIR", sectionName=configName)) + self.__cathU = CathClassificationProvider(cathDirPath=structDomainDataPath, useCache=useCache, **kwargs) + return self.__cathU + + def __fetchEnzymeProvider(self, cfgOb, configName, cachePath, useCache=True, **kwargs): + logger.debug("configName %s cachePath %s kwargs %r", configName, cachePath, kwargs) + if not self.__ecU: + enzymeDataPath = os.path.join(cachePath, cfgOb.get("ENZYME_CLASSIFICATION_CACHE_DIR", sectionName=configName)) + self.__ecU = EnzymeDatabaseProvider(enzymeDirPath=enzymeDataPath, useCache=useCache, **kwargs) + return self.__ecU + + # + def __fetchDrugBankProvider(self, cfgOb, configName, cachePath, useCache=True, **kwargs): + _ = cfgOb + logger.debug("configName %s cachePath %s kwargs %r", configName, cachePath, kwargs) + if not self.__dbU: + # If a mock DrugBank URL is provided add this as an argument. + mockUrlTarget = cfgOb.getPath("DRUGBANK_MOCK_URL_TARGET", sectionName=configName) + logger.info("Using mock DrugBank source file %r", mockUrlTarget) + if mockUrlTarget: + kwargs["urlTarget"] = mockUrlTarget + logger.info("Using mock DrugBank source file %r", mockUrlTarget) + un = cfgOb.get("_DRUGBANK_AUTH_USERNAME", sectionName=configName) + pw = cfgOb.get("_DRUGBANK_AUTH_PASSWORD", sectionName=configName) + self.__dbU = DrugBankProvider(cachePath=cachePath, useCache=useCache, username=un, password=pw, **kwargs) + return self.__dbU + + # + def __fetchResidProvider(self, cfgOb, configName, cachePath, useCache=True, **kwargs): + _ = cfgOb + logger.debug("configName %s cachePath %s kwargs %r", configName, cachePath, kwargs) + if not self.__residU: + # dbDataPath = os.path.join(cachePath, cfgOb.get("RESID_CACHE_DIR", sectionName=configName)) + self.__residU = ResidProvider(cachePath=cachePath, useCache=useCache, **kwargs) + return self.__residU + + def __fetchPsiModProvider(self, cfgOb, configName, cachePath, useCache=True, **kwargs): + _ = cfgOb + logger.debug("configName %s cachePath %s kwargs %r", configName, cachePath, kwargs) + if not self.__psimodU: + self.__psimodU = PsiModProvider(cachePath=cachePath, useCache=useCache, **kwargs) + return self.__psimodU + + def __fetchChemCompModelProvider(self, cfgOb, configName, cachePath, useCache=True, **kwargs): + _ = cfgOb + logger.debug("configName %s cachePath %s kwargs %r", configName, cachePath, kwargs) + if not self.__ccmU: + # dirPath = os.path.join(cachePath, cfgOb.get("CHEM_COMP_CACHE_DIR", sectionName=configName)) + self.__ccmU = ChemCompModelProvider(cachePath=cachePath, useCache=useCache, **kwargs) + return self.__ccmU + + def __fetchChemCompProvider(self, cfgOb, configName, cachePath, useCache=True, **kwargs): + _ = cfgOb + logger.debug("configName %s cachePath %s kwargs %r", configName, cachePath, kwargs) + if not self.__ccU: + # dirPath = os.path.join(cachePath, cfgOb.get("CHEM_COMP_CACHE_DIR", sectionName=configName)) + self.__ccU = ChemCompProvider(cachePath=cachePath, useCache=useCache, **kwargs) + return self.__ccU + + def __fetchAtcProvider(self, cfgOb, configName, cachePath, useCache=True, **kwargs): + _ = cfgOb + logger.debug("configName %s cachePath %s kwargs %r", configName, cachePath, kwargs) + if not self.__atcP: + # dirPath = os.path.join(cachePath, cfgOb.get("ATC_CACHE_DIR", sectionName=configName)) + self.__atcP = AtcProvider(cachePath=cachePath, useCache=useCache, **kwargs) + return self.__atcP + + def __fetchSiftsSummaryProvider(self, cfgOb, configName, cachePath, useCache=True, **kwargs): + logger.debug("configName %s cachePath %s kwargs %r", configName, cachePath, kwargs) + if not self.__ssP: + srcDirPath = os.path.join(cachePath, cfgOb.getPath("SIFTS_SUMMARY_DATA_PATH", sectionName=configName)) + cacheDirPath = os.path.join(cachePath, cfgOb.get("SIFTS_SUMMARY_CACHE_DIR", sectionName=configName)) + logger.debug("ssP %r %r", srcDirPath, cacheDirPath) + self.__ssP = SiftsSummaryProvider(srcDirPath=srcDirPath, cacheDirPath=cacheDirPath, useCache=useCache, abbreviated=self.__siftsAbbreviated, **kwargs) + logger.debug("ssP entry count %d", self.__ssP.getEntryCount()) + return self.__ssP + + def __fetchValidationProvider(self, cfgOb, configName, cachePath, useCache=True, **kwargs): + logger.debug("configName %s cachePath %s kwargs %r", configName, cachePath, kwargs) + if not self.__vrptP: + urlTarget = cfgOb.get("VRPT_DICT_MAPPING_LOCATOR", sectionName=configName) + dirPath = os.path.join(cachePath, cfgOb.get("DICTIONARY_CACHE_DIR", sectionName=configName)) + self.__vrptP = ValidationReportProvider(dirPath=dirPath, urlTarget=urlTarget, useCache=useCache) + # + return self.__vrptP + + def __fetchCommonUtils(self, cfgOb, configName, cachePath, useCache=None, **kwargs): + logger.debug("configName %s cachePath %r kwargs %r", configName, cachePath, kwargs) + _ = cfgOb + _ = useCache + if not self.__commonU: + self.__commonU = DictMethodCommonUtils(**kwargs) + return self.__commonU + + def __fetchDictionaryApi(self, cfgOb, configName, cachePath, useCache=None, **kwargs): + logger.debug("configName %s cachePath %s kwargs %r", configName, cachePath, kwargs) + schemaName = kwargs.get("schemaName", "pdbx_core") + self.__dApiW = DictionaryApiProviderWrapper(cfgOb, cachePath, useCache=useCache) + dictApi = self.__dApiW.getApiByName(schemaName) + # numRev = dictApi.getDictionaryRevisionCount() + return dictApi + + def __fetchPubChemProvider(self, cfgOb, configName, cachePath, useCache=True, **kwargs): + logger.debug("configName %s cachePath %s kwargs %r", configName, cachePath, kwargs) + if not self.__pcP: + # + try: + minCount = 0 + userName = cfgOb.get("_STASH_AUTH_USERNAME", sectionName=configName) + password = cfgOb.get("_STASH_AUTH_PASSWORD", sectionName=configName) + basePath = cfgOb.get("_STASH_SERVER_BASE_PATH", sectionName=configName) + url = cfgOb.get("STASH_SERVER_URL", sectionName=configName) + urlFallBack = cfgOb.get("STASH_SERVER_FALLBACK_URL", sectionName=configName) + # + pcP = PubChemProvider(cachePath=cachePath, useCache=useCache) + ok = pcP.fromStash(url, basePath, userName=userName, password=password) + ok = pcP.reload() + ok = pcP.testCache(minCount=10) + if not ok: + ok = pcP.fromStash(urlFallBack, basePath, userName=userName, password=password) + ok = pcP.testCache(minCount=minCount) + # + if pcP: + self.__pcP = pcP + riD = pcP.getIdentifiers() + logger.info("Fetched PubChem mapping dictionary (%d)", len(riD)) + except Exception as e: + logger.exception("Failing with %s", str(e)) + # + return self.__pcP + + def __fetchPharosProvider(self, cfgOb, configName, cachePath, useCache=True, **kwargs): + logger.debug("configName %s cachePath %s kwargs %r", configName, cachePath, kwargs) + if not self.__phP: + # -- + try: + minCount = 0 + userName = cfgOb.get("_STASH_AUTH_USERNAME", sectionName=configName) + password = cfgOb.get("_STASH_AUTH_PASSWORD", sectionName=configName) + basePath = cfgOb.get("_STASH_SERVER_BASE_PATH", sectionName=configName) + url = cfgOb.get("STASH_SERVER_URL", sectionName=configName) + urlFallBack = cfgOb.get("STASH_SERVER_FALLBACK_URL", sectionName=configName) + # + phP = PharosProvider(cachePath=cachePath, useCache=useCache) + ok = phP.fromStash(url, basePath, userName=userName, password=password) + ok = phP.reload() + ok = phP.testCache(minCount=10) + if not ok: + ok = phP.fromStash(urlFallBack, basePath, userName=userName, password=password) + ok = phP.testCache(minCount=minCount) + # + if phP: + self.__phP = phP + riD = phP.getIdentifiers() + logger.info("Fetched Pharos ChEMBL identifiers (%d)", len(riD)) + except Exception as e: + logger.warning("Failing with %s", str(e)) + # + return self.__phP + + def __fetchRcsbLigandScoreProvider(self, cfgOb, configName, cachePath, useCache=None, **kwargs): + logger.debug("configName %s cachePath %s kwargs %r", configName, cachePath, kwargs) + _ = cfgOb + if not self.__rlsP: + self.__rlsP = RcsbLigandScoreProvider(cachePath=cachePath, useCache=useCache) + return self.__rlsP diff --git a/rcsb/utils/dictionary/DictMethodRunner.py b/rcsb/utils/dictionary/DictMethodRunner.py new file mode 100644 index 0000000..64d411f --- /dev/null +++ b/rcsb/utils/dictionary/DictMethodRunner.py @@ -0,0 +1,227 @@ +## +# File: DictMethodRunner.py +# Author: J. Westbrook +# Date: 18-Aug-2018 +# Version: 0.001 Initial version +# +# Updates: +# 12-Nov-2018 jdw Run block methods after category and attribute methods. +# 5-Jun-2019 jdw Refactor and generalize and remove dependencies on rcsb.db package +# 17-Jul-2019 jdw Propagate kwargs to __getModuleInstance() +# +## +""" +Manage the invocation of dictionary methods implemented in helper classes. + +""" +__docformat__ = "restructuredtext en" +__author__ = "John Westbrook" +__email__ = "jwest@rcsb.rutgers.edu" +__license__ = "Apache 2.0" + +import logging +import sys +from operator import itemgetter + +logger = logging.getLogger(__name__) + + +class DictMethodRunner(object): + """Manage the invocation of dictionary methods implemented as class methods.""" + + def __init__(self, dictionaryApi, modulePathMap=None, **kwargs): + """Manage invocation of dictionary methods referenced in external modules. + + Arguments: + dictionaryApi {object} -- instance of DictionaryApi() for dictionary with target method definitions + + Keyword Arguments: + modulePathMap {dict str} -- mapping between dictionary module path and execution path (default: {None}) + cacheModuleFlag {bool} -- flag to cache module instances (defaullt: True) + implentationSource {str} -- method implementation (default: 'reference') + methodCodes (list str) -- filter methods by codes {default: ['calculation']} + """ + self.__dApi = dictionaryApi + self.__modulePathMap = modulePathMap if modulePathMap else {} + self.__cacheModuleFlag = kwargs.get("cacheModuleFlag", True) + methodCodes = kwargs.get("methodCodes", ["calculation"]) + implementationSource = kwargs.get("implementationCodes", "reference") + # + self.__kwargs = kwargs + # + # Preserve and reuse the module instances if caching is enabled + self.__moduleCache = {} + # + self.__methodD = self.__getMethodInfo(implementationSource=implementationSource, methodCodes=methodCodes) + logger.debug("Method index %r", self.__methodD.items()) + + def __getMethodInfo(self, implementationSource="reference", methodCodes=None): + """Get method implementation with the input implementation source.""" + methodCodes = methodCodes if methodCodes else ["calculation"] + methodD = {} + try: + methodIndex = self.__dApi.getMethodIndex() + for _, mrL in methodIndex.items(): + for mr in mrL: + mId = mr.getId() + catName = mr.getCategoryName() + atName = mr.getAttributeName() + mType = mr.getType() + if (catName, atName) not in methodD: + methodD[(catName, atName)] = [] + methDef = self.__dApi.getMethod(mId) + logger.debug("Category %s attribute %s mId %r type %r methDef %r", catName, atName, mId, mType, methDef) + mSource = methDef.getImplementationSource() + mCode = methDef.getCode() + if mSource == implementationSource and mCode in methodCodes: + mPriority = methDef.getPriority() + mLang = methDef.getLanguage() + mImplement = methDef.getImplementation() + dD = {"METHOD_LANGUAGE": mLang, "METHOD_IMPLEMENT": mImplement, "METHOD_TYPE": mType, "METHOD_CODE": mCode, "METHOD_PRIORITY": mPriority} + methodD[(catName, atName)].append(dD) + # + except Exception as e: + logger.exception("Failing with %s", str(e)) + ## + logger.debug("Method dictionary %r", methodD) + return methodD + + def __invokeAttributeMethod(self, methodPath, dataContainer, catName, atName, **kwargs): + """Invoke the input attribute method""" + ok = False + try: + modulePath, methodName = self.__methodPathSplit(methodPath) + mObj = self.__getModuleInstance(modulePath, **kwargs) + theMeth = getattr(mObj, methodName, None) + ok = theMeth(dataContainer, catName, atName, **kwargs) + except Exception as e: + logger.exception("Failed invoking attribute %s %s method %r with %s", catName, atName, methodPath, str(e)) + return ok + + def __invokeCategoryMethod(self, methodPath, dataContainer, catName, **kwargs): + """Invoke the input category method""" + ok = False + try: + modulePath, methodName = self.__methodPathSplit(methodPath) + mObj = self.__getModuleInstance(modulePath, **kwargs) + theMeth = getattr(mObj, methodName, None) + ok = theMeth(dataContainer, catName, **kwargs) + except Exception as e: + logger.exception("Failed invoking category %s method %r with %s", catName, methodPath, str(e)) + return ok + + def __invokeDatablockMethod(self, methodPath, dataContainer, blockName, **kwargs): + """Invoke the input data block method""" + ok = False + try: + modulePath, methodName = self.__methodPathSplit(methodPath) + mObj = self.__getModuleInstance(modulePath, **kwargs) + theMeth = getattr(mObj, methodName, None) + ok = theMeth(dataContainer, blockName, **kwargs) + except Exception as e: + logger.exception("Failed invoking block %s method %r with %s", blockName, methodPath, str(e)) + return ok + + def apply(self, dataContainer): + """Apply category, attribute and block level dictionary methods on the input data container.""" + kwargs = self.__kwargs + mTupL = self.__getCategoryMethods() + logger.debug("Category methods %r", mTupL) + for catName, _, methodPath, _ in mTupL: + self.__invokeCategoryMethod(methodPath, dataContainer, catName, **kwargs) + + mTupL = self.__getAttributeMethods() + logger.debug("Attribute methods %r", mTupL) + for catName, atName, methodPath, _ in mTupL: + self.__invokeAttributeMethod(methodPath, dataContainer, catName, atName, **kwargs) + + mTupL = self.__getDatablockMethods() + logger.debug("Datablock methods %r", mTupL) + for blockName, _, methodPath, _ in mTupL: + self.__invokeDatablockMethod(methodPath, dataContainer, blockName, **kwargs) + + return True + + def __getDatablockMethods(self): + mL = [] + try: + for (dictName, _), mDL in self.__methodD.items(): + for mD in mDL: + if mD["METHOD_TYPE"].lower() == "datablock": + methodPath = mD["METHOD_IMPLEMENT"] + mL.append((dictName, None, methodPath, mD["METHOD_PRIORITY"])) + mL = sorted(mL, key=itemgetter(3)) + return mL + except Exception as e: + logger.exception("Failing dictName %s with %s", dictName, str(e)) + return mL + + def __getCategoryMethods(self): + mL = [] + try: + for (catName, _), mDL in self.__methodD.items(): + for mD in mDL: + if mD["METHOD_TYPE"].lower() == "category": + methodPath = mD["METHOD_IMPLEMENT"] + mL.append((catName, None, methodPath, mD["METHOD_PRIORITY"])) + mL = sorted(mL, key=itemgetter(3)) + return mL + except Exception as e: + logger.exception("Failing catName %r with %s", catName, str(e)) + return mL + + def __getAttributeMethods(self): + mL = [] + try: + for (catName, atName), mDL in self.__methodD.items(): + for mD in mDL: + if mD["METHOD_TYPE"].lower() == "attribute": + methodPath = mD["METHOD_IMPLEMENT"] + mL.append((catName, atName, methodPath, mD["METHOD_PRIORITY"])) + mL = sorted(mL, key=itemgetter(3)) + return mL + except Exception as e: + logger.exception("Failing catName %s atName %s with %s", catName, atName, str(e)) + return mL + + def __methodPathSplit(self, methodPath): + """Extract module path and the method name from the input path. Optional + remap the module path. + + Arguments: + methodPath {str} -- implementation path from dictionary method definition + + Returns: + {tuple str} -- module path, method name + """ + try: + # Strip off any leading path of the module from the method path. + mpL = str(methodPath).split(".") + methodName = mpL[-1] + tp = ".".join(mpL[:-1]) + modulePath = self.__modulePathMap[tp] if tp in self.__modulePathMap else tp + return modulePath, methodName + except Exception as e: + logger.error("Failing for method path %r with %s", methodPath, str(e)) + return None, None + + def __getModuleInstance(self, modulePath, **kwargs): + # + if self.__cacheModuleFlag and modulePath in self.__moduleCache: + return self.__moduleCache[modulePath] + # + mObj = None + try: + aMod = __import__(modulePath, globals(), locals(), [""]) + sys.modules[modulePath] = aMod + # + # Strip off any leading path to the module before we instaniate the module object. + mpL = str(modulePath).split(".") + moduleName = mpL[-1] + # + mObj = getattr(aMod, moduleName)(**kwargs) + self.__moduleCache[modulePath] = mObj + + except Exception as e: + logger.error("Failing to instance helper %r with %s", modulePath, str(e)) + return mObj diff --git a/rcsb/utils/dictionary/DictionaryApiProvider.py b/rcsb/utils/dictionary/DictionaryApiProvider.py new file mode 100644 index 0000000..253bdb5 --- /dev/null +++ b/rcsb/utils/dictionary/DictionaryApiProvider.py @@ -0,0 +1,113 @@ +## +# File: DictionaryApiProvider.py +# Author: J. Westbrook +# Date: 3-Jun-2019 +# Version: 0.001 Initial version +# +# Updates: +# 14-Aug-2019 jdw adding remote dictionary fetch and caching logic. +## +""" +Resource provider for dictionary API. + +""" +__docformat__ = "restructuredtext en" +__author__ = "John Westbrook" +__email__ = "jwest@rcsb.rutgers.edu" +__license__ = "Apache 2.0" + +import logging +import os + +from mmcif.api.DictionaryApi import DictionaryApi +from rcsb.utils.io.FileUtil import FileUtil +from rcsb.utils.io.MarshalUtil import MarshalUtil +from rcsb.utils.io.SingletonClass import SingletonClass + +logger = logging.getLogger(__name__) + + +class DictionaryApiProvider(SingletonClass): + """Resource provider for dictionary APIs.""" + + def __init__(self, dirPath, useCache=True): + """Resource provider for dictionary APIs. + + Args: + dirPath (str): path to the directory containing cache files + useCache (bool, optional): flag to use cached files. Defaults to True. + + """ + self.__apiMap = {} + self.__dirPath = dirPath + self.__useCache = useCache + # + self.__fileU = FileUtil(workPath=self.__dirPath) + logger.debug("Leaving constructor") + + def __reload(self, dictLocators, dirPath, useCache=True): + """Reload local cache of dictionary resources and return a dictionary API instance. + + Args: + dictLocators (list, str): list of locators for dictionary resource files + dirPath (str): path to the directory containing cache files + useCache (bool, optional): flag to use cached files. Defaults to True. + + Returns: + (object): instance of dictionary API + """ + # + # verify the exitence of the cache directory ... + self.__fileU.mkdir(dirPath) + if not useCache: + for dictLocator in dictLocators: + try: + fn = self.__fileU.getFileName(dictLocator) + os.remove(os.path.join(dirPath, fn)) + except Exception: + pass + # + ret = True + for dictLocator in dictLocators: + cacheFilePath = os.path.join(dirPath, self.__fileU.getFileName(dictLocator)) + if useCache and self.__fileU.exists(cacheFilePath): + # nothing to do + continue + logger.debug("Fetching url %s caching in %s", dictLocator, cacheFilePath) + ok = self.__fileU.get(dictLocator, cacheFilePath) + ret = ret and ok + return ret + + def getApi(self, dictLocators, **kwargs): + """Return a dictionary API object of the input dictioaries. + + Arguments: + dictLocators {list str} -- list of dictionary locator paths + + Returns: + [object] -- returns DictionaryApi() object for input dictionaries + """ + dictFileNames = [self.__fileU.getFileName(dictLocator) for dictLocator in dictLocators] + dictTup = tuple(dictFileNames) + dApi = self.__apiMap[dictTup] if dictTup in self.__apiMap else self.__getApi(dictLocators, **kwargs) + self.__apiMap[dictTup] = dApi + return dApi + + def __getApi(self, dictLocators, **kwargs): + """Return an instance of a dictionary API instance for the input dictionary locator list.""" + consolidate = kwargs.get("consolidate", True) + replaceDefinition = kwargs.get("replaceDefinitions", True) + verbose = kwargs.get("verbose", True) + # + ok = self.__reload(dictLocators, self.__dirPath, useCache=self.__useCache) + # + dApi = None + if ok: + mU = MarshalUtil() + containerList = [] + for dictLocator in dictLocators: + cacheFilePath = os.path.join(self.__dirPath, self.__fileU.getFileName(dictLocator)) + containerList.extend(mU.doImport(cacheFilePath, fmt="mmcif-dict")) + # + dApi = DictionaryApi(containerList=containerList, consolidate=consolidate, replaceDefinition=replaceDefinition, verbose=verbose) + return dApi diff --git a/rcsb/utils/dictionary/DictionaryApiProviderWrapper.py b/rcsb/utils/dictionary/DictionaryApiProviderWrapper.py new file mode 100644 index 0000000..068ce1a --- /dev/null +++ b/rcsb/utils/dictionary/DictionaryApiProviderWrapper.py @@ -0,0 +1,74 @@ +## +# File: DictionaryApiProviderWrapper.py +# Author: J. Westbrook +# Date: 18-Aug-2019 +# Version: 0.001 Initial version +# +# Updates: +# +## +""" +Wrapper for dictionary API provider. + +""" +__docformat__ = "restructuredtext en" +__author__ = "John Westbrook" +__email__ = "jwest@rcsb.rutgers.edu" +__license__ = "Apache 2.0" + +import logging +import os.path + +from rcsb.utils.dictionary.DictionaryApiProvider import DictionaryApiProvider +from rcsb.utils.io.SingletonClass import SingletonClass + +logger = logging.getLogger(__name__) + + +class DictionaryApiProviderWrapper(SingletonClass): + """Wrapper for dictionary API provider.""" + + def __init__(self, cfgOb, cachePath, useCache=True, **kwargs): + """Wrapper for dictionary API provider. + + Args: + cfgOb (object): ConfigInfo() object instance + cachePath (str): top path to contain the dictionary cache directory + useCache (bool, optional): flag to use cached files. Defaults to True. + + """ + self.__cfgOb = cfgOb + self.__configName = self.__cfgOb.getDefaultSectionName() + self.__contentInfoConfigName = "content_info_helper_configuration" + self.__dictLocatorMap = self.__cfgOb.get("DICT_LOCATOR_CONFIG_MAP", sectionName=self.__contentInfoConfigName) + dirPath = os.path.join(cachePath, self.__cfgOb.get("DICTIONARY_CACHE_DIR", sectionName=self.__configName)) + self.__dP = DictionaryApiProvider(dirPath, useCache=useCache, **kwargs) + logger.debug("Leaving constructor") + + def getApiByLocators(self, dictLocators, **kwargs): + """Return a dictionary API object for the input dictionary locator list. + + Args: + dictLocators (list str): list of dictionary locators + + Returns: + (object): Instance of DictionaryApi() + """ + return self.__dP.getApi(dictLocators, **kwargs) + + def getApiByName(self, databaseName, **kwargs): + """Return a dictionary API object for the input schema name. + + Args: + databaseName (str): database schema name + + Returns: + (object): Instance of DictionaryApi() + """ + if databaseName not in self.__dictLocatorMap: + logger.error("Missing dictionary locator configuration for database schema %s", databaseName) + dictLocators = [] + else: + dictLocators = [self.__cfgOb.getPath(configLocator, sectionName=self.__configName) for configLocator in self.__dictLocatorMap[databaseName]] + # + return self.__dP.getApi(dictLocators, **kwargs) diff --git a/rcsb/utils/dictionary/__init__.py b/rcsb/utils/dictionary/__init__.py new file mode 100644 index 0000000..7a42a5d --- /dev/null +++ b/rcsb/utils/dictionary/__init__.py @@ -0,0 +1,5 @@ +__docformat__ = "restructuredtext en" +__author__ = "John Westbrook" +__email__ = "john.westbrook@rcsb.org" +__license__ = "Apache 2.0" +__version__ = "0.11" diff --git a/rcsb/utils/tests-dictionary/__init__.py b/rcsb/utils/tests-dictionary/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/rcsb/utils/tests-dictionary/testDictMethodRunner.py b/rcsb/utils/tests-dictionary/testDictMethodRunner.py new file mode 100644 index 0000000..151db78 --- /dev/null +++ b/rcsb/utils/tests-dictionary/testDictMethodRunner.py @@ -0,0 +1,129 @@ +# File: DictMethodRunnerTests.py +# Author: J. Westbrook +# Date: 18-Aug-2018 +# Version: 0.001 +# +# Update: +# 12-Nov-2018 jdw add chemical component and bird chemical component tests +# 5-Jun-2019 jdw revise for new method runner api +# 16-Jul-2019 jdw remove schema processing. +## +""" +Tests for applying dictionary methods defined as references to helper plugin methods . + +""" + +__docformat__ = "restructuredtext en" +__author__ = "John Westbrook" +__email__ = "jwest@rcsb.rutgers.edu" +__license__ = "Apache 2.0" + +import logging +import os +import time +import unittest + +from mmcif.api.DictMethodRunner import DictMethodRunner +from rcsb.utils.dictionary.DictionaryApiProviderWrapper import DictionaryApiProviderWrapper +from rcsb.utils.dictionary.DictMethodResourceProvider import DictMethodResourceProvider +from rcsb.utils.repository.RepositoryProvider import RepositoryProvider +from rcsb.utils.config.ConfigUtil import ConfigUtil +from rcsb.utils.io.MarshalUtil import MarshalUtil + +logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s]-%(module)s.%(funcName)s: %(message)s") +logger = logging.getLogger() +logger.setLevel(logging.INFO) + +HERE = os.path.abspath(os.path.dirname(__file__)) +TOPDIR = os.path.dirname(os.path.dirname(os.path.dirname(HERE))) + + +class DictMethodRunnerTests(unittest.TestCase): + def setUp(self): + self.__numProc = 2 + self.__fileLimit = 200 + mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") + self.__cachePath = os.path.join(TOPDIR, "CACHE") + configPath = os.path.join(mockTopPath, "config", "dbload-setup-example.yml") + configName = "site_info_configuration" + self.__configName = configName + self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=mockTopPath) + self.__mU = MarshalUtil(workPath=self.__cachePath) + self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb, numProc=self.__numProc, fileLimit=self.__fileLimit, cachePath=self.__cachePath) + # + self.__testCaseList = [ + {"contentType": "pdbx_core", "mockLength": 50, "mergeContent": ["vrpt"]}, + {"contentType": "bird_chem_comp_core", "mockLength": 17, "mergeContent": None}, + ] + # + self.__modulePathMap = self.__cfgOb.get("DICT_METHOD_HELPER_MODULE_PATH_MAP", sectionName=configName) + # + self.__startTime = time.time() + logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) + + def tearDown(self): + endTime = time.time() + logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) + + def __runContentType(self, contentType, mockLength, mergeContent): + """Read and process test fixture data files from the input content type.""" + try: + dP = DictionaryApiProviderWrapper(self.__cfgOb, self.__cachePath, useCache=True) + dictApi = dP.getApiByName(contentType) + rP = DictMethodResourceProvider(self.__cfgOb, configName=self.__configName, cachePath=self.__cachePath, siftsAbbreviated="TEST") + dmh = DictMethodRunner(dictApi, modulePathMap=self.__modulePathMap, resourceProvider=rP) + locatorObjList = self.__rpP.getLocatorObjList(contentType=contentType, mergeContentTypes=mergeContent) + containerList = self.__rpP.getContainerList(locatorObjList) + # + logger.debug("Length of locator list %d\n", len(locatorObjList)) + self.assertGreaterEqual(len(locatorObjList), mockLength) + for container in containerList: + cName = container.getName() + logger.debug("Processing container %s", cName) + dmh.apply(container) + savePath = os.path.join(HERE, "test-output", cName + "-with-method.cif") + self.__mU.doExport(savePath, [container], fmt="mmcif") + + except Exception as e: + logger.exception("Failing with %s", str(e)) + self.fail() + + def testMethodRunner(self): + """Test method runner for multiple content types.""" + for tD in self.__testCaseList: + self.__runContentType(tD["contentType"], tD["mockLength"], tD["mergeContent"]) + + def testMethodRunnerSetup(self): + """Test the setup methods for method runner class""" + try: + dP = DictionaryApiProviderWrapper(self.__cfgOb, self.__cachePath, useCache=True) + dictApi = dP.getApiByName("pdbx") + rP = DictMethodResourceProvider(self.__cfgOb, configName=self.__configName, cachePath=self.__cachePath, siftsAbbreviated="TEST") + dmh = DictMethodRunner(dictApi, modulePathMap=self.__modulePathMap, resourceProvider=rP) + ok = dmh is not None + self.assertTrue(ok) + + except Exception as e: + logger.exception("Failing with %s", str(e)) + self.fail() + + +def dictMethodRunnerSuite(): + suiteSelect = unittest.TestSuite() + suiteSelect.addTest(DictMethodRunnerTests("testMethodRunner")) + return suiteSelect + + +def dictMethodRunnerSetupSuite(): + suiteSelect = unittest.TestSuite() + suiteSelect.addTest(DictMethodRunnerTests("testMethodRunnerSetup")) + return suiteSelect + + +if __name__ == "__main__": + + mySuite = dictMethodRunnerSetupSuite() + unittest.TextTestRunner(verbosity=2).run(mySuite) + + mySuite = dictMethodRunnerSuite() + unittest.TextTestRunner(verbosity=2).run(mySuite) diff --git a/rcsb/utils/tests-dictionary/testDictionaryApiProvider.py b/rcsb/utils/tests-dictionary/testDictionaryApiProvider.py new file mode 100644 index 0000000..03997cb --- /dev/null +++ b/rcsb/utils/tests-dictionary/testDictionaryApiProvider.py @@ -0,0 +1,87 @@ +## +# File: testDictionaryApiProvider.py +# Author: J. Westbrook +# Date: 15-Aug-2019 +# Version: 0.001 +# +# Update: + +## +""" +Tests for dictionary API provider and cache. + +""" + +__docformat__ = "restructuredtext en" +__author__ = "John Westbrook" +__email__ = "jwest@rcsb.rutgers.edu" +__license__ = "Apache 2.0" + +import logging +import os +import time +import unittest + +from rcsb.utils.dictionary.DictionaryApiProvider import DictionaryApiProvider +from rcsb.utils.config.ConfigUtil import ConfigUtil + +logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s]-%(module)s.%(funcName)s: %(message)s") +logger = logging.getLogger() +logger.setLevel(logging.INFO) + +HERE = os.path.abspath(os.path.dirname(__file__)) +TOPDIR = os.path.dirname(os.path.dirname(os.path.dirname(HERE))) + + +class DictionaryProviderTests(unittest.TestCase): + def setUp(self): + mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") + self.__cachePath = os.path.join(TOPDIR, "CACHE") + self.__dirPath = os.path.join(self.__cachePath, "dictionaries") + configPath = os.path.join(mockTopPath, "config", "dbload-setup-example.yml") + configName = "site_info_configuration" + self.__configName = configName + self.__contentInfoConfigName = "content_info_helper_configuration" + self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=mockTopPath) + dictLocatorMap = self.__cfgOb.get("DICT_LOCATOR_CONFIG_MAP", sectionName=self.__contentInfoConfigName) + schemaName = "pdbx_core" + self.__dictLocators = [self.__cfgOb.getPath(configLocator, sectionName=self.__configName) for configLocator in dictLocatorMap[schemaName]] + # + self.__startTime = time.time() + logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) + + def tearDown(self): + endTime = time.time() + logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) + + def testResourceCache(self): + """Test case - generate and check dictonary artifact and api caches""" + try: + logger.debug("Dictionary locators %r", self.__dictLocators) + dp = DictionaryApiProvider(dirPath=self.__dirPath, useCache=False) + dApi = dp.getApi(self.__dictLocators) + ok = dApi.testCache() + self.assertTrue(ok) + title = dApi.getDictionaryTitle() + logger.debug("Title %r", title) + self.assertEqual(title, "mmcif_pdbx.dic,rcsb_mmcif_ext.dic,vrpt_mmcif_ext.dic") + # revL = dApi.getDictionaryHistory() + numRev = dApi.getDictionaryRevisionCount() + logger.debug("Number of dictionary revisions (numRev) %r", numRev) + self.assertGreater(numRev, 220) + # + except Exception as e: + logger.exception("Failing with %s", str(e)) + self.fail() + + +def dictionaryProviderSuite(): + suiteSelect = unittest.TestSuite() + suiteSelect.addTest(DictionaryProviderTests("testResourceCache")) + return suiteSelect + + +if __name__ == "__main__": + + mySuite = dictionaryProviderSuite() + unittest.TextTestRunner(verbosity=2).run(mySuite) diff --git a/rcsb/utils/tests-dictionary/testDictionaryApiProviderWrapper.py b/rcsb/utils/tests-dictionary/testDictionaryApiProviderWrapper.py new file mode 100644 index 0000000..2758c27 --- /dev/null +++ b/rcsb/utils/tests-dictionary/testDictionaryApiProviderWrapper.py @@ -0,0 +1,106 @@ +## +# File: testDictionaryApiProviderWrapper.py +# Author: J. Westbrook +# Date: 15-Aug-2019 +# Version: 0.001 +# +# Update: + +## +""" +Tests for dictionary API provider wrapper. + +""" + +__docformat__ = "restructuredtext en" +__author__ = "John Westbrook" +__email__ = "jwest@rcsb.rutgers.edu" +__license__ = "Apache 2.0" + +import logging +import os +import time +import unittest + +from rcsb.utils.dictionary.DictionaryApiProviderWrapper import DictionaryApiProviderWrapper +from rcsb.utils.config.ConfigUtil import ConfigUtil + +logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s]-%(module)s.%(funcName)s: %(message)s") +logger = logging.getLogger() +logger.setLevel(logging.INFO) + +HERE = os.path.abspath(os.path.dirname(__file__)) +TOPDIR = os.path.dirname(os.path.dirname(os.path.dirname(HERE))) + + +class DictionaryProviderTests(unittest.TestCase): + def setUp(self): + mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") + self.__cachePath = os.path.join(TOPDIR, "CACHE") + configPath = os.path.join(mockTopPath, "config", "dbload-setup-example.yml") + configName = "site_info_configuration" + self.__configName = configName + self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=mockTopPath) + # + self.__contentInfoConfigName = "content_info_helper_configuration" + dictLocatorMap = self.__cfgOb.get("DICT_LOCATOR_CONFIG_MAP", sectionName=self.__contentInfoConfigName) + self.__databaseName = "pdbx_core" + self.__dictLocators = [self.__cfgOb.getPath(configLocator, sectionName=self.__configName) for configLocator in dictLocatorMap[self.__databaseName]] + # + self.__startTime = time.time() + logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) + + def tearDown(self): + endTime = time.time() + logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) + + def testWrapperByName(self): + """Test case - get dictionary API by schema name""" + try: + dp = DictionaryApiProviderWrapper(self.__cfgOb, self.__cachePath, useCache=False) + dApi = dp.getApiByName(self.__databaseName) + ok = dApi.testCache() + self.assertTrue(ok) + title = dApi.getDictionaryTitle() + logger.debug("Title %r", title) + self.assertEqual(title, "mmcif_pdbx.dic,rcsb_mmcif_ext.dic,vrpt_mmcif_ext.dic") + # revL = dApi.getDictionaryHistory() + numRev = dApi.getDictionaryRevisionCount() + logger.debug("Number of dictionary revisions (numRev) %r", numRev) + self.assertGreater(numRev, 220) + # + except Exception as e: + logger.exception("Failing with %s", str(e)) + self.fail() + + def testWrapperByLocators(self): + """Test case - get dictionary API by locator list""" + try: + dp = DictionaryApiProviderWrapper(self.__cfgOb, self.__cachePath, useCache=False) + dApi = dp.getApiByLocators(self.__dictLocators) + ok = dApi.testCache() + self.assertTrue(ok) + title = dApi.getDictionaryTitle() + logger.debug("Title %r", title) + self.assertEqual(title, "mmcif_pdbx.dic,rcsb_mmcif_ext.dic,vrpt_mmcif_ext.dic") + # revL = dApi.getDictionaryHistory() + numRev = dApi.getDictionaryRevisionCount() + logger.debug("Number of dictionary revisions (numRev) %r", numRev) + self.assertGreater(numRev, 220) + # + except Exception as e: + logger.exception("Failing with %s", str(e)) + self.fail() + + +def dictionaryProviderSuite(): + suiteSelect = unittest.TestSuite() + suiteSelect.addTest(DictionaryProviderTests("testWrapperByName")) + suiteSelect.addTest(DictionaryProviderTests("testWrapperByLocators")) + return suiteSelect + + +if __name__ == "__main__": + + mySuite = dictionaryProviderSuite() + unittest.TextTestRunner(verbosity=2).run(mySuite) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..bc657bb --- /dev/null +++ b/requirements.txt @@ -0,0 +1,11 @@ +mmcif >= 0.57 +rcsb.utils.config >= 0.35 +rcsb.utils.io >= 0.97 +rcsb.utils.multiproc >= 0.17 +rcsb.utils.chemref >= 0.68 +rcsb.utils.citation >= 0.15 +rcsb.utils.validation >= 0.22 +rcsb.utils.ec >= 0.21 +rcsb.utils.taxonomy >= 0.32 +rcsb.utils.seq >= 0.43 +rcsb.utils.struct >= 0.26 diff --git a/setup.cfg b/setup.cfg new file mode 100755 index 0000000..53bb53d --- /dev/null +++ b/setup.cfg @@ -0,0 +1,7 @@ +[bdist_wheel] +# use py2.py3 tag for pure-python dist: +universal=1 + +[metadata] +description-file = README.md + diff --git a/setup.py b/setup.py new file mode 100755 index 0000000..f900941 --- /dev/null +++ b/setup.py @@ -0,0 +1,71 @@ +# File: setup.py +# Date: 14-Feb-2021 +# +# Update: +# +import re + +from setuptools import find_packages +from setuptools import setup + +packages = [] +thisPackage = "rcsb.utils.dictionary" + +with open("rcsb/utils/dictionary/__init__.py", "r") as fd: + version = re.search(r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]', fd.read(), re.MULTILINE).group(1) + +if not version: + raise RuntimeError("Cannot find version information") + +setup( + name=thisPackage, + version=version, + description="RCSB Python Dictionary Utility Classes", + long_description="See: README.md", + author="John Westbrook", + author_email="john.westbrook@rcsb.org", + url="https://github.com/rcsb/py-rcsb_utils_dictionary", + # + license="Apache 2.0", + classifiers=( + "Development Status :: 3 - Alpha", + # 'Development Status :: 5 - Production/Stable', + "Intended Audience :: Developers", + "Natural Language :: English", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + ), + entry_points={}, + # + install_requires=[ + "mmcif >= 0.57", + "rcsb.utils.io >= 0.97", + "rcsb.utils.config >= 0.35", + "rcsb.utils.multiproc >= 0.17", + "rcsb.utils.validation >= 0.22", + "rcsb.utils.chemref >= 0.68", + "rcsb.utils.citation >= 0.15", + "rcsb.utils.ec >= 0.21", + "rcsb.utils.taxonomy >= 0.32", + "rcsb.utils.seq >= 0.43", + "rcsb.utils.struct >= 0.26", + ], + packages=find_packages(exclude=["rcsb.utils.tests-dictionary", "rcsb.utils.tests-*", "tests.*"]), + package_data={ + # If any package contains *.md or *.rst ... files, include them: + "": ["*.md", "*.rst", "*.txt", "*.cfg"] + }, + # + # These basic tests require no database services - + test_suite="rcsb.utils.tests-dictionary", + tests_require=["tox"], + # + # Not configured ... + extras_require={"dev": ["check-manifest"], "test": ["coverage"]}, + # Added for + command_options={"build_sphinx": {"project": ("setup.py", thisPackage), "version": ("setup.py", version), "release": ("setup.py", version)}}, + # This setting for namespace package support - + zip_safe=False, +) diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..341716b --- /dev/null +++ b/tox.ini @@ -0,0 +1,170 @@ +# File: tox.ini (Templated version) +# +[local_settings] +# project specific local settings +test_pattern = "test*.py" +# +# Source pathls (unquoted space separated list of files/directories) for linting and format checks +source_paths = rcsb/utils/dictionary rcsb/utils/tests-dictionary setup.py +# +# Start directory path for test discovery +# Each path must reference valid directory that is searchable by Python (i.e. contains __init__.py) +# ** It is harmless to point to paths containing no tests. +# +test_path_1 = "rcsb/utils/tests-dictionary" +# These are placeholders valid source directories without tests files +test_path_2 = "rcsb/utils/dictionary" +test_path_3 = "rcsb/utils/dictionary" +test_path_4 = "rcsb/utils/dictionary" +# +# Comma separate list of directories for which test coverage will be evaluated +coverage_source_paths = "rcsb/utils/dictionary,rcsb/utils/tests-dictionary" +coverage_exclude_paths = "rcsb/utils/__init__.py" +coverage_cutoff = 50 +# +# +## -------------------------------------------------------------------------- +## ---------- No project specific setting beyond this point -------------- +# Updated: 3-Jul-2019 jdw Cleanup +# 8-Jul-2019 jdw Disable flake8 plugin (pydocstyle compat issue) +# 12-Jul-2019 jdw Add placeholders for up to four test paths +# 13-Jul-2019 jdw Add coverage exclusion +## +[tox] +# The complete list of supported test environments to setup and invoke +envlist = format_pep8-{py39}, lint_pylint-{py39}, format_black-{py39}, py{39} +# +minversion = 3.7.0 +skip_missing_interpreters = true +skipsdist = True + +[testenv] +passenv = CONFIG_SUPPORT_TOKEN_ENV OE_LICENSE +whitelist_externals = echo +deps = echo +commands = + echo "Starting default tests in testenv" + + +[testenv:py39] +description = 'Run unit tests (unittest runner) using {envpython}' +whitelist_externals = echo +skip_install = True +sitepackages = True +recreate = True +alwayscopy=True +usedevelop=True +platform= + macos: darwin + linux: linux +basepython = + py39: python3.9 + +deps = echo + -r requirements.txt +commands = + echo "Starting {envname} with {envpython}" + {envpython} -V + {envpython} -m unittest discover -v --start-directory {[local_settings]test_path_1} --pattern "{[local_settings]test_pattern}" + {envpython} -m unittest discover -v --start-directory {[local_settings]test_path_2} --pattern "{[local_settings]test_pattern}" + {envpython} -m unittest discover -v --start-directory {[local_settings]test_path_3} --pattern "{[local_settings]test_pattern}" + {envpython} -m unittest discover -v --start-directory {[local_settings]test_path_4} --pattern "{[local_settings]test_pattern}" + echo "Completed {envname} with {envpython}" + +# +[testenv:format_pep8-py39] +description = 'Run selected PEP8 compliance checks (flake8)' +platform= + macos: darwin + linux: linux +whitelist_externals = + echo + flake8 +basepython = py39: python3.9 +sitepackages = True +deps = + echo + flake8 + # This plugin is no longer compatible with latest pydocstyles - + # flake8-docstrings>=0.2.7 + flake8-import-order>=0.9 + -r requirements.txt +commands = + echo "Starting {envname}" + # Exceptions: D for docstrings, I for imports order and formatting, E302 is slice spacing - W503 multiline spacing incompatible with black + flake8 --max-line-length=185 --ignore=D,I,E203,W503 {[local_settings]source_paths} + echo "Completed {envname}" + +# +[testenv:lint_pylint-py39] +description = 'Run linting compliance checks (pylint)' +platform= + macos: darwin + linux: linux +whitelist_externals = + echo + pylint +basepython = py39: python3.9 +sitepackages = True +deps = + echo + pylint + -r requirements.txt +commands = + echo "Starting {envname}" + pylint --disable=R,C --reports=n --rcfile={toxinidir}/pylintrc {[local_settings]source_paths} + echo "Completed {envname}" + +# +[testenv:format_black-py39] +description = 'Run format compliance checks (black)' +platform= + macos: darwin + linux: linux +whitelist_externals = + echo + black +basepython = py39: python3.9 +sitepackages = True +deps = + echo + black>=20.8b + -r requirements.txt + # isort>=4.3.20 +commands = + echo "Starting {envname}" + black --check --line-length 180 {[local_settings]source_paths} + # isort -rc rcsb/utils --check-only + echo "Completed {envname}" + +# +[testenv:test_coverage-py39] +description = 'Run test coverage analysis' +platform= + macos: darwin + linux: linux +whitelist_externals = + echo + coverage +basepython = py39: python3.9 +recreate = true +alwayscopy=true +usedevelop=true +deps = + echo + coverage + -r requirements.txt + +commands = + echo "Starting {envname}" + coverage erase + coverage run --parallel-mode --omit="{[local_settings]coverage_exclude_paths}" --source="{[local_settings]coverage_source_paths}" -m unittest discover -v --start-directory {[local_settings]test_path_1} --pattern "{[local_settings]test_pattern}" + coverage run --parallel-mode --omit="{[local_settings]coverage_exclude_paths}" --source="{[local_settings]coverage_source_paths}" -m unittest discover -v --start-directory {[local_settings]test_path_2} --pattern "{[local_settings]test_pattern}" + coverage run --parallel-mode --omit="{[local_settings]coverage_exclude_paths}" --source="{[local_settings]coverage_source_paths}" -m unittest discover -v --start-directory {[local_settings]test_path_3} --pattern "{[local_settings]test_pattern}" + coverage run --parallel-mode --omit="{[local_settings]coverage_exclude_paths}" --source="{[local_settings]coverage_source_paths}" -m unittest discover -v --start-directory {[local_settings]test_path_4} --pattern "{[local_settings]test_pattern}" + echo " ------- Consolidating {envname} data ----------" + coverage combine + echo " ------- Building {envname} reports ----------" + coverage report --fail-under={[local_settings]coverage_cutoff} + - coverage xml + echo "Completed {envname}" \ No newline at end of file