diff --git a/docs/test_results/3.5-full-data.yml b/docs/test_results/3.5-full-data.yml
new file mode 100644
index 00000000..666e5e86
--- /dev/null
+++ b/docs/test_results/3.5-full-data.yml
@@ -0,0 +1,33 @@
+Concepts:
+ Precision: 0.5435908753019885
+ Process Method Duration: '0:00:00.027807'
+ Recall: 0.6921164772727273
+ Remote Call Duration: '0:00:00.031274'
+TnT Pos Tagger:
+ Accuracy: 0.9349333907344957
+ Process Method Duration: '0:00:00.007598'
+ Remote Call Duration: '0:00:00.009994'
+biomedicus-deepen:
+ F1: 0.912258064516129
+ Gold Standard: 2010 i2b2-VA
+ Per-Document Mean Pipeline Duration: '0:00:00.377380'
+ Precision: 0.8830141548709409
+ Recall: 0.9435053380782918
+biomedicus-dependencies:
+ Corpus: MiPACQ converted to UD from PTB test set
+ LAS: 0.5327625056331681
+ Process Method Duration: '0:00:00.502909'
+ Remote Call Duration: '0:00:00.503824'
+ UAS: 0.6661559260928346
+biomedicus-modification:
+ F1: 0.7100757788380578
+ Gold Standard: 2010 i2b2-VA
+ Per-Document Mean Pipeline Duration: '0:00:00.013226'
+ Precision: 0.9619771863117871
+ Recall: 0.5627224199288257
+biomedicus-negex:
+ F1: 0.8706162076481078
+ Gold Standard: 2010 i2b2-VA
+ Per-Document Mean Pipeline Duration: '0:00:00.009332'
+ Precision: 0.7849231868524473
+ Recall: 0.9773131672597865
diff --git a/docs/test_results/3.5-open-data.yml b/docs/test_results/3.5-open-data.yml
new file mode 100644
index 00000000..ce34ca72
--- /dev/null
+++ b/docs/test_results/3.5-open-data.yml
@@ -0,0 +1,33 @@
+Concepts:
+ Precision: 0.5265714242883056
+ Process Method Duration: '0:00:00.014707'
+ Recall: 0.6747159090909091
+ Remote Call Duration: '0:00:00.018182'
+TnT Pos Tagger:
+ Accuracy: 0.9349333907344957
+ Process Method Duration: '0:00:00.007532'
+ Remote Call Duration: '0:00:00.010117'
+biomedicus-deepen:
+ F1: 0.9104573759931286
+ Gold Standard: 2010 i2b2-VA
+ Per-Document Mean Pipeline Duration: '0:00:00.115112'
+ Precision: 0.8800332088003321
+ Recall: 0.9430604982206405
+biomedicus-dependencies:
+ Corpus: MiPACQ converted to UD from PTB test set
+ LAS: 0.5475739822742978
+ Process Method Duration: '0:00:00.101438'
+ Remote Call Duration: '0:00:00.102343'
+ UAS: 0.683340844224125
+biomedicus-modification:
+ F1: 0.7100757788380578
+ Gold Standard: 2010 i2b2-VA
+ Per-Document Mean Pipeline Duration: '0:00:00.014181'
+ Precision: 0.9619771863117871
+ Recall: 0.5627224199288257
+biomedicus-negex:
+ F1: 0.8706162076481078
+ Gold Standard: 2010 i2b2-VA
+ Per-Document Mean Pipeline Duration: '0:00:00.009531'
+ Precision: 0.7849231868524473
+ Recall: 0.9773131672597865
diff --git a/java/src/main/java/edu/umn/biomedicus/concepts/ConceptDictionaryBuilder.java b/java/src/main/java/edu/umn/biomedicus/concepts/ConceptDictionaryBuilder.java
index 5cfaa079..73069090 100644
--- a/java/src/main/java/edu/umn/biomedicus/concepts/ConceptDictionaryBuilder.java
+++ b/java/src/main/java/edu/umn/biomedicus/concepts/ConceptDictionaryBuilder.java
@@ -1,5 +1,5 @@
/*
- * Copyright 2019 Regents of the University of Minnesota.
+ * Copyright (c) Regents of the University of Minnesota.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -41,7 +41,8 @@
/**
* Builds the concepts dictionary.
*
- * Usage: java edu.umn.biomedicus.concepts.ConceptDictionaryBuilder [umls installation] \
+ * Usage: java edu.umn.biomedicus.concepts.ConceptDictionaryBuilder [umls
+ * installation] \
* [tuis-of-interest file] [banned-ttys file] [outputPath]
*/
public class ConceptDictionaryBuilder {
@@ -51,31 +52,25 @@ public class ConceptDictionaryBuilder {
private static final Pattern SPACE_SPLITTER = Pattern.compile(" ");
- @Argument(required = true, metaVar = "PATH/TO/UMLS", handler = PathOptionHandler.class,
- usage = "Path to UMLS installation")
+ @Argument(required = true, metaVar = "PATH/TO/UMLS", handler = PathOptionHandler.class, usage = "Path to UMLS installation")
private Path umlsPath;
- @Argument(index = 1, metaVar = "PATH/TO/TUIS", handler = PathOptionHandler.class,
- usage = "Path to TUIs of interest")
+ @Argument(index = 1, metaVar = "PATH/TO/TUIS", handler = PathOptionHandler.class, usage = "Path to TUIs of interest")
private Path tuisOfInterestFile;
- @Argument(index = 2, metaVar = "PATH/TO/BANNED_TTYS", handler = PathOptionHandler.class,
- usage = "Banned TTYs file")
+ @Argument(index = 2, metaVar = "PATH/TO/BANNED_TTYS", handler = PathOptionHandler.class, usage = "Banned TTYs file")
private Path bannedTtysFile;
@Argument(index = 3, metaVar = "OUTPUT_PATH", usage = "Path to write db out to.")
private Path dbPath;
- @Option(name = "--filtered-suis", handler = PathOptionHandler.class,
- usage = "A path to a file containing SUIs to filter out.")
+ @Option(name = "--filtered-suis", handler = PathOptionHandler.class, usage = "A path to a file containing SUIs to filter out.")
private Path filteredSuisPath = null;
- @Option(name = "--filtered-cuis", handler = PathOptionHandler.class,
- usage = "A path to a file containing CUIs to filter out.")
+ @Option(name = "--filtered-cuis", handler = PathOptionHandler.class, usage = "A path to a file containing CUIs to filter out.")
private Path filteredCuisPath = null;
- @Option(name = "--filtered-sui-cuis", handler = PathOptionHandler.class,
- usage = "A path to a file containing SUI-CUI combinations to filter")
+ @Option(name = "--filtered-sui-cuis", handler = PathOptionHandler.class, usage = "A path to a file containing SUI-CUI combinations to filter")
private Path filteredSuiCuisPath;
@Option(name = "--filtered-tuis", usage = "A path to a file containing TUIs to filter out.")
@@ -89,7 +84,9 @@ public static void main(String[] args) {
builder.doWork();
} catch (CmdLineException e) {
System.err.println(e.getLocalizedMessage());
- System.err.println("java edu.umn.biomedicus.concepts.ConceptDictionaryBuilder" + parser.printExample(OptionHandlerFilter.ALL) + " PATH/TO/UMLS PATH/TO/TUIS PATH/TO/BANNED_TTYS OUTPUT_PATH");
+ System.err.println(
+ "java edu.umn.biomedicus.concepts.ConceptDictionaryBuilder" + parser.printExample(OptionHandlerFilter.ALL)
+ + " PATH/TO/UMLS PATH/TO/TUIS PATH/TO/BANNED_TTYS OUTPUT_PATH");
parser.printUsage(System.err);
} catch (IOException e) {
e.printStackTrace();
@@ -208,7 +205,7 @@ private void doWork() throws IOException {
SuiCui sc = new SuiCui(sui, cui);
if (filteredCuis.contains(cui) || filteredTuis.contains(tui)
|| filteredSuiCuis.contains(sc) || filteredSuis
- .contains(sui)) {
+ .contains(sui)) {
continue;
}
@@ -227,7 +224,7 @@ private void doWork() throws IOException {
options.setCreateIfMissing(true);
options.prepareForBulkLoad();
try (RocksDB phrases = RocksDB.open(options, dbPath.resolve("phrases").toString());
- RocksDB lowercase = RocksDB.open(options, dbPath.resolve("lowercase").toString())) {
+ RocksDB lowercase = RocksDB.open(options, dbPath.resolve("lowercase").toString())) {
int wrote = 0;
for (Entry> entry : phrasesMap.entrySet()) {
List suiCuiTuis = entry.getValue();
@@ -308,16 +305,17 @@ private void doWork() throws IOException {
}
int wrote = 0;
- try (Options options = new Options();
- RocksDB normsDb = RocksDB.open(options, dbPath.resolve("norms").toString())) {
+ try (Options options = new Options()) {
options.setCreateIfMissing(true);
options.prepareForBulkLoad();
- for (Entry> entry : map.entrySet()) {
- List suiCuiTuis = entry.getValue();
- byte[] suiCuiTuiBytes = getBytes(suiCuiTuis);
- normsDb.put(entry.getKey().getBytes(), suiCuiTuiBytes);
- if (++wrote % 10_000 == 0) {
- System.out.println("Wrote " + wrote + " of " + map.size() + " norm term bags.");
+ try (RocksDB normsDb = RocksDB.open(options, dbPath.resolve("norms").toString())) {
+ for (Entry> entry : map.entrySet()) {
+ List suiCuiTuis = entry.getValue();
+ byte[] suiCuiTuiBytes = getBytes(suiCuiTuis);
+ normsDb.put(entry.getKey().getBytes(), suiCuiTuiBytes);
+ if (++wrote % 10_000 == 0) {
+ System.out.println("Wrote " + wrote + " of " + map.size() + " norm term bags.");
+ }
}
}
} catch (RocksDBException e) {
@@ -396,7 +394,8 @@ public int hashCode() {
@Override
public int compareTo(@NotNull SuiCui o) {
int compare = Integer.compare(sui.identifier(), o.sui.identifier());
- if (compare != 0) return compare;
+ if (compare != 0)
+ return compare;
return Integer.compare(cui.identifier(), o.cui.identifier());
}
}
diff --git a/java/src/main/resources/edu/umn/biomedicus/defaultConfig.yml b/java/src/main/resources/edu/umn/biomedicus/defaultConfig.yml
index a0795dbe..8c4787cd 100644
--- a/java/src/main/resources/edu/umn/biomedicus/defaultConfig.yml
+++ b/java/src/main/resources/edu/umn/biomedicus/defaultConfig.yml
@@ -32,7 +32,3 @@ tnt:
beam.threshold: 2.0
sections:
headersFile: ${BIOMEDICUS_DATA}/sections/header_patterns.txt
-data:
- # THE CERTIFICATE IS SIGNED FOR ATHENA, USE ATHENA URL
- data_url: https://athena.ahc.umn.edu/downloads/open/biomedicus-3.0b9-standard-data.zip
- version: 3.0b9
diff --git a/python/biomedicus/data_version.py b/python/biomedicus/data_version.py
index e6aa174c..eee25eb5 100644
--- a/python/biomedicus/data_version.py
+++ b/python/biomedicus/data_version.py
@@ -1,4 +1,4 @@
-# Copyright 2022 Regents of the University of Minnesota.
+# Copyright (c) Regents of the University of Minnesota.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -13,5 +13,5 @@
# limitations under the License.
"""The expected data version and url to download data."""
-DATA_VERSION = "2023-10-31"
-DATA_URL = "https://athena.ahc.umn.edu/downloads/open/biomedicus-open-data-2023-10-31.zip"
+DATA_VERSION = "2025-01-27"
+DATA_URL = "https://athena.ahc.umn.edu/downloads/open/biomedicus-open-data-2025-01-27.zip"
diff --git a/python/biomedicus/dependencies/stanza_parser.py b/python/biomedicus/dependencies/stanza_parser.py
index 6e6397cc..f1a24603 100644
--- a/python/biomedicus/dependencies/stanza_parser.py
+++ b/python/biomedicus/dependencies/stanza_parser.py
@@ -1,4 +1,4 @@
-# Copyright 2020 Regents of the University of Minnesota.
+# Copyright (c) Regents of the University of Minnesota.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
import numpy as np
import stanza
+import torch
from mtap import Document, DocumentProcessor, processor, run_processor, GenericLabel
from mtap.descriptors import labels, label_property
@@ -43,7 +44,7 @@ def stanza_deps_and_upos_tags(sentence, stanza_sentence):
for dep_id in range(len(stanza_dependencies) + 1):
if graph[head_id, dep_id] > 0:
dep, deprel = dep_map[dep_id]
- token_begin, token_end = sentence[dep_id - 1]
+ token_begin, token_end = sentence[dep_id - 1].location
dep_label = GenericLabel(token_begin, token_end, head=head_dep_label,
deprel=deprel)
dep_label.reference_cache['dependents'] = []
@@ -55,78 +56,105 @@ def stanza_deps_and_upos_tags(sentence, stanza_sentence):
if len(dependencies) == len(stanza_dependencies) - 1:
raise ValueError("Unexpected number of dependencies")
for word in stanza_sentence.words:
- token_begin, token_end = sentence[word.id - 1]
+ token_begin, token_end = sentence[word.id - 1].location
sentence_upos_tags.append(GenericLabel(token_begin, token_end, tag=word.upos))
return sentence_deps, sentence_upos_tags
@processor(
- 'biomedicus-dependencies',
- human_name="BioMedICUS Stanza Dependency Parser",
- description="Calls out to the Stanford Stanza framework for dependency parsing.",
+ 'biomedicus-selective-dependencies',
+ human_name="BioMedICUS Stanza Selective Dependency Parser",
+ description="Calls out to the Stanford Stanza framework for dependency parsing"
+ "on a appropriate subset of sentences.",
inputs=[
labels(name='sentences', reference='biomedicus-sentences/sentences'),
labels(name='pos_tags', reference='biomedicus-tnt-tagger/pos_tags'),
+ labels(
+ name='umls_terms',
+ reference='biomedicus-concepts/umls_terms',
+ name_from_parameter='terms_index',
+ optional=True
+ ),
+ labels(
+ "negation_triggers",
+ reference='biomedicus-negex-triggers',
+ optional=True
+ )
],
outputs=[
- labels(name='dependencies',
- description="The dependent words.",
- properties=[
- label_property(
- 'deprel',
- description="The dependency relation",
- data_type='str'
- ),
- label_property(
- 'head',
- description="The head of this label or null if its the root.",
- nullable=True,
- data_type='ref:dependencies'
- ),
- label_property(
- 'dependents',
- description="The dependents of ths dependent.",
- data_type='list[ref:dependencies]'
- )
- ]),
- labels(name='upos_tags',
- description="Universal Part-of-speech tags",
- properties=[
- label_property(
- 'tag',
- description="The Universal Part-of-Speech tag",
- data_type='str'
- )
- ])
+ labels(
+ name='dependencies',
+ description="The dependent words.",
+ properties=[
+ label_property(
+ 'deprel',
+ description="The dependency relation",
+ data_type='str'
+ ),
+ label_property(
+ 'head',
+ description="The head of this label or null if its the root.",
+ nullable=True,
+ data_type='ref:dependencies'
+ ),
+ label_property(
+ 'dependents',
+ description="The dependents of ths dependent.",
+ data_type='list[ref:dependencies]'
+ )
+ ]
+ ),
+ labels(
+ name='upos_tags',
+ description="Universal Part-of-speech tags",
+ properties=[
+ label_property(
+ 'tag',
+ description="The Universal Part-of-Speech tag",
+ data_type='str'
+ )
+ ]
+ )
],
additional_data={
- 'entry_point': __name__
+ 'entry_point': __name__,
}
)
class StanzaParser(DocumentProcessor):
-
- def __init__(self):
- stanza.download('en')
+ def __init__(self, selective=False):
self.nlp = stanza.Pipeline('en', processors='tokenize,pos,lemma,depparse',
- tokenize_pretokenized=True)
+ tokenize_pretokenized=True, verbose=False)
+ self.selective = selective
+
+ def __reduce__(self):
+ return StanzaParser, ()
def process_document(self,
document: Document,
params: Dict[str, Any]):
- sentences = document.labels['sentences']
pos_tags = document.labels['pos_tags']
- sentence_tokens = []
- for sentence in sentences:
- tokens = [(pt.start_index, pt.end_index) for pt in pos_tags.inside(sentence)]
- sentence_tokens.append(tokens)
-
- stanza_doc = self.nlp([[document.text[a:b] for a, b in sentence] for sentence in sentence_tokens])
-
all_deps = []
all_upos_tags = []
- for stanza_sentence, sentence in zip(stanza_doc.sentences, sentence_tokens):
- sentence_deps, sentence_upos_tags = stanza_deps_and_upos_tags(sentence, stanza_sentence)
+ selective = self.selective or params.get('selective', False)
+ if selective:
+ terms_index_name = params.get('terms_index', 'umls_terms')
+ negation_triggers = document.labels['negation_triggers']
+ terms = document.labels[terms_index_name]
+
+ def include(sentence):
+ if selective and (len(terms.inside(sentence)) == 0 or len(negation_triggers.inside(sentence)) == 0):
+ return False
+ return True
+
+ sentences = [sent for sent in document.labels['sentences'] if include(sent)]
+
+ with torch.no_grad():
+ stanza_doc = self.nlp([[tag.text for tag in pos_tags.inside(sent)] for sent in sentences])
+
+ for sentence, stanza_sent in zip(sentences, stanza_doc.sentences):
+ sentence_tags = pos_tags.inside(sentence)
+ sentence_deps, sentence_upos_tags = stanza_deps_and_upos_tags(sentence_tags, stanza_sent)
all_deps.extend(sentence_deps)
all_upos_tags.extend(sentence_upos_tags)
diff --git a/python/biomedicus/dependencies/stanza_selective_parser.py b/python/biomedicus/dependencies/stanza_selective_parser.py
index 458a6bba..3eee94cc 100644
--- a/python/biomedicus/dependencies/stanza_selective_parser.py
+++ b/python/biomedicus/dependencies/stanza_selective_parser.py
@@ -1,4 +1,4 @@
-# Copyright 2020 Regents of the University of Minnesota.
+# Copyright (c) Regents of the University of Minnesota.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -20,125 +20,22 @@
from mtap.processing import DocumentProcessor
from mtap.descriptors import labels, label_property
-from biomedicus.dependencies.stanza_parser import stanza_deps_and_upos_tags
-
-
-@processor(
- 'biomedicus-selective-dependencies',
- human_name="BioMedICUS Stanza Selective Dependency Parser",
- description="Calls out to the Stanford Stanza framework for dependency parsing"
- "on a appropriate subset of sentences.",
- inputs=[
- labels(name='sentences', reference='biomedicus-sentences/sentences'),
- labels(name='pos_tags', reference='biomedicus-tnt-tagger/pos_tags'),
- labels(
- name='umls_terms',
- reference='biomedicus-concepts/umls_terms',
- name_from_parameter='terms_index'
- ),
- labels(
- "negation_triggers",
- reference='biomedicus-negex-triggers'
- )
- ],
- outputs=[
- labels(
- name='dependencies',
- description="The dependent words.",
- properties=[
- label_property(
- 'deprel',
- description="The dependency relation",
- data_type='str'
- ),
- label_property(
- 'head',
- description="The head of this label or null if its the root.",
- nullable=True,
- data_type='ref:dependencies'
- ),
- label_property(
- 'dependents',
- description="The dependents of ths dependent.",
- data_type='list[ref:dependencies]'
- )
- ]
- ),
- labels(
- name='upos_tags',
- description="Universal Part-of-speech tags",
- properties=[
- label_property(
- 'tag',
- description="The Universal Part-of-Speech tag",
- data_type='str'
- )
- ]
- )
- ],
- additional_data={
- 'entry_point': __name__,
- }
-)
-class StanzaSelectiveParser(DocumentProcessor):
- def __init__(self):
- self.nlp = stanza.Pipeline('en', processors='tokenize,pos,lemma,depparse',
- tokenize_pretokenized=True, verbose=False)
-
- def __reduce__(self):
- return StanzaSelectiveParser, ()
-
- def process_document(self,
- document: Document,
- params: Dict[str, Any]):
- pos_tags = document.labels['pos_tags']
- terms_index_name = params.get('terms_index', 'umls_terms')
- terms = document.labels[terms_index_name]
- negation_triggers = document.labels['negation_triggers']
-
- all_deps = []
- all_upos_tags = []
- sentences = []
- sentence_texts = []
- for sentence in document.labels['sentences']:
- tokens = [(pt.start_index, pt.end_index) for pt in pos_tags.inside(sentence)]
- if len(terms.inside(sentence)) == 0 or len(negation_triggers.inside(sentence)) == 0:
- continue
- sentences.append(tokens)
- sentence_texts.append(sentence.text)
-
- with torch.no_grad():
- stanza_doc = self.nlp([[document.text[a:b] for a, b in sentence] for sentence in sentences])
- for (sentence, stanza_sentence) in zip(sentences, stanza_doc.sentences):
- sentence_deps, sentence_upos_tags = stanza_deps_and_upos_tags(sentence, stanza_sentence)
- all_deps.extend(sentence_deps)
- all_upos_tags.extend(sentence_upos_tags)
-
- document.add_labels('dependencies', all_deps)
- document.add_labels('upos_tags', all_upos_tags)
+from biomedicus.dependencies.stanza_parser import StanzaParser
def main(args=None):
parser = ArgumentParser(parents=[processor_parser()])
parser.add_argument('--offline', action='store_true')
- parser.add_argument(
- '--mp', action='store_true',
- help="Whether to use the multiprocessing pool based processor server."
- )
- parser.add_argument(
- '--mp-start-method', default='forkserver', choices=['forkserver', 'spawn'],
- help="The multiprocessing start method to use"
- )
options = parser.parse_args(args)
if not options.offline:
stanza.download('en')
- processor = StanzaSelectiveParser()
+ processor = StanzaParser(selective=True)
mp_context = None
if options.mp:
mp_context = torch.multiprocessing.get_context(options.mp_start_method)
- run_processor(processor, options=options, mp=options.mp, mp_context=mp_context)
+ run_processor(processor, options=options, mp_context=mp_context)
if __name__ == '__main__':
diff --git a/python/biomedicus/negation/deepen.py b/python/biomedicus/negation/deepen.py
index 695b1ae5..4d3c9564 100644
--- a/python/biomedicus/negation/deepen.py
+++ b/python/biomedicus/negation/deepen.py
@@ -202,22 +202,7 @@ def process_document(self, document: Document, params: Dict[str, Any]):
def main(args=None):
- parser = ArgumentParser(add_help=True, parents=[processor_parser()])
- parser.add_argument(
- '--mp', action='store_true',
- help="Whether to use the multiprocessing pool based processor server."
- )
- parser.add_argument(
- '--mp-start-method', default='forkserver', choices=['forkserver', 'spawn'],
- help="The multiprocessing start method to use"
- )
- options = parser.parse_args(args)
- mp_context = None
- if options.mp:
- import multiprocessing as mp
- mp_context = mp.get_context(options.mp_start_method)
-
- run_processor(DeepenProcessor(), options=options, mp=options.mp, mp_context=mp_context)
+ run_processor(DeepenProcessor())
if __name__ == '__main__':
diff --git a/python/biomedicus/sentences/bi_lstm.py b/python/biomedicus/sentences/bi_lstm.py
index ffe0afb3..17e10961 100644
--- a/python/biomedicus/sentences/bi_lstm.py
+++ b/python/biomedicus/sentences/bi_lstm.py
@@ -440,7 +440,7 @@ class Hparams:
model.to(device=device)
logger.info('Loading model weights from: {}'.format(conf.model_file))
with conf.model_file.open('rb') as f:
- state_dict = torch.load(f)
+ state_dict = torch.load(f, weights_only=True)
model.load_state_dict(state_dict)
model.eval()
if conf.mp: