Updated to data version 2025-01-27

nlpie · Jan 28, 2025 · 0cdd78e · 0cdd78e
1 parent a94f1e1
commit 0cdd78e
Show file tree

Hide file tree

Showing 9 changed files with 178 additions and 207 deletions.
diff --git a/docs/test_results/3.5-full-data.yml b/docs/test_results/3.5-full-data.yml
@@ -0,0 +1,33 @@
+Concepts:
+  Precision: 0.5435908753019885
+  Process Method Duration: '0:00:00.027807'
+  Recall: 0.6921164772727273
+  Remote Call Duration: '0:00:00.031274'
+TnT Pos Tagger:
+  Accuracy: 0.9349333907344957
+  Process Method Duration: '0:00:00.007598'
+  Remote Call Duration: '0:00:00.009994'
+biomedicus-deepen:
+  F1: 0.912258064516129
+  Gold Standard: 2010 i2b2-VA
+  Per-Document Mean Pipeline Duration: '0:00:00.377380'
+  Precision: 0.8830141548709409
+  Recall: 0.9435053380782918
+biomedicus-dependencies:
+  Corpus: MiPACQ converted to UD from PTB test set
+  LAS: 0.5327625056331681
+  Process Method Duration: '0:00:00.502909'
+  Remote Call Duration: '0:00:00.503824'
+  UAS: 0.6661559260928346
+biomedicus-modification:
+  F1: 0.7100757788380578
+  Gold Standard: 2010 i2b2-VA
+  Per-Document Mean Pipeline Duration: '0:00:00.013226'
+  Precision: 0.9619771863117871
+  Recall: 0.5627224199288257
+biomedicus-negex:
+  F1: 0.8706162076481078
+  Gold Standard: 2010 i2b2-VA
+  Per-Document Mean Pipeline Duration: '0:00:00.009332'
+  Precision: 0.7849231868524473
+  Recall: 0.9773131672597865
diff --git a/docs/test_results/3.5-open-data.yml b/docs/test_results/3.5-open-data.yml
@@ -0,0 +1,33 @@
+Concepts:
+  Precision: 0.5265714242883056
+  Process Method Duration: '0:00:00.014707'
+  Recall: 0.6747159090909091
+  Remote Call Duration: '0:00:00.018182'
+TnT Pos Tagger:
+  Accuracy: 0.9349333907344957
+  Process Method Duration: '0:00:00.007532'
+  Remote Call Duration: '0:00:00.010117'
+biomedicus-deepen:
+  F1: 0.9104573759931286
+  Gold Standard: 2010 i2b2-VA
+  Per-Document Mean Pipeline Duration: '0:00:00.115112'
+  Precision: 0.8800332088003321
+  Recall: 0.9430604982206405
+biomedicus-dependencies:
+  Corpus: MiPACQ converted to UD from PTB test set
+  LAS: 0.5475739822742978
+  Process Method Duration: '0:00:00.101438'
+  Remote Call Duration: '0:00:00.102343'
+  UAS: 0.683340844224125
+biomedicus-modification:
+  F1: 0.7100757788380578
+  Gold Standard: 2010 i2b2-VA
+  Per-Document Mean Pipeline Duration: '0:00:00.014181'
+  Precision: 0.9619771863117871
+  Recall: 0.5627224199288257
+biomedicus-negex:
+  F1: 0.8706162076481078
+  Gold Standard: 2010 i2b2-VA
+  Per-Document Mean Pipeline Duration: '0:00:00.009531'
+  Precision: 0.7849231868524473
+  Recall: 0.9773131672597865
diff --git a/java/src/main/java/edu/umn/biomedicus/concepts/ConceptDictionaryBuilder.java b/java/src/main/java/edu/umn/biomedicus/concepts/ConceptDictionaryBuilder.java
@@ -1,5 +1,5 @@
 /*
- * Copyright 2019 Regents of the University of Minnesota.
+ * Copyright (c) Regents of the University of Minnesota.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -41,7 +41,8 @@
 /**
  * Builds the concepts dictionary.
  * <p>
- * Usage: java edu.umn.biomedicus.concepts.ConceptDictionaryBuilder [umls installation] \
+ * Usage: java edu.umn.biomedicus.concepts.ConceptDictionaryBuilder [umls
+ * installation] \
  * [tuis-of-interest file] [banned-ttys file] [outputPath]
  */
 public class ConceptDictionaryBuilder {
@@ -51,31 +52,25 @@ public class ConceptDictionaryBuilder {
 
   private static final Pattern SPACE_SPLITTER = Pattern.compile(" ");
 
-  @Argument(required = true, metaVar = "PATH/TO/UMLS", handler = PathOptionHandler.class,
-      usage = "Path to UMLS installation")
+  @Argument(required = true, metaVar = "PATH/TO/UMLS", handler = PathOptionHandler.class, usage = "Path to UMLS installation")
   private Path umlsPath;
 
-  @Argument(index = 1, metaVar = "PATH/TO/TUIS", handler = PathOptionHandler.class,
-      usage = "Path to TUIs of interest")
+  @Argument(index = 1, metaVar = "PATH/TO/TUIS", handler = PathOptionHandler.class, usage = "Path to TUIs of interest")
   private Path tuisOfInterestFile;
 
-  @Argument(index = 2, metaVar = "PATH/TO/BANNED_TTYS", handler = PathOptionHandler.class,
-      usage = "Banned TTYs file")
+  @Argument(index = 2, metaVar = "PATH/TO/BANNED_TTYS", handler = PathOptionHandler.class, usage = "Banned TTYs file")
   private Path bannedTtysFile;
 
   @Argument(index = 3, metaVar = "OUTPUT_PATH", usage = "Path to write db out to.")
   private Path dbPath;
 
-  @Option(name = "--filtered-suis", handler = PathOptionHandler.class,
-      usage = "A path to a file containing SUIs to filter out.")
+  @Option(name = "--filtered-suis", handler = PathOptionHandler.class, usage = "A path to a file containing SUIs to filter out.")
   private Path filteredSuisPath = null;
 
-  @Option(name = "--filtered-cuis", handler = PathOptionHandler.class,
-      usage = "A path to a file containing CUIs to filter out.")
+  @Option(name = "--filtered-cuis", handler = PathOptionHandler.class, usage = "A path to a file containing CUIs to filter out.")
   private Path filteredCuisPath = null;
 
-  @Option(name = "--filtered-sui-cuis", handler = PathOptionHandler.class,
-      usage = "A path to a file containing SUI-CUI combinations to filter")
+  @Option(name = "--filtered-sui-cuis", handler = PathOptionHandler.class, usage = "A path to a file containing SUI-CUI combinations to filter")
   private Path filteredSuiCuisPath;
 
   @Option(name = "--filtered-tuis", usage = "A path to a file containing TUIs to filter out.")
@@ -89,7 +84,9 @@ public static void main(String[] args) {
       builder.doWork();
     } catch (CmdLineException e) {
       System.err.println(e.getLocalizedMessage());
-      System.err.println("java edu.umn.biomedicus.concepts.ConceptDictionaryBuilder" + parser.printExample(OptionHandlerFilter.ALL) + " PATH/TO/UMLS PATH/TO/TUIS PATH/TO/BANNED_TTYS OUTPUT_PATH");
+      System.err.println(
+          "java edu.umn.biomedicus.concepts.ConceptDictionaryBuilder" + parser.printExample(OptionHandlerFilter.ALL)
+              + " PATH/TO/UMLS PATH/TO/TUIS PATH/TO/BANNED_TTYS OUTPUT_PATH");
       parser.printUsage(System.err);
     } catch (IOException e) {
       e.printStackTrace();
@@ -208,7 +205,7 @@ private void doWork() throws IOException {
               SuiCui sc = new SuiCui(sui, cui);
               if (filteredCuis.contains(cui) || filteredTuis.contains(tui)
                   || filteredSuiCuis.contains(sc) || filteredSuis
-                  .contains(sui)) {
+                      .contains(sui)) {
                 continue;
               }
 
@@ -227,7 +224,7 @@ private void doWork() throws IOException {
           options.setCreateIfMissing(true);
           options.prepareForBulkLoad();
           try (RocksDB phrases = RocksDB.open(options, dbPath.resolve("phrases").toString());
-               RocksDB lowercase = RocksDB.open(options, dbPath.resolve("lowercase").toString())) {
+              RocksDB lowercase = RocksDB.open(options, dbPath.resolve("lowercase").toString())) {
             int wrote = 0;
             for (Entry<String, List<ConceptRow>> entry : phrasesMap.entrySet()) {
               List<ConceptRow> suiCuiTuis = entry.getValue();
@@ -308,16 +305,17 @@ private void doWork() throws IOException {
     }
 
     int wrote = 0;
-    try (Options options = new Options();
-         RocksDB normsDb = RocksDB.open(options, dbPath.resolve("norms").toString())) {
+    try (Options options = new Options()) {
       options.setCreateIfMissing(true);
       options.prepareForBulkLoad();
-      for (Entry<String, List<ConceptRow>> entry : map.entrySet()) {
-        List<ConceptRow> suiCuiTuis = entry.getValue();
-        byte[] suiCuiTuiBytes = getBytes(suiCuiTuis);
-        normsDb.put(entry.getKey().getBytes(), suiCuiTuiBytes);
-        if (++wrote % 10_000 == 0) {
-          System.out.println("Wrote " + wrote + " of " + map.size() + " norm term bags.");
+      try (RocksDB normsDb = RocksDB.open(options, dbPath.resolve("norms").toString())) {
+        for (Entry<String, List<ConceptRow>> entry : map.entrySet()) {
+          List<ConceptRow> suiCuiTuis = entry.getValue();
+          byte[] suiCuiTuiBytes = getBytes(suiCuiTuis);
+          normsDb.put(entry.getKey().getBytes(), suiCuiTuiBytes);
+          if (++wrote % 10_000 == 0) {
+            System.out.println("Wrote " + wrote + " of " + map.size() + " norm term bags.");
+          }
         }
       }
     } catch (RocksDBException e) {
@@ -396,7 +394,8 @@ public int hashCode() {
     @Override
     public int compareTo(@NotNull SuiCui o) {
       int compare = Integer.compare(sui.identifier(), o.sui.identifier());
-      if (compare != 0) return compare;
+      if (compare != 0)
+        return compare;
       return Integer.compare(cui.identifier(), o.cui.identifier());
     }
   }

diff --git a/java/src/main/resources/edu/umn/biomedicus/defaultConfig.yml b/java/src/main/resources/edu/umn/biomedicus/defaultConfig.yml
@@ -32,7 +32,3 @@ tnt:
   beam.threshold: 2.0
 sections:
   headersFile: ${BIOMEDICUS_DATA}/sections/header_patterns.txt
-data:
-  # THE CERTIFICATE IS SIGNED FOR ATHENA, USE ATHENA URL
-  data_url: https://athena.ahc.umn.edu/downloads/open/biomedicus-3.0b9-standard-data.zip
-  version: 3.0b9
diff --git a/python/biomedicus/data_version.py b/python/biomedicus/data_version.py
@@ -1,4 +1,4 @@
-#  Copyright 2022 Regents of the University of Minnesota.
+#  Copyright (c) Regents of the University of Minnesota.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -13,5 +13,5 @@
 #  limitations under the License.
 """The expected data version and url to download data."""
 
-DATA_VERSION = "2023-10-31"
-DATA_URL = "https://athena.ahc.umn.edu/downloads/open/biomedicus-open-data-2023-10-31.zip"
+DATA_VERSION = "2025-01-27"
+DATA_URL = "https://athena.ahc.umn.edu/downloads/open/biomedicus-open-data-2025-01-27.zip"
diff --git a/python/biomedicus/dependencies/stanza_parser.py b/python/biomedicus/dependencies/stanza_parser.py
@@ -1,4 +1,4 @@
-#  Copyright 2020 Regents of the University of Minnesota.
+#  Copyright (c) Regents of the University of Minnesota.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
 
 import numpy as np
 import stanza
+import torch
 from mtap import Document, DocumentProcessor, processor, run_processor, GenericLabel
 from mtap.descriptors import labels, label_property
 
@@ -43,7 +44,7 @@ def stanza_deps_and_upos_tags(sentence, stanza_sentence):
         for dep_id in range(len(stanza_dependencies) + 1):
             if graph[head_id, dep_id] > 0:
                 dep, deprel = dep_map[dep_id]
-                token_begin, token_end = sentence[dep_id - 1]
+                token_begin, token_end = sentence[dep_id - 1].location
                 dep_label = GenericLabel(token_begin, token_end, head=head_dep_label,
                                          deprel=deprel)
                 dep_label.reference_cache['dependents'] = []
@@ -55,78 +56,105 @@ def stanza_deps_and_upos_tags(sentence, stanza_sentence):
     if len(dependencies) == len(stanza_dependencies) - 1:
         raise ValueError("Unexpected number of dependencies")
     for word in stanza_sentence.words:
-        token_begin, token_end = sentence[word.id - 1]
+        token_begin, token_end = sentence[word.id - 1].location
         sentence_upos_tags.append(GenericLabel(token_begin, token_end, tag=word.upos))
     return sentence_deps, sentence_upos_tags
 
 
 @processor(
-    'biomedicus-dependencies',
-    human_name="BioMedICUS Stanza Dependency Parser",
-    description="Calls out to the Stanford Stanza framework for dependency parsing.",
+    'biomedicus-selective-dependencies',
+    human_name="BioMedICUS Stanza Selective Dependency Parser",
+    description="Calls out to the Stanford Stanza framework for dependency parsing"
+                "on a appropriate subset of sentences.",
     inputs=[
         labels(name='sentences', reference='biomedicus-sentences/sentences'),
         labels(name='pos_tags', reference='biomedicus-tnt-tagger/pos_tags'),
+        labels(
+            name='umls_terms',
+            reference='biomedicus-concepts/umls_terms',
+            name_from_parameter='terms_index',
+            optional=True
+        ),
+        labels(
+            "negation_triggers",
+            reference='biomedicus-negex-triggers',
+            optional=True
+        )
     ],
     outputs=[
-        labels(name='dependencies',
-               description="The dependent words.",
-               properties=[
-                   label_property(
-                       'deprel',
-                       description="The dependency relation",
-                       data_type='str'
-                   ),
-                   label_property(
-                       'head',
-                       description="The head of this label or null if its the root.",
-                       nullable=True,
-                       data_type='ref:dependencies'
-                   ),
-                   label_property(
-                       'dependents',
-                       description="The dependents of ths dependent.",
-                       data_type='list[ref:dependencies]'
-                   )
-               ]),
-        labels(name='upos_tags',
-               description="Universal Part-of-speech tags",
-               properties=[
-                   label_property(
-                       'tag',
-                       description="The Universal Part-of-Speech tag",
-                       data_type='str'
-                   )
-               ])
+        labels(
+            name='dependencies',
+            description="The dependent words.",
+            properties=[
+                label_property(
+                    'deprel',
+                    description="The dependency relation",
+                    data_type='str'
+                ),
+                label_property(
+                    'head',
+                    description="The head of this label or null if its the root.",
+                    nullable=True,
+                    data_type='ref:dependencies'
+                ),
+                label_property(
+                    'dependents',
+                    description="The dependents of ths dependent.",
+                    data_type='list[ref:dependencies]'
+                )
+            ]
+        ),
+        labels(
+            name='upos_tags',
+            description="Universal Part-of-speech tags",
+            properties=[
+                label_property(
+                    'tag',
+                    description="The Universal Part-of-Speech tag",
+                    data_type='str'
+                )
+            ]
+        )
     ],
     additional_data={
-        'entry_point': __name__
+        'entry_point': __name__,
     }
 )
 class StanzaParser(DocumentProcessor):
-
-    def __init__(self):
-        stanza.download('en')
+    def __init__(self, selective=False):
         self.nlp = stanza.Pipeline('en', processors='tokenize,pos,lemma,depparse',
-                                   tokenize_pretokenized=True)
+                                   tokenize_pretokenized=True, verbose=False)
+        self.selective = selective
+
+    def __reduce__(self):
+        return StanzaParser, ()
 
     def process_document(self,
                          document: Document,
                          params: Dict[str, Any]):
-        sentences = document.labels['sentences']
         pos_tags = document.labels['pos_tags']
 
-        sentence_tokens = []
-        for sentence in sentences:
-            tokens = [(pt.start_index, pt.end_index) for pt in pos_tags.inside(sentence)]
-            sentence_tokens.append(tokens)
-
-        stanza_doc = self.nlp([[document.text[a:b] for a, b in sentence] for sentence in sentence_tokens])
-
         all_deps = []
         all_upos_tags = []
-        for stanza_sentence, sentence in zip(stanza_doc.sentences, sentence_tokens):
-            sentence_deps, sentence_upos_tags = stanza_deps_and_upos_tags(sentence, stanza_sentence)
+        selective = self.selective or params.get('selective', False)
+        if selective:
+            terms_index_name = params.get('terms_index', 'umls_terms')
+            negation_triggers = document.labels['negation_triggers']
+            terms = document.labels[terms_index_name]
+
+        def include(sentence):
+            if selective and (len(terms.inside(sentence)) == 0 or len(negation_triggers.inside(sentence)) == 0):
+                return False
+            return True
+
+        sentences = [sent for sent in document.labels['sentences'] if include(sent)]
+
+        with torch.no_grad():
+            stanza_doc = self.nlp([[tag.text for tag in pos_tags.inside(sent)] for sent in sentences])
+
+        for sentence, stanza_sent in zip(sentences, stanza_doc.sentences):
+            sentence_tags = pos_tags.inside(sentence)
+            sentence_deps, sentence_upos_tags = stanza_deps_and_upos_tags(sentence_tags, stanza_sent)
             all_deps.extend(sentence_deps)
             all_upos_tags.extend(sentence_upos_tags)