Skip to content

Commit

Permalink
Updated to data version 2025-01-27
Browse files Browse the repository at this point in the history
  • Loading branch information
benknoll-umn committed Jan 28, 2025
1 parent a94f1e1 commit 0cdd78e
Show file tree
Hide file tree
Showing 9 changed files with 178 additions and 207 deletions.
33 changes: 33 additions & 0 deletions docs/test_results/3.5-full-data.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
Concepts:
Precision: 0.5435908753019885
Process Method Duration: '0:00:00.027807'
Recall: 0.6921164772727273
Remote Call Duration: '0:00:00.031274'
TnT Pos Tagger:
Accuracy: 0.9349333907344957
Process Method Duration: '0:00:00.007598'
Remote Call Duration: '0:00:00.009994'
biomedicus-deepen:
F1: 0.912258064516129
Gold Standard: 2010 i2b2-VA
Per-Document Mean Pipeline Duration: '0:00:00.377380'
Precision: 0.8830141548709409
Recall: 0.9435053380782918
biomedicus-dependencies:
Corpus: MiPACQ converted to UD from PTB test set
LAS: 0.5327625056331681
Process Method Duration: '0:00:00.502909'
Remote Call Duration: '0:00:00.503824'
UAS: 0.6661559260928346
biomedicus-modification:
F1: 0.7100757788380578
Gold Standard: 2010 i2b2-VA
Per-Document Mean Pipeline Duration: '0:00:00.013226'
Precision: 0.9619771863117871
Recall: 0.5627224199288257
biomedicus-negex:
F1: 0.8706162076481078
Gold Standard: 2010 i2b2-VA
Per-Document Mean Pipeline Duration: '0:00:00.009332'
Precision: 0.7849231868524473
Recall: 0.9773131672597865
33 changes: 33 additions & 0 deletions docs/test_results/3.5-open-data.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
Concepts:
Precision: 0.5265714242883056
Process Method Duration: '0:00:00.014707'
Recall: 0.6747159090909091
Remote Call Duration: '0:00:00.018182'
TnT Pos Tagger:
Accuracy: 0.9349333907344957
Process Method Duration: '0:00:00.007532'
Remote Call Duration: '0:00:00.010117'
biomedicus-deepen:
F1: 0.9104573759931286
Gold Standard: 2010 i2b2-VA
Per-Document Mean Pipeline Duration: '0:00:00.115112'
Precision: 0.8800332088003321
Recall: 0.9430604982206405
biomedicus-dependencies:
Corpus: MiPACQ converted to UD from PTB test set
LAS: 0.5475739822742978
Process Method Duration: '0:00:00.101438'
Remote Call Duration: '0:00:00.102343'
UAS: 0.683340844224125
biomedicus-modification:
F1: 0.7100757788380578
Gold Standard: 2010 i2b2-VA
Per-Document Mean Pipeline Duration: '0:00:00.014181'
Precision: 0.9619771863117871
Recall: 0.5627224199288257
biomedicus-negex:
F1: 0.8706162076481078
Gold Standard: 2010 i2b2-VA
Per-Document Mean Pipeline Duration: '0:00:00.009531'
Precision: 0.7849231868524473
Recall: 0.9773131672597865
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright 2019 Regents of the University of Minnesota.
* Copyright (c) Regents of the University of Minnesota.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -41,7 +41,8 @@
/**
* Builds the concepts dictionary.
* <p>
* Usage: java edu.umn.biomedicus.concepts.ConceptDictionaryBuilder [umls installation] \
* Usage: java edu.umn.biomedicus.concepts.ConceptDictionaryBuilder [umls
* installation] \
* [tuis-of-interest file] [banned-ttys file] [outputPath]
*/
public class ConceptDictionaryBuilder {
Expand All @@ -51,31 +52,25 @@ public class ConceptDictionaryBuilder {

private static final Pattern SPACE_SPLITTER = Pattern.compile(" ");

@Argument(required = true, metaVar = "PATH/TO/UMLS", handler = PathOptionHandler.class,
usage = "Path to UMLS installation")
@Argument(required = true, metaVar = "PATH/TO/UMLS", handler = PathOptionHandler.class, usage = "Path to UMLS installation")
private Path umlsPath;

@Argument(index = 1, metaVar = "PATH/TO/TUIS", handler = PathOptionHandler.class,
usage = "Path to TUIs of interest")
@Argument(index = 1, metaVar = "PATH/TO/TUIS", handler = PathOptionHandler.class, usage = "Path to TUIs of interest")
private Path tuisOfInterestFile;

@Argument(index = 2, metaVar = "PATH/TO/BANNED_TTYS", handler = PathOptionHandler.class,
usage = "Banned TTYs file")
@Argument(index = 2, metaVar = "PATH/TO/BANNED_TTYS", handler = PathOptionHandler.class, usage = "Banned TTYs file")
private Path bannedTtysFile;

@Argument(index = 3, metaVar = "OUTPUT_PATH", usage = "Path to write db out to.")
private Path dbPath;

@Option(name = "--filtered-suis", handler = PathOptionHandler.class,
usage = "A path to a file containing SUIs to filter out.")
@Option(name = "--filtered-suis", handler = PathOptionHandler.class, usage = "A path to a file containing SUIs to filter out.")
private Path filteredSuisPath = null;

@Option(name = "--filtered-cuis", handler = PathOptionHandler.class,
usage = "A path to a file containing CUIs to filter out.")
@Option(name = "--filtered-cuis", handler = PathOptionHandler.class, usage = "A path to a file containing CUIs to filter out.")
private Path filteredCuisPath = null;

@Option(name = "--filtered-sui-cuis", handler = PathOptionHandler.class,
usage = "A path to a file containing SUI-CUI combinations to filter")
@Option(name = "--filtered-sui-cuis", handler = PathOptionHandler.class, usage = "A path to a file containing SUI-CUI combinations to filter")
private Path filteredSuiCuisPath;

@Option(name = "--filtered-tuis", usage = "A path to a file containing TUIs to filter out.")
Expand All @@ -89,7 +84,9 @@ public static void main(String[] args) {
builder.doWork();
} catch (CmdLineException e) {
System.err.println(e.getLocalizedMessage());
System.err.println("java edu.umn.biomedicus.concepts.ConceptDictionaryBuilder" + parser.printExample(OptionHandlerFilter.ALL) + " PATH/TO/UMLS PATH/TO/TUIS PATH/TO/BANNED_TTYS OUTPUT_PATH");
System.err.println(
"java edu.umn.biomedicus.concepts.ConceptDictionaryBuilder" + parser.printExample(OptionHandlerFilter.ALL)
+ " PATH/TO/UMLS PATH/TO/TUIS PATH/TO/BANNED_TTYS OUTPUT_PATH");
parser.printUsage(System.err);
} catch (IOException e) {
e.printStackTrace();
Expand Down Expand Up @@ -208,7 +205,7 @@ private void doWork() throws IOException {
SuiCui sc = new SuiCui(sui, cui);
if (filteredCuis.contains(cui) || filteredTuis.contains(tui)
|| filteredSuiCuis.contains(sc) || filteredSuis
.contains(sui)) {
.contains(sui)) {
continue;
}

Expand All @@ -227,7 +224,7 @@ private void doWork() throws IOException {
options.setCreateIfMissing(true);
options.prepareForBulkLoad();
try (RocksDB phrases = RocksDB.open(options, dbPath.resolve("phrases").toString());
RocksDB lowercase = RocksDB.open(options, dbPath.resolve("lowercase").toString())) {
RocksDB lowercase = RocksDB.open(options, dbPath.resolve("lowercase").toString())) {
int wrote = 0;
for (Entry<String, List<ConceptRow>> entry : phrasesMap.entrySet()) {
List<ConceptRow> suiCuiTuis = entry.getValue();
Expand Down Expand Up @@ -308,16 +305,17 @@ private void doWork() throws IOException {
}

int wrote = 0;
try (Options options = new Options();
RocksDB normsDb = RocksDB.open(options, dbPath.resolve("norms").toString())) {
try (Options options = new Options()) {
options.setCreateIfMissing(true);
options.prepareForBulkLoad();
for (Entry<String, List<ConceptRow>> entry : map.entrySet()) {
List<ConceptRow> suiCuiTuis = entry.getValue();
byte[] suiCuiTuiBytes = getBytes(suiCuiTuis);
normsDb.put(entry.getKey().getBytes(), suiCuiTuiBytes);
if (++wrote % 10_000 == 0) {
System.out.println("Wrote " + wrote + " of " + map.size() + " norm term bags.");
try (RocksDB normsDb = RocksDB.open(options, dbPath.resolve("norms").toString())) {
for (Entry<String, List<ConceptRow>> entry : map.entrySet()) {
List<ConceptRow> suiCuiTuis = entry.getValue();
byte[] suiCuiTuiBytes = getBytes(suiCuiTuis);
normsDb.put(entry.getKey().getBytes(), suiCuiTuiBytes);
if (++wrote % 10_000 == 0) {
System.out.println("Wrote " + wrote + " of " + map.size() + " norm term bags.");
}
}
}
} catch (RocksDBException e) {
Expand Down Expand Up @@ -396,7 +394,8 @@ public int hashCode() {
@Override
public int compareTo(@NotNull SuiCui o) {
int compare = Integer.compare(sui.identifier(), o.sui.identifier());
if (compare != 0) return compare;
if (compare != 0)
return compare;
return Integer.compare(cui.identifier(), o.cui.identifier());
}
}
Expand Down
4 changes: 0 additions & 4 deletions java/src/main/resources/edu/umn/biomedicus/defaultConfig.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,3 @@ tnt:
beam.threshold: 2.0
sections:
headersFile: ${BIOMEDICUS_DATA}/sections/header_patterns.txt
data:
# THE CERTIFICATE IS SIGNED FOR ATHENA, USE ATHENA URL
data_url: https://athena.ahc.umn.edu/downloads/open/biomedicus-3.0b9-standard-data.zip
version: 3.0b9
6 changes: 3 additions & 3 deletions python/biomedicus/data_version.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2022 Regents of the University of Minnesota.
# Copyright (c) Regents of the University of Minnesota.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -13,5 +13,5 @@
# limitations under the License.
"""The expected data version and url to download data."""

DATA_VERSION = "2023-10-31"
DATA_URL = "https://athena.ahc.umn.edu/downloads/open/biomedicus-open-data-2023-10-31.zip"
DATA_VERSION = "2025-01-27"
DATA_URL = "https://athena.ahc.umn.edu/downloads/open/biomedicus-open-data-2025-01-27.zip"
128 changes: 78 additions & 50 deletions python/biomedicus/dependencies/stanza_parser.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2020 Regents of the University of Minnesota.
# Copyright (c) Regents of the University of Minnesota.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -15,6 +15,7 @@

import numpy as np
import stanza
import torch
from mtap import Document, DocumentProcessor, processor, run_processor, GenericLabel
from mtap.descriptors import labels, label_property

Expand Down Expand Up @@ -43,7 +44,7 @@ def stanza_deps_and_upos_tags(sentence, stanza_sentence):
for dep_id in range(len(stanza_dependencies) + 1):
if graph[head_id, dep_id] > 0:
dep, deprel = dep_map[dep_id]
token_begin, token_end = sentence[dep_id - 1]
token_begin, token_end = sentence[dep_id - 1].location
dep_label = GenericLabel(token_begin, token_end, head=head_dep_label,
deprel=deprel)
dep_label.reference_cache['dependents'] = []
Expand All @@ -55,78 +56,105 @@ def stanza_deps_and_upos_tags(sentence, stanza_sentence):
if len(dependencies) == len(stanza_dependencies) - 1:
raise ValueError("Unexpected number of dependencies")
for word in stanza_sentence.words:
token_begin, token_end = sentence[word.id - 1]
token_begin, token_end = sentence[word.id - 1].location
sentence_upos_tags.append(GenericLabel(token_begin, token_end, tag=word.upos))
return sentence_deps, sentence_upos_tags


@processor(
'biomedicus-dependencies',
human_name="BioMedICUS Stanza Dependency Parser",
description="Calls out to the Stanford Stanza framework for dependency parsing.",
'biomedicus-selective-dependencies',
human_name="BioMedICUS Stanza Selective Dependency Parser",
description="Calls out to the Stanford Stanza framework for dependency parsing"
"on a appropriate subset of sentences.",
inputs=[
labels(name='sentences', reference='biomedicus-sentences/sentences'),
labels(name='pos_tags', reference='biomedicus-tnt-tagger/pos_tags'),
labels(
name='umls_terms',
reference='biomedicus-concepts/umls_terms',
name_from_parameter='terms_index',
optional=True
),
labels(
"negation_triggers",
reference='biomedicus-negex-triggers',
optional=True
)
],
outputs=[
labels(name='dependencies',
description="The dependent words.",
properties=[
label_property(
'deprel',
description="The dependency relation",
data_type='str'
),
label_property(
'head',
description="The head of this label or null if its the root.",
nullable=True,
data_type='ref:dependencies'
),
label_property(
'dependents',
description="The dependents of ths dependent.",
data_type='list[ref:dependencies]'
)
]),
labels(name='upos_tags',
description="Universal Part-of-speech tags",
properties=[
label_property(
'tag',
description="The Universal Part-of-Speech tag",
data_type='str'
)
])
labels(
name='dependencies',
description="The dependent words.",
properties=[
label_property(
'deprel',
description="The dependency relation",
data_type='str'
),
label_property(
'head',
description="The head of this label or null if its the root.",
nullable=True,
data_type='ref:dependencies'
),
label_property(
'dependents',
description="The dependents of ths dependent.",
data_type='list[ref:dependencies]'
)
]
),
labels(
name='upos_tags',
description="Universal Part-of-speech tags",
properties=[
label_property(
'tag',
description="The Universal Part-of-Speech tag",
data_type='str'
)
]
)
],
additional_data={
'entry_point': __name__
'entry_point': __name__,
}
)
class StanzaParser(DocumentProcessor):

def __init__(self):
stanza.download('en')
def __init__(self, selective=False):
self.nlp = stanza.Pipeline('en', processors='tokenize,pos,lemma,depparse',
tokenize_pretokenized=True)
tokenize_pretokenized=True, verbose=False)
self.selective = selective

def __reduce__(self):
return StanzaParser, ()

def process_document(self,
document: Document,
params: Dict[str, Any]):
sentences = document.labels['sentences']
pos_tags = document.labels['pos_tags']

sentence_tokens = []
for sentence in sentences:
tokens = [(pt.start_index, pt.end_index) for pt in pos_tags.inside(sentence)]
sentence_tokens.append(tokens)

stanza_doc = self.nlp([[document.text[a:b] for a, b in sentence] for sentence in sentence_tokens])

all_deps = []
all_upos_tags = []
for stanza_sentence, sentence in zip(stanza_doc.sentences, sentence_tokens):
sentence_deps, sentence_upos_tags = stanza_deps_and_upos_tags(sentence, stanza_sentence)
selective = self.selective or params.get('selective', False)
if selective:
terms_index_name = params.get('terms_index', 'umls_terms')
negation_triggers = document.labels['negation_triggers']
terms = document.labels[terms_index_name]

def include(sentence):
if selective and (len(terms.inside(sentence)) == 0 or len(negation_triggers.inside(sentence)) == 0):
return False
return True

sentences = [sent for sent in document.labels['sentences'] if include(sent)]

with torch.no_grad():
stanza_doc = self.nlp([[tag.text for tag in pos_tags.inside(sent)] for sent in sentences])

for sentence, stanza_sent in zip(sentences, stanza_doc.sentences):
sentence_tags = pos_tags.inside(sentence)
sentence_deps, sentence_upos_tags = stanza_deps_and_upos_tags(sentence_tags, stanza_sent)
all_deps.extend(sentence_deps)
all_upos_tags.extend(sentence_upos_tags)

Expand Down
Loading

0 comments on commit 0cdd78e

Please sign in to comment.