Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated to data version 2025-01-27 #450

Merged
merged 1 commit into from
Jan 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions docs/test_results/3.5-full-data.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
Concepts:
Precision: 0.5435908753019885
Process Method Duration: '0:00:00.027807'
Recall: 0.6921164772727273
Remote Call Duration: '0:00:00.031274'
TnT Pos Tagger:
Accuracy: 0.9349333907344957
Process Method Duration: '0:00:00.007598'
Remote Call Duration: '0:00:00.009994'
biomedicus-deepen:
F1: 0.912258064516129
Gold Standard: 2010 i2b2-VA
Per-Document Mean Pipeline Duration: '0:00:00.377380'
Precision: 0.8830141548709409
Recall: 0.9435053380782918
biomedicus-dependencies:
Corpus: MiPACQ converted to UD from PTB test set
LAS: 0.5327625056331681
Process Method Duration: '0:00:00.502909'
Remote Call Duration: '0:00:00.503824'
UAS: 0.6661559260928346
biomedicus-modification:
F1: 0.7100757788380578
Gold Standard: 2010 i2b2-VA
Per-Document Mean Pipeline Duration: '0:00:00.013226'
Precision: 0.9619771863117871
Recall: 0.5627224199288257
biomedicus-negex:
F1: 0.8706162076481078
Gold Standard: 2010 i2b2-VA
Per-Document Mean Pipeline Duration: '0:00:00.009332'
Precision: 0.7849231868524473
Recall: 0.9773131672597865
33 changes: 33 additions & 0 deletions docs/test_results/3.5-open-data.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
Concepts:
Precision: 0.5265714242883056
Process Method Duration: '0:00:00.014707'
Recall: 0.6747159090909091
Remote Call Duration: '0:00:00.018182'
TnT Pos Tagger:
Accuracy: 0.9349333907344957
Process Method Duration: '0:00:00.007532'
Remote Call Duration: '0:00:00.010117'
biomedicus-deepen:
F1: 0.9104573759931286
Gold Standard: 2010 i2b2-VA
Per-Document Mean Pipeline Duration: '0:00:00.115112'
Precision: 0.8800332088003321
Recall: 0.9430604982206405
biomedicus-dependencies:
Corpus: MiPACQ converted to UD from PTB test set
LAS: 0.5475739822742978
Process Method Duration: '0:00:00.101438'
Remote Call Duration: '0:00:00.102343'
UAS: 0.683340844224125
biomedicus-modification:
F1: 0.7100757788380578
Gold Standard: 2010 i2b2-VA
Per-Document Mean Pipeline Duration: '0:00:00.014181'
Precision: 0.9619771863117871
Recall: 0.5627224199288257
biomedicus-negex:
F1: 0.8706162076481078
Gold Standard: 2010 i2b2-VA
Per-Document Mean Pipeline Duration: '0:00:00.009531'
Precision: 0.7849231868524473
Recall: 0.9773131672597865
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright 2019 Regents of the University of Minnesota.
* Copyright (c) Regents of the University of Minnesota.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -41,7 +41,8 @@
/**
* Builds the concepts dictionary.
* <p>
* Usage: java edu.umn.biomedicus.concepts.ConceptDictionaryBuilder [umls installation] \
* Usage: java edu.umn.biomedicus.concepts.ConceptDictionaryBuilder [umls
* installation] \
* [tuis-of-interest file] [banned-ttys file] [outputPath]
*/
public class ConceptDictionaryBuilder {
Expand All @@ -51,31 +52,25 @@ public class ConceptDictionaryBuilder {

private static final Pattern SPACE_SPLITTER = Pattern.compile(" ");

@Argument(required = true, metaVar = "PATH/TO/UMLS", handler = PathOptionHandler.class,
usage = "Path to UMLS installation")
@Argument(required = true, metaVar = "PATH/TO/UMLS", handler = PathOptionHandler.class, usage = "Path to UMLS installation")
private Path umlsPath;

@Argument(index = 1, metaVar = "PATH/TO/TUIS", handler = PathOptionHandler.class,
usage = "Path to TUIs of interest")
@Argument(index = 1, metaVar = "PATH/TO/TUIS", handler = PathOptionHandler.class, usage = "Path to TUIs of interest")
private Path tuisOfInterestFile;

@Argument(index = 2, metaVar = "PATH/TO/BANNED_TTYS", handler = PathOptionHandler.class,
usage = "Banned TTYs file")
@Argument(index = 2, metaVar = "PATH/TO/BANNED_TTYS", handler = PathOptionHandler.class, usage = "Banned TTYs file")
private Path bannedTtysFile;

@Argument(index = 3, metaVar = "OUTPUT_PATH", usage = "Path to write db out to.")
private Path dbPath;

@Option(name = "--filtered-suis", handler = PathOptionHandler.class,
usage = "A path to a file containing SUIs to filter out.")
@Option(name = "--filtered-suis", handler = PathOptionHandler.class, usage = "A path to a file containing SUIs to filter out.")
private Path filteredSuisPath = null;

@Option(name = "--filtered-cuis", handler = PathOptionHandler.class,
usage = "A path to a file containing CUIs to filter out.")
@Option(name = "--filtered-cuis", handler = PathOptionHandler.class, usage = "A path to a file containing CUIs to filter out.")
private Path filteredCuisPath = null;

@Option(name = "--filtered-sui-cuis", handler = PathOptionHandler.class,
usage = "A path to a file containing SUI-CUI combinations to filter")
@Option(name = "--filtered-sui-cuis", handler = PathOptionHandler.class, usage = "A path to a file containing SUI-CUI combinations to filter")
private Path filteredSuiCuisPath;

@Option(name = "--filtered-tuis", usage = "A path to a file containing TUIs to filter out.")
Expand All @@ -89,7 +84,9 @@ public static void main(String[] args) {
builder.doWork();
} catch (CmdLineException e) {
System.err.println(e.getLocalizedMessage());
System.err.println("java edu.umn.biomedicus.concepts.ConceptDictionaryBuilder" + parser.printExample(OptionHandlerFilter.ALL) + " PATH/TO/UMLS PATH/TO/TUIS PATH/TO/BANNED_TTYS OUTPUT_PATH");
System.err.println(
"java edu.umn.biomedicus.concepts.ConceptDictionaryBuilder" + parser.printExample(OptionHandlerFilter.ALL)
+ " PATH/TO/UMLS PATH/TO/TUIS PATH/TO/BANNED_TTYS OUTPUT_PATH");
parser.printUsage(System.err);
} catch (IOException e) {
e.printStackTrace();
Expand Down Expand Up @@ -208,7 +205,7 @@ private void doWork() throws IOException {
SuiCui sc = new SuiCui(sui, cui);
if (filteredCuis.contains(cui) || filteredTuis.contains(tui)
|| filteredSuiCuis.contains(sc) || filteredSuis
.contains(sui)) {
.contains(sui)) {
continue;
}

Expand All @@ -227,7 +224,7 @@ private void doWork() throws IOException {
options.setCreateIfMissing(true);
options.prepareForBulkLoad();
try (RocksDB phrases = RocksDB.open(options, dbPath.resolve("phrases").toString());
RocksDB lowercase = RocksDB.open(options, dbPath.resolve("lowercase").toString())) {
RocksDB lowercase = RocksDB.open(options, dbPath.resolve("lowercase").toString())) {
int wrote = 0;
for (Entry<String, List<ConceptRow>> entry : phrasesMap.entrySet()) {
List<ConceptRow> suiCuiTuis = entry.getValue();
Expand Down Expand Up @@ -308,16 +305,17 @@ private void doWork() throws IOException {
}

int wrote = 0;
try (Options options = new Options();
RocksDB normsDb = RocksDB.open(options, dbPath.resolve("norms").toString())) {
try (Options options = new Options()) {
options.setCreateIfMissing(true);
options.prepareForBulkLoad();
for (Entry<String, List<ConceptRow>> entry : map.entrySet()) {
List<ConceptRow> suiCuiTuis = entry.getValue();
byte[] suiCuiTuiBytes = getBytes(suiCuiTuis);
normsDb.put(entry.getKey().getBytes(), suiCuiTuiBytes);
if (++wrote % 10_000 == 0) {
System.out.println("Wrote " + wrote + " of " + map.size() + " norm term bags.");
try (RocksDB normsDb = RocksDB.open(options, dbPath.resolve("norms").toString())) {
for (Entry<String, List<ConceptRow>> entry : map.entrySet()) {
List<ConceptRow> suiCuiTuis = entry.getValue();
byte[] suiCuiTuiBytes = getBytes(suiCuiTuis);
normsDb.put(entry.getKey().getBytes(), suiCuiTuiBytes);
if (++wrote % 10_000 == 0) {
System.out.println("Wrote " + wrote + " of " + map.size() + " norm term bags.");
}
}
}
} catch (RocksDBException e) {
Expand Down Expand Up @@ -396,7 +394,8 @@ public int hashCode() {
@Override
public int compareTo(@NotNull SuiCui o) {
int compare = Integer.compare(sui.identifier(), o.sui.identifier());
if (compare != 0) return compare;
if (compare != 0)
return compare;
return Integer.compare(cui.identifier(), o.cui.identifier());
}
}
Expand Down
4 changes: 0 additions & 4 deletions java/src/main/resources/edu/umn/biomedicus/defaultConfig.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,3 @@ tnt:
beam.threshold: 2.0
sections:
headersFile: ${BIOMEDICUS_DATA}/sections/header_patterns.txt
data:
# THE CERTIFICATE IS SIGNED FOR ATHENA, USE ATHENA URL
data_url: https://athena.ahc.umn.edu/downloads/open/biomedicus-3.0b9-standard-data.zip
version: 3.0b9
6 changes: 3 additions & 3 deletions python/biomedicus/data_version.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2022 Regents of the University of Minnesota.
# Copyright (c) Regents of the University of Minnesota.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -13,5 +13,5 @@
# limitations under the License.
"""The expected data version and url to download data."""

DATA_VERSION = "2023-10-31"
DATA_URL = "https://athena.ahc.umn.edu/downloads/open/biomedicus-open-data-2023-10-31.zip"
DATA_VERSION = "2025-01-27"
DATA_URL = "https://athena.ahc.umn.edu/downloads/open/biomedicus-open-data-2025-01-27.zip"
128 changes: 78 additions & 50 deletions python/biomedicus/dependencies/stanza_parser.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2020 Regents of the University of Minnesota.
# Copyright (c) Regents of the University of Minnesota.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -15,6 +15,7 @@

import numpy as np
import stanza
import torch
from mtap import Document, DocumentProcessor, processor, run_processor, GenericLabel
from mtap.descriptors import labels, label_property

Expand Down Expand Up @@ -43,7 +44,7 @@ def stanza_deps_and_upos_tags(sentence, stanza_sentence):
for dep_id in range(len(stanza_dependencies) + 1):
if graph[head_id, dep_id] > 0:
dep, deprel = dep_map[dep_id]
token_begin, token_end = sentence[dep_id - 1]
token_begin, token_end = sentence[dep_id - 1].location
dep_label = GenericLabel(token_begin, token_end, head=head_dep_label,
deprel=deprel)
dep_label.reference_cache['dependents'] = []
Expand All @@ -55,78 +56,105 @@ def stanza_deps_and_upos_tags(sentence, stanza_sentence):
if len(dependencies) == len(stanza_dependencies) - 1:
raise ValueError("Unexpected number of dependencies")
for word in stanza_sentence.words:
token_begin, token_end = sentence[word.id - 1]
token_begin, token_end = sentence[word.id - 1].location
sentence_upos_tags.append(GenericLabel(token_begin, token_end, tag=word.upos))
return sentence_deps, sentence_upos_tags


@processor(
'biomedicus-dependencies',
human_name="BioMedICUS Stanza Dependency Parser",
description="Calls out to the Stanford Stanza framework for dependency parsing.",
'biomedicus-selective-dependencies',
human_name="BioMedICUS Stanza Selective Dependency Parser",
description="Calls out to the Stanford Stanza framework for dependency parsing"
"on a appropriate subset of sentences.",
inputs=[
labels(name='sentences', reference='biomedicus-sentences/sentences'),
labels(name='pos_tags', reference='biomedicus-tnt-tagger/pos_tags'),
labels(
name='umls_terms',
reference='biomedicus-concepts/umls_terms',
name_from_parameter='terms_index',
optional=True
),
labels(
"negation_triggers",
reference='biomedicus-negex-triggers',
optional=True
)
],
outputs=[
labels(name='dependencies',
description="The dependent words.",
properties=[
label_property(
'deprel',
description="The dependency relation",
data_type='str'
),
label_property(
'head',
description="The head of this label or null if its the root.",
nullable=True,
data_type='ref:dependencies'
),
label_property(
'dependents',
description="The dependents of ths dependent.",
data_type='list[ref:dependencies]'
)
]),
labels(name='upos_tags',
description="Universal Part-of-speech tags",
properties=[
label_property(
'tag',
description="The Universal Part-of-Speech tag",
data_type='str'
)
])
labels(
name='dependencies',
description="The dependent words.",
properties=[
label_property(
'deprel',
description="The dependency relation",
data_type='str'
),
label_property(
'head',
description="The head of this label or null if its the root.",
nullable=True,
data_type='ref:dependencies'
),
label_property(
'dependents',
description="The dependents of ths dependent.",
data_type='list[ref:dependencies]'
)
]
),
labels(
name='upos_tags',
description="Universal Part-of-speech tags",
properties=[
label_property(
'tag',
description="The Universal Part-of-Speech tag",
data_type='str'
)
]
)
],
additional_data={
'entry_point': __name__
'entry_point': __name__,
}
)
class StanzaParser(DocumentProcessor):

def __init__(self):
stanza.download('en')
def __init__(self, selective=False):
self.nlp = stanza.Pipeline('en', processors='tokenize,pos,lemma,depparse',
tokenize_pretokenized=True)
tokenize_pretokenized=True, verbose=False)
self.selective = selective

def __reduce__(self):
return StanzaParser, ()

def process_document(self,
document: Document,
params: Dict[str, Any]):
sentences = document.labels['sentences']
pos_tags = document.labels['pos_tags']

sentence_tokens = []
for sentence in sentences:
tokens = [(pt.start_index, pt.end_index) for pt in pos_tags.inside(sentence)]
sentence_tokens.append(tokens)

stanza_doc = self.nlp([[document.text[a:b] for a, b in sentence] for sentence in sentence_tokens])

all_deps = []
all_upos_tags = []
for stanza_sentence, sentence in zip(stanza_doc.sentences, sentence_tokens):
sentence_deps, sentence_upos_tags = stanza_deps_and_upos_tags(sentence, stanza_sentence)
selective = self.selective or params.get('selective', False)
if selective:
terms_index_name = params.get('terms_index', 'umls_terms')
negation_triggers = document.labels['negation_triggers']
terms = document.labels[terms_index_name]

def include(sentence):
if selective and (len(terms.inside(sentence)) == 0 or len(negation_triggers.inside(sentence)) == 0):
return False
return True

sentences = [sent for sent in document.labels['sentences'] if include(sent)]

with torch.no_grad():
stanza_doc = self.nlp([[tag.text for tag in pos_tags.inside(sent)] for sent in sentences])

for sentence, stanza_sent in zip(sentences, stanza_doc.sentences):
sentence_tags = pos_tags.inside(sentence)
sentence_deps, sentence_upos_tags = stanza_deps_and_upos_tags(sentence_tags, stanza_sent)
all_deps.extend(sentence_deps)
all_upos_tags.extend(sentence_upos_tags)

Expand Down
Loading
Loading