diff --git a/docs/test_results/3.5-full-data.yml b/docs/test_results/3.5-full-data.yml new file mode 100644 index 00000000..666e5e86 --- /dev/null +++ b/docs/test_results/3.5-full-data.yml @@ -0,0 +1,33 @@ +Concepts: + Precision: 0.5435908753019885 + Process Method Duration: '0:00:00.027807' + Recall: 0.6921164772727273 + Remote Call Duration: '0:00:00.031274' +TnT Pos Tagger: + Accuracy: 0.9349333907344957 + Process Method Duration: '0:00:00.007598' + Remote Call Duration: '0:00:00.009994' +biomedicus-deepen: + F1: 0.912258064516129 + Gold Standard: 2010 i2b2-VA + Per-Document Mean Pipeline Duration: '0:00:00.377380' + Precision: 0.8830141548709409 + Recall: 0.9435053380782918 +biomedicus-dependencies: + Corpus: MiPACQ converted to UD from PTB test set + LAS: 0.5327625056331681 + Process Method Duration: '0:00:00.502909' + Remote Call Duration: '0:00:00.503824' + UAS: 0.6661559260928346 +biomedicus-modification: + F1: 0.7100757788380578 + Gold Standard: 2010 i2b2-VA + Per-Document Mean Pipeline Duration: '0:00:00.013226' + Precision: 0.9619771863117871 + Recall: 0.5627224199288257 +biomedicus-negex: + F1: 0.8706162076481078 + Gold Standard: 2010 i2b2-VA + Per-Document Mean Pipeline Duration: '0:00:00.009332' + Precision: 0.7849231868524473 + Recall: 0.9773131672597865 diff --git a/docs/test_results/3.5-open-data.yml b/docs/test_results/3.5-open-data.yml new file mode 100644 index 00000000..ce34ca72 --- /dev/null +++ b/docs/test_results/3.5-open-data.yml @@ -0,0 +1,33 @@ +Concepts: + Precision: 0.5265714242883056 + Process Method Duration: '0:00:00.014707' + Recall: 0.6747159090909091 + Remote Call Duration: '0:00:00.018182' +TnT Pos Tagger: + Accuracy: 0.9349333907344957 + Process Method Duration: '0:00:00.007532' + Remote Call Duration: '0:00:00.010117' +biomedicus-deepen: + F1: 0.9104573759931286 + Gold Standard: 2010 i2b2-VA + Per-Document Mean Pipeline Duration: '0:00:00.115112' + Precision: 0.8800332088003321 + Recall: 0.9430604982206405 +biomedicus-dependencies: + Corpus: MiPACQ converted to UD from PTB test set + LAS: 0.5475739822742978 + Process Method Duration: '0:00:00.101438' + Remote Call Duration: '0:00:00.102343' + UAS: 0.683340844224125 +biomedicus-modification: + F1: 0.7100757788380578 + Gold Standard: 2010 i2b2-VA + Per-Document Mean Pipeline Duration: '0:00:00.014181' + Precision: 0.9619771863117871 + Recall: 0.5627224199288257 +biomedicus-negex: + F1: 0.8706162076481078 + Gold Standard: 2010 i2b2-VA + Per-Document Mean Pipeline Duration: '0:00:00.009531' + Precision: 0.7849231868524473 + Recall: 0.9773131672597865 diff --git a/java/src/main/java/edu/umn/biomedicus/concepts/ConceptDictionaryBuilder.java b/java/src/main/java/edu/umn/biomedicus/concepts/ConceptDictionaryBuilder.java index 5cfaa079..73069090 100644 --- a/java/src/main/java/edu/umn/biomedicus/concepts/ConceptDictionaryBuilder.java +++ b/java/src/main/java/edu/umn/biomedicus/concepts/ConceptDictionaryBuilder.java @@ -1,5 +1,5 @@ /* - * Copyright 2019 Regents of the University of Minnesota. + * Copyright (c) Regents of the University of Minnesota. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -41,7 +41,8 @@ /** * Builds the concepts dictionary. *

- * Usage: java edu.umn.biomedicus.concepts.ConceptDictionaryBuilder [umls installation] \ + * Usage: java edu.umn.biomedicus.concepts.ConceptDictionaryBuilder [umls + * installation] \ * [tuis-of-interest file] [banned-ttys file] [outputPath] */ public class ConceptDictionaryBuilder { @@ -51,31 +52,25 @@ public class ConceptDictionaryBuilder { private static final Pattern SPACE_SPLITTER = Pattern.compile(" "); - @Argument(required = true, metaVar = "PATH/TO/UMLS", handler = PathOptionHandler.class, - usage = "Path to UMLS installation") + @Argument(required = true, metaVar = "PATH/TO/UMLS", handler = PathOptionHandler.class, usage = "Path to UMLS installation") private Path umlsPath; - @Argument(index = 1, metaVar = "PATH/TO/TUIS", handler = PathOptionHandler.class, - usage = "Path to TUIs of interest") + @Argument(index = 1, metaVar = "PATH/TO/TUIS", handler = PathOptionHandler.class, usage = "Path to TUIs of interest") private Path tuisOfInterestFile; - @Argument(index = 2, metaVar = "PATH/TO/BANNED_TTYS", handler = PathOptionHandler.class, - usage = "Banned TTYs file") + @Argument(index = 2, metaVar = "PATH/TO/BANNED_TTYS", handler = PathOptionHandler.class, usage = "Banned TTYs file") private Path bannedTtysFile; @Argument(index = 3, metaVar = "OUTPUT_PATH", usage = "Path to write db out to.") private Path dbPath; - @Option(name = "--filtered-suis", handler = PathOptionHandler.class, - usage = "A path to a file containing SUIs to filter out.") + @Option(name = "--filtered-suis", handler = PathOptionHandler.class, usage = "A path to a file containing SUIs to filter out.") private Path filteredSuisPath = null; - @Option(name = "--filtered-cuis", handler = PathOptionHandler.class, - usage = "A path to a file containing CUIs to filter out.") + @Option(name = "--filtered-cuis", handler = PathOptionHandler.class, usage = "A path to a file containing CUIs to filter out.") private Path filteredCuisPath = null; - @Option(name = "--filtered-sui-cuis", handler = PathOptionHandler.class, - usage = "A path to a file containing SUI-CUI combinations to filter") + @Option(name = "--filtered-sui-cuis", handler = PathOptionHandler.class, usage = "A path to a file containing SUI-CUI combinations to filter") private Path filteredSuiCuisPath; @Option(name = "--filtered-tuis", usage = "A path to a file containing TUIs to filter out.") @@ -89,7 +84,9 @@ public static void main(String[] args) { builder.doWork(); } catch (CmdLineException e) { System.err.println(e.getLocalizedMessage()); - System.err.println("java edu.umn.biomedicus.concepts.ConceptDictionaryBuilder" + parser.printExample(OptionHandlerFilter.ALL) + " PATH/TO/UMLS PATH/TO/TUIS PATH/TO/BANNED_TTYS OUTPUT_PATH"); + System.err.println( + "java edu.umn.biomedicus.concepts.ConceptDictionaryBuilder" + parser.printExample(OptionHandlerFilter.ALL) + + " PATH/TO/UMLS PATH/TO/TUIS PATH/TO/BANNED_TTYS OUTPUT_PATH"); parser.printUsage(System.err); } catch (IOException e) { e.printStackTrace(); @@ -208,7 +205,7 @@ private void doWork() throws IOException { SuiCui sc = new SuiCui(sui, cui); if (filteredCuis.contains(cui) || filteredTuis.contains(tui) || filteredSuiCuis.contains(sc) || filteredSuis - .contains(sui)) { + .contains(sui)) { continue; } @@ -227,7 +224,7 @@ private void doWork() throws IOException { options.setCreateIfMissing(true); options.prepareForBulkLoad(); try (RocksDB phrases = RocksDB.open(options, dbPath.resolve("phrases").toString()); - RocksDB lowercase = RocksDB.open(options, dbPath.resolve("lowercase").toString())) { + RocksDB lowercase = RocksDB.open(options, dbPath.resolve("lowercase").toString())) { int wrote = 0; for (Entry> entry : phrasesMap.entrySet()) { List suiCuiTuis = entry.getValue(); @@ -308,16 +305,17 @@ private void doWork() throws IOException { } int wrote = 0; - try (Options options = new Options(); - RocksDB normsDb = RocksDB.open(options, dbPath.resolve("norms").toString())) { + try (Options options = new Options()) { options.setCreateIfMissing(true); options.prepareForBulkLoad(); - for (Entry> entry : map.entrySet()) { - List suiCuiTuis = entry.getValue(); - byte[] suiCuiTuiBytes = getBytes(suiCuiTuis); - normsDb.put(entry.getKey().getBytes(), suiCuiTuiBytes); - if (++wrote % 10_000 == 0) { - System.out.println("Wrote " + wrote + " of " + map.size() + " norm term bags."); + try (RocksDB normsDb = RocksDB.open(options, dbPath.resolve("norms").toString())) { + for (Entry> entry : map.entrySet()) { + List suiCuiTuis = entry.getValue(); + byte[] suiCuiTuiBytes = getBytes(suiCuiTuis); + normsDb.put(entry.getKey().getBytes(), suiCuiTuiBytes); + if (++wrote % 10_000 == 0) { + System.out.println("Wrote " + wrote + " of " + map.size() + " norm term bags."); + } } } } catch (RocksDBException e) { @@ -396,7 +394,8 @@ public int hashCode() { @Override public int compareTo(@NotNull SuiCui o) { int compare = Integer.compare(sui.identifier(), o.sui.identifier()); - if (compare != 0) return compare; + if (compare != 0) + return compare; return Integer.compare(cui.identifier(), o.cui.identifier()); } } diff --git a/java/src/main/resources/edu/umn/biomedicus/defaultConfig.yml b/java/src/main/resources/edu/umn/biomedicus/defaultConfig.yml index a0795dbe..8c4787cd 100644 --- a/java/src/main/resources/edu/umn/biomedicus/defaultConfig.yml +++ b/java/src/main/resources/edu/umn/biomedicus/defaultConfig.yml @@ -32,7 +32,3 @@ tnt: beam.threshold: 2.0 sections: headersFile: ${BIOMEDICUS_DATA}/sections/header_patterns.txt -data: - # THE CERTIFICATE IS SIGNED FOR ATHENA, USE ATHENA URL - data_url: https://athena.ahc.umn.edu/downloads/open/biomedicus-3.0b9-standard-data.zip - version: 3.0b9 diff --git a/python/biomedicus/data_version.py b/python/biomedicus/data_version.py index e6aa174c..eee25eb5 100644 --- a/python/biomedicus/data_version.py +++ b/python/biomedicus/data_version.py @@ -1,4 +1,4 @@ -# Copyright 2022 Regents of the University of Minnesota. +# Copyright (c) Regents of the University of Minnesota. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,5 +13,5 @@ # limitations under the License. """The expected data version and url to download data.""" -DATA_VERSION = "2023-10-31" -DATA_URL = "https://athena.ahc.umn.edu/downloads/open/biomedicus-open-data-2023-10-31.zip" +DATA_VERSION = "2025-01-27" +DATA_URL = "https://athena.ahc.umn.edu/downloads/open/biomedicus-open-data-2025-01-27.zip" diff --git a/python/biomedicus/dependencies/stanza_parser.py b/python/biomedicus/dependencies/stanza_parser.py index 6e6397cc..f1a24603 100644 --- a/python/biomedicus/dependencies/stanza_parser.py +++ b/python/biomedicus/dependencies/stanza_parser.py @@ -1,4 +1,4 @@ -# Copyright 2020 Regents of the University of Minnesota. +# Copyright (c) Regents of the University of Minnesota. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,6 +15,7 @@ import numpy as np import stanza +import torch from mtap import Document, DocumentProcessor, processor, run_processor, GenericLabel from mtap.descriptors import labels, label_property @@ -43,7 +44,7 @@ def stanza_deps_and_upos_tags(sentence, stanza_sentence): for dep_id in range(len(stanza_dependencies) + 1): if graph[head_id, dep_id] > 0: dep, deprel = dep_map[dep_id] - token_begin, token_end = sentence[dep_id - 1] + token_begin, token_end = sentence[dep_id - 1].location dep_label = GenericLabel(token_begin, token_end, head=head_dep_label, deprel=deprel) dep_label.reference_cache['dependents'] = [] @@ -55,78 +56,105 @@ def stanza_deps_and_upos_tags(sentence, stanza_sentence): if len(dependencies) == len(stanza_dependencies) - 1: raise ValueError("Unexpected number of dependencies") for word in stanza_sentence.words: - token_begin, token_end = sentence[word.id - 1] + token_begin, token_end = sentence[word.id - 1].location sentence_upos_tags.append(GenericLabel(token_begin, token_end, tag=word.upos)) return sentence_deps, sentence_upos_tags @processor( - 'biomedicus-dependencies', - human_name="BioMedICUS Stanza Dependency Parser", - description="Calls out to the Stanford Stanza framework for dependency parsing.", + 'biomedicus-selective-dependencies', + human_name="BioMedICUS Stanza Selective Dependency Parser", + description="Calls out to the Stanford Stanza framework for dependency parsing" + "on a appropriate subset of sentences.", inputs=[ labels(name='sentences', reference='biomedicus-sentences/sentences'), labels(name='pos_tags', reference='biomedicus-tnt-tagger/pos_tags'), + labels( + name='umls_terms', + reference='biomedicus-concepts/umls_terms', + name_from_parameter='terms_index', + optional=True + ), + labels( + "negation_triggers", + reference='biomedicus-negex-triggers', + optional=True + ) ], outputs=[ - labels(name='dependencies', - description="The dependent words.", - properties=[ - label_property( - 'deprel', - description="The dependency relation", - data_type='str' - ), - label_property( - 'head', - description="The head of this label or null if its the root.", - nullable=True, - data_type='ref:dependencies' - ), - label_property( - 'dependents', - description="The dependents of ths dependent.", - data_type='list[ref:dependencies]' - ) - ]), - labels(name='upos_tags', - description="Universal Part-of-speech tags", - properties=[ - label_property( - 'tag', - description="The Universal Part-of-Speech tag", - data_type='str' - ) - ]) + labels( + name='dependencies', + description="The dependent words.", + properties=[ + label_property( + 'deprel', + description="The dependency relation", + data_type='str' + ), + label_property( + 'head', + description="The head of this label or null if its the root.", + nullable=True, + data_type='ref:dependencies' + ), + label_property( + 'dependents', + description="The dependents of ths dependent.", + data_type='list[ref:dependencies]' + ) + ] + ), + labels( + name='upos_tags', + description="Universal Part-of-speech tags", + properties=[ + label_property( + 'tag', + description="The Universal Part-of-Speech tag", + data_type='str' + ) + ] + ) ], additional_data={ - 'entry_point': __name__ + 'entry_point': __name__, } ) class StanzaParser(DocumentProcessor): - - def __init__(self): - stanza.download('en') + def __init__(self, selective=False): self.nlp = stanza.Pipeline('en', processors='tokenize,pos,lemma,depparse', - tokenize_pretokenized=True) + tokenize_pretokenized=True, verbose=False) + self.selective = selective + + def __reduce__(self): + return StanzaParser, () def process_document(self, document: Document, params: Dict[str, Any]): - sentences = document.labels['sentences'] pos_tags = document.labels['pos_tags'] - sentence_tokens = [] - for sentence in sentences: - tokens = [(pt.start_index, pt.end_index) for pt in pos_tags.inside(sentence)] - sentence_tokens.append(tokens) - - stanza_doc = self.nlp([[document.text[a:b] for a, b in sentence] for sentence in sentence_tokens]) - all_deps = [] all_upos_tags = [] - for stanza_sentence, sentence in zip(stanza_doc.sentences, sentence_tokens): - sentence_deps, sentence_upos_tags = stanza_deps_and_upos_tags(sentence, stanza_sentence) + selective = self.selective or params.get('selective', False) + if selective: + terms_index_name = params.get('terms_index', 'umls_terms') + negation_triggers = document.labels['negation_triggers'] + terms = document.labels[terms_index_name] + + def include(sentence): + if selective and (len(terms.inside(sentence)) == 0 or len(negation_triggers.inside(sentence)) == 0): + return False + return True + + sentences = [sent for sent in document.labels['sentences'] if include(sent)] + + with torch.no_grad(): + stanza_doc = self.nlp([[tag.text for tag in pos_tags.inside(sent)] for sent in sentences]) + + for sentence, stanza_sent in zip(sentences, stanza_doc.sentences): + sentence_tags = pos_tags.inside(sentence) + sentence_deps, sentence_upos_tags = stanza_deps_and_upos_tags(sentence_tags, stanza_sent) all_deps.extend(sentence_deps) all_upos_tags.extend(sentence_upos_tags) diff --git a/python/biomedicus/dependencies/stanza_selective_parser.py b/python/biomedicus/dependencies/stanza_selective_parser.py index 458a6bba..3eee94cc 100644 --- a/python/biomedicus/dependencies/stanza_selective_parser.py +++ b/python/biomedicus/dependencies/stanza_selective_parser.py @@ -1,4 +1,4 @@ -# Copyright 2020 Regents of the University of Minnesota. +# Copyright (c) Regents of the University of Minnesota. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -20,125 +20,22 @@ from mtap.processing import DocumentProcessor from mtap.descriptors import labels, label_property -from biomedicus.dependencies.stanza_parser import stanza_deps_and_upos_tags - - -@processor( - 'biomedicus-selective-dependencies', - human_name="BioMedICUS Stanza Selective Dependency Parser", - description="Calls out to the Stanford Stanza framework for dependency parsing" - "on a appropriate subset of sentences.", - inputs=[ - labels(name='sentences', reference='biomedicus-sentences/sentences'), - labels(name='pos_tags', reference='biomedicus-tnt-tagger/pos_tags'), - labels( - name='umls_terms', - reference='biomedicus-concepts/umls_terms', - name_from_parameter='terms_index' - ), - labels( - "negation_triggers", - reference='biomedicus-negex-triggers' - ) - ], - outputs=[ - labels( - name='dependencies', - description="The dependent words.", - properties=[ - label_property( - 'deprel', - description="The dependency relation", - data_type='str' - ), - label_property( - 'head', - description="The head of this label or null if its the root.", - nullable=True, - data_type='ref:dependencies' - ), - label_property( - 'dependents', - description="The dependents of ths dependent.", - data_type='list[ref:dependencies]' - ) - ] - ), - labels( - name='upos_tags', - description="Universal Part-of-speech tags", - properties=[ - label_property( - 'tag', - description="The Universal Part-of-Speech tag", - data_type='str' - ) - ] - ) - ], - additional_data={ - 'entry_point': __name__, - } -) -class StanzaSelectiveParser(DocumentProcessor): - def __init__(self): - self.nlp = stanza.Pipeline('en', processors='tokenize,pos,lemma,depparse', - tokenize_pretokenized=True, verbose=False) - - def __reduce__(self): - return StanzaSelectiveParser, () - - def process_document(self, - document: Document, - params: Dict[str, Any]): - pos_tags = document.labels['pos_tags'] - terms_index_name = params.get('terms_index', 'umls_terms') - terms = document.labels[terms_index_name] - negation_triggers = document.labels['negation_triggers'] - - all_deps = [] - all_upos_tags = [] - sentences = [] - sentence_texts = [] - for sentence in document.labels['sentences']: - tokens = [(pt.start_index, pt.end_index) for pt in pos_tags.inside(sentence)] - if len(terms.inside(sentence)) == 0 or len(negation_triggers.inside(sentence)) == 0: - continue - sentences.append(tokens) - sentence_texts.append(sentence.text) - - with torch.no_grad(): - stanza_doc = self.nlp([[document.text[a:b] for a, b in sentence] for sentence in sentences]) - for (sentence, stanza_sentence) in zip(sentences, stanza_doc.sentences): - sentence_deps, sentence_upos_tags = stanza_deps_and_upos_tags(sentence, stanza_sentence) - all_deps.extend(sentence_deps) - all_upos_tags.extend(sentence_upos_tags) - - document.add_labels('dependencies', all_deps) - document.add_labels('upos_tags', all_upos_tags) +from biomedicus.dependencies.stanza_parser import StanzaParser def main(args=None): parser = ArgumentParser(parents=[processor_parser()]) parser.add_argument('--offline', action='store_true') - parser.add_argument( - '--mp', action='store_true', - help="Whether to use the multiprocessing pool based processor server." - ) - parser.add_argument( - '--mp-start-method', default='forkserver', choices=['forkserver', 'spawn'], - help="The multiprocessing start method to use" - ) options = parser.parse_args(args) if not options.offline: stanza.download('en') - processor = StanzaSelectiveParser() + processor = StanzaParser(selective=True) mp_context = None if options.mp: mp_context = torch.multiprocessing.get_context(options.mp_start_method) - run_processor(processor, options=options, mp=options.mp, mp_context=mp_context) + run_processor(processor, options=options, mp_context=mp_context) if __name__ == '__main__': diff --git a/python/biomedicus/negation/deepen.py b/python/biomedicus/negation/deepen.py index 695b1ae5..4d3c9564 100644 --- a/python/biomedicus/negation/deepen.py +++ b/python/biomedicus/negation/deepen.py @@ -202,22 +202,7 @@ def process_document(self, document: Document, params: Dict[str, Any]): def main(args=None): - parser = ArgumentParser(add_help=True, parents=[processor_parser()]) - parser.add_argument( - '--mp', action='store_true', - help="Whether to use the multiprocessing pool based processor server." - ) - parser.add_argument( - '--mp-start-method', default='forkserver', choices=['forkserver', 'spawn'], - help="The multiprocessing start method to use" - ) - options = parser.parse_args(args) - mp_context = None - if options.mp: - import multiprocessing as mp - mp_context = mp.get_context(options.mp_start_method) - - run_processor(DeepenProcessor(), options=options, mp=options.mp, mp_context=mp_context) + run_processor(DeepenProcessor()) if __name__ == '__main__': diff --git a/python/biomedicus/sentences/bi_lstm.py b/python/biomedicus/sentences/bi_lstm.py index ffe0afb3..17e10961 100644 --- a/python/biomedicus/sentences/bi_lstm.py +++ b/python/biomedicus/sentences/bi_lstm.py @@ -440,7 +440,7 @@ class Hparams: model.to(device=device) logger.info('Loading model weights from: {}'.format(conf.model_file)) with conf.model_file.open('rb') as f: - state_dict = torch.load(f) + state_dict = torch.load(f, weights_only=True) model.load_state_dict(state_dict) model.eval() if conf.mp: