Skip to content

Commit

Permalink
Attach Java to Python dists, Deployment / Run Scripts (#33)
Browse files Browse the repository at this point in the history
Various functionality for deployment / distribution.

Updated bi_lstm sentence detector.
  • Loading branch information
benknoll-umn authored Dec 18, 2019
1 parent 986097e commit 6f3ae83
Show file tree
Hide file tree
Showing 21 changed files with 757 additions and 110 deletions.
35 changes: 35 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,38 @@
The BioMedical Information Collection and Understanding System (BioMedICUS) is a system for large-scale text analysis and processing of biomedical and clinical reports. The system is being developed by the Natural Language Processing and Information Extraction Program at the University of Minnesota Institute for Health Informatics.

This is a collaborative project that aims to serve biomedical and clinical researchers, allowing for customization with different texts.

More information about BioMedICUS can be found on our [website](https://nlpie.github.io/biomedicus).

## Prerequisites

- [Python 3.5 or later](https://www.python.org/)
- [Java JDK 8.0 or later](https://adoptopenjdk.net/index.html). Note, you will need to have the ["java" command on the your "$PATH"](https://www.java.com/en/download/help/path.xml).

## Installation

```bash
pip install biomedicus\[torch]
```

## Deploying the default BioMedICUS Pipeline

The following command runs a script that will start up all of the BioMedICUS services for processing clinical notes:

```bash
biomedicus deploy --download-data
```

## Processing a directory of text files using BioMedICUS

After deploying BioMedICUS, you can process a directory of documents using the following command:

```bash
biomedicus run /path/to/input_dir /path/to/output_dir
```

This will process the documents in the directory using BioMedICUS and save the results as json-serialized MTAP Events to output directory.

## Contact

BioMedICUS is developed by the [NLP/IE Group](https://healthinformatics.umn.edu/research/nlpie-group) at the University of Minnesota Institute for Health Informatics. You can contact us at [[email protected]](mailto:[email protected]).
55 changes: 47 additions & 8 deletions java/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
*/

plugins {
id 'com.github.johnrengelman.shadow' version '5.2.0'
id 'java'
}

Expand All @@ -26,7 +27,43 @@ def gitVersion = { ->
commandLine 'git', 'describe', '--tags', '--dirty'
standardOutput = stdout
}
return stdout.toString().replaceFirst('^v', "").trim()
def pattern = ~/v([0-9]+)(?:.([0-9]+))?(?:.([0-9]+))?(?:-(alpha|beta|pre|rc).([0-9]+))?(?:-([0-9]+)-([a-g0-9]+))?(-dirty)?/
def newVersion = stdout.toString().trim().replaceFirst(pattern) { _,major,minor,patch,pre,preVersion,com,hash,dirty ->
def incremented = false
def result = ''
if (com != null || hash != null || dirty != null) {
result = '-SNAPSHOT'
} else {
incremented = true
}
if (pre != null && preVersion != null) {
if (!incremented) {
preVersion = (preVersion as int) + 1
incremented = true
}
result = "-${pre}${preVersion}${result}"
}
if (patch != null) {
if (!incremented) {
patch = (patch as int) + 1
incremented = true
}
result = ".${patch}${result}"
}
if (minor != null) {
if (!incremented) {
minor = (minor as int) + 1
incremented = true
}
result = ".${minor}${result}"
}
if (!incremented) {
major = (major as int) + 1
}
result = "${major}${result}"
return result
}
return newVersion
}

version gitVersion()
Expand Down Expand Up @@ -73,15 +110,13 @@ test {
useJUnitPlatform()
}

task fatJar(type: Jar) {
shadowJar {
manifest {
attributes 'Implementation-Title': 'NLP-NEWT',
'Implementation-Version': version,
'Main-Class': 'edu.umn.nlpie.mtap.MTAP'
attributes 'Implementation-Title': 'BioMedICUS',
'Description': 'A system for large-scale text analysis and processing of biomedical and clinical reports.',
'Implementation-Version': archiveVersion
}
setArchiveBaseName project.name + '-all'
from { configurations.runtimeClasspath.collect { it.isDirectory() ? it : zipTree(it) } }
with jar
mergeServiceFiles()
}

task execute(type: JavaExec) {
Expand All @@ -93,3 +128,7 @@ task conceptsUtility(type: JavaExec) {
classpath = sourceSets.main.runtimeClasspath
main 'edu.umn.biomedicus.concepts.ConceptsUtility'
}

task writeVersion() {
new File(buildDir, "version.txt").text = "$version\n"
}
Original file line number Diff line number Diff line change
Expand Up @@ -188,15 +188,19 @@ protected void process(@NotNull Document document,
for (TokenResult token : Tokenizer.tokenize(sentenceText)) {
int startIndex = sentence.getStartIndex() + token.getStartIndex();
int endIndex = sentence.getStartIndex() + token.getEndIndex();
tokens.add(GenericLabel.withSpan(startIndex, endIndex).build());
tokens.add(GenericLabel.withSpan(startIndex, endIndex).withDocument(document).build());
}
if (tokens.size() > 0) {
GenericLabel lastToken = tokens.remove(tokens.size() - 1);
if (lastToken.getEndIndex() - lastToken.getStartIndex() > 1) {
CharSequence tokenText = lastToken.getText();
if (Arrays.asList('!', '?', '.').contains(tokenText.charAt(tokenText.length() - 1))) {
tokens.add(GenericLabel.withSpan(lastToken.getStartIndex(), lastToken.getEndIndex() - 1).build());
tokens.add(GenericLabel.withSpan(lastToken.getEndIndex() - 1, lastToken.getEndIndex()).build());
tokens.add(
GenericLabel.withSpan(lastToken.getStartIndex(), lastToken.getEndIndex() - 1)
.withDocument(document).build());
tokens.add(
GenericLabel.withSpan(lastToken.getEndIndex() - 1, lastToken.getEndIndex())
.withDocument(document).build());
} else {
tokens.add(lastToken);
}
Expand Down
6 changes: 4 additions & 2 deletions java/src/main/resources/edu/umn/biomedicus/defaultConfig.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ normalization:
db: ${BIOMEDICUS_DATA}/normalization
inMemory: no
sentences:
hparamsFile: ${BIOMEDICUS_DATA}/sentences/1575655743.708996.yml
modelFile: ${BIOMEDICUS_DATA}/sentences/1575655743.708996.pt
hparamsFile: ${BIOMEDICUS_DATA}/sentences/1576567107.010098.yml
modelFile: ${BIOMEDICUS_DATA}/sentences/1576567107.010098.pt
wordEmbeddings: ${BIOMEDICUS_DATA}/sentences/mimic100.vec
charsFile: ${BIOMEDICUS_DATA}/sentences/chars.txt
tnt:
Expand All @@ -29,3 +29,5 @@ tnt:
db: ${BIOMEDICUS_DATA}/tnt/words
metadata: ${BIOMEDICUS_DATA}/tnt/wordMetadata.yml
beam.threshold: 2.0
data:
data_url: https://github.com/nlpie/biomedicus3/releases/download/v3.0-beta.0/biomedicus-3.0-beta0-umls-data.zip
9 changes: 9 additions & 0 deletions python/biomedicus/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,12 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from biomedicus import version

__version__ = version.version


def biomedicus_jar():
from pathlib import Path
return str(Path(__file__).parent / 'biomedicus-all.jar')
16 changes: 16 additions & 0 deletions python/biomedicus/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Copyright 2019 Regents of the University of Minnesota.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from biomedicus.cli import main

main()
36 changes: 36 additions & 0 deletions python/biomedicus/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Copyright 2019 Regents of the University of Minnesota.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


def main(args=None):
from argparse import ArgumentParser
from biomedicus.deployment.deploy_biomedicus import deployment_parser, deploy
from biomedicus.pipeline.default_pipeline import default_pipeline_parser, run_default_pipeline
parser = ArgumentParser()
parser.set_defaults(f=lambda _: parser.print_help())
subparsers = parser.add_subparsers()

deployment_subparser = subparsers.add_parser('deploy', parents=[deployment_parser()],
help='Deploys the default biomedicus pipeline.')
deployment_subparser.set_defaults(f=deploy)

run_subparser = subparsers.add_parser('run', parents=[default_pipeline_parser()],
help="Runs the default biomedicus pipeline on files "
"in a directory.")
run_subparser.set_defaults(f=run_default_pipeline)

conf = parser.parse_args(args)
f = conf.f
del conf.f
f(conf)
6 changes: 4 additions & 2 deletions python/biomedicus/defaultConfig.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ normalization:
db: ${BIOMEDICUS_DATA}/normalization
inMemory: no
sentences:
hparamsFile: ${BIOMEDICUS_DATA}/sentences/1575655743.708996.yml
modelFile: ${BIOMEDICUS_DATA}/sentences/1575655743.708996.pt
hparamsFile: ${BIOMEDICUS_DATA}/sentences/1576567107.010098.yml
modelFile: ${BIOMEDICUS_DATA}/sentences/1576567107.010098.pt
wordEmbeddings: ${BIOMEDICUS_DATA}/sentences/mimic100.vec
charsFile: ${BIOMEDICUS_DATA}/sentences/chars.txt
tnt:
Expand All @@ -29,3 +29,5 @@ tnt:
db: ${BIOMEDICUS_DATA}/tnt/words
metadata: ${BIOMEDICUS_DATA}/tnt/wordMetadata.yml
beam.threshold: 2.0
data:
data_url: https://github.com/nlpie/biomedicus3/releases/download/v3.0-beta.0/biomedicus-3.0-beta0-umls-data.zip
Empty file.
Loading

0 comments on commit 6f3ae83

Please sign in to comment.