Skip to content

Commit

Permalink
Updated to fix concept detection by normalized bag of words (#59)
Browse files Browse the repository at this point in the history
Also fixed an issue with deployment test not exiting normally.
  • Loading branch information
benknoll-umn authored May 18, 2020
1 parent 9c4c9e2 commit 74e237e
Show file tree
Hide file tree
Showing 5 changed files with 63 additions and 37 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

import static edu.umn.biomedicus.common.pos.PartOfSpeech.*;

Expand Down Expand Up @@ -94,18 +96,6 @@ public class DictionaryConceptDetector extends DocumentProcessor {

private static final Set<PartOfSpeech> TRIVIAL_POS = buildTrivialPos();

private static final int SPAN_SIZE = 5;

private final ConceptDictionary conceptDictionary;

@Nullable
private final NormalizerModel normalizerModel;

DictionaryConceptDetector(ConceptDictionary conceptDictionary, @Nullable NormalizerModel normalizerModel) {
this.conceptDictionary = conceptDictionary;
this.normalizerModel = normalizerModel;
}

private static Set<PartOfSpeech> buildTrivialPos() {
Set<PartOfSpeech> builder = new HashSet<>();
Collections.addAll(builder,
Expand All @@ -126,6 +116,28 @@ private static Set<PartOfSpeech> buildTrivialPos() {
return Collections.unmodifiableSet(builder);
}

private static final Set<String> STOPWORDS = buildStopwords();

private static Set<String> buildStopwords() {
HashSet<String> builder = new HashSet<>();
Collections.addAll(builder, "a", "of", "and", "with", "for", "nos", "to", "in", "by", "on", "the");
return Collections.unmodifiableSet(builder);
}

private static final Pattern PUNCT = Pattern.compile("[\\p{Punct}]+");

private static final int SPAN_SIZE = 5;

private final ConceptDictionary conceptDictionary;

@Nullable
private final NormalizerModel normalizerModel;

DictionaryConceptDetector(ConceptDictionary conceptDictionary, @Nullable NormalizerModel normalizerModel) {
this.conceptDictionary = conceptDictionary;
this.normalizerModel = normalizerModel;
}

public static @NotNull DictionaryConceptDetector createConceptDetector(
@NotNull ConceptsOptions conceptsOptions
) throws IOException, RocksDBException {
Expand Down Expand Up @@ -200,6 +212,14 @@ public static class ConceptsOptions extends ProcessorServer.Builder {
)
private boolean inMemory;

@Option(
name = "--check-norm-forms",
metaVar = "BOOL",
handler = ExplicitBooleanOptionHandler.class,
usage = "Whether to check normalized bags of words for concepts"
)
private boolean checkNormForms;

@Option(
name = "--normalize-locally",
metaVar = "BOOL",
Expand Down Expand Up @@ -354,23 +374,26 @@ public void run() {
int editedEnd = editedSentenceTokens.get(from + subsetSize - 1).getEndIndex();
String editedSubstring = editedSentenceText.substring(editedBegin, editedEnd);
if (checkPhrase(entire, editedSubstring, subsetSize == 1, .1)) {
continue;
}
continue;
}
}

if (windowSubset.size() <= 1) {
continue;
}

List<String> windowNorms = new ArrayList<>(sentenceNorms.subList(from, to));
windowNorms.sort(Comparator.naturalOrder());
StringBuilder queryStringBuilder = new StringBuilder();
for (String windowNorm : windowNorms) {
queryStringBuilder.append(windowNorm);
}
List<ConceptRow> normsCUI = conceptDictionary.forNorms(queryStringBuilder.toString());
if (normsCUI != null) {
labelTerm(entire, normsCUI, .3);
String newNorm = sentenceNorms.get(from + subsetSize - 1);
if (!STOPWORDS.contains(newNorm) && !PUNCT.matcher(newNorm).matches()) {
List<String> windowNorms = new ArrayList<>(sentenceNorms.subList(from, from + subsetSize));
windowNorms.sort(Comparator.naturalOrder());
windowNorms = windowNorms.stream().filter(x -> !STOPWORDS.contains(x))
.filter(x -> !PUNCT.matcher(x).matches())
.collect(Collectors.toList());
String queryString = String.join(" ", windowNorms);
List<ConceptRow> normsCUI = conceptDictionary.forNorms(queryString);
if (normsCUI != null) {
labelTerm(entire, normsCUI, .3);
}
}
}
}
Expand Down
4 changes: 2 additions & 2 deletions python/biomedicus/deployment/deploy_biomedicus.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,8 +146,8 @@ def deploy(conf):
sleep(60 * 60 * 24)
except KeyboardInterrupt:
print("Shutting down all processors")
for p in processes:
p.wait()
for listener in process_listeners:
listener.join(timeout=1)

print("Done shutting down all processors")

Expand Down
2 changes: 1 addition & 1 deletion python/biomedicus/pipeline/default_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,5 +191,5 @@ def source():
doc = e.create_document('plaintext', txt)
yield doc

default_pipeline.pipeline.run_multithread(source(), total=total)
default_pipeline.pipeline.run_multithread(source(), total=total, n_threads=conf.threads)
default_pipeline.pipeline.print_times()
18 changes: 10 additions & 8 deletions python/tests/dependencies/test_dependencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,17 @@ def fixture_dependencies_service(events_service, processor_watcher, processor_ti
try:
existing_address = os.environ['DEPENDENCIES_ADDRESS']
yield existing_address
return
except KeyError:
port = str(find_free_port())
address = '127.0.0.1:' + port
p = subprocess.Popen(['python', '-m', 'biomedicus.dependencies.stanza_parser',
'-p', port,
'--events', events_service],
start_new_session=True, stdin=subprocess.PIPE,
stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
yield from processor_watcher(address, p, timeout=processor_timeout)
pass
port = str(find_free_port())
address = '127.0.0.1:' + port
p = subprocess.Popen(['python', '-m', 'biomedicus.dependencies.stanza_parser',
'-p', port,
'--events', events_service],
start_new_session=True, stdin=subprocess.PIPE,
stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
yield from processor_watcher(address, p, timeout=processor_timeout)


def uas_equal(x, y):
Expand Down
7 changes: 4 additions & 3 deletions python/tests/deployment/test_deployment.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import signal
import threading
from pathlib import Path
Expand All @@ -23,7 +24,7 @@

@pytest.fixture(name='deploy_all')
def fixture_deploy_all():
p = Popen(['python', '-m', 'biomedicus', 'deploy'], stdout=PIPE, stderr=STDOUT)
p = Popen(['python', '-m', 'biomedicus', 'deploy'], start_new_session=True, stdout=PIPE, stderr=STDOUT)

e = threading.Event()

Expand All @@ -39,15 +40,15 @@ def listen(p, e):
listener.start()
e.wait()
yield p
p.send_signal(signal.SIGINT)
os.killpg(p.pid, signal.SIGINT)
listener.join()


def test_deploy_run(deploy_all):
print("testing deployment")
with TemporaryDirectory() as tmpdir:
code = call(['python', '-m', 'biomedicus', 'run', str(Path(__file__).parent / 'in'),
tmpdir])
tmpdir],)
assert code == 0
with YamlSerializer.file_to_event(Path(tmpdir) / '97_204.txt.json') as event:
document = event.documents['plaintext']
Expand Down

0 comments on commit 74e237e

Please sign in to comment.