Skip to content

Commit

Permalink
Merge branch 'main-pipeline' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
jteijema authored Nov 5, 2021
2 parents d428538 + 7a1b351 commit 9811aaa
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 0 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
Experimental repository aimed at using transformers (such as CovidBERT) and Deep Learning techniques to retrieve and visualize semantic clusters underlying the CORD-19 database.

![Alt Text](/docs/cord19_semantic_clusters.gif)

83 changes: 83 additions & 0 deletions asreviewcontrib/semantic_clustering/semantic_clustering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# Copyright 2021 The ASReview Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# import ASReview
from tqdm import tqdm
from asreview.data import ASReviewData

# import numpy
import numpy as np

# import transformer autotokenizer and automodel
from transformers import AutoTokenizer, AutoModel

# disable transformer warning
from transformers import logging
logging.set_verbosity_error()

# import tqdm


def SemanticClustering(asreview_data_object):

# load data
print("Loading data...")
data = load_data(asreview_data_object)

# cut data for testing
data = data.iloc[:10, :]

# load scibert transformer
print("Loading scibert transformer...")
transformer = 'allenai/scibert_scivocab_uncased'

# load transformer and tokenizer
print("Loading tokenizer and model...")
tokenizer = AutoTokenizer.from_pretrained(transformer)
model = AutoModel.from_pretrained(transformer)

# tokenize abstracts and add to data
print("Tokenizing abstracts...")
data['tokenized'] = data['abstract'].apply(lambda x: tokenizer.encode_plus(
x,
padding='longest',
add_special_tokens=False,
return_tensors="pt"))

# generate embeddings
print("Generating embeddings...")
data['embeddings'] = data['tokenized'].apply(
lambda x: model(**x, output_hidden_states=False)[-1])

from dim_reduct import run_pca
n_components = .98
#pca = run_pca(data['embeddings'], n_components)

print(data['embeddings'][0].detach().numpy())


def load_data(asreview_data_object):

# extract title and abstract, drop empty abstracts and reset index
data = asreview_data_object.df[['title', 'abstract']].copy()
data['abstract'] = data['abstract'].replace('', np.nan, inplace=False)
data.dropna(subset=['abstract'], inplace=True)
data = data.reset_index(drop=True)

return data


if __name__ == "__main__":
filepath = "https://raw.githubusercontent.com/asreview/systematic-review-datasets/master/datasets/van_de_Schoot_2017/output/van_de_Schoot_2017.csv"
SemanticClustering(ASReviewData.from_file(filepath))
Binary file added docs/semantic_clusters.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit 9811aaa

Please sign in to comment.