diff --git a/README.md b/README.md index cad6773..50f456c 100644 --- a/README.md +++ b/README.md @@ -2,3 +2,4 @@ Experimental repository aimed at using transformers (such as CovidBERT) and Deep Learning techniques to retrieve and visualize semantic clusters underlying the CORD-19 database. ![Alt Text](/docs/cord19_semantic_clusters.gif) + diff --git a/asreviewcontrib/semantic_clustering/semantic_clustering.py b/asreviewcontrib/semantic_clustering/semantic_clustering.py new file mode 100644 index 0000000..97d49af --- /dev/null +++ b/asreviewcontrib/semantic_clustering/semantic_clustering.py @@ -0,0 +1,83 @@ +# Copyright 2021 The ASReview Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# import ASReview +from tqdm import tqdm +from asreview.data import ASReviewData + +# import numpy +import numpy as np + +# import transformer autotokenizer and automodel +from transformers import AutoTokenizer, AutoModel + +# disable transformer warning +from transformers import logging +logging.set_verbosity_error() + +# import tqdm + + +def SemanticClustering(asreview_data_object): + + # load data + print("Loading data...") + data = load_data(asreview_data_object) + + # cut data for testing + data = data.iloc[:10, :] + + # load scibert transformer + print("Loading scibert transformer...") + transformer = 'allenai/scibert_scivocab_uncased' + + # load transformer and tokenizer + print("Loading tokenizer and model...") + tokenizer = AutoTokenizer.from_pretrained(transformer) + model = AutoModel.from_pretrained(transformer) + + # tokenize abstracts and add to data + print("Tokenizing abstracts...") + data['tokenized'] = data['abstract'].apply(lambda x: tokenizer.encode_plus( + x, + padding='longest', + add_special_tokens=False, + return_tensors="pt")) + + # generate embeddings + print("Generating embeddings...") + data['embeddings'] = data['tokenized'].apply( + lambda x: model(**x, output_hidden_states=False)[-1]) + + from dim_reduct import run_pca + n_components = .98 + #pca = run_pca(data['embeddings'], n_components) + + print(data['embeddings'][0].detach().numpy()) + + +def load_data(asreview_data_object): + + # extract title and abstract, drop empty abstracts and reset index + data = asreview_data_object.df[['title', 'abstract']].copy() + data['abstract'] = data['abstract'].replace('', np.nan, inplace=False) + data.dropna(subset=['abstract'], inplace=True) + data = data.reset_index(drop=True) + + return data + + +if __name__ == "__main__": + filepath = "https://raw.githubusercontent.com/asreview/systematic-review-datasets/master/datasets/van_de_Schoot_2017/output/van_de_Schoot_2017.csv" + SemanticClustering(ASReviewData.from_file(filepath)) diff --git a/docs/semantic_clusters.gif b/docs/semantic_clusters.gif new file mode 100644 index 0000000..b28701b Binary files /dev/null and b/docs/semantic_clusters.gif differ