diff --git a/README.md b/README.md index 50f456c..cad6773 100644 --- a/README.md +++ b/README.md @@ -2,4 +2,3 @@ Experimental repository aimed at using transformers (such as CovidBERT) and Deep Learning techniques to retrieve and visualize semantic clusters underlying the CORD-19 database. ![Alt Text](/docs/cord19_semantic_clusters.gif) - diff --git a/asreviewcontrib/semantic_clustering/semantic_clustering.py b/asreviewcontrib/semantic_clustering/semantic_clustering.py deleted file mode 100644 index 97d49af..0000000 --- a/asreviewcontrib/semantic_clustering/semantic_clustering.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright 2021 The ASReview Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# import ASReview -from tqdm import tqdm -from asreview.data import ASReviewData - -# import numpy -import numpy as np - -# import transformer autotokenizer and automodel -from transformers import AutoTokenizer, AutoModel - -# disable transformer warning -from transformers import logging -logging.set_verbosity_error() - -# import tqdm - - -def SemanticClustering(asreview_data_object): - - # load data - print("Loading data...") - data = load_data(asreview_data_object) - - # cut data for testing - data = data.iloc[:10, :] - - # load scibert transformer - print("Loading scibert transformer...") - transformer = 'allenai/scibert_scivocab_uncased' - - # load transformer and tokenizer - print("Loading tokenizer and model...") - tokenizer = AutoTokenizer.from_pretrained(transformer) - model = AutoModel.from_pretrained(transformer) - - # tokenize abstracts and add to data - print("Tokenizing abstracts...") - data['tokenized'] = data['abstract'].apply(lambda x: tokenizer.encode_plus( - x, - padding='longest', - add_special_tokens=False, - return_tensors="pt")) - - # generate embeddings - print("Generating embeddings...") - data['embeddings'] = data['tokenized'].apply( - lambda x: model(**x, output_hidden_states=False)[-1]) - - from dim_reduct import run_pca - n_components = .98 - #pca = run_pca(data['embeddings'], n_components) - - print(data['embeddings'][0].detach().numpy()) - - -def load_data(asreview_data_object): - - # extract title and abstract, drop empty abstracts and reset index - data = asreview_data_object.df[['title', 'abstract']].copy() - data['abstract'] = data['abstract'].replace('', np.nan, inplace=False) - data.dropna(subset=['abstract'], inplace=True) - data = data.reset_index(drop=True) - - return data - - -if __name__ == "__main__": - filepath = "https://raw.githubusercontent.com/asreview/systematic-review-datasets/master/datasets/van_de_Schoot_2017/output/van_de_Schoot_2017.csv" - SemanticClustering(ASReviewData.from_file(filepath)) diff --git a/docs/semantic_clusters.gif b/docs/semantic_clusters.gif deleted file mode 100644 index b28701b..0000000 Binary files a/docs/semantic_clusters.gif and /dev/null differ