diff --git a/asreviewcontrib/semantic_clustering/build.py b/asreviewcontrib/semantic_clustering/build.py deleted file mode 100644 index 08ebe92..0000000 --- a/asreviewcontrib/semantic_clustering/build.py +++ /dev/null @@ -1,192 +0,0 @@ -# imports -# System stuff -import os -import sys -import json -import pickle -from shutil import rmtree - -# Numerical / data imports -import numpy as np -import pandas as pd - -# Torch-y stuff -import torch - -# Transformers -from transformers import AutoTokenizer, AutoModelWithLMHead#, AutoModelForMaskedLM -from transformers import BertTokenizer, BertModel -from sentence_transformers import SentenceTransformer, models - -# Own functions -from load_data import load_from_parses, load_from_json, load_dataframe - -def generate_embeddings(model, tokenizer, df, use_covidbert=False): - """Function that generates (CovidBERT) embeddings - Args: - model: The (transformer) model to be used, e.g. CovidBERT - tokenizer: Tokenizer corresponding to the model used - df: DataFrame containing parsed data from the CORD-19 document parses - use_covidbert: (bool) To set whether we use covidbert or regular BERT - Returns: - embeddings: Contextualized embeddings from the specified model - """ - - # Path structure - if not os.path.exists("data"): - os.makedirs("data") - embs_path = os.path.join("data","embs") - if not os.path.exists(embs_path): - os.makedirs(embs_path) - - # Only use ones without missing abstracts - # (Effectively circumvented using titles instead while building abstracts) - miss_abs = df[df['Abstract'].isnull()] - no_miss_abs = df.drop(miss_abs.index) - - for i, abstract in enumerate(no_miss_abs['Abstract']): - - # Only do it for first 2000 for testing purposes - if i > 1999: - break - - # Get cord uid and title for article - cord_uid = no_miss_abs.iloc[i,0] - title = no_miss_abs.iloc[i,1] - - if i % 10 == 0: - print(f"Abstract: {i:7d}, cord_uid {cord_uid}") - - # In case we want to use CovidBERT - if use_covidbert: - - """"Add preprocessing for tokens instead of split""" - abstract = abstract.split(" ") - outputs = model.encode(abstract) - - # Use Regular BERT instead - else: - - # Use (BERT Tokenizer and get outputs tuple - tokenized = tokenizer.encode(abstract, return_tensors="pt") - outputs = model(tokenized) - - # Retrieve last hidden states and CLS token - #last_hidden_states = outputs[0] - cls_token = outputs[1] - - # Write single CLS token to file to prevent RAM build-up - # Cast to np if true - to_numpy = True - if to_numpy: - cls_token = cls_token.detach().numpy() - embs_file = os.path.join("data","embs", str(cord_uid)+".pickle") - with open(embs_file, "wb+") as file: - pickle.dump(cls_token, file) - - print("Did I encode all abstracts and save pickle?") - -def load_model(use_covidbert=False): - """Function that loads and returns the CovidBERT model""" - - # # Load CovidBERT - # if use_covidbert: - # print("Loading model...") - # model = AutoModelForMaskedLM.from_pretrained("deepset/covid_bert_base") - # print("Loading tokenizer...") - # tokenizer = AutoTokenizer.from_pretrained("deepset/covid_bert_base") - # print("Finished loading the model successfully!") - - #model = SentenceTransformer(model_path) - - # #Load CovidBERT - # if use_covidbert: - # print("Loading model...") - # model = AutoModelWithLMHead.from_pretrained("manueltonneau/clinicalcovid-bert-nli") - # print("Loading tokenizer...") - # print("\n") - # tokenizer = AutoTokenizer.from_pretrained("manueltonneau/clinicalcovid-bert-nli") - # print("\n") - # print("Finished loading the model successfully!") - - # # Save the model to model path - # model_path = os.path.join("models","clinicalcovid") - # if not os.path.exists(model_path): - # os.makedirs(model_path) - # model.save_pretrained(model_path) - # tokenizer.save_pretrained(model_path) - - # model = SentenceTransformer(model_path) - - # Load CovidBERT - if use_covidbert: - print("Loading model...") - model = AutoModelWithLMHead.from_pretrained("gsarti/covidbert-nli") - print("Loading tokenizer...") - print("\n") - tokenizer = AutoTokenizer.from_pretrained("gsarti/covidbert-nli") - print("\n") - print("Finished loading the model successfully!") - - # Save the model to model path - model_path = os.path.join("models","gsarticovid") - if not os.path.exists(model_path): - os.makedirs(model_path) - model.save_pretrained(model_path) - tokenizer.save_pretrained(model_path) - print(f"Successfully saved model to {model_path}") - - print("Loading Sentence Transformer now!") - word_embedding_model = models.BERT( - model_path, - # max_seq_length=args.max_seq_length, - # do_lower_case=args.do_lower_case - ) - pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), - pooling_mode_mean_tokens=True, - pooling_mode_cls_token=False, - pooling_mode_max_tokens=False) - model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) - rmtree(model_path) - model.save(model_path) - print("Finished building Sentence Transformer!") - - # Load regular BERT - else: - print("Loading BERT") - tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') - model = BertModel.from_pretrained('bert-base-uncased') - print("Finished loading BERT") - - return model, tokenizer - -if __name__ == "__main__": - - # First check if we have the right folder structure - if not os.path.exists("data"): - os.makedirs("data") - - # Whether we use CovidBERT or normal BERT - use_covidbert = False - - # Load model and tokenizer - model, tokenizer = load_model(use_covidbert=use_covidbert) - - # Use bulky loader if we don't have the cord19.json yet - cord19_json_path = os.path.join("data", "cord19.json") - if not os.path.exists(cord19_json_path): - load_from_parses() - - # Load the file from the created json - data = load_from_json(cord19_json_path) - - # Load dataframe if csv exists, otherwise create it - df = load_dataframe(data) - - # If embeddings don't exist, create them - embs_path = os.path.join("data","embs") - if not os.path.exists(embs_path): - os.makedirs(embs_path) - - if len(os.listdir(embs_path)) == 0: - generate_embeddings(model, tokenizer, df, use_covidbert=use_covidbert) \ No newline at end of file diff --git a/asreviewcontrib/semantic_clustering/load_data.py b/asreviewcontrib/semantic_clustering/load_data.py deleted file mode 100644 index c93094e..0000000 --- a/asreviewcontrib/semantic_clustering/load_data.py +++ /dev/null @@ -1,116 +0,0 @@ -# System / os stuff -import csv -import os -import json - -# Collections -from collections import defaultdict -import pandas as pd - -def load_dataframe(data): - """Function that creates a DataFrame from the json if there is no csv containing it""" - - # Either csv already exists and we can simply load it - df_path = os.path.join("data","cord19_df.csv") - if os.path.exists(df_path): - print("Loading DataFrame...") - df = pd.read_csv(df_path) - - # Or.. - else: - print("Creating DataFrame...") - # Create a DataFrame instead - d = [] - - # JSON data contains dictionaries in a list for each entry - for _, (_,val) in enumerate(data.items()): - val_dict = val[0] - cord_uid = val_dict['cord_uid'] - title = val_dict['title'] - abstract = val_dict['abstract'] - intro = val_dict['introduction'] - - d.append((cord_uid,title,abstract,intro)) - - # Turn list of tuples into DataFrame and write it to a CSV - df = pd.DataFrame(d, columns=('cord_uid','Title', 'Abstract', 'Introduction')) - df.to_csv(df_path, index=False) - - return df - -def load_from_json(cord19_json_path): - """Function that loads the data from a saved cord19.json file""" - if os.path.exists(cord19_json_path): - with open(cord19_json_path) as json_file: - data = json.load(json_file) - else: - raise ValueError("The provided path does not exist! Please use load_from_parses() to create the json file.") - return data - -def load_from_parses(): - """Function to load the CORD-19 dataset from the provided JSONS""" - - cord_uid_to_text = defaultdict(list) - - # open the file - if not os.path.exists("data"): - os.makedirs("data") - metadata = os.path.join("data", "metadata.csv") - - with open(metadata) as f_in: - reader = csv.DictReader(f_in) - for i, row in enumerate(reader): - - # access some metadata - cord_uid = row['cord_uid'] - title = row['title'] - abstract = row['abstract'] - #authors = row['authors'].split('; ') - - # Abstracts are quite big, so cut them - abstract = abstract.split(" ") - if len(abstract) > 200: - abstract = abstract[:200] - abstract = " ".join(abstract) - - # # If we don't have an abstract, use title - # if len(abstract) < 5: - # abstract = title - - # access the full text (if available) for Intro - introduction = [] - if row['pdf_json_files']: - for json_path in row['pdf_json_files'].split('; '): - - # Data is saved in "data" folder, so navigate there instead - json_path = os.path.join("data", json_path) - with open(json_path) as f_json: - full_text_dict = json.load(f_json) - - # grab introduction section from *some* version of the full text - for paragraph_dict in full_text_dict['body_text']: - paragraph_text = paragraph_dict['text'] - section_name = paragraph_dict['section'] - if 'intro' in section_name.lower(): - introduction.append(paragraph_text) - - # stop searching other copies of full text if already got introduction - if introduction: - break - if i % 100 == 0: - print(f"At row {i} now!") - - # save for later usage - cord_uid_to_text[cord_uid].append({ - 'cord_uid': cord_uid, - 'title': title, - 'abstract': abstract, - 'introduction': introduction - }) - - print(type(cord_uid_to_text)) - - # Save the full CORD dataset as one json file - data_path = os.path.join("data","cord19.json") - with open(data_path, 'w') as fp: - json.dump(cord_uid_to_text, fp) diff --git a/asreviewcontrib/semantic_clustering/semantic_clustering.py b/asreviewcontrib/semantic_clustering/semantic_clustering.py index f9913aa..97d49af 100644 --- a/asreviewcontrib/semantic_clustering/semantic_clustering.py +++ b/asreviewcontrib/semantic_clustering/semantic_clustering.py @@ -26,7 +26,7 @@ from transformers import logging logging.set_verbosity_error() -#import tqdm +# import tqdm def SemanticClustering(asreview_data_object): @@ -49,13 +49,22 @@ def SemanticClustering(asreview_data_object): # tokenize abstracts and add to data print("Tokenizing abstracts...") - data['tokenized'] = data['abstract'].apply(lambda x: tokenizer.encode( + data['tokenized'] = data['abstract'].apply(lambda x: tokenizer.encode_plus( x, padding='longest', - add_special_tokens=True, + add_special_tokens=False, return_tensors="pt")) - print(data) + # generate embeddings + print("Generating embeddings...") + data['embeddings'] = data['tokenized'].apply( + lambda x: model(**x, output_hidden_states=False)[-1]) + + from dim_reduct import run_pca + n_components = .98 + #pca = run_pca(data['embeddings'], n_components) + + print(data['embeddings'][0].detach().numpy()) def load_data(asreview_data_object): diff --git a/asreviewcontrib/semantic_clustering/update_df.py b/asreviewcontrib/semantic_clustering/update_df.py deleted file mode 100644 index c925def..0000000 --- a/asreviewcontrib/semantic_clustering/update_df.py +++ /dev/null @@ -1,106 +0,0 @@ -# imports - -# System -import os - -# data -import numpy as np -import pandas as pd - -# Self -from load_data import load_from_json, load_dataframe, load_from_parses - -############################# READ ME ############################# -##### This is a file with several functions that was used to ###### -##### recombine a saved dataFrame with titles and abstracts. ###### -################################################################### - -def update_titles(): - """Function to add titles to the kmeans df""" - - # Use bulky loader if we don't have the cord19.json yet - cord19_json_path = os.path.join("data", "cord19.json") - if not os.path.exists(cord19_json_path): - load_from_parses() - - # Load the file from the created json - data = load_from_json(cord19_json_path) - - # Load dataframe if csv exists, otherwise create it - df = load_dataframe(data) - df = df.iloc[:2000,:-1] - - # Load other df - kmeans_df_path = os.path.join("data","dataframes","kmeans_df.csv") - kmeans_df = pd.read_csv(kmeans_df_path) - - # Retrieve titles - titles = [] - for i, cord_uid in enumerate(kmeans_df['cord_uid']): - for df_uid in df.cord_uid.values: - if cord_uid == df_uid: - title = df[df.cord_uid == df_uid].Title.values.tolist() - titles.append(title) - - flatten = [title[0] for title in titles] - kmeans_df['Title'] = np.array(flatten) - print(kmeans_df.head()) - print(kmeans_df.columns) - - kmeans_df.to_csv(kmeans_df_path,index=None) - - -def update_abstracts(): - """Function to add abstracts to the kmeans df""" - - # Use bulky loader if we don't have the cord19.json yet - cord19_json_path = os.path.join("data", "cord19.json") - if not os.path.exists(cord19_json_path): - load_from_parses() - - # Load the file from the created json - data = load_from_json(cord19_json_path) - - # Load dataframe if csv exists, otherwise create it - df = load_dataframe(data) - df = df.iloc[:2000,:-1] - - print(df.head()) - print(df.columns) - print(df.shape) - print("\n\n") - - exit() - - # Load other df - kmeans_df_path = os.path.join("data","dataframes","kmeans_df.csv") - kmeans_df = pd.read_csv(kmeans_df_path) - - print(kmeans_df.head()) - print(kmeans_df.columns) - print(kmeans_df.shape) - - # print() - # print(kmeans_df[kmeans_df['cord_uid'] == "6c6cw80p"]) - # print(kmeans_df[kmeans_df['cord_uid'] == "ug7v899j"]) - - abstracts = [] - for i,cord_uid in enumerate(kmeans_df['cord_uid']): - for df_uid in df.cord_uid.values: - if cord_uid == df_uid: - abstract = df[df.cord_uid == df_uid].Abstract.values.tolist() - abstracts.append(abstract) - #print(abstracts[:10]) - - flatten = [absy[0] for absy in abstracts] - kmeans_df['Abstract'] = np.array(flatten) - print(kmeans_df.head()) - print(kmeans_df.columns) - print(kmeans_df[kmeans_df.cord_uid == "ug7v899j"].Abstract) - - kmeans_df.to_csv(kmeans_df_path,index=None) - -if __name__ == "__main__": - #update_abstracts() - #update_titles() - pass \ No newline at end of file