From e417a09548950bf3880f1448404706aa6d97d261 Mon Sep 17 00:00:00 2001 From: Jelle Teijema Date: Fri, 5 Nov 2021 09:45:30 +0100 Subject: [PATCH 1/5] Structure environment as ASReview plugin Does not yet include any changes to code --- README.md | 2 +- .../semantic_clustering/build.py | 0 .../semantic_clustering/clustering.py | 0 .../semantic_clustering/dim_reduct.py | 0 .../semantic_clustering/inspect_data.ipynb | 0 .../semantic_clustering/interactive.py | 0 .../semantic_clustering/load_data.py | 0 .../semantic_clustering/update_df.py | 0 semantic_clusters.gif => docs/semantic_clusters.gif | Bin 9 files changed, 1 insertion(+), 1 deletion(-) rename build.py => asreviewcontrib/semantic_clustering/build.py (100%) rename clustering.py => asreviewcontrib/semantic_clustering/clustering.py (100%) rename dim_reduct.py => asreviewcontrib/semantic_clustering/dim_reduct.py (100%) rename inspect_data.ipynb => asreviewcontrib/semantic_clustering/inspect_data.ipynb (100%) rename interactive.py => asreviewcontrib/semantic_clustering/interactive.py (100%) rename load_data.py => asreviewcontrib/semantic_clustering/load_data.py (100%) rename update_df.py => asreviewcontrib/semantic_clustering/update_df.py (100%) rename semantic_clusters.gif => docs/semantic_clusters.gif (100%) diff --git a/README.md b/README.md index 2b78f71..57dcdd3 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ # Semantic Clusters Experimental repository aimed at using transformers (such as CovidBERT) and Deep Learning techniques to retrieve and visualize semantic clusters underlying the CORD-19 database. -![Alt Text](https://github.com/asreview/semantic-clusters/blob/master/semantic_clusters.gif) \ No newline at end of file +![Alt Text](https://github.com/asreview/semantic-clusters/blob/master/docs/semantic_clusters.gif) \ No newline at end of file diff --git a/build.py b/asreviewcontrib/semantic_clustering/build.py similarity index 100% rename from build.py rename to asreviewcontrib/semantic_clustering/build.py diff --git a/clustering.py b/asreviewcontrib/semantic_clustering/clustering.py similarity index 100% rename from clustering.py rename to asreviewcontrib/semantic_clustering/clustering.py diff --git a/dim_reduct.py b/asreviewcontrib/semantic_clustering/dim_reduct.py similarity index 100% rename from dim_reduct.py rename to asreviewcontrib/semantic_clustering/dim_reduct.py diff --git a/inspect_data.ipynb b/asreviewcontrib/semantic_clustering/inspect_data.ipynb similarity index 100% rename from inspect_data.ipynb rename to asreviewcontrib/semantic_clustering/inspect_data.ipynb diff --git a/interactive.py b/asreviewcontrib/semantic_clustering/interactive.py similarity index 100% rename from interactive.py rename to asreviewcontrib/semantic_clustering/interactive.py diff --git a/load_data.py b/asreviewcontrib/semantic_clustering/load_data.py similarity index 100% rename from load_data.py rename to asreviewcontrib/semantic_clustering/load_data.py diff --git a/update_df.py b/asreviewcontrib/semantic_clustering/update_df.py similarity index 100% rename from update_df.py rename to asreviewcontrib/semantic_clustering/update_df.py diff --git a/semantic_clusters.gif b/docs/semantic_clusters.gif similarity index 100% rename from semantic_clusters.gif rename to docs/semantic_clusters.gif From 585d3abc7a44a5c180da0ef76918823ebf5cc1fb Mon Sep 17 00:00:00 2001 From: Jelle Date: Fri, 5 Nov 2021 09:47:17 +0100 Subject: [PATCH 2/5] Fix image with relative link instead of direct. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 57dcdd3..5b2cd0c 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ # Semantic Clusters Experimental repository aimed at using transformers (such as CovidBERT) and Deep Learning techniques to retrieve and visualize semantic clusters underlying the CORD-19 database. -![Alt Text](https://github.com/asreview/semantic-clusters/blob/master/docs/semantic_clusters.gif) \ No newline at end of file +![Alt Text](/docs/semantic_clusters.gif) From ccdfdf7066f01a3e5583f8f2ebe1fa45777deeac Mon Sep 17 00:00:00 2001 From: Jelle Teijema Date: Fri, 5 Nov 2021 10:00:11 +0100 Subject: [PATCH 3/5] Create a main pipeline file and remove inspect_data --- .../semantic_clustering/inspect_data.ipynb | 182 ------------------ .../semantic_clustering.py | 33 ++++ 2 files changed, 33 insertions(+), 182 deletions(-) delete mode 100644 asreviewcontrib/semantic_clustering/inspect_data.ipynb create mode 100644 asreviewcontrib/semantic_clustering/semantic_clustering.py diff --git a/asreviewcontrib/semantic_clustering/inspect_data.ipynb b/asreviewcontrib/semantic_clustering/inspect_data.ipynb deleted file mode 100644 index 2339564..0000000 --- a/asreviewcontrib/semantic_clustering/inspect_data.ipynb +++ /dev/null @@ -1,182 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Inspect Data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This is a small-scale notebook dedicated to investigating the optimal way to read, save and write the CORD-19 dataset for reusability." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# imports\n", - "# Numerical / data munging\n", - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "# system stuff\n", - "import os\n", - "import sys\n", - "import json" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Inspect JSON" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cord19_json_path = os.path.join(\"data\", \"cord19.json\")\n", - "with open(cord19_json_path) as json_file:\n", - " data = json.load(json_file)\n", - " print(type(data))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data['ug7v899j'][0]['title']\n", - "len(data['ug7v899j'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": false - }, - "outputs": [], - "source": [ - "data['ug7v899j'][0].keys()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Apparently lots of abstracts may be missing from some dictionaries\n", - "for i, (key,val) in enumerate(data.items()):\n", - " if len(val) > 1 and not val[0]['abstract']:\n", - "# print(i, val)\n", - "# print(\"\\n\\n\\n\")\n", - " for v in val:\n", - " print(v)\n", - " print(\"\\n\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Inspect DataFrame" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_path = os.path.join(\"data\",\"cord19_df.csv\")\n", - "df = pd.read_csv(df_path)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(len(df))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "miss_abs = df[df['Abstract'].isnull()]\n", - "no_miss_abs = df.drop(miss_abs.index)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "no_miss_abs[no_miss_abs['Abstract'].isnull()]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "type(miss_abs.index)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3.6.9 64-bit", - "language": "python", - "name": "python36964bit26fe9501e8bb4cb6b5e5f9775ab83204" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.9" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/asreviewcontrib/semantic_clustering/semantic_clustering.py b/asreviewcontrib/semantic_clustering/semantic_clustering.py new file mode 100644 index 0000000..18bf9ae --- /dev/null +++ b/asreviewcontrib/semantic_clustering/semantic_clustering.py @@ -0,0 +1,33 @@ +# Copyright 2021 The ASReview Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# import +from asreview.data import ASReviewData + + +class SemanticClustering(): + def __init__(self, data: ASReviewData): + self.data = data + + # create ASReview data object + + +def load_data(ASReviewDataObject): + + data = ASReviewDataObject.df[['title', 'abstract']].copy() + data['abstract'] = data['abstract'].replace('', np.nan, inplace=False) + data.dropna(subset=['abstract'], inplace=True) + data = data.reset_index(drop=True) + + return data From 292f093f1eeed6b9e8910b4bf3f2a7e1c9c56a77 Mon Sep 17 00:00:00 2001 From: Jelle Teijema Date: Fri, 5 Nov 2021 11:35:40 +0100 Subject: [PATCH 4/5] Add code up until tokenization --- .../semantic_clustering.py | 55 ++++++++++++++++--- 1 file changed, 48 insertions(+), 7 deletions(-) diff --git a/asreviewcontrib/semantic_clustering/semantic_clustering.py b/asreviewcontrib/semantic_clustering/semantic_clustering.py index 18bf9ae..f9913aa 100644 --- a/asreviewcontrib/semantic_clustering/semantic_clustering.py +++ b/asreviewcontrib/semantic_clustering/semantic_clustering.py @@ -12,22 +12,63 @@ # See the License for the specific language governing permissions and # limitations under the License. -# import +# import ASReview +from tqdm import tqdm from asreview.data import ASReviewData +# import numpy +import numpy as np -class SemanticClustering(): - def __init__(self, data: ASReviewData): - self.data = data +# import transformer autotokenizer and automodel +from transformers import AutoTokenizer, AutoModel - # create ASReview data object +# disable transformer warning +from transformers import logging +logging.set_verbosity_error() +#import tqdm -def load_data(ASReviewDataObject): - data = ASReviewDataObject.df[['title', 'abstract']].copy() +def SemanticClustering(asreview_data_object): + + # load data + print("Loading data...") + data = load_data(asreview_data_object) + + # cut data for testing + data = data.iloc[:10, :] + + # load scibert transformer + print("Loading scibert transformer...") + transformer = 'allenai/scibert_scivocab_uncased' + + # load transformer and tokenizer + print("Loading tokenizer and model...") + tokenizer = AutoTokenizer.from_pretrained(transformer) + model = AutoModel.from_pretrained(transformer) + + # tokenize abstracts and add to data + print("Tokenizing abstracts...") + data['tokenized'] = data['abstract'].apply(lambda x: tokenizer.encode( + x, + padding='longest', + add_special_tokens=True, + return_tensors="pt")) + + print(data) + + +def load_data(asreview_data_object): + + # extract title and abstract, drop empty abstracts and reset index + data = asreview_data_object.df[['title', 'abstract']].copy() data['abstract'] = data['abstract'].replace('', np.nan, inplace=False) data.dropna(subset=['abstract'], inplace=True) data = data.reset_index(drop=True) return data + + +if __name__ == "__main__": + filepath = "https://raw.githubusercontent.com/asreview/systematic-review-datasets/master/datasets/van_de_Schoot_2017/output/van_de_Schoot_2017.csv" + SemanticClustering(ASReviewData.from_file(filepath)) From 7a1b3518a854001b72f7a54b6aa086f337da4681 Mon Sep 17 00:00:00 2001 From: Jelle Teijema Date: Fri, 5 Nov 2021 12:12:01 +0100 Subject: [PATCH 5/5] Remove unused files --- asreviewcontrib/semantic_clustering/build.py | 192 ------------------ .../semantic_clustering/load_data.py | 116 ----------- .../semantic_clustering.py | 17 +- .../semantic_clustering/update_df.py | 106 ---------- 4 files changed, 13 insertions(+), 418 deletions(-) delete mode 100644 asreviewcontrib/semantic_clustering/build.py delete mode 100644 asreviewcontrib/semantic_clustering/load_data.py delete mode 100644 asreviewcontrib/semantic_clustering/update_df.py diff --git a/asreviewcontrib/semantic_clustering/build.py b/asreviewcontrib/semantic_clustering/build.py deleted file mode 100644 index 08ebe92..0000000 --- a/asreviewcontrib/semantic_clustering/build.py +++ /dev/null @@ -1,192 +0,0 @@ -# imports -# System stuff -import os -import sys -import json -import pickle -from shutil import rmtree - -# Numerical / data imports -import numpy as np -import pandas as pd - -# Torch-y stuff -import torch - -# Transformers -from transformers import AutoTokenizer, AutoModelWithLMHead#, AutoModelForMaskedLM -from transformers import BertTokenizer, BertModel -from sentence_transformers import SentenceTransformer, models - -# Own functions -from load_data import load_from_parses, load_from_json, load_dataframe - -def generate_embeddings(model, tokenizer, df, use_covidbert=False): - """Function that generates (CovidBERT) embeddings - Args: - model: The (transformer) model to be used, e.g. CovidBERT - tokenizer: Tokenizer corresponding to the model used - df: DataFrame containing parsed data from the CORD-19 document parses - use_covidbert: (bool) To set whether we use covidbert or regular BERT - Returns: - embeddings: Contextualized embeddings from the specified model - """ - - # Path structure - if not os.path.exists("data"): - os.makedirs("data") - embs_path = os.path.join("data","embs") - if not os.path.exists(embs_path): - os.makedirs(embs_path) - - # Only use ones without missing abstracts - # (Effectively circumvented using titles instead while building abstracts) - miss_abs = df[df['Abstract'].isnull()] - no_miss_abs = df.drop(miss_abs.index) - - for i, abstract in enumerate(no_miss_abs['Abstract']): - - # Only do it for first 2000 for testing purposes - if i > 1999: - break - - # Get cord uid and title for article - cord_uid = no_miss_abs.iloc[i,0] - title = no_miss_abs.iloc[i,1] - - if i % 10 == 0: - print(f"Abstract: {i:7d}, cord_uid {cord_uid}") - - # In case we want to use CovidBERT - if use_covidbert: - - """"Add preprocessing for tokens instead of split""" - abstract = abstract.split(" ") - outputs = model.encode(abstract) - - # Use Regular BERT instead - else: - - # Use (BERT Tokenizer and get outputs tuple - tokenized = tokenizer.encode(abstract, return_tensors="pt") - outputs = model(tokenized) - - # Retrieve last hidden states and CLS token - #last_hidden_states = outputs[0] - cls_token = outputs[1] - - # Write single CLS token to file to prevent RAM build-up - # Cast to np if true - to_numpy = True - if to_numpy: - cls_token = cls_token.detach().numpy() - embs_file = os.path.join("data","embs", str(cord_uid)+".pickle") - with open(embs_file, "wb+") as file: - pickle.dump(cls_token, file) - - print("Did I encode all abstracts and save pickle?") - -def load_model(use_covidbert=False): - """Function that loads and returns the CovidBERT model""" - - # # Load CovidBERT - # if use_covidbert: - # print("Loading model...") - # model = AutoModelForMaskedLM.from_pretrained("deepset/covid_bert_base") - # print("Loading tokenizer...") - # tokenizer = AutoTokenizer.from_pretrained("deepset/covid_bert_base") - # print("Finished loading the model successfully!") - - #model = SentenceTransformer(model_path) - - # #Load CovidBERT - # if use_covidbert: - # print("Loading model...") - # model = AutoModelWithLMHead.from_pretrained("manueltonneau/clinicalcovid-bert-nli") - # print("Loading tokenizer...") - # print("\n") - # tokenizer = AutoTokenizer.from_pretrained("manueltonneau/clinicalcovid-bert-nli") - # print("\n") - # print("Finished loading the model successfully!") - - # # Save the model to model path - # model_path = os.path.join("models","clinicalcovid") - # if not os.path.exists(model_path): - # os.makedirs(model_path) - # model.save_pretrained(model_path) - # tokenizer.save_pretrained(model_path) - - # model = SentenceTransformer(model_path) - - # Load CovidBERT - if use_covidbert: - print("Loading model...") - model = AutoModelWithLMHead.from_pretrained("gsarti/covidbert-nli") - print("Loading tokenizer...") - print("\n") - tokenizer = AutoTokenizer.from_pretrained("gsarti/covidbert-nli") - print("\n") - print("Finished loading the model successfully!") - - # Save the model to model path - model_path = os.path.join("models","gsarticovid") - if not os.path.exists(model_path): - os.makedirs(model_path) - model.save_pretrained(model_path) - tokenizer.save_pretrained(model_path) - print(f"Successfully saved model to {model_path}") - - print("Loading Sentence Transformer now!") - word_embedding_model = models.BERT( - model_path, - # max_seq_length=args.max_seq_length, - # do_lower_case=args.do_lower_case - ) - pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), - pooling_mode_mean_tokens=True, - pooling_mode_cls_token=False, - pooling_mode_max_tokens=False) - model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) - rmtree(model_path) - model.save(model_path) - print("Finished building Sentence Transformer!") - - # Load regular BERT - else: - print("Loading BERT") - tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') - model = BertModel.from_pretrained('bert-base-uncased') - print("Finished loading BERT") - - return model, tokenizer - -if __name__ == "__main__": - - # First check if we have the right folder structure - if not os.path.exists("data"): - os.makedirs("data") - - # Whether we use CovidBERT or normal BERT - use_covidbert = False - - # Load model and tokenizer - model, tokenizer = load_model(use_covidbert=use_covidbert) - - # Use bulky loader if we don't have the cord19.json yet - cord19_json_path = os.path.join("data", "cord19.json") - if not os.path.exists(cord19_json_path): - load_from_parses() - - # Load the file from the created json - data = load_from_json(cord19_json_path) - - # Load dataframe if csv exists, otherwise create it - df = load_dataframe(data) - - # If embeddings don't exist, create them - embs_path = os.path.join("data","embs") - if not os.path.exists(embs_path): - os.makedirs(embs_path) - - if len(os.listdir(embs_path)) == 0: - generate_embeddings(model, tokenizer, df, use_covidbert=use_covidbert) \ No newline at end of file diff --git a/asreviewcontrib/semantic_clustering/load_data.py b/asreviewcontrib/semantic_clustering/load_data.py deleted file mode 100644 index c93094e..0000000 --- a/asreviewcontrib/semantic_clustering/load_data.py +++ /dev/null @@ -1,116 +0,0 @@ -# System / os stuff -import csv -import os -import json - -# Collections -from collections import defaultdict -import pandas as pd - -def load_dataframe(data): - """Function that creates a DataFrame from the json if there is no csv containing it""" - - # Either csv already exists and we can simply load it - df_path = os.path.join("data","cord19_df.csv") - if os.path.exists(df_path): - print("Loading DataFrame...") - df = pd.read_csv(df_path) - - # Or.. - else: - print("Creating DataFrame...") - # Create a DataFrame instead - d = [] - - # JSON data contains dictionaries in a list for each entry - for _, (_,val) in enumerate(data.items()): - val_dict = val[0] - cord_uid = val_dict['cord_uid'] - title = val_dict['title'] - abstract = val_dict['abstract'] - intro = val_dict['introduction'] - - d.append((cord_uid,title,abstract,intro)) - - # Turn list of tuples into DataFrame and write it to a CSV - df = pd.DataFrame(d, columns=('cord_uid','Title', 'Abstract', 'Introduction')) - df.to_csv(df_path, index=False) - - return df - -def load_from_json(cord19_json_path): - """Function that loads the data from a saved cord19.json file""" - if os.path.exists(cord19_json_path): - with open(cord19_json_path) as json_file: - data = json.load(json_file) - else: - raise ValueError("The provided path does not exist! Please use load_from_parses() to create the json file.") - return data - -def load_from_parses(): - """Function to load the CORD-19 dataset from the provided JSONS""" - - cord_uid_to_text = defaultdict(list) - - # open the file - if not os.path.exists("data"): - os.makedirs("data") - metadata = os.path.join("data", "metadata.csv") - - with open(metadata) as f_in: - reader = csv.DictReader(f_in) - for i, row in enumerate(reader): - - # access some metadata - cord_uid = row['cord_uid'] - title = row['title'] - abstract = row['abstract'] - #authors = row['authors'].split('; ') - - # Abstracts are quite big, so cut them - abstract = abstract.split(" ") - if len(abstract) > 200: - abstract = abstract[:200] - abstract = " ".join(abstract) - - # # If we don't have an abstract, use title - # if len(abstract) < 5: - # abstract = title - - # access the full text (if available) for Intro - introduction = [] - if row['pdf_json_files']: - for json_path in row['pdf_json_files'].split('; '): - - # Data is saved in "data" folder, so navigate there instead - json_path = os.path.join("data", json_path) - with open(json_path) as f_json: - full_text_dict = json.load(f_json) - - # grab introduction section from *some* version of the full text - for paragraph_dict in full_text_dict['body_text']: - paragraph_text = paragraph_dict['text'] - section_name = paragraph_dict['section'] - if 'intro' in section_name.lower(): - introduction.append(paragraph_text) - - # stop searching other copies of full text if already got introduction - if introduction: - break - if i % 100 == 0: - print(f"At row {i} now!") - - # save for later usage - cord_uid_to_text[cord_uid].append({ - 'cord_uid': cord_uid, - 'title': title, - 'abstract': abstract, - 'introduction': introduction - }) - - print(type(cord_uid_to_text)) - - # Save the full CORD dataset as one json file - data_path = os.path.join("data","cord19.json") - with open(data_path, 'w') as fp: - json.dump(cord_uid_to_text, fp) diff --git a/asreviewcontrib/semantic_clustering/semantic_clustering.py b/asreviewcontrib/semantic_clustering/semantic_clustering.py index f9913aa..97d49af 100644 --- a/asreviewcontrib/semantic_clustering/semantic_clustering.py +++ b/asreviewcontrib/semantic_clustering/semantic_clustering.py @@ -26,7 +26,7 @@ from transformers import logging logging.set_verbosity_error() -#import tqdm +# import tqdm def SemanticClustering(asreview_data_object): @@ -49,13 +49,22 @@ def SemanticClustering(asreview_data_object): # tokenize abstracts and add to data print("Tokenizing abstracts...") - data['tokenized'] = data['abstract'].apply(lambda x: tokenizer.encode( + data['tokenized'] = data['abstract'].apply(lambda x: tokenizer.encode_plus( x, padding='longest', - add_special_tokens=True, + add_special_tokens=False, return_tensors="pt")) - print(data) + # generate embeddings + print("Generating embeddings...") + data['embeddings'] = data['tokenized'].apply( + lambda x: model(**x, output_hidden_states=False)[-1]) + + from dim_reduct import run_pca + n_components = .98 + #pca = run_pca(data['embeddings'], n_components) + + print(data['embeddings'][0].detach().numpy()) def load_data(asreview_data_object): diff --git a/asreviewcontrib/semantic_clustering/update_df.py b/asreviewcontrib/semantic_clustering/update_df.py deleted file mode 100644 index c925def..0000000 --- a/asreviewcontrib/semantic_clustering/update_df.py +++ /dev/null @@ -1,106 +0,0 @@ -# imports - -# System -import os - -# data -import numpy as np -import pandas as pd - -# Self -from load_data import load_from_json, load_dataframe, load_from_parses - -############################# READ ME ############################# -##### This is a file with several functions that was used to ###### -##### recombine a saved dataFrame with titles and abstracts. ###### -################################################################### - -def update_titles(): - """Function to add titles to the kmeans df""" - - # Use bulky loader if we don't have the cord19.json yet - cord19_json_path = os.path.join("data", "cord19.json") - if not os.path.exists(cord19_json_path): - load_from_parses() - - # Load the file from the created json - data = load_from_json(cord19_json_path) - - # Load dataframe if csv exists, otherwise create it - df = load_dataframe(data) - df = df.iloc[:2000,:-1] - - # Load other df - kmeans_df_path = os.path.join("data","dataframes","kmeans_df.csv") - kmeans_df = pd.read_csv(kmeans_df_path) - - # Retrieve titles - titles = [] - for i, cord_uid in enumerate(kmeans_df['cord_uid']): - for df_uid in df.cord_uid.values: - if cord_uid == df_uid: - title = df[df.cord_uid == df_uid].Title.values.tolist() - titles.append(title) - - flatten = [title[0] for title in titles] - kmeans_df['Title'] = np.array(flatten) - print(kmeans_df.head()) - print(kmeans_df.columns) - - kmeans_df.to_csv(kmeans_df_path,index=None) - - -def update_abstracts(): - """Function to add abstracts to the kmeans df""" - - # Use bulky loader if we don't have the cord19.json yet - cord19_json_path = os.path.join("data", "cord19.json") - if not os.path.exists(cord19_json_path): - load_from_parses() - - # Load the file from the created json - data = load_from_json(cord19_json_path) - - # Load dataframe if csv exists, otherwise create it - df = load_dataframe(data) - df = df.iloc[:2000,:-1] - - print(df.head()) - print(df.columns) - print(df.shape) - print("\n\n") - - exit() - - # Load other df - kmeans_df_path = os.path.join("data","dataframes","kmeans_df.csv") - kmeans_df = pd.read_csv(kmeans_df_path) - - print(kmeans_df.head()) - print(kmeans_df.columns) - print(kmeans_df.shape) - - # print() - # print(kmeans_df[kmeans_df['cord_uid'] == "6c6cw80p"]) - # print(kmeans_df[kmeans_df['cord_uid'] == "ug7v899j"]) - - abstracts = [] - for i,cord_uid in enumerate(kmeans_df['cord_uid']): - for df_uid in df.cord_uid.values: - if cord_uid == df_uid: - abstract = df[df.cord_uid == df_uid].Abstract.values.tolist() - abstracts.append(abstract) - #print(abstracts[:10]) - - flatten = [absy[0] for absy in abstracts] - kmeans_df['Abstract'] = np.array(flatten) - print(kmeans_df.head()) - print(kmeans_df.columns) - print(kmeans_df[kmeans_df.cord_uid == "ug7v899j"].Abstract) - - kmeans_df.to_csv(kmeans_df_path,index=None) - -if __name__ == "__main__": - #update_abstracts() - #update_titles() - pass \ No newline at end of file