From e417a09548950bf3880f1448404706aa6d97d261 Mon Sep 17 00:00:00 2001
From: Jelle Teijema <j.j.teijema@gmail.com>
Date: Fri, 5 Nov 2021 09:45:30 +0100
Subject: [PATCH 1/5] Structure environment as ASReview plugin

Does not yet include any changes to code
---
 README.md                                           |   2 +-
 .../semantic_clustering/build.py                    |   0
 .../semantic_clustering/clustering.py               |   0
 .../semantic_clustering/dim_reduct.py               |   0
 .../semantic_clustering/inspect_data.ipynb          |   0
 .../semantic_clustering/interactive.py              |   0
 .../semantic_clustering/load_data.py                |   0
 .../semantic_clustering/update_df.py                |   0
 semantic_clusters.gif => docs/semantic_clusters.gif | Bin
 9 files changed, 1 insertion(+), 1 deletion(-)
 rename build.py => asreviewcontrib/semantic_clustering/build.py (100%)
 rename clustering.py => asreviewcontrib/semantic_clustering/clustering.py (100%)
 rename dim_reduct.py => asreviewcontrib/semantic_clustering/dim_reduct.py (100%)
 rename inspect_data.ipynb => asreviewcontrib/semantic_clustering/inspect_data.ipynb (100%)
 rename interactive.py => asreviewcontrib/semantic_clustering/interactive.py (100%)
 rename load_data.py => asreviewcontrib/semantic_clustering/load_data.py (100%)
 rename update_df.py => asreviewcontrib/semantic_clustering/update_df.py (100%)
 rename semantic_clusters.gif => docs/semantic_clusters.gif (100%)

diff --git a/README.md b/README.md
index 2b78f71..57dcdd3 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
 # Semantic Clusters
 Experimental repository aimed at using transformers (such as CovidBERT) and Deep Learning techniques to retrieve and visualize semantic clusters underlying the CORD-19 database.
 
-![Alt Text](https://github.com/asreview/semantic-clusters/blob/master/semantic_clusters.gif)
\ No newline at end of file
+![Alt Text](https://github.com/asreview/semantic-clusters/blob/master/docs/semantic_clusters.gif)
\ No newline at end of file
diff --git a/build.py b/asreviewcontrib/semantic_clustering/build.py
similarity index 100%
rename from build.py
rename to asreviewcontrib/semantic_clustering/build.py
diff --git a/clustering.py b/asreviewcontrib/semantic_clustering/clustering.py
similarity index 100%
rename from clustering.py
rename to asreviewcontrib/semantic_clustering/clustering.py
diff --git a/dim_reduct.py b/asreviewcontrib/semantic_clustering/dim_reduct.py
similarity index 100%
rename from dim_reduct.py
rename to asreviewcontrib/semantic_clustering/dim_reduct.py
diff --git a/inspect_data.ipynb b/asreviewcontrib/semantic_clustering/inspect_data.ipynb
similarity index 100%
rename from inspect_data.ipynb
rename to asreviewcontrib/semantic_clustering/inspect_data.ipynb
diff --git a/interactive.py b/asreviewcontrib/semantic_clustering/interactive.py
similarity index 100%
rename from interactive.py
rename to asreviewcontrib/semantic_clustering/interactive.py
diff --git a/load_data.py b/asreviewcontrib/semantic_clustering/load_data.py
similarity index 100%
rename from load_data.py
rename to asreviewcontrib/semantic_clustering/load_data.py
diff --git a/update_df.py b/asreviewcontrib/semantic_clustering/update_df.py
similarity index 100%
rename from update_df.py
rename to asreviewcontrib/semantic_clustering/update_df.py
diff --git a/semantic_clusters.gif b/docs/semantic_clusters.gif
similarity index 100%
rename from semantic_clusters.gif
rename to docs/semantic_clusters.gif

From 585d3abc7a44a5c180da0ef76918823ebf5cc1fb Mon Sep 17 00:00:00 2001
From: Jelle <j.j.teijema@gmail.com>
Date: Fri, 5 Nov 2021 09:47:17 +0100
Subject: [PATCH 2/5] Fix image with relative link instead of direct.

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 57dcdd3..5b2cd0c 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
 # Semantic Clusters
 Experimental repository aimed at using transformers (such as CovidBERT) and Deep Learning techniques to retrieve and visualize semantic clusters underlying the CORD-19 database.
 
-![Alt Text](https://github.com/asreview/semantic-clusters/blob/master/docs/semantic_clusters.gif)
\ No newline at end of file
+![Alt Text](/docs/semantic_clusters.gif)

From ccdfdf7066f01a3e5583f8f2ebe1fa45777deeac Mon Sep 17 00:00:00 2001
From: Jelle Teijema <j.j.teijema@gmail.com>
Date: Fri, 5 Nov 2021 10:00:11 +0100
Subject: [PATCH 3/5] Create a main pipeline file and remove inspect_data

---
 .../semantic_clustering/inspect_data.ipynb    | 182 ------------------
 .../semantic_clustering.py                    |  33 ++++
 2 files changed, 33 insertions(+), 182 deletions(-)
 delete mode 100644 asreviewcontrib/semantic_clustering/inspect_data.ipynb
 create mode 100644 asreviewcontrib/semantic_clustering/semantic_clustering.py

diff --git a/asreviewcontrib/semantic_clustering/inspect_data.ipynb b/asreviewcontrib/semantic_clustering/inspect_data.ipynb
deleted file mode 100644
index 2339564..0000000
--- a/asreviewcontrib/semantic_clustering/inspect_data.ipynb
+++ /dev/null
@@ -1,182 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Inspect Data"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "This is a small-scale notebook dedicated to investigating the optimal way to read, save and write the CORD-19 dataset for reusability."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# imports\n",
-    "# Numerical / data munging\n",
-    "import numpy as np\n",
-    "import pandas as pd\n",
-    "\n",
-    "# system stuff\n",
-    "import os\n",
-    "import sys\n",
-    "import json"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Inspect JSON"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "cord19_json_path = os.path.join(\"data\", \"cord19.json\")\n",
-    "with open(cord19_json_path) as json_file:\n",
-    "    data = json.load(json_file)\n",
-    "    print(type(data))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "data['ug7v899j'][0]['title']\n",
-    "len(data['ug7v899j'])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "scrolled": false
-   },
-   "outputs": [],
-   "source": [
-    "data['ug7v899j'][0].keys()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Apparently lots of abstracts may be missing from some dictionaries\n",
-    "for i, (key,val) in enumerate(data.items()):\n",
-    "    if len(val) > 1 and not val[0]['abstract']:\n",
-    "#         print(i, val)\n",
-    "#         print(\"\\n\\n\\n\")\n",
-    "        for v in val:\n",
-    "            print(v)\n",
-    "            print(\"\\n\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Inspect DataFrame"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df_path = os.path.join(\"data\",\"cord19_df.csv\")\n",
-    "df = pd.read_csv(df_path)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df.head(5)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(len(df))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "miss_abs = df[df['Abstract'].isnull()]\n",
-    "no_miss_abs = df.drop(miss_abs.index)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "no_miss_abs[no_miss_abs['Abstract'].isnull()]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "type(miss_abs.index)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3.6.9 64-bit",
-   "language": "python",
-   "name": "python36964bit26fe9501e8bb4cb6b5e5f9775ab83204"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.9"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/asreviewcontrib/semantic_clustering/semantic_clustering.py b/asreviewcontrib/semantic_clustering/semantic_clustering.py
new file mode 100644
index 0000000..18bf9ae
--- /dev/null
+++ b/asreviewcontrib/semantic_clustering/semantic_clustering.py
@@ -0,0 +1,33 @@
+# Copyright 2021 The ASReview Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# import
+from asreview.data import ASReviewData
+
+
+class SemanticClustering():
+    def __init__(self, data: ASReviewData):
+        self.data = data
+
+    # create ASReview data object
+
+
+def load_data(ASReviewDataObject):
+
+    data = ASReviewDataObject.df[['title', 'abstract']].copy()
+    data['abstract'] = data['abstract'].replace('', np.nan, inplace=False)
+    data.dropna(subset=['abstract'], inplace=True)
+    data = data.reset_index(drop=True)
+
+    return data

From 292f093f1eeed6b9e8910b4bf3f2a7e1c9c56a77 Mon Sep 17 00:00:00 2001
From: Jelle Teijema <j.j.teijema@gmail.com>
Date: Fri, 5 Nov 2021 11:35:40 +0100
Subject: [PATCH 4/5] Add code up until tokenization

---
 .../semantic_clustering.py                    | 55 ++++++++++++++++---
 1 file changed, 48 insertions(+), 7 deletions(-)

diff --git a/asreviewcontrib/semantic_clustering/semantic_clustering.py b/asreviewcontrib/semantic_clustering/semantic_clustering.py
index 18bf9ae..f9913aa 100644
--- a/asreviewcontrib/semantic_clustering/semantic_clustering.py
+++ b/asreviewcontrib/semantic_clustering/semantic_clustering.py
@@ -12,22 +12,63 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# import
+# import ASReview
+from tqdm import tqdm
 from asreview.data import ASReviewData
 
+# import numpy
+import numpy as np
 
-class SemanticClustering():
-    def __init__(self, data: ASReviewData):
-        self.data = data
+# import transformer autotokenizer and automodel
+from transformers import AutoTokenizer, AutoModel
 
-    # create ASReview data object
+# disable transformer warning
+from transformers import logging
+logging.set_verbosity_error()
 
+#import tqdm
 
-def load_data(ASReviewDataObject):
 
-    data = ASReviewDataObject.df[['title', 'abstract']].copy()
+def SemanticClustering(asreview_data_object):
+
+    # load data
+    print("Loading data...")
+    data = load_data(asreview_data_object)
+
+    # cut data for testing
+    data = data.iloc[:10, :]
+
+    # load scibert transformer
+    print("Loading scibert transformer...")
+    transformer = 'allenai/scibert_scivocab_uncased'
+
+    # load transformer and tokenizer
+    print("Loading tokenizer and model...")
+    tokenizer = AutoTokenizer.from_pretrained(transformer)
+    model = AutoModel.from_pretrained(transformer)
+
+    # tokenize abstracts and add to data
+    print("Tokenizing abstracts...")
+    data['tokenized'] = data['abstract'].apply(lambda x: tokenizer.encode(
+        x,
+        padding='longest',
+        add_special_tokens=True,
+        return_tensors="pt"))
+
+    print(data)
+
+
+def load_data(asreview_data_object):
+
+    # extract title and abstract, drop empty abstracts and reset index
+    data = asreview_data_object.df[['title', 'abstract']].copy()
     data['abstract'] = data['abstract'].replace('', np.nan, inplace=False)
     data.dropna(subset=['abstract'], inplace=True)
     data = data.reset_index(drop=True)
 
     return data
+
+
+if __name__ == "__main__":
+    filepath = "https://raw.githubusercontent.com/asreview/systematic-review-datasets/master/datasets/van_de_Schoot_2017/output/van_de_Schoot_2017.csv"
+    SemanticClustering(ASReviewData.from_file(filepath))

From 7a1b3518a854001b72f7a54b6aa086f337da4681 Mon Sep 17 00:00:00 2001
From: Jelle Teijema <j.j.teijema@gmail.com>
Date: Fri, 5 Nov 2021 12:12:01 +0100
Subject: [PATCH 5/5] Remove unused files

---
 asreviewcontrib/semantic_clustering/build.py  | 192 ------------------
 .../semantic_clustering/load_data.py          | 116 -----------
 .../semantic_clustering.py                    |  17 +-
 .../semantic_clustering/update_df.py          | 106 ----------
 4 files changed, 13 insertions(+), 418 deletions(-)
 delete mode 100644 asreviewcontrib/semantic_clustering/build.py
 delete mode 100644 asreviewcontrib/semantic_clustering/load_data.py
 delete mode 100644 asreviewcontrib/semantic_clustering/update_df.py

diff --git a/asreviewcontrib/semantic_clustering/build.py b/asreviewcontrib/semantic_clustering/build.py
deleted file mode 100644
index 08ebe92..0000000
--- a/asreviewcontrib/semantic_clustering/build.py
+++ /dev/null
@@ -1,192 +0,0 @@
-# imports 
-# System stuff
-import os
-import sys
-import json
-import pickle
-from shutil import rmtree
-
-# Numerical / data imports
-import numpy as np
-import pandas as pd
-
-# Torch-y stuff
-import torch
-
-# Transformers
-from transformers import AutoTokenizer, AutoModelWithLMHead#, AutoModelForMaskedLM
-from transformers import BertTokenizer, BertModel
-from sentence_transformers import SentenceTransformer, models
-
-# Own functions
-from load_data import load_from_parses, load_from_json, load_dataframe
-
-def generate_embeddings(model, tokenizer, df, use_covidbert=False):
-    """Function that generates (CovidBERT) embeddings
-    Args: 
-      model: The (transformer) model to be used, e.g. CovidBERT
-      tokenizer: Tokenizer corresponding to the model used
-      df: DataFrame containing parsed data from the CORD-19 document parses
-      use_covidbert: (bool) To set whether we use covidbert or regular BERT
-    Returns:
-      embeddings: Contextualized embeddings from the specified model
-    """
-
-    # Path structure
-    if not os.path.exists("data"):
-        os.makedirs("data")
-    embs_path = os.path.join("data","embs")
-    if not os.path.exists(embs_path):
-        os.makedirs(embs_path)
-
-    # Only use ones without missing abstracts
-    # (Effectively circumvented using titles instead while building abstracts)
-    miss_abs = df[df['Abstract'].isnull()]
-    no_miss_abs = df.drop(miss_abs.index)
-
-    for i, abstract in enumerate(no_miss_abs['Abstract']):
-
-        # Only do it for first 2000 for testing purposes
-        if i > 1999:
-            break
-
-        # Get cord uid and title for article
-        cord_uid = no_miss_abs.iloc[i,0]
-        title = no_miss_abs.iloc[i,1]
-
-        if i % 10 == 0:
-            print(f"Abstract: {i:7d}, cord_uid {cord_uid}")
-
-        # In case we want to use CovidBERT
-        if use_covidbert:
-
-            """"Add preprocessing for tokens instead of split"""
-            abstract = abstract.split(" ")
-            outputs = model.encode(abstract)
-
-        # Use Regular BERT instead
-        else:
-
-            # Use (BERT Tokenizer and get outputs tuple
-            tokenized = tokenizer.encode(abstract, return_tensors="pt")
-            outputs = model(tokenized)
-
-            # Retrieve last hidden states and CLS token
-            #last_hidden_states = outputs[0]
-            cls_token = outputs[1]
-
-            # Write single CLS token to file to prevent RAM build-up
-            # Cast to np if true
-            to_numpy = True 
-            if to_numpy:
-                cls_token = cls_token.detach().numpy()
-            embs_file = os.path.join("data","embs", str(cord_uid)+".pickle")
-            with open(embs_file, "wb+") as file:
-                pickle.dump(cls_token, file)
-
-    print("Did I encode all abstracts and save pickle?")
-
-def load_model(use_covidbert=False):
-    """Function that loads and returns the CovidBERT model"""
-
-    # # Load CovidBERT
-    # if use_covidbert:
-    #     print("Loading model...")
-    #     model = AutoModelForMaskedLM.from_pretrained("deepset/covid_bert_base")
-    #     print("Loading tokenizer...")
-    #     tokenizer = AutoTokenizer.from_pretrained("deepset/covid_bert_base")
-    #     print("Finished loading the model successfully!")
-
-        #model = SentenceTransformer(model_path)
-
-    # #Load CovidBERT
-    # if use_covidbert:
-    #     print("Loading model...")
-    #     model = AutoModelWithLMHead.from_pretrained("manueltonneau/clinicalcovid-bert-nli")
-    #     print("Loading tokenizer...")
-    #     print("\n")
-    #     tokenizer = AutoTokenizer.from_pretrained("manueltonneau/clinicalcovid-bert-nli")
-    #     print("\n")
-    #     print("Finished loading the model successfully!")
-
-    #     # Save the model to model path
-    #     model_path = os.path.join("models","clinicalcovid")
-    #     if not os.path.exists(model_path):
-    #         os.makedirs(model_path)
-    #     model.save_pretrained(model_path)
-    #     tokenizer.save_pretrained(model_path)
-
-    #     model = SentenceTransformer(model_path)
-
-    # Load CovidBERT 
-    if use_covidbert:
-        print("Loading model...")
-        model = AutoModelWithLMHead.from_pretrained("gsarti/covidbert-nli")
-        print("Loading tokenizer...")
-        print("\n")
-        tokenizer = AutoTokenizer.from_pretrained("gsarti/covidbert-nli")
-        print("\n")
-        print("Finished loading the model successfully!")
-
-        # Save the model to model path
-        model_path = os.path.join("models","gsarticovid")
-        if not os.path.exists(model_path):
-            os.makedirs(model_path)
-        model.save_pretrained(model_path)
-        tokenizer.save_pretrained(model_path)
-        print(f"Successfully saved model to {model_path}")
-
-        print("Loading Sentence Transformer now!")
-        word_embedding_model = models.BERT(
-            model_path,
-            # max_seq_length=args.max_seq_length,
-            # do_lower_case=args.do_lower_case
-        )
-        pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
-                               pooling_mode_mean_tokens=True,
-                               pooling_mode_cls_token=False,
-                               pooling_mode_max_tokens=False)
-        model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
-        rmtree(model_path)
-        model.save(model_path)
-        print("Finished building Sentence Transformer!")
-
-    # Load regular BERT
-    else:
-        print("Loading BERT")
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertModel.from_pretrained('bert-base-uncased')
-        print("Finished loading BERT")
-
-    return model, tokenizer
-
-if __name__ == "__main__":
-
-    # First check if we have the right folder structure
-    if not os.path.exists("data"):
-        os.makedirs("data")
-
-    # Whether we use CovidBERT or normal BERT
-    use_covidbert = False
-
-    # Load model and tokenizer
-    model, tokenizer = load_model(use_covidbert=use_covidbert)
-
-    # Use bulky loader if we don't have the cord19.json yet
-    cord19_json_path = os.path.join("data", "cord19.json")
-    if not os.path.exists(cord19_json_path):
-        load_from_parses()
-
-    # Load the file from the created json
-    data = load_from_json(cord19_json_path)
-
-    # Load dataframe if csv exists, otherwise create it
-    df = load_dataframe(data)
-
-    # If embeddings don't exist, create them
-    embs_path = os.path.join("data","embs")
-    if not os.path.exists(embs_path):
-        os.makedirs(embs_path)
-
-    if len(os.listdir(embs_path)) == 0:
-        generate_embeddings(model, tokenizer, df, use_covidbert=use_covidbert)
\ No newline at end of file
diff --git a/asreviewcontrib/semantic_clustering/load_data.py b/asreviewcontrib/semantic_clustering/load_data.py
deleted file mode 100644
index c93094e..0000000
--- a/asreviewcontrib/semantic_clustering/load_data.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# System / os stuff
-import csv
-import os
-import json
-
-# Collections
-from collections import defaultdict
-import pandas as pd
-
-def load_dataframe(data):
-    """Function that creates a DataFrame from the json if there is no csv containing it"""
-
-    # Either csv already exists and we can simply load it
-    df_path = os.path.join("data","cord19_df.csv")
-    if os.path.exists(df_path):
-        print("Loading DataFrame...")
-        df = pd.read_csv(df_path)
-
-    # Or..
-    else:
-        print("Creating DataFrame...")
-        # Create a DataFrame instead
-        d = []
-
-        # JSON data contains dictionaries in a list for each entry
-        for _, (_,val) in enumerate(data.items()):
-            val_dict = val[0]
-            cord_uid = val_dict['cord_uid']
-            title = val_dict['title']
-            abstract = val_dict['abstract']
-            intro = val_dict['introduction']
-
-            d.append((cord_uid,title,abstract,intro))
-
-        # Turn list of tuples into DataFrame and write it to a CSV
-        df = pd.DataFrame(d, columns=('cord_uid','Title', 'Abstract', 'Introduction'))
-        df.to_csv(df_path, index=False)
-
-    return df
-
-def load_from_json(cord19_json_path):
-    """Function that loads the data from a saved cord19.json file"""
-    if os.path.exists(cord19_json_path):
-        with open(cord19_json_path) as json_file:
-            data = json.load(json_file)
-    else:
-        raise ValueError("The provided path does not exist! Please use load_from_parses() to create the json file.")
-    return data
-
-def load_from_parses():
-    """Function to load the CORD-19 dataset from the provided JSONS"""
-
-    cord_uid_to_text = defaultdict(list)
-
-    # open the file
-    if not os.path.exists("data"):
-        os.makedirs("data")
-    metadata = os.path.join("data", "metadata.csv")
-
-    with open(metadata) as f_in:
-        reader = csv.DictReader(f_in)
-        for i, row in enumerate(reader):
-        
-            # access some metadata
-            cord_uid = row['cord_uid']
-            title = row['title']
-            abstract = row['abstract']
-            #authors = row['authors'].split('; ')
-
-            # Abstracts are quite big, so cut them
-            abstract = abstract.split(" ")
-            if len(abstract) > 200:
-                abstract = abstract[:200]
-            abstract = " ".join(abstract)
-
-            # # If we don't have an abstract, use title
-            # if len(abstract) < 5:
-            #     abstract = title
-
-            # access the full text (if available) for Intro
-            introduction = []
-            if row['pdf_json_files']:
-                for json_path in row['pdf_json_files'].split('; '):
-
-                    # Data is saved in "data" folder, so navigate there instead
-                    json_path = os.path.join("data", json_path)
-                    with open(json_path) as f_json:
-                        full_text_dict = json.load(f_json)
-                        
-                        # grab introduction section from *some* version of the full text
-                        for paragraph_dict in full_text_dict['body_text']:
-                            paragraph_text = paragraph_dict['text']
-                            section_name = paragraph_dict['section']
-                            if 'intro' in section_name.lower():
-                                introduction.append(paragraph_text)
-
-                        # stop searching other copies of full text if already got introduction
-                        if introduction:
-                            break
-            if i % 100 == 0:
-                print(f"At row {i} now!")
-
-            # save for later usage
-            cord_uid_to_text[cord_uid].append({
-                'cord_uid': cord_uid,
-                'title': title,
-                'abstract': abstract,
-                'introduction': introduction
-            })
-    
-    print(type(cord_uid_to_text))
-
-    # Save the full CORD dataset as one json file
-    data_path = os.path.join("data","cord19.json")
-    with open(data_path, 'w') as fp:
-        json.dump(cord_uid_to_text, fp)
diff --git a/asreviewcontrib/semantic_clustering/semantic_clustering.py b/asreviewcontrib/semantic_clustering/semantic_clustering.py
index f9913aa..97d49af 100644
--- a/asreviewcontrib/semantic_clustering/semantic_clustering.py
+++ b/asreviewcontrib/semantic_clustering/semantic_clustering.py
@@ -26,7 +26,7 @@
 from transformers import logging
 logging.set_verbosity_error()
 
-#import tqdm
+# import tqdm
 
 
 def SemanticClustering(asreview_data_object):
@@ -49,13 +49,22 @@ def SemanticClustering(asreview_data_object):
 
     # tokenize abstracts and add to data
     print("Tokenizing abstracts...")
-    data['tokenized'] = data['abstract'].apply(lambda x: tokenizer.encode(
+    data['tokenized'] = data['abstract'].apply(lambda x: tokenizer.encode_plus(
         x,
         padding='longest',
-        add_special_tokens=True,
+        add_special_tokens=False,
         return_tensors="pt"))
 
-    print(data)
+    # generate embeddings
+    print("Generating embeddings...")
+    data['embeddings'] = data['tokenized'].apply(
+        lambda x: model(**x, output_hidden_states=False)[-1])
+
+    from dim_reduct import run_pca
+    n_components = .98
+    #pca = run_pca(data['embeddings'], n_components)
+
+    print(data['embeddings'][0].detach().numpy())
 
 
 def load_data(asreview_data_object):
diff --git a/asreviewcontrib/semantic_clustering/update_df.py b/asreviewcontrib/semantic_clustering/update_df.py
deleted file mode 100644
index c925def..0000000
--- a/asreviewcontrib/semantic_clustering/update_df.py
+++ /dev/null
@@ -1,106 +0,0 @@
-# imports
-
-# System
-import os
-
-# data
-import numpy as np 
-import pandas as pd  
-
-# Self
-from load_data import load_from_json, load_dataframe, load_from_parses
-
-############################# READ ME #############################
-##### This is a file with several functions that was used to ######
-##### recombine a saved dataFrame with titles and abstracts. ######
-###################################################################
-
-def update_titles():
-    """Function to add titles to the kmeans df"""
-
-    # Use bulky loader if we don't have the cord19.json yet
-    cord19_json_path = os.path.join("data", "cord19.json")
-    if not os.path.exists(cord19_json_path):
-        load_from_parses()
-
-    # Load the file from the created json
-    data = load_from_json(cord19_json_path)
-
-    # Load dataframe if csv exists, otherwise create it
-    df = load_dataframe(data)
-    df = df.iloc[:2000,:-1]
-
-    # Load other df
-    kmeans_df_path = os.path.join("data","dataframes","kmeans_df.csv")
-    kmeans_df = pd.read_csv(kmeans_df_path)
-
-    # Retrieve titles
-    titles = []
-    for i, cord_uid in enumerate(kmeans_df['cord_uid']):
-        for df_uid in df.cord_uid.values:
-            if cord_uid == df_uid:
-                title = df[df.cord_uid == df_uid].Title.values.tolist()
-                titles.append(title)
-
-    flatten = [title[0] for title in titles]
-    kmeans_df['Title'] = np.array(flatten)
-    print(kmeans_df.head())
-    print(kmeans_df.columns)
-
-    kmeans_df.to_csv(kmeans_df_path,index=None)
-    
-
-def update_abstracts():
-    """Function to add abstracts to the kmeans df"""
-
-    # Use bulky loader if we don't have the cord19.json yet
-    cord19_json_path = os.path.join("data", "cord19.json")
-    if not os.path.exists(cord19_json_path):
-        load_from_parses()
-
-    # Load the file from the created json
-    data = load_from_json(cord19_json_path)
-
-    # Load dataframe if csv exists, otherwise create it
-    df = load_dataframe(data)
-    df = df.iloc[:2000,:-1]
-
-    print(df.head())
-    print(df.columns)
-    print(df.shape)
-    print("\n\n")
-
-    exit()
-
-    # Load other df
-    kmeans_df_path = os.path.join("data","dataframes","kmeans_df.csv")
-    kmeans_df = pd.read_csv(kmeans_df_path)
-
-    print(kmeans_df.head())
-    print(kmeans_df.columns)
-    print(kmeans_df.shape)
-
-    # print()
-    # print(kmeans_df[kmeans_df['cord_uid'] == "6c6cw80p"])
-    # print(kmeans_df[kmeans_df['cord_uid'] == "ug7v899j"])
-
-    abstracts = []
-    for i,cord_uid in enumerate(kmeans_df['cord_uid']):
-        for df_uid in df.cord_uid.values:
-            if cord_uid == df_uid:
-                abstract = df[df.cord_uid == df_uid].Abstract.values.tolist()
-                abstracts.append(abstract)
-    #print(abstracts[:10])
-
-    flatten = [absy[0] for absy in abstracts]
-    kmeans_df['Abstract'] = np.array(flatten)
-    print(kmeans_df.head())
-    print(kmeans_df.columns)
-    print(kmeans_df[kmeans_df.cord_uid == "ug7v899j"].Abstract)
-
-    kmeans_df.to_csv(kmeans_df_path,index=None)
-
-if __name__ == "__main__":
-    #update_abstracts()
-    #update_titles()
-    pass
\ No newline at end of file