diff --git a/asreviewcontrib/semantic_clustering/build.py b/asreviewcontrib/semantic_clustering/build.py
deleted file mode 100644
index 08ebe92..0000000
--- a/asreviewcontrib/semantic_clustering/build.py
+++ /dev/null
@@ -1,192 +0,0 @@
-# imports 
-# System stuff
-import os
-import sys
-import json
-import pickle
-from shutil import rmtree
-
-# Numerical / data imports
-import numpy as np
-import pandas as pd
-
-# Torch-y stuff
-import torch
-
-# Transformers
-from transformers import AutoTokenizer, AutoModelWithLMHead#, AutoModelForMaskedLM
-from transformers import BertTokenizer, BertModel
-from sentence_transformers import SentenceTransformer, models
-
-# Own functions
-from load_data import load_from_parses, load_from_json, load_dataframe
-
-def generate_embeddings(model, tokenizer, df, use_covidbert=False):
-    """Function that generates (CovidBERT) embeddings
-    Args: 
-      model: The (transformer) model to be used, e.g. CovidBERT
-      tokenizer: Tokenizer corresponding to the model used
-      df: DataFrame containing parsed data from the CORD-19 document parses
-      use_covidbert: (bool) To set whether we use covidbert or regular BERT
-    Returns:
-      embeddings: Contextualized embeddings from the specified model
-    """
-
-    # Path structure
-    if not os.path.exists("data"):
-        os.makedirs("data")
-    embs_path = os.path.join("data","embs")
-    if not os.path.exists(embs_path):
-        os.makedirs(embs_path)
-
-    # Only use ones without missing abstracts
-    # (Effectively circumvented using titles instead while building abstracts)
-    miss_abs = df[df['Abstract'].isnull()]
-    no_miss_abs = df.drop(miss_abs.index)
-
-    for i, abstract in enumerate(no_miss_abs['Abstract']):
-
-        # Only do it for first 2000 for testing purposes
-        if i > 1999:
-            break
-
-        # Get cord uid and title for article
-        cord_uid = no_miss_abs.iloc[i,0]
-        title = no_miss_abs.iloc[i,1]
-
-        if i % 10 == 0:
-            print(f"Abstract: {i:7d}, cord_uid {cord_uid}")
-
-        # In case we want to use CovidBERT
-        if use_covidbert:
-
-            """"Add preprocessing for tokens instead of split"""
-            abstract = abstract.split(" ")
-            outputs = model.encode(abstract)
-
-        # Use Regular BERT instead
-        else:
-
-            # Use (BERT Tokenizer and get outputs tuple
-            tokenized = tokenizer.encode(abstract, return_tensors="pt")
-            outputs = model(tokenized)
-
-            # Retrieve last hidden states and CLS token
-            #last_hidden_states = outputs[0]
-            cls_token = outputs[1]
-
-            # Write single CLS token to file to prevent RAM build-up
-            # Cast to np if true
-            to_numpy = True 
-            if to_numpy:
-                cls_token = cls_token.detach().numpy()
-            embs_file = os.path.join("data","embs", str(cord_uid)+".pickle")
-            with open(embs_file, "wb+") as file:
-                pickle.dump(cls_token, file)
-
-    print("Did I encode all abstracts and save pickle?")
-
-def load_model(use_covidbert=False):
-    """Function that loads and returns the CovidBERT model"""
-
-    # # Load CovidBERT
-    # if use_covidbert:
-    #     print("Loading model...")
-    #     model = AutoModelForMaskedLM.from_pretrained("deepset/covid_bert_base")
-    #     print("Loading tokenizer...")
-    #     tokenizer = AutoTokenizer.from_pretrained("deepset/covid_bert_base")
-    #     print("Finished loading the model successfully!")
-
-        #model = SentenceTransformer(model_path)
-
-    # #Load CovidBERT
-    # if use_covidbert:
-    #     print("Loading model...")
-    #     model = AutoModelWithLMHead.from_pretrained("manueltonneau/clinicalcovid-bert-nli")
-    #     print("Loading tokenizer...")
-    #     print("\n")
-    #     tokenizer = AutoTokenizer.from_pretrained("manueltonneau/clinicalcovid-bert-nli")
-    #     print("\n")
-    #     print("Finished loading the model successfully!")
-
-    #     # Save the model to model path
-    #     model_path = os.path.join("models","clinicalcovid")
-    #     if not os.path.exists(model_path):
-    #         os.makedirs(model_path)
-    #     model.save_pretrained(model_path)
-    #     tokenizer.save_pretrained(model_path)
-
-    #     model = SentenceTransformer(model_path)
-
-    # Load CovidBERT 
-    if use_covidbert:
-        print("Loading model...")
-        model = AutoModelWithLMHead.from_pretrained("gsarti/covidbert-nli")
-        print("Loading tokenizer...")
-        print("\n")
-        tokenizer = AutoTokenizer.from_pretrained("gsarti/covidbert-nli")
-        print("\n")
-        print("Finished loading the model successfully!")
-
-        # Save the model to model path
-        model_path = os.path.join("models","gsarticovid")
-        if not os.path.exists(model_path):
-            os.makedirs(model_path)
-        model.save_pretrained(model_path)
-        tokenizer.save_pretrained(model_path)
-        print(f"Successfully saved model to {model_path}")
-
-        print("Loading Sentence Transformer now!")
-        word_embedding_model = models.BERT(
-            model_path,
-            # max_seq_length=args.max_seq_length,
-            # do_lower_case=args.do_lower_case
-        )
-        pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
-                               pooling_mode_mean_tokens=True,
-                               pooling_mode_cls_token=False,
-                               pooling_mode_max_tokens=False)
-        model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
-        rmtree(model_path)
-        model.save(model_path)
-        print("Finished building Sentence Transformer!")
-
-    # Load regular BERT
-    else:
-        print("Loading BERT")
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertModel.from_pretrained('bert-base-uncased')
-        print("Finished loading BERT")
-
-    return model, tokenizer
-
-if __name__ == "__main__":
-
-    # First check if we have the right folder structure
-    if not os.path.exists("data"):
-        os.makedirs("data")
-
-    # Whether we use CovidBERT or normal BERT
-    use_covidbert = False
-
-    # Load model and tokenizer
-    model, tokenizer = load_model(use_covidbert=use_covidbert)
-
-    # Use bulky loader if we don't have the cord19.json yet
-    cord19_json_path = os.path.join("data", "cord19.json")
-    if not os.path.exists(cord19_json_path):
-        load_from_parses()
-
-    # Load the file from the created json
-    data = load_from_json(cord19_json_path)
-
-    # Load dataframe if csv exists, otherwise create it
-    df = load_dataframe(data)
-
-    # If embeddings don't exist, create them
-    embs_path = os.path.join("data","embs")
-    if not os.path.exists(embs_path):
-        os.makedirs(embs_path)
-
-    if len(os.listdir(embs_path)) == 0:
-        generate_embeddings(model, tokenizer, df, use_covidbert=use_covidbert)
\ No newline at end of file
diff --git a/asreviewcontrib/semantic_clustering/load_data.py b/asreviewcontrib/semantic_clustering/load_data.py
deleted file mode 100644
index c93094e..0000000
--- a/asreviewcontrib/semantic_clustering/load_data.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# System / os stuff
-import csv
-import os
-import json
-
-# Collections
-from collections import defaultdict
-import pandas as pd
-
-def load_dataframe(data):
-    """Function that creates a DataFrame from the json if there is no csv containing it"""
-
-    # Either csv already exists and we can simply load it
-    df_path = os.path.join("data","cord19_df.csv")
-    if os.path.exists(df_path):
-        print("Loading DataFrame...")
-        df = pd.read_csv(df_path)
-
-    # Or..
-    else:
-        print("Creating DataFrame...")
-        # Create a DataFrame instead
-        d = []
-
-        # JSON data contains dictionaries in a list for each entry
-        for _, (_,val) in enumerate(data.items()):
-            val_dict = val[0]
-            cord_uid = val_dict['cord_uid']
-            title = val_dict['title']
-            abstract = val_dict['abstract']
-            intro = val_dict['introduction']
-
-            d.append((cord_uid,title,abstract,intro))
-
-        # Turn list of tuples into DataFrame and write it to a CSV
-        df = pd.DataFrame(d, columns=('cord_uid','Title', 'Abstract', 'Introduction'))
-        df.to_csv(df_path, index=False)
-
-    return df
-
-def load_from_json(cord19_json_path):
-    """Function that loads the data from a saved cord19.json file"""
-    if os.path.exists(cord19_json_path):
-        with open(cord19_json_path) as json_file:
-            data = json.load(json_file)
-    else:
-        raise ValueError("The provided path does not exist! Please use load_from_parses() to create the json file.")
-    return data
-
-def load_from_parses():
-    """Function to load the CORD-19 dataset from the provided JSONS"""
-
-    cord_uid_to_text = defaultdict(list)
-
-    # open the file
-    if not os.path.exists("data"):
-        os.makedirs("data")
-    metadata = os.path.join("data", "metadata.csv")
-
-    with open(metadata) as f_in:
-        reader = csv.DictReader(f_in)
-        for i, row in enumerate(reader):
-        
-            # access some metadata
-            cord_uid = row['cord_uid']
-            title = row['title']
-            abstract = row['abstract']
-            #authors = row['authors'].split('; ')
-
-            # Abstracts are quite big, so cut them
-            abstract = abstract.split(" ")
-            if len(abstract) > 200:
-                abstract = abstract[:200]
-            abstract = " ".join(abstract)
-
-            # # If we don't have an abstract, use title
-            # if len(abstract) < 5:
-            #     abstract = title
-
-            # access the full text (if available) for Intro
-            introduction = []
-            if row['pdf_json_files']:
-                for json_path in row['pdf_json_files'].split('; '):
-
-                    # Data is saved in "data" folder, so navigate there instead
-                    json_path = os.path.join("data", json_path)
-                    with open(json_path) as f_json:
-                        full_text_dict = json.load(f_json)
-                        
-                        # grab introduction section from *some* version of the full text
-                        for paragraph_dict in full_text_dict['body_text']:
-                            paragraph_text = paragraph_dict['text']
-                            section_name = paragraph_dict['section']
-                            if 'intro' in section_name.lower():
-                                introduction.append(paragraph_text)
-
-                        # stop searching other copies of full text if already got introduction
-                        if introduction:
-                            break
-            if i % 100 == 0:
-                print(f"At row {i} now!")
-
-            # save for later usage
-            cord_uid_to_text[cord_uid].append({
-                'cord_uid': cord_uid,
-                'title': title,
-                'abstract': abstract,
-                'introduction': introduction
-            })
-    
-    print(type(cord_uid_to_text))
-
-    # Save the full CORD dataset as one json file
-    data_path = os.path.join("data","cord19.json")
-    with open(data_path, 'w') as fp:
-        json.dump(cord_uid_to_text, fp)
diff --git a/asreviewcontrib/semantic_clustering/semantic_clustering.py b/asreviewcontrib/semantic_clustering/semantic_clustering.py
index f9913aa..97d49af 100644
--- a/asreviewcontrib/semantic_clustering/semantic_clustering.py
+++ b/asreviewcontrib/semantic_clustering/semantic_clustering.py
@@ -26,7 +26,7 @@
 from transformers import logging
 logging.set_verbosity_error()
 
-#import tqdm
+# import tqdm
 
 
 def SemanticClustering(asreview_data_object):
@@ -49,13 +49,22 @@ def SemanticClustering(asreview_data_object):
 
     # tokenize abstracts and add to data
     print("Tokenizing abstracts...")
-    data['tokenized'] = data['abstract'].apply(lambda x: tokenizer.encode(
+    data['tokenized'] = data['abstract'].apply(lambda x: tokenizer.encode_plus(
         x,
         padding='longest',
-        add_special_tokens=True,
+        add_special_tokens=False,
         return_tensors="pt"))
 
-    print(data)
+    # generate embeddings
+    print("Generating embeddings...")
+    data['embeddings'] = data['tokenized'].apply(
+        lambda x: model(**x, output_hidden_states=False)[-1])
+
+    from dim_reduct import run_pca
+    n_components = .98
+    #pca = run_pca(data['embeddings'], n_components)
+
+    print(data['embeddings'][0].detach().numpy())
 
 
 def load_data(asreview_data_object):
diff --git a/asreviewcontrib/semantic_clustering/update_df.py b/asreviewcontrib/semantic_clustering/update_df.py
deleted file mode 100644
index c925def..0000000
--- a/asreviewcontrib/semantic_clustering/update_df.py
+++ /dev/null
@@ -1,106 +0,0 @@
-# imports
-
-# System
-import os
-
-# data
-import numpy as np 
-import pandas as pd  
-
-# Self
-from load_data import load_from_json, load_dataframe, load_from_parses
-
-############################# READ ME #############################
-##### This is a file with several functions that was used to ######
-##### recombine a saved dataFrame with titles and abstracts. ######
-###################################################################
-
-def update_titles():
-    """Function to add titles to the kmeans df"""
-
-    # Use bulky loader if we don't have the cord19.json yet
-    cord19_json_path = os.path.join("data", "cord19.json")
-    if not os.path.exists(cord19_json_path):
-        load_from_parses()
-
-    # Load the file from the created json
-    data = load_from_json(cord19_json_path)
-
-    # Load dataframe if csv exists, otherwise create it
-    df = load_dataframe(data)
-    df = df.iloc[:2000,:-1]
-
-    # Load other df
-    kmeans_df_path = os.path.join("data","dataframes","kmeans_df.csv")
-    kmeans_df = pd.read_csv(kmeans_df_path)
-
-    # Retrieve titles
-    titles = []
-    for i, cord_uid in enumerate(kmeans_df['cord_uid']):
-        for df_uid in df.cord_uid.values:
-            if cord_uid == df_uid:
-                title = df[df.cord_uid == df_uid].Title.values.tolist()
-                titles.append(title)
-
-    flatten = [title[0] for title in titles]
-    kmeans_df['Title'] = np.array(flatten)
-    print(kmeans_df.head())
-    print(kmeans_df.columns)
-
-    kmeans_df.to_csv(kmeans_df_path,index=None)
-    
-
-def update_abstracts():
-    """Function to add abstracts to the kmeans df"""
-
-    # Use bulky loader if we don't have the cord19.json yet
-    cord19_json_path = os.path.join("data", "cord19.json")
-    if not os.path.exists(cord19_json_path):
-        load_from_parses()
-
-    # Load the file from the created json
-    data = load_from_json(cord19_json_path)
-
-    # Load dataframe if csv exists, otherwise create it
-    df = load_dataframe(data)
-    df = df.iloc[:2000,:-1]
-
-    print(df.head())
-    print(df.columns)
-    print(df.shape)
-    print("\n\n")
-
-    exit()
-
-    # Load other df
-    kmeans_df_path = os.path.join("data","dataframes","kmeans_df.csv")
-    kmeans_df = pd.read_csv(kmeans_df_path)
-
-    print(kmeans_df.head())
-    print(kmeans_df.columns)
-    print(kmeans_df.shape)
-
-    # print()
-    # print(kmeans_df[kmeans_df['cord_uid'] == "6c6cw80p"])
-    # print(kmeans_df[kmeans_df['cord_uid'] == "ug7v899j"])
-
-    abstracts = []
-    for i,cord_uid in enumerate(kmeans_df['cord_uid']):
-        for df_uid in df.cord_uid.values:
-            if cord_uid == df_uid:
-                abstract = df[df.cord_uid == df_uid].Abstract.values.tolist()
-                abstracts.append(abstract)
-    #print(abstracts[:10])
-
-    flatten = [absy[0] for absy in abstracts]
-    kmeans_df['Abstract'] = np.array(flatten)
-    print(kmeans_df.head())
-    print(kmeans_df.columns)
-    print(kmeans_df[kmeans_df.cord_uid == "ug7v899j"].Abstract)
-
-    kmeans_df.to_csv(kmeans_df_path,index=None)
-
-if __name__ == "__main__":
-    #update_abstracts()
-    #update_titles()
-    pass
\ No newline at end of file