Align with interactive app (#23)

* Revert "Revert "Add the functionality of semantic clusters into one streamlined main file" (#16)" This reverts commit ff861f2. * Delete semantic_clusters.gif * Remove unused files * Update semantic clustering with PCA * Update semantic clustering with t-sne * Add k_means clustering and finish main pipeline * Delete temporary_test_file.ipynb * Update README.md * Add automatic cluster calculation * Create a file for interactive to use, adjust interactive and add a command line interface Big commit, but all connected * Solve issues with command line interface * Update semantic clustering with pep8 * Fix optimal clusters feature * Move entrypoint to main.py * Add automatic removal of previous data file * Correct licenses * Squashed commit of the following: commit f6383ca Author: Jelle Teijema <[email protected]> Date: Wed Nov 10 10:26:20 2021 +0100 Cluster the imports commit d89df8f Author: Jelle Teijema <[email protected]> Date: Wed Nov 10 10:20:25 2021 +0100 Add an explanation of chosen defaults commit 12c8644 Merge: 926a1b9 5cdc705 Author: Jelle <[email protected]> Date: Tue Nov 9 11:05:01 2021 +0100 Merge branch 'master' into add-automatic-cluster-calculations * Update asreviewcontrib/semantic_clustering/__init__.py Co-authored-by: Jonathan de Bruin <[email protected]> * Update asreviewcontrib/semantic_clustering/main.py Co-authored-by: Jonathan de Bruin <[email protected]> * Formatting of the imports Co-authored-by: Jonathan de Bruin <[email protected]>
asreview · Nov 12, 2021 · 97a1d1e · 97a1d1e
1 parent 1094184
commit 97a1d1e
Show file tree

Hide file tree

Showing 4 changed files with 123 additions and 95 deletions.
diff --git a/asreviewcontrib/semantic_clustering/__init__.py b/asreviewcontrib/semantic_clustering/__init__.py
@@ -0,0 +1,2 @@
+from asreviewcontrib.semantic_clustering.semantic_clustering import SemanticClustering
+from asreviewcontrib.semantic_clustering.interactive import run_app
diff --git a/asreviewcontrib/semantic_clustering/interactive.py b/asreviewcontrib/semantic_clustering/interactive.py
@@ -4,146 +4,124 @@
 import os
 
 # Data
-import numpy as np 
-import pandas as pd 
+import pandas as pd
 
 # Dash-y
 import dash
-import dash_core_components as dcc
-import dash_html_components as html 
-from dash.dependencies import Output, Input
+from dash import dcc
+from dash import html
+from dash.dependencies import Input
 
 # Plotly
-import plotly.graph_objs as go
 import plotly.express as px
 
+
 def run_app():
     """Function to be called to run the full Dash App"""
-    
+
     # Load DataFrame with clusters
-    df_path = os.path.join("data","dataframes","kmeans_df.csv")
+    df_path = os.path.join("data", "kmeans_df.csv")
     df = pd.read_csv(df_path)
 
     # Read as STR for discrete colormap
     df['cluster_id'] = df['cluster_id'].astype(str)
 
     # Show main figure
-    #fig = px.scatter(df, x="x", y="y", color="cluster_id", color_discrete_map=px.colors.sequential.Viridis)
-    fig = px.scatter(df, x="x", y="y", color="cluster_id", color_discrete_sequence=px.colors.qualitative.T10)
+    fig = px.scatter(df, x="x", y="y", color="cluster_id",
+                     color_discrete_sequence=px.colors.qualitative.Set1)
     fig.update_layout(dragmode="pan")
-    fig.update_layout(xaxis=dict(showticklabels=False, title=""), 
+    fig.update_layout(xaxis=dict(showticklabels=False, title=""),
                       yaxis=dict(showticklabels=False, ticks="", title=""))
-    config = dict({'scrollZoom': True, 'displayModeBar':False, 'displaylogo':False})
+    config = dict(
+        {'scrollZoom': True, 'displayModeBar': False, 'displaylogo': False})
 
     # Initialize app and do lay-out
     app = dash.Dash()
     app.layout = html.Div([
 
         # banner div
         html.Div([
-            html.H2("CORD-19: Visualizing Semantic Clusters"),
+            html.H2("Visualizing Semantic Clusters"),
         ], className="banner"),
 
         # external css div
         html.Div([
 
             # Main semantic cluster graph
             html.Div([
-                dcc.Graph(figure=fig,id="cluster-div",config=config)
-            ], className = "six columns"),
+                dcc.Graph(figure=fig, id="cluster-div", config=config)
+            ], className="six columns"),
 
             # Div for abstract window
-            html.Div([ 
-                html.H3("Test Title", id="paper-title"),
+            html.Div([
+                html.H3("Test title", id="paper-title"),
                 dcc.Textarea(
                     readOnly=True,
                     placeholder='Enter a value...',
                     value='This is a TextArea component',
-                    style={'width': '100%','height':'300px'},
+                    style={'width': '100%', 'height': '300px'},
                     id="abstract-div"
-                )  
-            ], className = "six columns"),
+                )
+            ], className="six columns"),
 
         ], className="row"),
     ])
 
     # Allow global css - use chriddyp's time-tested external css
     app.css.config.serve_locally = False
     app.css.append_css({
-        "external_url":"https://codepen.io/chriddyp/pen/bWLwgP.css"
+        "external_url": "https://codepen.io/chriddyp/pen/bWLwgP.css"
     })
 
     ############################
-    ##### DEFINE CALLBACKS #####
+    # ### DEFINE CALLBACKS ### #
     ############################
 
-    # Callback to refresh Abstract window
+    # Callback to refresh abstract window
     @app.callback(dash.dependencies.Output("abstract-div", "value"),
-                [Input('cluster-div', 'hoverData')])
+                  [Input('cluster-div', 'hoverData')])
     def update_abstract(hoverData):
 
         # Fetch df
         nonlocal df
 
         # Update graph with hoverData
-        if hoverData != None:
+        if hoverData is not None:
             hover_dict = hoverData['points'][0]
             abstract_idx = hover_dict['pointIndex']
 
             # Set variable for abstract window update
-            abstract = df['Abstract'].iloc[abstract_idx]
+            abstract = df['abstract'].iloc[abstract_idx]
 
             # Set hoverData to None again to prevent issues with graph update
             hoverData = None
         else:
-            #cord_uid = df['cord_uid'].iloc[0]
-            abstract = df['Abstract'].iloc[0]
+            abstract = df['abstract'].iloc[0]
 
         return abstract
 
     # Callback to refresh article title
     @app.callback(dash.dependencies.Output("paper-title", "children"),
-                [Input('cluster-div', 'hoverData')])
+                  [Input('cluster-div', 'hoverData')])
     def update_title(hoverData):
 
         # Fetch df
         nonlocal df
 
         # Update graph with hoverData
-        if hoverData != None:
+        if hoverData is not None:
             hover_dict = hoverData['points'][0]
             title_idx = hover_dict['pointIndex']
 
             # Set variable for paper title update
-            title = df['Title'].iloc[title_idx]
+            title = df['title'].iloc[title_idx]
 
             # Set hoverData to None again to prevent issues with graph update
             hoverData = None
         else:
-            title = df['Title'].iloc[0]
+            title = df['title'].iloc[0]
 
         return title
 
     # Run the application
     app.run_server(debug=True)
-
-if __name__ == "__main__":
-    run_app()
-
-"""Goes into update_abstract function"""
-# # Make temp chart - we just want to change title for now
-# temp_chart = go.Scatter(
-#     x = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],
-#     y = [1,4,9,16,25,36,49,64,81,100,121,144,169,196,225],
-#     name="abstract-window",
-# )
-
-# temp_chart = dcc
-# temp_chart = [temp_chart]
-
-# # Change layout so we can change title
-# abstract_layout = dict(
-#     title=f"{cord_uid}"
-# )
-
-# abstract_fig = dict(data=temp_chart, layout=abstract_layout)
diff --git a/asreviewcontrib/semantic_clustering/main.py b/asreviewcontrib/semantic_clustering/main.py
@@ -0,0 +1,52 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# Path: asreviewcontrib\semantic_clustering\main.py
+
+# Environment imports
+import sys
+import getopt
+
+from asreview.data import ASReviewData
+
+from asreviewcontrib.semantic_clustering.interactive import run_app
+from asreviewcontrib.semantic_clustering.semantic_clustering import SemanticClustering
+
+
+def main(argv):
+    filepath = ""
+
+    try:
+        opts, args = getopt.getopt(
+            argv, "htf:a", ["help", "testfile", "filepath=", "app"])
+    except getopt.GetoptError:
+        print('Please use the following format:')
+        print('test.py -f <filepath>')
+        print('test.py --testfile')
+        print('test.py --app')
+        sys.exit(2)
+    for opt, arg in opts:
+        if opt in ("-h", "--help"):
+            print('test.py -f <filepath> or --testfile')
+            sys.exit()
+        elif opt in ("-f", "--filepath"):
+            filepath = arg
+        elif opt in ("-t", "--testfile"):
+            filepath = "https://raw.githubusercontent.com/asreview/systematic-review-datasets/master/datasets/van_de_Schoot_2017/output/van_de_Schoot_2017.csv"
+        elif opt in ("-a", "--app"):
+            run_app()
+            sys.exit(1)
+    print('Running from file: ', filepath)
+
+    # check if arguments are empty
+    if filepath == "":
+        print('Please use the following format:')
+        print('test.py -f <filepath>')
+        print('test.py --testfile')
+        print('test.py --app')
+        sys.exit(2)
+
+    SemanticClustering(ASReviewData.from_file(filepath))
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
diff --git a/asreviewcontrib/semantic_clustering/semantic_clustering.py b/asreviewcontrib/semantic_clustering/semantic_clustering.py
@@ -1,39 +1,22 @@
-# Copyright 2021 The ASReview Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# Path: asreviewcontrib\semantic_clustering\semantic_clustering.py
 
 # Environment imports
 import os
 from tqdm import tqdm
-
-# Calculation imports
+import numpy as np
 from sklearn.cluster import KMeans
 from numpy.linalg import norm
-import numpy as np
 
-# Transformer imports
 from transformers import AutoTokenizer, AutoModel
 from transformers import logging
-
-# Visualization imports
 import matplotlib.pyplot as plt
 import seaborn as sns
 
-# Local imports
 from dim_reduct import run_pca
 from dim_reduct import t_sne
 from clustering import run_KMeans
-from asreview.data import ASReviewData
 
 # Setting environment
 logging.set_verbosity_error()
@@ -43,13 +26,18 @@
 
 def SemanticClustering(asreview_data_object):
 
+    # if data folder exists, delete it
+    if os.path.exists("data"):
+        print("data folder exists, deleting...")
+        os.system("del /F /Q data")
+
     # load data
     print("Loading data...")
-    data = load_data(asreview_data_object)
+    data = _load_data(asreview_data_object)
 
     # since processing the data can take a long time, for now the data is cut
     # down to decrease test duration. This will be removed in future versions
-    data = data.iloc[:30, :]
+    # data = data.iloc[:30, :]
 
     # load scibert transformer
     print("Loading scibert transformer...")
@@ -90,7 +78,7 @@ def SemanticClustering(asreview_data_object):
 
     # calculate optimal number of clusters
     print("Calculating optimal number of clusters...")
-    n_clusters = calc_optimal_n_clusters(tsne)
+    n_clusters = _calc_optimal_n_clusters(tsne)
     print("Optimal number of clusters: ", n_clusters)
 
     # run k-means. n_init is set to 10, this indicated the amount of restarts
@@ -100,31 +88,45 @@ def SemanticClustering(asreview_data_object):
 
     # visualize clusters
     print("Visualizing clusters...")
-    tsne_data = [tsne[:, 0], tsne[:, 1]]
-    visualize_clusters(tsne_data, labels)
+    _visualize_clusters(tsne, labels)
+
+    # create file for use in interactive dashboard
+    _create_file(data, tsne, labels)
+
+# Create functional dataframe and store to file for use in interactive
+def _create_file(data, coords, labels):
+    data['x'] = coords[:, 0]
+    data['y'] = coords[:, 1]
+    data['cluster_id'] = labels
+
+    if not os.path.exists("data"):
+        os.makedirs("data")
+
+    kmeans_df_path = os.path.join("data", "kmeans_df.csv")
+    data.to_csv(kmeans_df_path, index=None)
 
 
 # Calculate the optimal amount of clusters. It checks the inertia for 1 to 25
 # clusters, and picks the optimal inertia based on an elbow graph and some cool
 # trigonometry.
-def calc_optimal_n_clusters(features):
+def _calc_optimal_n_clusters(features):
 
-    Sum_of_squared_distances = []
+    sum_of_squared_distances = []
 
     K = range(1, 25)
     for k in K:
         km = KMeans(n_clusters=k)
         km = km.fit(features)
-        Sum_of_squared_distances.append(km.inertia_)
+        sum_of_squared_distances.append(km.inertia_)
 
     max = 0
     clusters = 1
 
     for i in K:
-        p1 = np.asarray((Sum_of_squared_distances[0], 1))
+        p1 = np.asarray((sum_of_squared_distances[0], 1))
         p2 = np.asarray(
-            (Sum_of_squared_distances[-1], (len(Sum_of_squared_distances)+1)))
-        p3 = np.asarray((Sum_of_squared_distances[i-1], i))
+            (sum_of_squared_distances[-1], (len(sum_of_squared_distances)+1)))
+        p3 = np.asarray((sum_of_squared_distances[i-1], i))
 
         m = np.cross(p2-p1, p3-p1)/norm(p2-p1)
 
@@ -134,15 +136,14 @@ def calc_optimal_n_clusters(features):
 
     return clusters
 
-
-def visualize_clusters(data, labels):
+def _visualize_clusters(tsne, labels):
     fig, ax = plt.subplots()
     ax.set_title("semantic clustering")
     ax.set_xlabel("t-SNE Component 1")
     ax.set_ylabel("t-SNE Component 2")
 
-    x = data[0]
-    y = data[1]
+    x = tsne[:, 0]
+    y = tsne[:, 1]
 
     # Do actual plotting and save image
     ax.scatter(x, y, c=labels, cmap="Set3")
@@ -153,7 +154,7 @@ def visualize_clusters(data, labels):
     fig.savefig(img_path)
 
 
-def load_data(asreview_data_object):
+def _load_data(asreview_data_object):
 
     # extract title and abstract, drop empty abstracts and reset index
     data = asreview_data_object.df[['title', 'abstract']].copy()
@@ -162,8 +163,3 @@ def load_data(asreview_data_object):
     data = data.reset_index(drop=True)
 
     return data
-
-
-if __name__ == "__main__":
-    filepath = "https://raw.githubusercontent.com/asreview/systematic-review-datasets/master/datasets/van_de_Schoot_2017/output/van_de_Schoot_2017.csv"
-    SemanticClustering(ASReviewData.from_file(filepath))
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		from asreviewcontrib.semantic_clustering.semantic_clustering import SemanticClustering
		from asreviewcontrib.semantic_clustering.interactive import run_app