diff --git a/asreviewcontrib/semantic_clustering/__init__.py b/asreviewcontrib/semantic_clustering/__init__.py new file mode 100644 index 0000000..e0d6e9d --- /dev/null +++ b/asreviewcontrib/semantic_clustering/__init__.py @@ -0,0 +1,2 @@ +from asreviewcontrib.semantic_clustering.semantic_clustering import SemanticClustering +from asreviewcontrib.semantic_clustering.interactive import run_app diff --git a/asreviewcontrib/semantic_clustering/interactive.py b/asreviewcontrib/semantic_clustering/interactive.py index 53acfd9..3fee2ce 100644 --- a/asreviewcontrib/semantic_clustering/interactive.py +++ b/asreviewcontrib/semantic_clustering/interactive.py @@ -4,36 +4,36 @@ import os # Data -import numpy as np -import pandas as pd +import pandas as pd # Dash-y import dash -import dash_core_components as dcc -import dash_html_components as html -from dash.dependencies import Output, Input +from dash import dcc +from dash import html +from dash.dependencies import Input # Plotly -import plotly.graph_objs as go import plotly.express as px + def run_app(): """Function to be called to run the full Dash App""" - + # Load DataFrame with clusters - df_path = os.path.join("data","dataframes","kmeans_df.csv") + df_path = os.path.join("data", "kmeans_df.csv") df = pd.read_csv(df_path) # Read as STR for discrete colormap df['cluster_id'] = df['cluster_id'].astype(str) # Show main figure - #fig = px.scatter(df, x="x", y="y", color="cluster_id", color_discrete_map=px.colors.sequential.Viridis) - fig = px.scatter(df, x="x", y="y", color="cluster_id", color_discrete_sequence=px.colors.qualitative.T10) + fig = px.scatter(df, x="x", y="y", color="cluster_id", + color_discrete_sequence=px.colors.qualitative.Set1) fig.update_layout(dragmode="pan") - fig.update_layout(xaxis=dict(showticklabels=False, title=""), + fig.update_layout(xaxis=dict(showticklabels=False, title=""), yaxis=dict(showticklabels=False, ticks="", title="")) - config = dict({'scrollZoom': True, 'displayModeBar':False, 'displaylogo':False}) + config = dict( + {'scrollZoom': True, 'displayModeBar': False, 'displaylogo': False}) # Initialize app and do lay-out app = dash.Dash() @@ -41,7 +41,7 @@ def run_app(): # banner div html.Div([ - html.H2("CORD-19: Visualizing Semantic Clusters"), + html.H2("Visualizing Semantic Clusters"), ], className="banner"), # external css div @@ -49,20 +49,20 @@ def run_app(): # Main semantic cluster graph html.Div([ - dcc.Graph(figure=fig,id="cluster-div",config=config) - ], className = "six columns"), + dcc.Graph(figure=fig, id="cluster-div", config=config) + ], className="six columns"), # Div for abstract window - html.Div([ - html.H3("Test Title", id="paper-title"), + html.Div([ + html.H3("Test title", id="paper-title"), dcc.Textarea( readOnly=True, placeholder='Enter a value...', value='This is a TextArea component', - style={'width': '100%','height':'300px'}, + style={'width': '100%', 'height': '300px'}, id="abstract-div" - ) - ], className = "six columns"), + ) + ], className="six columns"), ], className="row"), ]) @@ -70,80 +70,58 @@ def run_app(): # Allow global css - use chriddyp's time-tested external css app.css.config.serve_locally = False app.css.append_css({ - "external_url":"https://codepen.io/chriddyp/pen/bWLwgP.css" + "external_url": "https://codepen.io/chriddyp/pen/bWLwgP.css" }) ############################ - ##### DEFINE CALLBACKS ##### + # ### DEFINE CALLBACKS ### # ############################ - # Callback to refresh Abstract window + # Callback to refresh abstract window @app.callback(dash.dependencies.Output("abstract-div", "value"), - [Input('cluster-div', 'hoverData')]) + [Input('cluster-div', 'hoverData')]) def update_abstract(hoverData): # Fetch df nonlocal df # Update graph with hoverData - if hoverData != None: + if hoverData is not None: hover_dict = hoverData['points'][0] abstract_idx = hover_dict['pointIndex'] # Set variable for abstract window update - abstract = df['Abstract'].iloc[abstract_idx] + abstract = df['abstract'].iloc[abstract_idx] # Set hoverData to None again to prevent issues with graph update hoverData = None else: - #cord_uid = df['cord_uid'].iloc[0] - abstract = df['Abstract'].iloc[0] + abstract = df['abstract'].iloc[0] return abstract # Callback to refresh article title @app.callback(dash.dependencies.Output("paper-title", "children"), - [Input('cluster-div', 'hoverData')]) + [Input('cluster-div', 'hoverData')]) def update_title(hoverData): # Fetch df nonlocal df # Update graph with hoverData - if hoverData != None: + if hoverData is not None: hover_dict = hoverData['points'][0] title_idx = hover_dict['pointIndex'] # Set variable for paper title update - title = df['Title'].iloc[title_idx] + title = df['title'].iloc[title_idx] # Set hoverData to None again to prevent issues with graph update hoverData = None else: - title = df['Title'].iloc[0] + title = df['title'].iloc[0] return title # Run the application app.run_server(debug=True) - -if __name__ == "__main__": - run_app() - -"""Goes into update_abstract function""" -# # Make temp chart - we just want to change title for now -# temp_chart = go.Scatter( -# x = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15], -# y = [1,4,9,16,25,36,49,64,81,100,121,144,169,196,225], -# name="abstract-window", -# ) - -# temp_chart = dcc -# temp_chart = [temp_chart] - -# # Change layout so we can change title -# abstract_layout = dict( -# title=f"{cord_uid}" -# ) - -# abstract_fig = dict(data=temp_chart, layout=abstract_layout) \ No newline at end of file diff --git a/asreviewcontrib/semantic_clustering/main.py b/asreviewcontrib/semantic_clustering/main.py new file mode 100644 index 0000000..d4f8262 --- /dev/null +++ b/asreviewcontrib/semantic_clustering/main.py @@ -0,0 +1,52 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +# Path: asreviewcontrib\semantic_clustering\main.py + +# Environment imports +import sys +import getopt + +from asreview.data import ASReviewData + +from asreviewcontrib.semantic_clustering.interactive import run_app +from asreviewcontrib.semantic_clustering.semantic_clustering import SemanticClustering + + +def main(argv): + filepath = "" + + try: + opts, args = getopt.getopt( + argv, "htf:a", ["help", "testfile", "filepath=", "app"]) + except getopt.GetoptError: + print('Please use the following format:') + print('test.py -f ') + print('test.py --testfile') + print('test.py --app') + sys.exit(2) + for opt, arg in opts: + if opt in ("-h", "--help"): + print('test.py -f or --testfile') + sys.exit() + elif opt in ("-f", "--filepath"): + filepath = arg + elif opt in ("-t", "--testfile"): + filepath = "https://raw.githubusercontent.com/asreview/systematic-review-datasets/master/datasets/van_de_Schoot_2017/output/van_de_Schoot_2017.csv" + elif opt in ("-a", "--app"): + run_app() + sys.exit(1) + print('Running from file: ', filepath) + + # check if arguments are empty + if filepath == "": + print('Please use the following format:') + print('test.py -f ') + print('test.py --testfile') + print('test.py --app') + sys.exit(2) + + SemanticClustering(ASReviewData.from_file(filepath)) + + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/asreviewcontrib/semantic_clustering/semantic_clustering.py b/asreviewcontrib/semantic_clustering/semantic_clustering.py index 413fba2..176b2bd 100644 --- a/asreviewcontrib/semantic_clustering/semantic_clustering.py +++ b/asreviewcontrib/semantic_clustering/semantic_clustering.py @@ -1,39 +1,22 @@ -# Copyright 2021 The ASReview Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +#!/usr/bin/python +# -*- coding: utf-8 -*- +# Path: asreviewcontrib\semantic_clustering\semantic_clustering.py # Environment imports import os from tqdm import tqdm - -# Calculation imports +import numpy as np from sklearn.cluster import KMeans from numpy.linalg import norm -import numpy as np -# Transformer imports from transformers import AutoTokenizer, AutoModel from transformers import logging - -# Visualization imports import matplotlib.pyplot as plt import seaborn as sns -# Local imports from dim_reduct import run_pca from dim_reduct import t_sne from clustering import run_KMeans -from asreview.data import ASReviewData # Setting environment logging.set_verbosity_error() @@ -43,13 +26,18 @@ def SemanticClustering(asreview_data_object): + # if data folder exists, delete it + if os.path.exists("data"): + print("data folder exists, deleting...") + os.system("del /F /Q data") + # load data print("Loading data...") - data = load_data(asreview_data_object) + data = _load_data(asreview_data_object) # since processing the data can take a long time, for now the data is cut # down to decrease test duration. This will be removed in future versions - data = data.iloc[:30, :] + # data = data.iloc[:30, :] # load scibert transformer print("Loading scibert transformer...") @@ -90,7 +78,7 @@ def SemanticClustering(asreview_data_object): # calculate optimal number of clusters print("Calculating optimal number of clusters...") - n_clusters = calc_optimal_n_clusters(tsne) + n_clusters = _calc_optimal_n_clusters(tsne) print("Optimal number of clusters: ", n_clusters) # run k-means. n_init is set to 10, this indicated the amount of restarts @@ -100,31 +88,45 @@ def SemanticClustering(asreview_data_object): # visualize clusters print("Visualizing clusters...") - tsne_data = [tsne[:, 0], tsne[:, 1]] - visualize_clusters(tsne_data, labels) + _visualize_clusters(tsne, labels) + + # create file for use in interactive dashboard + _create_file(data, tsne, labels) + +# Create functional dataframe and store to file for use in interactive +def _create_file(data, coords, labels): + data['x'] = coords[:, 0] + data['y'] = coords[:, 1] + data['cluster_id'] = labels + + if not os.path.exists("data"): + os.makedirs("data") + + kmeans_df_path = os.path.join("data", "kmeans_df.csv") + data.to_csv(kmeans_df_path, index=None) # Calculate the optimal amount of clusters. It checks the inertia for 1 to 25 # clusters, and picks the optimal inertia based on an elbow graph and some cool # trigonometry. -def calc_optimal_n_clusters(features): +def _calc_optimal_n_clusters(features): - Sum_of_squared_distances = [] + sum_of_squared_distances = [] K = range(1, 25) for k in K: km = KMeans(n_clusters=k) km = km.fit(features) - Sum_of_squared_distances.append(km.inertia_) + sum_of_squared_distances.append(km.inertia_) max = 0 clusters = 1 for i in K: - p1 = np.asarray((Sum_of_squared_distances[0], 1)) + p1 = np.asarray((sum_of_squared_distances[0], 1)) p2 = np.asarray( - (Sum_of_squared_distances[-1], (len(Sum_of_squared_distances)+1))) - p3 = np.asarray((Sum_of_squared_distances[i-1], i)) + (sum_of_squared_distances[-1], (len(sum_of_squared_distances)+1))) + p3 = np.asarray((sum_of_squared_distances[i-1], i)) m = np.cross(p2-p1, p3-p1)/norm(p2-p1) @@ -134,15 +136,14 @@ def calc_optimal_n_clusters(features): return clusters - -def visualize_clusters(data, labels): +def _visualize_clusters(tsne, labels): fig, ax = plt.subplots() ax.set_title("semantic clustering") ax.set_xlabel("t-SNE Component 1") ax.set_ylabel("t-SNE Component 2") - x = data[0] - y = data[1] + x = tsne[:, 0] + y = tsne[:, 1] # Do actual plotting and save image ax.scatter(x, y, c=labels, cmap="Set3") @@ -153,7 +154,7 @@ def visualize_clusters(data, labels): fig.savefig(img_path) -def load_data(asreview_data_object): +def _load_data(asreview_data_object): # extract title and abstract, drop empty abstracts and reset index data = asreview_data_object.df[['title', 'abstract']].copy() @@ -162,8 +163,3 @@ def load_data(asreview_data_object): data = data.reset_index(drop=True) return data - - -if __name__ == "__main__": - filepath = "https://raw.githubusercontent.com/asreview/systematic-review-datasets/master/datasets/van_de_Schoot_2017/output/van_de_Schoot_2017.csv" - SemanticClustering(ASReviewData.from_file(filepath))