Skip to content

Commit

Permalink
Align with interactive app (#23)
Browse files Browse the repository at this point in the history
* Revert "Revert "Add the functionality of semantic clusters into one streamlined main file" (#16)"

This reverts commit ff861f2.

* Delete semantic_clusters.gif

* Remove unused files

* Update semantic clustering with PCA

* Update semantic clustering with t-sne

* Add k_means clustering and finish main pipeline

* Delete temporary_test_file.ipynb

* Update README.md

* Add automatic cluster calculation

* Create a file for interactive to use, adjust interactive and add a command line interface

Big commit, but all connected

* Solve issues with command line interface

* Update semantic clustering with pep8

* Fix optimal clusters feature

* Move entrypoint to main.py

* Add automatic removal of previous data file

* Correct licenses

* Squashed commit of the following:

commit f6383ca
Author: Jelle Teijema <[email protected]>
Date:   Wed Nov 10 10:26:20 2021 +0100

    Cluster the imports

commit d89df8f
Author: Jelle Teijema <[email protected]>
Date:   Wed Nov 10 10:20:25 2021 +0100

    Add an explanation of chosen defaults

commit 12c8644
Merge: 926a1b9 5cdc705
Author: Jelle <[email protected]>
Date:   Tue Nov 9 11:05:01 2021 +0100

    Merge branch 'master' into add-automatic-cluster-calculations

* Update asreviewcontrib/semantic_clustering/__init__.py

Co-authored-by: Jonathan de Bruin <[email protected]>

* Update asreviewcontrib/semantic_clustering/main.py

Co-authored-by: Jonathan de Bruin <[email protected]>

* Formatting of the imports

Co-authored-by: Jonathan de Bruin <[email protected]>
  • Loading branch information
jteijema and J535D165 authored Nov 12, 2021
1 parent 1094184 commit 97a1d1e
Show file tree
Hide file tree
Showing 4 changed files with 123 additions and 95 deletions.
2 changes: 2 additions & 0 deletions asreviewcontrib/semantic_clustering/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from asreviewcontrib.semantic_clustering.semantic_clustering import SemanticClustering
from asreviewcontrib.semantic_clustering.interactive import run_app
84 changes: 31 additions & 53 deletions asreviewcontrib/semantic_clustering/interactive.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,146 +4,124 @@
import os

# Data
import numpy as np
import pandas as pd
import pandas as pd

# Dash-y
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Output, Input
from dash import dcc
from dash import html
from dash.dependencies import Input

# Plotly
import plotly.graph_objs as go
import plotly.express as px


def run_app():
"""Function to be called to run the full Dash App"""

# Load DataFrame with clusters
df_path = os.path.join("data","dataframes","kmeans_df.csv")
df_path = os.path.join("data", "kmeans_df.csv")
df = pd.read_csv(df_path)

# Read as STR for discrete colormap
df['cluster_id'] = df['cluster_id'].astype(str)

# Show main figure
#fig = px.scatter(df, x="x", y="y", color="cluster_id", color_discrete_map=px.colors.sequential.Viridis)
fig = px.scatter(df, x="x", y="y", color="cluster_id", color_discrete_sequence=px.colors.qualitative.T10)
fig = px.scatter(df, x="x", y="y", color="cluster_id",
color_discrete_sequence=px.colors.qualitative.Set1)
fig.update_layout(dragmode="pan")
fig.update_layout(xaxis=dict(showticklabels=False, title=""),
fig.update_layout(xaxis=dict(showticklabels=False, title=""),
yaxis=dict(showticklabels=False, ticks="", title=""))
config = dict({'scrollZoom': True, 'displayModeBar':False, 'displaylogo':False})
config = dict(
{'scrollZoom': True, 'displayModeBar': False, 'displaylogo': False})

# Initialize app and do lay-out
app = dash.Dash()
app.layout = html.Div([

# banner div
html.Div([
html.H2("CORD-19: Visualizing Semantic Clusters"),
html.H2("Visualizing Semantic Clusters"),
], className="banner"),

# external css div
html.Div([

# Main semantic cluster graph
html.Div([
dcc.Graph(figure=fig,id="cluster-div",config=config)
], className = "six columns"),
dcc.Graph(figure=fig, id="cluster-div", config=config)
], className="six columns"),

# Div for abstract window
html.Div([
html.H3("Test Title", id="paper-title"),
html.Div([
html.H3("Test title", id="paper-title"),
dcc.Textarea(
readOnly=True,
placeholder='Enter a value...',
value='This is a TextArea component',
style={'width': '100%','height':'300px'},
style={'width': '100%', 'height': '300px'},
id="abstract-div"
)
], className = "six columns"),
)
], className="six columns"),

], className="row"),
])

# Allow global css - use chriddyp's time-tested external css
app.css.config.serve_locally = False
app.css.append_css({
"external_url":"https://codepen.io/chriddyp/pen/bWLwgP.css"
"external_url": "https://codepen.io/chriddyp/pen/bWLwgP.css"
})

############################
##### DEFINE CALLBACKS #####
# ### DEFINE CALLBACKS ### #
############################

# Callback to refresh Abstract window
# Callback to refresh abstract window
@app.callback(dash.dependencies.Output("abstract-div", "value"),
[Input('cluster-div', 'hoverData')])
[Input('cluster-div', 'hoverData')])
def update_abstract(hoverData):

# Fetch df
nonlocal df

# Update graph with hoverData
if hoverData != None:
if hoverData is not None:
hover_dict = hoverData['points'][0]
abstract_idx = hover_dict['pointIndex']

# Set variable for abstract window update
abstract = df['Abstract'].iloc[abstract_idx]
abstract = df['abstract'].iloc[abstract_idx]

# Set hoverData to None again to prevent issues with graph update
hoverData = None
else:
#cord_uid = df['cord_uid'].iloc[0]
abstract = df['Abstract'].iloc[0]
abstract = df['abstract'].iloc[0]

return abstract

# Callback to refresh article title
@app.callback(dash.dependencies.Output("paper-title", "children"),
[Input('cluster-div', 'hoverData')])
[Input('cluster-div', 'hoverData')])
def update_title(hoverData):

# Fetch df
nonlocal df

# Update graph with hoverData
if hoverData != None:
if hoverData is not None:
hover_dict = hoverData['points'][0]
title_idx = hover_dict['pointIndex']

# Set variable for paper title update
title = df['Title'].iloc[title_idx]
title = df['title'].iloc[title_idx]

# Set hoverData to None again to prevent issues with graph update
hoverData = None
else:
title = df['Title'].iloc[0]
title = df['title'].iloc[0]

return title

# Run the application
app.run_server(debug=True)

if __name__ == "__main__":
run_app()

"""Goes into update_abstract function"""
# # Make temp chart - we just want to change title for now
# temp_chart = go.Scatter(
# x = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],
# y = [1,4,9,16,25,36,49,64,81,100,121,144,169,196,225],
# name="abstract-window",
# )

# temp_chart = dcc
# temp_chart = [temp_chart]

# # Change layout so we can change title
# abstract_layout = dict(
# title=f"{cord_uid}"
# )

# abstract_fig = dict(data=temp_chart, layout=abstract_layout)
52 changes: 52 additions & 0 deletions asreviewcontrib/semantic_clustering/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Path: asreviewcontrib\semantic_clustering\main.py

# Environment imports
import sys
import getopt

from asreview.data import ASReviewData

from asreviewcontrib.semantic_clustering.interactive import run_app
from asreviewcontrib.semantic_clustering.semantic_clustering import SemanticClustering


def main(argv):
filepath = ""

try:
opts, args = getopt.getopt(
argv, "htf:a", ["help", "testfile", "filepath=", "app"])
except getopt.GetoptError:
print('Please use the following format:')
print('test.py -f <filepath>')
print('test.py --testfile')
print('test.py --app')
sys.exit(2)
for opt, arg in opts:
if opt in ("-h", "--help"):
print('test.py -f <filepath> or --testfile')
sys.exit()
elif opt in ("-f", "--filepath"):
filepath = arg
elif opt in ("-t", "--testfile"):
filepath = "https://raw.githubusercontent.com/asreview/systematic-review-datasets/master/datasets/van_de_Schoot_2017/output/van_de_Schoot_2017.csv"
elif opt in ("-a", "--app"):
run_app()
sys.exit(1)
print('Running from file: ', filepath)

# check if arguments are empty
if filepath == "":
print('Please use the following format:')
print('test.py -f <filepath>')
print('test.py --testfile')
print('test.py --app')
sys.exit(2)

SemanticClustering(ASReviewData.from_file(filepath))


if __name__ == "__main__":
main(sys.argv[1:])
80 changes: 38 additions & 42 deletions asreviewcontrib/semantic_clustering/semantic_clustering.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,22 @@
# Copyright 2021 The ASReview Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Path: asreviewcontrib\semantic_clustering\semantic_clustering.py

# Environment imports
import os
from tqdm import tqdm

# Calculation imports
import numpy as np
from sklearn.cluster import KMeans
from numpy.linalg import norm
import numpy as np

# Transformer imports
from transformers import AutoTokenizer, AutoModel
from transformers import logging

# Visualization imports
import matplotlib.pyplot as plt
import seaborn as sns

# Local imports
from dim_reduct import run_pca
from dim_reduct import t_sne
from clustering import run_KMeans
from asreview.data import ASReviewData

# Setting environment
logging.set_verbosity_error()
Expand All @@ -43,13 +26,18 @@

def SemanticClustering(asreview_data_object):

# if data folder exists, delete it
if os.path.exists("data"):
print("data folder exists, deleting...")
os.system("del /F /Q data")

# load data
print("Loading data...")
data = load_data(asreview_data_object)
data = _load_data(asreview_data_object)

# since processing the data can take a long time, for now the data is cut
# down to decrease test duration. This will be removed in future versions
data = data.iloc[:30, :]
# data = data.iloc[:30, :]

# load scibert transformer
print("Loading scibert transformer...")
Expand Down Expand Up @@ -90,7 +78,7 @@ def SemanticClustering(asreview_data_object):

# calculate optimal number of clusters
print("Calculating optimal number of clusters...")
n_clusters = calc_optimal_n_clusters(tsne)
n_clusters = _calc_optimal_n_clusters(tsne)
print("Optimal number of clusters: ", n_clusters)

# run k-means. n_init is set to 10, this indicated the amount of restarts
Expand All @@ -100,31 +88,45 @@ def SemanticClustering(asreview_data_object):

# visualize clusters
print("Visualizing clusters...")
tsne_data = [tsne[:, 0], tsne[:, 1]]
visualize_clusters(tsne_data, labels)
_visualize_clusters(tsne, labels)

# create file for use in interactive dashboard
_create_file(data, tsne, labels)

# Create functional dataframe and store to file for use in interactive
def _create_file(data, coords, labels):
data['x'] = coords[:, 0]
data['y'] = coords[:, 1]
data['cluster_id'] = labels

if not os.path.exists("data"):
os.makedirs("data")

kmeans_df_path = os.path.join("data", "kmeans_df.csv")
data.to_csv(kmeans_df_path, index=None)


# Calculate the optimal amount of clusters. It checks the inertia for 1 to 25
# clusters, and picks the optimal inertia based on an elbow graph and some cool
# trigonometry.
def calc_optimal_n_clusters(features):
def _calc_optimal_n_clusters(features):

Sum_of_squared_distances = []
sum_of_squared_distances = []

K = range(1, 25)
for k in K:
km = KMeans(n_clusters=k)
km = km.fit(features)
Sum_of_squared_distances.append(km.inertia_)
sum_of_squared_distances.append(km.inertia_)

max = 0
clusters = 1

for i in K:
p1 = np.asarray((Sum_of_squared_distances[0], 1))
p1 = np.asarray((sum_of_squared_distances[0], 1))
p2 = np.asarray(
(Sum_of_squared_distances[-1], (len(Sum_of_squared_distances)+1)))
p3 = np.asarray((Sum_of_squared_distances[i-1], i))
(sum_of_squared_distances[-1], (len(sum_of_squared_distances)+1)))
p3 = np.asarray((sum_of_squared_distances[i-1], i))

m = np.cross(p2-p1, p3-p1)/norm(p2-p1)

Expand All @@ -134,15 +136,14 @@ def calc_optimal_n_clusters(features):

return clusters


def visualize_clusters(data, labels):
def _visualize_clusters(tsne, labels):
fig, ax = plt.subplots()
ax.set_title("semantic clustering")
ax.set_xlabel("t-SNE Component 1")
ax.set_ylabel("t-SNE Component 2")

x = data[0]
y = data[1]
x = tsne[:, 0]
y = tsne[:, 1]

# Do actual plotting and save image
ax.scatter(x, y, c=labels, cmap="Set3")
Expand All @@ -153,7 +154,7 @@ def visualize_clusters(data, labels):
fig.savefig(img_path)


def load_data(asreview_data_object):
def _load_data(asreview_data_object):

# extract title and abstract, drop empty abstracts and reset index
data = asreview_data_object.df[['title', 'abstract']].copy()
Expand All @@ -162,8 +163,3 @@ def load_data(asreview_data_object):
data = data.reset_index(drop=True)

return data


if __name__ == "__main__":
filepath = "https://raw.githubusercontent.com/asreview/systematic-review-datasets/master/datasets/van_de_Schoot_2017/output/van_de_Schoot_2017.csv"
SemanticClustering(ASReviewData.from_file(filepath))

0 comments on commit 97a1d1e

Please sign in to comment.