Skip to content

Commit

Permalink
Add automatic cluster calculations (#18)
Browse files Browse the repository at this point in the history
  • Loading branch information
jteijema authored Nov 11, 2021
1 parent 5cdc705 commit 5b26bbd
Showing 1 changed file with 60 additions and 10 deletions.
70 changes: 60 additions & 10 deletions asreviewcontrib/semantic_clustering/semantic_clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,30 @@
# See the License for the specific language governing permissions and
# limitations under the License.

# imports
# Environment imports
import os
from tqdm import tqdm
from dim_reduct import run_pca
from dim_reduct import t_sne
from clustering import run_KMeans
from asreview.data import ASReviewData

# Calculation imports
from sklearn.cluster import KMeans
from numpy.linalg import norm
import numpy as np

# Transformer imports
from transformers import AutoTokenizer, AutoModel
from transformers import logging

# Visualization imports
import matplotlib.pyplot as plt
import seaborn as sns

# Local imports
from dim_reduct import run_pca
from dim_reduct import t_sne
from clustering import run_KMeans
from asreview.data import ASReviewData

# Setting environment
logging.set_verbosity_error()
sns.set()
tqdm.pandas()
Expand All @@ -35,8 +47,9 @@ def SemanticClustering(asreview_data_object):
print("Loading data...")
data = load_data(asreview_data_object)

# cut data for testing
data = data.iloc[:300, :]
# since processing the data can take a long time, for now the data is cut
# down to decrease test duration. This will be removed in future versions
data = data.iloc[:30, :]

# load scibert transformer
print("Loading scibert transformer...")
Expand All @@ -55,7 +68,7 @@ def SemanticClustering(asreview_data_object):
add_special_tokens=False,
truncation=True,
max_length=512,
padding='max_length',
# padding='max_length',
return_tensors='pt'))

# generate embeddings and format correctly
Expand All @@ -75,16 +88,53 @@ def SemanticClustering(asreview_data_object):
print("Running t-SNE...")
tsne = t_sne(pca, n_iter=1000)

# run k-means
# calculate optimal number of clusters
print("Calculating optimal number of clusters...")
n_clusters = calc_optimal_n_clusters(tsne)
print("Optimal number of clusters: ", n_clusters)

# run k-means. n_init is set to 10, this indicated the amount of restarts
# for the KMeans algorithm. 10 is the sklearn default.
print("Running k-means...")
labels = run_KMeans(tsne, 10, 10)
labels = run_KMeans(tsne, n_clusters, 10)

# visualize clusters
print("Visualizing clusters...")
tsne_data = [tsne[:, 0], tsne[:, 1]]
visualize_clusters(tsne_data, labels)


# Calculate the optimal amount of clusters. It checks the inertia for 1 to 25
# clusters, and picks the optimal inertia based on an elbow graph and some cool
# trigonometry.
def calc_optimal_n_clusters(features):

Sum_of_squared_distances = []

K = range(1, 25)
for k in K:
km = KMeans(n_clusters=k)
km = km.fit(features)
Sum_of_squared_distances.append(km.inertia_)

max = 0
clusters = 1

for i in K:
p1 = np.asarray((Sum_of_squared_distances[0], 1))
p2 = np.asarray(
(Sum_of_squared_distances[-1], (len(Sum_of_squared_distances)+1)))
p3 = np.asarray((Sum_of_squared_distances[i-1], i))

m = np.cross(p2-p1, p3-p1)/norm(p2-p1)

if m > max:
max = m
clusters = i

return clusters


def visualize_clusters(data, labels):
fig, ax = plt.subplots()
ax.set_title("semantic clustering")
Expand Down

0 comments on commit 5b26bbd

Please sign in to comment.