Add automatic cluster calculations (#18)

asreview · Nov 11, 2021 · 5b26bbd · 5b26bbd
1 parent 5cdc705
commit 5b26bbd
Showing 1 changed file with 60 additions and 10 deletions.
diff --git a/asreviewcontrib/semantic_clustering/semantic_clustering.py b/asreviewcontrib/semantic_clustering/semantic_clustering.py
@@ -12,18 +12,30 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# imports
+# Environment imports
 import os
 from tqdm import tqdm
-from dim_reduct import run_pca
-from dim_reduct import t_sne
-from clustering import run_KMeans
-from asreview.data import ASReviewData
+
+# Calculation imports
+from sklearn.cluster import KMeans
+from numpy.linalg import norm
 import numpy as np
+
+# Transformer imports
 from transformers import AutoTokenizer, AutoModel
 from transformers import logging
+
+# Visualization imports
 import matplotlib.pyplot as plt
 import seaborn as sns
+
+# Local imports
+from dim_reduct import run_pca
+from dim_reduct import t_sne
+from clustering import run_KMeans
+from asreview.data import ASReviewData
+
+# Setting environment
 logging.set_verbosity_error()
 sns.set()
 tqdm.pandas()
@@ -35,8 +47,9 @@ def SemanticClustering(asreview_data_object):
     print("Loading data...")
     data = load_data(asreview_data_object)
 
-    # cut data for testing
-    data = data.iloc[:300, :]
+    # since processing the data can take a long time, for now the data is cut
+    # down to decrease test duration. This will be removed in future versions
+    data = data.iloc[:30, :]
 
     # load scibert transformer
     print("Loading scibert transformer...")
@@ -55,7 +68,7 @@ def SemanticClustering(asreview_data_object):
             add_special_tokens=False,
             truncation=True,
             max_length=512,
-            padding='max_length',
+            # padding='max_length',
             return_tensors='pt'))
 
     # generate embeddings and format correctly
@@ -75,16 +88,53 @@ def SemanticClustering(asreview_data_object):
     print("Running t-SNE...")
     tsne = t_sne(pca, n_iter=1000)
 
-    # run k-means
+    # calculate optimal number of clusters
+    print("Calculating optimal number of clusters...")
+    n_clusters = calc_optimal_n_clusters(tsne)
+    print("Optimal number of clusters: ", n_clusters)
+
+    # run k-means. n_init is set to 10, this indicated the amount of restarts
+    # for the KMeans algorithm. 10 is the sklearn default.
     print("Running k-means...")
-    labels = run_KMeans(tsne, 10, 10)
+    labels = run_KMeans(tsne, n_clusters, 10)
 
     # visualize clusters
     print("Visualizing clusters...")
     tsne_data = [tsne[:, 0], tsne[:, 1]]
     visualize_clusters(tsne_data, labels)
 
 
+# Calculate the optimal amount of clusters. It checks the inertia for 1 to 25
+# clusters, and picks the optimal inertia based on an elbow graph and some cool
+# trigonometry.
+def calc_optimal_n_clusters(features):
+
+    Sum_of_squared_distances = []
+
+    K = range(1, 25)
+    for k in K:
+        km = KMeans(n_clusters=k)
+        km = km.fit(features)
+        Sum_of_squared_distances.append(km.inertia_)
+
+    max = 0
+    clusters = 1
+
+    for i in K:
+        p1 = np.asarray((Sum_of_squared_distances[0], 1))
+        p2 = np.asarray(
+            (Sum_of_squared_distances[-1], (len(Sum_of_squared_distances)+1)))
+        p3 = np.asarray((Sum_of_squared_distances[i-1], i))
+
+        m = np.cross(p2-p1, p3-p1)/norm(p2-p1)
+
+        if m > max:
+            max = m
+            clusters = i
+
+    return clusters
+
+
 def visualize_clusters(data, labels):
     fig, ax = plt.subplots()
     ax.set_title("semantic clustering")