Skip to content

Commit

Permalink
Make subcommand for semantic clusters algorithm (#24)
Browse files Browse the repository at this point in the history
Co-authored-by: Jonathan de Bruin <[email protected]>
  • Loading branch information
jteijema and J535D165 authored Nov 16, 2021
1 parent 1d2b111 commit 980d55f
Show file tree
Hide file tree
Showing 8 changed files with 195 additions and 49 deletions.
40 changes: 40 additions & 0 deletions .github/workflows/ci-workflow.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
name: test-suite
on: [push, pull_request]
jobs:
lint-python:
name: lint-python
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@master
- uses: actions/setup-python@v1
with:
python-version: '3.8'
architecture: 'x64'
- name: Install flake8
run: |
pip install flake8
- name: Lint python with flake8
run: |
flake8 . --max-complexity=10 --statistics
test-master:
name: test-asreview-latest
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
with:
path: asr-semantic-clustering
- uses: actions/checkout@v2
with:
repository: asreview/asreview
path: asr-core
- uses: actions/setup-python@v1
with:
python-version: '3.8'
architecture: 'x64'
- name: Install packages
run: |
pip install pytest
pip install --upgrade setuptools>=41.0.0
pip install ./asr-core[all]
pip install ./asr-semantic-clustering
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ __pycache__/
# C extensions
*.so

# vscode
.vscode/

# Distribution / packaging
.Python
build/
Expand Down
2 changes: 0 additions & 2 deletions asreviewcontrib/semantic_clustering/__init__.py

This file was deleted.

2 changes: 1 addition & 1 deletion asreviewcontrib/semantic_clustering/interactive.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,4 +119,4 @@ def update_title(hoverData):
return title

# Run the application
app.run_server(debug=True)
app.run_server(debug=False)
106 changes: 67 additions & 39 deletions asreviewcontrib/semantic_clustering/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,51 +2,79 @@
# -*- coding: utf-8 -*-
# Path: asreviewcontrib\semantic_clustering\main.py

# Environment imports
import argparse
import sys
import getopt

from asreview.data import ASReviewData
import webbrowser

from asreview.data import ASReviewData
from asreview.entry_points import BaseEntryPoint
from asreviewcontrib.semantic_clustering.interactive import run_app
from asreviewcontrib.semantic_clustering.semantic_clustering import SemanticClustering


def main(argv):
filepath = ""

try:
opts, args = getopt.getopt(
argv, "htf:a", ["help", "testfile", "filepath=", "app"])
except getopt.GetoptError:
print('Please use the following format:')
print('test.py -f <filepath>')
print('test.py --testfile')
print('test.py --app')
sys.exit(2)
for opt, arg in opts:
if opt in ("-h", "--help"):
print('test.py -f <filepath> or --testfile')
sys.exit()
elif opt in ("-f", "--filepath"):
filepath = arg
elif opt in ("-t", "--testfile"):
filepath = "https://raw.githubusercontent.com/asreview/systematic-review-datasets/master/datasets/van_de_Schoot_2017/output/van_de_Schoot_2017.csv"
elif opt in ("-a", "--app"):
from asreviewcontrib.semantic_clustering.semantic_clustering import SemanticClustering # noqa: E501


class SemClusEntryPoint(BaseEntryPoint):
description = "Semantic clustering tools for ASReview."
extension_name = "semantic_clustering"

def __init__(self):
self.version = "0.1"

def execute(self, argv):
args = _parse_arguments(
version=f"{self.extension_name}: {self.version}", argv=argv)

if args.filepath:
data = ASReviewData.from_file(args.filepath)
SemanticClustering(data)

elif args.testfile:
data = ASReviewData.from_file("https://raw.githubusercontent.com/asreview/systematic-review-datasets/master/datasets/van_de_Schoot_2017/output/van_de_Schoot_2017.csv") # noqa: E501
SemanticClustering(data)

elif args.app:
url = "http://127.0.0.1:8050/"

webbrowser.open(url, new=2, autoraise=True)

run_app()
sys.exit(1)
print('Running from file: ', filepath)
sys.exit(1)


# check if arguments are empty
if filepath == "":
print('Please use the following format:')
print('test.py -f <filepath>')
print('test.py --testfile')
print('test.py --app')
sys.exit(2)
# argument parser
def _parse_arguments(version="Unknown", argv=None):
parser = argparse.ArgumentParser(prog='asreview semantic_clustering')
group = parser.add_mutually_exclusive_group()

SemanticClustering(ASReviewData.from_file(filepath))
group.add_argument(
"-f",
"--filepath",
help="path to the file to be processed",
type=str,
default="",
)
group.add_argument(
"-t",
"--testfile",
help="use a test file instead of providing a file",
action="store_true",
)
group.add_argument(
"-a",
"--app",
help="run the app",
action="store_true",
)
group.add_argument(
"-v",
"--version",
action="version",
version="%(prog)s " + version,
)

# Exit if no arguments are given
if len(argv) == 0:
parser.print_help(sys.stderr)
sys.exit(1)

if __name__ == "__main__":
main(sys.argv[1:])
return parser.parse_args(argv)
16 changes: 9 additions & 7 deletions asreviewcontrib/semantic_clustering/semantic_clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,17 @@
import os
from tqdm import tqdm
import numpy as np

from sklearn.cluster import KMeans
from numpy.linalg import norm

from transformers import AutoTokenizer, AutoModel
from transformers import logging
import matplotlib.pyplot as plt
import seaborn as sns

from dim_reduct import run_pca
from dim_reduct import t_sne
from clustering import run_KMeans
from asreviewcontrib.semantic_clustering.dim_reduct import run_pca
from asreviewcontrib.semantic_clustering.dim_reduct import t_sne
from asreviewcontrib.semantic_clustering.clustering import run_KMeans

# Setting environment
logging.set_verbosity_error()
Expand Down Expand Up @@ -92,6 +92,7 @@ def SemanticClustering(asreview_data_object):
# create file for use in interactive dashboard
_create_file(data, tsne, labels)


# Create functional dataframe and store to file for use in interactive
def _create_file(data, coords, labels):
data['x'] = coords[:, 0]
Expand Down Expand Up @@ -124,17 +125,18 @@ def _calc_optimal_n_clusters(features):
for i in K:
p1 = np.asarray((sum_of_squared_distances[0], 1))
p2 = np.asarray(
(sum_of_squared_distances[-1], (len(sum_of_squared_distances)+1)))
p3 = np.asarray((sum_of_squared_distances[i-1], i))
(sum_of_squared_distances[-1], (len(sum_of_squared_distances) + 1)))
p3 = np.asarray((sum_of_squared_distances[i - 1], i))

m = np.cross(p2-p1, p3-p1)/norm(p2-p1)
m = np.cross(p2 - p1, p3 - p1) / norm(p2 - p1)

if m > max:
max = m
clusters = i

return clusters


def _visualize_clusters(tsne, labels):
fig, ax = plt.subplots()
ax.set_title("semantic clustering")
Expand Down
7 changes: 7 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
[flake8]
max-line-length = 80
ignore =
E402, # module level import not at top of file
exclude =
clustering.py,
dim_reduct.py
68 changes: 68 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# based on https://github.com/pypa/sampleproject
# MIT License

# Always prefer setuptools over distutils
from setuptools import setup, find_namespace_packages
from os import path
from io import open

here = path.abspath(path.dirname(__file__))

# Get the long description from the README file
with open(path.join(here, 'README.md'), encoding='utf-8') as f:
long_description = f.read()

setup(
name='asreview-semantic-clustering',
description='Semantic clustering tool for the ASReview project',
version='0.1',
long_description=long_description,
long_description_content_type='text/markdown',
url='https://github.com/asreview/semantic-clusters',
author='Utrecht University',
author_email='[email protected]',
classifiers=[
# How mature is this project? Common values are
# 3 - Alpha
# 4 - Beta
# 5 - Production/Stable
'Development Status :: 3 - Alpha',

# Pick your license as you wish
'License :: OSI Approved :: Apache Software License',

'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
],
keywords='asreview extension semantic clustering clusters visualization',
packages=find_namespace_packages(include=['asreviewcontrib.*']),
install_requires=[
"numpy",
"matplotlib",
"asreview",
"dash",
"plotly",
"sklearn",
"transformers",
"numpy",
"seaborn",
"torch",
],

extras_require={
},

entry_points={
"asreview.entry_points": [
"semantic_clustering = asreviewcontrib.semantic_clustering.main:SemClusEntryPoint", # noqa: E501
]
},

project_urls={
'Bug Reports':
"https://github.com/asreview/semantic-clusters/issues",
'Source':
"https://github.com/asreview/semantic-clusters",
},
)

0 comments on commit 980d55f

Please sign in to comment.