From 980d55f5f81cf365a78c47280e55a4d9c9ba0aed Mon Sep 17 00:00:00 2001 From: Jelle Date: Tue, 16 Nov 2021 09:29:06 +0100 Subject: [PATCH] Make subcommand for semantic clusters algorithm (#24) Co-authored-by: Jonathan de Bruin --- .github/workflows/ci-workflow.yml | 40 +++++++ .gitignore | 3 + .../semantic_clustering/__init__.py | 2 - .../semantic_clustering/interactive.py | 2 +- asreviewcontrib/semantic_clustering/main.py | 106 +++++++++++------- .../semantic_clustering.py | 16 +-- setup.cfg | 7 ++ setup.py | 68 +++++++++++ 8 files changed, 195 insertions(+), 49 deletions(-) create mode 100644 .github/workflows/ci-workflow.yml delete mode 100644 asreviewcontrib/semantic_clustering/__init__.py create mode 100644 setup.cfg create mode 100644 setup.py diff --git a/.github/workflows/ci-workflow.yml b/.github/workflows/ci-workflow.yml new file mode 100644 index 0000000..6bae321 --- /dev/null +++ b/.github/workflows/ci-workflow.yml @@ -0,0 +1,40 @@ +name: test-suite +on: [push, pull_request] +jobs: + lint-python: + name: lint-python + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@master + - uses: actions/setup-python@v1 + with: + python-version: '3.8' + architecture: 'x64' + - name: Install flake8 + run: | + pip install flake8 + - name: Lint python with flake8 + run: | + flake8 . --max-complexity=10 --statistics + + test-master: + name: test-asreview-latest + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + with: + path: asr-semantic-clustering + - uses: actions/checkout@v2 + with: + repository: asreview/asreview + path: asr-core + - uses: actions/setup-python@v1 + with: + python-version: '3.8' + architecture: 'x64' + - name: Install packages + run: | + pip install pytest + pip install --upgrade setuptools>=41.0.0 + pip install ./asr-core[all] + pip install ./asr-semantic-clustering diff --git a/.gitignore b/.gitignore index 518bdbf..523f994 100644 --- a/.gitignore +++ b/.gitignore @@ -28,6 +28,9 @@ __pycache__/ # C extensions *.so +# vscode +.vscode/ + # Distribution / packaging .Python build/ diff --git a/asreviewcontrib/semantic_clustering/__init__.py b/asreviewcontrib/semantic_clustering/__init__.py deleted file mode 100644 index e0d6e9d..0000000 --- a/asreviewcontrib/semantic_clustering/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from asreviewcontrib.semantic_clustering.semantic_clustering import SemanticClustering -from asreviewcontrib.semantic_clustering.interactive import run_app diff --git a/asreviewcontrib/semantic_clustering/interactive.py b/asreviewcontrib/semantic_clustering/interactive.py index 686d458..c805837 100644 --- a/asreviewcontrib/semantic_clustering/interactive.py +++ b/asreviewcontrib/semantic_clustering/interactive.py @@ -119,4 +119,4 @@ def update_title(hoverData): return title # Run the application - app.run_server(debug=True) + app.run_server(debug=False) diff --git a/asreviewcontrib/semantic_clustering/main.py b/asreviewcontrib/semantic_clustering/main.py index d4f8262..febe210 100644 --- a/asreviewcontrib/semantic_clustering/main.py +++ b/asreviewcontrib/semantic_clustering/main.py @@ -2,51 +2,79 @@ # -*- coding: utf-8 -*- # Path: asreviewcontrib\semantic_clustering\main.py -# Environment imports +import argparse import sys -import getopt -from asreview.data import ASReviewData +import webbrowser +from asreview.data import ASReviewData +from asreview.entry_points import BaseEntryPoint from asreviewcontrib.semantic_clustering.interactive import run_app -from asreviewcontrib.semantic_clustering.semantic_clustering import SemanticClustering - - -def main(argv): - filepath = "" - - try: - opts, args = getopt.getopt( - argv, "htf:a", ["help", "testfile", "filepath=", "app"]) - except getopt.GetoptError: - print('Please use the following format:') - print('test.py -f ') - print('test.py --testfile') - print('test.py --app') - sys.exit(2) - for opt, arg in opts: - if opt in ("-h", "--help"): - print('test.py -f or --testfile') - sys.exit() - elif opt in ("-f", "--filepath"): - filepath = arg - elif opt in ("-t", "--testfile"): - filepath = "https://raw.githubusercontent.com/asreview/systematic-review-datasets/master/datasets/van_de_Schoot_2017/output/van_de_Schoot_2017.csv" - elif opt in ("-a", "--app"): +from asreviewcontrib.semantic_clustering.semantic_clustering import SemanticClustering # noqa: E501 + + +class SemClusEntryPoint(BaseEntryPoint): + description = "Semantic clustering tools for ASReview." + extension_name = "semantic_clustering" + + def __init__(self): + self.version = "0.1" + + def execute(self, argv): + args = _parse_arguments( + version=f"{self.extension_name}: {self.version}", argv=argv) + + if args.filepath: + data = ASReviewData.from_file(args.filepath) + SemanticClustering(data) + + elif args.testfile: + data = ASReviewData.from_file("https://raw.githubusercontent.com/asreview/systematic-review-datasets/master/datasets/van_de_Schoot_2017/output/van_de_Schoot_2017.csv") # noqa: E501 + SemanticClustering(data) + + elif args.app: + url = "http://127.0.0.1:8050/" + + webbrowser.open(url, new=2, autoraise=True) + run_app() - sys.exit(1) - print('Running from file: ', filepath) + sys.exit(1) + - # check if arguments are empty - if filepath == "": - print('Please use the following format:') - print('test.py -f ') - print('test.py --testfile') - print('test.py --app') - sys.exit(2) +# argument parser +def _parse_arguments(version="Unknown", argv=None): + parser = argparse.ArgumentParser(prog='asreview semantic_clustering') + group = parser.add_mutually_exclusive_group() - SemanticClustering(ASReviewData.from_file(filepath)) + group.add_argument( + "-f", + "--filepath", + help="path to the file to be processed", + type=str, + default="", + ) + group.add_argument( + "-t", + "--testfile", + help="use a test file instead of providing a file", + action="store_true", + ) + group.add_argument( + "-a", + "--app", + help="run the app", + action="store_true", + ) + group.add_argument( + "-v", + "--version", + action="version", + version="%(prog)s " + version, + ) + # Exit if no arguments are given + if len(argv) == 0: + parser.print_help(sys.stderr) + sys.exit(1) -if __name__ == "__main__": - main(sys.argv[1:]) + return parser.parse_args(argv) diff --git a/asreviewcontrib/semantic_clustering/semantic_clustering.py b/asreviewcontrib/semantic_clustering/semantic_clustering.py index 27c496b..b8d4bfa 100644 --- a/asreviewcontrib/semantic_clustering/semantic_clustering.py +++ b/asreviewcontrib/semantic_clustering/semantic_clustering.py @@ -5,17 +5,17 @@ import os from tqdm import tqdm import numpy as np + from sklearn.cluster import KMeans from numpy.linalg import norm - from transformers import AutoTokenizer, AutoModel from transformers import logging import matplotlib.pyplot as plt import seaborn as sns -from dim_reduct import run_pca -from dim_reduct import t_sne -from clustering import run_KMeans +from asreviewcontrib.semantic_clustering.dim_reduct import run_pca +from asreviewcontrib.semantic_clustering.dim_reduct import t_sne +from asreviewcontrib.semantic_clustering.clustering import run_KMeans # Setting environment logging.set_verbosity_error() @@ -92,6 +92,7 @@ def SemanticClustering(asreview_data_object): # create file for use in interactive dashboard _create_file(data, tsne, labels) + # Create functional dataframe and store to file for use in interactive def _create_file(data, coords, labels): data['x'] = coords[:, 0] @@ -124,10 +125,10 @@ def _calc_optimal_n_clusters(features): for i in K: p1 = np.asarray((sum_of_squared_distances[0], 1)) p2 = np.asarray( - (sum_of_squared_distances[-1], (len(sum_of_squared_distances)+1))) - p3 = np.asarray((sum_of_squared_distances[i-1], i)) + (sum_of_squared_distances[-1], (len(sum_of_squared_distances) + 1))) + p3 = np.asarray((sum_of_squared_distances[i - 1], i)) - m = np.cross(p2-p1, p3-p1)/norm(p2-p1) + m = np.cross(p2 - p1, p3 - p1) / norm(p2 - p1) if m > max: max = m @@ -135,6 +136,7 @@ def _calc_optimal_n_clusters(features): return clusters + def _visualize_clusters(tsne, labels): fig, ax = plt.subplots() ax.set_title("semantic clustering") diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..791bc8c --- /dev/null +++ b/setup.cfg @@ -0,0 +1,7 @@ +[flake8] +max-line-length = 80 +ignore = + E402, # module level import not at top of file +exclude = + clustering.py, + dim_reduct.py \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..eb4c837 --- /dev/null +++ b/setup.py @@ -0,0 +1,68 @@ +# based on https://github.com/pypa/sampleproject +# MIT License + +# Always prefer setuptools over distutils +from setuptools import setup, find_namespace_packages +from os import path +from io import open + +here = path.abspath(path.dirname(__file__)) + +# Get the long description from the README file +with open(path.join(here, 'README.md'), encoding='utf-8') as f: + long_description = f.read() + +setup( + name='asreview-semantic-clustering', + description='Semantic clustering tool for the ASReview project', + version='0.1', + long_description=long_description, + long_description_content_type='text/markdown', + url='https://github.com/asreview/semantic-clusters', + author='Utrecht University', + author_email='asreview@uu.nl', + classifiers=[ + # How mature is this project? Common values are + # 3 - Alpha + # 4 - Beta + # 5 - Production/Stable + 'Development Status :: 3 - Alpha', + + # Pick your license as you wish + 'License :: OSI Approved :: Apache Software License', + + 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', + ], + keywords='asreview extension semantic clustering clusters visualization', + packages=find_namespace_packages(include=['asreviewcontrib.*']), + install_requires=[ + "numpy", + "matplotlib", + "asreview", + "dash", + "plotly", + "sklearn", + "transformers", + "numpy", + "seaborn", + "torch", + ], + + extras_require={ + }, + + entry_points={ + "asreview.entry_points": [ + "semantic_clustering = asreviewcontrib.semantic_clustering.main:SemClusEntryPoint", # noqa: E501 + ] + }, + + project_urls={ + 'Bug Reports': + "https://github.com/asreview/semantic-clusters/issues", + 'Source': + "https://github.com/asreview/semantic-clusters", + }, +)