From 58a49b2ad6d43497cd4c789e7d40564eea3675cc Mon Sep 17 00:00:00 2001 From: Jonathan de Bruin Date: Thu, 2 Nov 2023 07:53:41 +0100 Subject: [PATCH] Fix SYNERGY visible in Oracle mode setup (#1556) --- asreview/datasets.py | 10 +++++++- asreview/entry_points/simulate.py | 23 ++----------------- asreview/webapp/api/projects.py | 4 +++- .../webapp/tests/test_api/test_projects.py | 2 +- docs/source/reference.rst | 2 +- docs/source/simulation_cli.rst | 12 ++++------ docs/source/simulation_overview.rst | 2 +- setup.py | 2 +- tests/test_data.py | 5 ---- tests/test_datasets.py | 15 ------------ tests/test_models.sh | 10 ++++---- tests/test_simulate.py | 7 ------ 12 files changed, 28 insertions(+), 66 deletions(-) diff --git a/asreview/datasets.py b/asreview/datasets.py index 25c25ac0b..514d618f2 100644 --- a/asreview/datasets.py +++ b/asreview/datasets.py @@ -15,6 +15,7 @@ import json import socket import tempfile +import warnings from abc import ABC from abc import abstractmethod from pathlib import Path @@ -737,7 +738,6 @@ def __init__(self): class BenchmarkDataGroup(BaseDataGroup): """Datasets available in the benchmark platform. - Deprecated """ @@ -745,6 +745,14 @@ class BenchmarkDataGroup(BaseDataGroup): description = "DEPRECATED: Datasets available in the online benchmark platform" def __init__(self): + + warnings.warn( + "The use of 'benchmark' datasets is deprecated, " + "use SYNERGY dataset instead. For more information, see " + "https://github.com/asreview/synergy-dataset.", + category=UserWarning + ) + meta_file = "https://raw.githubusercontent.com/asreview/systematic-review-datasets/master/index_v1.json" # noqa datasets = _download_from_metadata(meta_file) diff --git a/asreview/entry_points/simulate.py b/asreview/entry_points/simulate.py index c4036acd6..7817b9de6 100644 --- a/asreview/entry_points/simulate.py +++ b/asreview/entry_points/simulate.py @@ -42,26 +42,6 @@ from asreview.utils import get_random_state -def _get_dataset_path_from_args(args_dataset): - """Remove 'benchmark:' from the dataset name and add .csv suffix. - - Parameters - ---------- - args_dataset : str - Name of the dataset. - - Returns - ------- - str - Dataset name without 'benchmark:' if it started with that, - and with .csv suffix. - """ - if args_dataset.startswith("benchmark:"): - args_dataset = args_dataset[10:] - - return Path(args_dataset).with_suffix(".csv").name - - def _set_log_verbosity(verbose): if verbose == 0: logging.getLogger().setLevel(logging.WARNING) @@ -135,7 +115,8 @@ def execute(self, argv): # noqa ) # Add the dataset to the project file. - dataset_path = _get_dataset_path_from_args(args.dataset) + dataset_path = Path( + args.dataset.replace(":", "-")).with_suffix(".csv").name as_data.to_file(Path(fp_tmp_simulation, "data", dataset_path)) # Update the project.json. diff --git a/asreview/webapp/api/projects.py b/asreview/webapp/api/projects.py index e8ecea811..4cb901017 100644 --- a/asreview/webapp/api/projects.py +++ b/asreview/webapp/api/projects.py @@ -305,7 +305,9 @@ def api_demo_data_project(): # noqa: F401 if subset == "plugin": try: result_datasets = manager.list( - exclude=["builtin", "benchmark", "benchmark-nature"] + exclude=[ + "builtin", "synergy", "benchmark", "benchmark-nature" + ] ) except Exception as err: diff --git a/asreview/webapp/tests/test_api/test_projects.py b/asreview/webapp/tests/test_api/test_projects.py index d1a9859b6..474b1f456 100644 --- a/asreview/webapp/tests/test_api/test_projects.py +++ b/asreview/webapp/tests/test_api/test_projects.py @@ -17,7 +17,7 @@ # NOTE: I don't see a plugin that can be used for testing # purposes UPLOAD_DATA = [ - {"benchmark": "benchmark:Hall_2012"}, + {"benchmark": "synergy:van_der_Valk_2021"}, { "url": "https://raw.githubusercontent.com/asreview/" + "asreview/master/tests/demo_data/generic_labels.csv" diff --git a/docs/source/reference.rst b/docs/source/reference.rst index 72bd61ecf..d96caaee8 100644 --- a/docs/source/reference.rst +++ b/docs/source/reference.rst @@ -58,7 +58,7 @@ Available datasets .. autosummary:: :toctree: generated/ - asreview.datasets.BenchmarkDataGroup + asreview.datasets.SynergyDataGroup asreview.datasets.NaturePublicationDataGroup Dataset managers diff --git a/docs/source/simulation_cli.rst b/docs/source/simulation_cli.rst index 81d130b02..4e67fc463 100644 --- a/docs/source/simulation_cli.rst +++ b/docs/source/simulation_cli.rst @@ -62,23 +62,21 @@ Dataset .. option:: dataset - Required. File path or URL to the dataset or one of the benchmark datasets. + Required. File path or URL to the dataset or one of the SYNERGY datasets. -You can also use one of the :ref:`benchmark-datasets ` (see `index.csv -`_ -for dataset IDs). Use the following command and replace ``DATASET_ID`` by the +You can also use one of the :ref:`SYNERGY dataset `. Use the following command and replace ``DATASET_ID`` by the dataset ID. .. code:: bash - asreview simulate benchmark:DATASET_ID + asreview simulate synergy:DATASET_ID For example: .. code:: bash - asreview simulate benchmark:van_de_Schoot_2017 -s myreview.asreview + asreview simulate synergy:van_de_schoot_2018 -s myreview.asreview Active learning diff --git a/docs/source/simulation_overview.rst b/docs/source/simulation_overview.rst index 23641648f..39046caa3 100644 --- a/docs/source/simulation_overview.rst +++ b/docs/source/simulation_overview.rst @@ -41,7 +41,7 @@ inspection Datasets for simulation ----------------------- -Simulations require :ref:`fully labeled datasets ` (labels: ``0`` = irrelevant, ``1`` = relevant). Such a dataset can be the result of an earlier study. ASReview offers also fully labeled datasets via the `benchmark platform `_. These datasets are available via the user interface in the *Data* step of the setup and in the command line with the prefix `benchmark:` (e.g. `benchmark:van_de_schoot_2017`). +Simulations require :ref:`fully labeled datasets ` (labels: ``0`` = irrelevant, ``1`` = relevant). Such a dataset can be the result of an earlier study. ASReview offers also fully labeled datasets via the `SYNERGY dataset `_. These datasets are available via the user interface in the *Data* step of the setup and in the command line with the prefix `synergy:` (e.g. `synergy:van_de_schoot_2018`). .. tip:: diff --git a/setup.py b/setup.py index 988e513be..72c75dc84 100644 --- a/setup.py +++ b/setup.py @@ -199,9 +199,9 @@ def get_cmdclass(): ".xlsx = asreview.io:ExcelWriter", ], "asreview.datasets": [ - "benchmark = asreview.datasets:BenchmarkDataGroup", "benchmark-nature = asreview.datasets:NaturePublicationDataGroup", "synergy = asreview.datasets:SynergyDataGroup", + "benchmark = asreview.datasets:BenchmarkDataGroup", ], "asreview.models.classifiers": [ "svm = asreview.models.classifiers:SVMClassifier", diff --git a/tests/test_data.py b/tests/test_data.py index 42a94a008..8065481c1 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -39,11 +39,6 @@ def test_fuzzy_finder(keywords, paper_id): @mark.parametrize( "data_name", [ - # datasets from the datasets repo - "benchmark:van_de_Schoot_2017", - "benchmark:Hall_2012", - "benchmark:Cohen_2006_ACEInhibitors", - "benchmark:Bos_2018", # datasets from the Van de Schoot et al. paper # https://github.com/asreview/paper-asreview/blob/master/index_v1.json "benchmark-nature:van_de_Schoot_2017", diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 8f99f1d98..3839cb8c7 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -6,21 +6,6 @@ from asreview.datasets import NaturePublicationDataGroup -@pytest.mark.parametrize( - "data_id", - [ - "benchmark:van_de_Schoot_2017", - "benchmark:Hall_2012", - "benchmark:Cohen_2006_ACEInhibitors", - "benchmark:Bos_2018", - ], -) -def test_datasets(data_id): - data = DatasetManager().find(data_id) - assert data.filepath.startswith("https://raw.githubusercontent.com/asreview/") - assert data.title is not None - - def test_group(): group_nature = NaturePublicationDataGroup() diff --git a/tests/test_models.sh b/tests/test_models.sh index ac9c1cf9b..affef6504 100644 --- a/tests/test_models.sh +++ b/tests/test_models.sh @@ -1,12 +1,12 @@ -DATASET="van_de_schoot_2017" +DATASET="van_de_schoot_2018" QUERY_STRATEGIES=('max_random' 'max_uncertainty' 'max' 'uncertainty' 'random') # ('max_random' 'max_uncertainty' 'max' 'uncertainty' 'random' 'cluster') for qs in "${QUERY_STRATEGIES[@]}" do - asreview simulate benchmark:${DATASET} -q $qs --seed 535 --init_seed 535 -s ${DATASET}_${qs}.asreview + asreview simulate synergy:${DATASET} -q $qs --seed 535 --init_seed 535 -s ${DATASET}_${qs}.asreview asreview plot recall ${DATASET}_${qs}.asreview -o ${DATASET}_${qs}_recall.png done @@ -16,7 +16,7 @@ BALANCE_STRATEGIES=('double' 'simple' 'undersample') for bs in "${BALANCE_STRATEGIES[@]}" do - asreview simulate benchmark:${DATASET} -q $bs --seed 535 --init_seed 535 -s ${DATASET}_${bs}.asreview + asreview simulate synergy:${DATASET} -q $bs --seed 535 --init_seed 535 -s ${DATASET}_${bs}.asreview asreview plot recall ${DATASET}_${bs}.asreview -o ${DATASET}_${bs}_recall.png done @@ -27,7 +27,7 @@ MODELS=('logistic' 'nb' 'rf' 'svm') for m in "${MODELS[@]}" do - asreview simulate benchmark:${DATASET} -q $m --seed 535 --init_seed 535 -s ${DATASET}_${m}.asreview + asreview simulate synergy:${DATASET} -q $m --seed 535 --init_seed 535 -s ${DATASET}_${m}.asreview asreview plot recall ${DATASET}_${m}.asreview -o ${DATASET}_${m}_recall.png done @@ -38,7 +38,7 @@ FEATURE_STRATEGIES=('tfidf') for fs in "${FEATURE_STRATEGIES[@]}" do - asreview simulate benchmark:${DATASET} -q $fs --seed 535 --init_seed 535 -s ${DATASET}_${fs}.asreview + asreview simulate synergy:${DATASET} -q $fs --seed 535 --init_seed 535 -s ${DATASET}_${fs}.asreview asreview plot recall ${DATASET}_${fs}.asreview -o ${DATASET}_${fs}_recall.png done diff --git a/tests/test_simulate.py b/tests/test_simulate.py index 4af3332c6..b6864bf6a 100644 --- a/tests/test_simulate.py +++ b/tests/test_simulate.py @@ -4,7 +4,6 @@ import pytest from asreview.entry_points.simulate import SimulateEntryPoint -from asreview.entry_points.simulate import _get_dataset_path_from_args from asreview.entry_points.simulate import _simulate_parser from asreview.project import ASReviewProject from asreview.project import ProjectExistsError @@ -337,9 +336,3 @@ def test_is_partial_simulation(tmpdir): entry_point.execute(argv) assert _is_partial_simulation(args) # noqa - - -def test_get_dataset_path_from_args(): - assert _get_dataset_path_from_args("test") == "test.csv" - assert _get_dataset_path_from_args("test.ris") == "test.csv" - assert _get_dataset_path_from_args("benchmark:test") == "test.csv"