Fix SYNERGY visible in Oracle mode setup (asreview#1556)

PeterLombaers · Nov 2, 2023 · 58a49b2 · 58a49b2
1 parent 0444692
commit 58a49b2
Showing 12 changed files with 28 additions and 66 deletions.
diff --git a/asreview/datasets.py b/asreview/datasets.py
@@ -15,6 +15,7 @@
 import json
 import socket
 import tempfile
+import warnings
 from abc import ABC
 from abc import abstractmethod
 from pathlib import Path
@@ -737,14 +738,21 @@ def __init__(self):
 
 class BenchmarkDataGroup(BaseDataGroup):
     """Datasets available in the benchmark platform.
-
     Deprecated
     """
 
     group_id = "benchmark"
     description = "DEPRECATED: Datasets available in the online benchmark platform"
 
     def __init__(self):
+
+        warnings.warn(
+            "The use of 'benchmark' datasets is deprecated, "
+            "use SYNERGY dataset instead. For more information, see "
+            "https://github.com/asreview/synergy-dataset.",
+            category=UserWarning
+        )
+
         meta_file = "https://raw.githubusercontent.com/asreview/systematic-review-datasets/master/index_v1.json"  # noqa
         datasets = _download_from_metadata(meta_file)
 

diff --git a/asreview/entry_points/simulate.py b/asreview/entry_points/simulate.py
@@ -42,26 +42,6 @@
 from asreview.utils import get_random_state
 
 
-def _get_dataset_path_from_args(args_dataset):
-    """Remove 'benchmark:' from the dataset name and add .csv suffix.
-
-    Parameters
-    ----------
-    args_dataset : str
-        Name of the dataset.
-
-    Returns
-    -------
-    str
-        Dataset name without 'benchmark:' if it started with that,
-        and with .csv suffix.
-    """
-    if args_dataset.startswith("benchmark:"):
-        args_dataset = args_dataset[10:]
-
-    return Path(args_dataset).with_suffix(".csv").name
-
-
 def _set_log_verbosity(verbose):
     if verbose == 0:
         logging.getLogger().setLevel(logging.WARNING)
@@ -135,7 +115,8 @@ def execute(self, argv):  # noqa
             )
 
             # Add the dataset to the project file.
-            dataset_path = _get_dataset_path_from_args(args.dataset)
+            dataset_path = Path(
+                args.dataset.replace(":", "-")).with_suffix(".csv").name
 
             as_data.to_file(Path(fp_tmp_simulation, "data", dataset_path))
             # Update the project.json.

diff --git a/asreview/webapp/api/projects.py b/asreview/webapp/api/projects.py
@@ -305,7 +305,9 @@ def api_demo_data_project():  # noqa: F401
     if subset == "plugin":
         try:
             result_datasets = manager.list(
-                exclude=["builtin", "benchmark", "benchmark-nature"]
+                exclude=[
+                    "builtin", "synergy", "benchmark", "benchmark-nature"
+                ]
             )
 
         except Exception as err:

diff --git a/asreview/webapp/tests/test_api/test_projects.py b/asreview/webapp/tests/test_api/test_projects.py
@@ -17,7 +17,7 @@
 # NOTE: I don't see a plugin that can be used for testing
 # purposes
 UPLOAD_DATA = [
-    {"benchmark": "benchmark:Hall_2012"},
+    {"benchmark": "synergy:van_der_Valk_2021"},
     {
         "url": "https://raw.githubusercontent.com/asreview/"
         + "asreview/master/tests/demo_data/generic_labels.csv"

diff --git a/docs/source/reference.rst b/docs/source/reference.rst
@@ -58,7 +58,7 @@ Available datasets
 .. autosummary::
    :toctree: generated/
 
-   asreview.datasets.BenchmarkDataGroup
+   asreview.datasets.SynergyDataGroup
    asreview.datasets.NaturePublicationDataGroup
 
 Dataset managers

diff --git a/docs/source/simulation_cli.rst b/docs/source/simulation_cli.rst
@@ -62,23 +62,21 @@ Dataset
 
 .. option:: dataset
 
-    Required. File path or URL to the dataset or one of the benchmark datasets.
+    Required. File path or URL to the dataset or one of the SYNERGY datasets.
 
-You can also use one of the :ref:`benchmark-datasets <data_labeled:fully
-labeled data>` (see `index.csv
-<https://github.com/asreview/systematic-review-datasets/blob/master/index.csv>`_
-for dataset IDs). Use the following command and replace ``DATASET_ID`` by the
+You can also use one of the :ref:`SYNERGY dataset <data_labeled:fully
+labeled data>`. Use the following command and replace ``DATASET_ID`` by the
 dataset ID.
 
 .. code:: bash
 
-    asreview simulate benchmark:DATASET_ID
+    asreview simulate synergy:DATASET_ID
 
 For example:
 
 .. code:: bash
 
-    asreview simulate benchmark:van_de_Schoot_2017 -s myreview.asreview
+    asreview simulate synergy:van_de_schoot_2018 -s myreview.asreview
 
 
 Active learning

diff --git a/docs/source/simulation_overview.rst b/docs/source/simulation_overview.rst
@@ -41,7 +41,7 @@ inspection
 Datasets for simulation
 -----------------------
 
-Simulations require :ref:`fully labeled datasets <data_labeled:fully labeled data>` (labels: ``0`` = irrelevant, ``1`` = relevant). Such a dataset can be the result of an earlier study. ASReview offers also fully labeled datasets via the `benchmark platform <https://github.com/asreview/systematic-review-datasets>`_. These datasets are available via the user interface in the *Data* step of the setup and in the command line with the prefix `benchmark:` (e.g. `benchmark:van_de_schoot_2017`).
+Simulations require :ref:`fully labeled datasets <data_labeled:fully labeled data>` (labels: ``0`` = irrelevant, ``1`` = relevant). Such a dataset can be the result of an earlier study. ASReview offers also fully labeled datasets via the `SYNERGY dataset <https://github.com/asreview/synergy-dataset>`_. These datasets are available via the user interface in the *Data* step of the setup and in the command line with the prefix `synergy:` (e.g. `synergy:van_de_schoot_2018`).
 
 .. tip::
 

diff --git a/setup.py b/setup.py
@@ -199,9 +199,9 @@ def get_cmdclass():
             ".xlsx = asreview.io:ExcelWriter",
         ],
         "asreview.datasets": [
-            "benchmark = asreview.datasets:BenchmarkDataGroup",
             "benchmark-nature = asreview.datasets:NaturePublicationDataGroup",
             "synergy = asreview.datasets:SynergyDataGroup",
+            "benchmark = asreview.datasets:BenchmarkDataGroup",
         ],
         "asreview.models.classifiers": [
             "svm = asreview.models.classifiers:SVMClassifier",

diff --git a/tests/test_data.py b/tests/test_data.py
@@ -39,11 +39,6 @@ def test_fuzzy_finder(keywords, paper_id):
 @mark.parametrize(
     "data_name",
     [
-        # datasets from the datasets repo
-        "benchmark:van_de_Schoot_2017",
-        "benchmark:Hall_2012",
-        "benchmark:Cohen_2006_ACEInhibitors",
-        "benchmark:Bos_2018",
         # datasets from the Van de Schoot et al. paper
         # https://github.com/asreview/paper-asreview/blob/master/index_v1.json
         "benchmark-nature:van_de_Schoot_2017",

diff --git a/tests/test_datasets.py b/tests/test_datasets.py
@@ -6,21 +6,6 @@
 from asreview.datasets import NaturePublicationDataGroup
 
 
-@pytest.mark.parametrize(
-    "data_id",
-    [
-        "benchmark:van_de_Schoot_2017",
-        "benchmark:Hall_2012",
-        "benchmark:Cohen_2006_ACEInhibitors",
-        "benchmark:Bos_2018",
-    ],
-)
-def test_datasets(data_id):
-    data = DatasetManager().find(data_id)
-    assert data.filepath.startswith("https://raw.githubusercontent.com/asreview/")
-    assert data.title is not None
-
-
 def test_group():
     group_nature = NaturePublicationDataGroup()
 

diff --git a/tests/test_models.sh b/tests/test_models.sh
@@ -1,12 +1,12 @@
 
-DATASET="van_de_schoot_2017"
+DATASET="van_de_schoot_2018"
 
 QUERY_STRATEGIES=('max_random' 'max_uncertainty' 'max' 'uncertainty' 'random')
 # ('max_random' 'max_uncertainty' 'max' 'uncertainty' 'random' 'cluster')
 
 for qs in "${QUERY_STRATEGIES[@]}"
 do
-  asreview simulate benchmark:${DATASET} -q $qs --seed 535 --init_seed 535 -s ${DATASET}_${qs}.asreview
+  asreview simulate synergy:${DATASET} -q $qs --seed 535 --init_seed 535 -s ${DATASET}_${qs}.asreview
   asreview plot recall ${DATASET}_${qs}.asreview -o ${DATASET}_${qs}_recall.png
 done
 
@@ -16,7 +16,7 @@ BALANCE_STRATEGIES=('double' 'simple' 'undersample')
 
 for bs in "${BALANCE_STRATEGIES[@]}"
 do
-  asreview simulate benchmark:${DATASET} -q $bs --seed 535 --init_seed 535 -s ${DATASET}_${bs}.asreview
+  asreview simulate synergy:${DATASET} -q $bs --seed 535 --init_seed 535 -s ${DATASET}_${bs}.asreview
   asreview plot recall ${DATASET}_${bs}.asreview -o ${DATASET}_${bs}_recall.png
 done
 
@@ -27,7 +27,7 @@ MODELS=('logistic' 'nb' 'rf' 'svm')
 
 for m in "${MODELS[@]}"
 do
-  asreview simulate benchmark:${DATASET} -q $m --seed 535 --init_seed 535 -s ${DATASET}_${m}.asreview
+  asreview simulate synergy:${DATASET} -q $m --seed 535 --init_seed 535 -s ${DATASET}_${m}.asreview
   asreview plot recall ${DATASET}_${m}.asreview -o ${DATASET}_${m}_recall.png
 done
 
@@ -38,7 +38,7 @@ FEATURE_STRATEGIES=('tfidf')
 
 for fs in "${FEATURE_STRATEGIES[@]}"
 do
-  asreview simulate benchmark:${DATASET} -q $fs --seed 535 --init_seed 535 -s ${DATASET}_${fs}.asreview
+  asreview simulate synergy:${DATASET} -q $fs --seed 535 --init_seed 535 -s ${DATASET}_${fs}.asreview
   asreview plot recall ${DATASET}_${fs}.asreview -o ${DATASET}_${fs}_recall.png
 done
 

diff --git a/tests/test_simulate.py b/tests/test_simulate.py
@@ -4,7 +4,6 @@
 import pytest
 
 from asreview.entry_points.simulate import SimulateEntryPoint
-from asreview.entry_points.simulate import _get_dataset_path_from_args
 from asreview.entry_points.simulate import _simulate_parser
 from asreview.project import ASReviewProject
 from asreview.project import ProjectExistsError
@@ -337,9 +336,3 @@ def test_is_partial_simulation(tmpdir):
     entry_point.execute(argv)
 
     assert _is_partial_simulation(args)  # noqa
-
-
-def test_get_dataset_path_from_args():
-    assert _get_dataset_path_from_args("test") == "test.csv"
-    assert _get_dataset_path_from_args("test.ris") == "test.csv"
-    assert _get_dataset_path_from_args("benchmark:test") == "test.csv"