From 8b7ee225839fb3ed28bccd007fd12c4eda28df65 Mon Sep 17 00:00:00 2001
From: Yifan Mai <yifan@cs.stanford.edu>
Date: Wed, 22 Jan 2025 12:00:07 -0800
Subject: [PATCH 1/7] legal opinion

---
 .../run_specs/enterprise_run_specs.py         | 37 ++++++++-
 ...inion_sentiment_classification_scenario.py | 75 +++++++++++++++++++
 2 files changed, 111 insertions(+), 1 deletion(-)
 create mode 100644 src/helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py

diff --git a/src/helm/benchmark/run_specs/enterprise_run_specs.py b/src/helm/benchmark/run_specs/enterprise_run_specs.py
index 8e22bc56dbb..6f8a4b448d1 100644
--- a/src/helm/benchmark/run_specs/enterprise_run_specs.py
+++ b/src/helm/benchmark/run_specs/enterprise_run_specs.py
@@ -1,5 +1,6 @@
 """Run spec functions for HELM Enterprise scenarios."""
 
+from typing import List, Optional
 from helm.benchmark.adaptation.adapter_spec import ADAPT_MULTIPLE_CHOICE_JOINT
 from helm.benchmark.adaptation.common_adapter_specs import (
     get_generation_adapter_spec,
@@ -11,10 +12,22 @@
     get_exact_match_metric_specs,
     get_f1_metric_specs,
 )
+from helm.benchmark.metrics.metric import MetricSpec
 from helm.benchmark.run_spec import RunSpec, run_spec_function
 from helm.benchmark.scenarios.scenario import ScenarioSpec
 
 
+def get_weighted_classification_metric_specs(
+    delimiter: Optional[str] = None, average: str = "weighted", class_defs: Optional[List[str]] = None
+) -> List[MetricSpec]:
+    return [
+        MetricSpec(
+            class_name="helm.benchmark.metrics.classification_metrics.ClassificationMetric",
+            args={"delimiter": delimiter, "average": average, "class_defs": class_defs},
+        )
+    ]
+
+
 # Finance
 
 
@@ -44,7 +57,7 @@ def get_news_headline_spec(category: str) -> RunSpec:
 
 
 @run_spec_function("legal_contract_summarization")
-def get_legal_contract_spec() -> RunSpec:
+def get_legal_contract_summarization_spec() -> RunSpec:
     scenario_spec = ScenarioSpec(
         class_name="helm.benchmark.scenarios.legal_contract_summarization_scenario.LegalContractSummarizationScenario",
         args={},
@@ -67,6 +80,28 @@ def get_legal_contract_spec() -> RunSpec:
     )
 
 
+@run_spec_function("legal_opinion_sentiment_classification")
+def get_legal_opinion_sentiment_classification_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.legal_opinion_sentiment_classification_scenario.LegalOpinionSentimentClassificationScenario",
+        args={},
+    )
+
+    instructions = "Classify the sentences into one of the 3 sentiment categories. Possible labels: positive, neutral, negative."  # noqa
+    adapter_spec = get_generation_adapter_spec(
+        instructions=instructions,
+        output_noun="Label",
+    )
+
+    return RunSpec(
+        name="legal_opinion",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs() + get_weighted_classification_metric_specs(),
+        groups=["legal_opinion"],
+    )
+
+
 @run_spec_function("casehold")
 def get_casehold_spec() -> RunSpec:
     scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.casehold_scenario.CaseHOLDScenario", args={})
diff --git a/src/helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py b/src/helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py
new file mode 100644
index 00000000000..6171a188008
--- /dev/null
+++ b/src/helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py
@@ -0,0 +1,75 @@
+import os
+from typing import List
+
+import pandas as pd
+
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TRAIN_SPLIT,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
+from helm.common.general import ensure_file_downloaded, ensure_directory_exists
+
+
+class LegalOpinionSentimentClassificationScenario(Scenario):
+    """
+        TODO: Fill this in
+
+        Example prompt:
+    Classify the sentences into one of the 3 sentiment categories. Possible labels: positive, neutral, negative.
+    {Sentence}
+    Label: {positive/neutral/negative}
+
+    """
+
+    # Names of the tasks we support
+
+    name = "legal_opinion"
+    description = "Predicting the sentiment of the legal text in the positive, negative, or neutral."
+    tags = ["classification", "sentiment analysis", "legal"]
+
+    SENTIMENT_CLASSES = ["positive", "negative", "neutral"]
+    SPLIT_TO_URL = {
+        TRAIN_SPLIT: "https://osf.io/download/hfn62/train.xlsx",
+        TEST_SPLIT: "https://osf.io/download/hfn62/test.xlsx",
+    }
+
+    def create_instances(self, df: pd.DataFrame, split: str) -> List[Instance]:
+        instances: List[Instance] = []
+        assert split in [TRAIN_SPLIT, TEST_SPLIT]
+        if split == TRAIN_SPLIT:
+            phrase_column_name = "Phrase"
+            label_column_name = "Label"
+        else:
+            phrase_column_name = "sentence"
+            label_column_name = "label"
+        for row in df.itertuples():
+            phrase = getattr(row, phrase_column_name)
+            label_index = int(getattr(row, label_column_name))
+            label = LegalOpinionSentimentClassificationScenario.SENTIMENT_CLASSES[label_index]
+            instance = Instance(
+                input=Input(text=phrase), references=[Reference(Output(text=label), tags=[CORRECT_TAG])], split=split
+            )
+            instances.append(instance)
+        return instances
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        self.data_dir = os.path.join(output_path, "data")
+        data_dir = self.data_dir
+        ensure_directory_exists(data_dir)
+        instances: List[Instance] = []
+        for split, url in LegalOpinionSentimentClassificationScenario.SPLIT_TO_URL.items():
+            file_name = url.split("/")[-1]
+            file_path = os.path.join(data_dir, file_name)
+            ensure_file_downloaded(
+                source_url=url,
+                target_path=os.path.join(data_dir, file_name),
+            )
+            df = pd.read_excel(file_path)
+            instances.extend(self.create_instances(df, split))
+        return instances

From 7f0ea93aeea321e96c92050ecbdb4804c9e070b2 Mon Sep 17 00:00:00 2001
From: Yifan Mai <yifan@cs.stanford.edu>
Date: Wed, 22 Jan 2025 15:16:51 -0800
Subject: [PATCH 2/7] Fix schema

---
 .../run_specs/enterprise_run_specs.py         | 19 ++-----------------
 ...inion_sentiment_classification_scenario.py |  6 ++++--
 .../benchmark/static/schema_enterprise.yaml   | 18 ++++++++++++++++++
 3 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/src/helm/benchmark/run_specs/enterprise_run_specs.py b/src/helm/benchmark/run_specs/enterprise_run_specs.py
index 6f8a4b448d1..0182eb79678 100644
--- a/src/helm/benchmark/run_specs/enterprise_run_specs.py
+++ b/src/helm/benchmark/run_specs/enterprise_run_specs.py
@@ -1,6 +1,5 @@
 """Run spec functions for HELM Enterprise scenarios."""
 
-from typing import List, Optional
 from helm.benchmark.adaptation.adapter_spec import ADAPT_MULTIPLE_CHOICE_JOINT
 from helm.benchmark.adaptation.common_adapter_specs import (
     get_generation_adapter_spec,
@@ -17,20 +16,6 @@
 from helm.benchmark.scenarios.scenario import ScenarioSpec
 
 
-def get_weighted_classification_metric_specs(
-    delimiter: Optional[str] = None, average: str = "weighted", class_defs: Optional[List[str]] = None
-) -> List[MetricSpec]:
-    return [
-        MetricSpec(
-            class_name="helm.benchmark.metrics.classification_metrics.ClassificationMetric",
-            args={"delimiter": delimiter, "average": average, "class_defs": class_defs},
-        )
-    ]
-
-
-# Finance
-
-
 @run_spec_function("gold_commodity_news")
 def get_news_headline_spec(category: str) -> RunSpec:
     from helm.benchmark.scenarios.gold_commodity_news_scenario import GoldCommodityNewsScenario
@@ -84,7 +69,6 @@ def get_legal_contract_summarization_spec() -> RunSpec:
 def get_legal_opinion_sentiment_classification_spec() -> RunSpec:
     scenario_spec = ScenarioSpec(
         class_name="helm.benchmark.scenarios.legal_opinion_sentiment_classification_scenario.LegalOpinionSentimentClassificationScenario",
-        args={},
     )
 
     instructions = "Classify the sentences into one of the 3 sentiment categories. Possible labels: positive, neutral, negative."  # noqa
@@ -97,7 +81,8 @@ def get_legal_opinion_sentiment_classification_spec() -> RunSpec:
         name="legal_opinion",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_exact_match_metric_specs() + get_weighted_classification_metric_specs(),
+        # TODO: Switch to using weighted F1
+        metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
         groups=["legal_opinion"],
     )
 
diff --git a/src/helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py b/src/helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py
index 6171a188008..3c00e655cc1 100644
--- a/src/helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py
+++ b/src/helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py
@@ -18,9 +18,11 @@
 
 class LegalOpinionSentimentClassificationScenario(Scenario):
     """
-        TODO: Fill this in
+    A legal opinion sentiment classificaiton task based on the paper
+    Effective Approach to Develop a Sentiment Annotator For Legal Domain in a Low Resource Setting
+    [(Ratnayaka et al., 2020)](https://arxiv.org/pdf/2011.00318.pdf).
 
-        Example prompt:
+    Example prompt:
     Classify the sentences into one of the 3 sentiment categories. Possible labels: positive, neutral, negative.
     {Sentence}
     Label: {positive/neutral/negative}
diff --git a/src/helm/benchmark/static/schema_enterprise.yaml b/src/helm/benchmark/static/schema_enterprise.yaml
index 7eadad2b4c4..81077e90517 100644
--- a/src/helm/benchmark/static/schema_enterprise.yaml
+++ b/src/helm/benchmark/static/schema_enterprise.yaml
@@ -116,6 +116,8 @@ run_groups:
     category: All scenarios
     subgroups:
       - legal_contract_summarization
+      - legal_opinion_sentiment_classification
+      - casehold
 
   - name: climate_scenarios
     display_name: Climate Scenarios
@@ -182,6 +184,22 @@ run_groups:
       when: before 2021
       language: English
 
+  - name: legal_opinion_sentiment_classification
+    display_name: Legal Opinion Sentiment Classification
+    description: A legal opinion sentiment classificaiton task based on the paper Effective Approach to Develop a Sentiment Annotator For Legal Domain in a Low Resource Setting [(Ratnayaka et al., 2020)](https://arxiv.org/pdf/2011.00318.pdf).
+    metric_groups:
+      - accuracy
+      - general_information
+    environment:
+      main_name: quasi_exact_match
+      main_split: test
+    taxonomy:
+      task: sentiment analysis
+      what: United States legal opinion texts
+      who: United States courts
+      when: Before 2020
+      language: English
+
   - name: sumosum
     display_name: SUMO Web Claims Summarization
     description: A summarization benchmark based on the climate subset of the SUMO dataset ([Mishra et al., 2020](https://aclanthology.org/2020.wnut-1.12/)).

From a74b31cd49065aa6f4d642618b60ea1654d49e03 Mon Sep 17 00:00:00 2001
From: Yifan Mai <yifan@cs.stanford.edu>
Date: Wed, 22 Jan 2025 15:23:36 -0800
Subject: [PATCH 3/7] Fixes

---
 src/helm/benchmark/run_specs/enterprise_run_specs.py        | 4 ++--
 .../legal_opinion_sentiment_classification_scenario.py      | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/helm/benchmark/run_specs/enterprise_run_specs.py b/src/helm/benchmark/run_specs/enterprise_run_specs.py
index 0182eb79678..4d6b7179987 100644
--- a/src/helm/benchmark/run_specs/enterprise_run_specs.py
+++ b/src/helm/benchmark/run_specs/enterprise_run_specs.py
@@ -78,12 +78,12 @@ def get_legal_opinion_sentiment_classification_spec() -> RunSpec:
     )
 
     return RunSpec(
-        name="legal_opinion",
+        name="legal_opinion_sentiment_classification",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         # TODO: Switch to using weighted F1
         metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
-        groups=["legal_opinion"],
+        groups=["legal_opinion_sentiment_classification"],
     )
 
 
diff --git a/src/helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py b/src/helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py
index 3c00e655cc1..d07f5ec6aa1 100644
--- a/src/helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py
+++ b/src/helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py
@@ -37,8 +37,8 @@ class LegalOpinionSentimentClassificationScenario(Scenario):
 
     SENTIMENT_CLASSES = ["positive", "negative", "neutral"]
     SPLIT_TO_URL = {
-        TRAIN_SPLIT: "https://osf.io/download/hfn62/train.xlsx",
-        TEST_SPLIT: "https://osf.io/download/hfn62/test.xlsx",
+        TRAIN_SPLIT: "https://osf.io/download/hfn62/",
+        TEST_SPLIT: "https://osf.io/download/q4adh/",
     }
 
     def create_instances(self, df: pd.DataFrame, split: str) -> List[Instance]:
@@ -66,7 +66,7 @@ def get_instances(self, output_path: str) -> List[Instance]:
         ensure_directory_exists(data_dir)
         instances: List[Instance] = []
         for split, url in LegalOpinionSentimentClassificationScenario.SPLIT_TO_URL.items():
-            file_name = url.split("/")[-1]
+            file_name = f"{split.lower()}.xlsx"
             file_path = os.path.join(data_dir, file_name)
             ensure_file_downloaded(
                 source_url=url,

From cc144939f259e1a42feceede9b44344a4c69e8bc Mon Sep 17 00:00:00 2001
From: Yifan Mai <yifan@cs.stanford.edu>
Date: Wed, 22 Jan 2025 15:25:48 -0800
Subject: [PATCH 4/7] Lint

---
 src/helm/benchmark/run_specs/enterprise_run_specs.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/helm/benchmark/run_specs/enterprise_run_specs.py b/src/helm/benchmark/run_specs/enterprise_run_specs.py
index 4d6b7179987..d7f19c2b9ca 100644
--- a/src/helm/benchmark/run_specs/enterprise_run_specs.py
+++ b/src/helm/benchmark/run_specs/enterprise_run_specs.py
@@ -11,7 +11,6 @@
     get_exact_match_metric_specs,
     get_f1_metric_specs,
 )
-from helm.benchmark.metrics.metric import MetricSpec
 from helm.benchmark.run_spec import RunSpec, run_spec_function
 from helm.benchmark.scenarios.scenario import ScenarioSpec
 
@@ -68,10 +67,10 @@ def get_legal_contract_summarization_spec() -> RunSpec:
 @run_spec_function("legal_opinion_sentiment_classification")
 def get_legal_opinion_sentiment_classification_spec() -> RunSpec:
     scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.legal_opinion_sentiment_classification_scenario.LegalOpinionSentimentClassificationScenario",
+        class_name="helm.benchmark.scenarios.legal_opinion_sentiment_classification_scenario.LegalOpinionSentimentClassificationScenario",  # noqa: E501
     )
 
-    instructions = "Classify the sentences into one of the 3 sentiment categories. Possible labels: positive, neutral, negative."  # noqa
+    instructions = "Classify the sentences into one of the 3 sentiment categories. Possible labels: positive, neutral, negative."  # noqa: E501
     adapter_spec = get_generation_adapter_spec(
         instructions=instructions,
         output_noun="Label",

From ba8153876bace198d879b5e8a94a56b3e187abf8 Mon Sep 17 00:00:00 2001
From: Yifan Mai <yifan@cs.stanford.edu>
Date: Wed, 22 Jan 2025 15:29:31 -0800
Subject: [PATCH 5/7] Fix typo and add credit

Co-authored-by: Ryo Kawahara <ryokawa@jp.ibm.com>
Co-authored-by: Mikio Takeuchi <mtake@jp.ibm.com>
---
 .../legal_opinion_sentiment_classification_scenario.py          | 2 +-
 src/helm/benchmark/static/schema_enterprise.yaml                | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py b/src/helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py
index d07f5ec6aa1..98d99212866 100644
--- a/src/helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py
+++ b/src/helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py
@@ -18,7 +18,7 @@
 
 class LegalOpinionSentimentClassificationScenario(Scenario):
     """
-    A legal opinion sentiment classificaiton task based on the paper
+    A legal opinion sentiment classification task based on the paper
     Effective Approach to Develop a Sentiment Annotator For Legal Domain in a Low Resource Setting
     [(Ratnayaka et al., 2020)](https://arxiv.org/pdf/2011.00318.pdf).
 
diff --git a/src/helm/benchmark/static/schema_enterprise.yaml b/src/helm/benchmark/static/schema_enterprise.yaml
index 81077e90517..4ff593dd02e 100644
--- a/src/helm/benchmark/static/schema_enterprise.yaml
+++ b/src/helm/benchmark/static/schema_enterprise.yaml
@@ -186,7 +186,7 @@ run_groups:
 
   - name: legal_opinion_sentiment_classification
     display_name: Legal Opinion Sentiment Classification
-    description: A legal opinion sentiment classificaiton task based on the paper Effective Approach to Develop a Sentiment Annotator For Legal Domain in a Low Resource Setting [(Ratnayaka et al., 2020)](https://arxiv.org/pdf/2011.00318.pdf).
+    description: A legal opinion sentiment classification task based on the paper Effective Approach to Develop a Sentiment Annotator For Legal Domain in a Low Resource Setting [(Ratnayaka et al., 2020)](https://arxiv.org/pdf/2011.00318.pdf).
     metric_groups:
       - accuracy
       - general_information

From 3087f50242ed63ffe7457d8ab18e21905e6b7c57 Mon Sep 17 00:00:00 2001
From: Yifan Mai <yifan@cs.stanford.edu>
Date: Fri, 24 Jan 2025 11:20:40 -0800
Subject: [PATCH 6/7] Switch to using weighted F1, precision and recall

---
 src/helm/benchmark/run_specs/enterprise_run_specs.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/helm/benchmark/run_specs/enterprise_run_specs.py b/src/helm/benchmark/run_specs/enterprise_run_specs.py
index 8e0f4b65386..006e72cc8df 100644
--- a/src/helm/benchmark/run_specs/enterprise_run_specs.py
+++ b/src/helm/benchmark/run_specs/enterprise_run_specs.py
@@ -19,7 +19,7 @@ def _get_weighted_classification_metric_specs(labels: List[str]) -> List[MetricS
     return [
         MetricSpec(
             class_name="helm.benchmark.metrics.classification_metrics.ClassificationMetric",
-            args={"averages": ["weighted"], "labels": labels},
+            args={"averages": ["weighted"], "scores": ["f1", "precision", "recall"], "labels": labels},
         )
     ]
 
@@ -44,7 +44,7 @@ def get_news_headline_spec(category: str) -> RunSpec:
         name=f"gold_commodity_news:category={category}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_exact_match_metric_specs() + _get_weighted_classification_metric_specs(labels=["Yes", "No"]),
+        metric_specs=get_exact_match_metric_specs() + _get_weighted_classification_metric_specs(labels=["yes", "no"]),
         groups=["gold_commodity_news"],
     )
 
@@ -92,8 +92,7 @@ def get_legal_opinion_sentiment_classification_spec() -> RunSpec:
         name="legal_opinion_sentiment_classification",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        # TODO: Switch to using weighted F1
-        metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
+        metric_specs=get_exact_match_metric_specs() + _get_weighted_classification_metric_specs(labels=["positive", "neutral", "negative"]),
         groups=["legal_opinion_sentiment_classification"],
     )
 

From a240370bad69c071018951b25e7a6fdca4de9855 Mon Sep 17 00:00:00 2001
From: Yifan Mai <yifan@cs.stanford.edu>
Date: Fri, 24 Jan 2025 11:24:09 -0800
Subject: [PATCH 7/7] Fixes

---
 src/helm/benchmark/run_specs/enterprise_run_specs.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/helm/benchmark/run_specs/enterprise_run_specs.py b/src/helm/benchmark/run_specs/enterprise_run_specs.py
index 006e72cc8df..9dd01b24432 100644
--- a/src/helm/benchmark/run_specs/enterprise_run_specs.py
+++ b/src/helm/benchmark/run_specs/enterprise_run_specs.py
@@ -92,7 +92,8 @@ def get_legal_opinion_sentiment_classification_spec() -> RunSpec:
         name="legal_opinion_sentiment_classification",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_exact_match_metric_specs() + _get_weighted_classification_metric_specs(labels=["positive", "neutral", "negative"]),
+        metric_specs=get_exact_match_metric_specs()
+        + _get_weighted_classification_metric_specs(labels=["positive", "neutral", "negative"]),
         groups=["legal_opinion_sentiment_classification"],
     )