Add Legal Opinion Sentiment Classification scenario (#3286)

Co-authored-by: Ryo Kawahara <[email protected]> Co-authored-by: Mikio Takeuchi <[email protected]>
stanford-crfm · Jan 29, 2025 · 59dcfb1 · 59dcfb1
1 parent 92e3ee1
commit 59dcfb1
Show file tree

Hide file tree

Showing 3 changed files with 119 additions and 3 deletions.
diff --git a/src/helm/benchmark/run_specs/enterprise_run_specs.py b/src/helm/benchmark/run_specs/enterprise_run_specs.py
@@ -19,7 +19,7 @@ def _get_weighted_classification_metric_specs(labels: List[str]) -> List[MetricS
     return [
         MetricSpec(
             class_name="helm.benchmark.metrics.classification_metrics.ClassificationMetric",
-            args={"averages": ["weighted"], "labels": labels},
+            args={"averages": ["weighted"], "scores": ["f1", "precision", "recall"], "labels": labels},
         )
     ]
 
@@ -44,7 +44,7 @@ def get_news_headline_spec(category: str) -> RunSpec:
         name=f"gold_commodity_news:category={category}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_exact_match_metric_specs() + _get_weighted_classification_metric_specs(labels=["Yes", "No"]),
+        metric_specs=get_exact_match_metric_specs() + _get_weighted_classification_metric_specs(labels=["yes", "no"]),
         groups=["gold_commodity_news"],
     )
 
@@ -53,7 +53,7 @@ def get_news_headline_spec(category: str) -> RunSpec:
 
 
 @run_spec_function("legal_contract_summarization")
-def get_legal_contract_spec() -> RunSpec:
+def get_legal_contract_summarization_spec() -> RunSpec:
     scenario_spec = ScenarioSpec(
         class_name="helm.benchmark.scenarios.legal_contract_summarization_scenario.LegalContractSummarizationScenario",
         args={},
@@ -76,6 +76,28 @@ def get_legal_contract_spec() -> RunSpec:
     )
 
 
+@run_spec_function("legal_opinion_sentiment_classification")
+def get_legal_opinion_sentiment_classification_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.legal_opinion_sentiment_classification_scenario.LegalOpinionSentimentClassificationScenario",  # noqa: E501
+    )
+
+    instructions = "Classify the sentences into one of the 3 sentiment categories. Possible labels: positive, neutral, negative."  # noqa: E501
+    adapter_spec = get_generation_adapter_spec(
+        instructions=instructions,
+        output_noun="Label",
+    )
+
+    return RunSpec(
+        name="legal_opinion_sentiment_classification",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs()
+        + _get_weighted_classification_metric_specs(labels=["positive", "neutral", "negative"]),
+        groups=["legal_opinion_sentiment_classification"],
+    )
+
+
 @run_spec_function("casehold")
 def get_casehold_spec() -> RunSpec:
     scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.casehold_scenario.CaseHOLDScenario", args={})

diff --git a/src/helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py b/src/helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py
@@ -0,0 +1,77 @@
+import os
+from typing import List
+
+import pandas as pd
+
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TRAIN_SPLIT,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
+from helm.common.general import ensure_file_downloaded, ensure_directory_exists
+
+
+class LegalOpinionSentimentClassificationScenario(Scenario):
+    """
+    A legal opinion sentiment classification task based on the paper
+    Effective Approach to Develop a Sentiment Annotator For Legal Domain in a Low Resource Setting
+    [(Ratnayaka et al., 2020)](https://arxiv.org/pdf/2011.00318.pdf).
+
+    Example prompt:
+    Classify the sentences into one of the 3 sentiment categories. Possible labels: positive, neutral, negative.
+    {Sentence}
+    Label: {positive/neutral/negative}
+
+    """
+
+    # Names of the tasks we support
+
+    name = "legal_opinion"
+    description = "Predicting the sentiment of the legal text in the positive, negative, or neutral."
+    tags = ["classification", "sentiment analysis", "legal"]
+
+    SENTIMENT_CLASSES = ["positive", "negative", "neutral"]
+    SPLIT_TO_URL = {
+        TRAIN_SPLIT: "https://osf.io/download/hfn62/",
+        TEST_SPLIT: "https://osf.io/download/q4adh/",
+    }
+
+    def create_instances(self, df: pd.DataFrame, split: str) -> List[Instance]:
+        instances: List[Instance] = []
+        assert split in [TRAIN_SPLIT, TEST_SPLIT]
+        if split == TRAIN_SPLIT:
+            phrase_column_name = "Phrase"
+            label_column_name = "Label"
+        else:
+            phrase_column_name = "sentence"
+            label_column_name = "label"
+        for row in df.itertuples():
+            phrase = getattr(row, phrase_column_name)
+            label_index = int(getattr(row, label_column_name))
+            label = LegalOpinionSentimentClassificationScenario.SENTIMENT_CLASSES[label_index]
+            instance = Instance(
+                input=Input(text=phrase), references=[Reference(Output(text=label), tags=[CORRECT_TAG])], split=split
+            )
+            instances.append(instance)
+        return instances
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        self.data_dir = os.path.join(output_path, "data")
+        data_dir = self.data_dir
+        ensure_directory_exists(data_dir)
+        instances: List[Instance] = []
+        for split, url in LegalOpinionSentimentClassificationScenario.SPLIT_TO_URL.items():
+            file_name = f"{split.lower()}.xlsx"
+            file_path = os.path.join(data_dir, file_name)
+            ensure_file_downloaded(
+                source_url=url,
+                target_path=os.path.join(data_dir, file_name),
+            )
+            df = pd.read_excel(file_path)
+            instances.extend(self.create_instances(df, split))
+        return instances
diff --git a/src/helm/benchmark/static/schema_enterprise.yaml b/src/helm/benchmark/static/schema_enterprise.yaml
@@ -121,6 +121,7 @@ run_groups:
     subgroups:
       - legal_contract_summarization
       - casehold
+      - legal_opinion_sentiment_classification
 
   - name: climate_scenarios
     display_name: Climate Scenarios
@@ -187,6 +188,22 @@ run_groups:
       when: before 2021
       language: English
 
+  - name: legal_opinion_sentiment_classification
+    display_name: Legal Opinion Sentiment Classification
+    description: A legal opinion sentiment classification task based on the paper Effective Approach to Develop a Sentiment Annotator For Legal Domain in a Low Resource Setting [(Ratnayaka et al., 2020)](https://arxiv.org/pdf/2011.00318.pdf).
+    metric_groups:
+      - accuracy
+      - general_information
+    environment:
+      main_name: quasi_exact_match
+      main_split: test
+    taxonomy:
+      task: sentiment analysis
+      what: United States legal opinion texts
+      who: United States courts
+      when: Before 2020
+      language: English
+
   - name: sumosum
     display_name: SUMO Web Claims Summarization
     description: A summarization benchmark based on the climate subset of the SUMO dataset ([Mishra et al., 2020](https://aclanthology.org/2020.wnut-1.12/)).