From 8b7ee225839fb3ed28bccd007fd12c4eda28df65 Mon Sep 17 00:00:00 2001 From: Yifan Mai Date: Wed, 22 Jan 2025 12:00:07 -0800 Subject: [PATCH 1/7] legal opinion --- .../run_specs/enterprise_run_specs.py | 37 ++++++++- ...inion_sentiment_classification_scenario.py | 75 +++++++++++++++++++ 2 files changed, 111 insertions(+), 1 deletion(-) create mode 100644 src/helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py diff --git a/src/helm/benchmark/run_specs/enterprise_run_specs.py b/src/helm/benchmark/run_specs/enterprise_run_specs.py index 8e22bc56dbb..6f8a4b448d1 100644 --- a/src/helm/benchmark/run_specs/enterprise_run_specs.py +++ b/src/helm/benchmark/run_specs/enterprise_run_specs.py @@ -1,5 +1,6 @@ """Run spec functions for HELM Enterprise scenarios.""" +from typing import List, Optional from helm.benchmark.adaptation.adapter_spec import ADAPT_MULTIPLE_CHOICE_JOINT from helm.benchmark.adaptation.common_adapter_specs import ( get_generation_adapter_spec, @@ -11,10 +12,22 @@ get_exact_match_metric_specs, get_f1_metric_specs, ) +from helm.benchmark.metrics.metric import MetricSpec from helm.benchmark.run_spec import RunSpec, run_spec_function from helm.benchmark.scenarios.scenario import ScenarioSpec +def get_weighted_classification_metric_specs( + delimiter: Optional[str] = None, average: str = "weighted", class_defs: Optional[List[str]] = None +) -> List[MetricSpec]: + return [ + MetricSpec( + class_name="helm.benchmark.metrics.classification_metrics.ClassificationMetric", + args={"delimiter": delimiter, "average": average, "class_defs": class_defs}, + ) + ] + + # Finance @@ -44,7 +57,7 @@ def get_news_headline_spec(category: str) -> RunSpec: @run_spec_function("legal_contract_summarization") -def get_legal_contract_spec() -> RunSpec: +def get_legal_contract_summarization_spec() -> RunSpec: scenario_spec = ScenarioSpec( class_name="helm.benchmark.scenarios.legal_contract_summarization_scenario.LegalContractSummarizationScenario", args={}, @@ -67,6 +80,28 @@ def get_legal_contract_spec() -> RunSpec: ) +@run_spec_function("legal_opinion_sentiment_classification") +def get_legal_opinion_sentiment_classification_spec() -> RunSpec: + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.legal_opinion_sentiment_classification_scenario.LegalOpinionSentimentClassificationScenario", + args={}, + ) + + instructions = "Classify the sentences into one of the 3 sentiment categories. Possible labels: positive, neutral, negative." # noqa + adapter_spec = get_generation_adapter_spec( + instructions=instructions, + output_noun="Label", + ) + + return RunSpec( + name="legal_opinion", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=get_exact_match_metric_specs() + get_weighted_classification_metric_specs(), + groups=["legal_opinion"], + ) + + @run_spec_function("casehold") def get_casehold_spec() -> RunSpec: scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.casehold_scenario.CaseHOLDScenario", args={}) diff --git a/src/helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py b/src/helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py new file mode 100644 index 00000000000..6171a188008 --- /dev/null +++ b/src/helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py @@ -0,0 +1,75 @@ +import os +from typing import List + +import pandas as pd + +from helm.benchmark.scenarios.scenario import ( + Scenario, + Instance, + Reference, + TRAIN_SPLIT, + TEST_SPLIT, + CORRECT_TAG, + Input, + Output, +) +from helm.common.general import ensure_file_downloaded, ensure_directory_exists + + +class LegalOpinionSentimentClassificationScenario(Scenario): + """ + TODO: Fill this in + + Example prompt: + Classify the sentences into one of the 3 sentiment categories. Possible labels: positive, neutral, negative. + {Sentence} + Label: {positive/neutral/negative} + + """ + + # Names of the tasks we support + + name = "legal_opinion" + description = "Predicting the sentiment of the legal text in the positive, negative, or neutral." + tags = ["classification", "sentiment analysis", "legal"] + + SENTIMENT_CLASSES = ["positive", "negative", "neutral"] + SPLIT_TO_URL = { + TRAIN_SPLIT: "https://osf.io/download/hfn62/train.xlsx", + TEST_SPLIT: "https://osf.io/download/hfn62/test.xlsx", + } + + def create_instances(self, df: pd.DataFrame, split: str) -> List[Instance]: + instances: List[Instance] = [] + assert split in [TRAIN_SPLIT, TEST_SPLIT] + if split == TRAIN_SPLIT: + phrase_column_name = "Phrase" + label_column_name = "Label" + else: + phrase_column_name = "sentence" + label_column_name = "label" + for row in df.itertuples(): + phrase = getattr(row, phrase_column_name) + label_index = int(getattr(row, label_column_name)) + label = LegalOpinionSentimentClassificationScenario.SENTIMENT_CLASSES[label_index] + instance = Instance( + input=Input(text=phrase), references=[Reference(Output(text=label), tags=[CORRECT_TAG])], split=split + ) + instances.append(instance) + return instances + + def get_instances(self, output_path: str) -> List[Instance]: + self.data_dir = os.path.join(output_path, "data") + data_dir = self.data_dir + ensure_directory_exists(data_dir) + instances: List[Instance] = [] + for split, url in LegalOpinionSentimentClassificationScenario.SPLIT_TO_URL.items(): + file_name = url.split("/")[-1] + file_path = os.path.join(data_dir, file_name) + ensure_file_downloaded( + source_url=url, + target_path=os.path.join(data_dir, file_name), + ) + df = pd.read_excel(file_path) + instances.extend(self.create_instances(df, split)) + return instances From 7f0ea93aeea321e96c92050ecbdb4804c9e070b2 Mon Sep 17 00:00:00 2001 From: Yifan Mai Date: Wed, 22 Jan 2025 15:16:51 -0800 Subject: [PATCH 2/7] Fix schema --- .../run_specs/enterprise_run_specs.py | 19 ++----------------- ...inion_sentiment_classification_scenario.py | 6 ++++-- .../benchmark/static/schema_enterprise.yaml | 18 ++++++++++++++++++ 3 files changed, 24 insertions(+), 19 deletions(-) diff --git a/src/helm/benchmark/run_specs/enterprise_run_specs.py b/src/helm/benchmark/run_specs/enterprise_run_specs.py index 6f8a4b448d1..0182eb79678 100644 --- a/src/helm/benchmark/run_specs/enterprise_run_specs.py +++ b/src/helm/benchmark/run_specs/enterprise_run_specs.py @@ -1,6 +1,5 @@ """Run spec functions for HELM Enterprise scenarios.""" -from typing import List, Optional from helm.benchmark.adaptation.adapter_spec import ADAPT_MULTIPLE_CHOICE_JOINT from helm.benchmark.adaptation.common_adapter_specs import ( get_generation_adapter_spec, @@ -17,20 +16,6 @@ from helm.benchmark.scenarios.scenario import ScenarioSpec -def get_weighted_classification_metric_specs( - delimiter: Optional[str] = None, average: str = "weighted", class_defs: Optional[List[str]] = None -) -> List[MetricSpec]: - return [ - MetricSpec( - class_name="helm.benchmark.metrics.classification_metrics.ClassificationMetric", - args={"delimiter": delimiter, "average": average, "class_defs": class_defs}, - ) - ] - - -# Finance - - @run_spec_function("gold_commodity_news") def get_news_headline_spec(category: str) -> RunSpec: from helm.benchmark.scenarios.gold_commodity_news_scenario import GoldCommodityNewsScenario @@ -84,7 +69,6 @@ def get_legal_contract_summarization_spec() -> RunSpec: def get_legal_opinion_sentiment_classification_spec() -> RunSpec: scenario_spec = ScenarioSpec( class_name="helm.benchmark.scenarios.legal_opinion_sentiment_classification_scenario.LegalOpinionSentimentClassificationScenario", - args={}, ) instructions = "Classify the sentences into one of the 3 sentiment categories. Possible labels: positive, neutral, negative." # noqa @@ -97,7 +81,8 @@ def get_legal_opinion_sentiment_classification_spec() -> RunSpec: name="legal_opinion", scenario_spec=scenario_spec, adapter_spec=adapter_spec, - metric_specs=get_exact_match_metric_specs() + get_weighted_classification_metric_specs(), + # TODO: Switch to using weighted F1 + metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(), groups=["legal_opinion"], ) diff --git a/src/helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py b/src/helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py index 6171a188008..3c00e655cc1 100644 --- a/src/helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +++ b/src/helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py @@ -18,9 +18,11 @@ class LegalOpinionSentimentClassificationScenario(Scenario): """ - TODO: Fill this in + A legal opinion sentiment classificaiton task based on the paper + Effective Approach to Develop a Sentiment Annotator For Legal Domain in a Low Resource Setting + [(Ratnayaka et al., 2020)](https://arxiv.org/pdf/2011.00318.pdf). - Example prompt: + Example prompt: Classify the sentences into one of the 3 sentiment categories. Possible labels: positive, neutral, negative. {Sentence} Label: {positive/neutral/negative} diff --git a/src/helm/benchmark/static/schema_enterprise.yaml b/src/helm/benchmark/static/schema_enterprise.yaml index 7eadad2b4c4..81077e90517 100644 --- a/src/helm/benchmark/static/schema_enterprise.yaml +++ b/src/helm/benchmark/static/schema_enterprise.yaml @@ -116,6 +116,8 @@ run_groups: category: All scenarios subgroups: - legal_contract_summarization + - legal_opinion_sentiment_classification + - casehold - name: climate_scenarios display_name: Climate Scenarios @@ -182,6 +184,22 @@ run_groups: when: before 2021 language: English + - name: legal_opinion_sentiment_classification + display_name: Legal Opinion Sentiment Classification + description: A legal opinion sentiment classificaiton task based on the paper Effective Approach to Develop a Sentiment Annotator For Legal Domain in a Low Resource Setting [(Ratnayaka et al., 2020)](https://arxiv.org/pdf/2011.00318.pdf). + metric_groups: + - accuracy + - general_information + environment: + main_name: quasi_exact_match + main_split: test + taxonomy: + task: sentiment analysis + what: United States legal opinion texts + who: United States courts + when: Before 2020 + language: English + - name: sumosum display_name: SUMO Web Claims Summarization description: A summarization benchmark based on the climate subset of the SUMO dataset ([Mishra et al., 2020](https://aclanthology.org/2020.wnut-1.12/)). From a74b31cd49065aa6f4d642618b60ea1654d49e03 Mon Sep 17 00:00:00 2001 From: Yifan Mai Date: Wed, 22 Jan 2025 15:23:36 -0800 Subject: [PATCH 3/7] Fixes --- src/helm/benchmark/run_specs/enterprise_run_specs.py | 4 ++-- .../legal_opinion_sentiment_classification_scenario.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/helm/benchmark/run_specs/enterprise_run_specs.py b/src/helm/benchmark/run_specs/enterprise_run_specs.py index 0182eb79678..4d6b7179987 100644 --- a/src/helm/benchmark/run_specs/enterprise_run_specs.py +++ b/src/helm/benchmark/run_specs/enterprise_run_specs.py @@ -78,12 +78,12 @@ def get_legal_opinion_sentiment_classification_spec() -> RunSpec: ) return RunSpec( - name="legal_opinion", + name="legal_opinion_sentiment_classification", scenario_spec=scenario_spec, adapter_spec=adapter_spec, # TODO: Switch to using weighted F1 metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(), - groups=["legal_opinion"], + groups=["legal_opinion_sentiment_classification"], ) diff --git a/src/helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py b/src/helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py index 3c00e655cc1..d07f5ec6aa1 100644 --- a/src/helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +++ b/src/helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py @@ -37,8 +37,8 @@ class LegalOpinionSentimentClassificationScenario(Scenario): SENTIMENT_CLASSES = ["positive", "negative", "neutral"] SPLIT_TO_URL = { - TRAIN_SPLIT: "https://osf.io/download/hfn62/train.xlsx", - TEST_SPLIT: "https://osf.io/download/hfn62/test.xlsx", + TRAIN_SPLIT: "https://osf.io/download/hfn62/", + TEST_SPLIT: "https://osf.io/download/q4adh/", } def create_instances(self, df: pd.DataFrame, split: str) -> List[Instance]: @@ -66,7 +66,7 @@ def get_instances(self, output_path: str) -> List[Instance]: ensure_directory_exists(data_dir) instances: List[Instance] = [] for split, url in LegalOpinionSentimentClassificationScenario.SPLIT_TO_URL.items(): - file_name = url.split("/")[-1] + file_name = f"{split.lower()}.xlsx" file_path = os.path.join(data_dir, file_name) ensure_file_downloaded( source_url=url, From cc144939f259e1a42feceede9b44344a4c69e8bc Mon Sep 17 00:00:00 2001 From: Yifan Mai Date: Wed, 22 Jan 2025 15:25:48 -0800 Subject: [PATCH 4/7] Lint --- src/helm/benchmark/run_specs/enterprise_run_specs.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/helm/benchmark/run_specs/enterprise_run_specs.py b/src/helm/benchmark/run_specs/enterprise_run_specs.py index 4d6b7179987..d7f19c2b9ca 100644 --- a/src/helm/benchmark/run_specs/enterprise_run_specs.py +++ b/src/helm/benchmark/run_specs/enterprise_run_specs.py @@ -11,7 +11,6 @@ get_exact_match_metric_specs, get_f1_metric_specs, ) -from helm.benchmark.metrics.metric import MetricSpec from helm.benchmark.run_spec import RunSpec, run_spec_function from helm.benchmark.scenarios.scenario import ScenarioSpec @@ -68,10 +67,10 @@ def get_legal_contract_summarization_spec() -> RunSpec: @run_spec_function("legal_opinion_sentiment_classification") def get_legal_opinion_sentiment_classification_spec() -> RunSpec: scenario_spec = ScenarioSpec( - class_name="helm.benchmark.scenarios.legal_opinion_sentiment_classification_scenario.LegalOpinionSentimentClassificationScenario", + class_name="helm.benchmark.scenarios.legal_opinion_sentiment_classification_scenario.LegalOpinionSentimentClassificationScenario", # noqa: E501 ) - instructions = "Classify the sentences into one of the 3 sentiment categories. Possible labels: positive, neutral, negative." # noqa + instructions = "Classify the sentences into one of the 3 sentiment categories. Possible labels: positive, neutral, negative." # noqa: E501 adapter_spec = get_generation_adapter_spec( instructions=instructions, output_noun="Label", From ba8153876bace198d879b5e8a94a56b3e187abf8 Mon Sep 17 00:00:00 2001 From: Yifan Mai Date: Wed, 22 Jan 2025 15:29:31 -0800 Subject: [PATCH 5/7] Fix typo and add credit Co-authored-by: Ryo Kawahara Co-authored-by: Mikio Takeuchi --- .../legal_opinion_sentiment_classification_scenario.py | 2 +- src/helm/benchmark/static/schema_enterprise.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py b/src/helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py index d07f5ec6aa1..98d99212866 100644 --- a/src/helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +++ b/src/helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py @@ -18,7 +18,7 @@ class LegalOpinionSentimentClassificationScenario(Scenario): """ - A legal opinion sentiment classificaiton task based on the paper + A legal opinion sentiment classification task based on the paper Effective Approach to Develop a Sentiment Annotator For Legal Domain in a Low Resource Setting [(Ratnayaka et al., 2020)](https://arxiv.org/pdf/2011.00318.pdf). diff --git a/src/helm/benchmark/static/schema_enterprise.yaml b/src/helm/benchmark/static/schema_enterprise.yaml index 81077e90517..4ff593dd02e 100644 --- a/src/helm/benchmark/static/schema_enterprise.yaml +++ b/src/helm/benchmark/static/schema_enterprise.yaml @@ -186,7 +186,7 @@ run_groups: - name: legal_opinion_sentiment_classification display_name: Legal Opinion Sentiment Classification - description: A legal opinion sentiment classificaiton task based on the paper Effective Approach to Develop a Sentiment Annotator For Legal Domain in a Low Resource Setting [(Ratnayaka et al., 2020)](https://arxiv.org/pdf/2011.00318.pdf). + description: A legal opinion sentiment classification task based on the paper Effective Approach to Develop a Sentiment Annotator For Legal Domain in a Low Resource Setting [(Ratnayaka et al., 2020)](https://arxiv.org/pdf/2011.00318.pdf). metric_groups: - accuracy - general_information From 3087f50242ed63ffe7457d8ab18e21905e6b7c57 Mon Sep 17 00:00:00 2001 From: Yifan Mai Date: Fri, 24 Jan 2025 11:20:40 -0800 Subject: [PATCH 6/7] Switch to using weighted F1, precision and recall --- src/helm/benchmark/run_specs/enterprise_run_specs.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/helm/benchmark/run_specs/enterprise_run_specs.py b/src/helm/benchmark/run_specs/enterprise_run_specs.py index 8e0f4b65386..006e72cc8df 100644 --- a/src/helm/benchmark/run_specs/enterprise_run_specs.py +++ b/src/helm/benchmark/run_specs/enterprise_run_specs.py @@ -19,7 +19,7 @@ def _get_weighted_classification_metric_specs(labels: List[str]) -> List[MetricS return [ MetricSpec( class_name="helm.benchmark.metrics.classification_metrics.ClassificationMetric", - args={"averages": ["weighted"], "labels": labels}, + args={"averages": ["weighted"], "scores": ["f1", "precision", "recall"], "labels": labels}, ) ] @@ -44,7 +44,7 @@ def get_news_headline_spec(category: str) -> RunSpec: name=f"gold_commodity_news:category={category}", scenario_spec=scenario_spec, adapter_spec=adapter_spec, - metric_specs=get_exact_match_metric_specs() + _get_weighted_classification_metric_specs(labels=["Yes", "No"]), + metric_specs=get_exact_match_metric_specs() + _get_weighted_classification_metric_specs(labels=["yes", "no"]), groups=["gold_commodity_news"], ) @@ -92,8 +92,7 @@ def get_legal_opinion_sentiment_classification_spec() -> RunSpec: name="legal_opinion_sentiment_classification", scenario_spec=scenario_spec, adapter_spec=adapter_spec, - # TODO: Switch to using weighted F1 - metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(), + metric_specs=get_exact_match_metric_specs() + _get_weighted_classification_metric_specs(labels=["positive", "neutral", "negative"]), groups=["legal_opinion_sentiment_classification"], ) From a240370bad69c071018951b25e7a6fdca4de9855 Mon Sep 17 00:00:00 2001 From: Yifan Mai Date: Fri, 24 Jan 2025 11:24:09 -0800 Subject: [PATCH 7/7] Fixes --- src/helm/benchmark/run_specs/enterprise_run_specs.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/helm/benchmark/run_specs/enterprise_run_specs.py b/src/helm/benchmark/run_specs/enterprise_run_specs.py index 006e72cc8df..9dd01b24432 100644 --- a/src/helm/benchmark/run_specs/enterprise_run_specs.py +++ b/src/helm/benchmark/run_specs/enterprise_run_specs.py @@ -92,7 +92,8 @@ def get_legal_opinion_sentiment_classification_spec() -> RunSpec: name="legal_opinion_sentiment_classification", scenario_spec=scenario_spec, adapter_spec=adapter_spec, - metric_specs=get_exact_match_metric_specs() + _get_weighted_classification_metric_specs(labels=["positive", "neutral", "negative"]), + metric_specs=get_exact_match_metric_specs() + + _get_weighted_classification_metric_specs(labels=["positive", "neutral", "negative"]), groups=["legal_opinion_sentiment_classification"], )