diff --git a/src/helm/benchmark/run_specs/enterprise_run_specs.py b/src/helm/benchmark/run_specs/enterprise_run_specs.py index 0bba76e097..9dd01b2443 100644 --- a/src/helm/benchmark/run_specs/enterprise_run_specs.py +++ b/src/helm/benchmark/run_specs/enterprise_run_specs.py @@ -19,7 +19,7 @@ def _get_weighted_classification_metric_specs(labels: List[str]) -> List[MetricS return [ MetricSpec( class_name="helm.benchmark.metrics.classification_metrics.ClassificationMetric", - args={"averages": ["weighted"], "labels": labels}, + args={"averages": ["weighted"], "scores": ["f1", "precision", "recall"], "labels": labels}, ) ] @@ -44,7 +44,7 @@ def get_news_headline_spec(category: str) -> RunSpec: name=f"gold_commodity_news:category={category}", scenario_spec=scenario_spec, adapter_spec=adapter_spec, - metric_specs=get_exact_match_metric_specs() + _get_weighted_classification_metric_specs(labels=["Yes", "No"]), + metric_specs=get_exact_match_metric_specs() + _get_weighted_classification_metric_specs(labels=["yes", "no"]), groups=["gold_commodity_news"], ) @@ -53,7 +53,7 @@ def get_news_headline_spec(category: str) -> RunSpec: @run_spec_function("legal_contract_summarization") -def get_legal_contract_spec() -> RunSpec: +def get_legal_contract_summarization_spec() -> RunSpec: scenario_spec = ScenarioSpec( class_name="helm.benchmark.scenarios.legal_contract_summarization_scenario.LegalContractSummarizationScenario", args={}, @@ -76,6 +76,28 @@ def get_legal_contract_spec() -> RunSpec: ) +@run_spec_function("legal_opinion_sentiment_classification") +def get_legal_opinion_sentiment_classification_spec() -> RunSpec: + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.legal_opinion_sentiment_classification_scenario.LegalOpinionSentimentClassificationScenario", # noqa: E501 + ) + + instructions = "Classify the sentences into one of the 3 sentiment categories. Possible labels: positive, neutral, negative." # noqa: E501 + adapter_spec = get_generation_adapter_spec( + instructions=instructions, + output_noun="Label", + ) + + return RunSpec( + name="legal_opinion_sentiment_classification", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=get_exact_match_metric_specs() + + _get_weighted_classification_metric_specs(labels=["positive", "neutral", "negative"]), + groups=["legal_opinion_sentiment_classification"], + ) + + @run_spec_function("casehold") def get_casehold_spec() -> RunSpec: scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.casehold_scenario.CaseHOLDScenario", args={}) diff --git a/src/helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py b/src/helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py new file mode 100644 index 0000000000..98d9921286 --- /dev/null +++ b/src/helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py @@ -0,0 +1,77 @@ +import os +from typing import List + +import pandas as pd + +from helm.benchmark.scenarios.scenario import ( + Scenario, + Instance, + Reference, + TRAIN_SPLIT, + TEST_SPLIT, + CORRECT_TAG, + Input, + Output, +) +from helm.common.general import ensure_file_downloaded, ensure_directory_exists + + +class LegalOpinionSentimentClassificationScenario(Scenario): + """ + A legal opinion sentiment classification task based on the paper + Effective Approach to Develop a Sentiment Annotator For Legal Domain in a Low Resource Setting + [(Ratnayaka et al., 2020)](https://arxiv.org/pdf/2011.00318.pdf). + + Example prompt: + Classify the sentences into one of the 3 sentiment categories. Possible labels: positive, neutral, negative. + {Sentence} + Label: {positive/neutral/negative} + + """ + + # Names of the tasks we support + + name = "legal_opinion" + description = "Predicting the sentiment of the legal text in the positive, negative, or neutral." + tags = ["classification", "sentiment analysis", "legal"] + + SENTIMENT_CLASSES = ["positive", "negative", "neutral"] + SPLIT_TO_URL = { + TRAIN_SPLIT: "https://osf.io/download/hfn62/", + TEST_SPLIT: "https://osf.io/download/q4adh/", + } + + def create_instances(self, df: pd.DataFrame, split: str) -> List[Instance]: + instances: List[Instance] = [] + assert split in [TRAIN_SPLIT, TEST_SPLIT] + if split == TRAIN_SPLIT: + phrase_column_name = "Phrase" + label_column_name = "Label" + else: + phrase_column_name = "sentence" + label_column_name = "label" + for row in df.itertuples(): + phrase = getattr(row, phrase_column_name) + label_index = int(getattr(row, label_column_name)) + label = LegalOpinionSentimentClassificationScenario.SENTIMENT_CLASSES[label_index] + instance = Instance( + input=Input(text=phrase), references=[Reference(Output(text=label), tags=[CORRECT_TAG])], split=split + ) + instances.append(instance) + return instances + + def get_instances(self, output_path: str) -> List[Instance]: + self.data_dir = os.path.join(output_path, "data") + data_dir = self.data_dir + ensure_directory_exists(data_dir) + instances: List[Instance] = [] + for split, url in LegalOpinionSentimentClassificationScenario.SPLIT_TO_URL.items(): + file_name = f"{split.lower()}.xlsx" + file_path = os.path.join(data_dir, file_name) + ensure_file_downloaded( + source_url=url, + target_path=os.path.join(data_dir, file_name), + ) + df = pd.read_excel(file_path) + instances.extend(self.create_instances(df, split)) + return instances diff --git a/src/helm/benchmark/static/schema_enterprise.yaml b/src/helm/benchmark/static/schema_enterprise.yaml index 2b9ebdbdb8..53313e5c4d 100644 --- a/src/helm/benchmark/static/schema_enterprise.yaml +++ b/src/helm/benchmark/static/schema_enterprise.yaml @@ -121,6 +121,7 @@ run_groups: subgroups: - legal_contract_summarization - casehold + - legal_opinion_sentiment_classification - name: climate_scenarios display_name: Climate Scenarios @@ -187,6 +188,22 @@ run_groups: when: before 2021 language: English + - name: legal_opinion_sentiment_classification + display_name: Legal Opinion Sentiment Classification + description: A legal opinion sentiment classification task based on the paper Effective Approach to Develop a Sentiment Annotator For Legal Domain in a Low Resource Setting [(Ratnayaka et al., 2020)](https://arxiv.org/pdf/2011.00318.pdf). + metric_groups: + - accuracy + - general_information + environment: + main_name: quasi_exact_match + main_split: test + taxonomy: + task: sentiment analysis + what: United States legal opinion texts + who: United States courts + when: Before 2020 + language: English + - name: sumosum display_name: SUMO Web Claims Summarization description: A summarization benchmark based on the climate subset of the SUMO dataset ([Mishra et al., 2020](https://aclanthology.org/2020.wnut-1.12/)).