From a1ddda41addcbd0eac3f04557d00e5ba238cf72a Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Wed, 31 Jan 2024 18:16:48 +0000 Subject: [PATCH] add system for ligtheval format --- .../tasks/tasks_prompt_formatting.py | 46 ++++++++++++++----- src/lighteval/tasks/tasks_table.jsonl | 18 ++++++++ tasks_examples/bbh.txt | 45 ++++++++---------- 3 files changed, 70 insertions(+), 39 deletions(-) diff --git a/src/lighteval/tasks/tasks_prompt_formatting.py b/src/lighteval/tasks/tasks_prompt_formatting.py index bf6f41db5..8c2c2b47a 100644 --- a/src/lighteval/tasks/tasks_prompt_formatting.py +++ b/src/lighteval/tasks/tasks_prompt_formatting.py @@ -1,6 +1,7 @@ import ast import json import random +import numpy as np import re import string @@ -115,22 +116,43 @@ def process_path(path: str) -> str: story.append(text) return queries +def bbh_harness(line, task_name: str = None): + line = {k: v for k, v in line.items() if v is not None} + + task_prefix = line.get("task_prefix", "") + example_input_prefix = line.get("example_input_prefix", "\nQ: ") + query = f"{task_prefix}{example_input_prefix}{line['input']}" + + rng = np.random.RandomState(seed=42) + choice_prefix = line.get("choice_prefix", "\n choice: ") + append_choices = bool(line.get("append_choices_to_input", True)) + if append_choices: + permuted_choices = list(rng.permutation(sorted(line["choices"]))) + query = f"{query}{choice_prefix}{choice_prefix.join(permuted_choices)}" + + example_output_prefix = line.get("example_output_prefix", "\nA: ") + query = f"{query}{example_output_prefix}" + + gold_item = line["choices"][line["target_idx"]] + correct_index = permuted_choices.index(gold_item) + return Doc( + task_name=task_name, + query=query, + choices=permuted_choices, + gold_index=correct_index, + target_for_fewshot_sorting=permuted_choices, + instruction=line.get("task_prefix", None), + ) def bbh_lighteval(line, task_name: str = None): - query = "", "" - if "task_prefix" in line: - query = line["task_prefix"] - if "example_input_prefix" in line: - query += line["example_input_prefix"] - else: - query += "Q: " + line = {k: v for k, v in line.items() if v is not None} + + query = line.get("task_prefix", "") + query +=line.get("example_input_prefix", "\nQuestion: ") query += line["input"] - if "example_output_prefix" in line: - query += line["example_output_prefix"] - else: - query += "Choices:" + query += line.get("choice_prefix", "\n Choices: ") query += "".join([f"\n{key}. {choice}" for key, choice in zip(LETTER_INDICES, line["choices"])]) - query += "\nAnswer: " + query += line.get("example_output_prefix", "\nAnswer: ") return Doc( task_name=task_name, diff --git a/src/lighteval/tasks/tasks_table.jsonl b/src/lighteval/tasks/tasks_table.jsonl index 628764611..9ca6ed46c 100644 --- a/src/lighteval/tasks/tasks_table.jsonl +++ b/src/lighteval/tasks/tasks_table.jsonl @@ -86,6 +86,24 @@ {"name":"bigbench:tracking_shuffled_objects_five_objects","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"tracking_shuffled_objects_five_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} {"name":"bigbench:tracking_shuffled_objects_seven_objects","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"tracking_shuffled_objects_seven_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} {"name":"bigbench:tracking_shuffled_objects_three_objects","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"tracking_shuffled_objects_three_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} +{"name":"bigbench:causal_judgment","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"causal_judgement","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} +{"name":"bigbench:date_understanding","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"date_understanding","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} +{"name":"bigbench:disambiguation_qa","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"disambiguation_qa","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} +{"name":"bigbench:geometric_shapes","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"geometric_shapes","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} +{"name":"bigbench:logical_deduction_five_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"logical_deduction_five_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} +{"name":"bigbench:logical_deduction_seven_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"logical_deduction_seven_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} +{"name":"bigbench:logical_deduction_three_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"logical_deduction_three_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} +{"name":"bigbench:movie_recommendation","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"movie_recommendation","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} +{"name":"bigbench:navigate","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"navigate","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} +{"name":"bigbench:reasoning_about_colored_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"reasoning_about_colored_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} +{"name":"bigbench:ruin_names","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"ruin_names","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} +{"name":"bigbench:salient_translation_error_detection","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"salient_translation_error_detection","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} +{"name":"bigbench:snarks","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"snarks","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} +{"name":"bigbench:sports_understanding","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"sports_understanding","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} +{"name":"bigbench:temporal_sequences","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"temporal_sequences","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} +{"name":"bigbench:tracking_shuffled_objects_five_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"tracking_shuffled_objects_five_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} +{"name":"bigbench:tracking_shuffled_objects_seven_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"tracking_shuffled_objects_seven_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} +{"name":"bigbench:tracking_shuffled_objects_three_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"tracking_shuffled_objects_three_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} {"name":"bigbench:auto_debugging","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"auto_debugging","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false} {"name":"bigbench:bbq_lite_json:age_ambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-age_ambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false} {"name":"bigbench:bbq_lite_json:age_disambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-age_disambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false} diff --git a/tasks_examples/bbh.txt b/tasks_examples/bbh.txt index 924e1e64e..e8f6b6967 100644 --- a/tasks_examples/bbh.txt +++ b/tasks_examples/bbh.txt @@ -1,30 +1,3 @@ -harness|bbh:boolean_expressions|0|0 -harness|bbh:causal_judgment|0|0 -harness|bbh:date_understanding|0|0 -harness|bbh:disambiguation_qa|0|0 -harness|bbh:dyck_languages|0|0 -harness|bbh:formal_fallacies|0|0 -harness|bbh:geometric_shapes|0|0 -harness|bbh:hyperbaton|0|0 -harness|bbh:logical_deduction_five_objects|0|0 -harness|bbh:logical_deduction_seven_objects|0|0 -harness|bbh:logical_deduction_three_objects|0|0 -harness|bbh:movie_recommendation|0|0 -harness|bbh:multistep_arithmetic_two|0|0 -harness|bbh:navigate|0|0 -harness|bbh:object_counting|0|0 -harness|bbh:penguins_in_a_table|0|0 -harness|bbh:reasoning_about_colored_objects|0|0 -harness|bbh:ruin_names|0|0 -harness|bbh:salient_translation_error_detection|0|0 -harness|bbh:snarks|0|0 -harness|bbh:sports_understanding|0|0 -harness|bbh:temporal_sequences|0|0 -harness|bbh:tracking_shuffled_objects_five_objects|0|0 -harness|bbh:tracking_shuffled_objects_seven_objects|0|0 -harness|bbh:tracking_shuffled_objects_three_objects|0|0 -harness|bbh:web_of_lies|0|0 -harness|bbh:word_sorting|0|0 lighteval|bigbench:causal_judgment|0|0 lighteval|bigbench:date_understanding|0|0 lighteval|bigbench:disambiguation_qa|0|0 @@ -43,3 +16,21 @@ lighteval|bigbench:temporal_sequences|0|0 lighteval|bigbench:tracking_shuffled_objects_five_objects|0|0 lighteval|bigbench:tracking_shuffled_objects_seven_objects|0|0 lighteval|bigbench:tracking_shuffled_objects_three_objects|0|0 +harness|bigbench:causal_judgment|0|0 +harness|bigbench:date_understanding|0|0 +harness|bigbench:disambiguation_qa|0|0 +harness|bigbench:geometric_shapes|0|0 +harness|bigbench:logical_deduction_five_objects|0|0 +harness|bigbench:logical_deduction_seven_objects|0|0 +harness|bigbench:logical_deduction_three_objects|0|0 +harness|bigbench:movie_recommendation|0|0 +harness|bigbench:navigate|0|0 +harness|bigbench:reasoning_about_colored_objects|0|0 +harness|bigbench:ruin_names|0|0 +harness|bigbench:salient_translation_error_detection|0|0 +harness|bigbench:snarks|0|0 +harness|bigbench:sports_understanding|0|0 +harness|bigbench:temporal_sequences|0|0 +harness|bigbench:tracking_shuffled_objects_five_objects|0|0 +harness|bigbench:tracking_shuffled_objects_seven_objects|0|0 +harness|bigbench:tracking_shuffled_objects_three_objects|0|0