Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add multiple annotators to Omni-MATH and rename shared modules #3291

Merged
merged 2 commits into from
Feb 5, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions src/helm/benchmark/annotation/model_as_judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def __init__(self, response_text: str, **kwargs):


@dataclass
class _AnnotatorModelInfo:
class AnnotatorModelInfo:
model_name: str
model_deployment: str

Expand All @@ -26,9 +26,9 @@ def score_with_reasoning_with_gpt_and_llama(

Score using GPT-4o and Llama 3.1 for safety scenarios in HELM Safety."""
# TODO: Make this configurable
SHORT_NAME_TO_MODEL_INFO: Dict[str, _AnnotatorModelInfo] = {
"gpt": _AnnotatorModelInfo(model_name="openai/gpt-4o-2024-05-13", model_deployment="openai/gpt-4o-2024-05-13"),
"llama": _AnnotatorModelInfo(
SHORT_NAME_TO_MODEL_INFO: Dict[str, AnnotatorModelInfo] = {
"gpt": AnnotatorModelInfo(model_name="openai/gpt-4o-2024-05-13", model_deployment="openai/gpt-4o-2024-05-13"),
"llama": AnnotatorModelInfo(
model_name="meta/llama-3.1-405b-instruct-turbo", model_deployment="together/llama-3.1-405b-instruct-turbo"
),
}
Expand Down
65 changes: 45 additions & 20 deletions src/helm/benchmark/annotation/omni_math_annotator.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from typing import Any
from typing import Any, Dict
from importlib.resources import files

from helm.benchmark.adaptation.request_state import RequestState
from helm.benchmark.annotation.annotator import Annotator
from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo
from helm.clients.auto_client import AutoClient
from helm.common.request import Request

Expand Down Expand Up @@ -46,27 +47,51 @@ def annotate(self, request_state: RequestState) -> Any:
if not model_output_text.strip():
return {"prompt_text": annotator_prompt, "correctness": 0.0}

annotator_request = Request(
model="openai/gpt-4o-2024-05-13",
model_deployment="openai/gpt-4o-2024-05-13",
prompt=annotator_prompt,
temperature=0.0,
max_tokens=1000,
)
annotator_response = self._auto_client.make_request(annotator_request)
if not annotator_response.success:
raise Exception(f"Annotation request failed: {annotator_response.error}")
assert len(annotator_response.completions) == 1
annotator_response_text = annotator_response.completions[0].text
SHORT_NAME_TO_MODEL_INFO: Dict[str, AnnotatorModelInfo] = {
"gpt": AnnotatorModelInfo(
model_name="openai/gpt-4o-2024-05-13", model_deployment="openai/gpt-4o-2024-05-13"
),
"llama": AnnotatorModelInfo(
model_name="meta/llama-3.1-405b-instruct-turbo",
model_deployment="together/llama-3.1-405b-instruct-turbo",
),
"claude": AnnotatorModelInfo(
model_name="anthropic/claude-3-5-sonnet-20241022",
model_deployment="anthropic/claude-3-5-sonnet-20241022",
),
}
all_student_final_answers = []
all_equivalence_judgements = []
all_justifications = []
for annotator_model in SHORT_NAME_TO_MODEL_INFO:
annotator_model_info = SHORT_NAME_TO_MODEL_INFO[annotator_model]
annotator_request = Request(
model=annotator_model_info.model_name,
model_deployment=annotator_model_info.model_deployment,
prompt=annotator_prompt,
temperature=0.0,
max_tokens=1000,
)
annotator_response = self._auto_client.make_request(annotator_request)
if not annotator_response.success:
raise Exception(f"Annotation request failed: {annotator_response.error}")
assert len(annotator_response.completions) == 1
annotator_response_text = annotator_response.completions[0].text

info = parse_report(annotator_response_text)

info = parse_report(annotator_response_text)
equivalence_judgement = info.get("Equivalence Judgement", "")
student_final_answer = info.get("Student Final Answer", "")
justification = info.get("Justification", "").strip().removesuffix("=== report over ===").strip()
if equivalence_judgement == "":
continue # skip this annotator if there is no equivalence judgement parsed

Comment on lines +91 to +92
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't skip.

equivalence_judgement = info.get("Equivalence Judgement", "")
student_final_answer = info.get("Student Final Answer", "")
justification = info.get("Justification", "").strip().removesuffix("=== report over ===").strip()
all_student_final_answers.append(student_final_answer)
all_equivalence_judgements.append(equivalence_judgement)
all_justifications.append(justification)

return {
"student_final_answer": student_final_answer,
"equivalence_judgement": equivalence_judgement,
"justification": justification,
"student_final_answer": all_student_final_answers,
"equivalence_judgement": all_equivalence_judgements,
"justification": all_justifications,
}
10 changes: 5 additions & 5 deletions src/helm/benchmark/annotation/wildbench_annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from helm.benchmark.adaptation.request_state import RequestState
from helm.benchmark.annotation.annotator import Annotator
from helm.benchmark.annotation.model_as_judge import _AnnotatorModelInfo
from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo
from helm.clients.auto_client import AutoClient
from helm.common.request import Request

Expand Down Expand Up @@ -41,15 +41,15 @@ def annotate(self, request_state: RequestState) -> Any:
.replace("{$checklist}", "\n".join(request_state.instance.extra_data["checklist"]))
)

SHORT_NAME_TO_MODEL_INFO: Dict[str, _AnnotatorModelInfo] = {
"gpt": _AnnotatorModelInfo(
SHORT_NAME_TO_MODEL_INFO: Dict[str, AnnotatorModelInfo] = {
"gpt": AnnotatorModelInfo(
model_name="openai/gpt-4o-2024-05-13", model_deployment="openai/gpt-4o-2024-05-13"
),
"llama": _AnnotatorModelInfo(
"llama": AnnotatorModelInfo(
model_name="meta/llama-3.1-405b-instruct-turbo",
model_deployment="together/llama-3.1-405b-instruct-turbo",
),
"claude": _AnnotatorModelInfo(
"claude": AnnotatorModelInfo(
model_name="anthropic/claude-3-5-sonnet-20241022",
model_deployment="anthropic/claude-3-5-sonnet-20241022",
),
Expand Down
6 changes: 5 additions & 1 deletion src/helm/benchmark/metrics/omni_math_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,11 @@ def evaluate_generation(
eval_cache_path: str,
) -> List[Stat]:
assert request_state.annotations
score = request_state.annotations["omni_math"]["equivalence_judgement"].strip().upper() == "TRUE"
all_judgements = request_state.annotations["omni_math"]["equivalence_judgement"]
if len(all_judgements) == 0:
raise ValueError("Could not compute Omni-MATH accuracy because all annotators failed.")
judgement_bools = [judgement.strip().upper() == "TRUE" for judgement in all_judgements]
score = sum(judgement_bools) / len(judgement_bools)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not valid to sum an array of bools, right? You need to cast them to int first.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think bool is a subclass of int in Python, so it actually works fine..? If that introduces too much ambiguity I can go with explicit casting for sure.

>>> sum([True, False])
1
>>> issubclass(bool, int)
True

return [
Stat(MetricName("omni_math_accuracy")).add(score),
]
Loading