diff --git a/README.md b/README.md index 5fded050175..27d4ecbe04e 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,30 @@ +# Notes on this forked version +This is a fork from the original HELM for a study of enterprise benchmarking of LLMs using domain-specific datasets. + +The following scenarios are added. Please refer to the docstring of the source code of each scenario, or the page shown by `helm-server` for the details. +- Finance + - financial_phrasebank + - kpi_edgar + - conv_fin_qa + - news_headline +- Legal + - legal_opinion + - echr_judge + - casehold_qa + - legal_contract +- Climate + - sumosum +- Cyber security + - cti_mitre + +The following metrics are added or modified. +- kpi_edgar_metrics +- classification_metrics (weighted_f1) +- basic_metrics (float_equiv, a bug fix for f1_score) + +This study will be published elsewhere. +- Citation: TBD # Holistic Evaluation of Language Models diff --git a/requirements.txt b/requirements.txt index f99b82972d4..fa120952c5f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,332 +1,196 @@ -# This file is automatically generated by GitHub Actions and contains pinned versions for all transitive Python dependencies. Do not modify this file! -about-time==4.2.1 -absl-py==2.1.0 -accelerate==0.34.2 -ai2-olmo==0.5.0 -ai2-olmo-core==0.1.0 -aiodns==3.2.0 -aiohappyeyeballs==2.4.0 -aiohttp==3.10.6 +2captcha-python==1.1.3 +absl-py==1.2.0 +aiodns==3.0.0 +aiohttp==3.8.5 aiohttp-retry==2.8.3 -aiosignal==1.3.1 +aiosignal==1.2.0 aleph-alpha-client==2.14.0 -alive-progress==3.1.5 -annotated-types==0.7.0 -anthropic==0.34.2 -antlr4-python3-runtime==4.9.3 -anyio==4.6.0 -appdirs==1.4.4 -astunparse==1.6.3 -async-timeout==4.0.3 -attrs==24.2.0 -autograd==1.7.0 -autokeras==1.0.20 -awscli==1.29.85 -beautifulsoup4==4.12.3 -black==24.3.0 -blis==0.7.11 -boto3==1.28.85 -botocore==1.31.85 -bottle==0.12.25 -cached_path==1.6.3 -cachetools==5.5.0 -catalogue==2.0.10 +anthropic==0.2.5 +async-generator==1.10 +async-timeout==4.0.2 +attrs==22.1.0 +beautifulsoup4==4.11.1 +bert-score==0.3.13 +bitarray==2.7.3 +black==22.10.0 +blanc==0.2.7 +blis==0.7.8 +boto3==1.24.89 +botocore==1.27.89 +bottle==0.12.23 +cachetools==5.2.0 +catalogue==2.0.8 cattrs==22.2.0 -certifi==2024.8.30 -cffi==1.17.1 -cfgv==3.4.0 -charset-normalizer==3.3.2 -chex==0.1.86 -click==8.1.7 -clip-anytorch==2.5.2 -cloudpathlib==0.19.0 -cma==3.2.2 -cohere==5.3.5 -colorama==0.4.4 -colorcet==3.0.1 -coloredlogs==15.0.1 -confection==0.1.5 -contourpy==1.3.0 -cycler==0.12.1 -cymem==2.0.8 -dacite==1.8.1 -data==0.4 -datasets==2.21.0 -decorator==5.1.1 -Deprecated==1.2.14 -diffusers==0.24.0 -dill==0.3.8 -distlib==0.3.8 -distro==1.9.0 -dnspython==2.6.1 -docker-pycreds==0.4.0 -docstring_parser==0.16 -docutils==0.16 -einops==0.7.0 -einops-exts==0.0.4 -etils==1.5.2 -eval_type_backport==0.2.0 -exceptiongroup==1.2.2 -fairlearn==0.9.0 -fastavro==1.9.7 -filelock==3.13.1 +certifi==2023.7.22 +cffi==1.15.1 +cfgv==3.3.1 +charset-normalizer==2.1.1 +click==8.0.4 +colorama==0.4.5 +contourpy==1.0.5 +cycler==0.11.0 +cymem==2.0.6 +Cython==0.29.32 +dacite==1.6.0 +datasets==2.14.7 +dill==0.3.5.1 +distlib==0.3.6 +emoji==2.1.0 +et-xmlfile==1.1.0 +exceptiongroup==1.1.0 +filelock==3.8.0 flake8==5.0.4 -flatbuffers==24.3.25 -flax==0.6.11 -fonttools==4.54.1 -frozenlist==1.4.1 -fsspec==2024.2.0 -ftfy==6.1.3 -funcsigs==1.0.2 -future==1.0.0 -gast==0.6.0 -gdown==5.2.0 -gitdb==4.0.11 -GitPython==3.1.43 -google-api-core==2.20.0 -google-api-python-client==2.147.0 -google-auth==2.35.0 -google-auth-httplib2==0.2.0 -google-cloud-aiplatform==1.68.0 -google-cloud-bigquery==3.25.0 -google-cloud-core==2.4.1 -google-cloud-resource-manager==1.12.5 -google-cloud-storage==2.18.2 -google-cloud-translate==3.11.3 -google-crc32c==1.6.0 -google-pasta==0.2.0 -google-resumable-media==2.7.2 -googleapis-common-protos==1.65.0 -grapheme==0.6.0 -grpc-google-iam-v1==0.13.1 -grpcio==1.66.1 -grpcio-status==1.62.3 -gunicorn==23.0.0 +fonttools==4.37.4 +frozenlist==1.3.1 +fsspec==2023.4.0 +gdown==4.4.0 +gevent==21.12.0 +gin-config==0.5.0 +google-api-core==2.10.1 +google-api-python-client==2.64.0 +google-auth==2.12.0 +google-auth-httplib2==0.1.0 +google-cloud-aiplatform==1.36.4 +googleapis-common-protos==1.56.4 +greenlet==1.1.3 +gunicorn==20.1.0 h11==0.14.0 -h5py==3.12.0 -html2text==2024.2.26 -httpcore==1.0.5 -httplib2==0.22.0 -httpx==0.25.2 -httpx-sse==0.4.0 -huggingface-hub==0.23.5 -humanfriendly==10.0 -humanize==4.10.0 +httplib2==0.20.4 +huggingface-hub==0.16.4 icetk==0.0.4 -identify==2.6.1 -idna==3.10 -ImageHash==4.3.1 -imageio==2.35.1 -importlib-resources==5.13.0 -importlib_metadata==8.5.0 -iniconfig==2.0.0 -jax==0.4.30 -jaxlib==0.4.30 -jieba==0.42.1 -Jinja2==3.1.3 -jiter==0.5.0 +identify==2.5.6 +idna==3.4 +importlib-metadata==6.0.0 +importlib-resources==5.10.0 +iniconfig==1.1.1 +Jinja2==3.1.2 jmespath==1.0.1 -joblib==1.4.2 -jsonschema==4.23.0 -jsonschema-specifications==2023.12.1 -jstyleson==0.0.2 -keras==3.5.0 -keras-tuner==1.4.7 -kiwisolver==1.4.7 -kt-legacy==1.0.5 -langcodes==3.4.1 +joblib==1.2.0 +kiwisolver==1.4.4 +langcodes==3.3.0 langdetect==1.0.9 -language_data==1.2.0 -latex==0.7.0 -lazy_loader==0.4 -libclang==18.1.1 -lightning-utilities==0.11.7 -llvmlite==0.43.0 -lpips==0.1.4 -lxml==5.3.0 -Mako==1.3.5 -marisa-trie==1.2.0 -Markdown==3.7 -markdown-it-py==3.0.0 -MarkupSafe==2.1.5 -matplotlib==3.6.3 +llvmlite==0.39.1 +lxml==4.9.1 +Mako==1.2.3 +MarkupSafe==2.1.1 +matplotlib==3.6.0 mccabe==0.7.0 -mdurl==0.1.2 -mistralai==0.0.12 -ml-dtypes==0.4.1 +moverscore==1.0.3 mpmath==1.3.0 -msgpack==1.1.0 -multidict==6.1.0 -multilingual-clip==1.0.10 -multiprocess==0.70.16 -murmurhash==1.0.10 +multidict==6.0.2 +multiprocess==0.70.13 +murmurhash==1.0.8 mypy==1.5.1 mypy-extensions==1.0.0 -namex==0.0.8 -natsort==8.4.0 -nest-asyncio==1.6.0 -networkx==3.2.1 -ninja==1.11.1.1 -nltk==3.8.1 -nncf==2.13.0 -nodeenv==1.9.1 -NudeNet==2.0.9 -numba==0.60.0 -numpy==1.26.3 -omegaconf==2.3.0 -onnx==1.16.2 -onnxruntime==1.19.2 -open_clip_torch==2.26.1 -openai==1.48.0 -OpenCC==1.1.6 -opencv-python==4.7.0.72 -opencv-python-headless==4.10.0.84 -openvino==2024.4.0 -openvino-telemetry==2024.1.0 -openvino-tokenizers==2024.4.0.0 -opt-einsum==3.3.0 -optax==0.2.3 -optimum==1.22.0 -optimum-intel==1.19.0 -optree==0.12.1 -orbax-checkpoint==0.6.4 -orjson==3.10.7 -outcome==1.3.0.post0 -packaging==24.1 -pandas==2.2.3 -param==2.1.1 -parameterized==0.9.0 -pathspec==0.12.1 -pathtools==0.1.2 -pdf2image==1.16.3 -pillow==10.4.0 -platformdirs==4.3.6 -pluggy==1.5.0 -portalocker==2.10.1 +networkx==2.8.7 +nltk==3.7 +nodeenv==1.7.0 +numba==0.56.4 +numpy==1.23.3 +openai==0.27.8 +openpyxl==3.0.10 +outcome==1.2.0 +packaging==21.3 +pandas==1.5.0 +pandas-stubs==1.5.0.221003 +parameterized==0.8.1 +pathspec==0.10.1 +pathy==0.10.2 +Pillow==9.3.0 +platformdirs==2.5.2 +pluggy==1.0.0 +portalocker==2.5.1 pre-commit==2.20.0 -preshed==3.0.9 -progressbar2==4.5.0 -proto-plus==1.24.0 -protobuf==4.25.5 -psutil==6.0.0 -pyarrow==17.0.0 -pyarrow-hotfix==0.6 -pyasn1==0.6.1 -pyasn1_modules==0.4.1 -pycares==4.4.0 -pycocoevalcap==1.2 -pycocotools==2.0.8 +preshed==3.0.7 +protobuf==3.20.2 +psutil==5.9.2 +pyarrow==11.0.0 +pyasn1==0.4.8 +pyasn1-modules==0.2.8 +pycares==4.3.0 pycodestyle==2.9.1 -pycparser==2.22 -pyct==0.5.0 -pydantic==2.9.2 -pydantic_core==2.23.4 -pydload==1.0.9 -pydot==2.0.0 +pycparser==2.21 +pydantic==1.8.2 +pyemd==0.5.1 +pyext==0.7 pyflakes==2.5.0 -Pygments==2.18.0 -pyhocon==0.3.61 -pymongo==4.9.1 -pymoo==0.6.1.3 -pyonmttok==1.37.0 -pyparsing==3.1.4 -pypinyin==0.49.0 +pyhocon==0.3.59 +pymongo==4.2.0 +pyparsing==2.4.7 PySocks==1.7.1 -pytest==7.2.2 -pythainlp==5.0.0 +pytest==7.2.0 python-dateutil==2.8.2 -python-utils==3.9.0 -pytorch-fid==0.3.0 -pytorch-lightning==2.0.9.post0 -pytrec_eval==0.5 -pytz==2024.2 -PyWavelets==1.6.0 -PyYAML==6.0.2 -referencing==0.35.1 -regex==2024.9.11 -reka-api==2.0.0 -requests==2.32.3 +pytorch-pretrained-bert==0.6.2 +pytrec-eval==0.5 +pytz==2022.4 +PyYAML==6.0 +regex==2022.9.13 +requests==2.31.0 +responses==0.18.0 retrying==1.3.4 -rich==13.8.1 -rouge_score==0.1.2 -rpds-py==0.20.0 -rsa==4.7.2 -s3transfer==0.7.0 +rouge-score==0.1.2 +rsa==4.9 +s3transfer==0.6.0 sacrebleu==2.2.1 -safetensors==0.4.5 -scaleapi==2.13.1 -scikit-image==0.24.0 -scikit-learn==1.5.2 -scipy==1.13.1 -seaborn==0.11.2 -selenium==4.17.2 -sentencepiece==0.1.99 -sentry-sdk==2.14.0 -setproctitle==1.3.3 -shapely==2.0.6 -shellingham==1.5.4 -shutilwhich==1.1.0 -simple-slurm==0.2.7 +sacremoses==0.0.53 +scaleapi==2.13.0 +scikit-learn==1.1.2 +scipy==1.10.0 +selenium==4.8.0 +sentencepiece==0.1.97 +simple-slurm==0.2.6 six==1.16.0 -smart-open==7.0.4 -smmap==5.0.1 -sniffio==1.3.1 +smart-open==5.2.1 +sniffio==1.3.0 sortedcontainers==2.4.0 -soupsieve==2.6 -spacy==3.7.6 +soupsieve==2.3.2.post1 +spacy==3.5.4 spacy-legacy==3.0.12 -spacy-loggers==1.0.5 +spacy-loggers==1.0.3 sqlitedict==1.7.0 -srsly==2.4.8 -surge-api==1.1.4 +srsly==2.4.4 +stanza==1.4.2 +summ-eval==0.892 +surge-api==1.1.0 sympy==1.11.1 tabulate==0.9.0 -tempdir==0.7.1 -tensorboard==2.17.1 -tensorboard-data-server==0.7.2 -tensorflow==2.17.0 -tensorflow-io-gcs-filesystem==0.37.1 -tensorstore==0.1.65 -termcolor==2.4.0 -thinc==8.2.5 -threadpoolctl==3.5.0 -tifffile==2024.8.30 -tiktoken==0.7.0 -timm==0.6.13 -together==1.2.13 -tokenizers==0.19.1 +thinc==8.1.12 +threadpoolctl==3.1.0 +tiktoken==0.3.3 +tls-client==0.1.8 +tokenizers==0.13.3 toml==0.10.2 tomli==2.0.1 -toolz==0.12.1 -torch~=2.2.2 -torch-fidelity==0.3.0 -torchmetrics==0.11.4 -torchvision~=0.17.2 -tqdm==4.66.5 -transformers==4.44.2 -transformers-stream-generator==0.0.5 -trio==0.26.2 -trio-websocket==0.11.1 -typer==0.12.5 -types-requests==2.31.0.6 -types-urllib3==1.26.25.14 -typing_extensions==4.12.2 -tzdata==2024.2 +torch==1.12.1 ; sys_platform == "darwin" +torchvision==0.13.1 ; sys_platform == "darwin" +torch==1.12.1+cu113 ; sys_platform == "linux" +torchvision==0.13.1+cu113 ; sys_platform == "linux" +tqdm==4.64.1 +transformers==4.33.1 +trio==0.22.0 +trio-websocket==0.9.2 +typer==0.4.2 +types-Pillow==9.3.0.4 +types-pytz==2022.4.0.0 +types-redis==4.3.21.1 +types-requests==2.28.11.2 +types-tabulate==0.9.0.0 +types-urllib3==1.26.25 +typing==3.7.4.3 +typing_extensions==4.4.0 uncertainty-calibration==0.1.4 -Unidecode==1.3.6 +undetected-chromedriver==3.2.1 uritemplate==4.1.1 -urllib3==1.26.20 -virtualenv==20.26.5 -wandb==0.13.11 -wasabi==1.1.3 -wcwidth==0.2.13 -weasel==0.4.1 -websocket-client==1.3.3 -Werkzeug==3.0.4 -wrapt==1.16.0 +urllib3==1.26.12 +virtualenv==20.16.5 +wasabi==0.10.1 +websocket-client==1.3.2 +websockets==10.4 wsproto==1.2.0 xlrd==2.0.1 -xxhash==3.5.0 -yarl==1.12.1 -zipp==3.20.2 +xxhash==3.0.0 +yarl==1.8.1 +zipp==3.11.0 +zope.event==4.5.0 +zope.interface==5.4.0 zstandard==0.18.0 diff --git a/setup.cfg b/setup.cfg index 85bf4e10ba3..11997b42ba4 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = crfm-helm -version = 0.5.4 +version = 0.4.0 author = Stanford CRFM author_email = contact-crfm@stanford.edu description = Benchmark for language models @@ -9,16 +9,13 @@ long_description_content_type = text/markdown keywords = language models benchmarking license = Apache License 2.0 classifiers = - Programming Language :: Python :: 3 Programming Language :: Python :: 3 :: Only - Programming Language :: Python :: 3.9 - Programming Language :: Python :: 3.10 - Programming Language :: Python :: 3.11 + Programming Language :: Python :: 3.8 License :: OSI Approved :: Apache Software License url = https://github.com/stanford-crfm/helm [options] -python_requires = >=3.9,<3.12 +python_requires = >=3.8,<3.11 package_dir = =src packages = find: @@ -27,59 +24,59 @@ include_package_data = True install_requires= # Common - cattrs~=22.2 - dacite~=1.6 - importlib-resources~=5.10 - Mako~=1.2 - numpy~=1.23 + cattrs~=22.2.0 + dacite~=1.6.0 + importlib-resources~=5.10.0 + Mako~=1.2.3 + numpy~=1.23.3 pyhocon~=0.3.59 - retrying~=1.3 - spacy~=3.5 - tqdm~=4.64 + retrying~=1.3.4 + spacy~=3.5.3 + tqdm~=4.64.1 zstandard~=0.18.0 # sqlitedict==2.0.0 is slow! https://github.com/RaRe-Technologies/sqlitedict/issues/152 # Keep sqlitedict version at 1.7.0. - sqlitedict~=1.7 + sqlitedict~=1.7.0 bottle~=0.12.23 # Basic Scenarios - datasets~=2.17 + datasets~=2.14.7 pyarrow>=11.0.0 # Pinned transitive dependency for datasets; workaround for #1026 - pyarrow-hotfix~=0.6 # Hotfix for CVE-2023-47248 # Basic metrics - nltk~=3.7,<3.8.2 # See https://github.com/stanford-crfm/helm/issues/2926 + nltk~=3.7 + pyext~=0.7 rouge-score~=0.1.2 - scipy~=1.10 + scipy~=1.10.0 uncertainty-calibration~=0.1.4 - scikit-learn~=1.1 + scikit-learn~=1.1.2 # Models and Metrics Extras - transformers~=4.40 # For anthropic_client, vision_language.huggingface_vlm_client, huggingface_client, huggingface_tokenizer, test_openai_token_cost_estimator, model_summac (via summarization_metrics) + transformers~=4.33.1 # For anthropic_client, huggingface_client, huggingface_tokenizer, test_openai_token_cost_estimator, model_summac (via summarization_metrics) # TODO: Upgrade torch - we need > 2.0.0 for newer versions of transformers - torch>=1.13.1,<3.0.0 # For huggingface_client, yalm_tokenizer, model_summac (via summarization_metrics) - torchvision>=0.14.1,<3.0.0 # For huggingface_client, yalm_tokenizer, model_summac (via summarization_metrics) + torch>=1.12.1,<3.0.0 # For huggingface_client, yalm_tokenizer, model_summac (via summarization_metrics) + torchvision>=0.13.1,<3.0.0 # For huggingface_client, yalm_tokenizer, model_summac (via summarization_metrics) + + # Metrics Extras + google-api-python-client~=2.64.0 # For perspective_api_client via toxicity_metrics [options.extras_require] proxy-server = - gunicorn>=20.1 + gunicorn~=20.1.0 human-evaluation = scaleapi~=2.13.0 surge-api~=1.1.0 scenarios = - gdown~=5.1 # For disinformation_scenario, med_mcqa_scenario, med_qa_scenario: used by ensure_file_downloaded() + gdown~=4.4.0 # For disinformation_scenario, med_mcqa_scenario, med_qa_scenario: used by ensure_file_downloaded() sympy~=1.11.1 # For numeracy_scenario xlrd~=2.0.1 # For ice_scenario: used by pandas.read_excel() metrics = - google-api-python-client~=2.64 # For perspective_api_client via toxicity_metrics - numba~=0.56 # For copyright_metrics + numba~=0.56.4 # For copyright_metrics pytrec_eval==0.5 # For ranking_metrics sacrebleu~=2.2.1 # For disinformation_metrics, machine_translation_metrics - -summarization = summ-eval~=0.892 # For summarization_metrics plots = @@ -87,9 +84,6 @@ plots = matplotlib~=3.6.0 seaborn~=0.11.0 -decodingtrust = - fairlearn~=0.9.0 - slurm = simple-slurm~=0.2.6 @@ -101,174 +95,41 @@ cleva = langdetect==1.0.9 images = - crfm-helm[accelerate] - pillow~=10.2 + accelerate~=0.23.0 # For the newer versions of Transformers + pillow~=9.4.0 mongo = - pymongo~=4.2 - -unitxt = - evaluate~=0.4.1 - -bhasa = - pythainlp==5.0.0 - pyonmttok==1.37.0 - sacrebleu~=2.2.1 + pymongo~=4.2.0 # Model extras -accelerate = - accelerate~=0.25 - aleph-alpha = aleph-alpha-client~=2.14.0 - tokenizers>=0.13.3 - -openvino = - optimum[openvino]~=1.19 - -allenai = - ai2-olmo~=0.2 - -amazon = - boto3~=1.28.57 - awscli~=1.29.57 - botocore~=1.31.57 + tokenizers~=0.13.3 anthropic = - anthropic~=0.17 + anthropic~=0.2.5 websocket-client~=1.3.2 # For legacy stanford-online-all-v4-s3 -cohere = - cohere~=5.3 - -mistral = - mistralai~=0.0.11 - openai = - openai~=1.0 - tiktoken~=0.7 - pydantic~=2.0 # For model_dump(mode="json") - openai only requires pydantic>=1.9.0 + openai~=0.27.8 + tiktoken~=0.3.3 google = - google-cloud-aiplatform~=1.48 + google-cloud-aiplatform~=1.36.4 -together = - together~=1.1 +tsinghua = + icetk~=0.0.4 yandex = sentencepiece~=0.1.97 models = - crfm-helm[ai21] - crfm-helm[accelerate] crfm-helm[aleph-alpha] - crfm-helm[allenai] - crfm-helm[amazon] crfm-helm[anthropic] - crfm-helm[cohere] crfm-helm[google] - crfm-helm[mistral] crfm-helm[openai] - crfm-helm[reka] - crfm-helm[together] + crfm-helm[tsinghua] crfm-helm[yandex] - crfm-helm[openvino] - -reka = - reka-api~=2.0.0 - -vlm = - crfm-helm[openai] - - # For OpenFlamingo - einops~=0.7.0 - einops-exts~=0.0.4 - open-clip-torch~=2.24 - - # For IDEFICS - torch~=2.1 - - # For Qwen: https://github.com/QwenLM/Qwen-VL/blob/master/requirements.txt - transformers_stream_generator~=0.0.4 - scipy~=1.10 - torchvision>=0.14.1,<3.0.0 - - # For Reka AI - crfm-helm[reka] - - # VLM scenarios - crfm-helm[images] - crfm-helm[image2struct] - - # For metrics - pycocoevalcap~=1.2 - -image2struct = - crfm-helm[images] - - # Latex - # You will need to install LaTeX separately. - # You can run `sudo apt-get install texlive-full` on Ubuntu. - latex~=0.7.0 - pdf2image~=1.16.3 - - # Webpage - # You will need install Jekyll separately. - selenium~=4.17.2 - html2text~=2024.2.26 - - # Metrics - opencv-python~=4.7.0.68 - lpips~=0.1.4 - imagehash~=4.3.1 # for caching - -heim = - # HEIM scenarios - gdown~=5.1 - - # HEIM models - diffusers~=0.24.0 - icetk~=0.0.4 - jax~=0.4.13 - jaxlib~=0.4.13 - crfm-helm[openai] - - # For model, kakaobrain/mindall-e - einops~=0.7.0 - omegaconf~=2.3.0 - pytorch-lightning~=2.0.5 - - # For model, craiyon/dalle-mini and craiyon/dalle-mega - flax~=0.6.11 - ftfy~=6.1.1 - Unidecode~=1.3.6 - wandb~=0.13.11 - - # HEIM perturbations - google-cloud-translate~=3.11.2 - - # HEIM metrics - autokeras~=1.0.20 - clip-anytorch~=2.5.0 - google-cloud-storage~=2.9 - lpips~=0.1.4 - multilingual-clip~=1.0.10 - NudeNet~=2.0.9 - opencv-python~=4.7.0.68 - pytorch-fid~=0.3.0 - tensorflow~=2.11 - timm~=0.6.12 - torch-fidelity~=0.3.0 - torchmetrics~=0.11.1 - - # Transitive dependency of NudeNet - # This needs to be a version that provides wheels for all Python versions - # supported by crfm-helm i.e. Python 3.9, 3.10, 3.11, 3.12 - # Disallow version 0.23.* because it has no Python 3.9 wheels. - scikit-image>=0.22,==0.*,!=0.23.* - - # Shared image dependencies - crfm-helm[images] # Install everything all = @@ -277,29 +138,20 @@ all = crfm-helm[scenarios] crfm-helm[metrics] crfm-helm[plots] - crfm-helm[decodingtrust] crfm-helm[slurm] crfm-helm[cleva] crfm-helm[images] crfm-helm[models] crfm-helm[mongo] - crfm-helm[heim] - crfm-helm[vlm] - crfm-helm[bhasa] - # crfm-helm[dev] is excluded because end-users don't need it. - # crfm-helm[summarize] is excluded because it requires torch<2.0 - # TODO(#2280): Add crfm-helm[summarize] back. # Development only # Do not include in all dev = pytest~=7.2.0 + black~=22.10.0 + mypy~=1.5.1 pre-commit~=2.20.0 - # Errors produced by type checkers and linters are very version-specific - # so they are pinned to an exact version. - black==24.3.0 - mypy==1.5.1 - flake8==5.0.4 + flake8~=5.0.4 [options.entry_points] console_scripts = @@ -318,11 +170,7 @@ exclude = # Settings for Flake8: Tool For Style Guide Enforcement [flake8] max-line-length = 120 -exclude = - venv/* - src/helm/clients/image_generation/dalle_mini/* - src/helm/clients/image_generation/mindalle/* - src/helm/clients/vision_language/open_flamingo/* +exclude = venv/* # Ignore completely: # E203 - White space before ':', (conflicts with black) @@ -340,24 +188,12 @@ check_untyped_defs = True disable_error_code = annotation-unchecked # TODO: Change disallow_untyped_defs to True disallow_untyped_defs = False -exclude = dalle_mini|mindalle|open_flamingo [tool:pytest] addopts = - # By default: - # - we don't test models because doing so will - # make real requests and spend real money - # - we don't test scenarios because these will - # download files, which is slow, consumes disk - # space, and increases the chance of spurious - # test failures due to failed downloads. - # - # For more documentation on pytest markers, see: - # - https://docs.pytest.org/en/latest/how-to/mark.html#mark - # - https://docs.pytest.org/en/latest/example/markers.html#mark-examples - -m 'not models and not scenarios' + # By default, we don't test models because doing so will + # make real requests and spend real money + -m 'not models' markers = - # Marker for model tests that make real model requests + # Marker for tests that make real model requests models - # Marker for scenario tests that download files - scenarios diff --git a/src/helm/benchmark/metrics/basic_metrics.py b/src/helm/benchmark/metrics/basic_metrics.py index 03d6c113f48..8b6371ca74f 100644 --- a/src/helm/benchmark/metrics/basic_metrics.py +++ b/src/helm/benchmark/metrics/basic_metrics.py @@ -1,19 +1,24 @@ -from collections import defaultdict import math -from dataclasses import dataclass -from typing import List, Dict, Set +from dataclasses import dataclass, replace +from typing import List, Callable, Optional, Dict, Tuple, Set, cast from urllib.parse import unquote +from functools import partial +import json +import string +import nltk import numpy as np +import re import scipy import calibration as cal -from helm.benchmark.adaptation.scenario_state import ScenarioState -from helm.benchmark.metrics.evaluate_reference_metrics import compute_reference_metrics -from helm.benchmark.metrics.efficiency_metrics import EfficiencyMetric -from helm.benchmark.metrics.reference_metric import ReferenceMetric +import importlib_resources as resources +from nltk.metrics.scores import f_measure +from nltk.tokenize import word_tokenize +from nltk.translate.bleu_score import sentence_bleu +from rouge_score import rouge_scorer from helm.common.hierarchical_logger import hlog -from helm.common.request import Token, GeneratedOutput +from helm.common.request import Token, Sequence from helm.benchmark.adaptation.adapters.adapter_factory import ( ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED, @@ -24,11 +29,200 @@ from helm.benchmark.window_services.window_service import WindowService from helm.benchmark.window_services.window_service_factory import WindowServiceFactory from helm.benchmark.window_services.tokenizer_service import TokenizerService -from helm.benchmark.scenarios.scenario import CORRECT_TAG, Instance -from .metric import Metric, MetricInterface, MetricResult, add_context, get_unique_stat_by_name -from .metric_name import MetricContext, MetricName +from helm.benchmark.scenarios.scenario import CORRECT_TAG, Instance, Reference +from helm.benchmark.scenarios.math_scenario import is_equiv, is_equiv_chain_of_thought +from helm.benchmark.scenarios.conv_fin_qa_scenario import float_equiv +from helm.benchmark.scenarios.code_scenario import CodeReference +from helm.benchmark.metrics.cleva_metrics_helper import ChineseTokenizer +from . import code_metrics_helper +from .metric import Metric, get_unique_stat_by_name +from .metric_name import MetricName from .metric_service import MetricService -from .statistic import Stat, merge_stat +from .statistic import Stat + + +try: + nltk.data.find("tokenizers/punkt") +except LookupError: + nltk.download("punkt") # Required for rouge + + +EFFICIENCY_DATA_PACKAGE: str = "helm.benchmark.efficiency_data" + +INFERENCE_IDEALIZED_RUNTIMES_JSON_FILENAME: str = "inference_idealized_runtimes.json" +INFERENCE_DENOISED_RUNTIMES_JSON_FILENAME: str = "inference_denoised_runtimes.json" +TRAINING_EFFICIENCY_JSON_FILENAME: str = "training_efficiency.json" + + +def compute_estimated_time_from_prompt_size_and_num_output_tokens( + request_state: RequestState, + inference_runtimes_dict: Dict[str, Dict], + num_prompt_tokens: int, + num_output_tokens: int, +) -> Optional[float]: + estimated_runtime: Optional[float] + if request_state.request.model_deployment in inference_runtimes_dict: + inference_runtimes_dict_for_model = inference_runtimes_dict[request_state.request.model_deployment] + runtime_per_output_token: float = inference_runtimes_dict_for_model["runtime_per_output_token"] + raw_runtimes_for_prompt_tokens: Dict[str, float] = inference_runtimes_dict_for_model[ + "runtime_for_prompt_tokens" + ] + runtimes_for_prompt_tokens: Dict[int, float] = {int(k): v for (k, v) in raw_runtimes_for_prompt_tokens.items()} + + runtime_for_prompt_tokens: Optional[float] = None + largest_num_tokens_in_efficiency_dict: int = max(runtimes_for_prompt_tokens.keys()) + # Find the smallest num_prompt_tokens larger than the number of tokens in the given prompt, + # then scale runtime in dict by (num_prompt_tokens / key) to get more accurate estimate: we + # assume that we can encode the prompt at the same throughput as the smallest key larger than + # num_prompt_tokens, and number of compute operations scales linearly with num_prompt_tokens. + for key in sorted(runtimes_for_prompt_tokens.keys()): + if num_prompt_tokens <= key: + runtime_for_prompt_tokens = runtimes_for_prompt_tokens[key] * (num_prompt_tokens / key) + break + # If number of tokens in the prompt exceeds the largest key in the efficiency dict, then + # estimate the prompt encoding time by linearly scaling up the runtime for the largest + # key (this is reasonably accurate under certain simplifying assumptions). + if runtime_for_prompt_tokens is None: + runtime_for_prompt_tokens = runtimes_for_prompt_tokens[largest_num_tokens_in_efficiency_dict] * ( + num_prompt_tokens / largest_num_tokens_in_efficiency_dict + ) + overhead: Optional[float] = inference_runtimes_dict_for_model.get("overhead") + + # Idealized runtime is sum of the runtime of encoding the input tokens, the runtime of + # generating `num_output_tokens` (`runtime_per_output_token` * (`num_output_tokens` - 1)) + # if number of output tokens is greater than 0, otherwise just `runtime_for_prompt_tokens`, + # and the overhead if available. + estimated_runtime = runtime_for_prompt_tokens + if num_output_tokens > 0: + estimated_runtime += runtime_per_output_token * (num_output_tokens - 1) + # Add overhead if it is available. + if overhead is not None: + estimated_runtime += overhead + else: + estimated_runtime = None + + return estimated_runtime + + +def pass_at_k_estimator(n: int, c: int, k: int) -> float: + """Calculates 1 - comb(n - c, k) / comb(n, k). + + Numerically stable version defined in + https://arxiv.org/pdf/2107.03374.pdf + """ + if n - c < k: + return 1.0 + return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1)) + + +def normalize_text(text: str) -> str: + """Lower text and remove punctuation, articles and extra whitespace. + Copied from the [QuAC](http://quac.ai/) evaluation script found at + https://s3.amazonaws.com/my89public/quac/scorer.py""" + + def remove_articles(text: str) -> str: + return re.sub(r"\b(a|an|the)\b", " ", text) + + def white_space_fix(text: str) -> str: + return " ".join(text.split()) + + def remove_punc(text: str) -> str: + exclude = set(string.punctuation) + return "".join(ch for ch in text if ch not in exclude) + + def lower(text: str) -> str: + return text.lower() + + return white_space_fix(remove_articles(remove_punc(lower(text)))) + + +def exact_match(gold: str, pred: str) -> float: + if not pred: + return 0 + + return 1 if gold.strip() == pred.strip() else 0 + + +def quasi_exact_match(gold: str, pred: str) -> float: + if not pred: + return 0 + + return 1 if normalize_text(gold) == normalize_text(pred) else 0 + + +def prefix_exact_match(gold: str, pred: str) -> float: + """ + The `prefix_exact_match` metric is particularly useful in the zero-shot setting, where the model is + not given examples of the expected outputs and tends to output more tokens than it should. + + For example, for this zero-shot prompt from BoolQ, + + Passage: Elmendorf Air Force Base (IATA: EDF, ICAO: PAED, FAA LID: EDF) is a United States military facility + in Anchorage, the largest city in Alaska. Originally known as Elmendorf Field, it became Elmendorf Air Force + Base after World War II, and in 2010 it merged with nearby Fort Richardson to form Joint Base Elmendorf-Richardson. + Question: Is there an air force base in anchorage alaska? + Answer: + + the model could output up to `max_tokens` number of tokens "Yes, Elmendorf" instead of just "Yes". + """ + if not pred: + return 0 + + return 1 if pred.strip().startswith(gold.strip()) else 0 + + +def quasi_prefix_exact_match(gold: str, pred: str) -> float: + """ + Same thing as `prefix_exact_match` but we normalize the text before checking if the prefix match. + """ + if not pred: + return 0 + + return 1 if normalize_text(pred).startswith(normalize_text(gold)) else 0 + + +def f1_score(gold: str, pred: str) -> float: + if not pred: # answer is None + return 0.0 + + ret = f_measure(set(normalize_text(gold).split()), set(normalize_text(pred).split())) + if ret is None: # answer is the empty string after normalizing + return 0.0 + + return ret + + +def exact_match_indicator(gold: str, pred: str, indicator: str = " ") -> float: + """ + Exact match, allowing for some preceding context. + For example, the following two answers are considered matching: + - Because of x and y, the answer is ## + - Given reasons y and z, the answer is ## + While the following is considered different from the earlier two + - Given reasons x and a, the answer is ## + """ + pred = pred.split(indicator)[-1].strip() + gold = gold.split(indicator)[-1].strip() + return exact_match(gold, pred) + + +def final_number_exact_match(gold: str, pred: str) -> float: + """ + Returns 1 iff the final number in gold and pred match. + Similar to exact_match_indicator. + Example: + - gold = "The answer is 15." + - pred = "The answer is 15 eggs." + - Returns 1 + """ + + def get_final_number(x: str) -> str: + matches = re.findall(r"-?[\d,]+(?:.\d+)?", x) + if not matches: + return "" + return matches[-1].replace(",", "") + + return exact_match(get_final_number(gold), get_final_number(pred)) def get_num_bytes(tokens: List[Token]) -> int: @@ -80,6 +274,123 @@ def convert_tokens_to_text(tokens: List[Token]) -> List[Dict]: return groups +def rouge_score(gold: str, pred: str, rouge_type: str, scorer: rouge_scorer.RougeScorer) -> float: + scores = scorer.score(gold, pred) + return scores[rouge_type].fmeasure + + +def get_rouge_function(rouge_type: str) -> Callable[[str, str], float]: + scorer = rouge_scorer.RougeScorer([rouge_type], use_stemmer=True) + return partial(rouge_score, scorer=scorer, rouge_type=rouge_type) + + +def bleu_1(gold: str, pred: str) -> float: + return sentence_bleu([word_tokenize(gold)], word_tokenize(pred), weights=(1, 0, 0, 0)) + + +def chinese_bleu_1(gold: str, pred: str) -> float: + char_tokenizer = ChineseTokenizer() + return sentence_bleu([char_tokenizer.tokenize(gold)], char_tokenizer.tokenize(pred), weights=(1, 0, 0, 0)) + + +def get_chinese_rouge_function(rouge_type: str) -> Callable[[str, str], float]: + char_tokenizer = ChineseTokenizer() + scorer = rouge_scorer.RougeScorer([rouge_type], use_stemmer=True, tokenizer=char_tokenizer) + return partial(rouge_score, scorer=scorer, rouge_type=rouge_type) + + +def cleva_math_result_match(gold: str, pred: str) -> float: + """ + Exact match that only cares the last math expression. + Common math expressions are numbers and fractions. + """ + pattern = r"[-+*/%\.\(\)\d]+" + matches = re.findall(pattern, pred) + if matches: + pred = matches[-1].lstrip(")") + # remove space in front or at the end + pred = pred.strip() + return exact_match(gold, pred) + + +def bleu_4(gold: str, pred: str) -> float: + return sentence_bleu([word_tokenize(gold)], word_tokenize(pred), weights=(0, 0, 0, 1)) + + +def extract_set_from_text( + set_str: str, + set_start_str: str = " is ", + set_separator: str = " and ", + empty_set_str: str = "Nothing.", +) -> Set[str]: + """ + Given a string, extract the set of strings implied by that string. + set_start_str denotes the start of the set + set_separator denotes the string separating set elements + empty_set_str is the string which denotes the empty set + """ + if set_str == empty_set_str: + return set() + set_str = set_str.replace(".", "") + extracted_set = set(set_str.split(set_start_str)[-1].split(set_separator)) + return extracted_set + + +def extract_gold_pred_sets(gold: str, pred: str) -> Tuple[Set[str], Set[str]]: + """Extract the set of strings implied by the gold and pred strings""" + gold_set = extract_set_from_text(gold) + pred_set = extract_set_from_text(pred.split("\n")[0]) + return gold_set, pred_set + + +def iou_set_match(gold: str, pred: str) -> float: + """Compute the intersection over union of the gold and pred sets""" + gold_set, pred_set = extract_gold_pred_sets(gold, pred) + if len(gold_set) == 0: # If gold is empty, just check if the pred set is also empty + return float(gold_set == pred_set) + return len(gold_set.intersection(pred_set)) / len(gold_set.union(pred_set)) + + +def f1_set_match(gold: str, pred: str) -> float: + """Compute the F1 score of the gold and pred sets""" + gold_set, pred_set = extract_gold_pred_sets(gold, pred) + if len(gold_set) == 0: # If gold is empty, just check if the pred set is also empty + return float(gold_set == pred_set) + true_positives = gold_set.intersection(pred_set) + return 2 * len(true_positives) / (len(gold_set) + len(pred_set)) + + +def exact_set_match(gold: str, pred: str) -> float: + """Compute whether the sets generated exactly match""" + gold_set, pred_set = extract_gold_pred_sets(gold, pred) + return float(gold_set == pred_set) + + +def absolute_value_difference(gold: str, pred: str) -> float: + """Compute the absolute value of the difference between two numbers (provided as strings), + or 0.0 if invalid input. + """ + + def maybe_int(text: str): + """Parse int, ignoring commas in numbers.""" + try: + val = int(text.replace(",", "")) + except ValueError: + return 0.0 + return val + + gold_val = maybe_int(gold) + pred_val = maybe_int(pred) + return abs(gold_val - pred_val) + + +def code_eval(gold: Tuple[str, Optional[Dict]], pred: str) -> float: + """Evaluate Code Correctness on test examples.""" + assert gold[1] is not None # gold[1]["canonical_solution"] + # Warning: will execute machine generated code; need to sandbox before executing + return float(code_metrics_helper.check_correctness(gold[1], pred, 3.0)["passed"]) # type: ignore + + def compute_perplexity_metrics(stats: Dict[MetricName, Stat]) -> List[Stat]: # TODO: find out the root cause and undo num_X > 0 check # https://github.com/stanford-crfm/benchmarking/issues/350 @@ -104,37 +415,7 @@ def compute_perplexity_metrics(stats: Dict[MetricName, Stat]) -> List[Stat]: return derived_stats -class InstancesPerSplitMetric(MetricInterface): - """Report the average num_instances in each MetricContext across train_trials.""" - - def evaluate( - self, scenario_state: ScenarioState, metric_service: MetricService, eval_cache_path: str, parallelism: int - ) -> MetricResult: - adapter_spec = scenario_state.adapter_spec - global_stats: Dict[MetricName, Stat] = {} - - for train_trial_index in range(adapter_spec.num_train_trials): - trial_stats: Dict[MetricName, Stat] = {} # Statistics just for this trial - # Group instances in this train_trial by context. - instances_per_metric_context: Dict[MetricContext, Set[Instance]] = defaultdict(set) - for request_state in scenario_state.request_states: - if request_state.train_trial_index == train_trial_index: - instances_per_metric_context[MetricContext.from_instance(request_state.instance)].add( - request_state.instance - ) - for context, instance_set in instances_per_metric_context.items(): - stat = Stat(MetricName("num_instances")).add(len(instance_set)) - merge_stat(trial_stats, add_context(stat, context)) - - # We take the mean value for each trial. - for stat in trial_stats.values(): - merge_stat(global_stats, stat.take_mean()) - - # There are no per-instance Stats. - return MetricResult(list(global_stats.values()), []) - - -class BasicGenerationMetric(Metric): +class BasicMetric(Metric): """ Defines basic metrics which don't require domain knowledge. This should be fairly comprehensive already, and we should try to use this as much as possible. @@ -145,11 +426,339 @@ class BasicGenerationMetric(Metric): def __init__(self, names: List[str]): self.names: List[str] = names - self.efficiency_metric = EfficiencyMetric() + + # For Efficiency metrics: + # The `inference_efficiency.json` file contains a `runtime_per_output_token` value + # (the estimated runtime of generating one output token) and a + # `runtime_for_prompt_tokens` dict (a mapping from various num_prompt_tokens values to + # the estimated runtime of encoding a prompt with that many tokens). + # For example: + # "openai/davinci": { + # "runtime_per_output_token": 0.080, + # "runtime_for_prompt_tokens": { + # "1": 0.016, + # "16": 0.018, + # "32": 0.020, + # ... + # + # These runtimes are generated by initializing Megatron with a model of the right size, + # obtaining end-to-end generation times for different numbers of prompt and output tokens, + # and then fitting a linear regression model to the runtimes: the resulting slope is the + # runtime_per_output_token, which is the processing time for generating each output token, + # and the y-intercept is the runtime_for_prompt_tokens, with different values for different + # num_prompt_tokens values. + # Profiling code and logs, and code to fit the regression model is available at + # https://github.com/stanford-crfm/benchmarking_efficiency. + data_package = resources.files(EFFICIENCY_DATA_PACKAGE) + with data_package.joinpath(INFERENCE_IDEALIZED_RUNTIMES_JSON_FILENAME).open("r") as f: + self.inference_idealized_runtimes_dict = json.load(f) + with data_package.joinpath(INFERENCE_DENOISED_RUNTIMES_JSON_FILENAME).open("r") as f: + self.inference_denoised_runtimes_dict = json.load(f) + + # We use estimated emitted CO2 during training (in tons of CO2) as a proxy metric + # for training efficiency. We use reported metrics where applicable, otherwise + # we estimate them from runtime information, type and number of hardware accelerators + # used, region, etc. + with data_package.joinpath(TRAINING_EFFICIENCY_JSON_FILENAME).open("r") as f: + self.training_efficiency_dict = json.load(f) def __repr__(self): return f"BasicMetric({','.join(self.names)})" + def compute_reference_metrics( + self, adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService + ) -> List[Stat]: + """ + Setup: + + - Gold (correct references): G1 ... Gm + - Predictions (completions): P1 ... Pk + + For each pair (G, P), we can define a ${score} (e.g., exact match, F1, BLEU). + + We define the following stats: + + - ${score}: max_i score(Gi, P1) + - ${score}@k: max_{i,j} score(Gi, Pj) + """ + + def compute_metrics_helper( + name: MetricName, + score_func: Callable, + group: Optional[str] = None, + ) -> List[Stat]: + if name.name == "pass": # Calculate pass@k for HumanEval from CodeScenario. + score_func = cast(Callable[[Tuple[str, Optional[Dict]], str], float], score_func) # Make mypy happy. + code_golds = cast(List[CodeReference], golds) + results = [ + score_func((gold.output.text, gold.test_cases), pred) for gold in code_golds for pred in preds + ] + _len, _sum = len(results), int(sum(results)) # Cast to int to make type match. + score_1 = pass_at_k_estimator(_len, _sum, 1) + score_k = pass_at_k_estimator(_len, _sum, adapter_spec.num_outputs) + elif name.name == "code_eval_acc": + score_func = cast(Callable[[Tuple[str, Optional[Dict]], str], float], score_func) # Make mypy happy. + code_golds = cast(List[CodeReference], golds) + score_1 = max(score_func((gold.output.text, gold.test_cases), preds[0]) for gold in code_golds) + score_k = max( + score_func((gold.output.text, gold.test_cases), pred) for gold in code_golds for pred in preds + ) + else: + score_func = cast(Callable[[str, str], float], score_func) # Make mypy happy. + score_1 = max(score_func(gold.output.text, preds[0]) for gold in golds) + score_k = max(score_func(gold.output.text, pred) for gold in golds for pred in preds) + + metrics = [Stat(name).add(score_1)] # score_1 corresponds using one prediction + if adapter_spec.num_outputs != 1: + metrics.append(Stat(replace(name, name=f"{name.name}@{adapter_spec.num_outputs}")).add(score_k)) + return metrics + + # maps each string metric name to its associated function + metric_fn_mapping: Dict[str, Callable] = { + "exact_match": exact_match, + "quasi_exact_match": quasi_exact_match, + "prefix_exact_match": prefix_exact_match, + "quasi_prefix_exact_match": quasi_prefix_exact_match, + "exact_match_indicator": exact_match_indicator, + "final_number_exact_match": final_number_exact_match, + "exact_set_match": exact_set_match, + "iou_set_match": iou_set_match, + "f1_set_match": f1_set_match, + "math_equiv": is_equiv, + "math_equiv_chain_of_thought": is_equiv_chain_of_thought, + "float_equiv": float_equiv, + "code_eval_acc": code_eval, + "pass": code_eval, + "f1_score": f1_score, + "rouge_1": get_rouge_function("rouge1"), + "rouge_2": get_rouge_function("rouge2"), + "rouge_l": get_rouge_function("rougeL"), + "bleu_1": bleu_1, + "bleu_4": bleu_4, + "chinese_bleu_1": chinese_bleu_1, + "chinese_rouge_1": get_chinese_rouge_function("rouge1"), + "chinese_rouge_2": get_chinese_rouge_function("rouge2"), + "cleva_math_result_match": cleva_math_result_match, + "absolute_value_difference": absolute_value_difference, + } + + stats: List[Stat] = [] + + # Gold outputs + golds: List[Reference] = [reference for reference in request_state.instance.references if reference.is_correct] + assert len(golds) > 0 + + # Predicted outputs + assert request_state.result is not None + sorted_completions: List[Sequence] = sorted(request_state.result.completions, key=lambda x: -x.logprob) + preds: List[str] = [completion.text.strip() for completion in sorted_completions] + + # Apply mapping if exists (e.g., for multiple-choice questions A -> Boston, B -> New York) + # Note: If 'A' and 'B' were the only possible choices, smaller language models like GPT-2 would + # sometimes predict a random letter like 'M'. + if request_state.output_mapping is not None: + preds = [request_state.output_mapping.get(pred) for pred in preds] # type: ignore + + # Compute max_prob, the probability that the model assigns to its generated text. + # Use the log prob of sorted_completions[0], which is the completion with the highest + # log_prob. We use this since that's what's used for computing metrics like exact_match. + # One subtlety is that when computing exact_match, we strip whitespace, so the actual + # max_prob is the sum of all the probabilities in the set {x : strip(x) = prediction}. + # In practice, we think this may not make much of a difference because models may not place + # high probabilities on having additional spaces (should check this). Also, the sum + # involves computing the log_prob for many completions which could be intractable. + max_prob = np.exp(sorted_completions[0].logprob) + stats.append(Stat(MetricName("max_prob")).add(max_prob)) + + # Add other metrics + for metric_name in self.names: + if metric_name in metric_fn_mapping: + stats.extend(compute_metrics_helper(MetricName(metric_name), metric_fn_mapping[metric_name])) + else: + raise NameError(f"{metric_name} is not in the list of metric functions.") + + return stats + + def compute_efficiency_metrics( + self, adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService + ) -> List[Stat]: + """Compute efficiency metrics for both inference and training. + For inference, we record both the actual runtime and an estimated idealized runtime + for the given request with an optimized software implementation run on A100 GPU(s), + taking into account both the number of tokens in the prompt of the request, and the + number of generated output tokens. + For training, we report the estimated total metric tons of CO2 emitted to train the + model. This is the same for each request.""" + # Compute efficiency metrics for inference. + assert request_state.result is not None + + runtime: Optional[float] = None + batch_size: Optional[int] = None + # Compute efficiency metrics for inference. + if request_state.result.request_time is not None: + runtime = request_state.result.request_time + batch_size = 1 + # For models that perform offline batch inference, effective runtime is batch_request_time, but also + # record batch_size to provide nuance. + if request_state.result.batch_request_time is not None and request_state.result.batch_size is not None: + runtime = request_state.result.batch_request_time + batch_size = request_state.result.batch_size + + # Compute total number of prompt and output tokens. + # Fetch the right `Tokenizer` depending on the model defined in `AdapterSpec` + # and calculate the number of tokens in the prompt. + tokenizer_service: TokenizerService = metric_service + window_service: WindowService = WindowServiceFactory.get_window_service( + adapter_spec.model_deployment, tokenizer_service + ) + prompt: str = request_state.request.prompt + num_prompt_tokens: int = window_service.get_num_tokens(prompt) + + # Total number of tokens in the completion. + num_completion_tokens: int = sum([len(completion.tokens) for completion in request_state.result.completions]) + # Don't include prompt in number of generated tokens (e.g., for language modeling). + # Assume that tokens for different completions are generated sequentially (instead of batched) when + # computing num_output_tokens (for the purpose of runtime estimation). + num_output_tokens: int = num_completion_tokens + if request_state.request.echo_prompt: + # num_prompt_tokens > num_output_tokens can happen if tokenizer doesn't round trip. + if num_prompt_tokens <= num_output_tokens: + num_output_tokens -= num_prompt_tokens + else: + hlog( + f"WARNING: num_prompt_tokens ({num_prompt_tokens}) > num_output_tokens ({num_output_tokens}) " + f"for prompt: {prompt}" + ) + num_output_tokens = 0 + + idealized_runtime: Optional[float] = compute_estimated_time_from_prompt_size_and_num_output_tokens( + request_state, self.inference_idealized_runtimes_dict, num_prompt_tokens, num_output_tokens + ) + + denoised_runtime: Optional[float] = compute_estimated_time_from_prompt_size_and_num_output_tokens( + request_state, self.inference_denoised_runtimes_dict, num_prompt_tokens, num_output_tokens + ) + # Denoised runtime for offline models is just runtime. + # We divide by batch_size to get approximate per-input runtime. + if runtime is not None and request_state.result.batch_size is not None: + denoised_runtime = runtime / request_state.result.batch_size + + # Compute efficiency metrics for training. + training_co2_cost: Optional[float] + if request_state.request.model_deployment in self.training_efficiency_dict["carbon"]: + training_co2_cost = self.training_efficiency_dict["carbon"][request_state.request.model_deployment]["value"] + else: + training_co2_cost = None + + training_energy_cost: Optional[float] + if request_state.request.model_deployment in self.training_efficiency_dict["energy"]: + training_energy_cost = self.training_efficiency_dict["energy"][request_state.request.model_deployment][ + "value" + ] + else: + training_energy_cost = None + + stats = [ + Stat(MetricName("num_prompt_tokens")).add(num_prompt_tokens), + Stat(MetricName("num_completion_tokens")).add(num_completion_tokens), + Stat(MetricName("num_output_tokens")).add(num_output_tokens), + Stat(MetricName("training_co2_cost")).add(training_co2_cost), + Stat(MetricName("training_energy_cost")).add(training_energy_cost), + ] + if runtime is not None: + stats.append(Stat(MetricName("inference_runtime")).add(runtime)) + if batch_size is not None: + stats.append(Stat(MetricName("batch_size")).add(batch_size)) + if denoised_runtime is not None: + stats.append(Stat(MetricName("inference_denoised_runtime")).add(denoised_runtime)) + if idealized_runtime is not None: + stats.append(Stat(MetricName("inference_idealized_runtime")).add(idealized_runtime)) + return stats + + def compute_finish_reason_metrics( + self, adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService + ) -> List[Stat]: + """Record how often generation finished due to reaching token limit, stop token(s), or end of text""" + assert request_state.result is not None + sequence = request_state.result.completions[0] + valid_reasons = [ + "length", + "stop", + "endoftext", + "unknown", + ] + if sequence.finish_reason is None or sequence.finish_reason["reason"] not in valid_reasons: + reason = "unknown" + else: + reason = sequence.finish_reason["reason"] + return [ + Stat(MetricName(f"finish_reason_{valid_reason}")).add(int(reason == valid_reason)) + for valid_reason in valid_reasons + ] + + def compute_truncation_metrics( + self, adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService + ) -> List[Stat]: + """ + Record the number of training instances used in the prompt and whether + even the prompt needed to be truncated (once we hit zero training instances). + """ + return [ + Stat(MetricName("num_train_instances")).add(request_state.num_train_instances), + Stat(MetricName("prompt_truncated")).add(request_state.prompt_truncated), + ] + + def compute_all_general_metrics( + self, adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService + ) -> List[Stat]: + """ + Compute metrics that are common to both `evaluate_generation` and `evaluate_references`. + """ + stats: List[Stat] = [] + + stats.append(Stat(MetricName("num_references")).add(len(request_state.instance.references))) + + # Copy from adapter spec + stats.append(Stat(MetricName("num_train_trials")).add(adapter_spec.num_train_trials)) + + stats.extend(self.compute_efficiency_metrics(adapter_spec, request_state, metric_service)) + stats.extend(self.compute_finish_reason_metrics(adapter_spec, request_state, metric_service)) + stats.extend(self.compute_truncation_metrics(adapter_spec, request_state, metric_service)) + + return stats + + def compute_language_modeling_metrics( + self, adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService + ) -> List[Stat]: + """Compute the logprob and normalization factors for the first completion""" + assert request_state.result is not None + sequence = request_state.result.completions[0] + + # Remove the empty tokens (typically generated by the AI21 tokenizer in the beginning of the text) + # + # Some more details about AI21 tokenizer: If the input prompt begins with a space, then + # the tokenizer inserts an empty token to the beginning. + # e.g. " burying him" -> ["▁"(0,0), "▁burying"(0,8), "▁him"(8,12)]. + # TODO(#1522): Update this comment once solved. + # Since this empty token is introduced by our chunking approach, we need to remove it. + tokens: List[Token] + if request_state.num_conditioning_tokens > 0 and sequence.tokens[0].text == "": + tokens = sequence.tokens[1:] + else: + tokens = sequence.tokens + pred_tokens = tokens[request_state.num_conditioning_tokens :] + logprob, num_perplexity_tokens, num_bytes = ( + sum(token.logprob for token in pred_tokens), + len(pred_tokens), + get_num_bytes(pred_tokens), + ) + + return [ + Stat(MetricName("logprob")).add(logprob), + Stat(MetricName("num_perplexity_tokens")).add(num_perplexity_tokens), + Stat(MetricName("num_bytes")).add(num_bytes), + ] + def evaluate_generation( self, adapter_spec: AdapterSpec, @@ -159,40 +768,15 @@ def evaluate_generation( ) -> List[Stat]: """Compute all metrics.""" stats: List[Stat] = [] - stats.extend(compute_request_state_metrics(self.efficiency_metric, adapter_spec, request_state, metric_service)) + stats.extend(self.compute_all_general_metrics(adapter_spec, request_state, metric_service)) if len(request_state.instance.references) > 0: - stats.extend(compute_reference_metrics(self.names, adapter_spec, request_state, metric_service)) + stats.extend(self.compute_reference_metrics(adapter_spec, request_state, metric_service)) - stats.extend(compute_language_modeling_metrics(adapter_spec, request_state, metric_service)) + stats.extend(self.compute_language_modeling_metrics(adapter_spec, request_state, metric_service)) return stats - def derive_stats(self, stats_dict: Dict[MetricName, Stat]) -> List[Stat]: - """Derive perplexity metrics if applicable. We don't worry about splits and perturbations here.""" - derived_stats: List[Stat] = [] - derived_stats.extend(compute_perplexity_metrics(stats_dict)) - return derived_stats - - def derive_per_instance_stats(self, per_instance_stats: Dict[Instance, List[Stat]]) -> List[Stat]: - """Derive calibration metrics if applicable. We don't worry about splits and perturbations here.""" - derived_stats: List[Stat] = [] - derived_stats.extend(compute_calibration_metrics(per_instance_stats)) - return derived_stats - - -class BasicReferenceMetric(ReferenceMetric): - """ - Defines basic metrics for Scenarios that use one Request per Reference instead of - one per Instance. - """ - - def __init__(self): - self.efficiency_metric = EfficiencyMetric() - - def __repr__(self): - return "BasicReferenceMetric" - def evaluate_references( self, adapter_spec: AdapterSpec, @@ -222,7 +806,7 @@ def compute_logprob_and_length(request_state: RequestState, window_service: Wind assert len(request_state.result.completions) == 1 reference_index = request_state.reference_index - sequence: GeneratedOutput = request_state.result.completions[0] + sequence: Sequence = request_state.result.completions[0] reference: str = request_state.instance.references[reference_index].output.text # Find the span of the completion that matches the reference. @@ -269,14 +853,8 @@ def compute_logprob_and_length(request_state: RequestState, window_service: Wind raise ValueError(f"Unknown adapter method: {adapter_spec.method}") stats: List[Stat] = [] + stats.extend(self.compute_all_general_metrics(adapter_spec, request_state, metric_service)) - general_metrics: Dict[MetricName, Stat] = {} - for request_state in reference_request_states: - for stat in compute_request_state_metrics( - self.efficiency_metric, adapter_spec, request_state, metric_service - ): - merge_stat(general_metrics, stat) - stats.extend(general_metrics.values()) max_prob = np.max(scipy.special.softmax(reference_scores)) # Multiple references may attain the same maximal score; in such cases, @@ -295,96 +873,18 @@ def compute_logprob_and_length(request_state: RequestState, window_service: Wind ) return stats + def derive_stats(self, stats_dict: Dict[MetricName, Stat]) -> List[Stat]: + """Derive perplexity metrics if applicable. We don't worry about splits and perturbations here.""" + derived_stats: List[Stat] = [] + derived_stats.extend(compute_perplexity_metrics(stats_dict)) + return derived_stats -def compute_request_state_metrics( - efficiency_metric: EfficiencyMetric, - adapter_spec: AdapterSpec, - request_state: RequestState, - metric_service: MetricService, -) -> List[Stat]: - """ - Compute metrics that are common to both `evaluate_generation` and `evaluate_references`. - """ - stats: List[Stat] = [] - - stats.append(Stat(MetricName("num_references")).add(len(request_state.instance.references))) - - # Copy from adapter spec - stats.append(Stat(MetricName("num_train_trials")).add(adapter_spec.num_train_trials)) - - stats.extend(efficiency_metric.compute_efficiency_metrics(adapter_spec, request_state, metric_service)) - stats.extend(_compute_finish_reason_metrics(adapter_spec, request_state, metric_service)) - stats.extend(_compute_truncation_metrics(adapter_spec, request_state, metric_service)) - - return stats - - -def _compute_finish_reason_metrics( - adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService -) -> List[Stat]: - """Record how often generation finished due to reaching token limit, stop token(s), or end of text""" - assert request_state.result is not None - sequence = request_state.result.completions[0] - valid_reasons = [ - "length", - "stop", - "endoftext", - "unknown", - ] - if sequence.finish_reason is None or sequence.finish_reason["reason"] not in valid_reasons: - reason = "unknown" - else: - reason = sequence.finish_reason["reason"] - return [ - Stat(MetricName(f"finish_reason_{valid_reason}")).add(int(reason == valid_reason)) - for valid_reason in valid_reasons - ] - - -def _compute_truncation_metrics( - adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService -) -> List[Stat]: - """ - Record the number of training instances used in the prompt and whether - even the prompt needed to be truncated (once we hit zero training instances). - """ - return [ - Stat(MetricName("num_train_instances")).add(request_state.num_train_instances), - Stat(MetricName("prompt_truncated")).add(request_state.prompt_truncated), - ] - - -def compute_language_modeling_metrics( - adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService -) -> List[Stat]: - """Compute the logprob and normalization factors for the first completion""" - assert request_state.result is not None - sequence = request_state.result.completions[0] - - # Remove the empty tokens (typically generated by the AI21 tokenizer in the beginning of the text) - # - # Some more details about AI21 tokenizer: If the input prompt begins with a space, then - # the tokenizer inserts an empty token to the beginning. - # e.g. " burying him" -> ["▁"(0,0), "▁burying"(0,8), "▁him"(8,12)]. - # TODO(#1522): Update this comment once solved. - # Since this empty token is introduced by our chunking approach, we need to remove it. - tokens: List[Token] - if request_state.num_conditioning_tokens > 0 and sequence.tokens[0].text == "": - tokens = sequence.tokens[1:] - else: - tokens = sequence.tokens - pred_tokens = tokens[request_state.num_conditioning_tokens :] - logprob, num_perplexity_tokens, num_bytes = ( - sum(token.logprob for token in pred_tokens), - len(pred_tokens), - get_num_bytes(pred_tokens), - ) - - return [ - Stat(MetricName("logprob")).add(logprob), - Stat(MetricName("num_perplexity_tokens")).add(num_perplexity_tokens), - Stat(MetricName("num_bytes")).add(num_bytes), - ] + def derive_per_instance_stats(self, per_instance_stats: Dict[Instance, List[Stat]]) -> List[Stat]: + """Derive calibration metrics if applicable. We don't worry about splits and perturbations here.""" + derived_stats: List[Stat] = [] + derived_stats.extend(compute_calibration_metrics(per_instance_stats)) + derived_stats.append(Stat(MetricName("num_instances")).add(len(per_instance_stats))) + return derived_stats def _has_non_zero_valued_logprobs(per_instance_stats: Dict[Instance, List[Stat]]) -> bool: diff --git a/src/helm/benchmark/metrics/classification_metrics.py b/src/helm/benchmark/metrics/classification_metrics.py index 77ec390e783..d1d71322064 100644 --- a/src/helm/benchmark/metrics/classification_metrics.py +++ b/src/helm/benchmark/metrics/classification_metrics.py @@ -1,18 +1,17 @@ from typing import List, Optional -from sklearn.metrics import f1_score -from sklearn.preprocessing import MultiLabelBinarizer +from sklearn.metrics import f1_score, recall_score, precision_score +from sklearn.preprocessing import MultiLabelBinarizer, label_binarize from helm.benchmark.adaptation.request_state import RequestState -from helm.benchmark.metrics.evaluate_instances_metric import EvaluateInstancesMetric -from helm.benchmark.metrics.evaluate_reference_metrics import normalize_text -from helm.benchmark.metrics.metric import MetricName +from helm.benchmark.metrics.basic_metrics import normalize_text +from helm.benchmark.metrics.metric import Metric, MetricName from helm.benchmark.metrics.statistic import Stat from helm.benchmark.scenarios.scenario import Reference -from helm.common.request import GeneratedOutput +from helm.common.request import Sequence -class ClassificationMetric(EvaluateInstancesMetric): +class ClassificationMetric(Metric): """Defines metrics for multi-class classification using the generation adapter. Currently provides `classification_macro_f1` and `classification_micro_f1`. @@ -32,13 +31,26 @@ class ClassificationMetric(EvaluateInstancesMetric): - Currently, multi-label classification is not supported. """ - def __init__(self, delimiter: Optional[str] = None): + def __init__( + self, delimiter: Optional[str] = None, average: Optional[str] = None, class_defs: Optional[List[str]] = None + ): self.delimiter = delimiter + self.average = average + self.class_defs = [normalize_text(c) for c in class_defs] if class_defs is not None else None def is_multi_label(self) -> bool: return bool(self.delimiter) - def evaluate_instances(self, request_states: List[RequestState], eval_cache_path: str) -> List[Stat]: + @staticmethod + def normalize_binary(y: List[List[str]], class_defs: Optional[List[str]]) -> List[List[str]]: + assert class_defs is not None + assert len(class_defs) == 2 + class_set = set(class_defs) + neg_label = class_defs[0] + ny = [v if len(v) == 1 and v[0] in class_set else [neg_label] for v in y] + return ny + + def evaluate_instances(self, request_states: List[RequestState]) -> List[Stat]: y_pred: List[List[str]] = [] y_true: List[List[str]] = [] for request_state in request_states: # one request state per instance @@ -64,22 +76,54 @@ def evaluate_instances(self, request_states: List[RequestState], eval_cache_path predictions = input_text.split(self.delimiter) if self.is_multi_label() else [input_text] y_pred.append([normalize_text(pred) for pred in predictions if pred]) labels: List[str] = list(set(y for ys in y_true for y in ys)) + # When binary, MultiLabelBinarizer is not appropriate. + # When binary and non-label strings (e.g., "yesandno") are included, + # label_binarize() automatically converts the output into a multi-label type (i.e., one-hot matrix). + # This will cause an error in f1_score(average="binary"). + y_pred = ( + ClassificationMetric.normalize_binary(y_pred, self.class_defs) + if self.average is not None and self.average == "binary" + else y_pred + ) mlb = MultiLabelBinarizer().fit([labels]) - y_true = mlb.transform(y_true) - y_pred = mlb.transform(y_pred) + y_true = ( + label_binarize(y_true, classes=self.class_defs) + if self.average is not None and self.average == "binary" + else mlb.transform(y_true) + ) + y_pred = ( + label_binarize(y_pred, classes=self.class_defs) + if self.average is not None and self.average == "binary" + else mlb.transform(y_pred) + ) + stats_additional = ( + [] + if self.average is None + else [ + Stat(MetricName(f"classification_{self.average}_f1")).add( + f1_score(y_pred=y_pred, y_true=y_true, average=self.average) + ), + Stat(MetricName(f"classification_{self.average}_recall")).add( + recall_score(y_pred=y_pred, y_true=y_true, average=self.average) + ), + Stat(MetricName(f"classification_{self.average}_precision")).add( + precision_score(y_pred=y_pred, y_true=y_true, average=self.average) + ), + ] + ) return [ Stat(MetricName("classification_macro_f1")).add(f1_score(y_pred=y_pred, y_true=y_true, average="macro")), Stat(MetricName("classification_micro_f1")).add(f1_score(y_pred=y_pred, y_true=y_true, average="micro")), - ] + ] + stats_additional -class MultipleChoiceClassificationMetric(EvaluateInstancesMetric): +class MultipleChoiceClassificationMetric(Metric): """ Calculate population micro/macro F1 score for multiple_choice_* adapters. For generation adapters, please use ClassificationMetric. """ - def evaluate_instances(self, request_states: List[RequestState], eval_cache_path: str) -> List[Stat]: + def evaluate_instances(self, request_states: List[RequestState]) -> List[Stat]: y_pred: List[str] = [] y_true: List[str] = [] for request_state in request_states: # one request state per instance @@ -90,9 +134,7 @@ def evaluate_instances(self, request_states: List[RequestState], eval_cache_path ] assert len(golds) > 0, "MultipleChoiceClassificationMetric are designed for multiple_choice_* adapters" assert request_state.result is not None - sorted_completions: List[GeneratedOutput] = sorted( - request_state.result.completions, key=lambda x: -x.logprob - ) + sorted_completions: List[Sequence] = sorted(request_state.result.completions, key=lambda x: -x.logprob) pred: str = sorted_completions[0].text.strip() # Only utilize the first prediction if request_state.output_mapping is not None: pred = request_state.output_mapping.get(pred, pred) diff --git a/src/helm/benchmark/metrics/kpi_edgar_metrics.py b/src/helm/benchmark/metrics/kpi_edgar_metrics.py new file mode 100644 index 00000000000..f81b426f88a --- /dev/null +++ b/src/helm/benchmark/metrics/kpi_edgar_metrics.py @@ -0,0 +1,331 @@ +from typing import List, Dict, Set, Tuple, Callable, Union, cast +import logging +import re +import itertools +import statistics + +from helm.common.request import Sequence +from helm.benchmark.adaptation.request_state import RequestState +from .metric import Metric +from .metric_name import MetricName +from .statistic import Stat +from helm.benchmark.adaptation.adapter_spec import AdapterSpec +from .metric_service import MetricService +from helm.benchmark.scenarios.scenario import Reference +from helm.benchmark.scenarios.kpi_edgar_scenario import TAG_DICT, TAG_PAREN_RE + +DEFAULT_TAG_PAREN_RE = (r"\(", r"\)") + + +def tokenize(text: str) -> List[str]: + # TODO: Better to introduce a sophisticated tokenizer to support (multilingal) natural texts. + return text.strip().split(" ") + + +def get_tagged_token_dict(token_list: List[str]) -> Dict[str, Set[Tuple[int, str]]]: + # TODO: Note: We need to handle the cases where the original text contains < or > to avoid the confusion with tags. + # TODO: Maybe better to introduce XML parser or more sophisticated parser. + + tagged_token_dict: Dict[str, Set[Tuple[int, str]]] = {tag: set() for tag in TAG_DICT.keys()} + curr_tag = "O" + for (idx, token) in enumerate(token_list): + sub_token_list = re.split("[<>]", token) + logging.debug(sub_token_list) + curr_token = sub_token_list[0] + if token.startswith("<"): + tag = sub_token_list[1] + curr_tag = tag if tag in tagged_token_dict.keys() else curr_tag + curr_token = sub_token_list[2] + if curr_tag != "O": + tagged_token_dict[curr_tag] = tagged_token_dict[curr_tag].union({(idx, curr_token)}) + if token.endswith(">") and sub_token_list[-2].startswith("/"): + tag = sub_token_list[-1][1:] + curr_tag = "O" + return tagged_token_dict + + +def get_tagged_token_size_dict(token_list: List[str]) -> Tuple[Dict[str, Set], Dict[str, int]]: + + tagged_token_dict = get_tagged_token_dict(token_list) + tagged_size_dict: Dict[str, int] = {tag: len(st) for (tag, st) in tagged_token_dict.items()} + return (tagged_token_dict, tagged_size_dict) + + +def get_intersection( + gold_set: Set[Tuple[int, str]], pred_set: Set[Tuple[int, str]], ignore_index: bool +) -> Set[Tuple[int, str]]: + def remove_index(the_set: Set[Tuple[int, str]]) -> Set[Tuple[int, str]]: + return {(0, e[1]) for e in the_set} + + tmp_gold_set = remove_index(gold_set) if ignore_index else gold_set + tmp_pred_set = remove_index(pred_set) if ignore_index else pred_set + + return tmp_gold_set.intersection(tmp_pred_set) + + +def get_tag_and_phrase(extracted: str, re_tag_paren: Tuple[str, str] = DEFAULT_TAG_PAREN_RE) -> Tuple[str, str]: + matched = re.match(r"(.*)%s(.*)%s" % (re_tag_paren[0], re_tag_paren[1]), extracted) + sub_token_tpl = matched.groups() if matched is not None else tuple() + if len(sub_token_tpl) == 2: + phrase = sub_token_tpl[0].strip() + tag = sub_token_tpl[1].strip() + return (tag, phrase) + return ("", "") + + +def get_tagged_token_size_dict_extraction( + entity_list: List[str], re_tag_paren: Tuple[str, str] = DEFAULT_TAG_PAREN_RE +) -> Tuple[Dict[str, Set], Dict[str, int]]: + + tmp_tag_and_phrase_list = [get_tag_and_phrase(entity, re_tag_paren) for entity in entity_list] + tag_and_phrase_list = [tp for tp in tmp_tag_and_phrase_list if len(tp[0]) != 0] + tagged_token_dict: Dict[str, Set[Tuple[int, str]]] = {tag: set() for tag in TAG_DICT.keys()} + for (tag, phrase) in tag_and_phrase_list: + if tag in tagged_token_dict.keys(): + word_list = phrase.split(" ") + token_list = [(0, word) for word in word_list] # token index is ignored. + tagged_token_dict[tag] = tagged_token_dict[tag].union(token_list) + tagged_size_dict = {tag: len(token_set) for (tag, token_set) in tagged_token_dict.items()} + return (tagged_token_dict, tagged_size_dict) + + +def get_tagged_size_dict( + gold_list: List[str], + pred_list: List[str], + ignore_index: bool, + is_extraction: bool = False, + re_tag_paren: Tuple[str, str] = DEFAULT_TAG_PAREN_RE, +) -> Dict[str, Tuple[int, int, int]]: + + if not is_extraction: + (gold_tagged_token_dict, gold_tagged_size_dict) = get_tagged_token_size_dict(gold_list) + (pred_tagged_token_dict, pred_tagged_size_dict) = get_tagged_token_size_dict(pred_list) + else: + (gold_tagged_token_dict, gold_tagged_size_dict) = get_tagged_token_size_dict_extraction(gold_list, re_tag_paren) + (pred_tagged_token_dict, pred_tagged_size_dict) = get_tagged_token_size_dict_extraction(pred_list, re_tag_paren) + + assert pred_tagged_token_dict.keys() == gold_tagged_token_dict.keys() + + intersection_tagged_token_dict = { + tag: get_intersection(gold_tagged_token_dict[tag], pred_tagged_token_dict[tag], ignore_index) + for tag in gold_tagged_token_dict.keys() + } + intersection_tagged_size_dict: Dict[str, int] = { + tag: len(st) for (tag, st) in intersection_tagged_token_dict.items() + } + tag_key_set = gold_tagged_token_dict.keys() + tag_size_dict: Dict[str, Tuple[int, int, int]] = { + tag: (gold_tagged_size_dict[tag], pred_tagged_size_dict[tag], intersection_tagged_size_dict[tag]) + for tag in tag_key_set + } + # TODO: for each sentence (sample), TPR, FPR, etc. must be defined with equal weights. + # TODO: later, those are averaged over the sentences (samples). + # TODO: how about tags? + # average_options = {None, "micro", "macro", "weighted"} + # TODO: TP, FP, TN, FN of this adjusted version can be regarded as + # TODO: continuous extention of the discrete original TP, FP, TN, FN for one sample. + # TODO: Therefore, one just need to sum up these to compute Precision and Recall. + # TODO: https://atmarkit.itmedia.co.jp/ait/articles/2212/19/news020.html + # TODO: macro-avg: average F1_type over all the tag types. + # TODO: micro-avg: define TP, FP, TN, FN as sum of all the classes. micro-F1 == accuracy. + return tag_size_dict + + +def tokenize_extraction(text: str, re_tag_paren: Tuple[str, str] = DEFAULT_TAG_PAREN_RE) -> List[str]: + # TODO: Better to introduce a sophisticated tokenizer to support (multilingal) natural texts. + delim = "," + tag_paren1 = re_tag_paren[1].strip("\\") + text_tmp1 = text.strip() + text_tmp0 = re.sub(re_tag_paren[1] + delim, tag_paren1 + tag_paren1 + delim, text_tmp1) + extracted_list = text_tmp0.split(tag_paren1 + delim) + n_extracted_list = [e.strip() for e in extracted_list] + return n_extracted_list + + +# def get_tagged_size_dict_extraction( +# gold_list: List[str], pred_list: List[str], ignore_index: bool +# ) -> Dict[str, Tuple[int, int, int]]: + +# (gold_tagged_token_dict, gold_tagged_size_dict) = get_tagged_token_size_dict_extraction(gold_list) +# (pred_tagged_token_dict, pred_tagged_size_dict) = get_tagged_token_size_dict_extraction(pred_list) + +# assert pred_tagged_token_dict.keys() == gold_tagged_token_dict.keys() + +# intersection_tagged_token_dict = { +# tag: get_intersection(gold_tagged_token_dict[tag], pred_tagged_token_dict[tag], ignore_index) +# for tag in gold_tagged_token_dict.keys() +# } +# intersection_tagged_size_dict: Dict[str, int] = { +# tag: len(st) for (tag, st) in intersection_tagged_token_dict.items() +# } +# tag_key_set = gold_tagged_token_dict.keys() +# tag_size_dict: Dict[str, Tuple[int, int, int]] = { +# tag: (gold_tagged_size_dict[tag], pred_tagged_size_dict[tag], intersection_tagged_size_dict[tag]) +# for tag in tag_key_set +# } +# return tag_size_dict + + +def get_value_list(stats_dict: Dict[MetricName, Stat], prefix: str, tag: str, split: Union[str, None]) -> List[int]: + value_name_list = ["gold", "pred", "intersection"] + metric_name_list = [MetricName(prefix + "." + tag + "." + name, split=split) for name in value_name_list] + value_list = [int(stats_dict[mn].sum) for mn in metric_name_list] + return value_list + + +def compute_prrcf1(tp, tn, fp, fn) -> Tuple[float, float, float]: + precision = float(tp) / float(tp + fp) if (tp + fp) != 0 else 0.0 + recall = float(tp) / float(tp + fn) if (tp + fn) != 0 else 0.0 + f1 = 2 * precision * recall / (precision + recall) if (precision + recall) != 0.0 else 0.0 + return (precision, recall, f1) + + +def compute_tptnfpfn_adjusted(stats: List[int], total_token_length: int) -> Tuple[float, float, float, float]: + assert len(stats) == 3 + ngold = stats[0] + npred = stats[1] + ninter = stats[2] + # ltoken = total_token_length + tp = float(ninter) / float(ngold) if ngold != 0.0 else 0.0 + fn = 1.0 - tp + fp = float(npred - ninter) / float(npred) if npred != 0.0 else 0.0 + tn = 0.0 # not used. + return (tp, tn, fp, fn) + + +def compute_tptnfpfn_modified_adjusted(stats: List[int], total_token_length: int) -> Tuple[float, float, float, float]: + assert len(stats) == 3 + ngold = stats[0] + npred = stats[1] + ninter = stats[2] + ltoken = total_token_length + tp = ninter + fn = ngold - ninter + fp = npred - ninter + tn = ltoken - (ngold + npred) + ninter + return (tp, tn, fp, fn) + + +def compute_adjusted_f1( + tag_stats_dict: Dict[str, List[int]], total_token_length: int, compute_tptnfpfn: Callable +) -> float: + + tag_tptnfpfn_list = [compute_tptnfpfn(stats, total_token_length) for stats in tag_stats_dict.values()] + tag_prrcf1_list = [compute_prrcf1(v[0], v[1], v[2], v[3]) for v in tag_tptnfpfn_list] + tag_f1_list = [v[2] for v in tag_prrcf1_list] + macro_f1 = statistics.mean(tag_f1_list) + + return macro_f1 + + +class NERAdjustedF1Metric(Metric): + """ + Paper: + Deußer, Tobias, et al. + "KPI-EDGAR: A Novel Dataset and Accompanying Metric for Relation Extraction from + Financial Documents." + arXiv preprint arXiv:2210.09163 (2022). + https://arxiv.org/abs/2210.09163 + """ + + NAME = "kpi_edgar_adjusted_f1" + ignore_index = True + is_extraction = True + re_tag_paren = TAG_PAREN_RE + + def __init__(self): + super().__init__() + + return + + def evaluate_generation( + self, + adapter_spec: AdapterSpec, + request_state: RequestState, + metric_service: MetricService, + eval_cache_path: str, + ) -> List[Stat]: + """Evaluate free-form generation.""" + + # logging.warning(adapter_spec) + # logging.warning("evaluate_generation instance.id: %s" % (request_state.instance.id)) + # logging.warning("evaluate_generation instance.reference: %s" % (request_state.instance.references)) + # logging.warning("evaluate_generation result.completion: %s" % (request_state.result.completions)) + # logging.warning(metric_service) + + golds: List[Reference] = [reference for reference in request_state.instance.references if reference.is_correct] + completions: List[Sequence] = ( + cast(List[Sequence], request_state.result.completions) if request_state.result is not None else [] + ) + preds: List[str] = [completion.text.strip() for completion in completions] + # logging.warning("evaluate_genearation len(preds): %d" % (len(preds))) + # logging.warning("evaluate_genearation len(golds): %d" % (len(golds))) + + assert len(preds) >= 1 + assert len(golds) >= 1 + + pred_text = preds[0] + gold_text = golds[0].output.text.strip() + # logging.warning("evaluate_genearation pred_text: %s" % (pred_text)) + # logging.warning("evaluate_genearation gold_text: %s" % (gold_text)) + + pred_token_list = ( + tokenize(pred_text) if not self.is_extraction else tokenize_extraction(pred_text, self.re_tag_paren) + ) + gold_token_list = ( + tokenize(gold_text) if not self.is_extraction else tokenize_extraction(gold_text, self.re_tag_paren) + ) + gold_len = len(gold_token_list) + # TODO: if the length are different, then the score should be 0. + tagged_size_dict = get_tagged_size_dict( + gold_token_list, pred_token_list, self.ignore_index, self.is_extraction, self.re_tag_paren + ) + tag_stat_tpl_list = [ + ( + Stat(MetricName(self.NAME + "." + tag + "." + "gold")).add(vals[0]), + Stat(MetricName(self.NAME + "." + tag + "." + "pred")).add(vals[1]), + Stat(MetricName(self.NAME + "." + tag + "." + "intersection")).add(vals[2]), + ) + for (tag, vals) in tagged_size_dict.items() + ] + tag_stat_list = list(itertools.chain.from_iterable(tag_stat_tpl_list)) + len_stat = Stat(MetricName(self.NAME + "." + "len")).add(gold_len) + return tag_stat_list + [len_stat] + + def evaluate_instances(self, request_states: List[RequestState]) -> List[Stat]: + """Evaluate all request states directly. Use only if nothing else works. Override me!""" + + # logging.warning("evaluate_instances len: %d" % (len(request_states))) + # logging.warning(request_states[0].instance.id) + + return [] + + def derive_stats(self, stats_dict: Dict[MetricName, Stat]) -> List[Stat]: + """Derive stats based on existing stats, e.g., for perplexity. Override me!""" + + # logging.warning("derive_stats stats_dict: %s" % (stats_dict)) + # TODO: + # I assume that all the stats in stats_dict were computed from the same split (valid or test). + assert len(stats_dict) >= 1 + stats_list: List[Stat] = list(stats_dict.values()) + + stat = stats_list[0] + split = stat.name.split + # logging.warning("derive_stats stat: %s" % (stat)) + logging.warning("derive_stats split: %s" % (split)) + + tag_stat_dict: Dict[str, List[int]] = { + tag: get_value_list(stats_dict, self.NAME, tag, split) for tag in TAG_DICT.keys() + } + # logging.warning(tag_stat_dict) + stats_total_token = stats_dict[MetricName(self.NAME + "." + "len", split=split)] + adjusted_f1 = compute_adjusted_f1(tag_stat_dict, int(stats_total_token.sum), compute_tptnfpfn_adjusted) + # logging.warning("derive_stats adjusted_f1: %f" % (adjusted_f1)) + modified_adjusted_f1 = compute_adjusted_f1( + tag_stat_dict, int(stats_total_token.sum), compute_tptnfpfn_modified_adjusted + ) + # logging.warning("derive_stats modified adjusted_f1: %f" % (modified_adjusted_f1)) + return [ + Stat(MetricName(self.NAME + "." + "macro", split=split)).add(adjusted_f1), + Stat(MetricName(self.NAME + "." + "modified_macro", split=split)).add(modified_adjusted_f1), + ] diff --git a/src/helm/benchmark/metrics/test_kpi_edgar_metrics.py b/src/helm/benchmark/metrics/test_kpi_edgar_metrics.py new file mode 100644 index 00000000000..694671c7944 --- /dev/null +++ b/src/helm/benchmark/metrics/test_kpi_edgar_metrics.py @@ -0,0 +1,248 @@ +import logging + +from . import kpi_edgar_metrics as kem +import helm.benchmark.scenarios.kpi_edgar_scenario as kes + + +def test_kem_tokenize(): + + text = "This is a test ." + exp_token_list = ["This", "is", "a", "test", "."] + token_list = kem.tokenize(text) + + logging.debug(token_list) + assert token_list == exp_token_list + + return + + +def test_kem_get_tagged_token_dict(): + + token_list = ["This", "is", "a", "test", "."] + exp_tagged_token_dict = {"kpi": {(1, "is"), (3, "test"), (2, "a")}, "cy": set(), "py": set(), "py1": set()} + + tagged_token_dict = kem.get_tagged_token_dict(token_list) + + logging.debug(tagged_token_dict) + assert tagged_token_dict == exp_tagged_token_dict + + return + + +def test_kem_get_tagged_token_dict2(): + + token_list = ["This", "is", "a", "test", "."] + exp_tagged_token_dict = {"kpi": {(1, "is")}, "cy": {(2, "a"), (3, "test")}, "py": set(), "py1": set()} + + tagged_token_dict = kem.get_tagged_token_dict(token_list) + + logging.debug(tagged_token_dict) + assert tagged_token_dict == exp_tagged_token_dict + + return + + +def test_kem_get_tagged_token_size_dict(): + + token_list = ["This", "is", "a", "test", "."] + exp_tagged_token_dict = {"kpi": {(1, "is")}, "cy": {(2, "a"), (3, "test")}, "py": set(), "py1": set()} + exp_tagged_size_dict = {"kpi": 1, "cy": 2, "py": 0, "py1": 0} + + (tagged_token_dict, tagged_size_dict) = kem.get_tagged_token_size_dict(token_list) + + logging.debug(tagged_token_dict) + logging.debug(tagged_size_dict) + + assert tagged_token_dict == exp_tagged_token_dict + assert tagged_size_dict == exp_tagged_size_dict + + return + + +def test_kem_get_intersection(): + gold_set = {(0, "This"), (1, "is"), (2, "a"), (3, "pen")} + pred_set = {(0, "That"), (1, "is"), (2, "a"), (3, "pen")} + exp_inter_set = {(1, "is"), (2, "a"), (3, "pen")} + inter_set = kem.get_intersection(gold_set, pred_set, False) + + logging.debug(inter_set) + assert inter_set == exp_inter_set + return + + +def test_kem_get_intersection_2(): + gold_set = {(0, "This"), (1, "is"), (2, "a"), (3, "pen")} + pred_set = {(0, "That"), (1, "is"), (2, "a"), (4, "pen")} + exp_inter_set = {(0, "is"), (0, "a"), (0, "pen")} + inter_set = kem.get_intersection(gold_set, pred_set, True) + + logging.debug(inter_set) + assert inter_set == exp_inter_set + return + + +def test_kem_get_tagged_size_dict(): + + gold_token_list = ["This", "is", "a", "test", "."] + pred_token_list = ["This", "is", "a", "test", "."] + # exp_tagged_token_dict = {"kpi": {(1, "is")}, "cy": {(2, "a"), (3, "test")}, "py": set(), "py1": set()} + # exp_tagged_size_dict = {"kpi": 1, "cy": 2, "py": 0, "py1": 0} + exp_intersection_size_dict = {"kpi": 1, "cy": 2, "py": 0, "py1": 0} + + tagged_size_dict = kem.get_tagged_size_dict(gold_token_list, pred_token_list, False) + + logging.debug(tagged_size_dict) + + assert tagged_size_dict["kpi"][2] == exp_intersection_size_dict["kpi"] + assert tagged_size_dict["kpi"][2] == exp_intersection_size_dict["kpi"] + + return + + +def test_kem_get_tag_and_phrase(): + extracted = "annual revenue (kpi)" + result = kem.get_tag_and_phrase(extracted) + logging.debug(result) + assert result[0] == "kpi" + assert result[1] == "annual revenue" + return + + +def test_kem_get_tag_and_phrase_2(): + extracted = "annual revenue - kpi)" + result = kem.get_tag_and_phrase(extracted) + logging.debug(result) + assert result[0] == "" + assert result[1] == "" + return + + +def test_kem_get_tag_and_phrase_3(): + extracted = "annual [which is, a yearly] revenue [kpi]" + result = kem.get_tag_and_phrase(extracted, kes.TAG_PAREN_RE) + logging.debug(result) + assert result[0] == "kpi" + assert result[1] == "annual [which is, a yearly] revenue" + return + + +def test_kem_tokenize_extraction(): + extracted = "annual revenue (kpi), 364 (cy)" + result = kem.tokenize_extraction(extracted) + logging.debug(result) + assert result[0] == "annual revenue (kpi)" + assert result[1] == "364 (cy)" + + +def test_kem_tokenize_extraction_2(): + extracted = "annual (which is a yearly) revenue [kpi], 9,364 [cy]" + result = kem.tokenize_extraction(extracted, kes.TAG_PAREN_RE) + logging.debug(result) + assert result[0] == "annual (which is a yearly) revenue [kpi]" + assert result[1] == "9,364 [cy]" + + +def test_kem_get_tagged_token_size_dict_extraction(): + entity_list = ["annual revenue (kpi)", "364 (cy)"] + result = kem.get_tagged_token_size_dict_extraction(entity_list) + logging.debug(result) + assert result[0]["kpi"] == {(0, "annual"), (0, "revenue")} + assert result[1]["cy"] == 1 + return + + +def test_kem_get_tagged_size_dict_2(): + + gold_token_list = ["annual revenue (kpi)", "364 (cy)"] + pred_token_list = ["annual revenue (kpi)", "364 (cy)"] + # exp_tagged_token_dict = {"kpi": {(1, "is")}, "cy": {(2, "a"), (3, "test")}, "py": set(), "py1": set()} + # exp_tagged_size_dict = {"kpi": 1, "cy": 2, "py": 0, "py1": 0} + exp_intersection_size_dict = {"kpi": 2, "cy": 1, "py": 0, "py1": 0} + + tagged_size_dict = kem.get_tagged_size_dict(gold_token_list, pred_token_list, False, True) + + logging.debug(tagged_size_dict) + + assert tagged_size_dict["kpi"][2] == exp_intersection_size_dict["kpi"] + assert tagged_size_dict["kpi"][2] == exp_intersection_size_dict["kpi"] + + return + + +def test_kem_compute_prrcf1(): + tp = 10 + tn = 60 + fp = 10 + fn = 10 + (pr, rc, f1) = kem.compute_prrcf1(tp, tn, fp, fn) + logging.debug((pr, rc, f1)) + assert (pr, rc, f1) == (0.5, 0.5, 0.5) + + +def test_kem_compute_prrcf1_2(): + tp = 0 + tn = 0 + fp = 0 + fn = 0 + (pr, rc, f1) = kem.compute_prrcf1(tp, tn, fp, fn) + logging.debug((pr, rc, f1)) + assert (pr, rc, f1) == (0, 0, 0) + + +def test_kem_compute_prrcf1_3(): + tp = 10 + tn = 40 + fp = 30 + fn = 10 + (pr, rc, f1) = kem.compute_prrcf1(tp, tn, fp, fn) + logging.debug((pr, rc, f1)) + assert (pr, rc, f1) == (0.25, 0.5, float(1) / float(3)) + + +def test_kem_compute_tptnfpfn_adjusted(): + total_token_length = 100 + stats = [20, 40, 10] + tp = 0.5 + tn = 0.0 # unused. + fp = 0.75 + fn = 0.5 + tptnfpfn = kem.compute_tptnfpfn_adjusted(stats, total_token_length) + logging.debug(tptnfpfn) + assert tptnfpfn == (tp, tn, fp, fn) + + +def test_kem_compute_tptnfpfn_adjusted_1(): + total_token_length = 100 + stats = [0, 0, 0] + tp = 0.0 + tn = 0.0 # unused. + fp = 0.0 + fn = 1.0 + tptnfpfn = kem.compute_tptnfpfn_adjusted(stats, total_token_length) + logging.debug(tptnfpfn) + assert tptnfpfn == (tp, tn, fp, fn) + + +def test_kem_compute_tptnfpfn_modified_adjusted(): + total_token_length = 100 + stats = [20, 40, 10] + tp = 10 + tn = 50 + fp = 30 + fn = 10 + tptnfpfn = kem.compute_tptnfpfn_modified_adjusted(stats, total_token_length) + logging.debug(tptnfpfn) + assert tptnfpfn == (tp, tn, fp, fn) + + +def test_kem_compute_adjusted_f1(): + + total_token_lengh = 100 + tag_stats_dict = {"pos": [20, 20, 10], "neg": [80, 80, 70]} + exp_macro_f1 = 0.6875 + + macro_f1 = kem.compute_adjusted_f1(tag_stats_dict, total_token_lengh, kem.compute_tptnfpfn_adjusted) + + logging.debug(macro_f1) + assert macro_f1 == exp_macro_f1 + return diff --git a/src/helm/benchmark/run_specs/classic_run_specs.py b/src/helm/benchmark/run_specs/classic_run_specs.py index bf692de749c..daf4cf7000a 100644 --- a/src/helm/benchmark/run_specs/classic_run_specs.py +++ b/src/helm/benchmark/run_specs/classic_run_specs.py @@ -1,52 +1,824 @@ -"""Run spec functions for the HELM Classic leaderboard. - -Website: https://crfm.stanford.edu/helm/classic/ - -If a run spec function is included in both the HELM Classic leaderboard and the -HELM Lite leaderboard, it will be included in the lite_run_specs module instead of this module. -This module also contains some scenarios that are currently not used on any HELM leaderboard.""" - -from typing import Any, Dict, List, Optional, Set - -from helm.benchmark.adaptation.adapter_spec import ( - ADAPT_GENERATION, +import dataclasses +import itertools +from functools import partial +from typing import Any, Callable, List, Dict, Optional, Set, TypeVar + +from helm.benchmark.model_deployment_registry import ALL_MODEL_DEPLOYMENTS, DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT +from helm.benchmark.scenarios.commonsense_scenario import ( + CommonSenseQAScenario, + HellaSwagScenario, + OpenBookQA, + PiqaScenario, + SiqaScenario, +) +from helm.common.hierarchical_logger import hlog, htrack +from helm.common.object_spec import ObjectSpec +from helm.benchmark.adaptation.adapters.adapter_factory import ( + ADAPT_LANGUAGE_MODELING, ADAPT_MULTIPLE_CHOICE_JOINT, ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, + ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED, + ADAPT_GENERATION, ADAPT_RANKING_BINARY, - AdapterSpec, ) from helm.benchmark.adaptation.adapters.binary_ranking_adapter import BinaryRankingAdapter -from helm.benchmark.adaptation.common_adapter_specs import ( - get_completion_adapter_spec, - get_generation_adapter_spec, - get_language_modeling_adapter_spec, - get_multiple_choice_adapter_spec, - get_ranking_binary_adapter_spec, - get_summarization_adapter_spec, +from helm.benchmark.adaptation.adapter_spec import AdapterSpec +from .metrics.metric import MetricSpec +from .run_expander import ( + RUN_EXPANDERS, + GlobalPrefixRunExpander, + AnthropicRunExpander, + OpenAIRunExpander, + GoogleRunExpander, + StopRunExpander, + ChatMLRunExpander, + IncreaseTemperatureRunExpander, ) -from helm.benchmark.annotation.annotator import AnnotatorSpec -from helm.benchmark.metrics.common_metric_specs import ( - get_basic_metric_specs, - get_bias_metric_specs, - get_classification_metric_specs, - get_copyright_metric_specs, - get_disinformation_metric_specs, - get_exact_match_metric_specs, - get_f1_metric_specs, - get_generative_harms_metric_specs, - get_language_modeling_metric_specs, - get_numeracy_metric_specs, - get_open_ended_generation_metric_specs, - get_summarization_metric_specs, - get_basic_generation_metric_specs, - get_basic_reference_metric_specs, - get_generic_metric_specs, +from .runner import RunSpec, get_benchmark_output_path +from .scenarios.lex_glue_scenario import ( + get_lex_glue_max_train_instances, + get_lex_glue_instructions, + get_lex_glue_max_tokens, + get_lex_glue_task_type, ) -from helm.benchmark.metrics.metric import MetricSpec -from helm.benchmark.run_spec import RunSpec, run_spec_function -from helm.benchmark.runner import get_benchmark_output_path -from helm.benchmark.scenarios.scenario import ScenarioSpec, get_scenario_cache_path -from helm.common.hierarchical_logger import hlog, htrack +from .scenarios.scenario import ScenarioSpec, get_scenario_cache_path +from .scenarios.msmarco_scenario import MSMARCOScenario +from .scenarios.copyright_scenario import datatag2hash_code +from .scenarios.lextreme_scenario import ( + get_lextreme_instructions, + get_lextreme_max_train_instances, + get_lextreme_max_tokens, + TaskType, + get_lextreme_task_type, +) +from .scenarios.echr_judge_scenario import EchrJudgeScenario +from helm.benchmark.model_deployment_registry import ( + ModelDeployment, + get_model_deployment, +) +from helm.benchmark.model_metadata_registry import ( + ModelMetadata, + get_model_metadata, + ANTHROPIC_CLAUDE_1_MODEL_TAG, + ANTHROPIC_CLAUDE_2_MODEL_TAG, + GOOGLE_PALM_2_MODEL_TAG, + NO_NEWLINES_TAG, + NLG_PREFIX_TAG, + CHATML_MODEL_TAG, + OPENAI_CHATGPT_MODEL_TAG, + BUGGY_TEMP_0_TAG, +) +from helm.common.general import singleton + +INCLUDE_GENERATIVE_HARMS_METRICS = False + + +############################################################ +# Prototypical adapter specs + + +def format_instructions(instructions: str) -> str: + if len(instructions) > 0: + instructions += "\n" + return instructions + + +def get_multiple_choice_joint_adapter_spec( + instructions: str, + input_noun: Optional[str], + output_noun: str, + num_outputs: int = 5, + max_train_instances: int = 5, + max_tokens: int = 5, + sample_train: bool = True, + **kwargs, +) -> AdapterSpec: + """ + [instructions] + + [input_noun]: [input] + [reference_1] + ... + [reference_k] + [output_noun]: [output] + + [input_noun]: [input] + [reference_1] + ... + [reference_k] + [output_noun]: + """ + + return AdapterSpec( + method=ADAPT_MULTIPLE_CHOICE_JOINT, + instructions=format_instructions(instructions), + input_prefix=f"{input_noun}: " if input_noun is not None else "", + input_suffix="\n" if input_noun is not None else "", + output_prefix=f"{output_noun}: ", + output_suffix="\n", + max_train_instances=max_train_instances, + num_outputs=num_outputs, + max_tokens=max_tokens, + temperature=0.0, + stop_sequences=["\n"], + sample_train=sample_train, + **kwargs, + ) + + +def get_multiple_choice_separate_adapter_spec(method: str, empty_input: bool = False) -> AdapterSpec: + """ + [input] [reference_i] + or + [reference_i] + """ + assert method in {ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED} + + return AdapterSpec( + method=method, + instructions="", + input_prefix="", + input_suffix="", + output_prefix=" " if not empty_input else "", + output_suffix="", + # Separate is basically language modeling, so can't easily use in-context examples + max_train_instances=0, + num_outputs=1, + max_tokens=0, + temperature=0.0, + ) + + +def get_multiple_choice_adapter_spec( + method: str, + instructions: str, + input_noun: Optional[str], + output_noun: str, + max_train_instances: int = 5, + num_outputs: int = 5, + max_tokens: int = 1, + empty_input: bool = False, + sample_train: bool = True, + **kwargs, +): + """ + Toggle between joint and separate adapters. + """ + if method == ADAPT_MULTIPLE_CHOICE_JOINT: + return get_multiple_choice_joint_adapter_spec( + instructions, + input_noun, + output_noun, + max_train_instances=max_train_instances, + num_outputs=num_outputs, + max_tokens=max_tokens, + sample_train=sample_train, + **kwargs, + ) + elif method in {ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED}: + return get_multiple_choice_separate_adapter_spec(method, empty_input) + else: + raise ValueError(f"Invalid adaptation method: {method}") + + +def get_ranking_binary_adapter_spec( + instructions: str = "", + document_noun: str = "Passage", + query_noun: str = "Query", + output_prefix: str = "Does the passage answer the query?", + output_noun: str = "Answer", + max_train_instances: int = 4, + num_outputs: int = 1, + num_train_trials: int = 1, + temperature: float = 0.0, + max_tokens: int = 5, + **kwargs, +) -> AdapterSpec: + """ + [instructions] + + [object_noun]: [object] + [query_noun]: [query] + [prompt_noun]: [prompt_content] + [output_noun]: [output] + + ... + + [object_noun]: [object] + [query_noun]: [query] + [prompt_noun]: [prompt_content] + [output_noun]: [output] + + [object_noun]: [object] + [query_noun]: [query] + [prompt_noun]: [prompt_content] + [output_noun]: [output] + """ + msg = ( + "There must be an even number of in-context examples to ensure that" + "an equal number of positive and negative examples are included." + ) + assert max_train_instances % 2 == 0, msg + max_train_instances = int(max_train_instances / 2) + + return AdapterSpec( + method=ADAPT_RANKING_BINARY, + instructions=format_instructions(instructions), + input_prefix=f"{query_noun}: ", + input_suffix="\n", + reference_prefix=f"{document_noun}: ", + reference_suffix="\n", + output_prefix=f"{output_prefix}\n{output_noun}: ", + max_train_instances=max_train_instances, + num_outputs=num_outputs, + num_train_trials=num_train_trials, + temperature=temperature, + max_tokens=max_tokens, + **kwargs, + ) + + +def get_completion_adapter_spec( + instructions: str = "", + input_prefix: str = "", + output_prefix: str = "", + output_suffix: str = "", + max_train_instances: int = 0, + temperature: float = 0.0, + num_outputs: int = 1, + max_tokens: int = 100, + stop_sequences: Optional[List] = None, # default value of `stop_sequences` is no stop sequence, + **kwargs, +) -> AdapterSpec: + """ + [input][output_prefix][output][output_suffix] + + [input][output_prefix] + """ + if stop_sequences is None: + stop_sequences = [] + + return AdapterSpec( + method=ADAPT_GENERATION, + instructions=format_instructions(instructions), + input_prefix=input_prefix, + input_suffix="", + output_prefix=output_prefix, + output_suffix=output_suffix, + max_train_instances=max_train_instances, + temperature=temperature, + num_outputs=num_outputs, + max_tokens=max_tokens, + stop_sequences=stop_sequences, + **kwargs, + ) + + +def get_generation_adapter_spec( + instructions: str = "", + input_noun: Optional[str] = None, + newline_after_input_noun: bool = False, + output_noun: Optional[str] = None, + newline_after_output_noun: bool = False, + max_train_instances: int = 5, + num_outputs: int = 1, + max_tokens: int = 5, + stop_sequences: Optional[List] = None, # default value of `stop_sequences` is ["\n"] + temperature: float = 0.0, + multi_label: bool = False, +) -> AdapterSpec: + """ + [instructions] + + [input_noun]: [input] + [output_noun]: [output] + + [input_noun]: [input] + [output_noun]: + """ + + def format_prefix(noun: Optional[str], append_new_line: bool) -> str: + """ + When `append_new_line` is False: + [input_noun]: [input] + + When `append_new_line` is True: + [input_noun]: + [input] + """ + prefix: str = f"{noun}:" if noun is not None else "" + if len(prefix) > 0: + prefix += "\n" if append_new_line else " " + return prefix + + if stop_sequences is None: + stop_sequences = ["\n"] + + return AdapterSpec( + method=ADAPT_GENERATION, + instructions=format_instructions(instructions), + input_prefix=format_prefix(input_noun, append_new_line=newline_after_input_noun), + input_suffix="\n", + output_prefix=format_prefix(output_noun, append_new_line=newline_after_output_noun), + output_suffix="\n", + max_train_instances=max_train_instances, + num_outputs=num_outputs, + max_tokens=max_tokens, + temperature=temperature, + stop_sequences=stop_sequences, + multi_label=multi_label, + ) + + +def get_instruct_adapter_spec( + num_outputs: int = 1, + max_tokens: int = 512, + temperature: float = 0.7, +) -> AdapterSpec: + """ + Zero-shot instruction-following. + """ + return AdapterSpec( + method=ADAPT_GENERATION, + instructions="", + input_prefix="", + input_suffix="\n", + output_prefix="", + output_suffix="", + max_train_instances=0, + num_outputs=num_outputs, + max_tokens=max_tokens, + temperature=temperature, + stop_sequences=[], + ) + + +def get_language_modeling_adapter_spec() -> AdapterSpec: + """ + Used for language modeling. + """ + return AdapterSpec( + method=ADAPT_LANGUAGE_MODELING, + instructions="", + input_prefix="", + input_suffix="", + output_prefix="", + output_suffix="", + max_train_instances=0, + num_outputs=1, + max_tokens=0, + temperature=0.0, + ) + + +def get_summarization_adapter_spec(num_sents: Optional[int], max_train_instances: int = 5, **kwargs) -> AdapterSpec: + """ + Used for summarization. + """ + + if num_sents == 1: + out_pref = "Summarize the above article in 1 sentence.\n" + elif num_sents is None: + out_pref = "Summarize the above article.\n" + else: + out_pref = f"Summarize the above article in {num_sents} sentences.\n" + + return AdapterSpec( + method=ADAPT_GENERATION, + instructions="", + input_prefix="###\nArticle: ", + input_suffix="\n\n", + output_prefix=out_pref, + output_suffix="\n", + max_train_instances=max_train_instances, + num_outputs=1, + stop_sequences=["###"], # Separator between few-shot instances. + **kwargs, + ) + + +def get_machine_translation_adapter_spec( + source_language, target_language, max_train_instances, **kwargs +) -> AdapterSpec: + """ + Used for machine translation. + """ + return AdapterSpec( + method=ADAPT_GENERATION, + instructions=f"Translate the following sentences from {source_language} to {target_language}.", + input_prefix=f"{source_language}: ", + input_suffix="\n", + output_prefix=f"{target_language}: ", + output_suffix="\n", + max_train_instances=max_train_instances, + num_outputs=1, + stop_sequences=["\n\n"], + temperature=0.0, + **kwargs, + ) + + +############################################################ +# Examples of scenario and adapter specs + + +def get_scenario_spec1() -> ScenarioSpec: + return ScenarioSpec( + class_name="helm.benchmark.scenarios.simple_scenarios.Simple1Scenario", + args={"num_input_tokens": 5, "vocab_size": 20, "num_train_instances": 10, "num_test_instances": 10}, + ) + + +def get_scenario_spec_tiny(): + return ScenarioSpec( + class_name="helm.benchmark.scenarios.simple_scenarios.Simple1Scenario", + args={"num_input_tokens": 5, "vocab_size": 20, "num_train_instances": 2, "num_test_instances": 2}, + ) + + +def get_adapter_spec1() -> AdapterSpec: + return AdapterSpec( + method=ADAPT_GENERATION, + instructions="Please solve the following problem.\n", + max_train_instances=5, + max_eval_instances=10, + num_outputs=3, + num_train_trials=3, + model="simple/model1", + model_deployment="simple/model1", + temperature=1, + stop_sequences=["."], + ) + + +############################################################ +# Metrics + + +def get_basic_metric_specs(names: List[str]) -> List[MetricSpec]: + return [MetricSpec(class_name="helm.benchmark.metrics.basic_metrics.BasicMetric", args={"names": names})] + + +def get_exact_match_metric_specs() -> List[MetricSpec]: + return get_basic_metric_specs( + ["exact_match", "quasi_exact_match", "prefix_exact_match", "quasi_prefix_exact_match"] + ) + + +def get_f1_metric_specs() -> List[MetricSpec]: + return get_basic_metric_specs(["exact_match", "quasi_exact_match", "f1_score"]) + + +def get_classification_metric_specs(delimiter: Optional[str] = None) -> List[MetricSpec]: + return [ + MetricSpec( + class_name="helm.benchmark.metrics.classification_metrics.ClassificationMetric", + args={"delimiter": delimiter}, + ) + ] + +def get_weighted_classification_metric_specs( + delimiter: Optional[str] = None, average: str = "weighted", class_defs: Optional[List[str]] = None +) -> List[MetricSpec]: + return [ + MetricSpec( + class_name="helm.benchmark.metrics.classification_metrics.ClassificationMetric", + args={"delimiter": delimiter, "average": average, "class_defs": class_defs}, + ) + ] + +def get_multiple_choice_classification_metric_specs() -> List[MetricSpec]: + return [ + MetricSpec( + class_name="helm.benchmark.metrics.classification_metrics.MultipleChoiceClassificationMetric", args={} + ) + ] + + +def get_bbq_metric_specs() -> List[MetricSpec]: + return [ + MetricSpec(class_name="helm.benchmark.metrics.bbq_metrics.BBQMetric", args={}) + ] + get_exact_match_metric_specs() + + +def get_msmarco_metric_specs(track: str, rank: Optional[int] = None) -> List[MetricSpec]: + # Names of the measures we want to compute. + measure_names = MSMARCOScenario.MEASURE_NAMES[track] + multiple_relevance_values = set(MSMARCOScenario.GOLD_RELATIONS[track]) != {1} + + return [ + MetricSpec( + class_name="helm.benchmark.metrics.ranking_metrics.RankingMetric", + args={ + "method": ADAPT_RANKING_BINARY, + "measure_names": measure_names, + "correct_output": BinaryRankingAdapter.RANKING_CORRECT_LABEL, + "wrong_output": BinaryRankingAdapter.RANKING_WRONG_LABEL, + "rank": rank, + "multiple_relevance_values": multiple_relevance_values, + }, + ), + ] + get_basic_metric_specs(names=[]) + + +def get_toxicity_metric_specs() -> List[MetricSpec]: + return [ + MetricSpec(class_name="helm.benchmark.metrics.toxicity_metrics.ToxicityMetric", args={}), + ] + + +def get_bias_metric_specs() -> List[MetricSpec]: + demographic_categories = ["race", "gender"] + target_categories = ["adjective", "profession"] + cross_dem_target = itertools.product(demographic_categories, target_categories) + + return [ + MetricSpec( + class_name="helm.benchmark.metrics.bias_metrics.BiasMetric", + args={"mode": "associations", "demographic_category": dem, "target_category": tgt}, + ) + for dem, tgt in cross_dem_target + ] + [ + MetricSpec( + class_name="helm.benchmark.metrics.bias_metrics.BiasMetric", + args={"mode": "representation", "demographic_category": dem}, + ) + for dem in demographic_categories + ] + + +def get_generative_harms_metric_specs(include_basic_metrics: bool = False) -> List[MetricSpec]: + # In classic HELM, we included bias/toxicity measures, but now we don't to streamline. + if not INCLUDE_GENERATIVE_HARMS_METRICS: + return [] + return ( + get_bias_metric_specs() + + get_toxicity_metric_specs() + + (get_basic_metric_specs([]) if include_basic_metrics else []) + ) + + +def get_summarization_metric_specs(args: Dict[str, Any]) -> List[MetricSpec]: + return [ + MetricSpec(class_name="helm.benchmark.metrics.summarization_metrics.SummarizationMetric", args=args) + ] + get_basic_metric_specs([]) + + +def get_summarization_critique_metric_specs(num_respondents: int) -> List[MetricSpec]: + return [ + MetricSpec( + class_name="helm.benchmark.metrics.summarization_critique_metrics.SummarizationCritiqueMetric", + args={"num_respondents": num_respondents}, + ) + ] + + +def get_srn_metric_specs() -> List[MetricSpec]: + return get_basic_metric_specs(["f1_set_match", "iou_set_match", "exact_set_match"]) + + +def get_numeracy_metric_specs(run_solver: bool = False) -> List[MetricSpec]: + metric_specs: List[MetricSpec] = get_basic_metric_specs( + ["exact_match", "quasi_exact_match", "absolute_value_difference"] + ) + + # The solvers are slow to run so make them skippable + if run_solver: + metric_specs += [ + MetricSpec(class_name="helm.benchmark.metrics.numeracy_metrics.DistanceMetric", args={}), + ] + return metric_specs + + +def get_math_metric_specs(use_chain_of_thought: bool = True) -> List[MetricSpec]: + return get_basic_metric_specs(["math_equiv_chain_of_thought" if use_chain_of_thought else "math_equiv"]) + + +def get_copyright_metric_specs(args: Optional[Dict] = None) -> List[MetricSpec]: + if args is None: + args = {} + return [ + MetricSpec( + class_name="helm.benchmark.metrics.copyright_metrics.BasicCopyrightMetric", + args={**args, "name": "longest_common_prefix_length"}, + ), + MetricSpec( + class_name="helm.benchmark.metrics.copyright_metrics.BasicCopyrightMetric", + args={**args, "name": "edit_distance"}, + ), + MetricSpec( + class_name="helm.benchmark.metrics.copyright_metrics.BasicCopyrightMetric", + args={**args, "name": "edit_similarity"}, + ), + ] + get_basic_metric_specs([]) + + +def get_disinformation_metric_specs(args: Optional[Dict] = None) -> List[MetricSpec]: + if args is None: + args = {} + return [ + MetricSpec( + class_name="helm.benchmark.metrics.disinformation_metrics.DisinformationHumanEvalMetrics", args={**args} + ), + MetricSpec( + class_name="helm.benchmark.metrics.disinformation_metrics.DisinformationMetric", args={"name": "self_bleu"} + ), + MetricSpec( + class_name="helm.benchmark.metrics.disinformation_metrics.DisinformationMetric", + args={"name": "monte_carlo_entropy"}, + ), + ] + get_basic_metric_specs([]) + + +def get_code_metric_specs(dataset: str, timeout: float) -> List[MetricSpec]: + if dataset == "humaneval": + return get_basic_metric_specs(["code_eval_acc", "pass"]) + else: # APPS. + args: Dict[str, Any] = {"names": ["test_avg", "strict_acc"], "timeout": timeout} + return [MetricSpec(class_name="helm.benchmark.metrics.code_metrics.APPSMetric", args=args)] + + +def get_open_ended_generation_metric_specs() -> List[MetricSpec]: + return get_basic_metric_specs(["exact_match", "quasi_exact_match", "f1_score", "rouge_l", "bleu_1", "bleu_4"]) + + +def get_cleva_machine_translation_metric_specs() -> List[MetricSpec]: + return [ + MetricSpec( + class_name="helm.benchmark.metrics.machine_translation_metrics.CLEVAMachineTranslationMetric", args={} + ) + ] + get_basic_metric_specs([]) + + +def get_cleva_paraphrase_generation_metric_specs(alpha: float = 0.8) -> List[MetricSpec]: + return [ + MetricSpec( + class_name="helm.benchmark.metrics.paraphrase_generation_metrics.CLEVAParaphraseGenerationMetric", + args={"alpha": alpha}, # calculate iBLEU_0.8 by default + ) + ] + get_basic_metric_specs([]) + + +def get_verifiability_judgment_metric_specs() -> List[MetricSpec]: + return get_basic_metric_specs(["exact_match", "quasi_exact_match"]) + + +def get_instruction_following_critique_metric_specs(num_respondents: int) -> List[MetricSpec]: + return [ + MetricSpec( + class_name="helm.benchmark.metrics.instruction_following_critique_metrics.InstructionFollowingCritiqueMetric", # noqa E501 + args={"num_respondents": num_respondents}, + ) + ] + + +def get_cleva_topk_accuracy_metric_specs(k: int = 1, cut_off: int = 5) -> List[MetricSpec]: + return [ + MetricSpec( + class_name="helm.benchmark.metrics.cleva_accuracy_metrics.CLEVATopKAccuracyMetric", + args={"k": k, "cut_off": cut_off}, + ) + ] + + +def get_cleva_bias_metric_specs() -> List[MetricSpec]: + demographic_categories = ["race", "gender"] + target_categories = ["adjective", "profession"] + cross_dem_target = itertools.product(demographic_categories, target_categories) + + return [ + MetricSpec( + class_name="helm.benchmark.metrics.cleva_harms_metrics.CLEVABiasMetric", + args={"mode": "associations", "demographic_category": dem, "target_category": tgt}, + ) + for dem, tgt in cross_dem_target + ] + [ + MetricSpec( + class_name="helm.benchmark.metrics.cleva_harms_metrics.CLEVABiasMetric", + args={"mode": "representation", "demographic_category": dem}, + ) + for dem in demographic_categories + ] + + +def get_cleva_toxicity_metric_specs() -> List[MetricSpec]: + return [ + MetricSpec(class_name="helm.benchmark.metrics.cleva_harms_metrics.CLEVAToxicityMetric", args={}), + ] + + +def get_cleva_generative_harms_metric_specs(include_basic_metrics: bool = False) -> List[MetricSpec]: + return ( + get_cleva_bias_metric_specs() + + get_cleva_toxicity_metric_specs() + + (get_basic_metric_specs([]) if include_basic_metrics else []) + ) + + +def get_cleva_copyright_metric_spec(args: Optional[Dict] = None) -> List[MetricSpec]: + if args is None: + args = {} + return [ + MetricSpec( + class_name="helm.benchmark.metrics.cleva_harms_metrics.CLEVACopyrightMetric", + args={**args, "name": "longest_common_prefix_length"}, + ), + MetricSpec( + class_name="helm.benchmark.metrics.cleva_harms_metrics.CLEVACopyrightMetric", + args={**args, "name": "edit_distance"}, + ), + MetricSpec( + class_name="helm.benchmark.metrics.cleva_harms_metrics.CLEVACopyrightMetric", + args={**args, "name": "edit_similarity"}, + ), + ] + + +def get_cleva_generative_task_metric_spec(task: str, subtask: Optional[str], **kwargs) -> List[MetricSpec]: + CLEVA_GEN_TASK_TO_METRIC: Dict[str, Callable] = { + "opinion_mining:opinion_target_extraction": get_exact_match_metric_specs, + "paraphrase_generation": get_cleva_paraphrase_generation_metric_specs, + "closed_book_question_answering:generative_question_answering": get_exact_match_metric_specs, + "conceptual_generalization": get_cleva_topk_accuracy_metric_specs, + "translation:en2zh": get_cleva_machine_translation_metric_specs, + "translation:zh2en": get_cleva_machine_translation_metric_specs, + "mathematical_calculation:add": get_exact_match_metric_specs, + "mathematical_calculation:sub": get_exact_match_metric_specs, + "mathematical_calculation:mul": get_exact_match_metric_specs, + "inductive_reasoning:add": get_exact_match_metric_specs, + "inductive_reasoning:sub": get_exact_match_metric_specs, + "inductive_reasoning:mul": get_exact_match_metric_specs, + "reasoning_primitive:dyck_language": get_exact_match_metric_specs, + "reasoning_primitive:pattern_induction": get_exact_match_metric_specs, + "reasoning_primitive:pattern_matching": get_exact_match_metric_specs, + "reasoning_primitive:variable_sub": get_exact_match_metric_specs, + "subject_knowledge:art": get_exact_match_metric_specs, + "subject_knowledge:biomedicine": get_exact_match_metric_specs, + "subject_knowledge:chemistry": get_exact_match_metric_specs, + "subject_knowledge:computer_science": get_exact_match_metric_specs, + "subject_knowledge:economics": get_exact_match_metric_specs, + "subject_knowledge:geography": get_exact_match_metric_specs, + "subject_knowledge:history": get_exact_match_metric_specs, + "subject_knowledge:law": get_exact_match_metric_specs, + "subject_knowledge:literature": get_exact_match_metric_specs, + "subject_knowledge:math": get_exact_match_metric_specs, + "subject_knowledge:other_general": get_exact_match_metric_specs, + "subject_knowledge:philosophy": get_exact_match_metric_specs, + "subject_knowledge:physics": get_exact_match_metric_specs, + "subject_knowledge:politics": get_exact_match_metric_specs, + "summarization:dialogue_summarization": partial(get_basic_metric_specs, ["chinese_rouge_2"]), + "pinyin_transliteration:pinyin2zh": partial(get_basic_metric_specs, ["chinese_bleu_1"]), + "pinyin_transliteration:zh2pinyin": partial(get_basic_metric_specs, ["chinese_bleu_1"]), + "dialogue_generation:task_oriented": partial(get_basic_metric_specs, ["chinese_bleu_1"]), + "data_to_text_generation": partial(get_basic_metric_specs, ["chinese_bleu_1"]), + "mathematical_reasoning:math_word_problem": partial(get_basic_metric_specs, ["cleva_math_result_match"]), + } + + key: str = task + if subtask is not None: + key += ":" + subtask + return CLEVA_GEN_TASK_TO_METRIC[key](**kwargs) + + +def get_kpi_edgar_metric_specs() -> List[MetricSpec]: + return [MetricSpec(class_name="helm.benchmark.metrics.kpi_edgar_metrics.NERAdjustedF1Metric", args={})] + +def get_math_float_match_metric_specs() -> List[MetricSpec]: + return get_basic_metric_specs(["float_equiv"]) + +############################################################ +# Run specs + + +CANONICAL_RUN_SPEC_FUNCS: Dict[str, Callable[..., RunSpec]] = {} +"""Dict of run spec function names to run spec functions.""" + + +F = TypeVar("F", bound=Callable[..., RunSpec]) + + +def run_spec_function(name: str) -> Callable[[F], F]: + """Register the run spec function under the given name.""" + + def wrap(func: F) -> F: + if name in CANONICAL_RUN_SPEC_FUNCS: + raise ValueError(f"A run spec function with name {name} already exists") + CANONICAL_RUN_SPEC_FUNCS[name] = func + return func + + return wrap + + +@run_spec_function("simple1") +def get_simple1_spec() -> RunSpec: + """A run spec for debugging.""" + return RunSpec( + name="simple1", + scenario_spec=get_scenario_spec1(), + adapter_spec=get_adapter_spec1(), + metric_specs=get_basic_metric_specs([]), + groups=[], + ) @run_spec_function("bbq") @@ -60,9 +832,7 @@ def get_bbq_spec(subject: str, method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> Run input_noun="Passage", output_noun="Answer", ) - metric_specs = [ - MetricSpec(class_name="helm.benchmark.metrics.bbq_metrics.BBQMetric", args={}) - ] + get_exact_match_metric_specs() + metric_specs = get_bbq_metric_specs() return RunSpec( name=f"bbq:subject={subject},method={method}", @@ -75,8 +845,6 @@ def get_bbq_spec(subject: str, method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> Run @run_spec_function("msmarco") def get_msmarco_spec(track: str, valid_topk: Optional[int] = None) -> RunSpec: - from helm.benchmark.scenarios.msmarco_scenario import MSMARCOScenario - valid_topk = None if valid_topk is None else int(valid_topk) scenario_spec = ScenarioSpec( class_name="helm.benchmark.scenarios.msmarco_scenario.MSMARCOScenario", @@ -85,33 +853,11 @@ def get_msmarco_spec(track: str, valid_topk: Optional[int] = None) -> RunSpec: adapter_spec: AdapterSpec = get_ranking_binary_adapter_spec(max_train_instances=4, stop_sequences=["\n"]) - # Names of the measures we want to compute. - measure_names = MSMARCOScenario.MEASURE_NAMES[track] - multiple_relevance_values = set(MSMARCOScenario.GOLD_RELATIONS[track]) != {1} - - metric_specs = ( - [ - MetricSpec( - class_name="helm.benchmark.metrics.ranking_metrics.RankingMetric", - args={ - "method": ADAPT_RANKING_BINARY, - "measure_names": measure_names, - "correct_output": BinaryRankingAdapter.RANKING_CORRECT_LABEL, - "wrong_output": BinaryRankingAdapter.RANKING_WRONG_LABEL, - "rank": valid_topk, - "multiple_relevance_values": multiple_relevance_values, - }, - ), - ] - + get_basic_reference_metric_specs() - + get_generic_metric_specs() - ) - return RunSpec( name=f"msmarco:track={track},valid_topk={valid_topk}", scenario_spec=scenario_spec, adapter_spec=adapter_spec, - metric_specs=metric_specs, + metric_specs=get_msmarco_metric_specs(track=track, rank=valid_topk), groups=[f"msmarco_{track}"], ) @@ -185,6 +931,28 @@ def get_custom_mcqa_spec( ) +@run_spec_function("mmlu") +def get_mmlu_spec(subject: str, method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> RunSpec: + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.mmlu_scenario.MMLUScenario", args={"subject": subject} + ) + + adapter_spec = get_multiple_choice_adapter_spec( + method=method, + instructions=f"The following are multiple choice questions (with answers) about {subject.replace('_', ' ')}.", + input_noun="Question", + output_noun="Answer", + ) + + return RunSpec( + name=f"mmlu:subject={subject},method={method}", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=get_exact_match_metric_specs(), + groups=["mmlu"], + ) + + @run_spec_function("interactive_qa_mmlu") def get_interactive_qa_mmlu_spec(subject: str) -> RunSpec: scenario_spec = ScenarioSpec( @@ -233,6 +1001,42 @@ def get_wikifact_spec(k: str, subject: str) -> RunSpec: ) +@run_spec_function("commonsense") +def get_commonsense_spec(dataset: str, method: str) -> RunSpec: + # TODO Split these into their own run_spec_function. + if dataset == HellaSwagScenario.name: + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.commonsense_scenario.HellaSwagScenario", args={} + ) + elif dataset == OpenBookQA.name: + scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.commonsense_scenario.OpenBookQA", args={}) + elif dataset == CommonSenseQAScenario.name: + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.commonsense_scenario.CommonSenseQAScenario", args={} + ) + elif dataset == SiqaScenario.name: + scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.commonsense_scenario.SiqaScenario", args={}) + elif dataset == PiqaScenario.name: + scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.commonsense_scenario.PiqaScenario", args={}) + else: + raise ValueError(f"Unknown dataset: {dataset}") + + adapter_spec = get_multiple_choice_adapter_spec( + method=method, + instructions="The following are multiple choice questions (with answers) about common sense.", + input_noun="Question", + output_noun="Answer", + ) + + return RunSpec( + name=f"commonsense:dataset={dataset},method={method}", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=get_exact_match_metric_specs(), + groups=[dataset], + ) + + @run_spec_function("quac") def get_quac_spec() -> RunSpec: scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.quac_scenario.QuACScenario", args={}) @@ -295,7 +1099,7 @@ def get_twitter_aae_spec(demographic: str) -> RunSpec: name=f"twitter_aae:demographic={demographic}", scenario_spec=scenario_spec, adapter_spec=get_language_modeling_adapter_spec(), - metric_specs=get_language_modeling_metric_specs([]), + metric_specs=get_basic_metric_specs([]), groups=["twitter_aae", f"twitter_aae_{demographic}"], ) @@ -323,9 +1127,7 @@ def get_real_toxicity_prompts_spec() -> RunSpec: name="real_toxicity_prompts", scenario_spec=scenario_spec, adapter_spec=adapter_spec, - metric_specs=get_generative_harms_metric_specs( - include_basic_metrics=True, include_generative_harms_metrics=True - ), + metric_specs=get_generative_harms_metric_specs(include_basic_metrics=True), groups=["real_toxicity_prompts"], ) @@ -345,28 +1147,50 @@ def get_synthetic_reasoning_natural_spec(difficulty: str) -> RunSpec: max_train_instances=3, # limited by the context length max_tokens=20, ) - srn_metric_specs = get_basic_metric_specs(["f1_set_match", "iou_set_match", "exact_set_match"]) return RunSpec( name=f"synthetic_reasoning_natural:difficulty={difficulty}", scenario_spec=scenario_spec, adapter_spec=adapter_spec, - metric_specs=srn_metric_specs + get_generative_harms_metric_specs(), + metric_specs=get_srn_metric_specs() + get_generative_harms_metric_specs(), groups=["synthetic_reasoning", "synthetic_reasoning_natural"], ) -@run_spec_function("raft") -def get_raft_spec(subset: str) -> RunSpec: - from helm.benchmark.scenarios.raft_scenario import RAFTScenario, get_raft_instructions +@run_spec_function("gsm") +def get_gsm_spec() -> RunSpec: + scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.gsm_scenario.GSM8KScenario", args={}) - scenario_spec = ScenarioSpec( - class_name="helm.benchmark.scenarios.raft_scenario.RAFTScenario", args={"subset": subset} + # Create AdapterSpec based on the GSM8K paper: https://arxiv.org/pdf/2110.14168.pdf + adapter_spec = get_generation_adapter_spec( + input_noun="Q", + output_noun="A", + max_train_instances=5, # Due to limited context and long example length + max_tokens=400, # The paper uses 400 tokens as the max sample length + stop_sequences=["\n\n"], # Since answer may contain newlines, we use two as SEP ) - scenario_cache_path = get_scenario_cache_path(get_benchmark_output_path(), RAFTScenario.name) - adapter_spec = get_generation_adapter_spec( - instructions=get_raft_instructions(subset, scenario_cache_path), + return RunSpec( + name="gsm", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=get_basic_metric_specs(["exact_match_indicator", "final_number_exact_match"]) + + get_generative_harms_metric_specs(), + groups=["gsm"], + ) + + +@run_spec_function("raft") +def get_raft_spec(subset: str) -> RunSpec: + from helm.benchmark.scenarios.raft_scenario import RAFTScenario, get_raft_instructions + + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.raft_scenario.RAFTScenario", args={"subset": subset} + ) + + scenario_cache_path = get_scenario_cache_path(get_benchmark_output_path(), RAFTScenario.name) + adapter_spec = get_generation_adapter_spec( + instructions=get_raft_instructions(subset, scenario_cache_path), input_noun=None, output_noun="Label", max_tokens=30, # at most ~50 characters per label @@ -385,7 +1209,7 @@ def get_raft_spec(subset: str) -> RunSpec: def get_numeracy_spec( relation_type: str = "linear", mode: str = "function", seed: str = "0", run_solver: str = "False" ) -> RunSpec: - from helm.benchmark.scenarios.numeracy_scenario import get_numeracy_adapter_spec, RELTYPE_INFO + from .scenarios.numeracy_scenario import get_numeracy_adapter_spec, RELTYPE_INFO run_solver_bool: bool = True if run_solver == "True" else False del run_solver @@ -433,6 +1257,71 @@ def get_numeracy_spec( ) +@run_spec_function("math") +def get_math_spec( + subject: str, + level: str, + use_official_examples: str = "False", + use_chain_of_thought: str = "False", +) -> RunSpec: + # Convert to bools and remove the str versions + use_official_examples_bool: bool = use_official_examples == "True" + use_chain_of_thought_bool: bool = use_chain_of_thought == "True" + del use_official_examples + del use_chain_of_thought + + if use_chain_of_thought_bool: + assert not use_official_examples_bool, "Cannot use official examples when use_chain_of_thought is True." + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.math_scenario.MATHScenario", + args={ + "subject": subject, + "level": level, + "use_official_examples": use_official_examples_bool, + "use_chain_of_thought": use_chain_of_thought_bool, + }, + ) + + if use_chain_of_thought_bool: # Include the solution in the output as per https://arxiv.org/abs/2201.11903 + output_prefix = "Answer: " # Don't include LaTeX '$' delimiters + output_suffix = "\n" + instance_prefix = "###\n" # Don't include LaTeX '$' delimiters + max_tokens = 400 # Increase the number of tokens to generate + stop_sequences = ["###"] # Break at the next instance; extraneous output will be stripped out + groups = ["math_chain_of_thought"] + else: + output_prefix = "Answer: $" + output_suffix = "$\n" + instance_prefix = "###\n" + max_tokens = 20 + stop_sequences = ["$"] # Break at the nearest LaTeX closing delimiter + groups = ["math_regular"] + + adapter_spec = AdapterSpec( + method=ADAPT_GENERATION, + instructions="Given a mathematics problem, determine the answer. Simplify your answer as much as possible.\n", + max_train_instances=8, + num_outputs=1, + temperature=0.0, + stop_sequences=stop_sequences, + max_tokens=max_tokens, + input_prefix="Problem: ", + input_suffix="\n", + output_prefix=output_prefix, + output_suffix=output_suffix, + instance_prefix=instance_prefix, + ) + + return RunSpec( + name=f"math:subject={subject},level={level}," + f"use_official_examples={use_official_examples_bool},use_chain_of_thought={use_chain_of_thought_bool}", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=get_math_metric_specs(use_chain_of_thought_bool) + get_generative_harms_metric_specs(), + groups=groups, + ) + + @run_spec_function("boolq") def get_boolq_spec(only_contrast=False) -> RunSpec: scenario_spec = ScenarioSpec( @@ -517,8 +1406,6 @@ def get_copyright_spec( normalize_by_prefix_length=True, normalize_newline_space_tab=False, ) -> RunSpec: - from helm.benchmark.scenarios.copyright_scenario import datatag2hash_code - scenario_spec = ScenarioSpec( class_name="helm.benchmark.scenarios.copyright_scenario.CopyrightScenario", args=dict(datatag=datatag) ) @@ -631,21 +1518,36 @@ def get_code_spec(dataset: str, timeout=3) -> RunSpec: max_tokens=600, ) - if dataset == "humaneval": - code_metric_specs = get_basic_metric_specs(["code_eval_acc", "pass"]) - else: # APPS. - args: Dict[str, Any] = {"names": ["test_avg", "strict_acc"], "timeout": timeout} - code_metric_specs = [MetricSpec(class_name="helm.benchmark.metrics.code_metrics.APPSMetric", args=args)] - return RunSpec( name=f"code:dataset={dataset}", scenario_spec=scenario_spec, adapter_spec=adapter_spec, - metric_specs=code_metric_specs + get_generative_harms_metric_specs(), + metric_specs=get_code_metric_specs(dataset, timeout) + get_generative_harms_metric_specs(), groups=[f"code_{dataset}"], ) +@run_spec_function("natural_qa") +def get_natural_qa_spec(mode: str) -> RunSpec: + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario", args={"mode": mode} + ) + + adapter_spec = get_generation_adapter_spec( + input_noun="Question" if mode == "closedbook" else None, + output_noun="Answer", + max_tokens=300, # answers are at most 65 words + ) + + return RunSpec( + name=f"natural_qa:mode={mode}", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=get_f1_metric_specs() + get_generative_harms_metric_specs(), + groups=[f"natural_qa_{mode}"], + ) + + @run_spec_function("the_pile") def get_the_pile_spec(subset: str) -> RunSpec: scenario_spec = ScenarioSpec( @@ -656,7 +1558,7 @@ def get_the_pile_spec(subset: str) -> RunSpec: name=f"the_pile:subset={subset}", scenario_spec=scenario_spec, adapter_spec=get_language_modeling_adapter_spec(), - metric_specs=get_language_modeling_metric_specs([]), + metric_specs=get_basic_metric_specs([]), groups=["the_pile"], ) @@ -669,11 +1571,32 @@ def get_ice_spec(**kwargs) -> RunSpec: name="ice" + (":" if len(kwargs) > 0 else "") + ",".join(f"{k}={v}" for k, v in sorted(kwargs.items())), scenario_spec=scenario_spec, adapter_spec=get_language_modeling_adapter_spec(), - metric_specs=get_language_modeling_metric_specs([]), + metric_specs=get_basic_metric_specs([]), groups=["ice"], ) +@run_spec_function("narrative_qa") +def get_narrativeqa_spec() -> RunSpec: + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.narrativeqa_scenario.NarrativeQAScenario", args={} + ) + + adapter_spec = get_generation_adapter_spec( + input_noun="Passage", + output_noun="Answer", + max_tokens=100, # max 30 words + ) + + return RunSpec( + name="narrative_qa", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=get_open_ended_generation_metric_specs() + get_generative_harms_metric_specs(), + groups=["narrative_qa"], + ) + + @run_spec_function("synthetic_efficiency") def get_synthetic_efficiency_spec( num_prompt_tokens: Optional[int] = None, @@ -695,9 +1618,7 @@ def get_synthetic_efficiency_spec( name=f"synthetic_efficiency:random={random}", scenario_spec=scenario_spec, adapter_spec=adapter_spec, - metric_specs=get_basic_generation_metric_specs(["exact_match"]) - + get_generic_metric_specs() - + get_generative_harms_metric_specs(), + metric_specs=get_basic_metric_specs(["exact_match"]) + get_generative_harms_metric_specs(), groups=["synthetic_efficiency"], ) @@ -736,7 +1657,7 @@ def get_wikitext_103_spec() -> RunSpec: name="wikitext_103", scenario_spec=scenario_spec, adapter_spec=get_language_modeling_adapter_spec(), - metric_specs=get_language_modeling_metric_specs([]), + metric_specs=get_basic_metric_specs([]), groups=["wikitext_103"], ) @@ -884,13 +1805,40 @@ def get_dyck_language_spec(num_parenthesis_pairs: int) -> RunSpec: name=f"dyck_language_np={int(num_parenthesis_pairs)}", scenario_spec=scenario_spec, adapter_spec=adapter_spec, - metric_specs=get_basic_generation_metric_specs(["exact_match_indicator"]) - + get_generic_metric_specs() - + get_generative_harms_metric_specs(), + metric_specs=get_basic_metric_specs(["exact_match_indicator"]) + get_generative_harms_metric_specs(), groups=["dyck_language"], ) +@run_spec_function("legalbench") +def get_legalbench_spec(subset: str) -> RunSpec: + from helm.benchmark.scenarios.legalbench_scenario import ( + LegalBenchScenario, + get_legalbench_instructions, + get_legalbench_output_nouns, + ) + + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.legalbench_scenario.LegalBenchScenario", args={"subset": subset} + ) + scenario_cache_path = get_scenario_cache_path(get_benchmark_output_path(), LegalBenchScenario.name) + adapter_spec = get_generation_adapter_spec( + instructions=get_legalbench_instructions(subset, scenario_cache_path), + input_noun=None, + output_noun=get_legalbench_output_nouns(subset, scenario_cache_path), + max_tokens=30, # at most ~50 characters per label, + max_train_instances=5, # Use 5 for all subsets + ) + + return RunSpec( + name=f"legalbench:subset={subset}", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=get_exact_match_metric_specs(), + groups=["legalbench"], + ) + + @run_spec_function("legal_support") def get_legal_support_spec(method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> RunSpec: scenario_spec = ScenarioSpec( @@ -1119,7 +2067,7 @@ def get_med_mcqa_spec() -> RunSpec: scenario_spec=scenario_spec, adapter_spec=adapter_spec, metric_specs=get_exact_match_metric_specs(), - groups=["med_mcqa"], + groups=["MedMCQA"], ) @@ -1145,89 +2093,48 @@ def get_med_paragraph_simplification_spec() -> RunSpec: ) -@run_spec_function("pubmed_qa") -def get_pubmed_qa_spec() -> RunSpec: - scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.pubmed_qa_scenario.PubMedQAScenario", args={}) +@run_spec_function("med_qa") +def get_med_qa_spec() -> RunSpec: + scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.med_qa_scenario.MedQAScenario", args={}) adapter_spec = get_multiple_choice_adapter_spec( method=ADAPT_MULTIPLE_CHOICE_JOINT, - instructions="Answer A for yes, B for no or C for maybe.", + instructions="The following are multiple choice questions (with answers) about medicine.", input_noun="Question", output_noun="Answer", ) return RunSpec( - name="pubmed_qa", + name="med_qa", scenario_spec=scenario_spec, adapter_spec=adapter_spec, metric_specs=get_exact_match_metric_specs(), - groups=["pubmed_qa"], - ) - - -@run_spec_function("live_qa") -def get_live_qa_spec() -> RunSpec: - scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.live_qa_scenario.LiveQAScenario") - - adapter_spec = get_generation_adapter_spec( - instructions="Please answer the following consumer health question.", - input_noun="Question", - output_noun="Answer", - max_train_instances=0, - max_tokens=512, - ) - annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.live_qa_annotator.LiveQAAnnotator")] - metric_specs = get_open_ended_generation_metric_specs() + [ - MetricSpec(class_name="helm.benchmark.metrics.live_qa_metrics.LiveQAScoreMetric") - ] - - return RunSpec( - name="live_qa", - scenario_spec=scenario_spec, - adapter_spec=adapter_spec, - annotators=annotator_specs, - metric_specs=metric_specs, - groups=["live_qa"], + groups=["med_qa"], ) -@run_spec_function("medication_qa") -def get_medication_qa_spec() -> RunSpec: - scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.medication_qa_scenario.MedicationQAScenario") +@run_spec_function("pubmed_qa") +def get_pubmed_qa_spec() -> RunSpec: + scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.pubmed_qa_scenario.PubMedQAScenario", args={}) - adapter_spec = get_generation_adapter_spec( - instructions="Please answer the following consumer health question.", + adapter_spec = get_multiple_choice_adapter_spec( + method=ADAPT_MULTIPLE_CHOICE_JOINT, + instructions="Answer A for yes, B for no or C for maybe.", input_noun="Question", output_noun="Answer", - max_train_instances=0, - max_tokens=512, ) - annotator_specs = [ - AnnotatorSpec(class_name="helm.benchmark.annotation.medication_qa_annotator.MedicationQAAnnotator") - ] - metric_specs = [MetricSpec(class_name="helm.benchmark.metrics.medication_qa_metrics.MedicationQAScoreMetric")] - return RunSpec( - name="medication_qa", + name="pubmed_qa", scenario_spec=scenario_spec, adapter_spec=adapter_spec, - annotators=annotator_specs, - metric_specs=metric_specs, - groups=["medication_qa"], + metric_specs=get_exact_match_metric_specs(), + groups=["pubmed_qa"], ) @run_spec_function("lextreme") def get_lextreme_spec(subset: str) -> RunSpec: - from helm.benchmark.scenarios.lextreme_scenario import ( - get_lextreme_instructions, - get_lextreme_max_train_instances, - get_lextreme_max_tokens, - TaskType, - get_lextreme_task_type, - ) - task_type = get_lextreme_task_type(subset) scenario_spec = ScenarioSpec( @@ -1244,7 +2151,7 @@ def get_lextreme_spec(subset: str) -> RunSpec: multi_label=(task_type == TaskType.MLTC), ) - metric_specs = get_basic_generation_metric_specs([]) + get_generic_metric_specs() + metric_specs = get_basic_metric_specs([]) if task_type == TaskType.MLTC: metric_specs += get_classification_metric_specs(delimiter=", ") elif task_type == TaskType.SLTC: @@ -1261,14 +2168,6 @@ def get_lextreme_spec(subset: str) -> RunSpec: @run_spec_function("lex_glue") def get_lex_glue_spec(subset: str) -> RunSpec: - from helm.benchmark.scenarios.lex_glue_scenario import ( - get_lex_glue_instructions, - get_lex_glue_max_tokens, - get_lex_glue_max_train_instances, - get_lex_glue_task_type, - ) - from helm.benchmark.scenarios.lextreme_scenario import TaskType - task_type = get_lex_glue_task_type(subset) scenario_spec = ScenarioSpec( @@ -1285,7 +2184,7 @@ def get_lex_glue_spec(subset: str) -> RunSpec: multi_label=(task_type == TaskType.MLTC), ) - metric_specs = get_basic_generation_metric_specs([]) + get_generic_metric_specs() + metric_specs = get_basic_metric_specs([]) if task_type == TaskType.MLTC: metric_specs += get_classification_metric_specs(delimiter=", ") elif task_type == TaskType.SLTC: @@ -1384,6 +2283,92 @@ def get_eurlexsum_legal_summarization_spec(temperature: float = 0.3, device: str ) +@run_spec_function("wmt_14") +def get_wmt_14_spec(language_pair: str, max_train_instances: int = 1) -> RunSpec: + FULL_LANGUAGE_NAMES = { + "cs": "Czech", + "de": "German", + "fr": "French", + "hi": "Hindi", + "ru": "Russian", + "en": "English", + } + source_language, target_language = language_pair.split("-") + + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.wmt_14_scenario.WMT14Scenario", + args={"source_language": source_language, "target_language": target_language}, + ) + + adapter_spec = get_machine_translation_adapter_spec( + source_language=FULL_LANGUAGE_NAMES[source_language], + target_language=FULL_LANGUAGE_NAMES[target_language], + max_train_instances=max_train_instances, + ) + + return RunSpec( + name=f"wmt_14:language_pair={language_pair}", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=get_open_ended_generation_metric_specs(), + groups=["wmt_14"], + ) + + +@run_spec_function("self_instruct") +def get_self_instruct_spec(num_respondents: int) -> RunSpec: + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.self_instruct_scenario.SelfInstructScenario", + args={}, + ) + + adapter_spec = get_instruct_adapter_spec() + + return RunSpec( + name="self_instruct", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=get_instruction_following_critique_metric_specs(num_respondents), + groups=["self_instruct"], + ) + + +@run_spec_function("vicuna") +def get_vicuna_spec(num_respondents: int, category: str = "all") -> RunSpec: + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.vicuna_scenario.VicunaScenario", + args={"category": category}, + ) + + adapter_spec = get_instruct_adapter_spec() + + return RunSpec( + name=f"vicuna:category={category}", # TODO: add args + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=get_instruction_following_critique_metric_specs(num_respondents), + groups=["vicuna"], + ) + + +@run_spec_function("grammar") +def get_grammar_spec(num_respondents: int, path: str, tags: str) -> RunSpec: + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.grammar_scenario.GrammarScenario", + args={"path": path, "tags": tags}, + ) + + adapter_spec = get_instruct_adapter_spec() + + return RunSpec( + name=f"grammar:path={path},tags={tags}", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=get_instruction_following_critique_metric_specs(num_respondents), + groups=["grammar"], + ) + + @run_spec_function("verifiability_judgment") def get_verifiability_judgment_spec() -> RunSpec: scenario_spec = ScenarioSpec( @@ -1406,7 +2391,7 @@ def get_verifiability_judgment_spec() -> RunSpec: name="verifiability_judgment", scenario_spec=scenario_spec, adapter_spec=adapter_spec, - metric_specs=get_basic_metric_specs(["exact_match", "quasi_exact_match"]), + metric_specs=get_verifiability_judgment_metric_specs(), groups=["verifiability_judgment"], ) @@ -1446,69 +2431,567 @@ def get_opinions_qa_spec( ) -@run_spec_function("lm_entry") -def get_lm_entry_spec(task: str, method: str = ADAPT_GENERATION) -> RunSpec: +@run_spec_function("open_assistant") +def get_open_assistant_spec(num_respondents: int, language: str) -> RunSpec: scenario_spec = ScenarioSpec( - class_name="helm.benchmark.scenarios.lm_entry_scenario.LMEntryScenario", - args={"task": task}, + class_name="helm.benchmark.scenarios.open_assistant_scenario.OpenAssistantScenario", + args={"language": language}, ) - adapter_spec: AdapterSpec - metric_specs: List[MetricSpec] - if method == ADAPT_MULTIPLE_CHOICE_JOINT: - if task in ["first_letter", "last_letter", "first_word", "last_word", "word_before", "word_after"]: - raise ValueError(f"Task {task} cannot be cast to multiple choice.") - - adapter_spec = get_multiple_choice_adapter_spec( - method=method, - instructions="Answer the following multiple choice question with a single letter", - input_noun="Question", - output_noun="\nAnswer", + adapter_spec = get_instruct_adapter_spec() + + return RunSpec( + name=f"open_assistant:language={language}", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=get_instruction_following_critique_metric_specs(num_respondents), + groups=["open_assistant"], + ) + + +@run_spec_function("koala") +def get_koala_spec(num_respondents: int) -> RunSpec: + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.koala_scenario.KoalaScenario", + args={}, + ) + + adapter_spec = get_instruct_adapter_spec() + + return RunSpec( + name="koala", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=get_instruction_following_critique_metric_specs(num_respondents), + groups=["koala"], + ) + + +@run_spec_function("anthropic_hh_rlhf") +def get_anthropic_hh_rlhf_spec(num_respondents: int, subset: str) -> RunSpec: + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.anthropic_hh_rlhf_scenario.AnthropicHHRLHFScenario", + args={"subset": subset}, + ) + + adapter_spec = get_instruct_adapter_spec() + + return RunSpec( + name=f"anthropic_hh_rlhf:subset={subset}", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=get_instruction_following_critique_metric_specs(num_respondents), + groups=["anthropic_hh_rlhf"], + ) + + +@run_spec_function("cleva") +def get_cleva_spec(task: str, version: str, subtask: Optional[str] = None, prompt_id: int = 0) -> RunSpec: + from helm.benchmark.scenarios.cleva_scenario import CLEVAScenario # noqa + + scenario_cache_path = get_scenario_cache_path(get_benchmark_output_path(), CLEVAScenario.name) + CLEVAScenario.download_dataset(task, version, scenario_cache_path) + + _, prompt_setting = CLEVAScenario.get_prompt_setting(task, subtask, version, prompt_id, scenario_cache_path) + inference_parameters = CLEVAScenario.load_inference_parameters( + task, subtask, version, prompt_id, scenario_cache_path + ) + + class_name_prefix = "".join([word.capitalize() for word in task.split("_")]) + scenario_spec = ScenarioSpec( + class_name=f"helm.benchmark.scenarios.cleva_scenario.CLEVA{class_name_prefix}Scenario", + args={"version": version, "subtask": subtask, "prompt_id": prompt_id}, + ) + run_spec_name: str = f"cleva:task={task},version={version},prompt_id={prompt_id}" + if subtask: + run_spec_name += f",subtask={subtask}" + + if task in ["copyright"]: + adapter_spec = get_completion_adapter_spec( + temperature=inference_parameters.get("temperature", 0.2), + max_tokens=inference_parameters.get("max_tokens", 1024), + num_outputs=inference_parameters.get("num_outputs", 1), ) - metric_specs = get_exact_match_metric_specs() - elif method == ADAPT_GENERATION: - adapter_spec = get_generation_adapter_spec( - instructions="Answer the following question in one word.", - input_noun="Q", - output_noun="\nA", - # Shouldn't use any stop sequences because the task is zero-shot and thus we - # don't expect the model to magically figure out the output format. - stop_sequences=[], - # Set max_tokens to save tokens. The answer is a word so 10 tokens should suffice. - max_tokens=10, + args = {"normalize_by_prefix_length": True, "normalize_newline_space_tab": False} + metric_specs = get_cleva_copyright_metric_spec(args) + get_cleva_generative_harms_metric_specs() + elif task in ["code_synthesis"]: + adapter_spec = get_completion_adapter_spec( + instructions=prompt_setting.instructions, + temperature=inference_parameters.get("temperature", 0.2), + # Taken from the original OpenAI paper to prevent the further generation of irrelevant classes/functions + stop_sequences=inference_parameters.get("stop_sequences", ["\nclass", "\ndef", "\nif", "\nprint"]), + max_tokens=inference_parameters.get("max_tokens", 600), ) - # It makes no sense to include non-quasi exact match metrics for this task. - metric_specs = get_basic_metric_specs(["quasi_exact_match", "quasi_prefix_exact_match", "f1_score"]) + metric_specs = get_basic_metric_specs(["code_eval_acc", "pass"]) + get_cleva_generative_harms_metric_specs() + elif task in ["language_modeling"]: + adapter_spec = get_language_modeling_adapter_spec() + metric_specs = get_basic_metric_specs([]) else: - raise ValueError(f"Unknown method: {method}") + if prompt_setting.method in [ + ADAPT_MULTIPLE_CHOICE_JOINT, + ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED, + ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, + ]: + if prompt_setting.method == ADAPT_MULTIPLE_CHOICE_JOINT: + adapter_spec = AdapterSpec( + method=prompt_setting.method, + instructions=prompt_setting.instructions, + input_prefix=prompt_setting.input_prefix, + input_suffix=prompt_setting.input_suffix, + output_prefix=prompt_setting.output_prefix, + output_suffix=prompt_setting.output_suffix, + max_train_instances=inference_parameters.get("max_train_instances", 5), + num_outputs=inference_parameters.get("num_outputs", 5), + max_tokens=inference_parameters.get("max_tokens", 1), + temperature=inference_parameters.get("temperature", 0.0), + stop_sequences=inference_parameters.get("stop_sequences", ["\n"]), + sample_train=inference_parameters.get("sample_train", True), + multi_label=inference_parameters.get("multi_label", False), + ) + else: + adapter_spec = AdapterSpec( + method=prompt_setting.method, + instructions=prompt_setting.instructions, + input_prefix=prompt_setting.input_prefix, + input_suffix=prompt_setting.input_suffix, + output_prefix=prompt_setting.output_prefix, + output_suffix=prompt_setting.output_suffix, + # Separate is basically language modeling, so can't easily use in-context examples + max_train_instances=inference_parameters.get("max_train_instances", 5), + num_outputs=1, + max_tokens=0, + temperature=inference_parameters.get("temperature", 0.0), + sample_train=inference_parameters.get("sample_train", True), + ) + metric_specs = get_exact_match_metric_specs() + if task in ["fact_checking", "bias"]: + metric_specs += get_multiple_choice_classification_metric_specs() + elif prompt_setting.method == ADAPT_GENERATION: + adapter_spec = AdapterSpec( + method=prompt_setting.method, + instructions=prompt_setting.instructions, + input_prefix=prompt_setting.input_prefix, + input_suffix=prompt_setting.input_suffix, + output_prefix=prompt_setting.output_prefix, + output_suffix=prompt_setting.output_suffix, + max_train_instances=inference_parameters.get("max_train_instances", 5), + num_outputs=inference_parameters.get("num_outputs", 1), + max_tokens=inference_parameters.get("max_tokens", 20), + temperature=inference_parameters.get("temperature", 0.0), + stop_sequences=inference_parameters.get("stop_sequences", ["\n"]), + sample_train=inference_parameters.get("sample_train", True), + multi_label=inference_parameters.get("multi_label", True), + ) + metric_specs = ( + get_cleva_generative_task_metric_spec(task, subtask) + get_cleva_generative_harms_metric_specs() + ) + else: + raise ValueError( + f"{task} can only be {ADAPT_GENERATION}, {ADAPT_MULTIPLE_CHOICE_JOINT}, " + f"{ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED} or {ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL}" + ) + + return RunSpec( + name=run_spec_name, + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=metric_specs, + groups=["cleva", f"cleva_{task}"], + ) + +@run_spec_function("financial_phrasebank") +def get_financial_phrasebank_spec(subset: str = "sentences_50agree") -> RunSpec: + from .scenarios import financial_phrasebank_scenario + + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.financial_phrasebank_scenario.FinancialPhrasebankScenario", + args={"subset": subset}, + ) + + adapter_spec = get_generation_adapter_spec( + instructions=financial_phrasebank_scenario.get_instructions(), + input_noun=None, + output_noun="Label", + max_tokens=30, # at most ~50 characters per label + ) + + return RunSpec( + name=f"financial_phrasebank:subset={subset}", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=get_exact_match_metric_specs() + get_weighted_classification_metric_specs(), + groups=["financial_phrasebank"], + ) + +@run_spec_function("news_headline") +def get_news_headline_spec(category: str) -> RunSpec: + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.news_headline_scenario.NewsHeadlineScenario", + args={"category": category}, + ) + + adapter_spec = get_generation_adapter_spec(input_noun="Passage", output_noun="Answer") + + return RunSpec( + name=f"news_headline:category={category}", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(), + groups=["news_headline"], + ) + +@run_spec_function("kpi_edgar") +def get_kpi_edgar_spec() -> RunSpec: + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.kpi_edgar_scenario.KPIEDGARScenario", + args={}, + ) + + adapter_spec = get_generation_adapter_spec( + input_noun=None, output_noun="Answer", max_tokens=100, max_train_instances=20 + ) + + return RunSpec( + name="kpi_edgar", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=get_f1_metric_specs() + get_kpi_edgar_metric_specs(), + groups=["kpi_edgar"], + ) + +@run_spec_function("conv_fin_qa") +def get_conv_fin_qa_spec() -> RunSpec: + scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.conv_fin_qa_scenario.ConvFinQAScenario", args={}) + + adapter_spec = get_generation_adapter_spec(input_noun="Passage", output_noun="Answer") + + return RunSpec( + name="conv_fin_qa", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=get_math_float_match_metric_specs(), + groups=["conv_fin_qa"], + ) + +@run_spec_function("legal_opinion") +def get_legal_opinion_spec() -> RunSpec: + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.legal_opinion_scenario.LegalOpinionScenario", args={} + ) + + instructions = "Classify the sentences into one of the 3 sentiment categories. Possible labels: positive, neutral, negative." # noqa + adapter_spec = get_generation_adapter_spec( + instructions=instructions, + output_noun="Label", + ) + + return RunSpec( + name="legal_opinion", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=get_exact_match_metric_specs() + get_weighted_classification_metric_specs(), + groups=["legal_opinion"], + ) + +# A different implementation (binary classification) of lex_glue_fixed:subset=ecthr_a +@run_spec_function("echr_judge") +def get_echr_judge_spec() -> RunSpec: + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.echr_judge_scenario.EchrJudgeScenario", args={"doc_max_length": 600} + ) + + adapter_spec = get_generation_adapter_spec( + # instructions=EchrJudgeScenario.PROMPT_INST, # simple intsruction + instructions=EchrJudgeScenario.PROMPT_INST_WITH_EX, # instruction with trivial examples + input_noun=EchrJudgeScenario.PROMPT_INPUT, + output_noun=EchrJudgeScenario.PROMPT_OUTPUT, + max_tokens=1, + ) + + return RunSpec( + name="echr_judge", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=get_exact_match_metric_specs() + get_weighted_classification_metric_specs(), + groups=["echr_judge"], + ) + +# A different implementation of lex_glue_fixed:subset=case_hold +@run_spec_function("casehold_qa") +def get_casehold_qa_spec() -> RunSpec: + scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.casehold_qa_scenario.CaseHOLDQAScenario", args={}) + + method = ADAPT_MULTIPLE_CHOICE_JOINT + adapter_spec = get_multiple_choice_adapter_spec( + method=method, + instructions="Give a letter answer among A, B, C, D, or E.", + input_noun="Passage", + output_noun="Answer", + max_train_instances=2, + ) + + metric_specs = get_f1_metric_specs() return RunSpec( - name=f"lm_entry:task={task},method={method}", + name="casehold_qa", scenario_spec=scenario_spec, adapter_spec=adapter_spec, metric_specs=metric_specs, - groups=["lm_entry"], + groups=["CaseHOLDQA"], ) +@run_spec_function("legal_contract") +def get_legal_contract_spec() -> RunSpec: + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.legal_contract_scenario.LegalContractScenario", + args={}, + ) -@run_spec_function("thai_exam") -def get_thai_exam_spec(exam: str = "onet", method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> RunSpec: + adapter_spec = get_generation_adapter_spec( + output_noun="Summary", + max_tokens=100, # <=1536 (Limited by BAM) + stop_sequences=["\n\n"], # workaround for the first \n char with gpt-neox-20b + ) + + return RunSpec( + name="legal_contract", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=get_basic_metric_specs(["rouge_1", "rouge_2", "rouge_l"]), + groups=["legal_contract"], + ) + +@run_spec_function("sumosum") +def get_sumosum_spec() -> RunSpec: scenario_spec = ScenarioSpec( - class_name="helm.benchmark.scenarios.thai_exam_scenario.ThaiExamScenario", args={"exam": exam} + class_name="helm.benchmark.scenarios.sumosum_scenario.SUMOSumScenario", + args={ + # "sampling_min_length": 100, + # "sampling_max_length": 700, + # "doc_max_length": 3700, + }, + ) + + instructions = "Generate the title of the following article." + adapter_spec = get_generation_adapter_spec( + instructions=instructions, + output_noun="Title", + max_train_instances=0, + max_tokens=100, # <=1536 (Limited by BAM) + stop_sequences=["\n\n"], # workaround for the first \n char with gpt-neox-20b + ) + + # NOTE doc_max_length(3700 words) + max_tokens(100 tokens) <= max_request_length(4096 tokens) + # see EncoderDecoderWindowService.fits_within_context_window + + return RunSpec( + name="sumosum", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=get_basic_metric_specs(["rouge_1", "rouge_2", "rouge_l"]), + groups=["sumosum"], + ) + +@run_spec_function("cti_mitre") +def get_cti_mitre_spec(num_options: int = 10, seed: int = 42, method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> RunSpec: + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.cti_mitre_scenario.CtiMitreScenario", + args={ + "num_options": num_options, + "seed": seed, + }, ) adapter_spec = get_multiple_choice_adapter_spec( method=method, - instructions="The following are multiple choice questions (with answers).", - input_noun="Question", + instructions="Answer the possible security attacks in each of the following situations from each of the options below.", # noqa + input_noun="Situation", output_noun="Answer", - max_train_instances=5, + max_train_instances=10, ) return RunSpec( - name=f"thai_exam:exam={exam},method={method}", + name=f"cti_mitre:num_options={num_options},seed={seed},method={method}", scenario_spec=scenario_spec, adapter_spec=adapter_spec, - metric_specs=get_exact_match_metric_specs(), - groups=["thai_exam", f"thai_exam_{exam}"], + metric_specs=get_basic_metric_specs(["exact_match", "quasi_exact_match", "f1_score"]), + groups=["cti_mitre"], ) + + +############################################################ + + +def get_default_model_deployment_for_model( + model_name: str, warn_arg_deprecated: bool = False, ignore_deprecated: bool = False +) -> Optional[str]: + """Returns a valid model deployment name corresponding to the given model arg. + This is used as a backwards compatibility layer for model names that are now moved to model deployments. + Example: "anthropic/claude-v1.3" => "anthropic/claude-v1.3" + Example: "meta/llama-7b" => "together/llama-7b" + + The process to find a model deployment name is as follows: + 1. If there is a model deployment with the same name as the model arg, use it. + 2. If there is at least one deployment for the model, use the first one that is available. + 3. If there are no deployments for the model, returns None. + + This function will also try to find a model deployment name that is not deprecated. + If there are no non-deprecated deployments, it will return the first deployment (even if it's deprecated). + If ignore_deprecated is True, this function will return None if the model deployment is deprecated. + + If warn_arg_deprecated is True, this function will print a warning if the model deployment name is not the same + as the model arg. This is to remind the user that the model name is deprecated and should be replaced with + the model deployment name (in their config). + + Args: + model_arg: The model arg to convert to a model deployment name. + warn_arg_deprecated: Whether to print a warning if the model deployment name is not the same as the model arg. + ignore_deprecated: Whether to return None if the model deployment is deprecated. + """ + + # If there is a model deployment with the same name as the model arg, use it. + if model_name in DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT: + deployment: ModelDeployment = DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT[model_name] + if deployment.deprecated and ignore_deprecated: + if warn_arg_deprecated: + hlog(f"WARNING: Model deployment {model_name} is deprecated") + return None + return deployment.name + + # If there is at least one deployment for the model, use the first one that is available. + available_deployments: List[ModelDeployment] = [ + deployment for deployment in ALL_MODEL_DEPLOYMENTS if deployment.model_name == model_name + ] + if len(available_deployments) > 0: + available_deployment_names: List[str] = [deployment.name for deployment in available_deployments] + if warn_arg_deprecated: + hlog("WARNING: Model name is deprecated. Please use the model deployment name instead.") + hlog(f"Available model deployments for model {model_name}: {available_deployment_names}") + + # Additionally, if there is a non-deprecated deployment, use it. + non_deprecated_deployments: List[ModelDeployment] = [ + deployment for deployment in available_deployments if not deployment.deprecated + ] + if len(non_deprecated_deployments) > 0: + chosen_deployment = non_deprecated_deployments[0] + # There are no non-deprecated deployments, so there are two options: + # 1. If we can return an empty string, return it. (no model deployment is available) + # 2. If we can't return an empty string, return the first deployment (even if it's deprecated). + elif ignore_deprecated: + return None + else: + chosen_deployment = available_deployments[0] + if warn_arg_deprecated: + hlog(f"WARNING: All model deployments for model {model_name} are deprecated.") + if warn_arg_deprecated: + hlog( + f"Choosing {chosen_deployment.name} (the first one) as " + f"the default model deployment for model {model_name}" + ) + hlog("If you want to use a different model deployment, please specify it explicitly.") + return chosen_deployment.name + + # Some models are added but have no deployments yet. + # In this case, we return None. + return None + + +def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]: + """ + Takes a specification (name, args) and returns a list of `RunSpec`s. + """ + # Note that we are abusing `spec` a bit because the name is not actually a class name. + name = spec.class_name + args = spec.args + + if name not in CANONICAL_RUN_SPEC_FUNCS: + raise ValueError(f"Unknown run spec name: {name}") + + # Peel off the run expanders (e.g., model) + expanders = [RUN_EXPANDERS[key](value) for key, value in args.items() if key in RUN_EXPANDERS] # type: ignore + args = dict((key, value) for key, value in args.items() if key not in RUN_EXPANDERS) + + # Get the canonical run specs + run_specs = [CANONICAL_RUN_SPEC_FUNCS[name](**args)] + + # Apply expanders + for expander in expanders: + run_specs = [ + child_run_spec for parent_run_spec in run_specs for child_run_spec in expander.expand(parent_run_spec) + ] + + def alter_run_spec(run_spec: RunSpec) -> RunSpec: + if not run_spec.adapter_spec.model and not run_spec.adapter_spec.model_deployment: + raise ValueError("At least one of model_deployment and model must be specified") + elif not run_spec.adapter_spec.model and run_spec.adapter_spec.model_deployment: + # Infer model from model deployment + default_model_name = get_model_deployment(run_spec.adapter_spec.model_deployment).model_name + if not default_model_name: + default_model_name = run_spec.adapter_spec.model_deployment + run_spec = dataclasses.replace( + run_spec, + adapter_spec=dataclasses.replace(run_spec.adapter_spec, model=default_model_name), + ) + elif run_spec.adapter_spec.model and not run_spec.adapter_spec.model_deployment: + # Infer model deployment from model + default_model_deployment = get_default_model_deployment_for_model(run_spec.adapter_spec.model) + if not default_model_deployment: + raise ValueError( + f"Unknown model or no default model deployment found for model {run_spec.adapter_spec.model}" + ) + run_spec = dataclasses.replace( + run_spec, + adapter_spec=dataclasses.replace(run_spec.adapter_spec, model_deployment=default_model_deployment), + ) + + # Both model and model_deployment should now be filled + assert run_spec.adapter_spec.model_deployment + assert run_spec.adapter_spec.model + + model: ModelMetadata = get_model_metadata(run_spec.adapter_spec.model) + deployment: ModelDeployment = get_model_deployment(run_spec.adapter_spec.model_deployment) + if run_spec.adapter_spec.model != deployment.model_name: + raise ValueError( + f"Invalid RunSpec: selected model deployment '{run_spec.adapter_spec.model_deployment}'" + f"for model '{run_spec.adapter_spec.model}' but the model deployment is " + f"for a different model '{deployment.model_name}'" + ) + # For models that strip newlines, when we're generating, we need to set + # the delimiter to be '###' so we stop properly. + if NO_NEWLINES_TAG in model.tags and run_spec.adapter_spec.method in ( + ADAPT_GENERATION, + ADAPT_MULTIPLE_CHOICE_JOINT, + ): + stop_expander = StopRunExpander(value="hash") + run_spec = singleton(stop_expander.expand(run_spec)) + + if NLG_PREFIX_TAG in model.tags: + global_prefix_expander = GlobalPrefixRunExpander(value="nlg") + run_spec = singleton(global_prefix_expander.expand(run_spec)) + + if CHATML_MODEL_TAG in model.tags: + chatml_expander = ChatMLRunExpander() + run_spec = singleton(chatml_expander.expand(run_spec)) + + # Anthropic prompts + if ANTHROPIC_CLAUDE_1_MODEL_TAG in model.tags or ANTHROPIC_CLAUDE_2_MODEL_TAG in model.tags: + run_spec = singleton(AnthropicRunExpander().expand(run_spec)) + + # OpenAI prompts + if OPENAI_CHATGPT_MODEL_TAG in model.tags: + run_spec = singleton(OpenAIRunExpander().expand(run_spec)) + + # Google prompts + if GOOGLE_PALM_2_MODEL_TAG in model.tags: + run_spec = singleton(GoogleRunExpander().expand(run_spec)) + + # For multiple choice + if BUGGY_TEMP_0_TAG in model.tags and run_spec.adapter_spec.temperature == 0: + increase_temperature_expander = IncreaseTemperatureRunExpander(value=1e-4) + run_spec = singleton(increase_temperature_expander.expand(run_spec)) + + return run_spec + + run_specs = [alter_run_spec(run_spec) for run_spec in run_specs] + + return run_specs diff --git a/src/helm/benchmark/scenarios/casehold_qa_scenario.py b/src/helm/benchmark/scenarios/casehold_qa_scenario.py new file mode 100644 index 00000000000..5e818a7d2e5 --- /dev/null +++ b/src/helm/benchmark/scenarios/casehold_qa_scenario.py @@ -0,0 +1,109 @@ +import json +import os +import os.path +import shutil +import datasets +from typing import List, Dict, Any, cast + +from helm.common.general import ensure_directory_exists +from helm.benchmark.scenarios.scenario import ( + Scenario, + Instance, + Reference, + TRAIN_SPLIT, + VALID_SPLIT, + TEST_SPLIT, + ALL_SPLITS, + CORRECT_TAG, + PassageQuestionInput, + Output, +) + + +def download_dataset(data_path: str, splits: List[str]): + if False not in [os.path.exists(f"{data_path}/{split}.jsonl") for split in splits]: + return + + # https://huggingface.co/docs/datasets/index + ds_names: List[str] = datasets.list_datasets() + # https://huggingface.co/casehold + # https://huggingface.co/datasets/casehold/casehold + ds_name = "casehold/casehold" + if ds_name not in ds_names: + raise Exception(f"{ds_name} not included in datasets") + casehold: datasets.DatasetDict = cast(datasets.DatasetDict, datasets.load_dataset(ds_name)) + + for split in splits: + casehold[split].to_json(f"{data_path}/{split}.jsonl") + + # **WORK-AROUND** + # since "test.jsonl" includes no label info., we use "validation.jsonl" as a substitute. + if os.path.exists(f"{data_path}/test.jsonl"): + os.remove(f"{data_path}/test.jsonl") + shutil.copy(f"{data_path}/validation.jsonl", f"{data_path}/test.jsonl") + + +class CaseHOLDQAScenario(Scenario): + """ + CaseHOLD QA + CaseHOLD is a multiple choice question answering task derived from legal citations in judicial rulings. + CaseHOLD consists of ~53,000 questions, mined from the Harvard Law Library case law corpus. + + Dataset repository + https://huggingface.co/datasets/casehold/casehold + Publication + "When Does Pretraining Help? Assessing Self-Supervised Learning for Law and the CaseHOLD Dataset" + ICAIL, 2021 + https://reglab.stanford.edu/data/casehold-benchmark/ + https://arxiv.org/abs/2104.08671 + + Data content + The citing context from the judicial decision serves as the prompt for the question. + The answer choices are holding statements derived from citations following text in a legal decision. + There are five answer choices for each citing text. + The correct answer is the holding statement that corresponds to the citing text. + The four incorrect answers are other holding statements. + + """ + + name = "casehold_qa" + description = "CaseHOLD QA" + tags = ["question_answering", "legal"] + + splits_dict = {TRAIN_SPLIT: "train", VALID_SPLIT: "validation", TEST_SPLIT: "test"} + + def __init__(self, splits: List[str] = ALL_SPLITS): + super().__init__() + self.splits = splits + + def get_instances(self, output_path: str) -> List[Instance]: + data_path: str = os.path.join(output_path, "data") + ensure_directory_exists(data_path) + download_dataset(data_path, list(self.splits_dict.values())) + + def to_instance(line: str, split: str) -> Instance: + case: Dict[str, Any] = json.loads(line) + example_id: int = case["example_id"] + context: str = case["citing_prompt"] + question: str = "holding statement" + holdings: List[str] = [case[f"holding_{i}"] for i in range(5)] + label: str = case["label"] + instance: Instance = Instance( + input=PassageQuestionInput(passage=context, question=question), + references=[ + Reference(Output(text=holdings[i]), tags=([CORRECT_TAG] if label == str(i) else [])) + for i in range(5) + ], + split=split, + id=str(example_id), + ) + return instance + + instances: List[Instance] = [] + # TRAIN, VALID + for split in self.splits: + with open(f"{data_path}/{self.splits_dict[split]}.jsonl", mode="r") as f: + for line in f.readlines(): + instances.append(to_instance(line, split)) + + return instances diff --git a/src/helm/benchmark/scenarios/conv_fin_qa_scenario.py b/src/helm/benchmark/scenarios/conv_fin_qa_scenario.py new file mode 100644 index 00000000000..64038ba3d93 --- /dev/null +++ b/src/helm/benchmark/scenarios/conv_fin_qa_scenario.py @@ -0,0 +1,188 @@ +import json +import os +from typing import Dict, List, Tuple, Any, Optional +import re + +from helm.common.general import ensure_file_downloaded, ensure_directory_exists +from .scenario import ( + Scenario, + Instance, + Reference, + TRAIN_SPLIT, + VALID_SPLIT, + CORRECT_TAG, + PassageQuestionInput, + Output, +) + + +def _strip_string(str: str) -> Any: + # from https://stackoverflow.com/a/4703508 + numeric_const_pattern = r"[-+]?(?:(?:\d*\.\d+)|(?:\d+\.?))(?:[Ee][+-]?\d+)?" + match = re.search(numeric_const_pattern, str) + if match: + try: + return float(str[match.start() : match.end()]) + except Exception: + return None + return None + + +def float_equiv(str1: Optional[str], str2: Optional[str], eps: float = 1e-6) -> float: + """ + extract the first numbers in the two strings and compare them + """ + if str1 is None and str2 is None: + print("WARNING: Both None") + return 1.0 + if str1 is None or str2 is None: + return 0.0 + + try: + ss1 = _strip_string(str1) + ss2 = _strip_string(str2) + print(f"{str1}: ({ss1}) == {str2}: ({ss2})? {float(abs(ss1 - ss2) < eps)}") + + if ss1 is None or ss2 is None: + return 0.0 + return float(abs(ss1 - ss2) < eps) + except Exception: + return float(str1 == str2) + + +class ConvFinQAScenario(Scenario): + """ ConvFinQA Financial Conversations (Numerical Reasoning) + + Description: + ConvFinQA - Exploring the Chain of Numerical Reasoning in Conversational Finance Question Answering. + + Prompt: +Passage: Table: +{Table} +Text: +Questions: Question: {Question}? The answer is {Answer} +{Question}? The answer is {Answer} +{Question}? The answer is {Answer} +{Question}? The answer is +Answer: + + Data source: + https://github.com/czyssrs/ConvFinQA + + Reference: + Zhiyu Chen, Shiyang Li, Charese Smiley, Zhiqiang Ma, Sameena Shah, and William Yang Wang. 2022. + ConvFinQA: Exploring the Chain of Numerical Reasoning in Conversational Finance Question Answering. + In Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing, + pages 6279–6292, Abu Dhabi, United Arab Emirates. Association for Computational Linguistics. + https://aclanthology.org/2022.emnlp-main.421 + + """ # noqa + + """ Information on this class""" + name = "conv_fin_qa" + description = "Conversitional Finance QA" + tags = ["question_answering", "finance"] + + """ Class variables """ + # Dataset file name + DATASET_DOWNLOAD_URL: str = "https://github.com/czyssrs/ConvFinQA/raw/main/data.zip" + DATASET_FILE_NAME = "ConvFinQA" + + def __init__(self): + super().__init__() + + def download_dataset(self, output_path: str): + """Downloads the con_fin_qa dataset.""" + + # Download the raw data + data_dir = os.path.join(output_path, "data") + ensure_directory_exists(data_dir) + ensure_file_downloaded( + source_url=self.DATASET_DOWNLOAD_URL, + target_path=os.path.join(data_dir, self.DATASET_FILE_NAME), + unpack=True, + unpack_type="unzip", + ) + + def get_table_text(self, table: List[List[str]]) -> str: + """table in the format of List of columns""" + return "~".join(["|".join(col) for col in table]) + + def make_pseudo_markdown_table(self, array, line_sep="\n"): + markdown = str("|") + + for e in array[0]: + to_add = " " + str(e) + str(" |") + markdown += to_add + markdown += line_sep + + for entry in array[1:]: + markdown += str("| ") + for e in entry: + to_add = str(e) + str(" | ") + markdown += to_add + markdown += line_sep + + return markdown + + def get_instance_dict(self, dic, sep: str = "\n") -> Dict[str, Any]: + linearized_table = self.make_pseudo_markdown_table(dic["table"], line_sep=sep) + + if "gold_ind" in dic["annotation"]: + facts = dic["annotation"]["gold_ind"] + elif "gold_inds" in dic["annotation"]: + facts = dic["annotation"]["gold_inds"] + else: + facts = {} + + text = "" + for fact_type, fact in facts.items(): + if "text" in fact_type: + text += fact + context = "" + for ind, q in enumerate(dic["annotation"]["cur_dial"]): + if ind < len(dic["annotation"]["cur_dial"]) - 1: + context += q + " The answer is " + str(dic["annotation"]["exe_ans_list"][ind]) + " " + sep + else: + context += q + " The answer is " + doc = f"Table: {sep}{linearized_table}{sep}Text: {text}{sep}Questions: " + answer = str(dic["annotation"]["exe_ans"]) + return { + "input": PassageQuestionInput(passage="".join(doc), question=context, separator=" "), + "references": [Reference(Output(text=answer), tags=[CORRECT_TAG])], + } + + def load_dataset(self, output_path: str) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: + """Loads the dataset downloaded in download_dataset().""" + folder_path = os.path.join(output_path, "data", self.DATASET_FILE_NAME) + train_data = [] + dev_data = [] + + with open(os.path.join(folder_path, "train_turn.json"), encoding="utf-8") as f: + train_raw_data = json.load(f) + + for problem in train_raw_data: + train_data.append(self.get_instance_dict(problem)) + + with open(os.path.join(folder_path, "dev_turn.json"), encoding="utf-8") as f: + dev_raw_data = json.load(f) + + for problem in dev_raw_data: + dev_data.append(self.get_instance_dict(problem)) + + return train_data, dev_data + + def get_instances(self, output_path: str) -> List[Instance]: + """Returns the instances for this scenario.""" + # Body of the function + self.download_dataset(output_path) + train_data, dev_data = self.load_dataset(output_path) + train_k = 5 + train_instances = [ + Instance(input=d["input"], references=d["references"], split=TRAIN_SPLIT) for d in train_data[:train_k] + ] + valid_instances = [ + Instance(input=d["input"], references=d["references"], split=VALID_SPLIT) for d in dev_data[:1000] + ] + print("length of validate:", len(valid_instances)) + return train_instances + valid_instances diff --git a/src/helm/benchmark/scenarios/cti_mitre_scenario.py b/src/helm/benchmark/scenarios/cti_mitre_scenario.py new file mode 100644 index 00000000000..36083f54e10 --- /dev/null +++ b/src/helm/benchmark/scenarios/cti_mitre_scenario.py @@ -0,0 +1,304 @@ +import os +import json +import random +from typing import List, Dict +import pandas as pd +from pandas import DataFrame +from helm.common.general import ensure_file_downloaded, ensure_directory_exists +from .scenario import ( + Scenario, + Instance, + Reference, + TRAIN_SPLIT, + TEST_SPLIT, + CORRECT_TAG, + Input, + Output, +) + + +class CtiMitreScenario(Scenario): + """ + Original Task: + - The original task is to classify the description of the situation regarding the system + into the security threats in that situation. + - The classification categories are the approximately 200 categories of attack techniques + in the enterprise as defined by MITRE ATT&CK v10. + + Implemented Task: + - Since classification into so many classes is difficult to handle in a generative language model + such as GPT itself, we implement this task as a multiple-choice task. + - Each choice is the name of the attack technique category into which the description is classified. + - The number of options is determined by the parameter (num_options). + - The minimum number of options is 2 and the maximum is 199, the number of all categories of + attack methods defined in MITRE ATT&CK v10. + - From the 199 choices, num_options choices, including the correct answer and a default case, + are randomly selected and used. + - If num_options is not specified, all 199 category names will be used as choices. + + Data: + - dataset.csv + - Target dataset + - https://github.com/dessertlab/cti-to-mitre-with-nlp/raw/main/data/dataset.csv + - This data is of the form [sentence, label_tec, label_subtec, tec_name] + - sentence: the description + - label_tec: label for attack technique category + - label_subtec: label for attack technique subcategory + - tec_name : name(simple description) for attack technique subcategory + - Note: we need to extract name for attack technique category + from enterprise-attack.json + + - enterprise-attack.json + - https://github.com/mitre/cti/archive/refs/tags/ATT&CK-v10.1.zip + - /mitre_v10/enterprise-attack/enterprise-attack.json + - This data contains relation from attack technique name to attack technique label + - we can extract attack technique category name for label_tec using this json data. + + + Prompt: (k is specified by num_options) + ----------------------- + Answer the possible security attacks in each of the following situations from each of the options below. + [instruction] + + Situation: [in context examples] + A. + B. + ... + Y. + Z. Others + Answer: + + ... (Examples are output as long as the length allows) ... + + Situation: [target question] + A. + B. + ... + Y. + Z. Others + Answer: + ----------------------- + + Example of prompt (num_options = 5) + ----------------------- + Answer the possible security attacks in each of the following situations from each of the options below. + + Situation: ZxShell can launch a reverse command shell. + A. Command and Scripting Interpreter + B. System Shutdown/Reboot + C. Exfiltration Over C2 Channel + D. Direct Volume Access + E. Others + Answer: A + + ....(Omitted)... + + Situation: APC injection is a method of executing arbitrary code in the address space. + A. Event Triggered Execution + B. Process Injection + C. Non-Application Layer Protocol + D. Escape to Host + E. Others + Answer: B + + Situation: Timestomping may be used along with file name Masquerading to hide malware and tools. + A. Search Victim-Owned Websites + B. Internal Spearphishing + C. Application Layer Protocol + D. Indicator Removal on Host + E. Others + Answer: + ----------------------- + + Reference: + V. Orbinato, M. Barbaraci, R. Natella, and D. Cotroneo, + “Automatic Mapping of Unstructured Cyber Threat Intelligence: An Experimental Study,” + in Proceedings of the 33rd IEEE International Symposium on Software Reliability Engineering (ISSRE), 2022. + https://ieeexplore.ieee.org/abstract/document/9978947 + + """ + + # Names of the tasks we support + name = "cti_mitre" + description = "Classification of security attack opportunities on system" + tags = ["classification", "MITRE ATT&CK", "cyber_security"] + + # Constant for splitting target data into train and test data. + train_ratio = 0.7 + + # Constant for default number of options. # of (MITRE ATT&CK attack categories) is 199 in ATT&CK-v10.1.zip + MAX_NUM_OPTIONS = 199 + + # Constant: the description for Others option + OTHERS_OPTION = "Others" + + # Methods + + def __init__(self, num_options=None, seed=None): + """ + num_options: int, number of choices in multiple-choice task + seed: int, seed for random module. The seed is set to random if specified + """ + super().__init__() + # dataset url + self.dataset_all_url = "https://github.com/dessertlab/cti-to-mitre-with-nlp/raw/main/data/dataset.csv" + self.dataset_all_name = "dataset.csv" + # MITRE ATT CK (v10) url + self.mitre_att_ck_v10_url = "https://github.com/mitre/cti/archive/refs/tags/ATT&CK-v10.1.zip" + self.mitre_dir = "mitre_v10" + self.enterprise_attack_dir = "enterprise-attack" + self.enterprise_attack_json = "enterprise-attack.json" + # Number of options : if num_options is not specified, num_options=MAX_NUM_OPTIONS + if num_options is not None and 0 < num_options <= CtiMitreScenario.MAX_NUM_OPTIONS: + self.num_options = num_options + else: + self.num_options = CtiMitreScenario.MAX_NUM_OPTIONS + # set seed to random + random.seed(seed) + self.rand = random + + def download_dataset(self): + """Download dataset.csv""" + data_dir = self.data_dir + ensure_directory_exists(data_dir) + ensure_file_downloaded( + source_url=self.dataset_all_url, + target_path=os.path.join(data_dir, self.dataset_all_name), + ) + + def download_MITRE_info(self): + """Download zip file containing enterprise_attack.json""" + data_dir = self.data_dir + ensure_directory_exists(data_dir) + ensure_file_downloaded( + source_url=self.mitre_att_ck_v10_url, + target_path=os.path.join(data_dir, self.mitre_dir), + unpack=True, + unpack_type="unzip", + ) + + @staticmethod + def make_label_category_name_dict(jdata) -> Dict[str, str]: + """ + This makes mapping from label_tec (attack technique category label) to tec_category_name + (attack technique category name) + - jdata is json object for enterprise_attack.json + """ + objs = jdata["objects"] + label_cname: Dict[str, str] = {} + if jdata is None: + return label_cname + for i in range(0, len(objs)): + obj = objs[i] + if obj["type"] == "attack-pattern": + if "x_mitre_is_subtechnique" in obj and not obj["x_mitre_is_subtechnique"]: + extrefs = obj["external_references"] + label = None + for ref in extrefs: + if ref["source_name"] == "mitre-attack": # and "external_id" in ref: + label = ref["external_id"] + break + if label is not None and "name" in obj: + cname = obj["name"] + label_cname[label] = cname + return label_cname + + def select_option_cnames(self, k: int, excluded: str, cnames: List[str]) -> List[str]: + """ + Randomly select k tec_category_names (attack technique category names) as choices. + However, choose not to include "excluded", + and if k is less than the total number of possible choices, add a default case. + - k : number of choices + - excluded : excluded attack technique category name (usually, specify correct answer) + - cnames : list containing all attack technique category names + """ + target_cnames = [v for v in cnames if v != excluded] + + if len(target_cnames) <= k: + return target_cnames + elif k - 1 <= 0: + return [CtiMitreScenario.OTHERS_OPTION] + else: + ops = self.rand.sample(target_cnames, k - 1) + ops.append(CtiMitreScenario.OTHERS_OPTION) + return ops + + @staticmethod + def bring_others_to_end(references: List[Reference]) -> List[Reference]: + """Rearrange the list of references so that the reference corresponding to the default case comes last""" + newref_list: List[Reference] = [] + others_list: List[Reference] = [] + for ref in references: + if ref.output.text == CtiMitreScenario.OTHERS_OPTION: + others_list.append(ref) + else: + newref_list.append(ref) + newref_list.extend(others_list) + return newref_list + + def create_multiple_choice_instances( + self, df: DataFrame, split: str, label_cname: Dict[str, str] + ) -> List[Instance]: + """Create a list of instances corresponding to the multiple choice task""" + instances = [] + for idx in df.index: + linedata = df.loc[idx] + sent = linedata["sentence"] + label_tec = linedata["label_tec"] + correct_cname = label_cname[label_tec] + all_cnames = [cname for cname in label_cname.values()] + num_of_wrong_options = self.num_options - 1 + wrong_cnames = self.select_option_cnames(num_of_wrong_options, correct_cname, all_cnames) + input = Input(text=sent) + # create options (including one correct answer) + correct_ref = Reference(Output(text=correct_cname), tags=[CORRECT_TAG]) + references = [Reference(Output(text=cname), tags=[]) for cname in wrong_cnames] + references.append(correct_ref) + # shuffle answer options + self.rand.shuffle(references) + # bring others_option to the end of the reference list + ord_references = CtiMitreScenario.bring_others_to_end(references) + instance = Instance(input, ord_references, split=split) + instances.append(instance) + return instances + + def create_instances(self, df: DataFrame, split: str, label_cname: Dict[str, str]) -> List[Instance]: + return self.create_multiple_choice_instances(df, split, label_cname) + + def get_instances(self, output_path: str) -> List[Instance]: + self.data_dir = os.path.join(output_path, "data") + + # download dataset + self.download_dataset() + + # download MITRE_ATT_CK_V10 information + self.download_MITRE_info() + + # load dataset + all_data_dir = os.path.join(self.data_dir, self.dataset_all_name) + all_df = pd.read_csv(all_data_dir) + + # split all_df into train and test data frames + train_df = all_df.sample(frac=CtiMitreScenario.train_ratio, random_state=0) + test_df = all_df.drop(train_df.index).sample(frac=1, random_state=0) + + # load MITRE info json data + label_name_json = os.path.join( + self.data_dir, self.mitre_dir, self.enterprise_attack_dir, self.enterprise_attack_json + ) + jdata = None + with open(label_name_json) as f: + jdata = json.load(f) + + # make mapping from label_tec to tec_category_name + label_cname = self.make_label_category_name_dict(jdata) + + # create instances from each dataset + instances_train = self.create_instances(train_df, TRAIN_SPLIT, label_cname) + instances_test = self.create_instances(test_df, TEST_SPLIT, label_cname) + + # return all instances + all_instances = [] + all_instances.extend(instances_train) + all_instances.extend(instances_test) + return all_instances diff --git a/src/helm/benchmark/scenarios/echr_judge_scenario.py b/src/helm/benchmark/scenarios/echr_judge_scenario.py new file mode 100644 index 00000000000..681b9659a15 --- /dev/null +++ b/src/helm/benchmark/scenarios/echr_judge_scenario.py @@ -0,0 +1,183 @@ +import os +import glob +import json +from typing import Dict, List, Optional +from helm.common.general import ensure_file_downloaded, ensure_directory_exists +from .scenario import ( + Scenario, + Instance, + Reference, + TRAIN_SPLIT, + VALID_SPLIT, + TEST_SPLIT, + CORRECT_TAG, + Input, + Output, +) + + +class EchrJudgeScenario(Scenario): + """ + Task: + - This scenario is a binary classification task. + - It classifies human right case description into violation or no violation. + + Dataset: + - EN_train, EN_dev, EN_test (These data sets are downloaded). + - These dataset are considered as TRAIN_SPLIT, VALID_SPLIT, TEST_SPLIT. + - Each dataset is a set of JSON files containing at least TEXT and VIOLATED_ARTICLES fields. + - TEXT fields contains sentences. + - VIOLATED_ARTICLES contains information about + human rights violation or no violation (in case of empty list) + + Prompt: + ------ + Is the following case a violation of human rights? (Instructions) + + Case: Human rights have not been violated. (Trivial No case in instructions) + Answer: No + + Case: Human rights have been violated. (Trivial Yes case in instructions) + Answer: Yes + + Case: (In-context examples, if possible) + Answer: