diff --git a/README.md b/README.md
index 5fded050175..27d4ecbe04e 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,30 @@
 <!--intro-start-->
+# Notes on this forked version
+This is a fork from the original HELM for a study of enterprise benchmarking of LLMs using domain-specific datasets.
+
+The following scenarios are added. Please refer to the docstring of the source code of each scenario, or the page shown by `helm-server` for the details.
+- Finance
+    - financial_phrasebank
+    - kpi_edgar
+    - conv_fin_qa
+    - news_headline
+- Legal
+    - legal_opinion
+    - echr_judge
+    - casehold_qa
+    - legal_contract
+- Climate
+    - sumosum
+- Cyber security
+    - cti_mitre
+
+The following metrics are added or modified.
+- kpi_edgar_metrics
+- classification_metrics  (weighted_f1)
+- basic_metrics  (float_equiv, a bug fix for f1_score)
+
+This study will be published elsewhere.
+- Citation: TBD
 
 # Holistic Evaluation of Language Models
 
diff --git a/requirements.txt b/requirements.txt
index f99b82972d4..fa120952c5f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,332 +1,196 @@
-# This file is automatically generated by GitHub Actions and contains pinned versions for all transitive Python dependencies. Do not modify this file!
-about-time==4.2.1
-absl-py==2.1.0
-accelerate==0.34.2
-ai2-olmo==0.5.0
-ai2-olmo-core==0.1.0
-aiodns==3.2.0
-aiohappyeyeballs==2.4.0
-aiohttp==3.10.6
+2captcha-python==1.1.3
+absl-py==1.2.0
+aiodns==3.0.0
+aiohttp==3.8.5
 aiohttp-retry==2.8.3
-aiosignal==1.3.1
+aiosignal==1.2.0
 aleph-alpha-client==2.14.0
-alive-progress==3.1.5
-annotated-types==0.7.0
-anthropic==0.34.2
-antlr4-python3-runtime==4.9.3
-anyio==4.6.0
-appdirs==1.4.4
-astunparse==1.6.3
-async-timeout==4.0.3
-attrs==24.2.0
-autograd==1.7.0
-autokeras==1.0.20
-awscli==1.29.85
-beautifulsoup4==4.12.3
-black==24.3.0
-blis==0.7.11
-boto3==1.28.85
-botocore==1.31.85
-bottle==0.12.25
-cached_path==1.6.3
-cachetools==5.5.0
-catalogue==2.0.10
+anthropic==0.2.5
+async-generator==1.10
+async-timeout==4.0.2
+attrs==22.1.0
+beautifulsoup4==4.11.1
+bert-score==0.3.13
+bitarray==2.7.3
+black==22.10.0
+blanc==0.2.7
+blis==0.7.8
+boto3==1.24.89
+botocore==1.27.89
+bottle==0.12.23
+cachetools==5.2.0
+catalogue==2.0.8
 cattrs==22.2.0
-certifi==2024.8.30
-cffi==1.17.1
-cfgv==3.4.0
-charset-normalizer==3.3.2
-chex==0.1.86
-click==8.1.7
-clip-anytorch==2.5.2
-cloudpathlib==0.19.0
-cma==3.2.2
-cohere==5.3.5
-colorama==0.4.4
-colorcet==3.0.1
-coloredlogs==15.0.1
-confection==0.1.5
-contourpy==1.3.0
-cycler==0.12.1
-cymem==2.0.8
-dacite==1.8.1
-data==0.4
-datasets==2.21.0
-decorator==5.1.1
-Deprecated==1.2.14
-diffusers==0.24.0
-dill==0.3.8
-distlib==0.3.8
-distro==1.9.0
-dnspython==2.6.1
-docker-pycreds==0.4.0
-docstring_parser==0.16
-docutils==0.16
-einops==0.7.0
-einops-exts==0.0.4
-etils==1.5.2
-eval_type_backport==0.2.0
-exceptiongroup==1.2.2
-fairlearn==0.9.0
-fastavro==1.9.7
-filelock==3.13.1
+certifi==2023.7.22
+cffi==1.15.1
+cfgv==3.3.1
+charset-normalizer==2.1.1
+click==8.0.4
+colorama==0.4.5
+contourpy==1.0.5
+cycler==0.11.0
+cymem==2.0.6
+Cython==0.29.32
+dacite==1.6.0
+datasets==2.14.7
+dill==0.3.5.1
+distlib==0.3.6
+emoji==2.1.0
+et-xmlfile==1.1.0
+exceptiongroup==1.1.0
+filelock==3.8.0
 flake8==5.0.4
-flatbuffers==24.3.25
-flax==0.6.11
-fonttools==4.54.1
-frozenlist==1.4.1
-fsspec==2024.2.0
-ftfy==6.1.3
-funcsigs==1.0.2
-future==1.0.0
-gast==0.6.0
-gdown==5.2.0
-gitdb==4.0.11
-GitPython==3.1.43
-google-api-core==2.20.0
-google-api-python-client==2.147.0
-google-auth==2.35.0
-google-auth-httplib2==0.2.0
-google-cloud-aiplatform==1.68.0
-google-cloud-bigquery==3.25.0
-google-cloud-core==2.4.1
-google-cloud-resource-manager==1.12.5
-google-cloud-storage==2.18.2
-google-cloud-translate==3.11.3
-google-crc32c==1.6.0
-google-pasta==0.2.0
-google-resumable-media==2.7.2
-googleapis-common-protos==1.65.0
-grapheme==0.6.0
-grpc-google-iam-v1==0.13.1
-grpcio==1.66.1
-grpcio-status==1.62.3
-gunicorn==23.0.0
+fonttools==4.37.4
+frozenlist==1.3.1
+fsspec==2023.4.0
+gdown==4.4.0
+gevent==21.12.0
+gin-config==0.5.0
+google-api-core==2.10.1
+google-api-python-client==2.64.0
+google-auth==2.12.0
+google-auth-httplib2==0.1.0
+google-cloud-aiplatform==1.36.4
+googleapis-common-protos==1.56.4
+greenlet==1.1.3
+gunicorn==20.1.0
 h11==0.14.0
-h5py==3.12.0
-html2text==2024.2.26
-httpcore==1.0.5
-httplib2==0.22.0
-httpx==0.25.2
-httpx-sse==0.4.0
-huggingface-hub==0.23.5
-humanfriendly==10.0
-humanize==4.10.0
+httplib2==0.20.4
+huggingface-hub==0.16.4
 icetk==0.0.4
-identify==2.6.1
-idna==3.10
-ImageHash==4.3.1
-imageio==2.35.1
-importlib-resources==5.13.0
-importlib_metadata==8.5.0
-iniconfig==2.0.0
-jax==0.4.30
-jaxlib==0.4.30
-jieba==0.42.1
-Jinja2==3.1.3
-jiter==0.5.0
+identify==2.5.6
+idna==3.4
+importlib-metadata==6.0.0
+importlib-resources==5.10.0
+iniconfig==1.1.1
+Jinja2==3.1.2
 jmespath==1.0.1
-joblib==1.4.2
-jsonschema==4.23.0
-jsonschema-specifications==2023.12.1
-jstyleson==0.0.2
-keras==3.5.0
-keras-tuner==1.4.7
-kiwisolver==1.4.7
-kt-legacy==1.0.5
-langcodes==3.4.1
+joblib==1.2.0
+kiwisolver==1.4.4
+langcodes==3.3.0
 langdetect==1.0.9
-language_data==1.2.0
-latex==0.7.0
-lazy_loader==0.4
-libclang==18.1.1
-lightning-utilities==0.11.7
-llvmlite==0.43.0
-lpips==0.1.4
-lxml==5.3.0
-Mako==1.3.5
-marisa-trie==1.2.0
-Markdown==3.7
-markdown-it-py==3.0.0
-MarkupSafe==2.1.5
-matplotlib==3.6.3
+llvmlite==0.39.1
+lxml==4.9.1
+Mako==1.2.3
+MarkupSafe==2.1.1
+matplotlib==3.6.0
 mccabe==0.7.0
-mdurl==0.1.2
-mistralai==0.0.12
-ml-dtypes==0.4.1
+moverscore==1.0.3
 mpmath==1.3.0
-msgpack==1.1.0
-multidict==6.1.0
-multilingual-clip==1.0.10
-multiprocess==0.70.16
-murmurhash==1.0.10
+multidict==6.0.2
+multiprocess==0.70.13
+murmurhash==1.0.8
 mypy==1.5.1
 mypy-extensions==1.0.0
-namex==0.0.8
-natsort==8.4.0
-nest-asyncio==1.6.0
-networkx==3.2.1
-ninja==1.11.1.1
-nltk==3.8.1
-nncf==2.13.0
-nodeenv==1.9.1
-NudeNet==2.0.9
-numba==0.60.0
-numpy==1.26.3
-omegaconf==2.3.0
-onnx==1.16.2
-onnxruntime==1.19.2
-open_clip_torch==2.26.1
-openai==1.48.0
-OpenCC==1.1.6
-opencv-python==4.7.0.72
-opencv-python-headless==4.10.0.84
-openvino==2024.4.0
-openvino-telemetry==2024.1.0
-openvino-tokenizers==2024.4.0.0
-opt-einsum==3.3.0
-optax==0.2.3
-optimum==1.22.0
-optimum-intel==1.19.0
-optree==0.12.1
-orbax-checkpoint==0.6.4
-orjson==3.10.7
-outcome==1.3.0.post0
-packaging==24.1
-pandas==2.2.3
-param==2.1.1
-parameterized==0.9.0
-pathspec==0.12.1
-pathtools==0.1.2
-pdf2image==1.16.3
-pillow==10.4.0
-platformdirs==4.3.6
-pluggy==1.5.0
-portalocker==2.10.1
+networkx==2.8.7
+nltk==3.7
+nodeenv==1.7.0
+numba==0.56.4
+numpy==1.23.3
+openai==0.27.8
+openpyxl==3.0.10
+outcome==1.2.0
+packaging==21.3
+pandas==1.5.0
+pandas-stubs==1.5.0.221003
+parameterized==0.8.1
+pathspec==0.10.1
+pathy==0.10.2
+Pillow==9.3.0
+platformdirs==2.5.2
+pluggy==1.0.0
+portalocker==2.5.1
 pre-commit==2.20.0
-preshed==3.0.9
-progressbar2==4.5.0
-proto-plus==1.24.0
-protobuf==4.25.5
-psutil==6.0.0
-pyarrow==17.0.0
-pyarrow-hotfix==0.6
-pyasn1==0.6.1
-pyasn1_modules==0.4.1
-pycares==4.4.0
-pycocoevalcap==1.2
-pycocotools==2.0.8
+preshed==3.0.7
+protobuf==3.20.2
+psutil==5.9.2
+pyarrow==11.0.0
+pyasn1==0.4.8
+pyasn1-modules==0.2.8
+pycares==4.3.0
 pycodestyle==2.9.1
-pycparser==2.22
-pyct==0.5.0
-pydantic==2.9.2
-pydantic_core==2.23.4
-pydload==1.0.9
-pydot==2.0.0
+pycparser==2.21
+pydantic==1.8.2
+pyemd==0.5.1
+pyext==0.7
 pyflakes==2.5.0
-Pygments==2.18.0
-pyhocon==0.3.61
-pymongo==4.9.1
-pymoo==0.6.1.3
-pyonmttok==1.37.0
-pyparsing==3.1.4
-pypinyin==0.49.0
+pyhocon==0.3.59
+pymongo==4.2.0
+pyparsing==2.4.7
 PySocks==1.7.1
-pytest==7.2.2
-pythainlp==5.0.0
+pytest==7.2.0
 python-dateutil==2.8.2
-python-utils==3.9.0
-pytorch-fid==0.3.0
-pytorch-lightning==2.0.9.post0
-pytrec_eval==0.5
-pytz==2024.2
-PyWavelets==1.6.0
-PyYAML==6.0.2
-referencing==0.35.1
-regex==2024.9.11
-reka-api==2.0.0
-requests==2.32.3
+pytorch-pretrained-bert==0.6.2
+pytrec-eval==0.5
+pytz==2022.4
+PyYAML==6.0
+regex==2022.9.13
+requests==2.31.0
+responses==0.18.0
 retrying==1.3.4
-rich==13.8.1
-rouge_score==0.1.2
-rpds-py==0.20.0
-rsa==4.7.2
-s3transfer==0.7.0
+rouge-score==0.1.2
+rsa==4.9
+s3transfer==0.6.0
 sacrebleu==2.2.1
-safetensors==0.4.5
-scaleapi==2.13.1
-scikit-image==0.24.0
-scikit-learn==1.5.2
-scipy==1.13.1
-seaborn==0.11.2
-selenium==4.17.2
-sentencepiece==0.1.99
-sentry-sdk==2.14.0
-setproctitle==1.3.3
-shapely==2.0.6
-shellingham==1.5.4
-shutilwhich==1.1.0
-simple-slurm==0.2.7
+sacremoses==0.0.53
+scaleapi==2.13.0
+scikit-learn==1.1.2
+scipy==1.10.0
+selenium==4.8.0
+sentencepiece==0.1.97
+simple-slurm==0.2.6
 six==1.16.0
-smart-open==7.0.4
-smmap==5.0.1
-sniffio==1.3.1
+smart-open==5.2.1
+sniffio==1.3.0
 sortedcontainers==2.4.0
-soupsieve==2.6
-spacy==3.7.6
+soupsieve==2.3.2.post1
+spacy==3.5.4
 spacy-legacy==3.0.12
-spacy-loggers==1.0.5
+spacy-loggers==1.0.3
 sqlitedict==1.7.0
-srsly==2.4.8
-surge-api==1.1.4
+srsly==2.4.4
+stanza==1.4.2
+summ-eval==0.892
+surge-api==1.1.0
 sympy==1.11.1
 tabulate==0.9.0
-tempdir==0.7.1
-tensorboard==2.17.1
-tensorboard-data-server==0.7.2
-tensorflow==2.17.0
-tensorflow-io-gcs-filesystem==0.37.1
-tensorstore==0.1.65
-termcolor==2.4.0
-thinc==8.2.5
-threadpoolctl==3.5.0
-tifffile==2024.8.30
-tiktoken==0.7.0
-timm==0.6.13
-together==1.2.13
-tokenizers==0.19.1
+thinc==8.1.12
+threadpoolctl==3.1.0
+tiktoken==0.3.3
+tls-client==0.1.8
+tokenizers==0.13.3
 toml==0.10.2
 tomli==2.0.1
-toolz==0.12.1
-torch~=2.2.2
-torch-fidelity==0.3.0
-torchmetrics==0.11.4
-torchvision~=0.17.2
-tqdm==4.66.5
-transformers==4.44.2
-transformers-stream-generator==0.0.5
-trio==0.26.2
-trio-websocket==0.11.1
-typer==0.12.5
-types-requests==2.31.0.6
-types-urllib3==1.26.25.14
-typing_extensions==4.12.2
-tzdata==2024.2
+torch==1.12.1 ; sys_platform == "darwin"
+torchvision==0.13.1 ; sys_platform == "darwin"
+torch==1.12.1+cu113 ; sys_platform == "linux"
+torchvision==0.13.1+cu113 ; sys_platform == "linux"
+tqdm==4.64.1
+transformers==4.33.1
+trio==0.22.0
+trio-websocket==0.9.2
+typer==0.4.2
+types-Pillow==9.3.0.4
+types-pytz==2022.4.0.0
+types-redis==4.3.21.1
+types-requests==2.28.11.2
+types-tabulate==0.9.0.0
+types-urllib3==1.26.25
+typing==3.7.4.3
+typing_extensions==4.4.0
 uncertainty-calibration==0.1.4
-Unidecode==1.3.6
+undetected-chromedriver==3.2.1
 uritemplate==4.1.1
-urllib3==1.26.20
-virtualenv==20.26.5
-wandb==0.13.11
-wasabi==1.1.3
-wcwidth==0.2.13
-weasel==0.4.1
-websocket-client==1.3.3
-Werkzeug==3.0.4
-wrapt==1.16.0
+urllib3==1.26.12
+virtualenv==20.16.5
+wasabi==0.10.1
+websocket-client==1.3.2
+websockets==10.4
 wsproto==1.2.0
 xlrd==2.0.1
-xxhash==3.5.0
-yarl==1.12.1
-zipp==3.20.2
+xxhash==3.0.0
+yarl==1.8.1
+zipp==3.11.0
+zope.event==4.5.0
+zope.interface==5.4.0
 zstandard==0.18.0
diff --git a/setup.cfg b/setup.cfg
index 85bf4e10ba3..11997b42ba4 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = crfm-helm
-version = 0.5.4
+version = 0.4.0
 author = Stanford CRFM
 author_email = contact-crfm@stanford.edu
 description = Benchmark for language models
@@ -9,16 +9,13 @@ long_description_content_type = text/markdown
 keywords = language models benchmarking
 license = Apache License 2.0
 classifiers =
-    Programming Language :: Python :: 3
     Programming Language :: Python :: 3 :: Only
-    Programming Language :: Python :: 3.9
-    Programming Language :: Python :: 3.10
-    Programming Language :: Python :: 3.11
+    Programming Language :: Python :: 3.8
     License :: OSI Approved :: Apache Software License
 url = https://github.com/stanford-crfm/helm
 
 [options]
-python_requires = >=3.9,<3.12
+python_requires = >=3.8,<3.11
 package_dir =
     =src
 packages = find:
@@ -27,59 +24,59 @@ include_package_data = True
 
 install_requires=
     # Common
-    cattrs~=22.2
-    dacite~=1.6
-    importlib-resources~=5.10
-    Mako~=1.2
-    numpy~=1.23
+    cattrs~=22.2.0
+    dacite~=1.6.0
+    importlib-resources~=5.10.0
+    Mako~=1.2.3
+    numpy~=1.23.3
     pyhocon~=0.3.59
-    retrying~=1.3
-    spacy~=3.5
-    tqdm~=4.64
+    retrying~=1.3.4
+    spacy~=3.5.3
+    tqdm~=4.64.1
     zstandard~=0.18.0
     # sqlitedict==2.0.0 is slow! https://github.com/RaRe-Technologies/sqlitedict/issues/152
     # Keep sqlitedict version at 1.7.0.
-    sqlitedict~=1.7
+    sqlitedict~=1.7.0
     bottle~=0.12.23
 
     # Basic Scenarios
-    datasets~=2.17
+    datasets~=2.14.7
     pyarrow>=11.0.0  # Pinned transitive dependency for datasets; workaround for #1026
-    pyarrow-hotfix~=0.6  # Hotfix for CVE-2023-47248
 
     # Basic metrics
-    nltk~=3.7,<3.8.2  # See https://github.com/stanford-crfm/helm/issues/2926
+    nltk~=3.7
+    pyext~=0.7
     rouge-score~=0.1.2
-    scipy~=1.10
+    scipy~=1.10.0
     uncertainty-calibration~=0.1.4
-    scikit-learn~=1.1
+    scikit-learn~=1.1.2
 
     # Models and Metrics Extras
-    transformers~=4.40  # For anthropic_client, vision_language.huggingface_vlm_client, huggingface_client, huggingface_tokenizer, test_openai_token_cost_estimator, model_summac (via summarization_metrics)
+    transformers~=4.33.1  # For anthropic_client, huggingface_client, huggingface_tokenizer, test_openai_token_cost_estimator, model_summac (via summarization_metrics)
     # TODO: Upgrade torch - we need > 2.0.0 for newer versions of transformers
-    torch>=1.13.1,<3.0.0  # For huggingface_client, yalm_tokenizer, model_summac (via summarization_metrics)
-    torchvision>=0.14.1,<3.0.0  # For huggingface_client, yalm_tokenizer, model_summac (via summarization_metrics)
+    torch>=1.12.1,<3.0.0  # For huggingface_client, yalm_tokenizer, model_summac (via summarization_metrics)
+    torchvision>=0.13.1,<3.0.0  # For huggingface_client, yalm_tokenizer, model_summac (via summarization_metrics)
+
+    # Metrics Extras
+    google-api-python-client~=2.64.0  # For perspective_api_client via toxicity_metrics
 
 [options.extras_require]
 proxy-server =
-    gunicorn>=20.1
+    gunicorn~=20.1.0
 
 human-evaluation =
     scaleapi~=2.13.0
     surge-api~=1.1.0
 
 scenarios =
-    gdown~=5.1  # For disinformation_scenario, med_mcqa_scenario, med_qa_scenario: used by ensure_file_downloaded()
+    gdown~=4.4.0  # For disinformation_scenario, med_mcqa_scenario, med_qa_scenario: used by ensure_file_downloaded()
     sympy~=1.11.1  # For numeracy_scenario
     xlrd~=2.0.1  # For ice_scenario: used by pandas.read_excel()
 
 metrics =
-    google-api-python-client~=2.64  # For perspective_api_client via toxicity_metrics
-    numba~=0.56  # For copyright_metrics
+    numba~=0.56.4  # For copyright_metrics
     pytrec_eval==0.5  # For ranking_metrics
     sacrebleu~=2.2.1  # For disinformation_metrics, machine_translation_metrics
-
-summarization =
     summ-eval~=0.892  # For summarization_metrics
 
 plots =
@@ -87,9 +84,6 @@ plots =
     matplotlib~=3.6.0
     seaborn~=0.11.0
 
-decodingtrust =
-    fairlearn~=0.9.0
-
 slurm =
     simple-slurm~=0.2.6
 
@@ -101,174 +95,41 @@ cleva =
     langdetect==1.0.9
 
 images =
-    crfm-helm[accelerate]
-    pillow~=10.2
+    accelerate~=0.23.0  # For the newer versions of Transformers
+    pillow~=9.4.0
 
 mongo =
-    pymongo~=4.2
-
-unitxt =
-    evaluate~=0.4.1
-
-bhasa = 
-    pythainlp==5.0.0
-    pyonmttok==1.37.0
-    sacrebleu~=2.2.1
+    pymongo~=4.2.0
 
 # Model extras
-accelerate =
-    accelerate~=0.25
-
 aleph-alpha =
     aleph-alpha-client~=2.14.0
-    tokenizers>=0.13.3
-
-openvino =
-    optimum[openvino]~=1.19
-
-allenai =
-    ai2-olmo~=0.2
-
-amazon = 
-    boto3~=1.28.57
-    awscli~=1.29.57
-    botocore~=1.31.57
+    tokenizers~=0.13.3
 
 anthropic =
-    anthropic~=0.17
+    anthropic~=0.2.5
     websocket-client~=1.3.2  # For legacy stanford-online-all-v4-s3
 
-cohere =
-    cohere~=5.3
-
-mistral =
-    mistralai~=0.0.11
-
 openai =
-    openai~=1.0
-    tiktoken~=0.7
-    pydantic~=2.0  # For model_dump(mode="json") - openai only requires pydantic>=1.9.0
+    openai~=0.27.8
+    tiktoken~=0.3.3
 
 google =
-    google-cloud-aiplatform~=1.48
+    google-cloud-aiplatform~=1.36.4
 
-together =
-    together~=1.1
+tsinghua =
+    icetk~=0.0.4
 
 yandex =
     sentencepiece~=0.1.97
 
 models =
-    crfm-helm[ai21]
-    crfm-helm[accelerate]
     crfm-helm[aleph-alpha]
-    crfm-helm[allenai]
-    crfm-helm[amazon]
     crfm-helm[anthropic]
-    crfm-helm[cohere]
     crfm-helm[google]
-    crfm-helm[mistral]
     crfm-helm[openai]
-    crfm-helm[reka]
-    crfm-helm[together]
+    crfm-helm[tsinghua]
     crfm-helm[yandex]
-    crfm-helm[openvino]
-
-reka = 
-    reka-api~=2.0.0
-
-vlm =
-    crfm-helm[openai]
-
-    # For OpenFlamingo
-    einops~=0.7.0
-    einops-exts~=0.0.4
-    open-clip-torch~=2.24
-
-    # For IDEFICS
-    torch~=2.1
-
-    # For Qwen: https://github.com/QwenLM/Qwen-VL/blob/master/requirements.txt
-    transformers_stream_generator~=0.0.4
-    scipy~=1.10
-    torchvision>=0.14.1,<3.0.0
-
-    # For Reka AI
-    crfm-helm[reka]
-    
-    # VLM scenarios
-    crfm-helm[images]
-    crfm-helm[image2struct]
-
-    # For metrics
-    pycocoevalcap~=1.2
-
-image2struct =
-    crfm-helm[images]
-
-    # Latex
-    # You will need to install LaTeX separately.
-    # You can run `sudo apt-get install texlive-full` on Ubuntu.
-    latex~=0.7.0
-    pdf2image~=1.16.3
-
-    # Webpage
-    # You will need install Jekyll separately.
-    selenium~=4.17.2
-    html2text~=2024.2.26
-
-    # Metrics
-    opencv-python~=4.7.0.68
-    lpips~=0.1.4
-    imagehash~=4.3.1 # for caching
-
-heim =
-    # HEIM scenarios
-    gdown~=5.1
-
-    # HEIM models
-    diffusers~=0.24.0
-    icetk~=0.0.4
-    jax~=0.4.13
-    jaxlib~=0.4.13
-    crfm-helm[openai]
-
-    # For model, kakaobrain/mindall-e
-    einops~=0.7.0
-    omegaconf~=2.3.0
-    pytorch-lightning~=2.0.5
-
-    # For model, craiyon/dalle-mini and craiyon/dalle-mega
-    flax~=0.6.11
-    ftfy~=6.1.1
-    Unidecode~=1.3.6
-    wandb~=0.13.11
-
-    # HEIM perturbations
-    google-cloud-translate~=3.11.2
-
-    # HEIM metrics
-    autokeras~=1.0.20
-    clip-anytorch~=2.5.0
-    google-cloud-storage~=2.9
-    lpips~=0.1.4
-    multilingual-clip~=1.0.10
-    NudeNet~=2.0.9
-    opencv-python~=4.7.0.68
-    pytorch-fid~=0.3.0
-    tensorflow~=2.11
-    timm~=0.6.12
-    torch-fidelity~=0.3.0
-    torchmetrics~=0.11.1
-
-    # Transitive dependency of NudeNet
-    # This needs to be a version that provides wheels for all Python versions
-    # supported by crfm-helm i.e. Python 3.9, 3.10, 3.11, 3.12
-    # Disallow version 0.23.* because it has no Python 3.9 wheels.
-    scikit-image>=0.22,==0.*,!=0.23.*
-
-    # Shared image dependencies
-    crfm-helm[images]
 
 # Install everything
 all =
@@ -277,29 +138,20 @@ all =
     crfm-helm[scenarios]
     crfm-helm[metrics]
     crfm-helm[plots]
-    crfm-helm[decodingtrust]
     crfm-helm[slurm]
     crfm-helm[cleva]
     crfm-helm[images]
     crfm-helm[models]
     crfm-helm[mongo]
-    crfm-helm[heim]
-    crfm-helm[vlm]
-    crfm-helm[bhasa]
-    # crfm-helm[dev] is excluded because end-users don't need it.
-    # crfm-helm[summarize] is excluded because it requires torch<2.0
-    # TODO(#2280): Add crfm-helm[summarize] back.
 
 # Development only
 # Do not include in all
 dev =
     pytest~=7.2.0
+    black~=22.10.0
+    mypy~=1.5.1
     pre-commit~=2.20.0
-    # Errors produced by type checkers and linters are very version-specific
-    # so they are pinned to an exact version.
-    black==24.3.0
-    mypy==1.5.1
-    flake8==5.0.4
+    flake8~=5.0.4
 
 [options.entry_points]
 console_scripts = 
@@ -318,11 +170,7 @@ exclude =
 # Settings for Flake8: Tool For Style Guide Enforcement
 [flake8]
 max-line-length = 120
-exclude =
-    venv/*
-    src/helm/clients/image_generation/dalle_mini/*
-    src/helm/clients/image_generation/mindalle/*
-    src/helm/clients/vision_language/open_flamingo/*
+exclude = venv/*
 
 # Ignore completely:
 # E203 - White space before ':', (conflicts with black)
@@ -340,24 +188,12 @@ check_untyped_defs = True
 disable_error_code = annotation-unchecked
 # TODO: Change disallow_untyped_defs to True
 disallow_untyped_defs = False
-exclude = dalle_mini|mindalle|open_flamingo
 
 [tool:pytest]
 addopts =
-    # By default:
-    # - we don't test models because doing so will
-    #   make real requests and spend real money
-    # - we don't test scenarios because these will
-    #   download files, which is slow, consumes disk
-    #   space, and increases the chance of spurious
-    #   test failures due to failed downloads.
-    #
-    # For more documentation on pytest markers, see:
-    # - https://docs.pytest.org/en/latest/how-to/mark.html#mark
-    # - https://docs.pytest.org/en/latest/example/markers.html#mark-examples
-    -m 'not models and not scenarios'
+    # By default, we don't test models because doing so will
+    # make real requests and spend real money
+    -m 'not models'
 markers =
-    # Marker for model tests that make real model requests
+    # Marker for tests that make real model requests
     models
-    # Marker for scenario tests that download files
-    scenarios
diff --git a/src/helm/benchmark/metrics/basic_metrics.py b/src/helm/benchmark/metrics/basic_metrics.py
index 03d6c113f48..8b6371ca74f 100644
--- a/src/helm/benchmark/metrics/basic_metrics.py
+++ b/src/helm/benchmark/metrics/basic_metrics.py
@@ -1,19 +1,24 @@
-from collections import defaultdict
 import math
-from dataclasses import dataclass
-from typing import List, Dict, Set
+from dataclasses import dataclass, replace
+from typing import List, Callable, Optional, Dict, Tuple, Set, cast
 from urllib.parse import unquote
+from functools import partial
 
+import json
+import string
+import nltk
 import numpy as np
+import re
 import scipy
 import calibration as cal
-from helm.benchmark.adaptation.scenario_state import ScenarioState
-from helm.benchmark.metrics.evaluate_reference_metrics import compute_reference_metrics
-from helm.benchmark.metrics.efficiency_metrics import EfficiencyMetric
-from helm.benchmark.metrics.reference_metric import ReferenceMetric
+import importlib_resources as resources
+from nltk.metrics.scores import f_measure
+from nltk.tokenize import word_tokenize
+from nltk.translate.bleu_score import sentence_bleu
+from rouge_score import rouge_scorer
 
 from helm.common.hierarchical_logger import hlog
-from helm.common.request import Token, GeneratedOutput
+from helm.common.request import Token, Sequence
 from helm.benchmark.adaptation.adapters.adapter_factory import (
     ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
     ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
@@ -24,11 +29,200 @@
 from helm.benchmark.window_services.window_service import WindowService
 from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
 from helm.benchmark.window_services.tokenizer_service import TokenizerService
-from helm.benchmark.scenarios.scenario import CORRECT_TAG, Instance
-from .metric import Metric, MetricInterface, MetricResult, add_context, get_unique_stat_by_name
-from .metric_name import MetricContext, MetricName
+from helm.benchmark.scenarios.scenario import CORRECT_TAG, Instance, Reference
+from helm.benchmark.scenarios.math_scenario import is_equiv, is_equiv_chain_of_thought
+from helm.benchmark.scenarios.conv_fin_qa_scenario import float_equiv
+from helm.benchmark.scenarios.code_scenario import CodeReference
+from helm.benchmark.metrics.cleva_metrics_helper import ChineseTokenizer
+from . import code_metrics_helper
+from .metric import Metric, get_unique_stat_by_name
+from .metric_name import MetricName
 from .metric_service import MetricService
-from .statistic import Stat, merge_stat
+from .statistic import Stat
+
+
+try:
+    nltk.data.find("tokenizers/punkt")
+except LookupError:
+    nltk.download("punkt")  # Required for rouge
+
+
+EFFICIENCY_DATA_PACKAGE: str = "helm.benchmark.efficiency_data"
+
+INFERENCE_IDEALIZED_RUNTIMES_JSON_FILENAME: str = "inference_idealized_runtimes.json"
+INFERENCE_DENOISED_RUNTIMES_JSON_FILENAME: str = "inference_denoised_runtimes.json"
+TRAINING_EFFICIENCY_JSON_FILENAME: str = "training_efficiency.json"
+
+
+def compute_estimated_time_from_prompt_size_and_num_output_tokens(
+    request_state: RequestState,
+    inference_runtimes_dict: Dict[str, Dict],
+    num_prompt_tokens: int,
+    num_output_tokens: int,
+) -> Optional[float]:
+    estimated_runtime: Optional[float]
+    if request_state.request.model_deployment in inference_runtimes_dict:
+        inference_runtimes_dict_for_model = inference_runtimes_dict[request_state.request.model_deployment]
+        runtime_per_output_token: float = inference_runtimes_dict_for_model["runtime_per_output_token"]
+        raw_runtimes_for_prompt_tokens: Dict[str, float] = inference_runtimes_dict_for_model[
+            "runtime_for_prompt_tokens"
+        ]
+        runtimes_for_prompt_tokens: Dict[int, float] = {int(k): v for (k, v) in raw_runtimes_for_prompt_tokens.items()}
+
+        runtime_for_prompt_tokens: Optional[float] = None
+        largest_num_tokens_in_efficiency_dict: int = max(runtimes_for_prompt_tokens.keys())
+        # Find the smallest num_prompt_tokens larger than the number of tokens in the given prompt,
+        # then scale runtime in dict by (num_prompt_tokens / key) to get more accurate estimate: we
+        # assume that we can encode the prompt at the same throughput as the smallest key larger than
+        # num_prompt_tokens, and number of compute operations scales linearly with num_prompt_tokens.
+        for key in sorted(runtimes_for_prompt_tokens.keys()):
+            if num_prompt_tokens <= key:
+                runtime_for_prompt_tokens = runtimes_for_prompt_tokens[key] * (num_prompt_tokens / key)
+                break
+        # If number of tokens in the prompt exceeds the largest key in the efficiency dict, then
+        # estimate the prompt encoding time by linearly scaling up the runtime for the largest
+        # key (this is reasonably accurate under certain simplifying assumptions).
+        if runtime_for_prompt_tokens is None:
+            runtime_for_prompt_tokens = runtimes_for_prompt_tokens[largest_num_tokens_in_efficiency_dict] * (
+                num_prompt_tokens / largest_num_tokens_in_efficiency_dict
+            )
+        overhead: Optional[float] = inference_runtimes_dict_for_model.get("overhead")
+
+        # Idealized runtime is sum of the runtime of encoding the input tokens, the runtime of
+        # generating `num_output_tokens` (`runtime_per_output_token` * (`num_output_tokens` - 1))
+        # if number of output tokens is greater than 0, otherwise just `runtime_for_prompt_tokens`,
+        # and the overhead if available.
+        estimated_runtime = runtime_for_prompt_tokens
+        if num_output_tokens > 0:
+            estimated_runtime += runtime_per_output_token * (num_output_tokens - 1)
+        # Add overhead if it is available.
+        if overhead is not None:
+            estimated_runtime += overhead
+    else:
+        estimated_runtime = None
+
+    return estimated_runtime
+
+
+def pass_at_k_estimator(n: int, c: int, k: int) -> float:
+    """Calculates 1 - comb(n - c, k) / comb(n, k).
+
+    Numerically stable version defined in
+        https://arxiv.org/pdf/2107.03374.pdf
+    """
+    if n - c < k:
+        return 1.0
+    return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
+
+
+def normalize_text(text: str) -> str:
+    """Lower text and remove punctuation, articles and extra whitespace.
+    Copied from the [QuAC](http://quac.ai/) evaluation script found at
+    https://s3.amazonaws.com/my89public/quac/scorer.py"""
+
+    def remove_articles(text: str) -> str:
+        return re.sub(r"\b(a|an|the)\b", " ", text)
+
+    def white_space_fix(text: str) -> str:
+        return " ".join(text.split())
+
+    def remove_punc(text: str) -> str:
+        exclude = set(string.punctuation)
+        return "".join(ch for ch in text if ch not in exclude)
+
+    def lower(text: str) -> str:
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(text))))
+
+
+def exact_match(gold: str, pred: str) -> float:
+    if not pred:
+        return 0
+
+    return 1 if gold.strip() == pred.strip() else 0
+
+
+def quasi_exact_match(gold: str, pred: str) -> float:
+    if not pred:
+        return 0
+
+    return 1 if normalize_text(gold) == normalize_text(pred) else 0
+
+
+def prefix_exact_match(gold: str, pred: str) -> float:
+    """
+    The `prefix_exact_match` metric is particularly useful in the zero-shot setting, where the model is
+    not given examples of the expected outputs and tends to output more tokens than it should.
+
+    For example, for this zero-shot prompt from BoolQ,
+
+    Passage: Elmendorf Air Force Base (IATA: EDF, ICAO: PAED, FAA LID: EDF) is a United States military facility
+    in Anchorage, the largest city in Alaska. Originally known as Elmendorf Field, it became Elmendorf Air Force
+    Base after World War II, and in 2010 it merged with nearby Fort Richardson to form Joint Base Elmendorf-Richardson.
+    Question: Is there an air force base in anchorage alaska?
+    Answer:
+
+    the model could output up to `max_tokens` number of tokens "Yes, Elmendorf" instead of just "Yes".
+    """
+    if not pred:
+        return 0
+
+    return 1 if pred.strip().startswith(gold.strip()) else 0
+
+
+def quasi_prefix_exact_match(gold: str, pred: str) -> float:
+    """
+    Same thing as `prefix_exact_match` but we normalize the text before checking if the prefix match.
+    """
+    if not pred:
+        return 0
+
+    return 1 if normalize_text(pred).startswith(normalize_text(gold)) else 0
+
+
+def f1_score(gold: str, pred: str) -> float:
+    if not pred:  # answer is None
+        return 0.0
+
+    ret = f_measure(set(normalize_text(gold).split()), set(normalize_text(pred).split()))
+    if ret is None:  # answer is the empty string after normalizing
+        return 0.0
+
+    return ret
+
+
+def exact_match_indicator(gold: str, pred: str, indicator: str = " ") -> float:
+    """
+    Exact match, allowing for some preceding context.
+    For example, the following two answers are considered matching:
+    - Because of x and y, the answer is ## <answer>
+    - Given reasons y and z, the answer is ## <answer>
+    While the following is considered different from the earlier two
+    - Given reasons x and a, the answer is ## <other answer>
+    """
+    pred = pred.split(indicator)[-1].strip()
+    gold = gold.split(indicator)[-1].strip()
+    return exact_match(gold, pred)
+
+
+def final_number_exact_match(gold: str, pred: str) -> float:
+    """
+    Returns 1 iff the final number in gold and pred match.
+    Similar to exact_match_indicator.
+    Example:
+    - gold = "The answer is 15."
+    - pred = "The answer is 15 eggs."
+    - Returns 1
+    """
+
+    def get_final_number(x: str) -> str:
+        matches = re.findall(r"-?[\d,]+(?:.\d+)?", x)
+        if not matches:
+            return ""
+        return matches[-1].replace(",", "")
+
+    return exact_match(get_final_number(gold), get_final_number(pred))
 
 
 def get_num_bytes(tokens: List[Token]) -> int:
@@ -80,6 +274,123 @@ def convert_tokens_to_text(tokens: List[Token]) -> List[Dict]:
     return groups
 
 
+def rouge_score(gold: str, pred: str, rouge_type: str, scorer: rouge_scorer.RougeScorer) -> float:
+    scores = scorer.score(gold, pred)
+    return scores[rouge_type].fmeasure
+
+
+def get_rouge_function(rouge_type: str) -> Callable[[str, str], float]:
+    scorer = rouge_scorer.RougeScorer([rouge_type], use_stemmer=True)
+    return partial(rouge_score, scorer=scorer, rouge_type=rouge_type)
+
+
+def bleu_1(gold: str, pred: str) -> float:
+    return sentence_bleu([word_tokenize(gold)], word_tokenize(pred), weights=(1, 0, 0, 0))
+
+
+def chinese_bleu_1(gold: str, pred: str) -> float:
+    char_tokenizer = ChineseTokenizer()
+    return sentence_bleu([char_tokenizer.tokenize(gold)], char_tokenizer.tokenize(pred), weights=(1, 0, 0, 0))
+
+
+def get_chinese_rouge_function(rouge_type: str) -> Callable[[str, str], float]:
+    char_tokenizer = ChineseTokenizer()
+    scorer = rouge_scorer.RougeScorer([rouge_type], use_stemmer=True, tokenizer=char_tokenizer)
+    return partial(rouge_score, scorer=scorer, rouge_type=rouge_type)
+
+
+def cleva_math_result_match(gold: str, pred: str) -> float:
+    """
+    Exact match that only cares the last math expression.
+    Common math expressions are numbers and fractions.
+    """
+    pattern = r"[-+*/%\.\(\)\d]+"
+    matches = re.findall(pattern, pred)
+    if matches:
+        pred = matches[-1].lstrip(")")
+    # remove space in front or at the end
+    pred = pred.strip()
+    return exact_match(gold, pred)
+
+
+def bleu_4(gold: str, pred: str) -> float:
+    return sentence_bleu([word_tokenize(gold)], word_tokenize(pred), weights=(0, 0, 0, 1))
+
+
+def extract_set_from_text(
+    set_str: str,
+    set_start_str: str = " is ",
+    set_separator: str = " and ",
+    empty_set_str: str = "Nothing.",
+) -> Set[str]:
+    """
+    Given a string, extract the set of strings implied by that string.
+    set_start_str denotes the start of the set
+    set_separator denotes the string separating set elements
+    empty_set_str is the string which denotes the empty set
+    """
+    if set_str == empty_set_str:
+        return set()
+    set_str = set_str.replace(".", "")
+    extracted_set = set(set_str.split(set_start_str)[-1].split(set_separator))
+    return extracted_set
+
+
+def extract_gold_pred_sets(gold: str, pred: str) -> Tuple[Set[str], Set[str]]:
+    """Extract the set of strings implied by the gold and pred strings"""
+    gold_set = extract_set_from_text(gold)
+    pred_set = extract_set_from_text(pred.split("\n")[0])
+    return gold_set, pred_set
+
+
+def iou_set_match(gold: str, pred: str) -> float:
+    """Compute the intersection over union of the gold and pred sets"""
+    gold_set, pred_set = extract_gold_pred_sets(gold, pred)
+    if len(gold_set) == 0:  # If gold is empty, just check if the pred set is also empty
+        return float(gold_set == pred_set)
+    return len(gold_set.intersection(pred_set)) / len(gold_set.union(pred_set))
+
+
+def f1_set_match(gold: str, pred: str) -> float:
+    """Compute the F1 score of the gold and pred sets"""
+    gold_set, pred_set = extract_gold_pred_sets(gold, pred)
+    if len(gold_set) == 0:  # If gold is empty, just check if the pred set is also empty
+        return float(gold_set == pred_set)
+    true_positives = gold_set.intersection(pred_set)
+    return 2 * len(true_positives) / (len(gold_set) + len(pred_set))
+
+
+def exact_set_match(gold: str, pred: str) -> float:
+    """Compute whether the sets generated exactly match"""
+    gold_set, pred_set = extract_gold_pred_sets(gold, pred)
+    return float(gold_set == pred_set)
+
+
+def absolute_value_difference(gold: str, pred: str) -> float:
+    """Compute the absolute value of the difference between two numbers (provided as strings),
+    or 0.0 if invalid input.
+    """
+
+    def maybe_int(text: str):
+        """Parse int, ignoring commas in numbers."""
+        try:
+            val = int(text.replace(",", ""))
+        except ValueError:
+            return 0.0
+        return val
+
+    gold_val = maybe_int(gold)
+    pred_val = maybe_int(pred)
+    return abs(gold_val - pred_val)
+
+
+def code_eval(gold: Tuple[str, Optional[Dict]], pred: str) -> float:
+    """Evaluate Code Correctness on test examples."""
+    assert gold[1] is not None  # gold[1]["canonical_solution"]
+    # Warning: will execute machine generated code; need to sandbox before executing
+    return float(code_metrics_helper.check_correctness(gold[1], pred, 3.0)["passed"])  # type: ignore
+
+
 def compute_perplexity_metrics(stats: Dict[MetricName, Stat]) -> List[Stat]:
     # TODO: find out the root cause and undo num_X > 0 check
     #       https://github.com/stanford-crfm/benchmarking/issues/350
@@ -104,37 +415,7 @@ def compute_perplexity_metrics(stats: Dict[MetricName, Stat]) -> List[Stat]:
     return derived_stats
 
 
-class InstancesPerSplitMetric(MetricInterface):
-    """Report the average num_instances in each MetricContext across train_trials."""
-
-    def evaluate(
-        self, scenario_state: ScenarioState, metric_service: MetricService, eval_cache_path: str, parallelism: int
-    ) -> MetricResult:
-        adapter_spec = scenario_state.adapter_spec
-        global_stats: Dict[MetricName, Stat] = {}
-
-        for train_trial_index in range(adapter_spec.num_train_trials):
-            trial_stats: Dict[MetricName, Stat] = {}  # Statistics just for this trial
-            # Group instances in this train_trial by context.
-            instances_per_metric_context: Dict[MetricContext, Set[Instance]] = defaultdict(set)
-            for request_state in scenario_state.request_states:
-                if request_state.train_trial_index == train_trial_index:
-                    instances_per_metric_context[MetricContext.from_instance(request_state.instance)].add(
-                        request_state.instance
-                    )
-            for context, instance_set in instances_per_metric_context.items():
-                stat = Stat(MetricName("num_instances")).add(len(instance_set))
-                merge_stat(trial_stats, add_context(stat, context))
-
-            # We take the mean value for each trial.
-            for stat in trial_stats.values():
-                merge_stat(global_stats, stat.take_mean())
-
-        # There are no per-instance Stats.
-        return MetricResult(list(global_stats.values()), [])
-
-
-class BasicGenerationMetric(Metric):
+class BasicMetric(Metric):
     """
     Defines basic metrics which don't require domain knowledge.  This should be
     fairly comprehensive already, and we should try to use this as much as possible.
@@ -145,11 +426,339 @@ class BasicGenerationMetric(Metric):
 
     def __init__(self, names: List[str]):
         self.names: List[str] = names
-        self.efficiency_metric = EfficiencyMetric()
+
+        # For Efficiency metrics:
+        # The `inference_efficiency.json` file contains a `runtime_per_output_token` value
+        # (the estimated runtime of generating one output token) and a
+        # `runtime_for_prompt_tokens` dict (a mapping from various num_prompt_tokens values to
+        # the estimated runtime of encoding a prompt with that many tokens).
+        # For example:
+        # "openai/davinci": {
+        #   "runtime_per_output_token": 0.080,
+        #   "runtime_for_prompt_tokens": {
+        #     "1": 0.016,
+        #     "16": 0.018,
+        #     "32": 0.020,
+        #     ...
+        #
+        # These runtimes are generated by initializing Megatron with a model of the right size,
+        # obtaining end-to-end generation times for different numbers of prompt and output tokens,
+        # and then fitting a linear regression model to the runtimes: the resulting slope is the
+        # runtime_per_output_token, which is the processing time for generating each output token,
+        # and the y-intercept is the runtime_for_prompt_tokens, with different values for different
+        # num_prompt_tokens values.
+        # Profiling code and logs, and code to fit the regression model is available at
+        # https://github.com/stanford-crfm/benchmarking_efficiency.
+        data_package = resources.files(EFFICIENCY_DATA_PACKAGE)
+        with data_package.joinpath(INFERENCE_IDEALIZED_RUNTIMES_JSON_FILENAME).open("r") as f:
+            self.inference_idealized_runtimes_dict = json.load(f)
+        with data_package.joinpath(INFERENCE_DENOISED_RUNTIMES_JSON_FILENAME).open("r") as f:
+            self.inference_denoised_runtimes_dict = json.load(f)
+
+        # We use estimated emitted CO2 during training (in tons of CO2) as a proxy metric
+        # for training efficiency. We use reported metrics where applicable, otherwise
+        # we estimate them from runtime information, type and number of hardware accelerators
+        # used, region, etc.
+        with data_package.joinpath(TRAINING_EFFICIENCY_JSON_FILENAME).open("r") as f:
+            self.training_efficiency_dict = json.load(f)
 
     def __repr__(self):
         return f"BasicMetric({','.join(self.names)})"
 
+    def compute_reference_metrics(
+        self, adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService
+    ) -> List[Stat]:
+        """
+        Setup:
+
+        - Gold (correct references): G1 ... Gm
+        - Predictions (completions): P1 ... Pk
+
+        For each pair (G, P), we can define a ${score} (e.g., exact match, F1, BLEU).
+
+        We define the following stats:
+
+        - ${score}: max_i score(Gi, P1)
+        - ${score}@k: max_{i,j} score(Gi, Pj)
+        """
+
+        def compute_metrics_helper(
+            name: MetricName,
+            score_func: Callable,
+            group: Optional[str] = None,
+        ) -> List[Stat]:
+            if name.name == "pass":  # Calculate pass@k for HumanEval from CodeScenario.
+                score_func = cast(Callable[[Tuple[str, Optional[Dict]], str], float], score_func)  # Make mypy happy.
+                code_golds = cast(List[CodeReference], golds)
+                results = [
+                    score_func((gold.output.text, gold.test_cases), pred) for gold in code_golds for pred in preds
+                ]
+                _len, _sum = len(results), int(sum(results))  # Cast to int to make type match.
+                score_1 = pass_at_k_estimator(_len, _sum, 1)
+                score_k = pass_at_k_estimator(_len, _sum, adapter_spec.num_outputs)
+            elif name.name == "code_eval_acc":
+                score_func = cast(Callable[[Tuple[str, Optional[Dict]], str], float], score_func)  # Make mypy happy.
+                code_golds = cast(List[CodeReference], golds)
+                score_1 = max(score_func((gold.output.text, gold.test_cases), preds[0]) for gold in code_golds)
+                score_k = max(
+                    score_func((gold.output.text, gold.test_cases), pred) for gold in code_golds for pred in preds
+                )
+            else:
+                score_func = cast(Callable[[str, str], float], score_func)  # Make mypy happy.
+                score_1 = max(score_func(gold.output.text, preds[0]) for gold in golds)
+                score_k = max(score_func(gold.output.text, pred) for gold in golds for pred in preds)
+
+            metrics = [Stat(name).add(score_1)]  # score_1 corresponds using one prediction
+            if adapter_spec.num_outputs != 1:
+                metrics.append(Stat(replace(name, name=f"{name.name}@{adapter_spec.num_outputs}")).add(score_k))
+            return metrics
+
+        # maps each string metric name to its associated function
+        metric_fn_mapping: Dict[str, Callable] = {
+            "exact_match": exact_match,
+            "quasi_exact_match": quasi_exact_match,
+            "prefix_exact_match": prefix_exact_match,
+            "quasi_prefix_exact_match": quasi_prefix_exact_match,
+            "exact_match_indicator": exact_match_indicator,
+            "final_number_exact_match": final_number_exact_match,
+            "exact_set_match": exact_set_match,
+            "iou_set_match": iou_set_match,
+            "f1_set_match": f1_set_match,
+            "math_equiv": is_equiv,
+            "math_equiv_chain_of_thought": is_equiv_chain_of_thought,
+            "float_equiv": float_equiv,
+            "code_eval_acc": code_eval,
+            "pass": code_eval,
+            "f1_score": f1_score,
+            "rouge_1": get_rouge_function("rouge1"),
+            "rouge_2": get_rouge_function("rouge2"),
+            "rouge_l": get_rouge_function("rougeL"),
+            "bleu_1": bleu_1,
+            "bleu_4": bleu_4,
+            "chinese_bleu_1": chinese_bleu_1,
+            "chinese_rouge_1": get_chinese_rouge_function("rouge1"),
+            "chinese_rouge_2": get_chinese_rouge_function("rouge2"),
+            "cleva_math_result_match": cleva_math_result_match,
+            "absolute_value_difference": absolute_value_difference,
+        }
+
+        stats: List[Stat] = []
+
+        # Gold outputs
+        golds: List[Reference] = [reference for reference in request_state.instance.references if reference.is_correct]
+        assert len(golds) > 0
+
+        # Predicted outputs
+        assert request_state.result is not None
+        sorted_completions: List[Sequence] = sorted(request_state.result.completions, key=lambda x: -x.logprob)
+        preds: List[str] = [completion.text.strip() for completion in sorted_completions]
+
+        # Apply mapping if exists (e.g., for multiple-choice questions A -> Boston, B -> New York)
+        # Note: If 'A' and 'B' were the only possible choices, smaller language models like GPT-2 would
+        # sometimes predict a random letter like 'M'.
+        if request_state.output_mapping is not None:
+            preds = [request_state.output_mapping.get(pred) for pred in preds]  # type: ignore
+
+        # Compute max_prob, the probability that the model assigns to its generated text.
+        # Use the log prob of sorted_completions[0], which is the completion with the highest
+        # log_prob. We use this since that's what's used for computing metrics like exact_match.
+        # One subtlety is that when computing exact_match, we strip whitespace, so the actual
+        # max_prob is the sum of all the probabilities in the set {x : strip(x) = prediction}.
+        # In practice, we think this may not make much of a difference because models may not place
+        # high probabilities on having additional spaces (should check this). Also, the sum
+        # involves computing the log_prob for many completions which could be intractable.
+        max_prob = np.exp(sorted_completions[0].logprob)
+        stats.append(Stat(MetricName("max_prob")).add(max_prob))
+
+        # Add other metrics
+        for metric_name in self.names:
+            if metric_name in metric_fn_mapping:
+                stats.extend(compute_metrics_helper(MetricName(metric_name), metric_fn_mapping[metric_name]))
+            else:
+                raise NameError(f"{metric_name} is not in the list of metric functions.")
+
+        return stats
+
+    def compute_efficiency_metrics(
+        self, adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService
+    ) -> List[Stat]:
+        """Compute efficiency metrics for both inference and training.
+        For inference, we record both the actual runtime and an estimated idealized runtime
+        for the given request with an optimized software implementation run on A100 GPU(s),
+        taking into account both the number of tokens in the prompt of the request, and the
+        number of generated output tokens.
+        For training, we report the estimated total metric tons of CO2 emitted to train the
+        model. This is the same for each request."""
+        # Compute efficiency metrics for inference.
+        assert request_state.result is not None
+
+        runtime: Optional[float] = None
+        batch_size: Optional[int] = None
+        # Compute efficiency metrics for inference.
+        if request_state.result.request_time is not None:
+            runtime = request_state.result.request_time
+            batch_size = 1
+        # For models that perform offline batch inference, effective runtime is batch_request_time, but also
+        # record batch_size to provide nuance.
+        if request_state.result.batch_request_time is not None and request_state.result.batch_size is not None:
+            runtime = request_state.result.batch_request_time
+            batch_size = request_state.result.batch_size
+
+        # Compute total number of prompt and output tokens.
+        # Fetch the right `Tokenizer` depending on the model defined in `AdapterSpec`
+        # and calculate the number of tokens in the prompt.
+        tokenizer_service: TokenizerService = metric_service
+        window_service: WindowService = WindowServiceFactory.get_window_service(
+            adapter_spec.model_deployment, tokenizer_service
+        )
+        prompt: str = request_state.request.prompt
+        num_prompt_tokens: int = window_service.get_num_tokens(prompt)
+
+        # Total number of tokens in the completion.
+        num_completion_tokens: int = sum([len(completion.tokens) for completion in request_state.result.completions])
+        # Don't include prompt in number of generated tokens (e.g., for language modeling).
+        # Assume that tokens for different completions are generated sequentially (instead of batched) when
+        # computing num_output_tokens (for the purpose of runtime estimation).
+        num_output_tokens: int = num_completion_tokens
+        if request_state.request.echo_prompt:
+            # num_prompt_tokens > num_output_tokens can happen if tokenizer doesn't round trip.
+            if num_prompt_tokens <= num_output_tokens:
+                num_output_tokens -= num_prompt_tokens
+            else:
+                hlog(
+                    f"WARNING: num_prompt_tokens ({num_prompt_tokens}) > num_output_tokens ({num_output_tokens}) "
+                    f"for prompt: {prompt}"
+                )
+                num_output_tokens = 0
+
+        idealized_runtime: Optional[float] = compute_estimated_time_from_prompt_size_and_num_output_tokens(
+            request_state, self.inference_idealized_runtimes_dict, num_prompt_tokens, num_output_tokens
+        )
+
+        denoised_runtime: Optional[float] = compute_estimated_time_from_prompt_size_and_num_output_tokens(
+            request_state, self.inference_denoised_runtimes_dict, num_prompt_tokens, num_output_tokens
+        )
+        # Denoised runtime for offline models is just runtime.
+        # We divide by batch_size to get approximate per-input runtime.
+        if runtime is not None and request_state.result.batch_size is not None:
+            denoised_runtime = runtime / request_state.result.batch_size
+
+        # Compute efficiency metrics for training.
+        training_co2_cost: Optional[float]
+        if request_state.request.model_deployment in self.training_efficiency_dict["carbon"]:
+            training_co2_cost = self.training_efficiency_dict["carbon"][request_state.request.model_deployment]["value"]
+        else:
+            training_co2_cost = None
+
+        training_energy_cost: Optional[float]
+        if request_state.request.model_deployment in self.training_efficiency_dict["energy"]:
+            training_energy_cost = self.training_efficiency_dict["energy"][request_state.request.model_deployment][
+                "value"
+            ]
+        else:
+            training_energy_cost = None
+
+        stats = [
+            Stat(MetricName("num_prompt_tokens")).add(num_prompt_tokens),
+            Stat(MetricName("num_completion_tokens")).add(num_completion_tokens),
+            Stat(MetricName("num_output_tokens")).add(num_output_tokens),
+            Stat(MetricName("training_co2_cost")).add(training_co2_cost),
+            Stat(MetricName("training_energy_cost")).add(training_energy_cost),
+        ]
+        if runtime is not None:
+            stats.append(Stat(MetricName("inference_runtime")).add(runtime))
+        if batch_size is not None:
+            stats.append(Stat(MetricName("batch_size")).add(batch_size))
+        if denoised_runtime is not None:
+            stats.append(Stat(MetricName("inference_denoised_runtime")).add(denoised_runtime))
+        if idealized_runtime is not None:
+            stats.append(Stat(MetricName("inference_idealized_runtime")).add(idealized_runtime))
+        return stats
+
+    def compute_finish_reason_metrics(
+        self, adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService
+    ) -> List[Stat]:
+        """Record how often generation finished due to reaching token limit, stop token(s), or end of text"""
+        assert request_state.result is not None
+        sequence = request_state.result.completions[0]
+        valid_reasons = [
+            "length",
+            "stop",
+            "endoftext",
+            "unknown",
+        ]
+        if sequence.finish_reason is None or sequence.finish_reason["reason"] not in valid_reasons:
+            reason = "unknown"
+        else:
+            reason = sequence.finish_reason["reason"]
+        return [
+            Stat(MetricName(f"finish_reason_{valid_reason}")).add(int(reason == valid_reason))
+            for valid_reason in valid_reasons
+        ]
+
+    def compute_truncation_metrics(
+        self, adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService
+    ) -> List[Stat]:
+        """
+        Record the number of training instances used in the prompt and whether
+        even the prompt needed to be truncated (once we hit zero training instances).
+        """
+        return [
+            Stat(MetricName("num_train_instances")).add(request_state.num_train_instances),
+            Stat(MetricName("prompt_truncated")).add(request_state.prompt_truncated),
+        ]
+
+    def compute_all_general_metrics(
+        self, adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService
+    ) -> List[Stat]:
+        """
+        Compute metrics that are common to both `evaluate_generation` and `evaluate_references`.
+        """
+        stats: List[Stat] = []
+
+        stats.append(Stat(MetricName("num_references")).add(len(request_state.instance.references)))
+
+        # Copy from adapter spec
+        stats.append(Stat(MetricName("num_train_trials")).add(adapter_spec.num_train_trials))
+
+        stats.extend(self.compute_efficiency_metrics(adapter_spec, request_state, metric_service))
+        stats.extend(self.compute_finish_reason_metrics(adapter_spec, request_state, metric_service))
+        stats.extend(self.compute_truncation_metrics(adapter_spec, request_state, metric_service))
+
+        return stats
+
+    def compute_language_modeling_metrics(
+        self, adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService
+    ) -> List[Stat]:
+        """Compute the logprob and normalization factors for the first completion"""
+        assert request_state.result is not None
+        sequence = request_state.result.completions[0]
+
+        # Remove the empty tokens (typically generated by the AI21 tokenizer in the beginning of the text)
+        #
+        # Some more details about AI21 tokenizer: If the input prompt begins with a space, then
+        # the tokenizer inserts an empty token to the beginning.
+        # e.g. " burying him" -> ["▁"(0,0), "▁burying"(0,8), "▁him"(8,12)].
+        # TODO(#1522): Update this comment once solved.
+        # Since this empty token is introduced by our chunking approach, we need to remove it.
+        tokens: List[Token]
+        if request_state.num_conditioning_tokens > 0 and sequence.tokens[0].text == "":
+            tokens = sequence.tokens[1:]
+        else:
+            tokens = sequence.tokens
+        pred_tokens = tokens[request_state.num_conditioning_tokens :]
+        logprob, num_perplexity_tokens, num_bytes = (
+            sum(token.logprob for token in pred_tokens),
+            len(pred_tokens),
+            get_num_bytes(pred_tokens),
+        )
+
+        return [
+            Stat(MetricName("logprob")).add(logprob),
+            Stat(MetricName("num_perplexity_tokens")).add(num_perplexity_tokens),
+            Stat(MetricName("num_bytes")).add(num_bytes),
+        ]
+
     def evaluate_generation(
         self,
         adapter_spec: AdapterSpec,
@@ -159,40 +768,15 @@ def evaluate_generation(
     ) -> List[Stat]:
         """Compute all metrics."""
         stats: List[Stat] = []
-        stats.extend(compute_request_state_metrics(self.efficiency_metric, adapter_spec, request_state, metric_service))
+        stats.extend(self.compute_all_general_metrics(adapter_spec, request_state, metric_service))
 
         if len(request_state.instance.references) > 0:
-            stats.extend(compute_reference_metrics(self.names, adapter_spec, request_state, metric_service))
+            stats.extend(self.compute_reference_metrics(adapter_spec, request_state, metric_service))
 
-        stats.extend(compute_language_modeling_metrics(adapter_spec, request_state, metric_service))
+        stats.extend(self.compute_language_modeling_metrics(adapter_spec, request_state, metric_service))
 
         return stats
 
-    def derive_stats(self, stats_dict: Dict[MetricName, Stat]) -> List[Stat]:
-        """Derive perplexity metrics if applicable. We don't worry about splits and perturbations here."""
-        derived_stats: List[Stat] = []
-        derived_stats.extend(compute_perplexity_metrics(stats_dict))
-        return derived_stats
-
-    def derive_per_instance_stats(self, per_instance_stats: Dict[Instance, List[Stat]]) -> List[Stat]:
-        """Derive calibration metrics if applicable. We don't worry about splits and perturbations here."""
-        derived_stats: List[Stat] = []
-        derived_stats.extend(compute_calibration_metrics(per_instance_stats))
-        return derived_stats
-
-
-class BasicReferenceMetric(ReferenceMetric):
-    """
-    Defines basic metrics for Scenarios that use one Request per Reference instead of
-    one per Instance.
-    """
-
-    def __init__(self):
-        self.efficiency_metric = EfficiencyMetric()
-
-    def __repr__(self):
-        return "BasicReferenceMetric"
-
     def evaluate_references(
         self,
         adapter_spec: AdapterSpec,
@@ -222,7 +806,7 @@ def compute_logprob_and_length(request_state: RequestState, window_service: Wind
             assert len(request_state.result.completions) == 1
 
             reference_index = request_state.reference_index
-            sequence: GeneratedOutput = request_state.result.completions[0]
+            sequence: Sequence = request_state.result.completions[0]
             reference: str = request_state.instance.references[reference_index].output.text
 
             # Find the span of the completion that matches the reference.
@@ -269,14 +853,8 @@ def compute_logprob_and_length(request_state: RequestState, window_service: Wind
             raise ValueError(f"Unknown adapter method: {adapter_spec.method}")
 
         stats: List[Stat] = []
+        stats.extend(self.compute_all_general_metrics(adapter_spec, request_state, metric_service))
 
-        general_metrics: Dict[MetricName, Stat] = {}
-        for request_state in reference_request_states:
-            for stat in compute_request_state_metrics(
-                self.efficiency_metric, adapter_spec, request_state, metric_service
-            ):
-                merge_stat(general_metrics, stat)
-        stats.extend(general_metrics.values())
         max_prob = np.max(scipy.special.softmax(reference_scores))
 
         # Multiple references may attain the same maximal score; in such cases,
@@ -295,96 +873,18 @@ def compute_logprob_and_length(request_state: RequestState, window_service: Wind
         )
         return stats
 
+    def derive_stats(self, stats_dict: Dict[MetricName, Stat]) -> List[Stat]:
+        """Derive perplexity metrics if applicable. We don't worry about splits and perturbations here."""
+        derived_stats: List[Stat] = []
+        derived_stats.extend(compute_perplexity_metrics(stats_dict))
+        return derived_stats
 
-def compute_request_state_metrics(
-    efficiency_metric: EfficiencyMetric,
-    adapter_spec: AdapterSpec,
-    request_state: RequestState,
-    metric_service: MetricService,
-) -> List[Stat]:
-    """
-    Compute metrics that are common to both `evaluate_generation` and `evaluate_references`.
-    """
-    stats: List[Stat] = []
-
-    stats.append(Stat(MetricName("num_references")).add(len(request_state.instance.references)))
-
-    # Copy from adapter spec
-    stats.append(Stat(MetricName("num_train_trials")).add(adapter_spec.num_train_trials))
-
-    stats.extend(efficiency_metric.compute_efficiency_metrics(adapter_spec, request_state, metric_service))
-    stats.extend(_compute_finish_reason_metrics(adapter_spec, request_state, metric_service))
-    stats.extend(_compute_truncation_metrics(adapter_spec, request_state, metric_service))
-
-    return stats
-
-
-def _compute_finish_reason_metrics(
-    adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService
-) -> List[Stat]:
-    """Record how often generation finished due to reaching token limit, stop token(s), or end of text"""
-    assert request_state.result is not None
-    sequence = request_state.result.completions[0]
-    valid_reasons = [
-        "length",
-        "stop",
-        "endoftext",
-        "unknown",
-    ]
-    if sequence.finish_reason is None or sequence.finish_reason["reason"] not in valid_reasons:
-        reason = "unknown"
-    else:
-        reason = sequence.finish_reason["reason"]
-    return [
-        Stat(MetricName(f"finish_reason_{valid_reason}")).add(int(reason == valid_reason))
-        for valid_reason in valid_reasons
-    ]
-
-
-def _compute_truncation_metrics(
-    adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService
-) -> List[Stat]:
-    """
-    Record the number of training instances used in the prompt and whether
-    even the prompt needed to be truncated (once we hit zero training instances).
-    """
-    return [
-        Stat(MetricName("num_train_instances")).add(request_state.num_train_instances),
-        Stat(MetricName("prompt_truncated")).add(request_state.prompt_truncated),
-    ]
-
-
-def compute_language_modeling_metrics(
-    adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService
-) -> List[Stat]:
-    """Compute the logprob and normalization factors for the first completion"""
-    assert request_state.result is not None
-    sequence = request_state.result.completions[0]
-
-    # Remove the empty tokens (typically generated by the AI21 tokenizer in the beginning of the text)
-    #
-    # Some more details about AI21 tokenizer: If the input prompt begins with a space, then
-    # the tokenizer inserts an empty token to the beginning.
-    # e.g. " burying him" -> ["▁"(0,0), "▁burying"(0,8), "▁him"(8,12)].
-    # TODO(#1522): Update this comment once solved.
-    # Since this empty token is introduced by our chunking approach, we need to remove it.
-    tokens: List[Token]
-    if request_state.num_conditioning_tokens > 0 and sequence.tokens[0].text == "":
-        tokens = sequence.tokens[1:]
-    else:
-        tokens = sequence.tokens
-    pred_tokens = tokens[request_state.num_conditioning_tokens :]
-    logprob, num_perplexity_tokens, num_bytes = (
-        sum(token.logprob for token in pred_tokens),
-        len(pred_tokens),
-        get_num_bytes(pred_tokens),
-    )
-
-    return [
-        Stat(MetricName("logprob")).add(logprob),
-        Stat(MetricName("num_perplexity_tokens")).add(num_perplexity_tokens),
-        Stat(MetricName("num_bytes")).add(num_bytes),
-    ]
+    def derive_per_instance_stats(self, per_instance_stats: Dict[Instance, List[Stat]]) -> List[Stat]:
+        """Derive calibration metrics if applicable. We don't worry about splits and perturbations here."""
+        derived_stats: List[Stat] = []
+        derived_stats.extend(compute_calibration_metrics(per_instance_stats))
+        derived_stats.append(Stat(MetricName("num_instances")).add(len(per_instance_stats)))
+        return derived_stats
 
 
 def _has_non_zero_valued_logprobs(per_instance_stats: Dict[Instance, List[Stat]]) -> bool:
diff --git a/src/helm/benchmark/metrics/classification_metrics.py b/src/helm/benchmark/metrics/classification_metrics.py
index 77ec390e783..d1d71322064 100644
--- a/src/helm/benchmark/metrics/classification_metrics.py
+++ b/src/helm/benchmark/metrics/classification_metrics.py
@@ -1,18 +1,17 @@
 from typing import List, Optional
 
-from sklearn.metrics import f1_score
-from sklearn.preprocessing import MultiLabelBinarizer
+from sklearn.metrics import f1_score, recall_score, precision_score
+from sklearn.preprocessing import MultiLabelBinarizer, label_binarize
 
 from helm.benchmark.adaptation.request_state import RequestState
-from helm.benchmark.metrics.evaluate_instances_metric import EvaluateInstancesMetric
-from helm.benchmark.metrics.evaluate_reference_metrics import normalize_text
-from helm.benchmark.metrics.metric import MetricName
+from helm.benchmark.metrics.basic_metrics import normalize_text
+from helm.benchmark.metrics.metric import Metric, MetricName
 from helm.benchmark.metrics.statistic import Stat
 from helm.benchmark.scenarios.scenario import Reference
-from helm.common.request import GeneratedOutput
+from helm.common.request import Sequence
 
 
-class ClassificationMetric(EvaluateInstancesMetric):
+class ClassificationMetric(Metric):
     """Defines metrics for multi-class classification using the generation adapter.
 
     Currently provides `classification_macro_f1` and `classification_micro_f1`.
@@ -32,13 +31,26 @@ class ClassificationMetric(EvaluateInstancesMetric):
     - Currently, multi-label classification is not supported.
     """
 
-    def __init__(self, delimiter: Optional[str] = None):
+    def __init__(
+        self, delimiter: Optional[str] = None, average: Optional[str] = None, class_defs: Optional[List[str]] = None
+    ):
         self.delimiter = delimiter
+        self.average = average
+        self.class_defs = [normalize_text(c) for c in class_defs] if class_defs is not None else None
 
     def is_multi_label(self) -> bool:
         return bool(self.delimiter)
 
-    def evaluate_instances(self, request_states: List[RequestState], eval_cache_path: str) -> List[Stat]:
+    @staticmethod
+    def normalize_binary(y: List[List[str]], class_defs: Optional[List[str]]) -> List[List[str]]:
+        assert class_defs is not None
+        assert len(class_defs) == 2
+        class_set = set(class_defs)
+        neg_label = class_defs[0]
+        ny = [v if len(v) == 1 and v[0] in class_set else [neg_label] for v in y]
+        return ny
+
+    def evaluate_instances(self, request_states: List[RequestState]) -> List[Stat]:
         y_pred: List[List[str]] = []
         y_true: List[List[str]] = []
         for request_state in request_states:  # one request state per instance
@@ -64,22 +76,54 @@ def evaluate_instances(self, request_states: List[RequestState], eval_cache_path
             predictions = input_text.split(self.delimiter) if self.is_multi_label() else [input_text]
             y_pred.append([normalize_text(pred) for pred in predictions if pred])
         labels: List[str] = list(set(y for ys in y_true for y in ys))
+        # When binary, MultiLabelBinarizer is not appropriate.
+        # When binary and non-label strings (e.g., "yesandno") are included,
+        # label_binarize() automatically converts the output into a multi-label type (i.e., one-hot matrix).
+        # This will cause an error in f1_score(average="binary").
+        y_pred = (
+            ClassificationMetric.normalize_binary(y_pred, self.class_defs)
+            if self.average is not None and self.average == "binary"
+            else y_pred
+        )
         mlb = MultiLabelBinarizer().fit([labels])
-        y_true = mlb.transform(y_true)
-        y_pred = mlb.transform(y_pred)
+        y_true = (
+            label_binarize(y_true, classes=self.class_defs)
+            if self.average is not None and self.average == "binary"
+            else mlb.transform(y_true)
+        )
+        y_pred = (
+            label_binarize(y_pred, classes=self.class_defs)
+            if self.average is not None and self.average == "binary"
+            else mlb.transform(y_pred)
+        )
+        stats_additional = (
+            []
+            if self.average is None
+            else [
+                Stat(MetricName(f"classification_{self.average}_f1")).add(
+                    f1_score(y_pred=y_pred, y_true=y_true, average=self.average)
+                ),
+                Stat(MetricName(f"classification_{self.average}_recall")).add(
+                    recall_score(y_pred=y_pred, y_true=y_true, average=self.average)
+                ),
+                Stat(MetricName(f"classification_{self.average}_precision")).add(
+                    precision_score(y_pred=y_pred, y_true=y_true, average=self.average)
+                ),
+            ]
+        )
         return [
             Stat(MetricName("classification_macro_f1")).add(f1_score(y_pred=y_pred, y_true=y_true, average="macro")),
             Stat(MetricName("classification_micro_f1")).add(f1_score(y_pred=y_pred, y_true=y_true, average="micro")),
-        ]
+        ] + stats_additional
 
 
-class MultipleChoiceClassificationMetric(EvaluateInstancesMetric):
+class MultipleChoiceClassificationMetric(Metric):
     """
     Calculate population micro/macro F1 score for multiple_choice_* adapters.
     For generation adapters, please use ClassificationMetric.
     """
 
-    def evaluate_instances(self, request_states: List[RequestState], eval_cache_path: str) -> List[Stat]:
+    def evaluate_instances(self, request_states: List[RequestState]) -> List[Stat]:
         y_pred: List[str] = []
         y_true: List[str] = []
         for request_state in request_states:  # one request state per instance
@@ -90,9 +134,7 @@ def evaluate_instances(self, request_states: List[RequestState], eval_cache_path
             ]
             assert len(golds) > 0, "MultipleChoiceClassificationMetric are designed for multiple_choice_* adapters"
             assert request_state.result is not None
-            sorted_completions: List[GeneratedOutput] = sorted(
-                request_state.result.completions, key=lambda x: -x.logprob
-            )
+            sorted_completions: List[Sequence] = sorted(request_state.result.completions, key=lambda x: -x.logprob)
             pred: str = sorted_completions[0].text.strip()  # Only utilize the first prediction
             if request_state.output_mapping is not None:
                 pred = request_state.output_mapping.get(pred, pred)
diff --git a/src/helm/benchmark/metrics/kpi_edgar_metrics.py b/src/helm/benchmark/metrics/kpi_edgar_metrics.py
new file mode 100644
index 00000000000..f81b426f88a
--- /dev/null
+++ b/src/helm/benchmark/metrics/kpi_edgar_metrics.py
@@ -0,0 +1,331 @@
+from typing import List, Dict, Set, Tuple, Callable, Union, cast
+import logging
+import re
+import itertools
+import statistics
+
+from helm.common.request import Sequence
+from helm.benchmark.adaptation.request_state import RequestState
+from .metric import Metric
+from .metric_name import MetricName
+from .statistic import Stat
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from .metric_service import MetricService
+from helm.benchmark.scenarios.scenario import Reference
+from helm.benchmark.scenarios.kpi_edgar_scenario import TAG_DICT, TAG_PAREN_RE
+
+DEFAULT_TAG_PAREN_RE = (r"\(", r"\)")
+
+
+def tokenize(text: str) -> List[str]:
+    # TODO: Better to introduce a sophisticated tokenizer to support (multilingal) natural texts.
+    return text.strip().split(" ")
+
+
+def get_tagged_token_dict(token_list: List[str]) -> Dict[str, Set[Tuple[int, str]]]:
+    # TODO: Note: We need to handle the cases where the original text contains < or > to avoid the confusion with tags.
+    # TODO: Maybe better to introduce XML parser or more sophisticated parser.
+
+    tagged_token_dict: Dict[str, Set[Tuple[int, str]]] = {tag: set() for tag in TAG_DICT.keys()}
+    curr_tag = "O"
+    for (idx, token) in enumerate(token_list):
+        sub_token_list = re.split("[<>]", token)
+        logging.debug(sub_token_list)
+        curr_token = sub_token_list[0]
+        if token.startswith("<"):
+            tag = sub_token_list[1]
+            curr_tag = tag if tag in tagged_token_dict.keys() else curr_tag
+            curr_token = sub_token_list[2]
+        if curr_tag != "O":
+            tagged_token_dict[curr_tag] = tagged_token_dict[curr_tag].union({(idx, curr_token)})
+        if token.endswith(">") and sub_token_list[-2].startswith("/"):
+            tag = sub_token_list[-1][1:]
+            curr_tag = "O"
+    return tagged_token_dict
+
+
+def get_tagged_token_size_dict(token_list: List[str]) -> Tuple[Dict[str, Set], Dict[str, int]]:
+
+    tagged_token_dict = get_tagged_token_dict(token_list)
+    tagged_size_dict: Dict[str, int] = {tag: len(st) for (tag, st) in tagged_token_dict.items()}
+    return (tagged_token_dict, tagged_size_dict)
+
+
+def get_intersection(
+    gold_set: Set[Tuple[int, str]], pred_set: Set[Tuple[int, str]], ignore_index: bool
+) -> Set[Tuple[int, str]]:
+    def remove_index(the_set: Set[Tuple[int, str]]) -> Set[Tuple[int, str]]:
+        return {(0, e[1]) for e in the_set}
+
+    tmp_gold_set = remove_index(gold_set) if ignore_index else gold_set
+    tmp_pred_set = remove_index(pred_set) if ignore_index else pred_set
+
+    return tmp_gold_set.intersection(tmp_pred_set)
+
+
+def get_tag_and_phrase(extracted: str, re_tag_paren: Tuple[str, str] = DEFAULT_TAG_PAREN_RE) -> Tuple[str, str]:
+    matched = re.match(r"(.*)%s(.*)%s" % (re_tag_paren[0], re_tag_paren[1]), extracted)
+    sub_token_tpl = matched.groups() if matched is not None else tuple()
+    if len(sub_token_tpl) == 2:
+        phrase = sub_token_tpl[0].strip()
+        tag = sub_token_tpl[1].strip()
+        return (tag, phrase)
+    return ("", "")
+
+
+def get_tagged_token_size_dict_extraction(
+    entity_list: List[str], re_tag_paren: Tuple[str, str] = DEFAULT_TAG_PAREN_RE
+) -> Tuple[Dict[str, Set], Dict[str, int]]:
+
+    tmp_tag_and_phrase_list = [get_tag_and_phrase(entity, re_tag_paren) for entity in entity_list]
+    tag_and_phrase_list = [tp for tp in tmp_tag_and_phrase_list if len(tp[0]) != 0]
+    tagged_token_dict: Dict[str, Set[Tuple[int, str]]] = {tag: set() for tag in TAG_DICT.keys()}
+    for (tag, phrase) in tag_and_phrase_list:
+        if tag in tagged_token_dict.keys():
+            word_list = phrase.split(" ")
+            token_list = [(0, word) for word in word_list]  # token index is ignored.
+            tagged_token_dict[tag] = tagged_token_dict[tag].union(token_list)
+    tagged_size_dict = {tag: len(token_set) for (tag, token_set) in tagged_token_dict.items()}
+    return (tagged_token_dict, tagged_size_dict)
+
+
+def get_tagged_size_dict(
+    gold_list: List[str],
+    pred_list: List[str],
+    ignore_index: bool,
+    is_extraction: bool = False,
+    re_tag_paren: Tuple[str, str] = DEFAULT_TAG_PAREN_RE,
+) -> Dict[str, Tuple[int, int, int]]:
+
+    if not is_extraction:
+        (gold_tagged_token_dict, gold_tagged_size_dict) = get_tagged_token_size_dict(gold_list)
+        (pred_tagged_token_dict, pred_tagged_size_dict) = get_tagged_token_size_dict(pred_list)
+    else:
+        (gold_tagged_token_dict, gold_tagged_size_dict) = get_tagged_token_size_dict_extraction(gold_list, re_tag_paren)
+        (pred_tagged_token_dict, pred_tagged_size_dict) = get_tagged_token_size_dict_extraction(pred_list, re_tag_paren)
+
+    assert pred_tagged_token_dict.keys() == gold_tagged_token_dict.keys()
+
+    intersection_tagged_token_dict = {
+        tag: get_intersection(gold_tagged_token_dict[tag], pred_tagged_token_dict[tag], ignore_index)
+        for tag in gold_tagged_token_dict.keys()
+    }
+    intersection_tagged_size_dict: Dict[str, int] = {
+        tag: len(st) for (tag, st) in intersection_tagged_token_dict.items()
+    }
+    tag_key_set = gold_tagged_token_dict.keys()
+    tag_size_dict: Dict[str, Tuple[int, int, int]] = {
+        tag: (gold_tagged_size_dict[tag], pred_tagged_size_dict[tag], intersection_tagged_size_dict[tag])
+        for tag in tag_key_set
+    }
+    # TODO: for each sentence (sample), TPR, FPR, etc. must be defined with equal weights.
+    # TODO: later, those are averaged over the sentences (samples).
+    # TODO: how about tags?
+    #         average_options = {None, "micro", "macro", "weighted"}
+    # TODO: TP, FP, TN, FN of this adjusted version can be regarded as
+    # TODO: continuous extention of the discrete original TP, FP, TN, FN for one sample.
+    # TODO: Therefore, one just need to sum up these to compute Precision and Recall.
+    # TODO: https://atmarkit.itmedia.co.jp/ait/articles/2212/19/news020.html
+    # TODO: macro-avg: average F1_type over all the tag types.
+    # TODO: micro-avg: define TP, FP, TN, FN as sum of all the classes. micro-F1 == accuracy.
+    return tag_size_dict
+
+
+def tokenize_extraction(text: str, re_tag_paren: Tuple[str, str] = DEFAULT_TAG_PAREN_RE) -> List[str]:
+    # TODO: Better to introduce a sophisticated tokenizer to support (multilingal) natural texts.
+    delim = ","
+    tag_paren1 = re_tag_paren[1].strip("\\")
+    text_tmp1 = text.strip()
+    text_tmp0 = re.sub(re_tag_paren[1] + delim, tag_paren1 + tag_paren1 + delim, text_tmp1)
+    extracted_list = text_tmp0.split(tag_paren1 + delim)
+    n_extracted_list = [e.strip() for e in extracted_list]
+    return n_extracted_list
+
+
+# def get_tagged_size_dict_extraction(
+#     gold_list: List[str], pred_list: List[str], ignore_index: bool
+# ) -> Dict[str, Tuple[int, int, int]]:
+
+#     (gold_tagged_token_dict, gold_tagged_size_dict) = get_tagged_token_size_dict_extraction(gold_list)
+#     (pred_tagged_token_dict, pred_tagged_size_dict) = get_tagged_token_size_dict_extraction(pred_list)
+
+#     assert pred_tagged_token_dict.keys() == gold_tagged_token_dict.keys()
+
+#     intersection_tagged_token_dict = {
+#         tag: get_intersection(gold_tagged_token_dict[tag], pred_tagged_token_dict[tag], ignore_index)
+#         for tag in gold_tagged_token_dict.keys()
+#     }
+#     intersection_tagged_size_dict: Dict[str, int] = {
+#         tag: len(st) for (tag, st) in intersection_tagged_token_dict.items()
+#     }
+#     tag_key_set = gold_tagged_token_dict.keys()
+#     tag_size_dict: Dict[str, Tuple[int, int, int]] = {
+#         tag: (gold_tagged_size_dict[tag], pred_tagged_size_dict[tag], intersection_tagged_size_dict[tag])
+#         for tag in tag_key_set
+#     }
+#     return tag_size_dict
+
+
+def get_value_list(stats_dict: Dict[MetricName, Stat], prefix: str, tag: str, split: Union[str, None]) -> List[int]:
+    value_name_list = ["gold", "pred", "intersection"]
+    metric_name_list = [MetricName(prefix + "." + tag + "." + name, split=split) for name in value_name_list]
+    value_list = [int(stats_dict[mn].sum) for mn in metric_name_list]
+    return value_list
+
+
+def compute_prrcf1(tp, tn, fp, fn) -> Tuple[float, float, float]:
+    precision = float(tp) / float(tp + fp) if (tp + fp) != 0 else 0.0
+    recall = float(tp) / float(tp + fn) if (tp + fn) != 0 else 0.0
+    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) != 0.0 else 0.0
+    return (precision, recall, f1)
+
+
+def compute_tptnfpfn_adjusted(stats: List[int], total_token_length: int) -> Tuple[float, float, float, float]:
+    assert len(stats) == 3
+    ngold = stats[0]
+    npred = stats[1]
+    ninter = stats[2]
+    # ltoken = total_token_length
+    tp = float(ninter) / float(ngold) if ngold != 0.0 else 0.0
+    fn = 1.0 - tp
+    fp = float(npred - ninter) / float(npred) if npred != 0.0 else 0.0
+    tn = 0.0  # not used.
+    return (tp, tn, fp, fn)
+
+
+def compute_tptnfpfn_modified_adjusted(stats: List[int], total_token_length: int) -> Tuple[float, float, float, float]:
+    assert len(stats) == 3
+    ngold = stats[0]
+    npred = stats[1]
+    ninter = stats[2]
+    ltoken = total_token_length
+    tp = ninter
+    fn = ngold - ninter
+    fp = npred - ninter
+    tn = ltoken - (ngold + npred) + ninter
+    return (tp, tn, fp, fn)
+
+
+def compute_adjusted_f1(
+    tag_stats_dict: Dict[str, List[int]], total_token_length: int, compute_tptnfpfn: Callable
+) -> float:
+
+    tag_tptnfpfn_list = [compute_tptnfpfn(stats, total_token_length) for stats in tag_stats_dict.values()]
+    tag_prrcf1_list = [compute_prrcf1(v[0], v[1], v[2], v[3]) for v in tag_tptnfpfn_list]
+    tag_f1_list = [v[2] for v in tag_prrcf1_list]
+    macro_f1 = statistics.mean(tag_f1_list)
+
+    return macro_f1
+
+
+class NERAdjustedF1Metric(Metric):
+    """
+    Paper:
+    Deußer, Tobias, et al.
+    "KPI-EDGAR: A Novel Dataset and Accompanying Metric for Relation Extraction from
+    Financial Documents."
+    arXiv preprint arXiv:2210.09163 (2022).
+    https://arxiv.org/abs/2210.09163
+    """
+
+    NAME = "kpi_edgar_adjusted_f1"
+    ignore_index = True
+    is_extraction = True
+    re_tag_paren = TAG_PAREN_RE
+
+    def __init__(self):
+        super().__init__()
+
+        return
+
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        """Evaluate free-form generation."""
+
+        # logging.warning(adapter_spec)
+        # logging.warning("evaluate_generation instance.id: %s" % (request_state.instance.id))
+        # logging.warning("evaluate_generation instance.reference: %s" % (request_state.instance.references))
+        # logging.warning("evaluate_generation result.completion: %s" % (request_state.result.completions))
+        # logging.warning(metric_service)
+
+        golds: List[Reference] = [reference for reference in request_state.instance.references if reference.is_correct]
+        completions: List[Sequence] = (
+            cast(List[Sequence], request_state.result.completions) if request_state.result is not None else []
+        )
+        preds: List[str] = [completion.text.strip() for completion in completions]
+        # logging.warning("evaluate_genearation len(preds): %d" % (len(preds)))
+        # logging.warning("evaluate_genearation len(golds): %d" % (len(golds)))
+
+        assert len(preds) >= 1
+        assert len(golds) >= 1
+
+        pred_text = preds[0]
+        gold_text = golds[0].output.text.strip()
+        # logging.warning("evaluate_genearation pred_text: %s" % (pred_text))
+        # logging.warning("evaluate_genearation gold_text: %s" % (gold_text))
+
+        pred_token_list = (
+            tokenize(pred_text) if not self.is_extraction else tokenize_extraction(pred_text, self.re_tag_paren)
+        )
+        gold_token_list = (
+            tokenize(gold_text) if not self.is_extraction else tokenize_extraction(gold_text, self.re_tag_paren)
+        )
+        gold_len = len(gold_token_list)
+        # TODO: if the length are different, then the score should be 0.
+        tagged_size_dict = get_tagged_size_dict(
+            gold_token_list, pred_token_list, self.ignore_index, self.is_extraction, self.re_tag_paren
+        )
+        tag_stat_tpl_list = [
+            (
+                Stat(MetricName(self.NAME + "." + tag + "." + "gold")).add(vals[0]),
+                Stat(MetricName(self.NAME + "." + tag + "." + "pred")).add(vals[1]),
+                Stat(MetricName(self.NAME + "." + tag + "." + "intersection")).add(vals[2]),
+            )
+            for (tag, vals) in tagged_size_dict.items()
+        ]
+        tag_stat_list = list(itertools.chain.from_iterable(tag_stat_tpl_list))
+        len_stat = Stat(MetricName(self.NAME + "." + "len")).add(gold_len)
+        return tag_stat_list + [len_stat]
+
+    def evaluate_instances(self, request_states: List[RequestState]) -> List[Stat]:
+        """Evaluate all request states directly. Use only if nothing else works.  Override me!"""
+
+        # logging.warning("evaluate_instances len: %d" % (len(request_states)))
+        # logging.warning(request_states[0].instance.id)
+
+        return []
+
+    def derive_stats(self, stats_dict: Dict[MetricName, Stat]) -> List[Stat]:
+        """Derive stats based on existing stats, e.g., for perplexity. Override me!"""
+
+        # logging.warning("derive_stats stats_dict: %s" % (stats_dict))
+        # TODO:
+        # I assume that all the stats in stats_dict were computed from the same split (valid or test).
+        assert len(stats_dict) >= 1
+        stats_list: List[Stat] = list(stats_dict.values())
+
+        stat = stats_list[0]
+        split = stat.name.split
+        # logging.warning("derive_stats stat: %s" % (stat))
+        logging.warning("derive_stats split: %s" % (split))
+
+        tag_stat_dict: Dict[str, List[int]] = {
+            tag: get_value_list(stats_dict, self.NAME, tag, split) for tag in TAG_DICT.keys()
+        }
+        # logging.warning(tag_stat_dict)
+        stats_total_token = stats_dict[MetricName(self.NAME + "." + "len", split=split)]
+        adjusted_f1 = compute_adjusted_f1(tag_stat_dict, int(stats_total_token.sum), compute_tptnfpfn_adjusted)
+        # logging.warning("derive_stats adjusted_f1: %f" % (adjusted_f1))
+        modified_adjusted_f1 = compute_adjusted_f1(
+            tag_stat_dict, int(stats_total_token.sum), compute_tptnfpfn_modified_adjusted
+        )
+        # logging.warning("derive_stats modified adjusted_f1: %f" % (modified_adjusted_f1))
+        return [
+            Stat(MetricName(self.NAME + "." + "macro", split=split)).add(adjusted_f1),
+            Stat(MetricName(self.NAME + "." + "modified_macro", split=split)).add(modified_adjusted_f1),
+        ]
diff --git a/src/helm/benchmark/metrics/test_kpi_edgar_metrics.py b/src/helm/benchmark/metrics/test_kpi_edgar_metrics.py
new file mode 100644
index 00000000000..694671c7944
--- /dev/null
+++ b/src/helm/benchmark/metrics/test_kpi_edgar_metrics.py
@@ -0,0 +1,248 @@
+import logging
+
+from . import kpi_edgar_metrics as kem
+import helm.benchmark.scenarios.kpi_edgar_scenario as kes
+
+
+def test_kem_tokenize():
+
+    text = "This is a test ."
+    exp_token_list = ["This", "is", "a", "test", "."]
+    token_list = kem.tokenize(text)
+
+    logging.debug(token_list)
+    assert token_list == exp_token_list
+
+    return
+
+
+def test_kem_get_tagged_token_dict():
+
+    token_list = ["This", "<kpi>is", "a", "test</kpi>", "."]
+    exp_tagged_token_dict = {"kpi": {(1, "is"), (3, "test"), (2, "a")}, "cy": set(), "py": set(), "py1": set()}
+
+    tagged_token_dict = kem.get_tagged_token_dict(token_list)
+
+    logging.debug(tagged_token_dict)
+    assert tagged_token_dict == exp_tagged_token_dict
+
+    return
+
+
+def test_kem_get_tagged_token_dict2():
+
+    token_list = ["This", "<kpi>is</kpi>", "<cy>a", "test</cy>", "."]
+    exp_tagged_token_dict = {"kpi": {(1, "is")}, "cy": {(2, "a"), (3, "test")}, "py": set(), "py1": set()}
+
+    tagged_token_dict = kem.get_tagged_token_dict(token_list)
+
+    logging.debug(tagged_token_dict)
+    assert tagged_token_dict == exp_tagged_token_dict
+
+    return
+
+
+def test_kem_get_tagged_token_size_dict():
+
+    token_list = ["This", "<kpi>is</kpi>", "<cy>a", "test</cy>", "."]
+    exp_tagged_token_dict = {"kpi": {(1, "is")}, "cy": {(2, "a"), (3, "test")}, "py": set(), "py1": set()}
+    exp_tagged_size_dict = {"kpi": 1, "cy": 2, "py": 0, "py1": 0}
+
+    (tagged_token_dict, tagged_size_dict) = kem.get_tagged_token_size_dict(token_list)
+
+    logging.debug(tagged_token_dict)
+    logging.debug(tagged_size_dict)
+
+    assert tagged_token_dict == exp_tagged_token_dict
+    assert tagged_size_dict == exp_tagged_size_dict
+
+    return
+
+
+def test_kem_get_intersection():
+    gold_set = {(0, "This"), (1, "is"), (2, "a"), (3, "pen")}
+    pred_set = {(0, "That"), (1, "is"), (2, "a"), (3, "pen")}
+    exp_inter_set = {(1, "is"), (2, "a"), (3, "pen")}
+    inter_set = kem.get_intersection(gold_set, pred_set, False)
+
+    logging.debug(inter_set)
+    assert inter_set == exp_inter_set
+    return
+
+
+def test_kem_get_intersection_2():
+    gold_set = {(0, "This"), (1, "is"), (2, "a"), (3, "pen")}
+    pred_set = {(0, "That"), (1, "is"), (2, "a"), (4, "pen")}
+    exp_inter_set = {(0, "is"), (0, "a"), (0, "pen")}
+    inter_set = kem.get_intersection(gold_set, pred_set, True)
+
+    logging.debug(inter_set)
+    assert inter_set == exp_inter_set
+    return
+
+
+def test_kem_get_tagged_size_dict():
+
+    gold_token_list = ["This", "<kpi>is</kpi>", "<cy>a", "test</cy>", "."]
+    pred_token_list = ["This", "<kpi>is</kpi>", "<cy>a", "test</cy>", "."]
+    # exp_tagged_token_dict = {"kpi": {(1, "is")}, "cy": {(2, "a"), (3, "test")}, "py": set(), "py1": set()}
+    # exp_tagged_size_dict = {"kpi": 1, "cy": 2, "py": 0, "py1": 0}
+    exp_intersection_size_dict = {"kpi": 1, "cy": 2, "py": 0, "py1": 0}
+
+    tagged_size_dict = kem.get_tagged_size_dict(gold_token_list, pred_token_list, False)
+
+    logging.debug(tagged_size_dict)
+
+    assert tagged_size_dict["kpi"][2] == exp_intersection_size_dict["kpi"]
+    assert tagged_size_dict["kpi"][2] == exp_intersection_size_dict["kpi"]
+
+    return
+
+
+def test_kem_get_tag_and_phrase():
+    extracted = "annual revenue (kpi)"
+    result = kem.get_tag_and_phrase(extracted)
+    logging.debug(result)
+    assert result[0] == "kpi"
+    assert result[1] == "annual revenue"
+    return
+
+
+def test_kem_get_tag_and_phrase_2():
+    extracted = "annual revenue - kpi)"
+    result = kem.get_tag_and_phrase(extracted)
+    logging.debug(result)
+    assert result[0] == ""
+    assert result[1] == ""
+    return
+
+
+def test_kem_get_tag_and_phrase_3():
+    extracted = "annual [which is, a yearly] revenue [kpi]"
+    result = kem.get_tag_and_phrase(extracted, kes.TAG_PAREN_RE)
+    logging.debug(result)
+    assert result[0] == "kpi"
+    assert result[1] == "annual [which is, a yearly] revenue"
+    return
+
+
+def test_kem_tokenize_extraction():
+    extracted = "annual revenue (kpi), 364 (cy)"
+    result = kem.tokenize_extraction(extracted)
+    logging.debug(result)
+    assert result[0] == "annual revenue (kpi)"
+    assert result[1] == "364 (cy)"
+
+
+def test_kem_tokenize_extraction_2():
+    extracted = "annual (which is a yearly) revenue [kpi], 9,364 [cy]"
+    result = kem.tokenize_extraction(extracted, kes.TAG_PAREN_RE)
+    logging.debug(result)
+    assert result[0] == "annual (which is a yearly) revenue [kpi]"
+    assert result[1] == "9,364 [cy]"
+
+
+def test_kem_get_tagged_token_size_dict_extraction():
+    entity_list = ["annual revenue (kpi)", "364 (cy)"]
+    result = kem.get_tagged_token_size_dict_extraction(entity_list)
+    logging.debug(result)
+    assert result[0]["kpi"] == {(0, "annual"), (0, "revenue")}
+    assert result[1]["cy"] == 1
+    return
+
+
+def test_kem_get_tagged_size_dict_2():
+
+    gold_token_list = ["annual revenue (kpi)", "364 (cy)"]
+    pred_token_list = ["annual revenue (kpi)", "364 (cy)"]
+    # exp_tagged_token_dict = {"kpi": {(1, "is")}, "cy": {(2, "a"), (3, "test")}, "py": set(), "py1": set()}
+    # exp_tagged_size_dict = {"kpi": 1, "cy": 2, "py": 0, "py1": 0}
+    exp_intersection_size_dict = {"kpi": 2, "cy": 1, "py": 0, "py1": 0}
+
+    tagged_size_dict = kem.get_tagged_size_dict(gold_token_list, pred_token_list, False, True)
+
+    logging.debug(tagged_size_dict)
+
+    assert tagged_size_dict["kpi"][2] == exp_intersection_size_dict["kpi"]
+    assert tagged_size_dict["kpi"][2] == exp_intersection_size_dict["kpi"]
+
+    return
+
+
+def test_kem_compute_prrcf1():
+    tp = 10
+    tn = 60
+    fp = 10
+    fn = 10
+    (pr, rc, f1) = kem.compute_prrcf1(tp, tn, fp, fn)
+    logging.debug((pr, rc, f1))
+    assert (pr, rc, f1) == (0.5, 0.5, 0.5)
+
+
+def test_kem_compute_prrcf1_2():
+    tp = 0
+    tn = 0
+    fp = 0
+    fn = 0
+    (pr, rc, f1) = kem.compute_prrcf1(tp, tn, fp, fn)
+    logging.debug((pr, rc, f1))
+    assert (pr, rc, f1) == (0, 0, 0)
+
+
+def test_kem_compute_prrcf1_3():
+    tp = 10
+    tn = 40
+    fp = 30
+    fn = 10
+    (pr, rc, f1) = kem.compute_prrcf1(tp, tn, fp, fn)
+    logging.debug((pr, rc, f1))
+    assert (pr, rc, f1) == (0.25, 0.5, float(1) / float(3))
+
+
+def test_kem_compute_tptnfpfn_adjusted():
+    total_token_length = 100
+    stats = [20, 40, 10]
+    tp = 0.5
+    tn = 0.0  # unused.
+    fp = 0.75
+    fn = 0.5
+    tptnfpfn = kem.compute_tptnfpfn_adjusted(stats, total_token_length)
+    logging.debug(tptnfpfn)
+    assert tptnfpfn == (tp, tn, fp, fn)
+
+
+def test_kem_compute_tptnfpfn_adjusted_1():
+    total_token_length = 100
+    stats = [0, 0, 0]
+    tp = 0.0
+    tn = 0.0  # unused.
+    fp = 0.0
+    fn = 1.0
+    tptnfpfn = kem.compute_tptnfpfn_adjusted(stats, total_token_length)
+    logging.debug(tptnfpfn)
+    assert tptnfpfn == (tp, tn, fp, fn)
+
+
+def test_kem_compute_tptnfpfn_modified_adjusted():
+    total_token_length = 100
+    stats = [20, 40, 10]
+    tp = 10
+    tn = 50
+    fp = 30
+    fn = 10
+    tptnfpfn = kem.compute_tptnfpfn_modified_adjusted(stats, total_token_length)
+    logging.debug(tptnfpfn)
+    assert tptnfpfn == (tp, tn, fp, fn)
+
+
+def test_kem_compute_adjusted_f1():
+
+    total_token_lengh = 100
+    tag_stats_dict = {"pos": [20, 20, 10], "neg": [80, 80, 70]}
+    exp_macro_f1 = 0.6875
+
+    macro_f1 = kem.compute_adjusted_f1(tag_stats_dict, total_token_lengh, kem.compute_tptnfpfn_adjusted)
+
+    logging.debug(macro_f1)
+    assert macro_f1 == exp_macro_f1
+    return
diff --git a/src/helm/benchmark/run_specs/classic_run_specs.py b/src/helm/benchmark/run_specs/classic_run_specs.py
index bf692de749c..daf4cf7000a 100644
--- a/src/helm/benchmark/run_specs/classic_run_specs.py
+++ b/src/helm/benchmark/run_specs/classic_run_specs.py
@@ -1,52 +1,824 @@
-"""Run spec functions for the HELM Classic leaderboard.
-
-Website: https://crfm.stanford.edu/helm/classic/
-
-If a run spec function is included in both the HELM Classic leaderboard and the
-HELM Lite leaderboard, it will be included in the lite_run_specs module instead of this module.
-This module also contains some scenarios that are currently not used on any HELM leaderboard."""
-
-from typing import Any, Dict, List, Optional, Set
-
-from helm.benchmark.adaptation.adapter_spec import (
-    ADAPT_GENERATION,
+import dataclasses
+import itertools
+from functools import partial
+from typing import Any, Callable, List, Dict, Optional, Set, TypeVar
+
+from helm.benchmark.model_deployment_registry import ALL_MODEL_DEPLOYMENTS, DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT
+from helm.benchmark.scenarios.commonsense_scenario import (
+    CommonSenseQAScenario,
+    HellaSwagScenario,
+    OpenBookQA,
+    PiqaScenario,
+    SiqaScenario,
+)
+from helm.common.hierarchical_logger import hlog, htrack
+from helm.common.object_spec import ObjectSpec
+from helm.benchmark.adaptation.adapters.adapter_factory import (
+    ADAPT_LANGUAGE_MODELING,
     ADAPT_MULTIPLE_CHOICE_JOINT,
     ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
+    ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
+    ADAPT_GENERATION,
     ADAPT_RANKING_BINARY,
-    AdapterSpec,
 )
 from helm.benchmark.adaptation.adapters.binary_ranking_adapter import BinaryRankingAdapter
-from helm.benchmark.adaptation.common_adapter_specs import (
-    get_completion_adapter_spec,
-    get_generation_adapter_spec,
-    get_language_modeling_adapter_spec,
-    get_multiple_choice_adapter_spec,
-    get_ranking_binary_adapter_spec,
-    get_summarization_adapter_spec,
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from .metrics.metric import MetricSpec
+from .run_expander import (
+    RUN_EXPANDERS,
+    GlobalPrefixRunExpander,
+    AnthropicRunExpander,
+    OpenAIRunExpander,
+    GoogleRunExpander,
+    StopRunExpander,
+    ChatMLRunExpander,
+    IncreaseTemperatureRunExpander,
 )
-from helm.benchmark.annotation.annotator import AnnotatorSpec
-from helm.benchmark.metrics.common_metric_specs import (
-    get_basic_metric_specs,
-    get_bias_metric_specs,
-    get_classification_metric_specs,
-    get_copyright_metric_specs,
-    get_disinformation_metric_specs,
-    get_exact_match_metric_specs,
-    get_f1_metric_specs,
-    get_generative_harms_metric_specs,
-    get_language_modeling_metric_specs,
-    get_numeracy_metric_specs,
-    get_open_ended_generation_metric_specs,
-    get_summarization_metric_specs,
-    get_basic_generation_metric_specs,
-    get_basic_reference_metric_specs,
-    get_generic_metric_specs,
+from .runner import RunSpec, get_benchmark_output_path
+from .scenarios.lex_glue_scenario import (
+    get_lex_glue_max_train_instances,
+    get_lex_glue_instructions,
+    get_lex_glue_max_tokens,
+    get_lex_glue_task_type,
 )
-from helm.benchmark.metrics.metric import MetricSpec
-from helm.benchmark.run_spec import RunSpec, run_spec_function
-from helm.benchmark.runner import get_benchmark_output_path
-from helm.benchmark.scenarios.scenario import ScenarioSpec, get_scenario_cache_path
-from helm.common.hierarchical_logger import hlog, htrack
+from .scenarios.scenario import ScenarioSpec, get_scenario_cache_path
+from .scenarios.msmarco_scenario import MSMARCOScenario
+from .scenarios.copyright_scenario import datatag2hash_code
+from .scenarios.lextreme_scenario import (
+    get_lextreme_instructions,
+    get_lextreme_max_train_instances,
+    get_lextreme_max_tokens,
+    TaskType,
+    get_lextreme_task_type,
+)
+from .scenarios.echr_judge_scenario import EchrJudgeScenario
+from helm.benchmark.model_deployment_registry import (
+    ModelDeployment,
+    get_model_deployment,
+)
+from helm.benchmark.model_metadata_registry import (
+    ModelMetadata,
+    get_model_metadata,
+    ANTHROPIC_CLAUDE_1_MODEL_TAG,
+    ANTHROPIC_CLAUDE_2_MODEL_TAG,
+    GOOGLE_PALM_2_MODEL_TAG,
+    NO_NEWLINES_TAG,
+    NLG_PREFIX_TAG,
+    CHATML_MODEL_TAG,
+    OPENAI_CHATGPT_MODEL_TAG,
+    BUGGY_TEMP_0_TAG,
+)
+from helm.common.general import singleton
+
+INCLUDE_GENERATIVE_HARMS_METRICS = False
+
+
+############################################################
+# Prototypical adapter specs
+
+
+def format_instructions(instructions: str) -> str:
+    if len(instructions) > 0:
+        instructions += "\n"
+    return instructions
+
+
+def get_multiple_choice_joint_adapter_spec(
+    instructions: str,
+    input_noun: Optional[str],
+    output_noun: str,
+    num_outputs: int = 5,
+    max_train_instances: int = 5,
+    max_tokens: int = 5,
+    sample_train: bool = True,
+    **kwargs,
+) -> AdapterSpec:
+    """
+    [instructions]
+
+    [input_noun]: [input]
+    [reference_1]
+    ...
+    [reference_k]
+    [output_noun]: [output]
+
+    [input_noun]: [input]
+    [reference_1]
+    ...
+    [reference_k]
+    [output_noun]:
+    """
+
+    return AdapterSpec(
+        method=ADAPT_MULTIPLE_CHOICE_JOINT,
+        instructions=format_instructions(instructions),
+        input_prefix=f"{input_noun}: " if input_noun is not None else "",
+        input_suffix="\n" if input_noun is not None else "",
+        output_prefix=f"{output_noun}: ",
+        output_suffix="\n",
+        max_train_instances=max_train_instances,
+        num_outputs=num_outputs,
+        max_tokens=max_tokens,
+        temperature=0.0,
+        stop_sequences=["\n"],
+        sample_train=sample_train,
+        **kwargs,
+    )
+
+
+def get_multiple_choice_separate_adapter_spec(method: str, empty_input: bool = False) -> AdapterSpec:
+    """
+    [input] [reference_i]
+    or
+    [reference_i]
+    """
+    assert method in {ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED}
+
+    return AdapterSpec(
+        method=method,
+        instructions="",
+        input_prefix="",
+        input_suffix="",
+        output_prefix=" " if not empty_input else "",
+        output_suffix="",
+        # Separate is basically language modeling, so can't easily use in-context examples
+        max_train_instances=0,
+        num_outputs=1,
+        max_tokens=0,
+        temperature=0.0,
+    )
+
+
+def get_multiple_choice_adapter_spec(
+    method: str,
+    instructions: str,
+    input_noun: Optional[str],
+    output_noun: str,
+    max_train_instances: int = 5,
+    num_outputs: int = 5,
+    max_tokens: int = 1,
+    empty_input: bool = False,
+    sample_train: bool = True,
+    **kwargs,
+):
+    """
+    Toggle between joint and separate adapters.
+    """
+    if method == ADAPT_MULTIPLE_CHOICE_JOINT:
+        return get_multiple_choice_joint_adapter_spec(
+            instructions,
+            input_noun,
+            output_noun,
+            max_train_instances=max_train_instances,
+            num_outputs=num_outputs,
+            max_tokens=max_tokens,
+            sample_train=sample_train,
+            **kwargs,
+        )
+    elif method in {ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED}:
+        return get_multiple_choice_separate_adapter_spec(method, empty_input)
+    else:
+        raise ValueError(f"Invalid adaptation method: {method}")
+
+
+def get_ranking_binary_adapter_spec(
+    instructions: str = "",
+    document_noun: str = "Passage",
+    query_noun: str = "Query",
+    output_prefix: str = "Does the passage answer the query?",
+    output_noun: str = "Answer",
+    max_train_instances: int = 4,
+    num_outputs: int = 1,
+    num_train_trials: int = 1,
+    temperature: float = 0.0,
+    max_tokens: int = 5,
+    **kwargs,
+) -> AdapterSpec:
+    """
+    [instructions]
+
+    [object_noun]: [object]
+    [query_noun]: [query]
+    [prompt_noun]: [prompt_content]
+    [output_noun]: [output]
+
+    ...
+
+    [object_noun]: [object]
+    [query_noun]: [query]
+    [prompt_noun]: [prompt_content]
+    [output_noun]: [output]
+
+    [object_noun]: [object]
+    [query_noun]: [query]
+    [prompt_noun]: [prompt_content]
+    [output_noun]: [output]
+    """
+    msg = (
+        "There must be an even number of in-context examples to ensure that"
+        "an equal number of positive and negative examples are included."
+    )
+    assert max_train_instances % 2 == 0, msg
+    max_train_instances = int(max_train_instances / 2)
+
+    return AdapterSpec(
+        method=ADAPT_RANKING_BINARY,
+        instructions=format_instructions(instructions),
+        input_prefix=f"{query_noun}: ",
+        input_suffix="\n",
+        reference_prefix=f"{document_noun}: ",
+        reference_suffix="\n",
+        output_prefix=f"{output_prefix}\n{output_noun}: ",
+        max_train_instances=max_train_instances,
+        num_outputs=num_outputs,
+        num_train_trials=num_train_trials,
+        temperature=temperature,
+        max_tokens=max_tokens,
+        **kwargs,
+    )
+
+
+def get_completion_adapter_spec(
+    instructions: str = "",
+    input_prefix: str = "",
+    output_prefix: str = "",
+    output_suffix: str = "",
+    max_train_instances: int = 0,
+    temperature: float = 0.0,
+    num_outputs: int = 1,
+    max_tokens: int = 100,
+    stop_sequences: Optional[List] = None,  # default value of `stop_sequences` is no stop sequence,
+    **kwargs,
+) -> AdapterSpec:
+    """
+    [input][output_prefix][output][output_suffix]
+
+    [input][output_prefix]
+    """
+    if stop_sequences is None:
+        stop_sequences = []
+
+    return AdapterSpec(
+        method=ADAPT_GENERATION,
+        instructions=format_instructions(instructions),
+        input_prefix=input_prefix,
+        input_suffix="",
+        output_prefix=output_prefix,
+        output_suffix=output_suffix,
+        max_train_instances=max_train_instances,
+        temperature=temperature,
+        num_outputs=num_outputs,
+        max_tokens=max_tokens,
+        stop_sequences=stop_sequences,
+        **kwargs,
+    )
+
+
+def get_generation_adapter_spec(
+    instructions: str = "",
+    input_noun: Optional[str] = None,
+    newline_after_input_noun: bool = False,
+    output_noun: Optional[str] = None,
+    newline_after_output_noun: bool = False,
+    max_train_instances: int = 5,
+    num_outputs: int = 1,
+    max_tokens: int = 5,
+    stop_sequences: Optional[List] = None,  # default value of `stop_sequences` is ["\n"]
+    temperature: float = 0.0,
+    multi_label: bool = False,
+) -> AdapterSpec:
+    """
+    [instructions]
+
+    [input_noun]: [input]
+    [output_noun]: [output]
+
+    [input_noun]: [input]
+    [output_noun]:
+    """
+
+    def format_prefix(noun: Optional[str], append_new_line: bool) -> str:
+        """
+        When `append_new_line` is False:
+            [input_noun]: [input]
+
+        When `append_new_line` is True:
+            [input_noun]:
+            [input]
+        """
+        prefix: str = f"{noun}:" if noun is not None else ""
+        if len(prefix) > 0:
+            prefix += "\n" if append_new_line else " "
+        return prefix
+
+    if stop_sequences is None:
+        stop_sequences = ["\n"]
+
+    return AdapterSpec(
+        method=ADAPT_GENERATION,
+        instructions=format_instructions(instructions),
+        input_prefix=format_prefix(input_noun, append_new_line=newline_after_input_noun),
+        input_suffix="\n",
+        output_prefix=format_prefix(output_noun, append_new_line=newline_after_output_noun),
+        output_suffix="\n",
+        max_train_instances=max_train_instances,
+        num_outputs=num_outputs,
+        max_tokens=max_tokens,
+        temperature=temperature,
+        stop_sequences=stop_sequences,
+        multi_label=multi_label,
+    )
+
+
+def get_instruct_adapter_spec(
+    num_outputs: int = 1,
+    max_tokens: int = 512,
+    temperature: float = 0.7,
+) -> AdapterSpec:
+    """
+    Zero-shot instruction-following.
+    """
+    return AdapterSpec(
+        method=ADAPT_GENERATION,
+        instructions="",
+        input_prefix="",
+        input_suffix="\n",
+        output_prefix="",
+        output_suffix="",
+        max_train_instances=0,
+        num_outputs=num_outputs,
+        max_tokens=max_tokens,
+        temperature=temperature,
+        stop_sequences=[],
+    )
+
+
+def get_language_modeling_adapter_spec() -> AdapterSpec:
+    """
+    Used for language modeling.
+    """
+    return AdapterSpec(
+        method=ADAPT_LANGUAGE_MODELING,
+        instructions="",
+        input_prefix="",
+        input_suffix="",
+        output_prefix="",
+        output_suffix="",
+        max_train_instances=0,
+        num_outputs=1,
+        max_tokens=0,
+        temperature=0.0,
+    )
+
+
+def get_summarization_adapter_spec(num_sents: Optional[int], max_train_instances: int = 5, **kwargs) -> AdapterSpec:
+    """
+    Used for summarization.
+    """
+
+    if num_sents == 1:
+        out_pref = "Summarize the above article in 1 sentence.\n"
+    elif num_sents is None:
+        out_pref = "Summarize the above article.\n"
+    else:
+        out_pref = f"Summarize the above article in {num_sents} sentences.\n"
+
+    return AdapterSpec(
+        method=ADAPT_GENERATION,
+        instructions="",
+        input_prefix="###\nArticle: ",
+        input_suffix="\n\n",
+        output_prefix=out_pref,
+        output_suffix="\n",
+        max_train_instances=max_train_instances,
+        num_outputs=1,
+        stop_sequences=["###"],  # Separator between few-shot instances.
+        **kwargs,
+    )
+
+
+def get_machine_translation_adapter_spec(
+    source_language, target_language, max_train_instances, **kwargs
+) -> AdapterSpec:
+    """
+    Used for machine translation.
+    """
+    return AdapterSpec(
+        method=ADAPT_GENERATION,
+        instructions=f"Translate the following sentences from {source_language} to {target_language}.",
+        input_prefix=f"{source_language}: ",
+        input_suffix="\n",
+        output_prefix=f"{target_language}: ",
+        output_suffix="\n",
+        max_train_instances=max_train_instances,
+        num_outputs=1,
+        stop_sequences=["\n\n"],
+        temperature=0.0,
+        **kwargs,
+    )
+
+
+############################################################
+# Examples of scenario and adapter specs
+
+
+def get_scenario_spec1() -> ScenarioSpec:
+    return ScenarioSpec(
+        class_name="helm.benchmark.scenarios.simple_scenarios.Simple1Scenario",
+        args={"num_input_tokens": 5, "vocab_size": 20, "num_train_instances": 10, "num_test_instances": 10},
+    )
+
+
+def get_scenario_spec_tiny():
+    return ScenarioSpec(
+        class_name="helm.benchmark.scenarios.simple_scenarios.Simple1Scenario",
+        args={"num_input_tokens": 5, "vocab_size": 20, "num_train_instances": 2, "num_test_instances": 2},
+    )
+
+
+def get_adapter_spec1() -> AdapterSpec:
+    return AdapterSpec(
+        method=ADAPT_GENERATION,
+        instructions="Please solve the following problem.\n",
+        max_train_instances=5,
+        max_eval_instances=10,
+        num_outputs=3,
+        num_train_trials=3,
+        model="simple/model1",
+        model_deployment="simple/model1",
+        temperature=1,
+        stop_sequences=["."],
+    )
+
+
+############################################################
+# Metrics
+
+
+def get_basic_metric_specs(names: List[str]) -> List[MetricSpec]:
+    return [MetricSpec(class_name="helm.benchmark.metrics.basic_metrics.BasicMetric", args={"names": names})]
+
+
+def get_exact_match_metric_specs() -> List[MetricSpec]:
+    return get_basic_metric_specs(
+        ["exact_match", "quasi_exact_match", "prefix_exact_match", "quasi_prefix_exact_match"]
+    )
+
+
+def get_f1_metric_specs() -> List[MetricSpec]:
+    return get_basic_metric_specs(["exact_match", "quasi_exact_match", "f1_score"])
+
+
+def get_classification_metric_specs(delimiter: Optional[str] = None) -> List[MetricSpec]:
+    return [
+        MetricSpec(
+            class_name="helm.benchmark.metrics.classification_metrics.ClassificationMetric",
+            args={"delimiter": delimiter},
+        )
+    ]
+
+def get_weighted_classification_metric_specs(
+        delimiter: Optional[str] = None, average: str = "weighted", class_defs: Optional[List[str]] = None
+) -> List[MetricSpec]:
+    return [
+        MetricSpec(
+            class_name="helm.benchmark.metrics.classification_metrics.ClassificationMetric",
+            args={"delimiter": delimiter, "average": average, "class_defs": class_defs},
+        )
+    ]
+
+def get_multiple_choice_classification_metric_specs() -> List[MetricSpec]:
+    return [
+        MetricSpec(
+            class_name="helm.benchmark.metrics.classification_metrics.MultipleChoiceClassificationMetric", args={}
+        )
+    ]
+
+
+def get_bbq_metric_specs() -> List[MetricSpec]:
+    return [
+        MetricSpec(class_name="helm.benchmark.metrics.bbq_metrics.BBQMetric", args={})
+    ] + get_exact_match_metric_specs()
+
+
+def get_msmarco_metric_specs(track: str, rank: Optional[int] = None) -> List[MetricSpec]:
+    # Names of the measures we want to compute.
+    measure_names = MSMARCOScenario.MEASURE_NAMES[track]
+    multiple_relevance_values = set(MSMARCOScenario.GOLD_RELATIONS[track]) != {1}
+
+    return [
+        MetricSpec(
+            class_name="helm.benchmark.metrics.ranking_metrics.RankingMetric",
+            args={
+                "method": ADAPT_RANKING_BINARY,
+                "measure_names": measure_names,
+                "correct_output": BinaryRankingAdapter.RANKING_CORRECT_LABEL,
+                "wrong_output": BinaryRankingAdapter.RANKING_WRONG_LABEL,
+                "rank": rank,
+                "multiple_relevance_values": multiple_relevance_values,
+            },
+        ),
+    ] + get_basic_metric_specs(names=[])
+
+
+def get_toxicity_metric_specs() -> List[MetricSpec]:
+    return [
+        MetricSpec(class_name="helm.benchmark.metrics.toxicity_metrics.ToxicityMetric", args={}),
+    ]
+
+
+def get_bias_metric_specs() -> List[MetricSpec]:
+    demographic_categories = ["race", "gender"]
+    target_categories = ["adjective", "profession"]
+    cross_dem_target = itertools.product(demographic_categories, target_categories)
+
+    return [
+        MetricSpec(
+            class_name="helm.benchmark.metrics.bias_metrics.BiasMetric",
+            args={"mode": "associations", "demographic_category": dem, "target_category": tgt},
+        )
+        for dem, tgt in cross_dem_target
+    ] + [
+        MetricSpec(
+            class_name="helm.benchmark.metrics.bias_metrics.BiasMetric",
+            args={"mode": "representation", "demographic_category": dem},
+        )
+        for dem in demographic_categories
+    ]
+
+
+def get_generative_harms_metric_specs(include_basic_metrics: bool = False) -> List[MetricSpec]:
+    # In classic HELM, we included bias/toxicity measures, but now we don't to streamline.
+    if not INCLUDE_GENERATIVE_HARMS_METRICS:
+        return []
+    return (
+        get_bias_metric_specs()
+        + get_toxicity_metric_specs()
+        + (get_basic_metric_specs([]) if include_basic_metrics else [])
+    )
+
+
+def get_summarization_metric_specs(args: Dict[str, Any]) -> List[MetricSpec]:
+    return [
+        MetricSpec(class_name="helm.benchmark.metrics.summarization_metrics.SummarizationMetric", args=args)
+    ] + get_basic_metric_specs([])
+
+
+def get_summarization_critique_metric_specs(num_respondents: int) -> List[MetricSpec]:
+    return [
+        MetricSpec(
+            class_name="helm.benchmark.metrics.summarization_critique_metrics.SummarizationCritiqueMetric",
+            args={"num_respondents": num_respondents},
+        )
+    ]
+
+
+def get_srn_metric_specs() -> List[MetricSpec]:
+    return get_basic_metric_specs(["f1_set_match", "iou_set_match", "exact_set_match"])
+
+
+def get_numeracy_metric_specs(run_solver: bool = False) -> List[MetricSpec]:
+    metric_specs: List[MetricSpec] = get_basic_metric_specs(
+        ["exact_match", "quasi_exact_match", "absolute_value_difference"]
+    )
+
+    # The solvers are slow to run so make them skippable
+    if run_solver:
+        metric_specs += [
+            MetricSpec(class_name="helm.benchmark.metrics.numeracy_metrics.DistanceMetric", args={}),
+        ]
+    return metric_specs
+
+
+def get_math_metric_specs(use_chain_of_thought: bool = True) -> List[MetricSpec]:
+    return get_basic_metric_specs(["math_equiv_chain_of_thought" if use_chain_of_thought else "math_equiv"])
+
+
+def get_copyright_metric_specs(args: Optional[Dict] = None) -> List[MetricSpec]:
+    if args is None:
+        args = {}
+    return [
+        MetricSpec(
+            class_name="helm.benchmark.metrics.copyright_metrics.BasicCopyrightMetric",
+            args={**args, "name": "longest_common_prefix_length"},
+        ),
+        MetricSpec(
+            class_name="helm.benchmark.metrics.copyright_metrics.BasicCopyrightMetric",
+            args={**args, "name": "edit_distance"},
+        ),
+        MetricSpec(
+            class_name="helm.benchmark.metrics.copyright_metrics.BasicCopyrightMetric",
+            args={**args, "name": "edit_similarity"},
+        ),
+    ] + get_basic_metric_specs([])
+
+
+def get_disinformation_metric_specs(args: Optional[Dict] = None) -> List[MetricSpec]:
+    if args is None:
+        args = {}
+    return [
+        MetricSpec(
+            class_name="helm.benchmark.metrics.disinformation_metrics.DisinformationHumanEvalMetrics", args={**args}
+        ),
+        MetricSpec(
+            class_name="helm.benchmark.metrics.disinformation_metrics.DisinformationMetric", args={"name": "self_bleu"}
+        ),
+        MetricSpec(
+            class_name="helm.benchmark.metrics.disinformation_metrics.DisinformationMetric",
+            args={"name": "monte_carlo_entropy"},
+        ),
+    ] + get_basic_metric_specs([])
+
+
+def get_code_metric_specs(dataset: str, timeout: float) -> List[MetricSpec]:
+    if dataset == "humaneval":
+        return get_basic_metric_specs(["code_eval_acc", "pass"])
+    else:  # APPS.
+        args: Dict[str, Any] = {"names": ["test_avg", "strict_acc"], "timeout": timeout}
+        return [MetricSpec(class_name="helm.benchmark.metrics.code_metrics.APPSMetric", args=args)]
+
+
+def get_open_ended_generation_metric_specs() -> List[MetricSpec]:
+    return get_basic_metric_specs(["exact_match", "quasi_exact_match", "f1_score", "rouge_l", "bleu_1", "bleu_4"])
+
+
+def get_cleva_machine_translation_metric_specs() -> List[MetricSpec]:
+    return [
+        MetricSpec(
+            class_name="helm.benchmark.metrics.machine_translation_metrics.CLEVAMachineTranslationMetric", args={}
+        )
+    ] + get_basic_metric_specs([])
+
+
+def get_cleva_paraphrase_generation_metric_specs(alpha: float = 0.8) -> List[MetricSpec]:
+    return [
+        MetricSpec(
+            class_name="helm.benchmark.metrics.paraphrase_generation_metrics.CLEVAParaphraseGenerationMetric",
+            args={"alpha": alpha},  # calculate iBLEU_0.8 by default
+        )
+    ] + get_basic_metric_specs([])
+
+
+def get_verifiability_judgment_metric_specs() -> List[MetricSpec]:
+    return get_basic_metric_specs(["exact_match", "quasi_exact_match"])
+
+
+def get_instruction_following_critique_metric_specs(num_respondents: int) -> List[MetricSpec]:
+    return [
+        MetricSpec(
+            class_name="helm.benchmark.metrics.instruction_following_critique_metrics.InstructionFollowingCritiqueMetric",  # noqa E501
+            args={"num_respondents": num_respondents},
+        )
+    ]
+
+
+def get_cleva_topk_accuracy_metric_specs(k: int = 1, cut_off: int = 5) -> List[MetricSpec]:
+    return [
+        MetricSpec(
+            class_name="helm.benchmark.metrics.cleva_accuracy_metrics.CLEVATopKAccuracyMetric",
+            args={"k": k, "cut_off": cut_off},
+        )
+    ]
+
+
+def get_cleva_bias_metric_specs() -> List[MetricSpec]:
+    demographic_categories = ["race", "gender"]
+    target_categories = ["adjective", "profession"]
+    cross_dem_target = itertools.product(demographic_categories, target_categories)
+
+    return [
+        MetricSpec(
+            class_name="helm.benchmark.metrics.cleva_harms_metrics.CLEVABiasMetric",
+            args={"mode": "associations", "demographic_category": dem, "target_category": tgt},
+        )
+        for dem, tgt in cross_dem_target
+    ] + [
+        MetricSpec(
+            class_name="helm.benchmark.metrics.cleva_harms_metrics.CLEVABiasMetric",
+            args={"mode": "representation", "demographic_category": dem},
+        )
+        for dem in demographic_categories
+    ]
+
+
+def get_cleva_toxicity_metric_specs() -> List[MetricSpec]:
+    return [
+        MetricSpec(class_name="helm.benchmark.metrics.cleva_harms_metrics.CLEVAToxicityMetric", args={}),
+    ]
+
+
+def get_cleva_generative_harms_metric_specs(include_basic_metrics: bool = False) -> List[MetricSpec]:
+    return (
+        get_cleva_bias_metric_specs()
+        + get_cleva_toxicity_metric_specs()
+        + (get_basic_metric_specs([]) if include_basic_metrics else [])
+    )
+
+
+def get_cleva_copyright_metric_spec(args: Optional[Dict] = None) -> List[MetricSpec]:
+    if args is None:
+        args = {}
+    return [
+        MetricSpec(
+            class_name="helm.benchmark.metrics.cleva_harms_metrics.CLEVACopyrightMetric",
+            args={**args, "name": "longest_common_prefix_length"},
+        ),
+        MetricSpec(
+            class_name="helm.benchmark.metrics.cleva_harms_metrics.CLEVACopyrightMetric",
+            args={**args, "name": "edit_distance"},
+        ),
+        MetricSpec(
+            class_name="helm.benchmark.metrics.cleva_harms_metrics.CLEVACopyrightMetric",
+            args={**args, "name": "edit_similarity"},
+        ),
+    ]
+
+
+def get_cleva_generative_task_metric_spec(task: str, subtask: Optional[str], **kwargs) -> List[MetricSpec]:
+    CLEVA_GEN_TASK_TO_METRIC: Dict[str, Callable] = {
+        "opinion_mining:opinion_target_extraction": get_exact_match_metric_specs,
+        "paraphrase_generation": get_cleva_paraphrase_generation_metric_specs,
+        "closed_book_question_answering:generative_question_answering": get_exact_match_metric_specs,
+        "conceptual_generalization": get_cleva_topk_accuracy_metric_specs,
+        "translation:en2zh": get_cleva_machine_translation_metric_specs,
+        "translation:zh2en": get_cleva_machine_translation_metric_specs,
+        "mathematical_calculation:add": get_exact_match_metric_specs,
+        "mathematical_calculation:sub": get_exact_match_metric_specs,
+        "mathematical_calculation:mul": get_exact_match_metric_specs,
+        "inductive_reasoning:add": get_exact_match_metric_specs,
+        "inductive_reasoning:sub": get_exact_match_metric_specs,
+        "inductive_reasoning:mul": get_exact_match_metric_specs,
+        "reasoning_primitive:dyck_language": get_exact_match_metric_specs,
+        "reasoning_primitive:pattern_induction": get_exact_match_metric_specs,
+        "reasoning_primitive:pattern_matching": get_exact_match_metric_specs,
+        "reasoning_primitive:variable_sub": get_exact_match_metric_specs,
+        "subject_knowledge:art": get_exact_match_metric_specs,
+        "subject_knowledge:biomedicine": get_exact_match_metric_specs,
+        "subject_knowledge:chemistry": get_exact_match_metric_specs,
+        "subject_knowledge:computer_science": get_exact_match_metric_specs,
+        "subject_knowledge:economics": get_exact_match_metric_specs,
+        "subject_knowledge:geography": get_exact_match_metric_specs,
+        "subject_knowledge:history": get_exact_match_metric_specs,
+        "subject_knowledge:law": get_exact_match_metric_specs,
+        "subject_knowledge:literature": get_exact_match_metric_specs,
+        "subject_knowledge:math": get_exact_match_metric_specs,
+        "subject_knowledge:other_general": get_exact_match_metric_specs,
+        "subject_knowledge:philosophy": get_exact_match_metric_specs,
+        "subject_knowledge:physics": get_exact_match_metric_specs,
+        "subject_knowledge:politics": get_exact_match_metric_specs,
+        "summarization:dialogue_summarization": partial(get_basic_metric_specs, ["chinese_rouge_2"]),
+        "pinyin_transliteration:pinyin2zh": partial(get_basic_metric_specs, ["chinese_bleu_1"]),
+        "pinyin_transliteration:zh2pinyin": partial(get_basic_metric_specs, ["chinese_bleu_1"]),
+        "dialogue_generation:task_oriented": partial(get_basic_metric_specs, ["chinese_bleu_1"]),
+        "data_to_text_generation": partial(get_basic_metric_specs, ["chinese_bleu_1"]),
+        "mathematical_reasoning:math_word_problem": partial(get_basic_metric_specs, ["cleva_math_result_match"]),
+    }
+
+    key: str = task
+    if subtask is not None:
+        key += ":" + subtask
+    return CLEVA_GEN_TASK_TO_METRIC[key](**kwargs)
+
+
+def get_kpi_edgar_metric_specs() -> List[MetricSpec]:
+    return [MetricSpec(class_name="helm.benchmark.metrics.kpi_edgar_metrics.NERAdjustedF1Metric", args={})]
+
+def get_math_float_match_metric_specs() -> List[MetricSpec]:
+    return get_basic_metric_specs(["float_equiv"])
+
+############################################################
+# Run specs
+
+
+CANONICAL_RUN_SPEC_FUNCS: Dict[str, Callable[..., RunSpec]] = {}
+"""Dict of run spec function names to run spec functions."""
+
+
+F = TypeVar("F", bound=Callable[..., RunSpec])
+
+
+def run_spec_function(name: str) -> Callable[[F], F]:
+    """Register the run spec function under the given name."""
+
+    def wrap(func: F) -> F:
+        if name in CANONICAL_RUN_SPEC_FUNCS:
+            raise ValueError(f"A run spec function with name {name} already exists")
+        CANONICAL_RUN_SPEC_FUNCS[name] = func
+        return func
+
+    return wrap
+
+
+@run_spec_function("simple1")
+def get_simple1_spec() -> RunSpec:
+    """A run spec for debugging."""
+    return RunSpec(
+        name="simple1",
+        scenario_spec=get_scenario_spec1(),
+        adapter_spec=get_adapter_spec1(),
+        metric_specs=get_basic_metric_specs([]),
+        groups=[],
+    )
 
 
 @run_spec_function("bbq")
@@ -60,9 +832,7 @@ def get_bbq_spec(subject: str, method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> Run
         input_noun="Passage",
         output_noun="Answer",
     )
-    metric_specs = [
-        MetricSpec(class_name="helm.benchmark.metrics.bbq_metrics.BBQMetric", args={})
-    ] + get_exact_match_metric_specs()
+    metric_specs = get_bbq_metric_specs()
 
     return RunSpec(
         name=f"bbq:subject={subject},method={method}",
@@ -75,8 +845,6 @@ def get_bbq_spec(subject: str, method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> Run
 
 @run_spec_function("msmarco")
 def get_msmarco_spec(track: str, valid_topk: Optional[int] = None) -> RunSpec:
-    from helm.benchmark.scenarios.msmarco_scenario import MSMARCOScenario
-
     valid_topk = None if valid_topk is None else int(valid_topk)
     scenario_spec = ScenarioSpec(
         class_name="helm.benchmark.scenarios.msmarco_scenario.MSMARCOScenario",
@@ -85,33 +853,11 @@ def get_msmarco_spec(track: str, valid_topk: Optional[int] = None) -> RunSpec:
 
     adapter_spec: AdapterSpec = get_ranking_binary_adapter_spec(max_train_instances=4, stop_sequences=["\n"])
 
-    # Names of the measures we want to compute.
-    measure_names = MSMARCOScenario.MEASURE_NAMES[track]
-    multiple_relevance_values = set(MSMARCOScenario.GOLD_RELATIONS[track]) != {1}
-
-    metric_specs = (
-        [
-            MetricSpec(
-                class_name="helm.benchmark.metrics.ranking_metrics.RankingMetric",
-                args={
-                    "method": ADAPT_RANKING_BINARY,
-                    "measure_names": measure_names,
-                    "correct_output": BinaryRankingAdapter.RANKING_CORRECT_LABEL,
-                    "wrong_output": BinaryRankingAdapter.RANKING_WRONG_LABEL,
-                    "rank": valid_topk,
-                    "multiple_relevance_values": multiple_relevance_values,
-                },
-            ),
-        ]
-        + get_basic_reference_metric_specs()
-        + get_generic_metric_specs()
-    )
-
     return RunSpec(
         name=f"msmarco:track={track},valid_topk={valid_topk}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=metric_specs,
+        metric_specs=get_msmarco_metric_specs(track=track, rank=valid_topk),
         groups=[f"msmarco_{track}"],
     )
 
@@ -185,6 +931,28 @@ def get_custom_mcqa_spec(
     )
 
 
+@run_spec_function("mmlu")
+def get_mmlu_spec(subject: str, method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.mmlu_scenario.MMLUScenario", args={"subject": subject}
+    )
+
+    adapter_spec = get_multiple_choice_adapter_spec(
+        method=method,
+        instructions=f"The following are multiple choice questions (with answers) about {subject.replace('_', ' ')}.",
+        input_noun="Question",
+        output_noun="Answer",
+    )
+
+    return RunSpec(
+        name=f"mmlu:subject={subject},method={method}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs(),
+        groups=["mmlu"],
+    )
+
+
 @run_spec_function("interactive_qa_mmlu")
 def get_interactive_qa_mmlu_spec(subject: str) -> RunSpec:
     scenario_spec = ScenarioSpec(
@@ -233,6 +1001,42 @@ def get_wikifact_spec(k: str, subject: str) -> RunSpec:
     )
 
 
+@run_spec_function("commonsense")
+def get_commonsense_spec(dataset: str, method: str) -> RunSpec:
+    # TODO Split these into their own run_spec_function.
+    if dataset == HellaSwagScenario.name:
+        scenario_spec = ScenarioSpec(
+            class_name="helm.benchmark.scenarios.commonsense_scenario.HellaSwagScenario", args={}
+        )
+    elif dataset == OpenBookQA.name:
+        scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.commonsense_scenario.OpenBookQA", args={})
+    elif dataset == CommonSenseQAScenario.name:
+        scenario_spec = ScenarioSpec(
+            class_name="helm.benchmark.scenarios.commonsense_scenario.CommonSenseQAScenario", args={}
+        )
+    elif dataset == SiqaScenario.name:
+        scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.commonsense_scenario.SiqaScenario", args={})
+    elif dataset == PiqaScenario.name:
+        scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.commonsense_scenario.PiqaScenario", args={})
+    else:
+        raise ValueError(f"Unknown dataset: {dataset}")
+
+    adapter_spec = get_multiple_choice_adapter_spec(
+        method=method,
+        instructions="The following are multiple choice questions (with answers) about common sense.",
+        input_noun="Question",
+        output_noun="Answer",
+    )
+
+    return RunSpec(
+        name=f"commonsense:dataset={dataset},method={method}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs(),
+        groups=[dataset],
+    )
+
+
 @run_spec_function("quac")
 def get_quac_spec() -> RunSpec:
     scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.quac_scenario.QuACScenario", args={})
@@ -295,7 +1099,7 @@ def get_twitter_aae_spec(demographic: str) -> RunSpec:
         name=f"twitter_aae:demographic={demographic}",
         scenario_spec=scenario_spec,
         adapter_spec=get_language_modeling_adapter_spec(),
-        metric_specs=get_language_modeling_metric_specs([]),
+        metric_specs=get_basic_metric_specs([]),
         groups=["twitter_aae", f"twitter_aae_{demographic}"],
     )
 
@@ -323,9 +1127,7 @@ def get_real_toxicity_prompts_spec() -> RunSpec:
         name="real_toxicity_prompts",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_generative_harms_metric_specs(
-            include_basic_metrics=True, include_generative_harms_metrics=True
-        ),
+        metric_specs=get_generative_harms_metric_specs(include_basic_metrics=True),
         groups=["real_toxicity_prompts"],
     )
 
@@ -345,28 +1147,50 @@ def get_synthetic_reasoning_natural_spec(difficulty: str) -> RunSpec:
         max_train_instances=3,  # limited by the context length
         max_tokens=20,
     )
-    srn_metric_specs = get_basic_metric_specs(["f1_set_match", "iou_set_match", "exact_set_match"])
 
     return RunSpec(
         name=f"synthetic_reasoning_natural:difficulty={difficulty}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=srn_metric_specs + get_generative_harms_metric_specs(),
+        metric_specs=get_srn_metric_specs() + get_generative_harms_metric_specs(),
         groups=["synthetic_reasoning", "synthetic_reasoning_natural"],
     )
 
 
-@run_spec_function("raft")
-def get_raft_spec(subset: str) -> RunSpec:
-    from helm.benchmark.scenarios.raft_scenario import RAFTScenario, get_raft_instructions
+@run_spec_function("gsm")
+def get_gsm_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.gsm_scenario.GSM8KScenario", args={})
 
-    scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.raft_scenario.RAFTScenario", args={"subset": subset}
+    # Create AdapterSpec based on the GSM8K paper: https://arxiv.org/pdf/2110.14168.pdf
+    adapter_spec = get_generation_adapter_spec(
+        input_noun="Q",
+        output_noun="A",
+        max_train_instances=5,  # Due to limited context and long example length
+        max_tokens=400,  # The paper uses 400 tokens as the max sample length
+        stop_sequences=["\n\n"],  # Since answer may contain newlines, we use two as SEP
     )
 
-    scenario_cache_path = get_scenario_cache_path(get_benchmark_output_path(), RAFTScenario.name)
-    adapter_spec = get_generation_adapter_spec(
-        instructions=get_raft_instructions(subset, scenario_cache_path),
+    return RunSpec(
+        name="gsm",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_basic_metric_specs(["exact_match_indicator", "final_number_exact_match"])
+        + get_generative_harms_metric_specs(),
+        groups=["gsm"],
+    )
+
+
+@run_spec_function("raft")
+def get_raft_spec(subset: str) -> RunSpec:
+    from helm.benchmark.scenarios.raft_scenario import RAFTScenario, get_raft_instructions
+
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.raft_scenario.RAFTScenario", args={"subset": subset}
+    )
+
+    scenario_cache_path = get_scenario_cache_path(get_benchmark_output_path(), RAFTScenario.name)
+    adapter_spec = get_generation_adapter_spec(
+        instructions=get_raft_instructions(subset, scenario_cache_path),
         input_noun=None,
         output_noun="Label",
         max_tokens=30,  # at most ~50 characters per label
@@ -385,7 +1209,7 @@ def get_raft_spec(subset: str) -> RunSpec:
 def get_numeracy_spec(
     relation_type: str = "linear", mode: str = "function", seed: str = "0", run_solver: str = "False"
 ) -> RunSpec:
-    from helm.benchmark.scenarios.numeracy_scenario import get_numeracy_adapter_spec, RELTYPE_INFO
+    from .scenarios.numeracy_scenario import get_numeracy_adapter_spec, RELTYPE_INFO
 
     run_solver_bool: bool = True if run_solver == "True" else False
     del run_solver
@@ -433,6 +1257,71 @@ def get_numeracy_spec(
     )
 
 
+@run_spec_function("math")
+def get_math_spec(
+    subject: str,
+    level: str,
+    use_official_examples: str = "False",
+    use_chain_of_thought: str = "False",
+) -> RunSpec:
+    # Convert to bools and remove the str versions
+    use_official_examples_bool: bool = use_official_examples == "True"
+    use_chain_of_thought_bool: bool = use_chain_of_thought == "True"
+    del use_official_examples
+    del use_chain_of_thought
+
+    if use_chain_of_thought_bool:
+        assert not use_official_examples_bool, "Cannot use official examples when use_chain_of_thought is True."
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.math_scenario.MATHScenario",
+        args={
+            "subject": subject,
+            "level": level,
+            "use_official_examples": use_official_examples_bool,
+            "use_chain_of_thought": use_chain_of_thought_bool,
+        },
+    )
+
+    if use_chain_of_thought_bool:  # Include the solution in the output as per https://arxiv.org/abs/2201.11903
+        output_prefix = "Answer: "  # Don't include LaTeX '$' delimiters
+        output_suffix = "\n"
+        instance_prefix = "###\n"  # Don't include LaTeX '$' delimiters
+        max_tokens = 400  # Increase the number of tokens to generate
+        stop_sequences = ["###"]  # Break at the next instance; extraneous output will be stripped out
+        groups = ["math_chain_of_thought"]
+    else:
+        output_prefix = "Answer: $"
+        output_suffix = "$\n"
+        instance_prefix = "###\n"
+        max_tokens = 20
+        stop_sequences = ["$"]  # Break at the nearest LaTeX closing delimiter
+        groups = ["math_regular"]
+
+    adapter_spec = AdapterSpec(
+        method=ADAPT_GENERATION,
+        instructions="Given a mathematics problem, determine the answer. Simplify your answer as much as possible.\n",
+        max_train_instances=8,
+        num_outputs=1,
+        temperature=0.0,
+        stop_sequences=stop_sequences,
+        max_tokens=max_tokens,
+        input_prefix="Problem: ",
+        input_suffix="\n",
+        output_prefix=output_prefix,
+        output_suffix=output_suffix,
+        instance_prefix=instance_prefix,
+    )
+
+    return RunSpec(
+        name=f"math:subject={subject},level={level},"
+        f"use_official_examples={use_official_examples_bool},use_chain_of_thought={use_chain_of_thought_bool}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_math_metric_specs(use_chain_of_thought_bool) + get_generative_harms_metric_specs(),
+        groups=groups,
+    )
+
+
 @run_spec_function("boolq")
 def get_boolq_spec(only_contrast=False) -> RunSpec:
     scenario_spec = ScenarioSpec(
@@ -517,8 +1406,6 @@ def get_copyright_spec(
     normalize_by_prefix_length=True,
     normalize_newline_space_tab=False,
 ) -> RunSpec:
-    from helm.benchmark.scenarios.copyright_scenario import datatag2hash_code
-
     scenario_spec = ScenarioSpec(
         class_name="helm.benchmark.scenarios.copyright_scenario.CopyrightScenario", args=dict(datatag=datatag)
     )
@@ -631,21 +1518,36 @@ def get_code_spec(dataset: str, timeout=3) -> RunSpec:
             max_tokens=600,
         )
 
-    if dataset == "humaneval":
-        code_metric_specs = get_basic_metric_specs(["code_eval_acc", "pass"])
-    else:  # APPS.
-        args: Dict[str, Any] = {"names": ["test_avg", "strict_acc"], "timeout": timeout}
-        code_metric_specs = [MetricSpec(class_name="helm.benchmark.metrics.code_metrics.APPSMetric", args=args)]
-
     return RunSpec(
         name=f"code:dataset={dataset}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=code_metric_specs + get_generative_harms_metric_specs(),
+        metric_specs=get_code_metric_specs(dataset, timeout) + get_generative_harms_metric_specs(),
         groups=[f"code_{dataset}"],
     )
 
 
+@run_spec_function("natural_qa")
+def get_natural_qa_spec(mode: str) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario", args={"mode": mode}
+    )
+
+    adapter_spec = get_generation_adapter_spec(
+        input_noun="Question" if mode == "closedbook" else None,
+        output_noun="Answer",
+        max_tokens=300,  # answers are at most 65 words
+    )
+
+    return RunSpec(
+        name=f"natural_qa:mode={mode}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_f1_metric_specs() + get_generative_harms_metric_specs(),
+        groups=[f"natural_qa_{mode}"],
+    )
+
+
 @run_spec_function("the_pile")
 def get_the_pile_spec(subset: str) -> RunSpec:
     scenario_spec = ScenarioSpec(
@@ -656,7 +1558,7 @@ def get_the_pile_spec(subset: str) -> RunSpec:
         name=f"the_pile:subset={subset}",
         scenario_spec=scenario_spec,
         adapter_spec=get_language_modeling_adapter_spec(),
-        metric_specs=get_language_modeling_metric_specs([]),
+        metric_specs=get_basic_metric_specs([]),
         groups=["the_pile"],
     )
 
@@ -669,11 +1571,32 @@ def get_ice_spec(**kwargs) -> RunSpec:
         name="ice" + (":" if len(kwargs) > 0 else "") + ",".join(f"{k}={v}" for k, v in sorted(kwargs.items())),
         scenario_spec=scenario_spec,
         adapter_spec=get_language_modeling_adapter_spec(),
-        metric_specs=get_language_modeling_metric_specs([]),
+        metric_specs=get_basic_metric_specs([]),
         groups=["ice"],
     )
 
 
+@run_spec_function("narrative_qa")
+def get_narrativeqa_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.narrativeqa_scenario.NarrativeQAScenario", args={}
+    )
+
+    adapter_spec = get_generation_adapter_spec(
+        input_noun="Passage",
+        output_noun="Answer",
+        max_tokens=100,  # max 30 words
+    )
+
+    return RunSpec(
+        name="narrative_qa",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_open_ended_generation_metric_specs() + get_generative_harms_metric_specs(),
+        groups=["narrative_qa"],
+    )
+
+
 @run_spec_function("synthetic_efficiency")
 def get_synthetic_efficiency_spec(
     num_prompt_tokens: Optional[int] = None,
@@ -695,9 +1618,7 @@ def get_synthetic_efficiency_spec(
         name=f"synthetic_efficiency:random={random}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_basic_generation_metric_specs(["exact_match"])
-        + get_generic_metric_specs()
-        + get_generative_harms_metric_specs(),
+        metric_specs=get_basic_metric_specs(["exact_match"]) + get_generative_harms_metric_specs(),
         groups=["synthetic_efficiency"],
     )
 
@@ -736,7 +1657,7 @@ def get_wikitext_103_spec() -> RunSpec:
         name="wikitext_103",
         scenario_spec=scenario_spec,
         adapter_spec=get_language_modeling_adapter_spec(),
-        metric_specs=get_language_modeling_metric_specs([]),
+        metric_specs=get_basic_metric_specs([]),
         groups=["wikitext_103"],
     )
 
@@ -884,13 +1805,40 @@ def get_dyck_language_spec(num_parenthesis_pairs: int) -> RunSpec:
         name=f"dyck_language_np={int(num_parenthesis_pairs)}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_basic_generation_metric_specs(["exact_match_indicator"])
-        + get_generic_metric_specs()
-        + get_generative_harms_metric_specs(),
+        metric_specs=get_basic_metric_specs(["exact_match_indicator"]) + get_generative_harms_metric_specs(),
         groups=["dyck_language"],
     )
 
 
+@run_spec_function("legalbench")
+def get_legalbench_spec(subset: str) -> RunSpec:
+    from helm.benchmark.scenarios.legalbench_scenario import (
+        LegalBenchScenario,
+        get_legalbench_instructions,
+        get_legalbench_output_nouns,
+    )
+
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.legalbench_scenario.LegalBenchScenario", args={"subset": subset}
+    )
+    scenario_cache_path = get_scenario_cache_path(get_benchmark_output_path(), LegalBenchScenario.name)
+    adapter_spec = get_generation_adapter_spec(
+        instructions=get_legalbench_instructions(subset, scenario_cache_path),
+        input_noun=None,
+        output_noun=get_legalbench_output_nouns(subset, scenario_cache_path),
+        max_tokens=30,  # at most ~50 characters per label,
+        max_train_instances=5,  # Use 5 for all subsets
+    )
+
+    return RunSpec(
+        name=f"legalbench:subset={subset}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs(),
+        groups=["legalbench"],
+    )
+
+
 @run_spec_function("legal_support")
 def get_legal_support_spec(method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> RunSpec:
     scenario_spec = ScenarioSpec(
@@ -1119,7 +2067,7 @@ def get_med_mcqa_spec() -> RunSpec:
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=get_exact_match_metric_specs(),
-        groups=["med_mcqa"],
+        groups=["MedMCQA"],
     )
 
 
@@ -1145,89 +2093,48 @@ def get_med_paragraph_simplification_spec() -> RunSpec:
     )
 
 
-@run_spec_function("pubmed_qa")
-def get_pubmed_qa_spec() -> RunSpec:
-    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.pubmed_qa_scenario.PubMedQAScenario", args={})
+@run_spec_function("med_qa")
+def get_med_qa_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.med_qa_scenario.MedQAScenario", args={})
 
     adapter_spec = get_multiple_choice_adapter_spec(
         method=ADAPT_MULTIPLE_CHOICE_JOINT,
-        instructions="Answer A for yes, B for no or C for maybe.",
+        instructions="The following are multiple choice questions (with answers) about medicine.",
         input_noun="Question",
         output_noun="Answer",
     )
 
     return RunSpec(
-        name="pubmed_qa",
+        name="med_qa",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=get_exact_match_metric_specs(),
-        groups=["pubmed_qa"],
-    )
-
-
-@run_spec_function("live_qa")
-def get_live_qa_spec() -> RunSpec:
-    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.live_qa_scenario.LiveQAScenario")
-
-    adapter_spec = get_generation_adapter_spec(
-        instructions="Please answer the following consumer health question.",
-        input_noun="Question",
-        output_noun="Answer",
-        max_train_instances=0,
-        max_tokens=512,
-    )
-    annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.live_qa_annotator.LiveQAAnnotator")]
-    metric_specs = get_open_ended_generation_metric_specs() + [
-        MetricSpec(class_name="helm.benchmark.metrics.live_qa_metrics.LiveQAScoreMetric")
-    ]
-
-    return RunSpec(
-        name="live_qa",
-        scenario_spec=scenario_spec,
-        adapter_spec=adapter_spec,
-        annotators=annotator_specs,
-        metric_specs=metric_specs,
-        groups=["live_qa"],
+        groups=["med_qa"],
     )
 
 
-@run_spec_function("medication_qa")
-def get_medication_qa_spec() -> RunSpec:
-    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.medication_qa_scenario.MedicationQAScenario")
+@run_spec_function("pubmed_qa")
+def get_pubmed_qa_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.pubmed_qa_scenario.PubMedQAScenario", args={})
 
-    adapter_spec = get_generation_adapter_spec(
-        instructions="Please answer the following consumer health question.",
+    adapter_spec = get_multiple_choice_adapter_spec(
+        method=ADAPT_MULTIPLE_CHOICE_JOINT,
+        instructions="Answer A for yes, B for no or C for maybe.",
         input_noun="Question",
         output_noun="Answer",
-        max_train_instances=0,
-        max_tokens=512,
     )
 
-    annotator_specs = [
-        AnnotatorSpec(class_name="helm.benchmark.annotation.medication_qa_annotator.MedicationQAAnnotator")
-    ]
-    metric_specs = [MetricSpec(class_name="helm.benchmark.metrics.medication_qa_metrics.MedicationQAScoreMetric")]
-
     return RunSpec(
-        name="medication_qa",
+        name="pubmed_qa",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        annotators=annotator_specs,
-        metric_specs=metric_specs,
-        groups=["medication_qa"],
+        metric_specs=get_exact_match_metric_specs(),
+        groups=["pubmed_qa"],
     )
 
 
 @run_spec_function("lextreme")
 def get_lextreme_spec(subset: str) -> RunSpec:
-    from helm.benchmark.scenarios.lextreme_scenario import (
-        get_lextreme_instructions,
-        get_lextreme_max_train_instances,
-        get_lextreme_max_tokens,
-        TaskType,
-        get_lextreme_task_type,
-    )
-
     task_type = get_lextreme_task_type(subset)
 
     scenario_spec = ScenarioSpec(
@@ -1244,7 +2151,7 @@ def get_lextreme_spec(subset: str) -> RunSpec:
         multi_label=(task_type == TaskType.MLTC),
     )
 
-    metric_specs = get_basic_generation_metric_specs([]) + get_generic_metric_specs()
+    metric_specs = get_basic_metric_specs([])
     if task_type == TaskType.MLTC:
         metric_specs += get_classification_metric_specs(delimiter=", ")
     elif task_type == TaskType.SLTC:
@@ -1261,14 +2168,6 @@ def get_lextreme_spec(subset: str) -> RunSpec:
 
 @run_spec_function("lex_glue")
 def get_lex_glue_spec(subset: str) -> RunSpec:
-    from helm.benchmark.scenarios.lex_glue_scenario import (
-        get_lex_glue_instructions,
-        get_lex_glue_max_tokens,
-        get_lex_glue_max_train_instances,
-        get_lex_glue_task_type,
-    )
-    from helm.benchmark.scenarios.lextreme_scenario import TaskType
-
     task_type = get_lex_glue_task_type(subset)
 
     scenario_spec = ScenarioSpec(
@@ -1285,7 +2184,7 @@ def get_lex_glue_spec(subset: str) -> RunSpec:
         multi_label=(task_type == TaskType.MLTC),
     )
 
-    metric_specs = get_basic_generation_metric_specs([]) + get_generic_metric_specs()
+    metric_specs = get_basic_metric_specs([])
     if task_type == TaskType.MLTC:
         metric_specs += get_classification_metric_specs(delimiter=", ")
     elif task_type == TaskType.SLTC:
@@ -1384,6 +2283,92 @@ def get_eurlexsum_legal_summarization_spec(temperature: float = 0.3, device: str
     )
 
 
+@run_spec_function("wmt_14")
+def get_wmt_14_spec(language_pair: str, max_train_instances: int = 1) -> RunSpec:
+    FULL_LANGUAGE_NAMES = {
+        "cs": "Czech",
+        "de": "German",
+        "fr": "French",
+        "hi": "Hindi",
+        "ru": "Russian",
+        "en": "English",
+    }
+    source_language, target_language = language_pair.split("-")
+
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.wmt_14_scenario.WMT14Scenario",
+        args={"source_language": source_language, "target_language": target_language},
+    )
+
+    adapter_spec = get_machine_translation_adapter_spec(
+        source_language=FULL_LANGUAGE_NAMES[source_language],
+        target_language=FULL_LANGUAGE_NAMES[target_language],
+        max_train_instances=max_train_instances,
+    )
+
+    return RunSpec(
+        name=f"wmt_14:language_pair={language_pair}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_open_ended_generation_metric_specs(),
+        groups=["wmt_14"],
+    )
+
+
+@run_spec_function("self_instruct")
+def get_self_instruct_spec(num_respondents: int) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.self_instruct_scenario.SelfInstructScenario",
+        args={},
+    )
+
+    adapter_spec = get_instruct_adapter_spec()
+
+    return RunSpec(
+        name="self_instruct",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_instruction_following_critique_metric_specs(num_respondents),
+        groups=["self_instruct"],
+    )
+
+
+@run_spec_function("vicuna")
+def get_vicuna_spec(num_respondents: int, category: str = "all") -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.vicuna_scenario.VicunaScenario",
+        args={"category": category},
+    )
+
+    adapter_spec = get_instruct_adapter_spec()
+
+    return RunSpec(
+        name=f"vicuna:category={category}",  # TODO: add args
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_instruction_following_critique_metric_specs(num_respondents),
+        groups=["vicuna"],
+    )
+
+
+@run_spec_function("grammar")
+def get_grammar_spec(num_respondents: int, path: str, tags: str) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.grammar_scenario.GrammarScenario",
+        args={"path": path, "tags": tags},
+    )
+
+    adapter_spec = get_instruct_adapter_spec()
+
+    return RunSpec(
+        name=f"grammar:path={path},tags={tags}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_instruction_following_critique_metric_specs(num_respondents),
+        groups=["grammar"],
+    )
+
+
 @run_spec_function("verifiability_judgment")
 def get_verifiability_judgment_spec() -> RunSpec:
     scenario_spec = ScenarioSpec(
@@ -1406,7 +2391,7 @@ def get_verifiability_judgment_spec() -> RunSpec:
         name="verifiability_judgment",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_basic_metric_specs(["exact_match", "quasi_exact_match"]),
+        metric_specs=get_verifiability_judgment_metric_specs(),
         groups=["verifiability_judgment"],
     )
 
@@ -1446,69 +2431,567 @@ def get_opinions_qa_spec(
     )
 
 
-@run_spec_function("lm_entry")
-def get_lm_entry_spec(task: str, method: str = ADAPT_GENERATION) -> RunSpec:
+@run_spec_function("open_assistant")
+def get_open_assistant_spec(num_respondents: int, language: str) -> RunSpec:
     scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.lm_entry_scenario.LMEntryScenario",
-        args={"task": task},
+        class_name="helm.benchmark.scenarios.open_assistant_scenario.OpenAssistantScenario",
+        args={"language": language},
     )
-    adapter_spec: AdapterSpec
-    metric_specs: List[MetricSpec]
 
-    if method == ADAPT_MULTIPLE_CHOICE_JOINT:
-        if task in ["first_letter", "last_letter", "first_word", "last_word", "word_before", "word_after"]:
-            raise ValueError(f"Task {task} cannot be cast to multiple choice.")
-
-        adapter_spec = get_multiple_choice_adapter_spec(
-            method=method,
-            instructions="Answer the following multiple choice question with a single letter",
-            input_noun="Question",
-            output_noun="\nAnswer",
+    adapter_spec = get_instruct_adapter_spec()
+
+    return RunSpec(
+        name=f"open_assistant:language={language}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_instruction_following_critique_metric_specs(num_respondents),
+        groups=["open_assistant"],
+    )
+
+
+@run_spec_function("koala")
+def get_koala_spec(num_respondents: int) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.koala_scenario.KoalaScenario",
+        args={},
+    )
+
+    adapter_spec = get_instruct_adapter_spec()
+
+    return RunSpec(
+        name="koala",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_instruction_following_critique_metric_specs(num_respondents),
+        groups=["koala"],
+    )
+
+
+@run_spec_function("anthropic_hh_rlhf")
+def get_anthropic_hh_rlhf_spec(num_respondents: int, subset: str) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.anthropic_hh_rlhf_scenario.AnthropicHHRLHFScenario",
+        args={"subset": subset},
+    )
+
+    adapter_spec = get_instruct_adapter_spec()
+
+    return RunSpec(
+        name=f"anthropic_hh_rlhf:subset={subset}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_instruction_following_critique_metric_specs(num_respondents),
+        groups=["anthropic_hh_rlhf"],
+    )
+
+
+@run_spec_function("cleva")
+def get_cleva_spec(task: str, version: str, subtask: Optional[str] = None, prompt_id: int = 0) -> RunSpec:
+    from helm.benchmark.scenarios.cleva_scenario import CLEVAScenario  # noqa
+
+    scenario_cache_path = get_scenario_cache_path(get_benchmark_output_path(), CLEVAScenario.name)
+    CLEVAScenario.download_dataset(task, version, scenario_cache_path)
+
+    _, prompt_setting = CLEVAScenario.get_prompt_setting(task, subtask, version, prompt_id, scenario_cache_path)
+    inference_parameters = CLEVAScenario.load_inference_parameters(
+        task, subtask, version, prompt_id, scenario_cache_path
+    )
+
+    class_name_prefix = "".join([word.capitalize() for word in task.split("_")])
+    scenario_spec = ScenarioSpec(
+        class_name=f"helm.benchmark.scenarios.cleva_scenario.CLEVA{class_name_prefix}Scenario",
+        args={"version": version, "subtask": subtask, "prompt_id": prompt_id},
+    )
+    run_spec_name: str = f"cleva:task={task},version={version},prompt_id={prompt_id}"
+    if subtask:
+        run_spec_name += f",subtask={subtask}"
+
+    if task in ["copyright"]:
+        adapter_spec = get_completion_adapter_spec(
+            temperature=inference_parameters.get("temperature", 0.2),
+            max_tokens=inference_parameters.get("max_tokens", 1024),
+            num_outputs=inference_parameters.get("num_outputs", 1),
         )
-        metric_specs = get_exact_match_metric_specs()
-    elif method == ADAPT_GENERATION:
-        adapter_spec = get_generation_adapter_spec(
-            instructions="Answer the following question in one word.",
-            input_noun="Q",
-            output_noun="\nA",
-            # Shouldn't use any stop sequences because the task is zero-shot and thus we
-            # don't expect the model to magically figure out the output format.
-            stop_sequences=[],
-            # Set max_tokens to save tokens. The answer is a word so 10 tokens should suffice.
-            max_tokens=10,
+        args = {"normalize_by_prefix_length": True, "normalize_newline_space_tab": False}
+        metric_specs = get_cleva_copyright_metric_spec(args) + get_cleva_generative_harms_metric_specs()
+    elif task in ["code_synthesis"]:
+        adapter_spec = get_completion_adapter_spec(
+            instructions=prompt_setting.instructions,
+            temperature=inference_parameters.get("temperature", 0.2),
+            # Taken from the original OpenAI paper to prevent the further generation of irrelevant classes/functions
+            stop_sequences=inference_parameters.get("stop_sequences", ["\nclass", "\ndef", "\nif", "\nprint"]),
+            max_tokens=inference_parameters.get("max_tokens", 600),
         )
-        # It makes no sense to include non-quasi exact match metrics for this task.
-        metric_specs = get_basic_metric_specs(["quasi_exact_match", "quasi_prefix_exact_match", "f1_score"])
+        metric_specs = get_basic_metric_specs(["code_eval_acc", "pass"]) + get_cleva_generative_harms_metric_specs()
+    elif task in ["language_modeling"]:
+        adapter_spec = get_language_modeling_adapter_spec()
+        metric_specs = get_basic_metric_specs([])
     else:
-        raise ValueError(f"Unknown method: {method}")
+        if prompt_setting.method in [
+            ADAPT_MULTIPLE_CHOICE_JOINT,
+            ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
+            ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
+        ]:
+            if prompt_setting.method == ADAPT_MULTIPLE_CHOICE_JOINT:
+                adapter_spec = AdapterSpec(
+                    method=prompt_setting.method,
+                    instructions=prompt_setting.instructions,
+                    input_prefix=prompt_setting.input_prefix,
+                    input_suffix=prompt_setting.input_suffix,
+                    output_prefix=prompt_setting.output_prefix,
+                    output_suffix=prompt_setting.output_suffix,
+                    max_train_instances=inference_parameters.get("max_train_instances", 5),
+                    num_outputs=inference_parameters.get("num_outputs", 5),
+                    max_tokens=inference_parameters.get("max_tokens", 1),
+                    temperature=inference_parameters.get("temperature", 0.0),
+                    stop_sequences=inference_parameters.get("stop_sequences", ["\n"]),
+                    sample_train=inference_parameters.get("sample_train", True),
+                    multi_label=inference_parameters.get("multi_label", False),
+                )
+            else:
+                adapter_spec = AdapterSpec(
+                    method=prompt_setting.method,
+                    instructions=prompt_setting.instructions,
+                    input_prefix=prompt_setting.input_prefix,
+                    input_suffix=prompt_setting.input_suffix,
+                    output_prefix=prompt_setting.output_prefix,
+                    output_suffix=prompt_setting.output_suffix,
+                    # Separate is basically language modeling, so can't easily use in-context examples
+                    max_train_instances=inference_parameters.get("max_train_instances", 5),
+                    num_outputs=1,
+                    max_tokens=0,
+                    temperature=inference_parameters.get("temperature", 0.0),
+                    sample_train=inference_parameters.get("sample_train", True),
+                )
+            metric_specs = get_exact_match_metric_specs()
+            if task in ["fact_checking", "bias"]:
+                metric_specs += get_multiple_choice_classification_metric_specs()
+        elif prompt_setting.method == ADAPT_GENERATION:
+            adapter_spec = AdapterSpec(
+                method=prompt_setting.method,
+                instructions=prompt_setting.instructions,
+                input_prefix=prompt_setting.input_prefix,
+                input_suffix=prompt_setting.input_suffix,
+                output_prefix=prompt_setting.output_prefix,
+                output_suffix=prompt_setting.output_suffix,
+                max_train_instances=inference_parameters.get("max_train_instances", 5),
+                num_outputs=inference_parameters.get("num_outputs", 1),
+                max_tokens=inference_parameters.get("max_tokens", 20),
+                temperature=inference_parameters.get("temperature", 0.0),
+                stop_sequences=inference_parameters.get("stop_sequences", ["\n"]),
+                sample_train=inference_parameters.get("sample_train", True),
+                multi_label=inference_parameters.get("multi_label", True),
+            )
+            metric_specs = (
+                get_cleva_generative_task_metric_spec(task, subtask) + get_cleva_generative_harms_metric_specs()
+            )
+        else:
+            raise ValueError(
+                f"{task} can only be {ADAPT_GENERATION}, {ADAPT_MULTIPLE_CHOICE_JOINT}, "
+                f"{ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED} or {ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL}"
+            )
+
+    return RunSpec(
+        name=run_spec_name,
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        groups=["cleva", f"cleva_{task}"],
+    )
+
+@run_spec_function("financial_phrasebank")
+def get_financial_phrasebank_spec(subset: str = "sentences_50agree") -> RunSpec:
+    from .scenarios import financial_phrasebank_scenario
+
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.financial_phrasebank_scenario.FinancialPhrasebankScenario",
+        args={"subset": subset},
+    )
+
+    adapter_spec = get_generation_adapter_spec(
+        instructions=financial_phrasebank_scenario.get_instructions(),
+        input_noun=None,
+        output_noun="Label",
+        max_tokens=30,  # at most ~50 characters per label
+    )
+
+    return RunSpec(
+        name=f"financial_phrasebank:subset={subset}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs() + get_weighted_classification_metric_specs(),
+        groups=["financial_phrasebank"],
+    )
+
+@run_spec_function("news_headline")
+def get_news_headline_spec(category: str) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.news_headline_scenario.NewsHeadlineScenario",
+        args={"category": category},
+    )
+
+    adapter_spec = get_generation_adapter_spec(input_noun="Passage", output_noun="Answer")
+
+    return RunSpec(
+        name=f"news_headline:category={category}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
+        groups=["news_headline"],
+    )
+
+@run_spec_function("kpi_edgar")
+def get_kpi_edgar_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.kpi_edgar_scenario.KPIEDGARScenario",
+        args={},
+    )
+
+    adapter_spec = get_generation_adapter_spec(
+        input_noun=None, output_noun="Answer", max_tokens=100, max_train_instances=20
+    )
+
+    return RunSpec(
+        name="kpi_edgar",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_f1_metric_specs() + get_kpi_edgar_metric_specs(),
+        groups=["kpi_edgar"],
+    )
+
+@run_spec_function("conv_fin_qa")
+def get_conv_fin_qa_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.conv_fin_qa_scenario.ConvFinQAScenario", args={})
+
+    adapter_spec = get_generation_adapter_spec(input_noun="Passage", output_noun="Answer")
+
+    return RunSpec(
+        name="conv_fin_qa",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_math_float_match_metric_specs(),
+        groups=["conv_fin_qa"],
+    )
+
+@run_spec_function("legal_opinion")
+def get_legal_opinion_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.legal_opinion_scenario.LegalOpinionScenario", args={}
+    )
+
+    instructions = "Classify the sentences into one of the 3 sentiment categories. Possible labels: positive, neutral, negative."  # noqa
+    adapter_spec = get_generation_adapter_spec(
+        instructions=instructions,
+        output_noun="Label",
+    )
+
+    return RunSpec(
+        name="legal_opinion",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs() + get_weighted_classification_metric_specs(),
+        groups=["legal_opinion"],
+    )
+
+# A different implementation (binary classification) of lex_glue_fixed:subset=ecthr_a
+@run_spec_function("echr_judge")
+def get_echr_judge_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.echr_judge_scenario.EchrJudgeScenario", args={"doc_max_length": 600}
+    )
+
+    adapter_spec = get_generation_adapter_spec(
+        # instructions=EchrJudgeScenario.PROMPT_INST, # simple intsruction
+        instructions=EchrJudgeScenario.PROMPT_INST_WITH_EX,  # instruction with trivial examples
+        input_noun=EchrJudgeScenario.PROMPT_INPUT,
+        output_noun=EchrJudgeScenario.PROMPT_OUTPUT,
+        max_tokens=1,
+    )
+
+    return RunSpec(
+        name="echr_judge",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs() + get_weighted_classification_metric_specs(),
+        groups=["echr_judge"],
+    )
+
+# A different implementation of lex_glue_fixed:subset=case_hold
+@run_spec_function("casehold_qa")
+def get_casehold_qa_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.casehold_qa_scenario.CaseHOLDQAScenario", args={})
+
+    method = ADAPT_MULTIPLE_CHOICE_JOINT
+    adapter_spec = get_multiple_choice_adapter_spec(
+        method=method,
+        instructions="Give a letter answer among A, B, C, D, or E.",
+        input_noun="Passage",
+        output_noun="Answer",
+        max_train_instances=2,
+    )
+
+    metric_specs = get_f1_metric_specs()
 
     return RunSpec(
-        name=f"lm_entry:task={task},method={method}",
+        name="casehold_qa",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=metric_specs,
-        groups=["lm_entry"],
+        groups=["CaseHOLDQA"],
     )
 
+@run_spec_function("legal_contract")
+def get_legal_contract_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.legal_contract_scenario.LegalContractScenario",
+        args={},
+    )
 
-@run_spec_function("thai_exam")
-def get_thai_exam_spec(exam: str = "onet", method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> RunSpec:
+    adapter_spec = get_generation_adapter_spec(
+        output_noun="Summary",
+        max_tokens=100,  # <=1536 (Limited by BAM)
+        stop_sequences=["\n\n"],  # workaround for the first \n char with gpt-neox-20b
+    )
+
+    return RunSpec(
+        name="legal_contract",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_basic_metric_specs(["rouge_1", "rouge_2", "rouge_l"]),
+        groups=["legal_contract"],
+    )
+
+@run_spec_function("sumosum")
+def get_sumosum_spec() -> RunSpec:
     scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.thai_exam_scenario.ThaiExamScenario", args={"exam": exam}
+        class_name="helm.benchmark.scenarios.sumosum_scenario.SUMOSumScenario",
+        args={
+            # "sampling_min_length": 100,
+            # "sampling_max_length": 700,
+            # "doc_max_length": 3700,
+        },
+    )
+
+    instructions = "Generate the title of the following article."
+    adapter_spec = get_generation_adapter_spec(
+        instructions=instructions,
+        output_noun="Title",
+        max_train_instances=0,
+        max_tokens=100,  # <=1536 (Limited by BAM)
+        stop_sequences=["\n\n"],  # workaround for the first \n char with gpt-neox-20b
+    )
+
+    # NOTE doc_max_length(3700 words) + max_tokens(100 tokens) <= max_request_length(4096 tokens)
+    # see EncoderDecoderWindowService.fits_within_context_window
+
+    return RunSpec(
+        name="sumosum",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_basic_metric_specs(["rouge_1", "rouge_2", "rouge_l"]),
+        groups=["sumosum"],
+    )
+
+@run_spec_function("cti_mitre")
+def get_cti_mitre_spec(num_options: int = 10, seed: int = 42, method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.cti_mitre_scenario.CtiMitreScenario",
+        args={
+            "num_options": num_options,
+            "seed": seed,
+        },
     )
 
     adapter_spec = get_multiple_choice_adapter_spec(
         method=method,
-        instructions="The following are multiple choice questions (with answers).",
-        input_noun="Question",
+        instructions="Answer the possible security attacks in each of the following situations from each of the options below.",  # noqa
+        input_noun="Situation",
         output_noun="Answer",
-        max_train_instances=5,
+        max_train_instances=10,
     )
 
     return RunSpec(
-        name=f"thai_exam:exam={exam},method={method}",
+        name=f"cti_mitre:num_options={num_options},seed={seed},method={method}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_exact_match_metric_specs(),
-        groups=["thai_exam", f"thai_exam_{exam}"],
+        metric_specs=get_basic_metric_specs(["exact_match", "quasi_exact_match", "f1_score"]),
+        groups=["cti_mitre"],
     )
+
+
+############################################################
+
+
+def get_default_model_deployment_for_model(
+    model_name: str, warn_arg_deprecated: bool = False, ignore_deprecated: bool = False
+) -> Optional[str]:
+    """Returns a valid model deployment name corresponding to the given model arg.
+    This is used as a backwards compatibility layer for model names that are now moved to model deployments.
+    Example: "anthropic/claude-v1.3" => "anthropic/claude-v1.3"
+    Example: "meta/llama-7b" => "together/llama-7b"
+
+    The process to find a model deployment name is as follows:
+    1. If there is a model deployment with the same name as the model arg, use it.
+    2. If there is at least one deployment for the model, use the first one that is available.
+    3. If there are no deployments for the model, returns None.
+
+    This function will also try to find a model deployment name that is not deprecated.
+    If there are no non-deprecated deployments, it will return the first deployment (even if it's deprecated).
+    If ignore_deprecated is True, this function will return None if the model deployment is deprecated.
+
+    If warn_arg_deprecated is True, this function will print a warning if the model deployment name is not the same
+    as the model arg. This is to remind the user that the model name is deprecated and should be replaced with
+    the model deployment name (in their config).
+
+    Args:
+        model_arg: The model arg to convert to a model deployment name.
+        warn_arg_deprecated: Whether to print a warning if the model deployment name is not the same as the model arg.
+        ignore_deprecated: Whether to return None if the model deployment is deprecated.
+    """
+
+    # If there is a model deployment with the same name as the model arg, use it.
+    if model_name in DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT:
+        deployment: ModelDeployment = DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT[model_name]
+        if deployment.deprecated and ignore_deprecated:
+            if warn_arg_deprecated:
+                hlog(f"WARNING: Model deployment {model_name} is deprecated")
+            return None
+        return deployment.name
+
+    # If there is at least one deployment for the model, use the first one that is available.
+    available_deployments: List[ModelDeployment] = [
+        deployment for deployment in ALL_MODEL_DEPLOYMENTS if deployment.model_name == model_name
+    ]
+    if len(available_deployments) > 0:
+        available_deployment_names: List[str] = [deployment.name for deployment in available_deployments]
+        if warn_arg_deprecated:
+            hlog("WARNING: Model name is deprecated. Please use the model deployment name instead.")
+            hlog(f"Available model deployments for model {model_name}: {available_deployment_names}")
+
+        # Additionally, if there is a non-deprecated deployment, use it.
+        non_deprecated_deployments: List[ModelDeployment] = [
+            deployment for deployment in available_deployments if not deployment.deprecated
+        ]
+        if len(non_deprecated_deployments) > 0:
+            chosen_deployment = non_deprecated_deployments[0]
+        # There are no non-deprecated deployments, so there are two options:
+        # 1. If we can return an empty string, return it. (no model deployment is available)
+        # 2. If we can't return an empty string, return the first deployment (even if it's deprecated).
+        elif ignore_deprecated:
+            return None
+        else:
+            chosen_deployment = available_deployments[0]
+            if warn_arg_deprecated:
+                hlog(f"WARNING: All model deployments for model {model_name} are deprecated.")
+        if warn_arg_deprecated:
+            hlog(
+                f"Choosing {chosen_deployment.name} (the first one) as "
+                f"the default model deployment for model {model_name}"
+            )
+            hlog("If you want to use a different model deployment, please specify it explicitly.")
+        return chosen_deployment.name
+
+    # Some models are added but have no deployments yet.
+    # In this case, we return None.
+    return None
+
+
+def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]:
+    """
+    Takes a specification (name, args) and returns a list of `RunSpec`s.
+    """
+    # Note that we are abusing `spec` a bit because the name is not actually a class name.
+    name = spec.class_name
+    args = spec.args
+
+    if name not in CANONICAL_RUN_SPEC_FUNCS:
+        raise ValueError(f"Unknown run spec name: {name}")
+
+    # Peel off the run expanders (e.g., model)
+    expanders = [RUN_EXPANDERS[key](value) for key, value in args.items() if key in RUN_EXPANDERS]  # type: ignore
+    args = dict((key, value) for key, value in args.items() if key not in RUN_EXPANDERS)
+
+    # Get the canonical run specs
+    run_specs = [CANONICAL_RUN_SPEC_FUNCS[name](**args)]
+
+    # Apply expanders
+    for expander in expanders:
+        run_specs = [
+            child_run_spec for parent_run_spec in run_specs for child_run_spec in expander.expand(parent_run_spec)
+        ]
+
+    def alter_run_spec(run_spec: RunSpec) -> RunSpec:
+        if not run_spec.adapter_spec.model and not run_spec.adapter_spec.model_deployment:
+            raise ValueError("At least one of model_deployment and model must be specified")
+        elif not run_spec.adapter_spec.model and run_spec.adapter_spec.model_deployment:
+            # Infer model from model deployment
+            default_model_name = get_model_deployment(run_spec.adapter_spec.model_deployment).model_name
+            if not default_model_name:
+                default_model_name = run_spec.adapter_spec.model_deployment
+            run_spec = dataclasses.replace(
+                run_spec,
+                adapter_spec=dataclasses.replace(run_spec.adapter_spec, model=default_model_name),
+            )
+        elif run_spec.adapter_spec.model and not run_spec.adapter_spec.model_deployment:
+            # Infer model deployment from model
+            default_model_deployment = get_default_model_deployment_for_model(run_spec.adapter_spec.model)
+            if not default_model_deployment:
+                raise ValueError(
+                    f"Unknown model or no default model deployment found for model {run_spec.adapter_spec.model}"
+                )
+            run_spec = dataclasses.replace(
+                run_spec,
+                adapter_spec=dataclasses.replace(run_spec.adapter_spec, model_deployment=default_model_deployment),
+            )
+
+        # Both model and model_deployment should now be filled
+        assert run_spec.adapter_spec.model_deployment
+        assert run_spec.adapter_spec.model
+
+        model: ModelMetadata = get_model_metadata(run_spec.adapter_spec.model)
+        deployment: ModelDeployment = get_model_deployment(run_spec.adapter_spec.model_deployment)
+        if run_spec.adapter_spec.model != deployment.model_name:
+            raise ValueError(
+                f"Invalid RunSpec: selected model deployment '{run_spec.adapter_spec.model_deployment}'"
+                f"for model '{run_spec.adapter_spec.model}' but the model deployment is "
+                f"for a different model '{deployment.model_name}'"
+            )
+        # For models that strip newlines, when we're generating, we need to set
+        # the delimiter to be '###' so we stop properly.
+        if NO_NEWLINES_TAG in model.tags and run_spec.adapter_spec.method in (
+            ADAPT_GENERATION,
+            ADAPT_MULTIPLE_CHOICE_JOINT,
+        ):
+            stop_expander = StopRunExpander(value="hash")
+            run_spec = singleton(stop_expander.expand(run_spec))
+
+        if NLG_PREFIX_TAG in model.tags:
+            global_prefix_expander = GlobalPrefixRunExpander(value="nlg")
+            run_spec = singleton(global_prefix_expander.expand(run_spec))
+
+        if CHATML_MODEL_TAG in model.tags:
+            chatml_expander = ChatMLRunExpander()
+            run_spec = singleton(chatml_expander.expand(run_spec))
+
+        # Anthropic prompts
+        if ANTHROPIC_CLAUDE_1_MODEL_TAG in model.tags or ANTHROPIC_CLAUDE_2_MODEL_TAG in model.tags:
+            run_spec = singleton(AnthropicRunExpander().expand(run_spec))
+
+        # OpenAI prompts
+        if OPENAI_CHATGPT_MODEL_TAG in model.tags:
+            run_spec = singleton(OpenAIRunExpander().expand(run_spec))
+
+        # Google prompts
+        if GOOGLE_PALM_2_MODEL_TAG in model.tags:
+            run_spec = singleton(GoogleRunExpander().expand(run_spec))
+
+        # For multiple choice
+        if BUGGY_TEMP_0_TAG in model.tags and run_spec.adapter_spec.temperature == 0:
+            increase_temperature_expander = IncreaseTemperatureRunExpander(value=1e-4)
+            run_spec = singleton(increase_temperature_expander.expand(run_spec))
+
+        return run_spec
+
+    run_specs = [alter_run_spec(run_spec) for run_spec in run_specs]
+
+    return run_specs
diff --git a/src/helm/benchmark/scenarios/casehold_qa_scenario.py b/src/helm/benchmark/scenarios/casehold_qa_scenario.py
new file mode 100644
index 00000000000..5e818a7d2e5
--- /dev/null
+++ b/src/helm/benchmark/scenarios/casehold_qa_scenario.py
@@ -0,0 +1,109 @@
+import json
+import os
+import os.path
+import shutil
+import datasets
+from typing import List, Dict, Any, cast
+
+from helm.common.general import ensure_directory_exists
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TRAIN_SPLIT,
+    VALID_SPLIT,
+    TEST_SPLIT,
+    ALL_SPLITS,
+    CORRECT_TAG,
+    PassageQuestionInput,
+    Output,
+)
+
+
+def download_dataset(data_path: str, splits: List[str]):
+    if False not in [os.path.exists(f"{data_path}/{split}.jsonl") for split in splits]:
+        return
+
+    # https://huggingface.co/docs/datasets/index
+    ds_names: List[str] = datasets.list_datasets()
+    # https://huggingface.co/casehold
+    # https://huggingface.co/datasets/casehold/casehold
+    ds_name = "casehold/casehold"
+    if ds_name not in ds_names:
+        raise Exception(f"{ds_name} not included in datasets")
+    casehold: datasets.DatasetDict = cast(datasets.DatasetDict, datasets.load_dataset(ds_name))
+
+    for split in splits:
+        casehold[split].to_json(f"{data_path}/{split}.jsonl")
+
+    # **WORK-AROUND**
+    # since "test.jsonl" includes no label info., we use "validation.jsonl" as a substitute.
+    if os.path.exists(f"{data_path}/test.jsonl"):
+        os.remove(f"{data_path}/test.jsonl")
+    shutil.copy(f"{data_path}/validation.jsonl", f"{data_path}/test.jsonl")
+
+
+class CaseHOLDQAScenario(Scenario):
+    """
+     CaseHOLD QA
+       CaseHOLD is a multiple choice question answering task derived from legal citations in judicial rulings.
+       CaseHOLD consists of ~53,000 questions, mined from the Harvard Law Library case law corpus.
+
+     Dataset repository
+       https://huggingface.co/datasets/casehold/casehold
+     Publication
+       "When Does Pretraining Help? Assessing Self-Supervised Learning for Law and the CaseHOLD Dataset"
+       ICAIL, 2021
+       https://reglab.stanford.edu/data/casehold-benchmark/
+       https://arxiv.org/abs/2104.08671
+
+    Data content
+      The citing context from the judicial decision serves as the prompt for the question.
+      The answer choices are holding statements derived from citations following text in a legal decision.
+      There are five answer choices for each citing text.
+      The correct answer is the holding statement that corresponds to the citing text.
+      The four incorrect answers are other holding statements.
+
+    """
+
+    name = "casehold_qa"
+    description = "CaseHOLD QA"
+    tags = ["question_answering", "legal"]
+
+    splits_dict = {TRAIN_SPLIT: "train", VALID_SPLIT: "validation", TEST_SPLIT: "test"}
+
+    def __init__(self, splits: List[str] = ALL_SPLITS):
+        super().__init__()
+        self.splits = splits
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        data_path: str = os.path.join(output_path, "data")
+        ensure_directory_exists(data_path)
+        download_dataset(data_path, list(self.splits_dict.values()))
+
+        def to_instance(line: str, split: str) -> Instance:
+            case: Dict[str, Any] = json.loads(line)
+            example_id: int = case["example_id"]
+            context: str = case["citing_prompt"]
+            question: str = "holding statement"
+            holdings: List[str] = [case[f"holding_{i}"] for i in range(5)]
+            label: str = case["label"]
+            instance: Instance = Instance(
+                input=PassageQuestionInput(passage=context, question=question),
+                references=[
+                    Reference(Output(text=holdings[i]), tags=([CORRECT_TAG] if label == str(i) else []))
+                    for i in range(5)
+                ],
+                split=split,
+                id=str(example_id),
+            )
+            return instance
+
+        instances: List[Instance] = []
+        # TRAIN, VALID
+        for split in self.splits:
+            with open(f"{data_path}/{self.splits_dict[split]}.jsonl", mode="r") as f:
+                for line in f.readlines():
+                    instances.append(to_instance(line, split))
+
+        return instances
diff --git a/src/helm/benchmark/scenarios/conv_fin_qa_scenario.py b/src/helm/benchmark/scenarios/conv_fin_qa_scenario.py
new file mode 100644
index 00000000000..64038ba3d93
--- /dev/null
+++ b/src/helm/benchmark/scenarios/conv_fin_qa_scenario.py
@@ -0,0 +1,188 @@
+import json
+import os
+from typing import Dict, List, Tuple, Any, Optional
+import re
+
+from helm.common.general import ensure_file_downloaded, ensure_directory_exists
+from .scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TRAIN_SPLIT,
+    VALID_SPLIT,
+    CORRECT_TAG,
+    PassageQuestionInput,
+    Output,
+)
+
+
+def _strip_string(str: str) -> Any:
+    # from https://stackoverflow.com/a/4703508
+    numeric_const_pattern = r"[-+]?(?:(?:\d*\.\d+)|(?:\d+\.?))(?:[Ee][+-]?\d+)?"
+    match = re.search(numeric_const_pattern, str)
+    if match:
+        try:
+            return float(str[match.start() : match.end()])
+        except Exception:
+            return None
+    return None
+
+
+def float_equiv(str1: Optional[str], str2: Optional[str], eps: float = 1e-6) -> float:
+    """
+    extract the first numbers in the two strings and compare them
+    """
+    if str1 is None and str2 is None:
+        print("WARNING: Both None")
+        return 1.0
+    if str1 is None or str2 is None:
+        return 0.0
+
+    try:
+        ss1 = _strip_string(str1)
+        ss2 = _strip_string(str2)
+        print(f"{str1}: ({ss1}) == {str2}: ({ss2})? {float(abs(ss1 - ss2) < eps)}")
+
+        if ss1 is None or ss2 is None:
+            return 0.0
+        return float(abs(ss1 - ss2) < eps)
+    except Exception:
+        return float(str1 == str2)
+
+
+class ConvFinQAScenario(Scenario):
+    """ ConvFinQA Financial Conversations (Numerical Reasoning)
+
+    Description:
+    ConvFinQA - Exploring the Chain of Numerical Reasoning in Conversational Finance Question Answering.
+
+    Prompt:
+Passage: Table: 
+{Table}
+Text: 
+Questions:  Question: {Question}? The answer is {Answer} 
+{Question}? The answer is {Answer}
+{Question}? The answer is {Answer}
+{Question}? The answer is 
+Answer:
+
+    Data source:
+    https://github.com/czyssrs/ConvFinQA
+
+    Reference:
+    Zhiyu Chen, Shiyang Li, Charese Smiley, Zhiqiang Ma, Sameena Shah, and William Yang Wang. 2022. 
+    ConvFinQA: Exploring the Chain of Numerical Reasoning in Conversational Finance Question Answering. 
+    In Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing, 
+    pages 6279–6292, Abu Dhabi, United Arab Emirates. Association for Computational Linguistics.
+    https://aclanthology.org/2022.emnlp-main.421
+    
+    """  # noqa
+
+    """ Information on this class"""
+    name = "conv_fin_qa"
+    description = "Conversitional Finance QA"
+    tags = ["question_answering", "finance"]
+
+    """ Class variables """
+    # Dataset file name
+    DATASET_DOWNLOAD_URL: str = "https://github.com/czyssrs/ConvFinQA/raw/main/data.zip"
+    DATASET_FILE_NAME = "ConvFinQA"
+
+    def __init__(self):
+        super().__init__()
+
+    def download_dataset(self, output_path: str):
+        """Downloads the con_fin_qa dataset."""
+
+        # Download the raw data
+        data_dir = os.path.join(output_path, "data")
+        ensure_directory_exists(data_dir)
+        ensure_file_downloaded(
+            source_url=self.DATASET_DOWNLOAD_URL,
+            target_path=os.path.join(data_dir, self.DATASET_FILE_NAME),
+            unpack=True,
+            unpack_type="unzip",
+        )
+
+    def get_table_text(self, table: List[List[str]]) -> str:
+        """table in the format of List of columns"""
+        return "~".join(["|".join(col) for col in table])
+
+    def make_pseudo_markdown_table(self, array, line_sep="\n"):
+        markdown = str("|")
+
+        for e in array[0]:
+            to_add = " " + str(e) + str(" |")
+            markdown += to_add
+            markdown += line_sep
+
+        for entry in array[1:]:
+            markdown += str("| ")
+            for e in entry:
+                to_add = str(e) + str(" | ")
+                markdown += to_add
+                markdown += line_sep
+
+        return markdown
+
+    def get_instance_dict(self, dic, sep: str = "\n") -> Dict[str, Any]:
+        linearized_table = self.make_pseudo_markdown_table(dic["table"], line_sep=sep)
+
+        if "gold_ind" in dic["annotation"]:
+            facts = dic["annotation"]["gold_ind"]
+        elif "gold_inds" in dic["annotation"]:
+            facts = dic["annotation"]["gold_inds"]
+        else:
+            facts = {}
+
+        text = ""
+        for fact_type, fact in facts.items():
+            if "text" in fact_type:
+                text += fact
+        context = ""
+        for ind, q in enumerate(dic["annotation"]["cur_dial"]):
+            if ind < len(dic["annotation"]["cur_dial"]) - 1:
+                context += q + " The answer is " + str(dic["annotation"]["exe_ans_list"][ind]) + " " + sep
+            else:
+                context += q + " The answer is "
+        doc = f"Table: {sep}{linearized_table}{sep}Text: {text}{sep}Questions: "
+        answer = str(dic["annotation"]["exe_ans"])
+        return {
+            "input": PassageQuestionInput(passage="".join(doc), question=context, separator=" "),
+            "references": [Reference(Output(text=answer), tags=[CORRECT_TAG])],
+        }
+
+    def load_dataset(self, output_path: str) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
+        """Loads the dataset downloaded in download_dataset()."""
+        folder_path = os.path.join(output_path, "data", self.DATASET_FILE_NAME)
+        train_data = []
+        dev_data = []
+
+        with open(os.path.join(folder_path, "train_turn.json"), encoding="utf-8") as f:
+            train_raw_data = json.load(f)
+
+        for problem in train_raw_data:
+            train_data.append(self.get_instance_dict(problem))
+
+        with open(os.path.join(folder_path, "dev_turn.json"), encoding="utf-8") as f:
+            dev_raw_data = json.load(f)
+
+        for problem in dev_raw_data:
+            dev_data.append(self.get_instance_dict(problem))
+
+        return train_data, dev_data
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        """Returns the instances for this scenario."""
+        # Body of the function
+        self.download_dataset(output_path)
+        train_data, dev_data = self.load_dataset(output_path)
+        train_k = 5
+        train_instances = [
+            Instance(input=d["input"], references=d["references"], split=TRAIN_SPLIT) for d in train_data[:train_k]
+        ]
+        valid_instances = [
+            Instance(input=d["input"], references=d["references"], split=VALID_SPLIT) for d in dev_data[:1000]
+        ]
+        print("length of validate:", len(valid_instances))
+        return train_instances + valid_instances
diff --git a/src/helm/benchmark/scenarios/cti_mitre_scenario.py b/src/helm/benchmark/scenarios/cti_mitre_scenario.py
new file mode 100644
index 00000000000..36083f54e10
--- /dev/null
+++ b/src/helm/benchmark/scenarios/cti_mitre_scenario.py
@@ -0,0 +1,304 @@
+import os
+import json
+import random
+from typing import List, Dict
+import pandas as pd
+from pandas import DataFrame
+from helm.common.general import ensure_file_downloaded, ensure_directory_exists
+from .scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TRAIN_SPLIT,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
+
+
+class CtiMitreScenario(Scenario):
+    """
+    Original Task:
+    - The original task is to classify the description of the situation regarding the system
+      into the security threats in that situation.
+    - The classification categories are the approximately 200 categories of attack techniques
+      in the enterprise as defined by MITRE ATT&CK v10.
+
+    Implemented Task:
+    - Since classification into so many classes is difficult to handle in a generative language model
+      such as GPT itself, we implement this task as a multiple-choice task.
+    - Each choice is the name of the attack technique category into which the description is classified.
+    - The number of options is determined by the parameter (num_options).
+        - The minimum number of options is 2 and the maximum is 199, the number of all categories of
+          attack methods defined in MITRE ATT&CK v10.
+    - From the 199 choices, num_options choices, including the correct answer and a default case,
+      are randomly selected and used.
+        - If num_options is not specified, all 199 category names will be used as choices.
+
+    Data:
+    - dataset.csv
+        - Target dataset
+        - https://github.com/dessertlab/cti-to-mitre-with-nlp/raw/main/data/dataset.csv
+        - This data is of the form [sentence, label_tec, label_subtec, tec_name]
+            - sentence: the description
+            - label_tec: label for attack technique category
+            - label_subtec: label for attack technique subcategory
+            - tec_name : name(simple description) for attack technique subcategory
+                - Note: we need to extract name for attack technique category
+                        from enterprise-attack.json
+
+    - enterprise-attack.json
+        - https://github.com/mitre/cti/archive/refs/tags/ATT&CK-v10.1.zip
+            - /mitre_v10/enterprise-attack/enterprise-attack.json
+        - This data contains relation from attack technique name to attack technique label
+            - we can extract attack technique category name for label_tec using this json data.
+
+
+    Prompt: (k is specified by num_options)
+        -----------------------
+        Answer the possible security attacks in each of the following situations from each of the options below.
+                                        [instruction]
+
+        Situation: <description>        [in context examples]
+        A. <attack_category_name_1>
+        B. <attack_category_name_2>
+        ...
+        Y. <attack_category_name_(k-1)>
+        Z. Others
+        Answer: <correct_answer>
+
+        ... (Examples are output as long as the length allows) ...
+
+        Situation: <target_description>  [target question]
+        A. <attack_category_name_t1>
+        B. <attack_category_name_t2>
+        ...
+        Y. <attack_category_name_t(k-1)>
+        Z. Others
+        Answer:
+        -----------------------
+
+    Example of prompt (num_options = 5)
+        -----------------------
+        Answer the possible security attacks in each of the following situations from each of the options below.
+
+        Situation: ZxShell can launch a reverse command shell.
+        A. Command and Scripting Interpreter
+        B. System Shutdown/Reboot
+        C. Exfiltration Over C2 Channel
+        D. Direct Volume Access
+        E. Others
+        Answer: A
+
+        ....(Omitted)...
+
+        Situation: APC injection is a method of executing arbitrary code in the address space.
+        A. Event Triggered Execution
+        B. Process Injection
+        C. Non-Application Layer Protocol
+        D. Escape to Host
+        E. Others
+        Answer: B
+
+        Situation: Timestomping may be used along with file name Masquerading to hide malware and tools.
+        A. Search Victim-Owned Websites
+        B. Internal Spearphishing
+        C. Application Layer Protocol
+        D. Indicator Removal on Host
+        E. Others
+        Answer:
+        -----------------------
+
+    Reference:
+        V. Orbinato, M. Barbaraci, R. Natella, and D. Cotroneo, 
+        “Automatic Mapping of Unstructured Cyber Threat Intelligence: An Experimental Study,” 
+        in Proceedings of the 33rd IEEE International Symposium on Software Reliability Engineering (ISSRE), 2022.
+        https://ieeexplore.ieee.org/abstract/document/9978947
+        
+    """
+
+    # Names of the tasks we support
+    name = "cti_mitre"
+    description = "Classification of security attack opportunities on system"
+    tags = ["classification", "MITRE ATT&CK", "cyber_security"]
+
+    # Constant for splitting target data into train and test data.
+    train_ratio = 0.7
+
+    # Constant for default number of options. # of (MITRE ATT&CK attack categories) is 199 in ATT&CK-v10.1.zip
+    MAX_NUM_OPTIONS = 199
+
+    # Constant: the description for Others option
+    OTHERS_OPTION = "Others"
+
+    # Methods
+
+    def __init__(self, num_options=None, seed=None):
+        """
+        num_options: int, number of choices in multiple-choice task
+        seed: int, seed for random module. The seed is set to random if specified
+        """
+        super().__init__()
+        # dataset url
+        self.dataset_all_url = "https://github.com/dessertlab/cti-to-mitre-with-nlp/raw/main/data/dataset.csv"
+        self.dataset_all_name = "dataset.csv"
+        # MITRE ATT CK (v10) url
+        self.mitre_att_ck_v10_url = "https://github.com/mitre/cti/archive/refs/tags/ATT&CK-v10.1.zip"
+        self.mitre_dir = "mitre_v10"
+        self.enterprise_attack_dir = "enterprise-attack"
+        self.enterprise_attack_json = "enterprise-attack.json"
+        # Number of options : if num_options is not specified, num_options=MAX_NUM_OPTIONS
+        if num_options is not None and 0 < num_options <= CtiMitreScenario.MAX_NUM_OPTIONS:
+            self.num_options = num_options
+        else:
+            self.num_options = CtiMitreScenario.MAX_NUM_OPTIONS
+        # set seed to random
+        random.seed(seed)
+        self.rand = random
+
+    def download_dataset(self):
+        """Download dataset.csv"""
+        data_dir = self.data_dir
+        ensure_directory_exists(data_dir)
+        ensure_file_downloaded(
+            source_url=self.dataset_all_url,
+            target_path=os.path.join(data_dir, self.dataset_all_name),
+        )
+
+    def download_MITRE_info(self):
+        """Download zip file containing enterprise_attack.json"""
+        data_dir = self.data_dir
+        ensure_directory_exists(data_dir)
+        ensure_file_downloaded(
+            source_url=self.mitre_att_ck_v10_url,
+            target_path=os.path.join(data_dir, self.mitre_dir),
+            unpack=True,
+            unpack_type="unzip",
+        )
+
+    @staticmethod
+    def make_label_category_name_dict(jdata) -> Dict[str, str]:
+        """
+        This makes mapping from label_tec (attack technique category label) to tec_category_name
+        (attack technique category name)
+        - jdata is json object for enterprise_attack.json
+        """
+        objs = jdata["objects"]
+        label_cname: Dict[str, str] = {}
+        if jdata is None:
+            return label_cname
+        for i in range(0, len(objs)):
+            obj = objs[i]
+            if obj["type"] == "attack-pattern":
+                if "x_mitre_is_subtechnique" in obj and not obj["x_mitre_is_subtechnique"]:
+                    extrefs = obj["external_references"]
+                    label = None
+                    for ref in extrefs:
+                        if ref["source_name"] == "mitre-attack":  # and "external_id" in ref:
+                            label = ref["external_id"]
+                            break
+                    if label is not None and "name" in obj:
+                        cname = obj["name"]
+                        label_cname[label] = cname
+        return label_cname
+
+    def select_option_cnames(self, k: int, excluded: str, cnames: List[str]) -> List[str]:
+        """
+        Randomly select k tec_category_names (attack technique category names) as choices.
+        However, choose not to include "excluded",
+        and if k is less than the total number of possible choices, add a default case.
+        - k : number of choices
+        - excluded : excluded attack technique category name (usually, specify correct answer)
+        - cnames : list containing all attack technique category names
+        """
+        target_cnames = [v for v in cnames if v != excluded]
+
+        if len(target_cnames) <= k:
+            return target_cnames
+        elif k - 1 <= 0:
+            return [CtiMitreScenario.OTHERS_OPTION]
+        else:
+            ops = self.rand.sample(target_cnames, k - 1)
+            ops.append(CtiMitreScenario.OTHERS_OPTION)
+            return ops
+
+    @staticmethod
+    def bring_others_to_end(references: List[Reference]) -> List[Reference]:
+        """Rearrange the list of references so that the reference corresponding to the default case comes last"""
+        newref_list: List[Reference] = []
+        others_list: List[Reference] = []
+        for ref in references:
+            if ref.output.text == CtiMitreScenario.OTHERS_OPTION:
+                others_list.append(ref)
+            else:
+                newref_list.append(ref)
+        newref_list.extend(others_list)
+        return newref_list
+
+    def create_multiple_choice_instances(
+        self, df: DataFrame, split: str, label_cname: Dict[str, str]
+    ) -> List[Instance]:
+        """Create a list of instances corresponding to the multiple choice task"""
+        instances = []
+        for idx in df.index:
+            linedata = df.loc[idx]
+            sent = linedata["sentence"]
+            label_tec = linedata["label_tec"]
+            correct_cname = label_cname[label_tec]
+            all_cnames = [cname for cname in label_cname.values()]
+            num_of_wrong_options = self.num_options - 1
+            wrong_cnames = self.select_option_cnames(num_of_wrong_options, correct_cname, all_cnames)
+            input = Input(text=sent)
+            # create options (including one correct answer)
+            correct_ref = Reference(Output(text=correct_cname), tags=[CORRECT_TAG])
+            references = [Reference(Output(text=cname), tags=[]) for cname in wrong_cnames]
+            references.append(correct_ref)
+            # shuffle answer options
+            self.rand.shuffle(references)
+            # bring others_option to the end of the reference list
+            ord_references = CtiMitreScenario.bring_others_to_end(references)
+            instance = Instance(input, ord_references, split=split)
+            instances.append(instance)
+        return instances
+
+    def create_instances(self, df: DataFrame, split: str, label_cname: Dict[str, str]) -> List[Instance]:
+        return self.create_multiple_choice_instances(df, split, label_cname)
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        self.data_dir = os.path.join(output_path, "data")
+
+        # download dataset
+        self.download_dataset()
+
+        # download MITRE_ATT_CK_V10 information
+        self.download_MITRE_info()
+
+        # load dataset
+        all_data_dir = os.path.join(self.data_dir, self.dataset_all_name)
+        all_df = pd.read_csv(all_data_dir)
+
+        # split all_df into train and test data frames
+        train_df = all_df.sample(frac=CtiMitreScenario.train_ratio, random_state=0)
+        test_df = all_df.drop(train_df.index).sample(frac=1, random_state=0)
+
+        # load MITRE info json data
+        label_name_json = os.path.join(
+            self.data_dir, self.mitre_dir, self.enterprise_attack_dir, self.enterprise_attack_json
+        )
+        jdata = None
+        with open(label_name_json) as f:
+            jdata = json.load(f)
+
+        # make mapping from label_tec to tec_category_name
+        label_cname = self.make_label_category_name_dict(jdata)
+
+        # create instances from each dataset
+        instances_train = self.create_instances(train_df, TRAIN_SPLIT, label_cname)
+        instances_test = self.create_instances(test_df, TEST_SPLIT, label_cname)
+
+        # return all instances
+        all_instances = []
+        all_instances.extend(instances_train)
+        all_instances.extend(instances_test)
+        return all_instances
diff --git a/src/helm/benchmark/scenarios/echr_judge_scenario.py b/src/helm/benchmark/scenarios/echr_judge_scenario.py
new file mode 100644
index 00000000000..681b9659a15
--- /dev/null
+++ b/src/helm/benchmark/scenarios/echr_judge_scenario.py
@@ -0,0 +1,183 @@
+import os
+import glob
+import json
+from typing import Dict, List, Optional
+from helm.common.general import ensure_file_downloaded, ensure_directory_exists
+from .scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TRAIN_SPLIT,
+    VALID_SPLIT,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
+
+
+class EchrJudgeScenario(Scenario):
+    """
+    Task:
+    - This scenario is a binary classification task.
+    - It classifies human right case description into violation or no violation.
+
+    Dataset:
+    - EN_train, EN_dev, EN_test (These data sets are downloaded).
+    - These dataset are considered as TRAIN_SPLIT, VALID_SPLIT, TEST_SPLIT.
+    - Each dataset is a set of JSON files containing at least TEXT and VIOLATED_ARTICLES fields.
+        - TEXT fields contains sentences.
+        - VIOLATED_ARTICLES contains information about
+          human rights violation or no violation (in case of empty list)
+
+    Prompt:
+        ------
+        Is the following case a violation of human rights?  (Instructions)
+
+        Case: Human rights have not been violated.          (Trivial No case in instructions)
+        Answer: No
+
+        Case: Human rights have been violated.              (Trivial Yes case in instructions)
+        Answer: Yes
+
+        Case: <TEXT>                                        (In-context examples, if possible)
+        Answer: <Label>                                     (Label is correct answer, Yes or No)
+
+        ...
+        Case: <TEXT>                                        (Target input text)
+        Answer: <Output>                                    (Output ::= Yes | No)
+        ----
+
+    - <TEXT> parts are often too long, resulting in zero-shot predictions in many cases.
+      Therefore, we have added two trivial cases to the instructions part.
+    
+    Reference:
+        Ilias Chalkidis, Ion Androutsopoulos, and Nikolaos Aletras. 2019. 
+        Neural Legal Judgment Prediction in English. 
+        In Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, 
+        pages 4317–4323, Florence, Italy. Association for Computational Linguistics.
+        https://aclanthology.org/P19-1424/
+        
+    """
+
+    # Names of the tasks we support
+    name = "echr_judge"
+    description = "Predicting Legal Decisions on Human Rights Violations in English"
+    tags = ["classification", "judgement", "legal"]
+
+    # Dataset file name
+    DATASET_DOWNLOAD_URL = "https://archive.org/download/ECHR-ACL2019/ECHR_Dataset.zip"
+    DATASET_NAME = "ECHR_Dataset"
+
+    # Answer labels
+    ANSWER_VIOLATION = "Yes"
+    ANSWER_NO_VIOLATION = "No"
+
+    # Prompt constants (used in adapter)
+    PROMPT_INPUT = "Case"
+    PROMPT_OUTPUT = "Answer"
+
+    YES_EX = f"\n\n{PROMPT_INPUT}: Human rights have been violated.\n{PROMPT_OUTPUT}: {ANSWER_VIOLATION}"
+    NO_EX = f"\n\n{PROMPT_INPUT}: Human rights have not been violated.\n{PROMPT_OUTPUT}: {ANSWER_NO_VIOLATION}"
+    INST_EX = f"{NO_EX}{YES_EX}"
+
+    PROMPT_INST = "Is the following case a violation of human rights?"  # Prompt for instructions
+    PROMPT_INST_WITH_EX = f"{PROMPT_INST}{INST_EX}"  # Prompt for instructions with trivial examples
+
+    # Methods
+    def __init__(self, doc_max_length: Optional[int] = None):
+        """
+        Args:
+            doc_max_length: Int indicating the maximum word length to filter documents.
+                            Documents longer than this length are ignored.
+                            NOTE: Currently uses whitespace tokenization.
+        """
+        super().__init__()
+        self.doc_max_length = doc_max_length
+
+    def download_data(self):
+        data_dir = self.data_dir
+        ensure_directory_exists(data_dir)
+        ensure_file_downloaded(
+            source_url=self.DATASET_DOWNLOAD_URL,
+            target_path=os.path.join(data_dir, self.DATASET_NAME),
+            unpack=True,
+            unpack_type="unzip",
+        )
+
+    def check_small_word_length(self, jdata, limit_word_length):
+        """
+        This checks number of words in jdata < limit_word_length.
+        jdata : dict of {"TEXT":str,..}, limit_word_length:int .
+        """
+        text = " ".join(jdata["TEXT"])
+        text = text.replace("\n", " ")
+        word_length = len(text.split())
+        if limit_word_length is None:
+            return True
+        else:
+            if word_length < limit_word_length:
+                return True
+            else:
+                return False
+
+    def check_violated_article(self, jdata):
+        """
+        This checks that article jdata is violation data or not,
+        and returns True if jdata is violation data, otherwise False.
+        jdata : dict of {"VIOLATED_ARTICLES":str list, }.
+        """
+        valist = jdata["VIOLATED_ARTICLES"]
+        return len(valist) > 0
+
+    def get_input_references(self, jdata, limit_word_length):
+        """
+        This makes dictionary for Instance data from a json data for one document.
+        Instance is returned if word length of jdata's text < limit_word_length, otherwise None is returned.
+        """
+        text = " ".join(jdata["TEXT"])
+        if self.check_small_word_length(jdata, limit_word_length):
+            violated = self.check_violated_article(jdata)
+            answer = self.ANSWER_VIOLATION if violated else self.ANSWER_NO_VIOLATION
+            ref_correct = Reference(Output(text=answer), tags=[CORRECT_TAG])
+            references = [ref_correct]
+            instance = {"input": Input(text), "references": references}
+            return instance
+        else:
+            return None
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        self.data_dir = os.path.join(output_path, "data")
+
+        # download data
+        self.download_data()
+
+        dname_insts: Dict[str, List[Instance]] = {
+            "EN_train": [],
+            "EN_dev": [],
+            "EN_test": [],
+        }  # dataname to instance list
+        dname_split = {"EN_train": TRAIN_SPLIT, "EN_dev": VALID_SPLIT, "EN_test": TEST_SPLIT}  # dataname to split
+
+        # read json files under EN_train/, EN_dev/, and EN_test/ and convert these jsons into instances
+        for dname in dname_insts:
+            data_dir = self.data_dir
+            target_data_dir = os.path.join(data_dir, self.DATASET_NAME, dname)
+            data_dir_desc = target_data_dir + "/*.json"
+            for filename in sorted(glob.glob(data_dir_desc)):
+                with open(os.path.join(os.getcwd(), filename), "r") as f:
+                    jdata = json.load(f)
+                    # get input and references from json data
+                    idata = self.get_input_references(jdata, self.doc_max_length)
+                    if idata is not None:
+                        # create Instance for jdata
+                        instance = Instance(
+                            input=idata["input"], references=idata["references"], split=dname_split[dname]
+                        )
+                        dname_insts[dname].append(instance)
+
+        # all instances for json data in EN_train, EN_dev, and EN_test directories.
+        all_instances: List[Instance] = []
+        for dname in dname_insts:
+            all_instances.extend(dname_insts[dname])
+        return all_instances
diff --git a/src/helm/benchmark/scenarios/financial_phrasebank_scenario.py b/src/helm/benchmark/scenarios/financial_phrasebank_scenario.py
new file mode 100644
index 00000000000..bb80f1b627a
--- /dev/null
+++ b/src/helm/benchmark/scenarios/financial_phrasebank_scenario.py
@@ -0,0 +1,91 @@
+from pathlib import Path
+import random
+import datasets
+from typing import Dict, List
+from .scenario import Scenario, Instance, Reference, CORRECT_TAG, TRAIN_SPLIT, TEST_SPLIT, Input, Output
+
+
+def get_instructions():
+    instruction = """The dataset consists of sentences from English language financial news categorized by sentiment.
+Classify the sentences into one of the 3 sentiment categories.
+Possible labels:\n1. positive\n2. neutral\n3. negative"""
+    return instruction
+
+
+SUBSETS = ["sentences_allagree", "sentences_75agree", "sentences_66agree", "sentences_50agree"]
+
+
+class FinancialPhrasebankScenario(Scenario):
+    """
+    Context:
+    Polar sentiment dataset of sentences from financial news. The dataset consists of 4840 sentences from English
+    language financial news categorized by sentiment. The dataset is divided by agreement rate of 5-8 annotators.
+
+    This release of the financial phrase bank covers a collection of 4840 sentences. The selected collection of
+    phrases was annotated by 16 people with adequate background knowledge on financial markets.
+
+    Given the large number of overlapping annotations (5 to 8 annotations per sentence), there are several ways
+    to define a majority vote based gold standard. To provide an objective comparison, we have formed 4 alternative
+    reference datasets based on the strength of majority agreement:
+
+    Data source:
+    https://huggingface.co/datasets/takala/financial_phrasebank
+
+    Reference:
+    P. Malo, A. Sinha, P. Korhonen, J. Wallenius, and P. Takala, “Good debt or bad debt: Detecting semantic orientations in economic texts,” Journal of the Association for Information Science and Technology, vol. 65, 2014.
+
+    """
+
+    name = "financial_phrasebank"
+    description = "The dataset consists of 4840 sentences from English \
+                   language financial news categorized by sentiment."
+    tags = ["finance", "sentiment analysis", "classification"]
+
+    def __init__(self, subset: str, random_seed: int = 121):
+        """The initialization of an instance.
+
+        Args:
+            subset: str: This argument is used to specify the ratio of annotators who agreed on the ground truth label. 
+            The value must be one of the strings defined in 
+            SUBSETS = ["sentences_allagree", "sentences_75agree", "sentences_66agree", "sentences_50agree"].
+            random_seed: int = 121: The random seed for sampling the train/test splits.
+        """
+        super().__init__()
+        assert subset in SUBSETS, "Unknown subset: {}".format(subset)
+        self.subset = subset
+        self.random_seed = random_seed
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+
+        fields = ["sentence"]
+        cache_dir = str(Path(output_path) / "data")
+        # Download raw data
+        # Note: Only using public labeled instances now. Check if we can get the hidden test set labels.
+        all_usable_dataset = datasets.load_dataset(
+            "financial_phrasebank", self.subset, cache_dir=cache_dir, split="train"
+        )
+        assert isinstance(all_usable_dataset, datasets.Dataset)
+        dataset = all_usable_dataset.train_test_split(train_size=0.8, seed=self.random_seed)
+        train_dataset, test_dataset = dataset["train"], dataset["test"]
+        class_label_to_string = train_dataset.features["label"].int2str
+
+        dataset_splits: Dict[str, datasets.Dataset] = {
+            TRAIN_SPLIT: train_dataset,
+            TEST_SPLIT: test_dataset,
+        }
+
+        # Read all instances
+        random.seed(self.random_seed)
+        instances: List[Instance] = []
+        for split, subset in dataset_splits.items():
+            for x in subset:
+                assert fields is not None, "Field ordering not loaded"
+                prompt: str = "\n".join([f"{x[field]}" for field in fields])
+                instance = Instance(
+                    input=Input(text=prompt),
+                    references=[Reference(Output(text=class_label_to_string(x["label"])), tags=[CORRECT_TAG])],
+                    split=split,
+                )
+                instances.append(instance)
+
+        return instances
diff --git a/src/helm/benchmark/scenarios/kpi_edgar_scenario.py b/src/helm/benchmark/scenarios/kpi_edgar_scenario.py
new file mode 100644
index 00000000000..17923175c09
--- /dev/null
+++ b/src/helm/benchmark/scenarios/kpi_edgar_scenario.py
@@ -0,0 +1,245 @@
+import os
+import random
+from typing import List, Tuple, Dict, Any
+import json
+import itertools
+import logging
+import re
+
+from helm.common.general import ensure_file_downloaded, ensure_directory_exists
+from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, VALID_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output
+
+# TAG_DICT = {
+#     "kpi": "Key Performance Indicators expressible in numerical and monetary value, e.g. revenue or net sales.",
+#     "cy": "Current Year monetary value of a KPI .",
+#     "py": "Prior Year monetary value of a KPI.",
+#     "py1": "2 Year Past Value of a KPI",
+#     "increase": "Increase of a KPI from the previous year to the current year.",
+#     "increase-py": "Analogous to increase, but from py1 to py.",
+#     "decrease": "Decrease of a KPI from the previous year to the current year.",
+#     "decrease-py": "Analogous to decrease, but from py1 to py.",
+#     "thereof": "Represents a subordinate KPI, i.e. if a KPI is part of another, broader KPI.",
+#     "attr": "Attribute that further describes a KPI.",
+#     "kpi-coref": "A co-reference to a KPI mentioned in a previous sentence.",
+#     "false-positive": "Captures tokens that are similar to other entities,"
+#     " but are explicitly not one of them, e.g. when the writer of the report forecasts next year’s revenue.",
+# }
+TAG_DICT = {
+    "kpi": "Key Performance Indicators expressible in numerical and monetary value",
+    "cy": "Current Year monetary value",
+    "py": "Prior Year monetary value",
+    "py1": "Two Year Past Value",
+    # "increase": "",
+    # "increase-py": "",
+    # "decrease": "",
+    # "decrease-py": "",
+    # "thereof": "",
+    # "attr": "",
+    # "kpi-coref": "",
+    # "false-positive": ""
+}
+TAG_PAREN_RE = (r"\[", r"\]")
+TAG_PAREN = tuple((e.strip("\\") for e in TAG_PAREN_RE))
+TAG_PAREN_ESC = ("(", ")")
+UNUSED_SPLIT = "unused"
+SPLIT_DICT = {TRAIN_SPLIT: "train", VALID_SPLIT: "valid", TEST_SPLIT: "test", UNUSED_SPLIT: None}
+SPLIT_TYPE_DICT = {v: k for (k, v) in SPLIT_DICT.items()}
+
+
+class KPIEDGARScenario(Scenario):
+    """
+    Paper:
+    T. Deußer et al.,
+    “KPI-EDGAR: A Novel Dataset and Accompanying Metric for Relation Extraction from Financial Documents.” 2022.
+    https://arxiv.org/abs/2210.09163
+
+    Website:
+    https://github.com/tobideusser/kpi-edgar
+
+    This is a dataset for Named Entity Recognition task for financial domain.
+
+    Concretely, we prompt models using the following format:
+
+    ```
+Context: {Sentence}
+Task: Extract key performance indicators (KPIs) and values from the above text. Also, specify one of the following categories to each of the extracted KPIs and values in brackets. 
+kpi: Key Performance Indicators expressible in numerical and monetary value, cy: Current Year monetary value, py: Prior Year monetary value, py1: Two Year Past Value.
+Answer:
+    ```
+
+    Example
+
+    ```
+Context: The following table summarizes our total share-based compensation expense and excess tax benefits recognized : As of December 28 , 2019 , there was $ 284 million of total unrecognized compensation cost related to nonvested share-based compensation grants .
+Task: Extract key performance indicators (KPIs) and values from the above text. Also, specify one of the following categories to each of the extracted KPIs and values in brackets.
+kpi: Key Performance Indicators expressible in numerical and monetary value, cy: Current Year monetary value, py: Prior Year monetary value, py1: Two Year Past Value.
+Answer: 
+    ```
+
+    Reference:
+    ```
+    284 [cy], total unrecognized compensation cost [kpi]
+    ```
+
+    """  # noqa
+
+    name = "kpi_edgar"
+    description = "Named Entity Recognition from financial documents."
+    tags = ["question_answering"]  # TODO? https://crfm-helm.readthedocs.io/en/latest/schemas/
+    base_url = "https://github.com/tobideusser/kpi-edgar/raw/main/data/kpi_edgar.json"
+    dataset_file_name = "kpi_edgar.json"
+
+    is_extraction = True
+
+    @staticmethod
+    def extract_samples(dataset_obj: List[Dict]) -> List[Dict]:
+        def get_list(x: Any) -> List[Dict]:
+            return x if isinstance(x, list) else []
+
+        doc_seg_sentence_list_list_list = [
+            [[st for st in get_list(seg["sentences"])] for seg in get_list(doc["segments"])] for doc in dataset_obj
+        ]
+        seg_sentence_list_list = list(itertools.chain.from_iterable(doc_seg_sentence_list_list_list))
+        sentence_list = list(itertools.chain.from_iterable(seg_sentence_list_list))
+        return sentence_list
+
+    @staticmethod
+    def get_split(sample_dict: Dict, split_type_dict: Dict[Any, str]) -> str:
+        return split_type_dict[sample_dict["split_type"]]
+
+    @staticmethod
+    def insert_tags(word_list: List[str], anno_list: List[Dict]) -> List[str]:
+        def add_tag_part(the_word: str, tag_type: str, is_start: bool) -> str:
+            tagged_word = ("<%s>" % tag_type) + the_word if is_start else the_word + ("</%s>" % tag_type)
+            return tagged_word
+
+        curr_word_list = list(word_list)
+        for anno in anno_list:
+            start_idx = anno["start"]
+            end_idx = anno["end"] - 1
+            if start_idx >= len(curr_word_list):
+                logging.warning(curr_word_list)
+                logging.warning(start_idx)
+            if end_idx >= len(curr_word_list):
+                logging.warning(curr_word_list)
+                logging.warning(end_idx)
+
+            curr_word_list[start_idx] = add_tag_part(curr_word_list[start_idx], anno["type_"], True)
+            curr_word_list[end_idx] = add_tag_part(curr_word_list[end_idx], anno["type_"], False)
+
+        return curr_word_list
+
+    @staticmethod
+    def create_prompt(sample: dict) -> Tuple[str, str]:
+        word_list = [wd["value"] for wd in sample["words"]]
+        anno_list = sample["entities_anno"]
+        tag_dict = TAG_DICT
+        tag_desc_list = ["<%s></%s>: %s" % (key, key, val) for (key, val) in tag_dict.items()]
+        tag_desc = "\n".join(tag_desc_list)
+
+        passage = " ".join(word_list)
+        context = "Context: %s\n" % (passage)
+        question = "Question: Enclose KPIs (key performance indicators) and values of those in the above text with the following tags.\n"  # noqa
+        tags = "%s" % tag_desc
+        prompt = context + question + tags
+
+        tagged_word_list = KPIEDGARScenario.insert_tags(word_list, anno_list)
+        answer = " ".join(tagged_word_list)
+        return (prompt, answer)
+
+    @staticmethod
+    def escape_parenthesis(text: str, re_tag_paren: Tuple[str, str], esc_paren: Tuple[str, str]) -> str:
+        tmp0 = re.sub(re_tag_paren[0], esc_paren[0], text)
+        tmp1 = re.sub(re_tag_paren[1], esc_paren[1], tmp0)
+        return tmp1
+
+    @staticmethod
+    def create_ans_list_extraction(
+        word_list: List[str],
+        anno_list: List[Dict],
+    ) -> List[str]:
+        def create_one_ans(word_list: List[str], anno: Dict):
+            start_idx = anno["start"]
+            end_idx = anno["end"]
+            anno_word_list = word_list[start_idx:end_idx]
+            tmp_phrase = " ".join(anno_word_list)
+            phrase = KPIEDGARScenario.escape_parenthesis(tmp_phrase, TAG_PAREN_RE, TAG_PAREN_ESC)
+            ans_str = "%s %s%s%s" % (phrase, TAG_PAREN[0], anno["type_"], TAG_PAREN[1])
+            return ans_str
+
+        ans_list = [create_one_ans(word_list, anno) for anno in anno_list]
+        return ans_list
+
+    @staticmethod
+    def create_prompt_extraction(sample: dict) -> Tuple[str, str]:
+        word_list = [wd["value"] for wd in sample["words"]]
+        anno_list = sample["entities_anno"]
+        tag_dict = TAG_DICT
+        tag_desc_list = ["%s: %s" % (key, val) for (key, val) in tag_dict.items()]
+        tag_desc = ", ".join(tag_desc_list) + "."
+
+        passage0 = " ".join(word_list)
+        passage = KPIEDGARScenario.escape_parenthesis(passage0, TAG_PAREN_RE, TAG_PAREN_ESC)
+        context = "Context: %s\n" % (passage)
+        question = "Task: Extract key performance indicators (KPIs) and values from the above text. Also, specify one of the following categories to each of the extracted KPIs and values in brackets.\n"  # noqa
+        tags = "%s" % tag_desc
+        prompt = context + question + tags
+
+        ans_list = KPIEDGARScenario.create_ans_list_extraction(word_list, anno_list)
+        answer = ", ".join(ans_list)
+        return (prompt, answer)
+
+    @staticmethod
+    def get_split_instances(dataset_obj: List[Dict], is_extraction=True) -> List[Instance]:
+        """
+        Helper for generating instances for a split.
+        Args:
+            dataset_obj (list): Dataset for the corresponding data split
+
+        Returns:
+            List[Instance]: Instances for the specified split
+        """
+        sample_list = KPIEDGARScenario.extract_samples(dataset_obj)
+
+        prompt_split_list = [
+            (
+                KPIEDGARScenario.create_prompt(sample)
+                if not is_extraction
+                else KPIEDGARScenario.create_prompt_extraction(sample),
+                KPIEDGARScenario.get_split(sample, SPLIT_TYPE_DICT),
+            )
+            for sample in sample_list
+        ]
+
+        instance_list = [
+            Instance(
+                input=Input(text=pr),
+                references=[Reference(Output(text=ans), tags=[CORRECT_TAG])],
+                split=split,
+            )
+            for ((pr, ans), split) in prompt_split_list
+        ]
+
+        return instance_list
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+
+        random.seed(0)  # we pick a random dialogue point to query the model
+
+        data_path = os.path.join(output_path, "data")
+        ensure_directory_exists(data_path)
+
+        base_url = self.base_url
+        dataset_file_name = self.dataset_file_name
+        target_path = os.path.join(data_path, dataset_file_name)
+        ensure_file_downloaded(
+            source_url=base_url,
+            target_path=target_path,
+            unpack=False,
+        )
+
+        instances = []
+        with open(target_path, "r") as f:
+            dataset_dict = json.load(f)
+            instances = KPIEDGARScenario.get_split_instances(dataset_dict, self.is_extraction)
+        return instances
diff --git a/src/helm/benchmark/scenarios/legal_contract_scenario.py b/src/helm/benchmark/scenarios/legal_contract_scenario.py
new file mode 100644
index 00000000000..eb1e7cee58e
--- /dev/null
+++ b/src/helm/benchmark/scenarios/legal_contract_scenario.py
@@ -0,0 +1,144 @@
+import os
+from datasets import load_dataset
+import pandas as pd
+import csv
+import json
+
+from typing import List, Optional
+from helm.common.general import ensure_file_downloaded, ensure_directory_exists
+from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, TEST_SPLIT, CORRECT_TAG, PassageQuestionInput, Output
+
+
+class LegalContractScenario(Scenario):
+    """Legal Contracts (Summarization)
+
+    Description:
+    Plain English Summarization of Contracts [(Manor et al., 2019)](https://aclanthology.org/W19-2201.pdf).
+    
+    Data source:
+    https://github.com/lauramanor/legal_summarization/
+    
+    Reference:
+    Laura Manor and Junyi Jessy Li. 2019. 
+    Plain English Summarization of Contracts. 
+    In Proceedings of the Natural Legal Language Processing Workshop 2019, pages 1–11, 
+    Minneapolis, Minnesota. Association for Computational Linguistics.
+    https://aclanthology.org/W19-2201/
+
+    """
+
+    TRAIN_RATIO: float = 0.2
+
+    name = "legal_contract"
+    description = "Text summarization with legislative corpus"
+    tags = ["summarization", "legal"]
+
+    def __init__(
+        self,
+        sampling_min_length: Optional[int] = None,
+        sampling_max_length: Optional[int] = None,
+        doc_max_length: Optional[int] = None,
+    ):
+        """
+        Initializes the scenario.
+        Args:
+            sampling_min_length: Int indicating minimum length for training
+                                 documents. Training examples smaller than
+                                 sampling_min_length will be filtered out.
+                                 Useful for preventing the adapter from sampling
+                                 really small documents.
+            sampling_max_length: Int indicating maximum length for training
+                                 documents. Training examples larger than
+                                 sampling_max_length will be filtered out.
+                                 Useful for preventing the adapter from
+                                 sampling really large documents.
+            doc_max_length: Int indicating the maximum length to truncate
+                            documents. Documents in all splits will be
+                            truncated to doc_max_length tokens.
+                            NOTE: Currently uses whitespace tokenization.
+        """
+        super().__init__()
+        self.sampling_min_length = sampling_min_length
+        self.sampling_max_length = sampling_max_length
+        self.doc_max_length = doc_max_length
+
+    @staticmethod
+    def _clean_and_truncate(text: str, max_length: Optional[int] = None) -> str:
+        text = text.replace("\n", " ")
+        return " ".join(text.split()[:max_length])
+
+    def _load_dataset(self, output_path: str):
+        data_dir = os.path.join(output_path, "data")
+        ensure_directory_exists(data_dir)
+
+        source_url = "https://raw.githubusercontent.com/lauramanor/legal_summarization/master/all_v1.json"
+        source_file = os.path.basename(source_url)
+        target_path = os.path.join(data_dir, source_file)
+        ensure_file_downloaded(
+            source_url=source_url,
+            target_path=target_path,
+        )
+
+        source_file_noext = os.path.splitext(source_file)[0]
+        train_file = f"{source_file_noext}-{TRAIN_SPLIT}.csv"
+        test_file = f"{source_file_noext}-{TEST_SPLIT}.csv"
+        article_key = "original_text"
+        summary_key = "reference_summary"
+        target_df = pd.DataFrame()
+        with open(target_path) as f:
+            orig_df = json.load(f)
+            for _, dict in orig_df.items():
+                target_df = pd.concat([target_df, pd.DataFrame([dict])], ignore_index=True)
+            target_df = target_df.dropna(subset=[article_key, summary_key])
+            # Split randomly (works better than split by order)
+            train_df = target_df.sample(frac=LegalContractScenario.TRAIN_RATIO, random_state=0)
+            test_df = target_df.drop(train_df.index).sample(frac=1, random_state=0)
+            train_df.to_csv(os.path.join(data_dir, train_file), index=False, header=True, quoting=csv.QUOTE_NONNUMERIC)
+            test_df.to_csv(os.path.join(data_dir, test_file), index=False, header=True, quoting=csv.QUOTE_NONNUMERIC)
+
+        data_files = {}
+        data_files[TRAIN_SPLIT] = train_file
+        data_files[TEST_SPLIT] = test_file
+        dataset = load_dataset(data_dir, data_files=data_files)
+
+        return dataset, article_key, summary_key
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        dataset, article_key, summary_key = self._load_dataset(output_path)
+
+        instances: List[Instance] = []
+
+        PASSAGE_SYNONYM = "text"
+        PASSAGE_PREFIX = f"{PASSAGE_SYNONYM.capitalize()}: "
+        QUESTION_PREFIX = ""
+        QUESTION = f"Write the summary of the above {PASSAGE_SYNONYM}."
+
+        for split, split_data in dataset.items():
+            for example in split_data:
+                article: str = LegalContractScenario._clean_and_truncate(example[article_key], self.doc_max_length)
+
+                if split == TRAIN_SPLIT:
+                    art_len = len(article.split())
+                    if self.sampling_max_length and art_len > self.sampling_max_length:
+                        continue
+                    if self.sampling_min_length and art_len < self.sampling_min_length:
+                        continue
+
+                summary: str = LegalContractScenario._clean_and_truncate(example[summary_key])
+
+                input = PassageQuestionInput(
+                    passage=article,
+                    question=QUESTION,
+                    passage_prefix=PASSAGE_PREFIX,
+                    question_prefix=QUESTION_PREFIX,
+                )
+                output = Output(text=summary)
+
+                instance = Instance(
+                    input=input,
+                    references=[Reference(output=output, tags=[CORRECT_TAG])],
+                    split=split,
+                )
+                instances.append(instance)
+
+        return instances
diff --git a/src/helm/benchmark/scenarios/legal_opinion_scenario.py b/src/helm/benchmark/scenarios/legal_opinion_scenario.py
new file mode 100644
index 00000000000..bd3d7f49444
--- /dev/null
+++ b/src/helm/benchmark/scenarios/legal_opinion_scenario.py
@@ -0,0 +1,129 @@
+import os
+from typing import List
+import pandas as pd
+from helm.common.general import ensure_file_downloaded, ensure_directory_exists
+from .scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TRAIN_SPLIT,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
+
+
+class LegalOpinionScenario(Scenario):
+    """
+    Task:
+    - This task is a sentiment classification, which classifies legal phrases as positive, neutral, or negative.
+
+    Dataset:
+    - train.xlsx and test.xlsx
+        - from https://osf.io/zwhm8/
+        - Theses datasets are considered as TRAIN_SPLIT, TEST_SPLIT.
+        - Each dataset is excel .xlsx file
+            - train.xlsx : [Phrase, Label]
+            - test.xlsx  : [sentence, label]
+        - Labels
+            - label = 0 means negative sentiment
+            - label = 1 means neutral sentiment
+            - label = 2 means positive sentiment
+
+    Prompt:
+Classify the sentences into one of the 3 sentiment categories. Possible labels: positive, neutral, negative.
+{Sentence}
+Label: {positive/neutral/negative}
+
+    """
+
+    # Names of the tasks we support
+
+    name = "legal_opinion"
+    description = "Predicting the sentiment of the legal text in the positive, negative, or neutral."
+    tags = ["classification", "sentiment analysis", "legal"]
+
+    # Constants
+
+    ANSWER_POSITIVE = "positive"
+    ANSWER_NEGATIVE = "negative"
+    ANSWER_NEUTRAL = "neutral"
+
+    # Methods
+
+    def __init__(self):
+        super().__init__()
+        self.train_fname = "train.xlsx"
+        self.test_fname = "test.xlsx"
+        self.url_fname = {
+            "https://osf.io/download/hfn62/": self.train_fname,
+            "https://osf.io/download/q4adh/": self.test_fname,
+        }
+
+    def download_data(self):
+        data_dir = self.data_dir
+        url_fname = self.url_fname
+        ensure_directory_exists(data_dir)
+        for url in url_fname:
+            file_name = url_fname[url]
+            ensure_file_downloaded(
+                source_url=url,
+                target_path=os.path.join(data_dir, file_name),
+                unpack=False,
+            )
+
+    def int_senti(self, ilabel: int) -> str:
+        if ilabel == 2:
+            return self.ANSWER_POSITIVE
+        elif ilabel == 0:
+            return self.ANSWER_NEGATIVE
+        elif ilabel == 1:
+            return self.ANSWER_NEUTRAL
+        else:
+            return self.ANSWER_NEUTRAL
+
+    def create_instances(self, df, split, data_file) -> List[Instance]:
+        instances = []
+        if data_file == self.train_fname:
+            for idx in range(0, len(df)):
+                phrase = df.iloc[idx]["Phrase"]
+                label = df.iloc[idx]["Label"]
+                # convert integer_label to answer_string
+                slabel = self.int_senti(int(label))
+                input_i = Input(text=phrase)
+                # if slabel != self.ANSWER_NEUTRAL:
+                reference_i = Reference(Output(text=slabel), tags=[CORRECT_TAG])
+                instance = Instance(input_i, [reference_i], split=split)
+                instances.append(instance)
+            return instances
+        elif data_file == self.test_fname:
+            for idx in range(0, len(df)):
+                phrase = df.iloc[idx]["sentence"]
+                label = df.iloc[idx]["label"]
+                # convert integer_label to answer_string
+                slabel = self.int_senti(int(label))
+                input_i = Input(text=phrase)
+                reference_i = Reference(Output(text=slabel), tags=[CORRECT_TAG])
+                instance = Instance(input_i, [reference_i], split=split)
+                instances.append(instance)
+            return instances
+        else:
+            return instances
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        self.data_dir = os.path.join(output_path, "data")
+
+        # download data
+        self.download_data()
+        # read downloaded xlsx files as dataframe
+        data_dir = self.data_dir
+        train_file_path = os.path.join(data_dir, self.train_fname)
+        test_file_path = os.path.join(data_dir, self.test_fname)
+        train_df = pd.read_excel(train_file_path)
+        test_df = pd.read_excel(test_file_path)
+        # create instances
+        train_instances = self.create_instances(train_df, TRAIN_SPLIT, self.train_fname)
+        test_instances = self.create_instances(test_df, TEST_SPLIT, self.test_fname)
+        all_instances = train_instances + test_instances
+        return all_instances
diff --git a/src/helm/benchmark/scenarios/news_headline_scenario.py b/src/helm/benchmark/scenarios/news_headline_scenario.py
new file mode 100644
index 00000000000..6514b0c5e38
--- /dev/null
+++ b/src/helm/benchmark/scenarios/news_headline_scenario.py
@@ -0,0 +1,100 @@
+import os
+from typing import List
+import pandas as pd
+
+from helm.common.general import ensure_directory_exists
+
+from .scenario import (
+    CORRECT_TAG,
+    TEST_SPLIT,
+    TRAIN_SPLIT,
+    Instance,
+    PassageQuestionInput,
+    Reference,
+    Scenario,
+    Output,
+)
+
+
+class NewsHeadlineScenario(Scenario):
+    """
+    Context:
+    This dataset contains the gold commodity news annotated into various dimensions including information such as
+    past movements and expected directionality in prices, asset comparison and other general information that the
+    news is referring to.
+    url: https://www.kaggle.com/datasets/daittan/gold-commodity-news-and-dimensions
+    Content:
+    The dataset contains 12 columns.
+
+    The data file https://www.kaggle.com/datasets/daittan/gold-commodity-news-and-dimensions?select=finalDataset_0208.csv
+    must be downloaded manually and located at {execution_path}/benchmark_output/scenarios/news_headline/restricted/finalDataset_0208.csv.
+
+    Acknowledgements:
+    Sinha, Ankur, and Tanmay Khandait.
+    "Impact of News on the Commodity Market: Dataset and Results." arXiv preprint arXiv:2009.04202 (2020)
+    """  # noqa
+
+    name = "news_headline"
+    description = "The dataset is a collection of news items related to the gold commodities from various sources."
+
+    tags = ["news headline", "classification"]
+
+    PROMPT_CATEGORIES = {
+        "Price or Not": ["price", "price", "not-price"],
+        "Direction Up": ["direction up", "direction-up", "not-direction-up"],
+        "Direction Constant": ["direction constant", "direction-constant", "not-direction-constant"],
+        "Direction Down": ["direction down", "direction-down", "not-direction-down"],
+        "PastPrice": ["past price", "past-price", "not-past-price"],
+        "FuturePrice": ["future price", "future-price", "not-future-price"],
+        "PastNews": ["past news", "past-news", "not-past-news"],
+        "FutureNews": ["future news", "future-news", "not-future-news"],
+        "Asset Comparision": ["asset comparison", "asset-comparison", "not-asset-comparison"],  # typo is in the dataset
+    }
+
+    def __init__(self, category: str):
+        """ The initialization of an instance.
+
+        Args:
+            category: str: The category of the news data. This must be one of the strings in PROMPT_CATEGORIES.keys() (see above).
+
+        """
+        super().__init__()
+        assert category in NewsHeadlineScenario.PROMPT_CATEGORIES.keys(), f"Invalid category: {category}"
+        self.category: str = category
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        # data_dir = os.path.join(output_path, "data")
+        restricted_dir = os.path.join(output_path, "restricted") # https://crfm-helm.readthedocs.io/en/latest/benchmark/#running-restricted-benchmarks
+        # ensure_directory_exists(data_dir)
+        ensure_directory_exists(restricted_dir)
+
+        target_path = os.path.join(restricted_dir, "finalDataset_0208.csv")
+
+        # read pandas dataframe from csv
+        df = pd.read_csv(target_path, index_col=0)
+
+        # no explicit train/test split, so treat all rows as test cases
+        df["split"] = TEST_SPLIT
+        df.loc[0:100, "split"] = TRAIN_SPLIT
+
+        prompt_question = "Is the passage above about " + NewsHeadlineScenario.PROMPT_CATEGORIES[self.category][0] + "?"
+
+        instances: List[Instance] = []
+        for _, row in df.iterrows():
+            expected_output: str
+            # sub_split: str
+            if row[self.category] == 1:
+                expected_output = "Yes"
+                # sub_split = NewsHeadlineScenario.PROMPT_CATEGORIES[self.category][1]
+            else:
+                expected_output = "No"
+                # sub_split = NewsHeadlineScenario.PROMPT_CATEGORIES[self.category][2]
+
+            instance = Instance(
+                input=PassageQuestionInput(str(row["News"]), prompt_question),
+                references=[Reference(Output(text=expected_output), tags=[CORRECT_TAG])],
+                split=str(row["split"]),
+                # sub_split=sub_split,
+            )
+            instances.append(instance)
+        return instances
diff --git a/src/helm/benchmark/scenarios/sumosum_scenario.py b/src/helm/benchmark/scenarios/sumosum_scenario.py
new file mode 100644
index 00000000000..c5105b3c79b
--- /dev/null
+++ b/src/helm/benchmark/scenarios/sumosum_scenario.py
@@ -0,0 +1,140 @@
+import os
+from datasets import load_dataset
+import pandas as pd
+import csv
+
+from typing import List, Optional
+from helm.common.general import ensure_file_downloaded, ensure_directory_exists
+from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output
+
+
+class SUMOSumScenario(Scenario):
+    """
+    Generating Fact Checking Summaries for Web Claims ([paper](https://aclanthology.org/2020.wnut-1.12/))
+
+    Data source:
+    https://github.com/rahulOmishra/SUMO/
+    
+    Reference:
+    Rahul Mishra, Dhruv Gupta, and Markus Leippold. 2020. 
+    Generating Fact Checking Summaries for Web Claims. 
+    In Proceedings of the Sixth Workshop on Noisy User-generated Text (W-NUT 2020), pages 81–90, Online. 
+    Association for Computational Linguistics.
+    https://aclanthology.org/2020.wnut-1.12/
+
+    """
+
+    TRAIN_RATIO: float = 0.2
+
+    name = "sumosum"
+    description = "Text summarization with climate corpus"
+    tags = ["summarization", "climate"]
+
+    def __init__(
+        self,
+        sampling_min_length: Optional[int] = None,
+        sampling_max_length: Optional[int] = None,
+        doc_max_length: Optional[int] = None,
+    ):
+        """
+        Initializes the scenario.
+        Args:
+            sampling_min_length: Int indicating minimum length for training
+                                 documents. Training examples smaller than
+                                 sampling_min_length will be filtered out.
+                                 Useful for preventing the adapter from sampling
+                                 really small documents.
+            sampling_max_length: Int indicating maximum length for training
+                                 documents. Training examples larger than
+                                 sampling_max_length will be filtered out.
+                                 Useful for preventing the adapter from
+                                 sampling really large documents.
+            doc_max_length: Int indicating the maximum length to truncate
+                            documents. Documents in all splits will be
+                            truncated to doc_max_length tokens.
+                            NOTE: Currently uses whitespace tokenization.
+        """
+        super().__init__()
+        self.sampling_min_length = sampling_min_length
+        self.sampling_max_length = sampling_max_length
+        self.doc_max_length = doc_max_length
+
+    @staticmethod
+    def _clean_and_truncate(text: str, max_length: Optional[int] = None) -> str:
+        text = text.replace("\n", " ")
+        return " ".join(text.split()[:max_length])
+
+    def _load_dataset(self, output_path: str):
+        data_dir = os.path.join(output_path, "data")
+        ensure_directory_exists(data_dir)
+
+        source_url = "https://github.com/rahulOmishra/SUMO/raw/main/climate_claims_raw.xlsx"
+        source_file = os.path.basename(source_url)
+        target_path = os.path.join(data_dir, source_file)
+        ensure_file_downloaded(
+            source_url=source_url,
+            target_path=target_path,
+        )
+
+        source_file_noext = os.path.splitext(source_file)[0]
+        train_file = f"{source_file_noext}-{TRAIN_SPLIT}.csv"
+        test_file = f"{source_file_noext}-{TEST_SPLIT}.csv"
+        title_key = "Title"
+        document_key = "Doc_text"
+
+        # Claim_id(int),Claim,Title,Doc_text,Label(bool)
+        target_df = pd.read_excel(target_path, skiprows=1)
+        target_df = target_df.dropna(subset=[title_key, document_key])
+        # Remove carriage return _x000D_ in Excel string
+        target_df = target_df.replace({r"_x000D_": ""}, regex=True)
+        # target_df = target_df.replace({r"_x([0-9a-fA-F]{4})_": ""}, regex=True)
+        # Split randomly (works better than split by order)
+        train_df = target_df.sample(frac=SUMOSumScenario.TRAIN_RATIO, random_state=0)
+        test_df = target_df.drop(train_df.index).sample(frac=1, random_state=0)
+        train_df.to_csv(os.path.join(data_dir, train_file), index=False, header=True, quoting=csv.QUOTE_NONNUMERIC)
+        test_df.to_csv(os.path.join(data_dir, test_file), index=False, header=True, quoting=csv.QUOTE_NONNUMERIC)
+
+        data_files = {}
+        data_files[TRAIN_SPLIT] = train_file
+        data_files[TEST_SPLIT] = test_file
+        dataset = load_dataset(data_dir, data_files=data_files)
+
+        return dataset, title_key, document_key
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        dataset, title_key, document_key = self._load_dataset(output_path)
+
+        instances: List[Instance] = []
+
+        SKIP_MAX_LENGTH = 3700  # A too-long article doesn't fit in a prompt.
+        SKIP_MIN_LENGTH = 100  # A too-short article could be garbage.
+
+        for split, split_data in dataset.items():
+            for example in split_data:
+                document: str = SUMOSumScenario._clean_and_truncate(example[document_key])
+                # NOTE Select relatively short documents and truncate them to preserve as much information from the original documents as possible  # noqa
+                if split in TEST_SPLIT:
+                    art_len = len(document.split())
+                    if art_len > SKIP_MAX_LENGTH:
+                        continue
+                    if art_len < SKIP_MIN_LENGTH:
+                        continue
+                document = SUMOSumScenario._clean_and_truncate(example[document_key], self.doc_max_length)
+
+                title: str = SUMOSumScenario._clean_and_truncate(example[title_key])
+
+                if split == TRAIN_SPLIT:
+                    art_len = len(document.split())
+                    if self.sampling_max_length and art_len > self.sampling_max_length:
+                        continue
+                    if self.sampling_min_length and art_len < self.sampling_min_length:
+                        continue
+
+                instance = Instance(
+                    input=Input(text=document),
+                    references=[Reference(output=Output(text=title), tags=[CORRECT_TAG])],
+                    split=split,
+                )
+                instances.append(instance)
+
+        return instances
diff --git a/src/helm/benchmark/static/schema_classic.yaml b/src/helm/benchmark/static/schema_classic.yaml
index d95a43da96a..39c02b59e2e 100644
--- a/src/helm/benchmark/static/schema_classic.yaml
+++ b/src/helm/benchmark/static/schema_classic.yaml
@@ -1,8 +1,1128 @@
 ---
 ############################################################
-# For backwards compatibility with older versions of HELM.
-# TODO: Remove this after 2024-09-01.
-adapter: []
+models:
+  # AI21 Labs
+  - name: ai21/j1-jumbo
+    display_name: J1-Jumbo v1 (178B)
+    description: Jurassic-1 Jumbo (178B parameters) ([docs](https://studio.ai21.com/docs/jurassic1-language-models/), [tech report](https://uploads-ssl.webflow.com/60fd4503684b466578c0d307/61138924626a6981ee09caf6_jurassic_tech_paper.pdf)).
+    creator_organization: AI21 Labs
+    access: limited
+    num_parameters: 178000000000
+    release_date: 2021-08-11
+  - name: ai21/j1-large
+    display_name: J1-Large v1 (7.5B)
+    description: Jurassic-1 Large (7.5B parameters) ([docs](https://studio.ai21.com/docs/jurassic1-language-models/), [tech report](https://uploads-ssl.webflow.com/60fd4503684b466578c0d307/61138924626a6981ee09caf6_jurassic_tech_paper.pdf)).
+    creator_organization: AI21 Labs
+    access: limited
+    num_parameters: 7500000000
+    release_date: 2021-08-11
+  - name: ai21/j1-grande
+    display_name: J1-Grande v1 (17B)
+    description: Jurassic-1 Grande (17B parameters) with a "few tweaks" to the training process ([docs](https://studio.ai21.com/docs/jurassic1-language-models/), [tech report](https://uploads-ssl.webflow.com/60fd4503684b466578c0d307/61138924626a6981ee09caf6_jurassic_tech_paper.pdf)).
+    creator_organization: AI21 Labs
+    access: limited
+    num_parameters: 17000000000
+    release_date: 2022-05-03
+  - name: ai21/j1-grande-v2-beta
+    display_name: J1-Grande v2 beta (17B)
+    description: Jurassic-1 Grande v2 beta (17B parameters)
+    creator_organization: AI21 Labs
+    access: limited
+    num_parameters: 17000000000
+    release_date: 2022-10-28
+  - name: ai21/j2-jumbo
+    display_name: Jurassic-2 Jumbo (178B)
+    description: Jurassic-2 Jumbo (178B parameters) ([docs](https://www.ai21.com/blog/introducing-j2))
+    creator_organization: AI21 Labs
+    access: limited
+    num_parameters: 178000000000
+    release_date: 2023-03-09
+  - name: ai21/j2-grande
+    display_name: Jurassic-2 Grande (17B)
+    description: Jurassic-2 Grande (17B parameters) ([docs](https://www.ai21.com/blog/introducing-j2))
+    creator_organization: AI21 Labs
+    access: limited
+    num_parameters: 17000000000
+    release_date: 2023-03-09
+  - name: ai21/j2-large
+    display_name: Jurassic-2 Large (7.5B)
+    description: Jurassic-2 Large (7.5B parameters) ([docs](https://www.ai21.com/blog/introducing-j2))
+    creator_organization: AI21 Labs
+    access: limited
+    num_parameters: 7500000000
+    release_date: 2023-03-09
+
+  #  Aleph Alpha
+  # TODO: add Luminous World when it's released
+  - name: AlephAlpha/luminous-base
+    display_name: Luminous Base (13B)
+    description: Luminous Base (13B parameters) ([docs](https://docs.aleph-alpha.com/docs/introduction/luminous/))
+    creator_organization: Aleph Alpha
+    access: limited
+    num_parameters: 13000000000
+    # TODO: get exact release date
+    release_date: 2022-01-01
+  - name: AlephAlpha/luminous-extended
+    display_name: Luminous Extended (30B)
+    description: Luminous Extended (30B parameters) ([docs](https://docs.aleph-alpha.com/docs/introduction/luminous/))
+    creator_organization: Aleph Alpha
+    access: limited
+    num_parameters: 30000000000
+    release_date: 2022-01-01
+  - name: AlephAlpha/luminous-supreme
+    display_name: Luminous Supreme (70B)
+    description: Luminous Supreme (70B parameters) ([docs](https://docs.aleph-alpha.com/docs/introduction/luminous/))
+    creator_organization: Aleph Alpha
+    access: limited
+    num_parameters: 70000000000
+    release_date: 2022-01-01
+
+  # TODO: Remove Once we have configurable model names
+  - name: neurips/local
+    display_name: Local service
+    description: Local competition service
+    creator_organization: neurips
+    access: open
+    num_parameters: 1
+    release_date: 2021-12-01
+
+
+  # Anthropic
+  - name: anthropic/stanford-online-all-v4-s3
+    display_name: Anthropic-LM v4-s3 (52B)
+    description: A 52B parameter language model, trained using reinforcement learning from human feedback [paper](https://arxiv.org/pdf/2204.05862.pdf).
+    creator_organization: Anthropic
+    access: closed
+    num_parameters: 52000000000
+    release_date: 2021-12-01
+  - name: anthropic/claude-2.0
+    display_name: Anthropic Claude 2.0
+    description: Claude 2.0 is a general purpose large language model developed by Anthropic. It uses a transformer architecture and is trained via unsupervised learning, RLHF, and Constitutional AI (including both a supervised and Reinforcement Learning (RL) phase). ([model card](https://efficient-manatee.files.svdcdn.com/production/images/Model-Card-Claude-2.pdf))
+    creator_organization: Anthropic
+    access: limited
+    release_date: 2023-07-11
+  - name: anthropic/claude-2.1
+    display_name: Anthropic Claude 2.1
+    description: Claude 2.1 is a general purpose large language model developed by Anthropic. It uses a transformer architecture and is trained via unsupervised learning, RLHF, and Constitutional AI (including both a supervised and Reinforcement Learning (RL) phase). ([model card](https://efficient-manatee.files.svdcdn.com/production/images/Model-Card-Claude-2.pdf))
+    creator_organization: Anthropic
+    access: limited
+    release_date: 2023-11-21
+  - name: anthropic/claude-v1.3
+    display_name: Anthropic Claude v1.3
+    description: A model trained using reinforcement learning from human feedback ([docs](https://www.anthropic.com/index/introducing-claude)).
+    creator_organization: Anthropic
+    access: limited
+    release_date: 2023-03-17
+  - name: anthropic/claude-instant-v1
+    display_name: Anthropic Claude Instant V1
+    description: A lightweight version of Claude, a model trained using reinforcement learning from human feedback ([docs](https://www.anthropic.com/index/introducing-claude)).
+    creator_organization: Anthropic
+    access: limited
+    release_date: 2023-03-17
+  - name: anthropic/claude-instant-1.2
+    display_name: Anthropic Claude Instant 1.2
+    description: A lightweight version of Claude, a model trained using reinforcement learning from human feedback ([docs](https://www.anthropic.com/index/introducing-claude)).
+    creator_organization: Anthropic
+    access: limited
+    release_date: 2023-08-09
+
+  # Berkeley
+  - name: together/koala-13b
+    display_name: Koala (13B)
+    description: Koala (13B) is a chatbot fine-tuned from Llama (13B) on dialogue data gathered from the web. ([blog post](https://bair.berkeley.edu/blog/2023/04/03/koala/))
+    creator_organization: UC Berkeley
+    access: open
+    num_parameters: 13000000000
+    release_date: 2022-04-03
+    todo: true
+
+  # BigScience
+  - name: together/bloom
+    display_name: BLOOM (176B)
+    description: BLOOM (176B parameters) is an autoregressive model trained on 46 natural languages and 13 programming languages ([paper](https://arxiv.org/pdf/2211.05100.pdf)).
+    creator_organization: BigScience
+    access: open
+    num_parameters: 176000000000
+    release_date: 2022-06-28
+  - name: together/bloomz
+    display_name: BLOOMZ (176B)
+    description: BLOOMZ (176B parameters) is BLOOM that has been fine-tuned on natural language instructions ([details](https://huggingface.co/bigscience/bloomz)).
+    creator_organization: BigScience
+    access: open
+    num_parameters: 176000000000
+    release_date: 2022-11-03
+    todo: true
+  - name: together/t0pp
+    display_name: T0pp (11B)
+    description: T0pp (11B parameters) is an encoder-decoder model trained on a large set of different tasks specified in natural language prompts ([paper](https://arxiv.org/pdf/2110.08207.pdf)).
+    creator_organization: BigScience
+    access: open
+    num_parameters: 11000000000
+    release_date: 2021-10-15
+
+  # BigCode
+  - name: huggingface/santacoder
+    display_name: SantaCoder (1.1B)
+    description: SantaCoder (1.1B parameters) model trained on the Python, Java, and JavaScript subset of The Stack (v1.1) ([model card](https://huggingface.co/bigcode/santacoder)).
+    creator_organization: BigCode
+    access: open
+  - name: huggingface/starcoder
+    display_name: StarCoder (15.5B)
+    description: The StarCoder (15.5B parameter) model trained on 80+ programming languages from The Stack (v1.2) ([model card](https://huggingface.co/bigcode/starcoder)).
+    creator_organization: BigCode
+    access: open
+
+  # Hugging Face
+  - name: huggingface/gpt2
+    display_name: GPT-2 (124M)
+    description: GPT-2 is a transformers model pretrained on a very large corpus of English data in a self-supervised fashion. This means it was pretrained on the raw texts only, with no humans labelling them in any way (which is why it can use lots of publicly available data) with an automatic process to generate inputs and labels from those texts.
+    creator_organization: OpenAI
+    access: open
+    num_parameters: 124000000
+  - name: huggingface/gpt2-medium
+    display_name: GPT-2 Medium (355M)
+    description: GPT-2 Medium is the 355M parameter version of GPT-2, a transformer-based language model created and released by OpenAI. The model is a pretrained model on English language using a causal language modeling (CLM) objective.
+    creator_organization: OpenAI
+    access: open
+    num_parameters: 355000000
+  - name: huggingface/gpt2-large
+    display_name: GPT-2 Large (774M)
+    description: GPT-2 Large is the 774M parameter version of GPT-2, a transformer-based language model created and released by OpenAI. The model is a pretrained model on English language using a causal language modeling (CLM) objective.
+    creator_organization: OpenAI
+    access: open
+    num_parameters: 774000000
+  - name: huggingface/gpt2-xl
+    display_name: GPT-2 XL (1.5B)
+    description: GPT-2 XL is the 1.5B parameter version of GPT-2, a transformer-based language model created and released by OpenAI. The model is a pretrained model on English language using a causal language modeling (CLM) objective.
+    creator_organization: OpenAI
+    access: open
+    num_parameters: 1500000000
+
+  # HuggignfaceM4
+  - name: HuggingFaceM4/idefics-9b
+    display_name: IDEFICS (9B)
+    description: IDEFICS (9B parameters) is an open-source model based on DeepMind's Flamingo. ([blog](https://huggingface.co/blog/idefics))
+    creator_organization: HuggingFace
+    access: open
+    num_parameters: 9000000000
+    release_date: 2023-08-22
+  - name: HuggingFaceM4/idefics-9b-instruct
+    display_name: IDEFICS instruct (9B)
+    description: IDEFICS instruct (9B parameters) is an open-source model based on DeepMind's Flamingo. ([blog](https://huggingface.co/blog/idefics))
+    creator_organization: HuggingFace
+    access: open
+    num_parameters: 9000000000
+    release_date: 2023-08-22
+  - name: HuggingFaceM4/idefics-80b
+    display_name: IDEFICS (80B)
+    description: IDEFICS (80B parameters) is an open-source model based on DeepMind's Flamingo. ([blog](https://huggingface.co/blog/idefics))
+    creator_organization: HuggingFace
+    access: open
+    num_parameters: 80000000000
+    release_date: 2023-08-22
+  - name: HuggingFaceM4/idefics-80b-instruct
+    display_name: IDEFICS instruct (80B)
+    description: IDEFICS instruct (80B parameters) is an open-source model based on DeepMind's Flamingo. ([blog](https://huggingface.co/blog/idefics))
+    creator_organization: HuggingFace
+    access: open
+    num_parameters: 80000000000
+    release_date: 2023-08-22
+
+  # Cerebras Systems
+  - name: together/cerebras-gpt-6.7b
+    display_name: Cerebras GPT (6.7B)
+    description: Cerebras GPT is a family of open compute-optimal language models scaled from 111M to 13B parameters trained on the Eleuther Pile. ([paper](https://arxiv.org/pdf/2304.03208.pdf))
+    creator_organization: Cerebras
+    access: limited
+    num_parameters: 6700000000
+    release_date: 2023-04-06
+    todo: true
+  - name: together/cerebras-gpt-13b
+    display_name: Cerebras GPT (13B)
+    description: Cerebras GPT is a family of open compute-optimal language models scaled from 111M to 13B parameters trained on the Eleuther Pile. ([paper](https://arxiv.org/pdf/2304.03208.pdf))
+    creator_organization: Cerebras
+    access: limited
+    num_parameters: 13000000000
+    release_date: 2023-04-06
+    todo: true
+
+  # Cohere
+  - name: cohere/xlarge-20220609
+    display_name: Cohere xlarge v20220609 (52.4B)
+    description: Cohere xlarge v20220609 (52.4B parameters)
+    creator_organization: Cohere
+    access: limited
+    num_parameters: 52400000000
+    release_date: 2022-06-09
+  - name: cohere/large-20220720
+    display_name: Cohere large v20220720 (13.1B)
+    description: Cohere large v20220720 (13.1B parameters), which is deprecated by Cohere as of December 2, 2022.
+    creator_organization: Cohere
+    access: limited
+    num_parameters: 13100000000
+    release_date: 2022-07-20
+  - name: cohere/medium-20220720
+    display_name: Cohere medium v20220720 (6.1B)
+    description: Cohere medium v20220720 (6.1B parameters)
+    creator_organization: Cohere
+    access: limited
+    num_parameters: 6100000000
+    release_date: 2022-07-20
+  - name: cohere/small-20220720
+    display_name: Cohere small v20220720 (410M)
+    description: Cohere small v20220720 (410M parameters), which is deprecated by Cohere as of December 2, 2022.
+    creator_organization: Cohere
+    access: limited
+    num_parameters: 410000000
+    release_date: 2022-07-20
+  - name: cohere/xlarge-20221108
+    display_name: Cohere xlarge v20221108 (52.4B)
+    description: Cohere xlarge v20221108 (52.4B parameters)
+    creator_organization: Cohere
+    access: limited
+    num_parameters: 52400000000
+    release_date: 2022-11-08
+  - name: cohere/medium-20221108
+    display_name: Cohere medium v20221108 (6.1B)
+    description: Cohere medium v20221108 (6.1B parameters)
+    creator_organization: Cohere
+    access: limited
+    num_parameters: 6100000000
+    release_date: 2022-11-08
+  - name: cohere/command-medium-beta
+    display_name: Cohere Command beta (6.1B)
+    description: Cohere Command beta (6.1B parameters) is fine-tuned from the medium model to respond well with instruction-like prompts ([details](https://docs.cohere.ai/docs/command-beta)).
+    creator_organization: Cohere
+    access: limited
+    num_parameters: 6100000000
+    release_date: 2022-11-08
+  - name: cohere/command-xlarge-beta
+    display_name: Cohere Command beta (52.4B)
+    description: Cohere Command beta (52.4B parameters) is fine-tuned from the XL model to respond well with instruction-like prompts ([details](https://docs.cohere.ai/docs/command-beta)).
+    creator_organization: Cohere
+    access: limited
+    num_parameters: 52400000000
+    release_date: 2022-11-08
+  - name: cohere/command
+    display_name: Cohere Command
+    description: Command is Cohere’s flagship text generation model. It is trained to follow user commands and to be instantly useful in practical business applications. [docs](https://docs.cohere.com/reference/generate) and [changelog](https://docs.cohere.com/changelog)
+    creator_organization: Cohere
+    access: limited
+    release_date: 2023-09-29
+  - name: cohere/command-light
+    display_name: Cohere Command Light
+    description: Command is Cohere’s flagship text generation model. It is trained to follow user commands and to be instantly useful in practical business applications. [docs](https://docs.cohere.com/reference/generate) and [changelog](https://docs.cohere.com/changelog)
+    creator_organization: Cohere
+    access: limited
+    release_date: 2023-09-29
+
+  # Databricks
+  - name: databricks/dolly-v2-3b
+    display_name: Dolly V2 (3B)
+    description: Dolly V2 (3B) is an instruction-following large language model trained on the Databricks machine learning platform. It is based on pythia-12b.
+    creator_organization: Databricks
+    access: open
+    num_parameters: 2517652480
+    release_date: 2023-04-12
+    todo: true
+  - name: databricks/dolly-v2-7b
+    display_name: Dolly V2 (7B)
+    description: Dolly V2 (7B) is an instruction-following large language model trained on the Databricks machine learning platform. It is based on pythia-12b.
+    creator_organization: Databricks
+    access: open
+    num_parameters: 6444163072
+    release_date: 2023-04-12
+    todo: true
+  - name: databricks/dolly-v2-12b
+    display_name: Dolly V2 (12B)
+    description: Dolly V2 (12B) is an instruction-following large language model trained on the Databricks machine learning platform. It is based on pythia-12b.
+    creator_organization: Databricks
+    access: open
+    num_parameters: 11327027200
+    release_date: 2023-04-12
+    todo: true
+
+  # DeepMind
+  - name: deepmind/gopher
+    display_name: Gopher (280B)
+    description: Gopher (540B parameters) ([paper](https://arxiv.org/pdf/2112.11446.pdf)).
+    creator_organization: DeepMind
+    access: closed
+    todo: true
+  - name: deepmind/chinchilla
+    display_name: Chinchilla (70B)
+    description: Chinchilla (70B parameters) ([paper](https://arxiv.org/pdf/2203.15556.pdf)).
+    creator_organization: DeepMind
+    access: closed
+    todo: true
+
+  # EleutherAI
+  - name: together/gpt-j-6b
+    display_name: GPT-J (6B)
+    description: GPT-J (6B parameters) autoregressive language model trained on The Pile ([details](https://arankomatsuzaki.wordpress.com/2021/06/04/gpt-j/)).
+    creator_organization: EleutherAI
+    access: open
+    num_parameters: 6000000000
+    release_date: 2021-06-04
+  - name: together/gpt-neox-20b
+    display_name: GPT-NeoX (20B)
+    description: GPT-NeoX (20B parameters) autoregressive language model trained on The Pile ([paper](https://arxiv.org/pdf/2204.06745.pdf)).
+    creator_organization: EleutherAI
+    access: open
+    num_parameters: 20000000000
+    release_date: 2022-02-02
+  - name: eleutherai/pythia-1b-v0
+    display_name: Pythia (1B)
+    description: Pythia (1B parameters). The Pythia project combines interpretability analysis and scaling laws to understand how knowledge develops and evolves during training in autoregressive transformers.
+    creator_organization: EleutherAI
+    access: open
+    num_parameters: 805736448
+    release_date: 2023-02-13
+    todo: true
+  - name: eleutherai/pythia-2.8b-v0
+    display_name: Pythia (2.8B)
+    description: Pythia (2.8B parameters). The Pythia project combines interpretability analysis and scaling laws to understand how knowledge develops and evolves during training in autoregressive transformers.
+    creator_organization: EleutherAI
+    access: open
+    num_parameters: 2517652480
+    release_date: 2023-02-13
+    todo: true
+  - name: eleutherai/pythia-6.9b
+    display_name: Pythia (6.9B)
+    description: Pythia (6.9B parameters). The Pythia project combines interpretability analysis and scaling laws to understand how knowledge develops and evolves during training in autoregressive transformers.
+    creator_organization: EleutherAI
+    access: open
+    num_parameters: 6444163072
+    release_date: 2023-02-13
+  - name: eleutherai/pythia-12b-v0
+    display_name: Pythia (12B)
+    description: Pythia (12B parameters). The Pythia project combines interpretability analysis and scaling laws to understand how knowledge develops and evolves during training in autoregressive transformers.
+    creator_organization: EleutherAI
+    access: open
+    num_parameters: 11327027200
+    release_date: 2023-02-13
+
+  # Google
+  - name: together/t5-11b
+    display_name: T5 (11B)
+    description: T5 (11B parameters) is an encoder-decoder model trained on a multi-task mixture, where each task is converted into a text-to-text format ([paper](https://arxiv.org/pdf/1910.10683.pdf)).
+    creator_organization: Google
+    access: open
+    num_parameters: 11000000000
+    release_date: 2019-10-23
+  - name: together/ul2
+    display_name: UL2 (20B)
+    description: UL2 (20B parameters) is an encoder-decoder model trained on the C4 corpus. It's similar to T5 but trained with a different objective and slightly different scaling knobs ([paper](https://arxiv.org/pdf/2205.05131.pdf)).
+    creator_organization: Google
+    access: open
+    num_parameters: 20000000000
+    release_date: 2022-05-10
+  - name: together/flan-t5-xxl
+    display_name: Flan-T5 (11B)
+    description: Flan-T5 (11B parameters) is T5 fine-tuned on 1.8K tasks ([paper](https://arxiv.org/pdf/2210.11416.pdf)).
+    creator_organization: Google
+    access: open
+  - name: google/palm
+    display_name: PaLM (540B)
+    description: Pathways Language Model (540B parameters) is trained using 6144 TPU v4 chips ([paper](https://arxiv.org/pdf/2204.02311.pdf)).
+    creator_organization: Google
+    access: closed
+    todo: true
+  ## PaLM 2
+  - name: google/text-bison@001
+    display_name: PaLM-2 (Bison)
+    description: The best value PaLM model. PaLM 2 (Pathways Language Model) is a Transformer-based model trained using a mixture of objectives that was evaluated on English and multilingual language, and reasoning tasks. ([report](https://arxiv.org/pdf/2305.10403.pdf))
+    creator_organization: Google
+    access: limited
+    release_date: 2023-06-07 # Source: https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/text#model_versions
+  - name: google/text-bison-32k
+    display_name: PaLM-2 (Bison)
+    description: The best value PaLM model with a 32K context. PaLM 2 (Pathways Language Model) is a Transformer-based model trained using a mixture of objectives that was evaluated on English and multilingual language, and reasoning tasks. ([report](https://arxiv.org/pdf/2305.10403.pdf))
+    creator_organization: Google
+    access: limited
+    release_date: 2023-06-07 # Source: https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/text#model_versions
+  - name: google/text-unicorn@001
+    display_name: PaLM-2 (Unicorn)
+    description: The largest model in PaLM family. PaLM 2 (Pathways Language Model) is a Transformer-based model trained using a mixture of objectives that was evaluated on English and multilingual language, and reasoning tasks. ([report](https://arxiv.org/pdf/2305.10403.pdf))
+    creator_organization: Google
+    access: limited
+    release_date: 2023-11-30 # Source: https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/text#model_versions
+  - name: google/code-bison@001
+    display_name: Codey PaLM-2 (Bison)
+    description: A model fine-tuned to generate code based on a natural language description of the desired code. PaLM 2 (Pathways Language Model) is a Transformer-based model trained using a mixture of objectives that was evaluated on English and multilingual language, and reasoning tasks. ([report](https://arxiv.org/pdf/2305.10403.pdf))
+    creator_organization: Google
+    access: limited
+    release_date: 2023-06-29 # Source: https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/code-generation#model_versions
+  - name: google/code-bison-32k
+    display_name: Codey PaLM-2 (Bison)
+    description: Codey with a 32K context. PaLM 2 (Pathways Language Model) is a Transformer-based model trained using a mixture of objectives that was evaluated on English and multilingual language, and reasoning tasks. ([report](https://arxiv.org/pdf/2305.10403.pdf))
+    creator_organization: Google
+    access: limited
+    release_date: 2023-06-29 # Source: https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/code-generation#model_versions
+
+  # HazyResearch
+  - name: together/h3-2.7b
+    display_name: H3 (2.7B)
+    description: H3 (2.7B parameters) is a decoder-only language model based on state space models ([paper](https://arxiv.org/abs/2212.14052)).
+    creator_organization: HazyResearch
+    access: open
+    num_parameters: 2700000000
+    release_date: 2023-01-23
+    todo: true
+
+  # Lightning AI's Lit-GPT
+  - name: lightningai/lit-gpt
+    display_name: Lit-GPT
+    description: Lit-GPT is an optimized collection of open-source LLMs for finetuning and inference. It supports – Falcon, Llama 2, Vicuna, LongChat, and other top-performing open-source large language models.
+    creator_organization: Lightning AI
+    access: open
+    num_parameters: 1
+    release_date: 2023-04-04
+
+
+  # Meta
+  - name: together/opt-iml-175b
+    display_name: OPT-IML (175B)
+    description: OPT-IML (175B parameters) is a suite of decoder-only transformer LMs that are multi-task fine-tuned on 2000 datasets ([paper](https://arxiv.org/pdf/2212.12017.pdf)).
+    creator_organization: Meta
+    access: open
+    num_parameters: 175000000000
+    release_date: 2022-12-22
+    todo: true
+
+  - name: together/opt-iml-30b
+    display_name: OPT-IML (30B)
+    description: OPT-IML (30B parameters) is a suite of decoder-only transformer LMs that are multi-task fine-tuned on 2000 datasets ([paper](https://arxiv.org/pdf/2212.12017.pdf)).
+    creator_organization: Meta
+    access: open
+    num_parameters: 30000000000
+    release_date: 2022-12-22
+    todo: true
+
+  - name: together/opt-175b
+    display_name: OPT (175B)
+    description: Open Pre-trained Transformers (175B parameters) is a suite of decoder-only pre-trained transformers that are fully and responsibly shared with interested researchers ([paper](https://arxiv.org/pdf/2205.01068.pdf)).
+    creator_organization: Meta
+    access: open
+    num_parameters: 175000000000
+    release_date: 2022-05-02
+
+  - name: together/opt-66b
+    display_name: OPT (66B)
+    description: Open Pre-trained Transformers (66B parameters) is a suite of decoder-only pre-trained transformers that are fully and responsibly shared with interested researchers ([paper](https://arxiv.org/pdf/2205.01068.pdf)).
+    creator_organization: Meta
+    access: open
+    num_parameters: 66000000000
+    release_date: 2022-05-02
+
+  - name: together/opt-6.7b
+    display_name: OPT (6.7B)
+    description: Open Pre-trained Transformers (6.7B parameters) is a suite of decoder-only pre-trained transformers that are fully and responsibly shared with interested researchers ([paper](https://arxiv.org/pdf/2205.01068.pdf)).
+    creator_organization: Meta
+    access: open
+    num_parameters: 6700000000
+    release_date: 2022-05-02
+
+  - name: together/opt-1.3b
+    display_name: OPT (1.3B)
+    description: Open Pre-trained Transformers (1.3B parameters) is a suite of decoder-only pre-trained transformers that are fully and responsibly shared with interested researchers ([paper](https://arxiv.org/pdf/2205.01068.pdf)).
+    creator_organization: Meta
+    access: open
+    num_parameters: 1300000000
+    release_date: 2022-05-02
+
+  - name: together/galactica-120b
+    display_name: Galactica (120B)
+    description: Galactica (120B parameters) is trained on 48 million papers, textbooks, lectures notes, compounds and proteins, scientific websites, etc. ([paper](https://galactica.org/static/paper.pdf)).
+    creator_organization: Meta
+    access: open
+    num_parameters: 120000000000
+    release_date: 2022-11-15
+    todo: true
+
+  - name: together/galactica-30b
+    display_name: Galactica (30B)
+    description: Galactica (30B parameters) is trained on 48 million papers, textbooks, lectures notes, compounds and proteins, scientific websites, etc. ([paper](https://galactica.org/static/paper.pdf)).
+    creator_organization: Meta
+    access: open
+    num_parameters: 30000000000
+    release_date: 2022-11-15
+    todo: true
+  - name: meta/llama-7b
+    display_name: LLaMA (7B)
+    description: LLaMA is a collection of foundation language models ranging from 7B to 65B parameters.
+    creator_organization: Meta
+    access: open
+    num_parameters: 7000000000
+    release_date: 2023-02-24
+  - name: meta/llama-13b
+    display_name: LLaMA (13B)
+    description: LLaMA is a collection of foundation language models ranging from 7B to 65B parameters.
+    creator_organization: Meta
+    access: open
+    num_parameters: 13000000000
+    release_date: 2023-02-24
+  - name: meta/llama-30b
+    display_name: LLaMA (30B)
+    description: LLaMA is a collection of foundation language models ranging from 7B to 65B parameters.
+    creator_organization: Meta
+    access: open
+    num_parameters: 30000000000
+    release_date: 2023-02-24
+  - name: meta/llama-65b
+    display_name: LLaMA (65B)
+    description: LLaMA is a collection of foundation language models ranging from 7B to 65B parameters.
+    creator_organization: Meta
+    access: open
+    num_parameters: 65000000000
+    release_date: 2023-02-24
+  - name: meta/llama-2-7b
+    display_name: Llama 2 (7B)
+    description: Llama 2 pretrained models are trained on 2 trillion tokens, and have double the context length than Llama 1.
+    creator_organization: Meta
+    access: open
+    num_parameters: 7000000000
+    release_date: 2023-07-18
+  - name: meta/llama-2-13b
+    display_name: Llama 2 (13B)
+    description: Llama 2 pretrained models are trained on 2 trillion tokens, and have double the context length than Llama 1.
+    creator_organization: Meta
+    access: open
+    num_parameters: 13000000000
+    release_date: 2023-07-18
+  - name: meta/llama-2-70b
+    display_name: Llama 2 (70B)
+    description: Llama 2 pretrained models are trained on 2 trillion tokens, and have double the context length than Llama 1.
+    creator_organization: Meta
+    access: open
+    num_parameters: 70000000000
+    release_date: 2023-07-18
+
+  # Stability AI
+  - name: stabilityai/stablelm-base-alpha-3b
+    display_name: StableLM-Base-Alpha (3B)
+    description: StableLM-Base-Alpha is a suite of 3B and 7B parameter decoder-only language models pre-trained on a diverse collection of English datasets with a sequence length of 4096 to push beyond the context window limitations of existing open-source language models.
+    creator_organization: Stability AI
+    access: open
+    num_parameters: 3000000000
+    release_date: 2023-04-20
+    todo: true
+
+  - name: stabilityai/stablelm-base-alpha-7b
+    display_name: StableLM-Base-Alpha (7B)
+    description: StableLM-Base-Alpha is a suite of 3B and 7B parameter decoder-only language models pre-trained on a diverse collection of English datasets with a sequence length of 4096 to push beyond the context window limitations of existing open-source language models.
+    creator_organization: Stability AI
+    access: open
+    num_parameters: 7000000000
+    release_date: 2023-04-20
+    todo: true
+
+  # Stanford
+  - name: stanford/alpaca-7b
+    display_name: Alpaca (7B)
+    description: Alpaca 7B is a model fine-tuned from the LLaMA 7B model on 52K instruction-following demonstrations
+    creator_organization: Stanford
+    access: open
+    num_parameters: 7000000000
+    release_date: 2023-03-13
+
+  # LMSYS
+  - name: lmsys/vicuna-7b-v1.3
+    display_name: Vicuna v1.3 (7B)
+    description: Vicuna v1.3 (7B) is an open-source chatbot trained by fine-tuning LLaMA on user-shared conversations collected from ShareGPT.
+    creator_organization: LMSYS
+    access: open
+    num_parameters: 7000000000
+    release_date: 2023-06-22
+  - name: lmsys/vicuna-13b-v1.3
+    display_name: Vicuna v1.3 (13B)
+    description: Vicuna v1.3 (13B) is an open-source chatbot trained by fine-tuning LLaMA on user-shared conversations collected from ShareGPT.
+    creator_organization: LMSYS
+    access: open
+    num_parameters: 13000000000
+    release_date: 2023-06-22
+
+  # 01.AI
+  - name: 01-ai/yi-6b
+    display_name: Yi (6B)
+    description: The Yi models are large language models trained from scratch by developers at 01.AI.
+    creator_organization: 01.AI
+    access: open
+    num_parameters: 6000000000
+    release_date: 2023-11-02
+  - name: 01-ai/yi-34b
+    display_name: Yi (34B)
+    description: The Yi models are large language models trained from scratch by developers at 01.AI.
+    creator_organization: 01.AI
+    access: open
+    num_parameters: 34000000000
+    release_date: 2023-11-02
+
+  # Mistral AI
+  - name: mistralai/mistral-7b-v0.1
+    display_name: Mistral v0.1 (7B)
+    description: Mistral 7B is a  7.3B parameter transformer model that uses Grouped-Query Attention (GQA) and Sliding-Window Attention (SWA).
+    creator_organization: Mistral AI
+    access: open
+    num_parameters: 7300000000
+    release_date: 2023-09-27
+
+  # Microsoft/NVIDIA
+  - name: microsoft/TNLGv2_530B
+    display_name: TNLG v2 (530B)
+    description: TNLG v2 (530B parameters) autoregressive language model trained on a filtered subset of the Pile and CommonCrawl ([paper](https://arxiv.org/pdf/2201.11990.pdf)).
+    creator_organization: Microsoft/NVIDIA
+    access: closed
+    num_parameters: 530000000000
+    release_date: 2022-01-28
+  - name: microsoft/TNLGv2_7B
+    display_name: TNLG v2 (6.7B)
+    description: TNLG v2 (6.7B parameters) autoregressive language model trained on a filtered subset of the Pile and CommonCrawl ([paper](https://arxiv.org/pdf/2201.11990.pdf)).
+    creator_organization: Microsoft/NVIDIA
+    access: closed
+    num_parameters: 6700000000
+    release_date: 2022-01-28
+
+  # OpenAI: https://beta.openai.com/docs/engines/gpt-3
+  - name: openai/davinci
+    display_name: davinci (175B)
+    description: Original GPT-3 (175B parameters) autoregressive language model ([paper](https://arxiv.org/pdf/2005.14165.pdf), [docs](https://beta.openai.com/docs/model-index-for-researchers)).
+    creator_organization: OpenAI
+    access: limited
+    num_parameters: 175000000000
+    release_date: 2020-05-28
+  - name: openai/curie
+    display_name: curie (6.7B)
+    description: Original GPT-3 (6.7B parameters) autoregressive language model ([paper](https://arxiv.org/pdf/2005.14165.pdf), [docs](https://beta.openai.com/docs/model-index-for-researchers)).
+    creator_organization: OpenAI
+    access: limited
+    num_parameters: 6700000000
+    release_date: 2020-05-28
+  - name: openai/babbage
+    display_name: babbage (1.3B)
+    description: Original GPT-3 (1.3B parameters) autoregressive language model ([paper](https://arxiv.org/pdf/2005.14165.pdf), [docs](https://beta.openai.com/docs/model-index-for-researchers)).
+    creator_organization: OpenAI
+    access: limited
+    num_parameters: 1300000000
+    release_date: 2020-05-28
+  - name: openai/ada
+    display_name: ada (350M)
+    description: Original GPT-3 (350M parameters) autoregressive language model ([paper](https://arxiv.org/pdf/2005.14165.pdf), [docs](https://beta.openai.com/docs/model-index-for-researchers)).
+    creator_organization: OpenAI
+    access: limited
+    num_parameters: 350000000
+    release_date: 2020-05-28
+  - name: openai/text-davinci-003
+    display_name: text-davinci-003
+    description: text-davinci-003 model that involves reinforcement learning (PPO) with reward models. Derived from text-davinci-002 ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
+    creator_organization: OpenAI
+    access: limited
+    num_parameters: 175000000000
+    release_date: 2022-11-28
+  - name: openai/text-davinci-002
+    display_name: text-davinci-002
+    description: text-davinci-002 model that involves supervised fine-tuning on human-written demonstrations. Derived from code-davinci-002 ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
+    creator_organization: OpenAI
+    access: limited
+    num_parameters: 175000000000
+    release_date: 2022-01-27
+  - name: openai/text-davinci-001
+    display_name: text-davinci-001
+    description: text-davinci-001 model that involves supervised fine-tuning on human-written demonstrations ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
+    creator_organization: OpenAI
+    access: limited
+    num_parameters: 175000000000
+    release_date: 2022-01-27
+    todo: true
+  - name: openai/text-curie-001
+    display_name: text-curie-001
+    description: text-curie-001 model that involves supervised fine-tuning on human-written demonstrations ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
+    creator_organization: OpenAI
+    access: limited
+    num_parameters: 6700000000
+    release_date: 2022-01-27
+  - name: openai/text-babbage-001
+    display_name: text-babbage-001
+    description: text-babbage-001 model that involves supervised fine-tuning on human-written demonstrations ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
+    creator_organization: OpenAI
+    access: limited
+    num_parameters: 1300000000
+    release_date: 2022-01-27
+  - name: openai/text-ada-001
+    display_name: text-ada-001
+    description: text-ada-001 model that involves supervised fine-tuning on human-written demonstrations ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
+    creator_organization: OpenAI
+    access: limited
+    num_parameters: 350000000
+    release_date: 2022-01-27
+  - name: openai/gpt-4-0314
+    display_name: gpt-4-0314
+    description: GPT-4 is a large multimodal model (currently only accepting text inputs and emitting text outputs) that is optimized for chat but works well for traditional completions tasks. Snapshot of gpt-4 from March 14th 2023.
+    creator_organization: OpenAI
+    access: limited
+    release_date: 2023-03-14
+  - name: openai/gpt-4-32k-0314
+    display_name: gpt-4-32k-0314
+    description: GPT-4 is a large multimodal model (currently only accepting text inputs and emitting text outputs) that is optimized for chat but works well for traditional completions tasks. Snapshot of gpt-4 with a longer context length of 32,768 tokens from March 14th 2023.
+    creator_organization: OpenAI
+    access: limited
+    release_date: 2023-03-14
+  - name: openai/gpt-4-0613
+    display_name: gpt-4-0613
+    description: GPT-4 is a large multimodal model (currently only accepting text inputs and emitting text outputs) that is optimized for chat but works well for traditional completions tasks. Snapshot of gpt-4 from 2023-06-13.
+    creator_organization: OpenAI
+    access: limited
+    release_date: 2023-06-13
+  - name: openai/gpt-4-32k-0613
+    display_name: gpt-4-32k-0613
+    description: GPT-4 is a large multimodal model (currently only accepting text inputs and emitting text outputs) that is optimized for chat but works well for traditional completions tasks. Snapshot of gpt-4 with a longer context length of 32,768 tokens from 2023-06-13.
+    creator_organization: OpenAI
+    access: limited
+    release_date: 2023-06-13
+  - name: openai/code-davinci-002
+    display_name: code-davinci-002
+    description: Codex-style model that is designed for pure code-completion tasks ([docs](https://beta.openai.com/docs/models/codex)).
+    creator_organization: OpenAI
+    access: limited
+  - name: openai/code-davinci-001
+    display_name: code-davinci-001
+    description: code-davinci-001 model
+    creator_organization: OpenAI
+    access: limited
+    todo: true
+  - name: openai/code-cushman-001
+    display_name: code-cushman-001 (12B)
+    description: Codex-style model that is a stronger, multilingual version of the Codex (12B) model in the [Codex paper](https://arxiv.org/pdf/2107.03374.pdf).
+    creator_organization: OpenAI
+    access: limited
+  - name: openai/gpt-3.5-turbo-0301
+    display_name: gpt-3.5-turbo-0301
+    description: Sibling model of text-davinci-003 is optimized for chat but works well for traditional completions tasks as well. Snapshot from 2023-03-01.
+    creator_organization: OpenAI
+    access: limited
+    release_date: 2023-03-01
+  - name: openai/gpt-3.5-turbo-0613
+    display_name: gpt-3.5-turbo-0613
+    description: Sibling model of text-davinci-003 is optimized for chat but works well for traditional completions tasks as well. Snapshot from 2023-06-13.
+    creator_organization: OpenAI
+    access: limited
+    release_date: 2023-06-13
+  - name: openai/gpt-3.5-turbo-16k-0613
+    display_name: gpt-3.5-turbo-16k-0613
+    description: Sibling model of text-davinci-003 is optimized for chat but works well for traditional completions tasks as well. Snapshot from 2023-06-13 with a longer context length of 16,384 tokens.
+    creator_organization: OpenAI
+    access: limited
+    release_date: 2023-06-13
+  - name: openai/gpt-4-1106-preview
+    display_name: gpt-4-1106-preview
+    description: GPT-4 Turbo (preview) is a large multimodal model that is optimized for chat but works well for traditional completions tasks. The model is cheaper and faster than the original GPT-4 model. Preview snapshot from November 6, 2023.
+    creator_organization: OpenAI
+    access: limited
+    release_date: 2023-11-06
+
+  # Together
+  - name: together/Together-gpt-JT-6B-v1
+    display_name: GPT-JT (6B)
+    description: GPT-JT (6B parameters) is a fork of GPT-J ([blog post](https://www.together.xyz/blog/releasing-v1-of-gpt-jt-powered-by-open-source-ai)).
+    creator_organization: Together
+    access: open
+    num_parameters: 6700000000
+    release_date: 2022-11-29
+    todo: true
+  - name: together/gpt-neoxt-chat-base-20b
+    display_name: GPT-NeoXT-Chat-Base (20B)
+    description: GPT-NeoXT-Chat-Base (20B) is fine-tuned from GPT-NeoX, serving as a base model for developing open-source chatbots.
+    creator_organization: Together
+    access: open
+    num_parameters: 20000000000
+    release_date: 2023-03-08
+    todo: true
+  - name: together/redpajama-incite-base-3b-v1
+    display_name: RedPajama-INCITE-Base-v1 (3B)
+    description: RedPajama-INCITE-Base-v1 (3B parameters) is a 3 billion base model that aims to replicate the LLaMA recipe as closely as possible.
+    creator_organization: Together
+    access: open
+    num_parameters: 3000000000
+    release_date: 2023-05-05
+  - name: together/redpajama-incite-instruct-3b-v1
+    display_name: RedPajama-INCITE-Instruct-v1 (3B)
+    description: RedPajama-INCITE-Instruct-v1 (3B parameters) is a model fine-tuned for few-shot applications on the data of GPT-JT. It is built from RedPajama-INCITE-Base-v1 (3B), a 3 billion base model that aims to replicate the LLaMA recipe as closely as possible.
+    creator_organization: Together
+    access: open
+    num_parameters: 3000000000
+    release_date: 2023-05-05
+    todo: true
+  - name: together/redpajama-incite-chat-3b-v1
+    display_name: RedPajama-INCITE-Chat-v1 (3B)
+    description: RedPajama-INCITE-Chat-v1 (3B parameters) is a model fine-tuned on OASST1 and Dolly2 to enhance chatting ability. It is built from RedPajama-INCITE-Base-v1 (3B), a 3 billion base model that aims to replicate the LLaMA recipe as closely as possible.
+    creator_organization: Together
+    access: open
+    num_parameters: 3000000000
+    release_date: 2023-05-05
+    todo: true
+  - name: together/redpajama-incite-base-7b
+    display_name: RedPajama-INCITE-Base (7B)
+    description: RedPajama-INCITE-Base (7B parameters) is a 7 billion base model that aims to replicate the LLaMA recipe as closely as possible.
+    creator_organization: Together
+    access: open
+    num_parameters: 7000000000
+    release_date: 2023-05-05
+    todo: true
+  - name: together/redpajama-incite-instruct-7b
+    display_name: RedPajama-INCITE-Instruct (7B)
+    description: RedPajama-INCITE-Instruct (7B parameters) is a model fine-tuned for few-shot applications on the data of GPT-JT. It is built from RedPajama-INCITE-Base (7B), a 7 billion base model that aims to replicate the LLaMA recipe as closely as possible.
+    creator_organization: Together
+    access: open
+    num_parameters: 7000000000
+    release_date: 2023-05-05
+    todo: true
+
+  # MosaicML
+  - name: mosaicml/mpt-7b
+    display_name: MPT (7B)
+    description: MPT (7B) is a Transformer trained from scratch on 1T tokens of text and code.
+    creator_organization: MosaicML
+    access: open
+    num_parameters: 6700000000
+    release_date: 2023-05-05
+  - name: mosaicml/mpt-7b-chat
+    display_name: MPT-Chat (7B)
+    description: MPT-Chat (7B) is a chatbot-like model for dialogue generation. It is built by finetuning MPT (30B) , a Transformer trained from scratch on 1T tokens of text and code.
+    creator_organization: MosaicML
+    access: open
+    num_parameters: 6700000000
+    release_date: 2023-05-05
+    todo: true
+  - name: mosaicml/mpt-instruct-7b
+    display_name: MPT-Instruct (7B)
+    description: MPT-Instruct (7B) is a model for short-form instruction following. It is built by finetuning MPT (30B), a Transformer trained from scratch on 1T tokens of text and code.
+    creator_organization: MosaicML
+    access: open
+    num_parameters: 6700000000
+    release_date: 2023-05-05
+  - name: mosaicml/mpt-30b
+    display_name: MPT (30B)
+    description: MPT (30B) is a Transformer trained from scratch on 1T tokens of text and code.
+    creator_organization: MosaicML
+    access: open
+    num_parameters: 30000000000
+    release_date: 2023-06-22
+  - name: mosaicml/mpt-30b-chat
+    display_name: MPT-Chat (30B)
+    description: MPT-Chat (30B) is a chatbot-like model for dialogue generation. It is built by finetuning MPT (30B), a Transformer trained from scratch on 1T tokens of text and code.
+    creator_organization: MosaicML
+    access: open
+    num_parameters: 30000000000
+    release_date: 2023-06-22
+    todo: true
+  - name: mosaicml/mpt-instruct-30b
+    display_name: MPT-Instruct (30B)
+    description: MPT-Instruct (30B) is a model for short-form instruction following. It is built by finetuning MPT (30B), a Transformer trained from scratch on 1T tokens of text and code.
+    creator_organization: MosaicML
+    access: open
+    num_parameters: 30000000000
+    release_date: 2023-06-22
+
+  # TII UAE
+  - name: tiiuae/falcon-7b
+    display_name: Falcon (7B)
+    description: Falcon-7B is a 7B parameters causal decoder-only model built by TII and trained on 1,500B tokens of RefinedWeb enhanced with curated corpora.
+    creator_organization: TII UAE
+    access: open
+    num_parameters: 7000000000
+    release_date: 2023-03-15
+  - name: tiiuae/falcon-7b-instruct
+    display_name: Falcon-Instruct (7B)
+    description: Falcon-7B-Instruct is a 7B parameters causal decoder-only model built by TII based on Falcon-7B and finetuned on a mixture of chat/instruct datasets.
+    creator_organization: TII UAE
+    access: open
+    num_parameters: 7000000000
+    release_date: 2023-03-15
+  - name: tiiuae/falcon-40b
+    display_name: Falcon (40B)
+    description: Falcon-40B is a 40B parameters causal decoder-only model built by TII and trained on 1,500B tokens of RefinedWeb enhanced with curated corpora.
+    creator_organization: TII UAE
+    access: open
+    num_parameters: 40000000000
+    release_date: 2023-05-25
+  - name: tiiuae/falcon-40b-instruct
+    display_name: Falcon-Instruct (40B)
+    description: Falcon-40B-Instruct is a 40B parameters causal decoder-only model built by TII based on Falcon-7B and finetuned on a mixture of chat/instruct datasets.
+    creator_organization: TII UAE
+    access: open
+    num_parameters: 40000000000
+    release_date: 2023-05-25
+
+  # Salesforce
+  - name: together/codegen
+    display_name: CodeGen (16B)
+    description: CodeGen (16B parameters) is an open dense code model trained for multi-turn program synthesis ([blog](https://arxiv.org/pdf/2203.13474.pdf)).
+    creator_organization: Tsinghua
+    access: open
+    num_parameters: 16000000000
+    release_date: 2022-03-25
+    todo: true
+
+  # Tsinghua
+  - name: together/glm
+    display_name: GLM (130B)
+    description: GLM (130B parameters) is an open bilingual (English & Chinese) bidirectional dense model that was trained using General Language Model (GLM) procedure ([paper](https://arxiv.org/pdf/2210.02414.pdf)).
+    creator_organization: Tsinghua
+    access: open
+    num_parameters: 130000000000
+    release_date: 2022-08-04
+
+  - name: together/codegeex
+    display_name: CodeGeeX (13B)
+    description: CodeGeeX (13B parameters) is an open dense code model trained on more than 20 programming languages on a corpus of more than 850B tokens ([blog](http://keg.cs.tsinghua.edu.cn/codegeex/)).
+    creator_organization: Tsinghua
+    access: open
+    num_parameters: 13000000000
+    release_date: 2022-09-19
+    todo: true
+
+  # Writer
+  - name: writer/palmyra-base
+    display_name: Palmyra Base (5B)
+    description: Palmyra Base (5B)
+    creator_organization: Writer
+    access: limited
+    num_parameters: 5000000000
+    release_date: 2022-10-13
+  - name: writer/palmyra-large
+    display_name: Palmyra Large (20B)
+    description: Palmyra Large (20B)
+    creator_organization: Writer
+    access: limited
+    num_parameters: 20000000000
+    release_date: 2022-12-23
+  - name: writer/palmyra-instruct-30
+    display_name: InstructPalmyra (30B)
+    description: InstructPalmyra (30B parameters) is trained using reinforcement learning techniques based on feedback from humans.
+    creator_organization: Writer
+    access: limited
+    num_parameters: 30000000000
+    release_date: 2023-02-16
+  - name: writer/palmyra-e
+    display_name: Palmyra E (30B)
+    description: Palmyra E (30B)
+    creator_organization: Writer
+    access: limited
+    num_parameters: 30000000000
+    release_date: 2023-03-03
+  - name: writer/silk-road
+    display_name: Silk Road (35B)
+    description: Silk Road (35B)
+    creator_organization: Writer
+    access: limited
+    num_parameters: 35000000000
+    release_date: 2023-04-13
+  - name: writer/palmyra-x
+    display_name: Palmyra X (43B)
+    description: Palmyra-X (43B parameters) is trained to adhere to instructions using human feedback and utilizes a technique called multiquery attention. Furthermore, a new feature called 'self-instruct' has been introduced, which includes the implementation of an early stopping criteria specifically designed for minimal instruction tuning ([paper](https://dev.writer.com/docs/becoming-self-instruct-introducing-early-stopping-criteria-for-minimal-instruct-tuning)).
+    creator_organization: Writer
+    access: limited
+    num_parameters: 43000000000
+    release_date: 2023-06-11
+  - name: writer/palmyra-x-v2
+    display_name: Palmyra X V2 (33B)
+    description: Palmyra-X V2 (33B parameters) is a Transformer-based model, which is trained on extremely large-scale pre-training data. The pre-training data more than 2 trillion tokens types are diverse and cover a wide range of areas, used FlashAttention-2.
+    creator_organization: Writer
+    access: limited
+    num_parameters: 33000000000
+    release_date: 2023-12-01
+  - name: writer/palmyra-x-v3
+    display_name: Palmyra X V3 (72B)
+    description: Palmyra-X V3 (72B parameters) is a Transformer-based model, which is trained on extremely large-scale pre-training data. It is trained via unsupervised learning and DPO and use multiquery attention.
+    creator_organization: Writer
+    access: limited
+    num_parameters: 72000000000
+    release_date: 2023-12-01
+  - name: writer/palmyra-x-32k
+    display_name: Palmyra X-32K (33B)
+    description: Palmyra-X-32K (33B parameters) is a Transformer-based model, which is trained on large-scale pre-training data. The pre-training data types are diverse and cover a wide range of areas. These data types are used in conjunction and the alignment mechanism to extend context window.
+    creator_organization: Writer
+    access: limited
+    num_parameters: 33000000000
+    release_date: 2023-12-01
+
+  # Yandex
+  - name: together/yalm
+    display_name: YaLM (100B)
+    description: YaLM (100B parameters) is an autoregressive language model trained on English and Russian text ([GitHub](https://github.com/yandex/YaLM-100B)).
+    creator_organization: Yandex
+    access: open
+    num_parameters: 100000000000
+    release_date: 2022-06-23
+
+  # NVIDIA
+  - name: nvidia/megatron-gpt2
+    display_name: Megatron GPT2
+    description: GPT-2 implemented in Megatron-LM ([paper](https://arxiv.org/abs/1909.08053)).
+    creator_organization: NVIDIA
+    access: open
+    todo: true
+
+############################################################
+adapter:
+  - name: method
+    description: The high-level strategy for converting instances into a prompt for the language model.
+    values:
+      - name: generation
+        description: Given the input, the model generates the output free-form.
+      - name: multiple_choice_joint
+        description: Given the input, the model selects from multiple-choice options (A., B., C., D., E.).
+      - name: multiple_choice_separate_original
+        description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability.
+      - name: multiple_choice_separate_calibrated
+        description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability when calibrated by answer choice probability.
+      - name: language_modeling
+        description: Given the input, the model assigns the sequence a probability.
+  - name: instructions
+    description: The description of the task that is included at the very beginning of the prompt.
+  - name: global_prefix
+    description: The string that is prepended to the prompt.
+  - name: instance_prefix
+    description: The string that is included before each instance (e.g., '\n\n').
+  - name: input_prefix
+    description: The string that is included before each input (e.g., 'Question:').
+  - name: input_suffix
+    description: The string that is included after each input (e.g., '\n').
+  - name: reference_prefix
+    description: The string that is included before each reference (for multiple-choice questions).
+  - name: reference_suffix
+    description: The string that is included after each reference (for multiple-choice questions).
+  - name: output_prefix
+    description: The string that is included before the correct answer/predicted output (e.g., 'Answer:').
+  - name: output_suffix
+    description: The string that is included after the correct answer/predicted output (e.g., '\n').
+  - name: substitutions
+    description: A list of regular expression substitutions (e.g., replacing '\n' with ';\n') to perform at the very end on the prompt.
+  - name: max_train_instances
+    description: Maximum number of training instances to include in the prompt (currently by randomly sampling).
+  - name: max_eval_instances
+    description: Maximum number of instances to evaluate on (over all splits - test, valid, etc.).
+  - name: num_outputs
+    description: Maximum number of possible outputs to generate by sampling multiple outputs.
+  - name: num_train_trials
+    description: Number of trials, where in each trial we choose an independent, random set of training instances. Used to compute variance.
+  - name: sample_train
+    description: If true, randomly sample N training examples; if false, select N consecutive training examples
+  - name: model
+    description: DEPRECATED. Name of the language model (<creator_organization>/<model name>) to send requests to.
+  - name: model_deployment
+    description: Name of the language model (<host_organization>/<model name>) to send requests to.
+  - name: temperature
+    description: Temperature parameter used in generation.
+  - name: max_tokens
+    description: Maximum number of tokens to generate.
+  - name: stop_sequences
+    description: List of sequences, where we stop generation if we encounter any of them.
+  - name: random
+    description: Random seed (string), which guarantees reproducibility.
+  - name: multi_label
+    description: If true, for instances with multiple correct reference, the gold answer should be considered to be all of the correct references rather than any of the correct references.
+
 ############################################################
 metrics:
   # Infrastructure metrics:
@@ -512,50 +1632,18 @@ metrics:
     lower_is_better: false
   # CLEVA Chinese bias, copyright and toxicity metrics share the same name as the original HELM metrics
 
-
-  # DecodingTrust Fairness Metrics
-  - name: equalized_odds_difference
-    display_name: Equalized Odds Difference
-    short_display_name: Equalized Odds
-    description: The rate at which the model refuses to answer an OoD question.
-    lower_is_better: true
-
-  - name: demographic_parity_difference
-    display_name: Demographic Parity Difference
-    short_display_name: demographic parity
-    description: The demographic parity difference measures the difference between the probability of positive predictions conditioned on sensitive attribute A = 1 and that conditioned on A = 0
-    lower_is_better: true
-
-
-  # DecodingTrust OoD Metrics
-  - name: ood_knowledge_rr
-    display_name: OOD Knowledge Refusal Rate
-    short_display_name: OoD-RR
-    description: The rate at which the model refuses to answer an OoD question.
-    lower_is_better: false
-
-  - name: ood_knowledge_macc
-    display_name: OOD Meaningful Answer Accuracy (Exclude Refusal)
-    short_display_name: OoD-MACC
-    description: MACC indicates the percentage of correct answers when the model successfully generates meaningful responses by excluding outputs that are refused to answer
-    lower_is_better: false
-
-
-  # DecodingTrust Stereotype Bias Metrics
-
-  - name: decodingtrust_stereotype_bias
-    display_name: DecodingTrust Bias Score
-    short_display_name: Bias Score
-    description: 1 - agreementIndex (Section 4.2 of DecodingTrust)
+  # KPI-EDGAR metrics (Enterprise Benchmark)
+  - name: kpi_edgar_adjusted_f1.macro
+    display_name: KPI-EDGAR Adjusted F1 Score (macro average)
+    short_display_name: KPI_EDGAR_Adj_F1
+    description: Adjusted F1 Score defined in the KPI-EDGAR paper. The score is a macro average over various tags. This is used for kpi_edgar scenario only.
     lower_is_better: false
-
-  - name: decodingtrust_stereotype_bias_rejection_rate
-    display_name: Accuracy at 10% coverage
-    short_display_name: Bias Rejection Rate
-    description: Rejection rate of stereotype prompts
+  - name: kpi_edgar_adjusted_f1.modified_macro
+    display_name: Modified KPI-EDGAR Adjusted F1 Score (macro average)
+    short_display_name: Mod_KPI_EDGAR_Adj_F1
+    description: Adjusted F1 Score defined in a standard manner. The score is a macro average over various tags. This is used for kpi_edgar scenario only.
     lower_is_better: false
 
-
 ############################################################
 perturbations:
   - name: robustness
@@ -839,31 +1927,6 @@ metric_groups:
       - name: chinese_bleu_1
         split: ${main_split}
 
-  - name: decodingtrust_fairness_metrics
-    display_name: DecodingTrust Fairness
-    metrics:
-      - name: equalized_odds_difference
-        split: ${main_split}
-      - name: demographic_parity_difference
-        split: ${main_split}
-
-  - name: decodingtrust_ood_metrics
-    display_name: DecodingTrust OOD Accuracy
-    metrics:
-      - name: ood_knowledge_rr
-        split: ${main_split}
-      - name: ood_knowledge_macc
-        split: ${main_split}
-
-  - name: decodingtrust_stereotype_bias_metrics
-    display_name: DecodingTrust Stereotype Bias
-    metrics:
-      - name: decodingtrust_stereotype_bias
-        split: ${main_split}
-      - name: decodingtrust_stereotype_bias_rejection_rate
-        split: ${main_split}
-
-
 ############################################################
 run_groups:
 ## Top-level
@@ -879,6 +1942,10 @@ run_groups:
       - sentiment_analysis
       - toxicity_detection
       - miscellaneous_text_classification
+      - enterprise_benchmark_financial
+      - enterprise_benchmark_legal
+      - enterprise_benchmark_climate_sustainability
+      - enterprise_benchmark_cybersecurity
 
   - name: targeted_evaluations
     display_name: Targeted evaluations
@@ -1046,20 +2113,6 @@ run_groups:
       main_name: none
       main_split: none
 
-  - name: decodingtrust
-    display_name: DecodingTrust
-    description: A comprehensive benchmark of the trustworthiness of large language models [(Wang et. al. 2023)](https://decodingtrust.github.io/)
-    category: Core scenarios
-    subgroups:
-      - decodingtrust_adv_robustness
-      - decodingtrust_adv_demonstration
-      - decodingtrust_ood_robustness
-      - decodingtrust_fairness
-      - decodingtrust_privacy
-      - decodingtrust_machine_ethics
-      - decodingtrust_toxicity_prompts
-      - decodingtrust_stereotype_bias
-
 ### Ablations
   - name: ablation_in_context
     display_name: Vary number of in-context examples
@@ -1171,6 +2224,40 @@ run_groups:
       - cleva_toxicity_detection
       - cleva_translation
 
+  - name: enterprise_benchmark_financial
+    display_name: Financial (Enterprise Benchmark)
+    description: Financial Scenarios for Enterprise Benchmark.
+    category: Core scenarios
+    subgroups:
+      - financial_phrasebank
+      - news_headline
+      - kpi_edgar
+      - conv_fin_qa
+
+  - name: enterprise_benchmark_legal
+    display_name: Legal (Enterprise Benchmark)
+    description: Legal Scenarios for Enterprise Benchmark.
+    category: Core scenarios
+    subgroups:
+      - legal_contract
+      - casehold_qa
+      - echr_judge
+      - legal_opinion
+
+  - name: enterprise_benchmark_climate_sustainability
+    display_name: Climate & Sustainability (Enterprise Benchmark)
+    description: Climate & Sustainability Scenarios for Enterprise Benchmark.
+    category: Core scenarios
+    subgroups:
+      - sumosum
+
+  - name: enterprise_benchmark_cybersecurity
+    display_name: Cybersecurity (Enterprise Benchmark)
+    description: Cybersecurity Scenarios for Enterprise Benchmark.
+    category: Core scenarios
+    subgroups:
+      - cti_mitre
+
 ### Scenarios (the actual scenarios)
 
 # Question answering scenarios
@@ -2762,110 +3849,42 @@ run_groups:
       task: story generation
     todo: true
 
-# Domain coverage
-  - name: biomedical_scenarios
-    display_name: Biomedical scenarios
-    description: Currently, we implement scenarios from common domains in NLP research, neglecting various domains where language technologies could provide significant value.
-    taxonomy:
-      what: Biomedical text (e.g., biomedicine papers)
-    todo: true
-
-  - name: clinical_scenarios
-    display_name: Clinical scenarios
-    description: Currently, we implement scenarios from common domains in NLP research, neglecting various domains where language technologies could provide significant value.
-    taxonomy:
-      what: Clincal text (e.g., clinical notes)
-    todo: true
-
-  - name: financial_scenarios
-    display_name: Financial scenarios
-    description: Currently, we implement scenarios from common domains in NLP research, neglecting various domains where language technologies could provide significant value.
-    taxonomy:
-      what: Financial text (e.g., financial reports)
-    todo: true
 
-  - name: customer_service_scenarios
-    display_name: Customer services scenarios
-    description: Currently, we implement scenarios from common domains in NLP research, neglecting various domains where language technologies could provide significant value.
-    taxonomy:
-      what: Customer service text (e.g., customer service chat logs)
-    todo: true
+# Enterprise Benchmark scenarios
 
-  - name: educational_scenarios
-    display_name: Educational scenarios
-    description: Currently, we implement scenarios from common domains in NLP research, neglecting various domains where language technologies could provide significant value.
-    taxonomy:
-      what: Text from educational contexts (e.g., student-teacher interactions)
-    todo: true
-
-  - name: very_recent_scenarios
-    display_name: Very recent scenarios
-    description: Currently, we implement scenarios using standard NLP datasets. However, to test temporal generalization as the world and language change, we should implement scenarios with very recent data (e.g., current world events) like StreamingQA.
-    taxonomy:
-      when: present
-    todo: true
-
-  - name: historical_scenarios
-    display_name: Scenarios involving historic data
-    description: Currently, we implement scenarios using standard NLP datasets, which predominantly are from post-Internet and contemporary society. However, to test temporal generalization for using models in the digital humanities for historic data, we should implement scenarios with significantly older data (e.g., text from 1800s).
-    taxonomy:
-      when: distant past
-    todo: true
-
-  - name: not_native_English_speaker
-    display_name: Scenarios involving non-native speakers
-    description: Currently, we implement scenarios of an unknown composition of native and non-native English speakers. We should implement scenarios to ensure coverage of language from non-native English speakers.
-    taxonomy:
-      who: non-native English speakers
-      language: English
-    todo: true
-
-  - name: non_US_demographics
-    display_name: Scenarios involving data from marginalized demographics in non-US English-speaking regions
-    description: Currently, we ensure some coverage of language based on US-centric demographic groups, including marginalized groups. We should implement scenarios to ensure coverage of other socially-relevant groups beyond US demographics (e.g., caste in India).
-    taxonomy:
-      who: relevant demographics in non-US English-speaking regions
-      language: English
-    todo: true
-
-# Language coverage
-  - name: user_facing_tasks_english_dialects
-    display_name: Scenarios with user-facing tasks on English dialects
-    description: Currently, evaluate performance on English dialects via language modeling (e.g., TwitterAAE, ICE), but it would be good to implement user-facing tasks for these dialects.
-    taxonomy:
-      task: user-facing tasks
-      language: English dialects
-    todo: true
-
-
-# DecodingTrust scenarios
-  - name: decodingtrust_adv_robustness
-    display_name: DecodingTrust - AdvGLUE++
-    short_display_name: AdvGLUE++
-    description: Adversarial perturbations of the GLUE dataset generated against open-source LLMs including Alpaca, Vicuna, and Stable-Vicuna
+# enterprise_benchmark_financial
+  - name: financial_phrasebank
+    display_name: Financial Phrase Bank (Sentiment Analysis)
+    description: Good Debt or Bad Debt - Detecting Semantic Orientations in Economic Texts [(Malo et al., 2013)](https://arxiv.org/pdf/1307.5336.pdf).
     metric_groups:
       - accuracy
       - calibration
+      - robustness
+      - fairness
+      - bias
+      - toxicity
       - efficiency
       - general_information
     environment:
       main_name: quasi_exact_match
       main_split: test
     taxonomy:
-      task: text classification
+      task: sentiment analysis
       what: "?"
       who: "?"
       when: "?"
       language: English
-    todo: true
 
-  - name: decodingtrust_adv_demonstration
-    display_name: DecodingTrust - Adversarial Demonstrations
-    short_display_name: AdvDemo
-    description: Robustness analysis of LM generations when facing adversarial demonstrations
+  - name: news_headline
+    display_name: Commodity News Headlines (Classification)
+    description: Impact of News on the Commodity Market - Dataset and Results [(Sinha et al., 2020)](https://arxiv.org/pdf/2009.04202.pdf).
     metric_groups:
       - accuracy
       - calibration
+      - robustness
+      - fairness
+      - bias
+      - toxicity
       - efficiency
       - general_information
     environment:
@@ -2878,73 +3897,68 @@ run_groups:
       when: "?"
       language: English
 
-  - name: decodingtrust_ood_robustness
-    display_name: DecodingTrust - OoD Robustness
-    short_display_name: OoD
-    description: Style perturbations of GLUE datasets (OoD styles) and out-of-scope OoD knowledge evaluations
+  - name: kpi_edgar
+    display_name: KPI-EDGAR Financial Documents (Named Entity Recognition)
+    description: KPI-EDGAR - A Novel Dataset and Accompanying Metric for Relation Extraction from Financial Documents [(Deußer et al., 2022)](https://arxiv.org/pdf/2210.09163.pdf).
     metric_groups:
       - accuracy
-      - calibration
-      - efficiency
       - general_information
-      - decodingtrust_ood_metrics
     environment:
-      main_name: quasi_exact_match
+      main_name: kpi_edgar_adjusted_f1.macro
       main_split: test
     taxonomy:
-      task: text classification
+      task: named entity recognition
       what: "?"
       who: "?"
       when: "?"
       language: English
 
-  - name: decodingtrust_fairness
-    display_name: DecodingTrust - Fairness
-    short_display_name: Fairness
-    description: Fairness analysis of LLMs
+  - name: conv_fin_qa
+    display_name: ConvFinQA Financial Conversations (Numerical Reasoning)
+    description: ConvFinQA - Exploring the Chain of Numerical Reasoning in Conversational Finance Question Answering [(Chen ey al., 2022)](https://arxiv.org/pdf/2210.03849.pdf).
     metric_groups:
-      - accuracy
-      - calibration
-      - efficiency
       - general_information
-      - decodingtrust_fairness_metrics
     environment:
-      main_name: quasi_exact_match
-      main_split: test
+      main_split: valid
     taxonomy:
-      task: text classification
+      task: numerical reasoning
       what: "?"
       who: "?"
       when: "?"
       language: English
 
-  - name: decodingtrust_privacy
-    display_name: DecodingTrust - Privacy
-    short_display_name: Privacy
-    description: Evaluation of the privacy understanding and privacy preserving properties of LLMs
+# enterprise_benchmark_legal
+  - name: legal_contract
+    display_name: Legal Contracts (Summarization)
+    description: Plain English Summarization of Contracts [(Manor et al., 2019)](https://aclanthology.org/W19-2201.pdf).
     metric_groups:
-      - accuracy
-      - calibration
-      - efficiency
       - general_information
     environment:
-      main_name: quasi_exact_match
+      main_name: rouge_2
       main_split: test
     taxonomy:
-      task: text classification
+      task: summarization
       what: "?"
       who: "?"
       when: "?"
       language: English
 
-  - name: decodingtrust_machine_ethics
-    display_name: DecodingTrust - Ethics
-    short_display_name: Ethics
-    description: Evaluation of the understanding of ethical behaviors of LLMs
+  - name: casehold_qa
+    display_name: CaseHOLD Legal Cases and Holdings (Question Answering)
+    description: When Does Pretraining Help? Assessing Self-Supervised Learning for Law and the CaseHOLD Dataset [(Zheng et al, 2021)](https://arxiv.org/pdf/2104.08671.pdf).
+    metric_groups: []
+    taxonomy:
+      task: question answering
+      what: "?"
+      who: "?"
+      when: "?"
+      language: English
+
+  - name: echr_judge
+    display_name: ECHR Legal Judgments (Classification)
+    description: Neural Legal Judgment Prediction in English [(Chalkidis et al., 2019)](https://arxiv.org/pdf/1906.02059.pdf).
     metric_groups:
       - accuracy
-      - calibration
-      - efficiency
       - general_information
     environment:
       main_name: quasi_exact_match
@@ -2956,56 +3970,132 @@ run_groups:
       when: "?"
       language: English
 
-  - name: decodingtrust_toxicity_prompts
-    display_name: DecodingTrust - Toxicity
-    short_display_name: Toxicity
-    description: Evaluation of the privacy understanding and privacy preserving properties of LLMs
+  - name: legal_opinion
+    display_name: Legal Opinions (Sentiment Analysis)
+    description: Effective Approach to Develop a Sentiment Annotator For Legal Domain in a Low Resource Setting [(Ratnayaka et al., 2020)](https://arxiv.org/pdf/2011.00318.pdf).
     metric_groups:
-      - toxicity
-      - bias
-      - efficiency
+      - accuracy
       - general_information
     environment:
+      main_name: quasi_exact_match
       main_split: test
     taxonomy:
-      task: "?"
-      what: n/a
-      who: n/a
-      when: n/a
-      language: synthetic
+      task: sentiment analysis
+      what: "?"
+      who: "?"
+      when: "?"
+      language: English
 
-  - name: decodingtrust_stereotype_bias
-    display_name: DecodingTrust - Stereotype Bias
-    short_display_name: Stereotype
-    description: Manually crafted stereotype user prompts from DecodingTrust
+
+# enterprise_benchmark_climate_sustainability
+  - name: sumosum
+    display_name: SUMO Web Claims (Summarization)
+    description: Generating Fact Checking Summaries for Web Claims [(Mishra et al., 2020)](https://aclanthology.org/2020.wnut-1.12/).
     metric_groups:
-      - toxicity
-      - bias
-      - efficiency
       - general_information
-      - decodingtrust_stereotype_bias_metrics
     environment:
+      main_name: rouge_2
       main_split: test
     taxonomy:
-      task: "?"
-      what: n/a
-      who: n/a
-      when: n/a
-      language: synthetic
+      task: summarization
+      what: "?"
+      who: "?"
+      when: "?"
+      language: English
 
-  - name: thai_exam
-    display_name: Thai Exam
-    short_display_name: ThaiExam
-    description: A benchmark comprising Thai multiple-choice examinations.
+# enterprise_benchmark_cybersecurity
+  - name: cti_mitre
+    display_name: CTI-to-MITRE Cyber Threat Intelligence (Classification)
+    description: Automatic Mapping of Unstructured Cyber Threat Intelligence - An Experimental Study [(Orbinato et al., 2022)](https://arxiv.org/pdf/2208.12144.pdf).
     metric_groups:
       - accuracy
+      - calibration
+      - robustness
+      - fairness
+      - efficiency
       - general_information
     environment:
-      main_name: exact_match
+      main_name: quasi_exact_match
       main_split: test
     taxonomy:
-      task: question answering
+      task: text classification
       what: "?"
       who: "?"
       when: "?"
-      language: Thai
\ No newline at end of file
+      language: English
+
+
+# Domain coverage
+  - name: biomedical_scenarios
+    display_name: Biomedical scenarios
+    description: Currently, we implement scenarios from common domains in NLP research, neglecting various domains where language technologies could provide significant value.
+    taxonomy:
+      what: Biomedical text (e.g., biomedicine papers)
+    todo: true
+
+  - name: clinical_scenarios
+    display_name: Clinical scenarios
+    description: Currently, we implement scenarios from common domains in NLP research, neglecting various domains where language technologies could provide significant value.
+    taxonomy:
+      what: Clincal text (e.g., clinical notes)
+    todo: true
+
+  - name: financial_scenarios
+    display_name: Financial scenarios
+    description: Currently, we implement scenarios from common domains in NLP research, neglecting various domains where language technologies could provide significant value.
+    taxonomy:
+      what: Financial text (e.g., financial reports)
+    todo: true
+
+  - name: customer_service_scenarios
+    display_name: Customer services scenarios
+    description: Currently, we implement scenarios from common domains in NLP research, neglecting various domains where language technologies could provide significant value.
+    taxonomy:
+      what: Customer service text (e.g., customer service chat logs)
+    todo: true
+
+  - name: educational_scenarios
+    display_name: Educational scenarios
+    description: Currently, we implement scenarios from common domains in NLP research, neglecting various domains where language technologies could provide significant value.
+    taxonomy:
+      what: Text from educational contexts (e.g., student-teacher interactions)
+    todo: true
+
+  - name: very_recent_scenarios
+    display_name: Very recent scenarios
+    description: Currently, we implement scenarios using standard NLP datasets. However, to test temporal generalization as the world and language change, we should implement scenarios with very recent data (e.g., current world events) like StreamingQA.
+    taxonomy:
+      when: present
+    todo: true
+
+  - name: historical_scenarios
+    display_name: Scenarios involving historic data
+    description: Currently, we implement scenarios using standard NLP datasets, which predominantly are from post-Internet and contemporary society. However, to test temporal generalization for using models in the digital humanities for historic data, we should implement scenarios with significantly older data (e.g., text from 1800s).
+    taxonomy:
+      when: distant past
+    todo: true
+
+  - name: not_native_English_speaker
+    display_name: Scenarios involving non-native speakers
+    description: Currently, we implement scenarios of an unknown composition of native and non-native English speakers. We should implement scenarios to ensure coverage of language from non-native English speakers.
+    taxonomy:
+      who: non-native English speakers
+      language: English
+    todo: true
+
+  - name: non_US_demographics
+    display_name: Scenarios involving data from marginalized demographics in non-US English-speaking regions
+    description: Currently, we ensure some coverage of language based on US-centric demographic groups, including marginalized groups. We should implement scenarios to ensure coverage of other socially-relevant groups beyond US demographics (e.g., caste in India).
+    taxonomy:
+      who: relevant demographics in non-US English-speaking regions
+      language: English
+    todo: true
+
+# Language coverage
+  - name: user_facing_tasks_english_dialects
+    display_name: Scenarios with user-facing tasks on English dialects
+    description: Currently, evaluate performance on English dialects via language modeling (e.g., TwitterAAE, ICE), but it would be good to implement user-facing tasks for these dialects.
+    taxonomy:
+      task: user-facing tasks
+      language: English dialects
+    todo: true