diff --git a/.gitmodules b/.gitmodules index ee43426..7871604 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,9 +1,9 @@ [submodule "src/metametrics/metrics/GEMBA"] - path = metametrics/metrics/GEMBA + path = src/metametrics/metrics/GEMBA url = https://github.com/MicrosoftTranslator/GEMBA.git [submodule "src/metametrics/metrics/BARTScore"] - path = metametrics/metrics/BARTScore + path = src/metametrics/metrics/BARTScore url = https://github.com/neulab/BARTScore.git [submodule "src/metametrics/metrics/bleurt"] - path = metametrics/metrics/bleurt + path = src/metametrics/metrics/bleurt url = https://github.com/google-research/bleurt.git diff --git a/README.md b/README.md index c76e059..9d55f4b 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,60 @@ -# MetaMetrics +# MetaMetrics V0.0.1 +[MetaMetrics](https://github.com/meta-metrics/metametrics) is a tuneable, easily extensible, and explainable metric for human evaluation alignment in generation tasks. +The repository is the open-source implementation for MetaMetrics: Calibrating Metrics For Generation Tasks Using Human Preferences https://arxiv.org/pdf/2410.02381. We will release the code soon. -The repository is the open-source implementation for MetaMetrics: Calibrating Metrics For Generation Tasks Using Human Preferences https://arxiv.org/pdf/2410.02381. -We will release the code soon. +## Supported Tasks +The current version supports the following tasks: +1. Question Answering +2. Machine Translation +3. Image Captioning +4. Text Summarization +5. Reward Modeling -## Contents +You can clone and adapt the codes to support other generation tasks. -+ [Environment](#environment) -+ [Setup Instruction](#setup-instruction) +## Supported Metrics +The current version supports the following metrics: +1. BLEU +2. BARTScore +3. BERTScore +4. BLEURT20 +5. chrF +6. comet +7. MetricX +8. METEOR +9. ROUGE +10. ROUGEWE +11. SummaQA +12. YiSi +13. GEMBA_MQM +14. ClipScore +15. ArmoRM -## Environment +## Installation Guide +Requires `Python 3.10+` +``` +PENDING +``` -Python 3.10 or higher. Details of dependencies are in `setup.py`. +## How To Use +Example use-case with MetaMetrics library: +``` +CODE PENDING +``` -## Setup Instruction +## How To Extend New Metrics +Extending MetaMetrics to support other metrics is done by creating a Subclass of `metametrics.metrics.base_metric.BaseMetric` (Text Only Metric) +or `metametrics.metrics.base_metric.VisionToTextBaseMetric` (Vision to Text Metric) +and placing the file in `metametrics/src/metametrics/metrics/`. -1. Run `pip install .` as it will automatically install required dependencies. -2. Provide hugging-face token by setting as an environment variable in HF_TOKEN. + +The new metric must contain the following functions: +1. `NewMetric.score(self, predictions: List[str], references: Union[None,List[List[str]]]=None, sources: Union[None, List[str]]=None) -> List[float]`. + + +Checklist To Integrate Custom Metrics: +1. [ ] `metametrics/src/metametrics/metrics/__init__.py` | Import your new metric +2. [ ] `metametrics/src/metametrics/metrics/__init__.py` | Extend the `__all__` variable +3. [ ] `metametrics/src/metametrics/metametrics.py` | Import your metric +4. [ ] `metametrics/src/metametrics/metametrics.py` | Update variable `MetaMetrics.normalization_config` +5. [ ] `metametrics/src/metametrics/metametrics.py` | update function `MetaMetrics.get_metric()` \ No newline at end of file diff --git a/src/metametrics/__init__.py b/src/metametrics/__init__.py index fbda789..58f3ace 100644 --- a/src/metametrics/__init__.py +++ b/src/metametrics/__init__.py @@ -1,2 +1 @@ from .version import __version__ -from .metametrics import MetaMetrics diff --git a/src/metametrics/metametrics.py b/src/metametrics/metametrics.py deleted file mode 100644 index 627aed4..0000000 --- a/src/metametrics/metametrics.py +++ /dev/null @@ -1,253 +0,0 @@ -from bayes_opt import BayesianOptimization -from scipy import stats -from typing import List, Tuple -import numpy as np -import torch -import json -import os -import logging - -logging.basicConfig(level=logging.INFO) - -from metametrics.metrics import BLEUMetric -from metametrics.metrics import BARTScoreMetric -from metametrics.metrics import BERTScoreMetric -from metametrics.metrics import BLEURT20Metric -from metametrics.metrics import chrFMetric -from metametrics.metrics import COMETMetric -from metametrics.metrics import MetricXMetric -from metametrics.metrics import METEORMetric -from metametrics.metrics import ROUGEMetric -from metametrics.metrics import ROUGEWEMetric -from metametrics.metrics import SummaQAMetric -from metametrics.metrics import YiSiMetric -from metametrics.metrics import GEMBA_MQM - -from metametrics.metrics import ClipScoreMetric - - -class MetaMetrics: - """ - Args: - metrics_configs (List[Tuple[str, dict]]): a list of tuple of metric with the metric name and arguments. - weights (List[float]): a list of float weight assigned to each metric - cache_mode: bool - """ - def __init__(self, metrics_configs:List[Tuple[str, dict]], weights:List[float] = None, normalize:bool=False, cache_mode:bool=False): - self.metrics_configs = metrics_configs - self.metrics = [] - self.weights = weights - self.normalize = normalize - self.cache_mode = cache_mode - - if self.cache_mode: - for i in range(len(self.metrics_configs)): - metric_config = self.metrics_configs[i] - metric_name = metric_config[0] - metric_args = metric_config[1] - logging.info(f"[cache mode] initialize metric: {metric_name}") - metric = self.get_metric(metric_name, metric_args) - self.metrics.append(metric) - - if self.normalize: - logging.info(f"[normalize metric]") - self.normalization_config = { - # min, max, invert, clip - "bertscore": (-1.0, 1.0, False, False), - "yisi": (0.0, 1.0, False, False), - "bleurt": (0.0, 1.0, False, True), - "metricx": (0.0, 25.0, True, True), - "comet": (0.0, 1.0, False, True), - "xcomet-xl": (0.0, 1.0, False, True), - "xcomet-xxl": (0.0, 1.0, False, True), - "cometkiwi": (0.0, 1.0, False, True), - "cometkiwi-xl": (0.0, 1.0, False, True), - "cometkiwi-xxl": (0.0, 1.0, False, True), - "gemba_mqm": (-25.0, 0.0, False, False), - "bleu": (0.0, 1.0, False, False), - "chrf": (0.0, 100.0, False, False), - "clipscore": (0, 100.0, False, False), - "meteor": (0.0, 1.0, False, False), - "rouge": (0.0, 1.0, False, False), - "rougewe": (0.0, 1.0, False, False), - "summaqa": (0.0, 1.0, False, False), - "bartscore": (0.0, 1.0, False, False), - # "datastats": (0.0, 1.0, False, False), # TODO not sure, hence commented out - } - self.EPSILON = 1e-5 - - def get_metric(self, metric_name, metric_args): - logging.info(f"get metric: {metric_name}") - metric = None - if metric_name == "bleu": - metric = BLEUMetric(**metric_args) - elif metric_name == "bartscore": - metric = BARTScoreMetric(**metric_args) - elif metric_name == "bertscore": - metric = BERTScoreMetric(**metric_args) - elif metric_name == "bleurt": - metric = BLEURT20Metric(**metric_args) - elif metric_name == "chrf": - metric = chrFMetric(**metric_args) - elif metric_name == "comet": - metric = COMETMetric(comet_model="Unbabel/wmt22-comet-da", **metric_args) - elif metric_name == "xcomet-xxl": - metric = COMETMetric(comet_model="Unbabel/XCOMET-XXL", **metric_args) - elif metric_name == "xcomet-xl": - metric = COMETMetric(comet_model="Unbabel/XCOMET-XL", **metric_args) - elif metric_name == "cometkiwi": - metric = COMETMetric(comet_model="Unbabel/wmt22-cometkiwi-da", **metric_args) - elif metric_name == "cometkiwi-xl": - metric = COMETMetric(comet_model="Unbabel/wmt23-cometkiwi-da-xl", **metric_args) - elif metric_name == "cometkiwi-xxl": - metric = COMETMetric(comet_model="Unbabel/wmt23-cometkiwi-da-xxl", **metric_args) - elif metric_name == "metricx": - metric = MetricXMetric(**metric_args) - elif metric_name == "meteor": - metric = METEORMetric(**metric_args) - elif metric_name == "rouge": - metric = ROUGEMetric(**metric_args) - elif metric_name == "rougewe": - metric = ROUGEWEMetric(**metric_args) - elif metric_name == "summaqa": - metric = SummaQAMetric(**metric_args) - elif metric_name == "yisi": - metric = YiSiMetric(**metric_args) - elif metric_name =="gemba_mqm": - metric = GEMBA_MQM(**metric_args) - elif metric_name == "clipscore": - metric = ClipScoreMetric(**metric_args) - return metric - - def score_vl(self, image_sources:List[torch.Tensor], text_predictions:List[str], text_references:List[str], text_sources: List[str] = None) -> List[float]: - overall_metric_score = None - for i in range(len(self.metrics_configs)): - metric_config = self.metrics_configs[i] - metric_name = metric_config[0] - metric_args = metric_config[1] - - if self.cache_mode: - logging.info(f"[cache mode] get metric: {metric_name}") - metric = self.metrics[i] - else: - logging.info(f"initialize metric: {metric_name}") - metric = self.get_metric(metric_name, metric_args) - - metric_score = np.array(metric.score(image_sources, text_predictions, text_references, text_sources)) - - if self.normalize: - _min, _max, _invert, _clip = self.normalization_config[metric_name] - if _clip: - metric_score = np.clip(metric_score, _min, _max) - - if (_min - self.EPSILON <= metric_score).any() and (metric_score <= _max + self.EPSILON).any(): - metric_score = np.clip(metric_score, _min, _max) - - metric_score = (metric_score - _min) / (_max - _min) - if _invert: - metric_score = 1 - metric_score - - del metric # for efficiency - - if i == 0: - overall_metric_score = metric_score * self.weights[i] - else: - overall_metric_score += metric_score * self.weights[i] - return overall_metric_score - - def score(self, predictions:List[str], references:List[str], sources: List[str] = None) -> List[float]: - overall_metric_score = None - for i in range(len(self.metrics_configs)): - metric_config = self.metrics_configs[i] - metric_name = metric_config[0] - metric_args = metric_config[1] - - if self.cache_mode: - logging.info(f"[cache mode] get metric: {metric_name}") - metric = self.metrics[i] - else: - logging.info(f"initialize metric: {metric_name}") - metric = self.get_metric(metric_name, metric_args) - - metric_score = np.array(metric.score(predictions, references, sources)) - - if self.normalize: - _min, _max, _invert, _clip = self.normalization_config[metric_name] - if _clip: - metric_score = np.clip(metric_score, _min, _max) - - if (_min - self.EPSILON <= metric_score).any() and (metric_score <= _max + self.EPSILON).any(): - metric_score = np.clip(metric_score, _min, _max) - - metric_score = (metric_score - _min) / (_max - _min) - if _invert: - metric_score = 1 - metric_score - - del metric # for efficiency - - if i == 0: - overall_metric_score = metric_score * self.weights[i] - else: - overall_metric_score += metric_score * self.weights[i] - return overall_metric_score - - - def calibrate(self, method_name, sources, predictions, references, human_scores, optimizer_args, corr_metric="kendall", cache_key = None): - cache = {} - cache_file_path = 'meta-metrics_cache.json' - if cache_key is not None: - if not os.path.isfile(cache_file_path): - with open(cache_file_path, 'w+') as f: - json.dump(cache, f) - with open(cache_file_path, 'r') as f: - cache_file = json.load(f) - cache = cache_file - - if method_name == "GP": - def black_box_function(**kwargs): - metric_score = 0 - for i, (src, pred, ref, score) in enumerate(zip(sources, predictions, references, human_scores)): - key_name = cache_key[i][0] - for k in range(len(self.metrics_configs)): - metric_name = self.metrics_configs[k][0] - if str((key_name, metric_name)) not in cache: - metric_score = np.array(self.metrics[k].score(pred, ref, src)) - cache[str((key_name, metric_name))] = metric_score.tolist() - if cache_key is not None: - cache_file = cache - with open(cache_file_path, 'w') as f: - json.dump(cache_file, f) - logging.info(str((key_name, metric_name))) - metric_res = 0 - for k in range(len(self.metrics_configs)): - metric_name = self.metrics_configs[k][0] - metric_res += kwargs[metric_name] * np.array(cache[str((key_name, metric_name))]) - if corr_metric == "kendall": - kendall = stats.kendalltau(metric_res, score) - # logging.info(kendall.statistic) - metric_score += kendall.statistic - else: - pass - logging.info(metric_score.mean()) - return metric_score - - # Bounded region of parameter space - pbounds = {} - for i in range(len(self.metrics)): - pbounds[f"{self.metrics_configs[i][0]}"] = (0,1) - - optimizer = BayesianOptimization( - f=black_box_function, - pbounds=pbounds, - random_state=1, - ) - optimizer.maximize( - init_points=optimizer_args["init_points"], - n_iter=optimizer_args["n_iter"], - ) - self.weights = [] - for i in range(len(self.metrics_configs)): - metric_name = self.metrics_configs[i][0] - self.weights.append(optimizer.max["params"][metric_name]) - logging.info("weights:", self.weights) - return self.weights diff --git a/src/metametrics/metrics/BARTScore b/src/metametrics/metrics/BARTScore new file mode 160000 index 0000000..248f511 --- /dev/null +++ b/src/metametrics/metrics/BARTScore @@ -0,0 +1 @@ +Subproject commit 248f511cb34ae3753fc81f7d7a945de5bfe33458 diff --git a/src/metametrics/metrics/GEMBA b/src/metametrics/metrics/GEMBA new file mode 160000 index 0000000..68be552 --- /dev/null +++ b/src/metametrics/metrics/GEMBA @@ -0,0 +1 @@ +Subproject commit 68be552640e2180bc0b6e1c3963592126a59b43c diff --git a/src/metametrics/metrics/armoRM_metric.py b/src/metametrics/metrics/armoRM_metric.py new file mode 100644 index 0000000..f2fe788 --- /dev/null +++ b/src/metametrics/metrics/armoRM_metric.py @@ -0,0 +1,90 @@ +import pandas as pd +import json +from typing import Dict, List, Union +from meta_metrics.metrics.base_metric import BaseMetric +import torch +from transformers import AutoModelForSequenceClassification, AutoTokenizer +from tqdm import tqdm + +class ArmoRMMetric(BaseMetric): + class ArmoRMPipeline: + def __init__(self, model_id, device_map="auto", torch_dtype=torch.bfloat16, truncation=True, trust_remote_code=False, max_length=4096): + self.model = AutoModelForSequenceClassification.from_pretrained( + model_id, + device_map=device_map, + trust_remote_code=trust_remote_code, + torch_dtype=torch_dtype, + ) + self.tokenizer = AutoTokenizer.from_pretrained( + model_id, + use_fast=True, + ) + self.truncation = truncation + self.device = self.model.device + self.max_length = max_length + self.attributes = [ + 'helpsteer-helpfulness','helpsteer-correctness','helpsteer-coherence', + 'helpsteer-complexity','helpsteer-verbosity','ultrafeedback-overall_score', + 'ultrafeedback-instruction_following', 'ultrafeedback-truthfulness', + 'ultrafeedback-honesty','ultrafeedback-helpfulness','beavertails-is_safe', + 'prometheus-score','argilla-overall_quality','argilla-judge_lm','code-complexity', + 'code-style','code-explanation','code-instruction-following','code-readability' + ] + self.attribute_dict = {k: v for (v, k) in enumerate(self.attributes)} + + def __call__(self, messages: List[Dict[str, str]], scoring_attribute: str) -> Dict[str, float]: + """ + messages: OpenAI chat messages to be scored + scoring_attribute: attribute for scoring, use depending on cases. + Note: no batching since due to length differences, the model will have to pad to the max length which is not efficient + Returns: a dictionary with the score between 0 and 1 + """ + + input_ids = self.tokenizer.apply_chat_template( + messages, + return_tensors="pt", + padding=True, + truncation=self.truncation, + max_length=self.max_length, + ).to(self.device) + with torch.no_grad(): + output = self.model(input_ids) + multi_obj_rewards = output.rewards.cpu().float() + gating_output = output.gating_output.cpu().float() + preference_score = output.score.cpu().float() + obj_transform = self.model.reward_transform_matrix.data.cpu().float() + multi_obj_coeffs = gating_output @ obj_transform.T + assert torch.isclose(torch.sum(multi_obj_rewards * multi_obj_coeffs, dim=1), preference_score, atol=1e-3) + return {scoring_attribute: multi_obj_rewards[0][self.attribute_dict[scoring_attribute]]} + + def __init__(self, sources=None, predictions=None, scoring_attribute=None): + # In the context of QA, sources are the question and predictions are the model output answer. + self.rm = self.ArmoRMPipeline("RLHFlow/ArmoRM-Llama3-8B-v0.1", trust_remote_code=True) + self.sources = sources + self.predictions = predictions + self.scoring_attribute = scoring_attribute + if scoring_attribute == None: + self.scoring_attribute = self.rm.attributes[1] + + def score(self, predictions: List[str]=None, references: Union[None, List[List[str]]]=None, sources: Union[None, List[str]]=None) -> List[float]: + if self.predictions is None: + self.predictions = predictions + if self.sources is None: + self.sources = sources + df = pd.DataFrame({ + 'question': sources, + 'prediction': predictions + }) + df['id'] = range(len(predictions)) + scores = [] + for _, row in tqdm(df.iterrows(), total=len(df)): + try: + score = self.rm( + messages = [{"role": "user", "content": row['question']}, {"role": "assistant", "content": row['prediction']}], + scoring_attribute = self.scoring_attribute + ) + scores.append(score[self.scoring_attribute]) + except Exception as e: + print(f"An error occured: {e} || Setting default error value to -1") + scores.append(-1) + return scores diff --git a/src/metametrics/metrics/bleurt b/src/metametrics/metrics/bleurt new file mode 160000 index 0000000..cebe7e6 --- /dev/null +++ b/src/metametrics/metrics/bleurt @@ -0,0 +1 @@ +Subproject commit cebe7e6f996b40910cfaa520a63db47807e3bf5c diff --git a/src/metametrics/utils/logging.py b/src/metametrics/utils/logging.py index e2e680e..fd536e1 100644 --- a/src/metametrics/utils/logging.py +++ b/src/metametrics/utils/logging.py @@ -50,7 +50,7 @@ def _setup_logger() -> None: root_logger.setLevel(_determine_log_level()) root_logger.propagate = False -def create_logger(logger_name: Optional[str] = None) -> "logging.Logger": +def get_logger(logger_name: Optional[str] = None) -> "logging.Logger": """ Generates a logger with the provided name, or defaults to the root logger's name. """ diff --git a/src/metametrics/utils/validate.py b/src/metametrics/utils/validate.py index 5c75147..f8786c8 100644 --- a/src/metametrics/utils/validate.py +++ b/src/metametrics/utils/validate.py @@ -34,10 +34,12 @@ def validate_real(arg: float, valid_min: float = None, valid_max: float = None) Raises: ValueError: If the argument is not within the specified range. """ - if valid_min is not None and arg < valid_min: - raise ValueError(f"Invalid argument '{arg}'. Must be greater than or equal to {valid_min}.") - if valid_max is not None and arg > valid_max: - raise ValueError(f"Invalid argument '{arg}'. Must be less than or equal to {valid_max}.") + if valid_min is not None: + if arg < valid_min: + raise ValueError(f"Invalid argument '{arg}'. Must be greater than or equal to {valid_min}.") + if valid_max is not None: + if arg > valid_max: + raise ValueError(f"Invalid argument '{arg}'. Must be less than or equal to {valid_max}.") return arg @@ -56,10 +58,12 @@ def validate_int(arg: int, valid_min: int = None, valid_max: int = None) -> int: Raises: ValueError: If the argument is not within the specified range. """ - if valid_min is not None and arg < valid_min: - raise ValueError(f"Invalid argument '{arg}'. Must be greater than or equal to {valid_min}.") - if valid_max is not None and arg > valid_max: - raise ValueError(f"Invalid argument '{arg}'. Must be less than or equal to {valid_max}.") + if valid_min is not None: + if arg < valid_min: + raise ValueError(f"Invalid argument '{arg}'. Must be greater than or equal to {valid_min}.") + if valid_max is not None: + if arg > valid_max: + raise ValueError(f"Invalid argument '{arg}'. Must be less than or equal to {valid_max}.") return arg