diff --git a/.gitmodules b/.gitmodules
index ee43426..7871604 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,9 +1,9 @@
 [submodule "src/metametrics/metrics/GEMBA"]
-	path = metametrics/metrics/GEMBA
+	path = src/metametrics/metrics/GEMBA
 	url = https://github.com/MicrosoftTranslator/GEMBA.git
 [submodule "src/metametrics/metrics/BARTScore"]
-	path = metametrics/metrics/BARTScore
+	path = src/metametrics/metrics/BARTScore
 	url = https://github.com/neulab/BARTScore.git
 [submodule "src/metametrics/metrics/bleurt"]
-	path = metametrics/metrics/bleurt
+	path = src/metametrics/metrics/bleurt
 	url = https://github.com/google-research/bleurt.git
diff --git a/README.md b/README.md
index c76e059..9d55f4b 100644
--- a/README.md
+++ b/README.md
@@ -1,18 +1,60 @@
-# MetaMetrics
+# MetaMetrics V0.0.1
+[MetaMetrics](https://github.com/meta-metrics/metametrics) is a tuneable, easily extensible, and explainable metric for human evaluation alignment in generation tasks.
+The repository is the open-source implementation for MetaMetrics: Calibrating Metrics For Generation Tasks Using Human Preferences https://arxiv.org/pdf/2410.02381. We will release the code soon.
 
-The repository is the open-source implementation for MetaMetrics: Calibrating Metrics For Generation Tasks Using Human Preferences https://arxiv.org/pdf/2410.02381.
-We will release the code soon.
+## Supported Tasks
+The current version supports the following tasks:
+1. Question Answering
+2. Machine Translation
+3. Image Captioning
+4. Text Summarization
+5. Reward Modeling
 
-## Contents
+You can clone and adapt the codes to support other generation tasks.
 
-+ [Environment](#environment)
-+ [Setup Instruction](#setup-instruction)
+## Supported Metrics
+The current version supports the following metrics:
+1. BLEU
+2. BARTScore
+3. BERTScore
+4. BLEURT20
+5. chrF
+6. comet
+7. MetricX
+8. METEOR
+9. ROUGE
+10. ROUGEWE
+11. SummaQA
+12. YiSi
+13. GEMBA_MQM
+14. ClipScore
+15. ArmoRM
 
-## Environment
+## Installation Guide
+Requires `Python 3.10+`
+```
+PENDING
+```
 
-Python 3.10 or higher. Details of dependencies are in `setup.py`.
+## How To Use
+Example use-case with MetaMetrics library:
+```
+CODE PENDING
+```
 
-## Setup Instruction
+## How To Extend New Metrics
+Extending MetaMetrics to support other metrics is done by creating a Subclass of `metametrics.metrics.base_metric.BaseMetric` (Text Only Metric)
+or `metametrics.metrics.base_metric.VisionToTextBaseMetric` (Vision to Text Metric)
+and placing the file in `metametrics/src/metametrics/metrics/`.
 
-1. Run `pip install .` as it will automatically install required dependencies.
-2. Provide hugging-face token by setting as an environment variable in HF_TOKEN.
+
+The new metric must contain the following functions:
+1. `NewMetric.score(self, predictions: List[str], references: Union[None,List[List[str]]]=None, sources: Union[None, List[str]]=None) -> List[float]`.
+
+
+Checklist To Integrate Custom Metrics:
+1. [ ] `metametrics/src/metametrics/metrics/__init__.py` | Import your new metric
+2. [ ] `metametrics/src/metametrics/metrics/__init__.py` | Extend the `__all__` variable
+3. [ ] `metametrics/src/metametrics/metametrics.py` | Import your metric
+4. [ ] `metametrics/src/metametrics/metametrics.py` | Update variable `MetaMetrics.normalization_config`
+5. [ ] `metametrics/src/metametrics/metametrics.py` | update function `MetaMetrics.get_metric()`
\ No newline at end of file
diff --git a/src/metametrics/__init__.py b/src/metametrics/__init__.py
index fbda789..58f3ace 100644
--- a/src/metametrics/__init__.py
+++ b/src/metametrics/__init__.py
@@ -1,2 +1 @@
 from .version import __version__
-from .metametrics import MetaMetrics
diff --git a/src/metametrics/metametrics.py b/src/metametrics/metametrics.py
deleted file mode 100644
index 627aed4..0000000
--- a/src/metametrics/metametrics.py
+++ /dev/null
@@ -1,253 +0,0 @@
-from bayes_opt import BayesianOptimization
-from scipy import stats
-from typing import List, Tuple
-import numpy as np
-import torch
-import json
-import os
-import logging
-
-logging.basicConfig(level=logging.INFO)
-
-from metametrics.metrics import BLEUMetric
-from metametrics.metrics import BARTScoreMetric
-from metametrics.metrics import BERTScoreMetric
-from metametrics.metrics import BLEURT20Metric
-from metametrics.metrics import chrFMetric
-from metametrics.metrics import COMETMetric
-from metametrics.metrics import MetricXMetric
-from metametrics.metrics import METEORMetric
-from metametrics.metrics import ROUGEMetric
-from metametrics.metrics import ROUGEWEMetric
-from metametrics.metrics import SummaQAMetric
-from metametrics.metrics import YiSiMetric
-from metametrics.metrics import GEMBA_MQM
-
-from metametrics.metrics import ClipScoreMetric
-
-
-class MetaMetrics:
-    """
-        Args:
-            metrics_configs (List[Tuple[str, dict]]): a list of tuple of metric with the metric name and arguments.
-            weights (List[float]): a list of float weight assigned to each metric
-            cache_mode: bool
-    """
-    def __init__(self, metrics_configs:List[Tuple[str, dict]], weights:List[float] = None, normalize:bool=False, cache_mode:bool=False):
-        self.metrics_configs = metrics_configs
-        self.metrics = []
-        self.weights = weights
-        self.normalize = normalize
-        self.cache_mode = cache_mode
-
-        if self.cache_mode:
-            for i in range(len(self.metrics_configs)):
-                metric_config = self.metrics_configs[i]
-                metric_name = metric_config[0]
-                metric_args = metric_config[1]
-                logging.info(f"[cache mode] initialize metric: {metric_name}")
-                metric = self.get_metric(metric_name, metric_args)
-                self.metrics.append(metric)
-        
-        if self.normalize:
-            logging.info(f"[normalize metric]")
-            self.normalization_config = {
-                # min, max, invert, clip
-                "bertscore": (-1.0, 1.0, False, False),
-                "yisi": (0.0, 1.0, False, False),
-                "bleurt": (0.0, 1.0, False, True),
-                "metricx": (0.0, 25.0, True, True),
-                "comet": (0.0, 1.0, False, True),
-                "xcomet-xl": (0.0, 1.0, False, True),
-                "xcomet-xxl": (0.0, 1.0, False, True),
-                "cometkiwi": (0.0, 1.0, False, True),
-                "cometkiwi-xl": (0.0, 1.0, False, True),
-                "cometkiwi-xxl": (0.0, 1.0, False, True),
-                "gemba_mqm": (-25.0, 0.0, False, False),
-                "bleu": (0.0, 1.0, False, False),
-                "chrf": (0.0, 100.0, False, False),
-                "clipscore": (0, 100.0, False, False),
-                "meteor": (0.0, 1.0, False, False),
-                "rouge": (0.0, 1.0, False, False),
-                "rougewe": (0.0, 1.0, False, False),
-                "summaqa": (0.0, 1.0, False, False),
-                "bartscore": (0.0, 1.0, False, False),
-                # "datastats": (0.0, 1.0, False, False), # TODO not sure, hence commented out
-            }
-            self.EPSILON = 1e-5
-
-    def get_metric(self, metric_name, metric_args):
-        logging.info(f"get metric: {metric_name}")
-        metric = None
-        if metric_name == "bleu":
-            metric = BLEUMetric(**metric_args)
-        elif metric_name == "bartscore":
-            metric = BARTScoreMetric(**metric_args)
-        elif metric_name == "bertscore":
-            metric = BERTScoreMetric(**metric_args)
-        elif metric_name == "bleurt":
-            metric = BLEURT20Metric(**metric_args)
-        elif metric_name == "chrf":
-            metric = chrFMetric(**metric_args)
-        elif metric_name == "comet":
-            metric = COMETMetric(comet_model="Unbabel/wmt22-comet-da", **metric_args)
-        elif metric_name == "xcomet-xxl":
-            metric = COMETMetric(comet_model="Unbabel/XCOMET-XXL", **metric_args)
-        elif metric_name == "xcomet-xl":
-            metric = COMETMetric(comet_model="Unbabel/XCOMET-XL", **metric_args)
-        elif metric_name == "cometkiwi":
-            metric = COMETMetric(comet_model="Unbabel/wmt22-cometkiwi-da", **metric_args)
-        elif metric_name == "cometkiwi-xl":
-            metric = COMETMetric(comet_model="Unbabel/wmt23-cometkiwi-da-xl", **metric_args)
-        elif metric_name == "cometkiwi-xxl":
-            metric = COMETMetric(comet_model="Unbabel/wmt23-cometkiwi-da-xxl", **metric_args)
-        elif metric_name == "metricx":
-            metric = MetricXMetric(**metric_args)
-        elif metric_name == "meteor":
-            metric = METEORMetric(**metric_args)
-        elif metric_name == "rouge":
-            metric = ROUGEMetric(**metric_args)
-        elif metric_name == "rougewe":
-            metric = ROUGEWEMetric(**metric_args)
-        elif metric_name == "summaqa":
-            metric = SummaQAMetric(**metric_args)
-        elif metric_name == "yisi":
-            metric = YiSiMetric(**metric_args)
-        elif metric_name =="gemba_mqm":
-            metric = GEMBA_MQM(**metric_args)
-        elif metric_name == "clipscore":
-            metric = ClipScoreMetric(**metric_args)
-        return metric
-
-    def score_vl(self, image_sources:List[torch.Tensor], text_predictions:List[str], text_references:List[str], text_sources: List[str] = None) -> List[float]:
-        overall_metric_score = None
-        for i in range(len(self.metrics_configs)):
-            metric_config = self.metrics_configs[i]
-            metric_name = metric_config[0]
-            metric_args = metric_config[1]
-
-            if self.cache_mode:
-                logging.info(f"[cache mode] get metric: {metric_name}")
-                metric = self.metrics[i]
-            else:
-                logging.info(f"initialize metric: {metric_name}")
-                metric = self.get_metric(metric_name, metric_args)
-                
-            metric_score = np.array(metric.score(image_sources, text_predictions, text_references, text_sources))
-
-            if self.normalize:
-                _min, _max, _invert, _clip = self.normalization_config[metric_name]
-                if _clip:
-                    metric_score = np.clip(metric_score, _min, _max)
-                
-                if (_min - self.EPSILON <= metric_score).any() and (metric_score <= _max + self.EPSILON).any():
-                    metric_score = np.clip(metric_score, _min, _max)
-                
-                metric_score = (metric_score - _min) / (_max - _min)
-                if _invert:
-                    metric_score = 1 - metric_score
-
-            del metric # for efficiency
-
-            if i == 0:
-                overall_metric_score = metric_score * self.weights[i]
-            else:
-                overall_metric_score += metric_score * self.weights[i]
-        return overall_metric_score
-
-    def score(self, predictions:List[str], references:List[str], sources: List[str] = None) -> List[float]:
-        overall_metric_score = None
-        for i in range(len(self.metrics_configs)):
-            metric_config = self.metrics_configs[i]
-            metric_name = metric_config[0]
-            metric_args = metric_config[1]
-
-            if self.cache_mode:
-                logging.info(f"[cache mode] get metric: {metric_name}")
-                metric = self.metrics[i]
-            else:
-                logging.info(f"initialize metric: {metric_name}")
-                metric = self.get_metric(metric_name, metric_args)
-                
-            metric_score = np.array(metric.score(predictions, references, sources))
-
-            if self.normalize:
-                _min, _max, _invert, _clip = self.normalization_config[metric_name]
-                if _clip:
-                    metric_score = np.clip(metric_score, _min, _max)
-                
-                if (_min - self.EPSILON <= metric_score).any() and (metric_score <= _max + self.EPSILON).any():
-                    metric_score = np.clip(metric_score, _min, _max)
-                
-                metric_score = (metric_score - _min) / (_max - _min)
-                if _invert:
-                    metric_score = 1 - metric_score
-
-            del metric # for efficiency
-
-            if i == 0:
-                overall_metric_score = metric_score * self.weights[i]
-            else:
-                overall_metric_score += metric_score * self.weights[i]
-        return overall_metric_score
-
-
-    def calibrate(self, method_name, sources, predictions, references, human_scores, optimizer_args, corr_metric="kendall", cache_key = None):
-        cache = {}
-        cache_file_path = 'meta-metrics_cache.json'
-        if cache_key is not None:
-            if not os.path.isfile(cache_file_path):
-                with open(cache_file_path, 'w+') as f:
-                    json.dump(cache, f)
-            with open(cache_file_path, 'r') as f:
-                cache_file = json.load(f)
-                cache = cache_file
-        
-        if method_name == "GP":
-            def black_box_function(**kwargs):
-                metric_score = 0
-                for i, (src, pred, ref, score) in enumerate(zip(sources, predictions, references, human_scores)):
-                    key_name = cache_key[i][0]
-                    for k in range(len(self.metrics_configs)):
-                        metric_name = self.metrics_configs[k][0]
-                        if str((key_name, metric_name)) not in cache:
-                            metric_score = np.array(self.metrics[k].score(pred, ref, src))
-                            cache[str((key_name, metric_name))] = metric_score.tolist()
-                            if cache_key is not None:
-                                cache_file = cache
-                                with open(cache_file_path, 'w') as f:
-                                    json.dump(cache_file, f)
-                                logging.info(str((key_name, metric_name)))
-                    metric_res = 0
-                    for k in range(len(self.metrics_configs)):
-                        metric_name = self.metrics_configs[k][0]
-                        metric_res += kwargs[metric_name] * np.array(cache[str((key_name, metric_name))])
-                    if corr_metric == "kendall":
-                        kendall = stats.kendalltau(metric_res, score)
-                        # logging.info(kendall.statistic)
-                        metric_score += kendall.statistic
-                    else:
-                        pass
-                logging.info(metric_score.mean())
-                return metric_score
-
-            # Bounded region of parameter space
-            pbounds = {}
-            for i in range(len(self.metrics)):
-                pbounds[f"{self.metrics_configs[i][0]}"] = (0,1)
-
-            optimizer = BayesianOptimization(
-                f=black_box_function,
-                pbounds=pbounds,
-                random_state=1,
-            )
-            optimizer.maximize(
-                init_points=optimizer_args["init_points"],
-                n_iter=optimizer_args["n_iter"],
-            )
-            self.weights = []
-            for i in range(len(self.metrics_configs)):
-                metric_name = self.metrics_configs[i][0]
-                self.weights.append(optimizer.max["params"][metric_name])
-            logging.info("weights:", self.weights)
-            return self.weights
diff --git a/src/metametrics/metrics/BARTScore b/src/metametrics/metrics/BARTScore
new file mode 160000
index 0000000..248f511
--- /dev/null
+++ b/src/metametrics/metrics/BARTScore
@@ -0,0 +1 @@
+Subproject commit 248f511cb34ae3753fc81f7d7a945de5bfe33458
diff --git a/src/metametrics/metrics/GEMBA b/src/metametrics/metrics/GEMBA
new file mode 160000
index 0000000..68be552
--- /dev/null
+++ b/src/metametrics/metrics/GEMBA
@@ -0,0 +1 @@
+Subproject commit 68be552640e2180bc0b6e1c3963592126a59b43c
diff --git a/src/metametrics/metrics/armoRM_metric.py b/src/metametrics/metrics/armoRM_metric.py
new file mode 100644
index 0000000..f2fe788
--- /dev/null
+++ b/src/metametrics/metrics/armoRM_metric.py
@@ -0,0 +1,90 @@
+import pandas as pd
+import json
+from typing import Dict, List, Union
+from meta_metrics.metrics.base_metric import BaseMetric
+import torch
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+from tqdm import tqdm
+
+class ArmoRMMetric(BaseMetric):
+    class ArmoRMPipeline:
+        def __init__(self, model_id, device_map="auto", torch_dtype=torch.bfloat16, truncation=True, trust_remote_code=False, max_length=4096):
+            self.model = AutoModelForSequenceClassification.from_pretrained(
+                model_id,
+                device_map=device_map,
+                trust_remote_code=trust_remote_code,
+                torch_dtype=torch_dtype,
+            )
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                model_id,
+                use_fast=True,
+            )
+            self.truncation = truncation
+            self.device = self.model.device
+            self.max_length = max_length
+            self.attributes = [
+                'helpsteer-helpfulness','helpsteer-correctness','helpsteer-coherence',
+                'helpsteer-complexity','helpsteer-verbosity','ultrafeedback-overall_score',
+                'ultrafeedback-instruction_following', 'ultrafeedback-truthfulness',
+                'ultrafeedback-honesty','ultrafeedback-helpfulness','beavertails-is_safe',
+                'prometheus-score','argilla-overall_quality','argilla-judge_lm','code-complexity',
+                'code-style','code-explanation','code-instruction-following','code-readability'
+            ]
+            self.attribute_dict = {k: v for (v, k) in enumerate(self.attributes)}
+    
+        def __call__(self, messages: List[Dict[str, str]], scoring_attribute: str) -> Dict[str, float]:
+            """
+            messages: OpenAI chat messages to be scored
+            scoring_attribute: attribute for scoring, use depending on cases.
+            Note: no batching since due to length differences, the model will have to pad to the max length which is not efficient
+            Returns: a dictionary with the score between 0 and 1
+            """
+
+            input_ids = self.tokenizer.apply_chat_template(
+                messages,
+                return_tensors="pt",
+                padding=True,
+                truncation=self.truncation,
+                max_length=self.max_length,
+            ).to(self.device)
+            with torch.no_grad():
+               output = self.model(input_ids)
+               multi_obj_rewards = output.rewards.cpu().float() 
+               gating_output = output.gating_output.cpu().float()
+               preference_score = output.score.cpu().float() 
+            obj_transform = self.model.reward_transform_matrix.data.cpu().float()
+            multi_obj_coeffs = gating_output @ obj_transform.T
+            assert torch.isclose(torch.sum(multi_obj_rewards * multi_obj_coeffs, dim=1), preference_score, atol=1e-3) 
+            return {scoring_attribute: multi_obj_rewards[0][self.attribute_dict[scoring_attribute]]}
+
+    def __init__(self, sources=None, predictions=None, scoring_attribute=None):
+        # In the context of QA, sources are the question and predictions are the model output answer.
+        self.rm = self.ArmoRMPipeline("RLHFlow/ArmoRM-Llama3-8B-v0.1", trust_remote_code=True)
+        self.sources = sources
+        self.predictions = predictions
+        self.scoring_attribute = scoring_attribute
+        if scoring_attribute == None:
+            self.scoring_attribute = self.rm.attributes[1]
+        
+    def score(self, predictions: List[str]=None, references: Union[None, List[List[str]]]=None, sources: Union[None, List[str]]=None) -> List[float]:
+        if self.predictions is None:
+            self.predictions = predictions
+        if self.sources is None:
+            self.sources = sources
+        df = pd.DataFrame({
+            'question': sources,
+            'prediction': predictions
+        })
+        df['id'] = range(len(predictions))
+        scores = []
+        for _, row in tqdm(df.iterrows(), total=len(df)):
+            try:
+                score = self.rm(
+                    messages = [{"role": "user", "content": row['question']}, {"role": "assistant", "content": row['prediction']}],
+                    scoring_attribute = self.scoring_attribute
+                )
+                scores.append(score[self.scoring_attribute])
+            except Exception as e:
+                print(f"An error occured: {e} || Setting default error value to -1")
+                scores.append(-1)
+        return scores
diff --git a/src/metametrics/metrics/bleurt b/src/metametrics/metrics/bleurt
new file mode 160000
index 0000000..cebe7e6
--- /dev/null
+++ b/src/metametrics/metrics/bleurt
@@ -0,0 +1 @@
+Subproject commit cebe7e6f996b40910cfaa520a63db47807e3bf5c
diff --git a/src/metametrics/utils/logging.py b/src/metametrics/utils/logging.py
index e2e680e..fd536e1 100644
--- a/src/metametrics/utils/logging.py
+++ b/src/metametrics/utils/logging.py
@@ -50,7 +50,7 @@ def _setup_logger() -> None:
         root_logger.setLevel(_determine_log_level())
         root_logger.propagate = False
 
-def create_logger(logger_name: Optional[str] = None) -> "logging.Logger":
+def get_logger(logger_name: Optional[str] = None) -> "logging.Logger":
     """
     Generates a logger with the provided name, or defaults to the root logger's name.
     """
diff --git a/src/metametrics/utils/validate.py b/src/metametrics/utils/validate.py
index 5c75147..f8786c8 100644
--- a/src/metametrics/utils/validate.py
+++ b/src/metametrics/utils/validate.py
@@ -34,10 +34,12 @@ def validate_real(arg: float, valid_min: float = None, valid_max: float = None)
     Raises:
         ValueError: If the argument is not within the specified range.
     """
-    if valid_min is not None and arg < valid_min:
-        raise ValueError(f"Invalid argument '{arg}'. Must be greater than or equal to {valid_min}.")
-    if valid_max is not None and arg > valid_max:
-        raise ValueError(f"Invalid argument '{arg}'. Must be less than or equal to {valid_max}.")
+    if valid_min is not None:
+        if arg < valid_min:
+            raise ValueError(f"Invalid argument '{arg}'. Must be greater than or equal to {valid_min}.")
+    if valid_max is not None:
+        if arg > valid_max:
+            raise ValueError(f"Invalid argument '{arg}'. Must be less than or equal to {valid_max}.")
     
     return arg
 
@@ -56,10 +58,12 @@ def validate_int(arg: int, valid_min: int = None, valid_max: int = None) -> int:
     Raises:
         ValueError: If the argument is not within the specified range.
     """
-    if valid_min is not None and arg < valid_min:
-        raise ValueError(f"Invalid argument '{arg}'. Must be greater than or equal to {valid_min}.")
-    if valid_max is not None and arg > valid_max:
-        raise ValueError(f"Invalid argument '{arg}'. Must be less than or equal to {valid_max}.")
+    if valid_min is not None: 
+        if arg < valid_min:
+            raise ValueError(f"Invalid argument '{arg}'. Must be greater than or equal to {valid_min}.")
+    if valid_max is not None:
+        if arg > valid_max:
+            raise ValueError(f"Invalid argument '{arg}'. Must be less than or equal to {valid_max}.")
     
     return arg