No public description

PiperOrigin-RevId: 644455583
PAIR-code · Jun 27, 2024 · a519e0c · a519e0c
1 parent 6e3a410
commit a519e0c
Show file tree

Hide file tree

Showing 2 changed files with 120 additions and 76 deletions.
diff --git a/python/notebooks/run_scripts_for_llm_comparator.ipynb b/python/notebooks/run_scripts_for_llm_comparator.ipynb
@@ -8,40 +8,25 @@
       },
       "outputs": [],
       "source": [
-        "# !pip install llm-comparator"
+        "! pip install /content/llm_comparator-0.1-py3-none-any.whl"
       ]
     },
-    {
-      "metadata": {
-        "id": "QZlVpN83nJBv"
-      },
-      "cell_type": "code",
-      "source": [
-        "# Run this if using a google3 Colab Kernel, such as with\n",
-        "# blaze run //third_party/javascript/llm_comparator/python/src/llm_comparator:kernel\n",
-        "# Otherwise, import modules using the following cell.\n",
-        "from llm_comparator import model_helper\n",
-        "from llm_comparator import llm_judge_runner\n",
-        "from llm_comparator import rationale_bullet_generator\n",
-        "from llm_comparator import rationale_cluster_generator\n",
-        "import vertexai"
-      ],
-      "outputs": [],
-      "execution_count": null
-    },
     {
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "id": "82SyO0LFCLPy"
+        "id": "QZlVpN83nJBv"
       },
       "outputs": [],
       "source": [
+        "import vertexai\n",
+        "from google.colab import auth\n",
+        "\n",
+        "from llm_comparator import comparison\n",
         "from llm_comparator import model_helper\n",
         "from llm_comparator import llm_judge_runner\n",
         "from llm_comparator import rationale_bullet_generator\n",
-        "from llm_comparator import rationale_cluster_generator\n",
-        "import vertexai"
+        "from llm_comparator import rationale_cluster_generator"
       ]
     },
     {
@@ -53,19 +38,14 @@
       "outputs": [],
       "source": [
         "#@title Setup for using Vertex AI.\n",
-        "from google.colab import auth\n",
-        "\n",
         "auth.authenticate_user()\n",
         "\n",
         "PROJECT_ID = 'pair-experimental'  #@param {type: \"string\"}\n",
         "REGION = 'us-central1'  #@param {type: \"string\"}\n",
         "\n",
         "! gcloud config set project {PROJECT_ID}\n",
         "\n",
-        "vertexai.init(project=PROJECT_ID, location=REGION)\n",
-        "\n",
-        "generator = model_helper.VertexGenerationModelHelper()\n",
-        "embedder = model_helper.VertexEmbeddingModelHelper()"
+        "vertexai.init(project=PROJECT_ID, location=REGION)"
       ]
     },
     {
@@ -91,67 +71,55 @@
       },
       "outputs": [],
       "source": [
-        "# Run LLM judge.\n",
+        "#@title Initialize models used in the LLM Comparator evaluation.\n",
+        "generator = model_helper.VertexGenerationModelHelper()\n",
+        "embedder = model_helper.VertexEmbeddingModelHelper()\n",
         "judge = llm_judge_runner.LLMJudgeRunner(generator)\n",
-        "judge_outputs = judge.run(llm_judge_inputs, 4)\n",
-        "\n",
-        "# Generate bulleted summary of rationales.\n",
-        "bullet_generator = rationale_bullet_generator.RationaleBulletGenerator(\n",
-        "    generator)\n",
-        "bullet_generator_outputs = bullet_generator.run(judge_outputs)\n",
-        "\n",
-        "# Cluster the bulleted summary of rationales.\n",
+        "bulletizer = rationale_bullet_generator.RationaleBulletGenerator(generator)\n",
         "clusterer = rationale_cluster_generator.RationaleClusterGenerator(\n",
-        "    generator, embedder)\n",
-        "clusters, rationales_with_similarities = clusterer.run(\n",
-        "    bullet_generator_outputs, num_clusters=5\n",
-        ")\n",
-        "\n",
-        "# TODO: Create a wrapper class that includes both LLM judge and rationale summary (not implemented yet)."
+        "    generator, embedder\n",
+        ")"
       ]
     },
     {
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "id": "X33Th6Kiw_Ja"
+        "id": "QUU3V63vVbvS"
       },
       "outputs": [],
       "source": [
-        "#@title Prepare JSON for LLM Comparator\n",
-        "# TODO: Move to the pip package.\n",
-        "import json\n",
-        "\n",
-        "llm_comparator_data = {\n",
-        "    'metadata': {'custom_fields_schema': []},\n",
-        "    'models': [{'name': 'A'}, { 'name': 'B'}],\n",
-        "    'examples': [{\n",
-        "        'input_text': input['prompt'],\n",
-        "        'tags': [],\n",
-        "        'output_text_a': input['response_a'],\n",
-        "        'output_text_b': input['response_b'],\n",
-        "        'score': judge_output['score'],\n",
-        "        'individual_rater_scores': judge_output['individual_rater_scores'],\n",
-        "        'rationale_list': rationales_with_similarities_for_ex,\n",
-        "    } for input, judge_output, rationales_with_similarities_for_ex in zip(\n",
-        "        llm_judge_inputs, judge_outputs, rationales_with_similarities)],\n",
-        "    'rationale_clusters': clusters,\n",
-        "}\n",
-        "\n",
-        "with open('json_for_llm_comparator.json', 'w') as f:\n",
-        "  json.dump(llm_comparator_data, f)"
+        "#@title Run the LLM Comparator evauation.\n",
+        "comparison_result = comparison.run(\n",
+        "    llm_judge_inputs,\n",
+        "    judge,\n",
+        "    bulletizer,\n",
+        "    clusterer,\n",
+        ")"
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": null,
       "metadata": {
-        "id": "g1mE-MKAwsA6"
+        "id": "hViMkxUGhnTA"
       },
+      "cell_type": "code",
+      "source": [
+        "file_path = 'json_for_llm_comparator.json' # @param {type: \"string\"}\n",
+        "comparison.write(comparison_result, file_path)"
+      ],
       "outputs": [],
+      "execution_count": null
+    },
+    {
+      "metadata": {
+        "id": "3_kgtTlOtI8s"
+      },
+      "cell_type": "code",
       "source": [
-        "!git clone https://github.com/PAIR-code/llm-comparator"
-      ]
+        "! git clone https://github.com/PAIR-code/llm-comparator"
+      ],
+      "outputs": [],
+      "execution_count": null
     },
     {
       "cell_type": "code",
@@ -162,12 +130,12 @@
       "outputs": [],
       "source": [
         "#@title For displaying LLM Comparator.\n",
-        "from IPython.display import Javascript\n",
+        "import IPython\n",
         "\n",
         "# TODO: Move to the pip package.\n",
-        "def show_llm_comparator(json_path, height=800, port=8888):\n",
-        "  get_ipython().system_raw(f'python3 -m http.server {port} \u0026')\n",
-        "  display(Javascript(\"\"\"\n",
+        "def show_llm_comparator(json_path, height=800, port=4321):\n",
+        "  IPython.get_ipython().system_raw(f'python3 -m http.server {port} \u0026')\n",
+        "  IPython.display.display(IPython.display.Javascript(\"\"\"\n",
         "  (async ()=\u003e{\n",
         "    fm = document.createElement('iframe')\n",
         "    fm.src = await google.colab.kernel.proxyPort(%s)\n",
@@ -190,12 +158,16 @@
       },
       "outputs": [],
       "source": [
-        "show_llm_comparator('json_for_llm_comparator.json')"
+        "show_llm_comparator(file_path, port=7676)"
       ]
     }
   ],
   "metadata": {
     "colab": {
+      "last_runtime": {
+        "build_target": "",
+        "kind": "local"
+      },
       "private_outputs": true,
       "provenance": [
         {

diff --git a/python/src/llm_comparator/comparison.py b/python/src/llm_comparator/comparison.py
@@ -0,0 +1,72 @@
+"""Primary entry point for running evaluations with LLM Comparator."""
+
+from collections.abc import Sequence
+import json
+
+from llm_comparator import llm_judge_runner
+from llm_comparator import rationale_bullet_generator
+from llm_comparator import rationale_cluster_generator
+from llm_comparator import types
+
+
+# TODO(llm-comparator): Provide convenience utilities for converting from, e.g.,
+# CSV/TSV to the dictionary format required by this function.
+def run(
+    inputs: Sequence[types.LLMJudgeInput],
+    judge: llm_judge_runner.LLMJudgeRunner,
+    bulletizer: rationale_bullet_generator.RationaleBulletGenerator,
+    clusterer: rationale_cluster_generator.RationaleClusterGenerator,
+    model_names: Sequence[str] = ('A', 'B'),
+) -> types.JsonDict:
+  """Runs a comparison with LLM Comparator.
+
+  LLM Comparator comparisons are run in three steps:
+
+  1. An LLM Judge is run on the inputs to produce a set of judgements.
+  2. A Rationale Bullet Generator is run on the judgements to produce a set of
+     rationale bullets.
+  3. The Rationale Cluster Generator is run on the rationale bullets to produce
+     a set of rationale clusters with similarity scores.
+
+  Args:
+    inputs: The inputs to the evaluation.
+    judge: The LLM Judge to use.
+    bulletizer: The Rationale Bullet Generator to use.
+    clusterer: The Rationale Cluster Generator to use.
+    model_names: The names of the models as you would like them to appear in the
+      LLM Comparator web application.
+
+  Returns:
+    The evaluation results as a JSON object, or the value of output_path if
+    provided and writing to that file was successful.
+  """
+
+  judgements = judge.run(inputs)
+  bullets = bulletizer.run(judgements)
+  clusters, cluster_similarities = clusterer.run(bullets)
+
+  per_example_generator = zip(inputs, judgements, cluster_similarities)
+
+  return {
+      'metadata': {'custom_fields_schema': []},
+      'models': [{'name': name} for name in model_names],
+      'examples': [
+          {
+              'input_text': input['prompt'],
+              'tags': [],
+              'output_text_a': input['response_a'],
+              'output_text_b': input['response_b'],
+              'score': judgement['score'],
+              'individual_rater_scores': judgement['individual_rater_scores'],
+              'rationale_list': similarity,
+          }
+          for input, judgement, similarity in per_example_generator
+      ],
+      'rationale_clusters': clusters,
+  }
+
+
+def write(comparison_result: types.JsonDict, output_path: str) -> str:
+  with open(output_path, 'w') as f:
+    json.dump(comparison_result, f)
+  return output_path