Skip to content

Commit

Permalink
No public description
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 644455583
  • Loading branch information
RyanMullins committed Jun 27, 2024
1 parent 6e3a410 commit a519e0c
Show file tree
Hide file tree
Showing 2 changed files with 120 additions and 76 deletions.
124 changes: 48 additions & 76 deletions python/notebooks/run_scripts_for_llm_comparator.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -8,40 +8,25 @@
},
"outputs": [],
"source": [
"# !pip install llm-comparator"
"! pip install /content/llm_comparator-0.1-py3-none-any.whl"
]
},
{
"metadata": {
"id": "QZlVpN83nJBv"
},
"cell_type": "code",
"source": [
"# Run this if using a google3 Colab Kernel, such as with\n",
"# blaze run //third_party/javascript/llm_comparator/python/src/llm_comparator:kernel\n",
"# Otherwise, import modules using the following cell.\n",
"from llm_comparator import model_helper\n",
"from llm_comparator import llm_judge_runner\n",
"from llm_comparator import rationale_bullet_generator\n",
"from llm_comparator import rationale_cluster_generator\n",
"import vertexai"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "82SyO0LFCLPy"
"id": "QZlVpN83nJBv"
},
"outputs": [],
"source": [
"import vertexai\n",
"from google.colab import auth\n",
"\n",
"from llm_comparator import comparison\n",
"from llm_comparator import model_helper\n",
"from llm_comparator import llm_judge_runner\n",
"from llm_comparator import rationale_bullet_generator\n",
"from llm_comparator import rationale_cluster_generator\n",
"import vertexai"
"from llm_comparator import rationale_cluster_generator"
]
},
{
Expand All @@ -53,19 +38,14 @@
"outputs": [],
"source": [
"#@title Setup for using Vertex AI.\n",
"from google.colab import auth\n",
"\n",
"auth.authenticate_user()\n",
"\n",
"PROJECT_ID = 'pair-experimental' #@param {type: \"string\"}\n",
"REGION = 'us-central1' #@param {type: \"string\"}\n",
"\n",
"! gcloud config set project {PROJECT_ID}\n",
"\n",
"vertexai.init(project=PROJECT_ID, location=REGION)\n",
"\n",
"generator = model_helper.VertexGenerationModelHelper()\n",
"embedder = model_helper.VertexEmbeddingModelHelper()"
"vertexai.init(project=PROJECT_ID, location=REGION)"
]
},
{
Expand All @@ -91,67 +71,55 @@
},
"outputs": [],
"source": [
"# Run LLM judge.\n",
"#@title Initialize models used in the LLM Comparator evaluation.\n",
"generator = model_helper.VertexGenerationModelHelper()\n",
"embedder = model_helper.VertexEmbeddingModelHelper()\n",
"judge = llm_judge_runner.LLMJudgeRunner(generator)\n",
"judge_outputs = judge.run(llm_judge_inputs, 4)\n",
"\n",
"# Generate bulleted summary of rationales.\n",
"bullet_generator = rationale_bullet_generator.RationaleBulletGenerator(\n",
" generator)\n",
"bullet_generator_outputs = bullet_generator.run(judge_outputs)\n",
"\n",
"# Cluster the bulleted summary of rationales.\n",
"bulletizer = rationale_bullet_generator.RationaleBulletGenerator(generator)\n",
"clusterer = rationale_cluster_generator.RationaleClusterGenerator(\n",
" generator, embedder)\n",
"clusters, rationales_with_similarities = clusterer.run(\n",
" bullet_generator_outputs, num_clusters=5\n",
")\n",
"\n",
"# TODO: Create a wrapper class that includes both LLM judge and rationale summary (not implemented yet)."
" generator, embedder\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "X33Th6Kiw_Ja"
"id": "QUU3V63vVbvS"
},
"outputs": [],
"source": [
"#@title Prepare JSON for LLM Comparator\n",
"# TODO: Move to the pip package.\n",
"import json\n",
"\n",
"llm_comparator_data = {\n",
" 'metadata': {'custom_fields_schema': []},\n",
" 'models': [{'name': 'A'}, { 'name': 'B'}],\n",
" 'examples': [{\n",
" 'input_text': input['prompt'],\n",
" 'tags': [],\n",
" 'output_text_a': input['response_a'],\n",
" 'output_text_b': input['response_b'],\n",
" 'score': judge_output['score'],\n",
" 'individual_rater_scores': judge_output['individual_rater_scores'],\n",
" 'rationale_list': rationales_with_similarities_for_ex,\n",
" } for input, judge_output, rationales_with_similarities_for_ex in zip(\n",
" llm_judge_inputs, judge_outputs, rationales_with_similarities)],\n",
" 'rationale_clusters': clusters,\n",
"}\n",
"\n",
"with open('json_for_llm_comparator.json', 'w') as f:\n",
" json.dump(llm_comparator_data, f)"
"#@title Run the LLM Comparator evauation.\n",
"comparison_result = comparison.run(\n",
" llm_judge_inputs,\n",
" judge,\n",
" bulletizer,\n",
" clusterer,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "g1mE-MKAwsA6"
"id": "hViMkxUGhnTA"
},
"cell_type": "code",
"source": [
"file_path = 'json_for_llm_comparator.json' # @param {type: \"string\"}\n",
"comparison.write(comparison_result, file_path)"
],
"outputs": [],
"execution_count": null
},
{
"metadata": {
"id": "3_kgtTlOtI8s"
},
"cell_type": "code",
"source": [
"!git clone https://github.com/PAIR-code/llm-comparator"
]
"! git clone https://github.com/PAIR-code/llm-comparator"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
Expand All @@ -162,12 +130,12 @@
"outputs": [],
"source": [
"#@title For displaying LLM Comparator.\n",
"from IPython.display import Javascript\n",
"import IPython\n",
"\n",
"# TODO: Move to the pip package.\n",
"def show_llm_comparator(json_path, height=800, port=8888):\n",
" get_ipython().system_raw(f'python3 -m http.server {port} \u0026')\n",
" display(Javascript(\"\"\"\n",
"def show_llm_comparator(json_path, height=800, port=4321):\n",
" IPython.get_ipython().system_raw(f'python3 -m http.server {port} \u0026')\n",
" IPython.display.display(IPython.display.Javascript(\"\"\"\n",
" (async ()=\u003e{\n",
" fm = document.createElement('iframe')\n",
" fm.src = await google.colab.kernel.proxyPort(%s)\n",
Expand All @@ -190,12 +158,16 @@
},
"outputs": [],
"source": [
"show_llm_comparator('json_for_llm_comparator.json')"
"show_llm_comparator(file_path, port=7676)"
]
}
],
"metadata": {
"colab": {
"last_runtime": {
"build_target": "",
"kind": "local"
},
"private_outputs": true,
"provenance": [
{
Expand Down
72 changes: 72 additions & 0 deletions python/src/llm_comparator/comparison.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
"""Primary entry point for running evaluations with LLM Comparator."""

from collections.abc import Sequence
import json

from llm_comparator import llm_judge_runner
from llm_comparator import rationale_bullet_generator
from llm_comparator import rationale_cluster_generator
from llm_comparator import types


# TODO(llm-comparator): Provide convenience utilities for converting from, e.g.,
# CSV/TSV to the dictionary format required by this function.
def run(
inputs: Sequence[types.LLMJudgeInput],
judge: llm_judge_runner.LLMJudgeRunner,
bulletizer: rationale_bullet_generator.RationaleBulletGenerator,
clusterer: rationale_cluster_generator.RationaleClusterGenerator,
model_names: Sequence[str] = ('A', 'B'),
) -> types.JsonDict:
"""Runs a comparison with LLM Comparator.
LLM Comparator comparisons are run in three steps:
1. An LLM Judge is run on the inputs to produce a set of judgements.
2. A Rationale Bullet Generator is run on the judgements to produce a set of
rationale bullets.
3. The Rationale Cluster Generator is run on the rationale bullets to produce
a set of rationale clusters with similarity scores.
Args:
inputs: The inputs to the evaluation.
judge: The LLM Judge to use.
bulletizer: The Rationale Bullet Generator to use.
clusterer: The Rationale Cluster Generator to use.
model_names: The names of the models as you would like them to appear in the
LLM Comparator web application.
Returns:
The evaluation results as a JSON object, or the value of output_path if
provided and writing to that file was successful.
"""

judgements = judge.run(inputs)
bullets = bulletizer.run(judgements)
clusters, cluster_similarities = clusterer.run(bullets)

per_example_generator = zip(inputs, judgements, cluster_similarities)

return {
'metadata': {'custom_fields_schema': []},
'models': [{'name': name} for name in model_names],
'examples': [
{
'input_text': input['prompt'],
'tags': [],
'output_text_a': input['response_a'],
'output_text_b': input['response_b'],
'score': judgement['score'],
'individual_rater_scores': judgement['individual_rater_scores'],
'rationale_list': similarity,
}
for input, judgement, similarity in per_example_generator
],
'rationale_clusters': clusters,
}


def write(comparison_result: types.JsonDict, output_path: str) -> str:
with open(output_path, 'w') as f:
json.dump(comparison_result, f)
return output_path

0 comments on commit a519e0c

Please sign in to comment.