add bootstrapping figs and code

WM-SEMERU · Jun 5, 2024 · 4b5e4c7 · 4b5e4c7
1 parent 3853d8f
commit 4b5e4c7
Show file tree

Hide file tree

Showing 17 changed files with 984 additions and 44 deletions.
diff --git a/README.md b/README.md
@@ -1 +1,28 @@
-# emergent-capabilities
+# emergent-capabilities
+
+## Primary Files
+- `bugs2fix.ipynb` generates the graphs for the Bugs2Fix code repair task.
+- `code2code-trans.ipynb` generates the graphs for the Code2Code code translation tastk.
+- `commit-message.ipynb` generates the graphs for the commit message generation task.
+
+- `pull-tests.ipynb` installs the datasets from BIG and other various places. (I'm pretty sure CodeXGLUE was not installed this way - the repository was simply cloned to `data/CodeXGLUE`.)
+- `trim-tokens.ipynb` (***TODO***) is to uniformly trim output lines to ensure all lines are at most 500 tokens long (useful because various configurations were used during the testing process).
+
+- `bleu.py` is code adapted from CodeXGLUE which calculates the BLEU metric.
+- `metric.py` is a wrapper around the various metrics we used in this project.
+- `model_wrapper.py` is a wrapper around (specifically) the CodeGen extended family of models.
+- `render_output.py` is a wrapper around matplotlib suited for our usecase.
+- `run_battery.py` is helper code which streamlines the testcase running process.
+- `timehelp.py` is helper code which is responsible for timing operations and formatting them.
+
+## Scaffolding Files
+
+- `accelerate-test.ipynb` is scaffolding code which became the basis for `model_wrapper.py`, testing GPU loading & caching of the codegen models.
+- `codexglue-test.ipynb` is a scratchpad for initial testing of various prompts.
+- `consecutive.ipynb` is proof of concept code loading the models in sequence without leaking GPU memory.
+- `metric-progress.ipynb` investigates the change in metric according to number of test cases processed.
+- `softmax.ipynb` is verifying the correspondence between softmax and logits.
+- `testing.ipynb` is used for miscellaneous testing, but primarily the parsing of multiple choice questions.
+- `verify-result.ipynb` is debugging code used to examine questionable input/output pairs and assess what caused them (in this case, a bug/false assumption in the generation code).
+- `wrapper-test.ipynb` is a simple testing file for making sure the model wrapper works correctly.
+- `test.py` is an old testing file.
diff --git a/bars3d.ipynb b/bars3d.ipynb
diff --git a/bugs2fix.ipynb b/bugs2fix.ipynb
@@ -15,21 +15,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import os\n",
-    "from run_battery import BatteryRunner\n",
+    "from run_battery import BatteryRunner, BatteryConfigs\n",
     "\n",
-    "runner = BatteryRunner(\n",
-    "    case_count=100,\n",
-    "    meta_count=None,\n",
-    "    task=\"bugs2fix\",\n",
-    "    prompts=[\n",
-    "        \"// the buggy version of the code\\n{prompt}\\n// the fixed version of the code\\n\",\n",
-    "        \"// You are given a piece of buggy code. Your task is to fix the error, and generate the corrected code. Fix the following code:\\n{prompt}\\n\",\n",
-    "    ],\n",
-    "    battery_path=\"./data/CodeXGLUE/Code-Code/code-refinement/data/small\",\n",
-    "    questions_file=\"test.buggy-fixed.buggy\",\n",
-    "    truth_file=\"test.buggy-fixed.fixed\",\n",
-    ")"
+    "runner = BatteryRunner.of(BatteryConfigs.Bugs2Fix)"
    ]
   },
   {

diff --git a/figs/b2f-bootstrap-bleu.png b/figs/b2f-bootstrap-bleu.png
diff --git a/figs/b2f-bootstrap-codebleu.png b/figs/b2f-bootstrap-codebleu.png
diff --git a/figs/c2c-bootstrap-bleu.png b/figs/c2c-bootstrap-bleu.png
diff --git a/figs/c2c-bootstrap-codebleu.png b/figs/c2c-bootstrap-codebleu.png
diff --git a/metric-progress.ipynb b/metric-progress.ipynb
diff --git a/metric.py b/metric.py
@@ -14,8 +14,9 @@ def b_norm(ref, trans):
     return _bleu(ref, trans, smooth=True, lower=True)
 
 class Metric:
-    def __init__(self, name, *, grade_single=None, grade_multi=None, baseline=0.0):
+    def __init__(self, name, shortname, *, grade_single=None, grade_multi=None, baseline=0.0):
         self.name = name
+        self.shortname = shortname
         self.grade_single = grade_single
         self.grade_multi = grade_multi
         self.baseline = 0.0
@@ -35,25 +36,31 @@ def grade(self, answer_key, answers):
 
 ExactMatch = Metric(
     name="Accuracy% (Exact Match)",
+    shortname="em",
     grade_single = lambda truth, answer: truth == answer,
 )
 BLEU = Metric(
     name="BLEU",
+    shortname="bleu",
     grade_multi = _bleu,
 )
 CodeBLEUJava = Metric(
     name="CodeBLEU (Java)",
+    shortname="codebleu-java",
     grade_multi = partial(_codebleu, lang="java"),
 )
 CodeBLEUCSharp = Metric(
     name="CodeBLEU (C#)",
+    shortname="codebleu-cs",
     grade_multi = partial(_codebleu, lang="c_sharp"),
 )
 BMoses = Metric(
     name="B-Moses",
+    shortname="codebleu-bmoses",
     grade_multi = b_moses,
 )
 BNorm = Metric(
     name="B-Norm",
+    shortname="codebleu-bnorm",
     grade_multi = b_norm,
 )
diff --git a/output/bugs2fix/.ipynb_checkpoints/bootstrap-bleu-checkpoint.json b/output/bugs2fix/.ipynb_checkpoints/bootstrap-bleu-checkpoint.json
diff --git a/output/bugs2fix/bootstrap-bleu.json b/output/bugs2fix/bootstrap-bleu.json
diff --git a/output/bugs2fix/bootstrap-codebleu-java.json b/output/bugs2fix/bootstrap-codebleu-java.json
diff --git a/output/code2code-trans/bootstrap-bleu.json b/output/code2code-trans/bootstrap-bleu.json
diff --git a/output/code2code-trans/bootstrap-codebleu-java.json b/output/code2code-trans/bootstrap-codebleu-java.json
diff --git a/render_output.py b/render_output.py
@@ -1,11 +1,12 @@
 import matplotlib.pyplot as plt
+import matplotlib.colors as mcolors
 from statistics import median
 import numpy as np
 
 class OutputRenderer:
     def __init__(self, baseline=0.0, metric="(Unspecified metric)"):
         self.x_values = [0.35, 2.70, 6.10, 16.10]
-        self.box_color = "Pink"
+        #self.box_color = "Pink"
         self.baseline = baseline
         self.metric = metric
 
@@ -18,19 +19,52 @@ def set_lim(self, y_max=None):
         plt.ylim(0, y_max)
 
 
-    def draw_box(self, ax, ys):
+    def draw_box(self, ax, ys, box_color):
+        solid_color = mcolors.to_rgb(box_color)
+        # a black version of the given color
+        black_ratio = 0.2
+        black = (0.0, 0.0, 0.0)
+        edge_color = tuple(
+            (1 - black_ratio) * np.array(solid_color)
+            + black_ratio * np.array(black)
+        )
         bplot = ax.boxplot(
             ys,
             positions=self.x_values,
             widths=1,
             manage_ticks=False,
             patch_artist=True,
             zorder=5,
-            medianprops=dict(color="black"),
+            medianprops=dict(
+                color=edge_color,
+                linewidth=2
+            ),
+            whiskerprops=dict(
+                color=edge_color,
+                linewidth=2
+            ),
+            capprops=dict(
+                color=edge_color,
+                linewidth=2
+            ),
+            flierprops=dict(
+                markersize=5,
+                markeredgecolor=solid_color,
+                markerfacecolor=solid_color,
+                marker=".",
+                # the "x" marker is cursed, idky
+                # markeredgecolor=box_color,
+                # marker="x",
+                # linewidth=15,
+            ),
+            boxprops=dict(
+                color=edge_color,
+                linewidth=2
+            ),
         )
 
         for patch in bplot["boxes"]:
-            patch.set_facecolor(self.box_color)
+            patch.set_facecolor(box_color)
 
     def draw_line(self, ax, ys, label=None, color="b"):
         medians = [median(vals) for vals in ys]
@@ -65,11 +99,13 @@ def draw_random_annotation(self, y_max=None):
             color="orange",
             horizontalalignment="right",
         )
+
 
-    def meta_info(self):
+    def meta_info(self, title=None):
         plt.xlabel("Parameters (in billions)")
         plt.ylabel(self.metric)
-        plt.title("Model Performance")
+        plt.title(title or "Model Performance")
+
 
     def draw_bands(self, ax, ys, color="b"):
         q1 = [np.percentile(val, 25) for val in ys]
@@ -92,8 +128,9 @@ def draw_bands(self, ax, ys, color="b"):
             zorder=5,
             color=color,
         )
+
 
-    def render(self, ys, y_max=None, save=None):
+    def render(self, ys, y_max=None, save=None, title=None):
         y_lines = ys
         if not isinstance(y_lines, dict):
             y_lines = { "unnamed": y_lines }
@@ -111,7 +148,7 @@ def render(self, ys, y_max=None, save=None):
         ax1 = fig.add_subplot(111)
         plt.grid(True)
         self.set_lim(y_max=y_max)
-        self.meta_info()
+        self.meta_info(title=title)
 
         self.draw_random_annotation(y_max=y_max)
 
@@ -120,7 +157,9 @@ def render(self, ys, y_max=None, save=None):
             color = colors[idx % len(colors)]
             self.draw_bands(ax1, ys, color=color)
             self.draw_line(ax1, ys, label=key, color=color)
-            self.draw_box(ax1, ys)
+            box_color = mcolors.to_rgb(color)
+            box_color += (0.3, )
+            self.draw_box(ax1, ys, box_color)
 
         plt.legend()