progress report update

WM-SEMERU · May 20, 2024 · caab6bc · caab6bc
1 parent 89b39d3
commit caab6bc
Show file tree

Hide file tree

Showing 7 changed files with 1,516 additions and 59 deletions.
diff --git a/bleu.py b/bleu.py
@@ -0,0 +1,152 @@
+# The following file is a modified version of https://github.com/microsoft/CodeXGLUE/blob/main/Code-Code/code-refinement/evaluator/bleu.py
+# As per Apache 2.0, the license originally included with the code must be included here.
+# The following modifications have been made:
+#  - Make _bleu deal with lists rather than files
+#  - Remove presumed legacy code for dealing with multiple files at a time
+#  - Abstracted notation of tokenization to function tokenize_line
+#  - Clean some spacing
+#  - Removed rounding from _bleu (round(100 * bleu_score,2) ---> bleu_score)
+
+
+# Copyright 2017 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Python implementation of BLEU and smooth-BLEU.
+
+This module provides a Python implementation of BLEU and smooth-BLEU.
+Smooth BLEU is computed following the method outlined in the paper:
+Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic
+evaluation metrics for machine translation. COLING 2004.
+"""
+
+import collections
+import math
+
+
+def _get_ngrams(segment, max_order):
+  """Extracts all n-grams upto a given maximum order from an input segment.
+
+  Args:
+    segment: text segment from which n-grams will be extracted.
+    max_order: maximum length in tokens of the n-grams returned by this
+        methods.
+
+  Returns:
+    The Counter containing all n-grams upto max_order in segment
+    with a count of how many times each n-gram occurred.
+  """
+  ngram_counts = collections.Counter()
+  for order in range(1, max_order + 1):
+    for i in range(0, len(segment) - order + 1):
+      ngram = tuple(segment[i:i+order])
+      ngram_counts[ngram] += 1
+  return ngram_counts
+
+
+def compute_bleu(reference_corpus, translation_corpus, max_order=4,
+                 smooth=False):
+  """Computes BLEU score of translated segments against one or more references.
+
+  Args:
+    reference_corpus: list of lists of references for each translation. Each
+        reference should be tokenized into a list of tokens.
+    translation_corpus: list of translations to score. Each translation
+        should be tokenized into a list of tokens.
+    max_order: Maximum n-gram order to use when computing BLEU score.
+    smooth: Whether or not to apply Lin et al. 2004 smoothing.
+
+  Returns:
+    3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
+    precisions and brevity penalty.
+  """
+  matches_by_order = [0] * max_order
+  possible_matches_by_order = [0] * max_order
+  reference_length = 0
+  translation_length = 0
+  for (references, translation) in zip(reference_corpus,
+                                       translation_corpus):
+    reference_length += min(len(r) for r in references)
+    translation_length += len(translation)
+
+    merged_ref_ngram_counts = collections.Counter()
+    for reference in references:
+      merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
+    translation_ngram_counts = _get_ngrams(translation, max_order)
+    overlap = translation_ngram_counts & merged_ref_ngram_counts
+    for ngram in overlap:
+      matches_by_order[len(ngram)-1] += overlap[ngram]
+    for order in range(1, max_order+1):
+      possible_matches = len(translation) - order + 1
+      if possible_matches > 0:
+        possible_matches_by_order[order-1] += possible_matches
+
+  precisions = [0] * max_order
+  for i in range(0, max_order):
+    if smooth:
+      precisions[i] = ((matches_by_order[i] + 1.) /
+                       (possible_matches_by_order[i] + 1.))
+    else:
+      if possible_matches_by_order[i] > 0:
+        precisions[i] = (float(matches_by_order[i]) /
+                         possible_matches_by_order[i])
+      else:
+        precisions[i] = 0.0
+
+  if min(precisions) > 0:
+    p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
+    geo_mean = math.exp(p_log_sum)
+  else:
+    geo_mean = 0
+
+  ratio = float(translation_length) / reference_length
+
+  if ratio > 1.0:
+    bp = 1.
+  else:
+    bp = math.exp(1 - 1. / ratio)
+
+  bleu = geo_mean * bp
+
+  return (bleu, precisions, bp, ratio, translation_length, reference_length)
+
+def tokenize_line(line):
+    return line.strip().split()
+
+def _bleu(reference_lines, translation_lines, subword_option=None):
+    max_order = 4
+    smooth = True
+
+    reference_text = [
+        tokenize_line(line)
+        for line in reference_lines
+    ]
+    per_segment_references = [
+        [ line ]
+        for line in reference_text
+    ]
+
+    translations = [
+        tokenize_line(line)
+        for line in translation_lines
+    ]
+
+    bleu_score, _, _, _, _, _ = compute_bleu(
+        per_segment_references,
+        translations,
+        max_order,
+        smooth
+    )
+
+    return bleu_score