Major fixes

salvacarrion · Jun 12, 2024 · 76a20c7 · 76a20c7
1 parent 0df0918
commit 76a20c7
Show file tree

Hide file tree

Showing 11 changed files with 155 additions and 249 deletions.
diff --git a/autonmt/bundle/utils.py b/autonmt/bundle/utils.py
@@ -403,4 +403,26 @@ def basic_stats(tokens, prefix=""):
         f"{prefix}percentile99.982_tokens": int(np.percentile(tokens, 99.982)),  # TIER III
         f"{prefix}percentile99.995_tokens": int(np.percentile(tokens, 99.995)),  # TIER IV
     }
-    return d
+    return d
+
+
+def text2hex(text, return_str=False):
+    # There are multiple ways to do this: hex(ord(c)) vs. c.encode('utf-8').hex()
+    # Here, I chose "Hexadecimal Representation of Unicode Code Points" because we deal with the Unicode
+    # values directly. In contrast, Hexadecimal Representation of Encoded Bytes may have undesired results.
+    hex_values = [hex(ord(c)) for c in text]
+    res = ' '.join(hex_values) if return_str else hex_values
+    return res
+
+
+def hex2text(hex_values, return_str=False):
+    # Converts each hexadecimal code point to its Unicode character.
+    if isinstance(hex_values, str):
+        hex_values = hex_values.split(' ')
+    elif isinstance(hex_values, list):
+        pass
+    else:
+        raise ValueError("hex_values must be a list of strings or a string")
+    text_values = [chr(int(c, 16)) for c in hex_values]  # Value error may occur if the hex is not valid
+    res = ' '.join(text_values) if return_str else text_values
+    return res
diff --git a/autonmt/modules/seq2seq.py b/autonmt/modules/seq2seq.py
@@ -151,19 +151,16 @@ def _step(self, batch, batch_idx, log_prefix):
         return loss, outputs
 
     def _compute_metrics(self, y_hat, y, x, metrics, log_prefix):
-        # Decode lines
+        # Decode lines (only during training)
         # Since ref lines are encoded, unknowns can appear. Therefore, for small vocabularies the scores could be strongly biased
         hyp_lines = [self._trg_vocab.decode(list(x)) for x in y_hat.detach().cpu().numpy()]
         ref_lines = [self._trg_vocab.decode(list(x)) for x in y.detach().cpu().numpy()]
         src_lines = [self._src_vocab.decode(list(x)) for x in x.detach().cpu().numpy()]
 
-        # Full decoding
-        hyp_lines = decode_lines(hyp_lines, self._trg_vocab.lang, self._trg_vocab.subword_model, self._trg_vocab.pretok_flag,
-                                 self._trg_vocab.spm_model, remove_unk_hyphen=True)
-        ref_lines = decode_lines(ref_lines, self._trg_vocab.lang, self._trg_vocab.subword_model, self._trg_vocab.pretok_flag,
-                                 self._trg_vocab.spm_model, remove_unk_hyphen=True)
-        src_lines = decode_lines(src_lines, self._src_vocab.lang, self._src_vocab.subword_model, self._src_vocab.pretok_flag,
-                                 self._src_vocab.spm_model, remove_unk_hyphen=True)
+        # Full decoding (lines are stripped)
+        hyp_lines = decode_lines(hyp_lines, self._trg_vocab.lang, self._trg_vocab.subword_model, self._trg_vocab.pretok_flag, self._trg_vocab.spm_model)
+        ref_lines = decode_lines(ref_lines, self._trg_vocab.lang, self._trg_vocab.subword_model, self._trg_vocab.pretok_flag, self._trg_vocab.spm_model)
+        src_lines = decode_lines(src_lines, self._src_vocab.lang, self._src_vocab.subword_model, self._src_vocab.pretok_flag, self._src_vocab.spm_model)
 
         # Compute metrics
         scores = []

diff --git a/autonmt/preprocessing/builder.py b/autonmt/preprocessing/builder.py
@@ -148,9 +148,9 @@ def build(self, make_plots=False, force_overwrite=False, verbose=False):
 
             # Encode preprocessing
             self._encode_datasets(force_overwrite=force_overwrite)
-            self._export_vocab_frequencies(force_overwrite=force_overwrite)
 
             # Compute stats
+            self._export_vocab_frequencies(force_overwrite=force_overwrite)
             self._compute_stats(force_overwrite=force_overwrite, print_stats=verbose)
 
             # Make plot
@@ -450,48 +450,62 @@ def _train_tokenizer(self, force_overwrite):
             print(f"\t- Building vocabulary: {ds.id2(as_path=True)}")
 
             # Ignore dataset but create directories (just in case... for plots or stats)
-            if ds.subword_model in {None, "none", "bytes"}:
+            if ds.subword_model in {None, "none"}:
                 continue
 
-            # Pretokenize (if needed - words)
-            self._pretokenize(ds, force_overwrite)
-
-            # Get train files
-            file_path_fn = ds.get_pretok_path if ds.pretok_flag else ds.get_splits_auto_path
-            src_train_path = file_path_fn(fname=f"{ds.train_name}.{src_lang}")
-            trg_train_path = file_path_fn(fname=f"{ds.train_name}.{trg_lang}")
-
-            # One or two models
-            if self.merge_vocabs:  # One model
-                concat_train_path = os.path.join(tmp_path, f"{ds.train_name}.{src_lang}-{trg_lang}")
-
-                # Concat files
-                if force_overwrite or not os.path.exists(concat_train_path):
-                    # Read files
-                    lines = read_file_lines(src_train_path, autoclean=True)
-                    lines += read_file_lines(trg_train_path, autoclean=True)
-
-                    # Shuffle lines: Just in case because can spm_train load the first X lines of corpus by default
-                    random.shuffle(lines)
-
-                    # Save file
-                    write_file_lines(lines=lines, filename=concat_train_path, insert_break_line=True)
-                files = [(concat_train_path, f"{src_lang}-{trg_lang}")]
-            else:  # Two models
-                files = [(src_train_path, f"{src_lang}"), (trg_train_path, f"{trg_lang}")]
-
-            # Train models
-            for input_file, ext in files:
-                output_file = ds.get_vocab_file(lang=ext)  # without extension
-                if force_overwrite or not os.path.exists(f"{output_file}.model"):
-                    tokenizers.spm_train_file(input_file=input_file, model_prefix=output_file, subword_model=ds.subword_model,
-                                              vocab_size=ds.vocab_size, input_sentence_size=self.input_sentence_size,
-                                              character_coverage=self.character_coverage, split_digits=self.split_digits)
-                    assert os.path.exists(f"{output_file}.model")
-
-            # Check vocabs
-            print(f"=> Checking existing vocabularies...")
-            ds.check_vocab_folder_consistency()
+            elif ds.subword_model in {"bytes"}:  # Trick: bytes is a special case
+                # Generate vocab
+                tokens = [f'0x{byte:02x}' for byte in range(256)]
+                special_tokens = ["<unk>", "<s>", "</s>", "<pad>"]  # "unk" is not needed. Added for consistency
+                tokens_str = [f"{tok}\t{0}" for tok in (special_tokens + tokens)]  # Must be a tuple (tok, log_prob)
+
+                # Save vocab (if needed)
+                langs_ext = [f"{src_lang}-{trg_lang}"] if self.merge_vocabs else [src_lang, trg_lang]
+                for ext in langs_ext:
+                    output_file = ds.get_vocab_file(lang=ext)  # without extension
+                    if force_overwrite or not os.path.exists(f"{output_file}.vocab"):
+                        write_file_lines(tokens_str, filename=f"{output_file}.vocab", insert_break_line=True)
+
+            else:  # words, bpe, unigram and chars
+                # Pretokenize (if needed - words)
+                self._pretokenize(ds, force_overwrite)
+
+                # Get train files
+                file_path_fn = ds.get_pretok_path if ds.pretok_flag else ds.get_splits_auto_path
+                src_train_path = file_path_fn(fname=f"{ds.train_name}.{src_lang}")
+                trg_train_path = file_path_fn(fname=f"{ds.train_name}.{trg_lang}")
+
+                # One or two models
+                if self.merge_vocabs:  # One model
+                    concat_train_path = os.path.join(tmp_path, f"{ds.train_name}.{src_lang}-{trg_lang}")
+
+                    # Concat files
+                    if force_overwrite or not os.path.exists(concat_train_path):
+                        # Read files
+                        lines = read_file_lines(src_train_path, autoclean=True)
+                        lines += read_file_lines(trg_train_path, autoclean=True)
+
+                        # Shuffle lines: Just in case because can spm_train load the first X lines of corpus by default
+                        random.shuffle(lines)
+
+                        # Save file
+                        write_file_lines(lines=lines, filename=concat_train_path, insert_break_line=True)
+                    files = [(concat_train_path, f"{src_lang}-{trg_lang}")]
+                else:  # Two models
+                    files = [(src_train_path, f"{src_lang}"), (trg_train_path, f"{trg_lang}")]
+
+                # Train models
+                for input_file, ext in files:
+                    output_file = ds.get_vocab_file(lang=ext)  # without extension
+                    if force_overwrite or not os.path.exists(f"{output_file}.model"):
+                        tokenizers.spm_train_file(input_file=input_file, model_prefix=output_file, subword_model=ds.subword_model,
+                                                  vocab_size=ds.vocab_size, input_sentence_size=self.input_sentence_size,
+                                                  character_coverage=self.character_coverage, split_digits=self.split_digits)
+                        assert os.path.exists(f"{output_file}.model")
+
+                # Check vocabs
+                print(f"=> Checking existing vocabularies...")
+                ds.check_vocab_folder_consistency()
 
     def _encode_datasets(self, force_overwrite):
         print(f"=> Building datasets...")
@@ -512,11 +526,9 @@ def _encode_datasets(self, force_overwrite):
                 input_file = file_path_fn(fname=fname)
                 output_file = ds.get_encoded_path(fname)
 
-                # Select model
-                if self.merge_vocabs:
-                    model_path = ds.get_vocab_file() + ".model"
-                else:
-                    model_path = ds.get_vocab_file(lang=lang) + ".model"
+                # Select model (not used for bytes)
+                model_path = ds.get_vocab_file() if self.merge_vocabs else ds.get_vocab_file(lang=lang)
+                model_path += ".model"  # Add extension
 
                 # Encode file
                 encode_file(input_file=input_file, output_file=output_file, model_vocab_path=model_path,

diff --git a/autonmt/preprocessing/dataset.py b/autonmt/preprocessing/dataset.py
@@ -40,7 +40,7 @@ def __init__(self, base_path, parent_ds, dataset_name, dataset_lang_pair, datase
         # Dataset versions
         self.subword_model = str(subword_model).lower() if subword_model else subword_model
         self.vocab_size = str(vocab_size).lower() if vocab_size else vocab_size
-        self.pretok_flag = self.subword_model in {"word", "words"}
+        self.pretok_flag = self.subword_model in {"word"}
         self.merge_vocabs = merge_vocabs
 
         # Preprocessing

diff --git a/autonmt/preprocessing/processors.py b/autonmt/preprocessing/processors.py
@@ -149,15 +149,14 @@ def encode_file(input_file, output_file, model_vocab_path, subword_model, force_
         if subword_model in {None, "none"}:
             shutil.copyfile(input_file, output_file)
 
-        elif subword_model in {"bytes"}:
+        elif subword_model in {"bytes"}:  # No vocab is needed (just bytes)
             # Save file as UTF8 and make sure everything uses NFKC
             lines = read_file_lines(input_file, autoclean=True)
             lines = [NFKC().normalize_str(line) for line in lines]
-            lines = [" ".join([hex(x) for x in line.encode()]) for line in lines]
+            lines = [utils.text2hex(line, return_str=True) for line in lines]
             write_file_lines(lines=lines, filename=output_file, insert_break_line=True)
 
         else:
-
             # Encode files
             tokenizers.spm_encode_file(spm_model_path=model_vocab_path, input_file=input_file, output_file=output_file)
 
@@ -170,21 +169,14 @@ def decode_file(input_file, output_file, lang, subword_model, pretok_flag, model
     if force_overwrite or not os.path.exists(output_file):
 
         # Detokenize
-        if subword_model in {None, "none"}:
-            # Rename or copy files (tok==txt)
-            shutil.copyfile(input_file, output_file)
-
-        elif subword_model in {"bytes"}:
+        if subword_model in {None, "none", "bytes"}:
             # Rename or copy files (tok==txt)
             shutil.copyfile(input_file, output_file)
 
         else:
-            # Decode files
+            # Decode files (Note: SPM leaves a '▁' at the beginning of the line; we'll remove it later)
             tokenizers.spm_decode_file(model_vocab_path, input_file=input_file, output_file=output_file)
-
-            # Remove the hyphen of unknown words when needed
-            if remove_unk_hyphen:
-                replace_in_file('▁', ' ', output_file)
+            replace_in_file('▁', ' ', output_file)
 
         # Detokenize with moses
         if pretok_flag:
@@ -194,23 +186,14 @@ def decode_file(input_file, output_file, lang, subword_model, pretok_flag, model
         assert os.path.exists(output_file)
 
 
-def decode_lines(lines, lang, subword_model, pretok_flag, spm_model=None, remove_unk_hyphen=False):
+def decode_lines(lines, lang, subword_model, pretok_flag, spm_model=None):
     # Detokenize
-    if subword_model in {None, "none"}:
-        # Rename or copy files (tok==txt)
-        lines = lines
-
-    elif subword_model in {"bytes"}:
-        lines = lines
-        # Decode files
-        # lines = [utils.clean_file_line(bytes([int(x, base=16) for x in line.split(' ')])) for line in lines]
+    if subword_model in {None, "none", "bytes"}:
+        pass
     else:
-        # Decode files
+        # Decode files (Note: Lines stripped because SPM leaves a '▁' at the beginning of the line)
         lines = tokenizers._spm_decode(lines, spm_model)
-
-        # Remove the hyphen of unknown words when needed
-        if remove_unk_hyphen:
-            lines = [line.replace('▁', ' ') for line in lines]
+        lines = [line.replace('▁', ' ').strip() for line in lines]
 
     # Detokenize with moses
     if pretok_flag:

diff --git a/autonmt/preprocessing/tokenizers.py b/autonmt/preprocessing/tokenizers.py
@@ -37,10 +37,6 @@ def moses_detokenizer_file(input_file, output_file, lang):
     utils.write_file_lines(lines=lines, filename=output_file, insert_break_line=True)
 
 def spm_train_file(input_file, model_prefix, subword_model, vocab_size, input_sentence_size, character_coverage, split_digits):
-    # Normalize
-    if subword_model in {"word", "words"}:
-        subword_model = "word"
-
     # Enable
     byte_fallback = False
     if "+bytes" in subword_model:

diff --git a/autonmt/vocabularies/__init__.py b/autonmt/vocabularies/__init__.py
@@ -1,3 +1 @@
-from autonmt.vocabularies.bytes_vocab import BytesVocabulary
 from autonmt.vocabularies.whitespace_vocab import Vocabulary
-from autonmt.vocabularies.whitespace_vocab_old import VocabularyOld
diff --git a/autonmt/vocabularies/bytes_vocab.py b/autonmt/vocabularies/bytes_vocab.py