Skip to content

Commit

Permalink
Major fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
salvacarrion committed Jun 12, 2024
1 parent 0df0918 commit 76a20c7
Show file tree
Hide file tree
Showing 11 changed files with 155 additions and 249 deletions.
24 changes: 23 additions & 1 deletion autonmt/bundle/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,4 +403,26 @@ def basic_stats(tokens, prefix=""):
f"{prefix}percentile99.982_tokens": int(np.percentile(tokens, 99.982)), # TIER III
f"{prefix}percentile99.995_tokens": int(np.percentile(tokens, 99.995)), # TIER IV
}
return d
return d


def text2hex(text, return_str=False):
# There are multiple ways to do this: hex(ord(c)) vs. c.encode('utf-8').hex()
# Here, I chose "Hexadecimal Representation of Unicode Code Points" because we deal with the Unicode
# values directly. In contrast, Hexadecimal Representation of Encoded Bytes may have undesired results.
hex_values = [hex(ord(c)) for c in text]
res = ' '.join(hex_values) if return_str else hex_values
return res


def hex2text(hex_values, return_str=False):
# Converts each hexadecimal code point to its Unicode character.
if isinstance(hex_values, str):
hex_values = hex_values.split(' ')
elif isinstance(hex_values, list):
pass
else:
raise ValueError("hex_values must be a list of strings or a string")
text_values = [chr(int(c, 16)) for c in hex_values] # Value error may occur if the hex is not valid
res = ' '.join(text_values) if return_str else text_values
return res
13 changes: 5 additions & 8 deletions autonmt/modules/seq2seq.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,19 +151,16 @@ def _step(self, batch, batch_idx, log_prefix):
return loss, outputs

def _compute_metrics(self, y_hat, y, x, metrics, log_prefix):
# Decode lines
# Decode lines (only during training)
# Since ref lines are encoded, unknowns can appear. Therefore, for small vocabularies the scores could be strongly biased
hyp_lines = [self._trg_vocab.decode(list(x)) for x in y_hat.detach().cpu().numpy()]
ref_lines = [self._trg_vocab.decode(list(x)) for x in y.detach().cpu().numpy()]
src_lines = [self._src_vocab.decode(list(x)) for x in x.detach().cpu().numpy()]

# Full decoding
hyp_lines = decode_lines(hyp_lines, self._trg_vocab.lang, self._trg_vocab.subword_model, self._trg_vocab.pretok_flag,
self._trg_vocab.spm_model, remove_unk_hyphen=True)
ref_lines = decode_lines(ref_lines, self._trg_vocab.lang, self._trg_vocab.subword_model, self._trg_vocab.pretok_flag,
self._trg_vocab.spm_model, remove_unk_hyphen=True)
src_lines = decode_lines(src_lines, self._src_vocab.lang, self._src_vocab.subword_model, self._src_vocab.pretok_flag,
self._src_vocab.spm_model, remove_unk_hyphen=True)
# Full decoding (lines are stripped)
hyp_lines = decode_lines(hyp_lines, self._trg_vocab.lang, self._trg_vocab.subword_model, self._trg_vocab.pretok_flag, self._trg_vocab.spm_model)
ref_lines = decode_lines(ref_lines, self._trg_vocab.lang, self._trg_vocab.subword_model, self._trg_vocab.pretok_flag, self._trg_vocab.spm_model)
src_lines = decode_lines(src_lines, self._src_vocab.lang, self._src_vocab.subword_model, self._src_vocab.pretok_flag, self._src_vocab.spm_model)

# Compute metrics
scores = []
Expand Down
104 changes: 58 additions & 46 deletions autonmt/preprocessing/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,9 +148,9 @@ def build(self, make_plots=False, force_overwrite=False, verbose=False):

# Encode preprocessing
self._encode_datasets(force_overwrite=force_overwrite)
self._export_vocab_frequencies(force_overwrite=force_overwrite)

# Compute stats
self._export_vocab_frequencies(force_overwrite=force_overwrite)
self._compute_stats(force_overwrite=force_overwrite, print_stats=verbose)

# Make plot
Expand Down Expand Up @@ -450,48 +450,62 @@ def _train_tokenizer(self, force_overwrite):
print(f"\t- Building vocabulary: {ds.id2(as_path=True)}")

# Ignore dataset but create directories (just in case... for plots or stats)
if ds.subword_model in {None, "none", "bytes"}:
if ds.subword_model in {None, "none"}:
continue

# Pretokenize (if needed - words)
self._pretokenize(ds, force_overwrite)

# Get train files
file_path_fn = ds.get_pretok_path if ds.pretok_flag else ds.get_splits_auto_path
src_train_path = file_path_fn(fname=f"{ds.train_name}.{src_lang}")
trg_train_path = file_path_fn(fname=f"{ds.train_name}.{trg_lang}")

# One or two models
if self.merge_vocabs: # One model
concat_train_path = os.path.join(tmp_path, f"{ds.train_name}.{src_lang}-{trg_lang}")

# Concat files
if force_overwrite or not os.path.exists(concat_train_path):
# Read files
lines = read_file_lines(src_train_path, autoclean=True)
lines += read_file_lines(trg_train_path, autoclean=True)

# Shuffle lines: Just in case because can spm_train load the first X lines of corpus by default
random.shuffle(lines)

# Save file
write_file_lines(lines=lines, filename=concat_train_path, insert_break_line=True)
files = [(concat_train_path, f"{src_lang}-{trg_lang}")]
else: # Two models
files = [(src_train_path, f"{src_lang}"), (trg_train_path, f"{trg_lang}")]

# Train models
for input_file, ext in files:
output_file = ds.get_vocab_file(lang=ext) # without extension
if force_overwrite or not os.path.exists(f"{output_file}.model"):
tokenizers.spm_train_file(input_file=input_file, model_prefix=output_file, subword_model=ds.subword_model,
vocab_size=ds.vocab_size, input_sentence_size=self.input_sentence_size,
character_coverage=self.character_coverage, split_digits=self.split_digits)
assert os.path.exists(f"{output_file}.model")

# Check vocabs
print(f"=> Checking existing vocabularies...")
ds.check_vocab_folder_consistency()
elif ds.subword_model in {"bytes"}: # Trick: bytes is a special case
# Generate vocab
tokens = [f'0x{byte:02x}' for byte in range(256)]
special_tokens = ["<unk>", "<s>", "</s>", "<pad>"] # "unk" is not needed. Added for consistency
tokens_str = [f"{tok}\t{0}" for tok in (special_tokens + tokens)] # Must be a tuple (tok, log_prob)

# Save vocab (if needed)
langs_ext = [f"{src_lang}-{trg_lang}"] if self.merge_vocabs else [src_lang, trg_lang]
for ext in langs_ext:
output_file = ds.get_vocab_file(lang=ext) # without extension
if force_overwrite or not os.path.exists(f"{output_file}.vocab"):
write_file_lines(tokens_str, filename=f"{output_file}.vocab", insert_break_line=True)

else: # words, bpe, unigram and chars
# Pretokenize (if needed - words)
self._pretokenize(ds, force_overwrite)

# Get train files
file_path_fn = ds.get_pretok_path if ds.pretok_flag else ds.get_splits_auto_path
src_train_path = file_path_fn(fname=f"{ds.train_name}.{src_lang}")
trg_train_path = file_path_fn(fname=f"{ds.train_name}.{trg_lang}")

# One or two models
if self.merge_vocabs: # One model
concat_train_path = os.path.join(tmp_path, f"{ds.train_name}.{src_lang}-{trg_lang}")

# Concat files
if force_overwrite or not os.path.exists(concat_train_path):
# Read files
lines = read_file_lines(src_train_path, autoclean=True)
lines += read_file_lines(trg_train_path, autoclean=True)

# Shuffle lines: Just in case because can spm_train load the first X lines of corpus by default
random.shuffle(lines)

# Save file
write_file_lines(lines=lines, filename=concat_train_path, insert_break_line=True)
files = [(concat_train_path, f"{src_lang}-{trg_lang}")]
else: # Two models
files = [(src_train_path, f"{src_lang}"), (trg_train_path, f"{trg_lang}")]

# Train models
for input_file, ext in files:
output_file = ds.get_vocab_file(lang=ext) # without extension
if force_overwrite or not os.path.exists(f"{output_file}.model"):
tokenizers.spm_train_file(input_file=input_file, model_prefix=output_file, subword_model=ds.subword_model,
vocab_size=ds.vocab_size, input_sentence_size=self.input_sentence_size,
character_coverage=self.character_coverage, split_digits=self.split_digits)
assert os.path.exists(f"{output_file}.model")

# Check vocabs
print(f"=> Checking existing vocabularies...")
ds.check_vocab_folder_consistency()

def _encode_datasets(self, force_overwrite):
print(f"=> Building datasets...")
Expand All @@ -512,11 +526,9 @@ def _encode_datasets(self, force_overwrite):
input_file = file_path_fn(fname=fname)
output_file = ds.get_encoded_path(fname)

# Select model
if self.merge_vocabs:
model_path = ds.get_vocab_file() + ".model"
else:
model_path = ds.get_vocab_file(lang=lang) + ".model"
# Select model (not used for bytes)
model_path = ds.get_vocab_file() if self.merge_vocabs else ds.get_vocab_file(lang=lang)
model_path += ".model" # Add extension

# Encode file
encode_file(input_file=input_file, output_file=output_file, model_vocab_path=model_path,
Expand Down
2 changes: 1 addition & 1 deletion autonmt/preprocessing/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def __init__(self, base_path, parent_ds, dataset_name, dataset_lang_pair, datase
# Dataset versions
self.subword_model = str(subword_model).lower() if subword_model else subword_model
self.vocab_size = str(vocab_size).lower() if vocab_size else vocab_size
self.pretok_flag = self.subword_model in {"word", "words"}
self.pretok_flag = self.subword_model in {"word"}
self.merge_vocabs = merge_vocabs

# Preprocessing
Expand Down
37 changes: 10 additions & 27 deletions autonmt/preprocessing/processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,15 +149,14 @@ def encode_file(input_file, output_file, model_vocab_path, subword_model, force_
if subword_model in {None, "none"}:
shutil.copyfile(input_file, output_file)

elif subword_model in {"bytes"}:
elif subword_model in {"bytes"}: # No vocab is needed (just bytes)
# Save file as UTF8 and make sure everything uses NFKC
lines = read_file_lines(input_file, autoclean=True)
lines = [NFKC().normalize_str(line) for line in lines]
lines = [" ".join([hex(x) for x in line.encode()]) for line in lines]
lines = [utils.text2hex(line, return_str=True) for line in lines]
write_file_lines(lines=lines, filename=output_file, insert_break_line=True)

else:

# Encode files
tokenizers.spm_encode_file(spm_model_path=model_vocab_path, input_file=input_file, output_file=output_file)

Expand All @@ -170,21 +169,14 @@ def decode_file(input_file, output_file, lang, subword_model, pretok_flag, model
if force_overwrite or not os.path.exists(output_file):

# Detokenize
if subword_model in {None, "none"}:
# Rename or copy files (tok==txt)
shutil.copyfile(input_file, output_file)

elif subword_model in {"bytes"}:
if subword_model in {None, "none", "bytes"}:
# Rename or copy files (tok==txt)
shutil.copyfile(input_file, output_file)

else:
# Decode files
# Decode files (Note: SPM leaves a '▁' at the beginning of the line; we'll remove it later)
tokenizers.spm_decode_file(model_vocab_path, input_file=input_file, output_file=output_file)

# Remove the hyphen of unknown words when needed
if remove_unk_hyphen:
replace_in_file('▁', ' ', output_file)
replace_in_file('▁', ' ', output_file)

# Detokenize with moses
if pretok_flag:
Expand All @@ -194,23 +186,14 @@ def decode_file(input_file, output_file, lang, subword_model, pretok_flag, model
assert os.path.exists(output_file)


def decode_lines(lines, lang, subword_model, pretok_flag, spm_model=None, remove_unk_hyphen=False):
def decode_lines(lines, lang, subword_model, pretok_flag, spm_model=None):
# Detokenize
if subword_model in {None, "none"}:
# Rename or copy files (tok==txt)
lines = lines

elif subword_model in {"bytes"}:
lines = lines
# Decode files
# lines = [utils.clean_file_line(bytes([int(x, base=16) for x in line.split(' ')])) for line in lines]
if subword_model in {None, "none", "bytes"}:
pass
else:
# Decode files
# Decode files (Note: Lines stripped because SPM leaves a '▁' at the beginning of the line)
lines = tokenizers._spm_decode(lines, spm_model)

# Remove the hyphen of unknown words when needed
if remove_unk_hyphen:
lines = [line.replace('▁', ' ') for line in lines]
lines = [line.replace('▁', ' ').strip() for line in lines]

# Detokenize with moses
if pretok_flag:
Expand Down
4 changes: 0 additions & 4 deletions autonmt/preprocessing/tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,6 @@ def moses_detokenizer_file(input_file, output_file, lang):
utils.write_file_lines(lines=lines, filename=output_file, insert_break_line=True)

def spm_train_file(input_file, model_prefix, subword_model, vocab_size, input_sentence_size, character_coverage, split_digits):
# Normalize
if subword_model in {"word", "words"}:
subword_model = "word"

# Enable
byte_fallback = False
if "+bytes" in subword_model:
Expand Down
2 changes: 0 additions & 2 deletions autonmt/vocabularies/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1 @@
from autonmt.vocabularies.bytes_vocab import BytesVocabulary
from autonmt.vocabularies.whitespace_vocab import Vocabulary
from autonmt.vocabularies.whitespace_vocab_old import VocabularyOld
48 changes: 0 additions & 48 deletions autonmt/vocabularies/bytes_vocab.py

This file was deleted.

Loading

0 comments on commit 76a20c7

Please sign in to comment.