Skip to content

Commit

Permalink
Major fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
salvacarrion committed Jun 12, 2024
1 parent fccb674 commit 0df0918
Show file tree
Hide file tree
Showing 14 changed files with 181 additions and 149 deletions.
23 changes: 23 additions & 0 deletions autonmt/bundle/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from collections import defaultdict
from pathlib import Path

import numpy as np
from tqdm import tqdm


Expand Down Expand Up @@ -381,3 +382,25 @@ def shuffle_in_order(list1, list2):
def count_file_lines(file_path):
num_lines = sum(1 for i in open(file_path, 'rb'))
return num_lines


def basic_stats(tokens, prefix=""):
# tokens is array of integers (number of tokens per sentence)
assert isinstance(tokens, np.ndarray)
d = {
f"{prefix}total_sentences": len(tokens),
f"{prefix}total_tokens": int(tokens.sum()),
f"{prefix}max_tokens": int(np.max(tokens)),
f"{prefix}min_tokens": int(np.min(tokens)),
f"{prefix}avg_tokens": float(np.average(tokens)),
f"{prefix}std_tokens": float(np.std(tokens)),
f"{prefix}percentile5_tokens": int(np.percentile(tokens, 5)),
f"{prefix}percentile50_tokens": int(np.percentile(tokens, 50)),
f"{prefix}percentile95_tokens": int(np.percentile(tokens, 95)),
f"{prefix}percentile99_tokens": int(np.percentile(tokens, 99)),
f"{prefix}percentile99.671_tokens": int(np.percentile(tokens, 99.671)), # TIER I
f"{prefix}percentile99.749_tokens": int(np.percentile(tokens, 99.749)), # TIER II
f"{prefix}percentile99.982_tokens": int(np.percentile(tokens, 99.982)), # TIER III
f"{prefix}percentile99.995_tokens": int(np.percentile(tokens, 99.995)), # TIER IV
}
return d
2 changes: 1 addition & 1 deletion autonmt/modules/models/transfomer.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def __init__(self,
dim_feedforward=encoder_ffn_embed_dim,
dropout=dropout,
activation=activation_fn)
self.output_layer = nn.Linear(encoder_embed_dim, src_vocab_size)
self.output_layer = nn.Linear(encoder_embed_dim, trg_vocab_size)
self.input_dropout = nn.Dropout(dropout)

# Checks
Expand Down
33 changes: 16 additions & 17 deletions autonmt/preprocessing/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def get_train_ds(self):
def get_test_ds(self):
return self.get_ds(ignore_variants=True)

def build(self, make_plots=False, force_overwrite=False):
def build(self, make_plots=False, force_overwrite=False, verbose=False):
print(f"=> Building datasets...")
print(f"\t- base_path={self.base_path}")

Expand All @@ -151,7 +151,7 @@ def build(self, make_plots=False, force_overwrite=False):
self._export_vocab_frequencies(force_overwrite=force_overwrite)

# Compute stats
self._compute_stats(force_overwrite=force_overwrite)
self._compute_stats(force_overwrite=force_overwrite, print_stats=verbose)

# Make plot
if make_plots:
Expand Down Expand Up @@ -591,7 +591,7 @@ def _export_vocab_frequencies(self, force_overwrite, normalize_freq=False):
lines = [f"{pair[0]}\t{pair[1]}" for pair in vocab_frequencies]
write_file_lines(lines=lines, filename=vocab_path, insert_break_line=True)

def _compute_stats(self, force_overwrite):
def _compute_stats(self, force_overwrite, print_stats=True):
print(f"=> Computing stats... (base_path={self.base_path})")

# Walk through preprocessing
Expand All @@ -604,10 +604,17 @@ def _compute_stats(self, force_overwrite):
# Save file
savepath = ds.get_stats_path("stats.json")
if force_overwrite or not os.path.exists(savepath):
# Compute and save stats
# Compute stats
stats = ds.get_stats(count_unknowns=True)

# Save stats
save_json(stats, savepath=savepath)

# Print dictionary of stats (pretty)
if print_stats:
print(json.dumps(stats, indent=4))


def _plot_datasets(self, force_overwrite, save_figures=True, show_figures=False, add_dataset_title=True, vocab_top_k=None):
print(f"=> Plotting started... (base_path={self.base_path})")
print(f"- [WARNING]: Matplotlib might miss some images if the loop is too fast")
Expand Down Expand Up @@ -648,21 +655,13 @@ def _plot_datasets(self, force_overwrite, save_figures=True, show_figures=False,
tokens_per_sentence = utils.count_tokens_per_sentence(filename=ds.get_encoded_path(fname))
tokens_per_sentence = np.array(tokens_per_sentence)

# Compute data
row = {
"total_sentences": len(tokens_per_sentence),
"total_tokens": int(tokens_per_sentence.sum()),
"max_tokens": int(np.max(tokens_per_sentence)),
"min_tokens": int(np.min(tokens_per_sentence)),
"avg_tokens": float(np.average(tokens_per_sentence)),
"std_tokens": float(np.std(tokens_per_sentence)),
"percentile5_tokens": int(np.percentile(tokens_per_sentence, 5)),
"percentile50_tokens": int(np.percentile(tokens_per_sentence, 50)),
"percentile95_tokens": int(np.percentile(tokens_per_sentence, 95)),
# Compute data stats
stats_row = utils.basic_stats(tokens_per_sentence, prefix="")
stats_row.update({
"split": split_name,
"lang": split_lang,
}
split_stats[fname] = row
})
split_stats[fname] = stats_row

# Plot sentence length distribution (by tokens' length)
df = pd.DataFrame(tokens_per_sentence, columns=["frequency"])
Expand Down
19 changes: 3 additions & 16 deletions autonmt/preprocessing/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def __init__(self, base_path, parent_ds, dataset_name, dataset_lang_pair, datase
# Dataset versions
self.subword_model = str(subword_model).lower() if subword_model else subword_model
self.vocab_size = str(vocab_size).lower() if vocab_size else vocab_size
self.pretok_flag = (self.subword_model == "word")
self.pretok_flag = self.subword_model in {"word", "words"}
self.merge_vocabs = merge_vocabs

# Preprocessing
Expand Down Expand Up @@ -205,19 +205,6 @@ def get_run_name(self, run_prefix):
return f"{run_prefix}_{self.subword_model}_{self.vocab_size}".lower()

def get_stats(self, splits=None, count_unknowns=False):
def basic_stats(tokens, prefix=""):
d = {
f"{prefix}total_sentences": len(tokens),
f"{prefix}total_tokens": int(tokens.sum()),
f"{prefix}max_tokens": int(np.max(tokens)),
f"{prefix}min_tokens": int(np.min(tokens)),
f"{prefix}avg_tokens": float(np.average(tokens)),
f"{prefix}std_tokens": float(np.std(tokens)),
f"{prefix}percentile5_tokens": int(np.percentile(tokens, 5)),
f"{prefix}percentile50_tokens": int(np.percentile(tokens, 50)),
f"{prefix}percentile95_tokens": int(np.percentile(tokens, 95)),
}
return d

if not splits:
splits = self.get_split_fnames() # Split names
Expand All @@ -236,7 +223,7 @@ def basic_stats(tokens, prefix=""):
"subword_model": self.subword_model,
"vocab_size": self.vocab_size,
}
row.update(basic_stats(tokens_per_sentence, prefix=""))
row.update(utils.basic_stats(tokens_per_sentence, prefix=""))

# Count unknowns
if count_unknowns and self.subword_model not in {None, "none", "bytes"}:
Expand All @@ -247,7 +234,7 @@ def basic_stats(tokens, prefix=""):
lines = utils.read_file_lines(self.get_encoded_path(fname), autoclean=True)
unknowns = [len(set(line.split(' ')).difference(vocab_keys)) for line in lines]
unknowns = np.array(unknowns)
row.update(basic_stats(unknowns, prefix="unknown_"))
row.update(utils.basic_stats(unknowns, prefix="unknown_"))

# Add stats
split_stats[fname] = row
Expand Down
11 changes: 4 additions & 7 deletions autonmt/preprocessing/processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,12 +175,8 @@ def decode_file(input_file, output_file, lang, subword_model, pretok_flag, model
shutil.copyfile(input_file, output_file)

elif subword_model in {"bytes"}:
# Decode files
lines = read_file_lines(input_file, autoclean=True)
lines = [clean_file_line(bytes([int(x, base=16) for x in line.split(' ')])) for line in lines]

# Write files
write_file_lines(lines=lines, filename=output_file, insert_break_line=True)
# Rename or copy files (tok==txt)
shutil.copyfile(input_file, output_file)

else:
# Decode files
Expand All @@ -205,8 +201,9 @@ def decode_lines(lines, lang, subword_model, pretok_flag, spm_model=None, remove
lines = lines

elif subword_model in {"bytes"}:
lines = lines
# Decode files
lines = [utils.clean_file_line(bytes([int(x, base=16) for x in line.split(' ')])) for line in lines]
# lines = [utils.clean_file_line(bytes([int(x, base=16) for x in line.split(' ')])) for line in lines]
else:
# Decode files
lines = tokenizers._spm_decode(lines, spm_model)
Expand Down
4 changes: 4 additions & 0 deletions autonmt/preprocessing/tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@ def moses_detokenizer_file(input_file, output_file, lang):
utils.write_file_lines(lines=lines, filename=output_file, insert_break_line=True)

def spm_train_file(input_file, model_prefix, subword_model, vocab_size, input_sentence_size, character_coverage, split_digits):
# Normalize
if subword_model in {"word", "words"}:
subword_model = "word"

# Enable
byte_fallback = False
if "+bytes" in subword_model:
Expand Down
2 changes: 1 addition & 1 deletion autonmt/toolkits/autonmt.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ def _translate(self, data_path, output_path, src_lang, trg_lang, beam_width, max
checkpoint, num_workers, devices, accelerator,
force_overwrite, checkpoints_dir=None, filter_idx=0, **kwargs):
# Checkpoint
if checkpoint:
if checkpoint: # "best", "last", "filename", "path"
self.from_checkpoint = self.load_checkpoint(checkpoint)

# Set evaluation model
Expand Down
1 change: 1 addition & 0 deletions autonmt/vocabularies/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
from autonmt.vocabularies.bytes_vocab import BytesVocabulary
from autonmt.vocabularies.whitespace_vocab import Vocabulary
from autonmt.vocabularies.whitespace_vocab_old import VocabularyOld
7 changes: 3 additions & 4 deletions autonmt/vocabularies/base_vocab.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,13 @@ def __init__(self, sos_id, eos_id, pad_id, sos_piece, eos_piece, pad_piece, lang
self.eos_piece = eos_piece
self.pad_piece = pad_piece

# Set special tokens
self.special_tokens = [(self.sos_piece, self.sos_id), (self.eos_piece, self.eos_id),
(self.pad_piece, self.pad_id)]

# Other
self.lang = lang
self.max_tokens = max_tokens

def special_tokens(self):
return [(self.sos_piece, self.sos_id), (self.eos_piece, self.eos_id), (self.pad_piece, self.pad_id)]

@abstractmethod
def encode(self, *args, **kwargs):
pass
Expand Down
5 changes: 1 addition & 4 deletions autonmt/vocabularies/bytes_vocab.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,11 @@ def __init__(self, sos_id=256, eos_id=257, pad_id=258,
super().__init__(sos_id=sos_id, eos_id=eos_id, pad_id=pad_id,
sos_piece=sos_piece, eos_piece=eos_piece, pad_piece=pad_piece,
lang=lang, max_tokens=max_tokens)
# Set special tokens
self._offset = len(self.special_tokens)

# Other
self.hex_input = hex_input

def __len__(self):
return 256 + len(self.special_tokens)
return 256 + len(self.special_tokens())

def encode(self, text, add_special_tokens=True):
if self.hex_input:
Expand Down
Loading

0 comments on commit 0df0918

Please sign in to comment.