Skip to content

Commit

Permalink
Fix multi-architecture pipeline (working)
Browse files Browse the repository at this point in the history
  • Loading branch information
salvacarrion committed Jul 2, 2024
1 parent eeff4cc commit 999677a
Show file tree
Hide file tree
Showing 6 changed files with 84 additions and 71 deletions.
2 changes: 1 addition & 1 deletion autonmt/bundle/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def scores2pandas(scores):

def summarize_scores(df_report, default_cols=None, ref_metric="bleu"):
if default_cols is None:
default_cols = ["train_dataset", "train__lang_pair", "test_dataset", "test__lang_pair", "vocab__subword_model", "vocab__size"]
default_cols = ["train_dataset", "train__lang_pair", "test_dataset", "test__lang_pair", "vocab__subword_model", "vocab__size", "model__architecture", "model__total_params"]

# Select columns
selected_cols = [c for c in df_report.columns.values if c in default_cols or ref_metric in c]
Expand Down
31 changes: 15 additions & 16 deletions autonmt/modules/models/rnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,11 @@ def __init__(self,
teacher_force_ratio=0.5,
padding_idx=None,
packed_sequence=False,
architecture="rnn",
base_rnn="rnn",
**kwargs):
super().__init__(src_vocab_size, trg_vocab_size, padding_idx, packed_sequence=packed_sequence,
architecture=architecture, **kwargs)
base_rnn=base_rnn, architecture=f"{self.__class__.__name__}-{base_rnn.upper()}", **kwargs)
self.base_rnn = base_rnn
self.encoder_embed_dim = encoder_embed_dim
self.decoder_embed_dim = decoder_embed_dim
self.encoder_hidden_dim = encoder_hidden_dim
Expand All @@ -49,7 +50,7 @@ def __init__(self,
self.output_layer = nn.Linear(decoder_hidden_dim, trg_vocab_size)

# RNN
base_rnn = self.get_base_rnn(self.architecture)
base_rnn = self.get_base_rnn(self.base_rnn)
if base_rnn is None:
self.encoder_rnn = None
self.decoder_rnn = None
Expand All @@ -71,18 +72,16 @@ def __init__(self,
assert encoder_n_layers == decoder_n_layers

@staticmethod
def get_base_rnn(architecture):
# Choose architecture
architecture = architecture.lower().strip()
if architecture == "rnn":
def get_base_rnn(base_rnn):
base_rnn = base_rnn.lower().strip()
if base_rnn == "rnn":
return nn.RNN
elif architecture == "lstm":
elif base_rnn == "lstm":
return nn.LSTM
elif architecture == "gru":
elif base_rnn == "gru":
return nn.GRU
else:
return None
# raise ValueError(f"Invalid architecture: {architecture}. Choose: 'rnn', 'lstm' or 'gru'")

def forward_encoder(self, x, x_len, **kwargs):
# Encode trg: (batch, length) => (batch, length, emb_dim)
Expand Down Expand Up @@ -150,9 +149,9 @@ def forward_enc_dec(self, x, x_len, y, y_len, **kwargs):


class ContextRNN(SimpleRNN):
def __init__(self, *args, architecture="gru", **kwargs):
super().__init__(*args, architecture=architecture, **kwargs)
base_rnn = self.get_base_rnn(self.architecture)
def __init__(self, *args, base_rnn="gru", **kwargs):
super().__init__(*args, base_rnn=base_rnn, **kwargs)
base_rnn = self.get_base_rnn(base_rnn=base_rnn)
self.encoder_rnn = base_rnn(input_size=self.encoder_embed_dim,
hidden_size=self.encoder_hidden_dim,
num_layers=self.encoder_n_layers,
Expand Down Expand Up @@ -210,9 +209,9 @@ def forward_decoder(self, y, y_len, states, **kwargs):


class AttentionRNN(SimpleRNN):
def __init__(self, *args, architecture="gru", **kwargs):
super().__init__(*args, architecture=architecture, **kwargs)
base_rnn = self.get_base_rnn(self.architecture)
def __init__(self, *args, base_rnn="gru", **kwargs):
super().__init__(*args, base_rnn=base_rnn, **kwargs)
base_rnn = self.get_base_rnn(self.base_rnn)
self.encoder_rnn = base_rnn(input_size=self.encoder_embed_dim,
hidden_size=self.encoder_hidden_dim,
num_layers=self.encoder_n_layers,
Expand Down
11 changes: 9 additions & 2 deletions autonmt/modules/seq2seq.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@

class LitSeq2Seq(pl.LightningModule):

def __init__(self, src_vocab_size, trg_vocab_size, padding_idx, packed_sequence=False, architecture="base", **kwargs):
def __init__(self, src_vocab_size, trg_vocab_size, padding_idx, packed_sequence=False, architecture=None, **kwargs):
super().__init__()
self.src_vocab_size = src_vocab_size
self.trg_vocab_size = trg_vocab_size
self.padding_idx = padding_idx
self.packed_sequence = packed_sequence # Use for RNNs and to "sort within batches"
self.architecture = architecture
self.architecture = architecture if architecture else self.__class__.__name__

# Hyperparams (PyTorch Lightning stuff)
self.strategy = None
Expand All @@ -45,6 +45,13 @@ def forward_decoder(self, y, y_len, states, **kwargs):
def forward_enc_dec(self, x, x_len, y, y_len, **kwargs):
pass

def count_parameters(self):
# Get model params
trainable_params = sum(p.numel() for p in self.parameters() if p.requires_grad)
no_trainable_params = sum(p.numel() for p in self.parameters() if not p.requires_grad)
total_params = trainable_params + no_trainable_params
return total_params, trainable_params, no_trainable_params

def configure_optimizers(self):
optim_fn = {
"adadelta": torch.optim.Adadelta,
Expand Down
2 changes: 1 addition & 1 deletion autonmt/search/greedy_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def greedy_search(model, dataset, sos_id, eos_id, pad_id, batch_size, max_tokens
max_iter = 0
for i in range(1, max_gen_length):
max_iter = i
outputs_t, states = model.forward_decoder(y=y_pred[:, :i], state=states, x_pad_mask=x_pad_mask)
outputs_t, states = model.forward_decoder(y=y_pred[:, :i], y_len=None, states=states, x_pad_mask=x_pad_mask)
top1 = outputs_t[:, -1, :].argmax(1) # Get most probable next-word (logits)

# Update y_pred for next iteration
Expand Down
7 changes: 3 additions & 4 deletions autonmt/toolkits/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -475,8 +475,7 @@ def parse_metrics(self, eval_ds, beams, metrics, **kwargs):
vocab_size = f"{len(self.src_vocab)}"

# Get model params
trainable_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad)
no_trainable_params = sum(p.numel() for p in self.model.parameters() if not p.requires_grad)
total_params, trainable_params, no_trainable_params = self.model.count_parameters()

# Report
report_dict = {
Expand All @@ -486,10 +485,10 @@ def parse_metrics(self, eval_ds, beams, metrics, **kwargs):
"eval_datetime": str(datetime.datetime.now()),

# Model
"model__architecture": self.model.__class__.__name__,
"model__architecture": self.model.architecture,
"model__trainable_params": trainable_params,
"model__no_trainable_params": no_trainable_params,
"model__total_params": trainable_params + no_trainable_params,
"model__total_params": total_params,
"model__dtype": str(self.model.dtype),

# Vocab
Expand Down
102 changes: 55 additions & 47 deletions examples/dev/0_test_custom_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
preprocess_predict_fn = lambda data, ds: preprocess_lines(data["lines"], normalize_fn=normalize_fn)

BASE_PATH = "/home/scarrion/datasets/translate" # Remote
BASE_PATH = "/Users/salvacarrion/Documents/Programming/datasets/translate" # Remote
# BASE_PATH = "/Users/salvacarrion/Documents/Programming/datasets/translate" # Remote

def main():
# Create preprocessing for training
Expand Down Expand Up @@ -59,52 +59,60 @@ def main():
# Train & Score a model for each dataset
scores = []
for i, train_ds in enumerate(tr_datasets, 1):
# Define max tokens (99.96% of the data)
if train_ds.subword_model == "bytes":
max_tokens_src, max_tokens_tgt = 539, 598
elif train_ds.subword_model == "char":
max_tokens_src, max_tokens_tgt = 540, 588
elif train_ds.subword_model == "bpe":
max_tokens_src, max_tokens_tgt = 106, 115
elif train_ds.subword_model == "word":
max_tokens_src, max_tokens_tgt = 99, 106
else:
raise ValueError(f"Unknown subword model: {train_ds.subword_model}")

for iters in [10]:
# Instantiate vocabs and model
src_vocab = Vocabulary(max_tokens=max_tokens_src).build_from_ds(ds=train_ds, lang=train_ds.src_lang)
trg_vocab = Vocabulary(max_tokens=max_tokens_tgt).build_from_ds(ds=train_ds, lang=train_ds.trg_lang)
model = Conv(src_vocab_size=len(src_vocab), trg_vocab_size=len(trg_vocab), padding_idx=src_vocab.pad_id)

# Define trainer
runs_dir = train_ds.get_runs_path(toolkit="autonmt")
run_prefix = f"{model.architecture}-{iters}ep__" + '_'.join(train_ds.id()[:2]).replace('/', '-')
run_name = train_ds.get_run_name(run_prefix=run_prefix) #+ f"__{int(time.time())}"
trainer = AutonmtTranslator(model=model, src_vocab=src_vocab, trg_vocab=trg_vocab,
runs_dir=runs_dir, run_name=run_name)

# Print info
print(f"=> Training model...")
print(f"\t- TRAINING ({i}/{len(tr_datasets)}): {str(train_ds)}")
print(f"\t- TESTING ({len(ts_datasets)}): {', '.join([str(x) for x in ts_datasets])}")
print(f"\t- MODEL PREFIX: {run_prefix}")

# Train model
wandb_params = None #dict(project="architecture", entity="salvacarrion", reinit=True)
trainer.fit(train_ds, max_epochs=iters, learning_rate=0.001, optimizer="adam", batch_size=128, seed=1234,
patience=10, num_workers=0, accelerator="auto", strategy="auto", save_best=True, save_last=True, print_samples=1,
wandb_params=wandb_params, use_bucketing=False)

# Test model
m_scores = trainer.predict(ts_datasets, metrics={"bleu"}, beams=[1], load_checkpoint="best",
preprocess_fn=preprocess_predict_fn, eval_mode="compatible", force_overwrite=True)
for ms in m_scores:
ms['train_dataset'] = train_ds.dataset_name
ms['vocab__merged'] = train_ds.merge_vocabs
ms['max_iters'] = str(iters)
ms['train_dataset'] = str(train_ds)
scores.append(m_scores)
for model_class in [SimpleRNN, ContextRNN, AttentionRNN, Conv, Transformer]:
# Define max tokens (99.96% of the data)
if train_ds.subword_model == "bytes":
max_tokens_src, max_tokens_tgt = 539, 598
elif train_ds.subword_model == "char":
max_tokens_src, max_tokens_tgt = 540, 588
elif train_ds.subword_model == "bpe":
max_tokens_src, max_tokens_tgt = 106, 115
elif train_ds.subword_model == "word":
max_tokens_src, max_tokens_tgt = 99, 106
else:
raise ValueError(f"Unknown subword model: {train_ds.subword_model}")

for iters in [1]:
# Instantiate vocabs and model
src_vocab = Vocabulary(max_tokens=max_tokens_src).build_from_ds(ds=train_ds, lang=train_ds.src_lang)
trg_vocab = Vocabulary(max_tokens=max_tokens_tgt).build_from_ds(ds=train_ds, lang=train_ds.trg_lang)
model = model_class(src_vocab_size=len(src_vocab), trg_vocab_size=len(trg_vocab), padding_idx=src_vocab.pad_id)

# Define trainer
runs_dir = train_ds.get_runs_path(toolkit="autonmt")
run_prefix = f"{model.architecture}-{iters}ep__" + '_'.join(train_ds.id()[:2]).replace('/', '-')
run_name = train_ds.get_run_name(run_prefix=run_prefix) #+ f"__{int(time.time())}"
trainer = AutonmtTranslator(model=model, src_vocab=src_vocab, trg_vocab=trg_vocab,
runs_dir=runs_dir, run_name=run_name)

# Print info
total_params, trainable_params, no_trainable_params = model.count_parameters()
print(f"=> Training model...")
print(f"\t- TRAINING ({i}/{len(tr_datasets)}): {str(train_ds)}")
print(f"\t- TESTING ({len(ts_datasets)}): {', '.join([str(x) for x in ts_datasets])}")
print(f"\t- MODEL PREFIX: {run_prefix}")
print(f"\t- MODEL ARCHITECTURE: {model.architecture}")
print(f"\t- MODEL PARAMETERS:")
print(f"\t\t- Total parameters: {total_params/10**6:,.2f}M")
print(f"\t\t- Trainable parameters: {trainable_params/10**6:,.2f}M")
print(f"\t\t- No trainable parameters: {no_trainable_params/10**6:,.2f}M")

# Train model
wandb_params = None #dict(project="architecture", entity="salvacarrion", reinit=True)
# trainer.fit(train_ds, max_epochs=iters, learning_rate=0.001, optimizer="adam", batch_size=128, seed=1234,
# patience=10, num_workers=0, accelerator="auto", strategy="auto", save_best=True, save_last=True, print_samples=1,
# wandb_params=wandb_params, use_bucketing=False)

# Test model
m_scores = trainer.predict(ts_datasets, metrics={"bleu"}, beams=[1], load_checkpoint="best",
preprocess_fn=preprocess_predict_fn, eval_mode="compatible", force_overwrite=True)
for ms in m_scores:
ms['train_dataset'] = train_ds.dataset_name
ms['vocab__merged'] = train_ds.merge_vocabs
ms['max_iters'] = str(iters)
ms['train_dataset'] = str(train_ds)
ms['architecture'] = model.architecture
scores.append(m_scores)

# Make report
output_path = os.path.join(BASE_PATH, f".outputs/autonmt/multi30k")
Expand Down

0 comments on commit 999677a

Please sign in to comment.