translate.py

import torch
import numpy as np
import math
from torch.nn.functional import log_softmax, softmax
from beam_search import Beam


def batch_translate_beam_search(img, model, beam_size=4, candidates=1, max_seq_length=128, bos_token=1, eos_token=2):
    # img: NxCxHxW
    model.eval()
    device = img.device
    sents = []

    with torch.no_grad():
        src = model.forward_visual_feature(img).transpose(0, 1)
        memories = model.SequenceModeling.forward_encoder(src).transpose(0, 1)
        for i in range(src.size(0)):
            #            memory = memories[:,i,:].repeat(1, beam_size, 1) # TxNxE
            memory = model.SequenceModeling.get_memory(memories, i)
            sent = beamsearch(memory, model, device, beam_size, candidates, max_seq_length, bos_token, eos_token)
            sents.append(sent)

    sents = np.asarray(sents)

    return sents


def translate_beam_search(img, model, beam_size=4, candidates=1, max_seq_length=128, bos_token=1, eos_token=2):
    # img: 1xCxHxW
    model.eval()
    device = img.device

    with torch.no_grad():
        src = model.forward_visual_feature(img).transpose(0, 1)
        memory = model.SequenceModeling.forward_encoder(src)  # TxNxE
        sent = beamsearch(memory, model, device, beam_size, candidates, max_seq_length, bos_token, eos_token)

    return sent


def beamsearch(memory, model, device, beam_size=4, candidates=1, max_seq_length=128, bos_token=1, eos_token=2):
    # memory: Tx1xE
    model.eval()

    beam = Beam(beam_size=beam_size, min_length=0, n_top=candidates, ranker=None, start_token_id=bos_token,
                end_token_id=eos_token)

    with torch.no_grad():
        #        memory = memory.repeat(1, beam_size, 1) # TxNxE
        memory = model.SequenceModeling.expand_memory(memory, beam_size)

        for _ in range(max_seq_length):

            tgt_inp = beam.get_current_state().transpose(0, 1).to(device)  # TxN
            decoder_outputs, memory = model.SequenceModeling.forward_decoder(tgt_inp, memory)

            log_prob = log_softmax(decoder_outputs[:, -1, :].squeeze(0), dim=-1)
            beam.advance(log_prob.cpu())

            if beam.done():
                break

        scores, ks = beam.sort_finished(minimum=1)

        hypothesises = []
        for i, (times, k) in enumerate(ks[:candidates]):
            hypothesis = beam.get_hypothesis(times, k)
            hypothesises.append(hypothesis)

    return [1] + [int(i) for i in hypothesises[0][:-1]]


def translate(img, model, max_seq_length=128, bos_token=1, eos_token=2, start_real_token=5):
    "data: BxCXHxW"
    model.eval()
    device = img.device

    with torch.no_grad():
        src = model.forward_visual_feature(img).transpose(0, 1)
        memory = model.SequenceModeling.forward_encoder(src)

        translated_sentence = [[bos_token] * len(img)]
        char_probs = [[1] * len(img)]

        max_length = 0

        while max_length <= max_seq_length and not all(np.any(np.asarray(translated_sentence).T == eos_token, axis=1)):
            tgt_inp = torch.LongTensor(translated_sentence).to(device)

            #            output = model(img, tgt_inp, tgt_key_padding_mask=None)
            #            output = model.transformer(src, tgt_inp, tgt_key_padding_mask=None)
            output, memory = model.SequenceModeling.forward_decoder(tgt_inp, memory)
            output = softmax(output, dim=-1)
            output = output.to('cpu')

            values, indices = torch.topk(output, 5)

            indices = indices[:, -1, 0]
            indices = indices.tolist()

            values = values[:, -1, 0]
            values = values.tolist()
            char_probs.append(values)

            translated_sentence.append(indices)
            max_length += 1

            del output

        translated_sentence = np.asarray(translated_sentence).T

        char_probs = np.asarray(char_probs).T
        log_char_probs = np.sum(np.log(char_probs) * (translated_sentence >= start_real_token), axis=-1)
        char_probs = np.exp(log_char_probs)
        # print(char_probs)
        # char_probs = np.multiply(char_probs, translated_sentence > 3)
        # char_probs = np.sum(char_probs, axis=-1) / (char_probs > 0).sum(-1)

    return translated_sentence, char_probs

# def resize(w, h, expected_height, image_min_width, image_max_width):
#     new_w = int(expected_height * float(w) / float(h))
#     round_to = 10
#     new_w = math.ceil(new_w / round_to) * round_to
#     new_w = max(new_w, image_min_width)
#     new_w = min(new_w, image_max_width)
#
#     return new_w, expected_height
#
#
# def process_image(image, image_height, image_min_width, image_max_width):
#     img = image.convert('RGB')
#
#     w, h = img.size
#     new_w, image_height = resize(w, h, image_height, image_min_width, image_max_width)
#
#     img = img.resize((new_w, image_height), Image.ANTIALIAS)
#
#     img = np.asarray(img).transpose(2, 0, 1)
#     img = img / 255
#     return img
#
#
# def process_input(image, image_height, image_min_width, image_max_width):
#     img = process_image(image, image_height, image_min_width, image_max_width)
#     img = img[np.newaxis, ...]
#     img = torch.FloatTensor(img)
#     return img
#
#
# def predict(filename, config):
#     img = Image.open(filename)
#     img = process_input(img)
#
#     img = img.to(config['device'])
#
#     model, vocab = build_model(config)
#     s = translate(img, model)[0].tolist()
#     s = vocab.decode(s)
#
#     return s