recognizer.py

from PIL import Image
import torch
from torch.utils.data import DataLoader
from torch.nn.functional import softmax

from utils import *

class Recognizer:

    def __init__(
        self,
        model_file,
        device = "cpu",
        ):
        weights = torch.load(model_file)
        self.model = weights["model"]
        self.converter = weights["converter"]
        self.opt = self.model.opt
        self.imgH = self.opt.imgH
        self.imgW = self.opt.imgW
        self.input_channel = self.opt.input_channel
        self.device = device

        _ = self.model.to(device)
        self.model.eval()

    def __call__(
        self,
        imgs
        ):
        results = []
        transform = NormalizePAD((self.input_channel, self.imgH, self.imgW))
        with torch.no_grad():
            for img in imgs:
                img = Image.fromarray(img).convert("L")
                w, h = img.size
               
                ratio = w / float(h)
                if math.ceil(self.imgH * ratio) > self.imgW:
                    resized_w = self.imgW
                else:
                    resized_w = math.ceil(self.imgH * ratio)

                img = img.resize((resized_w, self.imgH), Image.BICUBIC)
                img = transform(img)
                img = img.unsqueeze(0)
                img = img.to(self.device)

                text_for_pred = torch.LongTensor(1, w // 10 + 1).fill_(0).to(self.device)

                preds = self.model(img, text_for_pred)

                preds_size = [preds.size(1)]

                preds_prob = softmax(preds, dim=-1).squeeze().cpu().detach().numpy()

                values = preds_prob.max(axis=-1)
                indices = preds_prob.argmax(axis=-1)

                preds_str = self.converter.decode_greedy(indices.ravel(), preds_size)[0]
                confidence_score = custom_mean(values)

                results.append([preds_str, confidence_score])

        return results