Skip to content

Commit

Permalink
Merge pull request #57 from gnana70/develop
Browse files Browse the repository at this point in the history
document updation
  • Loading branch information
gnana70 authored Apr 2, 2024
2 parents ec84a40 + 1e4f94c commit 71a91db
Show file tree
Hide file tree
Showing 2 changed files with 119 additions and 22 deletions.
42 changes: 21 additions & 21 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
</p>
</div>

OCR Tamil can help you extract text from signboard, nameplates, storefronts etc., from Natural Scenes with high accuracy. This version of OCR is much more robust to tilted text compared to the Tesseract, Paddle OCR and Easy OCR as they are primarily built to work on the documents texts and not on natural scenes. This model is work in progress, feel free to contribute!!!
OCR Tamil can help you extract text from signboard, nameplates, storefronts etc., from Natural Scenes with high accuracy. This version of OCR is much more robust to tilted text compared to the Tesseract, Paddle OCR and Easy OCR as they are primarily built to work on the documents texts and not on natural scenes.

## Languages Supported 🔛
**➡️ English**
Expand Down Expand Up @@ -176,30 +176,31 @@ OCR module can be initialized by setting following parameters as per your requir

## Limitations⛔

1. Unable to read the text if they are present in rotated forms
1. Document text reading capability is not supported as library doesn't have

**➡️Auto identification of Paragraph**

**➡️Orientation detection**

**➡️Skew correction**

**➡️Reading order prediction**

**➡️Document unwarping**

**➡️Optimal Text detection for Document text not available**

(**WORKAROUND** Bring your own models for above cases and use with OCR tamil for text recognition)


2. Unable to read the text if they are present in rotated forms

<p align="left">
<img width="200" alt="teaser" src="https://github.com/gnana70/tamil_ocr/raw/main/test_images/9.jpg">
<img width="200" alt="teaser" src="https://github.com/gnana70/tamil_ocr/raw/main/test_images/8.jpg">
</p>

2. Currently supports Only English and Tamil Language

3. Document Text reading capability is limited. Auto identification of Paragraph, reading order are not supported along with Text detection model inability to detect and crop the Tamil text leads to accuracy decrease (**WORKAROUND** Can use your own text detection model along with OCR tamil text recognition model)
<p align="center">
<img width="200" alt="teaser" src="https://github.com/gnana70/tamil_ocr/raw/main/test_images/tamil_sentence.jpg">
</p>
<p align="center">
<span>Cropped Text from Text detection Model</span>
</p>
<p align="center">
<img width="200" alt="teaser" src="https://github.com/gnana70/tamil_ocr/raw/main/test_images/tamil_sentence_crop.jpg">
</p>
<p align="center">
Character **** missing due to text detection model error
</p>

**?**யற்கை மூலிகைகளில் இருந்து ஈர்த்தெடுக்கக்கப்பட்ட வீரிய உட்பொருட்களை உள்ளடக்கி எந்த இரசாயன சேர்க்கைகளும் இல்லாமல் உருவாக்கப்பட்ட இந்தியாவின் முதல் சித்த தயாரிப்பு
3. Currently supports Only Tamil Language. I don't own english model as it's taken from open source implementation of parseq


## Acknowledgements 👏
Expand Down Expand Up @@ -244,5 +245,4 @@ Character **இ** missing due to text detection model error
year={2024},
url={https://github.com/gnana70/tamil_ocr}
}
```

```
99 changes: 98 additions & 1 deletion ocr_tamil/ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,13 @@


class ParseqDataset(Dataset):
"""
Parseq Dataset loader
Args:
Dataset (list): List of Images
"""
def __init__(self, data, transform=None):
self.data = data
self.transform = transform
Expand All @@ -52,6 +59,15 @@ def __len__(self):


def download(url: str, dest_folder: str):
"""Download the model files from the server
Args:
url (str): file url
dest_folder (str): local folder path
Raises:
RuntimeError: _description_
"""
if not os.path.exists(dest_folder):
os.makedirs(dest_folder) # create folder if it does not exist

Expand Down Expand Up @@ -85,6 +101,8 @@ def download(url: str, dest_folder: str):
os.remove(file_path)

class OCR:
"""Tamil OCR class
"""
def __init__(self,detect=False,
tamil_model_path=None,
eng_model_path=None,
Expand All @@ -96,9 +114,27 @@ def __init__(self,detect=False,
low_text=0.3,
details=0,
lang=["tamil","english"],
mode = "full",
fp16=False,
recognize_thres = 0.85) -> None:
"""
Tamil OCR prediction initilization
Args:
detect (bool, optional): To enable the text detection. Defaults to False.
tamil_model_path (_type_, optional): Path for tamil text recognition model. Defaults to None.
eng_model_path (_type_, optional): Path for english text recognition model. Defaults to None.
detect_model_path (_type_, optional): Path for text detect model. Defaults to None.
enable_cuda (bool, optional): To enable or disable cuda. Defaults to True.
batch_size (int, optional): Prediction batch size for text recognition. Defaults to 8.
text_threshold (float, optional): Text detection theshold to classify text or not. Defaults to 0.5.
link_threshold (float, optional): To combine characters into words (distance). Defaults to 0.1.
low_text (float, optional): Helps in padding while cropping results from text detection. Defaults to 0.3.
details (int, optional): Output infomration controller. Defaults to 0.
lang (list, optional): Text recognize language. Defaults to ["tamil","english"].
fp16 (bool, optional): full precision vs half precision (experimental). Defaults to False.
recognize_thres (float, optional): Threshold to filter the texts based on prediction confidence (text recognition). Defaults to 0.85.
"""

if enable_cuda:
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
Expand Down Expand Up @@ -161,6 +197,11 @@ def __init__(self,detect=False,
self.craft_net = load_craftnet_model(cuda=False,weight_path=self.detect_model_path)

def get_transform(self):
"""Basic transform for prediction
Returns:
torch transforms: torch vision transformation
"""
transforms = []
transforms.extend([
T.Resize([ 32, 128 ], T.InterpolationMode.BICUBIC),
Expand All @@ -170,6 +211,9 @@ def get_transform(self):
return T.Compose(transforms)

def load_model(self):
"""
Load the required models into the memory
"""

self.img_transform = self.get_transform()
self.eng_character_set = """0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"""
Expand All @@ -194,6 +238,7 @@ def load_model(self):
# self.tamil_parseq = torch.load("ocr_tamil\model_weights\parseq_tamil_rotate.pt").to(self.device).eval()

def sort_bboxes(self,contours):

c = np.array(contours)
max_height = np.median(c[::, 3]) * 0.5

Expand All @@ -219,6 +264,15 @@ def sort_bboxes(self,contours):
return contours_sorted,line_info

def craft_detect(self,image,**kwargs):
"""Text detection predict
Args:
image (numpy array): image numpy array
Returns:
list: list of cropped numpy arrays for text detected
list: Bbox informations
"""
size = max(image.shape[0],image.shape[1],640)

# Reshaping to the nearest size
Expand Down Expand Up @@ -278,6 +332,16 @@ def craft_detect(self,image,**kwargs):
return exported_file_paths,updated_prediction_result

def decode_file_name(self,decode_text,text_char_confidence,special_sep_char="~"):
"""Maps the encoded text to tamil words
Args:
decode_text (string): text to decode
text_char_confidence (float): minimum text recognition threshold
special_sep_char (str, optional): seperator for each character. Defaults to "~".
Returns:
string: decoded text
"""


indices = [x for x, v in enumerate(decode_text) if v == special_sep_char]
Expand Down Expand Up @@ -307,6 +371,14 @@ def decode_file_name(self,decode_text,text_char_confidence,special_sep_char="~")
return tamil_word

def read_image_input(self,image):
"""Reads the input image
Args:
image: Path, bytes and numpy array
Returns:
numpy array: image numpy array
"""
if type(image) == str:
img = cv2.imread(image)
# img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
Expand All @@ -328,6 +400,15 @@ def read_image_input(self,image):
return img

def text_recognize_batch(self,exported_regions):
"""Text recognition predictor
Args:
exported_regions (list): list of numpy array
Returns:
list: list of predicted text and confidence informations
"""


dataset = ParseqDataset(exported_regions, transform=self.img_transform)
dataloader = DataLoader(dataset, batch_size=self.batch_size)
Expand Down Expand Up @@ -405,6 +486,16 @@ def text_recognize_batch(self,exported_regions):
return text_list,conf_list

def output_formatter(self,text_list,conf_list,updated_prediction_result=None):
"""Output structure formatter
Args:
text_list (list): text information
conf_list (list): confidence information
updated_prediction_result (list, optional): bbox information. Defaults to None.
Returns:
list: output results
"""
final_result = []

if not self.details:
Expand All @@ -427,6 +518,12 @@ def output_formatter(self,text_list,conf_list,updated_prediction_result=None):

def predict(self,image):

""" Detect and recognize text informations
Returns:
List: extracted text information
"""

# To handle multiple images
if isinstance(image,list):
text_list = []
Expand Down

0 comments on commit 71a91db

Please sign in to comment.