Merge pull request #57 from gnana70/develop

document updation
gnana70 · Apr 2, 2024 · 71a91db · 71a91db
2 parents ec84a40 + 1e4f94c
commit 71a91db
Show file tree

Hide file tree

Showing 2 changed files with 119 additions and 22 deletions.
diff --git a/README.md b/README.md
@@ -23,7 +23,7 @@
   </p>
 </div>
 
- OCR Tamil can help you extract text from signboard, nameplates, storefronts etc., from Natural Scenes with high accuracy. This version of OCR is much more robust to tilted text compared to the Tesseract, Paddle OCR and Easy OCR as they are primarily built to work on the documents texts and not on natural scenes. This model is work in progress, feel free to contribute!!!
+ OCR Tamil can help you extract text from signboard, nameplates, storefronts etc., from Natural Scenes with high accuracy. This version of OCR is much more robust to tilted text compared to the Tesseract, Paddle OCR and Easy OCR as they are primarily built to work on the documents texts and not on natural scenes.
 
 ## Languages Supported 🔛
 **➡️ English**
@@ -176,30 +176,31 @@ OCR module can be initialized by setting following parameters as per your requir
 
 ## Limitations⛔
 
-1. Unable to read the text if they are present in rotated forms
+1. Document text reading capability is not supported as library doesn't have
+
+      **➡️Auto identification of Paragraph**
+
+      **➡️Orientation detection**
+
+      **➡️Skew correction**
+
+      **➡️Reading order prediction**
+
+      **➡️Document unwarping**
+
+      **➡️Optimal Text detection for Document text not available**  
+
+      (**WORKAROUND** Bring your own models for above cases and use with OCR tamil for text recognition)
+
+
+2. Unable to read the text if they are present in rotated forms
 
 <p align="left">
 <img width="200" alt="teaser" src="https://github.com/gnana70/tamil_ocr/raw/main/test_images/9.jpg"> 
 <img width="200" alt="teaser" src="https://github.com/gnana70/tamil_ocr/raw/main/test_images/8.jpg">
 </p>
 
-2. Currently supports Only English and Tamil Language
-
-3. Document Text reading capability is limited. Auto identification of Paragraph, reading order are not supported along with Text detection model inability to detect and crop the Tamil text leads to accuracy decrease (**WORKAROUND** Can use your own text detection model along with OCR tamil text recognition model)
-<p align="center">
-<img width="200" alt="teaser" src="https://github.com/gnana70/tamil_ocr/raw/main/test_images/tamil_sentence.jpg">
-</p>
-<p align="center">
-<span>Cropped Text from Text detection Model</span>
-</p>
-<p align="center">
-<img width="200" alt="teaser" src="https://github.com/gnana70/tamil_ocr/raw/main/test_images/tamil_sentence_crop.jpg">
-</p>
-<p align="center">
-Character **இ** missing due to text detection model error 
-</p>
-
-**?**யற்கை மூலிகைகளில் இருந்து ஈர்த்தெடுக்கக்கப்பட்ட வீரிய உட்பொருட்களை உள்ளடக்கி எந்த இரசாயன சேர்க்கைகளும் இல்லாமல் உருவாக்கப்பட்ட இந்தியாவின் முதல் சித்த தயாரிப்பு 
+3. Currently supports Only Tamil Language. I don't own english model as it's taken from open source implementation of parseq
 
 
 ## Acknowledgements 👏
@@ -244,5 +245,4 @@ Character **இ** missing due to text detection model error
   year={2024},
   url={https://github.com/gnana70/tamil_ocr}
 }
-```
-
+```
diff --git a/ocr_tamil/ocr.py b/ocr_tamil/ocr.py
@@ -33,6 +33,13 @@
 
 
 class ParseqDataset(Dataset):
+    """
+    
+    Parseq Dataset loader
+
+    Args:
+        Dataset (list): List of Images
+    """
     def __init__(self, data, transform=None):
         self.data = data
         self.transform = transform
@@ -52,6 +59,15 @@ def __len__(self):
 
 
 def download(url: str, dest_folder: str):
+    """Download the model files from the server
+
+    Args:
+        url (str): file url
+        dest_folder (str): local folder path
+
+    Raises:
+        RuntimeError: _description_
+    """
     if not os.path.exists(dest_folder):
         os.makedirs(dest_folder)  # create folder if it does not exist
 
@@ -85,6 +101,8 @@ def download(url: str, dest_folder: str):
             os.remove(file_path)
 
 class OCR:
+    """Tamil OCR class
+    """
     def __init__(self,detect=False,
                  tamil_model_path=None,
                  eng_model_path=None,
@@ -96,9 +114,27 @@ def __init__(self,detect=False,
                  low_text=0.3,
                  details=0,
                  lang=["tamil","english"],
-                 mode = "full",
                  fp16=False,
                  recognize_thres = 0.85) -> None:
+        """
+        
+        Tamil OCR prediction initilization
+
+        Args:
+            detect (bool, optional): To enable the text detection. Defaults to False.
+            tamil_model_path (_type_, optional): Path for tamil text recognition model. Defaults to None.
+            eng_model_path (_type_, optional): Path for english text recognition model. Defaults to None.
+            detect_model_path (_type_, optional): Path for text detect model. Defaults to None.
+            enable_cuda (bool, optional): To enable or disable cuda. Defaults to True.
+            batch_size (int, optional): Prediction batch size for text recognition. Defaults to 8.
+            text_threshold (float, optional): Text detection theshold to classify text or not. Defaults to 0.5.
+            link_threshold (float, optional): To combine characters into words (distance). Defaults to 0.1.
+            low_text (float, optional): Helps in padding while cropping results from text detection. Defaults to 0.3.
+            details (int, optional): Output infomration controller. Defaults to 0.
+            lang (list, optional): Text recognize language. Defaults to ["tamil","english"].
+            fp16 (bool, optional): full precision vs half precision (experimental). Defaults to False.
+            recognize_thres (float, optional): Threshold to filter the texts based on prediction confidence (text recognition). Defaults to 0.85.
+        """
 
         if enable_cuda:
             self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
@@ -161,6 +197,11 @@ def __init__(self,detect=False,
                 self.craft_net = load_craftnet_model(cuda=False,weight_path=self.detect_model_path)
 
     def get_transform(self):
+        """Basic transform for prediction
+
+        Returns:
+            torch transforms: torch vision transformation
+        """
         transforms = []
         transforms.extend([
             T.Resize([ 32, 128 ], T.InterpolationMode.BICUBIC),
@@ -170,6 +211,9 @@ def get_transform(self):
         return T.Compose(transforms)
 
     def load_model(self):
+        """
+        Load the required models into the memory
+        """
 
         self.img_transform = self.get_transform()
         self.eng_character_set = """0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"""
@@ -194,6 +238,7 @@ def load_model(self):
         # self.tamil_parseq = torch.load("ocr_tamil\model_weights\parseq_tamil_rotate.pt").to(self.device).eval()
 
     def sort_bboxes(self,contours):
+
         c = np.array(contours)
         max_height = np.median(c[::, 3]) * 0.5
 
@@ -219,6 +264,15 @@ def sort_bboxes(self,contours):
         return contours_sorted,line_info
 
     def craft_detect(self,image,**kwargs):
+        """Text detection predict
+
+        Args:
+            image (numpy array): image numpy array
+
+        Returns:
+            list: list of cropped numpy arrays for text detected
+            list: Bbox informations
+        """
         size = max(image.shape[0],image.shape[1],640)
 
         # Reshaping to the nearest size
@@ -278,6 +332,16 @@ def craft_detect(self,image,**kwargs):
         return exported_file_paths,updated_prediction_result
 
     def decode_file_name(self,decode_text,text_char_confidence,special_sep_char="~"):
+        """Maps the encoded text to tamil words
+
+        Args:
+            decode_text (string): text to decode
+            text_char_confidence (float): minimum text recognition threshold
+            special_sep_char (str, optional): seperator for each character. Defaults to "~".
+
+        Returns:
+            string: decoded text
+        """
 
 
         indices = [x for x, v in enumerate(decode_text) if v == special_sep_char]
@@ -307,6 +371,14 @@ def decode_file_name(self,decode_text,text_char_confidence,special_sep_char="~")
         return tamil_word
 
     def read_image_input(self,image):
+        """Reads the input image
+
+        Args:
+            image: Path, bytes and numpy array
+
+        Returns:
+            numpy array: image numpy array
+        """
         if type(image) == str:
             img = cv2.imread(image)
             # img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
@@ -328,6 +400,15 @@ def read_image_input(self,image):
         return img
 
     def text_recognize_batch(self,exported_regions):
+        """Text recognition predictor
+
+        Args:
+            exported_regions (list): list of numpy array
+
+        Returns:
+            list: list of predicted text and confidence informations
+        """
+
 
         dataset = ParseqDataset(exported_regions, transform=self.img_transform)
         dataloader = DataLoader(dataset, batch_size=self.batch_size)
@@ -405,6 +486,16 @@ def text_recognize_batch(self,exported_regions):
         return text_list,conf_list
 
     def output_formatter(self,text_list,conf_list,updated_prediction_result=None):
+        """Output structure formatter
+
+        Args:
+            text_list (list): text information
+            conf_list (list): confidence information
+            updated_prediction_result (list, optional): bbox information. Defaults to None.
+
+        Returns:
+            list: output results
+        """
         final_result = []
 
         if not self.details:
@@ -427,6 +518,12 @@ def output_formatter(self,text_list,conf_list,updated_prediction_result=None):
 
     def predict(self,image):
 
+        """ Detect and recognize text informations
+
+        Returns:
+            List: extracted text information
+        """
+
         # To handle multiple images
         if isinstance(image,list):
             text_list = []