feat: Add character level confidence thresholds

Unstructured-IO · Jan 6, 2025 · c0f2768 · c0f2768
1 parent 9e31ebc
commit c0f2768
Show file tree

Hide file tree

Showing 3 changed files with 121 additions and 11 deletions.
diff --git a/test_unstructured/partition/pdf_image/test_ocr.py b/test_unstructured/partition/pdf_image/test_ocr.py
@@ -6,6 +6,7 @@
 import pandas as pd
 import pytest
 import unstructured_pytesseract
+from bs4 import BeautifulSoup, Tag
 from pdf2image.exceptions import PDFPageCountError
 from PIL import Image, UnidentifiedImageError
 from unstructured_inference.inference.elements import EmbeddedTextRegion, TextRegion
@@ -484,3 +485,44 @@ def test_merge_out_layout_with_cid_code(mock_out_layout, mock_ocr_regions):
     # Check if the final layout contains both original elements and OCR-derived elements
     assert all(element in final_layout for element in mock_out_layout)
     assert any(element in final_layout for element in ocr_elements)
+
+
+def test_extract_word_from_hocr():
+    def _create_hocr_word_span(characters: list[tuple[str, str, list[int]]]) -> Tag:
+        word_span = BeautifulSoup("<span class='ocrx_word'></span>", "html.parser").span
+        for char, x_conf, bbox in characters:
+            char_span = BeautifulSoup(
+                f"""
+                <span class='ocrx_cinfo' title='x_bboxes {bbox[0]} {bbox[1]} {bbox[2]} {bbox[3]}; x_conf {x_conf}'>{char}</span>
+                """,  # noqa : E501
+                "html.parser",
+            ).span
+            word_span.append(char_span)
+        return word_span
+
+    characters = [
+        ("w", "99.0", [10, 10, 20, 20]),
+        ("o", "98.5", [21, 9, 29, 20]),
+        ("r", "97.5", [31, 10, 40, 21]),
+        ("d", "96.0", [41, 11, 50, 22]),
+        ("!", "50.0", [51, 10, 60, 20]),
+        ("@", "45.0", [61, 10, 70, 20]),
+    ]
+
+    word_span = _create_hocr_word_span(characters)
+
+    text, bbox = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.0)
+    assert text == "word!@"
+    assert bbox == [10, 9, 70, 22]
+
+    text, bbox = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.960)
+    assert text == "word"
+    assert bbox == [10, 9, 50, 22]
+
+    text, bbox = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.990)
+    assert text == "w"
+    assert bbox == [10, 10, 20, 20]
+
+    text, bbox = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.999)
+    assert text == ""
+    assert bbox is None
diff --git a/unstructured/partition/utils/config.py b/unstructured/partition/utils/config.py
@@ -97,7 +97,7 @@ def TESSERACT_OPTIMUM_TEXT_HEIGHT(self) -> int:
         return self._get_int("TESSERACT_OPTIMUM_TEXT_HEIGHT", 20)
 
     @property
-    def TESSERACT_CONFIDENCE_THRESHOLD(self) -> int:
+    def TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD(self) -> int:
         """Tesseract predictions with confidence below this threshold are ignored"""
         return self._get_float("TESSERACT_CONFIDENCE_THRESHOLD", 0.0)
 

diff --git a/unstructured/partition/utils/ocr_models/tesseract_ocr.py b/unstructured/partition/utils/ocr_models/tesseract_ocr.py
@@ -1,14 +1,15 @@
 from __future__ import annotations
 
 import os
+import re
 from typing import TYPE_CHECKING, List
 
 import cv2
 import numpy as np
 import pandas as pd
 import unstructured_pytesseract
+from bs4 import BeautifulSoup, Tag
 from PIL import Image as PILImage
-from unstructured_pytesseract import Output
 
 from unstructured.logger import trace_logger
 from unstructured.partition.utils.config import env_config
@@ -47,11 +48,10 @@ def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]:
 
         trace_logger.detail("Processing entire page OCR with tesseract...")
         zoom = 1
-        ocr_df: pd.DataFrame = unstructured_pytesseract.image_to_data(
-            np.array(image),
+        ocr_df: pd.DataFrame = self.image_to_data_with_character_confidence_filter(
+            np.array(zoom_image(image, zoom)),
             lang=self.language,
-            output_type=Output.DATAFRAME,
-            # config='--oem 3 --psm 6'
+            character_confidence_threshold=env_config.TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD,
         )
         ocr_df = ocr_df.dropna()
 
@@ -77,20 +77,88 @@ def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]:
                 np.round(env_config.TESSERACT_OPTIMUM_TEXT_HEIGHT / text_height, 1),
                 max_zoom,
             )
-            ocr_df = unstructured_pytesseract.image_to_data(
+            ocr_df = self.image_to_data_with_character_confidence_filter(
                 np.array(zoom_image(image, zoom)),
                 lang=self.language,
-                output_type=Output.DATAFRAME,
-                # config='--oem 3 --psm 6'
+                character_confidence_threshold=env_config.TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD,
             )
             ocr_df = ocr_df.dropna()
-        probabilities = ocr_df["conf"].div(100)
-        ocr_df = ocr_df[probabilities.ge(env_config.TESSERACT_CONFIDENCE_THRESHOLD)]
         print("OCR FILTERING")
         ocr_regions = self.parse_data(ocr_df, zoom=zoom)
 
         return ocr_regions
 
+    def image_to_data_with_character_confidence_filter(
+        self,
+        image: np.ndarray,
+        lang: str = "eng",
+        config: str = "",
+        character_confidence_threshold: float = 0.5,
+    ) -> pd.DataFrame:
+        hocr: pd.DataFrame = unstructured_pytesseract.image_to_pdf_or_hocr(
+            image,
+            lang=lang,
+            config="-c hocr_char_boxes=1 " + config,
+            extension="hocr",
+        )
+        soup = BeautifulSoup(hocr, "html.parser")
+        words = soup.find_all("span", class_="ocrx_word")
+
+        df_entries = []
+        for word in words:
+            text, bbox = self.extract_word_from_hocr(
+                word=word, character_confidence_threshold=character_confidence_threshold
+            )
+            if text and bbox:
+                left, top, right, bottom = bbox
+                df_entries.append(
+                    {
+                        "left": left,
+                        "top": top,
+                        "width": right - left,
+                        "height": bottom - top,
+                        "text": text,
+                    }
+                )
+        ocr_df = pd.DataFrame(df_entries)
+
+        return ocr_df
+
+    @staticmethod
+    def extract_word_from_hocr(
+        word: Tag, character_confidence_threshold: float = 0.0
+    ) -> tuple[str, list[int] | None]:
+        """Extracts a word from an hOCR word tag, filtering out characters with low confidence."""
+        word_text = ""
+        word_bbox = None
+
+        character_spans = word.find_all("span", class_="ocrx_cinfo")
+        for character_span in character_spans:
+            char = character_span.text
+
+            char_title = character_span.get("title", "")
+            conf_match = re.search(r"x_conf (\d+\.\d+)", char_title)
+            bbox_match = re.search(r"x_bboxes (\d+) (\d+) (\d+) (\d+)", char_title)
+
+            if not (char and conf_match and bbox_match):
+                continue
+
+            character_probability = float(conf_match.group(1)) / 100
+            character_bbox = list(map(int, bbox_match.groups()))
+
+            if character_probability >= character_confidence_threshold:
+                word_text += char
+                if word_bbox is None:
+                    word_bbox = character_bbox
+                else:
+                    word_bbox = [
+                        min(word_bbox[0], character_bbox[0]),  # x1 - starts from 0
+                        min(word_bbox[1], character_bbox[1]),  # y1 - starts from 0
+                        max(word_bbox[2], character_bbox[2]),
+                        max(word_bbox[3], character_bbox[3]),
+                    ]
+        return word_text, word_bbox
+
     @requires_dependencies("unstructured_inference")
     def get_layout_elements_from_image(self, image: PILImage.Image) -> List["LayoutElement"]:
         from unstructured.partition.pdf_image.inference_utils import (