Skip to content

Commit

Permalink
feat: Add character level confidence thresholds
Browse files Browse the repository at this point in the history
  • Loading branch information
plutasnyy committed Jan 6, 2025
1 parent 9e31ebc commit c0f2768
Show file tree
Hide file tree
Showing 3 changed files with 121 additions and 11 deletions.
42 changes: 42 additions & 0 deletions test_unstructured/partition/pdf_image/test_ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import pandas as pd
import pytest
import unstructured_pytesseract
from bs4 import BeautifulSoup, Tag
from pdf2image.exceptions import PDFPageCountError
from PIL import Image, UnidentifiedImageError
from unstructured_inference.inference.elements import EmbeddedTextRegion, TextRegion
Expand Down Expand Up @@ -484,3 +485,44 @@ def test_merge_out_layout_with_cid_code(mock_out_layout, mock_ocr_regions):
# Check if the final layout contains both original elements and OCR-derived elements
assert all(element in final_layout for element in mock_out_layout)
assert any(element in final_layout for element in ocr_elements)


def test_extract_word_from_hocr():
def _create_hocr_word_span(characters: list[tuple[str, str, list[int]]]) -> Tag:
word_span = BeautifulSoup("<span class='ocrx_word'></span>", "html.parser").span
for char, x_conf, bbox in characters:
char_span = BeautifulSoup(
f"""
<span class='ocrx_cinfo' title='x_bboxes {bbox[0]} {bbox[1]} {bbox[2]} {bbox[3]}; x_conf {x_conf}'>{char}</span>
""", # noqa : E501
"html.parser",
).span
word_span.append(char_span)
return word_span

characters = [
("w", "99.0", [10, 10, 20, 20]),
("o", "98.5", [21, 9, 29, 20]),
("r", "97.5", [31, 10, 40, 21]),
("d", "96.0", [41, 11, 50, 22]),
("!", "50.0", [51, 10, 60, 20]),
("@", "45.0", [61, 10, 70, 20]),
]

word_span = _create_hocr_word_span(characters)

text, bbox = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.0)
assert text == "word!@"
assert bbox == [10, 9, 70, 22]

text, bbox = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.960)
assert text == "word"
assert bbox == [10, 9, 50, 22]

text, bbox = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.990)
assert text == "w"
assert bbox == [10, 10, 20, 20]

text, bbox = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.999)
assert text == ""
assert bbox is None
2 changes: 1 addition & 1 deletion unstructured/partition/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def TESSERACT_OPTIMUM_TEXT_HEIGHT(self) -> int:
return self._get_int("TESSERACT_OPTIMUM_TEXT_HEIGHT", 20)

@property
def TESSERACT_CONFIDENCE_THRESHOLD(self) -> int:
def TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD(self) -> int:
"""Tesseract predictions with confidence below this threshold are ignored"""
return self._get_float("TESSERACT_CONFIDENCE_THRESHOLD", 0.0)

Expand Down
88 changes: 78 additions & 10 deletions unstructured/partition/utils/ocr_models/tesseract_ocr.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
from __future__ import annotations

import os
import re
from typing import TYPE_CHECKING, List

import cv2
import numpy as np
import pandas as pd
import unstructured_pytesseract
from bs4 import BeautifulSoup, Tag
from PIL import Image as PILImage
from unstructured_pytesseract import Output

from unstructured.logger import trace_logger
from unstructured.partition.utils.config import env_config
Expand Down Expand Up @@ -47,11 +48,10 @@ def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]:

trace_logger.detail("Processing entire page OCR with tesseract...")
zoom = 1
ocr_df: pd.DataFrame = unstructured_pytesseract.image_to_data(
np.array(image),
ocr_df: pd.DataFrame = self.image_to_data_with_character_confidence_filter(
np.array(zoom_image(image, zoom)),
lang=self.language,
output_type=Output.DATAFRAME,
# config='--oem 3 --psm 6'
character_confidence_threshold=env_config.TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD,
)
ocr_df = ocr_df.dropna()

Expand All @@ -77,20 +77,88 @@ def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]:
np.round(env_config.TESSERACT_OPTIMUM_TEXT_HEIGHT / text_height, 1),
max_zoom,
)
ocr_df = unstructured_pytesseract.image_to_data(
ocr_df = self.image_to_data_with_character_confidence_filter(
np.array(zoom_image(image, zoom)),
lang=self.language,
output_type=Output.DATAFRAME,
# config='--oem 3 --psm 6'
character_confidence_threshold=env_config.TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD,
)
ocr_df = ocr_df.dropna()
probabilities = ocr_df["conf"].div(100)
ocr_df = ocr_df[probabilities.ge(env_config.TESSERACT_CONFIDENCE_THRESHOLD)]
print("OCR FILTERING")
ocr_regions = self.parse_data(ocr_df, zoom=zoom)

return ocr_regions

def image_to_data_with_character_confidence_filter(
self,
image: np.ndarray,
lang: str = "eng",
config: str = "",
character_confidence_threshold: float = 0.5,
) -> pd.DataFrame:
hocr: pd.DataFrame = unstructured_pytesseract.image_to_pdf_or_hocr(
image,
lang=lang,
config="-c hocr_char_boxes=1 " + config,
extension="hocr",
)
soup = BeautifulSoup(hocr, "html.parser")
words = soup.find_all("span", class_="ocrx_word")

df_entries = []
for word in words:
text, bbox = self.extract_word_from_hocr(
word=word, character_confidence_threshold=character_confidence_threshold
)
if text and bbox:
left, top, right, bottom = bbox
df_entries.append(
{
"left": left,
"top": top,
"width": right - left,
"height": bottom - top,
"text": text,
}
)
ocr_df = pd.DataFrame(df_entries)

return ocr_df

@staticmethod
def extract_word_from_hocr(
word: Tag, character_confidence_threshold: float = 0.0
) -> tuple[str, list[int] | None]:
"""Extracts a word from an hOCR word tag, filtering out characters with low confidence."""
word_text = ""
word_bbox = None

character_spans = word.find_all("span", class_="ocrx_cinfo")
for character_span in character_spans:
char = character_span.text

char_title = character_span.get("title", "")
conf_match = re.search(r"x_conf (\d+\.\d+)", char_title)
bbox_match = re.search(r"x_bboxes (\d+) (\d+) (\d+) (\d+)", char_title)

if not (char and conf_match and bbox_match):
continue

character_probability = float(conf_match.group(1)) / 100
character_bbox = list(map(int, bbox_match.groups()))

if character_probability >= character_confidence_threshold:
word_text += char
if word_bbox is None:
word_bbox = character_bbox
else:
word_bbox = [
min(word_bbox[0], character_bbox[0]), # x1 - starts from 0
min(word_bbox[1], character_bbox[1]), # y1 - starts from 0
max(word_bbox[2], character_bbox[2]),
max(word_bbox[3], character_bbox[3]),
]
return word_text, word_bbox

@requires_dependencies("unstructured_inference")
def get_layout_elements_from_image(self, image: PILImage.Image) -> List["LayoutElement"]:
from unstructured.partition.pdf_image.inference_utils import (
Expand Down

0 comments on commit c0f2768

Please sign in to comment.