From c1e9b8efa09ed2738af19d6738714251777df1da Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Wed, 8 Jan 2025 11:35:38 +0100 Subject: [PATCH] Use word bboxes instead of character bboxees --- .../partition/pdf_image/test_ocr.py | 81 ++++++++++++------- .../utils/ocr_models/tesseract_ocr.py | 42 +++++----- 2 files changed, 71 insertions(+), 52 deletions(-) diff --git a/test_unstructured/partition/pdf_image/test_ocr.py b/test_unstructured/partition/pdf_image/test_ocr.py index 7b1454a189..fe04d82c55 100644 --- a/test_unstructured/partition/pdf_image/test_ocr.py +++ b/test_unstructured/partition/pdf_image/test_ocr.py @@ -487,42 +487,67 @@ def test_merge_out_layout_with_cid_code(mock_out_layout, mock_ocr_regions): assert any(element in final_layout for element in ocr_elements) -def test_extract_word_from_hocr(): - def _create_hocr_word_span(characters: list[tuple[str, str, list[int]]]) -> Tag: - word_span = BeautifulSoup("", "html.parser").span - for char, x_conf, bbox in characters: - char_span = BeautifulSoup( - f""" - {char} - """, # noqa : E501 - "html.parser", - ).span - word_span.append(char_span) - return word_span +def _create_hocr_word_span( + characters: list[tuple[str, str]], word_bbox: tuple[int, int, int, int] +) -> Tag: + word_span = BeautifulSoup( + f"", + "html.parser", + ).span + for char, x_conf in characters: + char_span = BeautifulSoup( + f""" + {char} + """, # noqa : E501 + "html.parser", + ).span + word_span.append(char_span) + return word_span + +def test_extract_word_from_hocr(): characters = [ - ("w", "99.0", [10, 10, 20, 20]), - ("o", "98.5", [21, 9, 29, 20]), - ("r", "97.5", [31, 10, 40, 21]), - ("d", "96.0", [41, 11, 50, 22]), - ("!", "50.0", [51, 10, 60, 20]), - ("@", "45.0", [61, 10, 70, 20]), + ("w", "99.0"), + ("o", "98.5"), + ("r", "97.5"), + ("d", "96.0"), + ("!", "50.0"), + ("@", "45.0"), ] + word_bbox = (10, 9, 70, 22) + word_span = _create_hocr_word_span(characters, word_bbox) - word_span = _create_hocr_word_span(characters) - - text, bbox = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.0) + text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.0) assert text == "word!@" - assert bbox == [10, 9, 70, 22] - text, bbox = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.960) + text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.960) assert text == "word" - assert bbox == [10, 9, 50, 22] - text, bbox = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.990) + text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.990) assert text == "w" - assert bbox == [10, 10, 20, 20] - text, bbox = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.999) + text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.999) assert text == "" - assert bbox is None + + +def test_hocr_to_dataframe(): + characters = [ + ("w", "99.0"), + ("o", "98.5"), + ("r", "97.5"), + ("d", "96.0"), + ("!", "50.0"), + ("@", "45.0"), + ] + word_bbox = (10, 9, 70, 22) + hocr = str(_create_hocr_word_span(characters, word_bbox)) + df = OCRAgentTesseract().hocr_to_dataframe(hocr=hocr, character_confidence_threshold=0.960) + + assert df.shape == (1, 5) + assert df["left"].iloc[0] == 10 + assert df["top"].iloc[0] == 9 + assert df["width"].iloc[0] == 60 + assert df["height"].iloc[0] == 13 + assert df["text"].iloc[0] == "word" diff --git a/unstructured/partition/utils/ocr_models/tesseract_ocr.py b/unstructured/partition/utils/ocr_models/tesseract_ocr.py index 36e38787fa..64ba58e073 100644 --- a/unstructured/partition/utils/ocr_models/tesseract_ocr.py +++ b/unstructured/partition/utils/ocr_models/tesseract_ocr.py @@ -107,15 +107,22 @@ def hocr_to_dataframe( self, hocr: str, character_confidence_threshold: float = 0.0 ) -> pd.DataFrame: soup = BeautifulSoup(hocr, "html.parser") - words = soup.find_all("span", class_="ocrx_word") + word_spans = soup.find_all("span", class_="ocrx_word") df_entries = [] - for word in words: - text, bbox = self.extract_word_from_hocr( - word=word, character_confidence_threshold=character_confidence_threshold + for word_span in word_spans: + word_title = word_span.get("title", "") + bbox_match = re.search(r"bbox (\d+) (\d+) (\d+) (\d+)", word_title) + + # Note: word bbox is used instead of combining characters together due to tesseract + # bug that causes the character bboxes to be outside the word bbox, and they have 0 + # height or width when text is horizontal + text = self.extract_word_from_hocr( + word=word_span, character_confidence_threshold=character_confidence_threshold ) - if text and bbox: - left, top, right, bottom = bbox + if text and bbox_match: + word_bbox = list(map(int, bbox_match.groups())) + left, top, right, bottom = word_bbox df_entries.append( { "left": left, @@ -131,42 +138,29 @@ def hocr_to_dataframe( @staticmethod def extract_word_from_hocr( word: Tag, character_confidence_threshold: float = 0.0 - ) -> tuple[str, list[int] | None]: + ) -> str | None: """Extracts a word from an hOCR word tag, filtering out characters with low confidence.""" character_spans = word.find_all("span", class_="ocrx_cinfo") if len(character_spans) == 0: - return "", None + return None word_text = "" - word_bbox = None - for character_span in character_spans: char = character_span.text char_title = character_span.get("title", "") conf_match = re.search(r"x_conf (\d+\.\d+)", char_title) - bbox_match = re.search(r"x_bboxes (\d+) (\d+) (\d+) (\d+)", char_title) - if not (char and conf_match and bbox_match): + if not (char and conf_match): continue character_probability = float(conf_match.group(1)) / 100 - character_bbox = list(map(int, bbox_match.groups())) if character_probability >= character_confidence_threshold: word_text += char - if word_bbox is None: - word_bbox = character_bbox - else: - word_bbox = [ - min(word_bbox[0], character_bbox[0]), # x1 - starts from 0 - min(word_bbox[1], character_bbox[1]), # y1 - starts from 0 - max(word_bbox[2], character_bbox[2]), - max(word_bbox[3], character_bbox[3]), - ] - - return word_text, word_bbox + + return word_text @requires_dependencies("unstructured_inference") def get_layout_elements_from_image(self, image: PILImage.Image) -> List["LayoutElement"]: