From cee5440b43943a96a580faf226da73e27b586546 Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Wed, 8 Jan 2025 16:46:31 +0100 Subject: [PATCH] Set default threshold --- unstructured/partition/utils/config.py | 2 +- unstructured/partition/utils/ocr_models/tesseract_ocr.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/unstructured/partition/utils/config.py b/unstructured/partition/utils/config.py index 291ae1b6a3..43489df74f 100644 --- a/unstructured/partition/utils/config.py +++ b/unstructured/partition/utils/config.py @@ -99,7 +99,7 @@ def TESSERACT_OPTIMUM_TEXT_HEIGHT(self) -> int: @property def TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD(self) -> int: """Tesseract predictions with confidence below this threshold are ignored""" - return self._get_float("TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD", 0.0) + return self._get_float("TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD", 0.85) @property def GOOGLEVISION_API_ENDPOINT(self) -> str: diff --git a/unstructured/partition/utils/ocr_models/tesseract_ocr.py b/unstructured/partition/utils/ocr_models/tesseract_ocr.py index 8668dec1fc..58572d1a72 100644 --- a/unstructured/partition/utils/ocr_models/tesseract_ocr.py +++ b/unstructured/partition/utils/ocr_models/tesseract_ocr.py @@ -92,7 +92,7 @@ def image_to_data_with_character_confidence_filter( image: np.ndarray, lang: str = "eng", config: str = "", - character_confidence_threshold: float = 0.5, + character_confidence_threshold: float = 0.85, ) -> pd.DataFrame: hocr: str = unstructured_pytesseract.image_to_pdf_or_hocr( image, @@ -104,7 +104,7 @@ def image_to_data_with_character_confidence_filter( return ocr_df def hocr_to_dataframe( - self, hocr: str, character_confidence_threshold: float = 0.0 + self, hocr: str, character_confidence_threshold: float = 0.85 ) -> pd.DataFrame: soup = BeautifulSoup(hocr, "html.parser") word_spans = soup.find_all("span", class_="ocrx_word")