diff --git a/unstructured/partition/utils/config.py b/unstructured/partition/utils/config.py index 291ae1b6a3..43489df74f 100644 --- a/unstructured/partition/utils/config.py +++ b/unstructured/partition/utils/config.py @@ -99,7 +99,7 @@ def TESSERACT_OPTIMUM_TEXT_HEIGHT(self) -> int: @property def TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD(self) -> int: """Tesseract predictions with confidence below this threshold are ignored""" - return self._get_float("TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD", 0.0) + return self._get_float("TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD", 0.85) @property def GOOGLEVISION_API_ENDPOINT(self) -> str: diff --git a/unstructured/partition/utils/ocr_models/tesseract_ocr.py b/unstructured/partition/utils/ocr_models/tesseract_ocr.py index 8668dec1fc..58572d1a72 100644 --- a/unstructured/partition/utils/ocr_models/tesseract_ocr.py +++ b/unstructured/partition/utils/ocr_models/tesseract_ocr.py @@ -92,7 +92,7 @@ def image_to_data_with_character_confidence_filter( image: np.ndarray, lang: str = "eng", config: str = "", - character_confidence_threshold: float = 0.5, + character_confidence_threshold: float = 0.85, ) -> pd.DataFrame: hocr: str = unstructured_pytesseract.image_to_pdf_or_hocr( image, @@ -104,7 +104,7 @@ def image_to_data_with_character_confidence_filter( return ocr_df def hocr_to_dataframe( - self, hocr: str, character_confidence_threshold: float = 0.0 + self, hocr: str, character_confidence_threshold: float = 0.85 ) -> pd.DataFrame: soup = BeautifulSoup(hocr, "html.parser") word_spans = soup.find_all("span", class_="ocrx_word")