Skip to content

Commit

Permalink
Set default threshold
Browse files Browse the repository at this point in the history
  • Loading branch information
plutasnyy committed Jan 8, 2025
1 parent 1611a61 commit cee5440
Show file tree
Hide file tree
Showing 2 changed files with 3 additions and 3 deletions.
2 changes: 1 addition & 1 deletion unstructured/partition/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def TESSERACT_OPTIMUM_TEXT_HEIGHT(self) -> int:
@property
def TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD(self) -> int:
"""Tesseract predictions with confidence below this threshold are ignored"""
return self._get_float("TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD", 0.0)
return self._get_float("TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD", 0.85)

@property
def GOOGLEVISION_API_ENDPOINT(self) -> str:
Expand Down
4 changes: 2 additions & 2 deletions unstructured/partition/utils/ocr_models/tesseract_ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def image_to_data_with_character_confidence_filter(
image: np.ndarray,
lang: str = "eng",
config: str = "",
character_confidence_threshold: float = 0.5,
character_confidence_threshold: float = 0.85,
) -> pd.DataFrame:
hocr: str = unstructured_pytesseract.image_to_pdf_or_hocr(
image,
Expand All @@ -104,7 +104,7 @@ def image_to_data_with_character_confidence_filter(
return ocr_df

def hocr_to_dataframe(
self, hocr: str, character_confidence_threshold: float = 0.0
self, hocr: str, character_confidence_threshold: float = 0.85
) -> pd.DataFrame:
soup = BeautifulSoup(hocr, "html.parser")
word_spans = soup.find_all("span", class_="ocrx_word")
Expand Down

0 comments on commit cee5440

Please sign in to comment.