Skip to content

Commit

Permalink
fix: Correctly patch pdfminer to avoid unnecessarily and unsuccessful…
Browse files Browse the repository at this point in the history
…ly repairing PDFs with long content streams, causing needless and endless OCR (#3822)

Fixes: #3815 

Verified on my very large documents that it doesn't unnecessarily and
unsuccessfully "repair" them.

You may or may not wish to keep the version check in `patch_psparser`.
Since ~you're pinning the version of pdfminer.six and since it isn't
guaranteed that the bug in question will be fixed in the next
pdfminer.six release (but it is rather serious, so I should hope so),
then perhaps you just want to unconditionally patch it.~ it seems like
pinning of versions is only operative when running from Docker (good!)
so never mind! Keep that version check!

Also corrected an import so that if you do feel like using a newer
version of pdfminer.six, it won't break on you.

---------

Authored-by: David Huggins-Daines <[email protected]>
  • Loading branch information
dhdaines authored Jan 24, 2025
1 parent e230364 commit 9e5ff22
Show file tree
Hide file tree
Showing 6 changed files with 87 additions and 18 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
## 0.16.16-dev1
## 0.16.16-dev2

### Enhancements

### Features
- **Vectorize layout (inferred, extracted, and OCR) data structure** Using `np.ndarray` to store a group of layout elements or text regions instead of using a list of objects. This improves the memory efficiency and compute speed around layout merging and deduplication.

### Fixes
- **Correctly patch pdfminer to avoid PDF repair**. The patch applied to pdfminer's parser caused it to occasionally split tokens in content streams, throwing `PDFSyntaxError`. Repairing these PDFs sometimes failed (since they were not actually invalid) resulting in unnecessary OCR fallback.

* **Drop usage of ndjson dependency**

Expand Down
16 changes: 15 additions & 1 deletion test_unstructured/partition/pdf_image/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1205,8 +1205,8 @@ def test_partition_pdf_with_fast_finds_headers_footers(
@pytest.mark.parametrize(
("filename", "expected_log"),
[
# This one is *actually* an invalid PDF document
("invalid-pdf-structure-pdfminer-entire-doc.pdf", "Repairing the PDF document ..."),
("invalid-pdf-structure-pdfminer-one-page.pdf", "Repairing the PDF page 2 ..."),
],
)
def test_extractable_elements_repair_invalid_pdf_structure(filename, expected_log, caplog):
Expand All @@ -1215,6 +1215,20 @@ def test_extractable_elements_repair_invalid_pdf_structure(filename, expected_lo
assert expected_log in caplog.text


@pytest.mark.parametrize(
("filename", "expected_log"),
[
# This one is *not* an invalid PDF document, make sure we
# don't try to "repair" it unnecessarily
("invalid-pdf-structure-pdfminer-one-page.pdf", "Repairing the PDF page 2 ..."),
],
)
def test_properly_patch_pdfminer(filename, expected_log, caplog):
caplog.set_level(logging.INFO)
assert pdf.extractable_elements(filename=example_doc_path(f"pdf/{filename}"))
assert expected_log not in caplog.text


def assert_element_extraction(
elements: list[Element],
extract_image_block_types: list[str],
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.16.16-dev1" # pragma: no cover
__version__ = "0.16.16-dev2" # pragma: no cover
12 changes: 7 additions & 5 deletions unstructured/partition/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@

import numpy as np
import wrapt
from pdfminer import psparser
from pdfminer.layout import LTContainer, LTImage, LTItem, LTTextBox
from pdfminer.utils import open_filename
from pi_heif import register_heif_opener
Expand Down Expand Up @@ -96,15 +95,18 @@
PartitionStrategy,
)
from unstructured.partition.utils.sorting import coord_has_valid_points, sort_page_elements
from unstructured.patches.pdfminer import parse_keyword
from unstructured.patches.pdfminer import patch_psparser
from unstructured.utils import first, requires_dependencies

if TYPE_CHECKING:
pass

# NOTE(alan): Patching this to fix a bug in pdfminer.six. Submitted this PR into pdfminer.six to fix
# the bug: https://github.com/pdfminer/pdfminer.six/pull/885
psparser.PSBaseParser._parse_keyword = parse_keyword # type: ignore

# Correct a bug that was introduced by a previous patch to
# pdfminer.six, causing needless and unsuccessful repairing of PDFs
# which were not actually broken.
patch_psparser()


RE_MULTISPACE_INCLUDING_NEWLINES = re.compile(pattern=r"\s+", flags=re.DOTALL)

Expand Down
2 changes: 1 addition & 1 deletion unstructured/partition/pdf_image/pdfminer_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from pdfminer.layout import LAParams, LTContainer, LTImage, LTItem, LTTextLine
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PSSyntaxError
from pdfminer.psparser import PSSyntaxError

from unstructured.logger import logger
from unstructured.utils import requires_dependencies
Expand Down
70 changes: 61 additions & 9 deletions unstructured/patches/pdfminer.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,35 @@
from typing import Union
import functools
from typing import Tuple, Union

from pdfminer.psparser import END_KEYWORD, KWD, PSBaseParser, PSKeyword
import pdfminer
from pdfminer.psparser import (
END_KEYWORD,
KWD,
PSEOF,
PSBaseParser,
PSBaseParserToken,
PSKeyword,
log,
)

factory_seek = PSBaseParser.seek

def parse_keyword(self: PSBaseParser, s: bytes, i: int) -> int:
"""Patch for pdfminer method _parse_keyword of PSBaseParser. Changes are identical to the PR
https://github.com/pdfminer/pdfminer.six/pull/885."""

@functools.wraps(PSBaseParser.seek)
def seek(self: PSBaseParser, pos: int) -> None:
factory_seek(self, pos)
self.eof = False


@functools.wraps(PSBaseParser._parse_keyword)
def _parse_keyword(self, s: bytes, i: int) -> int:
m = END_KEYWORD.search(s, i)
if not m:
j = len(s)
self._curtoken += s[i:]
else:
if m:
j = m.start(0)
self._curtoken += s[i:j]
else:
self._curtoken += s[i:]
return len(s)
if self._curtoken == b"true":
token: Union[bool, PSKeyword] = True
elif self._curtoken == b"false":
Expand All @@ -22,3 +39,38 @@ def parse_keyword(self: PSBaseParser, s: bytes, i: int) -> int:
self._add_token(token)
self._parse1 = self._parse_main
return j


@functools.wraps(PSBaseParser.nexttoken)
def nexttoken(self) -> Tuple[int, PSBaseParserToken]:
if self.eof:
# It's not really unexpected, come on now...
raise PSEOF("Unexpected EOF")
while not self._tokens:
try:
self.fillbuf()
self.charpos = self._parse1(self.buf, self.charpos)
except PSEOF:
# If we hit EOF in the middle of a token, try to parse
# it by tacking on whitespace, and delay raising PSEOF
# until next time around
self.charpos = self._parse1(b"\n", 0)
self.eof = True
# Oh, so there wasn't actually a token there? OK.
if not self._tokens:
raise
token = self._tokens.pop(0)
log.debug("nexttoken: %r", token)
return token


def patch_psparser():
"""Monkey-patch certain versions of pdfminer.six to avoid dropping
tokens at EOF (before 20231228) and splitting tokens at buffer
boundaries (20231228 and 20240706).
"""
# Presuming the bug will be fixed in the next release
if pdfminer.__version__ <= "20240706":
PSBaseParser.seek = seek
PSBaseParser._parse_keyword = _parse_keyword
PSBaseParser.nexttoken = nexttoken

0 comments on commit 9e5ff22

Please sign in to comment.