diff --git a/CHANGELOG.md b/CHANGELOG.md index b8bc339c9d..ea2d19c906 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,9 @@ -## 0.11.4-dev0 +## 0.11.4-dev1 ### Enhancements +* **Refactor pdfminer code.** The pdfminer code is moved from `unstructured-inference` to `unstructured`. + ### Features ### Fixes @@ -23,8 +25,8 @@ ## 0.11.1 ### Enhancements -* **Use `pikepdf` to repair invalid PDF structure** for PDFminer when we see error `PSSyntaxError` when PDFminer opens the document and creates the PDFminer pages object or processes a single PDF page. +* **Use `pikepdf` to repair invalid PDF structure** for PDFminer when we see error `PSSyntaxError` when PDFminer opens the document and creates the PDFminer pages object or processes a single PDF page. * **Batch Source Connector support** For instances where it is more optimal to read content from a source connector in batches, a new batch ingest doc is added which created multiple ingest docs after reading them in in batches per process. ### Features diff --git a/docs/requirements.txt b/docs/requirements.txt index a1cb4e31be..129ce79f3d 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -36,7 +36,7 @@ idna==3.6 # requests imagesize==1.4.1 # via sphinx -importlib-metadata==6.8.0 +importlib-metadata==6.9.0 # via sphinx jinja2==3.1.2 # via diff --git a/examples/custom-layout-order/evaluate_natural_reading_order.py b/examples/custom-layout-order/evaluate_natural_reading_order.py index ed4f6c09dd..165841957f 100644 --- a/examples/custom-layout-order/evaluate_natural_reading_order.py +++ b/examples/custom-layout-order/evaluate_natural_reading_order.py @@ -8,7 +8,7 @@ from PIL import Image from unstructured.documents.elements import PageBreak -from unstructured.partition.pdf import partition_pdf +from unstructured.partition.pdf_image.pdf import partition_pdf from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_DONT, SORT_MODE_XY_CUT from unstructured.partition.utils.xycut import ( bbox2points, diff --git a/examples/layout-analysis/visualization.py b/examples/layout-analysis/visualization.py index a13bb930cb..3906ea0a1e 100644 --- a/examples/layout-analysis/visualization.py +++ b/examples/layout-analysis/visualization.py @@ -7,7 +7,7 @@ from unstructured_inference.visualize import draw_bbox from unstructured.documents.elements import PageBreak -from unstructured.partition.pdf import partition_pdf +from unstructured.partition.pdf_image.pdf import partition_pdf CUR_DIR = pathlib.Path(__file__).parent.resolve() diff --git a/requirements/build.txt b/requirements/build.txt index a1cb4e31be..129ce79f3d 100644 --- a/requirements/build.txt +++ b/requirements/build.txt @@ -36,7 +36,7 @@ idna==3.6 # requests imagesize==1.4.1 # via sphinx -importlib-metadata==6.8.0 +importlib-metadata==6.9.0 # via sphinx jinja2==3.1.2 # via diff --git a/requirements/dev.txt b/requirements/dev.txt index 52e9747700..c5592adbd0 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -91,7 +91,7 @@ idna==3.6 # anyio # jsonschema # requests -importlib-metadata==6.8.0 +importlib-metadata==6.9.0 # via # build # jupyter-client @@ -138,7 +138,7 @@ jsonschema[format-nongpl]==4.20.0 # jupyter-events # jupyterlab-server # nbformat -jsonschema-specifications==2023.11.1 +jsonschema-specifications==2023.11.2 # via jsonschema jupyter==1.0.0 # via -r dev.in @@ -301,7 +301,7 @@ qtconsole==5.5.1 # via jupyter qtpy==2.4.1 # via qtconsole -referencing==0.31.0 +referencing==0.31.1 # via # jsonschema # jsonschema-specifications @@ -319,7 +319,7 @@ rfc3986-validator==0.1.1 # via # jsonschema # jupyter-events -rpds-py==0.13.1 +rpds-py==0.13.2 # via # jsonschema # referencing @@ -354,7 +354,7 @@ tomli==2.0.1 # jupyterlab # pip-tools # pyproject-hooks -tornado==6.3.3 +tornado==6.4 # via # ipykernel # jupyter-client @@ -395,7 +395,7 @@ urllib3==1.26.18 # -c constraints.in # -c test.txt # requests -virtualenv==20.24.7 +virtualenv==20.25.0 # via pre-commit wcwidth==0.2.12 # via prompt-toolkit diff --git a/requirements/extra-markdown.txt b/requirements/extra-markdown.txt index cf0b1f2c08..940336c7ca 100644 --- a/requirements/extra-markdown.txt +++ b/requirements/extra-markdown.txt @@ -4,7 +4,7 @@ # # pip-compile --output-file=extra-markdown.txt extra-markdown.in # -importlib-metadata==6.8.0 +importlib-metadata==6.9.0 # via markdown markdown==3.5.1 # via -r extra-markdown.in diff --git a/requirements/extra-msg.txt b/requirements/extra-msg.txt index 525809363e..0b4a6bfb15 100644 --- a/requirements/extra-msg.txt +++ b/requirements/extra-msg.txt @@ -6,5 +6,5 @@ # msg-parser==1.2.0 # via -r extra-msg.in -olefile==0.46 +olefile==0.47 # via msg-parser diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt index 4999f5d3b6..896e548897 100644 --- a/requirements/extra-paddleocr.txt +++ b/requirements/extra-paddleocr.txt @@ -59,7 +59,7 @@ imageio==2.33.0 # scikit-image imgaug==0.4.0 # via unstructured-paddleocr -importlib-metadata==6.8.0 +importlib-metadata==6.9.0 # via flask importlib-resources==6.1.1 # via matplotlib diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in index bcccc2aceb..4ccf33f804 100644 --- a/requirements/extra-pdf-image.in +++ b/requirements/extra-pdf-image.in @@ -8,7 +8,7 @@ pikepdf pypdf # Do not move to contsraints.in, otherwise unstructured-inference will not be upgraded # when unstructured library is. -unstructured-inference==0.7.15 +unstructured-inference==0.7.17 # unstructured fork of pytesseract that provides an interface to allow for multiple output formats # from one tesseract call unstructured.pytesseract>=0.3.12 diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 9a5ae381a7..dc7f2ec7a3 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -250,7 +250,7 @@ typing-extensions==4.8.0 # torch tzdata==2023.3 # via pandas -unstructured-inference==0.7.15 +unstructured-inference==0.7.17 # via -r extra-pdf-image.in unstructured-pytesseract==0.3.12 # via diff --git a/requirements/ingest/airtable.txt b/requirements/ingest/airtable.txt index 0560b94e23..d9145af2be 100644 --- a/requirements/ingest/airtable.txt +++ b/requirements/ingest/airtable.txt @@ -19,7 +19,7 @@ idna==3.6 # requests inflection==0.5.1 # via pyairtable -pyairtable==2.2.0 +pyairtable==2.2.1 # via -r ingest/airtable.in pydantic==1.10.13 # via diff --git a/requirements/ingest/azure.txt b/requirements/ingest/azure.txt index 779d423308..c54b4e0aeb 100644 --- a/requirements/ingest/azure.txt +++ b/requirements/ingest/azure.txt @@ -76,9 +76,7 @@ portalocker==2.8.2 pycparser==2.21 # via cffi pyjwt[crypto]==2.8.0 - # via - # msal - # pyjwt + # via msal requests==2.31.0 # via # -c ingest/../base.txt diff --git a/requirements/ingest/box.txt b/requirements/ingest/box.txt index aec2ef132f..506a27c5de 100644 --- a/requirements/ingest/box.txt +++ b/requirements/ingest/box.txt @@ -9,9 +9,7 @@ attrs==23.1.0 boxfs==0.2.1 # via -r ingest/box.in boxsdk[jwt]==3.9.2 - # via - # boxfs - # boxsdk + # via boxfs certifi==2023.11.17 # via # -c ingest/../base.txt diff --git a/requirements/ingest/confluence.txt b/requirements/ingest/confluence.txt index 9121adfb5e..4d37f5b9e2 100644 --- a/requirements/ingest/confluence.txt +++ b/requirements/ingest/confluence.txt @@ -4,7 +4,7 @@ # # pip-compile --output-file=ingest/confluence.txt ingest/confluence.in # -atlassian-python-api==3.41.3 +atlassian-python-api==3.41.4 # via -r ingest/confluence.in certifi==2023.11.17 # via diff --git a/requirements/ingest/embed-aws-bedrock.txt b/requirements/ingest/embed-aws-bedrock.txt index 50e4c401ce..e155405f14 100644 --- a/requirements/ingest/embed-aws-bedrock.txt +++ b/requirements/ingest/embed-aws-bedrock.txt @@ -46,6 +46,8 @@ frozenlist==1.4.0 # via # aiohttp # aiosignal +greenlet==3.0.1 + # via sqlalchemy idna==3.6 # via # -c ingest/../base.txt @@ -62,11 +64,11 @@ jsonpatch==1.33 # langchain-core jsonpointer==2.4 # via jsonpatch -langchain==0.0.341 +langchain==0.0.344 # via -r ingest/embed-aws-bedrock.in -langchain-core==0.0.6 +langchain-core==0.0.8 # via langchain -langsmith==0.0.67 +langsmith==0.0.68 # via # langchain # langchain-core diff --git a/requirements/ingest/embed-huggingface.txt b/requirements/ingest/embed-huggingface.txt index 650134007b..771a613391 100644 --- a/requirements/ingest/embed-huggingface.txt +++ b/requirements/ingest/embed-huggingface.txt @@ -51,6 +51,8 @@ fsspec==2023.9.1 # -c ingest/../constraints.in # huggingface-hub # torch +greenlet==3.0.1 + # via sqlalchemy huggingface==0.0.1 # via -r ingest/embed-huggingface.in huggingface-hub==0.19.4 @@ -77,11 +79,11 @@ jsonpatch==1.33 # langchain-core jsonpointer==2.4 # via jsonpatch -langchain==0.0.341 +langchain==0.0.344 # via -r ingest/embed-huggingface.in -langchain-core==0.0.6 +langchain-core==0.0.8 # via langchain -langsmith==0.0.67 +langsmith==0.0.68 # via # langchain # langchain-core diff --git a/requirements/ingest/embed-openai.txt b/requirements/ingest/embed-openai.txt index 6798574d0e..0486bc927d 100644 --- a/requirements/ingest/embed-openai.txt +++ b/requirements/ingest/embed-openai.txt @@ -43,6 +43,8 @@ frozenlist==1.4.0 # via # aiohttp # aiosignal +greenlet==3.0.1 + # via sqlalchemy h11==0.14.0 # via httpcore httpcore==1.0.2 @@ -62,11 +64,11 @@ jsonpatch==1.33 # langchain-core jsonpointer==2.4 # via jsonpatch -langchain==0.0.341 +langchain==0.0.344 # via -r ingest/embed-openai.in -langchain-core==0.0.6 +langchain-core==0.0.8 # via langchain -langsmith==0.0.67 +langsmith==0.0.68 # via # langchain # langchain-core @@ -87,7 +89,7 @@ numpy==1.24.4 # -c ingest/../base.txt # -c ingest/../constraints.in # langchain -openai==1.3.5 +openai==1.3.7 # via -r ingest/embed-openai.in packaging==23.2 # via @@ -116,6 +118,7 @@ sniffio==1.3.0 # via # anyio # httpx + # openai sqlalchemy==2.0.23 # via langchain tenacity==8.2.3 diff --git a/requirements/ingest/gcs.txt b/requirements/ingest/gcs.txt index 8d800668af..171f91313f 100644 --- a/requirements/ingest/gcs.txt +++ b/requirements/ingest/gcs.txt @@ -46,7 +46,7 @@ google-api-core==2.14.0 # via # google-cloud-core # google-cloud-storage -google-auth==2.23.4 +google-auth==2.24.0 # via # gcsfs # google-api-core diff --git a/requirements/ingest/github.txt b/requirements/ingest/github.txt index 736b8b9ee6..f39ad3cd32 100644 --- a/requirements/ingest/github.txt +++ b/requirements/ingest/github.txt @@ -30,9 +30,7 @@ pycparser==2.21 pygithub==2.1.1 # via -r ingest/github.in pyjwt[crypto]==2.8.0 - # via - # pygithub - # pyjwt + # via pygithub pynacl==1.5.0 # via pygithub python-dateutil==2.8.2 diff --git a/requirements/ingest/google-drive.txt b/requirements/ingest/google-drive.txt index 882a3bcf42..49912a2083 100644 --- a/requirements/ingest/google-drive.txt +++ b/requirements/ingest/google-drive.txt @@ -17,9 +17,9 @@ charset-normalizer==3.3.2 # requests google-api-core==2.14.0 # via google-api-python-client -google-api-python-client==2.108.0 +google-api-python-client==2.109.0 # via -r ingest/google-drive.in -google-auth==2.23.4 +google-auth==2.24.0 # via # google-api-core # google-api-python-client diff --git a/requirements/ingest/hubspot.txt b/requirements/ingest/hubspot.txt index 95b073e691..d67982d3d0 100644 --- a/requirements/ingest/hubspot.txt +++ b/requirements/ingest/hubspot.txt @@ -2,19 +2,19 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile requirements/ingest-hubspot.in +# pip-compile --output-file=ingest/hubspot.txt ingest/hubspot.in # -certifi==2023.7.22 +certifi==2023.11.17 # via hubspot-api-client hubspot-api-client==8.1.1 - # via -r requirements/ingest-hubspot.in + # via -r ingest/hubspot.in python-dateutil==2.8.2 # via hubspot-api-client six==1.16.0 # via # hubspot-api-client # python-dateutil -urllib3==1.26.17 +urllib3==2.1.0 # via - # -r requirements/ingest-hubspot.in + # -r ingest/hubspot.in # hubspot-api-client diff --git a/requirements/ingest/jira.txt b/requirements/ingest/jira.txt index 3ab059f559..0b77cb9945 100644 --- a/requirements/ingest/jira.txt +++ b/requirements/ingest/jira.txt @@ -4,7 +4,7 @@ # # pip-compile --output-file=ingest/jira.txt ingest/jira.in # -atlassian-python-api==3.41.3 +atlassian-python-api==3.41.4 # via -r ingest/jira.in certifi==2023.11.17 # via diff --git a/requirements/ingest/mongodb.txt b/requirements/ingest/mongodb.txt index e193ef4e0d..d395eb0abb 100644 --- a/requirements/ingest/mongodb.txt +++ b/requirements/ingest/mongodb.txt @@ -6,5 +6,5 @@ # dnspython==2.4.2 # via pymongo -pymongo==4.6.0 +pymongo==4.6.1 # via -r ingest/mongodb.in diff --git a/requirements/ingest/onedrive.txt b/requirements/ingest/onedrive.txt index 99930f62ca..155fdcb36a 100644 --- a/requirements/ingest/onedrive.txt +++ b/requirements/ingest/onedrive.txt @@ -40,9 +40,7 @@ office365-rest-python-client==2.4.2 pycparser==2.21 # via cffi pyjwt[crypto]==2.8.0 - # via - # msal - # pyjwt + # via msal pytz==2023.3.post1 # via office365-rest-python-client requests==2.31.0 diff --git a/requirements/ingest/outlook.txt b/requirements/ingest/outlook.txt index becea4ed3a..03aa4ffdd0 100644 --- a/requirements/ingest/outlook.txt +++ b/requirements/ingest/outlook.txt @@ -34,9 +34,7 @@ office365-rest-python-client==2.4.2 pycparser==2.21 # via cffi pyjwt[crypto]==2.8.0 - # via - # msal - # pyjwt + # via msal pytz==2023.3.post1 # via office365-rest-python-client requests==2.31.0 diff --git a/requirements/ingest/pinecone.in b/requirements/ingest/pinecone.in index 939f61e6d4..ebaedb531b 100644 --- a/requirements/ingest/pinecone.in +++ b/requirements/ingest/pinecone.in @@ -1,3 +1,3 @@ --c constraints.in --c base.txt +-c ../constraints.in +-c ../base.txt pinecone-client diff --git a/requirements/ingest/pinecone.txt b/requirements/ingest/pinecone.txt index 19c30cdeff..ffa46e75ce 100644 --- a/requirements/ingest/pinecone.txt +++ b/requirements/ingest/pinecone.txt @@ -1,56 +1,58 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile requirements/ingest-pinecone.in +# pip-compile --output-file=ingest/pinecone.txt ingest/pinecone.in # -certifi==2023.7.22 +certifi==2023.11.17 # via - # -c requirements/base.txt - # -c requirements/constraints.in + # -c ingest/../base.txt + # -c ingest/../constraints.in # requests -charset-normalizer==3.3.0 +charset-normalizer==3.3.2 # via - # -c requirements/base.txt + # -c ingest/../base.txt # requests dnspython==2.4.2 # via pinecone-client -idna==3.4 +idna==3.6 # via - # -c requirements/base.txt + # -c ingest/../base.txt # requests loguru==0.7.2 # via pinecone-client numpy==1.24.4 # via - # -c requirements/base.txt - # -c requirements/constraints.in + # -c ingest/../base.txt + # -c ingest/../constraints.in # pinecone-client pinecone-client==2.2.4 - # via -r requirements/ingest-pinecone.in + # via -r ingest/pinecone.in python-dateutil==2.8.2 - # via pinecone-client + # via + # -c ingest/../base.txt + # pinecone-client pyyaml==6.0.1 # via pinecone-client requests==2.31.0 # via - # -c requirements/base.txt + # -c ingest/../base.txt # pinecone-client six==1.16.0 # via - # -c requirements/base.txt + # -c ingest/../base.txt # python-dateutil tqdm==4.66.1 # via - # -c requirements/base.txt + # -c ingest/../base.txt # pinecone-client typing-extensions==4.8.0 # via - # -c requirements/base.txt + # -c ingest/../base.txt # pinecone-client urllib3==1.26.18 # via - # -c requirements/base.txt - # -c requirements/constraints.in + # -c ingest/../base.txt + # -c ingest/../constraints.in # pinecone-client # requests diff --git a/requirements/ingest/sharepoint.txt b/requirements/ingest/sharepoint.txt index 82f58e365c..d49b89c227 100644 --- a/requirements/ingest/sharepoint.txt +++ b/requirements/ingest/sharepoint.txt @@ -34,9 +34,7 @@ office365-rest-python-client==2.4.2 pycparser==2.21 # via cffi pyjwt[crypto]==2.8.0 - # via - # msal - # pyjwt + # via msal pytz==2023.3.post1 # via office365-rest-python-client requests==2.31.0 diff --git a/setup.cfg b/setup.cfg index 85c0e27436..5b463cff97 100644 --- a/setup.cfg +++ b/setup.cfg @@ -5,6 +5,7 @@ license_files = LICENSE.md max-line-length = 100 exclude = .venv + unstructured-inference [tool:pytest] filterwarnings = diff --git a/test_unstructured/partition/pdf_image/test_chipper.py b/test_unstructured/partition/pdf_image/test_chipper.py index d625f97876..469362ade8 100644 --- a/test_unstructured/partition/pdf_image/test_chipper.py +++ b/test_unstructured/partition/pdf_image/test_chipper.py @@ -1,6 +1,6 @@ import pytest -from unstructured.partition import pdf +from unstructured.partition.pdf_image import pdf from unstructured.partition.utils.constants import PartitionStrategy diff --git a/test_unstructured/partition/pdf_image/test_image.py b/test_unstructured/partition/pdf_image/test_image.py index d233c345a8..4b50f077c6 100644 --- a/test_unstructured/partition/pdf_image/test_image.py +++ b/test_unstructured/partition/pdf_image/test_image.py @@ -10,7 +10,7 @@ from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path from unstructured.chunking.title import chunk_by_title from unstructured.documents.elements import ElementType -from unstructured.partition import image, ocr, pdf +from unstructured.partition.pdf_image import image, ocr, pdf from unstructured.partition.utils.constants import ( UNSTRUCTURED_INCLUDE_DEBUG_METADATA, PartitionStrategy, @@ -303,7 +303,7 @@ def test_partition_image_metadata_date( ): mocked_last_modification_date = "2029-07-05T09:24:28" mocker.patch( - "unstructured.partition.pdf.get_last_modified_date", + "unstructured.partition.pdf_image.pdf.get_last_modified_date", return_value=mocked_last_modification_date, ) elements = image.partition_image(filename=filename) @@ -317,7 +317,7 @@ def test_partition_image_with_hi_res_strategy_metadata_date( ): mocked_last_modification_date = "2029-07-05T09:24:28" mocker.patch( - "unstructured.partition.pdf.get_last_modified_date", + "unstructured.partition.pdf_image.pdf.get_last_modified_date", return_value=mocked_last_modification_date, ) elements = image.partition_image(filename=filename, stratefy=PartitionStrategy.HI_RES) @@ -333,7 +333,7 @@ def test_partition_image_metadata_date_custom_metadata_date( expected_last_modification_date = "2009-07-05T09:24:28" mocker.patch( - "unstructured.partition.pdf.get_last_modified_date", + "unstructured.partition.pdf_image.pdf.get_last_modified_date", return_value=mocked_last_modification_date, ) elements = image.partition_image( @@ -352,7 +352,7 @@ def test_partition_image_with_hi_res_strategy_metadata_date_custom_metadata_date expected_last_modification_date = "2009-07-05T09:24:28" mocker.patch( - "unstructured.partition.pdf.get_last_modified_date", + "unstructured.partition.pdf_image.pdf.get_last_modified_date", return_value=mocked_last_modification_date, ) elements = image.partition_image( @@ -370,7 +370,7 @@ def test_partition_image_from_file_metadata_date( ): mocked_last_modification_date = "2029-07-05T09:24:28" mocker.patch( - "unstructured.partition.pdf.get_last_modified_date_from_file", + "unstructured.partition.pdf_image.pdf.get_last_modified_date_from_file", return_value=mocked_last_modification_date, ) with open(filename, "rb") as f: @@ -385,7 +385,7 @@ def test_partition_image_from_file_with_hi_res_strategy_metadata_date( ): mocked_last_modification_date = "2029-07-05T09:24:28" mocker.patch( - "unstructured.partition.pdf.get_last_modified_date_from_file", + "unstructured.partition.pdf_image.pdf.get_last_modified_date_from_file", return_value=mocked_last_modification_date, ) @@ -403,7 +403,7 @@ def test_partition_image_from_file_metadata_date_custom_metadata_date( expected_last_modification_date = "2009-07-05T09:24:28" mocker.patch( - "unstructured.partition.pdf.get_last_modified_date_from_file", + "unstructured.partition.pdf_image.pdf.get_last_modified_date_from_file", return_value=mocked_last_modification_date, ) with open(filename, "rb") as f: @@ -423,7 +423,7 @@ def test_partition_image_from_file_with_hi_res_strategy_metadata_date_custom_met expected_last_modification_date = "2009-07-05T09:24:28" mocker.patch( - "unstructured.partition.pdf.get_last_modified_date_from_file", + "unstructured.partition.pdf_image.pdf.get_last_modified_date_from_file", return_value=mocked_last_modification_date, ) with open(filename, "rb") as f: @@ -479,7 +479,7 @@ def test_partition_image_with_ocr_coordinates_are_not_nan_from_filename( def test_partition_image_formats_languages_for_tesseract(): filename = "example-docs/jpn-vert.jpeg" with mock.patch( - "unstructured.partition.ocr.process_file_with_ocr", + "unstructured.partition.pdf_image.ocr.process_file_with_ocr", ) as mock_process_file_with_ocr: image.partition_image( filename=filename, strategy=PartitionStrategy.HI_RES, languages=["jpn_vert"] @@ -592,7 +592,6 @@ def inference_results(): page = layout.PageLayout( number=1, image=mock.MagicMock(format="JPEG"), - layout=layout.TextRegion.from_coords(0, 0, 600, 800, text="hello"), ) page.elements = [layout.LayoutElement.from_coords(0, 0, 600, 800, text="hello")] doc = layout.DocumentLayout(pages=[page]) diff --git a/test_unstructured/partition/pdf_image/test_ocr.py b/test_unstructured/partition/pdf_image/test_ocr.py index 4d94cd6106..acfc22a3f9 100644 --- a/test_unstructured/partition/pdf_image/test_ocr.py +++ b/test_unstructured/partition/pdf_image/test_ocr.py @@ -13,8 +13,8 @@ ) from unstructured.documents.elements import ElementType -from unstructured.partition import ocr -from unstructured.partition.ocr import pad_element_bboxes +from unstructured.partition.pdf_image import ocr +from unstructured.partition.pdf_image.ocr import pad_element_bboxes from unstructured.partition.utils.constants import ( OCR_AGENT_PADDLE, OCR_AGENT_TESSERACT, diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index ed973dda96..efd4a9b3a2 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -5,6 +5,7 @@ from unittest import mock import pytest +from pdf2image.exceptions import PDFPageCountError from PIL import Image from unstructured_inference.inference import layout @@ -19,8 +20,9 @@ Text, Title, ) -from unstructured.partition import ocr, pdf, strategies -from unstructured.partition.pdf import get_uris_from_annots +from unstructured.partition import strategies +from unstructured.partition.pdf_image import ocr, pdf, pdfminer_processing +from unstructured.partition.pdf_image.pdf import get_uris_from_annots from unstructured.partition.utils.constants import ( UNSTRUCTURED_INCLUDE_DEBUG_METADATA, PartitionStrategy, @@ -109,6 +111,16 @@ def test_partition_pdf_local(monkeypatch, filename, file): "process_file_with_model", lambda *args, **kwargs: MockDocumentLayout(), ) + monkeypatch.setattr( + pdfminer_processing, + "process_data_with_pdfminer", + lambda *args, **kwargs: MockDocumentLayout(), + ) + monkeypatch.setattr( + pdfminer_processing, + "process_file_with_pdfminer", + lambda *args, **kwargs: MockDocumentLayout(), + ) monkeypatch.setattr( ocr, "process_data_with_ocr", @@ -116,7 +128,7 @@ def test_partition_pdf_local(monkeypatch, filename, file): ) monkeypatch.setattr( ocr, - "process_data_with_ocr", + "process_file_with_ocr", lambda *args, **kwargs: MockDocumentLayout(), ) @@ -125,7 +137,7 @@ def test_partition_pdf_local(monkeypatch, filename, file): def test_partition_pdf_local_raises_with_no_filename(): - with pytest.raises(FileNotFoundError): + with pytest.raises((FileNotFoundError, PDFPageCountError)): pdf._partition_pdf_or_image_local(filename="", file=None, is_image=False) @@ -391,7 +403,7 @@ def mock_exists(dep): def test_partition_pdf_uses_table_extraction(): filename = example_doc_path("layout-parser-paper-fast.pdf") with mock.patch( - "unstructured.partition.ocr.process_file_with_ocr", + "unstructured.partition.pdf_image.ocr.process_file_with_ocr", ) as mock_process_file_with_model: pdf.partition_pdf(filename, infer_table_structure=True) assert mock_process_file_with_model.call_args[1]["infer_table_structure"] @@ -633,7 +645,7 @@ def test_partition_pdf_metadata_date( ) mocker.patch( - "unstructured.partition.pdf.get_the_last_modification_date_pdf_or_img", + "unstructured.partition.pdf_image.pdf.get_the_last_modification_date_pdf_or_img", return_value=mocked_last_modification_date, ) diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index 36a4ee5534..a364076f57 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -330,7 +330,7 @@ def test_auto_partition_pdf_from_filename(pass_metadata_filename, content_type, def test_auto_partition_pdf_uses_table_extraction(): filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf") with patch( - "unstructured.partition.ocr.process_file_with_ocr", + "unstructured.partition.pdf_image.ocr.process_file_with_ocr", ) as mock_process_file_with_model: partition(filename, pdf_infer_table_structure=True, strategy=PartitionStrategy.HI_RES) assert mock_process_file_with_model.call_args[1]["infer_table_structure"] @@ -390,7 +390,7 @@ def test_auto_partition_pdf_from_file(pass_metadata_filename, content_type, requ def test_auto_partition_formats_languages_for_tesseract(): filename = "example-docs/chi_sim_image.jpeg" with patch( - "unstructured.partition.ocr.process_file_with_ocr", + "unstructured.partition.pdf_image.ocr.process_file_with_ocr", ) as mock_process_file_with_ocr: partition(filename, strategy=PartitionStrategy.HI_RES, languages=["zh"]) _, kwargs = mock_process_file_with_ocr.call_args_list[0] @@ -692,7 +692,10 @@ def test_file_specific_produces_correct_filetype(filetype: FileType): extension if filetype not in FILETYPE_TO_MODULE else FILETYPE_TO_MODULE[filetype] ) fun_name = "partition_" + filetype_module - module = import_module(f"unstructured.partition.{filetype_module}") # noqa + if filetype_module in ["pdf", "image"]: + module = import_module(f"unstructured.partition.pdf_image.{filetype_module}") # noqa + else: + module = import_module(f"unstructured.partition.{filetype_module}") # noqa fun = eval(f"module.{fun_name}") for file in pathlib.Path("example-docs").iterdir(): if file.is_file() and file.suffix == f".{extension}": diff --git a/test_unstructured/partition/test_common.py b/test_unstructured/partition/test_common.py index 2be9a93fd4..5e9652ff55 100644 --- a/test_unstructured/partition/test_common.py +++ b/test_unstructured/partition/test_common.py @@ -451,7 +451,6 @@ def test_document_to_element_list_handles_parent(): page = PageLayout( number=1, image=MockImage(), - layout=None, ) page.elements = [block1, block2] doc = DocumentLayout.from_pages([page]) @@ -477,7 +476,6 @@ def test_document_to_element_list_doesnt_sort_on_sort_method(sort_mode, call_cou page = PageLayout( number=1, image=MockImage(), - layout=None, ) page.elements = [block1, block2] doc = DocumentLayout.from_pages([page]) diff --git a/test_unstructured/partition/test_strategies.py b/test_unstructured/partition/test_strategies.py index b8891c3953..f9eab34995 100644 --- a/test_unstructured/partition/test_strategies.py +++ b/test_unstructured/partition/test_strategies.py @@ -2,7 +2,8 @@ import pytest -from unstructured.partition import pdf, strategies +from unstructured.partition import strategies +from unstructured.partition.pdf_image import pdf from unstructured.partition.utils.constants import PartitionStrategy diff --git a/test_unstructured/partition/utils/test_processing_elements.py b/test_unstructured/partition/utils/test_processing_elements.py index b7d4843077..78fb1b4996 100644 --- a/test_unstructured/partition/utils/test_processing_elements.py +++ b/test_unstructured/partition/utils/test_processing_elements.py @@ -1,8 +1,10 @@ import pytest -from unstructured_inference.constants import Source +from PIL import Image +from unstructured_inference.constants import Source as InferenceSource from unstructured_inference.inference.elements import Rectangle from unstructured_inference.inference.layout import DocumentLayout, LayoutElement, PageLayout +from unstructured.partition.utils.constants import Source from unstructured.partition.utils.processing_elements import clean_pdfminer_inner_elements # A set of elements with pdfminer elements inside tables @@ -23,10 +25,10 @@ bbox=Rectangle(0, 0, 100, 100), text="Table with inner elements", type="Table", - source=Source.YOLOX, + source=InferenceSource.YOLOX, ), - LayoutElement(bbox=Rectangle(50, 50, 70, 70), text="text1", source=Source.YOLOX), - LayoutElement(bbox=Rectangle(70, 70, 80, 80), text="text2", source=Source.YOLOX), + LayoutElement(bbox=Rectangle(50, 50, 70, 70), text="text1", source=InferenceSource.YOLOX), + LayoutElement(bbox=Rectangle(70, 70, 80, 80), text="text2", source=InferenceSource.YOLOX), ] # A set of elements with pdfminer elements inside tables and other # elements with source=Source.PDFMINER @@ -36,7 +38,7 @@ bbox=Rectangle(0, 0, 100, 100), text="Table1 with inner elements", type="Table", - source=Source.YOLOX, + source=InferenceSource.YOLOX, ), LayoutElement(bbox=Rectangle(50, 50, 70, 70), text="Inside table1"), LayoutElement(bbox=Rectangle(70, 70, 80, 80), text="Inside table1", source=Source.PDFMINER), @@ -54,7 +56,7 @@ bbox=Rectangle(0, 500, 100, 700), text="Table2 with inner elements", type="Table", - source=Source.YOLOX, + source=InferenceSource.YOLOX, ), LayoutElement(bbox=Rectangle(0, 510, 50, 300), text="Inside table2", source=Source.PDFMINER), LayoutElement(bbox=Rectangle(0, 550, 70, 400), text="Inside table2", source=Source.PDFMINER), @@ -71,7 +73,7 @@ ) def test_clean_pdfminer_inner_elements(elements, length_extra_info, expected_document_length): # create a sample document with pdfminer elements inside tables - page = PageLayout(number=1, image=None, layout=elements) + page = PageLayout(number=1, image=Image.new("1", (1, 1))) page.elements = elements document_with_table = DocumentLayout(pages=[page]) document = document_with_table diff --git a/unstructured/__version__.py b/unstructured/__version__.py index dd12154992..510b9a48ec 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.11.4-dev0" # pragma: no cover +__version__ = "0.11.4-dev1" # pragma: no cover diff --git a/unstructured/metrics/table_structure.py b/unstructured/metrics/table_structure.py index ff79114f91..90fca5a7d6 100644 --- a/unstructured/metrics/table_structure.py +++ b/unstructured/metrics/table_structure.py @@ -2,7 +2,7 @@ import pandas as pd from PIL import Image -from unstructured.partition.pdf import convert_pdf_to_images +from unstructured.partition.pdf_image.pdf import convert_pdf_to_images from unstructured.utils import requires_dependencies diff --git a/unstructured/nlp/partition.py b/unstructured/nlp/partition.py index 881b006357..d362fec0e0 100644 --- a/unstructured/nlp/partition.py +++ b/unstructured/nlp/partition.py @@ -1,5 +1,5 @@ # flake8: noqa -from unstructured.partition.pdf import partition_pdf # noqa +from unstructured.partition.pdf_image.pdf import partition_pdf # noqa from unstructured.partition.text_type import ( # noqa is_bulleted_text, is_possible_narrative_text, diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py index 16047d7965..eeca1480be 100644 --- a/unstructured/partition/auto.py +++ b/unstructured/partition/auto.py @@ -78,13 +78,13 @@ pdf_imports = ["pdf2image", "pdfminer", "PIL"] if all(dependency_exists(dep) for dep in pdf_imports): - from unstructured.partition.pdf import partition_pdf + from unstructured.partition.pdf_image.pdf import partition_pdf PARTITION_WITH_EXTRAS_MAP["pdf"] = partition_pdf if dependency_exists("unstructured_inference"): - from unstructured.partition.image import partition_image + from unstructured.partition.pdf_image.image import partition_image PARTITION_WITH_EXTRAS_MAP["image"] = partition_image diff --git a/unstructured/partition/pdf_image/__init__.py b/unstructured/partition/pdf_image/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/unstructured/partition/image.py b/unstructured/partition/pdf_image/image.py similarity index 98% rename from unstructured/partition/image.py rename to unstructured/partition/pdf_image/image.py index 4986ab26ff..eb0e1cb8d6 100644 --- a/unstructured/partition/image.py +++ b/unstructured/partition/pdf_image/image.py @@ -8,7 +8,7 @@ from unstructured.partition.lang import ( convert_old_ocr_languages_to_languages, ) -from unstructured.partition.pdf import partition_pdf_or_image +from unstructured.partition.pdf_image.pdf import partition_pdf_or_image from unstructured.partition.utils.constants import PartitionStrategy diff --git a/unstructured/partition/ocr.py b/unstructured/partition/pdf_image/ocr.py similarity index 100% rename from unstructured/partition/ocr.py rename to unstructured/partition/pdf_image/ocr.py diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf_image/pdf.py similarity index 90% rename from unstructured/partition/pdf.py rename to unstructured/partition/pdf_image/pdf.py index ce566529b9..1d911fc8a4 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf_image/pdf.py @@ -2,7 +2,6 @@ import io import os import re -import tempfile import warnings from tempfile import SpooledTemporaryFile from typing import ( @@ -21,21 +20,15 @@ import numpy as np import pdf2image -import pikepdf -import pypdf import wrapt -from pdfminer.converter import PDFPageAggregator +from pdfminer import psparser from pdfminer.layout import ( - LAParams, LTChar, LTContainer, LTImage, LTItem, LTTextBox, ) -from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager -from pdfminer.pdfpage import PDFPage -from pdfminer.pdfparser import PSSyntaxError from pdfminer.pdftypes import PDFObjRef from pdfminer.utils import open_filename from PIL import Image as PILImage @@ -77,9 +70,9 @@ check_languages, prepare_languages_for_tesseract, ) -from unstructured.partition.ocr import ( - get_layout_elements_from_ocr, - get_ocr_agent, +from unstructured.partition.pdf_image.pdfminer_utils import ( + open_pdfminer_pages_generator, + rect_to_bbox, ) from unstructured.partition.strategies import determine_pdf_or_image_strategy, validate_strategy from unstructured.partition.text import element_from_text @@ -96,11 +89,17 @@ coord_has_valid_points, sort_page_elements, ) +from unstructured.patches.pdfminer import parse_keyword from unstructured.utils import requires_dependencies if TYPE_CHECKING: pass + +# NOTE(alan): Patching this to fix a bug in pdfminer.six. Submitted this PR into pdfminer.six to fix +# the bug: https://github.com/pdfminer/pdfminer.six/pull/885 +psparser.PSBaseParser._parse_keyword = parse_keyword # type: ignore + RE_MULTISPACE_INCLUDING_NEWLINES = re.compile(pattern=r"\s+", flags=re.DOTALL) @@ -353,10 +352,14 @@ def _partition_pdf_or_image_local( process_file_with_model, ) - from unstructured.partition.ocr import ( + from unstructured.partition.pdf_image.ocr import ( process_data_with_ocr, process_file_with_ocr, ) + from unstructured.partition.pdf_image.pdfminer_processing import ( + process_data_with_pdfminer, + process_file_with_pdfminer, + ) if languages is None: languages = ["eng"] @@ -373,8 +376,7 @@ def _partition_pdf_or_image_local( ) if file is None: - # NOTE(christine): out_layout = extracted_layout + inferred_layout - out_layout = process_file_with_model( + inferred_document_layout = process_file_with_model( filename, is_image=is_image, model_name=model_name, @@ -382,13 +384,21 @@ def _partition_pdf_or_image_local( extract_images_in_pdf=extract_images_in_pdf, image_output_dir_path=image_output_dir_path, ) + + # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout + merged_document_layout = process_file_with_pdfminer( + inferred_document_layout, + filename, + is_image, + ) + if model_name.startswith("chipper"): # NOTE(alan): We shouldn't do OCR with chipper - final_layout = out_layout + final_document_layout = merged_document_layout else: - final_layout = process_file_with_ocr( + final_document_layout = process_file_with_ocr( filename, - out_layout, + merged_document_layout, is_image=is_image, infer_table_structure=infer_table_structure, ocr_languages=ocr_languages, @@ -396,7 +406,7 @@ def _partition_pdf_or_image_local( pdf_image_dpi=pdf_image_dpi, ) else: - out_layout = process_data_with_model( + inferred_document_layout = process_data_with_model( file, is_image=is_image, model_name=model_name, @@ -404,15 +414,25 @@ def _partition_pdf_or_image_local( extract_images_in_pdf=extract_images_in_pdf, image_output_dir_path=image_output_dir_path, ) + if hasattr(file, "seek"): + file.seek(0) + + # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout + merged_document_layout = process_data_with_pdfminer( + inferred_document_layout, + file, + is_image, + ) + if model_name.startswith("chipper"): # NOTE(alan): We shouldn't do OCR with chipper - final_layout = out_layout + final_document_layout = merged_document_layout else: if hasattr(file, "seek"): file.seek(0) - final_layout = process_data_with_ocr( + final_document_layout = process_data_with_ocr( file, - out_layout, + merged_document_layout, is_image=is_image, infer_table_structure=infer_table_structure, ocr_languages=ocr_languages, @@ -424,9 +444,9 @@ def _partition_pdf_or_image_local( if model_name == "chipper": kwargs["sort_mode"] = SORT_MODE_DONT - final_layout = clean_pdfminer_inner_elements(final_layout) + final_document_layout = clean_pdfminer_inner_elements(final_document_layout) elements = document_to_element_list( - final_layout, + final_document_layout, sortable=True, include_page_breaks=include_page_breaks, last_modification_date=metadata_last_modified, @@ -545,69 +565,6 @@ def pdfminer_interpreter_init_resources(wrapped, instance, args, kwargs): return wrapped(resources) -def get_page_data(fp: BinaryIO, page_number: int): - """Find the binary data for a given page number from a PDF binary file.""" - pdf_reader = pypdf.PdfReader(fp) - pdf_writer = pypdf.PdfWriter() - page = pdf_reader.pages[page_number] - pdf_writer.add_page(page) - page_data = io.BytesIO() - pdf_writer.write(page_data) - return page_data - - -def _open_pdfminer_pages_generator( - fp: BinaryIO, -): - """Open PDF pages using PDFMiner, handling and repairing invalid dictionary constructs.""" - - rsrcmgr = PDFResourceManager() - laparams = LAParams() - device = PDFPageAggregator(rsrcmgr, laparams=laparams) - interpreter = PDFPageInterpreter(rsrcmgr, device) - try: - i = 0 - pages = PDFPage.get_pages(fp) - # Detect invalid dictionary construct for entire PDF - for page in pages: - try: - # Detect invalid dictionary construct for one page - interpreter.process_page(page) - page_layout = device.get_result() - except PSSyntaxError: - logger.info("Detected invalid dictionary construct for PDFminer") - logger.info(f"Repairing the PDF page {i+1} ...") - # find the error page from binary data fp - error_page_data = get_page_data(fp, page_number=i) - # repair the error page with pikepdf - with tempfile.NamedTemporaryFile() as tmp: - with pikepdf.Pdf.open(error_page_data) as pdf: - pdf.save(tmp.name) - page = next(PDFPage.get_pages(open(tmp.name, "rb"))) # noqa: SIM115 - try: - interpreter.process_page(page) - page_layout = device.get_result() - except Exception: - logger.warning( - f"PDFMiner failed to process PDF page {i+1} after repairing it." - ) - break - i += 1 - yield page, page_layout - except PSSyntaxError: - logger.info("Detected invalid dictionary construct for PDFminer") - logger.info("Repairing the PDF document ...") - # repair the entire doc with pikepdf - with tempfile.NamedTemporaryFile() as tmp: - with pikepdf.Pdf.open(fp) as pdf: - pdf.save(tmp.name) - pages = PDFPage.get_pages(open(tmp.name, "rb")) # noqa: SIM115 - for page in pages: - interpreter.process_page(page) - page_layout = device.get_result() - yield page, page_layout - - def _process_pdfminer_pages( fp: BinaryIO, filename: str, @@ -620,7 +577,7 @@ def _process_pdfminer_pages( """Uses PDFMiner to split a document into pages and process them.""" elements: List[Element] = [] - for i, (page, page_layout) in enumerate(_open_pdfminer_pages_generator(fp)): + for i, (page, page_layout) in enumerate(open_pdfminer_pages_generator(fp)): width, height = page_layout.width, page_layout.height page_elements = [] @@ -842,6 +799,11 @@ def _partition_pdf_or_image_with_ocr_from_image( ) -> List[Element]: """Extract `unstructured` elements from an image using OCR and perform partitioning.""" + from unstructured.partition.pdf_image.ocr import ( + get_layout_elements_from_ocr, + get_ocr_agent, + ) + ocr_agent = get_ocr_agent() ocr_languages = prepare_languages_for_tesseract(languages) @@ -1035,29 +997,6 @@ def try_resolve(annot: PDFObjRef): return annot -def rect_to_bbox( - rect: Tuple[float, float, float, float], - height: float, -) -> Tuple[float, float, float, float]: - """ - Converts a PDF rectangle coordinates (x1, y1, x2, y2) to a bounding box in the specified - coordinate system where the vertical axis is measured from the top of the page. - - Args: - rect (Tuple[float, float, float, float]): A tuple representing a PDF rectangle - coordinates (x1, y1, x2, y2). - height (float): The height of the page in the specified coordinate system. - - Returns: - Tuple[float, float, float, float]: A tuple representing the bounding box coordinates - (x1, y1, x2, y2) with the y-coordinates adjusted to be measured from the top of the page. - """ - x1, y2, x2, y1 = rect - y1 = height - y1 - y2 = height - y2 - return (x1, y1, x2, y2) - - def calculate_intersection_area( bbox1: Tuple[float, float, float, float], bbox2: Tuple[float, float, float, float], diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py new file mode 100644 index 0000000000..523cd62be1 --- /dev/null +++ b/unstructured/partition/pdf_image/pdfminer_processing.py @@ -0,0 +1,131 @@ +from typing import TYPE_CHECKING, BinaryIO, List, Optional, Union, cast + +from pdfminer.utils import open_filename +from unstructured_inference.inference.elements import ( + EmbeddedTextRegion, + ImageTextRegion, + TextRegion, +) +from unstructured_inference.inference.layoutelement import ( + merge_inferred_layout_with_extracted_layout, +) +from unstructured_inference.inference.ordering import order_layout +from unstructured_inference.models.detectron2onnx import UnstructuredDetectronONNXModel + +from unstructured.partition.pdf_image.pdfminer_utils import ( + get_images_from_pdf_element, + open_pdfminer_pages_generator, + rect_to_bbox, +) +from unstructured.partition.utils.constants import Source + +if TYPE_CHECKING: + from unstructured_inference.inference.layout import DocumentLayout + + +def process_file_with_pdfminer( + inferred_document_layout: "DocumentLayout", + filename: str = "", + is_image: bool = False, +) -> "DocumentLayout": + with open_filename(filename, "rb") as fp: + fp = cast(BinaryIO, fp) + inferred_document_layout = process_data_with_pdfminer( + inferred_document_layout=inferred_document_layout, + file=fp, + is_image=is_image, + ) + return inferred_document_layout + + +def process_data_with_pdfminer( + inferred_document_layout: "DocumentLayout", + file: Optional[Union[bytes, BinaryIO]] = None, + is_image: bool = False, +) -> "DocumentLayout": + if is_image: + for page in inferred_document_layout.pages: + for el in page.elements: + el.text = el.text or "" + return inferred_document_layout + + extracted_layouts = get_regions_by_pdfminer(file) + + inferred_pages = inferred_document_layout.pages + for i, (inferred_page, extracted_layout) in enumerate(zip(inferred_pages, extracted_layouts)): + inferred_layout = inferred_page.elements + image_metadata = inferred_page.image_metadata + w = image_metadata.get("width") + h = image_metadata.get("height") + image_size = (w, h) + + threshold_kwargs = {} + # NOTE(Benjamin): With this the thresholds are only changed for detextron2_mask_rcnn + # In other case the default values for the functions are used + if ( + isinstance(inferred_page.detection_model, UnstructuredDetectronONNXModel) + and "R_50" not in inferred_page.detection_model.model_path + ): + threshold_kwargs = {"same_region_threshold": 0.5, "subregion_threshold": 0.5} + + merged_layout = merge_inferred_layout_with_extracted_layout( + inferred_layout=inferred_layout, + extracted_layout=extracted_layout, + page_image_size=image_size, + **threshold_kwargs, + ) + + elements = inferred_page.get_elements_from_layout( + layout=cast(List[TextRegion], merged_layout), + pdf_objects=extracted_layout, + ) + + inferred_page.elements[:] = elements + + return inferred_document_layout + + +def get_regions_by_pdfminer( + fp: Optional[Union[bytes, BinaryIO]], + dpi: int = 200, +) -> List[List[TextRegion]]: + """Loads the image and word objects from a pdf using pdfplumber and the image renderings of the + pdf pages using pdf2image""" + + layouts = [] + # Coefficient to rescale bounding box to be compatible with images + coef = dpi / 72 + for i, (page, page_layout) in enumerate(open_pdfminer_pages_generator(fp)): + height = page_layout.height + + layout: List["TextRegion"] = [] + for obj in page_layout: + x1, y1, x2, y2 = rect_to_bbox(obj.bbox, height) + + if hasattr(obj, "get_text"): + _text = obj.get_text() + element_class = EmbeddedTextRegion # type: ignore + else: + embedded_images = get_images_from_pdf_element(obj) + if len(embedded_images) > 0: + _text = None + element_class = ImageTextRegion # type: ignore + else: + continue + + text_region = element_class.from_coords( + x1 * coef, + y1 * coef, + x2 * coef, + y2 * coef, + text=_text, + source=Source.PDFMINER, + ) + + if text_region.bbox is not None and text_region.bbox.area > 0: + layout.append(text_region) + + layout = order_layout(layout) + layouts.append(layout) + + return layouts diff --git a/unstructured/partition/pdf_image/pdfminer_utils.py b/unstructured/partition/pdf_image/pdfminer_utils.py new file mode 100644 index 0000000000..5c50bff88c --- /dev/null +++ b/unstructured/partition/pdf_image/pdfminer_utils.py @@ -0,0 +1,128 @@ +import tempfile +from typing import Any, BinaryIO, List, Tuple + +import pikepdf +from pdfminer.converter import PDFPageAggregator +from pdfminer.layout import LAParams, LTContainer, LTImage +from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager +from pdfminer.pdfpage import PDFPage +from pdfminer.pdfparser import PSSyntaxError + +from unstructured.logger import logger +from unstructured.partition.pdf_image.pypdf_utils import get_page_data + + +def init_pdfminer(): + rsrcmgr = PDFResourceManager() + laparams = LAParams() + device = PDFPageAggregator(rsrcmgr, laparams=laparams) + interpreter = PDFPageInterpreter(rsrcmgr, device) + + return device, interpreter + + +def get_images_from_pdf_element(layout_object: Any) -> List[LTImage]: + """ + Recursively extracts LTImage objects from a PDF layout element. + + This function takes a PDF layout element (could be LTImage or LTContainer) and recursively + extracts all LTImage objects contained within it. + + Parameters: + - layout_object (Any): The PDF layout element to extract images from. + + Returns: + - List[LTImage]: A list of LTImage objects extracted from the layout object. + + Note: + - This function recursively traverses through the layout_object to find and accumulate all + LTImage objects. + - If the input layout_object is an LTImage, it will be included in the returned list. + - If the input layout_object is an LTContainer, the function will recursively search its + children for LTImage objects. + - If the input layout_object is neither LTImage nor LTContainer, an empty list will be + returned. + """ + + # recursively locate Image objects in layout_object + if isinstance(layout_object, LTImage): + return [layout_object] + if isinstance(layout_object, LTContainer): + img_list: List[LTImage] = [] + for child in layout_object: + img_list = img_list + get_images_from_pdf_element(child) + return img_list + else: + return [] + + +def rect_to_bbox( + rect: Tuple[float, float, float, float], + height: float, +) -> Tuple[float, float, float, float]: + """ + Converts a PDF rectangle coordinates (x1, y1, x2, y2) to a bounding box in the specified + coordinate system where the vertical axis is measured from the top of the page. + + Args: + rect (Tuple[float, float, float, float]): A tuple representing a PDF rectangle + coordinates (x1, y1, x2, y2). + height (float): The height of the page in the specified coordinate system. + + Returns: + Tuple[float, float, float, float]: A tuple representing the bounding box coordinates + (x1, y1, x2, y2) with the y-coordinates adjusted to be measured from the top of the page. + """ + x1, y2, x2, y1 = rect + y1 = height - y1 + y2 = height - y2 + return (x1, y1, x2, y2) + + +def open_pdfminer_pages_generator( + fp: BinaryIO, +): + """Open PDF pages using PDFMiner, handling and repairing invalid dictionary constructs.""" + + device, interpreter = init_pdfminer() + try: + i = 0 + pages = PDFPage.get_pages(fp) + # Detect invalid dictionary construct for entire PDF + for page in pages: + try: + # Detect invalid dictionary construct for one page + interpreter.process_page(page) + page_layout = device.get_result() + except PSSyntaxError: + logger.info("Detected invalid dictionary construct for PDFminer") + logger.info(f"Repairing the PDF page {i+1} ...") + # find the error page from binary data fp + error_page_data = get_page_data(fp, page_number=i) + # repair the error page with pikepdf + with tempfile.NamedTemporaryFile() as tmp: + with pikepdf.Pdf.open(error_page_data) as pdf: + pdf.save(tmp.name) + page = next(PDFPage.get_pages(open(tmp.name, "rb"))) # noqa: SIM115 + try: + interpreter.process_page(page) + page_layout = device.get_result() + except Exception: + logger.warning( + f"PDFMiner failed to process PDF page {i+1} after repairing it." + ) + break + i += 1 + yield page, page_layout + except PSSyntaxError: + logger.info("Detected invalid dictionary construct for PDFminer") + logger.info("Repairing the PDF document ...") + # repair the entire doc with pikepdf + with tempfile.NamedTemporaryFile() as tmp: + with pikepdf.Pdf.open(fp) as pdf: + pdf.save(tmp.name) + pages = PDFPage.get_pages(open(tmp.name, "rb")) # noqa: SIM115 + for page in pages: + interpreter.process_page(page) + page_layout = device.get_result() + yield page, page_layout diff --git a/unstructured/partition/pdf_image/pypdf_utils.py b/unstructured/partition/pdf_image/pypdf_utils.py new file mode 100644 index 0000000000..b03fe72856 --- /dev/null +++ b/unstructured/partition/pdf_image/pypdf_utils.py @@ -0,0 +1,15 @@ +import io +from typing import BinaryIO + +import pypdf + + +def get_page_data(fp: BinaryIO, page_number: int): + """Find the binary data for a given page number from a PDF binary file.""" + pdf_reader = pypdf.PdfReader(fp) + pdf_writer = pypdf.PdfWriter() + page = pdf_reader.pages[page_number] + pdf_writer.add_page(page) + page_data = io.BytesIO() + pdf_writer.write(page_data) + return page_data diff --git a/unstructured/partition/utils/constants.py b/unstructured/partition/utils/constants.py index 54b596048b..a88b555be0 100644 --- a/unstructured/partition/utils/constants.py +++ b/unstructured/partition/utils/constants.py @@ -3,6 +3,7 @@ class Source(Enum): + PDFMINER = "pdfminer" OCR_TESSERACT = "ocr_tesseract" OCR_PADDLE = "ocr_paddle" diff --git a/unstructured/partition/utils/processing_elements.py b/unstructured/partition/utils/processing_elements.py index 95289a41ca..7fb76be42a 100644 --- a/unstructured/partition/utils/processing_elements.py +++ b/unstructured/partition/utils/processing_elements.py @@ -1,8 +1,9 @@ from collections import defaultdict -from unstructured_inference.constants import Source from unstructured_inference.inference.layout import DocumentLayout +from unstructured.partition.utils.constants import Source + def clean_pdfminer_inner_elements(document: DocumentLayout) -> DocumentLayout: """Clean pdfminer elements from inside tables and stores them in extra_info dictionary diff --git a/unstructured/patches/__init__.py b/unstructured/patches/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/unstructured/patches/pdfminer.py b/unstructured/patches/pdfminer.py new file mode 100644 index 0000000000..20b938d1ce --- /dev/null +++ b/unstructured/patches/pdfminer.py @@ -0,0 +1,24 @@ +from typing import Union + +from pdfminer.psparser import END_KEYWORD, KWD, PSBaseParser, PSKeyword + + +def parse_keyword(self: PSBaseParser, s: bytes, i: int) -> int: + """Patch for pdfminer method _parse_keyword of PSBaseParser. Changes are identical to the PR + https://github.com/pdfminer/pdfminer.six/pull/885.""" + m = END_KEYWORD.search(s, i) + if not m: + j = len(s) + self._curtoken += s[i:] + else: + j = m.start(0) + self._curtoken += s[i:j] + if self._curtoken == b"true": + token: Union[bool, PSKeyword] = True + elif self._curtoken == b"false": + token = False + else: + token = KWD(self._curtoken) + self._add_token(token) + self._parse1 = self._parse_main + return j