diff --git a/CHANGELOG.md b/CHANGELOG.md
index b8bc339c9d..ea2d19c906 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,7 +1,9 @@
-## 0.11.4-dev0
+## 0.11.4-dev1
 
 ### Enhancements
 
+* **Refactor pdfminer code.** The pdfminer code is moved from `unstructured-inference` to `unstructured`.
+
 ### Features
 
 ### Fixes
@@ -23,8 +25,8 @@
 ## 0.11.1
 
 ### Enhancements
-* **Use `pikepdf` to repair invalid PDF structure** for PDFminer when we see error `PSSyntaxError` when PDFminer opens the document and creates the PDFminer pages object or processes a single PDF page.
 
+* **Use `pikepdf` to repair invalid PDF structure** for PDFminer when we see error `PSSyntaxError` when PDFminer opens the document and creates the PDFminer pages object or processes a single PDF page.
 * **Batch Source Connector support** For instances where it is more optimal to read content from a source connector in batches, a new batch ingest doc is added which created multiple ingest docs after reading them in in batches per process.
 
 ### Features
diff --git a/docs/requirements.txt b/docs/requirements.txt
index a1cb4e31be..129ce79f3d 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -36,7 +36,7 @@ idna==3.6
     #   requests
 imagesize==1.4.1
     # via sphinx
-importlib-metadata==6.8.0
+importlib-metadata==6.9.0
     # via sphinx
 jinja2==3.1.2
     # via
diff --git a/examples/custom-layout-order/evaluate_natural_reading_order.py b/examples/custom-layout-order/evaluate_natural_reading_order.py
index ed4f6c09dd..165841957f 100644
--- a/examples/custom-layout-order/evaluate_natural_reading_order.py
+++ b/examples/custom-layout-order/evaluate_natural_reading_order.py
@@ -8,7 +8,7 @@
 from PIL import Image
 
 from unstructured.documents.elements import PageBreak
-from unstructured.partition.pdf import partition_pdf
+from unstructured.partition.pdf_image.pdf import partition_pdf
 from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_DONT, SORT_MODE_XY_CUT
 from unstructured.partition.utils.xycut import (
     bbox2points,
diff --git a/examples/layout-analysis/visualization.py b/examples/layout-analysis/visualization.py
index a13bb930cb..3906ea0a1e 100644
--- a/examples/layout-analysis/visualization.py
+++ b/examples/layout-analysis/visualization.py
@@ -7,7 +7,7 @@
 from unstructured_inference.visualize import draw_bbox
 
 from unstructured.documents.elements import PageBreak
-from unstructured.partition.pdf import partition_pdf
+from unstructured.partition.pdf_image.pdf import partition_pdf
 
 CUR_DIR = pathlib.Path(__file__).parent.resolve()
 
diff --git a/requirements/build.txt b/requirements/build.txt
index a1cb4e31be..129ce79f3d 100644
--- a/requirements/build.txt
+++ b/requirements/build.txt
@@ -36,7 +36,7 @@ idna==3.6
     #   requests
 imagesize==1.4.1
     # via sphinx
-importlib-metadata==6.8.0
+importlib-metadata==6.9.0
     # via sphinx
 jinja2==3.1.2
     # via
diff --git a/requirements/dev.txt b/requirements/dev.txt
index 52e9747700..c5592adbd0 100644
--- a/requirements/dev.txt
+++ b/requirements/dev.txt
@@ -91,7 +91,7 @@ idna==3.6
     #   anyio
     #   jsonschema
     #   requests
-importlib-metadata==6.8.0
+importlib-metadata==6.9.0
     # via
     #   build
     #   jupyter-client
@@ -138,7 +138,7 @@ jsonschema[format-nongpl]==4.20.0
     #   jupyter-events
     #   jupyterlab-server
     #   nbformat
-jsonschema-specifications==2023.11.1
+jsonschema-specifications==2023.11.2
     # via jsonschema
 jupyter==1.0.0
     # via -r dev.in
@@ -301,7 +301,7 @@ qtconsole==5.5.1
     # via jupyter
 qtpy==2.4.1
     # via qtconsole
-referencing==0.31.0
+referencing==0.31.1
     # via
     #   jsonschema
     #   jsonschema-specifications
@@ -319,7 +319,7 @@ rfc3986-validator==0.1.1
     # via
     #   jsonschema
     #   jupyter-events
-rpds-py==0.13.1
+rpds-py==0.13.2
     # via
     #   jsonschema
     #   referencing
@@ -354,7 +354,7 @@ tomli==2.0.1
     #   jupyterlab
     #   pip-tools
     #   pyproject-hooks
-tornado==6.3.3
+tornado==6.4
     # via
     #   ipykernel
     #   jupyter-client
@@ -395,7 +395,7 @@ urllib3==1.26.18
     #   -c constraints.in
     #   -c test.txt
     #   requests
-virtualenv==20.24.7
+virtualenv==20.25.0
     # via pre-commit
 wcwidth==0.2.12
     # via prompt-toolkit
diff --git a/requirements/extra-markdown.txt b/requirements/extra-markdown.txt
index cf0b1f2c08..940336c7ca 100644
--- a/requirements/extra-markdown.txt
+++ b/requirements/extra-markdown.txt
@@ -4,7 +4,7 @@
 #
 #    pip-compile --output-file=extra-markdown.txt extra-markdown.in
 #
-importlib-metadata==6.8.0
+importlib-metadata==6.9.0
     # via markdown
 markdown==3.5.1
     # via -r extra-markdown.in
diff --git a/requirements/extra-msg.txt b/requirements/extra-msg.txt
index 525809363e..0b4a6bfb15 100644
--- a/requirements/extra-msg.txt
+++ b/requirements/extra-msg.txt
@@ -6,5 +6,5 @@
 #
 msg-parser==1.2.0
     # via -r extra-msg.in
-olefile==0.46
+olefile==0.47
     # via msg-parser
diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt
index 4999f5d3b6..896e548897 100644
--- a/requirements/extra-paddleocr.txt
+++ b/requirements/extra-paddleocr.txt
@@ -59,7 +59,7 @@ imageio==2.33.0
     #   scikit-image
 imgaug==0.4.0
     # via unstructured-paddleocr
-importlib-metadata==6.8.0
+importlib-metadata==6.9.0
     # via flask
 importlib-resources==6.1.1
     # via matplotlib
diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in
index bcccc2aceb..4ccf33f804 100644
--- a/requirements/extra-pdf-image.in
+++ b/requirements/extra-pdf-image.in
@@ -8,7 +8,7 @@ pikepdf
 pypdf
 # Do not move to contsraints.in, otherwise unstructured-inference will not be upgraded
 # when unstructured library is.
-unstructured-inference==0.7.15
+unstructured-inference==0.7.17
 # unstructured fork of pytesseract that provides an interface to allow for multiple output formats
 # from one tesseract call
 unstructured.pytesseract>=0.3.12
diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt
index 9a5ae381a7..dc7f2ec7a3 100644
--- a/requirements/extra-pdf-image.txt
+++ b/requirements/extra-pdf-image.txt
@@ -250,7 +250,7 @@ typing-extensions==4.8.0
     #   torch
 tzdata==2023.3
     # via pandas
-unstructured-inference==0.7.15
+unstructured-inference==0.7.17
     # via -r extra-pdf-image.in
 unstructured-pytesseract==0.3.12
     # via
diff --git a/requirements/ingest/airtable.txt b/requirements/ingest/airtable.txt
index 0560b94e23..d9145af2be 100644
--- a/requirements/ingest/airtable.txt
+++ b/requirements/ingest/airtable.txt
@@ -19,7 +19,7 @@ idna==3.6
     #   requests
 inflection==0.5.1
     # via pyairtable
-pyairtable==2.2.0
+pyairtable==2.2.1
     # via -r ingest/airtable.in
 pydantic==1.10.13
     # via
diff --git a/requirements/ingest/azure.txt b/requirements/ingest/azure.txt
index 779d423308..c54b4e0aeb 100644
--- a/requirements/ingest/azure.txt
+++ b/requirements/ingest/azure.txt
@@ -76,9 +76,7 @@ portalocker==2.8.2
 pycparser==2.21
     # via cffi
 pyjwt[crypto]==2.8.0
-    # via
-    #   msal
-    #   pyjwt
+    # via msal
 requests==2.31.0
     # via
     #   -c ingest/../base.txt
diff --git a/requirements/ingest/box.txt b/requirements/ingest/box.txt
index aec2ef132f..506a27c5de 100644
--- a/requirements/ingest/box.txt
+++ b/requirements/ingest/box.txt
@@ -9,9 +9,7 @@ attrs==23.1.0
 boxfs==0.2.1
     # via -r ingest/box.in
 boxsdk[jwt]==3.9.2
-    # via
-    #   boxfs
-    #   boxsdk
+    # via boxfs
 certifi==2023.11.17
     # via
     #   -c ingest/../base.txt
diff --git a/requirements/ingest/confluence.txt b/requirements/ingest/confluence.txt
index 9121adfb5e..4d37f5b9e2 100644
--- a/requirements/ingest/confluence.txt
+++ b/requirements/ingest/confluence.txt
@@ -4,7 +4,7 @@
 #
 #    pip-compile --output-file=ingest/confluence.txt ingest/confluence.in
 #
-atlassian-python-api==3.41.3
+atlassian-python-api==3.41.4
     # via -r ingest/confluence.in
 certifi==2023.11.17
     # via
diff --git a/requirements/ingest/embed-aws-bedrock.txt b/requirements/ingest/embed-aws-bedrock.txt
index 50e4c401ce..e155405f14 100644
--- a/requirements/ingest/embed-aws-bedrock.txt
+++ b/requirements/ingest/embed-aws-bedrock.txt
@@ -46,6 +46,8 @@ frozenlist==1.4.0
     # via
     #   aiohttp
     #   aiosignal
+greenlet==3.0.1
+    # via sqlalchemy
 idna==3.6
     # via
     #   -c ingest/../base.txt
@@ -62,11 +64,11 @@ jsonpatch==1.33
     #   langchain-core
 jsonpointer==2.4
     # via jsonpatch
-langchain==0.0.341
+langchain==0.0.344
     # via -r ingest/embed-aws-bedrock.in
-langchain-core==0.0.6
+langchain-core==0.0.8
     # via langchain
-langsmith==0.0.67
+langsmith==0.0.68
     # via
     #   langchain
     #   langchain-core
diff --git a/requirements/ingest/embed-huggingface.txt b/requirements/ingest/embed-huggingface.txt
index 650134007b..771a613391 100644
--- a/requirements/ingest/embed-huggingface.txt
+++ b/requirements/ingest/embed-huggingface.txt
@@ -51,6 +51,8 @@ fsspec==2023.9.1
     #   -c ingest/../constraints.in
     #   huggingface-hub
     #   torch
+greenlet==3.0.1
+    # via sqlalchemy
 huggingface==0.0.1
     # via -r ingest/embed-huggingface.in
 huggingface-hub==0.19.4
@@ -77,11 +79,11 @@ jsonpatch==1.33
     #   langchain-core
 jsonpointer==2.4
     # via jsonpatch
-langchain==0.0.341
+langchain==0.0.344
     # via -r ingest/embed-huggingface.in
-langchain-core==0.0.6
+langchain-core==0.0.8
     # via langchain
-langsmith==0.0.67
+langsmith==0.0.68
     # via
     #   langchain
     #   langchain-core
diff --git a/requirements/ingest/embed-openai.txt b/requirements/ingest/embed-openai.txt
index 6798574d0e..0486bc927d 100644
--- a/requirements/ingest/embed-openai.txt
+++ b/requirements/ingest/embed-openai.txt
@@ -43,6 +43,8 @@ frozenlist==1.4.0
     # via
     #   aiohttp
     #   aiosignal
+greenlet==3.0.1
+    # via sqlalchemy
 h11==0.14.0
     # via httpcore
 httpcore==1.0.2
@@ -62,11 +64,11 @@ jsonpatch==1.33
     #   langchain-core
 jsonpointer==2.4
     # via jsonpatch
-langchain==0.0.341
+langchain==0.0.344
     # via -r ingest/embed-openai.in
-langchain-core==0.0.6
+langchain-core==0.0.8
     # via langchain
-langsmith==0.0.67
+langsmith==0.0.68
     # via
     #   langchain
     #   langchain-core
@@ -87,7 +89,7 @@ numpy==1.24.4
     #   -c ingest/../base.txt
     #   -c ingest/../constraints.in
     #   langchain
-openai==1.3.5
+openai==1.3.7
     # via -r ingest/embed-openai.in
 packaging==23.2
     # via
@@ -116,6 +118,7 @@ sniffio==1.3.0
     # via
     #   anyio
     #   httpx
+    #   openai
 sqlalchemy==2.0.23
     # via langchain
 tenacity==8.2.3
diff --git a/requirements/ingest/gcs.txt b/requirements/ingest/gcs.txt
index 8d800668af..171f91313f 100644
--- a/requirements/ingest/gcs.txt
+++ b/requirements/ingest/gcs.txt
@@ -46,7 +46,7 @@ google-api-core==2.14.0
     # via
     #   google-cloud-core
     #   google-cloud-storage
-google-auth==2.23.4
+google-auth==2.24.0
     # via
     #   gcsfs
     #   google-api-core
diff --git a/requirements/ingest/github.txt b/requirements/ingest/github.txt
index 736b8b9ee6..f39ad3cd32 100644
--- a/requirements/ingest/github.txt
+++ b/requirements/ingest/github.txt
@@ -30,9 +30,7 @@ pycparser==2.21
 pygithub==2.1.1
     # via -r ingest/github.in
 pyjwt[crypto]==2.8.0
-    # via
-    #   pygithub
-    #   pyjwt
+    # via pygithub
 pynacl==1.5.0
     # via pygithub
 python-dateutil==2.8.2
diff --git a/requirements/ingest/google-drive.txt b/requirements/ingest/google-drive.txt
index 882a3bcf42..49912a2083 100644
--- a/requirements/ingest/google-drive.txt
+++ b/requirements/ingest/google-drive.txt
@@ -17,9 +17,9 @@ charset-normalizer==3.3.2
     #   requests
 google-api-core==2.14.0
     # via google-api-python-client
-google-api-python-client==2.108.0
+google-api-python-client==2.109.0
     # via -r ingest/google-drive.in
-google-auth==2.23.4
+google-auth==2.24.0
     # via
     #   google-api-core
     #   google-api-python-client
diff --git a/requirements/ingest/hubspot.txt b/requirements/ingest/hubspot.txt
index 95b073e691..d67982d3d0 100644
--- a/requirements/ingest/hubspot.txt
+++ b/requirements/ingest/hubspot.txt
@@ -2,19 +2,19 @@
 # This file is autogenerated by pip-compile with Python 3.8
 # by the following command:
 #
-#    pip-compile requirements/ingest-hubspot.in
+#    pip-compile --output-file=ingest/hubspot.txt ingest/hubspot.in
 #
-certifi==2023.7.22
+certifi==2023.11.17
     # via hubspot-api-client
 hubspot-api-client==8.1.1
-    # via -r requirements/ingest-hubspot.in
+    # via -r ingest/hubspot.in
 python-dateutil==2.8.2
     # via hubspot-api-client
 six==1.16.0
     # via
     #   hubspot-api-client
     #   python-dateutil
-urllib3==1.26.17
+urllib3==2.1.0
     # via
-    #   -r requirements/ingest-hubspot.in
+    #   -r ingest/hubspot.in
     #   hubspot-api-client
diff --git a/requirements/ingest/jira.txt b/requirements/ingest/jira.txt
index 3ab059f559..0b77cb9945 100644
--- a/requirements/ingest/jira.txt
+++ b/requirements/ingest/jira.txt
@@ -4,7 +4,7 @@
 #
 #    pip-compile --output-file=ingest/jira.txt ingest/jira.in
 #
-atlassian-python-api==3.41.3
+atlassian-python-api==3.41.4
     # via -r ingest/jira.in
 certifi==2023.11.17
     # via
diff --git a/requirements/ingest/mongodb.txt b/requirements/ingest/mongodb.txt
index e193ef4e0d..d395eb0abb 100644
--- a/requirements/ingest/mongodb.txt
+++ b/requirements/ingest/mongodb.txt
@@ -6,5 +6,5 @@
 #
 dnspython==2.4.2
     # via pymongo
-pymongo==4.6.0
+pymongo==4.6.1
     # via -r ingest/mongodb.in
diff --git a/requirements/ingest/onedrive.txt b/requirements/ingest/onedrive.txt
index 99930f62ca..155fdcb36a 100644
--- a/requirements/ingest/onedrive.txt
+++ b/requirements/ingest/onedrive.txt
@@ -40,9 +40,7 @@ office365-rest-python-client==2.4.2
 pycparser==2.21
     # via cffi
 pyjwt[crypto]==2.8.0
-    # via
-    #   msal
-    #   pyjwt
+    # via msal
 pytz==2023.3.post1
     # via office365-rest-python-client
 requests==2.31.0
diff --git a/requirements/ingest/outlook.txt b/requirements/ingest/outlook.txt
index becea4ed3a..03aa4ffdd0 100644
--- a/requirements/ingest/outlook.txt
+++ b/requirements/ingest/outlook.txt
@@ -34,9 +34,7 @@ office365-rest-python-client==2.4.2
 pycparser==2.21
     # via cffi
 pyjwt[crypto]==2.8.0
-    # via
-    #   msal
-    #   pyjwt
+    # via msal
 pytz==2023.3.post1
     # via office365-rest-python-client
 requests==2.31.0
diff --git a/requirements/ingest/pinecone.in b/requirements/ingest/pinecone.in
index 939f61e6d4..ebaedb531b 100644
--- a/requirements/ingest/pinecone.in
+++ b/requirements/ingest/pinecone.in
@@ -1,3 +1,3 @@
--c constraints.in
--c base.txt
+-c ../constraints.in
+-c ../base.txt
 pinecone-client
diff --git a/requirements/ingest/pinecone.txt b/requirements/ingest/pinecone.txt
index 19c30cdeff..ffa46e75ce 100644
--- a/requirements/ingest/pinecone.txt
+++ b/requirements/ingest/pinecone.txt
@@ -1,56 +1,58 @@
 #
-# This file is autogenerated by pip-compile with Python 3.10
+# This file is autogenerated by pip-compile with Python 3.8
 # by the following command:
 #
-#    pip-compile requirements/ingest-pinecone.in
+#    pip-compile --output-file=ingest/pinecone.txt ingest/pinecone.in
 #
-certifi==2023.7.22
+certifi==2023.11.17
     # via
-    #   -c requirements/base.txt
-    #   -c requirements/constraints.in
+    #   -c ingest/../base.txt
+    #   -c ingest/../constraints.in
     #   requests
-charset-normalizer==3.3.0
+charset-normalizer==3.3.2
     # via
-    #   -c requirements/base.txt
+    #   -c ingest/../base.txt
     #   requests
 dnspython==2.4.2
     # via pinecone-client
-idna==3.4
+idna==3.6
     # via
-    #   -c requirements/base.txt
+    #   -c ingest/../base.txt
     #   requests
 loguru==0.7.2
     # via pinecone-client
 numpy==1.24.4
     # via
-    #   -c requirements/base.txt
-    #   -c requirements/constraints.in
+    #   -c ingest/../base.txt
+    #   -c ingest/../constraints.in
     #   pinecone-client
 pinecone-client==2.2.4
-    # via -r requirements/ingest-pinecone.in
+    # via -r ingest/pinecone.in
 python-dateutil==2.8.2
-    # via pinecone-client
+    # via
+    #   -c ingest/../base.txt
+    #   pinecone-client
 pyyaml==6.0.1
     # via pinecone-client
 requests==2.31.0
     # via
-    #   -c requirements/base.txt
+    #   -c ingest/../base.txt
     #   pinecone-client
 six==1.16.0
     # via
-    #   -c requirements/base.txt
+    #   -c ingest/../base.txt
     #   python-dateutil
 tqdm==4.66.1
     # via
-    #   -c requirements/base.txt
+    #   -c ingest/../base.txt
     #   pinecone-client
 typing-extensions==4.8.0
     # via
-    #   -c requirements/base.txt
+    #   -c ingest/../base.txt
     #   pinecone-client
 urllib3==1.26.18
     # via
-    #   -c requirements/base.txt
-    #   -c requirements/constraints.in
+    #   -c ingest/../base.txt
+    #   -c ingest/../constraints.in
     #   pinecone-client
     #   requests
diff --git a/requirements/ingest/sharepoint.txt b/requirements/ingest/sharepoint.txt
index 82f58e365c..d49b89c227 100644
--- a/requirements/ingest/sharepoint.txt
+++ b/requirements/ingest/sharepoint.txt
@@ -34,9 +34,7 @@ office365-rest-python-client==2.4.2
 pycparser==2.21
     # via cffi
 pyjwt[crypto]==2.8.0
-    # via
-    #   msal
-    #   pyjwt
+    # via msal
 pytz==2023.3.post1
     # via office365-rest-python-client
 requests==2.31.0
diff --git a/setup.cfg b/setup.cfg
index 85c0e27436..5b463cff97 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -5,6 +5,7 @@ license_files = LICENSE.md
 max-line-length = 100
 exclude =
     .venv
+    unstructured-inference
 
 [tool:pytest]
 filterwarnings =
diff --git a/test_unstructured/partition/pdf_image/test_chipper.py b/test_unstructured/partition/pdf_image/test_chipper.py
index d625f97876..469362ade8 100644
--- a/test_unstructured/partition/pdf_image/test_chipper.py
+++ b/test_unstructured/partition/pdf_image/test_chipper.py
@@ -1,6 +1,6 @@
 import pytest
 
-from unstructured.partition import pdf
+from unstructured.partition.pdf_image import pdf
 from unstructured.partition.utils.constants import PartitionStrategy
 
 
diff --git a/test_unstructured/partition/pdf_image/test_image.py b/test_unstructured/partition/pdf_image/test_image.py
index d233c345a8..4b50f077c6 100644
--- a/test_unstructured/partition/pdf_image/test_image.py
+++ b/test_unstructured/partition/pdf_image/test_image.py
@@ -10,7 +10,7 @@
 from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
 from unstructured.chunking.title import chunk_by_title
 from unstructured.documents.elements import ElementType
-from unstructured.partition import image, ocr, pdf
+from unstructured.partition.pdf_image import image, ocr, pdf
 from unstructured.partition.utils.constants import (
     UNSTRUCTURED_INCLUDE_DEBUG_METADATA,
     PartitionStrategy,
@@ -303,7 +303,7 @@ def test_partition_image_metadata_date(
 ):
     mocked_last_modification_date = "2029-07-05T09:24:28"
     mocker.patch(
-        "unstructured.partition.pdf.get_last_modified_date",
+        "unstructured.partition.pdf_image.pdf.get_last_modified_date",
         return_value=mocked_last_modification_date,
     )
     elements = image.partition_image(filename=filename)
@@ -317,7 +317,7 @@ def test_partition_image_with_hi_res_strategy_metadata_date(
 ):
     mocked_last_modification_date = "2029-07-05T09:24:28"
     mocker.patch(
-        "unstructured.partition.pdf.get_last_modified_date",
+        "unstructured.partition.pdf_image.pdf.get_last_modified_date",
         return_value=mocked_last_modification_date,
     )
     elements = image.partition_image(filename=filename, stratefy=PartitionStrategy.HI_RES)
@@ -333,7 +333,7 @@ def test_partition_image_metadata_date_custom_metadata_date(
     expected_last_modification_date = "2009-07-05T09:24:28"
 
     mocker.patch(
-        "unstructured.partition.pdf.get_last_modified_date",
+        "unstructured.partition.pdf_image.pdf.get_last_modified_date",
         return_value=mocked_last_modification_date,
     )
     elements = image.partition_image(
@@ -352,7 +352,7 @@ def test_partition_image_with_hi_res_strategy_metadata_date_custom_metadata_date
     expected_last_modification_date = "2009-07-05T09:24:28"
 
     mocker.patch(
-        "unstructured.partition.pdf.get_last_modified_date",
+        "unstructured.partition.pdf_image.pdf.get_last_modified_date",
         return_value=mocked_last_modification_date,
     )
     elements = image.partition_image(
@@ -370,7 +370,7 @@ def test_partition_image_from_file_metadata_date(
 ):
     mocked_last_modification_date = "2029-07-05T09:24:28"
     mocker.patch(
-        "unstructured.partition.pdf.get_last_modified_date_from_file",
+        "unstructured.partition.pdf_image.pdf.get_last_modified_date_from_file",
         return_value=mocked_last_modification_date,
     )
     with open(filename, "rb") as f:
@@ -385,7 +385,7 @@ def test_partition_image_from_file_with_hi_res_strategy_metadata_date(
 ):
     mocked_last_modification_date = "2029-07-05T09:24:28"
     mocker.patch(
-        "unstructured.partition.pdf.get_last_modified_date_from_file",
+        "unstructured.partition.pdf_image.pdf.get_last_modified_date_from_file",
         return_value=mocked_last_modification_date,
     )
 
@@ -403,7 +403,7 @@ def test_partition_image_from_file_metadata_date_custom_metadata_date(
     expected_last_modification_date = "2009-07-05T09:24:28"
 
     mocker.patch(
-        "unstructured.partition.pdf.get_last_modified_date_from_file",
+        "unstructured.partition.pdf_image.pdf.get_last_modified_date_from_file",
         return_value=mocked_last_modification_date,
     )
     with open(filename, "rb") as f:
@@ -423,7 +423,7 @@ def test_partition_image_from_file_with_hi_res_strategy_metadata_date_custom_met
     expected_last_modification_date = "2009-07-05T09:24:28"
 
     mocker.patch(
-        "unstructured.partition.pdf.get_last_modified_date_from_file",
+        "unstructured.partition.pdf_image.pdf.get_last_modified_date_from_file",
         return_value=mocked_last_modification_date,
     )
     with open(filename, "rb") as f:
@@ -479,7 +479,7 @@ def test_partition_image_with_ocr_coordinates_are_not_nan_from_filename(
 def test_partition_image_formats_languages_for_tesseract():
     filename = "example-docs/jpn-vert.jpeg"
     with mock.patch(
-        "unstructured.partition.ocr.process_file_with_ocr",
+        "unstructured.partition.pdf_image.ocr.process_file_with_ocr",
     ) as mock_process_file_with_ocr:
         image.partition_image(
             filename=filename, strategy=PartitionStrategy.HI_RES, languages=["jpn_vert"]
@@ -592,7 +592,6 @@ def inference_results():
     page = layout.PageLayout(
         number=1,
         image=mock.MagicMock(format="JPEG"),
-        layout=layout.TextRegion.from_coords(0, 0, 600, 800, text="hello"),
     )
     page.elements = [layout.LayoutElement.from_coords(0, 0, 600, 800, text="hello")]
     doc = layout.DocumentLayout(pages=[page])
diff --git a/test_unstructured/partition/pdf_image/test_ocr.py b/test_unstructured/partition/pdf_image/test_ocr.py
index 4d94cd6106..acfc22a3f9 100644
--- a/test_unstructured/partition/pdf_image/test_ocr.py
+++ b/test_unstructured/partition/pdf_image/test_ocr.py
@@ -13,8 +13,8 @@
 )
 
 from unstructured.documents.elements import ElementType
-from unstructured.partition import ocr
-from unstructured.partition.ocr import pad_element_bboxes
+from unstructured.partition.pdf_image import ocr
+from unstructured.partition.pdf_image.ocr import pad_element_bboxes
 from unstructured.partition.utils.constants import (
     OCR_AGENT_PADDLE,
     OCR_AGENT_TESSERACT,
diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py
index ed973dda96..efd4a9b3a2 100644
--- a/test_unstructured/partition/pdf_image/test_pdf.py
+++ b/test_unstructured/partition/pdf_image/test_pdf.py
@@ -5,6 +5,7 @@
 from unittest import mock
 
 import pytest
+from pdf2image.exceptions import PDFPageCountError
 from PIL import Image
 from unstructured_inference.inference import layout
 
@@ -19,8 +20,9 @@
     Text,
     Title,
 )
-from unstructured.partition import ocr, pdf, strategies
-from unstructured.partition.pdf import get_uris_from_annots
+from unstructured.partition import strategies
+from unstructured.partition.pdf_image import ocr, pdf, pdfminer_processing
+from unstructured.partition.pdf_image.pdf import get_uris_from_annots
 from unstructured.partition.utils.constants import (
     UNSTRUCTURED_INCLUDE_DEBUG_METADATA,
     PartitionStrategy,
@@ -109,6 +111,16 @@ def test_partition_pdf_local(monkeypatch, filename, file):
         "process_file_with_model",
         lambda *args, **kwargs: MockDocumentLayout(),
     )
+    monkeypatch.setattr(
+        pdfminer_processing,
+        "process_data_with_pdfminer",
+        lambda *args, **kwargs: MockDocumentLayout(),
+    )
+    monkeypatch.setattr(
+        pdfminer_processing,
+        "process_file_with_pdfminer",
+        lambda *args, **kwargs: MockDocumentLayout(),
+    )
     monkeypatch.setattr(
         ocr,
         "process_data_with_ocr",
@@ -116,7 +128,7 @@ def test_partition_pdf_local(monkeypatch, filename, file):
     )
     monkeypatch.setattr(
         ocr,
-        "process_data_with_ocr",
+        "process_file_with_ocr",
         lambda *args, **kwargs: MockDocumentLayout(),
     )
 
@@ -125,7 +137,7 @@ def test_partition_pdf_local(monkeypatch, filename, file):
 
 
 def test_partition_pdf_local_raises_with_no_filename():
-    with pytest.raises(FileNotFoundError):
+    with pytest.raises((FileNotFoundError, PDFPageCountError)):
         pdf._partition_pdf_or_image_local(filename="", file=None, is_image=False)
 
 
@@ -391,7 +403,7 @@ def mock_exists(dep):
 def test_partition_pdf_uses_table_extraction():
     filename = example_doc_path("layout-parser-paper-fast.pdf")
     with mock.patch(
-        "unstructured.partition.ocr.process_file_with_ocr",
+        "unstructured.partition.pdf_image.ocr.process_file_with_ocr",
     ) as mock_process_file_with_model:
         pdf.partition_pdf(filename, infer_table_structure=True)
         assert mock_process_file_with_model.call_args[1]["infer_table_structure"]
@@ -633,7 +645,7 @@ def test_partition_pdf_metadata_date(
     )
 
     mocker.patch(
-        "unstructured.partition.pdf.get_the_last_modification_date_pdf_or_img",
+        "unstructured.partition.pdf_image.pdf.get_the_last_modification_date_pdf_or_img",
         return_value=mocked_last_modification_date,
     )
 
diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py
index 36a4ee5534..a364076f57 100644
--- a/test_unstructured/partition/test_auto.py
+++ b/test_unstructured/partition/test_auto.py
@@ -330,7 +330,7 @@ def test_auto_partition_pdf_from_filename(pass_metadata_filename, content_type,
 def test_auto_partition_pdf_uses_table_extraction():
     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
     with patch(
-        "unstructured.partition.ocr.process_file_with_ocr",
+        "unstructured.partition.pdf_image.ocr.process_file_with_ocr",
     ) as mock_process_file_with_model:
         partition(filename, pdf_infer_table_structure=True, strategy=PartitionStrategy.HI_RES)
         assert mock_process_file_with_model.call_args[1]["infer_table_structure"]
@@ -390,7 +390,7 @@ def test_auto_partition_pdf_from_file(pass_metadata_filename, content_type, requ
 def test_auto_partition_formats_languages_for_tesseract():
     filename = "example-docs/chi_sim_image.jpeg"
     with patch(
-        "unstructured.partition.ocr.process_file_with_ocr",
+        "unstructured.partition.pdf_image.ocr.process_file_with_ocr",
     ) as mock_process_file_with_ocr:
         partition(filename, strategy=PartitionStrategy.HI_RES, languages=["zh"])
         _, kwargs = mock_process_file_with_ocr.call_args_list[0]
@@ -692,7 +692,10 @@ def test_file_specific_produces_correct_filetype(filetype: FileType):
         extension if filetype not in FILETYPE_TO_MODULE else FILETYPE_TO_MODULE[filetype]
     )
     fun_name = "partition_" + filetype_module
-    module = import_module(f"unstructured.partition.{filetype_module}")  # noqa
+    if filetype_module in ["pdf", "image"]:
+        module = import_module(f"unstructured.partition.pdf_image.{filetype_module}")  # noqa
+    else:
+        module = import_module(f"unstructured.partition.{filetype_module}")  # noqa
     fun = eval(f"module.{fun_name}")
     for file in pathlib.Path("example-docs").iterdir():
         if file.is_file() and file.suffix == f".{extension}":
diff --git a/test_unstructured/partition/test_common.py b/test_unstructured/partition/test_common.py
index 2be9a93fd4..5e9652ff55 100644
--- a/test_unstructured/partition/test_common.py
+++ b/test_unstructured/partition/test_common.py
@@ -451,7 +451,6 @@ def test_document_to_element_list_handles_parent():
     page = PageLayout(
         number=1,
         image=MockImage(),
-        layout=None,
     )
     page.elements = [block1, block2]
     doc = DocumentLayout.from_pages([page])
@@ -477,7 +476,6 @@ def test_document_to_element_list_doesnt_sort_on_sort_method(sort_mode, call_cou
     page = PageLayout(
         number=1,
         image=MockImage(),
-        layout=None,
     )
     page.elements = [block1, block2]
     doc = DocumentLayout.from_pages([page])
diff --git a/test_unstructured/partition/test_strategies.py b/test_unstructured/partition/test_strategies.py
index b8891c3953..f9eab34995 100644
--- a/test_unstructured/partition/test_strategies.py
+++ b/test_unstructured/partition/test_strategies.py
@@ -2,7 +2,8 @@
 
 import pytest
 
-from unstructured.partition import pdf, strategies
+from unstructured.partition import strategies
+from unstructured.partition.pdf_image import pdf
 from unstructured.partition.utils.constants import PartitionStrategy
 
 
diff --git a/test_unstructured/partition/utils/test_processing_elements.py b/test_unstructured/partition/utils/test_processing_elements.py
index b7d4843077..78fb1b4996 100644
--- a/test_unstructured/partition/utils/test_processing_elements.py
+++ b/test_unstructured/partition/utils/test_processing_elements.py
@@ -1,8 +1,10 @@
 import pytest
-from unstructured_inference.constants import Source
+from PIL import Image
+from unstructured_inference.constants import Source as InferenceSource
 from unstructured_inference.inference.elements import Rectangle
 from unstructured_inference.inference.layout import DocumentLayout, LayoutElement, PageLayout
 
+from unstructured.partition.utils.constants import Source
 from unstructured.partition.utils.processing_elements import clean_pdfminer_inner_elements
 
 # A set of elements with pdfminer elements inside tables
@@ -23,10 +25,10 @@
         bbox=Rectangle(0, 0, 100, 100),
         text="Table with inner elements",
         type="Table",
-        source=Source.YOLOX,
+        source=InferenceSource.YOLOX,
     ),
-    LayoutElement(bbox=Rectangle(50, 50, 70, 70), text="text1", source=Source.YOLOX),
-    LayoutElement(bbox=Rectangle(70, 70, 80, 80), text="text2", source=Source.YOLOX),
+    LayoutElement(bbox=Rectangle(50, 50, 70, 70), text="text1", source=InferenceSource.YOLOX),
+    LayoutElement(bbox=Rectangle(70, 70, 80, 80), text="text2", source=InferenceSource.YOLOX),
 ]
 # A set of elements with pdfminer elements inside tables and other
 # elements with source=Source.PDFMINER
@@ -36,7 +38,7 @@
         bbox=Rectangle(0, 0, 100, 100),
         text="Table1 with inner elements",
         type="Table",
-        source=Source.YOLOX,
+        source=InferenceSource.YOLOX,
     ),
     LayoutElement(bbox=Rectangle(50, 50, 70, 70), text="Inside table1"),
     LayoutElement(bbox=Rectangle(70, 70, 80, 80), text="Inside table1", source=Source.PDFMINER),
@@ -54,7 +56,7 @@
         bbox=Rectangle(0, 500, 100, 700),
         text="Table2 with inner elements",
         type="Table",
-        source=Source.YOLOX,
+        source=InferenceSource.YOLOX,
     ),
     LayoutElement(bbox=Rectangle(0, 510, 50, 300), text="Inside table2", source=Source.PDFMINER),
     LayoutElement(bbox=Rectangle(0, 550, 70, 400), text="Inside table2", source=Source.PDFMINER),
@@ -71,7 +73,7 @@
 )
 def test_clean_pdfminer_inner_elements(elements, length_extra_info, expected_document_length):
     # create a sample document with pdfminer elements inside tables
-    page = PageLayout(number=1, image=None, layout=elements)
+    page = PageLayout(number=1, image=Image.new("1", (1, 1)))
     page.elements = elements
     document_with_table = DocumentLayout(pages=[page])
     document = document_with_table
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index dd12154992..510b9a48ec 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.11.4-dev0"  # pragma: no cover
+__version__ = "0.11.4-dev1"  # pragma: no cover
diff --git a/unstructured/metrics/table_structure.py b/unstructured/metrics/table_structure.py
index ff79114f91..90fca5a7d6 100644
--- a/unstructured/metrics/table_structure.py
+++ b/unstructured/metrics/table_structure.py
@@ -2,7 +2,7 @@
 import pandas as pd
 from PIL import Image
 
-from unstructured.partition.pdf import convert_pdf_to_images
+from unstructured.partition.pdf_image.pdf import convert_pdf_to_images
 from unstructured.utils import requires_dependencies
 
 
diff --git a/unstructured/nlp/partition.py b/unstructured/nlp/partition.py
index 881b006357..d362fec0e0 100644
--- a/unstructured/nlp/partition.py
+++ b/unstructured/nlp/partition.py
@@ -1,5 +1,5 @@
 # flake8: noqa
-from unstructured.partition.pdf import partition_pdf  # noqa
+from unstructured.partition.pdf_image.pdf import partition_pdf  # noqa
 from unstructured.partition.text_type import (  # noqa
     is_bulleted_text,
     is_possible_narrative_text,
diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py
index 16047d7965..eeca1480be 100644
--- a/unstructured/partition/auto.py
+++ b/unstructured/partition/auto.py
@@ -78,13 +78,13 @@
 
 pdf_imports = ["pdf2image", "pdfminer", "PIL"]
 if all(dependency_exists(dep) for dep in pdf_imports):
-    from unstructured.partition.pdf import partition_pdf
+    from unstructured.partition.pdf_image.pdf import partition_pdf
 
     PARTITION_WITH_EXTRAS_MAP["pdf"] = partition_pdf
 
 
 if dependency_exists("unstructured_inference"):
-    from unstructured.partition.image import partition_image
+    from unstructured.partition.pdf_image.image import partition_image
 
     PARTITION_WITH_EXTRAS_MAP["image"] = partition_image
 
diff --git a/unstructured/partition/pdf_image/__init__.py b/unstructured/partition/pdf_image/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/unstructured/partition/image.py b/unstructured/partition/pdf_image/image.py
similarity index 98%
rename from unstructured/partition/image.py
rename to unstructured/partition/pdf_image/image.py
index 4986ab26ff..eb0e1cb8d6 100644
--- a/unstructured/partition/image.py
+++ b/unstructured/partition/pdf_image/image.py
@@ -8,7 +8,7 @@
 from unstructured.partition.lang import (
     convert_old_ocr_languages_to_languages,
 )
-from unstructured.partition.pdf import partition_pdf_or_image
+from unstructured.partition.pdf_image.pdf import partition_pdf_or_image
 from unstructured.partition.utils.constants import PartitionStrategy
 
 
diff --git a/unstructured/partition/ocr.py b/unstructured/partition/pdf_image/ocr.py
similarity index 100%
rename from unstructured/partition/ocr.py
rename to unstructured/partition/pdf_image/ocr.py
diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf_image/pdf.py
similarity index 90%
rename from unstructured/partition/pdf.py
rename to unstructured/partition/pdf_image/pdf.py
index ce566529b9..1d911fc8a4 100644
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf_image/pdf.py
@@ -2,7 +2,6 @@
 import io
 import os
 import re
-import tempfile
 import warnings
 from tempfile import SpooledTemporaryFile
 from typing import (
@@ -21,21 +20,15 @@
 
 import numpy as np
 import pdf2image
-import pikepdf
-import pypdf
 import wrapt
-from pdfminer.converter import PDFPageAggregator
+from pdfminer import psparser
 from pdfminer.layout import (
-    LAParams,
     LTChar,
     LTContainer,
     LTImage,
     LTItem,
     LTTextBox,
 )
-from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
-from pdfminer.pdfpage import PDFPage
-from pdfminer.pdfparser import PSSyntaxError
 from pdfminer.pdftypes import PDFObjRef
 from pdfminer.utils import open_filename
 from PIL import Image as PILImage
@@ -77,9 +70,9 @@
     check_languages,
     prepare_languages_for_tesseract,
 )
-from unstructured.partition.ocr import (
-    get_layout_elements_from_ocr,
-    get_ocr_agent,
+from unstructured.partition.pdf_image.pdfminer_utils import (
+    open_pdfminer_pages_generator,
+    rect_to_bbox,
 )
 from unstructured.partition.strategies import determine_pdf_or_image_strategy, validate_strategy
 from unstructured.partition.text import element_from_text
@@ -96,11 +89,17 @@
     coord_has_valid_points,
     sort_page_elements,
 )
+from unstructured.patches.pdfminer import parse_keyword
 from unstructured.utils import requires_dependencies
 
 if TYPE_CHECKING:
     pass
 
+
+# NOTE(alan): Patching this to fix a bug in pdfminer.six. Submitted this PR into pdfminer.six to fix
+# the bug: https://github.com/pdfminer/pdfminer.six/pull/885
+psparser.PSBaseParser._parse_keyword = parse_keyword  # type: ignore
+
 RE_MULTISPACE_INCLUDING_NEWLINES = re.compile(pattern=r"\s+", flags=re.DOTALL)
 
 
@@ -353,10 +352,14 @@ def _partition_pdf_or_image_local(
         process_file_with_model,
     )
 
-    from unstructured.partition.ocr import (
+    from unstructured.partition.pdf_image.ocr import (
         process_data_with_ocr,
         process_file_with_ocr,
     )
+    from unstructured.partition.pdf_image.pdfminer_processing import (
+        process_data_with_pdfminer,
+        process_file_with_pdfminer,
+    )
 
     if languages is None:
         languages = ["eng"]
@@ -373,8 +376,7 @@ def _partition_pdf_or_image_local(
         )
 
     if file is None:
-        # NOTE(christine): out_layout = extracted_layout + inferred_layout
-        out_layout = process_file_with_model(
+        inferred_document_layout = process_file_with_model(
             filename,
             is_image=is_image,
             model_name=model_name,
@@ -382,13 +384,21 @@ def _partition_pdf_or_image_local(
             extract_images_in_pdf=extract_images_in_pdf,
             image_output_dir_path=image_output_dir_path,
         )
+
+        # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
+        merged_document_layout = process_file_with_pdfminer(
+            inferred_document_layout,
+            filename,
+            is_image,
+        )
+
         if model_name.startswith("chipper"):
             # NOTE(alan): We shouldn't do OCR with chipper
-            final_layout = out_layout
+            final_document_layout = merged_document_layout
         else:
-            final_layout = process_file_with_ocr(
+            final_document_layout = process_file_with_ocr(
                 filename,
-                out_layout,
+                merged_document_layout,
                 is_image=is_image,
                 infer_table_structure=infer_table_structure,
                 ocr_languages=ocr_languages,
@@ -396,7 +406,7 @@ def _partition_pdf_or_image_local(
                 pdf_image_dpi=pdf_image_dpi,
             )
     else:
-        out_layout = process_data_with_model(
+        inferred_document_layout = process_data_with_model(
             file,
             is_image=is_image,
             model_name=model_name,
@@ -404,15 +414,25 @@ def _partition_pdf_or_image_local(
             extract_images_in_pdf=extract_images_in_pdf,
             image_output_dir_path=image_output_dir_path,
         )
+        if hasattr(file, "seek"):
+            file.seek(0)
+
+        # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
+        merged_document_layout = process_data_with_pdfminer(
+            inferred_document_layout,
+            file,
+            is_image,
+        )
+
         if model_name.startswith("chipper"):
             # NOTE(alan): We shouldn't do OCR with chipper
-            final_layout = out_layout
+            final_document_layout = merged_document_layout
         else:
             if hasattr(file, "seek"):
                 file.seek(0)
-            final_layout = process_data_with_ocr(
+            final_document_layout = process_data_with_ocr(
                 file,
-                out_layout,
+                merged_document_layout,
                 is_image=is_image,
                 infer_table_structure=infer_table_structure,
                 ocr_languages=ocr_languages,
@@ -424,9 +444,9 @@ def _partition_pdf_or_image_local(
     if model_name == "chipper":
         kwargs["sort_mode"] = SORT_MODE_DONT
 
-    final_layout = clean_pdfminer_inner_elements(final_layout)
+    final_document_layout = clean_pdfminer_inner_elements(final_document_layout)
     elements = document_to_element_list(
-        final_layout,
+        final_document_layout,
         sortable=True,
         include_page_breaks=include_page_breaks,
         last_modification_date=metadata_last_modified,
@@ -545,69 +565,6 @@ def pdfminer_interpreter_init_resources(wrapped, instance, args, kwargs):
     return wrapped(resources)
 
 
-def get_page_data(fp: BinaryIO, page_number: int):
-    """Find the binary data for a given page number from a PDF binary file."""
-    pdf_reader = pypdf.PdfReader(fp)
-    pdf_writer = pypdf.PdfWriter()
-    page = pdf_reader.pages[page_number]
-    pdf_writer.add_page(page)
-    page_data = io.BytesIO()
-    pdf_writer.write(page_data)
-    return page_data
-
-
-def _open_pdfminer_pages_generator(
-    fp: BinaryIO,
-):
-    """Open PDF pages using PDFMiner, handling and repairing invalid dictionary constructs."""
-
-    rsrcmgr = PDFResourceManager()
-    laparams = LAParams()
-    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
-    interpreter = PDFPageInterpreter(rsrcmgr, device)
-    try:
-        i = 0
-        pages = PDFPage.get_pages(fp)
-        # Detect invalid dictionary construct for entire PDF
-        for page in pages:
-            try:
-                # Detect invalid dictionary construct for one page
-                interpreter.process_page(page)
-                page_layout = device.get_result()
-            except PSSyntaxError:
-                logger.info("Detected invalid dictionary construct for PDFminer")
-                logger.info(f"Repairing the PDF page {i+1} ...")
-                # find the error page from binary data fp
-                error_page_data = get_page_data(fp, page_number=i)
-                # repair the error page with pikepdf
-                with tempfile.NamedTemporaryFile() as tmp:
-                    with pikepdf.Pdf.open(error_page_data) as pdf:
-                        pdf.save(tmp.name)
-                    page = next(PDFPage.get_pages(open(tmp.name, "rb")))  # noqa: SIM115
-                    try:
-                        interpreter.process_page(page)
-                        page_layout = device.get_result()
-                    except Exception:
-                        logger.warning(
-                            f"PDFMiner failed to process PDF page {i+1} after repairing it."
-                        )
-                        break
-            i += 1
-            yield page, page_layout
-    except PSSyntaxError:
-        logger.info("Detected invalid dictionary construct for PDFminer")
-        logger.info("Repairing the PDF document ...")
-        # repair the entire doc with pikepdf
-        with tempfile.NamedTemporaryFile() as tmp:
-            with pikepdf.Pdf.open(fp) as pdf:
-                pdf.save(tmp.name)
-            pages = PDFPage.get_pages(open(tmp.name, "rb"))  # noqa: SIM115
-            for page in pages:
-                interpreter.process_page(page)
-                page_layout = device.get_result()
-                yield page, page_layout
-
-
 def _process_pdfminer_pages(
     fp: BinaryIO,
     filename: str,
@@ -620,7 +577,7 @@ def _process_pdfminer_pages(
     """Uses PDFMiner to split a document into pages and process them."""
     elements: List[Element] = []
 
-    for i, (page, page_layout) in enumerate(_open_pdfminer_pages_generator(fp)):
+    for i, (page, page_layout) in enumerate(open_pdfminer_pages_generator(fp)):
         width, height = page_layout.width, page_layout.height
 
         page_elements = []
@@ -842,6 +799,11 @@ def _partition_pdf_or_image_with_ocr_from_image(
 ) -> List[Element]:
     """Extract `unstructured` elements from an image using OCR and perform partitioning."""
 
+    from unstructured.partition.pdf_image.ocr import (
+        get_layout_elements_from_ocr,
+        get_ocr_agent,
+    )
+
     ocr_agent = get_ocr_agent()
     ocr_languages = prepare_languages_for_tesseract(languages)
 
@@ -1035,29 +997,6 @@ def try_resolve(annot: PDFObjRef):
         return annot
 
 
-def rect_to_bbox(
-    rect: Tuple[float, float, float, float],
-    height: float,
-) -> Tuple[float, float, float, float]:
-    """
-    Converts a PDF rectangle coordinates (x1, y1, x2, y2) to a bounding box in the specified
-    coordinate system where the vertical axis is measured from the top of the page.
-
-    Args:
-        rect (Tuple[float, float, float, float]): A tuple representing a PDF rectangle
-            coordinates (x1, y1, x2, y2).
-        height (float): The height of the page in the specified coordinate system.
-
-    Returns:
-        Tuple[float, float, float, float]: A tuple representing the bounding box coordinates
-        (x1, y1, x2, y2) with the y-coordinates adjusted to be measured from the top of the page.
-    """
-    x1, y2, x2, y1 = rect
-    y1 = height - y1
-    y2 = height - y2
-    return (x1, y1, x2, y2)
-
-
 def calculate_intersection_area(
     bbox1: Tuple[float, float, float, float],
     bbox2: Tuple[float, float, float, float],
diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py
new file mode 100644
index 0000000000..523cd62be1
--- /dev/null
+++ b/unstructured/partition/pdf_image/pdfminer_processing.py
@@ -0,0 +1,131 @@
+from typing import TYPE_CHECKING, BinaryIO, List, Optional, Union, cast
+
+from pdfminer.utils import open_filename
+from unstructured_inference.inference.elements import (
+    EmbeddedTextRegion,
+    ImageTextRegion,
+    TextRegion,
+)
+from unstructured_inference.inference.layoutelement import (
+    merge_inferred_layout_with_extracted_layout,
+)
+from unstructured_inference.inference.ordering import order_layout
+from unstructured_inference.models.detectron2onnx import UnstructuredDetectronONNXModel
+
+from unstructured.partition.pdf_image.pdfminer_utils import (
+    get_images_from_pdf_element,
+    open_pdfminer_pages_generator,
+    rect_to_bbox,
+)
+from unstructured.partition.utils.constants import Source
+
+if TYPE_CHECKING:
+    from unstructured_inference.inference.layout import DocumentLayout
+
+
+def process_file_with_pdfminer(
+    inferred_document_layout: "DocumentLayout",
+    filename: str = "",
+    is_image: bool = False,
+) -> "DocumentLayout":
+    with open_filename(filename, "rb") as fp:
+        fp = cast(BinaryIO, fp)
+        inferred_document_layout = process_data_with_pdfminer(
+            inferred_document_layout=inferred_document_layout,
+            file=fp,
+            is_image=is_image,
+        )
+        return inferred_document_layout
+
+
+def process_data_with_pdfminer(
+    inferred_document_layout: "DocumentLayout",
+    file: Optional[Union[bytes, BinaryIO]] = None,
+    is_image: bool = False,
+) -> "DocumentLayout":
+    if is_image:
+        for page in inferred_document_layout.pages:
+            for el in page.elements:
+                el.text = el.text or ""
+        return inferred_document_layout
+
+    extracted_layouts = get_regions_by_pdfminer(file)
+
+    inferred_pages = inferred_document_layout.pages
+    for i, (inferred_page, extracted_layout) in enumerate(zip(inferred_pages, extracted_layouts)):
+        inferred_layout = inferred_page.elements
+        image_metadata = inferred_page.image_metadata
+        w = image_metadata.get("width")
+        h = image_metadata.get("height")
+        image_size = (w, h)
+
+        threshold_kwargs = {}
+        # NOTE(Benjamin): With this the thresholds are only changed for detextron2_mask_rcnn
+        # In other case the default values for the functions are used
+        if (
+            isinstance(inferred_page.detection_model, UnstructuredDetectronONNXModel)
+            and "R_50" not in inferred_page.detection_model.model_path
+        ):
+            threshold_kwargs = {"same_region_threshold": 0.5, "subregion_threshold": 0.5}
+
+        merged_layout = merge_inferred_layout_with_extracted_layout(
+            inferred_layout=inferred_layout,
+            extracted_layout=extracted_layout,
+            page_image_size=image_size,
+            **threshold_kwargs,
+        )
+
+        elements = inferred_page.get_elements_from_layout(
+            layout=cast(List[TextRegion], merged_layout),
+            pdf_objects=extracted_layout,
+        )
+
+        inferred_page.elements[:] = elements
+
+    return inferred_document_layout
+
+
+def get_regions_by_pdfminer(
+    fp: Optional[Union[bytes, BinaryIO]],
+    dpi: int = 200,
+) -> List[List[TextRegion]]:
+    """Loads the image and word objects from a pdf using pdfplumber and the image renderings of the
+    pdf pages using pdf2image"""
+
+    layouts = []
+    # Coefficient to rescale bounding box to be compatible with images
+    coef = dpi / 72
+    for i, (page, page_layout) in enumerate(open_pdfminer_pages_generator(fp)):
+        height = page_layout.height
+
+        layout: List["TextRegion"] = []
+        for obj in page_layout:
+            x1, y1, x2, y2 = rect_to_bbox(obj.bbox, height)
+
+            if hasattr(obj, "get_text"):
+                _text = obj.get_text()
+                element_class = EmbeddedTextRegion  # type: ignore
+            else:
+                embedded_images = get_images_from_pdf_element(obj)
+                if len(embedded_images) > 0:
+                    _text = None
+                    element_class = ImageTextRegion  # type: ignore
+                else:
+                    continue
+
+            text_region = element_class.from_coords(
+                x1 * coef,
+                y1 * coef,
+                x2 * coef,
+                y2 * coef,
+                text=_text,
+                source=Source.PDFMINER,
+            )
+
+            if text_region.bbox is not None and text_region.bbox.area > 0:
+                layout.append(text_region)
+
+        layout = order_layout(layout)
+        layouts.append(layout)
+
+    return layouts
diff --git a/unstructured/partition/pdf_image/pdfminer_utils.py b/unstructured/partition/pdf_image/pdfminer_utils.py
new file mode 100644
index 0000000000..5c50bff88c
--- /dev/null
+++ b/unstructured/partition/pdf_image/pdfminer_utils.py
@@ -0,0 +1,128 @@
+import tempfile
+from typing import Any, BinaryIO, List, Tuple
+
+import pikepdf
+from pdfminer.converter import PDFPageAggregator
+from pdfminer.layout import LAParams, LTContainer, LTImage
+from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
+from pdfminer.pdfpage import PDFPage
+from pdfminer.pdfparser import PSSyntaxError
+
+from unstructured.logger import logger
+from unstructured.partition.pdf_image.pypdf_utils import get_page_data
+
+
+def init_pdfminer():
+    rsrcmgr = PDFResourceManager()
+    laparams = LAParams()
+    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
+    interpreter = PDFPageInterpreter(rsrcmgr, device)
+
+    return device, interpreter
+
+
+def get_images_from_pdf_element(layout_object: Any) -> List[LTImage]:
+    """
+    Recursively extracts LTImage objects from a PDF layout element.
+
+    This function takes a PDF layout element (could be LTImage or LTContainer) and recursively
+    extracts all LTImage objects contained within it.
+
+    Parameters:
+    - layout_object (Any): The PDF layout element to extract images from.
+
+    Returns:
+    - List[LTImage]: A list of LTImage objects extracted from the layout object.
+
+    Note:
+    - This function recursively traverses through the layout_object to find and accumulate all
+     LTImage objects.
+    - If the input layout_object is an LTImage, it will be included in the returned list.
+    - If the input layout_object is an LTContainer, the function will recursively search its
+     children for LTImage objects.
+    - If the input layout_object is neither LTImage nor LTContainer, an empty list will be
+     returned.
+    """
+
+    # recursively locate Image objects in layout_object
+    if isinstance(layout_object, LTImage):
+        return [layout_object]
+    if isinstance(layout_object, LTContainer):
+        img_list: List[LTImage] = []
+        for child in layout_object:
+            img_list = img_list + get_images_from_pdf_element(child)
+        return img_list
+    else:
+        return []
+
+
+def rect_to_bbox(
+    rect: Tuple[float, float, float, float],
+    height: float,
+) -> Tuple[float, float, float, float]:
+    """
+    Converts a PDF rectangle coordinates (x1, y1, x2, y2) to a bounding box in the specified
+    coordinate system where the vertical axis is measured from the top of the page.
+
+    Args:
+        rect (Tuple[float, float, float, float]): A tuple representing a PDF rectangle
+            coordinates (x1, y1, x2, y2).
+        height (float): The height of the page in the specified coordinate system.
+
+    Returns:
+        Tuple[float, float, float, float]: A tuple representing the bounding box coordinates
+        (x1, y1, x2, y2) with the y-coordinates adjusted to be measured from the top of the page.
+    """
+    x1, y2, x2, y1 = rect
+    y1 = height - y1
+    y2 = height - y2
+    return (x1, y1, x2, y2)
+
+
+def open_pdfminer_pages_generator(
+    fp: BinaryIO,
+):
+    """Open PDF pages using PDFMiner, handling and repairing invalid dictionary constructs."""
+
+    device, interpreter = init_pdfminer()
+    try:
+        i = 0
+        pages = PDFPage.get_pages(fp)
+        # Detect invalid dictionary construct for entire PDF
+        for page in pages:
+            try:
+                # Detect invalid dictionary construct for one page
+                interpreter.process_page(page)
+                page_layout = device.get_result()
+            except PSSyntaxError:
+                logger.info("Detected invalid dictionary construct for PDFminer")
+                logger.info(f"Repairing the PDF page {i+1} ...")
+                # find the error page from binary data fp
+                error_page_data = get_page_data(fp, page_number=i)
+                # repair the error page with pikepdf
+                with tempfile.NamedTemporaryFile() as tmp:
+                    with pikepdf.Pdf.open(error_page_data) as pdf:
+                        pdf.save(tmp.name)
+                    page = next(PDFPage.get_pages(open(tmp.name, "rb")))  # noqa: SIM115
+                    try:
+                        interpreter.process_page(page)
+                        page_layout = device.get_result()
+                    except Exception:
+                        logger.warning(
+                            f"PDFMiner failed to process PDF page {i+1} after repairing it."
+                        )
+                        break
+            i += 1
+            yield page, page_layout
+    except PSSyntaxError:
+        logger.info("Detected invalid dictionary construct for PDFminer")
+        logger.info("Repairing the PDF document ...")
+        # repair the entire doc with pikepdf
+        with tempfile.NamedTemporaryFile() as tmp:
+            with pikepdf.Pdf.open(fp) as pdf:
+                pdf.save(tmp.name)
+            pages = PDFPage.get_pages(open(tmp.name, "rb"))  # noqa: SIM115
+            for page in pages:
+                interpreter.process_page(page)
+                page_layout = device.get_result()
+                yield page, page_layout
diff --git a/unstructured/partition/pdf_image/pypdf_utils.py b/unstructured/partition/pdf_image/pypdf_utils.py
new file mode 100644
index 0000000000..b03fe72856
--- /dev/null
+++ b/unstructured/partition/pdf_image/pypdf_utils.py
@@ -0,0 +1,15 @@
+import io
+from typing import BinaryIO
+
+import pypdf
+
+
+def get_page_data(fp: BinaryIO, page_number: int):
+    """Find the binary data for a given page number from a PDF binary file."""
+    pdf_reader = pypdf.PdfReader(fp)
+    pdf_writer = pypdf.PdfWriter()
+    page = pdf_reader.pages[page_number]
+    pdf_writer.add_page(page)
+    page_data = io.BytesIO()
+    pdf_writer.write(page_data)
+    return page_data
diff --git a/unstructured/partition/utils/constants.py b/unstructured/partition/utils/constants.py
index 54b596048b..a88b555be0 100644
--- a/unstructured/partition/utils/constants.py
+++ b/unstructured/partition/utils/constants.py
@@ -3,6 +3,7 @@
 
 
 class Source(Enum):
+    PDFMINER = "pdfminer"
     OCR_TESSERACT = "ocr_tesseract"
     OCR_PADDLE = "ocr_paddle"
 
diff --git a/unstructured/partition/utils/processing_elements.py b/unstructured/partition/utils/processing_elements.py
index 95289a41ca..7fb76be42a 100644
--- a/unstructured/partition/utils/processing_elements.py
+++ b/unstructured/partition/utils/processing_elements.py
@@ -1,8 +1,9 @@
 from collections import defaultdict
 
-from unstructured_inference.constants import Source
 from unstructured_inference.inference.layout import DocumentLayout
 
+from unstructured.partition.utils.constants import Source
+
 
 def clean_pdfminer_inner_elements(document: DocumentLayout) -> DocumentLayout:
     """Clean pdfminer elements from inside tables and stores them in extra_info dictionary
diff --git a/unstructured/patches/__init__.py b/unstructured/patches/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/unstructured/patches/pdfminer.py b/unstructured/patches/pdfminer.py
new file mode 100644
index 0000000000..20b938d1ce
--- /dev/null
+++ b/unstructured/patches/pdfminer.py
@@ -0,0 +1,24 @@
+from typing import Union
+
+from pdfminer.psparser import END_KEYWORD, KWD, PSBaseParser, PSKeyword
+
+
+def parse_keyword(self: PSBaseParser, s: bytes, i: int) -> int:
+    """Patch for pdfminer method _parse_keyword of PSBaseParser. Changes are identical to the PR
+    https://github.com/pdfminer/pdfminer.six/pull/885."""
+    m = END_KEYWORD.search(s, i)
+    if not m:
+        j = len(s)
+        self._curtoken += s[i:]
+    else:
+        j = m.start(0)
+        self._curtoken += s[i:j]
+    if self._curtoken == b"true":
+        token: Union[bool, PSKeyword] = True
+    elif self._curtoken == b"false":
+        token = False
+    else:
+        token = KWD(self._curtoken)
+    self._add_token(token)
+    self._parse1 = self._parse_main
+    return j