getomni-ai · pradhyumna85 · Sep 13, 2024 · Sep 13, 2024 · Sep 15, 2024 · Sep 18, 2024
diff --git a/README.md b/README.md
@@ -157,6 +157,7 @@ Refer to the [LiteLLM Documentation](https://docs.litellm.ai/docs/providers) for
 
 ```python
 from pyzerox import zerox
+
 import os
 import json
 import asyncio
@@ -203,7 +204,7 @@ file_path = 'path/to/vertex_ai_service_account.json'
 
 # Load the JSON file
 with open(file_path, 'r') as file:
-    vertex_credentials = json.load(file)
+  vertex_credentials = json.load(file)
 
 # Convert to JSON string
 vertex_credentials_json = json.dumps(vertex_credentials)
@@ -217,15 +218,23 @@ kwargs = {"vertex_credentials": vertex_credentials}
 
 # Define main async entrypoint
 async def main():
-    file_path = "https://omni-demo-data.s3.amazonaws.com/test/cs101.pdf" ## local filepath and file URL supported
+  file_path = "https://omni-demo-data.s3.amazonaws.com/test/cs101.pdf" ## local filepath and file URL supported
+
+  ## process only some pages or all
+  select_pages = None ## None for all, but could be int or list(int) page numbers (1 indexed)
 
-    ## process only some pages or all
-    select_pages = None ## None for all, but could be int or list(int) page numbers (1 indexed)
+  output_file_path = "output.md" ## filepath to save the consolidated output file (markdown by default). Pass None to skip saving any output file
+  page_separator = "\n\n" ## The separator to use between pages when writing the output to `output_file_path`
 
-    output_dir = "./output_test" ## directory to save the consolidated markdown file
-    result = await zerox(file_path=file_path, model=model, output_dir=output_dir,
-                        custom_system_prompt=custom_system_prompt,select_pages=select_pages, **kwargs)
-    return result
+  ## function to apply on model's text output (on each page). Function should take input as string and return output also as string.
+  ## By default uses Zerox's format_markdown function to format text as markdown
+  # post_process_function = lambda x: x.strip() ## To skip any post processing pass None, which would just keep the raw text output from the model.
+
+  result = await zerox(file_path = file_path, model = model, output_file_path = output_file_path,
+                      custom_system_prompt = custom_system_prompt, select_pages = select_pages,
+                      # post_process_function = post_process_function, 
+                      **kwargs)
+  return result
 
 
 # run the main function:
@@ -244,10 +253,12 @@ async def zerox(
     file_path: Optional[str] = "",
     maintain_format: bool = False,
     model: str = "gpt-4o-mini",
-    output_dir: Optional[str] = None,
+    output_file_path: Optional[str] = None,
+    page_separator: str = "\n\n",
     temp_dir: Optional[str] = None,
     custom_system_prompt: Optional[str] = None,
     select_pages: Optional[Union[int, Iterable[int]]] = None,
+    post_process_function: Optional[Callable[[str], str]] = format_markdown,
     **kwargs
 ) -> ZeroxOutput:
   ...
@@ -266,22 +277,25 @@ Parameters
 - **model** (str, optional):
   The model to use for generating completions. Defaults to "gpt-4o-mini".
   Refer to LiteLLM Providers for the correct model name, as it may differ depending on the provider.
-- **output_dir** (Optional[str], optional):
-  The directory to save the markdown output. Defaults to None.
+- **output_file_path** (Optional[str], optional):
+  The path to save the markdown output (e.g., "output.md"). Any required directories will be created. Defaults to None.
+- **page_separator** (str, optional):
+  The separator to use between pages when writing the output to `output_file_path`. Defaults to "\n\n".
 - **temp_dir** (str, optional):
   The directory to store temporary files, defaults to some named folder in system's temp directory. If already exists, the contents will be deleted before zerox uses it.
 - **custom_system_prompt** (str, optional):
   The system prompt to use for the model, this overrides the default system prompt of zerox.Generally it is not required unless you want some specific behaviour. When set, it will raise a friendly warning. Defaults to None.
 - **select_pages** (Optional[Union[int, Iterable[int]]], optional):
   Pages to process, can be a single page number or an iterable of page numbers, Defaults to None
+- **post_process_function** (Optional[Callable[[str], str]], optional):
+  A function to post-process the text output from the model for each page. It should take a string as input and return a string as output. Defaults to Zerox's `format_markdown` function, which formats the output in markdown. Pass None to skip post-processing.
 - **kwargs** (dict, optional):
   Additional keyword arguments to pass to the litellm.completion method.
   Refer to the LiteLLM Documentation and Completion Input for details.
 
 Returns
-
 - ZeroxOutput:
-  Contains the markdown content generated by the model and also some metadata (refer below).
+  Contains the output content (markdown as default) generated by the model and also some metadata (refer below).
 
 ### Example Output (Output from "azure/gpt-4o-mini"):
 

diff --git a/py_zerox/pyzerox/core/types.py b/py_zerox/pyzerox/core/types.py
@@ -1,4 +1,5 @@
-from typing import List, Optional, Dict, Any, Union, Iterable
+from typing import List, Optional, Dict, Any, Union, Iterable, Callable
+from ..processor import format_markdown
 from dataclasses import dataclass, field
 
 
@@ -12,11 +13,13 @@ class ZeroxArgs:
     cleanup: bool = True
     concurrency: int = 10
     maintain_format: bool = False
-    model: str = "gpt-4o-mini",
-    output_dir: Optional[str] = None
+    model: str = "gpt-4o-mini"
+    output_file_path: Optional[str] = None
+    page_separator: Optional[str] = None
     temp_dir: Optional[str] = None
     custom_system_prompt: Optional[str] = None
     select_pages: Optional[Union[int, Iterable[int]]] = None
+    post_process_function: Optional[Callable[[str], str]] = format_markdown
     kwargs: Dict[str, Any] = field(default_factory=dict)
 
 @dataclass

diff --git a/py_zerox/pyzerox/core/zerox.py b/py_zerox/pyzerox/core/zerox.py
@@ -2,7 +2,7 @@
 import aioshutil as async_shutil
 import tempfile
 import warnings
-from typing import List, Optional, Union, Iterable
+from typing import List, Optional, Union, Iterable, Callable
 from datetime import datetime
 import aiofiles
 import aiofiles.os as async_os
@@ -15,6 +15,7 @@
     process_page,
     process_pages_in_batches,
     create_selected_pages_pdf,
+    format_markdown,
 )
 from ..errors import FileUnavailable
 from ..constants.messages import Messages
@@ -28,14 +29,16 @@ async def zerox(
     file_path: Optional[str] = "",
     maintain_format: bool = False,
     model: str = "gpt-4o-mini",
-    output_dir: Optional[str] = None,
+    output_file_path: Optional[str] = None,
+    page_separator: Optional[str] = None,
     temp_dir: Optional[str] = None,
     custom_system_prompt: Optional[str] = None,
     select_pages: Optional[Union[int, Iterable[int]]] = None,
+    post_process_function: Optional[Callable[[str], str]] = format_markdown,
     **kwargs
 ) -> ZeroxOutput:
     """
-    API to perform OCR to markdown using Vision models.
+    API to perform OCR to markdown (default) using Vision models.
     Please setup the environment variables for the model and model provider before using this API. Refer: https://docs.litellm.ai/docs/providers
 
     :param cleanup: Whether to cleanup the temporary files after processing, defaults to True
@@ -48,24 +51,28 @@ async def zerox(
     :type maintain_format: bool, optional
     :param model: The model to use for generating completions, defaults to "gpt-4o-mini". Note - Refer: https://docs.litellm.ai/docs/providers to pass correct model name as according to provider it might be different from actual name.
     :type model: str, optional
-    :param output_dir: The directory to save the markdown output, defaults to None
-    :type output_dir: str, optional
+    :param output_file_path: The path to save the output output file (Example "output.md"). Any required directories will be created, defaults to None
+    :type output_file_path: str, optional
     :param temp_dir: The directory to store temporary files, defaults to some named folder in system's temp directory. If already exists, the contents will be deleted for zerox uses it.
     :type temp_dir: str, optional
+    :param page_separator: The separator to use between pages (at the end of each page) when writing the output to "output_file_path", can include a {page_no} placeholder to insert the page number. Uses "\\n\\n<=== Page {page_no} ===>\\n\\n" by default. defaults to None
+    :type page_separator: str, None
     :param custom_system_prompt: The system prompt to use for the model, this overrides the default system prompt of zerox. Generally it is not required unless you want some specific behaviour. When set, it will raise a friendly warning, defaults to None
     :type custom_system_prompt: str, optional
     :param select_pages: Pages to process, can be a single page number or an iterable of page numbers, defaults to None
     :type select_pages: int or Iterable[int], optional
+    :param post_process_function: A function to post-process the text output from the model for each page. It should take string as an input and return string as an output, defaults to "format_markdown" function (zerox's default for markdown formatting). Pass None to skip any post processing on the text output of the model.
+    :type post_process_function: Callable[[str], str], optional
 
     :param kwargs: Additional keyword arguments to pass to the model.completion -> litellm.completion method. Refer: https://docs.litellm.ai/docs/providers and https://docs.litellm.ai/docs/completion/input
-    :return: The markdown content generated by the model.
+    :return: The content generated by the model after Zerox's postprocessing (if provided).
     """
 
 
     input_token_count = 0
     output_token_count = 0
     prior_page = ""
-    aggregated_markdown: List[str] = []
+    aggregated_output: List[str] = []
     start_time = datetime.now()
 
     # File Path Validators
@@ -84,13 +91,15 @@ async def zerox(
         warnings.warn(Messages.MAINTAIN_FORMAT_SELECTED_PAGES_WARNING)
 
     # If select_pages is a single integer, convert it to a list for consistency
-    if isinstance(select_pages, int):
-        select_pages = [select_pages]
-
-    # Sort the pages to maintain consistency
-    select_pages = sorted(select_pages)
+    if select_pages:
+        if isinstance(select_pages, int):
+            select_pages = [select_pages]
+        else:
+            # Sort the pages to maintain consistency
+            select_pages = sorted(list(select_pages))
 
-    # Ensure the output directory exists
+    # Ensure the directory for output_file_path exists
+    output_dir = os.path.dirname(output_file_path) if output_file_path else None
     if output_dir:
         await async_os.makedirs(output_dir, exist_ok=True)
 
@@ -138,10 +147,11 @@ async def zerox(
                     input_token_count,
                     output_token_count,
                     prior_page,
+                    post_process_function,
                 )
 
                 if result:
-                    aggregated_markdown.append(result)
+                    aggregated_output.append(result)
         else:
             results = await process_pages_in_batches(
                 images,
@@ -151,19 +161,30 @@ async def zerox(
                 input_token_count,
                 output_token_count,
                 prior_page,
+                post_process_function,
             )
 
-            aggregated_markdown = [result[0] for result in results if isinstance(result[0], str)]
+            aggregated_output = [result[0] for result in results if isinstance(result[0], str)]
 
             ## add token usage
             input_token_count += sum([result[1] for result in results])
             output_token_count += sum([result[2] for result in results])
 
-        # Write the aggregated markdown to a file
-        if output_dir:
-            result_file_path = os.path.join(output_dir, f"{file_name}.md")
-            async with aiofiles.open(result_file_path, "w") as f:
-                await f.write("\n\n".join(aggregated_markdown))
+        # Write the aggregated output to a file
+        if output_file_path:
+            if not page_separator and not isinstance(page_separator, str):
+                page_separator = "\n\n<=== Page {page_no} ===>\n\n"
+
+            async with aiofiles.open(output_file_path, "w") as f:
+                for i, page_content in enumerate(aggregated_output):
+                    await f.write(page_content)
+
+                    # Replace {page_no} with the actual page number in page_separator
+                    if "{page_no}" in page_separator:
+                        page_no_text = page_separator.format(page_no=(select_pages[i] if select_pages else i + 1))
+                        await f.write(f"{page_no_text}")
+                    else:
+                        await f.write(page_separator)
 
         # Cleanup the downloaded PDF file
         if cleanup and os.path.exists(temp_directory):
@@ -175,16 +196,16 @@ async def zerox(
 
         # Adjusting the formatted_pages logic to account for select_pages to output the correct page numbers
         if select_pages is not None:
-            # Map aggregated markdown to the selected pages
+            # Map aggregated_output to the selected pages
             formatted_pages = [
                         Page(content=content, page=select_pages[i], content_length=len(content))
-                        for i, content in enumerate(aggregated_markdown)
+                        for i, content in enumerate(aggregated_output)
                     ]
         else:
             # Default behavior when no select_pages is provided
             formatted_pages = [
                         Page(content=content, page=i + 1, content_length=len(content))
-                        for i, content in enumerate(aggregated_markdown)
+                        for i, content in enumerate(aggregated_output)
                     ]
 
         return ZeroxOutput(

diff --git a/py_zerox/pyzerox/processor/pdf.py b/py_zerox/pyzerox/processor/pdf.py
@@ -1,7 +1,7 @@
 import logging
 import os
 import asyncio
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Callable
 from pdf2image import convert_from_path
 
 # Package Imports
@@ -40,6 +40,7 @@ async def process_page(
     input_token_count: int = 0,
     output_token_count: int = 0,
     prior_page: str = "",
+    post_process_function: Optional[Callable[[str], str]] = format_markdown,
     semaphore: Optional[asyncio.Semaphore] = None,
 ) -> Tuple[str, int, int, str]:
     """Process a single page of a PDF"""
@@ -54,6 +55,7 @@ async def process_page(
                 input_token_count,
                 output_token_count,
                 prior_page,
+                post_process_function,
             )
 
     image_path = os.path.join(temp_directory, image)
@@ -66,12 +68,18 @@ async def process_page(
             prior_page=prior_page,
         )
 
-        formatted_markdown = format_markdown(completion.content)
+        ## post process the completion
+        if post_process_function:
+            output_text = post_process_function(completion.content)
+        else:
+            ## skip post processing
+            output_text = completion.content
+
         input_token_count += completion.input_tokens
         output_token_count += completion.output_tokens
-        prior_page = formatted_markdown
+        prior_page = output_text
 
-        return formatted_markdown, input_token_count, output_token_count, prior_page
+        return output_text, input_token_count, output_token_count, prior_page
 
     except Exception as error:
         logging.error(f"{Messages.FAILED_TO_PROCESS_IMAGE} Error:{error}")
@@ -86,6 +94,7 @@ async def process_pages_in_batches(
     input_token_count: int = 0,
     output_token_count: int = 0,
     prior_page: str = "",
+    post_process_function: Optional[Callable[[str], str]] = format_markdown,
 ):
     # Create a semaphore to limit the number of concurrent tasks
     semaphore = asyncio.Semaphore(concurrency)
@@ -99,6 +108,7 @@ async def process_pages_in_batches(
             input_token_count,
             output_token_count,
             prior_page,
+            post_process_function,
             semaphore,
         )
         for image in images

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "py-zerox"
-version = "0.0.5"
+version = "0.0.6"
 description = "ocr documents using vision models from all popular providers like OpenAI, Azure OpenAI, Anthropic, AWS Bedrock etc"
 authors = ["wizenheimer","pradhyumna85"]
 license = "MIT"

diff --git a/setup.cfg b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = py-zerox
-version = 0.0.5
+version = 0.0.6
 description = ocr documents using vision models from all popular providers like OpenAI, Azure OpenAI, Anthropic, AWS Bedrock etc
 long_description = file: README.md
 long_description_content_type = text/markdown

diff --git a/setup.py b/setup.py
@@ -19,7 +19,7 @@ def run(self):
     cmdclass={
         "install": InstallSystemDependencies,
     },
-    version="0.0.5",
+    version="0.0.6",
     packages=find_packages(where="py_zerox"),  # Specify the root folder
     package_dir={"": "py_zerox"},  # Map root directory
     include_package_data=True,