Spaces:

rt4u
/

marker

Sleeping

App Files Files Community

Vik Paruchuri commited on Oct 25, 2024

Commit

a800c59

1 Parent(s): d440114

Flatten pdfs, fix page separators

Browse files

Files changed (8) hide show

README.md +1 -1
marker/pdf/extract_text.py +1 -1
marker/postprocessors/markdown.py +40 -20
marker/schema/merged.py +2 -1
marker/settings.py +2 -1
marker/tables/table.py +1 -0
poetry.lock +4 -4
pyproject.toml +2 -2

README.md CHANGED Viewed

@@ -212,7 +212,7 @@ Set `DEBUG=true` to save data to the `debug` subfolder in the marker root direct
 These settings can improve/change output quality:
 - `OCR_ALL_PAGES` will force OCR across the document.  Many PDFs have bad text embedded due to older OCR engines being used.
-- `PAGINATE_OUTPUT` will put a horizontal rule between pages.  Default: False.
 - `EXTRACT_IMAGES` will extract images and save separately.  Default: True.
 - `BAD_SPAN_TYPES` specifies layout blocks to remove from the markdown output.

 These settings can improve/change output quality:
 - `OCR_ALL_PAGES` will force OCR across the document.  Many PDFs have bad text embedded due to older OCR engines being used.
+- `PAGINATE_OUTPUT` will put a horizontal rule between pages.  Default: False.  The horizontal rule will be `\n\n`, then `{PAGE_NUMBER}`, then 48 single dashes `-`, then `\n\n`.  The separator can be configured via the `PAGE_SEPARATOR` setting.
 - `EXTRACT_IMAGES` will extract images and save separately.  Default: True.
 - `BAD_SPAN_TYPES` specifies layout blocks to remove from the markdown output.

marker/pdf/extract_text.py CHANGED Viewed

@@ -92,7 +92,7 @@ def get_text_blocks(doc, fname, max_pages: Optional[int] = None, start_page: Opt
     page_range = range(start_page, start_page + max_pages)
-    char_blocks = dictionary_output(fname, page_range=page_range, keep_chars=False, workers=settings.PDFTEXT_CPU_WORKERS)
     marker_blocks = [pdftext_format_to_blocks(page, pnum) for pnum, page in enumerate(char_blocks)]
     return marker_blocks, toc

     page_range = range(start_page, start_page + max_pages)
+    char_blocks = dictionary_output(fname, page_range=page_range, keep_chars=False, workers=settings.PDFTEXT_CPU_WORKERS, flatten_pdf=settings.FLATTEN_PDF)
     marker_blocks = [pdftext_format_to_blocks(page, pnum) for pnum, page in enumerate(char_blocks)]
     return marker_blocks, toc

marker/postprocessors/markdown.py CHANGED Viewed

@@ -64,11 +64,19 @@ def merge_spans(pages: List[Page]) -> List[List[MergedBlock]]:
             if len(block_lines) > 0:
                 page_blocks.append(MergedBlock(
                     lines=block_lines,
-                    pnum=block.pnum,
                     bbox=block.bbox,
                     block_type=block.block_type,
                     heading_level=block.heading_level
                 ))
         merged_blocks.append(page_blocks)
     return merged_blocks
@@ -139,9 +147,6 @@ def block_separator(prev_block: FullyMergedBlock, block: FullyMergedBlock):
     if prev_block.block_type == "Text":
         sep = "\n\n"
-    if prev_block.page_end:
-        sep = settings.PAGE_SEPARATOR
     return sep + block.text
@@ -152,8 +157,30 @@ def merge_lines(blocks: List[List[MergedBlock]], max_block_gap=15):
     block_text = ""
     block_type = ""
     prev_heading_level = None
     for idx, page in enumerate(blocks):
         for block in page:
             block_type = block.block_type
             if (block_type != prev_type and prev_type) or (block.heading_level != prev_heading_level and prev_heading_level):
@@ -161,13 +188,15 @@ def merge_lines(blocks: List[List[MergedBlock]], max_block_gap=15):
                     FullyMergedBlock(
                         text=block_surround(block_text, prev_type, prev_heading_level),
                         block_type=prev_type if prev_type else settings.DEFAULT_BLOCK_TYPE,
-                        page_end=False
                     )
                 )
                 block_text = ""
             prev_type = block_type
             prev_heading_level = block.heading_level
             # Join lines in the block together properly
             for i, line in enumerate(block.lines):
                 line_height = line.bbox[3] - line.bbox[1]
@@ -181,28 +210,17 @@ def merge_lines(blocks: List[List[MergedBlock]], max_block_gap=15):
                 else:
                     block_text = line.text
-        # Force blocks to end at page boundaries
-        if settings.PAGINATE_OUTPUT:
-            text_blocks.append(
-                FullyMergedBlock(
-                    text=block_surround(block_text, prev_type, prev_heading_level),
-                    block_type=prev_type if prev_type else settings.DEFAULT_BLOCK_TYPE,
-                    page_end=True
-                )
-            )
-            block_text = ""
     # Append the final block
     text_blocks.append(
         FullyMergedBlock(
             text=block_surround(block_text, prev_type, prev_heading_level),
             block_type=block_type if block_type else settings.DEFAULT_BLOCK_TYPE,
-            page_end=False
         )
     )
-    text_blocks = [block for block in text_blocks if block.text.strip()]
     return text_blocks
@@ -210,7 +228,9 @@ def get_full_text(text_blocks):
     full_text = ""
     prev_block = None
     for block in text_blocks:
-        if prev_block:
             full_text += block_separator(prev_block, block)
         else:
             full_text += block.text

             if len(block_lines) > 0:
                 page_blocks.append(MergedBlock(
                     lines=block_lines,
+                    pnum=page.pnum,
                     bbox=block.bbox,
                     block_type=block.block_type,
                     heading_level=block.heading_level
                 ))
+        if len(page_blocks) == 0:
+            page_blocks.append(MergedBlock(
+                lines=[],
+                pnum=page.pnum,
+                bbox=page.bbox,
+                block_type="Text",
+                heading_level=None
+            ))
         merged_blocks.append(page_blocks)
     return merged_blocks
     if prev_block.block_type == "Text":
         sep = "\n\n"
     return sep + block.text
     block_text = ""
     block_type = ""
     prev_heading_level = None
+    pnum = None
     for idx, page in enumerate(blocks):
+        # Insert pagination at every page boundary
+        if settings.PAGINATE_OUTPUT:
+            if block_text:
+                text_blocks.append(
+                    FullyMergedBlock(
+                        text=block_surround(block_text, prev_type, prev_heading_level),
+                        block_type=prev_type if prev_type else settings.DEFAULT_BLOCK_TYPE,
+                        page_start=False,
+                        pnum=pnum
+                    )
+                )
+                block_text = ""
+            text_blocks.append(
+                FullyMergedBlock(
+                    text="",
+                    block_type="Text",
+                    page_start=True,
+                    pnum=page[0].pnum
+                )
+            )
         for block in page:
             block_type = block.block_type
             if (block_type != prev_type and prev_type) or (block.heading_level != prev_heading_level and prev_heading_level):
                     FullyMergedBlock(
                         text=block_surround(block_text, prev_type, prev_heading_level),
                         block_type=prev_type if prev_type else settings.DEFAULT_BLOCK_TYPE,
+                        page_start=False,
+                        pnum=block.pnum
                     )
                 )
                 block_text = ""
             prev_type = block_type
             prev_heading_level = block.heading_level
+            pnum = block.pnum
             # Join lines in the block together properly
             for i, line in enumerate(block.lines):
                 line_height = line.bbox[3] - line.bbox[1]
                 else:
                     block_text = line.text
     # Append the final block
     text_blocks.append(
         FullyMergedBlock(
             text=block_surround(block_text, prev_type, prev_heading_level),
             block_type=block_type if block_type else settings.DEFAULT_BLOCK_TYPE,
+            page_start=False,
+            pnum=pnum
         )
     )
+    text_blocks = [block for block in text_blocks if (block.text.strip() or block.page_start)]
     return text_blocks
     full_text = ""
     prev_block = None
     for block in text_blocks:
+        if block.page_start:
+            full_text += "\n\n{" + str(block.pnum) + "}" + settings.PAGE_SEPARATOR
+        elif prev_block:
             full_text += block_separator(prev_block, block)
         else:
             full_text += block.text

marker/schema/merged.py CHANGED Viewed

@@ -25,4 +25,5 @@ class MergedBlock(BboxElement):
 class FullyMergedBlock(BaseModel):
     text: str
     block_type: str
-    page_end: bool

 class FullyMergedBlock(BaseModel):
     text: str
     block_type: str
+    page_start: bool
+    pnum: int | None

marker/settings.py CHANGED Viewed

@@ -14,6 +14,7 @@ class Settings(BaseSettings):
     EXTRACT_IMAGES: bool = True # Extract images from pdfs and save them
     PAGINATE_OUTPUT: bool = False # Paginate output markdown
     BASE_DIR: str = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
     @computed_field
     @property
@@ -88,7 +89,7 @@ class Settings(BaseSettings):
     HEADING_DEFAULT_LEVEL: int = 2
     # Output
-    PAGE_SEPARATOR: str = "\n\n" + "-" * 48 + "\n\n"
     # Debug
     DEBUG_DATA_FOLDER: str = os.path.join(BASE_DIR, "debug_data")

     EXTRACT_IMAGES: bool = True # Extract images from pdfs and save them
     PAGINATE_OUTPUT: bool = False # Paginate output markdown
     BASE_DIR: str = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    FLATTEN_PDF: bool = True # Pull form field values into the PDF before converting to markdown
     @computed_field
     @property
     HEADING_DEFAULT_LEVEL: int = 2
     # Output
+    PAGE_SEPARATOR: str = "-" * 48 + "\n\n"
     # Debug
     DEBUG_DATA_FOLDER: str = os.path.join(BASE_DIR, "debug_data")

marker/tables/table.py CHANGED Viewed

@@ -73,6 +73,7 @@ def get_table_boxes(pages: List[Page], doc: PdfDocument, fname):
         fname,
         doc_idxs,
         [hr for i, hr in enumerate(img_sizes) if i in table_idxs],
     )
     text_lines = []
     out_img_sizes = []

         fname,
         doc_idxs,
         [hr for i, hr in enumerate(img_sizes) if i in table_idxs],
+        # Add flatten pdf here
     )
     text_lines = []
     out_img_sizes = []

poetry.lock CHANGED Viewed

@@ -4175,13 +4175,13 @@ snowflake = ["snowflake-connector-python (>=2.8.0)", "snowflake-snowpark-python[
 [[package]]
 name = "surya-ocr"
-version = "0.6.10"
 description = "OCR, layout, reading order, and table recognition in 90+ languages"
 optional = false
 python-versions = "<4.0,>=3.10"
 files = [
-    {file = "surya_ocr-0.6.10-py3-none-any.whl", hash = "sha256:e22038d226d73bead781abda761a3813bacb1261f47996a00f5679686e86434e"},
-    {file = "surya_ocr-0.6.10.tar.gz", hash = "sha256:7867dd02242a67e8d632d3f1343c62eb16e2068a98e36612dce1ec40065ff5b5"},
 ]
 [package.dependencies]
@@ -5076,4 +5076,4 @@ propcache = ">=0.2.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "380f95b398ed6864345aa1ed6d5357bfef0045cfee6bf32450d71e6e05ec079f"

 [[package]]
 name = "surya-ocr"
+version = "0.6.11"
 description = "OCR, layout, reading order, and table recognition in 90+ languages"
 optional = false
 python-versions = "<4.0,>=3.10"
 files = [
+    {file = "surya_ocr-0.6.11-py3-none-any.whl", hash = "sha256:9b0c8638feda3f0f9db73a2ebceccdcbf5fdbb4cef0102b8d19837c455799347"},
+    {file = "surya_ocr-0.6.11.tar.gz", hash = "sha256:d1415fcceae30cd44b08e8012d810efef51538dcf8d902fad035189b6219ee48"},
 ]
 [package.dependencies]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
+content-hash = "e60697c44fdc30b1d5b48e5f4077ac4c65ff5844a49a4057f236dc6933a56dbb"

pyproject.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "marker-pdf"
-version = "0.3.8"
 description = "Convert PDF to markdown with high speed and accuracy."
 authors = ["Vik Paruchuri <github@vikas.sh>"]
 readme = "README.md"
@@ -32,7 +32,7 @@ tabulate = "^0.9.0"
 ftfy = "^6.1.1"
 texify = "^0.2.0"
 rapidfuzz = "^3.8.1"
-surya-ocr = "^0.6.10"
 filetype = "^1.2.0"
 regex = "^2024.4.28"
 pdftext = "^0.3.17"

 [tool.poetry]
 name = "marker-pdf"
+version = "0.3.9"
 description = "Convert PDF to markdown with high speed and accuracy."
 authors = ["Vik Paruchuri <github@vikas.sh>"]
 readme = "README.md"
 ftfy = "^6.1.1"
 texify = "^0.2.0"
 rapidfuzz = "^3.8.1"
+surya-ocr = "^0.6.11"
 filetype = "^1.2.0"
 regex = "^2024.4.28"
 pdftext = "^0.3.17"