Spaces:

rt4u
/

marker

Sleeping

App Files Files Community

Vik Paruchuri commited on Nov 18, 2024

Commit

9ff9e66

1 Parent(s): cf0611c

Output images, clean up other output formats

Browse files

Files changed (20) hide show

.gitignore +1 -0
marker/v2/converters/pdf.py +16 -4
marker/v2/processors/__init__.py +2 -2
marker/v2/processors/equation.py +2 -2
marker/v2/processors/table.py +2 -2
marker/v2/renderers/__init__.py +1 -0
marker/v2/renderers/html.py +35 -8
marker/v2/renderers/markdown.py +20 -5
marker/v2/schema/blocks/base.py +25 -0
marker/v2/schema/blocks/equation.py +1 -1
marker/v2/schema/blocks/figure.py +1 -1
marker/v2/schema/blocks/form.py +9 -0
marker/v2/schema/blocks/pagefooter.py +5 -0
marker/v2/schema/blocks/pageheader.py +5 -0
marker/v2/schema/blocks/picture.py +1 -1
marker/v2/schema/blocks/text.py +0 -1
marker/v2/schema/blocks/toc.py +9 -0
marker/v2/schema/document.py +6 -1
marker/v2/schema/groups/list.py +1 -1
marker/v2/schema/text/line.py +2 -1

.gitignore CHANGED Viewed

@@ -10,6 +10,7 @@ report.json
 benchmark_data
 debug_data
 temp.md
 # Byte-compiled / optimized / DLL files
 __pycache__/

 benchmark_data
 debug_data
 temp.md
+temp
 # Byte-compiled / optimized / DLL files
 __pycache__/

marker/v2/converters/pdf.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import tempfile
 from typing import List, Optional
 import datasets
 from pydantic import BaseModel
@@ -43,9 +45,14 @@ class PdfConverter(BaseConverter):
         return renderer(document)
-if __name__ == "__main__":
     dataset = datasets.load_dataset("datalab-to/pdfs", split="train")
-    idx = dataset['filename'].index('adversarial.pdf')
     with tempfile.NamedTemporaryFile(suffix=".pdf") as temp_pdf:
         temp_pdf.write(dataset['pdf'][idx])
@@ -54,7 +61,12 @@ if __name__ == "__main__":
         converter = PdfConverter()
         rendered = converter(temp_pdf.name)
-        with open("temp.md", "w+") as f:
-            f.write(rendered)

+import os
 import tempfile
 from typing import List, Optional
+import click
 import datasets
 from pydantic import BaseModel
         return renderer(document)
+@click.command()
+@click.option("--output", type=click.Path(exists=False), required=False, default="temp")
+@click.option("--fname", type=str, default="adversarial.pdf")
+def main(output: str, fname: str):
     dataset = datasets.load_dataset("datalab-to/pdfs", split="train")
+    idx = dataset['filename'].index(fname)
+    out_filename = fname.rsplit(".", 1)[0] + ".md"
+    os.makedirs(output, exist_ok=True)
     with tempfile.NamedTemporaryFile(suffix=".pdf") as temp_pdf:
         temp_pdf.write(dataset['pdf'][idx])
         converter = PdfConverter()
         rendered = converter(temp_pdf.name)
+        with open(os.path.join(output, out_filename), "w+") as f:
+            f.write(rendered.markdown)
+        for img_name, img in rendered.images.items():
+            img.save(os.path.join(output, img_name))
+if __name__ == "__main__":
+    main()

marker/v2/processors/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Optional
 from pydantic import BaseModel
@@ -7,7 +7,7 @@ from marker.v2.util import assign_config
 class BaseProcessor:
-    block_type: str | None = None # What block type this processor is responsible for
     def __init__(self, config: Optional[BaseModel | dict] = None):
         assign_config(self, config)

+from typing import Optional, Tuple
 from pydantic import BaseModel
 class BaseProcessor:
+    block_types: Tuple[str] | None = None # What block types this processor is responsible for
     def __init__(self, config: Optional[BaseModel | dict] = None):
         assign_config(self, config)

marker/v2/processors/equation.py CHANGED Viewed

@@ -11,7 +11,7 @@ from texify.inference import batch_inference
 class EquationProcessor(BaseProcessor):
-    block_type = "Equation"
     model_max_length = 384
     batch_size = None
     token_buffer = 256
@@ -26,7 +26,7 @@ class EquationProcessor(BaseProcessor):
         for page in document.pages:
             for block in page.children:
-                if block.block_type != self.block_type:
                     continue
                 image_poly = block.polygon.rescale((page.polygon.width, page.polygon.height), page.lowres_image.size)
                 image = page.lowres_image.crop(image_poly.bbox).convert("RGB")

 class EquationProcessor(BaseProcessor):
+    block_types = ("Equation", )
     model_max_length = 384
     batch_size = None
     token_buffer = 256
         for page in document.pages:
             for block in page.children:
+                if block.block_type not in self.block_types:
                     continue
                 image_poly = block.polygon.rescale((page.polygon.width, page.polygon.height), page.lowres_image.size)
                 image = page.lowres_image.crop(image_poly.bbox).convert("RGB")

marker/v2/processors/table.py CHANGED Viewed

@@ -12,7 +12,7 @@ from marker.v2.schema.document import Document
 class TableProcessor(BaseProcessor):
-    block_type = BlockTypes.Table
     detect_boxes = False
     detector_batch_size = None
     table_rec_batch_size = None
@@ -31,7 +31,7 @@ class TableProcessor(BaseProcessor):
         table_data = []
         for page in document.pages:
             for block in page.children:
-                if block.block_type != self.block_type:
                     continue
                 image_poly = block.polygon.rescale((page.polygon.width, page.polygon.height), page.highres_image.size)

 class TableProcessor(BaseProcessor):
+    block_types = (BlockTypes.Table, BlockTypes.TableOfContents, BlockTypes.Form)
     detect_boxes = False
     detector_batch_size = None
     table_rec_batch_size = None
         table_data = []
         for page in document.pages:
             for block in page.children:
+                if block.block_type not in self.block_types:
                     continue
                 image_poly = block.polygon.rescale((page.polygon.width, page.polygon.height), page.highres_image.size)

marker/v2/renderers/__init__.py CHANGED Viewed

@@ -3,6 +3,7 @@ from typing import Optional
 from pydantic import BaseModel
 class BaseRenderer:
     block_type: str | None = None

 from pydantic import BaseModel
 class BaseRenderer:
     block_type: str | None = None

marker/v2/renderers/html.py CHANGED Viewed

@@ -1,33 +1,60 @@
 from bs4 import BeautifulSoup
 from marker.v2.renderers import BaseRenderer
 from marker.v2.schema import BlockTypes
 class HTMLRenderer(BaseRenderer):
     remove_blocks: list = [BlockTypes.PageHeader, BlockTypes.PageFooter]
     image_blocks: list = [BlockTypes.Picture, BlockTypes.Figure]
     def extract_html(self, document, document_output):
         soup = BeautifulSoup(document_output.html, 'html.parser')
         content_refs = soup.find_all('content-ref')
-        ref_block_type = None
         for ref in content_refs:
             src = ref.get('src')
             for item in document_output.children:
                 if item.id == src:
-                    content = self.extract_html(document, item)
-                    ref_block_type = item.id.block_type
                     break
-            if ref_block_type in self.remove_blocks:
                 ref.replace_with('')
             else:
                 ref.replace_with(BeautifulSoup(f"<div>{content}</div>", 'html.parser'))
-        return str(soup)
-    def __call__(self, document):
         document_output = document.render()
-        full_html = self.extract_html(document, document_output)
-        return full_html

 from bs4 import BeautifulSoup
+from pydantic import BaseModel
 from marker.v2.renderers import BaseRenderer
 from marker.v2.schema import BlockTypes
+from marker.v2.schema.blocks import BlockId
+class HTMLOutput(BaseModel):
+    html: str
+    images: dict
 class HTMLRenderer(BaseRenderer):
     remove_blocks: list = [BlockTypes.PageHeader, BlockTypes.PageFooter]
     image_blocks: list = [BlockTypes.Picture, BlockTypes.Figure]
+    def extract_image(self, document, image_id):
+        image_block = document.get_block(image_id)
+        page = document.get_page(image_block.page_id)
+        page_img = page.highres_image
+        image_box = image_block.polygon.rescale(page.polygon.size, page_img.size)
+        cropped = page_img.crop(image_box.bbox)
+        return cropped
     def extract_html(self, document, document_output):
         soup = BeautifulSoup(document_output.html, 'html.parser')
         content_refs = soup.find_all('content-ref')
+        ref_block_id = None
+        images = {}
         for ref in content_refs:
             src = ref.get('src')
+            sub_images = {}
             for item in document_output.children:
                 if item.id == src:
+                    content, sub_images = self.extract_html(document, item)
+                    ref_block_id: BlockId = item.id
                     break
+            if ref_block_id.block_type in self.remove_blocks:
                 ref.replace_with('')
+            elif ref_block_id.block_type in self.image_blocks:
+                image = self.extract_image(document, ref_block_id)
+                image_name = f"{ref_block_id.to_path()}.png"
+                images[image_name] = image
+                ref.replace_with(BeautifulSoup(f"<p><img src='{image_name}'></p>", 'html.parser'))
             else:
+                images.update(sub_images)
                 ref.replace_with(BeautifulSoup(f"<div>{content}</div>", 'html.parser'))
+        return str(soup), images
+    def __call__(self, document) -> HTMLOutput:
         document_output = document.render()
+        full_html, images = self.extract_html(document, document_output)
+        return HTMLOutput(
+            html=full_html,
+            images=images,
+        )

marker/v2/renderers/markdown.py CHANGED Viewed

@@ -1,17 +1,32 @@
-from markdownify import markdownify
 from marker.v2.renderers.html import HTMLRenderer
 class MarkdownRenderer(HTMLRenderer):
-    def __call__(self, document):
         document_output = document.render()
-        full_html = self.extract_html(document, document_output)
-        return markdownify(
-            full_html,
             heading_style="ATX",
             bullets="-",
             escape_misc=False,
             escape_underscores=False
         )

+from markdownify import markdownify, MarkdownConverter
+from pydantic import BaseModel
 from marker.v2.renderers.html import HTMLRenderer
+class Markdownify(MarkdownConverter):
+    pass
+class MarkdownOutput(BaseModel):
+    markdown: str
+    images: dict
 class MarkdownRenderer(HTMLRenderer):
+    def __call__(self, document) -> MarkdownOutput:
         document_output = document.render()
+        full_html, images = self.extract_html(document, document_output)
+        md_cls = Markdownify(
             heading_style="ATX",
             bullets="-",
             escape_misc=False,
             escape_underscores=False
         )
+        markdown = md_cls.convert(full_html)
+        return MarkdownOutput(
+            markdown=markdown,
+            images=images
+        )

marker/v2/schema/blocks/base.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from __future__ import annotations
 from typing import Optional, List, Any
 from pydantic import BaseModel, ConfigDict, field_validator
@@ -44,6 +45,28 @@ class BlockId(BaseModel):
             raise ValueError(f"Invalid block type: {v}")
         return v
 class Block(BaseModel):
     polygon: PolygonBox
@@ -105,6 +128,8 @@ class Block(BaseModel):
         template = ""
         for c in child_blocks:
             template += f"<content-ref src='{c.id}'></content-ref>"
         return template
     def render(self, document, parent_structure):

 from __future__ import annotations
 from typing import Optional, List, Any
+import re
 from pydantic import BaseModel, ConfigDict, field_validator
             raise ValueError(f"Invalid block type: {v}")
         return v
+    def to_path(self):
+        return str(self).replace('/', '_')
+def merge_consecutive_tags(html, tag):
+    if not html:
+        return html
+    def replace_with_space(match):
+        closing_tag, whitespace, opening_tag = match.groups()
+        return whitespace if whitespace else ''
+    pattern = fr'</{tag}>\s*<{tag}>'
+    while True:
+        new_merged = re.sub(pattern, replace_with_space, html)
+        if new_merged == html:
+            break
+        html = new_merged
+    return html
 class Block(BaseModel):
     polygon: PolygonBox
         template = ""
         for c in child_blocks:
             template += f"<content-ref src='{c.id}'></content-ref>"
+        template = merge_consecutive_tags(template, 'b')
+        template = merge_consecutive_tags(template, 'i')
         return template
     def render(self, document, parent_structure):

marker/v2/schema/blocks/equation.py CHANGED Viewed

@@ -6,4 +6,4 @@ class Equation(Block):
     latex: str | None = None
     def assemble_html(self, child_blocks, parent_structure=None):
-        return f"<div class='math'>{self.latex}</div>"

     latex: str | None = None
     def assemble_html(self, child_blocks, parent_structure=None):
+        return f"<p><math>{self.latex}</math></p>"

marker/v2/schema/blocks/figure.py CHANGED Viewed

@@ -5,4 +5,4 @@ class Figure(Block):
     block_type: str = "Figure"
     def assemble_html(self, child_blocks, parent_structure):
-        return f"Image {self.block_id}"

     block_type: str = "Figure"
     def assemble_html(self, child_blocks, parent_structure):
+        return f"<p>Image {self.block_id}</p>"

marker/v2/schema/blocks/form.py CHANGED Viewed

@@ -1,5 +1,14 @@
 from marker.v2.schema.blocks import Block
 class Form(Block):
     block_type: str = "Form"

+from typing import List
+from tabled.formats import html_format
+from tabled.schema import SpanTableCell
 from marker.v2.schema.blocks import Block
 class Form(Block):
     block_type: str = "Form"
+    cells: List[SpanTableCell] | None = None
+    def assemble_html(self, child_blocks, parent_structure=None):
+        return html_format(self.cells)

marker/v2/schema/blocks/pagefooter.py CHANGED Viewed

@@ -3,3 +3,8 @@ from marker.v2.schema.blocks import Block
 class PageFooter(Block):
     block_type: str = "PageFooter"

 class PageFooter(Block):
     block_type: str = "PageFooter"
+    def assemble_html(self, child_blocks, parent_structure):
+        template = super().assemble_html(child_blocks, parent_structure)
+        template = template.replace("\n", " ")
+        return f"<p>{template}</p>"

marker/v2/schema/blocks/pageheader.py CHANGED Viewed

@@ -3,3 +3,8 @@ from marker.v2.schema.blocks import Block
 class PageHeader(Block):
     block_type: str = "PageHeader"

 class PageHeader(Block):
     block_type: str = "PageHeader"
+    def assemble_html(self, child_blocks, parent_structure):
+        template = super().assemble_html(child_blocks, parent_structure)
+        template = template.replace("\n", " ")
+        return f"<p>{template}</p>"

marker/v2/schema/blocks/picture.py CHANGED Viewed

@@ -5,4 +5,4 @@ class Picture(Block):
     block_type: str = "Picture"
     def assemble_html(self, child_blocks, parent_structure):
-        return f"Image {self.block_id}"

     block_type: str = "Picture"
     def assemble_html(self, child_blocks, parent_structure):
+        return f"<p>Image {self.block_id}</p>"

marker/v2/schema/blocks/text.py CHANGED Viewed

@@ -1,6 +1,5 @@
 from marker.v2.schema.blocks import Block
 class Text(Block):
     block_type: str = "Text"

 from marker.v2.schema.blocks import Block
 class Text(Block):
     block_type: str = "Text"

marker/v2/schema/blocks/toc.py CHANGED Viewed

@@ -1,5 +1,14 @@
 from marker.v2.schema.blocks import Block
 class TableOfContents(Block):
     block_type: str = "TableOfContents"

+from typing import List
+from tabled.formats import html_format
+from tabled.schema import SpanTableCell
 from marker.v2.schema.blocks import Block
 class TableOfContents(Block):
     block_type: str = "TableOfContents"
+    cells: List[SpanTableCell] | None = None
+    def assemble_html(self, child_blocks, parent_structure=None):
+        return html_format(self.cells)

marker/v2/schema/document.py CHANGED Viewed

@@ -20,12 +20,17 @@ class Document(BaseModel):
     block_type: str = "Document"
     def get_block(self, block_id: BlockId):
-        page = [p for p in self.pages if p.page_id == block_id.page_id][0]
         block = page.get_block(block_id)
         if block:
             return block
         return None
     def assemble_html(self, child_blocks):
         template = ""
         for c in child_blocks:

     block_type: str = "Document"
     def get_block(self, block_id: BlockId):
+        page = self.get_page(block_id.page_id)
         block = page.get_block(block_id)
         if block:
             return block
         return None
+    def get_page(self, page_id):
+        page = self.pages[page_id]
+        assert page.page_id == page_id, "Mismatch between page_id and page index"
+        return page
     def assemble_html(self, child_blocks):
         template = ""
         for c in child_blocks:

marker/v2/schema/groups/list.py CHANGED Viewed

@@ -6,4 +6,4 @@ class ListGroup(Block):
     def assemble_html(self, child_blocks, parent_structure):
         template = super().assemble_html(child_blocks, parent_structure)
-        return f"<ul>{template}</ul>"

     def assemble_html(self, child_blocks, parent_structure):
         template = super().assemble_html(child_blocks, parent_structure)
+        return f"<p><ul>{template}</ul></p>"

marker/v2/schema/text/line.py CHANGED Viewed

@@ -25,7 +25,8 @@ def strip_trailing_hyphens(line_text, next_line_text, line_html) -> str:
     next_line_starts_lowercase = regex.match(rf"^\s?[{lowercase_letters}]", next_line_text)
     if hyphen_regex.match(line_text) and next_line_starts_lowercase:
-        return replace_last(line_html, rf'[{HYPHENS}]', "")
     return line_html

     next_line_starts_lowercase = regex.match(rf"^\s?[{lowercase_letters}]", next_line_text)
     if hyphen_regex.match(line_text) and next_line_starts_lowercase:
+        line_html = replace_last(line_html, rf'[{HYPHENS}]', "")
     return line_html