Spaces:

rt4u
/

marker

Sleeping

App Files Files Community

Vik Paruchuri commited on Aug 19

Commit

0c7adea

1 Parent(s): 4f69a6f

Add block ids to html renderer

Browse files

Files changed (6) hide show

marker/processors/llm/llm_table.py +1 -1
marker/renderers/__init__.py +4 -0
marker/renderers/html.py +40 -7
marker/schema/blocks/basetable.py +5 -3
marker/schema/blocks/tablecell.py +4 -0
tests/renderers/test_html_renderer.py +19 -0

marker/processors/llm/llm_table.py CHANGED Viewed

@@ -161,7 +161,7 @@ No corrections needed.
                 batch_bbox[3] = block_image.size[1]
             batch_image = block_image.crop(batch_bbox)
-            block_html = block.format_cells(document, [], batch_cells)
             batch_image = self.handle_image_rotation(batch_cells, batch_image)
             batch_parsed_cells = self.rewrite_single_chunk(
                 page, block, block_html, batch_cells, batch_image

                 batch_bbox[3] = block_image.size[1]
             batch_image = block_image.crop(batch_bbox)
+            block_html = block.format_cells(document, [], None, batch_cells)
             batch_image = self.handle_image_rotation(batch_cells, batch_image)
             batch_parsed_cells = self.rewrite_single_chunk(
                 page, block, block_html, batch_cells, batch_image

marker/renderers/__init__.py CHANGED Viewed

@@ -29,6 +29,9 @@ class BaseRenderer:
     keep_pagefooter_in_output: Annotated[
         bool, "Keep the page footer in the output HTML."
     ] = False
     def __init__(self, config: Optional[BaseModel | dict] = None):
         assign_config(self, config)
@@ -36,6 +39,7 @@ class BaseRenderer:
         self.block_config = {
             "keep_pageheader_in_output": self.keep_pageheader_in_output,
             "keep_pagefooter_in_output": self.keep_pagefooter_in_output,
         }
     def __call__(self, document):

     keep_pagefooter_in_output: Annotated[
         bool, "Keep the page footer in the output HTML."
     ] = False
+    add_block_ids: Annotated[bool, "Whether to add block IDs to the output HTML."] = (
+        False
+    )
     def __init__(self, config: Optional[BaseModel | dict] = None):
         assign_config(self, config)
         self.block_config = {
             "keep_pageheader_in_output": self.keep_pageheader_in_output,
             "keep_pagefooter_in_output": self.keep_pagefooter_in_output,
+            "add_block_ids": self.add_block_ids,
         }
     def __call__(self, document):

marker/renderers/html.py CHANGED Viewed

@@ -47,6 +47,37 @@ class HTMLRenderer(BaseRenderer):
         )
         return cropped
     def extract_html(self, document, document_output, level=0):
         soup = BeautifulSoup(document_output.html, "html.parser")
@@ -69,22 +100,24 @@ class HTMLRenderer(BaseRenderer):
                     image = self.extract_image(document, ref_block_id)
                     image_name = f"{ref_block_id.to_path()}.{settings.OUTPUT_IMAGE_FORMAT.lower()}"
                     images[image_name] = image
-                    ref.replace_with(
-                        BeautifulSoup(
-                            f"<p>{content}<img src='{image_name}'></p>", "html.parser"
-                        )
                     )
                 else:
                     # This will be the image description if using llm mode, or empty if not
-                    ref.replace_with(BeautifulSoup(f"{content}", "html.parser"))
             elif ref_block_id.block_type in self.page_blocks:
                 images.update(sub_images)
                 if self.paginate_output:
                     content = f"<div class='page' data-page-id='{ref_block_id.page_id}'>{content}</div>"
-                ref.replace_with(BeautifulSoup(f"{content}", "html.parser"))
             else:
                 images.update(sub_images)
-                ref.replace_with(BeautifulSoup(f"{content}", "html.parser"))
         output = str(soup)
         if level == 0:

         )
         return cropped
+    def insert_block_id(self, soup, block_id: BlockId):
+        """
+        Insert a block ID into the soup as a data attribute.
+        """
+        if block_id.block_type in [BlockTypes.Line, BlockTypes.Span]:
+            return soup
+        if self.add_block_ids:
+            # Find the outermost tag (first tag that isn't a NavigableString)
+            outermost_tag = None
+            for element in soup.contents:
+                if hasattr(element, "name") and element.name:
+                    outermost_tag = element
+                    break
+            # If we found an outermost tag, add the data-block-id attribute
+            if outermost_tag:
+                outermost_tag["data-block-id"] = str(block_id)
+            # If soup only contains text or no tags, wrap in a span
+            elif soup.contents:
+                wrapper = soup.new_tag("span")
+                wrapper["data-block-id"] = str(block_id)
+                contents = list(soup.contents)
+                for content in contents:
+                    content.extract()
+                    wrapper.append(content)
+                soup.append(wrapper)
+        return soup
     def extract_html(self, document, document_output, level=0):
         soup = BeautifulSoup(document_output.html, "html.parser")
                     image = self.extract_image(document, ref_block_id)
                     image_name = f"{ref_block_id.to_path()}.{settings.OUTPUT_IMAGE_FORMAT.lower()}"
                     images[image_name] = image
+                    element = BeautifulSoup(
+                        f"<p>{content}<img src='{image_name}'></p>", "html.parser"
                     )
+                    ref.replace_with(self.insert_block_id(element, ref_block_id))
                 else:
                     # This will be the image description if using llm mode, or empty if not
+                    element = BeautifulSoup(f"{content}", "html.parser")
+                    ref.replace_with(self.insert_block_id(element, ref_block_id))
             elif ref_block_id.block_type in self.page_blocks:
                 images.update(sub_images)
                 if self.paginate_output:
                     content = f"<div class='page' data-page-id='{ref_block_id.page_id}'>{content}</div>"
+                element = BeautifulSoup(f"{content}", "html.parser")
+                ref.replace_with(self.insert_block_id(element, ref_block_id))
             else:
                 images.update(sub_images)
+                element = BeautifulSoup(f"{content}", "html.parser")
+                ref.replace_with(self.insert_block_id(element, ref_block_id))
         output = str(soup)
         if level == 0:

marker/schema/blocks/basetable.py CHANGED Viewed

@@ -11,7 +11,7 @@ class BaseTable(Block):
     @staticmethod
     def format_cells(
-        document, child_blocks, child_cells: List[TableCell] | None = None
     ):
         if child_cells is None:
             child_cells: List[TableCell] = [
@@ -28,7 +28,9 @@ class BaseTable(Block):
             )
             html_repr += "<tr>"
             for cell in row_cells:
-                html_repr += cell.assemble_html(document, child_blocks, None, None)
             html_repr += "</tr>"
         html_repr += "</tbody></table>"
         return html_repr
@@ -56,7 +58,7 @@ class BaseTable(Block):
             return template + self.html
         elif len(child_blocks) > 0 and BlockTypes.TableCell in child_block_types:
             # Table processor
-            return template + self.format_cells(document, child_blocks)
         else:
             # Default text lines and spans
             return f"<p>{template}</p>"

     @staticmethod
     def format_cells(
+        document, child_blocks, block_config, child_cells: List[TableCell] | None = None
     ):
         if child_cells is None:
             child_cells: List[TableCell] = [
             )
             html_repr += "<tr>"
             for cell in row_cells:
+                html_repr += cell.assemble_html(
+                    document, child_blocks, None, block_config
+                )
             html_repr += "</tr>"
         html_repr += "</tbody></table>"
         return html_repr
             return template + self.html
         elif len(child_blocks) > 0 and BlockTypes.TableCell in child_block_types:
             # Table processor
+            return template + self.format_cells(document, child_blocks, block_config)
         else:
             # Default text lines and spans
             return f"<p>{template}</p>"

marker/schema/blocks/tablecell.py CHANGED Viewed

@@ -21,12 +21,16 @@ class TableCell(Block):
     def assemble_html(
         self, document, child_blocks, parent_structure=None, block_config=None
     ):
         tag_cls = "th" if self.is_header else "td"
         tag = f"<{tag_cls}"
         if self.rowspan > 1:
             tag += f" rowspan={self.rowspan}"
         if self.colspan > 1:
             tag += f" colspan={self.colspan}"
         if self.text_lines is None:
             self.text_lines = []
         text = "<br>".join(self.text_lines)

     def assemble_html(
         self, document, child_blocks, parent_structure=None, block_config=None
     ):
+        add_cell_id = block_config and block_config.get("add_block_ids", False)
         tag_cls = "th" if self.is_header else "td"
         tag = f"<{tag_cls}"
         if self.rowspan > 1:
             tag += f" rowspan={self.rowspan}"
         if self.colspan > 1:
             tag += f" colspan={self.colspan}"
+        if add_cell_id:
+            tag += f' data-block-id="{self.id}"'
         if self.text_lines is None:
             self.text_lines = []
         text = "<br>".join(self.text_lines)

tests/renderers/test_html_renderer.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import pytest
+from marker.renderers.html import HTMLRenderer
+@pytest.mark.config(
+    {
+        "page_range": [0],
+        "disable_ocr": True,
+        "add_block_ids": True,
+        "paginate_output": True,
+    }
+)
+def test_html_renderer_block_ids(pdf_document, config):
+    renderer = HTMLRenderer(config)
+    html = renderer(pdf_document).html
+    # Verify some block IDs are present
+    assert "/page/0/Text/1" in html