Vik Paruchuri
commited on
Commit
·
0c7adea
1
Parent(s):
4f69a6f
Add block ids to html renderer
Browse files
marker/processors/llm/llm_table.py
CHANGED
|
@@ -161,7 +161,7 @@ No corrections needed.
|
|
| 161 |
batch_bbox[3] = block_image.size[1]
|
| 162 |
|
| 163 |
batch_image = block_image.crop(batch_bbox)
|
| 164 |
-
block_html = block.format_cells(document, [], batch_cells)
|
| 165 |
batch_image = self.handle_image_rotation(batch_cells, batch_image)
|
| 166 |
batch_parsed_cells = self.rewrite_single_chunk(
|
| 167 |
page, block, block_html, batch_cells, batch_image
|
|
|
|
| 161 |
batch_bbox[3] = block_image.size[1]
|
| 162 |
|
| 163 |
batch_image = block_image.crop(batch_bbox)
|
| 164 |
+
block_html = block.format_cells(document, [], None, batch_cells)
|
| 165 |
batch_image = self.handle_image_rotation(batch_cells, batch_image)
|
| 166 |
batch_parsed_cells = self.rewrite_single_chunk(
|
| 167 |
page, block, block_html, batch_cells, batch_image
|
marker/renderers/__init__.py
CHANGED
|
@@ -29,6 +29,9 @@ class BaseRenderer:
|
|
| 29 |
keep_pagefooter_in_output: Annotated[
|
| 30 |
bool, "Keep the page footer in the output HTML."
|
| 31 |
] = False
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
def __init__(self, config: Optional[BaseModel | dict] = None):
|
| 34 |
assign_config(self, config)
|
|
@@ -36,6 +39,7 @@ class BaseRenderer:
|
|
| 36 |
self.block_config = {
|
| 37 |
"keep_pageheader_in_output": self.keep_pageheader_in_output,
|
| 38 |
"keep_pagefooter_in_output": self.keep_pagefooter_in_output,
|
|
|
|
| 39 |
}
|
| 40 |
|
| 41 |
def __call__(self, document):
|
|
|
|
| 29 |
keep_pagefooter_in_output: Annotated[
|
| 30 |
bool, "Keep the page footer in the output HTML."
|
| 31 |
] = False
|
| 32 |
+
add_block_ids: Annotated[bool, "Whether to add block IDs to the output HTML."] = (
|
| 33 |
+
False
|
| 34 |
+
)
|
| 35 |
|
| 36 |
def __init__(self, config: Optional[BaseModel | dict] = None):
|
| 37 |
assign_config(self, config)
|
|
|
|
| 39 |
self.block_config = {
|
| 40 |
"keep_pageheader_in_output": self.keep_pageheader_in_output,
|
| 41 |
"keep_pagefooter_in_output": self.keep_pagefooter_in_output,
|
| 42 |
+
"add_block_ids": self.add_block_ids,
|
| 43 |
}
|
| 44 |
|
| 45 |
def __call__(self, document):
|
marker/renderers/html.py
CHANGED
|
@@ -47,6 +47,37 @@ class HTMLRenderer(BaseRenderer):
|
|
| 47 |
)
|
| 48 |
return cropped
|
| 49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
def extract_html(self, document, document_output, level=0):
|
| 51 |
soup = BeautifulSoup(document_output.html, "html.parser")
|
| 52 |
|
|
@@ -69,22 +100,24 @@ class HTMLRenderer(BaseRenderer):
|
|
| 69 |
image = self.extract_image(document, ref_block_id)
|
| 70 |
image_name = f"{ref_block_id.to_path()}.{settings.OUTPUT_IMAGE_FORMAT.lower()}"
|
| 71 |
images[image_name] = image
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
f"<p>{content}<img src='{image_name}'></p>", "html.parser"
|
| 75 |
-
)
|
| 76 |
)
|
|
|
|
| 77 |
else:
|
| 78 |
# This will be the image description if using llm mode, or empty if not
|
| 79 |
-
|
|
|
|
| 80 |
elif ref_block_id.block_type in self.page_blocks:
|
| 81 |
images.update(sub_images)
|
| 82 |
if self.paginate_output:
|
| 83 |
content = f"<div class='page' data-page-id='{ref_block_id.page_id}'>{content}</div>"
|
| 84 |
-
|
|
|
|
| 85 |
else:
|
| 86 |
images.update(sub_images)
|
| 87 |
-
|
|
|
|
| 88 |
|
| 89 |
output = str(soup)
|
| 90 |
if level == 0:
|
|
|
|
| 47 |
)
|
| 48 |
return cropped
|
| 49 |
|
| 50 |
+
def insert_block_id(self, soup, block_id: BlockId):
|
| 51 |
+
"""
|
| 52 |
+
Insert a block ID into the soup as a data attribute.
|
| 53 |
+
"""
|
| 54 |
+
if block_id.block_type in [BlockTypes.Line, BlockTypes.Span]:
|
| 55 |
+
return soup
|
| 56 |
+
|
| 57 |
+
if self.add_block_ids:
|
| 58 |
+
# Find the outermost tag (first tag that isn't a NavigableString)
|
| 59 |
+
outermost_tag = None
|
| 60 |
+
for element in soup.contents:
|
| 61 |
+
if hasattr(element, "name") and element.name:
|
| 62 |
+
outermost_tag = element
|
| 63 |
+
break
|
| 64 |
+
|
| 65 |
+
# If we found an outermost tag, add the data-block-id attribute
|
| 66 |
+
if outermost_tag:
|
| 67 |
+
outermost_tag["data-block-id"] = str(block_id)
|
| 68 |
+
|
| 69 |
+
# If soup only contains text or no tags, wrap in a span
|
| 70 |
+
elif soup.contents:
|
| 71 |
+
wrapper = soup.new_tag("span")
|
| 72 |
+
wrapper["data-block-id"] = str(block_id)
|
| 73 |
+
|
| 74 |
+
contents = list(soup.contents)
|
| 75 |
+
for content in contents:
|
| 76 |
+
content.extract()
|
| 77 |
+
wrapper.append(content)
|
| 78 |
+
soup.append(wrapper)
|
| 79 |
+
return soup
|
| 80 |
+
|
| 81 |
def extract_html(self, document, document_output, level=0):
|
| 82 |
soup = BeautifulSoup(document_output.html, "html.parser")
|
| 83 |
|
|
|
|
| 100 |
image = self.extract_image(document, ref_block_id)
|
| 101 |
image_name = f"{ref_block_id.to_path()}.{settings.OUTPUT_IMAGE_FORMAT.lower()}"
|
| 102 |
images[image_name] = image
|
| 103 |
+
element = BeautifulSoup(
|
| 104 |
+
f"<p>{content}<img src='{image_name}'></p>", "html.parser"
|
|
|
|
|
|
|
| 105 |
)
|
| 106 |
+
ref.replace_with(self.insert_block_id(element, ref_block_id))
|
| 107 |
else:
|
| 108 |
# This will be the image description if using llm mode, or empty if not
|
| 109 |
+
element = BeautifulSoup(f"{content}", "html.parser")
|
| 110 |
+
ref.replace_with(self.insert_block_id(element, ref_block_id))
|
| 111 |
elif ref_block_id.block_type in self.page_blocks:
|
| 112 |
images.update(sub_images)
|
| 113 |
if self.paginate_output:
|
| 114 |
content = f"<div class='page' data-page-id='{ref_block_id.page_id}'>{content}</div>"
|
| 115 |
+
element = BeautifulSoup(f"{content}", "html.parser")
|
| 116 |
+
ref.replace_with(self.insert_block_id(element, ref_block_id))
|
| 117 |
else:
|
| 118 |
images.update(sub_images)
|
| 119 |
+
element = BeautifulSoup(f"{content}", "html.parser")
|
| 120 |
+
ref.replace_with(self.insert_block_id(element, ref_block_id))
|
| 121 |
|
| 122 |
output = str(soup)
|
| 123 |
if level == 0:
|
marker/schema/blocks/basetable.py
CHANGED
|
@@ -11,7 +11,7 @@ class BaseTable(Block):
|
|
| 11 |
|
| 12 |
@staticmethod
|
| 13 |
def format_cells(
|
| 14 |
-
document, child_blocks, child_cells: List[TableCell] | None = None
|
| 15 |
):
|
| 16 |
if child_cells is None:
|
| 17 |
child_cells: List[TableCell] = [
|
|
@@ -28,7 +28,9 @@ class BaseTable(Block):
|
|
| 28 |
)
|
| 29 |
html_repr += "<tr>"
|
| 30 |
for cell in row_cells:
|
| 31 |
-
html_repr += cell.assemble_html(
|
|
|
|
|
|
|
| 32 |
html_repr += "</tr>"
|
| 33 |
html_repr += "</tbody></table>"
|
| 34 |
return html_repr
|
|
@@ -56,7 +58,7 @@ class BaseTable(Block):
|
|
| 56 |
return template + self.html
|
| 57 |
elif len(child_blocks) > 0 and BlockTypes.TableCell in child_block_types:
|
| 58 |
# Table processor
|
| 59 |
-
return template + self.format_cells(document, child_blocks)
|
| 60 |
else:
|
| 61 |
# Default text lines and spans
|
| 62 |
return f"<p>{template}</p>"
|
|
|
|
| 11 |
|
| 12 |
@staticmethod
|
| 13 |
def format_cells(
|
| 14 |
+
document, child_blocks, block_config, child_cells: List[TableCell] | None = None
|
| 15 |
):
|
| 16 |
if child_cells is None:
|
| 17 |
child_cells: List[TableCell] = [
|
|
|
|
| 28 |
)
|
| 29 |
html_repr += "<tr>"
|
| 30 |
for cell in row_cells:
|
| 31 |
+
html_repr += cell.assemble_html(
|
| 32 |
+
document, child_blocks, None, block_config
|
| 33 |
+
)
|
| 34 |
html_repr += "</tr>"
|
| 35 |
html_repr += "</tbody></table>"
|
| 36 |
return html_repr
|
|
|
|
| 58 |
return template + self.html
|
| 59 |
elif len(child_blocks) > 0 and BlockTypes.TableCell in child_block_types:
|
| 60 |
# Table processor
|
| 61 |
+
return template + self.format_cells(document, child_blocks, block_config)
|
| 62 |
else:
|
| 63 |
# Default text lines and spans
|
| 64 |
return f"<p>{template}</p>"
|
marker/schema/blocks/tablecell.py
CHANGED
|
@@ -21,12 +21,16 @@ class TableCell(Block):
|
|
| 21 |
def assemble_html(
|
| 22 |
self, document, child_blocks, parent_structure=None, block_config=None
|
| 23 |
):
|
|
|
|
|
|
|
| 24 |
tag_cls = "th" if self.is_header else "td"
|
| 25 |
tag = f"<{tag_cls}"
|
| 26 |
if self.rowspan > 1:
|
| 27 |
tag += f" rowspan={self.rowspan}"
|
| 28 |
if self.colspan > 1:
|
| 29 |
tag += f" colspan={self.colspan}"
|
|
|
|
|
|
|
| 30 |
if self.text_lines is None:
|
| 31 |
self.text_lines = []
|
| 32 |
text = "<br>".join(self.text_lines)
|
|
|
|
| 21 |
def assemble_html(
|
| 22 |
self, document, child_blocks, parent_structure=None, block_config=None
|
| 23 |
):
|
| 24 |
+
add_cell_id = block_config and block_config.get("add_block_ids", False)
|
| 25 |
+
|
| 26 |
tag_cls = "th" if self.is_header else "td"
|
| 27 |
tag = f"<{tag_cls}"
|
| 28 |
if self.rowspan > 1:
|
| 29 |
tag += f" rowspan={self.rowspan}"
|
| 30 |
if self.colspan > 1:
|
| 31 |
tag += f" colspan={self.colspan}"
|
| 32 |
+
if add_cell_id:
|
| 33 |
+
tag += f' data-block-id="{self.id}"'
|
| 34 |
if self.text_lines is None:
|
| 35 |
self.text_lines = []
|
| 36 |
text = "<br>".join(self.text_lines)
|
tests/renderers/test_html_renderer.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
+
|
| 3 |
+
from marker.renderers.html import HTMLRenderer
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
@pytest.mark.config(
|
| 7 |
+
{
|
| 8 |
+
"page_range": [0],
|
| 9 |
+
"disable_ocr": True,
|
| 10 |
+
"add_block_ids": True,
|
| 11 |
+
"paginate_output": True,
|
| 12 |
+
}
|
| 13 |
+
)
|
| 14 |
+
def test_html_renderer_block_ids(pdf_document, config):
|
| 15 |
+
renderer = HTMLRenderer(config)
|
| 16 |
+
html = renderer(pdf_document).html
|
| 17 |
+
|
| 18 |
+
# Verify some block IDs are present
|
| 19 |
+
assert "/page/0/Text/1" in html
|