Vik Paruchuri commited on
Commit
0c7adea
·
1 Parent(s): 4f69a6f

Add block ids to html renderer

Browse files
marker/processors/llm/llm_table.py CHANGED
@@ -161,7 +161,7 @@ No corrections needed.
161
  batch_bbox[3] = block_image.size[1]
162
 
163
  batch_image = block_image.crop(batch_bbox)
164
- block_html = block.format_cells(document, [], batch_cells)
165
  batch_image = self.handle_image_rotation(batch_cells, batch_image)
166
  batch_parsed_cells = self.rewrite_single_chunk(
167
  page, block, block_html, batch_cells, batch_image
 
161
  batch_bbox[3] = block_image.size[1]
162
 
163
  batch_image = block_image.crop(batch_bbox)
164
+ block_html = block.format_cells(document, [], None, batch_cells)
165
  batch_image = self.handle_image_rotation(batch_cells, batch_image)
166
  batch_parsed_cells = self.rewrite_single_chunk(
167
  page, block, block_html, batch_cells, batch_image
marker/renderers/__init__.py CHANGED
@@ -29,6 +29,9 @@ class BaseRenderer:
29
  keep_pagefooter_in_output: Annotated[
30
  bool, "Keep the page footer in the output HTML."
31
  ] = False
 
 
 
32
 
33
  def __init__(self, config: Optional[BaseModel | dict] = None):
34
  assign_config(self, config)
@@ -36,6 +39,7 @@ class BaseRenderer:
36
  self.block_config = {
37
  "keep_pageheader_in_output": self.keep_pageheader_in_output,
38
  "keep_pagefooter_in_output": self.keep_pagefooter_in_output,
 
39
  }
40
 
41
  def __call__(self, document):
 
29
  keep_pagefooter_in_output: Annotated[
30
  bool, "Keep the page footer in the output HTML."
31
  ] = False
32
+ add_block_ids: Annotated[bool, "Whether to add block IDs to the output HTML."] = (
33
+ False
34
+ )
35
 
36
  def __init__(self, config: Optional[BaseModel | dict] = None):
37
  assign_config(self, config)
 
39
  self.block_config = {
40
  "keep_pageheader_in_output": self.keep_pageheader_in_output,
41
  "keep_pagefooter_in_output": self.keep_pagefooter_in_output,
42
+ "add_block_ids": self.add_block_ids,
43
  }
44
 
45
  def __call__(self, document):
marker/renderers/html.py CHANGED
@@ -47,6 +47,37 @@ class HTMLRenderer(BaseRenderer):
47
  )
48
  return cropped
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  def extract_html(self, document, document_output, level=0):
51
  soup = BeautifulSoup(document_output.html, "html.parser")
52
 
@@ -69,22 +100,24 @@ class HTMLRenderer(BaseRenderer):
69
  image = self.extract_image(document, ref_block_id)
70
  image_name = f"{ref_block_id.to_path()}.{settings.OUTPUT_IMAGE_FORMAT.lower()}"
71
  images[image_name] = image
72
- ref.replace_with(
73
- BeautifulSoup(
74
- f"<p>{content}<img src='{image_name}'></p>", "html.parser"
75
- )
76
  )
 
77
  else:
78
  # This will be the image description if using llm mode, or empty if not
79
- ref.replace_with(BeautifulSoup(f"{content}", "html.parser"))
 
80
  elif ref_block_id.block_type in self.page_blocks:
81
  images.update(sub_images)
82
  if self.paginate_output:
83
  content = f"<div class='page' data-page-id='{ref_block_id.page_id}'>{content}</div>"
84
- ref.replace_with(BeautifulSoup(f"{content}", "html.parser"))
 
85
  else:
86
  images.update(sub_images)
87
- ref.replace_with(BeautifulSoup(f"{content}", "html.parser"))
 
88
 
89
  output = str(soup)
90
  if level == 0:
 
47
  )
48
  return cropped
49
 
50
+ def insert_block_id(self, soup, block_id: BlockId):
51
+ """
52
+ Insert a block ID into the soup as a data attribute.
53
+ """
54
+ if block_id.block_type in [BlockTypes.Line, BlockTypes.Span]:
55
+ return soup
56
+
57
+ if self.add_block_ids:
58
+ # Find the outermost tag (first tag that isn't a NavigableString)
59
+ outermost_tag = None
60
+ for element in soup.contents:
61
+ if hasattr(element, "name") and element.name:
62
+ outermost_tag = element
63
+ break
64
+
65
+ # If we found an outermost tag, add the data-block-id attribute
66
+ if outermost_tag:
67
+ outermost_tag["data-block-id"] = str(block_id)
68
+
69
+ # If soup only contains text or no tags, wrap in a span
70
+ elif soup.contents:
71
+ wrapper = soup.new_tag("span")
72
+ wrapper["data-block-id"] = str(block_id)
73
+
74
+ contents = list(soup.contents)
75
+ for content in contents:
76
+ content.extract()
77
+ wrapper.append(content)
78
+ soup.append(wrapper)
79
+ return soup
80
+
81
  def extract_html(self, document, document_output, level=0):
82
  soup = BeautifulSoup(document_output.html, "html.parser")
83
 
 
100
  image = self.extract_image(document, ref_block_id)
101
  image_name = f"{ref_block_id.to_path()}.{settings.OUTPUT_IMAGE_FORMAT.lower()}"
102
  images[image_name] = image
103
+ element = BeautifulSoup(
104
+ f"<p>{content}<img src='{image_name}'></p>", "html.parser"
 
 
105
  )
106
+ ref.replace_with(self.insert_block_id(element, ref_block_id))
107
  else:
108
  # This will be the image description if using llm mode, or empty if not
109
+ element = BeautifulSoup(f"{content}", "html.parser")
110
+ ref.replace_with(self.insert_block_id(element, ref_block_id))
111
  elif ref_block_id.block_type in self.page_blocks:
112
  images.update(sub_images)
113
  if self.paginate_output:
114
  content = f"<div class='page' data-page-id='{ref_block_id.page_id}'>{content}</div>"
115
+ element = BeautifulSoup(f"{content}", "html.parser")
116
+ ref.replace_with(self.insert_block_id(element, ref_block_id))
117
  else:
118
  images.update(sub_images)
119
+ element = BeautifulSoup(f"{content}", "html.parser")
120
+ ref.replace_with(self.insert_block_id(element, ref_block_id))
121
 
122
  output = str(soup)
123
  if level == 0:
marker/schema/blocks/basetable.py CHANGED
@@ -11,7 +11,7 @@ class BaseTable(Block):
11
 
12
  @staticmethod
13
  def format_cells(
14
- document, child_blocks, child_cells: List[TableCell] | None = None
15
  ):
16
  if child_cells is None:
17
  child_cells: List[TableCell] = [
@@ -28,7 +28,9 @@ class BaseTable(Block):
28
  )
29
  html_repr += "<tr>"
30
  for cell in row_cells:
31
- html_repr += cell.assemble_html(document, child_blocks, None, None)
 
 
32
  html_repr += "</tr>"
33
  html_repr += "</tbody></table>"
34
  return html_repr
@@ -56,7 +58,7 @@ class BaseTable(Block):
56
  return template + self.html
57
  elif len(child_blocks) > 0 and BlockTypes.TableCell in child_block_types:
58
  # Table processor
59
- return template + self.format_cells(document, child_blocks)
60
  else:
61
  # Default text lines and spans
62
  return f"<p>{template}</p>"
 
11
 
12
  @staticmethod
13
  def format_cells(
14
+ document, child_blocks, block_config, child_cells: List[TableCell] | None = None
15
  ):
16
  if child_cells is None:
17
  child_cells: List[TableCell] = [
 
28
  )
29
  html_repr += "<tr>"
30
  for cell in row_cells:
31
+ html_repr += cell.assemble_html(
32
+ document, child_blocks, None, block_config
33
+ )
34
  html_repr += "</tr>"
35
  html_repr += "</tbody></table>"
36
  return html_repr
 
58
  return template + self.html
59
  elif len(child_blocks) > 0 and BlockTypes.TableCell in child_block_types:
60
  # Table processor
61
+ return template + self.format_cells(document, child_blocks, block_config)
62
  else:
63
  # Default text lines and spans
64
  return f"<p>{template}</p>"
marker/schema/blocks/tablecell.py CHANGED
@@ -21,12 +21,16 @@ class TableCell(Block):
21
  def assemble_html(
22
  self, document, child_blocks, parent_structure=None, block_config=None
23
  ):
 
 
24
  tag_cls = "th" if self.is_header else "td"
25
  tag = f"<{tag_cls}"
26
  if self.rowspan > 1:
27
  tag += f" rowspan={self.rowspan}"
28
  if self.colspan > 1:
29
  tag += f" colspan={self.colspan}"
 
 
30
  if self.text_lines is None:
31
  self.text_lines = []
32
  text = "<br>".join(self.text_lines)
 
21
  def assemble_html(
22
  self, document, child_blocks, parent_structure=None, block_config=None
23
  ):
24
+ add_cell_id = block_config and block_config.get("add_block_ids", False)
25
+
26
  tag_cls = "th" if self.is_header else "td"
27
  tag = f"<{tag_cls}"
28
  if self.rowspan > 1:
29
  tag += f" rowspan={self.rowspan}"
30
  if self.colspan > 1:
31
  tag += f" colspan={self.colspan}"
32
+ if add_cell_id:
33
+ tag += f' data-block-id="{self.id}"'
34
  if self.text_lines is None:
35
  self.text_lines = []
36
  text = "<br>".join(self.text_lines)
tests/renderers/test_html_renderer.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+
3
+ from marker.renderers.html import HTMLRenderer
4
+
5
+
6
+ @pytest.mark.config(
7
+ {
8
+ "page_range": [0],
9
+ "disable_ocr": True,
10
+ "add_block_ids": True,
11
+ "paginate_output": True,
12
+ }
13
+ )
14
+ def test_html_renderer_block_ids(pdf_document, config):
15
+ renderer = HTMLRenderer(config)
16
+ html = renderer(pdf_document).html
17
+
18
+ # Verify some block IDs are present
19
+ assert "/page/0/Text/1" in html