Vik Paruchuri commited on
Commit
9ff9e66
·
1 Parent(s): cf0611c

Output images, clean up other output formats

Browse files
.gitignore CHANGED
@@ -10,6 +10,7 @@ report.json
10
  benchmark_data
11
  debug_data
12
  temp.md
 
13
 
14
  # Byte-compiled / optimized / DLL files
15
  __pycache__/
 
10
  benchmark_data
11
  debug_data
12
  temp.md
13
+ temp
14
 
15
  # Byte-compiled / optimized / DLL files
16
  __pycache__/
marker/v2/converters/pdf.py CHANGED
@@ -1,6 +1,8 @@
 
1
  import tempfile
2
  from typing import List, Optional
3
 
 
4
  import datasets
5
  from pydantic import BaseModel
6
 
@@ -43,9 +45,14 @@ class PdfConverter(BaseConverter):
43
  return renderer(document)
44
 
45
 
46
- if __name__ == "__main__":
 
 
 
47
  dataset = datasets.load_dataset("datalab-to/pdfs", split="train")
48
- idx = dataset['filename'].index('adversarial.pdf')
 
 
49
 
50
  with tempfile.NamedTemporaryFile(suffix=".pdf") as temp_pdf:
51
  temp_pdf.write(dataset['pdf'][idx])
@@ -54,7 +61,12 @@ if __name__ == "__main__":
54
  converter = PdfConverter()
55
  rendered = converter(temp_pdf.name)
56
 
57
- with open("temp.md", "w+") as f:
58
- f.write(rendered)
 
 
 
59
 
60
 
 
 
 
1
+ import os
2
  import tempfile
3
  from typing import List, Optional
4
 
5
+ import click
6
  import datasets
7
  from pydantic import BaseModel
8
 
 
45
  return renderer(document)
46
 
47
 
48
+ @click.command()
49
+ @click.option("--output", type=click.Path(exists=False), required=False, default="temp")
50
+ @click.option("--fname", type=str, default="adversarial.pdf")
51
+ def main(output: str, fname: str):
52
  dataset = datasets.load_dataset("datalab-to/pdfs", split="train")
53
+ idx = dataset['filename'].index(fname)
54
+ out_filename = fname.rsplit(".", 1)[0] + ".md"
55
+ os.makedirs(output, exist_ok=True)
56
 
57
  with tempfile.NamedTemporaryFile(suffix=".pdf") as temp_pdf:
58
  temp_pdf.write(dataset['pdf'][idx])
 
61
  converter = PdfConverter()
62
  rendered = converter(temp_pdf.name)
63
 
64
+ with open(os.path.join(output, out_filename), "w+") as f:
65
+ f.write(rendered.markdown)
66
+
67
+ for img_name, img in rendered.images.items():
68
+ img.save(os.path.join(output, img_name))
69
 
70
 
71
+ if __name__ == "__main__":
72
+ main()
marker/v2/processors/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
- from typing import Optional
2
 
3
  from pydantic import BaseModel
4
 
@@ -7,7 +7,7 @@ from marker.v2.util import assign_config
7
 
8
 
9
  class BaseProcessor:
10
- block_type: str | None = None # What block type this processor is responsible for
11
 
12
  def __init__(self, config: Optional[BaseModel | dict] = None):
13
  assign_config(self, config)
 
1
+ from typing import Optional, Tuple
2
 
3
  from pydantic import BaseModel
4
 
 
7
 
8
 
9
  class BaseProcessor:
10
+ block_types: Tuple[str] | None = None # What block types this processor is responsible for
11
 
12
  def __init__(self, config: Optional[BaseModel | dict] = None):
13
  assign_config(self, config)
marker/v2/processors/equation.py CHANGED
@@ -11,7 +11,7 @@ from texify.inference import batch_inference
11
 
12
 
13
  class EquationProcessor(BaseProcessor):
14
- block_type = "Equation"
15
  model_max_length = 384
16
  batch_size = None
17
  token_buffer = 256
@@ -26,7 +26,7 @@ class EquationProcessor(BaseProcessor):
26
 
27
  for page in document.pages:
28
  for block in page.children:
29
- if block.block_type != self.block_type:
30
  continue
31
  image_poly = block.polygon.rescale((page.polygon.width, page.polygon.height), page.lowres_image.size)
32
  image = page.lowres_image.crop(image_poly.bbox).convert("RGB")
 
11
 
12
 
13
  class EquationProcessor(BaseProcessor):
14
+ block_types = ("Equation", )
15
  model_max_length = 384
16
  batch_size = None
17
  token_buffer = 256
 
26
 
27
  for page in document.pages:
28
  for block in page.children:
29
+ if block.block_type not in self.block_types:
30
  continue
31
  image_poly = block.polygon.rescale((page.polygon.width, page.polygon.height), page.lowres_image.size)
32
  image = page.lowres_image.crop(image_poly.bbox).convert("RGB")
marker/v2/processors/table.py CHANGED
@@ -12,7 +12,7 @@ from marker.v2.schema.document import Document
12
 
13
 
14
  class TableProcessor(BaseProcessor):
15
- block_type = BlockTypes.Table
16
  detect_boxes = False
17
  detector_batch_size = None
18
  table_rec_batch_size = None
@@ -31,7 +31,7 @@ class TableProcessor(BaseProcessor):
31
  table_data = []
32
  for page in document.pages:
33
  for block in page.children:
34
- if block.block_type != self.block_type:
35
  continue
36
 
37
  image_poly = block.polygon.rescale((page.polygon.width, page.polygon.height), page.highres_image.size)
 
12
 
13
 
14
  class TableProcessor(BaseProcessor):
15
+ block_types = (BlockTypes.Table, BlockTypes.TableOfContents, BlockTypes.Form)
16
  detect_boxes = False
17
  detector_batch_size = None
18
  table_rec_batch_size = None
 
31
  table_data = []
32
  for page in document.pages:
33
  for block in page.children:
34
+ if block.block_type not in self.block_types:
35
  continue
36
 
37
  image_poly = block.polygon.rescale((page.polygon.width, page.polygon.height), page.highres_image.size)
marker/v2/renderers/__init__.py CHANGED
@@ -3,6 +3,7 @@ from typing import Optional
3
  from pydantic import BaseModel
4
 
5
 
 
6
  class BaseRenderer:
7
  block_type: str | None = None
8
 
 
3
  from pydantic import BaseModel
4
 
5
 
6
+
7
  class BaseRenderer:
8
  block_type: str | None = None
9
 
marker/v2/renderers/html.py CHANGED
@@ -1,33 +1,60 @@
1
  from bs4 import BeautifulSoup
 
 
2
  from marker.v2.renderers import BaseRenderer
3
  from marker.v2.schema import BlockTypes
 
 
 
 
 
 
4
 
5
 
6
  class HTMLRenderer(BaseRenderer):
7
  remove_blocks: list = [BlockTypes.PageHeader, BlockTypes.PageFooter]
8
  image_blocks: list = [BlockTypes.Picture, BlockTypes.Figure]
9
 
 
 
 
 
 
 
 
 
10
  def extract_html(self, document, document_output):
11
  soup = BeautifulSoup(document_output.html, 'html.parser')
12
 
13
  content_refs = soup.find_all('content-ref')
14
- ref_block_type = None
 
15
  for ref in content_refs:
16
  src = ref.get('src')
 
17
  for item in document_output.children:
18
  if item.id == src:
19
- content = self.extract_html(document, item)
20
- ref_block_type = item.id.block_type
21
  break
22
 
23
- if ref_block_type in self.remove_blocks:
24
  ref.replace_with('')
 
 
 
 
 
25
  else:
 
26
  ref.replace_with(BeautifulSoup(f"<div>{content}</div>", 'html.parser'))
27
 
28
- return str(soup)
29
 
30
- def __call__(self, document):
31
  document_output = document.render()
32
- full_html = self.extract_html(document, document_output)
33
- return full_html
 
 
 
 
1
  from bs4 import BeautifulSoup
2
+ from pydantic import BaseModel
3
+
4
  from marker.v2.renderers import BaseRenderer
5
  from marker.v2.schema import BlockTypes
6
+ from marker.v2.schema.blocks import BlockId
7
+
8
+
9
+ class HTMLOutput(BaseModel):
10
+ html: str
11
+ images: dict
12
 
13
 
14
  class HTMLRenderer(BaseRenderer):
15
  remove_blocks: list = [BlockTypes.PageHeader, BlockTypes.PageFooter]
16
  image_blocks: list = [BlockTypes.Picture, BlockTypes.Figure]
17
 
18
+ def extract_image(self, document, image_id):
19
+ image_block = document.get_block(image_id)
20
+ page = document.get_page(image_block.page_id)
21
+ page_img = page.highres_image
22
+ image_box = image_block.polygon.rescale(page.polygon.size, page_img.size)
23
+ cropped = page_img.crop(image_box.bbox)
24
+ return cropped
25
+
26
  def extract_html(self, document, document_output):
27
  soup = BeautifulSoup(document_output.html, 'html.parser')
28
 
29
  content_refs = soup.find_all('content-ref')
30
+ ref_block_id = None
31
+ images = {}
32
  for ref in content_refs:
33
  src = ref.get('src')
34
+ sub_images = {}
35
  for item in document_output.children:
36
  if item.id == src:
37
+ content, sub_images = self.extract_html(document, item)
38
+ ref_block_id: BlockId = item.id
39
  break
40
 
41
+ if ref_block_id.block_type in self.remove_blocks:
42
  ref.replace_with('')
43
+ elif ref_block_id.block_type in self.image_blocks:
44
+ image = self.extract_image(document, ref_block_id)
45
+ image_name = f"{ref_block_id.to_path()}.png"
46
+ images[image_name] = image
47
+ ref.replace_with(BeautifulSoup(f"<p><img src='{image_name}'></p>", 'html.parser'))
48
  else:
49
+ images.update(sub_images)
50
  ref.replace_with(BeautifulSoup(f"<div>{content}</div>", 'html.parser'))
51
 
52
+ return str(soup), images
53
 
54
+ def __call__(self, document) -> HTMLOutput:
55
  document_output = document.render()
56
+ full_html, images = self.extract_html(document, document_output)
57
+ return HTMLOutput(
58
+ html=full_html,
59
+ images=images,
60
+ )
marker/v2/renderers/markdown.py CHANGED
@@ -1,17 +1,32 @@
1
- from markdownify import markdownify
 
 
2
  from marker.v2.renderers.html import HTMLRenderer
3
 
4
 
 
 
 
 
 
 
 
 
 
5
  class MarkdownRenderer(HTMLRenderer):
6
- def __call__(self, document):
7
  document_output = document.render()
8
- full_html = self.extract_html(document, document_output)
9
- return markdownify(
10
- full_html,
11
  heading_style="ATX",
12
  bullets="-",
13
  escape_misc=False,
14
  escape_underscores=False
15
  )
 
 
 
 
 
16
 
17
 
 
1
+ from markdownify import markdownify, MarkdownConverter
2
+ from pydantic import BaseModel
3
+
4
  from marker.v2.renderers.html import HTMLRenderer
5
 
6
 
7
+ class Markdownify(MarkdownConverter):
8
+ pass
9
+
10
+
11
+ class MarkdownOutput(BaseModel):
12
+ markdown: str
13
+ images: dict
14
+
15
+
16
  class MarkdownRenderer(HTMLRenderer):
17
+ def __call__(self, document) -> MarkdownOutput:
18
  document_output = document.render()
19
+ full_html, images = self.extract_html(document, document_output)
20
+ md_cls = Markdownify(
 
21
  heading_style="ATX",
22
  bullets="-",
23
  escape_misc=False,
24
  escape_underscores=False
25
  )
26
+ markdown = md_cls.convert(full_html)
27
+ return MarkdownOutput(
28
+ markdown=markdown,
29
+ images=images
30
+ )
31
 
32
 
marker/v2/schema/blocks/base.py CHANGED
@@ -1,6 +1,7 @@
1
  from __future__ import annotations
2
 
3
  from typing import Optional, List, Any
 
4
 
5
  from pydantic import BaseModel, ConfigDict, field_validator
6
 
@@ -44,6 +45,28 @@ class BlockId(BaseModel):
44
  raise ValueError(f"Invalid block type: {v}")
45
  return v
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
  class Block(BaseModel):
49
  polygon: PolygonBox
@@ -105,6 +128,8 @@ class Block(BaseModel):
105
  template = ""
106
  for c in child_blocks:
107
  template += f"<content-ref src='{c.id}'></content-ref>"
 
 
108
  return template
109
 
110
  def render(self, document, parent_structure):
 
1
  from __future__ import annotations
2
 
3
  from typing import Optional, List, Any
4
+ import re
5
 
6
  from pydantic import BaseModel, ConfigDict, field_validator
7
 
 
45
  raise ValueError(f"Invalid block type: {v}")
46
  return v
47
 
48
+ def to_path(self):
49
+ return str(self).replace('/', '_')
50
+
51
+
52
+ def merge_consecutive_tags(html, tag):
53
+ if not html:
54
+ return html
55
+
56
+ def replace_with_space(match):
57
+ closing_tag, whitespace, opening_tag = match.groups()
58
+ return whitespace if whitespace else ''
59
+
60
+ pattern = fr'</{tag}>\s*<{tag}>'
61
+
62
+ while True:
63
+ new_merged = re.sub(pattern, replace_with_space, html)
64
+ if new_merged == html:
65
+ break
66
+ html = new_merged
67
+
68
+ return html
69
+
70
 
71
  class Block(BaseModel):
72
  polygon: PolygonBox
 
128
  template = ""
129
  for c in child_blocks:
130
  template += f"<content-ref src='{c.id}'></content-ref>"
131
+ template = merge_consecutive_tags(template, 'b')
132
+ template = merge_consecutive_tags(template, 'i')
133
  return template
134
 
135
  def render(self, document, parent_structure):
marker/v2/schema/blocks/equation.py CHANGED
@@ -6,4 +6,4 @@ class Equation(Block):
6
  latex: str | None = None
7
 
8
  def assemble_html(self, child_blocks, parent_structure=None):
9
- return f"<div class='math'>{self.latex}</div>"
 
6
  latex: str | None = None
7
 
8
  def assemble_html(self, child_blocks, parent_structure=None):
9
+ return f"<p><math>{self.latex}</math></p>"
marker/v2/schema/blocks/figure.py CHANGED
@@ -5,4 +5,4 @@ class Figure(Block):
5
  block_type: str = "Figure"
6
 
7
  def assemble_html(self, child_blocks, parent_structure):
8
- return f"Image {self.block_id}"
 
5
  block_type: str = "Figure"
6
 
7
  def assemble_html(self, child_blocks, parent_structure):
8
+ return f"<p>Image {self.block_id}</p>"
marker/v2/schema/blocks/form.py CHANGED
@@ -1,5 +1,14 @@
 
 
 
 
 
1
  from marker.v2.schema.blocks import Block
2
 
3
 
4
  class Form(Block):
5
  block_type: str = "Form"
 
 
 
 
 
1
+ from typing import List
2
+
3
+ from tabled.formats import html_format
4
+ from tabled.schema import SpanTableCell
5
+
6
  from marker.v2.schema.blocks import Block
7
 
8
 
9
  class Form(Block):
10
  block_type: str = "Form"
11
+ cells: List[SpanTableCell] | None = None
12
+
13
+ def assemble_html(self, child_blocks, parent_structure=None):
14
+ return html_format(self.cells)
marker/v2/schema/blocks/pagefooter.py CHANGED
@@ -3,3 +3,8 @@ from marker.v2.schema.blocks import Block
3
 
4
  class PageFooter(Block):
5
  block_type: str = "PageFooter"
 
 
 
 
 
 
3
 
4
  class PageFooter(Block):
5
  block_type: str = "PageFooter"
6
+
7
+ def assemble_html(self, child_blocks, parent_structure):
8
+ template = super().assemble_html(child_blocks, parent_structure)
9
+ template = template.replace("\n", " ")
10
+ return f"<p>{template}</p>"
marker/v2/schema/blocks/pageheader.py CHANGED
@@ -3,3 +3,8 @@ from marker.v2.schema.blocks import Block
3
 
4
  class PageHeader(Block):
5
  block_type: str = "PageHeader"
 
 
 
 
 
 
3
 
4
  class PageHeader(Block):
5
  block_type: str = "PageHeader"
6
+
7
+ def assemble_html(self, child_blocks, parent_structure):
8
+ template = super().assemble_html(child_blocks, parent_structure)
9
+ template = template.replace("\n", " ")
10
+ return f"<p>{template}</p>"
marker/v2/schema/blocks/picture.py CHANGED
@@ -5,4 +5,4 @@ class Picture(Block):
5
  block_type: str = "Picture"
6
 
7
  def assemble_html(self, child_blocks, parent_structure):
8
- return f"Image {self.block_id}"
 
5
  block_type: str = "Picture"
6
 
7
  def assemble_html(self, child_blocks, parent_structure):
8
+ return f"<p>Image {self.block_id}</p>"
marker/v2/schema/blocks/text.py CHANGED
@@ -1,6 +1,5 @@
1
  from marker.v2.schema.blocks import Block
2
 
3
-
4
  class Text(Block):
5
  block_type: str = "Text"
6
 
 
1
  from marker.v2.schema.blocks import Block
2
 
 
3
  class Text(Block):
4
  block_type: str = "Text"
5
 
marker/v2/schema/blocks/toc.py CHANGED
@@ -1,5 +1,14 @@
 
 
 
 
 
1
  from marker.v2.schema.blocks import Block
2
 
3
 
4
  class TableOfContents(Block):
5
  block_type: str = "TableOfContents"
 
 
 
 
 
1
+ from typing import List
2
+
3
+ from tabled.formats import html_format
4
+ from tabled.schema import SpanTableCell
5
+
6
  from marker.v2.schema.blocks import Block
7
 
8
 
9
  class TableOfContents(Block):
10
  block_type: str = "TableOfContents"
11
+ cells: List[SpanTableCell] | None = None
12
+
13
+ def assemble_html(self, child_blocks, parent_structure=None):
14
+ return html_format(self.cells)
marker/v2/schema/document.py CHANGED
@@ -20,12 +20,17 @@ class Document(BaseModel):
20
  block_type: str = "Document"
21
 
22
  def get_block(self, block_id: BlockId):
23
- page = [p for p in self.pages if p.page_id == block_id.page_id][0]
24
  block = page.get_block(block_id)
25
  if block:
26
  return block
27
  return None
28
 
 
 
 
 
 
29
  def assemble_html(self, child_blocks):
30
  template = ""
31
  for c in child_blocks:
 
20
  block_type: str = "Document"
21
 
22
  def get_block(self, block_id: BlockId):
23
+ page = self.get_page(block_id.page_id)
24
  block = page.get_block(block_id)
25
  if block:
26
  return block
27
  return None
28
 
29
+ def get_page(self, page_id):
30
+ page = self.pages[page_id]
31
+ assert page.page_id == page_id, "Mismatch between page_id and page index"
32
+ return page
33
+
34
  def assemble_html(self, child_blocks):
35
  template = ""
36
  for c in child_blocks:
marker/v2/schema/groups/list.py CHANGED
@@ -6,4 +6,4 @@ class ListGroup(Block):
6
 
7
  def assemble_html(self, child_blocks, parent_structure):
8
  template = super().assemble_html(child_blocks, parent_structure)
9
- return f"<ul>{template}</ul>"
 
6
 
7
  def assemble_html(self, child_blocks, parent_structure):
8
  template = super().assemble_html(child_blocks, parent_structure)
9
+ return f"<p><ul>{template}</ul></p>"
marker/v2/schema/text/line.py CHANGED
@@ -25,7 +25,8 @@ def strip_trailing_hyphens(line_text, next_line_text, line_html) -> str:
25
  next_line_starts_lowercase = regex.match(rf"^\s?[{lowercase_letters}]", next_line_text)
26
 
27
  if hyphen_regex.match(line_text) and next_line_starts_lowercase:
28
- return replace_last(line_html, rf'[{HYPHENS}]', "")
 
29
  return line_html
30
 
31
 
 
25
  next_line_starts_lowercase = regex.match(rf"^\s?[{lowercase_letters}]", next_line_text)
26
 
27
  if hyphen_regex.match(line_text) and next_line_starts_lowercase:
28
+ line_html = replace_last(line_html, rf'[{HYPHENS}]', "")
29
+
30
  return line_html
31
 
32