Vik Paruchuri
commited on
Commit
·
9ff9e66
1
Parent(s):
cf0611c
Output images, clean up other output formats
Browse files- .gitignore +1 -0
- marker/v2/converters/pdf.py +16 -4
- marker/v2/processors/__init__.py +2 -2
- marker/v2/processors/equation.py +2 -2
- marker/v2/processors/table.py +2 -2
- marker/v2/renderers/__init__.py +1 -0
- marker/v2/renderers/html.py +35 -8
- marker/v2/renderers/markdown.py +20 -5
- marker/v2/schema/blocks/base.py +25 -0
- marker/v2/schema/blocks/equation.py +1 -1
- marker/v2/schema/blocks/figure.py +1 -1
- marker/v2/schema/blocks/form.py +9 -0
- marker/v2/schema/blocks/pagefooter.py +5 -0
- marker/v2/schema/blocks/pageheader.py +5 -0
- marker/v2/schema/blocks/picture.py +1 -1
- marker/v2/schema/blocks/text.py +0 -1
- marker/v2/schema/blocks/toc.py +9 -0
- marker/v2/schema/document.py +6 -1
- marker/v2/schema/groups/list.py +1 -1
- marker/v2/schema/text/line.py +2 -1
.gitignore
CHANGED
|
@@ -10,6 +10,7 @@ report.json
|
|
| 10 |
benchmark_data
|
| 11 |
debug_data
|
| 12 |
temp.md
|
|
|
|
| 13 |
|
| 14 |
# Byte-compiled / optimized / DLL files
|
| 15 |
__pycache__/
|
|
|
|
| 10 |
benchmark_data
|
| 11 |
debug_data
|
| 12 |
temp.md
|
| 13 |
+
temp
|
| 14 |
|
| 15 |
# Byte-compiled / optimized / DLL files
|
| 16 |
__pycache__/
|
marker/v2/converters/pdf.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
|
|
| 1 |
import tempfile
|
| 2 |
from typing import List, Optional
|
| 3 |
|
|
|
|
| 4 |
import datasets
|
| 5 |
from pydantic import BaseModel
|
| 6 |
|
|
@@ -43,9 +45,14 @@ class PdfConverter(BaseConverter):
|
|
| 43 |
return renderer(document)
|
| 44 |
|
| 45 |
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
| 47 |
dataset = datasets.load_dataset("datalab-to/pdfs", split="train")
|
| 48 |
-
idx = dataset['filename'].index(
|
|
|
|
|
|
|
| 49 |
|
| 50 |
with tempfile.NamedTemporaryFile(suffix=".pdf") as temp_pdf:
|
| 51 |
temp_pdf.write(dataset['pdf'][idx])
|
|
@@ -54,7 +61,12 @@ if __name__ == "__main__":
|
|
| 54 |
converter = PdfConverter()
|
| 55 |
rendered = converter(temp_pdf.name)
|
| 56 |
|
| 57 |
-
with open(
|
| 58 |
-
f.write(rendered)
|
|
|
|
|
|
|
|
|
|
| 59 |
|
| 60 |
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
import tempfile
|
| 3 |
from typing import List, Optional
|
| 4 |
|
| 5 |
+
import click
|
| 6 |
import datasets
|
| 7 |
from pydantic import BaseModel
|
| 8 |
|
|
|
|
| 45 |
return renderer(document)
|
| 46 |
|
| 47 |
|
| 48 |
+
@click.command()
|
| 49 |
+
@click.option("--output", type=click.Path(exists=False), required=False, default="temp")
|
| 50 |
+
@click.option("--fname", type=str, default="adversarial.pdf")
|
| 51 |
+
def main(output: str, fname: str):
|
| 52 |
dataset = datasets.load_dataset("datalab-to/pdfs", split="train")
|
| 53 |
+
idx = dataset['filename'].index(fname)
|
| 54 |
+
out_filename = fname.rsplit(".", 1)[0] + ".md"
|
| 55 |
+
os.makedirs(output, exist_ok=True)
|
| 56 |
|
| 57 |
with tempfile.NamedTemporaryFile(suffix=".pdf") as temp_pdf:
|
| 58 |
temp_pdf.write(dataset['pdf'][idx])
|
|
|
|
| 61 |
converter = PdfConverter()
|
| 62 |
rendered = converter(temp_pdf.name)
|
| 63 |
|
| 64 |
+
with open(os.path.join(output, out_filename), "w+") as f:
|
| 65 |
+
f.write(rendered.markdown)
|
| 66 |
+
|
| 67 |
+
for img_name, img in rendered.images.items():
|
| 68 |
+
img.save(os.path.join(output, img_name))
|
| 69 |
|
| 70 |
|
| 71 |
+
if __name__ == "__main__":
|
| 72 |
+
main()
|
marker/v2/processors/__init__.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
from typing import Optional
|
| 2 |
|
| 3 |
from pydantic import BaseModel
|
| 4 |
|
|
@@ -7,7 +7,7 @@ from marker.v2.util import assign_config
|
|
| 7 |
|
| 8 |
|
| 9 |
class BaseProcessor:
|
| 10 |
-
|
| 11 |
|
| 12 |
def __init__(self, config: Optional[BaseModel | dict] = None):
|
| 13 |
assign_config(self, config)
|
|
|
|
| 1 |
+
from typing import Optional, Tuple
|
| 2 |
|
| 3 |
from pydantic import BaseModel
|
| 4 |
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
class BaseProcessor:
|
| 10 |
+
block_types: Tuple[str] | None = None # What block types this processor is responsible for
|
| 11 |
|
| 12 |
def __init__(self, config: Optional[BaseModel | dict] = None):
|
| 13 |
assign_config(self, config)
|
marker/v2/processors/equation.py
CHANGED
|
@@ -11,7 +11,7 @@ from texify.inference import batch_inference
|
|
| 11 |
|
| 12 |
|
| 13 |
class EquationProcessor(BaseProcessor):
|
| 14 |
-
|
| 15 |
model_max_length = 384
|
| 16 |
batch_size = None
|
| 17 |
token_buffer = 256
|
|
@@ -26,7 +26,7 @@ class EquationProcessor(BaseProcessor):
|
|
| 26 |
|
| 27 |
for page in document.pages:
|
| 28 |
for block in page.children:
|
| 29 |
-
if block.block_type
|
| 30 |
continue
|
| 31 |
image_poly = block.polygon.rescale((page.polygon.width, page.polygon.height), page.lowres_image.size)
|
| 32 |
image = page.lowres_image.crop(image_poly.bbox).convert("RGB")
|
|
|
|
| 11 |
|
| 12 |
|
| 13 |
class EquationProcessor(BaseProcessor):
|
| 14 |
+
block_types = ("Equation", )
|
| 15 |
model_max_length = 384
|
| 16 |
batch_size = None
|
| 17 |
token_buffer = 256
|
|
|
|
| 26 |
|
| 27 |
for page in document.pages:
|
| 28 |
for block in page.children:
|
| 29 |
+
if block.block_type not in self.block_types:
|
| 30 |
continue
|
| 31 |
image_poly = block.polygon.rescale((page.polygon.width, page.polygon.height), page.lowres_image.size)
|
| 32 |
image = page.lowres_image.crop(image_poly.bbox).convert("RGB")
|
marker/v2/processors/table.py
CHANGED
|
@@ -12,7 +12,7 @@ from marker.v2.schema.document import Document
|
|
| 12 |
|
| 13 |
|
| 14 |
class TableProcessor(BaseProcessor):
|
| 15 |
-
|
| 16 |
detect_boxes = False
|
| 17 |
detector_batch_size = None
|
| 18 |
table_rec_batch_size = None
|
|
@@ -31,7 +31,7 @@ class TableProcessor(BaseProcessor):
|
|
| 31 |
table_data = []
|
| 32 |
for page in document.pages:
|
| 33 |
for block in page.children:
|
| 34 |
-
if block.block_type
|
| 35 |
continue
|
| 36 |
|
| 37 |
image_poly = block.polygon.rescale((page.polygon.width, page.polygon.height), page.highres_image.size)
|
|
|
|
| 12 |
|
| 13 |
|
| 14 |
class TableProcessor(BaseProcessor):
|
| 15 |
+
block_types = (BlockTypes.Table, BlockTypes.TableOfContents, BlockTypes.Form)
|
| 16 |
detect_boxes = False
|
| 17 |
detector_batch_size = None
|
| 18 |
table_rec_batch_size = None
|
|
|
|
| 31 |
table_data = []
|
| 32 |
for page in document.pages:
|
| 33 |
for block in page.children:
|
| 34 |
+
if block.block_type not in self.block_types:
|
| 35 |
continue
|
| 36 |
|
| 37 |
image_poly = block.polygon.rescale((page.polygon.width, page.polygon.height), page.highres_image.size)
|
marker/v2/renderers/__init__.py
CHANGED
|
@@ -3,6 +3,7 @@ from typing import Optional
|
|
| 3 |
from pydantic import BaseModel
|
| 4 |
|
| 5 |
|
|
|
|
| 6 |
class BaseRenderer:
|
| 7 |
block_type: str | None = None
|
| 8 |
|
|
|
|
| 3 |
from pydantic import BaseModel
|
| 4 |
|
| 5 |
|
| 6 |
+
|
| 7 |
class BaseRenderer:
|
| 8 |
block_type: str | None = None
|
| 9 |
|
marker/v2/renderers/html.py
CHANGED
|
@@ -1,33 +1,60 @@
|
|
| 1 |
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
| 2 |
from marker.v2.renderers import BaseRenderer
|
| 3 |
from marker.v2.schema import BlockTypes
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
|
| 6 |
class HTMLRenderer(BaseRenderer):
|
| 7 |
remove_blocks: list = [BlockTypes.PageHeader, BlockTypes.PageFooter]
|
| 8 |
image_blocks: list = [BlockTypes.Picture, BlockTypes.Figure]
|
| 9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
def extract_html(self, document, document_output):
|
| 11 |
soup = BeautifulSoup(document_output.html, 'html.parser')
|
| 12 |
|
| 13 |
content_refs = soup.find_all('content-ref')
|
| 14 |
-
|
|
|
|
| 15 |
for ref in content_refs:
|
| 16 |
src = ref.get('src')
|
|
|
|
| 17 |
for item in document_output.children:
|
| 18 |
if item.id == src:
|
| 19 |
-
content = self.extract_html(document, item)
|
| 20 |
-
|
| 21 |
break
|
| 22 |
|
| 23 |
-
if
|
| 24 |
ref.replace_with('')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
else:
|
|
|
|
| 26 |
ref.replace_with(BeautifulSoup(f"<div>{content}</div>", 'html.parser'))
|
| 27 |
|
| 28 |
-
return str(soup)
|
| 29 |
|
| 30 |
-
def __call__(self, document):
|
| 31 |
document_output = document.render()
|
| 32 |
-
full_html = self.extract_html(document, document_output)
|
| 33 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from bs4 import BeautifulSoup
|
| 2 |
+
from pydantic import BaseModel
|
| 3 |
+
|
| 4 |
from marker.v2.renderers import BaseRenderer
|
| 5 |
from marker.v2.schema import BlockTypes
|
| 6 |
+
from marker.v2.schema.blocks import BlockId
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class HTMLOutput(BaseModel):
|
| 10 |
+
html: str
|
| 11 |
+
images: dict
|
| 12 |
|
| 13 |
|
| 14 |
class HTMLRenderer(BaseRenderer):
|
| 15 |
remove_blocks: list = [BlockTypes.PageHeader, BlockTypes.PageFooter]
|
| 16 |
image_blocks: list = [BlockTypes.Picture, BlockTypes.Figure]
|
| 17 |
|
| 18 |
+
def extract_image(self, document, image_id):
|
| 19 |
+
image_block = document.get_block(image_id)
|
| 20 |
+
page = document.get_page(image_block.page_id)
|
| 21 |
+
page_img = page.highres_image
|
| 22 |
+
image_box = image_block.polygon.rescale(page.polygon.size, page_img.size)
|
| 23 |
+
cropped = page_img.crop(image_box.bbox)
|
| 24 |
+
return cropped
|
| 25 |
+
|
| 26 |
def extract_html(self, document, document_output):
|
| 27 |
soup = BeautifulSoup(document_output.html, 'html.parser')
|
| 28 |
|
| 29 |
content_refs = soup.find_all('content-ref')
|
| 30 |
+
ref_block_id = None
|
| 31 |
+
images = {}
|
| 32 |
for ref in content_refs:
|
| 33 |
src = ref.get('src')
|
| 34 |
+
sub_images = {}
|
| 35 |
for item in document_output.children:
|
| 36 |
if item.id == src:
|
| 37 |
+
content, sub_images = self.extract_html(document, item)
|
| 38 |
+
ref_block_id: BlockId = item.id
|
| 39 |
break
|
| 40 |
|
| 41 |
+
if ref_block_id.block_type in self.remove_blocks:
|
| 42 |
ref.replace_with('')
|
| 43 |
+
elif ref_block_id.block_type in self.image_blocks:
|
| 44 |
+
image = self.extract_image(document, ref_block_id)
|
| 45 |
+
image_name = f"{ref_block_id.to_path()}.png"
|
| 46 |
+
images[image_name] = image
|
| 47 |
+
ref.replace_with(BeautifulSoup(f"<p><img src='{image_name}'></p>", 'html.parser'))
|
| 48 |
else:
|
| 49 |
+
images.update(sub_images)
|
| 50 |
ref.replace_with(BeautifulSoup(f"<div>{content}</div>", 'html.parser'))
|
| 51 |
|
| 52 |
+
return str(soup), images
|
| 53 |
|
| 54 |
+
def __call__(self, document) -> HTMLOutput:
|
| 55 |
document_output = document.render()
|
| 56 |
+
full_html, images = self.extract_html(document, document_output)
|
| 57 |
+
return HTMLOutput(
|
| 58 |
+
html=full_html,
|
| 59 |
+
images=images,
|
| 60 |
+
)
|
marker/v2/renderers/markdown.py
CHANGED
|
@@ -1,17 +1,32 @@
|
|
| 1 |
-
from markdownify import markdownify
|
|
|
|
|
|
|
| 2 |
from marker.v2.renderers.html import HTMLRenderer
|
| 3 |
|
| 4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
class MarkdownRenderer(HTMLRenderer):
|
| 6 |
-
def __call__(self, document):
|
| 7 |
document_output = document.render()
|
| 8 |
-
full_html = self.extract_html(document, document_output)
|
| 9 |
-
|
| 10 |
-
full_html,
|
| 11 |
heading_style="ATX",
|
| 12 |
bullets="-",
|
| 13 |
escape_misc=False,
|
| 14 |
escape_underscores=False
|
| 15 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
|
|
|
|
| 1 |
+
from markdownify import markdownify, MarkdownConverter
|
| 2 |
+
from pydantic import BaseModel
|
| 3 |
+
|
| 4 |
from marker.v2.renderers.html import HTMLRenderer
|
| 5 |
|
| 6 |
|
| 7 |
+
class Markdownify(MarkdownConverter):
|
| 8 |
+
pass
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class MarkdownOutput(BaseModel):
|
| 12 |
+
markdown: str
|
| 13 |
+
images: dict
|
| 14 |
+
|
| 15 |
+
|
| 16 |
class MarkdownRenderer(HTMLRenderer):
|
| 17 |
+
def __call__(self, document) -> MarkdownOutput:
|
| 18 |
document_output = document.render()
|
| 19 |
+
full_html, images = self.extract_html(document, document_output)
|
| 20 |
+
md_cls = Markdownify(
|
|
|
|
| 21 |
heading_style="ATX",
|
| 22 |
bullets="-",
|
| 23 |
escape_misc=False,
|
| 24 |
escape_underscores=False
|
| 25 |
)
|
| 26 |
+
markdown = md_cls.convert(full_html)
|
| 27 |
+
return MarkdownOutput(
|
| 28 |
+
markdown=markdown,
|
| 29 |
+
images=images
|
| 30 |
+
)
|
| 31 |
|
| 32 |
|
marker/v2/schema/blocks/base.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
from typing import Optional, List, Any
|
|
|
|
| 4 |
|
| 5 |
from pydantic import BaseModel, ConfigDict, field_validator
|
| 6 |
|
|
@@ -44,6 +45,28 @@ class BlockId(BaseModel):
|
|
| 44 |
raise ValueError(f"Invalid block type: {v}")
|
| 45 |
return v
|
| 46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
class Block(BaseModel):
|
| 49 |
polygon: PolygonBox
|
|
@@ -105,6 +128,8 @@ class Block(BaseModel):
|
|
| 105 |
template = ""
|
| 106 |
for c in child_blocks:
|
| 107 |
template += f"<content-ref src='{c.id}'></content-ref>"
|
|
|
|
|
|
|
| 108 |
return template
|
| 109 |
|
| 110 |
def render(self, document, parent_structure):
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
from typing import Optional, List, Any
|
| 4 |
+
import re
|
| 5 |
|
| 6 |
from pydantic import BaseModel, ConfigDict, field_validator
|
| 7 |
|
|
|
|
| 45 |
raise ValueError(f"Invalid block type: {v}")
|
| 46 |
return v
|
| 47 |
|
| 48 |
+
def to_path(self):
|
| 49 |
+
return str(self).replace('/', '_')
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def merge_consecutive_tags(html, tag):
|
| 53 |
+
if not html:
|
| 54 |
+
return html
|
| 55 |
+
|
| 56 |
+
def replace_with_space(match):
|
| 57 |
+
closing_tag, whitespace, opening_tag = match.groups()
|
| 58 |
+
return whitespace if whitespace else ''
|
| 59 |
+
|
| 60 |
+
pattern = fr'</{tag}>\s*<{tag}>'
|
| 61 |
+
|
| 62 |
+
while True:
|
| 63 |
+
new_merged = re.sub(pattern, replace_with_space, html)
|
| 64 |
+
if new_merged == html:
|
| 65 |
+
break
|
| 66 |
+
html = new_merged
|
| 67 |
+
|
| 68 |
+
return html
|
| 69 |
+
|
| 70 |
|
| 71 |
class Block(BaseModel):
|
| 72 |
polygon: PolygonBox
|
|
|
|
| 128 |
template = ""
|
| 129 |
for c in child_blocks:
|
| 130 |
template += f"<content-ref src='{c.id}'></content-ref>"
|
| 131 |
+
template = merge_consecutive_tags(template, 'b')
|
| 132 |
+
template = merge_consecutive_tags(template, 'i')
|
| 133 |
return template
|
| 134 |
|
| 135 |
def render(self, document, parent_structure):
|
marker/v2/schema/blocks/equation.py
CHANGED
|
@@ -6,4 +6,4 @@ class Equation(Block):
|
|
| 6 |
latex: str | None = None
|
| 7 |
|
| 8 |
def assemble_html(self, child_blocks, parent_structure=None):
|
| 9 |
-
return f"<
|
|
|
|
| 6 |
latex: str | None = None
|
| 7 |
|
| 8 |
def assemble_html(self, child_blocks, parent_structure=None):
|
| 9 |
+
return f"<p><math>{self.latex}</math></p>"
|
marker/v2/schema/blocks/figure.py
CHANGED
|
@@ -5,4 +5,4 @@ class Figure(Block):
|
|
| 5 |
block_type: str = "Figure"
|
| 6 |
|
| 7 |
def assemble_html(self, child_blocks, parent_structure):
|
| 8 |
-
return f"Image {self.block_id}"
|
|
|
|
| 5 |
block_type: str = "Figure"
|
| 6 |
|
| 7 |
def assemble_html(self, child_blocks, parent_structure):
|
| 8 |
+
return f"<p>Image {self.block_id}</p>"
|
marker/v2/schema/blocks/form.py
CHANGED
|
@@ -1,5 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from marker.v2.schema.blocks import Block
|
| 2 |
|
| 3 |
|
| 4 |
class Form(Block):
|
| 5 |
block_type: str = "Form"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List
|
| 2 |
+
|
| 3 |
+
from tabled.formats import html_format
|
| 4 |
+
from tabled.schema import SpanTableCell
|
| 5 |
+
|
| 6 |
from marker.v2.schema.blocks import Block
|
| 7 |
|
| 8 |
|
| 9 |
class Form(Block):
|
| 10 |
block_type: str = "Form"
|
| 11 |
+
cells: List[SpanTableCell] | None = None
|
| 12 |
+
|
| 13 |
+
def assemble_html(self, child_blocks, parent_structure=None):
|
| 14 |
+
return html_format(self.cells)
|
marker/v2/schema/blocks/pagefooter.py
CHANGED
|
@@ -3,3 +3,8 @@ from marker.v2.schema.blocks import Block
|
|
| 3 |
|
| 4 |
class PageFooter(Block):
|
| 5 |
block_type: str = "PageFooter"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
class PageFooter(Block):
|
| 5 |
block_type: str = "PageFooter"
|
| 6 |
+
|
| 7 |
+
def assemble_html(self, child_blocks, parent_structure):
|
| 8 |
+
template = super().assemble_html(child_blocks, parent_structure)
|
| 9 |
+
template = template.replace("\n", " ")
|
| 10 |
+
return f"<p>{template}</p>"
|
marker/v2/schema/blocks/pageheader.py
CHANGED
|
@@ -3,3 +3,8 @@ from marker.v2.schema.blocks import Block
|
|
| 3 |
|
| 4 |
class PageHeader(Block):
|
| 5 |
block_type: str = "PageHeader"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
class PageHeader(Block):
|
| 5 |
block_type: str = "PageHeader"
|
| 6 |
+
|
| 7 |
+
def assemble_html(self, child_blocks, parent_structure):
|
| 8 |
+
template = super().assemble_html(child_blocks, parent_structure)
|
| 9 |
+
template = template.replace("\n", " ")
|
| 10 |
+
return f"<p>{template}</p>"
|
marker/v2/schema/blocks/picture.py
CHANGED
|
@@ -5,4 +5,4 @@ class Picture(Block):
|
|
| 5 |
block_type: str = "Picture"
|
| 6 |
|
| 7 |
def assemble_html(self, child_blocks, parent_structure):
|
| 8 |
-
return f"Image {self.block_id}"
|
|
|
|
| 5 |
block_type: str = "Picture"
|
| 6 |
|
| 7 |
def assemble_html(self, child_blocks, parent_structure):
|
| 8 |
+
return f"<p>Image {self.block_id}</p>"
|
marker/v2/schema/blocks/text.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
| 1 |
from marker.v2.schema.blocks import Block
|
| 2 |
|
| 3 |
-
|
| 4 |
class Text(Block):
|
| 5 |
block_type: str = "Text"
|
| 6 |
|
|
|
|
| 1 |
from marker.v2.schema.blocks import Block
|
| 2 |
|
|
|
|
| 3 |
class Text(Block):
|
| 4 |
block_type: str = "Text"
|
| 5 |
|
marker/v2/schema/blocks/toc.py
CHANGED
|
@@ -1,5 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from marker.v2.schema.blocks import Block
|
| 2 |
|
| 3 |
|
| 4 |
class TableOfContents(Block):
|
| 5 |
block_type: str = "TableOfContents"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List
|
| 2 |
+
|
| 3 |
+
from tabled.formats import html_format
|
| 4 |
+
from tabled.schema import SpanTableCell
|
| 5 |
+
|
| 6 |
from marker.v2.schema.blocks import Block
|
| 7 |
|
| 8 |
|
| 9 |
class TableOfContents(Block):
|
| 10 |
block_type: str = "TableOfContents"
|
| 11 |
+
cells: List[SpanTableCell] | None = None
|
| 12 |
+
|
| 13 |
+
def assemble_html(self, child_blocks, parent_structure=None):
|
| 14 |
+
return html_format(self.cells)
|
marker/v2/schema/document.py
CHANGED
|
@@ -20,12 +20,17 @@ class Document(BaseModel):
|
|
| 20 |
block_type: str = "Document"
|
| 21 |
|
| 22 |
def get_block(self, block_id: BlockId):
|
| 23 |
-
page =
|
| 24 |
block = page.get_block(block_id)
|
| 25 |
if block:
|
| 26 |
return block
|
| 27 |
return None
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
def assemble_html(self, child_blocks):
|
| 30 |
template = ""
|
| 31 |
for c in child_blocks:
|
|
|
|
| 20 |
block_type: str = "Document"
|
| 21 |
|
| 22 |
def get_block(self, block_id: BlockId):
|
| 23 |
+
page = self.get_page(block_id.page_id)
|
| 24 |
block = page.get_block(block_id)
|
| 25 |
if block:
|
| 26 |
return block
|
| 27 |
return None
|
| 28 |
|
| 29 |
+
def get_page(self, page_id):
|
| 30 |
+
page = self.pages[page_id]
|
| 31 |
+
assert page.page_id == page_id, "Mismatch between page_id and page index"
|
| 32 |
+
return page
|
| 33 |
+
|
| 34 |
def assemble_html(self, child_blocks):
|
| 35 |
template = ""
|
| 36 |
for c in child_blocks:
|
marker/v2/schema/groups/list.py
CHANGED
|
@@ -6,4 +6,4 @@ class ListGroup(Block):
|
|
| 6 |
|
| 7 |
def assemble_html(self, child_blocks, parent_structure):
|
| 8 |
template = super().assemble_html(child_blocks, parent_structure)
|
| 9 |
-
return f"<ul>{template}</ul>"
|
|
|
|
| 6 |
|
| 7 |
def assemble_html(self, child_blocks, parent_structure):
|
| 8 |
template = super().assemble_html(child_blocks, parent_structure)
|
| 9 |
+
return f"<p><ul>{template}</ul></p>"
|
marker/v2/schema/text/line.py
CHANGED
|
@@ -25,7 +25,8 @@ def strip_trailing_hyphens(line_text, next_line_text, line_html) -> str:
|
|
| 25 |
next_line_starts_lowercase = regex.match(rf"^\s?[{lowercase_letters}]", next_line_text)
|
| 26 |
|
| 27 |
if hyphen_regex.match(line_text) and next_line_starts_lowercase:
|
| 28 |
-
|
|
|
|
| 29 |
return line_html
|
| 30 |
|
| 31 |
|
|
|
|
| 25 |
next_line_starts_lowercase = regex.match(rf"^\s?[{lowercase_letters}]", next_line_text)
|
| 26 |
|
| 27 |
if hyphen_regex.match(line_text) and next_line_starts_lowercase:
|
| 28 |
+
line_html = replace_last(line_html, rf'[{HYPHENS}]', "")
|
| 29 |
+
|
| 30 |
return line_html
|
| 31 |
|
| 32 |
|