Merge remote-tracking branch 'origin/vik_v2' into dev-mose/marker-v2
Browse files- marker/v2/builders/structure.py +1 -1
- marker/v2/converters/__init__.py +1 -0
- marker/v2/converters/pdf.py +16 -14
- marker/v2/processors/equation.py +1 -1
- marker/v2/processors/sectionheader.py +84 -0
- marker/v2/processors/table.py +1 -1
- marker/v2/providers/pdf.py +1 -1
- marker/v2/renderers/__init__.py +2 -4
- marker/v2/renderers/html.py +12 -1
- marker/v2/renderers/markdown.py +17 -1
- marker/v2/schema/blocks/base.py +3 -0
- marker/v2/schema/blocks/sectionheader.py +3 -1
- marker/v2/schema/groups/page.py +6 -0
- marker/v2/util.py +17 -4
- tests/utils.py +5 -8
marker/v2/builders/structure.py
CHANGED
|
@@ -12,7 +12,7 @@ from marker.v2.schema.groups.page import PageGroup
|
|
| 12 |
class StructureBuilder(BaseBuilder):
|
| 13 |
gap_threshold: int = 10
|
| 14 |
|
| 15 |
-
def __init__(self, config
|
| 16 |
super().__init__(config)
|
| 17 |
|
| 18 |
def __call__(self, document: Document):
|
|
|
|
| 12 |
class StructureBuilder(BaseBuilder):
|
| 13 |
gap_threshold: int = 10
|
| 14 |
|
| 15 |
+
def __init__(self, config=None):
|
| 16 |
super().__init__(config)
|
| 17 |
|
| 18 |
def __call__(self, document: Document):
|
marker/v2/converters/__init__.py
CHANGED
|
@@ -8,6 +8,7 @@ from marker.v2.util import assign_config
|
|
| 8 |
class BaseConverter:
|
| 9 |
def __init__(self, config: Optional[BaseModel | dict] = None):
|
| 10 |
assign_config(self, config)
|
|
|
|
| 11 |
|
| 12 |
def __call__(self, *args, **kwargs):
|
| 13 |
raise NotImplementedError
|
|
|
|
| 8 |
class BaseConverter:
|
| 9 |
def __init__(self, config: Optional[BaseModel | dict] = None):
|
| 10 |
assign_config(self, config)
|
| 11 |
+
self.config = config
|
| 12 |
|
| 13 |
def __call__(self, *args, **kwargs):
|
| 14 |
raise NotImplementedError
|
marker/v2/converters/pdf.py
CHANGED
|
@@ -1,9 +1,8 @@
|
|
| 1 |
-
<<<<<<< HEAD
|
| 2 |
import os
|
| 3 |
-
|
| 4 |
-
from marker.v2.providers.pdf import PdfProvider
|
| 5 |
|
| 6 |
-
|
|
|
|
| 7 |
import tempfile
|
| 8 |
from typing import List, Optional
|
| 9 |
|
|
@@ -24,7 +23,7 @@ from marker.v2.renderers.markdown import MarkdownRenderer
|
|
| 24 |
|
| 25 |
|
| 26 |
class PdfConverter(BaseConverter):
|
| 27 |
-
def __init__(self, config
|
| 28 |
super().__init__(config)
|
| 29 |
|
| 30 |
self.layout_model = setup_layout_model()
|
|
@@ -33,21 +32,24 @@ class PdfConverter(BaseConverter):
|
|
| 33 |
self.table_rec_model = setup_table_rec_model()
|
| 34 |
self.detection_model = setup_detection_model()
|
| 35 |
|
| 36 |
-
def __call__(self, filepath: str
|
| 37 |
-
pdf_provider = PdfProvider(filepath,
|
| 38 |
|
| 39 |
-
layout_builder = LayoutBuilder(self.layout_model)
|
| 40 |
-
ocr_builder = OcrBuilder(self.detection_model, self.recognition_model)
|
| 41 |
-
document = DocumentBuilder()(pdf_provider, layout_builder, ocr_builder)
|
| 42 |
-
StructureBuilder()(document)
|
| 43 |
|
| 44 |
-
equation_processor = EquationProcessor(self.texify_model)
|
| 45 |
equation_processor(document)
|
| 46 |
|
| 47 |
-
table_processor = TableProcessor(self.detection_model, self.recognition_model, self.table_rec_model)
|
| 48 |
table_processor(document)
|
| 49 |
|
| 50 |
-
|
|
|
|
|
|
|
|
|
|
| 51 |
return renderer(document)
|
| 52 |
|
| 53 |
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false" # disables a tokenizers warning
|
|
|
|
| 3 |
|
| 4 |
+
from marker.v2.processors.sectionheader import SectionHeaderProcessor
|
| 5 |
+
from marker.v2.providers.pdf import PdfProvider
|
| 6 |
import tempfile
|
| 7 |
from typing import List, Optional
|
| 8 |
|
|
|
|
| 23 |
|
| 24 |
|
| 25 |
class PdfConverter(BaseConverter):
|
| 26 |
+
def __init__(self, config=None):
|
| 27 |
super().__init__(config)
|
| 28 |
|
| 29 |
self.layout_model = setup_layout_model()
|
|
|
|
| 32 |
self.table_rec_model = setup_table_rec_model()
|
| 33 |
self.detection_model = setup_detection_model()
|
| 34 |
|
| 35 |
+
def __call__(self, filepath: str):
|
| 36 |
+
pdf_provider = PdfProvider(filepath, self.config)
|
| 37 |
|
| 38 |
+
layout_builder = LayoutBuilder(self.layout_model, self.config)
|
| 39 |
+
ocr_builder = OcrBuilder(self.detection_model, self.recognition_model, self.config)
|
| 40 |
+
document = DocumentBuilder(self.config)(pdf_provider, layout_builder, ocr_builder)
|
| 41 |
+
StructureBuilder(self.config)(document)
|
| 42 |
|
| 43 |
+
equation_processor = EquationProcessor(self.texify_model, self.config)
|
| 44 |
equation_processor(document)
|
| 45 |
|
| 46 |
+
table_processor = TableProcessor(self.detection_model, self.recognition_model, self.table_rec_model, self.config)
|
| 47 |
table_processor(document)
|
| 48 |
|
| 49 |
+
section_header_processor = SectionHeaderProcessor(self.config)
|
| 50 |
+
section_header_processor(document)
|
| 51 |
+
|
| 52 |
+
renderer = MarkdownRenderer(self.config)
|
| 53 |
return renderer(document)
|
| 54 |
|
| 55 |
|
marker/v2/processors/equation.py
CHANGED
|
@@ -16,7 +16,7 @@ class EquationProcessor(BaseProcessor):
|
|
| 16 |
batch_size = None
|
| 17 |
token_buffer = 256
|
| 18 |
|
| 19 |
-
def __init__(self, texify_model, config
|
| 20 |
super().__init__(config)
|
| 21 |
|
| 22 |
self.texify_model = texify_model
|
|
|
|
| 16 |
batch_size = None
|
| 17 |
token_buffer = 256
|
| 18 |
|
| 19 |
+
def __init__(self, texify_model, config=None):
|
| 20 |
super().__init__(config)
|
| 21 |
|
| 22 |
self.texify_model = texify_model
|
marker/v2/processors/sectionheader.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from marker.v2.processors import BaseProcessor
|
| 2 |
+
from marker.v2.schema import BlockTypes
|
| 3 |
+
from marker.v2.schema.document import Document
|
| 4 |
+
|
| 5 |
+
import numpy as np
|
| 6 |
+
from sklearn.cluster import KMeans
|
| 7 |
+
from sklearn.exceptions import ConvergenceWarning
|
| 8 |
+
|
| 9 |
+
# Ignore sklearn warning about not converging
|
| 10 |
+
import warnings
|
| 11 |
+
warnings.filterwarnings("ignore", category=ConvergenceWarning)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class SectionHeaderProcessor(BaseProcessor):
|
| 15 |
+
block_types = (BlockTypes.SectionHeader, )
|
| 16 |
+
level_count = 4
|
| 17 |
+
merge_threshold = .25
|
| 18 |
+
default_level = 2
|
| 19 |
+
height_tolerance = .99
|
| 20 |
+
|
| 21 |
+
def __call__(self, document: Document):
|
| 22 |
+
line_heights = {}
|
| 23 |
+
for page in document.pages:
|
| 24 |
+
for block in page.children:
|
| 25 |
+
if block.block_type not in self.block_types:
|
| 26 |
+
continue
|
| 27 |
+
|
| 28 |
+
line_heights[block.block_id] = [document.get_block(l).polygon.height for l in block.structure if l.block_type == BlockTypes.Line]
|
| 29 |
+
|
| 30 |
+
flat_line_heights = [h for heights in line_heights.values() for h in heights]
|
| 31 |
+
heading_ranges = self.bucket_headings(flat_line_heights)
|
| 32 |
+
|
| 33 |
+
for page in document.pages:
|
| 34 |
+
for block in page.children:
|
| 35 |
+
if block.block_type not in self.block_types:
|
| 36 |
+
continue
|
| 37 |
+
|
| 38 |
+
block_heights = line_heights[block.block_id]
|
| 39 |
+
if len(block_heights) > 0:
|
| 40 |
+
avg_height = sum(block_heights) / len(block_heights)
|
| 41 |
+
for idx, (min_height, max_height) in enumerate(heading_ranges):
|
| 42 |
+
if avg_height >= min_height * self.height_tolerance:
|
| 43 |
+
block.heading_level = idx + 1
|
| 44 |
+
break
|
| 45 |
+
|
| 46 |
+
if block.heading_level is None:
|
| 47 |
+
block.heading_level = self.default_level
|
| 48 |
+
|
| 49 |
+
def bucket_headings(self, line_heights, num_levels=4):
|
| 50 |
+
if len(line_heights) <= self.level_count:
|
| 51 |
+
return []
|
| 52 |
+
|
| 53 |
+
data = np.asarray(line_heights).reshape(-1, 1)
|
| 54 |
+
labels = KMeans(n_clusters=num_levels, random_state=0, n_init="auto").fit_predict(data)
|
| 55 |
+
data_labels = np.concatenate([data, labels.reshape(-1, 1)], axis=1)
|
| 56 |
+
data_labels = np.sort(data_labels, axis=0)
|
| 57 |
+
|
| 58 |
+
cluster_means = {int(label): float(np.mean(data_labels[data_labels[:, 1] == label, 0])) for label in np.unique(labels)}
|
| 59 |
+
label_max = None
|
| 60 |
+
label_min = None
|
| 61 |
+
heading_ranges = []
|
| 62 |
+
prev_cluster = None
|
| 63 |
+
for row in data_labels:
|
| 64 |
+
value, label = row
|
| 65 |
+
value = float(value)
|
| 66 |
+
label = int(label)
|
| 67 |
+
if prev_cluster is not None and label != prev_cluster:
|
| 68 |
+
prev_cluster_mean = cluster_means[prev_cluster]
|
| 69 |
+
cluster_mean = cluster_means[label]
|
| 70 |
+
if cluster_mean * self.merge_threshold < prev_cluster_mean:
|
| 71 |
+
heading_ranges.append((label_min, label_max))
|
| 72 |
+
label_min = None
|
| 73 |
+
label_max = None
|
| 74 |
+
|
| 75 |
+
label_min = value if label_min is None else min(label_min, value)
|
| 76 |
+
label_max = value if label_max is None else max(label_max, value)
|
| 77 |
+
prev_cluster = label
|
| 78 |
+
|
| 79 |
+
if label_min is not None:
|
| 80 |
+
heading_ranges.append((label_min, label_max))
|
| 81 |
+
|
| 82 |
+
heading_ranges = sorted(heading_ranges, reverse=True)
|
| 83 |
+
|
| 84 |
+
return heading_ranges
|
marker/v2/processors/table.py
CHANGED
|
@@ -18,7 +18,7 @@ class TableProcessor(BaseProcessor):
|
|
| 18 |
table_rec_batch_size = None
|
| 19 |
ocr_batch_size = None
|
| 20 |
|
| 21 |
-
def __init__(self, detection_model, ocr_model, table_rec_model, config
|
| 22 |
super().__init__(config)
|
| 23 |
|
| 24 |
self.detection_model = detection_model
|
|
|
|
| 18 |
table_rec_batch_size = None
|
| 19 |
ocr_batch_size = None
|
| 20 |
|
| 21 |
+
def __init__(self, detection_model, ocr_model, table_rec_model, config=None):
|
| 22 |
super().__init__(config)
|
| 23 |
|
| 24 |
self.detection_model = detection_model
|
marker/v2/providers/pdf.py
CHANGED
|
@@ -23,7 +23,7 @@ class PdfProvider(BaseProvider):
|
|
| 23 |
flatten_pdf: bool = True
|
| 24 |
force_ocr: bool = False
|
| 25 |
|
| 26 |
-
def __init__(self, filepath: str, config
|
| 27 |
super().__init__(filepath, config)
|
| 28 |
|
| 29 |
self.doc: pdfium.PdfDocument = pdfium.PdfDocument(self.filepath)
|
|
|
|
| 23 |
flatten_pdf: bool = True
|
| 24 |
force_ocr: bool = False
|
| 25 |
|
| 26 |
+
def __init__(self, filepath: str, config = None):
|
| 27 |
super().__init__(filepath, config)
|
| 28 |
|
| 29 |
self.doc: pdfium.PdfDocument = pdfium.PdfDocument(self.filepath)
|
marker/v2/renderers/__init__.py
CHANGED
|
@@ -3,16 +3,14 @@ from typing import Optional
|
|
| 3 |
from pydantic import BaseModel
|
| 4 |
|
| 5 |
from marker.v2.schema import BlockTypes
|
| 6 |
-
|
| 7 |
|
| 8 |
|
| 9 |
class BaseRenderer:
|
| 10 |
block_type: BlockTypes | None = None
|
| 11 |
|
| 12 |
def __init__(self, config: Optional[BaseModel | dict] = None):
|
| 13 |
-
|
| 14 |
-
for k in config.model_fields:
|
| 15 |
-
setattr(self, k, config[k])
|
| 16 |
|
| 17 |
def __call__(self, document):
|
| 18 |
# Children are in reading order
|
|
|
|
| 3 |
from pydantic import BaseModel
|
| 4 |
|
| 5 |
from marker.v2.schema import BlockTypes
|
| 6 |
+
from marker.v2.util import assign_config
|
| 7 |
|
| 8 |
|
| 9 |
class BaseRenderer:
|
| 10 |
block_type: BlockTypes | None = None
|
| 11 |
|
| 12 |
def __init__(self, config: Optional[BaseModel | dict] = None):
|
| 13 |
+
assign_config(self, config)
|
|
|
|
|
|
|
| 14 |
|
| 15 |
def __call__(self, document):
|
| 16 |
# Children are in reading order
|
marker/v2/renderers/html.py
CHANGED
|
@@ -1,12 +1,16 @@
|
|
| 1 |
import re
|
| 2 |
|
| 3 |
-
from bs4 import BeautifulSoup
|
| 4 |
from pydantic import BaseModel
|
| 5 |
|
| 6 |
from marker.v2.renderers import BaseRenderer
|
| 7 |
from marker.v2.schema import BlockTypes
|
| 8 |
from marker.v2.schema.blocks import BlockId
|
| 9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
class HTMLOutput(BaseModel):
|
| 12 |
html: str
|
|
@@ -34,6 +38,8 @@ def merge_consecutive_tags(html, tag):
|
|
| 34 |
class HTMLRenderer(BaseRenderer):
|
| 35 |
remove_blocks: list = [BlockTypes.PageHeader, BlockTypes.PageFooter]
|
| 36 |
image_blocks: list = [BlockTypes.Picture, BlockTypes.Figure]
|
|
|
|
|
|
|
| 37 |
|
| 38 |
def extract_image(self, document, image_id):
|
| 39 |
image_block = document.get_block(image_id)
|
|
@@ -65,6 +71,11 @@ class HTMLRenderer(BaseRenderer):
|
|
| 65 |
image_name = f"{ref_block_id.to_path()}.png"
|
| 66 |
images[image_name] = image
|
| 67 |
ref.replace_with(BeautifulSoup(f"<p><img src='{image_name}'></p>", 'html.parser'))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
else:
|
| 69 |
images.update(sub_images)
|
| 70 |
ref.replace_with(BeautifulSoup(f"{content}", 'html.parser'))
|
|
|
|
| 1 |
import re
|
| 2 |
|
| 3 |
+
from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
|
| 4 |
from pydantic import BaseModel
|
| 5 |
|
| 6 |
from marker.v2.renderers import BaseRenderer
|
| 7 |
from marker.v2.schema import BlockTypes
|
| 8 |
from marker.v2.schema.blocks import BlockId
|
| 9 |
|
| 10 |
+
# Ignore beautifulsoup warnings
|
| 11 |
+
import warnings
|
| 12 |
+
warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)
|
| 13 |
+
|
| 14 |
|
| 15 |
class HTMLOutput(BaseModel):
|
| 16 |
html: str
|
|
|
|
| 38 |
class HTMLRenderer(BaseRenderer):
|
| 39 |
remove_blocks: list = [BlockTypes.PageHeader, BlockTypes.PageFooter]
|
| 40 |
image_blocks: list = [BlockTypes.Picture, BlockTypes.Figure]
|
| 41 |
+
page_blocks: list = [BlockTypes.Page]
|
| 42 |
+
paginate_output: bool = False
|
| 43 |
|
| 44 |
def extract_image(self, document, image_id):
|
| 45 |
image_block = document.get_block(image_id)
|
|
|
|
| 71 |
image_name = f"{ref_block_id.to_path()}.png"
|
| 72 |
images[image_name] = image
|
| 73 |
ref.replace_with(BeautifulSoup(f"<p><img src='{image_name}'></p>", 'html.parser'))
|
| 74 |
+
elif ref_block_id.block_type in self.page_blocks:
|
| 75 |
+
images.update(sub_images)
|
| 76 |
+
if self.paginate_output:
|
| 77 |
+
content = f"<div class='page' data-page-id='{ref_block_id.page_id}'>{content}</div>"
|
| 78 |
+
ref.replace_with(BeautifulSoup(f"{content}", 'html.parser'))
|
| 79 |
else:
|
| 80 |
images.update(sub_images)
|
| 81 |
ref.replace_with(BeautifulSoup(f"{content}", 'html.parser'))
|
marker/v2/renderers/markdown.py
CHANGED
|
@@ -6,7 +6,19 @@ from marker.v2.schema.document import Document
|
|
| 6 |
|
| 7 |
|
| 8 |
class Markdownify(MarkdownConverter):
|
| 9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
|
| 12 |
class MarkdownOutput(BaseModel):
|
|
@@ -15,10 +27,14 @@ class MarkdownOutput(BaseModel):
|
|
| 15 |
|
| 16 |
|
| 17 |
class MarkdownRenderer(HTMLRenderer):
|
|
|
|
|
|
|
| 18 |
def __call__(self, document: Document) -> MarkdownOutput:
|
| 19 |
document_output = document.render()
|
| 20 |
full_html, images = self.extract_html(document, document_output)
|
| 21 |
md_cls = Markdownify(
|
|
|
|
|
|
|
| 22 |
heading_style="ATX",
|
| 23 |
bullets="-",
|
| 24 |
escape_misc=False,
|
|
|
|
| 6 |
|
| 7 |
|
| 8 |
class Markdownify(MarkdownConverter):
|
| 9 |
+
def __init__(self, paginate_output, page_separator, **kwargs):
|
| 10 |
+
super().__init__(**kwargs)
|
| 11 |
+
self.paginate_output = paginate_output
|
| 12 |
+
self.page_separator = page_separator
|
| 13 |
+
|
| 14 |
+
def convert_div(self, el, text, convert_as_inline):
|
| 15 |
+
is_page = el.has_attr('class') and el['class'][0] == 'page'
|
| 16 |
+
if self.paginate_output and is_page:
|
| 17 |
+
page_id = el['data-page-id']
|
| 18 |
+
pagination_item = "\n\n" + "{" + str(page_id) + "}" + self.page_separator + "\n\n"
|
| 19 |
+
return pagination_item + text
|
| 20 |
+
else:
|
| 21 |
+
return text
|
| 22 |
|
| 23 |
|
| 24 |
class MarkdownOutput(BaseModel):
|
|
|
|
| 27 |
|
| 28 |
|
| 29 |
class MarkdownRenderer(HTMLRenderer):
|
| 30 |
+
page_separator: str = "-" * 48
|
| 31 |
+
|
| 32 |
def __call__(self, document: Document) -> MarkdownOutput:
|
| 33 |
document_output = document.render()
|
| 34 |
full_html, images = self.extract_html(document, document_output)
|
| 35 |
md_cls = Markdownify(
|
| 36 |
+
self.paginate_output,
|
| 37 |
+
self.page_separator,
|
| 38 |
heading_style="ATX",
|
| 39 |
bullets="-",
|
| 40 |
escape_misc=False,
|
marker/v2/schema/blocks/base.py
CHANGED
|
@@ -28,6 +28,9 @@ class BlockId(BaseModel):
|
|
| 28 |
return f"/page/{self.page_id}"
|
| 29 |
return f"/page/{self.page_id}/{self.block_type.name}/{self.block_id}"
|
| 30 |
|
|
|
|
|
|
|
|
|
|
| 31 |
def __repr__(self):
|
| 32 |
return str(self)
|
| 33 |
|
|
|
|
| 28 |
return f"/page/{self.page_id}"
|
| 29 |
return f"/page/{self.page_id}/{self.block_type.name}/{self.block_id}"
|
| 30 |
|
| 31 |
+
def __hash__(self):
|
| 32 |
+
return hash(str(self))
|
| 33 |
+
|
| 34 |
def __repr__(self):
|
| 35 |
return str(self)
|
| 36 |
|
marker/v2/schema/blocks/sectionheader.py
CHANGED
|
@@ -4,8 +4,10 @@ from marker.v2.schema.blocks import Block
|
|
| 4 |
|
| 5 |
class SectionHeader(Block):
|
| 6 |
block_type: BlockTypes = BlockTypes.SectionHeader
|
|
|
|
| 7 |
|
| 8 |
def assemble_html(self, child_blocks, parent_structure):
|
| 9 |
template = super().assemble_html(child_blocks, parent_structure)
|
| 10 |
template = template.replace("\n", " ")
|
| 11 |
-
|
|
|
|
|
|
| 4 |
|
| 5 |
class SectionHeader(Block):
|
| 6 |
block_type: BlockTypes = BlockTypes.SectionHeader
|
| 7 |
+
heading_level: int | None = None
|
| 8 |
|
| 9 |
def assemble_html(self, child_blocks, parent_structure):
|
| 10 |
template = super().assemble_html(child_blocks, parent_structure)
|
| 11 |
template = template.replace("\n", " ")
|
| 12 |
+
tag = f"h{self.heading_level}" if self.heading_level else "h2"
|
| 13 |
+
return f"<{tag}>{template}</{tag}>"
|
marker/v2/schema/groups/page.py
CHANGED
|
@@ -48,6 +48,12 @@ class PageGroup(Block):
|
|
| 48 |
assert block.block_id == block_id.block_id
|
| 49 |
return block
|
| 50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
def merge_blocks(
|
| 52 |
self,
|
| 53 |
page_lines: List[Line],
|
|
|
|
| 48 |
assert block.block_id == block_id.block_id
|
| 49 |
return block
|
| 50 |
|
| 51 |
+
def assemble_html(self, child_blocks, parent_structure=None):
|
| 52 |
+
template = ""
|
| 53 |
+
for c in child_blocks:
|
| 54 |
+
template += f"<content-ref src='{c.id}'></content-ref>"
|
| 55 |
+
return template
|
| 56 |
+
|
| 57 |
def merge_blocks(
|
| 58 |
self,
|
| 59 |
page_lines: List[Line],
|
marker/v2/util.py
CHANGED
|
@@ -2,11 +2,24 @@ from pydantic import BaseModel
|
|
| 2 |
|
| 3 |
|
| 4 |
def assign_config(cls, config: BaseModel | dict | None):
|
|
|
|
| 5 |
if config is None:
|
| 6 |
return
|
| 7 |
elif isinstance(config, BaseModel):
|
| 8 |
-
|
| 9 |
-
setattr(cls, k, config[k])
|
| 10 |
elif isinstance(config, dict):
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
|
| 4 |
def assign_config(cls, config: BaseModel | dict | None):
|
| 5 |
+
cls_name = cls.__class__.__name__
|
| 6 |
if config is None:
|
| 7 |
return
|
| 8 |
elif isinstance(config, BaseModel):
|
| 9 |
+
dict_config = config.dict()
|
|
|
|
| 10 |
elif isinstance(config, dict):
|
| 11 |
+
dict_config = config
|
| 12 |
+
else:
|
| 13 |
+
raise ValueError("config must be a dict or a pydantic BaseModel")
|
| 14 |
+
|
| 15 |
+
for k in dict_config:
|
| 16 |
+
if hasattr(cls, k):
|
| 17 |
+
setattr(cls, k, dict_config[k])
|
| 18 |
+
for k in dict_config:
|
| 19 |
+
if cls_name not in k:
|
| 20 |
+
continue
|
| 21 |
+
# Enables using class-specific keys, like "MarkdownRenderer_remove_blocks"
|
| 22 |
+
split_k = k.removeprefix(cls_name + "_")
|
| 23 |
+
|
| 24 |
+
if hasattr(cls, split_k):
|
| 25 |
+
setattr(cls, split_k, dict_config[k])
|
tests/utils.py
CHANGED
|
@@ -11,10 +11,7 @@ from marker.v2.schema.document import Document
|
|
| 11 |
|
| 12 |
def setup_pdf_document(
|
| 13 |
filename='adversarial.pdf',
|
| 14 |
-
|
| 15 |
-
layout_builder_config=None,
|
| 16 |
-
ocr_builder_config=None,
|
| 17 |
-
document_builder_config=None
|
| 18 |
) -> Document:
|
| 19 |
dataset = datasets.load_dataset("datalab-to/pdfs", split="train")
|
| 20 |
idx = dataset['filename'].index(filename)
|
|
@@ -27,9 +24,9 @@ def setup_pdf_document(
|
|
| 27 |
recognition_model = setup_recognition_model()
|
| 28 |
detection_model = setup_detection_model()
|
| 29 |
|
| 30 |
-
provider = PdfProvider(temp_pdf.name,
|
| 31 |
-
layout_builder = LayoutBuilder(layout_model,
|
| 32 |
-
ocr_builder = OcrBuilder(detection_model, recognition_model,
|
| 33 |
-
builder = DocumentBuilder(
|
| 34 |
document = builder(provider, layout_builder, ocr_builder)
|
| 35 |
return document
|
|
|
|
| 11 |
|
| 12 |
def setup_pdf_document(
|
| 13 |
filename='adversarial.pdf',
|
| 14 |
+
config=None,
|
|
|
|
|
|
|
|
|
|
| 15 |
) -> Document:
|
| 16 |
dataset = datasets.load_dataset("datalab-to/pdfs", split="train")
|
| 17 |
idx = dataset['filename'].index(filename)
|
|
|
|
| 24 |
recognition_model = setup_recognition_model()
|
| 25 |
detection_model = setup_detection_model()
|
| 26 |
|
| 27 |
+
provider = PdfProvider(temp_pdf.name, config)
|
| 28 |
+
layout_builder = LayoutBuilder(layout_model, config)
|
| 29 |
+
ocr_builder = OcrBuilder(detection_model, recognition_model, config)
|
| 30 |
+
builder = DocumentBuilder(config)
|
| 31 |
document = builder(provider, layout_builder, ocr_builder)
|
| 32 |
return document
|