Merge branch 'dev' of https://github.com/datalab-to/marker into dev
Browse files- marker/builders/layout.py +3 -1
- marker/builders/line.py +2 -2
- marker/builders/ocr.py +2 -0
- marker/models.py +3 -6
- marker/renderers/markdown.py +9 -0
- marker/schema/groups/page.py +2 -0
marker/builders/layout.py
CHANGED
|
@@ -146,7 +146,9 @@ class LayoutBuilder(BaseBuilder):
|
|
| 146 |
layout_page_size, provider_page_size
|
| 147 |
).fit_to_bounds((0, 0, *provider_page_size))
|
| 148 |
layout_block.top_k = {
|
| 149 |
-
BlockTypes[label]: prob
|
|
|
|
|
|
|
| 150 |
}
|
| 151 |
page.add_structure(layout_block)
|
| 152 |
|
|
|
|
| 146 |
layout_page_size, provider_page_size
|
| 147 |
).fit_to_bounds((0, 0, *provider_page_size))
|
| 148 |
layout_block.top_k = {
|
| 149 |
+
BlockTypes[label]: prob
|
| 150 |
+
for (label, prob) in bbox.top_k.items()
|
| 151 |
+
if label in BlockTypes.__members__
|
| 152 |
}
|
| 153 |
page.add_structure(layout_block)
|
| 154 |
|
marker/builders/line.py
CHANGED
|
@@ -70,7 +70,6 @@ class LineBuilder(BaseBuilder):
|
|
| 70 |
BlockTypes.Table,
|
| 71 |
BlockTypes.Form,
|
| 72 |
BlockTypes.TableOfContents,
|
| 73 |
-
BlockTypes.Equation,
|
| 74 |
)
|
| 75 |
disable_tqdm: Annotated[
|
| 76 |
bool,
|
|
@@ -81,6 +80,7 @@ class LineBuilder(BaseBuilder):
|
|
| 81 |
"Disable OCR for the document. This will only use the lines from the provider.",
|
| 82 |
] = False
|
| 83 |
keep_chars: Annotated[bool, "Keep individual characters."] = False
|
|
|
|
| 84 |
|
| 85 |
def __init__(
|
| 86 |
self,
|
|
@@ -191,7 +191,7 @@ class LineBuilder(BaseBuilder):
|
|
| 191 |
detection_boxes = []
|
| 192 |
if detection_result:
|
| 193 |
detection_boxes = [
|
| 194 |
-
PolygonBox(polygon=box.polygon) for box in detection_result.bboxes
|
| 195 |
]
|
| 196 |
|
| 197 |
detection_boxes = sort_text_lines(detection_boxes)
|
|
|
|
| 70 |
BlockTypes.Table,
|
| 71 |
BlockTypes.Form,
|
| 72 |
BlockTypes.TableOfContents,
|
|
|
|
| 73 |
)
|
| 74 |
disable_tqdm: Annotated[
|
| 75 |
bool,
|
|
|
|
| 80 |
"Disable OCR for the document. This will only use the lines from the provider.",
|
| 81 |
] = False
|
| 82 |
keep_chars: Annotated[bool, "Keep individual characters."] = False
|
| 83 |
+
detection_line_min_confidence: Annotated[float, "Minimum confidence for a detected line to be included"] = 0.8
|
| 84 |
|
| 85 |
def __init__(
|
| 86 |
self,
|
|
|
|
| 191 |
detection_boxes = []
|
| 192 |
if detection_result:
|
| 193 |
detection_boxes = [
|
| 194 |
+
PolygonBox(polygon=box.polygon) for box in detection_result.bboxes if box.confidence > self.detection_line_min_confidence
|
| 195 |
]
|
| 196 |
|
| 197 |
detection_boxes = sort_text_lines(detection_boxes)
|
marker/builders/ocr.py
CHANGED
|
@@ -47,6 +47,8 @@ class OcrBuilder(BaseBuilder):
|
|
| 47 |
BlockTypes.Figure,
|
| 48 |
BlockTypes.Picture,
|
| 49 |
BlockTypes.Table,
|
|
|
|
|
|
|
| 50 |
]
|
| 51 |
full_ocr_block_types: Annotated[
|
| 52 |
List[BlockTypes],
|
|
|
|
| 47 |
BlockTypes.Figure,
|
| 48 |
BlockTypes.Picture,
|
| 49 |
BlockTypes.Table,
|
| 50 |
+
BlockTypes.Form,
|
| 51 |
+
BlockTypes.TableOfContents,
|
| 52 |
]
|
| 53 |
full_ocr_block_types: Annotated[
|
| 54 |
List[BlockTypes],
|
marker/models.py
CHANGED
|
@@ -10,18 +10,15 @@ from surya.layout import LayoutPredictor
|
|
| 10 |
from surya.ocr_error import OCRErrorPredictor
|
| 11 |
from surya.recognition import RecognitionPredictor
|
| 12 |
from surya.table_rec import TableRecPredictor
|
|
|
|
| 13 |
|
| 14 |
|
| 15 |
def create_model_dict(
|
| 16 |
device=None, dtype=None, attention_implementation: str | None = None
|
| 17 |
) -> dict:
|
| 18 |
-
foundation_predictor = FoundationPredictor(
|
| 19 |
-
device=device, dtype=dtype, attention_implementation=attention_implementation
|
| 20 |
-
)
|
| 21 |
return {
|
| 22 |
-
"
|
| 23 |
-
"
|
| 24 |
-
"recognition_model": RecognitionPredictor(foundation_predictor),
|
| 25 |
"table_rec_model": TableRecPredictor(device=device, dtype=dtype),
|
| 26 |
"detection_model": DetectionPredictor(device=device, dtype=dtype),
|
| 27 |
"ocr_error_model": OCRErrorPredictor(device=device, dtype=dtype),
|
|
|
|
| 10 |
from surya.ocr_error import OCRErrorPredictor
|
| 11 |
from surya.recognition import RecognitionPredictor
|
| 12 |
from surya.table_rec import TableRecPredictor
|
| 13 |
+
from surya.settings import settings as surya_settings
|
| 14 |
|
| 15 |
|
| 16 |
def create_model_dict(
|
| 17 |
device=None, dtype=None, attention_implementation: str | None = None
|
| 18 |
) -> dict:
|
|
|
|
|
|
|
|
|
|
| 19 |
return {
|
| 20 |
+
"layout_model": LayoutPredictor(FoundationPredictor(checkpoint=surya_settings.LAYOUT_MODEL_CHECKPOINT, attention_implementation=attention_implementation, device=device, dtype=dtype)),
|
| 21 |
+
"recognition_model": RecognitionPredictor(FoundationPredictor(checkpoint=surya_settings.RECOGNITION_MODEL_CHECKPOINT, attention_implementation=attention_implementation, device=device, dtype=dtype)),
|
|
|
|
| 22 |
"table_rec_model": TableRecPredictor(device=device, dtype=dtype),
|
| 23 |
"detection_model": DetectionPredictor(device=device, dtype=dtype),
|
| 24 |
"ocr_error_model": OCRErrorPredictor(device=device, dtype=dtype),
|
marker/renderers/markdown.py
CHANGED
|
@@ -62,6 +62,7 @@ class Markdownify(MarkdownConverter):
|
|
| 62 |
page_separator,
|
| 63 |
inline_math_delimiters,
|
| 64 |
block_math_delimiters,
|
|
|
|
| 65 |
**kwargs,
|
| 66 |
):
|
| 67 |
super().__init__(**kwargs)
|
|
@@ -69,6 +70,7 @@ class Markdownify(MarkdownConverter):
|
|
| 69 |
self.page_separator = page_separator
|
| 70 |
self.inline_math_delimiters = inline_math_delimiters
|
| 71 |
self.block_math_delimiters = block_math_delimiters
|
|
|
|
| 72 |
|
| 73 |
def convert_div(self, el, text, parent_tags):
|
| 74 |
is_page = el.has_attr("class") and el["class"][0] == "page"
|
|
@@ -116,6 +118,9 @@ class Markdownify(MarkdownConverter):
|
|
| 116 |
)
|
| 117 |
|
| 118 |
def convert_table(self, el, text, parent_tags):
|
|
|
|
|
|
|
|
|
|
| 119 |
total_rows = len(el.find_all("tr"))
|
| 120 |
colspans = []
|
| 121 |
rowspan_cols = defaultdict(int)
|
|
@@ -268,6 +273,9 @@ class MarkdownRenderer(HTMLRenderer):
|
|
| 268 |
block_math_delimiters: Annotated[
|
| 269 |
Tuple[str], "The delimiters to use for block math."
|
| 270 |
] = ("$$", "$$")
|
|
|
|
|
|
|
|
|
|
| 271 |
|
| 272 |
@property
|
| 273 |
def md_cls(self):
|
|
@@ -284,6 +292,7 @@ class MarkdownRenderer(HTMLRenderer):
|
|
| 284 |
sup_symbol="<sup>",
|
| 285 |
inline_math_delimiters=self.inline_math_delimiters,
|
| 286 |
block_math_delimiters=self.block_math_delimiters,
|
|
|
|
| 287 |
)
|
| 288 |
|
| 289 |
def __call__(self, document: Document) -> MarkdownOutput:
|
|
|
|
| 62 |
page_separator,
|
| 63 |
inline_math_delimiters,
|
| 64 |
block_math_delimiters,
|
| 65 |
+
html_tables_in_markdown,
|
| 66 |
**kwargs,
|
| 67 |
):
|
| 68 |
super().__init__(**kwargs)
|
|
|
|
| 70 |
self.page_separator = page_separator
|
| 71 |
self.inline_math_delimiters = inline_math_delimiters
|
| 72 |
self.block_math_delimiters = block_math_delimiters
|
| 73 |
+
self.html_tables_in_markdown = html_tables_in_markdown
|
| 74 |
|
| 75 |
def convert_div(self, el, text, parent_tags):
|
| 76 |
is_page = el.has_attr("class") and el["class"][0] == "page"
|
|
|
|
| 118 |
)
|
| 119 |
|
| 120 |
def convert_table(self, el, text, parent_tags):
|
| 121 |
+
if self.html_tables_in_markdown:
|
| 122 |
+
return "\n\n" + str(el) + "\n\n"
|
| 123 |
+
|
| 124 |
total_rows = len(el.find_all("tr"))
|
| 125 |
colspans = []
|
| 126 |
rowspan_cols = defaultdict(int)
|
|
|
|
| 273 |
block_math_delimiters: Annotated[
|
| 274 |
Tuple[str], "The delimiters to use for block math."
|
| 275 |
] = ("$$", "$$")
|
| 276 |
+
html_tables_in_markdown: Annotated[
|
| 277 |
+
bool, "Return tables formatted as HTML, instead of in markdown"
|
| 278 |
+
] = False
|
| 279 |
|
| 280 |
@property
|
| 281 |
def md_cls(self):
|
|
|
|
| 292 |
sup_symbol="<sup>",
|
| 293 |
inline_math_delimiters=self.inline_math_delimiters,
|
| 294 |
block_math_delimiters=self.block_math_delimiters,
|
| 295 |
+
html_tables_in_markdown=self.html_tables_in_markdown
|
| 296 |
)
|
| 297 |
|
| 298 |
def __call__(self, document: Document) -> MarkdownOutput:
|
marker/schema/groups/page.py
CHANGED
|
@@ -169,6 +169,8 @@ class PageGroup(Group):
|
|
| 169 |
|
| 170 |
max_intersection_pct = 0
|
| 171 |
for block_idx, block in enumerate(structure_blocks):
|
|
|
|
|
|
|
| 172 |
max_intersection_pct = max(max_intersection_pct, np.max(intersection_matrix[block_idx]) / block.polygon.area)
|
| 173 |
|
| 174 |
return max_intersection_pct
|
|
|
|
| 169 |
|
| 170 |
max_intersection_pct = 0
|
| 171 |
for block_idx, block in enumerate(structure_blocks):
|
| 172 |
+
if block.polygon.area == 0:
|
| 173 |
+
continue
|
| 174 |
max_intersection_pct = max(max_intersection_pct, np.max(intersection_matrix[block_idx]) / block.polygon.area)
|
| 175 |
|
| 176 |
return max_intersection_pct
|