Spaces:

rt4u
/

marker

Sleeping

App Files Files Community

peppermenta commited on Sep 23

Commit

ffc1cfb

2 Parent(s): a45c553 723d583

Merge branch 'dev' of https://github.com/datalab-to/marker into dev

Browse files

Files changed (6) hide show

marker/builders/layout.py +3 -1
marker/builders/line.py +2 -2
marker/builders/ocr.py +2 -0
marker/models.py +3 -6
marker/renderers/markdown.py +9 -0
marker/schema/groups/page.py +2 -0

marker/builders/layout.py CHANGED Viewed

@@ -146,7 +146,9 @@ class LayoutBuilder(BaseBuilder):
                     layout_page_size, provider_page_size
                 ).fit_to_bounds((0, 0, *provider_page_size))
                 layout_block.top_k = {
-                    BlockTypes[label]: prob for (label, prob) in bbox.top_k.items()
                 }
                 page.add_structure(layout_block)

                     layout_page_size, provider_page_size
                 ).fit_to_bounds((0, 0, *provider_page_size))
                 layout_block.top_k = {
+                    BlockTypes[label]: prob
+                    for (label, prob) in bbox.top_k.items()
+                    if label in BlockTypes.__members__
                 }
                 page.add_structure(layout_block)

marker/builders/line.py CHANGED Viewed

@@ -70,7 +70,6 @@ class LineBuilder(BaseBuilder):
         BlockTypes.Table,
         BlockTypes.Form,
         BlockTypes.TableOfContents,
-        BlockTypes.Equation,
     )
     disable_tqdm: Annotated[
         bool,
@@ -81,6 +80,7 @@ class LineBuilder(BaseBuilder):
         "Disable OCR for the document. This will only use the lines from the provider.",
     ] = False
     keep_chars: Annotated[bool, "Keep individual characters."] = False
     def __init__(
         self,
@@ -191,7 +191,7 @@ class LineBuilder(BaseBuilder):
             detection_boxes = []
             if detection_result:
                 detection_boxes = [
-                    PolygonBox(polygon=box.polygon) for box in detection_result.bboxes
                 ]
             detection_boxes = sort_text_lines(detection_boxes)

         BlockTypes.Table,
         BlockTypes.Form,
         BlockTypes.TableOfContents,
     )
     disable_tqdm: Annotated[
         bool,
         "Disable OCR for the document. This will only use the lines from the provider.",
     ] = False
     keep_chars: Annotated[bool, "Keep individual characters."] = False
+    detection_line_min_confidence: Annotated[float, "Minimum confidence for a detected line to be included"] = 0.8
     def __init__(
         self,
             detection_boxes = []
             if detection_result:
                 detection_boxes = [
+                    PolygonBox(polygon=box.polygon) for box in detection_result.bboxes if box.confidence > self.detection_line_min_confidence
                 ]
             detection_boxes = sort_text_lines(detection_boxes)

marker/builders/ocr.py CHANGED Viewed

@@ -47,6 +47,8 @@ class OcrBuilder(BaseBuilder):
         BlockTypes.Figure,
         BlockTypes.Picture,
         BlockTypes.Table,
     ]
     full_ocr_block_types: Annotated[
         List[BlockTypes],

         BlockTypes.Figure,
         BlockTypes.Picture,
         BlockTypes.Table,
+        BlockTypes.Form,
+        BlockTypes.TableOfContents,
     ]
     full_ocr_block_types: Annotated[
         List[BlockTypes],

marker/models.py CHANGED Viewed

@@ -10,18 +10,15 @@ from surya.layout import LayoutPredictor
 from surya.ocr_error import OCRErrorPredictor
 from surya.recognition import RecognitionPredictor
 from surya.table_rec import TableRecPredictor
 def create_model_dict(
     device=None, dtype=None, attention_implementation: str | None = None
 ) -> dict:
-    foundation_predictor = FoundationPredictor(
-        device=device, dtype=dtype, attention_implementation=attention_implementation
-    )
     return {
-        "foundation_model": foundation_predictor,
-        "layout_model": LayoutPredictor(device=device, dtype=dtype),
-        "recognition_model": RecognitionPredictor(foundation_predictor),
         "table_rec_model": TableRecPredictor(device=device, dtype=dtype),
         "detection_model": DetectionPredictor(device=device, dtype=dtype),
         "ocr_error_model": OCRErrorPredictor(device=device, dtype=dtype),

 from surya.ocr_error import OCRErrorPredictor
 from surya.recognition import RecognitionPredictor
 from surya.table_rec import TableRecPredictor
+from surya.settings import settings as surya_settings
 def create_model_dict(
     device=None, dtype=None, attention_implementation: str | None = None
 ) -> dict:
     return {
+        "layout_model": LayoutPredictor(FoundationPredictor(checkpoint=surya_settings.LAYOUT_MODEL_CHECKPOINT, attention_implementation=attention_implementation, device=device, dtype=dtype)),
+        "recognition_model": RecognitionPredictor(FoundationPredictor(checkpoint=surya_settings.RECOGNITION_MODEL_CHECKPOINT, attention_implementation=attention_implementation, device=device, dtype=dtype)),
         "table_rec_model": TableRecPredictor(device=device, dtype=dtype),
         "detection_model": DetectionPredictor(device=device, dtype=dtype),
         "ocr_error_model": OCRErrorPredictor(device=device, dtype=dtype),

marker/renderers/markdown.py CHANGED Viewed

@@ -62,6 +62,7 @@ class Markdownify(MarkdownConverter):
         page_separator,
         inline_math_delimiters,
         block_math_delimiters,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -69,6 +70,7 @@ class Markdownify(MarkdownConverter):
         self.page_separator = page_separator
         self.inline_math_delimiters = inline_math_delimiters
         self.block_math_delimiters = block_math_delimiters
     def convert_div(self, el, text, parent_tags):
         is_page = el.has_attr("class") and el["class"][0] == "page"
@@ -116,6 +118,9 @@ class Markdownify(MarkdownConverter):
             )
     def convert_table(self, el, text, parent_tags):
         total_rows = len(el.find_all("tr"))
         colspans = []
         rowspan_cols = defaultdict(int)
@@ -268,6 +273,9 @@ class MarkdownRenderer(HTMLRenderer):
     block_math_delimiters: Annotated[
         Tuple[str], "The delimiters to use for block math."
     ] = ("$$", "$$")
     @property
     def md_cls(self):
@@ -284,6 +292,7 @@ class MarkdownRenderer(HTMLRenderer):
             sup_symbol="<sup>",
             inline_math_delimiters=self.inline_math_delimiters,
             block_math_delimiters=self.block_math_delimiters,
         )
     def __call__(self, document: Document) -> MarkdownOutput:

         page_separator,
         inline_math_delimiters,
         block_math_delimiters,
+        html_tables_in_markdown,
         **kwargs,
     ):
         super().__init__(**kwargs)
         self.page_separator = page_separator
         self.inline_math_delimiters = inline_math_delimiters
         self.block_math_delimiters = block_math_delimiters
+        self.html_tables_in_markdown = html_tables_in_markdown
     def convert_div(self, el, text, parent_tags):
         is_page = el.has_attr("class") and el["class"][0] == "page"
             )
     def convert_table(self, el, text, parent_tags):
+        if self.html_tables_in_markdown:
+            return "\n\n" + str(el) + "\n\n"
         total_rows = len(el.find_all("tr"))
         colspans = []
         rowspan_cols = defaultdict(int)
     block_math_delimiters: Annotated[
         Tuple[str], "The delimiters to use for block math."
     ] = ("$$", "$$")
+    html_tables_in_markdown: Annotated[
+        bool, "Return tables formatted as HTML, instead of in markdown"
+    ] = False
     @property
     def md_cls(self):
             sup_symbol="<sup>",
             inline_math_delimiters=self.inline_math_delimiters,
             block_math_delimiters=self.block_math_delimiters,
+            html_tables_in_markdown=self.html_tables_in_markdown
         )
     def __call__(self, document: Document) -> MarkdownOutput:

marker/schema/groups/page.py CHANGED Viewed

@@ -169,6 +169,8 @@ class PageGroup(Group):
         max_intersection_pct = 0
         for block_idx, block in enumerate(structure_blocks):
             max_intersection_pct = max(max_intersection_pct, np.max(intersection_matrix[block_idx]) / block.polygon.area)
         return max_intersection_pct

         max_intersection_pct = 0
         for block_idx, block in enumerate(structure_blocks):
+            if block.polygon.area == 0:
+                continue
             max_intersection_pct = max(max_intersection_pct, np.max(intersection_matrix[block_idx]) / block.polygon.area)
         return max_intersection_pct