peppermenta commited on
Commit
ffc1cfb
·
2 Parent(s): a45c553 723d583

Merge branch 'dev' of https://github.com/datalab-to/marker into dev

Browse files
marker/builders/layout.py CHANGED
@@ -146,7 +146,9 @@ class LayoutBuilder(BaseBuilder):
146
  layout_page_size, provider_page_size
147
  ).fit_to_bounds((0, 0, *provider_page_size))
148
  layout_block.top_k = {
149
- BlockTypes[label]: prob for (label, prob) in bbox.top_k.items()
 
 
150
  }
151
  page.add_structure(layout_block)
152
 
 
146
  layout_page_size, provider_page_size
147
  ).fit_to_bounds((0, 0, *provider_page_size))
148
  layout_block.top_k = {
149
+ BlockTypes[label]: prob
150
+ for (label, prob) in bbox.top_k.items()
151
+ if label in BlockTypes.__members__
152
  }
153
  page.add_structure(layout_block)
154
 
marker/builders/line.py CHANGED
@@ -70,7 +70,6 @@ class LineBuilder(BaseBuilder):
70
  BlockTypes.Table,
71
  BlockTypes.Form,
72
  BlockTypes.TableOfContents,
73
- BlockTypes.Equation,
74
  )
75
  disable_tqdm: Annotated[
76
  bool,
@@ -81,6 +80,7 @@ class LineBuilder(BaseBuilder):
81
  "Disable OCR for the document. This will only use the lines from the provider.",
82
  ] = False
83
  keep_chars: Annotated[bool, "Keep individual characters."] = False
 
84
 
85
  def __init__(
86
  self,
@@ -191,7 +191,7 @@ class LineBuilder(BaseBuilder):
191
  detection_boxes = []
192
  if detection_result:
193
  detection_boxes = [
194
- PolygonBox(polygon=box.polygon) for box in detection_result.bboxes
195
  ]
196
 
197
  detection_boxes = sort_text_lines(detection_boxes)
 
70
  BlockTypes.Table,
71
  BlockTypes.Form,
72
  BlockTypes.TableOfContents,
 
73
  )
74
  disable_tqdm: Annotated[
75
  bool,
 
80
  "Disable OCR for the document. This will only use the lines from the provider.",
81
  ] = False
82
  keep_chars: Annotated[bool, "Keep individual characters."] = False
83
+ detection_line_min_confidence: Annotated[float, "Minimum confidence for a detected line to be included"] = 0.8
84
 
85
  def __init__(
86
  self,
 
191
  detection_boxes = []
192
  if detection_result:
193
  detection_boxes = [
194
+ PolygonBox(polygon=box.polygon) for box in detection_result.bboxes if box.confidence > self.detection_line_min_confidence
195
  ]
196
 
197
  detection_boxes = sort_text_lines(detection_boxes)
marker/builders/ocr.py CHANGED
@@ -47,6 +47,8 @@ class OcrBuilder(BaseBuilder):
47
  BlockTypes.Figure,
48
  BlockTypes.Picture,
49
  BlockTypes.Table,
 
 
50
  ]
51
  full_ocr_block_types: Annotated[
52
  List[BlockTypes],
 
47
  BlockTypes.Figure,
48
  BlockTypes.Picture,
49
  BlockTypes.Table,
50
+ BlockTypes.Form,
51
+ BlockTypes.TableOfContents,
52
  ]
53
  full_ocr_block_types: Annotated[
54
  List[BlockTypes],
marker/models.py CHANGED
@@ -10,18 +10,15 @@ from surya.layout import LayoutPredictor
10
  from surya.ocr_error import OCRErrorPredictor
11
  from surya.recognition import RecognitionPredictor
12
  from surya.table_rec import TableRecPredictor
 
13
 
14
 
15
  def create_model_dict(
16
  device=None, dtype=None, attention_implementation: str | None = None
17
  ) -> dict:
18
- foundation_predictor = FoundationPredictor(
19
- device=device, dtype=dtype, attention_implementation=attention_implementation
20
- )
21
  return {
22
- "foundation_model": foundation_predictor,
23
- "layout_model": LayoutPredictor(device=device, dtype=dtype),
24
- "recognition_model": RecognitionPredictor(foundation_predictor),
25
  "table_rec_model": TableRecPredictor(device=device, dtype=dtype),
26
  "detection_model": DetectionPredictor(device=device, dtype=dtype),
27
  "ocr_error_model": OCRErrorPredictor(device=device, dtype=dtype),
 
10
  from surya.ocr_error import OCRErrorPredictor
11
  from surya.recognition import RecognitionPredictor
12
  from surya.table_rec import TableRecPredictor
13
+ from surya.settings import settings as surya_settings
14
 
15
 
16
  def create_model_dict(
17
  device=None, dtype=None, attention_implementation: str | None = None
18
  ) -> dict:
 
 
 
19
  return {
20
+ "layout_model": LayoutPredictor(FoundationPredictor(checkpoint=surya_settings.LAYOUT_MODEL_CHECKPOINT, attention_implementation=attention_implementation, device=device, dtype=dtype)),
21
+ "recognition_model": RecognitionPredictor(FoundationPredictor(checkpoint=surya_settings.RECOGNITION_MODEL_CHECKPOINT, attention_implementation=attention_implementation, device=device, dtype=dtype)),
 
22
  "table_rec_model": TableRecPredictor(device=device, dtype=dtype),
23
  "detection_model": DetectionPredictor(device=device, dtype=dtype),
24
  "ocr_error_model": OCRErrorPredictor(device=device, dtype=dtype),
marker/renderers/markdown.py CHANGED
@@ -62,6 +62,7 @@ class Markdownify(MarkdownConverter):
62
  page_separator,
63
  inline_math_delimiters,
64
  block_math_delimiters,
 
65
  **kwargs,
66
  ):
67
  super().__init__(**kwargs)
@@ -69,6 +70,7 @@ class Markdownify(MarkdownConverter):
69
  self.page_separator = page_separator
70
  self.inline_math_delimiters = inline_math_delimiters
71
  self.block_math_delimiters = block_math_delimiters
 
72
 
73
  def convert_div(self, el, text, parent_tags):
74
  is_page = el.has_attr("class") and el["class"][0] == "page"
@@ -116,6 +118,9 @@ class Markdownify(MarkdownConverter):
116
  )
117
 
118
  def convert_table(self, el, text, parent_tags):
 
 
 
119
  total_rows = len(el.find_all("tr"))
120
  colspans = []
121
  rowspan_cols = defaultdict(int)
@@ -268,6 +273,9 @@ class MarkdownRenderer(HTMLRenderer):
268
  block_math_delimiters: Annotated[
269
  Tuple[str], "The delimiters to use for block math."
270
  ] = ("$$", "$$")
 
 
 
271
 
272
  @property
273
  def md_cls(self):
@@ -284,6 +292,7 @@ class MarkdownRenderer(HTMLRenderer):
284
  sup_symbol="<sup>",
285
  inline_math_delimiters=self.inline_math_delimiters,
286
  block_math_delimiters=self.block_math_delimiters,
 
287
  )
288
 
289
  def __call__(self, document: Document) -> MarkdownOutput:
 
62
  page_separator,
63
  inline_math_delimiters,
64
  block_math_delimiters,
65
+ html_tables_in_markdown,
66
  **kwargs,
67
  ):
68
  super().__init__(**kwargs)
 
70
  self.page_separator = page_separator
71
  self.inline_math_delimiters = inline_math_delimiters
72
  self.block_math_delimiters = block_math_delimiters
73
+ self.html_tables_in_markdown = html_tables_in_markdown
74
 
75
  def convert_div(self, el, text, parent_tags):
76
  is_page = el.has_attr("class") and el["class"][0] == "page"
 
118
  )
119
 
120
  def convert_table(self, el, text, parent_tags):
121
+ if self.html_tables_in_markdown:
122
+ return "\n\n" + str(el) + "\n\n"
123
+
124
  total_rows = len(el.find_all("tr"))
125
  colspans = []
126
  rowspan_cols = defaultdict(int)
 
273
  block_math_delimiters: Annotated[
274
  Tuple[str], "The delimiters to use for block math."
275
  ] = ("$$", "$$")
276
+ html_tables_in_markdown: Annotated[
277
+ bool, "Return tables formatted as HTML, instead of in markdown"
278
+ ] = False
279
 
280
  @property
281
  def md_cls(self):
 
292
  sup_symbol="<sup>",
293
  inline_math_delimiters=self.inline_math_delimiters,
294
  block_math_delimiters=self.block_math_delimiters,
295
+ html_tables_in_markdown=self.html_tables_in_markdown
296
  )
297
 
298
  def __call__(self, document: Document) -> MarkdownOutput:
marker/schema/groups/page.py CHANGED
@@ -169,6 +169,8 @@ class PageGroup(Group):
169
 
170
  max_intersection_pct = 0
171
  for block_idx, block in enumerate(structure_blocks):
 
 
172
  max_intersection_pct = max(max_intersection_pct, np.max(intersection_matrix[block_idx]) / block.polygon.area)
173
 
174
  return max_intersection_pct
 
169
 
170
  max_intersection_pct = 0
171
  for block_idx, block in enumerate(structure_blocks):
172
+ if block.polygon.area == 0:
173
+ continue
174
  max_intersection_pct = max(max_intersection_pct, np.max(intersection_matrix[block_idx]) / block.polygon.area)
175
 
176
  return max_intersection_pct