Vik Paruchuri commited on
Commit
b730265
·
1 Parent(s): b2d41ef

Fix tests, add way to disable ocr

Browse files
marker/builders/line.py CHANGED
@@ -36,10 +36,6 @@ class LineBuilder(BaseBuilder):
36
  "The batch size to use for the ocr error detection model.",
37
  "Default is None, which will use the default batch size for the model.",
38
  ] = None
39
- enable_table_ocr: Annotated[
40
- bool,
41
- "Whether to skip OCR on tables. The TableProcessor will re-OCR them. Only enable if the TableProcessor is not running.",
42
- ] = False
43
  layout_coverage_min_lines: Annotated[
44
  int,
45
  "The minimum number of PdfProvider lines that must be covered by the layout model",
@@ -54,17 +50,10 @@ class LineBuilder(BaseBuilder):
54
  float,
55
  "If less pages than this threshold are good, OCR will happen in the document. Otherwise it will not.",
56
  ] = 0.85
57
- provider_line_detected_line_min_overlap_pct: Annotated[
58
- float,
59
- "The percentage of a provider line that has to be covered by a detected line",
60
- ] = 0.1
61
  provider_line_provider_line_min_overlap_pct: Annotated[
62
  float,
63
  "The percentage of a provider line that has to be covered by a detected line",
64
- ] = 0.1
65
- line_vertical_merge_threshold: Annotated[
66
- int, "The maximum pixel distance between y1s for two lines to be merged"
67
- ] = 8
68
  excluded_for_coverage: Annotated[
69
  Tuple[BlockTypes],
70
  "A list of block types to exclude from the layout coverage check.",
@@ -86,6 +75,10 @@ class LineBuilder(BaseBuilder):
86
  bool,
87
  "Disable tqdm progress bars.",
88
  ] = False
 
 
 
 
89
  keep_chars: Annotated[bool, "Keep individual characters."] = False
90
 
91
  def __init__(
@@ -169,6 +162,9 @@ class LineBuilder(BaseBuilder):
169
  ), # Ensure provider lines don't overflow the page or intersect
170
  ]
171
  )
 
 
 
172
  layout_good.append(provider_lines_good)
173
 
174
  run_detection = [not good for good in layout_good]
@@ -191,12 +187,12 @@ class LineBuilder(BaseBuilder):
191
  )
192
 
193
  # Setup detection results
 
194
  if detection_result:
195
  detection_boxes = [
196
  PolygonBox(polygon=box.polygon) for box in detection_result.bboxes
197
  ]
198
- else:
199
- detection_boxes = []
200
  detection_boxes = sort_text_lines(detection_boxes)
201
 
202
  if provider_lines_good:
@@ -257,6 +253,7 @@ class LineBuilder(BaseBuilder):
257
  provider_bboxes = [line.line.polygon.bbox for line in provider_lines]
258
  # Add a small margin to account for minor overflows
259
  page_bbox = document_page.polygon.expand(5, 5).bbox
 
260
  for bbox in provider_bboxes:
261
  if bbox[0] < page_bbox[0]:
262
  return False
@@ -275,7 +272,7 @@ class LineBuilder(BaseBuilder):
275
  )
276
 
277
  # There should be one intersection with itself
278
- if intersect_counts > 1:
279
  return False
280
 
281
  return True
 
36
  "The batch size to use for the ocr error detection model.",
37
  "Default is None, which will use the default batch size for the model.",
38
  ] = None
 
 
 
 
39
  layout_coverage_min_lines: Annotated[
40
  int,
41
  "The minimum number of PdfProvider lines that must be covered by the layout model",
 
50
  float,
51
  "If less pages than this threshold are good, OCR will happen in the document. Otherwise it will not.",
52
  ] = 0.85
 
 
 
 
53
  provider_line_provider_line_min_overlap_pct: Annotated[
54
  float,
55
  "The percentage of a provider line that has to be covered by a detected line",
56
+ ] = 0.15
 
 
 
57
  excluded_for_coverage: Annotated[
58
  Tuple[BlockTypes],
59
  "A list of block types to exclude from the layout coverage check.",
 
75
  bool,
76
  "Disable tqdm progress bars.",
77
  ] = False
78
+ disable_ocr: Annotated[
79
+ bool,
80
+ "Disable OCR for the document. This will only use the lines from the provider.",
81
+ ] = False
82
  keep_chars: Annotated[bool, "Keep individual characters."] = False
83
 
84
  def __init__(
 
162
  ), # Ensure provider lines don't overflow the page or intersect
163
  ]
164
  )
165
+ if self.disable_ocr:
166
+ provider_lines_good = True
167
+
168
  layout_good.append(provider_lines_good)
169
 
170
  run_detection = [not good for good in layout_good]
 
187
  )
188
 
189
  # Setup detection results
190
+ detection_boxes = []
191
  if detection_result:
192
  detection_boxes = [
193
  PolygonBox(polygon=box.polygon) for box in detection_result.bboxes
194
  ]
195
+
 
196
  detection_boxes = sort_text_lines(detection_boxes)
197
 
198
  if provider_lines_good:
 
253
  provider_bboxes = [line.line.polygon.bbox for line in provider_lines]
254
  # Add a small margin to account for minor overflows
255
  page_bbox = document_page.polygon.expand(5, 5).bbox
256
+
257
  for bbox in provider_bboxes:
258
  if bbox[0] < page_bbox[0]:
259
  return False
 
272
  )
273
 
274
  # There should be one intersection with itself
275
+ if intersect_counts > 2:
276
  return False
277
 
278
  return True
tests/builders/test_document_builder.py CHANGED
@@ -4,20 +4,40 @@ from marker.schema import BlockTypes
4
  from marker.schema.text.line import Line
5
 
6
 
 
7
  @pytest.mark.config({"page_range": [0]})
8
  def test_document_builder(pdf_document):
9
  first_page = pdf_document.pages[0]
10
- assert first_page.structure[0] == '/page/0/SectionHeader/0'
11
 
12
  first_block = first_page.get_block(first_page.structure[0])
13
  assert first_block.block_type == BlockTypes.SectionHeader
14
- assert first_block.text_extraction_method == 'pdftext'
15
 
16
  first_text_block: Line = first_page.get_block(first_block.structure[0])
17
  assert first_text_block.block_type == BlockTypes.Line
18
 
19
  first_span = first_page.get_block(first_text_block.structure[0])
20
  assert first_span.block_type == BlockTypes.Span
21
- assert first_span.text == 'Subspace Adversarial Training'
22
- assert first_span.font == 'NimbusRomNo9L-Medi'
23
- assert first_span.formats == ['plain']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  from marker.schema.text.line import Line
5
 
6
 
7
+ @pytest.mark.filename("thinkpython.pdf")
8
  @pytest.mark.config({"page_range": [0]})
9
  def test_document_builder(pdf_document):
10
  first_page = pdf_document.pages[0]
11
+ assert first_page.structure[0] == "/page/0/SectionHeader/0"
12
 
13
  first_block = first_page.get_block(first_page.structure[0])
14
  assert first_block.block_type == BlockTypes.SectionHeader
15
+ assert first_block.text_extraction_method == "pdftext"
16
 
17
  first_text_block: Line = first_page.get_block(first_block.structure[0])
18
  assert first_text_block.block_type == BlockTypes.Line
19
 
20
  first_span = first_page.get_block(first_text_block.structure[0])
21
  assert first_span.block_type == BlockTypes.Span
22
+ assert first_span.text == "Think Python"
23
+ assert first_span.font == "URWPalladioL-Roma"
24
+ assert first_span.formats == ["plain"]
25
+
26
+
27
+ @pytest.mark.config({"page_range": [0]})
28
+ def test_document_builder_inline_eq(pdf_document):
29
+ first_page = pdf_document.pages[0]
30
+ assert first_page.structure[0] == "/page/0/SectionHeader/0"
31
+
32
+ first_block = first_page.get_block(first_page.structure[0])
33
+ assert first_block.block_type == BlockTypes.SectionHeader
34
+ assert first_block.text_extraction_method == "surya"
35
+
36
+ first_text_block: Line = first_page.get_block(first_block.structure[0])
37
+ assert first_text_block.block_type == BlockTypes.Line
38
+
39
+ first_span = first_page.get_block(first_text_block.structure[0])
40
+ assert first_span.block_type == BlockTypes.Span
41
+ assert first_span.text == "Subspace Adversarial Training"
42
+ assert first_span.font == "NimbusRomNo9L-Medi"
43
+ assert first_span.formats == ["plain"]
tests/builders/test_layout_replace.py CHANGED
@@ -8,8 +8,11 @@ from marker.schema import BlockTypes
8
  from marker.schema.registry import get_block_class
9
 
10
 
 
11
  @pytest.mark.config({"page_range": [0]})
12
- def test_layout_replace(request, config, doc_provider, layout_model, ocr_error_model, detection_model):
 
 
13
  # The llm layout builder replaces blocks - this makes sure text is still merged properly
14
  layout_builder = LayoutBuilder(layout_model, config)
15
  line_builder = LineBuilder(detection_model, ocr_error_model, config)
@@ -35,8 +38,4 @@ def test_layout_replace(request, config, doc_provider, layout_model, ocr_error_m
35
  renderer = MarkdownRenderer(config)
36
  rendered = renderer(document)
37
 
38
- assert "worst-case perturbations" in rendered.markdown
39
- assert "projected gradient descent" in rendered.markdown
40
-
41
-
42
-
 
8
  from marker.schema.registry import get_block_class
9
 
10
 
11
+ @pytest.mark.filename("thinkpython.pdf")
12
  @pytest.mark.config({"page_range": [0]})
13
+ def test_layout_replace(
14
+ request, config, doc_provider, layout_model, ocr_error_model, detection_model
15
+ ):
16
  # The llm layout builder replaces blocks - this makes sure text is still merged properly
17
  layout_builder = LayoutBuilder(layout_model, config)
18
  line_builder = LineBuilder(detection_model, ocr_error_model, config)
 
38
  renderer = MarkdownRenderer(config)
39
  rendered = renderer(document)
40
 
41
+ assert "Think Python" in rendered.markdown
 
 
 
 
tests/converters/test_ocr_converter.py CHANGED
@@ -35,7 +35,7 @@ def check_bboxes(page: OCRJSONPageOutput, lines):
35
 
36
  @pytest.mark.config({"page_range": [0]})
37
  def test_ocr_converter(config, model_dict, temp_doc):
38
- _ocr_converter(config, model_dict, temp_doc, 83, 2)
39
 
40
 
41
  @pytest.mark.filename("pres.pdf")
 
35
 
36
  @pytest.mark.config({"page_range": [0]})
37
  def test_ocr_converter(config, model_dict, temp_doc):
38
+ _ocr_converter(config, model_dict, temp_doc, 85, 2)
39
 
40
 
41
  @pytest.mark.filename("pres.pdf")
tests/converters/test_pdf_converter.py CHANGED
@@ -6,7 +6,7 @@ from marker.renderers.markdown import MarkdownOutput
6
 
7
 
8
  @pytest.mark.output_format("markdown")
9
- @pytest.mark.config({"page_range": [0, 1, 2, 3, 7]})
10
  def test_pdf_converter(pdf_converter: PdfConverter, temp_doc):
11
  markdown_output: MarkdownOutput = pdf_converter(temp_doc.name)
12
  markdown = markdown_output.markdown
@@ -79,7 +79,7 @@ def test_pptx_converter(pdf_converter: PdfConverter, temp_doc):
79
 
80
 
81
  @pytest.mark.output_format("markdown")
82
- @pytest.mark.config({"page_range": [0, 1, 2, 3, 7]})
83
  def test_pdf_converter_bytes(pdf_converter: PdfConverter, temp_doc):
84
  with open(temp_doc.name, "rb") as f:
85
  data = f.read()
 
6
 
7
 
8
  @pytest.mark.output_format("markdown")
9
+ @pytest.mark.config({"page_range": [0, 1, 2, 3, 7], "disable_ocr": True})
10
  def test_pdf_converter(pdf_converter: PdfConverter, temp_doc):
11
  markdown_output: MarkdownOutput = pdf_converter(temp_doc.name)
12
  markdown = markdown_output.markdown
 
79
 
80
 
81
  @pytest.mark.output_format("markdown")
82
+ @pytest.mark.config({"page_range": [0, 1, 2, 3, 7], "disable_ocr": True})
83
  def test_pdf_converter_bytes(pdf_converter: PdfConverter, temp_doc):
84
  with open(temp_doc.name, "rb") as f:
85
  data = f.read()
tests/renderers/test_markdown_renderer.py CHANGED
@@ -5,13 +5,22 @@ from marker.schema import BlockTypes
5
  from marker.schema.blocks import TableCell
6
 
7
 
8
- @pytest.mark.config({"page_range": [0]})
9
  def test_markdown_renderer(pdf_document):
10
  renderer = MarkdownRenderer()
11
  md = renderer(pdf_document).markdown
12
 
13
  # Verify markdown
14
- assert '# Subspace Adversarial Training' in md
 
 
 
 
 
 
 
 
 
15
 
16
 
17
  @pytest.mark.config({"page_range": [0, 1], "paginate_output": True})
@@ -29,12 +38,14 @@ def test_markdown_renderer_pagination_blank_last_page(pdf_document):
29
  last_page = pdf_document.pages[-1]
30
  last_page.children = []
31
  last_page.structure = []
32
-
33
  renderer = MarkdownRenderer({"paginate_output": True})
34
  md = renderer(pdf_document).markdown
35
-
36
  # Should end with pagination marker and preserve trailing newlines
37
- assert md.endswith("}\n\n") or md.endswith("}------------------------------------------------\n\n")
 
 
38
 
39
 
40
  @pytest.mark.config({"page_range": [0, 1]})
@@ -48,9 +59,10 @@ def test_markdown_renderer_metadata(pdf_document):
48
  def test_markdown_renderer_images(pdf_document):
49
  renderer = MarkdownRenderer({"extract_images": False})
50
  markdown_output = renderer(pdf_document)
51
-
52
  assert len(markdown_output.images) == 0
53
- assert '![](' not in markdown_output.markdown
 
54
 
55
  @pytest.mark.config({"page_range": [5]})
56
  def test_markdown_renderer_tables(pdf_document):
@@ -74,5 +86,3 @@ def test_markdown_renderer_tables(pdf_document):
74
  renderer = MarkdownRenderer()
75
  md = renderer(pdf_document).markdown
76
  assert "54 <i>.45</i> 67<br>89 $x$" in md
77
-
78
-
 
5
  from marker.schema.blocks import TableCell
6
 
7
 
8
+ @pytest.mark.config({"page_range": [0], "disable_ocr": True})
9
  def test_markdown_renderer(pdf_document):
10
  renderer = MarkdownRenderer()
11
  md = renderer(pdf_document).markdown
12
 
13
  # Verify markdown
14
+ assert "# Subspace Adversarial Training" in md
15
+
16
+
17
+ @pytest.mark.config({"page_range": [0]})
18
+ def test_markdown_renderer_auto_ocr(pdf_document):
19
+ renderer = MarkdownRenderer()
20
+ md = renderer(pdf_document).markdown
21
+
22
+ # Verify markdown
23
+ assert "Subspace Adversarial Training" in md
24
 
25
 
26
  @pytest.mark.config({"page_range": [0, 1], "paginate_output": True})
 
38
  last_page = pdf_document.pages[-1]
39
  last_page.children = []
40
  last_page.structure = []
41
+
42
  renderer = MarkdownRenderer({"paginate_output": True})
43
  md = renderer(pdf_document).markdown
44
+
45
  # Should end with pagination marker and preserve trailing newlines
46
+ assert md.endswith("}\n\n") or md.endswith(
47
+ "}------------------------------------------------\n\n"
48
+ )
49
 
50
 
51
  @pytest.mark.config({"page_range": [0, 1]})
 
59
  def test_markdown_renderer_images(pdf_document):
60
  renderer = MarkdownRenderer({"extract_images": False})
61
  markdown_output = renderer(pdf_document)
62
+
63
  assert len(markdown_output.images) == 0
64
+ assert "![](" not in markdown_output.markdown
65
+
66
 
67
  @pytest.mark.config({"page_range": [5]})
68
  def test_markdown_renderer_tables(pdf_document):
 
86
  renderer = MarkdownRenderer()
87
  md = renderer(pdf_document).markdown
88
  assert "54 <i>.45</i> 67<br>89 $x$" in md