Vik Paruchuri commited on
Commit
6b71c3d
·
1 Parent(s): 4447b6c
marker/builders/llm_layout.py CHANGED
@@ -85,11 +85,11 @@ Choose the label you believe is the most accurate representation of the layout b
85
 
86
  Potential labels:
87
 
88
- - Figure - A figure or diagram in the document.
89
- - Picture - A picture or image in the document.
90
- - ComplexRegion - a complex region containing multiple elements, including pictures, text, tables, or figures.
91
- - Table - A table in the document.
92
- - Form - A form in the document.
93
 
94
  Respond only with one of `Figure`, `Picture`, `ComplexRegion`, `Table`, or `Form`.
95
 
 
85
 
86
  Potential labels:
87
 
88
+ - Picture
89
+ - Table
90
+ - Form
91
+ - Figure - A graph or diagram with text.
92
+ - ComplexRegion - a complex region containing multiple text and other elements.
93
 
94
  Respond only with one of `Figure`, `Picture`, `ComplexRegion`, `Table`, or `Form`.
95
 
tests/builders/test_ocr_pipeline.py CHANGED
@@ -23,7 +23,7 @@ def test_ocr_pipeline(pdf_document):
23
  # Ensure we match all text lines up properly
24
  # Makes sure the OCR bbox is being scaled to the same scale as the layout boxes
25
  text_lines = first_page.contained_blocks(pdf_document, (BlockTypes.Line,))
26
- text_blocks = first_page.contained_blocks(pdf_document, (BlockTypes.Text,))
27
  assert len(text_lines) == 75
28
 
29
  # Ensure the bbox sizes match up
 
23
  # Ensure we match all text lines up properly
24
  # Makes sure the OCR bbox is being scaled to the same scale as the layout boxes
25
  text_lines = first_page.contained_blocks(pdf_document, (BlockTypes.Line,))
26
+ text_blocks = first_page.contained_blocks(pdf_document, (BlockTypes.Text,BlockTypes.TextInlineMath))
27
  assert len(text_lines) == 75
28
 
29
  # Ensure the bbox sizes match up
tests/builders/test_rotated_bboxes.py CHANGED
@@ -10,7 +10,7 @@ def test_rotated_bboxes(pdf_document):
10
 
11
  # Ensure we match all text lines up properly
12
  text_lines = first_page.contained_blocks(pdf_document, (BlockTypes.Line,))
13
- text_blocks = first_page.contained_blocks(pdf_document, (BlockTypes.Text,))
14
  assert len(text_lines) == 84
15
 
16
  # Ensure the bbox sizes match up
 
10
 
11
  # Ensure we match all text lines up properly
12
  text_lines = first_page.contained_blocks(pdf_document, (BlockTypes.Line,))
13
+ text_blocks = first_page.contained_blocks(pdf_document, (BlockTypes.Text, BlockTypes.TextInlineMath))
14
  assert len(text_lines) == 84
15
 
16
  # Ensure the bbox sizes match up
tests/processors/test_ignoretext.py CHANGED
@@ -10,7 +10,7 @@ def test_ignoretext_processor(pdf_document):
10
  processor = IgnoreTextProcessor()
11
  processor(pdf_document)
12
 
13
- page0_header = pdf_document.pages[0].contained_blocks(pdf_document, [BlockTypes.Text])[0]
14
- assert "bioRxiv" in page0_header.raw_text(pdf_document)
15
 
16
- assert page0_header.ignore_for_output is True
 
10
  processor = IgnoreTextProcessor()
11
  processor(pdf_document)
12
 
13
+ page1_header = pdf_document.pages[1].contained_blocks(pdf_document, [BlockTypes.Text])[0]
14
+ assert "bioRxiv" in page1_header.raw_text(pdf_document)
15
 
16
+ assert page1_header.ignore_for_output is True
tests/processors/test_llm_processors.py CHANGED
@@ -54,16 +54,31 @@ def test_llm_form_processor(pdf_document, detection_model, table_rec_model, reco
54
  @pytest.mark.filename("table_ex2.pdf")
55
  @pytest.mark.config({"page_range": [0]})
56
  def test_llm_table_processor(pdf_document, detection_model, table_rec_model, recognition_model, mocker):
57
- corrected_markdown = """
58
- | Column 1 | Column 2 | Column 3 | Column 4 |
59
- |----------|----------|----------|----------|
60
- | Value 1 | Value 2 | Value 3 | Value 4 |
61
- | Value 5 | Value 6 | Value 7 | Value 8 |
62
- | Value 9 | Value 10 | Value 11 | Value 12 |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  """.strip()
64
 
65
  mock_cls = Mock()
66
- mock_cls.return_value.generate_response.return_value = {"corrected_markdown": corrected_markdown}
67
  mocker.patch("marker.processors.llm.GoogleModel", mock_cls)
68
 
69
  cell_processor = TableProcessor(detection_model, recognition_model, table_rec_model)
 
54
  @pytest.mark.filename("table_ex2.pdf")
55
  @pytest.mark.config({"page_range": [0]})
56
  def test_llm_table_processor(pdf_document, detection_model, table_rec_model, recognition_model, mocker):
57
+ corrected_html = """
58
+ <table>
59
+ <tr>
60
+ <td>Column 1</td>
61
+ <td>Column 2</td>
62
+ <td>Column 3</td>
63
+ <td>Column 4</td>
64
+ </tr>
65
+ <tr>
66
+ <td>Value 1</td>
67
+ <td>Value 2</td>
68
+ <td>Value 3</td>
69
+ <td>Value 4</td>
70
+ </tr>
71
+ <tr>
72
+ <td>Value 5</td>
73
+ <td>Value 6</td>
74
+ <td>Value 7</td>
75
+ <td>Value 8</td>
76
+ </tr>
77
+ </table>
78
  """.strip()
79
 
80
  mock_cls = Mock()
81
+ mock_cls.return_value.generate_response.return_value = {"corrected_html": corrected_html}
82
  mocker.patch("marker.processors.llm.GoogleModel", mock_cls)
83
 
84
  cell_processor = TableProcessor(detection_model, recognition_model, table_rec_model)
tests/schema/groups/test_list_grouping.py CHANGED
@@ -15,4 +15,5 @@ def test_list_grouping(pdf_document):
15
  if block.block_type == BlockTypes.ListGroup:
16
  list_groups.append(block)
17
 
18
- assert len(list_groups) == 1
 
 
15
  if block.block_type == BlockTypes.ListGroup:
16
  list_groups.append(block)
17
 
18
+ # The model breaks this up, since it has equations in it
19
+ assert len(list_groups) == 3