Vik Paruchuri
commited on
Commit
·
6b71c3d
1
Parent(s):
4447b6c
Fix tests
Browse files
marker/builders/llm_layout.py
CHANGED
|
@@ -85,11 +85,11 @@ Choose the label you believe is the most accurate representation of the layout b
|
|
| 85 |
|
| 86 |
Potential labels:
|
| 87 |
|
| 88 |
-
-
|
| 89 |
-
-
|
| 90 |
-
-
|
| 91 |
-
-
|
| 92 |
-
-
|
| 93 |
|
| 94 |
Respond only with one of `Figure`, `Picture`, `ComplexRegion`, `Table`, or `Form`.
|
| 95 |
|
|
|
|
| 85 |
|
| 86 |
Potential labels:
|
| 87 |
|
| 88 |
+
- Picture
|
| 89 |
+
- Table
|
| 90 |
+
- Form
|
| 91 |
+
- Figure - A graph or diagram with text.
|
| 92 |
+
- ComplexRegion - a complex region containing multiple text and other elements.
|
| 93 |
|
| 94 |
Respond only with one of `Figure`, `Picture`, `ComplexRegion`, `Table`, or `Form`.
|
| 95 |
|
tests/builders/test_ocr_pipeline.py
CHANGED
|
@@ -23,7 +23,7 @@ def test_ocr_pipeline(pdf_document):
|
|
| 23 |
# Ensure we match all text lines up properly
|
| 24 |
# Makes sure the OCR bbox is being scaled to the same scale as the layout boxes
|
| 25 |
text_lines = first_page.contained_blocks(pdf_document, (BlockTypes.Line,))
|
| 26 |
-
text_blocks = first_page.contained_blocks(pdf_document, (BlockTypes.Text,))
|
| 27 |
assert len(text_lines) == 75
|
| 28 |
|
| 29 |
# Ensure the bbox sizes match up
|
|
|
|
| 23 |
# Ensure we match all text lines up properly
|
| 24 |
# Makes sure the OCR bbox is being scaled to the same scale as the layout boxes
|
| 25 |
text_lines = first_page.contained_blocks(pdf_document, (BlockTypes.Line,))
|
| 26 |
+
text_blocks = first_page.contained_blocks(pdf_document, (BlockTypes.Text,BlockTypes.TextInlineMath))
|
| 27 |
assert len(text_lines) == 75
|
| 28 |
|
| 29 |
# Ensure the bbox sizes match up
|
tests/builders/test_rotated_bboxes.py
CHANGED
|
@@ -10,7 +10,7 @@ def test_rotated_bboxes(pdf_document):
|
|
| 10 |
|
| 11 |
# Ensure we match all text lines up properly
|
| 12 |
text_lines = first_page.contained_blocks(pdf_document, (BlockTypes.Line,))
|
| 13 |
-
text_blocks = first_page.contained_blocks(pdf_document, (BlockTypes.Text,))
|
| 14 |
assert len(text_lines) == 84
|
| 15 |
|
| 16 |
# Ensure the bbox sizes match up
|
|
|
|
| 10 |
|
| 11 |
# Ensure we match all text lines up properly
|
| 12 |
text_lines = first_page.contained_blocks(pdf_document, (BlockTypes.Line,))
|
| 13 |
+
text_blocks = first_page.contained_blocks(pdf_document, (BlockTypes.Text, BlockTypes.TextInlineMath))
|
| 14 |
assert len(text_lines) == 84
|
| 15 |
|
| 16 |
# Ensure the bbox sizes match up
|
tests/processors/test_ignoretext.py
CHANGED
|
@@ -10,7 +10,7 @@ def test_ignoretext_processor(pdf_document):
|
|
| 10 |
processor = IgnoreTextProcessor()
|
| 11 |
processor(pdf_document)
|
| 12 |
|
| 13 |
-
|
| 14 |
-
assert "bioRxiv" in
|
| 15 |
|
| 16 |
-
assert
|
|
|
|
| 10 |
processor = IgnoreTextProcessor()
|
| 11 |
processor(pdf_document)
|
| 12 |
|
| 13 |
+
page1_header = pdf_document.pages[1].contained_blocks(pdf_document, [BlockTypes.Text])[0]
|
| 14 |
+
assert "bioRxiv" in page1_header.raw_text(pdf_document)
|
| 15 |
|
| 16 |
+
assert page1_header.ignore_for_output is True
|
tests/processors/test_llm_processors.py
CHANGED
|
@@ -54,16 +54,31 @@ def test_llm_form_processor(pdf_document, detection_model, table_rec_model, reco
|
|
| 54 |
@pytest.mark.filename("table_ex2.pdf")
|
| 55 |
@pytest.mark.config({"page_range": [0]})
|
| 56 |
def test_llm_table_processor(pdf_document, detection_model, table_rec_model, recognition_model, mocker):
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
""".strip()
|
| 64 |
|
| 65 |
mock_cls = Mock()
|
| 66 |
-
mock_cls.return_value.generate_response.return_value = {"
|
| 67 |
mocker.patch("marker.processors.llm.GoogleModel", mock_cls)
|
| 68 |
|
| 69 |
cell_processor = TableProcessor(detection_model, recognition_model, table_rec_model)
|
|
|
|
| 54 |
@pytest.mark.filename("table_ex2.pdf")
|
| 55 |
@pytest.mark.config({"page_range": [0]})
|
| 56 |
def test_llm_table_processor(pdf_document, detection_model, table_rec_model, recognition_model, mocker):
|
| 57 |
+
corrected_html = """
|
| 58 |
+
<table>
|
| 59 |
+
<tr>
|
| 60 |
+
<td>Column 1</td>
|
| 61 |
+
<td>Column 2</td>
|
| 62 |
+
<td>Column 3</td>
|
| 63 |
+
<td>Column 4</td>
|
| 64 |
+
</tr>
|
| 65 |
+
<tr>
|
| 66 |
+
<td>Value 1</td>
|
| 67 |
+
<td>Value 2</td>
|
| 68 |
+
<td>Value 3</td>
|
| 69 |
+
<td>Value 4</td>
|
| 70 |
+
</tr>
|
| 71 |
+
<tr>
|
| 72 |
+
<td>Value 5</td>
|
| 73 |
+
<td>Value 6</td>
|
| 74 |
+
<td>Value 7</td>
|
| 75 |
+
<td>Value 8</td>
|
| 76 |
+
</tr>
|
| 77 |
+
</table>
|
| 78 |
""".strip()
|
| 79 |
|
| 80 |
mock_cls = Mock()
|
| 81 |
+
mock_cls.return_value.generate_response.return_value = {"corrected_html": corrected_html}
|
| 82 |
mocker.patch("marker.processors.llm.GoogleModel", mock_cls)
|
| 83 |
|
| 84 |
cell_processor = TableProcessor(detection_model, recognition_model, table_rec_model)
|
tests/schema/groups/test_list_grouping.py
CHANGED
|
@@ -15,4 +15,5 @@ def test_list_grouping(pdf_document):
|
|
| 15 |
if block.block_type == BlockTypes.ListGroup:
|
| 16 |
list_groups.append(block)
|
| 17 |
|
| 18 |
-
|
|
|
|
|
|
| 15 |
if block.block_type == BlockTypes.ListGroup:
|
| 16 |
list_groups.append(block)
|
| 17 |
|
| 18 |
+
# The model breaks this up, since it has equations in it
|
| 19 |
+
assert len(list_groups) == 3
|