Spaces:

rt4u
/

marker

Sleeping

Vik Paruchuri commited on Dec 30, 2024

Commit

6b71c3d

1 Parent(s): 4447b6c

Fix tests

Files changed (6) hide show

marker/builders/llm_layout.py CHANGED Viewed

@@ -85,11 +85,11 @@ Choose the label you believe is the most accurate representation of the layout b
 Potential labels:
-- Figure - A figure or diagram in the document.
-- Picture - A picture or image in the document.
-- ComplexRegion - a complex region containing multiple elements, including pictures, text, tables, or figures.
-- Table - A table in the document.
-- Form - A form in the document.
 Respond only with one of `Figure`, `Picture`, `ComplexRegion`, `Table`, or `Form`.

 Potential labels:
+- Picture
+- Table
+- Form
+- Figure - A graph or diagram with text.
+- ComplexRegion - a complex region containing multiple text and other elements.
 Respond only with one of `Figure`, `Picture`, `ComplexRegion`, `Table`, or `Form`.

tests/builders/test_ocr_pipeline.py CHANGED Viewed

@@ -23,7 +23,7 @@ def test_ocr_pipeline(pdf_document):
     # Ensure we match all text lines up properly
     # Makes sure the OCR bbox is being scaled to the same scale as the layout boxes
     text_lines = first_page.contained_blocks(pdf_document, (BlockTypes.Line,))
-    text_blocks = first_page.contained_blocks(pdf_document, (BlockTypes.Text,))
     assert len(text_lines) == 75
     # Ensure the bbox sizes match up

     # Ensure we match all text lines up properly
     # Makes sure the OCR bbox is being scaled to the same scale as the layout boxes
     text_lines = first_page.contained_blocks(pdf_document, (BlockTypes.Line,))
+    text_blocks = first_page.contained_blocks(pdf_document, (BlockTypes.Text,BlockTypes.TextInlineMath))
     assert len(text_lines) == 75
     # Ensure the bbox sizes match up

tests/builders/test_rotated_bboxes.py CHANGED Viewed

@@ -10,7 +10,7 @@ def test_rotated_bboxes(pdf_document):
     # Ensure we match all text lines up properly
     text_lines = first_page.contained_blocks(pdf_document, (BlockTypes.Line,))
-    text_blocks = first_page.contained_blocks(pdf_document, (BlockTypes.Text,))
     assert len(text_lines) == 84
     # Ensure the bbox sizes match up

     # Ensure we match all text lines up properly
     text_lines = first_page.contained_blocks(pdf_document, (BlockTypes.Line,))
+    text_blocks = first_page.contained_blocks(pdf_document, (BlockTypes.Text, BlockTypes.TextInlineMath))
     assert len(text_lines) == 84
     # Ensure the bbox sizes match up

tests/processors/test_ignoretext.py CHANGED Viewed

@@ -10,7 +10,7 @@ def test_ignoretext_processor(pdf_document):
     processor = IgnoreTextProcessor()
     processor(pdf_document)
-    page0_header = pdf_document.pages[0].contained_blocks(pdf_document, [BlockTypes.Text])[0]
-    assert "bioRxiv" in page0_header.raw_text(pdf_document)
-    assert page0_header.ignore_for_output is True

     processor = IgnoreTextProcessor()
     processor(pdf_document)
+    page1_header = pdf_document.pages[1].contained_blocks(pdf_document, [BlockTypes.Text])[0]
+    assert "bioRxiv" in page1_header.raw_text(pdf_document)
+    assert page1_header.ignore_for_output is True

tests/processors/test_llm_processors.py CHANGED Viewed

@@ -54,16 +54,31 @@ def test_llm_form_processor(pdf_document, detection_model, table_rec_model, reco
 @pytest.mark.filename("table_ex2.pdf")
 @pytest.mark.config({"page_range": [0]})
 def test_llm_table_processor(pdf_document, detection_model, table_rec_model, recognition_model, mocker):
-    corrected_markdown = """
-| Column 1 | Column 2 | Column 3 | Column 4 |
-|----------|----------|----------|----------|
-| Value 1  | Value 2  | Value 3  | Value 4  |
-| Value 5  | Value 6  | Value 7  | Value 8  |
-| Value 9  | Value 10 | Value 11 | Value 12 |
     """.strip()
     mock_cls = Mock()
-    mock_cls.return_value.generate_response.return_value = {"corrected_markdown": corrected_markdown}
     mocker.patch("marker.processors.llm.GoogleModel", mock_cls)
     cell_processor = TableProcessor(detection_model, recognition_model, table_rec_model)

 @pytest.mark.filename("table_ex2.pdf")
 @pytest.mark.config({"page_range": [0]})
 def test_llm_table_processor(pdf_document, detection_model, table_rec_model, recognition_model, mocker):
+    corrected_html = """
+<table>
+    <tr>
+        <td>Column 1</td>
+        <td>Column 2</td>
+        <td>Column 3</td>
+        <td>Column 4</td>
+    </tr>
+    <tr>
+        <td>Value 1</td>
+        <td>Value 2</td>
+        <td>Value 3</td>
+        <td>Value 4</td>
+    </tr>
+    <tr>
+        <td>Value 5</td>
+        <td>Value 6</td>
+        <td>Value 7</td>
+        <td>Value 8</td>
+    </tr>
+</table>
     """.strip()
     mock_cls = Mock()
+    mock_cls.return_value.generate_response.return_value = {"corrected_html": corrected_html}
     mocker.patch("marker.processors.llm.GoogleModel", mock_cls)
     cell_processor = TableProcessor(detection_model, recognition_model, table_rec_model)

tests/schema/groups/test_list_grouping.py CHANGED Viewed

@@ -15,4 +15,5 @@ def test_list_grouping(pdf_document):
         if block.block_type == BlockTypes.ListGroup:
             list_groups.append(block)
-    assert len(list_groups) == 1

         if block.block_type == BlockTypes.ListGroup:
             list_groups.append(block)
+    # The model breaks this up, since it has equations in it
+    assert len(list_groups) == 3