Moses Paul R
commited on
Commit
·
aee20f6
1
Parent(s):
662dfec
make tests much faster and cleanup [skip ci]
Browse files- marker/v2/schema/document.py +6 -5
- tests/conftest.py +6 -3
- tests/test_document_builder.py +3 -19
- tests/test_equation_processor.py +4 -6
- tests/test_garbled_pdf.py +3 -6
- tests/test_ocr_pipeline.py +4 -13
- tests/test_pdf_provider.py +7 -26
- tests/test_structure.py +3 -0
- tests/test_table_processor.py +4 -6
marker/v2/schema/document.py
CHANGED
|
@@ -5,7 +5,7 @@ from typing import List
|
|
| 5 |
from pydantic import BaseModel
|
| 6 |
|
| 7 |
from marker.v2.schema import BlockTypes
|
| 8 |
-
from marker.v2.schema.blocks import BlockId, BlockOutput
|
| 9 |
from marker.v2.schema.groups.page import PageGroup
|
| 10 |
|
| 11 |
|
|
@@ -28,11 +28,12 @@ class Document(BaseModel):
|
|
| 28 |
return None
|
| 29 |
|
| 30 |
def get_page(self, page_id):
|
| 31 |
-
page
|
| 32 |
-
|
| 33 |
-
|
|
|
|
| 34 |
|
| 35 |
-
def assemble_html(self, child_blocks):
|
| 36 |
template = ""
|
| 37 |
for c in child_blocks:
|
| 38 |
template += f"<content-ref src='{c.id}'></content-ref>"
|
|
|
|
| 5 |
from pydantic import BaseModel
|
| 6 |
|
| 7 |
from marker.v2.schema import BlockTypes
|
| 8 |
+
from marker.v2.schema.blocks import Block, BlockId, BlockOutput
|
| 9 |
from marker.v2.schema.groups.page import PageGroup
|
| 10 |
|
| 11 |
|
|
|
|
| 28 |
return None
|
| 29 |
|
| 30 |
def get_page(self, page_id):
|
| 31 |
+
for page in self.pages:
|
| 32 |
+
if page.page_id == page_id:
|
| 33 |
+
return page
|
| 34 |
+
return None
|
| 35 |
|
| 36 |
+
def assemble_html(self, child_blocks: List[Block]):
|
| 37 |
template = ""
|
| 38 |
for c in child_blocks:
|
| 39 |
template += f"<content-ref src='{c.id}'></content-ref>"
|
tests/conftest.py
CHANGED
|
@@ -49,8 +49,11 @@ def table_rec_model():
|
|
| 49 |
|
| 50 |
@pytest.fixture(scope="function")
|
| 51 |
def pdf_provider(request):
|
| 52 |
-
|
| 53 |
-
filename =
|
|
|
|
|
|
|
|
|
|
| 54 |
|
| 55 |
dataset = datasets.load_dataset("datalab-to/pdfs", split="train")
|
| 56 |
idx = dataset['filename'].index(filename)
|
|
@@ -58,7 +61,7 @@ def pdf_provider(request):
|
|
| 58 |
temp_pdf = tempfile.NamedTemporaryFile(suffix=".pdf")
|
| 59 |
temp_pdf.write(dataset['pdf'][idx])
|
| 60 |
temp_pdf.flush()
|
| 61 |
-
yield PdfProvider(temp_pdf.name)
|
| 62 |
|
| 63 |
|
| 64 |
@pytest.fixture(scope="function")
|
|
|
|
| 49 |
|
| 50 |
@pytest.fixture(scope="function")
|
| 51 |
def pdf_provider(request):
|
| 52 |
+
filename_mark = request.node.get_closest_marker("filename")
|
| 53 |
+
filename = filename_mark.args[0] if filename_mark else "adversarial.pdf"
|
| 54 |
+
|
| 55 |
+
config_mark = request.node.get_closest_marker("config")
|
| 56 |
+
config = config_mark.args[0] if config_mark else None
|
| 57 |
|
| 58 |
dataset = datasets.load_dataset("datalab-to/pdfs", split="train")
|
| 59 |
idx = dataset['filename'].index(filename)
|
|
|
|
| 61 |
temp_pdf = tempfile.NamedTemporaryFile(suffix=".pdf")
|
| 62 |
temp_pdf.write(dataset['pdf'][idx])
|
| 63 |
temp_pdf.flush()
|
| 64 |
+
yield PdfProvider(temp_pdf.name, config)
|
| 65 |
|
| 66 |
|
| 67 |
@pytest.fixture(scope="function")
|
tests/test_document_builder.py
CHANGED
|
@@ -1,7 +1,10 @@
|
|
|
|
|
|
|
|
| 1 |
from marker.v2.schema import BlockTypes
|
| 2 |
from marker.v2.schema.text.line import Line
|
| 3 |
|
| 4 |
|
|
|
|
| 5 |
def test_document_builder(pdf_document):
|
| 6 |
first_page = pdf_document.pages[0]
|
| 7 |
assert first_page.structure[0] == '/page/0/SectionHeader/0'
|
|
@@ -18,22 +21,3 @@ def test_document_builder(pdf_document):
|
|
| 18 |
assert first_span.text == 'Subspace Adversarial Training'
|
| 19 |
assert first_span.font == 'NimbusRomNo9L-Medi'
|
| 20 |
assert first_span.formats == ['plain']
|
| 21 |
-
|
| 22 |
-
last_block = first_page.get_block(first_page.structure[-1])
|
| 23 |
-
assert last_block.block_type == BlockTypes.Text
|
| 24 |
-
|
| 25 |
-
last_text_block: Line = first_page.get_block(last_block.structure[-1])
|
| 26 |
-
assert last_text_block.block_type == BlockTypes.Line
|
| 27 |
-
|
| 28 |
-
last_span = first_page.get_block(last_text_block.structure[-1])
|
| 29 |
-
assert last_span.block_type == BlockTypes.Span
|
| 30 |
-
assert last_span.text == 'prove the quality of single-step AT solutions. However,'
|
| 31 |
-
assert last_span.font == 'NimbusRomNo9L-Regu'
|
| 32 |
-
assert last_span.formats == ['plain']
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
if __name__ == "__main__":
|
| 36 |
-
from tests.utils import setup_pdf_document
|
| 37 |
-
|
| 38 |
-
pdf_document = setup_pdf_document("adversarial.pdf")
|
| 39 |
-
test_document_builder(pdf_document)
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
+
|
| 3 |
from marker.v2.schema import BlockTypes
|
| 4 |
from marker.v2.schema.text.line import Line
|
| 5 |
|
| 6 |
|
| 7 |
+
@pytest.mark.config({"page_range": [0]})
|
| 8 |
def test_document_builder(pdf_document):
|
| 9 |
first_page = pdf_document.pages[0]
|
| 10 |
assert first_page.structure[0] == '/page/0/SectionHeader/0'
|
|
|
|
| 21 |
assert first_span.text == 'Subspace Adversarial Training'
|
| 22 |
assert first_span.font == 'NimbusRomNo9L-Medi'
|
| 23 |
assert first_span.formats == ['plain']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/test_equation_processor.py
CHANGED
|
@@ -1,16 +1,14 @@
|
|
| 1 |
-
|
| 2 |
|
| 3 |
from marker.v2.schema import BlockTypes
|
| 4 |
from marker.v2.processors.equation import EquationProcessor
|
| 5 |
|
| 6 |
|
|
|
|
| 7 |
def test_equation_processor(pdf_document, texify_model):
|
| 8 |
processor = EquationProcessor(texify_model)
|
|
|
|
| 9 |
|
| 10 |
-
|
| 11 |
-
new_document.pages = [new_document.pages[0]]
|
| 12 |
-
processor(new_document)
|
| 13 |
-
|
| 14 |
-
for block in new_document.pages[0].children:
|
| 15 |
if block.block_type == BlockTypes.Equation:
|
| 16 |
assert block.latex is not None
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
|
| 3 |
from marker.v2.schema import BlockTypes
|
| 4 |
from marker.v2.processors.equation import EquationProcessor
|
| 5 |
|
| 6 |
|
| 7 |
+
@pytest.mark.config({"page_range": [0]})
|
| 8 |
def test_equation_processor(pdf_document, texify_model):
|
| 9 |
processor = EquationProcessor(texify_model)
|
| 10 |
+
processor(pdf_document)
|
| 11 |
|
| 12 |
+
for block in pdf_document.pages[0].children:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
if block.block_type == BlockTypes.Equation:
|
| 14 |
assert block.latex is not None
|
tests/test_garbled_pdf.py
CHANGED
|
@@ -1,12 +1,9 @@
|
|
|
|
|
| 1 |
from marker.v2.schema import BlockTypes
|
| 2 |
-
from marker.v2.schema.text.line import Line
|
| 3 |
-
from tests.utils import setup_pdf_document
|
| 4 |
|
| 5 |
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
"water_damage.pdf"
|
| 9 |
-
)
|
| 10 |
assert pdf_document.pages[0].structure[0] == '/page/0/Table/0'
|
| 11 |
|
| 12 |
table_block = pdf_document.pages[0].get_block(pdf_document.pages[0].structure[0])
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
from marker.v2.schema import BlockTypes
|
|
|
|
|
|
|
| 3 |
|
| 4 |
|
| 5 |
+
@pytest.mark.filename("water_damage.pdf")
|
| 6 |
+
def test_ocr_pipeline(pdf_document):
|
|
|
|
|
|
|
| 7 |
assert pdf_document.pages[0].structure[0] == '/page/0/Table/0'
|
| 8 |
|
| 9 |
table_block = pdf_document.pages[0].get_block(pdf_document.pages[0].structure[0])
|
tests/test_ocr_pipeline.py
CHANGED
|
@@ -1,16 +1,11 @@
|
|
|
|
|
|
|
|
| 1 |
from marker.v2.schema import BlockTypes
|
| 2 |
from marker.v2.schema.text.line import Line
|
| 3 |
-
from tests.utils import setup_pdf_document
|
| 4 |
-
|
| 5 |
|
| 6 |
-
def test_ocr_pipeline():
|
| 7 |
-
pdf_document = setup_pdf_document(
|
| 8 |
-
"adversarial.pdf",
|
| 9 |
-
pdf_provider_config={
|
| 10 |
-
"force_ocr": True
|
| 11 |
-
}
|
| 12 |
-
)
|
| 13 |
|
|
|
|
|
|
|
| 14 |
first_page = pdf_document.pages[0]
|
| 15 |
assert first_page.structure[0] == '/page/0/SectionHeader/0'
|
| 16 |
|
|
@@ -24,7 +19,3 @@ def test_ocr_pipeline():
|
|
| 24 |
first_span = first_page.get_block(first_text_block.structure[0])
|
| 25 |
assert first_span.block_type == BlockTypes.Span
|
| 26 |
assert first_span.text.strip() == 'Subspace Adversarial Training'
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
if __name__ == "__main__":
|
| 30 |
-
test_ocr_pipeline()
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
+
|
| 3 |
from marker.v2.schema import BlockTypes
|
| 4 |
from marker.v2.schema.text.line import Line
|
|
|
|
|
|
|
| 5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
+
@pytest.mark.config({"force_ocr": True, "page_range": [0]})
|
| 8 |
+
def test_ocr_pipeline(pdf_document):
|
| 9 |
first_page = pdf_document.pages[0]
|
| 10 |
assert first_page.structure[0] == '/page/0/SectionHeader/0'
|
| 11 |
|
|
|
|
| 19 |
first_span = first_page.get_block(first_text_block.structure[0])
|
| 20 |
assert first_span.block_type == BlockTypes.Span
|
| 21 |
assert first_span.text.strip() == 'Subspace Adversarial Training'
|
|
|
|
|
|
|
|
|
|
|
|
tests/test_pdf_provider.py
CHANGED
|
@@ -1,24 +1,13 @@
|
|
| 1 |
-
import
|
| 2 |
|
| 3 |
-
import datasets
|
| 4 |
|
| 5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
-
|
| 8 |
-
def test_pdf_provider():
|
| 9 |
-
dataset = datasets.load_dataset("datalab-to/pdfs", split="train")
|
| 10 |
-
idx = dataset['filename'].index('adversarial.pdf')
|
| 11 |
-
|
| 12 |
-
temp_pdf = tempfile.NamedTemporaryFile(suffix=".pdf")
|
| 13 |
-
temp_pdf.write(dataset['pdf'][idx])
|
| 14 |
-
temp_pdf.flush()
|
| 15 |
-
|
| 16 |
-
provider = PdfProvider(temp_pdf.name)
|
| 17 |
-
assert len(provider) == 12
|
| 18 |
-
assert provider.get_image(0, 72).size == (612, 792)
|
| 19 |
-
assert provider.get_image(0, 96).size == (816, 1056)
|
| 20 |
-
|
| 21 |
-
spans_list = provider.get_page_spans(0)
|
| 22 |
assert len(spans_list) == 93
|
| 23 |
|
| 24 |
spans = spans_list[0]
|
|
@@ -26,11 +15,3 @@ def test_pdf_provider():
|
|
| 26 |
assert spans[0].text == "Subspace Adversarial Training"
|
| 27 |
assert spans[0].font == "NimbusRomNo9L-Medi"
|
| 28 |
assert spans[0].formats == ["plain"]
|
| 29 |
-
|
| 30 |
-
# for line in provider.get_page_lines(0):
|
| 31 |
-
# for span in line.spans:
|
| 32 |
-
# print(f"{span=}")
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
if __name__ == "__main__":
|
| 36 |
-
test_pdf_provider()
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
|
|
|
|
| 3 |
|
| 4 |
+
@pytest.mark.config({"page_range": [0]})
|
| 5 |
+
def test_pdf_provider(pdf_provider):
|
| 6 |
+
assert len(pdf_provider) == 12
|
| 7 |
+
assert pdf_provider.get_image(0, 72).size == (612, 792)
|
| 8 |
+
assert pdf_provider.get_image(0, 96).size == (816, 1056)
|
| 9 |
|
| 10 |
+
spans_list = pdf_provider.get_page_spans(0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
assert len(spans_list) == 93
|
| 12 |
|
| 13 |
spans = spans_list[0]
|
|
|
|
| 15 |
assert spans[0].text == "Subspace Adversarial Training"
|
| 16 |
assert spans[0].font == "NimbusRomNo9L-Medi"
|
| 17 |
assert spans[0].formats == ["plain"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/test_structure.py
CHANGED
|
@@ -1,6 +1,9 @@
|
|
|
|
|
|
|
|
| 1 |
from marker.v2.builders.structure import StructureBuilder
|
| 2 |
|
| 3 |
|
|
|
|
| 4 |
def test_structure_builder(pdf_document):
|
| 5 |
structure = StructureBuilder()
|
| 6 |
structure(pdf_document)
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
+
|
| 3 |
from marker.v2.builders.structure import StructureBuilder
|
| 4 |
|
| 5 |
|
| 6 |
+
@pytest.mark.config({"page_range": [0]})
|
| 7 |
def test_structure_builder(pdf_document):
|
| 8 |
structure = StructureBuilder()
|
| 9 |
structure(pdf_document)
|
tests/test_table_processor.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
|
| 2 |
|
| 3 |
from tabled.schema import SpanTableCell
|
| 4 |
|
|
@@ -6,14 +6,12 @@ from marker.v2.schema import BlockTypes
|
|
| 6 |
from marker.v2.processors.table import TableProcessor
|
| 7 |
|
| 8 |
|
|
|
|
| 9 |
def test_table_processor(pdf_document, detection_model, recognition_model, table_rec_model):
|
| 10 |
processor = TableProcessor(detection_model, recognition_model, table_rec_model)
|
|
|
|
| 11 |
|
| 12 |
-
|
| 13 |
-
new_document.pages = new_document.pages[:5]
|
| 14 |
-
processor(new_document)
|
| 15 |
-
|
| 16 |
-
for block in new_document.pages[0].children:
|
| 17 |
if block.block_type == BlockTypes.Table:
|
| 18 |
assert block.cells is not None
|
| 19 |
assert len(block.cells) > 0
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
|
| 3 |
from tabled.schema import SpanTableCell
|
| 4 |
|
|
|
|
| 6 |
from marker.v2.processors.table import TableProcessor
|
| 7 |
|
| 8 |
|
| 9 |
+
@pytest.mark.config({"page_range": [5]})
|
| 10 |
def test_table_processor(pdf_document, detection_model, recognition_model, table_rec_model):
|
| 11 |
processor = TableProcessor(detection_model, recognition_model, table_rec_model)
|
| 12 |
+
processor(pdf_document)
|
| 13 |
|
| 14 |
+
for block in pdf_document.pages[0].children:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
if block.block_type == BlockTypes.Table:
|
| 16 |
assert block.cells is not None
|
| 17 |
assert len(block.cells) > 0
|