Moses Paul R commited on
Commit
aee20f6
·
1 Parent(s): 662dfec

make tests much faster and cleanup [skip ci]

Browse files
marker/v2/schema/document.py CHANGED
@@ -5,7 +5,7 @@ from typing import List
5
  from pydantic import BaseModel
6
 
7
  from marker.v2.schema import BlockTypes
8
- from marker.v2.schema.blocks import BlockId, BlockOutput
9
  from marker.v2.schema.groups.page import PageGroup
10
 
11
 
@@ -28,11 +28,12 @@ class Document(BaseModel):
28
  return None
29
 
30
  def get_page(self, page_id):
31
- page = self.pages[page_id]
32
- assert page.page_id == page_id, "Mismatch between page_id and page index"
33
- return page
 
34
 
35
- def assemble_html(self, child_blocks):
36
  template = ""
37
  for c in child_blocks:
38
  template += f"<content-ref src='{c.id}'></content-ref>"
 
5
  from pydantic import BaseModel
6
 
7
  from marker.v2.schema import BlockTypes
8
+ from marker.v2.schema.blocks import Block, BlockId, BlockOutput
9
  from marker.v2.schema.groups.page import PageGroup
10
 
11
 
 
28
  return None
29
 
30
  def get_page(self, page_id):
31
+ for page in self.pages:
32
+ if page.page_id == page_id:
33
+ return page
34
+ return None
35
 
36
+ def assemble_html(self, child_blocks: List[Block]):
37
  template = ""
38
  for c in child_blocks:
39
  template += f"<content-ref src='{c.id}'></content-ref>"
tests/conftest.py CHANGED
@@ -49,8 +49,11 @@ def table_rec_model():
49
 
50
  @pytest.fixture(scope="function")
51
  def pdf_provider(request):
52
- mark = request.node.get_closest_marker("filename")
53
- filename = mark.args[0] if mark else "adversarial.pdf"
 
 
 
54
 
55
  dataset = datasets.load_dataset("datalab-to/pdfs", split="train")
56
  idx = dataset['filename'].index(filename)
@@ -58,7 +61,7 @@ def pdf_provider(request):
58
  temp_pdf = tempfile.NamedTemporaryFile(suffix=".pdf")
59
  temp_pdf.write(dataset['pdf'][idx])
60
  temp_pdf.flush()
61
- yield PdfProvider(temp_pdf.name)
62
 
63
 
64
  @pytest.fixture(scope="function")
 
49
 
50
  @pytest.fixture(scope="function")
51
  def pdf_provider(request):
52
+ filename_mark = request.node.get_closest_marker("filename")
53
+ filename = filename_mark.args[0] if filename_mark else "adversarial.pdf"
54
+
55
+ config_mark = request.node.get_closest_marker("config")
56
+ config = config_mark.args[0] if config_mark else None
57
 
58
  dataset = datasets.load_dataset("datalab-to/pdfs", split="train")
59
  idx = dataset['filename'].index(filename)
 
61
  temp_pdf = tempfile.NamedTemporaryFile(suffix=".pdf")
62
  temp_pdf.write(dataset['pdf'][idx])
63
  temp_pdf.flush()
64
+ yield PdfProvider(temp_pdf.name, config)
65
 
66
 
67
  @pytest.fixture(scope="function")
tests/test_document_builder.py CHANGED
@@ -1,7 +1,10 @@
 
 
1
  from marker.v2.schema import BlockTypes
2
  from marker.v2.schema.text.line import Line
3
 
4
 
 
5
  def test_document_builder(pdf_document):
6
  first_page = pdf_document.pages[0]
7
  assert first_page.structure[0] == '/page/0/SectionHeader/0'
@@ -18,22 +21,3 @@ def test_document_builder(pdf_document):
18
  assert first_span.text == 'Subspace Adversarial Training'
19
  assert first_span.font == 'NimbusRomNo9L-Medi'
20
  assert first_span.formats == ['plain']
21
-
22
- last_block = first_page.get_block(first_page.structure[-1])
23
- assert last_block.block_type == BlockTypes.Text
24
-
25
- last_text_block: Line = first_page.get_block(last_block.structure[-1])
26
- assert last_text_block.block_type == BlockTypes.Line
27
-
28
- last_span = first_page.get_block(last_text_block.structure[-1])
29
- assert last_span.block_type == BlockTypes.Span
30
- assert last_span.text == 'prove the quality of single-step AT solutions. However,'
31
- assert last_span.font == 'NimbusRomNo9L-Regu'
32
- assert last_span.formats == ['plain']
33
-
34
-
35
- if __name__ == "__main__":
36
- from tests.utils import setup_pdf_document
37
-
38
- pdf_document = setup_pdf_document("adversarial.pdf")
39
- test_document_builder(pdf_document)
 
1
+ import pytest
2
+
3
  from marker.v2.schema import BlockTypes
4
  from marker.v2.schema.text.line import Line
5
 
6
 
7
+ @pytest.mark.config({"page_range": [0]})
8
  def test_document_builder(pdf_document):
9
  first_page = pdf_document.pages[0]
10
  assert first_page.structure[0] == '/page/0/SectionHeader/0'
 
21
  assert first_span.text == 'Subspace Adversarial Training'
22
  assert first_span.font == 'NimbusRomNo9L-Medi'
23
  assert first_span.formats == ['plain']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tests/test_equation_processor.py CHANGED
@@ -1,16 +1,14 @@
1
- from copy import deepcopy
2
 
3
  from marker.v2.schema import BlockTypes
4
  from marker.v2.processors.equation import EquationProcessor
5
 
6
 
 
7
  def test_equation_processor(pdf_document, texify_model):
8
  processor = EquationProcessor(texify_model)
 
9
 
10
- new_document = deepcopy(pdf_document)
11
- new_document.pages = [new_document.pages[0]]
12
- processor(new_document)
13
-
14
- for block in new_document.pages[0].children:
15
  if block.block_type == BlockTypes.Equation:
16
  assert block.latex is not None
 
1
+ import pytest
2
 
3
  from marker.v2.schema import BlockTypes
4
  from marker.v2.processors.equation import EquationProcessor
5
 
6
 
7
+ @pytest.mark.config({"page_range": [0]})
8
  def test_equation_processor(pdf_document, texify_model):
9
  processor = EquationProcessor(texify_model)
10
+ processor(pdf_document)
11
 
12
+ for block in pdf_document.pages[0].children:
 
 
 
 
13
  if block.block_type == BlockTypes.Equation:
14
  assert block.latex is not None
tests/test_garbled_pdf.py CHANGED
@@ -1,12 +1,9 @@
 
1
  from marker.v2.schema import BlockTypes
2
- from marker.v2.schema.text.line import Line
3
- from tests.utils import setup_pdf_document
4
 
5
 
6
- def test_ocr_pipeline():
7
- pdf_document = setup_pdf_document(
8
- "water_damage.pdf"
9
- )
10
  assert pdf_document.pages[0].structure[0] == '/page/0/Table/0'
11
 
12
  table_block = pdf_document.pages[0].get_block(pdf_document.pages[0].structure[0])
 
1
+ import pytest
2
  from marker.v2.schema import BlockTypes
 
 
3
 
4
 
5
+ @pytest.mark.filename("water_damage.pdf")
6
+ def test_ocr_pipeline(pdf_document):
 
 
7
  assert pdf_document.pages[0].structure[0] == '/page/0/Table/0'
8
 
9
  table_block = pdf_document.pages[0].get_block(pdf_document.pages[0].structure[0])
tests/test_ocr_pipeline.py CHANGED
@@ -1,16 +1,11 @@
 
 
1
  from marker.v2.schema import BlockTypes
2
  from marker.v2.schema.text.line import Line
3
- from tests.utils import setup_pdf_document
4
-
5
 
6
- def test_ocr_pipeline():
7
- pdf_document = setup_pdf_document(
8
- "adversarial.pdf",
9
- pdf_provider_config={
10
- "force_ocr": True
11
- }
12
- )
13
 
 
 
14
  first_page = pdf_document.pages[0]
15
  assert first_page.structure[0] == '/page/0/SectionHeader/0'
16
 
@@ -24,7 +19,3 @@ def test_ocr_pipeline():
24
  first_span = first_page.get_block(first_text_block.structure[0])
25
  assert first_span.block_type == BlockTypes.Span
26
  assert first_span.text.strip() == 'Subspace Adversarial Training'
27
-
28
-
29
- if __name__ == "__main__":
30
- test_ocr_pipeline()
 
1
+ import pytest
2
+
3
  from marker.v2.schema import BlockTypes
4
  from marker.v2.schema.text.line import Line
 
 
5
 
 
 
 
 
 
 
 
6
 
7
+ @pytest.mark.config({"force_ocr": True, "page_range": [0]})
8
+ def test_ocr_pipeline(pdf_document):
9
  first_page = pdf_document.pages[0]
10
  assert first_page.structure[0] == '/page/0/SectionHeader/0'
11
 
 
19
  first_span = first_page.get_block(first_text_block.structure[0])
20
  assert first_span.block_type == BlockTypes.Span
21
  assert first_span.text.strip() == 'Subspace Adversarial Training'
 
 
 
 
tests/test_pdf_provider.py CHANGED
@@ -1,24 +1,13 @@
1
- import tempfile
2
 
3
- import datasets
4
 
5
- from marker.v2.providers.pdf import PdfProvider
 
 
 
 
6
 
7
-
8
- def test_pdf_provider():
9
- dataset = datasets.load_dataset("datalab-to/pdfs", split="train")
10
- idx = dataset['filename'].index('adversarial.pdf')
11
-
12
- temp_pdf = tempfile.NamedTemporaryFile(suffix=".pdf")
13
- temp_pdf.write(dataset['pdf'][idx])
14
- temp_pdf.flush()
15
-
16
- provider = PdfProvider(temp_pdf.name)
17
- assert len(provider) == 12
18
- assert provider.get_image(0, 72).size == (612, 792)
19
- assert provider.get_image(0, 96).size == (816, 1056)
20
-
21
- spans_list = provider.get_page_spans(0)
22
  assert len(spans_list) == 93
23
 
24
  spans = spans_list[0]
@@ -26,11 +15,3 @@ def test_pdf_provider():
26
  assert spans[0].text == "Subspace Adversarial Training"
27
  assert spans[0].font == "NimbusRomNo9L-Medi"
28
  assert spans[0].formats == ["plain"]
29
-
30
- # for line in provider.get_page_lines(0):
31
- # for span in line.spans:
32
- # print(f"{span=}")
33
-
34
-
35
- if __name__ == "__main__":
36
- test_pdf_provider()
 
1
+ import pytest
2
 
 
3
 
4
+ @pytest.mark.config({"page_range": [0]})
5
+ def test_pdf_provider(pdf_provider):
6
+ assert len(pdf_provider) == 12
7
+ assert pdf_provider.get_image(0, 72).size == (612, 792)
8
+ assert pdf_provider.get_image(0, 96).size == (816, 1056)
9
 
10
+ spans_list = pdf_provider.get_page_spans(0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  assert len(spans_list) == 93
12
 
13
  spans = spans_list[0]
 
15
  assert spans[0].text == "Subspace Adversarial Training"
16
  assert spans[0].font == "NimbusRomNo9L-Medi"
17
  assert spans[0].formats == ["plain"]
 
 
 
 
 
 
 
 
tests/test_structure.py CHANGED
@@ -1,6 +1,9 @@
 
 
1
  from marker.v2.builders.structure import StructureBuilder
2
 
3
 
 
4
  def test_structure_builder(pdf_document):
5
  structure = StructureBuilder()
6
  structure(pdf_document)
 
1
+ import pytest
2
+
3
  from marker.v2.builders.structure import StructureBuilder
4
 
5
 
6
+ @pytest.mark.config({"page_range": [0]})
7
  def test_structure_builder(pdf_document):
8
  structure = StructureBuilder()
9
  structure(pdf_document)
tests/test_table_processor.py CHANGED
@@ -1,4 +1,4 @@
1
- from copy import deepcopy
2
 
3
  from tabled.schema import SpanTableCell
4
 
@@ -6,14 +6,12 @@ from marker.v2.schema import BlockTypes
6
  from marker.v2.processors.table import TableProcessor
7
 
8
 
 
9
  def test_table_processor(pdf_document, detection_model, recognition_model, table_rec_model):
10
  processor = TableProcessor(detection_model, recognition_model, table_rec_model)
 
11
 
12
- new_document = deepcopy(pdf_document)
13
- new_document.pages = new_document.pages[:5]
14
- processor(new_document)
15
-
16
- for block in new_document.pages[0].children:
17
  if block.block_type == BlockTypes.Table:
18
  assert block.cells is not None
19
  assert len(block.cells) > 0
 
1
+ import pytest
2
 
3
  from tabled.schema import SpanTableCell
4
 
 
6
  from marker.v2.processors.table import TableProcessor
7
 
8
 
9
+ @pytest.mark.config({"page_range": [5]})
10
  def test_table_processor(pdf_document, detection_model, recognition_model, table_rec_model):
11
  processor = TableProcessor(detection_model, recognition_model, table_rec_model)
12
+ processor(pdf_document)
13
 
14
+ for block in pdf_document.pages[0].children:
 
 
 
 
15
  if block.block_type == BlockTypes.Table:
16
  assert block.cells is not None
17
  assert len(block.cells) > 0