marker / tests /converters /test_ocr_converter.py
peppermenta's picture
Fix tests for block mode
2374d8a
import pytest
from marker.converters.ocr import OCRConverter
from marker.renderers.ocr_json import OCRJSONOutput, OCRJSONPageOutput
def _ocr_converter(config, model_dict, temp_pdf, line_count: int, eq_count: int):
converter = OCRConverter(artifact_dict=model_dict, config=config)
ocr_json: OCRJSONOutput = converter(temp_pdf.name)
pages = ocr_json.children
assert len(pages) == 1
# assert len(pages[0].children) == line_count
eqs = [line for line in pages[0].children if line.block_type == "Equation"]
assert len(eqs) == eq_count
return pages
def check_bboxes(page: OCRJSONPageOutput, lines):
page_size = page.bbox
for line in lines:
assert len(line.children) > 0
for child in line.children:
bbox = child.bbox
assert all(
[
bbox[0] >= page_size[0],
bbox[1] >= page_size[1],
bbox[2] <= page_size[2],
bbox[3] <= page_size[3],
]
), "Child bbox is outside page bbox"
@pytest.mark.config({"page_range": [0]})
def test_ocr_converter(config, model_dict, temp_doc):
_ocr_converter(config, model_dict, temp_doc, 85, 2)
@pytest.mark.filename("pres.pdf")
@pytest.mark.config({"page_range": [1], "force_ocr": True, "keep_chars": True})
def test_ocr_converter_force(config, model_dict, temp_doc):
pages = _ocr_converter(config, model_dict, temp_doc, 10, 0)
lines = [line for line in pages[0].children if line.block_type == "Line"]
check_bboxes(pages[0], lines)
@pytest.mark.filename("pres.pdf")
@pytest.mark.config({"page_range": [1], "keep_chars": True})
def test_ocr_converter_keep(config, model_dict, temp_doc):
pages = _ocr_converter(config, model_dict, temp_doc, 10, 0)
lines = [line for line in pages[0].children if line.block_type == "Line"]
check_bboxes(pages[0], lines)