Spaces:

rt4u
/

marker

Sleeping

App Files Files Community

marker / tests /converters /test_ocr_converter.py

peppermenta

Fix tests for block mode

2374d8a 4 months ago

raw

history blame contribute delete

1.91 kB

	import pytest

	from marker.converters.ocr import OCRConverter
	from marker.renderers.ocr_json import OCRJSONOutput, OCRJSONPageOutput


	def _ocr_converter(config, model_dict, temp_pdf, line_count: int, eq_count: int):
	converter = OCRConverter(artifact_dict=model_dict, config=config)

	ocr_json: OCRJSONOutput = converter(temp_pdf.name)
	pages = ocr_json.children

	assert len(pages) == 1
	# assert len(pages[0].children) == line_count
	eqs = [line for line in pages[0].children if line.block_type == "Equation"]
	assert len(eqs) == eq_count
	return pages


	def check_bboxes(page: OCRJSONPageOutput, lines):
	page_size = page.bbox
	for line in lines:
	assert len(line.children) > 0
	for child in line.children:
	bbox = child.bbox
	assert all(
	[
	bbox[0] >= page_size[0],
	bbox[1] >= page_size[1],
	bbox[2] <= page_size[2],
	bbox[3] <= page_size[3],
	]
	), "Child bbox is outside page bbox"


	@pytest.mark.config({"page_range": [0]})
	def test_ocr_converter(config, model_dict, temp_doc):
	_ocr_converter(config, model_dict, temp_doc, 85, 2)


	@pytest.mark.filename("pres.pdf")
	@pytest.mark.config({"page_range": [1], "force_ocr": True, "keep_chars": True})
	def test_ocr_converter_force(config, model_dict, temp_doc):
	pages = _ocr_converter(config, model_dict, temp_doc, 10, 0)
	lines = [line for line in pages[0].children if line.block_type == "Line"]
	check_bboxes(pages[0], lines)


	@pytest.mark.filename("pres.pdf")
	@pytest.mark.config({"page_range": [1], "keep_chars": True})
	def test_ocr_converter_keep(config, model_dict, temp_doc):
	pages = _ocr_converter(config, model_dict, temp_doc, 10, 0)
	lines = [line for line in pages[0].children if line.block_type == "Line"]
	check_bboxes(pages[0], lines)