Commit
·
a993ef3
1
Parent(s):
bfc6099
Add tests for line merging logic
Browse files
tests/builders/test_line_builder.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
+
|
| 3 |
+
from marker.schema import BlockTypes
|
| 4 |
+
|
| 5 |
+
# Page contains provider lines that are longer than detected lines
|
| 6 |
+
# Any bad merging will cause broken final OCR results with format lines
|
| 7 |
+
@pytest.mark.filename("mixed_eng_hindi.pdf")
|
| 8 |
+
@pytest.mark.config({"page_range": [2], "format_lines": True})
|
| 9 |
+
def test_provider_detected_line_merge(pdf_document):
|
| 10 |
+
page = pdf_document.pages[0]
|
| 11 |
+
text_lines = page.contained_blocks(pdf_document, (BlockTypes.Line,))
|
| 12 |
+
|
| 13 |
+
# This count includes detected lines merged in with provider lines
|
| 14 |
+
assert len(text_lines) == 83
|
| 15 |
+
|
| 16 |
+
# Page provider lines only contain english, while the hindi is missing
|
| 17 |
+
# format_lines should fill in the missing lines
|
| 18 |
+
@pytest.mark.filename("mixed_eng_hindi.pdf")
|
| 19 |
+
@pytest.mark.config({"page_range": [0], "format_lines": True})
|
| 20 |
+
def test_fill_missing_provider_lines(pdf_document):
|
| 21 |
+
page = pdf_document.pages[0]
|
| 22 |
+
raw_text = page.raw_text(pdf_document)
|
| 23 |
+
assert "प्राधिकार से प्रकाशित" in raw_text
|
| 24 |
+
assert "खान मंत्रालय" in raw_text
|