peppermenta commited on
Commit
a993ef3
·
1 Parent(s): bfc6099

Add tests for line merging logic

Browse files
Files changed (1) hide show
  1. tests/builders/test_line_builder.py +24 -0
tests/builders/test_line_builder.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+
3
+ from marker.schema import BlockTypes
4
+
5
+ # Page contains provider lines that are longer than detected lines
6
+ # Any bad merging will cause broken final OCR results with format lines
7
+ @pytest.mark.filename("mixed_eng_hindi.pdf")
8
+ @pytest.mark.config({"page_range": [2], "format_lines": True})
9
+ def test_provider_detected_line_merge(pdf_document):
10
+ page = pdf_document.pages[0]
11
+ text_lines = page.contained_blocks(pdf_document, (BlockTypes.Line,))
12
+
13
+ # This count includes detected lines merged in with provider lines
14
+ assert len(text_lines) == 83
15
+
16
+ # Page provider lines only contain english, while the hindi is missing
17
+ # format_lines should fill in the missing lines
18
+ @pytest.mark.filename("mixed_eng_hindi.pdf")
19
+ @pytest.mark.config({"page_range": [0], "format_lines": True})
20
+ def test_fill_missing_provider_lines(pdf_document):
21
+ page = pdf_document.pages[0]
22
+ raw_text = page.raw_text(pdf_document)
23
+ assert "प्राधिकार से प्रकाशित" in raw_text
24
+ assert "खान मंत्रालय" in raw_text