Vik Paruchuri
commited on
Commit
·
b730265
1
Parent(s):
b2d41ef
Fix tests, add way to disable ocr
Browse files
marker/builders/line.py
CHANGED
|
@@ -36,10 +36,6 @@ class LineBuilder(BaseBuilder):
|
|
| 36 |
"The batch size to use for the ocr error detection model.",
|
| 37 |
"Default is None, which will use the default batch size for the model.",
|
| 38 |
] = None
|
| 39 |
-
enable_table_ocr: Annotated[
|
| 40 |
-
bool,
|
| 41 |
-
"Whether to skip OCR on tables. The TableProcessor will re-OCR them. Only enable if the TableProcessor is not running.",
|
| 42 |
-
] = False
|
| 43 |
layout_coverage_min_lines: Annotated[
|
| 44 |
int,
|
| 45 |
"The minimum number of PdfProvider lines that must be covered by the layout model",
|
|
@@ -54,17 +50,10 @@ class LineBuilder(BaseBuilder):
|
|
| 54 |
float,
|
| 55 |
"If less pages than this threshold are good, OCR will happen in the document. Otherwise it will not.",
|
| 56 |
] = 0.85
|
| 57 |
-
provider_line_detected_line_min_overlap_pct: Annotated[
|
| 58 |
-
float,
|
| 59 |
-
"The percentage of a provider line that has to be covered by a detected line",
|
| 60 |
-
] = 0.1
|
| 61 |
provider_line_provider_line_min_overlap_pct: Annotated[
|
| 62 |
float,
|
| 63 |
"The percentage of a provider line that has to be covered by a detected line",
|
| 64 |
-
] = 0.
|
| 65 |
-
line_vertical_merge_threshold: Annotated[
|
| 66 |
-
int, "The maximum pixel distance between y1s for two lines to be merged"
|
| 67 |
-
] = 8
|
| 68 |
excluded_for_coverage: Annotated[
|
| 69 |
Tuple[BlockTypes],
|
| 70 |
"A list of block types to exclude from the layout coverage check.",
|
|
@@ -86,6 +75,10 @@ class LineBuilder(BaseBuilder):
|
|
| 86 |
bool,
|
| 87 |
"Disable tqdm progress bars.",
|
| 88 |
] = False
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
keep_chars: Annotated[bool, "Keep individual characters."] = False
|
| 90 |
|
| 91 |
def __init__(
|
|
@@ -169,6 +162,9 @@ class LineBuilder(BaseBuilder):
|
|
| 169 |
), # Ensure provider lines don't overflow the page or intersect
|
| 170 |
]
|
| 171 |
)
|
|
|
|
|
|
|
|
|
|
| 172 |
layout_good.append(provider_lines_good)
|
| 173 |
|
| 174 |
run_detection = [not good for good in layout_good]
|
|
@@ -191,12 +187,12 @@ class LineBuilder(BaseBuilder):
|
|
| 191 |
)
|
| 192 |
|
| 193 |
# Setup detection results
|
|
|
|
| 194 |
if detection_result:
|
| 195 |
detection_boxes = [
|
| 196 |
PolygonBox(polygon=box.polygon) for box in detection_result.bboxes
|
| 197 |
]
|
| 198 |
-
|
| 199 |
-
detection_boxes = []
|
| 200 |
detection_boxes = sort_text_lines(detection_boxes)
|
| 201 |
|
| 202 |
if provider_lines_good:
|
|
@@ -257,6 +253,7 @@ class LineBuilder(BaseBuilder):
|
|
| 257 |
provider_bboxes = [line.line.polygon.bbox for line in provider_lines]
|
| 258 |
# Add a small margin to account for minor overflows
|
| 259 |
page_bbox = document_page.polygon.expand(5, 5).bbox
|
|
|
|
| 260 |
for bbox in provider_bboxes:
|
| 261 |
if bbox[0] < page_bbox[0]:
|
| 262 |
return False
|
|
@@ -275,7 +272,7 @@ class LineBuilder(BaseBuilder):
|
|
| 275 |
)
|
| 276 |
|
| 277 |
# There should be one intersection with itself
|
| 278 |
-
if intersect_counts >
|
| 279 |
return False
|
| 280 |
|
| 281 |
return True
|
|
|
|
| 36 |
"The batch size to use for the ocr error detection model.",
|
| 37 |
"Default is None, which will use the default batch size for the model.",
|
| 38 |
] = None
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
layout_coverage_min_lines: Annotated[
|
| 40 |
int,
|
| 41 |
"The minimum number of PdfProvider lines that must be covered by the layout model",
|
|
|
|
| 50 |
float,
|
| 51 |
"If less pages than this threshold are good, OCR will happen in the document. Otherwise it will not.",
|
| 52 |
] = 0.85
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
provider_line_provider_line_min_overlap_pct: Annotated[
|
| 54 |
float,
|
| 55 |
"The percentage of a provider line that has to be covered by a detected line",
|
| 56 |
+
] = 0.15
|
|
|
|
|
|
|
|
|
|
| 57 |
excluded_for_coverage: Annotated[
|
| 58 |
Tuple[BlockTypes],
|
| 59 |
"A list of block types to exclude from the layout coverage check.",
|
|
|
|
| 75 |
bool,
|
| 76 |
"Disable tqdm progress bars.",
|
| 77 |
] = False
|
| 78 |
+
disable_ocr: Annotated[
|
| 79 |
+
bool,
|
| 80 |
+
"Disable OCR for the document. This will only use the lines from the provider.",
|
| 81 |
+
] = False
|
| 82 |
keep_chars: Annotated[bool, "Keep individual characters."] = False
|
| 83 |
|
| 84 |
def __init__(
|
|
|
|
| 162 |
), # Ensure provider lines don't overflow the page or intersect
|
| 163 |
]
|
| 164 |
)
|
| 165 |
+
if self.disable_ocr:
|
| 166 |
+
provider_lines_good = True
|
| 167 |
+
|
| 168 |
layout_good.append(provider_lines_good)
|
| 169 |
|
| 170 |
run_detection = [not good for good in layout_good]
|
|
|
|
| 187 |
)
|
| 188 |
|
| 189 |
# Setup detection results
|
| 190 |
+
detection_boxes = []
|
| 191 |
if detection_result:
|
| 192 |
detection_boxes = [
|
| 193 |
PolygonBox(polygon=box.polygon) for box in detection_result.bboxes
|
| 194 |
]
|
| 195 |
+
|
|
|
|
| 196 |
detection_boxes = sort_text_lines(detection_boxes)
|
| 197 |
|
| 198 |
if provider_lines_good:
|
|
|
|
| 253 |
provider_bboxes = [line.line.polygon.bbox for line in provider_lines]
|
| 254 |
# Add a small margin to account for minor overflows
|
| 255 |
page_bbox = document_page.polygon.expand(5, 5).bbox
|
| 256 |
+
|
| 257 |
for bbox in provider_bboxes:
|
| 258 |
if bbox[0] < page_bbox[0]:
|
| 259 |
return False
|
|
|
|
| 272 |
)
|
| 273 |
|
| 274 |
# There should be one intersection with itself
|
| 275 |
+
if intersect_counts > 2:
|
| 276 |
return False
|
| 277 |
|
| 278 |
return True
|
tests/builders/test_document_builder.py
CHANGED
|
@@ -4,20 +4,40 @@ from marker.schema import BlockTypes
|
|
| 4 |
from marker.schema.text.line import Line
|
| 5 |
|
| 6 |
|
|
|
|
| 7 |
@pytest.mark.config({"page_range": [0]})
|
| 8 |
def test_document_builder(pdf_document):
|
| 9 |
first_page = pdf_document.pages[0]
|
| 10 |
-
assert first_page.structure[0] ==
|
| 11 |
|
| 12 |
first_block = first_page.get_block(first_page.structure[0])
|
| 13 |
assert first_block.block_type == BlockTypes.SectionHeader
|
| 14 |
-
assert first_block.text_extraction_method ==
|
| 15 |
|
| 16 |
first_text_block: Line = first_page.get_block(first_block.structure[0])
|
| 17 |
assert first_text_block.block_type == BlockTypes.Line
|
| 18 |
|
| 19 |
first_span = first_page.get_block(first_text_block.structure[0])
|
| 20 |
assert first_span.block_type == BlockTypes.Span
|
| 21 |
-
assert first_span.text ==
|
| 22 |
-
assert first_span.font ==
|
| 23 |
-
assert first_span.formats == [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
from marker.schema.text.line import Line
|
| 5 |
|
| 6 |
|
| 7 |
+
@pytest.mark.filename("thinkpython.pdf")
|
| 8 |
@pytest.mark.config({"page_range": [0]})
|
| 9 |
def test_document_builder(pdf_document):
|
| 10 |
first_page = pdf_document.pages[0]
|
| 11 |
+
assert first_page.structure[0] == "/page/0/SectionHeader/0"
|
| 12 |
|
| 13 |
first_block = first_page.get_block(first_page.structure[0])
|
| 14 |
assert first_block.block_type == BlockTypes.SectionHeader
|
| 15 |
+
assert first_block.text_extraction_method == "pdftext"
|
| 16 |
|
| 17 |
first_text_block: Line = first_page.get_block(first_block.structure[0])
|
| 18 |
assert first_text_block.block_type == BlockTypes.Line
|
| 19 |
|
| 20 |
first_span = first_page.get_block(first_text_block.structure[0])
|
| 21 |
assert first_span.block_type == BlockTypes.Span
|
| 22 |
+
assert first_span.text == "Think Python"
|
| 23 |
+
assert first_span.font == "URWPalladioL-Roma"
|
| 24 |
+
assert first_span.formats == ["plain"]
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
@pytest.mark.config({"page_range": [0]})
|
| 28 |
+
def test_document_builder_inline_eq(pdf_document):
|
| 29 |
+
first_page = pdf_document.pages[0]
|
| 30 |
+
assert first_page.structure[0] == "/page/0/SectionHeader/0"
|
| 31 |
+
|
| 32 |
+
first_block = first_page.get_block(first_page.structure[0])
|
| 33 |
+
assert first_block.block_type == BlockTypes.SectionHeader
|
| 34 |
+
assert first_block.text_extraction_method == "surya"
|
| 35 |
+
|
| 36 |
+
first_text_block: Line = first_page.get_block(first_block.structure[0])
|
| 37 |
+
assert first_text_block.block_type == BlockTypes.Line
|
| 38 |
+
|
| 39 |
+
first_span = first_page.get_block(first_text_block.structure[0])
|
| 40 |
+
assert first_span.block_type == BlockTypes.Span
|
| 41 |
+
assert first_span.text == "Subspace Adversarial Training"
|
| 42 |
+
assert first_span.font == "NimbusRomNo9L-Medi"
|
| 43 |
+
assert first_span.formats == ["plain"]
|
tests/builders/test_layout_replace.py
CHANGED
|
@@ -8,8 +8,11 @@ from marker.schema import BlockTypes
|
|
| 8 |
from marker.schema.registry import get_block_class
|
| 9 |
|
| 10 |
|
|
|
|
| 11 |
@pytest.mark.config({"page_range": [0]})
|
| 12 |
-
def test_layout_replace(
|
|
|
|
|
|
|
| 13 |
# The llm layout builder replaces blocks - this makes sure text is still merged properly
|
| 14 |
layout_builder = LayoutBuilder(layout_model, config)
|
| 15 |
line_builder = LineBuilder(detection_model, ocr_error_model, config)
|
|
@@ -35,8 +38,4 @@ def test_layout_replace(request, config, doc_provider, layout_model, ocr_error_m
|
|
| 35 |
renderer = MarkdownRenderer(config)
|
| 36 |
rendered = renderer(document)
|
| 37 |
|
| 38 |
-
assert "
|
| 39 |
-
assert "projected gradient descent" in rendered.markdown
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
|
|
|
| 8 |
from marker.schema.registry import get_block_class
|
| 9 |
|
| 10 |
|
| 11 |
+
@pytest.mark.filename("thinkpython.pdf")
|
| 12 |
@pytest.mark.config({"page_range": [0]})
|
| 13 |
+
def test_layout_replace(
|
| 14 |
+
request, config, doc_provider, layout_model, ocr_error_model, detection_model
|
| 15 |
+
):
|
| 16 |
# The llm layout builder replaces blocks - this makes sure text is still merged properly
|
| 17 |
layout_builder = LayoutBuilder(layout_model, config)
|
| 18 |
line_builder = LineBuilder(detection_model, ocr_error_model, config)
|
|
|
|
| 38 |
renderer = MarkdownRenderer(config)
|
| 39 |
rendered = renderer(document)
|
| 40 |
|
| 41 |
+
assert "Think Python" in rendered.markdown
|
|
|
|
|
|
|
|
|
|
|
|
tests/converters/test_ocr_converter.py
CHANGED
|
@@ -35,7 +35,7 @@ def check_bboxes(page: OCRJSONPageOutput, lines):
|
|
| 35 |
|
| 36 |
@pytest.mark.config({"page_range": [0]})
|
| 37 |
def test_ocr_converter(config, model_dict, temp_doc):
|
| 38 |
-
_ocr_converter(config, model_dict, temp_doc,
|
| 39 |
|
| 40 |
|
| 41 |
@pytest.mark.filename("pres.pdf")
|
|
|
|
| 35 |
|
| 36 |
@pytest.mark.config({"page_range": [0]})
|
| 37 |
def test_ocr_converter(config, model_dict, temp_doc):
|
| 38 |
+
_ocr_converter(config, model_dict, temp_doc, 85, 2)
|
| 39 |
|
| 40 |
|
| 41 |
@pytest.mark.filename("pres.pdf")
|
tests/converters/test_pdf_converter.py
CHANGED
|
@@ -6,7 +6,7 @@ from marker.renderers.markdown import MarkdownOutput
|
|
| 6 |
|
| 7 |
|
| 8 |
@pytest.mark.output_format("markdown")
|
| 9 |
-
@pytest.mark.config({"page_range": [0, 1, 2, 3, 7]})
|
| 10 |
def test_pdf_converter(pdf_converter: PdfConverter, temp_doc):
|
| 11 |
markdown_output: MarkdownOutput = pdf_converter(temp_doc.name)
|
| 12 |
markdown = markdown_output.markdown
|
|
@@ -79,7 +79,7 @@ def test_pptx_converter(pdf_converter: PdfConverter, temp_doc):
|
|
| 79 |
|
| 80 |
|
| 81 |
@pytest.mark.output_format("markdown")
|
| 82 |
-
@pytest.mark.config({"page_range": [0, 1, 2, 3, 7]})
|
| 83 |
def test_pdf_converter_bytes(pdf_converter: PdfConverter, temp_doc):
|
| 84 |
with open(temp_doc.name, "rb") as f:
|
| 85 |
data = f.read()
|
|
|
|
| 6 |
|
| 7 |
|
| 8 |
@pytest.mark.output_format("markdown")
|
| 9 |
+
@pytest.mark.config({"page_range": [0, 1, 2, 3, 7], "disable_ocr": True})
|
| 10 |
def test_pdf_converter(pdf_converter: PdfConverter, temp_doc):
|
| 11 |
markdown_output: MarkdownOutput = pdf_converter(temp_doc.name)
|
| 12 |
markdown = markdown_output.markdown
|
|
|
|
| 79 |
|
| 80 |
|
| 81 |
@pytest.mark.output_format("markdown")
|
| 82 |
+
@pytest.mark.config({"page_range": [0, 1, 2, 3, 7], "disable_ocr": True})
|
| 83 |
def test_pdf_converter_bytes(pdf_converter: PdfConverter, temp_doc):
|
| 84 |
with open(temp_doc.name, "rb") as f:
|
| 85 |
data = f.read()
|
tests/renderers/test_markdown_renderer.py
CHANGED
|
@@ -5,13 +5,22 @@ from marker.schema import BlockTypes
|
|
| 5 |
from marker.schema.blocks import TableCell
|
| 6 |
|
| 7 |
|
| 8 |
-
@pytest.mark.config({"page_range": [0]})
|
| 9 |
def test_markdown_renderer(pdf_document):
|
| 10 |
renderer = MarkdownRenderer()
|
| 11 |
md = renderer(pdf_document).markdown
|
| 12 |
|
| 13 |
# Verify markdown
|
| 14 |
-
assert
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
|
| 17 |
@pytest.mark.config({"page_range": [0, 1], "paginate_output": True})
|
|
@@ -29,12 +38,14 @@ def test_markdown_renderer_pagination_blank_last_page(pdf_document):
|
|
| 29 |
last_page = pdf_document.pages[-1]
|
| 30 |
last_page.children = []
|
| 31 |
last_page.structure = []
|
| 32 |
-
|
| 33 |
renderer = MarkdownRenderer({"paginate_output": True})
|
| 34 |
md = renderer(pdf_document).markdown
|
| 35 |
-
|
| 36 |
# Should end with pagination marker and preserve trailing newlines
|
| 37 |
-
assert md.endswith("}\n\n") or md.endswith(
|
|
|
|
|
|
|
| 38 |
|
| 39 |
|
| 40 |
@pytest.mark.config({"page_range": [0, 1]})
|
|
@@ -48,9 +59,10 @@ def test_markdown_renderer_metadata(pdf_document):
|
|
| 48 |
def test_markdown_renderer_images(pdf_document):
|
| 49 |
renderer = MarkdownRenderer({"extract_images": False})
|
| 50 |
markdown_output = renderer(pdf_document)
|
| 51 |
-
|
| 52 |
assert len(markdown_output.images) == 0
|
| 53 |
-
assert
|
|
|
|
| 54 |
|
| 55 |
@pytest.mark.config({"page_range": [5]})
|
| 56 |
def test_markdown_renderer_tables(pdf_document):
|
|
@@ -74,5 +86,3 @@ def test_markdown_renderer_tables(pdf_document):
|
|
| 74 |
renderer = MarkdownRenderer()
|
| 75 |
md = renderer(pdf_document).markdown
|
| 76 |
assert "54 <i>.45</i> 67<br>89 $x$" in md
|
| 77 |
-
|
| 78 |
-
|
|
|
|
| 5 |
from marker.schema.blocks import TableCell
|
| 6 |
|
| 7 |
|
| 8 |
+
@pytest.mark.config({"page_range": [0], "disable_ocr": True})
|
| 9 |
def test_markdown_renderer(pdf_document):
|
| 10 |
renderer = MarkdownRenderer()
|
| 11 |
md = renderer(pdf_document).markdown
|
| 12 |
|
| 13 |
# Verify markdown
|
| 14 |
+
assert "# Subspace Adversarial Training" in md
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
@pytest.mark.config({"page_range": [0]})
|
| 18 |
+
def test_markdown_renderer_auto_ocr(pdf_document):
|
| 19 |
+
renderer = MarkdownRenderer()
|
| 20 |
+
md = renderer(pdf_document).markdown
|
| 21 |
+
|
| 22 |
+
# Verify markdown
|
| 23 |
+
assert "Subspace Adversarial Training" in md
|
| 24 |
|
| 25 |
|
| 26 |
@pytest.mark.config({"page_range": [0, 1], "paginate_output": True})
|
|
|
|
| 38 |
last_page = pdf_document.pages[-1]
|
| 39 |
last_page.children = []
|
| 40 |
last_page.structure = []
|
| 41 |
+
|
| 42 |
renderer = MarkdownRenderer({"paginate_output": True})
|
| 43 |
md = renderer(pdf_document).markdown
|
| 44 |
+
|
| 45 |
# Should end with pagination marker and preserve trailing newlines
|
| 46 |
+
assert md.endswith("}\n\n") or md.endswith(
|
| 47 |
+
"}------------------------------------------------\n\n"
|
| 48 |
+
)
|
| 49 |
|
| 50 |
|
| 51 |
@pytest.mark.config({"page_range": [0, 1]})
|
|
|
|
| 59 |
def test_markdown_renderer_images(pdf_document):
|
| 60 |
renderer = MarkdownRenderer({"extract_images": False})
|
| 61 |
markdown_output = renderer(pdf_document)
|
| 62 |
+
|
| 63 |
assert len(markdown_output.images) == 0
|
| 64 |
+
assert "
|
| 68 |
def test_markdown_renderer_tables(pdf_document):
|
|
|
|
| 86 |
renderer = MarkdownRenderer()
|
| 87 |
md = renderer(pdf_document).markdown
|
| 88 |
assert "54 <i>.45</i> 67<br>89 $x$" in md
|
|
|
|
|
|