Vik Paruchuri
commited on
Commit
·
1daa24b
1
Parent(s):
518215f
Add test for partial row splitting
Browse files
marker/processors/table.py
CHANGED
|
@@ -226,7 +226,8 @@ class TableProcessor(BaseProcessor):
|
|
| 226 |
new_cell_count += 1
|
| 227 |
|
| 228 |
# For each new row we add, shift up subsequent rows
|
| 229 |
-
|
|
|
|
| 230 |
else:
|
| 231 |
for cell in row_cells:
|
| 232 |
cell.row_id += shift_up
|
|
|
|
| 226 |
new_cell_count += 1
|
| 227 |
|
| 228 |
# For each new row we add, shift up subsequent rows
|
| 229 |
+
# The max is to account for partial rows
|
| 230 |
+
shift_up += max(line_lens) - 1
|
| 231 |
else:
|
| 232 |
for cell in row_cells:
|
| 233 |
cell.row_id += shift_up
|
tests/processors/test_table_processor.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
|
|
|
|
|
| 1 |
import pytest
|
| 2 |
from marker.renderers.json import JSONRenderer
|
| 3 |
|
|
@@ -63,3 +65,15 @@ def test_ocr_table(pdf_document, detection_model, recognition_model, table_rec_m
|
|
| 63 |
table_output = renderer(pdf_document)
|
| 64 |
assert "1.2E-38" in table_output.markdown
|
| 65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List
|
| 2 |
+
|
| 3 |
import pytest
|
| 4 |
from marker.renderers.json import JSONRenderer
|
| 5 |
|
|
|
|
| 65 |
table_output = renderer(pdf_document)
|
| 66 |
assert "1.2E-38" in table_output.markdown
|
| 67 |
|
| 68 |
+
|
| 69 |
+
@pytest.mark.config({"page_range": [11]})
|
| 70 |
+
def test_split_rows(pdf_document, detection_model, recognition_model, table_rec_model):
|
| 71 |
+
processor = TableProcessor(detection_model, recognition_model, table_rec_model)
|
| 72 |
+
processor(pdf_document)
|
| 73 |
+
|
| 74 |
+
table = pdf_document.contained_blocks((BlockTypes.Table,))[-1]
|
| 75 |
+
cells: List[TableCell] = table.contained_blocks(pdf_document, (BlockTypes.TableCell,))
|
| 76 |
+
unique_rows = len(set([cell.row_id for cell in cells]))
|
| 77 |
+
assert unique_rows == 6
|
| 78 |
+
|
| 79 |
+
|