Vik Paruchuri
commited on
Commit
·
66acddd
1
Parent(s):
72f1d99
Fix header row issues
Browse files- marker/processors/table.py +18 -13
- marker/renderers/markdown.py +11 -4
- tests/converters/test_table_converter.py +31 -0
marker/processors/table.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
from collections import defaultdict
|
| 2 |
from typing import Annotated, List
|
| 3 |
|
|
@@ -61,7 +62,7 @@ class TableProcessor(BaseProcessor):
|
|
| 61 |
table_data = []
|
| 62 |
for page in document.pages:
|
| 63 |
for block in page.contained_blocks(document, self.block_types):
|
| 64 |
-
image_poly = block.polygon.rescale((page.polygon.width, page.polygon.height), page.highres_image.size)
|
| 65 |
image = page.highres_image.crop(image_poly.bbox).convert("RGB")
|
| 66 |
|
| 67 |
table_data.append({
|
|
@@ -70,7 +71,7 @@ class TableProcessor(BaseProcessor):
|
|
| 70 |
"table_image": image,
|
| 71 |
"table_bbox": image_poly.bbox,
|
| 72 |
"img_size": page.highres_image.size,
|
| 73 |
-
"ocr_block":
|
| 74 |
})
|
| 75 |
|
| 76 |
extract_blocks = [t for t in table_data if not t["ocr_block"]]
|
|
@@ -133,7 +134,11 @@ class TableProcessor(BaseProcessor):
|
|
| 133 |
|
| 134 |
for k in cell_text:
|
| 135 |
# TODO: see if the text needs to be sorted (based on rotation)
|
| 136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
|
| 138 |
def assign_pdftext_lines(self, extract_blocks: list, filepath: str):
|
| 139 |
table_inputs = []
|
|
@@ -166,21 +171,21 @@ class TableProcessor(BaseProcessor):
|
|
| 166 |
|
| 167 |
def assign_ocr_lines(self, ocr_blocks: list):
|
| 168 |
det_images = [t["table_image"] for t in ocr_blocks]
|
| 169 |
-
ocr_results: List[OCRResult] = self.recognition_model(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
|
| 171 |
for block, ocr_res in zip(ocr_blocks, ocr_results):
|
| 172 |
table_cells = []
|
| 173 |
for line in ocr_res.text_lines:
|
| 174 |
-
|
| 175 |
-
#
|
| 176 |
-
bbox = [
|
| 177 |
-
bbox[0] + block["table_bbox"][0],
|
| 178 |
-
bbox[1] + block["table_bbox"][1],
|
| 179 |
-
bbox[2] + block["table_bbox"][0],
|
| 180 |
-
bbox[3] + block["table_bbox"][1]
|
| 181 |
-
]
|
| 182 |
table_cells.append({
|
| 183 |
-
"bbox": bbox,
|
| 184 |
"text": line.text
|
| 185 |
})
|
| 186 |
block["table_text_lines"] = table_cells
|
|
|
|
| 1 |
+
import re
|
| 2 |
from collections import defaultdict
|
| 3 |
from typing import Annotated, List
|
| 4 |
|
|
|
|
| 62 |
table_data = []
|
| 63 |
for page in document.pages:
|
| 64 |
for block in page.contained_blocks(document, self.block_types):
|
| 65 |
+
image_poly = block.polygon.rescale((page.polygon.width, page.polygon.height), page.highres_image.size).expand(.01, .01)
|
| 66 |
image = page.highres_image.crop(image_poly.bbox).convert("RGB")
|
| 67 |
|
| 68 |
table_data.append({
|
|
|
|
| 71 |
"table_image": image,
|
| 72 |
"table_bbox": image_poly.bbox,
|
| 73 |
"img_size": page.highres_image.size,
|
| 74 |
+
"ocr_block": page.text_extraction_method == "surya",
|
| 75 |
})
|
| 76 |
|
| 77 |
extract_blocks = [t for t in table_data if not t["ocr_block"]]
|
|
|
|
| 134 |
|
| 135 |
for k in cell_text:
|
| 136 |
# TODO: see if the text needs to be sorted (based on rotation)
|
| 137 |
+
text = "\n".join([ct["text"] for ct in cell_text[k]])
|
| 138 |
+
# Replace . . . etc with ...
|
| 139 |
+
text = re.sub(r"(\s\.){3,}", "...", text) # Replace . . .
|
| 140 |
+
text = re.sub(r"\.{3,}", "...", text) # Replace ..., like in table of contents
|
| 141 |
+
table_cells[k].text = text
|
| 142 |
|
| 143 |
def assign_pdftext_lines(self, extract_blocks: list, filepath: str):
|
| 144 |
table_inputs = []
|
|
|
|
| 171 |
|
| 172 |
def assign_ocr_lines(self, ocr_blocks: list):
|
| 173 |
det_images = [t["table_image"] for t in ocr_blocks]
|
| 174 |
+
ocr_results: List[OCRResult] = self.recognition_model(
|
| 175 |
+
det_images,
|
| 176 |
+
[None] * len(det_images),
|
| 177 |
+
self.detection_model,
|
| 178 |
+
recognition_batch_size=self.get_recognition_batch_size(),
|
| 179 |
+
detection_batch_size=self.get_detector_batch_size()
|
| 180 |
+
)
|
| 181 |
|
| 182 |
for block, ocr_res in zip(ocr_blocks, ocr_results):
|
| 183 |
table_cells = []
|
| 184 |
for line in ocr_res.text_lines:
|
| 185 |
+
# Don't need to correct back to image size
|
| 186 |
+
# Table rec boxes are relative to the table
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
table_cells.append({
|
| 188 |
+
"bbox": line.bbox,
|
| 189 |
"text": line.text
|
| 190 |
})
|
| 191 |
block["table_text_lines"] = table_cells
|
marker/renderers/markdown.py
CHANGED
|
@@ -56,12 +56,14 @@ class Markdownify(MarkdownConverter):
|
|
| 56 |
def convert_table(self, el, text, convert_as_inline):
|
| 57 |
total_rows = len(el.find_all('tr'))
|
| 58 |
colspans = []
|
|
|
|
| 59 |
for row in el.find_all('tr'):
|
| 60 |
row_cols = 0
|
| 61 |
for cell in row.find_all(['td', 'th']):
|
| 62 |
colspan = int(cell.get('colspan', 1))
|
| 63 |
row_cols += colspan
|
| 64 |
colspans.append(row_cols)
|
|
|
|
| 65 |
total_cols = max(colspans)
|
| 66 |
|
| 67 |
grid = [[None for _ in range(total_cols)] for _ in range(total_rows)]
|
|
@@ -94,12 +96,13 @@ class Markdownify(MarkdownConverter):
|
|
| 94 |
if cell is not None:
|
| 95 |
col_widths[col_idx] = max(col_widths[col_idx], len(str(cell)))
|
| 96 |
|
| 97 |
-
|
| 98 |
-
markdown_lines.append('|' + '|'.join(f" {' ' * width} " for width in col_widths) + '|')
|
| 99 |
-
markdown_lines.append('|' + '|'.join('-' * (width + 2) for width in col_widths) + '|')
|
| 100 |
|
| 101 |
# Generate markdown rows
|
| 102 |
-
for row in grid:
|
|
|
|
|
|
|
|
|
|
| 103 |
line = []
|
| 104 |
for col_idx, cell in enumerate(row):
|
| 105 |
if cell is None:
|
|
@@ -108,6 +111,10 @@ class Markdownify(MarkdownConverter):
|
|
| 108 |
line.append(f" {cell}{' ' * padding} ")
|
| 109 |
markdown_lines.append('|' + '|'.join(line) + '|')
|
| 110 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
table_md = '\n'.join(markdown_lines)
|
| 112 |
return "\n\n" + table_md + "\n\n"
|
| 113 |
|
|
|
|
| 56 |
def convert_table(self, el, text, convert_as_inline):
|
| 57 |
total_rows = len(el.find_all('tr'))
|
| 58 |
colspans = []
|
| 59 |
+
is_header_row = []
|
| 60 |
for row in el.find_all('tr'):
|
| 61 |
row_cols = 0
|
| 62 |
for cell in row.find_all(['td', 'th']):
|
| 63 |
colspan = int(cell.get('colspan', 1))
|
| 64 |
row_cols += colspan
|
| 65 |
colspans.append(row_cols)
|
| 66 |
+
is_header_row.append(len(row.find_all('th')) == row_cols)
|
| 67 |
total_cols = max(colspans)
|
| 68 |
|
| 69 |
grid = [[None for _ in range(total_cols)] for _ in range(total_rows)]
|
|
|
|
| 96 |
if cell is not None:
|
| 97 |
col_widths[col_idx] = max(col_widths[col_idx], len(str(cell)))
|
| 98 |
|
| 99 |
+
add_header_line = lambda: markdown_lines.append('|' + '|'.join('-' * (width + 2) for width in col_widths) + '|')
|
|
|
|
|
|
|
| 100 |
|
| 101 |
# Generate markdown rows
|
| 102 |
+
for i, row in enumerate(grid):
|
| 103 |
+
if i == 1:
|
| 104 |
+
add_header_line()
|
| 105 |
+
|
| 106 |
line = []
|
| 107 |
for col_idx, cell in enumerate(row):
|
| 108 |
if cell is None:
|
|
|
|
| 111 |
line.append(f" {cell}{' ' * padding} ")
|
| 112 |
markdown_lines.append('|' + '|'.join(line) + '|')
|
| 113 |
|
| 114 |
+
# Handle one row tables
|
| 115 |
+
if total_rows == 1:
|
| 116 |
+
add_header_line()
|
| 117 |
+
|
| 118 |
table_md = '\n'.join(markdown_lines)
|
| 119 |
return "\n\n" + table_md + "\n\n"
|
| 120 |
|
tests/converters/test_table_converter.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
+
from marker.converters.table import TableConverter
|
| 3 |
+
from marker.renderers.markdown import MarkdownOutput
|
| 4 |
+
from marker.util import classes_to_strings
|
| 5 |
+
|
| 6 |
+
def _table_converter(config, model_dict, renderer, temp_pdf):
|
| 7 |
+
converter = TableConverter(
|
| 8 |
+
artifact_dict=model_dict,
|
| 9 |
+
processor_list=None,
|
| 10 |
+
renderer=classes_to_strings([renderer])[0],
|
| 11 |
+
config=config
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
markdown_output: MarkdownOutput = converter(temp_pdf.name)
|
| 15 |
+
markdown = markdown_output.markdown
|
| 16 |
+
|
| 17 |
+
breakpoint()
|
| 18 |
+
assert len(markdown) > 0
|
| 19 |
+
assert "cyclic" in markdown
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
@pytest.mark.output_format("markdown")
|
| 23 |
+
@pytest.mark.config({"page_range": [5]})
|
| 24 |
+
def test_table_converter(config, model_dict, renderer, temp_pdf):
|
| 25 |
+
_table_converter(config, model_dict, renderer, temp_pdf)
|
| 26 |
+
|
| 27 |
+
@pytest.mark.output_format("markdown")
|
| 28 |
+
@pytest.mark.config({"page_range": [5], "force_ocr": True})
|
| 29 |
+
def test_table_converter(config, model_dict, renderer, temp_pdf):
|
| 30 |
+
_table_converter(config, model_dict, renderer, temp_pdf)
|
| 31 |
+
|