Spaces:

rt4u
/

marker

Sleeping

App Files Files Community

Vik Paruchuri commited on Jan 10

Commit

66acddd

1 Parent(s): 72f1d99

Fix header row issues

Browse files

Files changed (3) hide show

marker/processors/table.py +18 -13
marker/renderers/markdown.py +11 -4
tests/converters/test_table_converter.py +31 -0

marker/processors/table.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from collections import defaultdict
 from typing import Annotated, List
@@ -61,7 +62,7 @@ class TableProcessor(BaseProcessor):
         table_data = []
         for page in document.pages:
             for block in page.contained_blocks(document, self.block_types):
-                image_poly = block.polygon.rescale((page.polygon.width, page.polygon.height), page.highres_image.size)
                 image = page.highres_image.crop(image_poly.bbox).convert("RGB")
                 table_data.append({
@@ -70,7 +71,7 @@ class TableProcessor(BaseProcessor):
                     "table_image": image,
                     "table_bbox": image_poly.bbox,
                     "img_size": page.highres_image.size,
-                    "ocr_block": block.text_extraction_method == "surya",
                 })
         extract_blocks = [t for t in table_data if not t["ocr_block"]]
@@ -133,7 +134,11 @@ class TableProcessor(BaseProcessor):
             for k in cell_text:
                 # TODO: see if the text needs to be sorted (based on rotation)
-                table_cells[k].text = "\n".join([ct["text"] for ct in cell_text[k]])
     def assign_pdftext_lines(self, extract_blocks: list, filepath: str):
         table_inputs = []
@@ -166,21 +171,21 @@ class TableProcessor(BaseProcessor):
     def assign_ocr_lines(self, ocr_blocks: list):
         det_images = [t["table_image"] for t in ocr_blocks]
-        ocr_results: List[OCRResult] = self.recognition_model(det_images, [None] * len(det_images), self.detection_model, recognition_batch_size=self.get_recognition_batch_size(), detection_batch_size=self.get_detector_batch_size())
         for block, ocr_res in zip(ocr_blocks, ocr_results):
             table_cells = []
             for line in ocr_res.text_lines:
-                bbox = line.bbox
-                # Correct back to image size
-                bbox = [
-                    bbox[0] + block["table_bbox"][0],
-                    bbox[1] + block["table_bbox"][1],
-                    bbox[2] + block["table_bbox"][0],
-                    bbox[3] + block["table_bbox"][1]
-                ]
                 table_cells.append({
-                    "bbox": bbox,
                     "text": line.text
                 })
             block["table_text_lines"] = table_cells

+import re
 from collections import defaultdict
 from typing import Annotated, List
         table_data = []
         for page in document.pages:
             for block in page.contained_blocks(document, self.block_types):
+                image_poly = block.polygon.rescale((page.polygon.width, page.polygon.height), page.highres_image.size).expand(.01, .01)
                 image = page.highres_image.crop(image_poly.bbox).convert("RGB")
                 table_data.append({
                     "table_image": image,
                     "table_bbox": image_poly.bbox,
                     "img_size": page.highres_image.size,
+                    "ocr_block": page.text_extraction_method == "surya",
                 })
         extract_blocks = [t for t in table_data if not t["ocr_block"]]
             for k in cell_text:
                 # TODO: see if the text needs to be sorted (based on rotation)
+                text = "\n".join([ct["text"] for ct in cell_text[k]])
+                # Replace . . . etc with ...
+                text = re.sub(r"(\s\.){3,}", "...", text) # Replace . . .
+                text = re.sub(r"\.{3,}", "...", text) # Replace ..., like in table of contents
+                table_cells[k].text = text
     def assign_pdftext_lines(self, extract_blocks: list, filepath: str):
         table_inputs = []
     def assign_ocr_lines(self, ocr_blocks: list):
         det_images = [t["table_image"] for t in ocr_blocks]
+        ocr_results: List[OCRResult] = self.recognition_model(
+            det_images,
+            [None] * len(det_images),
+            self.detection_model,
+            recognition_batch_size=self.get_recognition_batch_size(),
+            detection_batch_size=self.get_detector_batch_size()
+        )
         for block, ocr_res in zip(ocr_blocks, ocr_results):
             table_cells = []
             for line in ocr_res.text_lines:
+                # Don't need to correct back to image size
+                # Table rec boxes are relative to the table
                 table_cells.append({
+                    "bbox": line.bbox,
                     "text": line.text
                 })
             block["table_text_lines"] = table_cells

marker/renderers/markdown.py CHANGED Viewed

@@ -56,12 +56,14 @@ class Markdownify(MarkdownConverter):
     def convert_table(self, el, text, convert_as_inline):
         total_rows = len(el.find_all('tr'))
         colspans = []
         for row in el.find_all('tr'):
             row_cols = 0
             for cell in row.find_all(['td', 'th']):
                 colspan = int(cell.get('colspan', 1))
                 row_cols += colspan
             colspans.append(row_cols)
         total_cols = max(colspans)
         grid = [[None for _ in range(total_cols)] for _ in range(total_rows)]
@@ -94,12 +96,13 @@ class Markdownify(MarkdownConverter):
                 if cell is not None:
                     col_widths[col_idx] = max(col_widths[col_idx], len(str(cell)))
-        # Generate header and separator
-        markdown_lines.append('|' + '|'.join(f" {' ' * width} " for width in col_widths) + '|')
-        markdown_lines.append('|' + '|'.join('-' * (width + 2) for width in col_widths) + '|')
         # Generate markdown rows
-        for row in grid:
             line = []
             for col_idx, cell in enumerate(row):
                 if cell is None:
@@ -108,6 +111,10 @@ class Markdownify(MarkdownConverter):
                 line.append(f" {cell}{' ' * padding} ")
             markdown_lines.append('|' + '|'.join(line) + '|')
         table_md = '\n'.join(markdown_lines)
         return "\n\n" + table_md + "\n\n"

     def convert_table(self, el, text, convert_as_inline):
         total_rows = len(el.find_all('tr'))
         colspans = []
+        is_header_row = []
         for row in el.find_all('tr'):
             row_cols = 0
             for cell in row.find_all(['td', 'th']):
                 colspan = int(cell.get('colspan', 1))
                 row_cols += colspan
             colspans.append(row_cols)
+            is_header_row.append(len(row.find_all('th')) == row_cols)
         total_cols = max(colspans)
         grid = [[None for _ in range(total_cols)] for _ in range(total_rows)]
                 if cell is not None:
                     col_widths[col_idx] = max(col_widths[col_idx], len(str(cell)))
+        add_header_line = lambda: markdown_lines.append('|' + '|'.join('-' * (width + 2) for width in col_widths) + '|')
         # Generate markdown rows
+        for i, row in enumerate(grid):
+            if i == 1:
+                add_header_line()
             line = []
             for col_idx, cell in enumerate(row):
                 if cell is None:
                 line.append(f" {cell}{' ' * padding} ")
             markdown_lines.append('|' + '|'.join(line) + '|')
+        # Handle one row tables
+        if total_rows == 1:
+            add_header_line()
         table_md = '\n'.join(markdown_lines)
         return "\n\n" + table_md + "\n\n"

tests/converters/test_table_converter.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import pytest
+from marker.converters.table import TableConverter
+from marker.renderers.markdown import MarkdownOutput
+from marker.util import classes_to_strings
+def _table_converter(config, model_dict, renderer, temp_pdf):
+    converter = TableConverter(
+        artifact_dict=model_dict,
+        processor_list=None,
+        renderer=classes_to_strings([renderer])[0],
+        config=config
+    )
+    markdown_output: MarkdownOutput = converter(temp_pdf.name)
+    markdown = markdown_output.markdown
+    breakpoint()
+    assert len(markdown) > 0
+    assert "cyclic" in markdown
+@pytest.mark.output_format("markdown")
+@pytest.mark.config({"page_range": [5]})
+def test_table_converter(config, model_dict, renderer, temp_pdf):
+    _table_converter(config, model_dict, renderer, temp_pdf)
+@pytest.mark.output_format("markdown")
+@pytest.mark.config({"page_range": [5], "force_ocr": True})
+def test_table_converter(config, model_dict, renderer, temp_pdf):
+    _table_converter(config, model_dict, renderer, temp_pdf)