Vik Paruchuri commited on
Commit
66acddd
·
1 Parent(s): 72f1d99

Fix header row issues

Browse files
marker/processors/table.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from collections import defaultdict
2
  from typing import Annotated, List
3
 
@@ -61,7 +62,7 @@ class TableProcessor(BaseProcessor):
61
  table_data = []
62
  for page in document.pages:
63
  for block in page.contained_blocks(document, self.block_types):
64
- image_poly = block.polygon.rescale((page.polygon.width, page.polygon.height), page.highres_image.size)
65
  image = page.highres_image.crop(image_poly.bbox).convert("RGB")
66
 
67
  table_data.append({
@@ -70,7 +71,7 @@ class TableProcessor(BaseProcessor):
70
  "table_image": image,
71
  "table_bbox": image_poly.bbox,
72
  "img_size": page.highres_image.size,
73
- "ocr_block": block.text_extraction_method == "surya",
74
  })
75
 
76
  extract_blocks = [t for t in table_data if not t["ocr_block"]]
@@ -133,7 +134,11 @@ class TableProcessor(BaseProcessor):
133
 
134
  for k in cell_text:
135
  # TODO: see if the text needs to be sorted (based on rotation)
136
- table_cells[k].text = "\n".join([ct["text"] for ct in cell_text[k]])
 
 
 
 
137
 
138
  def assign_pdftext_lines(self, extract_blocks: list, filepath: str):
139
  table_inputs = []
@@ -166,21 +171,21 @@ class TableProcessor(BaseProcessor):
166
 
167
  def assign_ocr_lines(self, ocr_blocks: list):
168
  det_images = [t["table_image"] for t in ocr_blocks]
169
- ocr_results: List[OCRResult] = self.recognition_model(det_images, [None] * len(det_images), self.detection_model, recognition_batch_size=self.get_recognition_batch_size(), detection_batch_size=self.get_detector_batch_size())
 
 
 
 
 
 
170
 
171
  for block, ocr_res in zip(ocr_blocks, ocr_results):
172
  table_cells = []
173
  for line in ocr_res.text_lines:
174
- bbox = line.bbox
175
- # Correct back to image size
176
- bbox = [
177
- bbox[0] + block["table_bbox"][0],
178
- bbox[1] + block["table_bbox"][1],
179
- bbox[2] + block["table_bbox"][0],
180
- bbox[3] + block["table_bbox"][1]
181
- ]
182
  table_cells.append({
183
- "bbox": bbox,
184
  "text": line.text
185
  })
186
  block["table_text_lines"] = table_cells
 
1
+ import re
2
  from collections import defaultdict
3
  from typing import Annotated, List
4
 
 
62
  table_data = []
63
  for page in document.pages:
64
  for block in page.contained_blocks(document, self.block_types):
65
+ image_poly = block.polygon.rescale((page.polygon.width, page.polygon.height), page.highres_image.size).expand(.01, .01)
66
  image = page.highres_image.crop(image_poly.bbox).convert("RGB")
67
 
68
  table_data.append({
 
71
  "table_image": image,
72
  "table_bbox": image_poly.bbox,
73
  "img_size": page.highres_image.size,
74
+ "ocr_block": page.text_extraction_method == "surya",
75
  })
76
 
77
  extract_blocks = [t for t in table_data if not t["ocr_block"]]
 
134
 
135
  for k in cell_text:
136
  # TODO: see if the text needs to be sorted (based on rotation)
137
+ text = "\n".join([ct["text"] for ct in cell_text[k]])
138
+ # Replace . . . etc with ...
139
+ text = re.sub(r"(\s\.){3,}", "...", text) # Replace . . .
140
+ text = re.sub(r"\.{3,}", "...", text) # Replace ..., like in table of contents
141
+ table_cells[k].text = text
142
 
143
  def assign_pdftext_lines(self, extract_blocks: list, filepath: str):
144
  table_inputs = []
 
171
 
172
  def assign_ocr_lines(self, ocr_blocks: list):
173
  det_images = [t["table_image"] for t in ocr_blocks]
174
+ ocr_results: List[OCRResult] = self.recognition_model(
175
+ det_images,
176
+ [None] * len(det_images),
177
+ self.detection_model,
178
+ recognition_batch_size=self.get_recognition_batch_size(),
179
+ detection_batch_size=self.get_detector_batch_size()
180
+ )
181
 
182
  for block, ocr_res in zip(ocr_blocks, ocr_results):
183
  table_cells = []
184
  for line in ocr_res.text_lines:
185
+ # Don't need to correct back to image size
186
+ # Table rec boxes are relative to the table
 
 
 
 
 
 
187
  table_cells.append({
188
+ "bbox": line.bbox,
189
  "text": line.text
190
  })
191
  block["table_text_lines"] = table_cells
marker/renderers/markdown.py CHANGED
@@ -56,12 +56,14 @@ class Markdownify(MarkdownConverter):
56
  def convert_table(self, el, text, convert_as_inline):
57
  total_rows = len(el.find_all('tr'))
58
  colspans = []
 
59
  for row in el.find_all('tr'):
60
  row_cols = 0
61
  for cell in row.find_all(['td', 'th']):
62
  colspan = int(cell.get('colspan', 1))
63
  row_cols += colspan
64
  colspans.append(row_cols)
 
65
  total_cols = max(colspans)
66
 
67
  grid = [[None for _ in range(total_cols)] for _ in range(total_rows)]
@@ -94,12 +96,13 @@ class Markdownify(MarkdownConverter):
94
  if cell is not None:
95
  col_widths[col_idx] = max(col_widths[col_idx], len(str(cell)))
96
 
97
- # Generate header and separator
98
- markdown_lines.append('|' + '|'.join(f" {' ' * width} " for width in col_widths) + '|')
99
- markdown_lines.append('|' + '|'.join('-' * (width + 2) for width in col_widths) + '|')
100
 
101
  # Generate markdown rows
102
- for row in grid:
 
 
 
103
  line = []
104
  for col_idx, cell in enumerate(row):
105
  if cell is None:
@@ -108,6 +111,10 @@ class Markdownify(MarkdownConverter):
108
  line.append(f" {cell}{' ' * padding} ")
109
  markdown_lines.append('|' + '|'.join(line) + '|')
110
 
 
 
 
 
111
  table_md = '\n'.join(markdown_lines)
112
  return "\n\n" + table_md + "\n\n"
113
 
 
56
  def convert_table(self, el, text, convert_as_inline):
57
  total_rows = len(el.find_all('tr'))
58
  colspans = []
59
+ is_header_row = []
60
  for row in el.find_all('tr'):
61
  row_cols = 0
62
  for cell in row.find_all(['td', 'th']):
63
  colspan = int(cell.get('colspan', 1))
64
  row_cols += colspan
65
  colspans.append(row_cols)
66
+ is_header_row.append(len(row.find_all('th')) == row_cols)
67
  total_cols = max(colspans)
68
 
69
  grid = [[None for _ in range(total_cols)] for _ in range(total_rows)]
 
96
  if cell is not None:
97
  col_widths[col_idx] = max(col_widths[col_idx], len(str(cell)))
98
 
99
+ add_header_line = lambda: markdown_lines.append('|' + '|'.join('-' * (width + 2) for width in col_widths) + '|')
 
 
100
 
101
  # Generate markdown rows
102
+ for i, row in enumerate(grid):
103
+ if i == 1:
104
+ add_header_line()
105
+
106
  line = []
107
  for col_idx, cell in enumerate(row):
108
  if cell is None:
 
111
  line.append(f" {cell}{' ' * padding} ")
112
  markdown_lines.append('|' + '|'.join(line) + '|')
113
 
114
+ # Handle one row tables
115
+ if total_rows == 1:
116
+ add_header_line()
117
+
118
  table_md = '\n'.join(markdown_lines)
119
  return "\n\n" + table_md + "\n\n"
120
 
tests/converters/test_table_converter.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ from marker.converters.table import TableConverter
3
+ from marker.renderers.markdown import MarkdownOutput
4
+ from marker.util import classes_to_strings
5
+
6
+ def _table_converter(config, model_dict, renderer, temp_pdf):
7
+ converter = TableConverter(
8
+ artifact_dict=model_dict,
9
+ processor_list=None,
10
+ renderer=classes_to_strings([renderer])[0],
11
+ config=config
12
+ )
13
+
14
+ markdown_output: MarkdownOutput = converter(temp_pdf.name)
15
+ markdown = markdown_output.markdown
16
+
17
+ breakpoint()
18
+ assert len(markdown) > 0
19
+ assert "cyclic" in markdown
20
+
21
+
22
+ @pytest.mark.output_format("markdown")
23
+ @pytest.mark.config({"page_range": [5]})
24
+ def test_table_converter(config, model_dict, renderer, temp_pdf):
25
+ _table_converter(config, model_dict, renderer, temp_pdf)
26
+
27
+ @pytest.mark.output_format("markdown")
28
+ @pytest.mark.config({"page_range": [5], "force_ocr": True})
29
+ def test_table_converter(config, model_dict, renderer, temp_pdf):
30
+ _table_converter(config, model_dict, renderer, temp_pdf)
31
+