Spaces:

rt4u
/

marker

Sleeping

App Files Files Community

Vik Paruchuri commited on Aug 14, 2024

Commit

d090d63

1 Parent(s): c85fe35

Improve table benchmark, parsing

Browse files

Files changed (6) hide show

README.md +8 -0
benchmarks/table.py +3 -1
marker/benchmark/table.py +29 -12
marker/tables/cells.py +105 -105
marker/tables/table.py +0 -1
marker/tables/utils.py +1 -1

README.md CHANGED Viewed

@@ -209,6 +209,14 @@ This will benchmark marker against other text extraction methods.  It sets up ba
 Omit `--nougat` to exclude nougat from the benchmark.  I don't recommend running nougat on CPU, since it is very slow.
 # Thanks
 This work would not have been possible without amazing open source models and datasets, including (but not limited to):

 Omit `--nougat` to exclude nougat from the benchmark.  I don't recommend running nougat on CPU, since it is very slow.
+### Table benchmark
+There is a benchmark for table parsing, which you can run with:
+```shell
+python benchmarks/table.py test_data/tables.json
+```
 # Thanks
 This work would not have been possible without amazing open source models and datasets, including (but not limited to):

benchmarks/table.py CHANGED Viewed

@@ -3,6 +3,7 @@ import json
 import datasets
 from surya.schema import LayoutResult, LayoutBox
 from marker.benchmark.table import score_table
 from marker.schema.bbox import rescale_bbox
@@ -20,7 +21,7 @@ def main():
     ds = datasets.load_dataset(args.dataset, split="train")
     results = []
-    for i in range(len(ds)):
         row = ds[i]
         marker_page = Page(**json.loads(row["marker_page"]))
         table_bbox = row["table_bbox"]
@@ -55,6 +56,7 @@ def main():
         table_block = table_blocks[0]
         table_md = table_block.lines[0].spans[0].text
         results.append({
             "score": score_table(table_md, gpt4_table),
             "arxiv_id": row["arxiv_id"],

 import datasets
 from surya.schema import LayoutResult, LayoutBox
+from tqdm import tqdm
 from marker.benchmark.table import score_table
 from marker.schema.bbox import rescale_bbox
     ds = datasets.load_dataset(args.dataset, split="train")
     results = []
+    for i in tqdm(range(len(ds)), desc="Evaluating tables"):
         row = ds[i]
         marker_page = Page(**json.loads(row["marker_page"]))
         table_bbox = row["table_bbox"]
         table_block = table_blocks[0]
         table_md = table_block.lines[0].spans[0].text
         results.append({
             "score": score_table(table_md, gpt4_table),
             "arxiv_id": row["arxiv_id"],

marker/benchmark/table.py CHANGED Viewed

@@ -2,23 +2,40 @@ from rapidfuzz import fuzz
 import re
-def split_to_rows(table):
     table = table.strip()
     table = re.sub(r" {2,}", "", table)
     table_rows = table.split("\n")
-    return [t for t in table_rows if t.strip()]
 def score_table(hypothesis, reference):
-    hypothesis = split_to_rows(hypothesis)
-    reference = split_to_rows(reference)
     alignments = []
-    for row in reference:
-        max_alignment = 0
-        for hrow in hypothesis:
-            alignment = fuzz.ratio(hrow, row, score_cutoff=30) / 100
-            if alignment > max_alignment:
-                max_alignment = alignment
-        alignments.append(max_alignment)
-    return sum(alignments) / len(reference)

 import re
+def split_to_cells(table):
     table = table.strip()
     table = re.sub(r" {2,}", "", table)
     table_rows = table.split("\n")
+    table_rows = [t for t in table_rows if t.strip()]
+    table_cells = [r.split("|") for r in table_rows]
+    return table_cells
+def align_rows(hypothesis, ref_row):
+    best_alignment = []
+    best_alignment_score = 0
+    for j in range(0, len(hypothesis)):
+        alignments = []
+        for i in range(len(ref_row)):
+            if i >= len(hypothesis[j]):
+                alignments.append(0)
+                continue
+            alignment = fuzz.ratio(hypothesis[j][i], ref_row[i], score_cutoff=30) / 100
+            alignments.append(alignment)
+        if len(alignments) == 0:
+            continue
+        alignment_score = sum(alignments) / len(alignments)
+        if alignment_score >= best_alignment_score:
+            best_alignment = alignments
+            best_alignment_score = alignment_score
+    return best_alignment
 def score_table(hypothesis, reference):
+    hypothesis = split_to_cells(hypothesis)
+    reference = split_to_cells(reference)
     alignments = []
+    for i in range(0, len(reference)):
+        alignments.extend(align_rows(hypothesis, reference[i]))
+    return sum(alignments) / len(alignments)

marker/tables/cells.py CHANGED Viewed

@@ -1,116 +1,116 @@
 from PIL import Image, ImageDraw
 import copy
 from marker.tables.edges import get_vertical_lines
 import numpy as np
-def get_column_lines(page, table_box, table_rows, align="l", y_tolerance=10, x_tolerance=4):
-    table_height = (table_box[3] - table_box[1]) * 2
-    table_width = table_box[2] - table_box[0]
-    img_size = (int(table_width), int(table_height))
-    draw_img = Image.new("RGB", img_size)
-    draw = ImageDraw.Draw(draw_img)
-    for row in table_rows:
         for cell in row:
-            line_bbox = list(copy.deepcopy(cell[0]))
-            match align:
-                case "l":
-                    line_bbox[2] = line_bbox[0]
-                case "r":
-                    line_bbox[0] = line_bbox[2]
-                case "c":
-                    line_bbox[0] = line_bbox[0] + (line_bbox[2] - line_bbox[0]) / 2
-                    line_bbox[2] = line_bbox[0]
-            line_bbox[1] -= y_tolerance
-            line_bbox[3] += y_tolerance
-            line_bbox[0] -= table_box[0]
-            line_bbox[2] -= table_box[0]
-            line_bbox[1] -= table_box[1]
-            line_bbox[3] -= table_box[1]
-            draw.rectangle(line_bbox, outline="red", width=x_tolerance)
-    np_img = np.array(draw_img, dtype=np.float32) / 255.0
-    columns = get_vertical_lines(np_img, divisor=2, x_tolerance=10, y_tolerance=1)
-    columns = sorted(columns, key=lambda x: x[0])
-    # Remove short columns (single cells, probably)
-    # Rescale coordinates back to image
-    rescaled = []
-    for c in columns:
-        if c[3] - c[1] < table_height / 5:
-            continue
-        c[0] += table_box[0]
-        c[2] += table_box[0]
-        c[1] += table_box[1]
-        c[3] += table_box[1]
-        rescaled.append(c)
-    return rescaled
-def assign_cells_to_columns(page, table_box, rows, tolerance=5):
-    alignments = ["l", "r", "c"]
-    columns = {}
-    for align in alignments:
-        columns[align] = get_column_lines(page, table_box, rows, align=align)
-    # Find the column alignment that is closest to the number of columns
-    max_cols = max([len(r) for r in rows])
-    columns = min(columns.items(), key=lambda x: abs(len(x) - max_cols))[1]
-    formatted_rows = []
-    for table_row in rows:
-        formatted_row = []
-        for cell_idx in range(len(table_row) - 1, -1, -1):
-            cell = copy.deepcopy(table_row[cell_idx])
-            cell_bbox = cell[0]
-            found = False
-            for j in range(len(columns) - 1, -1, -1):
-                if columns[j][0] - tolerance < cell_bbox[0]:
-                    if len(formatted_row) > 0:
-                        prev_column = formatted_row[-1][0]
-                        blanks = prev_column - j
-                        if blanks > 1:
-                            for b in range(1, blanks):
-                                formatted_row.append([prev_column - b, ""])
-                    formatted_row.append([j, cell[1]])
-                    found = True
                     break
-            if not found:
-                formatted_row.append([cell_idx, cell[1]])
-        formatted_rows.append(formatted_row[::-1])
-    # Ensure rows have sequential column indices
-    # Also identify the total number of columns
-    col_count = 0
-    for row in formatted_rows:
-        prev_col = -1
-        for col in row:
-            col_idx = col[0]
-            if col_idx <= prev_col:
-                col[0] = prev_col + 1
-            prev_col = col[0]
-            col_count = max(col_count, col[0] + 1)
-    # Assign cells to correct column positions
-    clean_rows = []
-    for row in formatted_rows:
-        clean_row = []
-        for col in range(col_count):
-            found = False
-            for cell in row:
-                if cell[0] == col:
-                    clean_row.append(cell)
-                    found = True
-                    break
-            if not found:
-                clean_row.append((col, ""))
-        clean_rows.append([cell[1] for cell in clean_row])
-    max_cols = max([len(r) for r in clean_rows])
-    for row in clean_rows:
-        while len(row) < max_cols:
             row.append("")
-    return clean_rows

 from PIL import Image, ImageDraw
 import copy
+from marker.schema.bbox import rescale_bbox, box_intersection_pct
+from marker.schema.page import Page
 from marker.tables.edges import get_vertical_lines
 import numpy as np
+from sklearn.cluster import DBSCAN
+from marker.settings import settings
+def cluster_coords(coords, row_count):
+    if len(coords) == 0:
+        return []
+    coords = np.array(sorted(set(coords))).reshape(-1, 1)
+    clustering = DBSCAN(eps=.01, min_samples=max(2, row_count // 4)).fit(coords)
+    clusters = clustering.labels_
+    separators = []
+    for label in set(clusters):
+        clustered_points = coords[clusters == label]
+        separators.append(np.mean(clustered_points))
+    separators = sorted(separators)
+    return separators
+def find_column_separators(page: Page, table_box, rows, round_factor=.002, min_count=1):
+    left_edges = []
+    right_edges = []
+    centers = []
+    line_boxes = [p.bbox for p in page.text_lines.bboxes]
+    line_boxes = [rescale_bbox(page.text_lines.image_bbox, page.bbox, l) for l in line_boxes]
+    line_boxes = [l for l in line_boxes if box_intersection_pct(l, table_box) > settings.BBOX_INTERSECTION_THRESH]
+    pwidth = page.bbox[2] - page.bbox[0]
+    pheight = page.bbox[3] - page.bbox[1]
+    for cell in line_boxes:
+        ncell = [cell[0] / pwidth, cell[1] / pheight, cell[2] / pwidth, cell[3] / pheight]
+        left_edges.append(ncell[0] / round_factor * round_factor)
+        right_edges.append(ncell[2] / round_factor * round_factor)
+        centers.append((ncell[0] + ncell[2]) / 2 * round_factor / round_factor)
+    left_edges = [l for l in left_edges if left_edges.count(l) > min_count]
+    right_edges = [r for r in right_edges if right_edges.count(r) > min_count]
+    centers = [c for c in centers if centers.count(c) > min_count]
+    sorted_left = cluster_coords(left_edges, len(rows))
+    sorted_right = cluster_coords(right_edges, len(rows))
+    sorted_center = cluster_coords(centers, len(rows))
+    # Find list with minimum length
+    separators = max([sorted_left, sorted_right, sorted_center], key=len)
+    separators.append(1)
+    separators.insert(0, 0)
+    return separators
+def assign_cells_to_columns(page, table_box, rows, round_factor=.002, tolerance=.01):
+    separators = find_column_separators(page, table_box, rows, round_factor=round_factor)
+    additional_column_index = 0
+    pwidth = page.bbox[2] - page.bbox[0]
+    row_dicts = []
+    for row in rows:
+        new_row = {}
+        last_col_index = -1
         for cell in row:
+            left_edge = cell[0][0] / pwidth
+            column_index = -1
+            for i, separator in enumerate(separators):
+                if left_edge - tolerance < separator and last_col_index < i:
+                    column_index = i
                     break
+            if column_index == -1:
+                column_index = len(separators) + additional_column_index
+                additional_column_index += 1
+            new_row[column_index] = cell[1]
+            last_col_index = column_index
+        additional_column_index = 0
+        row_dicts.append(new_row)
+    max_row_idx = 0
+    for row in row_dicts:
+        max_row_idx = max(max_row_idx, max(row.keys()))
+    # Assign sorted cells to columns, account for blanks
+    new_rows = []
+    for row in row_dicts:
+        flat_row = []
+        for row_idx in range(1, max_row_idx + 1):
+            if row_idx in row:
+                flat_row.append(row[row_idx])
+            else:
+                flat_row.append("")
+        new_rows.append(flat_row)
+    # Pad rows to have the same length
+    max_row_len = max([len(r) for r in new_rows])
+    for row in new_rows:
+        while len(row) < max_row_len:
             row.append("")
+    cols_to_remove = set()
+    for idx, col in enumerate(zip(*new_rows)):
+        col_total = sum([len(cell.strip()) > 0 for cell in col])
+        if col_total == 0:
+            cols_to_remove.add(idx)
+    rows = []
+    for row in new_rows:
+        rows.append([col for idx, col in enumerate(row) if idx not in cols_to_remove])
+    return rows

marker/tables/table.py CHANGED Viewed

@@ -7,7 +7,6 @@ from typing import List
 from marker.settings import settings
 from marker.tables.cells import assign_cells_to_columns
 from marker.tables.utils import sort_table_blocks, replace_dots, replace_newlines
-from marker.schema.bbox import BboxElement
 def get_table_surya(page, table_box, space_tol=.01) -> List[List[str]]:

 from marker.settings import settings
 from marker.tables.cells import assign_cells_to_columns
 from marker.tables.utils import sort_table_blocks, replace_dots, replace_newlines
 def get_table_surya(page, table_box, space_tol=.01) -> List[List[str]]:

marker/tables/utils.py CHANGED Viewed

@@ -34,4 +34,4 @@ def replace_dots(text):
 def replace_newlines(text):
     # Replace all newlines
     newline_pattern = re.compile(r'[\r\n]+')
-    return newline_pattern.sub(' ', text.strip())

 def replace_newlines(text):
     # Replace all newlines
     newline_pattern = re.compile(r'[\r\n]+')
+    return newline_pattern.sub(' ', text).strip()