Spaces:

rt4u
/

marker

Sleeping

App Files Files Community

Vik Paruchuri commited on Jan 30

Commit

cc1d60d

1 Parent(s): c57d999

Refactor benchmarks

Browse files

Files changed (6) hide show

benchmarks/overall/__init__.py +0 -0
benchmarks/overall/overall.py +54 -26
benchmarks/table/__init__.py +0 -0
benchmarks/table/inference.py +139 -0
benchmarks/table/table.py +5 -126
benchmarks/verify_scores.py +1 -1

benchmarks/overall/__init__.py ADDED Viewed

File without changes

benchmarks/overall/overall.py CHANGED Viewed

@@ -1,13 +1,14 @@
 import json
 import os
-import traceback
 from collections import defaultdict
 from pathlib import Path
 import click
 import datasets
 import tabulate
 from tqdm import tqdm
 from benchmarks.overall.inference import marker_scoring_func, mathpix_scoring_func
 from benchmarks.overall.schema import FullResult
@@ -28,12 +29,17 @@ def get_method_scores(ds, model_dict, max_rows=None, score_func=marker_scoring_f
         gt_blocks = json.loads(sample["gt_blocks"])
         doc_type = sample["classification"]
         try:
             gt_html = [block["html"] for block in gt_blocks]
             scores = score_func(model_dict, sample, gt_html, **kwargs)
         except ValueError as e:
             print(f"Error with sample {idx}: {e}")
             continue
         averages_by_type[doc_type].append(scores["overall_score"])
         for score, gt_block in zip(scores["scores"], gt_blocks):
@@ -50,27 +56,48 @@ def get_method_scores(ds, model_dict, max_rows=None, score_func=marker_scoring_f
         "average_score": sum([bench_scores[k]["overall_score"] for k in bench_scores]) / len(bench_scores)
     }
-def print_scores(scores: FullResult, method: str):
-    averages_by_type = scores["averages_by_type"]
-    averages_by_block_type = scores["averages_by_block_type"]
-    bench_scores = scores["raw_scores"]
-    for k in averages_by_type:
-        averages_by_type[k] = sum(averages_by_type[k]) / len(averages_by_type[k])
-    averages_by_type = sorted(averages_by_type.items())
-    print(f"Scores for method {method}:")
-    print(tabulate.tabulate(averages_by_type, headers=["Document Type", "Average Score"], tablefmt="github"))
-    for k in averages_by_block_type:
-        averages_by_block_type[k] = sum(averages_by_block_type[k]) / len(averages_by_block_type[k])
-    averages_by_block_type = sorted(averages_by_block_type.items())
-    print(tabulate.tabulate(averages_by_block_type, headers=["Block Type", "Average Score"], tablefmt="github"))
-    overall_average = sum([bench_scores[k]["overall_score"] for k in bench_scores]) / len(bench_scores)
-    print(tabulate.tabulate([["Overall Average", overall_average]], tablefmt="github"))
-    print()
 @click.command(help="Benchmark PDF to MD conversion.")
 @click.option("--dataset", type=str, help="Path to the benchmark dataset", default="datalab-to/marker_benchmark")
@@ -85,6 +112,9 @@ def main(
         max_rows: int,
         use_llm: bool
 ):
     allowed_methods = ["mathpix", ""]
     methods = other_methods.split(",")
     for method in methods:
@@ -104,11 +134,9 @@ def main(
         mathpix_scores = get_method_scores(ds, model_dict, max_rows=max_rows, score_func=mathpix_scoring_func, mathpix_ds=mathpix_ds)
         all_scores["mathpix"] = mathpix_scores
-    for k,v in all_scores.items():
-        print_scores(v, k)
-    out_path = Path(result_path)
-    out_path.mkdir(parents=True, exist_ok=True)
     with open(out_path / "overall.json", "w", encoding="utf-8") as f:
         json.dump(all_scores, f, indent=2, ensure_ascii=False)

 import json
 import os
 from collections import defaultdict
 from pathlib import Path
+from typing import Dict
 import click
 import datasets
 import tabulate
 from tqdm import tqdm
+import pypdfium2 as pdfium
 from benchmarks.overall.inference import marker_scoring_func, mathpix_scoring_func
 from benchmarks.overall.schema import FullResult
         gt_blocks = json.loads(sample["gt_blocks"])
         doc_type = sample["classification"]
         try:
             gt_html = [block["html"] for block in gt_blocks]
             scores = score_func(model_dict, sample, gt_html, **kwargs)
         except ValueError as e:
             print(f"Error with sample {idx}: {e}")
             continue
+        except pdfium.PdfiumError as e:
+            print(f"Error opening pdf: {e}")
+            continue
         averages_by_type[doc_type].append(scores["overall_score"])
         for score, gt_block in zip(scores["scores"], gt_blocks):
         "average_score": sum([bench_scores[k]["overall_score"] for k in bench_scores]) / len(bench_scores)
     }
+def print_scores(scores: Dict[str, FullResult], out_path: Path, default_method="marker"):
+    inference_types = [default_method] + [k for k in scores.keys() if k != default_method]
+    document_types = list(scores[default_method]["averages_by_type"].keys())
+    document_rows = [[k] for k in document_types]
+    for k in inference_types:
+        for i, doc_type in enumerate(document_types):
+            avg = sum(scores[k]["averages_by_type"][doc_type]) / max(1, len(scores[k]["averages_by_type"][doc_type]))
+            document_rows[i].append(avg)
+    print("Document types")
+    document_type_table = tabulate.tabulate(document_rows, headers=["Document Type"] + inference_types, tablefmt="github")
+    print(document_type_table)
+    with open(out_path / "document_types.md", "w", encoding="utf-8") as f:
+        f.write(document_type_table)
+    block_types = list(scores[default_method]["averages_by_block_type"].keys())
+    block_rows = [[k] for k in block_types]
+    for k in inference_types:
+        for i, block_type in enumerate(block_types):
+            avg = sum(scores[k]["averages_by_block_type"][block_type]) / max(1, len(scores[k]["averages_by_block_type"][block_type]))
+            block_rows[i].append(avg)
+    print("Block types")
+    block_type_table = tabulate.tabulate(block_rows, headers=["Block Type"] + inference_types, tablefmt="github")
+    print(block_type_table)
+    with open(out_path / "block_types.md", "w", encoding="utf-8") as f:
+        f.write(block_type_table)
+    headers = ["Method", "Avg Score", "Avg Time"]
+    inference_rows = [[k] for k in inference_types]
+    for i, k in enumerate(inference_types):
+        inference_rows[i].append(scores[k]["average_score"])
+        inference_rows[i].append(scores[k]["average_time"])
+    print("Overall")
+    overall_table = tabulate.tabulate(inference_rows, headers=headers, tablefmt="github")
+    print(overall_table)
+    with open(out_path / "overall.md", "w", encoding="utf-8") as f:
+        f.write(overall_table)
+    print("Scores computed by aligning ground truth markdown blocks with predicted markdown for each method.  The scores are 0-100 based on edit distance.")
 @click.command(help="Benchmark PDF to MD conversion.")
 @click.option("--dataset", type=str, help="Path to the benchmark dataset", default="datalab-to/marker_benchmark")
         max_rows: int,
         use_llm: bool
 ):
+    out_path = Path(result_path)
+    out_path.mkdir(parents=True, exist_ok=True)
     allowed_methods = ["mathpix", ""]
     methods = other_methods.split(",")
     for method in methods:
         mathpix_scores = get_method_scores(ds, model_dict, max_rows=max_rows, score_func=mathpix_scoring_func, mathpix_ds=mathpix_ds)
         all_scores["mathpix"] = mathpix_scores
+    # Display formatted score tables
+    print_scores(all_scores, out_path)
     with open(out_path / "overall.json", "w", encoding="utf-8") as f:
         json.dump(all_scores, f, indent=2, ensure_ascii=False)

benchmarks/table/__init__.py ADDED Viewed

File without changes

benchmarks/table/inference.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import datasets
+import numpy as np
+from bs4 import BeautifulSoup
+import pypdfium2 as pdfium
+from tqdm import tqdm
+import base64
+import tempfile
+from benchmarks.table.gemini import gemini_table_rec
+from marker.config.parser import ConfigParser
+from marker.converters.table import TableConverter
+from marker.models import create_model_dict
+from marker.util import matrix_intersection_area
+def inference_tables(dataset, use_llm: bool, table_rec_batch_size: int | None, max_rows: int, use_gemini: bool):
+    models = create_model_dict()
+    config_parser = ConfigParser({'output_format': 'json', "use_llm": use_llm, "table_rec_batch_size": table_rec_batch_size, "disable_tqdm": True})
+    total_unaligned = 0
+    results = []
+    dataset = datasets.load_dataset(dataset, split='train')
+    dataset = dataset.shuffle(seed=0)
+    iterations = len(dataset)
+    if max_rows is not None:
+        iterations = min(max_rows, len(dataset))
+    for i in tqdm(range(iterations), desc='Converting Tables'):
+        try:
+            row = dataset[i]
+            pdf_binary = base64.b64decode(row['pdf'])
+            gt_tables = row['tables']  # Already sorted by reading order, which is what marker returns
+            converter = TableConverter(
+                config=config_parser.generate_config_dict(),
+                artifact_dict=models,
+                processor_list=config_parser.get_processors(),
+                renderer=config_parser.get_renderer()
+            )
+            with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as temp_pdf_file:
+                temp_pdf_file.write(pdf_binary)
+                temp_pdf_file.seek(0)
+                marker_json = converter(temp_pdf_file.name).children
+                doc = pdfium.PdfDocument(temp_pdf_file.name)
+                page_image = doc[0].render(scale=92 / 72).to_pil()
+            if len(marker_json) == 0 or len(gt_tables) == 0:
+                print(f'No tables detected, skipping...')
+                total_unaligned += len(gt_tables)
+                continue
+            marker_tables = extract_tables(marker_json)
+            marker_table_boxes = [table.bbox for table in marker_tables]
+            page_bbox = marker_json[0].bbox
+            w_scaler, h_scaler = page_image.width / page_bbox[2], page_image.height / page_bbox[3]
+            table_images = [
+                page_image.crop([bbox[0] * w_scaler, bbox[1] * h_scaler, bbox[2] * w_scaler, bbox[3] * h_scaler]) for bbox
+                in marker_table_boxes]
+            # Normalize the bboxes
+            for bbox in marker_table_boxes:
+                bbox[0] = bbox[0] / page_bbox[2]
+                bbox[1] = bbox[1] / page_bbox[3]
+                bbox[2] = bbox[2] / page_bbox[2]
+                bbox[3] = bbox[3] / page_bbox[3]
+            gt_boxes = [table['normalized_bbox'] for table in gt_tables]
+            gt_areas = [(bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) for bbox in gt_boxes]
+            marker_areas = [(bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) for bbox in marker_table_boxes]
+            table_alignments = matrix_intersection_area(gt_boxes, marker_table_boxes)
+            aligned_tables = []
+            used_tables = set()
+            unaligned_tables = set()
+            for table_idx, alignment in enumerate(table_alignments):
+                try:
+                    max_area = np.max(alignment)
+                    aligned_idx = np.argmax(alignment)
+                except ValueError:
+                    # No alignment found
+                    unaligned_tables.add(table_idx)
+                    continue
+                if aligned_idx in used_tables:
+                    # Marker table already aligned with another gt table
+                    unaligned_tables.add(table_idx)
+                    continue
+                # Gt table doesn't align well with any marker table
+                gt_table_pct = gt_areas[table_idx] / max_area
+                if not .75 < gt_table_pct < 1.25:
+                    unaligned_tables.add(table_idx)
+                    continue
+                # Marker table doesn't align with gt table
+                marker_table_pct = marker_areas[aligned_idx] / max_area
+                if not .75 < marker_table_pct < 1.25:
+                    unaligned_tables.add(table_idx)
+                    continue
+                gemini_html = ""
+                if use_gemini:
+                    gemini_html = gemini_table_rec(table_images[aligned_idx])
+                aligned_tables.append(
+                    (marker_tables[aligned_idx], gt_tables[table_idx], gemini_html)
+                )
+                used_tables.add(aligned_idx)
+            total_unaligned += len(unaligned_tables)
+            for marker_table, gt_table, gemini_table in aligned_tables:
+                gt_table_html = gt_table['html']
+                # marker wraps the table in <tbody> which fintabnet data doesn't
+                # Fintabnet doesn't use th tags, need to be replaced for fair comparison
+                marker_table_soup = BeautifulSoup(marker_table.html, 'html.parser')
+                tbody = marker_table_soup.find('tbody')
+                if tbody:
+                    tbody.unwrap()
+                for th_tag in marker_table_soup.find_all('th'):
+                    th_tag.name = 'td'
+                marker_table_html = str(marker_table_soup)
+                marker_table_html = marker_table_html.replace("<br>", " ")  # Fintabnet uses spaces instead of newlines
+                marker_table_html = marker_table_html.replace("\n", " ")  # Fintabnet uses spaces instead of newlines
+                gemini_table_html = gemini_table.replace("\n", " ")  # Fintabnet uses spaces instead of newlines
+                results.append({
+                    "marker_table": marker_table_html,
+                    "gt_table": gt_table_html,
+                    "gemini_table": gemini_table_html
+                })
+        except pdfium.PdfiumError:
+            print('Broken PDF, Skipping...')
+            continue
+    return results, total_unaligned

benchmarks/table/table.py CHANGED Viewed

@@ -1,33 +1,27 @@
 import os
 os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"  # Transformers uses .isin for an op, which is not supported on MPS
 from pathlib import Path
 from itertools import repeat
 from typing import List
-import numpy as np
-import base64
 import time
 import datasets
 from tqdm import tqdm
-import tempfile
 import click
 from tabulate import tabulate
 import json
-from bs4 import BeautifulSoup
 from concurrent.futures import ProcessPoolExecutor
-from pypdfium2._helpers.misc import PdfiumError
-import pypdfium2 as pdfium
-from marker.util import matrix_intersection_area
-from marker.renderers.json import JSONOutput, JSONBlockOutput
 from marker.settings import settings
 from marker.config.parser import ConfigParser
-from marker.converters.table import TableConverter
 from marker.models import create_model_dict
 from scoring import wrap_table_html, similarity_eval_html
-from gemini import gemini_table_rec
 def update_teds_score(result, prefix: str = "marker"):
     prediction, ground_truth = result[f'{prefix}_table'], result['gt_table']
@@ -64,128 +58,13 @@ def main(
         table_rec_batch_size: int | None,
         use_gemini: bool = False
 ):
-    models = create_model_dict()
-    config_parser = ConfigParser({'output_format': 'json', "use_llm": use_llm, "table_rec_batch_size": table_rec_batch_size, "disable_tqdm": True})
     start = time.time()
     dataset = datasets.load_dataset(dataset, split='train')
     dataset = dataset.shuffle(seed=0)
-    iterations = len(dataset)
-    if max_rows is not None:
-        iterations = min(max_rows, len(dataset))
-    results = []
-    total_unaligned = 0
-    for i in tqdm(range(iterations), desc='Converting Tables'):
-        try:
-            row = dataset[i]
-            pdf_binary = base64.b64decode(row['pdf'])
-            gt_tables = row['tables']       #Already sorted by reading order, which is what marker returns
-            converter = TableConverter(
-                config=config_parser.generate_config_dict(),
-                artifact_dict=models,
-                processor_list=config_parser.get_processors(),
-                renderer=config_parser.get_renderer()
-            )
-            with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as temp_pdf_file:
-                temp_pdf_file.write(pdf_binary)
-                temp_pdf_file.seek(0)
-                marker_json = converter(temp_pdf_file.name).children
-                doc = pdfium.PdfDocument(temp_pdf_file.name)
-                page_image = doc[0].render(scale=92/72).to_pil()
-            if len(marker_json) == 0 or len(gt_tables) == 0:
-                print(f'No tables detected, skipping...')
-                total_unaligned += len(gt_tables)
-                continue
-            marker_tables = extract_tables(marker_json)
-            marker_table_boxes = [table.bbox for table in marker_tables]
-            page_bbox = marker_json[0].bbox
-            w_scaler, h_scaler = page_image.width / page_bbox[2], page_image.height / page_bbox[3]
-            table_images = [page_image.crop([bbox[0] * w_scaler, bbox[1] * h_scaler, bbox[2] * w_scaler, bbox[3] * h_scaler]) for bbox in marker_table_boxes]
-            # Normalize the bboxes
-            for bbox in marker_table_boxes:
-                bbox[0] = bbox[0] / page_bbox[2]
-                bbox[1] = bbox[1] / page_bbox[3]
-                bbox[2] = bbox[2] / page_bbox[2]
-                bbox[3] = bbox[3] / page_bbox[3]
-            gt_boxes = [table['normalized_bbox'] for table in gt_tables]
-            gt_areas = [(bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) for bbox in gt_boxes]
-            marker_areas = [(bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) for bbox in marker_table_boxes]
-            table_alignments = matrix_intersection_area(gt_boxes, marker_table_boxes)
-            aligned_tables = []
-            used_tables = set()
-            unaligned_tables = set()
-            for table_idx, alignment in enumerate(table_alignments):
-                try:
-                    max_area = np.max(alignment)
-                    aligned_idx = np.argmax(alignment)
-                except ValueError:
-                    # No alignment found
-                    unaligned_tables.add(table_idx)
-                    continue
-                if aligned_idx in used_tables:
-                    # Marker table already aligned with another gt table
-                    unaligned_tables.add(table_idx)
-                    continue
-                # Gt table doesn't align well with any marker table
-                gt_table_pct = gt_areas[table_idx] / max_area
-                if not .75 < gt_table_pct < 1.25:
-                    unaligned_tables.add(table_idx)
-                    continue
-                # Marker table doesn't align with gt table
-                marker_table_pct = marker_areas[aligned_idx] / max_area
-                if not .75 < marker_table_pct < 1.25:
-                    unaligned_tables.add(table_idx)
-                    continue
-                gemini_html = ""
-                if use_gemini:
-                    gemini_html = gemini_table_rec(table_images[aligned_idx])
-                aligned_tables.append(
-                    (marker_tables[aligned_idx], gt_tables[table_idx], gemini_html)
-                )
-                used_tables.add(aligned_idx)
-            total_unaligned += len(unaligned_tables)
-            for marker_table, gt_table, gemini_table in aligned_tables:
-                gt_table_html = gt_table['html']
-                #marker wraps the table in <tbody> which fintabnet data doesn't
-                #Fintabnet doesn't use th tags, need to be replaced for fair comparison
-                marker_table_soup = BeautifulSoup(marker_table.html, 'html.parser')
-                tbody = marker_table_soup.find('tbody')
-                if tbody:
-                    tbody.unwrap()
-                for th_tag in marker_table_soup.find_all('th'):
-                    th_tag.name = 'td'
-                marker_table_html = str(marker_table_soup)
-                marker_table_html = marker_table_html.replace("<br>", " ") # Fintabnet uses spaces instead of newlines
-                marker_table_html = marker_table_html.replace("\n", " ") # Fintabnet uses spaces instead of newlines
-                gemini_table_html = gemini_table.replace("\n", " ") # Fintabnet uses spaces instead of newlines
-                results.append({
-                    "marker_table": marker_table_html,
-                    "gt_table": gt_table_html,
-                    "gemini_table": gemini_table_html
-                })
-        except PdfiumError:
-            print('Broken PDF, Skipping...')
-            continue
     print(f"Total time: {time.time() - start}.")
     print(f"Could not align {total_unaligned} tables from fintabnet.")

 import os
+from benchmarks.table.inference import inference_tables
 os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"  # Transformers uses .isin for an op, which is not supported on MPS
 from pathlib import Path
 from itertools import repeat
 from typing import List
 import time
 import datasets
 from tqdm import tqdm
 import click
 from tabulate import tabulate
 import json
 from concurrent.futures import ProcessPoolExecutor
+from marker.renderers.json import JSONBlockOutput
 from marker.settings import settings
 from marker.config.parser import ConfigParser
 from marker.models import create_model_dict
 from scoring import wrap_table_html, similarity_eval_html
 def update_teds_score(result, prefix: str = "marker"):
     prediction, ground_truth = result[f'{prefix}_table'], result['gt_table']
         table_rec_batch_size: int | None,
         use_gemini: bool = False
 ):
     start = time.time()
     dataset = datasets.load_dataset(dataset, split='train')
     dataset = dataset.shuffle(seed=0)
+    results, total_unaligned = inference_tables(dataset, use_llm, table_rec_batch_size, max_rows, use_gemini)
     print(f"Total time: {time.time() - start}.")
     print(f"Could not align {total_unaligned} tables from fintabnet.")

benchmarks/verify_scores.py CHANGED Viewed

@@ -15,7 +15,7 @@ def verify_table_scores(file_path):
     with open(file_path, 'r') as file:
         data = json.load(file)
-    avg = sum([r["score"] for r in data]) / len(data)
     if avg < 0.7:
         raise ValueError("Average score is below the required threshold of 0.7")

     with open(file_path, 'r') as file:
         data = json.load(file)
+    avg = sum([r["marker_score"] for r in data["marker"]]) / len(data)
     if avg < 0.7:
         raise ValueError("Average score is below the required threshold of 0.7")