File size: 4,908 Bytes
f07afd3 35074e4 f07afd3 10b0dcd f07afd3 16bf7e7 f07afd3 c3b2b47 35074e4 f07afd3 a79daf8 f07afd3 a04b2ff f07afd3 c3b2b47 f07afd3 8650951 f07afd3 13fe745 0408259 424d8b9 f07afd3 35074e4 10b0dcd f07afd3 35074e4 c3b2b47 9c7b7b3 424d8b9 c3b2b47 424d8b9 f07afd3 16bf7e7 9c7b7b3 16bf7e7 35074e4 f07afd3 35074e4 0408259 35074e4 16bf7e7 35074e4 9c7b7b3 f07afd3 35074e4 f07afd3 424d8b9 35074e4 424d8b9 9c7b7b3 f07afd3 35074e4 9c7b7b3 35074e4 9c7b7b3 35074e4 9c7b7b3 f07afd3 35074e4 0408259 ec69c20 35074e4 0408259 35074e4 9c7b7b3 8650951 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
import argparse
import tempfile
import time
from collections import defaultdict
from tqdm import tqdm
from marker.convert import convert_single_pdf
from marker.logger import configure_logging
from marker.models import load_all_models
from marker.benchmark.scoring import score_text
from marker.extract_text import naive_get_text
import json
import os
import subprocess
import shutil
import fitz as pymupdf
from tabulate import tabulate
configure_logging()
def nougat_prediction(pdf_filename, batch_size=1):
out_dir = tempfile.mkdtemp()
subprocess.run(["nougat", pdf_filename, "-o", out_dir, "--no-skipping", "--recompute", "--batchsize", str(batch_size)], check=True)
md_file = os.listdir(out_dir)[0]
with open(os.path.join(out_dir, md_file), "r") as f:
data = f.read()
shutil.rmtree(out_dir)
return data
def main():
parser = argparse.ArgumentParser(description="Benchmark PDF to MD conversion. Needs source pdfs, and a refernece folder with the correct markdown.")
parser.add_argument("in_folder", help="Input PDF files")
parser.add_argument("reference_folder", help="Reference folder with reference markdown files")
parser.add_argument("out_file", help="Output filename")
parser.add_argument("--nougat", action="store_true", help="Run nougat and compare", default=False)
# Nougat batch size 1 uses about as much VRAM as default marker settings
parser.add_argument("--nougat_batch_size", type=int, default=1, help="Batch size to use for nougat when making predictions.")
parser.add_argument("--marker_parallel_factor", type=int, default=1, help="How much to multiply default parallel OCR workers and model batch sizes by.")
parser.add_argument("--md_out_path", type=str, default=None, help="Output path for generated markdown files")
args = parser.parse_args()
methods = ["naive", "marker"]
if args.nougat:
methods.append("nougat")
model_lst = load_all_models()
scores = defaultdict(dict)
benchmark_files = os.listdir(args.in_folder)
benchmark_files = [b for b in benchmark_files if b.endswith(".pdf")]
times = defaultdict(dict)
pages = defaultdict(int)
for fname in tqdm(benchmark_files):
md_filename = fname.rsplit(".", 1)[0] + ".md"
reference_filename = os.path.join(args.reference_folder, md_filename)
with open(reference_filename, "r") as f:
reference = f.read()
pdf_filename = os.path.join(args.in_folder, fname)
doc = pymupdf.open(pdf_filename)
pages[fname] = len(doc)
for method in methods:
start = time.time()
if method == "marker":
full_text, out_meta = convert_single_pdf(pdf_filename, model_lst, parallel_factor=args.marker_parallel_factor)
elif method == "nougat":
full_text = nougat_prediction(pdf_filename, batch_size=args.nougat_batch_size)
elif method == "naive":
full_text = naive_get_text(doc)
else:
raise ValueError(f"Unknown method {method}")
times[method][fname] = time.time() - start
score = score_text(full_text, reference)
scores[method][fname] = score
if args.md_out_path:
md_out_filename = f"{method}_{md_filename}"
with open(os.path.join(args.md_out_path, md_out_filename), "w+") as f:
f.write(full_text)
total_pages = sum(pages.values())
with open(args.out_file, "w+") as f:
write_data = defaultdict(dict)
for method in methods:
total_time = sum(times[method].values())
file_stats = {
fname:
{
"time": times[method][fname],
"score": scores[method][fname],
"pages": pages[fname]
}
for fname in benchmark_files
}
write_data[method] = {
"files": file_stats,
"avg_score": sum(scores[method].values()) / len(scores[method]),
"time_per_page": total_time / total_pages,
"time_per_doc": total_time / len(scores[method])
}
json.dump(write_data, f, indent=4)
summary_table = []
score_table = []
score_headers = benchmark_files
for method in methods:
summary_table.append([method, write_data[method]["avg_score"], write_data[method]["time_per_page"], write_data[method]["time_per_doc"]])
score_table.append([method, *[write_data[method]["files"][h]["score"] for h in score_headers]])
print(tabulate(summary_table, headers=["Method", "Average Score", "Time per page", "Time per document"]))
print("")
print("Scores by file")
print(tabulate(score_table, headers=["Method", *score_headers]))
if __name__ == "__main__":
main()
|