File size: 1,117 Bytes
c959776
 
 
29da3ef
10b0dcd
ffdc165
c959776
29da3ef
 
c959776
8650951
c959776
 
 
9591dd3
0408259
c959776
 
 
10b0dcd
0408259
c959776
0408259
ffdc165
 
 
 
8650951
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import argparse

from marker.convert import convert_single_pdf
from marker.logger import configure_logging
from marker.models import load_all_models
import json

configure_logging()


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("filename", help="PDF file to parse")
    parser.add_argument("output", help="Output file name")
    parser.add_argument("--max_pages", type=int, default=None, help="Maximum number of pages to parse")
    parser.add_argument("--parallel_factor", type=int, default=1, help="How much to multiply default parallel OCR workers and model batch sizes by.")
    args = parser.parse_args()

    fname = args.filename
    model_lst = load_all_models()
    full_text, out_meta = convert_single_pdf(fname, model_lst, max_pages=args.max_pages, parallel_factor=args.parallel_factor)

    with open(args.output, "w+", encoding='utf-8') as f:
        f.write(full_text)

    out_meta_filename = args.output.rsplit(".", 1)[0] + "_meta.json"
    with open(out_meta_filename, "w+") as f:
        f.write(json.dumps(out_meta, indent=4))


if __name__ == "__main__":
    main()