File size: 1,677 Bytes
715ea00
 
12dd22b
069ad93
3c6746a
088f39c
c959776
088f39c
c959776
29da3ef
10b0dcd
c959776
069ad93
 
29da3ef
 
c959776
8650951
c959776
 
069ad93
9591dd3
afda82c
a6bdfaa
33d40b2
c959776
 
389e6a3
 
c959776
10b0dcd
715ea00
3c6746a
ffdc165
069ad93
c4a4f63
 
 
3c6746a
8650951
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import time

import pypdfium2 # Needs to be at the top to avoid warnings
import os

os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # For some reason, transformers decided to use .isin for a simple op, which is not supported on MPS

import argparse
from marker.convert import convert_single_pdf
from marker.logger import configure_logging
from marker.models import load_all_models

from marker.output import save_markdown

configure_logging()


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("filename", help="PDF file to parse")
    parser.add_argument("output", help="Output base folder path")
    parser.add_argument("--max_pages", type=int, default=None, help="Maximum number of pages to parse")
    parser.add_argument("--start_page", type=int, default=None, help="Page to start processing at")
    parser.add_argument("--langs", type=str, help="Optional languages to use for OCR, comma separated", default=None)
    parser.add_argument("--batch_multiplier", type=int, default=2, help="How much to increase batch sizes")
    args = parser.parse_args()

    langs = args.langs.split(",") if args.langs else None

    fname = args.filename
    model_lst = load_all_models()
    start = time.time()
    full_text, images, out_meta = convert_single_pdf(fname, model_lst, max_pages=args.max_pages, langs=langs, batch_multiplier=args.batch_multiplier, start_page=args.start_page)

    fname = os.path.basename(fname)
    subfolder_path = save_markdown(args.output, fname, full_text, images, out_meta)

    print(f"Saved markdown to the {subfolder_path} folder")
    print(f"Total time: {time.time() - start}")


if __name__ == "__main__":
    main()