File size: 1,677 Bytes
715ea00 12dd22b 069ad93 3c6746a 088f39c c959776 088f39c c959776 29da3ef 10b0dcd c959776 069ad93 29da3ef c959776 8650951 c959776 069ad93 9591dd3 afda82c a6bdfaa 33d40b2 c959776 389e6a3 c959776 10b0dcd 715ea00 3c6746a ffdc165 069ad93 c4a4f63 3c6746a 8650951 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
import time
import pypdfium2 # Needs to be at the top to avoid warnings
import os
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # For some reason, transformers decided to use .isin for a simple op, which is not supported on MPS
import argparse
from marker.convert import convert_single_pdf
from marker.logger import configure_logging
from marker.models import load_all_models
from marker.output import save_markdown
configure_logging()
def main():
parser = argparse.ArgumentParser()
parser.add_argument("filename", help="PDF file to parse")
parser.add_argument("output", help="Output base folder path")
parser.add_argument("--max_pages", type=int, default=None, help="Maximum number of pages to parse")
parser.add_argument("--start_page", type=int, default=None, help="Page to start processing at")
parser.add_argument("--langs", type=str, help="Optional languages to use for OCR, comma separated", default=None)
parser.add_argument("--batch_multiplier", type=int, default=2, help="How much to increase batch sizes")
args = parser.parse_args()
langs = args.langs.split(",") if args.langs else None
fname = args.filename
model_lst = load_all_models()
start = time.time()
full_text, images, out_meta = convert_single_pdf(fname, model_lst, max_pages=args.max_pages, langs=langs, batch_multiplier=args.batch_multiplier, start_page=args.start_page)
fname = os.path.basename(fname)
subfolder_path = save_markdown(args.output, fname, full_text, images, out_meta)
print(f"Saved markdown to the {subfolder_path} folder")
print(f"Total time: {time.time() - start}")
if __name__ == "__main__":
main()
|