Vik Paruchuri
commited on
Commit
·
afda82c
1
Parent(s):
b2797ab
Set start page
Browse files- convert.py +4 -0
- convert_single.py +2 -1
- marker/convert.py +8 -1
- marker/debug/data.py +2 -2
- marker/pdf/extract_text.py +12 -4
- pyproject.toml +1 -1
convert.py
CHANGED
|
@@ -107,7 +107,11 @@ def main():
|
|
| 107 |
|
| 108 |
mp.set_start_method('spawn') # Required for CUDA, forkserver doesn't work
|
| 109 |
model_lst = load_all_models()
|
|
|
|
| 110 |
for model in model_lst:
|
|
|
|
|
|
|
|
|
|
| 111 |
if model:
|
| 112 |
model.share_memory()
|
| 113 |
|
|
|
|
| 107 |
|
| 108 |
mp.set_start_method('spawn') # Required for CUDA, forkserver doesn't work
|
| 109 |
model_lst = load_all_models()
|
| 110 |
+
|
| 111 |
for model in model_lst:
|
| 112 |
+
if model.device.type == "mps":
|
| 113 |
+
raise ValueError("Cannot use MPS with torch multiprocessing share_memory. You have to use CUDA or CPU. Set the TORCH_DEVICE environment variable to change the device.")
|
| 114 |
+
|
| 115 |
if model:
|
| 116 |
model.share_memory()
|
| 117 |
|
convert_single.py
CHANGED
|
@@ -16,6 +16,7 @@ def main():
|
|
| 16 |
parser.add_argument("filename", help="PDF file to parse")
|
| 17 |
parser.add_argument("output", help="Output base folder path")
|
| 18 |
parser.add_argument("--max_pages", type=int, default=None, help="Maximum number of pages to parse")
|
|
|
|
| 19 |
parser.add_argument("--langs", type=str, help="Languages to use for OCR, comma separated", default=None)
|
| 20 |
parser.add_argument("--batch_multiplier", type=int, default=2, help="How much to increase batch sizes")
|
| 21 |
args = parser.parse_args()
|
|
@@ -24,7 +25,7 @@ def main():
|
|
| 24 |
|
| 25 |
fname = args.filename
|
| 26 |
model_lst = load_all_models()
|
| 27 |
-
full_text, images, out_meta = convert_single_pdf(fname, model_lst, max_pages=args.max_pages, langs=langs, batch_multiplier=args.batch_multiplier)
|
| 28 |
|
| 29 |
fname = os.path.basename(fname)
|
| 30 |
subfolder_path = save_markdown(args.output, fname, full_text, images, out_meta)
|
|
|
|
| 16 |
parser.add_argument("filename", help="PDF file to parse")
|
| 17 |
parser.add_argument("output", help="Output base folder path")
|
| 18 |
parser.add_argument("--max_pages", type=int, default=None, help="Maximum number of pages to parse")
|
| 19 |
+
parser.add_argument("--start_page", type=int, default=None, help="Page to start processing at")
|
| 20 |
parser.add_argument("--langs", type=str, help="Languages to use for OCR, comma separated", default=None)
|
| 21 |
parser.add_argument("--batch_multiplier", type=int, default=2, help="How much to increase batch sizes")
|
| 22 |
args = parser.parse_args()
|
|
|
|
| 25 |
|
| 26 |
fname = args.filename
|
| 27 |
model_lst = load_all_models()
|
| 28 |
+
full_text, images, out_meta = convert_single_pdf(fname, model_lst, max_pages=args.max_pages, langs=langs, batch_multiplier=args.batch_multiplier, start_page=args.start_page)
|
| 29 |
|
| 30 |
fname = os.path.basename(fname)
|
| 31 |
subfolder_path = save_markdown(args.output, fname, full_text, images, out_meta)
|
marker/convert.py
CHANGED
|
@@ -34,6 +34,7 @@ def convert_single_pdf(
|
|
| 34 |
fname: str,
|
| 35 |
model_lst: List,
|
| 36 |
max_pages: int = None,
|
|
|
|
| 37 |
metadata: Optional[Dict] = None,
|
| 38 |
langs: Optional[List[str]] = None,
|
| 39 |
batch_multiplier: int = 1
|
|
@@ -66,12 +67,18 @@ def convert_single_pdf(
|
|
| 66 |
doc,
|
| 67 |
fname,
|
| 68 |
max_pages=max_pages,
|
|
|
|
| 69 |
)
|
| 70 |
out_meta.update({
|
| 71 |
"toc": toc,
|
| 72 |
"pages": len(pages),
|
| 73 |
})
|
| 74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
# Unpack models from list
|
| 76 |
texify_model, layout_model, order_model, edit_model, detection_model, ocr_model = model_lst
|
| 77 |
|
|
@@ -99,7 +106,7 @@ def convert_single_pdf(
|
|
| 99 |
annotate_block_types(pages)
|
| 100 |
|
| 101 |
# Dump debug data if flags are set
|
| 102 |
-
dump_bbox_debug_data(doc, pages)
|
| 103 |
|
| 104 |
# Find reading order for blocks
|
| 105 |
# Sort blocks by reading order
|
|
|
|
| 34 |
fname: str,
|
| 35 |
model_lst: List,
|
| 36 |
max_pages: int = None,
|
| 37 |
+
start_page: int = None,
|
| 38 |
metadata: Optional[Dict] = None,
|
| 39 |
langs: Optional[List[str]] = None,
|
| 40 |
batch_multiplier: int = 1
|
|
|
|
| 67 |
doc,
|
| 68 |
fname,
|
| 69 |
max_pages=max_pages,
|
| 70 |
+
start_page=start_page
|
| 71 |
)
|
| 72 |
out_meta.update({
|
| 73 |
"toc": toc,
|
| 74 |
"pages": len(pages),
|
| 75 |
})
|
| 76 |
|
| 77 |
+
# Trim pages from doc to align with start page
|
| 78 |
+
if start_page:
|
| 79 |
+
for page_idx in range(start_page):
|
| 80 |
+
doc.del_page(0)
|
| 81 |
+
|
| 82 |
# Unpack models from list
|
| 83 |
texify_model, layout_model, order_model, edit_model, detection_model, ocr_model = model_lst
|
| 84 |
|
|
|
|
| 106 |
annotate_block_types(pages)
|
| 107 |
|
| 108 |
# Dump debug data if flags are set
|
| 109 |
+
dump_bbox_debug_data(doc, fname, pages)
|
| 110 |
|
| 111 |
# Find reading order for blocks
|
| 112 |
# Sort blocks by reading order
|
marker/debug/data.py
CHANGED
|
@@ -42,12 +42,12 @@ def dump_equation_debug_data(doc, images, converted_spans):
|
|
| 42 |
json.dump(data_lines, f)
|
| 43 |
|
| 44 |
|
| 45 |
-
def dump_bbox_debug_data(doc, blocks: List[Page]):
|
| 46 |
if not settings.DEBUG_DATA_FOLDER or settings.DEBUG_LEVEL < 2:
|
| 47 |
return
|
| 48 |
|
| 49 |
# Remove extension from doc name
|
| 50 |
-
doc_base =
|
| 51 |
|
| 52 |
debug_file = os.path.join(settings.DEBUG_DATA_FOLDER, f"{doc_base}_bbox.json")
|
| 53 |
debug_data = []
|
|
|
|
| 42 |
json.dump(data_lines, f)
|
| 43 |
|
| 44 |
|
| 45 |
+
def dump_bbox_debug_data(doc, fname, blocks: List[Page]):
|
| 46 |
if not settings.DEBUG_DATA_FOLDER or settings.DEBUG_LEVEL < 2:
|
| 47 |
return
|
| 48 |
|
| 49 |
# Remove extension from doc name
|
| 50 |
+
doc_base = fname.rsplit(".", 1)[0]
|
| 51 |
|
| 52 |
debug_file = os.path.join(settings.DEBUG_DATA_FOLDER, f"{doc_base}_bbox.json")
|
| 53 |
debug_data = []
|
marker/pdf/extract_text.py
CHANGED
|
@@ -74,13 +74,21 @@ def pdftext_format_to_blocks(page, pnum: int) -> Page:
|
|
| 74 |
return out_page
|
| 75 |
|
| 76 |
|
| 77 |
-
def get_text_blocks(doc, fname, max_pages: Optional[int] = None) -> (List[Page], Dict):
|
| 78 |
toc = get_toc(doc)
|
| 79 |
|
| 80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
if max_pages:
|
| 82 |
-
|
| 83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
char_blocks = dictionary_output(fname, page_range=page_range, keep_chars=True, workers=settings.PDFTEXT_CPU_WORKERS)
|
| 86 |
marker_blocks = [pdftext_format_to_blocks(page, pnum) for pnum, page in enumerate(char_blocks)]
|
|
|
|
| 74 |
return out_page
|
| 75 |
|
| 76 |
|
| 77 |
+
def get_text_blocks(doc, fname, max_pages: Optional[int] = None, start_page: Optional[int] = None) -> (List[Page], Dict):
|
| 78 |
toc = get_toc(doc)
|
| 79 |
|
| 80 |
+
if start_page:
|
| 81 |
+
assert start_page < len(doc)
|
| 82 |
+
else:
|
| 83 |
+
start_page = 0
|
| 84 |
+
|
| 85 |
if max_pages:
|
| 86 |
+
if max_pages + start_page > len(doc):
|
| 87 |
+
max_pages = len(doc) - start_page
|
| 88 |
+
else:
|
| 89 |
+
max_pages = len(doc) - start_page
|
| 90 |
+
|
| 91 |
+
page_range = range(start_page, start_page + max_pages)
|
| 92 |
|
| 93 |
char_blocks = dictionary_output(fname, page_range=page_range, keep_chars=True, workers=settings.PDFTEXT_CPU_WORKERS)
|
| 94 |
marker_blocks = [pdftext_format_to_blocks(page, pnum) for pnum, page in enumerate(char_blocks)]
|
pyproject.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
[tool.poetry]
|
| 2 |
name = "marker-pdf"
|
| 3 |
-
version = "0.2.
|
| 4 |
description = "Convert PDF to markdown with high speed and accuracy."
|
| 5 |
authors = ["Vik Paruchuri <github@vikas.sh>"]
|
| 6 |
readme = "README.md"
|
|
|
|
| 1 |
[tool.poetry]
|
| 2 |
name = "marker-pdf"
|
| 3 |
+
version = "0.2.12"
|
| 4 |
description = "Convert PDF to markdown with high speed and accuracy."
|
| 5 |
authors = ["Vik Paruchuri <github@vikas.sh>"]
|
| 6 |
readme = "README.md"
|