Vik Paruchuri commited on
Commit
afda82c
·
1 Parent(s): b2797ab

Set start page

Browse files
convert.py CHANGED
@@ -107,7 +107,11 @@ def main():
107
 
108
  mp.set_start_method('spawn') # Required for CUDA, forkserver doesn't work
109
  model_lst = load_all_models()
 
110
  for model in model_lst:
 
 
 
111
  if model:
112
  model.share_memory()
113
 
 
107
 
108
  mp.set_start_method('spawn') # Required for CUDA, forkserver doesn't work
109
  model_lst = load_all_models()
110
+
111
  for model in model_lst:
112
+ if model.device.type == "mps":
113
+ raise ValueError("Cannot use MPS with torch multiprocessing share_memory. You have to use CUDA or CPU. Set the TORCH_DEVICE environment variable to change the device.")
114
+
115
  if model:
116
  model.share_memory()
117
 
convert_single.py CHANGED
@@ -16,6 +16,7 @@ def main():
16
  parser.add_argument("filename", help="PDF file to parse")
17
  parser.add_argument("output", help="Output base folder path")
18
  parser.add_argument("--max_pages", type=int, default=None, help="Maximum number of pages to parse")
 
19
  parser.add_argument("--langs", type=str, help="Languages to use for OCR, comma separated", default=None)
20
  parser.add_argument("--batch_multiplier", type=int, default=2, help="How much to increase batch sizes")
21
  args = parser.parse_args()
@@ -24,7 +25,7 @@ def main():
24
 
25
  fname = args.filename
26
  model_lst = load_all_models()
27
- full_text, images, out_meta = convert_single_pdf(fname, model_lst, max_pages=args.max_pages, langs=langs, batch_multiplier=args.batch_multiplier)
28
 
29
  fname = os.path.basename(fname)
30
  subfolder_path = save_markdown(args.output, fname, full_text, images, out_meta)
 
16
  parser.add_argument("filename", help="PDF file to parse")
17
  parser.add_argument("output", help="Output base folder path")
18
  parser.add_argument("--max_pages", type=int, default=None, help="Maximum number of pages to parse")
19
+ parser.add_argument("--start_page", type=int, default=None, help="Page to start processing at")
20
  parser.add_argument("--langs", type=str, help="Languages to use for OCR, comma separated", default=None)
21
  parser.add_argument("--batch_multiplier", type=int, default=2, help="How much to increase batch sizes")
22
  args = parser.parse_args()
 
25
 
26
  fname = args.filename
27
  model_lst = load_all_models()
28
+ full_text, images, out_meta = convert_single_pdf(fname, model_lst, max_pages=args.max_pages, langs=langs, batch_multiplier=args.batch_multiplier, start_page=args.start_page)
29
 
30
  fname = os.path.basename(fname)
31
  subfolder_path = save_markdown(args.output, fname, full_text, images, out_meta)
marker/convert.py CHANGED
@@ -34,6 +34,7 @@ def convert_single_pdf(
34
  fname: str,
35
  model_lst: List,
36
  max_pages: int = None,
 
37
  metadata: Optional[Dict] = None,
38
  langs: Optional[List[str]] = None,
39
  batch_multiplier: int = 1
@@ -66,12 +67,18 @@ def convert_single_pdf(
66
  doc,
67
  fname,
68
  max_pages=max_pages,
 
69
  )
70
  out_meta.update({
71
  "toc": toc,
72
  "pages": len(pages),
73
  })
74
 
 
 
 
 
 
75
  # Unpack models from list
76
  texify_model, layout_model, order_model, edit_model, detection_model, ocr_model = model_lst
77
 
@@ -99,7 +106,7 @@ def convert_single_pdf(
99
  annotate_block_types(pages)
100
 
101
  # Dump debug data if flags are set
102
- dump_bbox_debug_data(doc, pages)
103
 
104
  # Find reading order for blocks
105
  # Sort blocks by reading order
 
34
  fname: str,
35
  model_lst: List,
36
  max_pages: int = None,
37
+ start_page: int = None,
38
  metadata: Optional[Dict] = None,
39
  langs: Optional[List[str]] = None,
40
  batch_multiplier: int = 1
 
67
  doc,
68
  fname,
69
  max_pages=max_pages,
70
+ start_page=start_page
71
  )
72
  out_meta.update({
73
  "toc": toc,
74
  "pages": len(pages),
75
  })
76
 
77
+ # Trim pages from doc to align with start page
78
+ if start_page:
79
+ for page_idx in range(start_page):
80
+ doc.del_page(0)
81
+
82
  # Unpack models from list
83
  texify_model, layout_model, order_model, edit_model, detection_model, ocr_model = model_lst
84
 
 
106
  annotate_block_types(pages)
107
 
108
  # Dump debug data if flags are set
109
+ dump_bbox_debug_data(doc, fname, pages)
110
 
111
  # Find reading order for blocks
112
  # Sort blocks by reading order
marker/debug/data.py CHANGED
@@ -42,12 +42,12 @@ def dump_equation_debug_data(doc, images, converted_spans):
42
  json.dump(data_lines, f)
43
 
44
 
45
- def dump_bbox_debug_data(doc, blocks: List[Page]):
46
  if not settings.DEBUG_DATA_FOLDER or settings.DEBUG_LEVEL < 2:
47
  return
48
 
49
  # Remove extension from doc name
50
- doc_base = os.path.basename(doc.name).rsplit(".", 1)[0]
51
 
52
  debug_file = os.path.join(settings.DEBUG_DATA_FOLDER, f"{doc_base}_bbox.json")
53
  debug_data = []
 
42
  json.dump(data_lines, f)
43
 
44
 
45
+ def dump_bbox_debug_data(doc, fname, blocks: List[Page]):
46
  if not settings.DEBUG_DATA_FOLDER or settings.DEBUG_LEVEL < 2:
47
  return
48
 
49
  # Remove extension from doc name
50
+ doc_base = fname.rsplit(".", 1)[0]
51
 
52
  debug_file = os.path.join(settings.DEBUG_DATA_FOLDER, f"{doc_base}_bbox.json")
53
  debug_data = []
marker/pdf/extract_text.py CHANGED
@@ -74,13 +74,21 @@ def pdftext_format_to_blocks(page, pnum: int) -> Page:
74
  return out_page
75
 
76
 
77
- def get_text_blocks(doc, fname, max_pages: Optional[int] = None) -> (List[Page], Dict):
78
  toc = get_toc(doc)
79
 
80
- page_range = range(len(doc))
 
 
 
 
81
  if max_pages:
82
- range_end = min(max_pages, len(doc))
83
- page_range = range(range_end)
 
 
 
 
84
 
85
  char_blocks = dictionary_output(fname, page_range=page_range, keep_chars=True, workers=settings.PDFTEXT_CPU_WORKERS)
86
  marker_blocks = [pdftext_format_to_blocks(page, pnum) for pnum, page in enumerate(char_blocks)]
 
74
  return out_page
75
 
76
 
77
+ def get_text_blocks(doc, fname, max_pages: Optional[int] = None, start_page: Optional[int] = None) -> (List[Page], Dict):
78
  toc = get_toc(doc)
79
 
80
+ if start_page:
81
+ assert start_page < len(doc)
82
+ else:
83
+ start_page = 0
84
+
85
  if max_pages:
86
+ if max_pages + start_page > len(doc):
87
+ max_pages = len(doc) - start_page
88
+ else:
89
+ max_pages = len(doc) - start_page
90
+
91
+ page_range = range(start_page, start_page + max_pages)
92
 
93
  char_blocks = dictionary_output(fname, page_range=page_range, keep_chars=True, workers=settings.PDFTEXT_CPU_WORKERS)
94
  marker_blocks = [pdftext_format_to_blocks(page, pnum) for pnum, page in enumerate(char_blocks)]
pyproject.toml CHANGED
@@ -1,6 +1,6 @@
1
  [tool.poetry]
2
  name = "marker-pdf"
3
- version = "0.2.11"
4
  description = "Convert PDF to markdown with high speed and accuracy."
5
  authors = ["Vik Paruchuri <github@vikas.sh>"]
6
  readme = "README.md"
 
1
  [tool.poetry]
2
  name = "marker-pdf"
3
+ version = "0.2.12"
4
  description = "Convert PDF to markdown with high speed and accuracy."
5
  authors = ["Vik Paruchuri <github@vikas.sh>"]
6
  readme = "README.md"