Vik Paruchuri commited on
Commit
71b4e76
·
2 Parent(s): 412aa07 715ea00

Merge pull request #229 from VikParuchuri/dev

Browse files
Files changed (5) hide show
  1. convert_single.py +6 -0
  2. marker/models.py +7 -7
  3. marker/settings.py +1 -5
  4. poetry.lock +0 -0
  5. pyproject.toml +3 -3
convert_single.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import pypdfium2 # Needs to be at the top to avoid warnings
2
  import os
3
  os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # For some reason, transformers decided to use .isin for a simple op, which is not supported on MPS
@@ -20,18 +22,22 @@ def main():
20
  parser.add_argument("--start_page", type=int, default=None, help="Page to start processing at")
21
  parser.add_argument("--langs", type=str, help="Languages to use for OCR, comma separated", default=None)
22
  parser.add_argument("--batch_multiplier", type=int, default=2, help="How much to increase batch sizes")
 
23
  args = parser.parse_args()
24
 
25
  langs = args.langs.split(",") if args.langs else None
26
 
27
  fname = args.filename
28
  model_lst = load_all_models()
 
29
  full_text, images, out_meta = convert_single_pdf(fname, model_lst, max_pages=args.max_pages, langs=langs, batch_multiplier=args.batch_multiplier, start_page=args.start_page)
30
 
31
  fname = os.path.basename(fname)
32
  subfolder_path = save_markdown(args.output, fname, full_text, images, out_meta)
33
 
34
  print(f"Saved markdown to the {subfolder_path} folder")
 
 
35
 
36
 
37
  if __name__ == "__main__":
 
1
+ import time
2
+
3
  import pypdfium2 # Needs to be at the top to avoid warnings
4
  import os
5
  os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # For some reason, transformers decided to use .isin for a simple op, which is not supported on MPS
 
22
  parser.add_argument("--start_page", type=int, default=None, help="Page to start processing at")
23
  parser.add_argument("--langs", type=str, help="Languages to use for OCR, comma separated", default=None)
24
  parser.add_argument("--batch_multiplier", type=int, default=2, help="How much to increase batch sizes")
25
+ parser.add_argument("--debug", action="store_true", help="Enable debug logging", default=False)
26
  args = parser.parse_args()
27
 
28
  langs = args.langs.split(",") if args.langs else None
29
 
30
  fname = args.filename
31
  model_lst = load_all_models()
32
+ start = time.time()
33
  full_text, images, out_meta = convert_single_pdf(fname, model_lst, max_pages=args.max_pages, langs=langs, batch_multiplier=args.batch_multiplier, start_page=args.start_page)
34
 
35
  fname = os.path.basename(fname)
36
  subfolder_path = save_markdown(args.output, fname, full_text, images, out_meta)
37
 
38
  print(f"Saved markdown to the {subfolder_path} folder")
39
+ if args.debug:
40
+ print(f"Total time: {time.time() - start}")
41
 
42
 
43
  if __name__ == "__main__":
marker/models.py CHANGED
@@ -3,7 +3,7 @@ os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # For some reason, transformers
3
 
4
 
5
  from marker.postprocessors.editor import load_editing_model
6
- from surya.model.detection import segformer
7
  from texify.model.model import load_model as load_texify_model
8
  from texify.model.processor import load_processor as load_texify_processor
9
  from marker.settings import settings
@@ -25,11 +25,11 @@ def setup_recognition_model(langs, device=None, dtype=None):
25
 
26
  def setup_detection_model(device=None, dtype=None):
27
  if device:
28
- model = segformer.load_model(device=device, dtype=dtype)
29
  else:
30
- model = segformer.load_model()
31
 
32
- processor = segformer.load_processor()
33
  model.processor = processor
34
  return model
35
 
@@ -46,10 +46,10 @@ def setup_texify_model(device=None, dtype=None):
46
 
47
  def setup_layout_model(device=None, dtype=None):
48
  if device:
49
- model = segformer.load_model(checkpoint=settings.LAYOUT_MODEL_CHECKPOINT, device=device, dtype=dtype)
50
  else:
51
- model = segformer.load_model(checkpoint=settings.LAYOUT_MODEL_CHECKPOINT)
52
- processor = segformer.load_processor(checkpoint=settings.LAYOUT_MODEL_CHECKPOINT)
53
  model.processor = processor
54
  return model
55
 
 
3
 
4
 
5
  from marker.postprocessors.editor import load_editing_model
6
+ from surya.model.detection.model import load_model as load_detection_model, load_processor as load_detection_processor
7
  from texify.model.model import load_model as load_texify_model
8
  from texify.model.processor import load_processor as load_texify_processor
9
  from marker.settings import settings
 
25
 
26
  def setup_detection_model(device=None, dtype=None):
27
  if device:
28
+ model = load_detection_model(device=device, dtype=dtype)
29
  else:
30
+ model = load_detection_model()
31
 
32
+ processor = load_detection_processor()
33
  model.processor = processor
34
  return model
35
 
 
46
 
47
  def setup_layout_model(device=None, dtype=None):
48
  if device:
49
+ model = load_detection_model(checkpoint=settings.LAYOUT_MODEL_CHECKPOINT, device=device, dtype=dtype)
50
  else:
51
+ model = load_detection_model(checkpoint=settings.LAYOUT_MODEL_CHECKPOINT)
52
+ processor = load_detection_processor(checkpoint=settings.LAYOUT_MODEL_CHECKPOINT)
53
  model.processor = processor
54
  return model
55
 
marker/settings.py CHANGED
@@ -67,7 +67,7 @@ class Settings(BaseSettings):
67
  # Layout model
68
  SURYA_LAYOUT_DPI: int = 96
69
  BAD_SPAN_TYPES: List[str] = ["Caption", "Footnote", "Page-footer", "Page-header", "Picture"]
70
- LAYOUT_MODEL_CHECKPOINT: str = "vikp/surya_layout2"
71
  BBOX_INTERSECTION_THRESH: float = 0.7 # How much the layout and pdf bboxes need to overlap to be the same
72
  LAYOUT_BATCH_SIZE: Optional[int] = None # Defaults to 12 for cuda, 6 otherwise
73
 
@@ -83,10 +83,6 @@ class Settings(BaseSettings):
83
  ENABLE_EDITOR_MODEL: bool = False # The editor model can create false positives
84
  EDITOR_CUTOFF_THRESH: float = 0.9 # Ignore predictions below this probability
85
 
86
- # Ray
87
- RAY_CACHE_PATH: Optional[str] = None # Where to save ray cache
88
- RAY_CORES_PER_WORKER: int = 1 # How many cpu cores to allocate per worker
89
-
90
  # Debug
91
  DEBUG: bool = False # Enable debug logging
92
  DEBUG_DATA_FOLDER: Optional[str] = None
 
67
  # Layout model
68
  SURYA_LAYOUT_DPI: int = 96
69
  BAD_SPAN_TYPES: List[str] = ["Caption", "Footnote", "Page-footer", "Page-header", "Picture"]
70
+ LAYOUT_MODEL_CHECKPOINT: str = "vikp/surya_layout3"
71
  BBOX_INTERSECTION_THRESH: float = 0.7 # How much the layout and pdf bboxes need to overlap to be the same
72
  LAYOUT_BATCH_SIZE: Optional[int] = None # Defaults to 12 for cuda, 6 otherwise
73
 
 
83
  ENABLE_EDITOR_MODEL: bool = False # The editor model can create false positives
84
  EDITOR_CUTOFF_THRESH: float = 0.9 # Ignore predictions below this probability
85
 
 
 
 
 
86
  # Debug
87
  DEBUG: bool = False # Enable debug logging
88
  DEBUG_DATA_FOLDER: Optional[str] = None
poetry.lock CHANGED
The diff for this file is too large to render. See raw diff
 
pyproject.toml CHANGED
@@ -1,6 +1,6 @@
1
  [tool.poetry]
2
  name = "marker-pdf"
3
- version = "0.2.15"
4
  description = "Convert PDF to markdown with high speed and accuracy."
5
  authors = ["Vik Paruchuri <github@vikas.sh>"]
6
  readme = "README.md"
@@ -19,7 +19,7 @@ include = [
19
 
20
  [tool.poetry.dependencies]
21
  python = ">=3.9,<3.13,!=3.9.7"
22
- scikit-learn = "^1.3.2"
23
  Pillow = "^10.1.0"
24
  pydantic = "^2.4.2"
25
  pydantic-settings = "^2.0.3"
@@ -32,7 +32,7 @@ tabulate = "^0.9.0"
32
  ftfy = "^6.1.1"
33
  texify = "^0.1.10"
34
  rapidfuzz = "^3.8.1"
35
- surya-ocr = "^0.4.14"
36
  filetype = "^1.2.0"
37
  regex = "^2024.4.28"
38
  pdftext = "^0.3.10"
 
1
  [tool.poetry]
2
  name = "marker-pdf"
3
+ version = "0.2.16"
4
  description = "Convert PDF to markdown with high speed and accuracy."
5
  authors = ["Vik Paruchuri <github@vikas.sh>"]
6
  readme = "README.md"
 
19
 
20
  [tool.poetry.dependencies]
21
  python = ">=3.9,<3.13,!=3.9.7"
22
+ scikit-learn = "^1.3.2,<=1.4.2"
23
  Pillow = "^10.1.0"
24
  pydantic = "^2.4.2"
25
  pydantic-settings = "^2.0.3"
 
32
  ftfy = "^6.1.1"
33
  texify = "^0.1.10"
34
  rapidfuzz = "^3.8.1"
35
+ surya-ocr = "^0.4.15"
36
  filetype = "^1.2.0"
37
  regex = "^2024.4.28"
38
  pdftext = "^0.3.10"