Merge pull request #229 from VikParuchuri/dev
Browse files- convert_single.py +6 -0
- marker/models.py +7 -7
- marker/settings.py +1 -5
- poetry.lock +0 -0
- pyproject.toml +3 -3
convert_single.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
|
|
|
|
|
| 1 |
import pypdfium2 # Needs to be at the top to avoid warnings
|
| 2 |
import os
|
| 3 |
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # For some reason, transformers decided to use .isin for a simple op, which is not supported on MPS
|
|
@@ -20,18 +22,22 @@ def main():
|
|
| 20 |
parser.add_argument("--start_page", type=int, default=None, help="Page to start processing at")
|
| 21 |
parser.add_argument("--langs", type=str, help="Languages to use for OCR, comma separated", default=None)
|
| 22 |
parser.add_argument("--batch_multiplier", type=int, default=2, help="How much to increase batch sizes")
|
|
|
|
| 23 |
args = parser.parse_args()
|
| 24 |
|
| 25 |
langs = args.langs.split(",") if args.langs else None
|
| 26 |
|
| 27 |
fname = args.filename
|
| 28 |
model_lst = load_all_models()
|
|
|
|
| 29 |
full_text, images, out_meta = convert_single_pdf(fname, model_lst, max_pages=args.max_pages, langs=langs, batch_multiplier=args.batch_multiplier, start_page=args.start_page)
|
| 30 |
|
| 31 |
fname = os.path.basename(fname)
|
| 32 |
subfolder_path = save_markdown(args.output, fname, full_text, images, out_meta)
|
| 33 |
|
| 34 |
print(f"Saved markdown to the {subfolder_path} folder")
|
|
|
|
|
|
|
| 35 |
|
| 36 |
|
| 37 |
if __name__ == "__main__":
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
|
| 3 |
import pypdfium2 # Needs to be at the top to avoid warnings
|
| 4 |
import os
|
| 5 |
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # For some reason, transformers decided to use .isin for a simple op, which is not supported on MPS
|
|
|
|
| 22 |
parser.add_argument("--start_page", type=int, default=None, help="Page to start processing at")
|
| 23 |
parser.add_argument("--langs", type=str, help="Languages to use for OCR, comma separated", default=None)
|
| 24 |
parser.add_argument("--batch_multiplier", type=int, default=2, help="How much to increase batch sizes")
|
| 25 |
+
parser.add_argument("--debug", action="store_true", help="Enable debug logging", default=False)
|
| 26 |
args = parser.parse_args()
|
| 27 |
|
| 28 |
langs = args.langs.split(",") if args.langs else None
|
| 29 |
|
| 30 |
fname = args.filename
|
| 31 |
model_lst = load_all_models()
|
| 32 |
+
start = time.time()
|
| 33 |
full_text, images, out_meta = convert_single_pdf(fname, model_lst, max_pages=args.max_pages, langs=langs, batch_multiplier=args.batch_multiplier, start_page=args.start_page)
|
| 34 |
|
| 35 |
fname = os.path.basename(fname)
|
| 36 |
subfolder_path = save_markdown(args.output, fname, full_text, images, out_meta)
|
| 37 |
|
| 38 |
print(f"Saved markdown to the {subfolder_path} folder")
|
| 39 |
+
if args.debug:
|
| 40 |
+
print(f"Total time: {time.time() - start}")
|
| 41 |
|
| 42 |
|
| 43 |
if __name__ == "__main__":
|
marker/models.py
CHANGED
|
@@ -3,7 +3,7 @@ os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # For some reason, transformers
|
|
| 3 |
|
| 4 |
|
| 5 |
from marker.postprocessors.editor import load_editing_model
|
| 6 |
-
from surya.model.detection import
|
| 7 |
from texify.model.model import load_model as load_texify_model
|
| 8 |
from texify.model.processor import load_processor as load_texify_processor
|
| 9 |
from marker.settings import settings
|
|
@@ -25,11 +25,11 @@ def setup_recognition_model(langs, device=None, dtype=None):
|
|
| 25 |
|
| 26 |
def setup_detection_model(device=None, dtype=None):
|
| 27 |
if device:
|
| 28 |
-
model =
|
| 29 |
else:
|
| 30 |
-
model =
|
| 31 |
|
| 32 |
-
processor =
|
| 33 |
model.processor = processor
|
| 34 |
return model
|
| 35 |
|
|
@@ -46,10 +46,10 @@ def setup_texify_model(device=None, dtype=None):
|
|
| 46 |
|
| 47 |
def setup_layout_model(device=None, dtype=None):
|
| 48 |
if device:
|
| 49 |
-
model =
|
| 50 |
else:
|
| 51 |
-
model =
|
| 52 |
-
processor =
|
| 53 |
model.processor = processor
|
| 54 |
return model
|
| 55 |
|
|
|
|
| 3 |
|
| 4 |
|
| 5 |
from marker.postprocessors.editor import load_editing_model
|
| 6 |
+
from surya.model.detection.model import load_model as load_detection_model, load_processor as load_detection_processor
|
| 7 |
from texify.model.model import load_model as load_texify_model
|
| 8 |
from texify.model.processor import load_processor as load_texify_processor
|
| 9 |
from marker.settings import settings
|
|
|
|
| 25 |
|
| 26 |
def setup_detection_model(device=None, dtype=None):
|
| 27 |
if device:
|
| 28 |
+
model = load_detection_model(device=device, dtype=dtype)
|
| 29 |
else:
|
| 30 |
+
model = load_detection_model()
|
| 31 |
|
| 32 |
+
processor = load_detection_processor()
|
| 33 |
model.processor = processor
|
| 34 |
return model
|
| 35 |
|
|
|
|
| 46 |
|
| 47 |
def setup_layout_model(device=None, dtype=None):
|
| 48 |
if device:
|
| 49 |
+
model = load_detection_model(checkpoint=settings.LAYOUT_MODEL_CHECKPOINT, device=device, dtype=dtype)
|
| 50 |
else:
|
| 51 |
+
model = load_detection_model(checkpoint=settings.LAYOUT_MODEL_CHECKPOINT)
|
| 52 |
+
processor = load_detection_processor(checkpoint=settings.LAYOUT_MODEL_CHECKPOINT)
|
| 53 |
model.processor = processor
|
| 54 |
return model
|
| 55 |
|
marker/settings.py
CHANGED
|
@@ -67,7 +67,7 @@ class Settings(BaseSettings):
|
|
| 67 |
# Layout model
|
| 68 |
SURYA_LAYOUT_DPI: int = 96
|
| 69 |
BAD_SPAN_TYPES: List[str] = ["Caption", "Footnote", "Page-footer", "Page-header", "Picture"]
|
| 70 |
-
LAYOUT_MODEL_CHECKPOINT: str = "vikp/
|
| 71 |
BBOX_INTERSECTION_THRESH: float = 0.7 # How much the layout and pdf bboxes need to overlap to be the same
|
| 72 |
LAYOUT_BATCH_SIZE: Optional[int] = None # Defaults to 12 for cuda, 6 otherwise
|
| 73 |
|
|
@@ -83,10 +83,6 @@ class Settings(BaseSettings):
|
|
| 83 |
ENABLE_EDITOR_MODEL: bool = False # The editor model can create false positives
|
| 84 |
EDITOR_CUTOFF_THRESH: float = 0.9 # Ignore predictions below this probability
|
| 85 |
|
| 86 |
-
# Ray
|
| 87 |
-
RAY_CACHE_PATH: Optional[str] = None # Where to save ray cache
|
| 88 |
-
RAY_CORES_PER_WORKER: int = 1 # How many cpu cores to allocate per worker
|
| 89 |
-
|
| 90 |
# Debug
|
| 91 |
DEBUG: bool = False # Enable debug logging
|
| 92 |
DEBUG_DATA_FOLDER: Optional[str] = None
|
|
|
|
| 67 |
# Layout model
|
| 68 |
SURYA_LAYOUT_DPI: int = 96
|
| 69 |
BAD_SPAN_TYPES: List[str] = ["Caption", "Footnote", "Page-footer", "Page-header", "Picture"]
|
| 70 |
+
LAYOUT_MODEL_CHECKPOINT: str = "vikp/surya_layout3"
|
| 71 |
BBOX_INTERSECTION_THRESH: float = 0.7 # How much the layout and pdf bboxes need to overlap to be the same
|
| 72 |
LAYOUT_BATCH_SIZE: Optional[int] = None # Defaults to 12 for cuda, 6 otherwise
|
| 73 |
|
|
|
|
| 83 |
ENABLE_EDITOR_MODEL: bool = False # The editor model can create false positives
|
| 84 |
EDITOR_CUTOFF_THRESH: float = 0.9 # Ignore predictions below this probability
|
| 85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
# Debug
|
| 87 |
DEBUG: bool = False # Enable debug logging
|
| 88 |
DEBUG_DATA_FOLDER: Optional[str] = None
|
poetry.lock
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
pyproject.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
[tool.poetry]
|
| 2 |
name = "marker-pdf"
|
| 3 |
-
version = "0.2.
|
| 4 |
description = "Convert PDF to markdown with high speed and accuracy."
|
| 5 |
authors = ["Vik Paruchuri <github@vikas.sh>"]
|
| 6 |
readme = "README.md"
|
|
@@ -19,7 +19,7 @@ include = [
|
|
| 19 |
|
| 20 |
[tool.poetry.dependencies]
|
| 21 |
python = ">=3.9,<3.13,!=3.9.7"
|
| 22 |
-
scikit-learn = "^1.3.2"
|
| 23 |
Pillow = "^10.1.0"
|
| 24 |
pydantic = "^2.4.2"
|
| 25 |
pydantic-settings = "^2.0.3"
|
|
@@ -32,7 +32,7 @@ tabulate = "^0.9.0"
|
|
| 32 |
ftfy = "^6.1.1"
|
| 33 |
texify = "^0.1.10"
|
| 34 |
rapidfuzz = "^3.8.1"
|
| 35 |
-
surya-ocr = "^0.4.
|
| 36 |
filetype = "^1.2.0"
|
| 37 |
regex = "^2024.4.28"
|
| 38 |
pdftext = "^0.3.10"
|
|
|
|
| 1 |
[tool.poetry]
|
| 2 |
name = "marker-pdf"
|
| 3 |
+
version = "0.2.16"
|
| 4 |
description = "Convert PDF to markdown with high speed and accuracy."
|
| 5 |
authors = ["Vik Paruchuri <github@vikas.sh>"]
|
| 6 |
readme = "README.md"
|
|
|
|
| 19 |
|
| 20 |
[tool.poetry.dependencies]
|
| 21 |
python = ">=3.9,<3.13,!=3.9.7"
|
| 22 |
+
scikit-learn = "^1.3.2,<=1.4.2"
|
| 23 |
Pillow = "^10.1.0"
|
| 24 |
pydantic = "^2.4.2"
|
| 25 |
pydantic-settings = "^2.0.3"
|
|
|
|
| 32 |
ftfy = "^6.1.1"
|
| 33 |
texify = "^0.1.10"
|
| 34 |
rapidfuzz = "^3.8.1"
|
| 35 |
+
surya-ocr = "^0.4.15"
|
| 36 |
filetype = "^1.2.0"
|
| 37 |
regex = "^2024.4.28"
|
| 38 |
pdftext = "^0.3.10"
|