Vik Paruchuri
commited on
Commit
·
a6bdfaa
1
Parent(s):
d090d63
Integrate new surya OCR model
Browse files- README.md +7 -9
- convert.py +2 -7
- convert_single.py +1 -1
- marker/convert.py +0 -4
- marker/equations/inference.py +2 -1
- marker/models.py +5 -5
- marker/ocr/lang.py +8 -0
- marker/settings.py +0 -2
- marker/tables/cells.py +0 -4
- poetry.lock +4 -4
- pyproject.toml +2 -2
README.md
CHANGED
|
@@ -88,32 +88,30 @@ First, some configuration:
|
|
| 88 |
|
| 89 |
- Inspect the settings in `marker/settings.py`. You can override any settings with environment variables.
|
| 90 |
- Your torch device will be automatically detected, but you can override this. For example, `TORCH_DEVICE=cuda`.
|
| 91 |
-
|
| 92 |
-
- Depending on your document types, marker's average memory usage per task can vary slightly. You can configure `VRAM_PER_TASK` to adjust this if you notice tasks failing with GPU out of memory errors.
|
| 93 |
-
- By default, marker will use `surya` for OCR. Surya is slower on CPU, but more accurate than tesseract. If you want faster OCR, set `OCR_ENGINE` to `ocrmypdf`. This also requires external dependencies (see above). If you don't want OCR at all, set `OCR_ENGINE` to `None`.
|
| 94 |
|
| 95 |
## Convert a single file
|
| 96 |
|
| 97 |
```shell
|
| 98 |
-
marker_single /path/to/file.pdf /path/to/output/folder --batch_multiplier 2 --max_pages 10
|
| 99 |
```
|
| 100 |
|
| 101 |
- `--batch_multiplier` is how much to multiply default batch sizes by if you have extra VRAM. Higher numbers will take more VRAM, but process faster. Set to 2 by default. The default batch sizes will take ~3GB of VRAM.
|
| 102 |
- `--max_pages` is the maximum number of pages to process. Omit this to convert the entire document.
|
| 103 |
-
- `--langs` is
|
| 104 |
|
| 105 |
-
|
| 106 |
|
| 107 |
## Convert multiple files
|
| 108 |
|
| 109 |
```shell
|
| 110 |
-
marker /path/to/input/folder /path/to/output/folder --workers
|
| 111 |
```
|
| 112 |
|
| 113 |
-
- `--workers` is the number of pdfs to convert at once. This is set to 1 by default, but you can increase it to increase throughput, at the cost of more CPU/GPU usage.
|
| 114 |
- `--max` is the maximum number of pdfs to convert. Omit this to convert all pdfs in the folder.
|
| 115 |
- `--min_length` is the minimum number of characters that need to be extracted from a pdf before it will be considered for processing. If you're processing a lot of pdfs, I recommend setting this to avoid OCRing pdfs that are mostly images. (slows everything down)
|
| 116 |
-
- `--metadata_file` is an optional path to a json file with metadata about the pdfs. If you provide it, it will be used to set the language for each pdf.
|
| 117 |
|
| 118 |
```
|
| 119 |
{
|
|
|
|
| 88 |
|
| 89 |
- Inspect the settings in `marker/settings.py`. You can override any settings with environment variables.
|
| 90 |
- Your torch device will be automatically detected, but you can override this. For example, `TORCH_DEVICE=cuda`.
|
| 91 |
+
- By default, marker will use `surya` for OCR. Surya is slower on CPU, but more accurate than tesseract. It also doesn't require you to specify the languages in the document. If you want faster OCR, set `OCR_ENGINE` to `ocrmypdf`. This also requires external dependencies (see above). If you don't want OCR at all, set `OCR_ENGINE` to `None`.
|
|
|
|
|
|
|
| 92 |
|
| 93 |
## Convert a single file
|
| 94 |
|
| 95 |
```shell
|
| 96 |
+
marker_single /path/to/file.pdf /path/to/output/folder --batch_multiplier 2 --max_pages 10
|
| 97 |
```
|
| 98 |
|
| 99 |
- `--batch_multiplier` is how much to multiply default batch sizes by if you have extra VRAM. Higher numbers will take more VRAM, but process faster. Set to 2 by default. The default batch sizes will take ~3GB of VRAM.
|
| 100 |
- `--max_pages` is the maximum number of pages to process. Omit this to convert the entire document.
|
| 101 |
+
- `--langs` is am optional comma separated list of the languages in the document, for OCR. Optional by default, required if you use tesseract.
|
| 102 |
|
| 103 |
+
The list of supported languages for surya OCR is [here](https://github.com/VikParuchuri/surya/blob/master/surya/languages.py). If you need more languages, you can use any language supported by [Tesseract](https://tesseract-ocr.github.io/tessdoc/Data-Files#data-files-for-version-400-november-29-2016) if you set `OCR_ENGINE` to `ocrmypdf`. If you don't need OCR, marker can work with any language.
|
| 104 |
|
| 105 |
## Convert multiple files
|
| 106 |
|
| 107 |
```shell
|
| 108 |
+
marker /path/to/input/folder /path/to/output/folder --workers 4 --max 10 --min_length 10000
|
| 109 |
```
|
| 110 |
|
| 111 |
+
- `--workers` is the number of pdfs to convert at once. This is set to 1 by default, but you can increase it to increase throughput, at the cost of more CPU/GPU usage. Marker will use 5GB of VRAM per worker at the peak, and 3.5GB average.
|
| 112 |
- `--max` is the maximum number of pdfs to convert. Omit this to convert all pdfs in the folder.
|
| 113 |
- `--min_length` is the minimum number of characters that need to be extracted from a pdf before it will be considered for processing. If you're processing a lot of pdfs, I recommend setting this to avoid OCRing pdfs that are mostly images. (slows everything down)
|
| 114 |
+
- `--metadata_file` is an optional path to a json file with metadata about the pdfs. If you provide it, it will be used to set the language for each pdf. Setting language is optional for surya (default), but required for tesseract. The format is:
|
| 115 |
|
| 116 |
```
|
| 117 |
{
|
convert.py
CHANGED
|
@@ -73,8 +73,8 @@ def main():
|
|
| 73 |
parser.add_argument("--chunk_idx", type=int, default=0, help="Chunk index to convert")
|
| 74 |
parser.add_argument("--num_chunks", type=int, default=1, help="Number of chunks being processed in parallel")
|
| 75 |
parser.add_argument("--max", type=int, default=None, help="Maximum number of pdfs to convert")
|
| 76 |
-
parser.add_argument("--workers", type=int, default=5, help="Number of worker processes to use")
|
| 77 |
-
parser.add_argument("--metadata_file", type=str, default=None, help="Metadata json file to use for
|
| 78 |
parser.add_argument("--min_length", type=int, default=None, help="Minimum length of pdf to convert")
|
| 79 |
|
| 80 |
args = parser.parse_args()
|
|
@@ -104,11 +104,6 @@ def main():
|
|
| 104 |
|
| 105 |
total_processes = min(len(files_to_convert), args.workers)
|
| 106 |
|
| 107 |
-
# Dynamically set GPU allocation per task based on GPU ram
|
| 108 |
-
if settings.CUDA:
|
| 109 |
-
tasks_per_gpu = settings.INFERENCE_RAM // settings.VRAM_PER_TASK if settings.CUDA else 0
|
| 110 |
-
total_processes = min(tasks_per_gpu, total_processes)
|
| 111 |
-
|
| 112 |
try:
|
| 113 |
mp.set_start_method('spawn') # Required for CUDA, forkserver doesn't work
|
| 114 |
except RuntimeError:
|
|
|
|
| 73 |
parser.add_argument("--chunk_idx", type=int, default=0, help="Chunk index to convert")
|
| 74 |
parser.add_argument("--num_chunks", type=int, default=1, help="Number of chunks being processed in parallel")
|
| 75 |
parser.add_argument("--max", type=int, default=None, help="Maximum number of pdfs to convert")
|
| 76 |
+
parser.add_argument("--workers", type=int, default=5, help="Number of worker processes to use. Peak VRAM usage per process is 5GB, but avg is closer to 3.5GB.")
|
| 77 |
+
parser.add_argument("--metadata_file", type=str, default=None, help="Metadata json file to use for languages")
|
| 78 |
parser.add_argument("--min_length", type=int, default=None, help="Minimum length of pdf to convert")
|
| 79 |
|
| 80 |
args = parser.parse_args()
|
|
|
|
| 104 |
|
| 105 |
total_processes = min(len(files_to_convert), args.workers)
|
| 106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
try:
|
| 108 |
mp.set_start_method('spawn') # Required for CUDA, forkserver doesn't work
|
| 109 |
except RuntimeError:
|
convert_single.py
CHANGED
|
@@ -20,7 +20,7 @@ def main():
|
|
| 20 |
parser.add_argument("output", help="Output base folder path")
|
| 21 |
parser.add_argument("--max_pages", type=int, default=None, help="Maximum number of pages to parse")
|
| 22 |
parser.add_argument("--start_page", type=int, default=None, help="Page to start processing at")
|
| 23 |
-
parser.add_argument("--langs", type=str, help="
|
| 24 |
parser.add_argument("--batch_multiplier", type=int, default=2, help="How much to increase batch sizes")
|
| 25 |
parser.add_argument("--debug", action="store_true", help="Enable debug logging", default=False)
|
| 26 |
args = parser.parse_args()
|
|
|
|
| 20 |
parser.add_argument("output", help="Output base folder path")
|
| 21 |
parser.add_argument("--max_pages", type=int, default=None, help="Maximum number of pages to parse")
|
| 22 |
parser.add_argument("--start_page", type=int, default=None, help="Page to start processing at")
|
| 23 |
+
parser.add_argument("--langs", type=str, help="Optional languages to use for OCR, comma separated", default=None)
|
| 24 |
parser.add_argument("--batch_multiplier", type=int, default=2, help="How much to increase batch sizes")
|
| 25 |
parser.add_argument("--debug", action="store_true", help="Enable debug logging", default=False)
|
| 26 |
args = parser.parse_args()
|
marker/convert.py
CHANGED
|
@@ -43,10 +43,6 @@ def convert_single_pdf(
|
|
| 43 |
langs: Optional[List[str]] = None,
|
| 44 |
batch_multiplier: int = 1
|
| 45 |
) -> Tuple[str, Dict[str, Image.Image], Dict]:
|
| 46 |
-
# Set language needed for OCR
|
| 47 |
-
if langs is None:
|
| 48 |
-
langs = [settings.DEFAULT_LANG]
|
| 49 |
-
|
| 50 |
if metadata:
|
| 51 |
langs = metadata.get("languages", langs)
|
| 52 |
|
|
|
|
| 43 |
langs: Optional[List[str]] = None,
|
| 44 |
batch_multiplier: int = 1
|
| 45 |
) -> Tuple[str, Dict[str, Image.Image], Dict]:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
if metadata:
|
| 47 |
langs = metadata.get("languages", langs)
|
| 48 |
|
marker/equations/inference.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
from texify.inference import batch_inference
|
|
|
|
| 2 |
|
| 3 |
from marker.settings import settings
|
| 4 |
import os
|
|
@@ -22,7 +23,7 @@ def get_latex_batched(images, token_counts, texify_model, batch_multiplier=1):
|
|
| 22 |
predictions = [""] * len(images)
|
| 23 |
batch_size = get_batch_size() * batch_multiplier
|
| 24 |
|
| 25 |
-
for i in range(0, len(images), batch_size):
|
| 26 |
# Dynamically set max length to save inference time
|
| 27 |
min_idx = i
|
| 28 |
max_idx = min(min_idx + batch_size, len(images))
|
|
|
|
| 1 |
from texify.inference import batch_inference
|
| 2 |
+
from tqdm import tqdm
|
| 3 |
|
| 4 |
from marker.settings import settings
|
| 5 |
import os
|
|
|
|
| 23 |
predictions = [""] * len(images)
|
| 24 |
batch_size = get_batch_size() * batch_multiplier
|
| 25 |
|
| 26 |
+
for i in tqdm(range(0, len(images), batch_size), desc="Recognizing equations"):
|
| 27 |
# Dynamically set max length to save inference time
|
| 28 |
min_idx = i
|
| 29 |
max_idx = min(min_idx + batch_size, len(images))
|
marker/models.py
CHANGED
|
@@ -13,11 +13,11 @@ from surya.model.ordering.model import load_model as load_order_model
|
|
| 13 |
from surya.model.ordering.processor import load_processor as load_order_processor
|
| 14 |
|
| 15 |
|
| 16 |
-
def setup_recognition_model(
|
| 17 |
if device:
|
| 18 |
-
rec_model = load_recognition_model(
|
| 19 |
else:
|
| 20 |
-
rec_model = load_recognition_model(
|
| 21 |
rec_processor = load_recognition_processor()
|
| 22 |
rec_model.processor = rec_processor
|
| 23 |
return rec_model
|
|
@@ -64,7 +64,7 @@ def setup_order_model(device=None, dtype=None):
|
|
| 64 |
return model
|
| 65 |
|
| 66 |
|
| 67 |
-
def load_all_models(
|
| 68 |
if device is not None:
|
| 69 |
assert dtype is not None, "Must provide dtype if device is provided"
|
| 70 |
|
|
@@ -75,7 +75,7 @@ def load_all_models(langs=None, device=None, dtype=None, force_load_ocr=False):
|
|
| 75 |
edit = load_editing_model(device, dtype)
|
| 76 |
|
| 77 |
# Only load recognition model if we'll need it for all pdfs
|
| 78 |
-
ocr = setup_recognition_model(
|
| 79 |
texify = setup_texify_model(device, dtype)
|
| 80 |
model_lst = [texify, layout, order, edit, detection, ocr]
|
| 81 |
return model_lst
|
|
|
|
| 13 |
from surya.model.ordering.processor import load_processor as load_order_processor
|
| 14 |
|
| 15 |
|
| 16 |
+
def setup_recognition_model(device=None, dtype=None):
|
| 17 |
if device:
|
| 18 |
+
rec_model = load_recognition_model(device=device, dtype=dtype)
|
| 19 |
else:
|
| 20 |
+
rec_model = load_recognition_model()
|
| 21 |
rec_processor = load_recognition_processor()
|
| 22 |
rec_model.processor = rec_processor
|
| 23 |
return rec_model
|
|
|
|
| 64 |
return model
|
| 65 |
|
| 66 |
|
| 67 |
+
def load_all_models(device=None, dtype=None, force_load_ocr=False):
|
| 68 |
if device is not None:
|
| 69 |
assert dtype is not None, "Must provide dtype if device is provided"
|
| 70 |
|
|
|
|
| 75 |
edit = load_editing_model(device, dtype)
|
| 76 |
|
| 77 |
# Only load recognition model if we'll need it for all pdfs
|
| 78 |
+
ocr = setup_recognition_model(device, dtype)
|
| 79 |
texify = setup_texify_model(device, dtype)
|
| 80 |
model_lst = [texify, layout, order, edit, detection, ocr]
|
| 81 |
return model_lst
|
marker/ocr/lang.py
CHANGED
|
@@ -15,10 +15,16 @@ def langs_to_ids(langs: List[str]):
|
|
| 15 |
|
| 16 |
def replace_langs_with_codes(langs):
|
| 17 |
if settings.OCR_ENGINE == "surya":
|
|
|
|
|
|
|
| 18 |
for i, lang in enumerate(langs):
|
| 19 |
if lang.title() in LANGUAGE_TO_CODE:
|
| 20 |
langs[i] = LANGUAGE_TO_CODE[lang.title()]
|
| 21 |
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
for i, lang in enumerate(langs):
|
| 23 |
if lang in LANGUAGE_TO_CODE:
|
| 24 |
langs[i] = LANGUAGE_TO_TESSERACT_CODE[lang]
|
|
@@ -27,6 +33,8 @@ def replace_langs_with_codes(langs):
|
|
| 27 |
|
| 28 |
def validate_langs(langs):
|
| 29 |
if settings.OCR_ENGINE == "surya":
|
|
|
|
|
|
|
| 30 |
for lang in langs:
|
| 31 |
if lang not in CODE_TO_LANGUAGE:
|
| 32 |
raise ValueError(f"Invalid language code {lang} for Surya OCR")
|
|
|
|
| 15 |
|
| 16 |
def replace_langs_with_codes(langs):
|
| 17 |
if settings.OCR_ENGINE == "surya":
|
| 18 |
+
if langs is None:
|
| 19 |
+
return
|
| 20 |
for i, lang in enumerate(langs):
|
| 21 |
if lang.title() in LANGUAGE_TO_CODE:
|
| 22 |
langs[i] = LANGUAGE_TO_CODE[lang.title()]
|
| 23 |
else:
|
| 24 |
+
if langs is None:
|
| 25 |
+
langs = [settings.DEFAULT_LANG]
|
| 26 |
+
print(f"No languages specified for tesseract, defaulting to {settings.DEFAULT_LANG}.")
|
| 27 |
+
|
| 28 |
for i, lang in enumerate(langs):
|
| 29 |
if lang in LANGUAGE_TO_CODE:
|
| 30 |
langs[i] = LANGUAGE_TO_TESSERACT_CODE[lang]
|
|
|
|
| 33 |
|
| 34 |
def validate_langs(langs):
|
| 35 |
if settings.OCR_ENGINE == "surya":
|
| 36 |
+
if langs is None:
|
| 37 |
+
return
|
| 38 |
for lang in langs:
|
| 39 |
if lang not in CODE_TO_LANGUAGE:
|
| 40 |
raise ValueError(f"Invalid language code {lang} for Surya OCR")
|
marker/settings.py
CHANGED
|
@@ -27,8 +27,6 @@ class Settings(BaseSettings):
|
|
| 27 |
|
| 28 |
return "cpu"
|
| 29 |
|
| 30 |
-
INFERENCE_RAM: int = 40 # How much VRAM each GPU has (in GB).
|
| 31 |
-
VRAM_PER_TASK: float = 4.5 # How much VRAM to allocate per task (in GB). Peak marker VRAM usage is around 5GB, but avg across workers is lower.
|
| 32 |
DEFAULT_LANG: str = "English" # Default language we assume files to be in, should be one of the keys in TESSERACT_LANGUAGES
|
| 33 |
|
| 34 |
SUPPORTED_FILETYPES: Dict = {
|
|
|
|
| 27 |
|
| 28 |
return "cpu"
|
| 29 |
|
|
|
|
|
|
|
| 30 |
DEFAULT_LANG: str = "English" # Default language we assume files to be in, should be one of the keys in TESSERACT_LANGUAGES
|
| 31 |
|
| 32 |
SUPPORTED_FILETYPES: Dict = {
|
marker/tables/cells.py
CHANGED
|
@@ -1,9 +1,5 @@
|
|
| 1 |
-
from PIL import Image, ImageDraw
|
| 2 |
-
import copy
|
| 3 |
-
|
| 4 |
from marker.schema.bbox import rescale_bbox, box_intersection_pct
|
| 5 |
from marker.schema.page import Page
|
| 6 |
-
from marker.tables.edges import get_vertical_lines
|
| 7 |
import numpy as np
|
| 8 |
from sklearn.cluster import DBSCAN
|
| 9 |
from marker.settings import settings
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from marker.schema.bbox import rescale_bbox, box_intersection_pct
|
| 2 |
from marker.schema.page import Page
|
|
|
|
| 3 |
import numpy as np
|
| 4 |
from sklearn.cluster import DBSCAN
|
| 5 |
from marker.settings import settings
|
poetry.lock
CHANGED
|
@@ -3170,13 +3170,13 @@ tests = ["cython", "littleutils", "pygments", "pytest", "typeguard"]
|
|
| 3170 |
|
| 3171 |
[[package]]
|
| 3172 |
name = "surya-ocr"
|
| 3173 |
-
version = "0.
|
| 3174 |
description = "OCR, layout, reading order, and line detection in 90+ languages"
|
| 3175 |
optional = false
|
| 3176 |
python-versions = "!=2.7.*,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,!=3.7.*,!=3.8.*,>=3.9"
|
| 3177 |
files = [
|
| 3178 |
-
{file = "surya_ocr-0.
|
| 3179 |
-
{file = "surya_ocr-0.
|
| 3180 |
]
|
| 3181 |
|
| 3182 |
[package.dependencies]
|
|
@@ -3771,4 +3771,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools",
|
|
| 3771 |
[metadata]
|
| 3772 |
lock-version = "2.0"
|
| 3773 |
python-versions = ">=3.9,<3.13,!=3.9.7"
|
| 3774 |
-
content-hash = "
|
|
|
|
| 3170 |
|
| 3171 |
[[package]]
|
| 3172 |
name = "surya-ocr"
|
| 3173 |
+
version = "0.5.0"
|
| 3174 |
description = "OCR, layout, reading order, and line detection in 90+ languages"
|
| 3175 |
optional = false
|
| 3176 |
python-versions = "!=2.7.*,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,!=3.7.*,!=3.8.*,>=3.9"
|
| 3177 |
files = [
|
| 3178 |
+
{file = "surya_ocr-0.5.0-py3-none-any.whl", hash = "sha256:e70516d74f3816c5b2a61bdf8f7eeb5fbd5670514bc5ae2eb0947d33c60c22d3"},
|
| 3179 |
+
{file = "surya_ocr-0.5.0.tar.gz", hash = "sha256:a80740c2b000d9630cf3d5525043c95096efaeb6b0892254ff32339a171e789a"},
|
| 3180 |
]
|
| 3181 |
|
| 3182 |
[package.dependencies]
|
|
|
|
| 3771 |
[metadata]
|
| 3772 |
lock-version = "2.0"
|
| 3773 |
python-versions = ">=3.9,<3.13,!=3.9.7"
|
| 3774 |
+
content-hash = "e1a4a5f18fbc4b7e3108b19910577bf3d09c2053e6bbd4c320c81990e143373d"
|
pyproject.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
[tool.poetry]
|
| 2 |
name = "marker-pdf"
|
| 3 |
-
version = "0.2.
|
| 4 |
description = "Convert PDF to markdown with high speed and accuracy."
|
| 5 |
authors = ["Vik Paruchuri <github@vikas.sh>"]
|
| 6 |
readme = "README.md"
|
|
@@ -32,7 +32,7 @@ tabulate = "^0.9.0"
|
|
| 32 |
ftfy = "^6.1.1"
|
| 33 |
texify = "^0.1.10"
|
| 34 |
rapidfuzz = "^3.8.1"
|
| 35 |
-
surya-ocr = "^0.
|
| 36 |
filetype = "^1.2.0"
|
| 37 |
regex = "^2024.4.28"
|
| 38 |
pdftext = "^0.3.10"
|
|
|
|
| 1 |
[tool.poetry]
|
| 2 |
name = "marker-pdf"
|
| 3 |
+
version = "0.2.17"
|
| 4 |
description = "Convert PDF to markdown with high speed and accuracy."
|
| 5 |
authors = ["Vik Paruchuri <github@vikas.sh>"]
|
| 6 |
readme = "README.md"
|
|
|
|
| 32 |
ftfy = "^6.1.1"
|
| 33 |
texify = "^0.1.10"
|
| 34 |
rapidfuzz = "^3.8.1"
|
| 35 |
+
surya-ocr = "^0.5.0"
|
| 36 |
filetype = "^1.2.0"
|
| 37 |
regex = "^2024.4.28"
|
| 38 |
pdftext = "^0.3.10"
|