Spaces:

rt4u
/

marker

Sleeping

App Files Files Community

Vik Paruchuri commited on Aug 19, 2024

Commit

a6bdfaa

1 Parent(s): d090d63

Integrate new surya OCR model

Browse files

Files changed (11) hide show

README.md +7 -9
convert.py +2 -7
convert_single.py +1 -1
marker/convert.py +0 -4
marker/equations/inference.py +2 -1
marker/models.py +5 -5
marker/ocr/lang.py +8 -0
marker/settings.py +0 -2
marker/tables/cells.py +0 -4
poetry.lock +4 -4
pyproject.toml +2 -2

README.md CHANGED Viewed

@@ -88,32 +88,30 @@ First, some configuration:
 - Inspect the settings in `marker/settings.py`.  You can override any settings with environment variables.
 - Your torch device will be automatically detected, but you can override this.  For example, `TORCH_DEVICE=cuda`.
-  - If using GPU, set `INFERENCE_RAM` to your GPU VRAM (per GPU).  For example, if you have 16 GB of VRAM, set `INFERENCE_RAM=16`.
-  - Depending on your document types, marker's average memory usage per task can vary slightly.  You can configure `VRAM_PER_TASK` to adjust this if you notice tasks failing with GPU out of memory errors.
-- By default, marker will use `surya` for OCR.  Surya is slower on CPU, but more accurate than tesseract.  If you want faster OCR, set `OCR_ENGINE` to `ocrmypdf`. This also requires external dependencies (see above).  If you don't want OCR at all, set `OCR_ENGINE` to `None`.
 ## Convert a single file
 ```shell
-marker_single /path/to/file.pdf /path/to/output/folder --batch_multiplier 2 --max_pages 10 --langs English
 ```
 - `--batch_multiplier` is how much to multiply default batch sizes by if you have extra VRAM.  Higher numbers will take more VRAM, but process faster.  Set to 2 by default.  The default batch sizes will take ~3GB of VRAM.
 - `--max_pages` is the maximum number of pages to process.  Omit this to convert the entire document.
-- `--langs` is a comma separated list of the languages in the document, for OCR
-Make sure the `DEFAULT_LANG` setting is set appropriately for your document.  The list of supported languages for OCR is [here](https://github.com/VikParuchuri/surya/blob/master/surya/languages.py).  If you need more languages, you can use any language supported by [Tesseract](https://tesseract-ocr.github.io/tessdoc/Data-Files#data-files-for-version-400-november-29-2016) if you set `OCR_ENGINE` to `ocrmypdf`.  If you don't need OCR, marker can work with any language.
 ## Convert multiple files
 ```shell
-marker /path/to/input/folder /path/to/output/folder --workers 10 --max 10 --metadata_file /path/to/metadata.json --min_length 10000
 ```
-- `--workers` is the number of pdfs to convert at once.  This is set to 1 by default, but you can increase it to increase throughput, at the cost of more CPU/GPU usage. Parallelism will not increase beyond `INFERENCE_RAM / VRAM_PER_TASK` if you're using GPU.
 - `--max` is the maximum number of pdfs to convert.  Omit this to convert all pdfs in the folder.
 - `--min_length` is the minimum number of characters that need to be extracted from a pdf before it will be considered for processing.  If you're processing a lot of pdfs, I recommend setting this to avoid OCRing pdfs that are mostly images. (slows everything down)
-- `--metadata_file` is an optional path to a json file with metadata about the pdfs.  If you provide it, it will be used to set the language for each pdf.  If not, `DEFAULT_LANG` will be used. The format is:
 ```
 {

 - Inspect the settings in `marker/settings.py`.  You can override any settings with environment variables.
 - Your torch device will be automatically detected, but you can override this.  For example, `TORCH_DEVICE=cuda`.
+- By default, marker will use `surya` for OCR.  Surya is slower on CPU, but more accurate than tesseract.  It also doesn't require you to specify the languages in the document.  If you want faster OCR, set `OCR_ENGINE` to `ocrmypdf`. This also requires external dependencies (see above).  If you don't want OCR at all, set `OCR_ENGINE` to `None`.
 ## Convert a single file
 ```shell
+marker_single /path/to/file.pdf /path/to/output/folder --batch_multiplier 2 --max_pages 10
 ```
 - `--batch_multiplier` is how much to multiply default batch sizes by if you have extra VRAM.  Higher numbers will take more VRAM, but process faster.  Set to 2 by default.  The default batch sizes will take ~3GB of VRAM.
 - `--max_pages` is the maximum number of pages to process.  Omit this to convert the entire document.
+- `--langs` is am optional comma separated list of the languages in the document, for OCR.  Optional by default, required if you use tesseract.
+The list of supported languages for surya OCR is [here](https://github.com/VikParuchuri/surya/blob/master/surya/languages.py).  If you need more languages, you can use any language supported by [Tesseract](https://tesseract-ocr.github.io/tessdoc/Data-Files#data-files-for-version-400-november-29-2016) if you set `OCR_ENGINE` to `ocrmypdf`.  If you don't need OCR, marker can work with any language.
 ## Convert multiple files
 ```shell
+marker /path/to/input/folder /path/to/output/folder --workers 4 --max 10 --min_length 10000
 ```
+- `--workers` is the number of pdfs to convert at once.  This is set to 1 by default, but you can increase it to increase throughput, at the cost of more CPU/GPU usage.  Marker will use 5GB of VRAM per worker at the peak, and 3.5GB average.
 - `--max` is the maximum number of pdfs to convert.  Omit this to convert all pdfs in the folder.
 - `--min_length` is the minimum number of characters that need to be extracted from a pdf before it will be considered for processing.  If you're processing a lot of pdfs, I recommend setting this to avoid OCRing pdfs that are mostly images. (slows everything down)
+- `--metadata_file` is an optional path to a json file with metadata about the pdfs.  If you provide it, it will be used to set the language for each pdf.  Setting language is optional for surya (default), but required for tesseract. The format is:
 ```
 {

convert.py CHANGED Viewed

@@ -73,8 +73,8 @@ def main():
     parser.add_argument("--chunk_idx", type=int, default=0, help="Chunk index to convert")
     parser.add_argument("--num_chunks", type=int, default=1, help="Number of chunks being processed in parallel")
     parser.add_argument("--max", type=int, default=None, help="Maximum number of pdfs to convert")
-    parser.add_argument("--workers", type=int, default=5, help="Number of worker processes to use")
-    parser.add_argument("--metadata_file", type=str, default=None, help="Metadata json file to use for filtering")
     parser.add_argument("--min_length", type=int, default=None, help="Minimum length of pdf to convert")
     args = parser.parse_args()
@@ -104,11 +104,6 @@ def main():
     total_processes = min(len(files_to_convert), args.workers)
-    # Dynamically set GPU allocation per task based on GPU ram
-    if settings.CUDA:
-        tasks_per_gpu = settings.INFERENCE_RAM // settings.VRAM_PER_TASK if settings.CUDA else 0
-        total_processes = min(tasks_per_gpu, total_processes)
     try:
         mp.set_start_method('spawn') # Required for CUDA, forkserver doesn't work
     except RuntimeError:

     parser.add_argument("--chunk_idx", type=int, default=0, help="Chunk index to convert")
     parser.add_argument("--num_chunks", type=int, default=1, help="Number of chunks being processed in parallel")
     parser.add_argument("--max", type=int, default=None, help="Maximum number of pdfs to convert")
+    parser.add_argument("--workers", type=int, default=5, help="Number of worker processes to use.  Peak VRAM usage per process is 5GB, but avg is closer to 3.5GB.")
+    parser.add_argument("--metadata_file", type=str, default=None, help="Metadata json file to use for languages")
     parser.add_argument("--min_length", type=int, default=None, help="Minimum length of pdf to convert")
     args = parser.parse_args()
     total_processes = min(len(files_to_convert), args.workers)
     try:
         mp.set_start_method('spawn') # Required for CUDA, forkserver doesn't work
     except RuntimeError:

convert_single.py CHANGED Viewed

@@ -20,7 +20,7 @@ def main():
     parser.add_argument("output", help="Output base folder path")
     parser.add_argument("--max_pages", type=int, default=None, help="Maximum number of pages to parse")
     parser.add_argument("--start_page", type=int, default=None, help="Page to start processing at")
-    parser.add_argument("--langs", type=str, help="Languages to use for OCR, comma separated", default=None)
     parser.add_argument("--batch_multiplier", type=int, default=2, help="How much to increase batch sizes")
     parser.add_argument("--debug", action="store_true", help="Enable debug logging", default=False)
     args = parser.parse_args()

     parser.add_argument("output", help="Output base folder path")
     parser.add_argument("--max_pages", type=int, default=None, help="Maximum number of pages to parse")
     parser.add_argument("--start_page", type=int, default=None, help="Page to start processing at")
+    parser.add_argument("--langs", type=str, help="Optional languages to use for OCR, comma separated", default=None)
     parser.add_argument("--batch_multiplier", type=int, default=2, help="How much to increase batch sizes")
     parser.add_argument("--debug", action="store_true", help="Enable debug logging", default=False)
     args = parser.parse_args()

marker/convert.py CHANGED Viewed

@@ -43,10 +43,6 @@ def convert_single_pdf(
         langs: Optional[List[str]] = None,
         batch_multiplier: int = 1
 ) -> Tuple[str, Dict[str, Image.Image], Dict]:
-    # Set language needed for OCR
-    if langs is None:
-        langs = [settings.DEFAULT_LANG]
     if metadata:
         langs = metadata.get("languages", langs)

         langs: Optional[List[str]] = None,
         batch_multiplier: int = 1
 ) -> Tuple[str, Dict[str, Image.Image], Dict]:
     if metadata:
         langs = metadata.get("languages", langs)

marker/equations/inference.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from texify.inference import batch_inference
 from marker.settings import settings
 import os
@@ -22,7 +23,7 @@ def get_latex_batched(images, token_counts, texify_model, batch_multiplier=1):
     predictions = [""] * len(images)
     batch_size = get_batch_size() * batch_multiplier
-    for i in range(0, len(images), batch_size):
         # Dynamically set max length to save inference time
         min_idx = i
         max_idx = min(min_idx + batch_size, len(images))

 from texify.inference import batch_inference
+from tqdm import tqdm
 from marker.settings import settings
 import os
     predictions = [""] * len(images)
     batch_size = get_batch_size() * batch_multiplier
+    for i in tqdm(range(0, len(images), batch_size), desc="Recognizing equations"):
         # Dynamically set max length to save inference time
         min_idx = i
         max_idx = min(min_idx + batch_size, len(images))

marker/models.py CHANGED Viewed

@@ -13,11 +13,11 @@ from surya.model.ordering.model import load_model as load_order_model
 from surya.model.ordering.processor import load_processor as load_order_processor
-def setup_recognition_model(langs, device=None, dtype=None):
     if device:
-        rec_model = load_recognition_model(langs=langs, device=device, dtype=dtype)
     else:
-        rec_model = load_recognition_model(langs=langs)
     rec_processor = load_recognition_processor()
     rec_model.processor = rec_processor
     return rec_model
@@ -64,7 +64,7 @@ def setup_order_model(device=None, dtype=None):
     return model
-def load_all_models(langs=None, device=None, dtype=None, force_load_ocr=False):
     if device is not None:
         assert dtype is not None, "Must provide dtype if device is provided"
@@ -75,7 +75,7 @@ def load_all_models(langs=None, device=None, dtype=None, force_load_ocr=False):
     edit = load_editing_model(device, dtype)
     # Only load recognition model if we'll need it for all pdfs
-    ocr = setup_recognition_model(langs, device, dtype)
     texify = setup_texify_model(device, dtype)
     model_lst = [texify, layout, order, edit, detection, ocr]
     return model_lst

 from surya.model.ordering.processor import load_processor as load_order_processor
+def setup_recognition_model(device=None, dtype=None):
     if device:
+        rec_model = load_recognition_model(device=device, dtype=dtype)
     else:
+        rec_model = load_recognition_model()
     rec_processor = load_recognition_processor()
     rec_model.processor = rec_processor
     return rec_model
     return model
+def load_all_models(device=None, dtype=None, force_load_ocr=False):
     if device is not None:
         assert dtype is not None, "Must provide dtype if device is provided"
     edit = load_editing_model(device, dtype)
     # Only load recognition model if we'll need it for all pdfs
+    ocr = setup_recognition_model(device, dtype)
     texify = setup_texify_model(device, dtype)
     model_lst = [texify, layout, order, edit, detection, ocr]
     return model_lst

marker/ocr/lang.py CHANGED Viewed

@@ -15,10 +15,16 @@ def langs_to_ids(langs: List[str]):
 def replace_langs_with_codes(langs):
     if settings.OCR_ENGINE == "surya":
         for i, lang in enumerate(langs):
             if lang.title() in LANGUAGE_TO_CODE:
                 langs[i] = LANGUAGE_TO_CODE[lang.title()]
     else:
         for i, lang in enumerate(langs):
             if lang in LANGUAGE_TO_CODE:
                 langs[i] = LANGUAGE_TO_TESSERACT_CODE[lang]
@@ -27,6 +33,8 @@ def replace_langs_with_codes(langs):
 def validate_langs(langs):
     if settings.OCR_ENGINE == "surya":
         for lang in langs:
             if lang not in CODE_TO_LANGUAGE:
                 raise ValueError(f"Invalid language code {lang} for Surya OCR")

 def replace_langs_with_codes(langs):
     if settings.OCR_ENGINE == "surya":
+        if langs is None:
+            return
         for i, lang in enumerate(langs):
             if lang.title() in LANGUAGE_TO_CODE:
                 langs[i] = LANGUAGE_TO_CODE[lang.title()]
     else:
+        if langs is None:
+            langs = [settings.DEFAULT_LANG]
+            print(f"No languages specified for tesseract, defaulting to {settings.DEFAULT_LANG}.")
         for i, lang in enumerate(langs):
             if lang in LANGUAGE_TO_CODE:
                 langs[i] = LANGUAGE_TO_TESSERACT_CODE[lang]
 def validate_langs(langs):
     if settings.OCR_ENGINE == "surya":
+        if langs is None:
+            return
         for lang in langs:
             if lang not in CODE_TO_LANGUAGE:
                 raise ValueError(f"Invalid language code {lang} for Surya OCR")

marker/settings.py CHANGED Viewed

@@ -27,8 +27,6 @@ class Settings(BaseSettings):
         return "cpu"
-    INFERENCE_RAM: int = 40 # How much VRAM each GPU has (in GB).
-    VRAM_PER_TASK: float = 4.5 # How much VRAM to allocate per task (in GB).  Peak marker VRAM usage is around 5GB, but avg across workers is lower.
     DEFAULT_LANG: str = "English" # Default language we assume files to be in, should be one of the keys in TESSERACT_LANGUAGES
     SUPPORTED_FILETYPES: Dict = {

         return "cpu"
     DEFAULT_LANG: str = "English" # Default language we assume files to be in, should be one of the keys in TESSERACT_LANGUAGES
     SUPPORTED_FILETYPES: Dict = {

marker/tables/cells.py CHANGED Viewed

@@ -1,9 +1,5 @@
-from PIL import Image, ImageDraw
-import copy
 from marker.schema.bbox import rescale_bbox, box_intersection_pct
 from marker.schema.page import Page
-from marker.tables.edges import get_vertical_lines
 import numpy as np
 from sklearn.cluster import DBSCAN
 from marker.settings import settings

 from marker.schema.bbox import rescale_bbox, box_intersection_pct
 from marker.schema.page import Page
 import numpy as np
 from sklearn.cluster import DBSCAN
 from marker.settings import settings

poetry.lock CHANGED Viewed

@@ -3170,13 +3170,13 @@ tests = ["cython", "littleutils", "pygments", "pytest", "typeguard"]
 [[package]]
 name = "surya-ocr"
-version = "0.4.15"
 description = "OCR, layout, reading order, and line detection in 90+ languages"
 optional = false
 python-versions = "!=2.7.*,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,!=3.7.*,!=3.8.*,>=3.9"
 files = [
-    {file = "surya_ocr-0.4.15-py3-none-any.whl", hash = "sha256:32f4719e10b2f54dccac21f8b2d65e9fde06d41a2c34a47a8c9243a84a0e3cbb"},
-    {file = "surya_ocr-0.4.15.tar.gz", hash = "sha256:17a37ce1ae9c67afa774efac55df5e07a391b84c3aa5f17133acd7b398fc4dbf"},
 ]
 [package.dependencies]
@@ -3771,4 +3771,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools",
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9,<3.13,!=3.9.7"
-content-hash = "555e4829eaa849c175de3ee87ca3941b737533728b4196c9d224db2ee39e6e85"

 [[package]]
 name = "surya-ocr"
+version = "0.5.0"
 description = "OCR, layout, reading order, and line detection in 90+ languages"
 optional = false
 python-versions = "!=2.7.*,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,!=3.7.*,!=3.8.*,>=3.9"
 files = [
+    {file = "surya_ocr-0.5.0-py3-none-any.whl", hash = "sha256:e70516d74f3816c5b2a61bdf8f7eeb5fbd5670514bc5ae2eb0947d33c60c22d3"},
+    {file = "surya_ocr-0.5.0.tar.gz", hash = "sha256:a80740c2b000d9630cf3d5525043c95096efaeb6b0892254ff32339a171e789a"},
 ]
 [package.dependencies]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9,<3.13,!=3.9.7"
+content-hash = "e1a4a5f18fbc4b7e3108b19910577bf3d09c2053e6bbd4c320c81990e143373d"

pyproject.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "marker-pdf"
-version = "0.2.16"
 description = "Convert PDF to markdown with high speed and accuracy."
 authors = ["Vik Paruchuri <github@vikas.sh>"]
 readme = "README.md"
@@ -32,7 +32,7 @@ tabulate = "^0.9.0"
 ftfy = "^6.1.1"
 texify = "^0.1.10"
 rapidfuzz = "^3.8.1"
-surya-ocr = "^0.4.15"
 filetype = "^1.2.0"
 regex = "^2024.4.28"
 pdftext = "^0.3.10"

 [tool.poetry]
 name = "marker-pdf"
+version = "0.2.17"
 description = "Convert PDF to markdown with high speed and accuracy."
 authors = ["Vik Paruchuri <github@vikas.sh>"]
 readme = "README.md"
 ftfy = "^6.1.1"
 texify = "^0.1.10"
 rapidfuzz = "^3.8.1"
+surya-ocr = "^0.5.0"
 filetype = "^1.2.0"
 regex = "^2024.4.28"
 pdftext = "^0.3.10"