Merge pull request #213 from VikParuchuri/dev
Browse files- README.md +1 -1
- convert.py +1 -0
- convert_single.py +2 -1
- marker/convert.py +4 -0
- marker/models.py +4 -0
- poetry.lock +0 -0
- pyproject.toml +3 -3
README.md
CHANGED
|
@@ -50,7 +50,7 @@ There's a hosted API for marker available [here](https://www.datalab.to/):
|
|
| 50 |
|
| 51 |
- Supports PDFs, word documents, and powerpoints
|
| 52 |
- 1/4th the price of leading cloud-based competitors
|
| 53 |
-
-
|
| 54 |
|
| 55 |
# Community
|
| 56 |
|
|
|
|
| 50 |
|
| 51 |
- Supports PDFs, word documents, and powerpoints
|
| 52 |
- 1/4th the price of leading cloud-based competitors
|
| 53 |
+
- Leverages [Modal](https://modal.com/) for high reliability without latency spikes
|
| 54 |
|
| 55 |
# Community
|
| 56 |
|
convert.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
import os
|
| 2 |
|
|
|
|
| 3 |
os.environ["IN_STREAMLIT"] = "true" # Avoid multiprocessing inside surya
|
| 4 |
os.environ["PDFTEXT_CPU_WORKERS"] = "1" # Avoid multiprocessing inside pdftext
|
| 5 |
|
|
|
|
| 1 |
import os
|
| 2 |
|
| 3 |
+
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # For some reason, transformers decided to use .isin for a simple op, which is not supported on MPS
|
| 4 |
os.environ["IN_STREAMLIT"] = "true" # Avoid multiprocessing inside surya
|
| 5 |
os.environ["PDFTEXT_CPU_WORKERS"] = "1" # Avoid multiprocessing inside pdftext
|
| 6 |
|
convert_single.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
| 1 |
import pypdfium2 # Needs to be at the top to avoid warnings
|
| 2 |
-
import argparse
|
| 3 |
import os
|
|
|
|
| 4 |
|
|
|
|
| 5 |
from marker.convert import convert_single_pdf
|
| 6 |
from marker.logger import configure_logging
|
| 7 |
from marker.models import load_all_models
|
|
|
|
| 1 |
import pypdfium2 # Needs to be at the top to avoid warnings
|
|
|
|
| 2 |
import os
|
| 3 |
+
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # For some reason, transformers decided to use .isin for a simple op, which is not supported on MPS
|
| 4 |
|
| 5 |
+
import argparse
|
| 6 |
from marker.convert import convert_single_pdf
|
| 7 |
from marker.logger import configure_logging
|
| 8 |
from marker.models import load_all_models
|
marker/convert.py
CHANGED
|
@@ -1,6 +1,10 @@
|
|
| 1 |
import warnings
|
| 2 |
warnings.filterwarnings("ignore", category=UserWarning) # Filter torch pytree user warnings
|
| 3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
import pypdfium2 as pdfium # Needs to be at the top to avoid warnings
|
| 5 |
from PIL import Image
|
| 6 |
|
|
|
|
| 1 |
import warnings
|
| 2 |
warnings.filterwarnings("ignore", category=UserWarning) # Filter torch pytree user warnings
|
| 3 |
|
| 4 |
+
import os
|
| 5 |
+
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # For some reason, transformers decided to use .isin for a simple op, which is not supported on MPS
|
| 6 |
+
|
| 7 |
+
|
| 8 |
import pypdfium2 as pdfium # Needs to be at the top to avoid warnings
|
| 9 |
from PIL import Image
|
| 10 |
|
marker/models.py
CHANGED
|
@@ -1,3 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from marker.postprocessors.editor import load_editing_model
|
| 2 |
from surya.model.detection import segformer
|
| 3 |
from texify.model.model import load_model as load_texify_model
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # For some reason, transformers decided to use .isin for a simple op, which is not supported on MPS
|
| 3 |
+
|
| 4 |
+
|
| 5 |
from marker.postprocessors.editor import load_editing_model
|
| 6 |
from surya.model.detection import segformer
|
| 7 |
from texify.model.model import load_model as load_texify_model
|
poetry.lock
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
pyproject.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
[tool.poetry]
|
| 2 |
name = "marker-pdf"
|
| 3 |
-
version = "0.2.
|
| 4 |
description = "Convert PDF to markdown with high speed and accuracy."
|
| 5 |
authors = ["Vik Paruchuri <github@vikas.sh>"]
|
| 6 |
readme = "README.md"
|
|
@@ -30,9 +30,9 @@ torch = "^2.2.2" # Issue with torch 2.3.0 and vision models - https://github.com
|
|
| 30 |
tqdm = "^4.66.1"
|
| 31 |
tabulate = "^0.9.0"
|
| 32 |
ftfy = "^6.1.1"
|
| 33 |
-
texify = "^0.1.
|
| 34 |
rapidfuzz = "^3.8.1"
|
| 35 |
-
surya-ocr = "^0.4.
|
| 36 |
filetype = "^1.2.0"
|
| 37 |
regex = "^2024.4.28"
|
| 38 |
pdftext = "^0.3.10"
|
|
|
|
| 1 |
[tool.poetry]
|
| 2 |
name = "marker-pdf"
|
| 3 |
+
version = "0.2.15"
|
| 4 |
description = "Convert PDF to markdown with high speed and accuracy."
|
| 5 |
authors = ["Vik Paruchuri <github@vikas.sh>"]
|
| 6 |
readme = "README.md"
|
|
|
|
| 30 |
tqdm = "^4.66.1"
|
| 31 |
tabulate = "^0.9.0"
|
| 32 |
ftfy = "^6.1.1"
|
| 33 |
+
texify = "^0.1.10"
|
| 34 |
rapidfuzz = "^3.8.1"
|
| 35 |
+
surya-ocr = "^0.4.14"
|
| 36 |
filetype = "^1.2.0"
|
| 37 |
regex = "^2024.4.28"
|
| 38 |
pdftext = "^0.3.10"
|