Vik Paruchuri commited on
Commit
412aa07
·
2 Parent(s): affe15d 088f39c

Merge pull request #213 from VikParuchuri/dev

Browse files
Files changed (7) hide show
  1. README.md +1 -1
  2. convert.py +1 -0
  3. convert_single.py +2 -1
  4. marker/convert.py +4 -0
  5. marker/models.py +4 -0
  6. poetry.lock +0 -0
  7. pyproject.toml +3 -3
README.md CHANGED
@@ -50,7 +50,7 @@ There's a hosted API for marker available [here](https://www.datalab.to/):
50
 
51
  - Supports PDFs, word documents, and powerpoints
52
  - 1/4th the price of leading cloud-based competitors
53
- - Uses [modal](https://modal.com/) for high reliability without latency spikes
54
 
55
  # Community
56
 
 
50
 
51
  - Supports PDFs, word documents, and powerpoints
52
  - 1/4th the price of leading cloud-based competitors
53
+ - Leverages [Modal](https://modal.com/) for high reliability without latency spikes
54
 
55
  # Community
56
 
convert.py CHANGED
@@ -1,5 +1,6 @@
1
  import os
2
 
 
3
  os.environ["IN_STREAMLIT"] = "true" # Avoid multiprocessing inside surya
4
  os.environ["PDFTEXT_CPU_WORKERS"] = "1" # Avoid multiprocessing inside pdftext
5
 
 
1
  import os
2
 
3
+ os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # For some reason, transformers decided to use .isin for a simple op, which is not supported on MPS
4
  os.environ["IN_STREAMLIT"] = "true" # Avoid multiprocessing inside surya
5
  os.environ["PDFTEXT_CPU_WORKERS"] = "1" # Avoid multiprocessing inside pdftext
6
 
convert_single.py CHANGED
@@ -1,7 +1,8 @@
1
  import pypdfium2 # Needs to be at the top to avoid warnings
2
- import argparse
3
  import os
 
4
 
 
5
  from marker.convert import convert_single_pdf
6
  from marker.logger import configure_logging
7
  from marker.models import load_all_models
 
1
  import pypdfium2 # Needs to be at the top to avoid warnings
 
2
  import os
3
+ os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # For some reason, transformers decided to use .isin for a simple op, which is not supported on MPS
4
 
5
+ import argparse
6
  from marker.convert import convert_single_pdf
7
  from marker.logger import configure_logging
8
  from marker.models import load_all_models
marker/convert.py CHANGED
@@ -1,6 +1,10 @@
1
  import warnings
2
  warnings.filterwarnings("ignore", category=UserWarning) # Filter torch pytree user warnings
3
 
 
 
 
 
4
  import pypdfium2 as pdfium # Needs to be at the top to avoid warnings
5
  from PIL import Image
6
 
 
1
  import warnings
2
  warnings.filterwarnings("ignore", category=UserWarning) # Filter torch pytree user warnings
3
 
4
+ import os
5
+ os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # For some reason, transformers decided to use .isin for a simple op, which is not supported on MPS
6
+
7
+
8
  import pypdfium2 as pdfium # Needs to be at the top to avoid warnings
9
  from PIL import Image
10
 
marker/models.py CHANGED
@@ -1,3 +1,7 @@
 
 
 
 
1
  from marker.postprocessors.editor import load_editing_model
2
  from surya.model.detection import segformer
3
  from texify.model.model import load_model as load_texify_model
 
1
+ import os
2
+ os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # For some reason, transformers decided to use .isin for a simple op, which is not supported on MPS
3
+
4
+
5
  from marker.postprocessors.editor import load_editing_model
6
  from surya.model.detection import segformer
7
  from texify.model.model import load_model as load_texify_model
poetry.lock CHANGED
The diff for this file is too large to render. See raw diff
 
pyproject.toml CHANGED
@@ -1,6 +1,6 @@
1
  [tool.poetry]
2
  name = "marker-pdf"
3
- version = "0.2.14"
4
  description = "Convert PDF to markdown with high speed and accuracy."
5
  authors = ["Vik Paruchuri <github@vikas.sh>"]
6
  readme = "README.md"
@@ -30,9 +30,9 @@ torch = "^2.2.2" # Issue with torch 2.3.0 and vision models - https://github.com
30
  tqdm = "^4.66.1"
31
  tabulate = "^0.9.0"
32
  ftfy = "^6.1.1"
33
- texify = "^0.1.9"
34
  rapidfuzz = "^3.8.1"
35
- surya-ocr = "^0.4.12"
36
  filetype = "^1.2.0"
37
  regex = "^2024.4.28"
38
  pdftext = "^0.3.10"
 
1
  [tool.poetry]
2
  name = "marker-pdf"
3
+ version = "0.2.15"
4
  description = "Convert PDF to markdown with high speed and accuracy."
5
  authors = ["Vik Paruchuri <github@vikas.sh>"]
6
  readme = "README.md"
 
30
  tqdm = "^4.66.1"
31
  tabulate = "^0.9.0"
32
  ftfy = "^6.1.1"
33
+ texify = "^0.1.10"
34
  rapidfuzz = "^3.8.1"
35
+ surya-ocr = "^0.4.14"
36
  filetype = "^1.2.0"
37
  regex = "^2024.4.28"
38
  pdftext = "^0.3.10"