Vik Paruchuri
commited on
Commit
·
1f06427
1
Parent(s):
817e4ae
Fix pdftext workers config
Browse files- marker_app.py +3 -3
- marker_server.py +3 -6
- run_marker_app.py +1 -1
marker_app.py
CHANGED
|
@@ -2,7 +2,6 @@ import os
|
|
| 2 |
|
| 3 |
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
|
| 4 |
os.environ["IN_STREAMLIT"] = "true"
|
| 5 |
-
os.environ["PDFTEXT_CPU_WORKERS"] = "1"
|
| 6 |
|
| 7 |
import base64
|
| 8 |
import io
|
|
@@ -25,8 +24,10 @@ def load_models():
|
|
| 25 |
|
| 26 |
def convert_pdf(fname: str, **kwargs) -> (str, Dict[str, Any], dict):
|
| 27 |
config_parser = ConfigParser(kwargs)
|
|
|
|
|
|
|
| 28 |
converter = PdfConverter(
|
| 29 |
-
config=
|
| 30 |
artifact_dict=model_dict,
|
| 31 |
processor_list=config_parser.get_processors(),
|
| 32 |
renderer=config_parser.get_renderer()
|
|
@@ -51,7 +52,6 @@ def img_to_html(img, img_alt):
|
|
| 51 |
def markdown_insert_images(markdown, images):
|
| 52 |
image_tags = re.findall(r'(!\[(?P<image_title>[^\]]*)\]\((?P<image_path>[^\)"\s]+)\s*([^\)]*)\))', markdown)
|
| 53 |
|
| 54 |
-
print(image_tags)
|
| 55 |
for image in image_tags:
|
| 56 |
image_markdown = image[0]
|
| 57 |
image_alt = image[1]
|
|
|
|
| 2 |
|
| 3 |
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
|
| 4 |
os.environ["IN_STREAMLIT"] = "true"
|
|
|
|
| 5 |
|
| 6 |
import base64
|
| 7 |
import io
|
|
|
|
| 24 |
|
| 25 |
def convert_pdf(fname: str, **kwargs) -> (str, Dict[str, Any], dict):
|
| 26 |
config_parser = ConfigParser(kwargs)
|
| 27 |
+
config_dict = config_parser.generate_config_dict()
|
| 28 |
+
config_dict["pdftext_workers"] = 1
|
| 29 |
converter = PdfConverter(
|
| 30 |
+
config=config_dict,
|
| 31 |
artifact_dict=model_dict,
|
| 32 |
processor_list=config_parser.get_processors(),
|
| 33 |
renderer=config_parser.get_renderer()
|
|
|
|
| 52 |
def markdown_insert_images(markdown, images):
|
| 53 |
image_tags = re.findall(r'(!\[(?P<image_title>[^\]]*)\]\((?P<image_path>[^\)"\s]+)\s*([^\)]*)\))', markdown)
|
| 54 |
|
|
|
|
| 55 |
for image in image_tags:
|
| 56 |
image_markdown = image[0]
|
| 57 |
image_alt = image[1]
|
marker_server.py
CHANGED
|
@@ -1,10 +1,5 @@
|
|
| 1 |
-
import argparse
|
| 2 |
-
import os
|
| 3 |
-
|
| 4 |
import click
|
| 5 |
|
| 6 |
-
os.environ["PDFTEXT_CPU_WORKERS"] = "1"
|
| 7 |
-
|
| 8 |
import uvicorn
|
| 9 |
from pydantic import BaseModel, Field
|
| 10 |
from starlette.responses import HTMLResponse
|
|
@@ -83,8 +78,10 @@ async def convert_pdf(
|
|
| 83 |
try:
|
| 84 |
options = params.model_dump()
|
| 85 |
config_parser = ConfigParser(options)
|
|
|
|
|
|
|
| 86 |
converter = PdfConverter(
|
| 87 |
-
config=
|
| 88 |
artifact_dict=app_data["models"],
|
| 89 |
processor_list=config_parser.get_processors(),
|
| 90 |
renderer=config_parser.get_renderer()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import click
|
| 2 |
|
|
|
|
|
|
|
| 3 |
import uvicorn
|
| 4 |
from pydantic import BaseModel, Field
|
| 5 |
from starlette.responses import HTMLResponse
|
|
|
|
| 78 |
try:
|
| 79 |
options = params.model_dump()
|
| 80 |
config_parser = ConfigParser(options)
|
| 81 |
+
config_dict = config_parser.generate_config_dict()
|
| 82 |
+
config_dict["pdftext_workers"] = 1
|
| 83 |
converter = PdfConverter(
|
| 84 |
+
config=config_dict,
|
| 85 |
artifact_dict=app_data["models"],
|
| 86 |
processor_list=config_parser.get_processors(),
|
| 87 |
renderer=config_parser.get_renderer()
|
run_marker_app.py
CHANGED
|
@@ -6,7 +6,7 @@ def run():
|
|
| 6 |
cur_dir = os.path.dirname(os.path.abspath(__file__))
|
| 7 |
app_path = os.path.join(cur_dir, "marker_app.py")
|
| 8 |
cmd = ["streamlit", "run", app_path]
|
| 9 |
-
subprocess.run(cmd, env={**os.environ, "IN_STREAMLIT": "true"
|
| 10 |
|
| 11 |
|
| 12 |
if __name__ == "__main__":
|
|
|
|
| 6 |
cur_dir = os.path.dirname(os.path.abspath(__file__))
|
| 7 |
app_path = os.path.join(cur_dir, "marker_app.py")
|
| 8 |
cmd = ["streamlit", "run", app_path]
|
| 9 |
+
subprocess.run(cmd, env={**os.environ, "IN_STREAMLIT": "true"})
|
| 10 |
|
| 11 |
|
| 12 |
if __name__ == "__main__":
|