Merge in master
Browse files- chunk_convert.py +1 -1
- convert.py +1 -1
- convert_single.py +1 -1
- marker/scripts/__init__.py +0 -5
- marker/scripts/server.py +1 -1
- marker/scripts/streamlit_app.py +4 -7
- marker_app.py +1 -1
- marker_server.py +1 -1
- poetry.lock +0 -0
- pyproject.toml +3 -3
- signatures/version1/cla.json +32 -0
chunk_convert.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
from marker.scripts import chunk_convert_cli
|
| 2 |
|
| 3 |
if __name__ == "__main__":
|
| 4 |
chunk_convert_cli()
|
|
|
|
| 1 |
+
from marker.scripts.chunk_convert import chunk_convert_cli
|
| 2 |
|
| 3 |
if __name__ == "__main__":
|
| 4 |
chunk_convert_cli()
|
convert.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
from marker.scripts import convert_cli
|
| 2 |
|
| 3 |
if __name__ == "__main__":
|
| 4 |
convert_cli()
|
|
|
|
| 1 |
+
from marker.scripts.convert import convert_cli
|
| 2 |
|
| 3 |
if __name__ == "__main__":
|
| 4 |
convert_cli()
|
convert_single.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
from marker.scripts import convert_single_cli
|
| 2 |
|
| 3 |
if __name__ == "__main__":
|
| 4 |
convert_single_cli()
|
|
|
|
| 1 |
+
from marker.scripts.convert_single import convert_single_cli
|
| 2 |
|
| 3 |
if __name__ == "__main__":
|
| 4 |
convert_single_cli()
|
marker/scripts/__init__.py
CHANGED
|
@@ -1,5 +0,0 @@
|
|
| 1 |
-
from marker.scripts.convert_single import convert_single_cli
|
| 2 |
-
from marker.scripts.convert import convert_cli
|
| 3 |
-
from marker.scripts.server import server_cli
|
| 4 |
-
from marker.scripts.run_streamlit_app import streamlit_app_cli
|
| 5 |
-
from marker.scripts.chunk_convert import chunk_convert_cli
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
marker/scripts/server.py
CHANGED
|
@@ -3,7 +3,6 @@ import traceback
|
|
| 3 |
import click
|
| 4 |
import os
|
| 5 |
|
| 6 |
-
import uvicorn
|
| 7 |
from pydantic import BaseModel, Field
|
| 8 |
from starlette.responses import HTMLResponse
|
| 9 |
|
|
@@ -163,6 +162,7 @@ async def convert_pdf_upload(
|
|
| 163 |
@click.option("--port", type=int, default=8000, help="Port to run the server on")
|
| 164 |
@click.option("--host", type=str, default="127.0.0.1", help="Host to run the server on")
|
| 165 |
def server_cli(port: int, host: str):
|
|
|
|
| 166 |
# Run the server
|
| 167 |
uvicorn.run(
|
| 168 |
app,
|
|
|
|
| 3 |
import click
|
| 4 |
import os
|
| 5 |
|
|
|
|
| 6 |
from pydantic import BaseModel, Field
|
| 7 |
from starlette.responses import HTMLResponse
|
| 8 |
|
|
|
|
| 162 |
@click.option("--port", type=int, default=8000, help="Port to run the server on")
|
| 163 |
@click.option("--host", type=str, default="127.0.0.1", help="Host to run the server on")
|
| 164 |
def server_cli(port: int, host: str):
|
| 165 |
+
import uvicorn
|
| 166 |
# Run the server
|
| 167 |
uvicorn.run(
|
| 168 |
app,
|
marker/scripts/streamlit_app.py
CHANGED
|
@@ -68,15 +68,12 @@ def markdown_insert_images(markdown, images):
|
|
| 68 |
def get_page_image(pdf_file, page_num, dpi=96):
|
| 69 |
if "pdf" in pdf_file.type:
|
| 70 |
doc = open_pdf(pdf_file)
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
page_indices=[page_num],
|
| 74 |
scale=dpi / 72,
|
| 75 |
-
)
|
| 76 |
-
png = list(renderer)[0]
|
| 77 |
-
png_image = png.convert("RGB")
|
| 78 |
else:
|
| 79 |
-
png_image = Image.open(
|
| 80 |
return png_image
|
| 81 |
|
| 82 |
|
|
|
|
| 68 |
def get_page_image(pdf_file, page_num, dpi=96):
|
| 69 |
if "pdf" in pdf_file.type:
|
| 70 |
doc = open_pdf(pdf_file)
|
| 71 |
+
page = doc[page_num]
|
| 72 |
+
png_image = page.render(
|
|
|
|
| 73 |
scale=dpi / 72,
|
| 74 |
+
).to_pil().convert("RGB")
|
|
|
|
|
|
|
| 75 |
else:
|
| 76 |
+
png_image = Image.open(pdf_file).convert("RGB")
|
| 77 |
return png_image
|
| 78 |
|
| 79 |
|
marker_app.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
from marker.scripts import streamlit_app_cli
|
| 2 |
|
| 3 |
if __name__ == "__main__":
|
| 4 |
streamlit_app_cli()
|
|
|
|
| 1 |
+
from marker.scripts.run_streamlit_app import streamlit_app_cli
|
| 2 |
|
| 3 |
if __name__ == "__main__":
|
| 4 |
streamlit_app_cli()
|
marker_server.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
from marker.scripts import server_cli
|
| 2 |
|
| 3 |
if __name__ == "__main__":
|
| 4 |
server_cli()
|
|
|
|
| 1 |
+
from marker.scripts.server import server_cli
|
| 2 |
|
| 3 |
if __name__ == "__main__":
|
| 4 |
server_cli()
|
poetry.lock
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
pyproject.toml
CHANGED
|
@@ -23,13 +23,12 @@ transformers = "^4.45.2"
|
|
| 23 |
python-dotenv = "^1.0.0"
|
| 24 |
torch = "^2.5.1"
|
| 25 |
tqdm = "^4.66.1"
|
| 26 |
-
tabulate = "^0.9.0"
|
| 27 |
ftfy = "^6.1.1"
|
| 28 |
texify = "^0.2.1"
|
| 29 |
rapidfuzz = "^3.8.1"
|
| 30 |
-
surya-ocr = "~0.
|
| 31 |
regex = "^2024.4.28"
|
| 32 |
-
pdftext = "~0.
|
| 33 |
markdownify = "^0.13.1"
|
| 34 |
click = "^8.1.7"
|
| 35 |
google-generativeai = "^0.8.3"
|
|
@@ -49,6 +48,7 @@ pytest-mock = "^3.14.0"
|
|
| 49 |
apted = "1.0.3"
|
| 50 |
distance = "0.1.3"
|
| 51 |
lxml = "5.3.0"
|
|
|
|
| 52 |
|
| 53 |
[tool.poetry.scripts]
|
| 54 |
marker = "marker.scripts.convert:convert_cli"
|
|
|
|
| 23 |
python-dotenv = "^1.0.0"
|
| 24 |
torch = "^2.5.1"
|
| 25 |
tqdm = "^4.66.1"
|
|
|
|
| 26 |
ftfy = "^6.1.1"
|
| 27 |
texify = "^0.2.1"
|
| 28 |
rapidfuzz = "^3.8.1"
|
| 29 |
+
surya-ocr = "~0.9.0"
|
| 30 |
regex = "^2024.4.28"
|
| 31 |
+
pdftext = "~0.5.0"
|
| 32 |
markdownify = "^0.13.1"
|
| 33 |
click = "^8.1.7"
|
| 34 |
google-generativeai = "^0.8.3"
|
|
|
|
| 48 |
apted = "1.0.3"
|
| 49 |
distance = "0.1.3"
|
| 50 |
lxml = "5.3.0"
|
| 51 |
+
tabulate = "^0.9.0"
|
| 52 |
|
| 53 |
[tool.poetry.scripts]
|
| 54 |
marker = "marker.scripts.convert:convert_cli"
|
signatures/version1/cla.json
CHANGED
|
@@ -111,6 +111,38 @@
|
|
| 111 |
"created_at": "2024-12-05T13:13:34Z",
|
| 112 |
"repoId": 712111618,
|
| 113 |
"pullRequestNo": 416
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
}
|
| 115 |
]
|
| 116 |
}
|
|
|
|
| 111 |
"created_at": "2024-12-05T13:13:34Z",
|
| 112 |
"repoId": 712111618,
|
| 113 |
"pullRequestNo": 416
|
| 114 |
+
},
|
| 115 |
+
{
|
| 116 |
+
"name": "tarun-menta",
|
| 117 |
+
"id": 66506307,
|
| 118 |
+
"comment_id": 2543907406,
|
| 119 |
+
"created_at": "2024-12-15T15:06:32Z",
|
| 120 |
+
"repoId": 712111618,
|
| 121 |
+
"pullRequestNo": 427
|
| 122 |
+
},
|
| 123 |
+
{
|
| 124 |
+
"name": "ZeyuTeng96",
|
| 125 |
+
"id": 96521059,
|
| 126 |
+
"comment_id": 2567236036,
|
| 127 |
+
"created_at": "2025-01-02T02:36:02Z",
|
| 128 |
+
"repoId": 712111618,
|
| 129 |
+
"pullRequestNo": 452
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"name": "xiaoyao9184",
|
| 133 |
+
"id": 6614349,
|
| 134 |
+
"comment_id": 2571623521,
|
| 135 |
+
"created_at": "2025-01-05T13:15:34Z",
|
| 136 |
+
"repoId": 712111618,
|
| 137 |
+
"pullRequestNo": 463
|
| 138 |
+
},
|
| 139 |
+
{
|
| 140 |
+
"name": "yasyf",
|
| 141 |
+
"id": 709645,
|
| 142 |
+
"comment_id": 2571679069,
|
| 143 |
+
"created_at": "2025-01-05T16:23:12Z",
|
| 144 |
+
"repoId": 712111618,
|
| 145 |
+
"pullRequestNo": 464
|
| 146 |
}
|
| 147 |
]
|
| 148 |
}
|