Victor Yang
commited on
Commit
·
d836ba5
1
Parent(s):
883efb2
Add Dockerfile for Hugging Face Space deployment, update README with deployment instructions and API usage, and create requirements.txt for dependencies
Browse files- Dockerfile +31 -0
- README.md +79 -0
- marker/scripts/server.py +28 -2
- requirements.txt +36 -0
Dockerfile
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use PyTorch base image with CUDA support
|
| 2 |
+
FROM pytorch/pytorch:2.9.1-cuda12.6-cudnn9-runtime
|
| 3 |
+
|
| 4 |
+
# Set up a new user named "user" with user ID 1000
|
| 5 |
+
RUN useradd -m -u 1000 user
|
| 6 |
+
|
| 7 |
+
# Switch to the "user" user
|
| 8 |
+
USER user
|
| 9 |
+
|
| 10 |
+
# Set home to the user's home directory
|
| 11 |
+
ENV HOME=/home/user \
|
| 12 |
+
PATH=/home/user/.local/bin:$PATH
|
| 13 |
+
|
| 14 |
+
# Set the working directory to the user's home directory
|
| 15 |
+
WORKDIR $HOME/app
|
| 16 |
+
|
| 17 |
+
# Try and run pip command after setting the user with `USER user` to avoid permission issues with Python
|
| 18 |
+
RUN pip install --no-cache-dir --upgrade pip
|
| 19 |
+
|
| 20 |
+
# Copy requirements.txt and install dependencies
|
| 21 |
+
COPY --chown=user requirements.txt $HOME/app/
|
| 22 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 23 |
+
|
| 24 |
+
# Copy the current directory contents into the container at $HOME/app setting the owner to the user
|
| 25 |
+
COPY --chown=user . $HOME/app
|
| 26 |
+
|
| 27 |
+
# Expose port 7860 (Hugging Face Space default port)
|
| 28 |
+
EXPOSE 7860
|
| 29 |
+
|
| 30 |
+
# Set the startup command
|
| 31 |
+
CMD ["uvicorn", "marker.scripts.server:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
|
@@ -1,3 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# Marker
|
| 2 |
|
| 3 |
Marker converts documents to markdown, JSON, chunks, and HTML quickly and accurately.
|
|
@@ -433,6 +442,76 @@ requests.post("http://localhost:8001/marker", data=json.dumps(post_data)).json()
|
|
| 433 |
|
| 434 |
Note that this is not a very robust API, and is only intended for small-scale use. If you want to use this server, but want a more robust conversion option, you can use the hosted [Datalab API](https://www.datalab.to/plans).
|
| 435 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 436 |
# Troubleshooting
|
| 437 |
|
| 438 |
There are some settings that you may find useful if things aren't working the way you expect:
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Marker PDF Converter
|
| 3 |
+
emoji: 📄
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: purple
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
+
---
|
| 9 |
+
|
| 10 |
# Marker
|
| 11 |
|
| 12 |
Marker converts documents to markdown, JSON, chunks, and HTML quickly and accurately.
|
|
|
|
| 442 |
|
| 443 |
Note that this is not a very robust API, and is only intended for small-scale use. If you want to use this server, but want a more robust conversion option, you can use the hosted [Datalab API](https://www.datalab.to/plans).
|
| 444 |
|
| 445 |
+
## Deploying to Hugging Face Space
|
| 446 |
+
|
| 447 |
+
This project can be deployed to Hugging Face Space as a Docker Space. The API includes token authentication for security.
|
| 448 |
+
|
| 449 |
+
### Setting up Secrets
|
| 450 |
+
|
| 451 |
+
1. Go to your Space Settings on Hugging Face
|
| 452 |
+
2. Navigate to the "Variables and secrets" section
|
| 453 |
+
3. Create a new secret named `API_TOKEN`
|
| 454 |
+
4. Set the secret value to your desired authentication token
|
| 455 |
+
|
| 456 |
+
The token will be automatically injected as an environment variable at runtime.
|
| 457 |
+
|
| 458 |
+
### Using the API
|
| 459 |
+
|
| 460 |
+
Once deployed, the API requires authentication via Bearer token in the request header:
|
| 461 |
+
|
| 462 |
+
```python
|
| 463 |
+
import requests
|
| 464 |
+
|
| 465 |
+
# Your token (set in Space secrets)
|
| 466 |
+
token = "your-token-here"
|
| 467 |
+
|
| 468 |
+
# Example: Convert PDF via file upload
|
| 469 |
+
url = "https://your-space.hf.space/marker/upload"
|
| 470 |
+
headers = {
|
| 471 |
+
"Authorization": f"Bearer {token}"
|
| 472 |
+
}
|
| 473 |
+
files = {
|
| 474 |
+
"file": ("document.pdf", open("document.pdf", "rb"), "application/pdf")
|
| 475 |
+
}
|
| 476 |
+
data = {
|
| 477 |
+
"output_format": "markdown",
|
| 478 |
+
"force_ocr": False
|
| 479 |
+
}
|
| 480 |
+
|
| 481 |
+
response = requests.post(url, headers=headers, files=files, data=data)
|
| 482 |
+
result = response.json()
|
| 483 |
+
print(result["output"])
|
| 484 |
+
```
|
| 485 |
+
|
| 486 |
+
Or using the JSON endpoint:
|
| 487 |
+
|
| 488 |
+
```python
|
| 489 |
+
import requests
|
| 490 |
+
import json
|
| 491 |
+
|
| 492 |
+
url = "https://your-space.hf.space/marker"
|
| 493 |
+
headers = {
|
| 494 |
+
"Authorization": f"Bearer {token}",
|
| 495 |
+
"Content-Type": "application/json"
|
| 496 |
+
}
|
| 497 |
+
data = {
|
| 498 |
+
"filepath": "/path/to/file.pdf",
|
| 499 |
+
"output_format": "markdown"
|
| 500 |
+
}
|
| 501 |
+
|
| 502 |
+
response = requests.post(url, headers=headers, data=json.dumps(data))
|
| 503 |
+
result = response.json()
|
| 504 |
+
```
|
| 505 |
+
|
| 506 |
+
### API Endpoints
|
| 507 |
+
|
| 508 |
+
- `GET /` - API information page (no authentication required)
|
| 509 |
+
- `GET /docs` - Interactive API documentation (no authentication required)
|
| 510 |
+
- `POST /marker` - Convert PDF using file path (requires authentication)
|
| 511 |
+
- `POST /marker/upload` - Convert PDF via file upload (requires authentication)
|
| 512 |
+
|
| 513 |
+
All POST endpoints require the `Authorization: Bearer <token>` header with a valid token.
|
| 514 |
+
|
| 515 |
# Troubleshooting
|
| 516 |
|
| 517 |
There are some settings that you may find useful if things aren't working the way you expect:
|
marker/scripts/server.py
CHANGED
|
@@ -5,6 +5,8 @@ import os
|
|
| 5 |
|
| 6 |
from pydantic import BaseModel, Field
|
| 7 |
from starlette.responses import HTMLResponse
|
|
|
|
|
|
|
| 8 |
|
| 9 |
from marker.config.parser import ConfigParser
|
| 10 |
from marker.output import text_from_rendered
|
|
@@ -14,7 +16,6 @@ from contextlib import asynccontextmanager
|
|
| 14 |
from typing import Optional, Annotated
|
| 15 |
import io
|
| 16 |
|
| 17 |
-
from fastapi import FastAPI, Form, File, UploadFile
|
| 18 |
from marker.converters.pdf import PdfConverter
|
| 19 |
from marker.models import create_model_dict
|
| 20 |
from marker.settings import settings
|
|
@@ -38,6 +39,30 @@ async def lifespan(app: FastAPI):
|
|
| 38 |
|
| 39 |
app = FastAPI(lifespan=lifespan)
|
| 40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
@app.get("/")
|
| 43 |
async def root():
|
|
@@ -128,7 +153,7 @@ async def _convert_pdf(params: CommonParams):
|
|
| 128 |
|
| 129 |
|
| 130 |
@app.post("/marker")
|
| 131 |
-
async def convert_pdf(params: CommonParams):
|
| 132 |
return await _convert_pdf(params)
|
| 133 |
|
| 134 |
|
|
@@ -141,6 +166,7 @@ async def convert_pdf_upload(
|
|
| 141 |
file: UploadFile = File(
|
| 142 |
..., description="The PDF file to convert.", media_type="application/pdf"
|
| 143 |
),
|
|
|
|
| 144 |
):
|
| 145 |
upload_path = os.path.join(UPLOAD_DIRECTORY, file.filename)
|
| 146 |
with open(upload_path, "wb+") as upload_file:
|
|
|
|
| 5 |
|
| 6 |
from pydantic import BaseModel, Field
|
| 7 |
from starlette.responses import HTMLResponse
|
| 8 |
+
from fastapi import FastAPI, Form, File, UploadFile, Depends, HTTPException, status
|
| 9 |
+
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
|
| 10 |
|
| 11 |
from marker.config.parser import ConfigParser
|
| 12 |
from marker.output import text_from_rendered
|
|
|
|
| 16 |
from typing import Optional, Annotated
|
| 17 |
import io
|
| 18 |
|
|
|
|
| 19 |
from marker.converters.pdf import PdfConverter
|
| 20 |
from marker.models import create_model_dict
|
| 21 |
from marker.settings import settings
|
|
|
|
| 39 |
|
| 40 |
app = FastAPI(lifespan=lifespan)
|
| 41 |
|
| 42 |
+
# Token verification
|
| 43 |
+
security = HTTPBearer()
|
| 44 |
+
|
| 45 |
+
def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
|
| 46 |
+
"""
|
| 47 |
+
Verify the token from Authorization header.
|
| 48 |
+
Token is read from environment variable API_TOKEN.
|
| 49 |
+
"""
|
| 50 |
+
token = os.environ.get("API_TOKEN")
|
| 51 |
+
|
| 52 |
+
if not token:
|
| 53 |
+
# If no token is set in environment, allow access (for local development)
|
| 54 |
+
# In production, you should raise an exception here
|
| 55 |
+
return True
|
| 56 |
+
|
| 57 |
+
if credentials.credentials != token:
|
| 58 |
+
raise HTTPException(
|
| 59 |
+
status_code=status.HTTP_401_UNAUTHORIZED,
|
| 60 |
+
detail="Invalid authentication token",
|
| 61 |
+
headers={"WWW-Authenticate": "Bearer"},
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
return True
|
| 65 |
+
|
| 66 |
|
| 67 |
@app.get("/")
|
| 68 |
async def root():
|
|
|
|
| 153 |
|
| 154 |
|
| 155 |
@app.post("/marker")
|
| 156 |
+
async def convert_pdf(params: CommonParams, token_verified: bool = Depends(verify_token)):
|
| 157 |
return await _convert_pdf(params)
|
| 158 |
|
| 159 |
|
|
|
|
| 166 |
file: UploadFile = File(
|
| 167 |
..., description="The PDF file to convert.", media_type="application/pdf"
|
| 168 |
),
|
| 169 |
+
token_verified: bool = Depends(verify_token),
|
| 170 |
):
|
| 171 |
upload_path = os.path.join(UPLOAD_DIRECTORY, file.filename)
|
| 172 |
with open(upload_path, "wb+") as upload_file:
|
requirements.txt
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core dependencies for marker
|
| 2 |
+
# Note: PyTorch 2.9.1 is already included in the base image (pytorch/pytorch:2.9.1-cuda12.6-cudnn9-runtime)
|
| 3 |
+
# So we don't need to install torch separately
|
| 4 |
+
|
| 5 |
+
# FastAPI and server dependencies
|
| 6 |
+
fastapi>=0.115.4
|
| 7 |
+
uvicorn>=0.32.0
|
| 8 |
+
python-multipart>=0.0.16
|
| 9 |
+
|
| 10 |
+
# Marker core dependencies
|
| 11 |
+
Pillow>=10.1.0
|
| 12 |
+
pydantic>=2.4.2
|
| 13 |
+
pydantic-settings>=2.0.3
|
| 14 |
+
transformers>=4.45.2
|
| 15 |
+
python-dotenv>=1.0.0
|
| 16 |
+
tqdm>=4.66.1
|
| 17 |
+
ftfy>=6.1.1
|
| 18 |
+
rapidfuzz>=3.8.1
|
| 19 |
+
surya-ocr>=0.17.0
|
| 20 |
+
regex>=2024.4.28
|
| 21 |
+
pdftext~=0.6.3
|
| 22 |
+
markdownify>=1.1.0
|
| 23 |
+
click>=8.2.0
|
| 24 |
+
markdown2>=2.5.2
|
| 25 |
+
filetype>=1.2.0
|
| 26 |
+
google-genai>=1.0.0
|
| 27 |
+
anthropic>=0.46.0
|
| 28 |
+
scikit-learn>=1.6.1
|
| 29 |
+
openai>=1.65.2
|
| 30 |
+
|
| 31 |
+
# Optional dependencies for full document support
|
| 32 |
+
mammoth>=1.9.0
|
| 33 |
+
openpyxl>=3.1.5
|
| 34 |
+
python-pptx>=1.0.2
|
| 35 |
+
ebooklib>=0.18
|
| 36 |
+
weasyprint>=63.1
|