Vik Paruchuri
commited on
Commit
·
173a1b8
1
Parent(s):
5d1097f
Fix ray version
Browse files- convert.py +0 -1
- marker/debug/data.py +1 -2
- marker/settings.py +1 -2
- poetry.lock +28 -29
- pyproject.toml +3 -3
convert.py
CHANGED
|
@@ -87,7 +87,6 @@ def main():
|
|
| 87 |
num_gpus=1 if settings.CUDA else 0,
|
| 88 |
storage=settings.RAY_CACHE_PATH,
|
| 89 |
_temp_dir=settings.RAY_CACHE_PATH,
|
| 90 |
-
dashboard_host=settings.RAY_DASHBOARD_HOST,
|
| 91 |
log_to_driver=settings.DEBUG
|
| 92 |
)
|
| 93 |
|
|
|
|
| 87 |
num_gpus=1 if settings.CUDA else 0,
|
| 88 |
storage=settings.RAY_CACHE_PATH,
|
| 89 |
_temp_dir=settings.RAY_CACHE_PATH,
|
|
|
|
| 90 |
log_to_driver=settings.DEBUG
|
| 91 |
)
|
| 92 |
|
marker/debug/data.py
CHANGED
|
@@ -21,11 +21,10 @@ def dump_equation_debug_data(doc, images, converted_spans):
|
|
| 21 |
assert len(converted_spans) == len(images)
|
| 22 |
|
| 23 |
data_lines = []
|
| 24 |
-
for idx, (
|
| 25 |
if converted_span is None:
|
| 26 |
continue
|
| 27 |
# Image is a BytesIO object
|
| 28 |
-
pil_image = Image.open(image)
|
| 29 |
img_bytes = io.BytesIO()
|
| 30 |
pil_image.save(img_bytes, format="WEBP", lossless=True)
|
| 31 |
b64_image = base64.b64encode(img_bytes.getvalue()).decode("utf-8")
|
|
|
|
| 21 |
assert len(converted_spans) == len(images)
|
| 22 |
|
| 23 |
data_lines = []
|
| 24 |
+
for idx, (pil_image, converted_span) in enumerate(zip(images, converted_spans)):
|
| 25 |
if converted_span is None:
|
| 26 |
continue
|
| 27 |
# Image is a BytesIO object
|
|
|
|
| 28 |
img_bytes = io.BytesIO()
|
| 29 |
pil_image.save(img_bytes, format="WEBP", lossless=True)
|
| 30 |
b64_image = base64.b64encode(img_bytes.getvalue()).decode("utf-8")
|
marker/settings.py
CHANGED
|
@@ -78,7 +78,7 @@ class Settings(BaseSettings):
|
|
| 78 |
TEXIFY_MODEL_MAX: int = 384 # Max inference length for texify
|
| 79 |
TEXIFY_TOKEN_BUFFER: int = 256 # Number of tokens to buffer above max for texify
|
| 80 |
TEXIFY_DPI: int = 96 # DPI to render images at
|
| 81 |
-
TEXIFY_BATCH_SIZE: int =
|
| 82 |
TEXIFY_MODEL_NAME: str = "vikp/texify"
|
| 83 |
|
| 84 |
# Layout model
|
|
@@ -102,7 +102,6 @@ class Settings(BaseSettings):
|
|
| 102 |
|
| 103 |
# Ray
|
| 104 |
RAY_CACHE_PATH: Optional[str] = None # Where to save ray cache
|
| 105 |
-
RAY_DASHBOARD_HOST: str = "127.0.0.1"
|
| 106 |
RAY_CORES_PER_WORKER: int = 1 # How many cpu cores to allocate per worker
|
| 107 |
|
| 108 |
# Debug
|
|
|
|
| 78 |
TEXIFY_MODEL_MAX: int = 384 # Max inference length for texify
|
| 79 |
TEXIFY_TOKEN_BUFFER: int = 256 # Number of tokens to buffer above max for texify
|
| 80 |
TEXIFY_DPI: int = 96 # DPI to render images at
|
| 81 |
+
TEXIFY_BATCH_SIZE: int = 2 if TORCH_DEVICE_MODEL == "cpu" else 6 # Batch size for texify, lower on cpu due to float32
|
| 82 |
TEXIFY_MODEL_NAME: str = "vikp/texify"
|
| 83 |
|
| 84 |
# Layout model
|
|
|
|
| 102 |
|
| 103 |
# Ray
|
| 104 |
RAY_CACHE_PATH: Optional[str] = None # Where to save ray cache
|
|
|
|
| 105 |
RAY_CORES_PER_WORKER: int = 1 # How many cpu cores to allocate per worker
|
| 106 |
|
| 107 |
# Debug
|
poetry.lock
CHANGED
|
@@ -3341,31 +3341,31 @@ full = ["numpy"]
|
|
| 3341 |
|
| 3342 |
[[package]]
|
| 3343 |
name = "ray"
|
| 3344 |
-
version = "2.
|
| 3345 |
description = "Ray provides a simple, universal API for building distributed applications."
|
| 3346 |
optional = false
|
| 3347 |
-
python-versions = "
|
| 3348 |
files = [
|
| 3349 |
-
{file = "ray-2.
|
| 3350 |
-
{file = "ray-2.
|
| 3351 |
-
{file = "ray-2.
|
| 3352 |
-
{file = "ray-2.
|
| 3353 |
-
{file = "ray-2.
|
| 3354 |
-
{file = "ray-2.
|
| 3355 |
-
{file = "ray-2.
|
| 3356 |
-
{file = "ray-2.
|
| 3357 |
-
{file = "ray-2.
|
| 3358 |
-
{file = "ray-2.
|
| 3359 |
-
{file = "ray-2.
|
| 3360 |
-
{file = "ray-2.
|
| 3361 |
-
{file = "ray-2.
|
| 3362 |
-
{file = "ray-2.
|
| 3363 |
-
{file = "ray-2.
|
| 3364 |
-
{file = "ray-2.
|
| 3365 |
-
{file = "ray-2.
|
| 3366 |
-
{file = "ray-2.
|
| 3367 |
-
{file = "ray-2.
|
| 3368 |
-
{file = "ray-2.
|
| 3369 |
]
|
| 3370 |
|
| 3371 |
[package.dependencies]
|
|
@@ -3375,23 +3375,22 @@ filelock = "*"
|
|
| 3375 |
frozenlist = "*"
|
| 3376 |
jsonschema = "*"
|
| 3377 |
msgpack = ">=1.0.0,<2.0.0"
|
| 3378 |
-
numpy = {version = ">=1.19.3", markers = "python_version >= \"3.9\""}
|
| 3379 |
packaging = "*"
|
| 3380 |
protobuf = ">=3.15.3,<3.19.5 || >3.19.5"
|
| 3381 |
pyyaml = "*"
|
| 3382 |
requests = "*"
|
| 3383 |
|
| 3384 |
[package.extras]
|
| 3385 |
-
air = ["aiohttp (>=3.7)", "aiohttp-cors", "aiorwlock", "colorful", "fastapi", "fsspec", "gpustat (>=1.0.0)", "grpcio (>=1.32.0)", "grpcio (>=1.42.0)", "numpy (>=1.20)", "opencensus", "pandas", "pandas (>=1.3)", "prometheus-client (>=0.7.1)", "py-spy (>=0.2.0)", "pyarrow (>=6.0.1)", "pydantic (<2)", "requests", "smart-open", "starlette", "tensorboardX (>=1.9)", "uvicorn[standard]", "virtualenv (>=20.0.24,<20.21.1)", "watchfiles"]
|
| 3386 |
-
all = ["aiohttp (>=3.7)", "aiohttp-cors", "aiorwlock", "colorful", "dm-tree", "fastapi", "fsspec", "gpustat (>=1.0.0)", "grpcio (!=1.56.0)", "grpcio (>=1.32.0)", "grpcio (>=1.42.0)", "gymnasium (==0.28.1)", "lz4", "numpy (>=1.20)", "opencensus", "opentelemetry-api", "opentelemetry-exporter-otlp", "opentelemetry-sdk", "pandas", "pandas (>=1.3)", "prometheus-client (>=0.7.1)", "py-spy (>=0.2.0)", "pyarrow (>=6.0.1)", "pydantic (<2)", "pyyaml", "ray-cpp (==2.
|
| 3387 |
client = ["grpcio (!=1.56.0)"]
|
| 3388 |
-
cpp = ["ray-cpp (==2.
|
| 3389 |
data = ["fsspec", "numpy (>=1.20)", "pandas (>=1.3)", "pyarrow (>=6.0.1)"]
|
| 3390 |
-
default = ["aiohttp (>=3.7)", "aiohttp-cors", "colorful", "gpustat (>=1.0.0)", "grpcio (>=1.32.0)", "grpcio (>=1.42.0)", "opencensus", "prometheus-client (>=0.7.1)", "py-spy (>=0.2.0)", "pydantic (<2)", "requests", "smart-open", "virtualenv (>=20.0.24,<20.21.1)"]
|
| 3391 |
observability = ["opentelemetry-api", "opentelemetry-exporter-otlp", "opentelemetry-sdk"]
|
| 3392 |
rllib = ["dm-tree", "fsspec", "gymnasium (==0.28.1)", "lz4", "pandas", "pyarrow (>=6.0.1)", "pyyaml", "requests", "rich", "scikit-image", "scipy", "tensorboardX (>=1.9)", "typer"]
|
| 3393 |
-
serve = ["aiohttp (>=3.7)", "aiohttp-cors", "aiorwlock", "colorful", "fastapi", "gpustat (>=1.0.0)", "grpcio (>=1.32.0)", "grpcio (>=1.42.0)", "opencensus", "prometheus-client (>=0.7.1)", "py-spy (>=0.2.0)", "pydantic (<2)", "requests", "smart-open", "starlette", "uvicorn[standard]", "virtualenv (>=20.0.24,<20.21.1)", "watchfiles"]
|
| 3394 |
-
serve-grpc = ["aiohttp (>=3.7)", "aiohttp-cors", "aiorwlock", "colorful", "fastapi", "gpustat (>=1.0.0)", "grpcio (>=1.32.0)", "grpcio (>=1.42.0)", "opencensus", "prometheus-client (>=0.7.1)", "py-spy (>=0.2.0)", "pydantic (<2)", "requests", "smart-open", "starlette", "uvicorn[standard]", "virtualenv (>=20.0.24,<20.21.1)", "watchfiles"]
|
| 3395 |
train = ["fsspec", "pandas", "pyarrow (>=6.0.1)", "requests", "tensorboardX (>=1.9)"]
|
| 3396 |
tune = ["fsspec", "pandas", "pyarrow (>=6.0.1)", "requests", "tensorboardX (>=1.9)"]
|
| 3397 |
|
|
|
|
| 3341 |
|
| 3342 |
[[package]]
|
| 3343 |
name = "ray"
|
| 3344 |
+
version = "2.9.0"
|
| 3345 |
description = "Ray provides a simple, universal API for building distributed applications."
|
| 3346 |
optional = false
|
| 3347 |
+
python-versions = ">=3.8"
|
| 3348 |
files = [
|
| 3349 |
+
{file = "ray-2.9.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:eca277062646ef4ce87ffe249a0a816dba0b80c5720708c9973dcb6c17527fa1"},
|
| 3350 |
+
{file = "ray-2.9.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:15e075f647b52ec210538985b4cb2665f64fb76acab77f66f1893653964db64e"},
|
| 3351 |
+
{file = "ray-2.9.0-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:ef8ba4d6126d8aacfc611b967a23e3e9571edf010756277991e8de9af56bd0ee"},
|
| 3352 |
+
{file = "ray-2.9.0-cp310-cp310-manylinux2014_x86_64.whl", hash = "sha256:bb79596c449c4ba027bc9839299617d8c876b1a5b61f16a1e401aa901ad45183"},
|
| 3353 |
+
{file = "ray-2.9.0-cp310-cp310-win_amd64.whl", hash = "sha256:724ff0103919fb98181010cfbcd0d52a1b78b0dc84cbfd6e7ea0094b74e90a26"},
|
| 3354 |
+
{file = "ray-2.9.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:47d9d949e362112213bc53631b08183d1fe254d66d58131377cee913e5891597"},
|
| 3355 |
+
{file = "ray-2.9.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b2211c39bae3f415e32fe9fe23f67acfea4cff80fc37fb794a5767497ac8f2b7"},
|
| 3356 |
+
{file = "ray-2.9.0-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:1751d9672208b7142b9dbc6de9766ffc92e1a7fe522ca45bcc88bbf88ca5d202"},
|
| 3357 |
+
{file = "ray-2.9.0-cp311-cp311-manylinux2014_x86_64.whl", hash = "sha256:13c555fe730fce355726e8dae7a7d6cedbe470a7e125748008ebfc44b0c5827d"},
|
| 3358 |
+
{file = "ray-2.9.0-cp311-cp311-win_amd64.whl", hash = "sha256:1dcf0b476f97bd552531279bb8a1c0b677001433e522cc0f33ffe29c920ed693"},
|
| 3359 |
+
{file = "ray-2.9.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:585aa849afb1cadc0933dc5d251bb8fffe87b7b87b312ca66065b058e2fc2821"},
|
| 3360 |
+
{file = "ray-2.9.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b4108832754156cbf296402c5e44ad23758ac190ef923ff91036dbddde6a2d3d"},
|
| 3361 |
+
{file = "ray-2.9.0-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:06f34afc29fd392361435aa5425630d3851824e923263607cb0a5404083a23f9"},
|
| 3362 |
+
{file = "ray-2.9.0-cp38-cp38-manylinux2014_x86_64.whl", hash = "sha256:d6f2335a1d7724143e2732e7c4761ee9b572ec924445515808b0951f362a4dbf"},
|
| 3363 |
+
{file = "ray-2.9.0-cp38-cp38-win_amd64.whl", hash = "sha256:93372482171c69e5543aae4cb739bcbe671d5c7d498c0ce761c23813e0f35b84"},
|
| 3364 |
+
{file = "ray-2.9.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:013984b5d76b3ce63ab4616a5e57b4545524003d8b3df27df90007545cc6e364"},
|
| 3365 |
+
{file = "ray-2.9.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f245d0a45a32e67e1279bffc02b33ebe73fedd679c00f6b1623681275aa3f488"},
|
| 3366 |
+
{file = "ray-2.9.0-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:e54cef078e75718a56fe65d4b5be14e7193fc0743c6dba3e6d78ad1284e13556"},
|
| 3367 |
+
{file = "ray-2.9.0-cp39-cp39-manylinux2014_x86_64.whl", hash = "sha256:dabba731106e3a5f0093d2eeae21c822db1f01768e7806eb4f39f06db94eec12"},
|
| 3368 |
+
{file = "ray-2.9.0-cp39-cp39-win_amd64.whl", hash = "sha256:8de5efb388d503bb35d92f1570b8456cf3f2d01e856a9003814164356d2d75e7"},
|
| 3369 |
]
|
| 3370 |
|
| 3371 |
[package.dependencies]
|
|
|
|
| 3375 |
frozenlist = "*"
|
| 3376 |
jsonschema = "*"
|
| 3377 |
msgpack = ">=1.0.0,<2.0.0"
|
|
|
|
| 3378 |
packaging = "*"
|
| 3379 |
protobuf = ">=3.15.3,<3.19.5 || >3.19.5"
|
| 3380 |
pyyaml = "*"
|
| 3381 |
requests = "*"
|
| 3382 |
|
| 3383 |
[package.extras]
|
| 3384 |
+
air = ["aiohttp (>=3.7)", "aiohttp-cors", "aiorwlock", "colorful", "fastapi", "fsspec", "gpustat (>=1.0.0)", "grpcio (>=1.32.0)", "grpcio (>=1.42.0)", "numpy (>=1.20)", "opencensus", "pandas", "pandas (>=1.3)", "prometheus-client (>=0.7.1)", "py-spy (>=0.2.0)", "pyarrow (>=6.0.1)", "pydantic (<2.0.dev0 || >=2.5.dev0,<3)", "requests", "smart-open", "starlette", "tensorboardX (>=1.9)", "uvicorn[standard]", "virtualenv (>=20.0.24,<20.21.1)", "watchfiles"]
|
| 3385 |
+
all = ["aiohttp (>=3.7)", "aiohttp-cors", "aiorwlock", "colorful", "dm-tree", "fastapi", "fsspec", "gpustat (>=1.0.0)", "grpcio (!=1.56.0)", "grpcio (>=1.32.0)", "grpcio (>=1.42.0)", "gymnasium (==0.28.1)", "lz4", "numpy (>=1.20)", "opencensus", "opentelemetry-api", "opentelemetry-exporter-otlp", "opentelemetry-sdk", "pandas", "pandas (>=1.3)", "prometheus-client (>=0.7.1)", "py-spy (>=0.2.0)", "pyarrow (>=6.0.1)", "pydantic (<2.0.dev0 || >=2.5.dev0,<3)", "pyyaml", "ray-cpp (==2.9.0)", "requests", "rich", "scikit-image", "scipy", "smart-open", "starlette", "tensorboardX (>=1.9)", "typer", "uvicorn[standard]", "virtualenv (>=20.0.24,<20.21.1)", "watchfiles"]
|
| 3386 |
client = ["grpcio (!=1.56.0)"]
|
| 3387 |
+
cpp = ["ray-cpp (==2.9.0)"]
|
| 3388 |
data = ["fsspec", "numpy (>=1.20)", "pandas (>=1.3)", "pyarrow (>=6.0.1)"]
|
| 3389 |
+
default = ["aiohttp (>=3.7)", "aiohttp-cors", "colorful", "gpustat (>=1.0.0)", "grpcio (>=1.32.0)", "grpcio (>=1.42.0)", "opencensus", "prometheus-client (>=0.7.1)", "py-spy (>=0.2.0)", "pydantic (<2.0.dev0 || >=2.5.dev0,<3)", "requests", "smart-open", "virtualenv (>=20.0.24,<20.21.1)"]
|
| 3390 |
observability = ["opentelemetry-api", "opentelemetry-exporter-otlp", "opentelemetry-sdk"]
|
| 3391 |
rllib = ["dm-tree", "fsspec", "gymnasium (==0.28.1)", "lz4", "pandas", "pyarrow (>=6.0.1)", "pyyaml", "requests", "rich", "scikit-image", "scipy", "tensorboardX (>=1.9)", "typer"]
|
| 3392 |
+
serve = ["aiohttp (>=3.7)", "aiohttp-cors", "aiorwlock", "colorful", "fastapi", "gpustat (>=1.0.0)", "grpcio (>=1.32.0)", "grpcio (>=1.42.0)", "opencensus", "prometheus-client (>=0.7.1)", "py-spy (>=0.2.0)", "pydantic (<2.0.dev0 || >=2.5.dev0,<3)", "requests", "smart-open", "starlette", "uvicorn[standard]", "virtualenv (>=20.0.24,<20.21.1)", "watchfiles"]
|
| 3393 |
+
serve-grpc = ["aiohttp (>=3.7)", "aiohttp-cors", "aiorwlock", "colorful", "fastapi", "gpustat (>=1.0.0)", "grpcio (>=1.32.0)", "grpcio (>=1.42.0)", "opencensus", "prometheus-client (>=0.7.1)", "py-spy (>=0.2.0)", "pydantic (<2.0.dev0 || >=2.5.dev0,<3)", "requests", "smart-open", "starlette", "uvicorn[standard]", "virtualenv (>=20.0.24,<20.21.1)", "watchfiles"]
|
| 3394 |
train = ["fsspec", "pandas", "pyarrow (>=6.0.1)", "requests", "tensorboardX (>=1.9)"]
|
| 3395 |
tune = ["fsspec", "pandas", "pyarrow (>=6.0.1)", "requests", "tensorboardX (>=1.9)"]
|
| 3396 |
|
pyproject.toml
CHANGED
|
@@ -27,11 +27,11 @@ PyMuPDF = "^1.23.5"
|
|
| 27 |
pymupdf-fonts = "^1.0.5"
|
| 28 |
pydantic = "^2.4.2"
|
| 29 |
pydantic-settings = "^2.0.3"
|
| 30 |
-
transformers = "^4.
|
| 31 |
numpy = "^1.26.1"
|
| 32 |
python-dotenv = "^1.0.0"
|
| 33 |
-
torch = "^2.1.
|
| 34 |
-
ray = "^2.
|
| 35 |
tqdm = "^4.66.1"
|
| 36 |
tabulate = "^0.9.0"
|
| 37 |
thefuzz = "^0.20.0"
|
|
|
|
| 27 |
pymupdf-fonts = "^1.0.5"
|
| 28 |
pydantic = "^2.4.2"
|
| 29 |
pydantic-settings = "^2.0.3"
|
| 30 |
+
transformers = "^4.36.2"
|
| 31 |
numpy = "^1.26.1"
|
| 32 |
python-dotenv = "^1.0.0"
|
| 33 |
+
torch = "^2.1.2"
|
| 34 |
+
ray = "^2.9.0"
|
| 35 |
tqdm = "^4.66.1"
|
| 36 |
tabulate = "^0.9.0"
|
| 37 |
thefuzz = "^0.20.0"
|