Vik Paruchuri
commited on
Commit
·
692dda6
1
Parent(s):
d7ecc8c
Python 3.13 support
Browse files- README.md +1 -1
- benchmarks/overall.py +1 -1
- poetry.lock +105 -121
- pyproject.toml +1 -1
- scripts/verify_benchmark_scores.py +1 -1
README.md
CHANGED
|
@@ -244,7 +244,7 @@ poetry install
|
|
| 244 |
Download the benchmark data [here](https://drive.google.com/file/d/1ZSeWDo2g1y0BRLT7KnbmytV2bjWARWba/view?usp=sharing) and unzip. Then run the overall benchmark like this:
|
| 245 |
|
| 246 |
```shell
|
| 247 |
-
python
|
| 248 |
```
|
| 249 |
|
| 250 |
This will benchmark marker against other text extraction methods. It sets up batch sizes for nougat and marker to use a similar amount of GPU RAM for each.
|
|
|
|
| 244 |
Download the benchmark data [here](https://drive.google.com/file/d/1ZSeWDo2g1y0BRLT7KnbmytV2bjWARWba/view?usp=sharing) and unzip. Then run the overall benchmark like this:
|
| 245 |
|
| 246 |
```shell
|
| 247 |
+
python benchmarks/overall.py data/pdfs data/references report.json --nougat
|
| 248 |
```
|
| 249 |
|
| 250 |
This will benchmark marker against other text extraction methods. It sets up batch sizes for nougat and marker to use a similar amount of GPU RAM for each.
|
benchmarks/overall.py
CHANGED
|
@@ -48,7 +48,7 @@ def nougat_prediction(pdf_filename, batch_size=1):
|
|
| 48 |
|
| 49 |
|
| 50 |
def main():
|
| 51 |
-
parser = argparse.ArgumentParser(description="Benchmark PDF to MD conversion. Needs source pdfs, and a
|
| 52 |
parser.add_argument("in_folder", help="Input PDF files")
|
| 53 |
parser.add_argument("reference_folder", help="Reference folder with reference markdown files")
|
| 54 |
parser.add_argument("out_file", help="Output filename")
|
|
|
|
| 48 |
|
| 49 |
|
| 50 |
def main():
|
| 51 |
+
parser = argparse.ArgumentParser(description="Benchmark PDF to MD conversion. Needs source pdfs, and a reference folder with the correct markdown.")
|
| 52 |
parser.add_argument("in_folder", help="Input PDF files")
|
| 53 |
parser.add_argument("reference_folder", help="Reference folder with reference markdown files")
|
| 54 |
parser.add_argument("out_file", help="Output filename")
|
poetry.lock
CHANGED
|
@@ -1935,13 +1935,13 @@ dill = ">=0.3.8"
|
|
| 1935 |
|
| 1936 |
[[package]]
|
| 1937 |
name = "narwhals"
|
| 1938 |
-
version = "1.9.
|
| 1939 |
description = "Extremely lightweight compatibility layer between dataframe libraries"
|
| 1940 |
optional = false
|
| 1941 |
python-versions = ">=3.8"
|
| 1942 |
files = [
|
| 1943 |
-
{file = "narwhals-1.9.
|
| 1944 |
-
{file = "narwhals-1.9.
|
| 1945 |
]
|
| 1946 |
|
| 1947 |
[package.extras]
|
|
@@ -2510,13 +2510,13 @@ testing = ["docopt", "pytest"]
|
|
| 2510 |
|
| 2511 |
[[package]]
|
| 2512 |
name = "pdftext"
|
| 2513 |
-
version = "0.3.
|
| 2514 |
description = "Extract structured text from pdfs quickly"
|
| 2515 |
optional = false
|
| 2516 |
-
python-versions = "
|
| 2517 |
files = [
|
| 2518 |
-
{file = "pdftext-0.3.
|
| 2519 |
-
{file = "pdftext-0.3.
|
| 2520 |
]
|
| 2521 |
|
| 2522 |
[package.dependencies]
|
|
@@ -3049,13 +3049,13 @@ typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0"
|
|
| 3049 |
|
| 3050 |
[[package]]
|
| 3051 |
name = "pydantic-settings"
|
| 3052 |
-
version = "2.
|
| 3053 |
description = "Settings management using Pydantic"
|
| 3054 |
optional = false
|
| 3055 |
python-versions = ">=3.8"
|
| 3056 |
files = [
|
| 3057 |
-
{file = "pydantic_settings-2.
|
| 3058 |
-
{file = "pydantic_settings-2.
|
| 3059 |
]
|
| 3060 |
|
| 3061 |
[package.dependencies]
|
|
@@ -3215,17 +3215,17 @@ files = [
|
|
| 3215 |
|
| 3216 |
[[package]]
|
| 3217 |
name = "pywinpty"
|
| 3218 |
-
version = "2.0.
|
| 3219 |
description = "Pseudo terminal support for Windows from Python."
|
| 3220 |
optional = false
|
| 3221 |
python-versions = ">=3.8"
|
| 3222 |
files = [
|
| 3223 |
-
{file = "pywinpty-2.0.
|
| 3224 |
-
{file = "pywinpty-2.0.
|
| 3225 |
-
{file = "pywinpty-2.0.
|
| 3226 |
-
{file = "pywinpty-2.0.
|
| 3227 |
-
{file = "pywinpty-2.0.
|
| 3228 |
-
{file = "pywinpty-2.0.
|
| 3229 |
]
|
| 3230 |
|
| 3231 |
[[package]]
|
|
@@ -4056,13 +4056,13 @@ win32 = ["pywin32"]
|
|
| 4056 |
|
| 4057 |
[[package]]
|
| 4058 |
name = "setuptools"
|
| 4059 |
-
version = "75.
|
| 4060 |
description = "Easily download, build, install, upgrade, and uninstall Python packages"
|
| 4061 |
optional = false
|
| 4062 |
python-versions = ">=3.8"
|
| 4063 |
files = [
|
| 4064 |
-
{file = "setuptools-75.
|
| 4065 |
-
{file = "setuptools-75.
|
| 4066 |
]
|
| 4067 |
|
| 4068 |
[package.extras]
|
|
@@ -4978,109 +4978,93 @@ files = [
|
|
| 4978 |
|
| 4979 |
[[package]]
|
| 4980 |
name = "yarl"
|
| 4981 |
-
version = "1.15.
|
| 4982 |
description = "Yet another URL library"
|
| 4983 |
optional = false
|
| 4984 |
-
python-versions = ">=3.
|
| 4985 |
files = [
|
| 4986 |
-
{file = "yarl-1.15.
|
| 4987 |
-
{file = "yarl-1.15.
|
| 4988 |
-
{file = "yarl-1.15.
|
| 4989 |
-
{file = "yarl-1.15.
|
| 4990 |
-
{file = "yarl-1.15.
|
| 4991 |
-
{file = "yarl-1.15.
|
| 4992 |
-
{file = "yarl-1.15.
|
| 4993 |
-
{file = "yarl-1.15.
|
| 4994 |
-
{file = "yarl-1.15.
|
| 4995 |
-
{file = "yarl-1.15.
|
| 4996 |
-
{file = "yarl-1.15.
|
| 4997 |
-
{file = "yarl-1.15.
|
| 4998 |
-
{file = "yarl-1.15.
|
| 4999 |
-
{file = "yarl-1.15.
|
| 5000 |
-
{file = "yarl-1.15.
|
| 5001 |
-
{file = "yarl-1.15.
|
| 5002 |
-
{file = "yarl-1.15.
|
| 5003 |
-
{file = "yarl-1.15.
|
| 5004 |
-
{file = "yarl-1.15.
|
| 5005 |
-
{file = "yarl-1.15.
|
| 5006 |
-
{file = "yarl-1.15.
|
| 5007 |
-
{file = "yarl-1.15.
|
| 5008 |
-
{file = "yarl-1.15.
|
| 5009 |
-
{file = "yarl-1.15.
|
| 5010 |
-
{file = "yarl-1.15.
|
| 5011 |
-
{file = "yarl-1.15.
|
| 5012 |
-
{file = "yarl-1.15.
|
| 5013 |
-
{file = "yarl-1.15.
|
| 5014 |
-
{file = "yarl-1.15.
|
| 5015 |
-
{file = "yarl-1.15.
|
| 5016 |
-
{file = "yarl-1.15.
|
| 5017 |
-
{file = "yarl-1.15.
|
| 5018 |
-
{file = "yarl-1.15.
|
| 5019 |
-
{file = "yarl-1.15.
|
| 5020 |
-
{file = "yarl-1.15.
|
| 5021 |
-
{file = "yarl-1.15.
|
| 5022 |
-
{file = "yarl-1.15.
|
| 5023 |
-
{file = "yarl-1.15.
|
| 5024 |
-
{file = "yarl-1.15.
|
| 5025 |
-
{file = "yarl-1.15.
|
| 5026 |
-
{file = "yarl-1.15.
|
| 5027 |
-
{file = "yarl-1.15.
|
| 5028 |
-
{file = "yarl-1.15.
|
| 5029 |
-
{file = "yarl-1.15.
|
| 5030 |
-
{file = "yarl-1.15.
|
| 5031 |
-
{file = "yarl-1.15.
|
| 5032 |
-
{file = "yarl-1.15.
|
| 5033 |
-
{file = "yarl-1.15.
|
| 5034 |
-
{file = "yarl-1.15.
|
| 5035 |
-
{file = "yarl-1.15.
|
| 5036 |
-
{file = "yarl-1.15.
|
| 5037 |
-
{file = "yarl-1.15.
|
| 5038 |
-
{file = "yarl-1.15.
|
| 5039 |
-
{file = "yarl-1.15.
|
| 5040 |
-
{file = "yarl-1.15.
|
| 5041 |
-
{file = "yarl-1.15.
|
| 5042 |
-
{file = "yarl-1.15.
|
| 5043 |
-
{file = "yarl-1.15.
|
| 5044 |
-
{file = "yarl-1.15.
|
| 5045 |
-
{file = "yarl-1.15.
|
| 5046 |
-
{file = "yarl-1.15.
|
| 5047 |
-
{file = "yarl-1.15.
|
| 5048 |
-
{file = "yarl-1.15.
|
| 5049 |
-
{file = "yarl-1.15.
|
| 5050 |
-
{file = "yarl-1.15.
|
| 5051 |
-
{file = "yarl-1.15.
|
| 5052 |
-
{file = "yarl-1.15.
|
| 5053 |
-
{file = "yarl-1.15.
|
| 5054 |
-
{file = "yarl-1.15.
|
| 5055 |
-
{file = "yarl-1.15.
|
| 5056 |
-
{file = "yarl-1.15.
|
| 5057 |
-
{file = "yarl-1.15.
|
| 5058 |
-
{file = "yarl-1.15.
|
| 5059 |
-
{file = "yarl-1.15.
|
| 5060 |
-
{file = "yarl-1.15.
|
| 5061 |
-
{file = "yarl-1.15.
|
| 5062 |
-
{file = "yarl-1.15.
|
| 5063 |
-
{file = "yarl-1.15.
|
| 5064 |
-
{file = "yarl-1.15.
|
| 5065 |
-
{file = "yarl-1.15.
|
| 5066 |
-
{file = "yarl-1.15.
|
| 5067 |
-
{file = "yarl-1.15.
|
| 5068 |
-
{file = "yarl-1.15.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:bed1b5dbf90bad3bfc19439258c97873eab453c71d8b6869c136346acfe497e7"},
|
| 5069 |
-
{file = "yarl-1.15.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ed20a4bdc635f36cb19e630bfc644181dd075839b6fc84cac51c0f381ac472e2"},
|
| 5070 |
-
{file = "yarl-1.15.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d538df442c0d9665664ab6dd5fccd0110fa3b364914f9c85b3ef9b7b2e157980"},
|
| 5071 |
-
{file = "yarl-1.15.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c6cf1d92edf936ceedc7afa61b07e9d78a27b15244aa46bbcd534c7458ee1b"},
|
| 5072 |
-
{file = "yarl-1.15.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce44217ad99ffad8027d2fde0269ae368c86db66ea0571c62a000798d69401fb"},
|
| 5073 |
-
{file = "yarl-1.15.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b47a6000a7e833ebfe5886b56a31cb2ff12120b1efd4578a6fcc38df16cc77bd"},
|
| 5074 |
-
{file = "yarl-1.15.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:e52f77a0cd246086afde8815039f3e16f8d2be51786c0a39b57104c563c5cbb0"},
|
| 5075 |
-
{file = "yarl-1.15.2-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:f9ca0e6ce7774dc7830dc0cc4bb6b3eec769db667f230e7c770a628c1aa5681b"},
|
| 5076 |
-
{file = "yarl-1.15.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:136f9db0f53c0206db38b8cd0c985c78ded5fd596c9a86ce5c0b92afb91c3a19"},
|
| 5077 |
-
{file = "yarl-1.15.2-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:173866d9f7409c0fb514cf6e78952e65816600cb888c68b37b41147349fe0057"},
|
| 5078 |
-
{file = "yarl-1.15.2-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:6e840553c9c494a35e449a987ca2c4f8372668ee954a03a9a9685075228e5036"},
|
| 5079 |
-
{file = "yarl-1.15.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:458c0c65802d816a6b955cf3603186de79e8fdb46d4f19abaec4ef0a906f50a7"},
|
| 5080 |
-
{file = "yarl-1.15.2-cp39-cp39-win32.whl", hash = "sha256:5b48388ded01f6f2429a8c55012bdbd1c2a0c3735b3e73e221649e524c34a58d"},
|
| 5081 |
-
{file = "yarl-1.15.2-cp39-cp39-win_amd64.whl", hash = "sha256:81dadafb3aa124f86dc267a2168f71bbd2bfb163663661ab0038f6e4b8edb810"},
|
| 5082 |
-
{file = "yarl-1.15.2-py3-none-any.whl", hash = "sha256:0d3105efab7c5c091609abacad33afff33bdff0035bece164c98bcf5a85ef90a"},
|
| 5083 |
-
{file = "yarl-1.15.2.tar.gz", hash = "sha256:a39c36f4218a5bb668b4f06874d676d35a035ee668e6e7e3538835c703634b84"},
|
| 5084 |
]
|
| 5085 |
|
| 5086 |
[package.dependencies]
|
|
@@ -5091,4 +5075,4 @@ propcache = ">=0.2.0"
|
|
| 5091 |
[metadata]
|
| 5092 |
lock-version = "2.0"
|
| 5093 |
python-versions = "^3.10"
|
| 5094 |
-
content-hash = "
|
|
|
|
| 1935 |
|
| 1936 |
[[package]]
|
| 1937 |
name = "narwhals"
|
| 1938 |
+
version = "1.9.4"
|
| 1939 |
description = "Extremely lightweight compatibility layer between dataframe libraries"
|
| 1940 |
optional = false
|
| 1941 |
python-versions = ">=3.8"
|
| 1942 |
files = [
|
| 1943 |
+
{file = "narwhals-1.9.4-py3-none-any.whl", hash = "sha256:cdd16f73268a3f0d3327aa9e4c6ab25a0b277629d6710bef58e86f40e57e5cc9"},
|
| 1944 |
+
{file = "narwhals-1.9.4.tar.gz", hash = "sha256:5de1f2d7bfbe555573d945fe1d760469a05784f3e69b7bc1b5b1303aae7946a1"},
|
| 1945 |
]
|
| 1946 |
|
| 1947 |
[package.extras]
|
|
|
|
| 2510 |
|
| 2511 |
[[package]]
|
| 2512 |
name = "pdftext"
|
| 2513 |
+
version = "0.3.15"
|
| 2514 |
description = "Extract structured text from pdfs quickly"
|
| 2515 |
optional = false
|
| 2516 |
+
python-versions = "<4.0,>=3.10"
|
| 2517 |
files = [
|
| 2518 |
+
{file = "pdftext-0.3.15-py3-none-any.whl", hash = "sha256:3151abacd5c2cfed9975d090333b543151e14de439ebe0d228b935328d512f3d"},
|
| 2519 |
+
{file = "pdftext-0.3.15.tar.gz", hash = "sha256:3c6d55781c1adfd263cdc05c39cbea2c40d4e626439ab24078f860eca65c2e6c"},
|
| 2520 |
]
|
| 2521 |
|
| 2522 |
[package.dependencies]
|
|
|
|
| 3049 |
|
| 3050 |
[[package]]
|
| 3051 |
name = "pydantic-settings"
|
| 3052 |
+
version = "2.6.0"
|
| 3053 |
description = "Settings management using Pydantic"
|
| 3054 |
optional = false
|
| 3055 |
python-versions = ">=3.8"
|
| 3056 |
files = [
|
| 3057 |
+
{file = "pydantic_settings-2.6.0-py3-none-any.whl", hash = "sha256:4a819166f119b74d7f8c765196b165f95cc7487ce58ea27dec8a5a26be0970e0"},
|
| 3058 |
+
{file = "pydantic_settings-2.6.0.tar.gz", hash = "sha256:44a1804abffac9e6a30372bb45f6cafab945ef5af25e66b1c634c01dd39e0188"},
|
| 3059 |
]
|
| 3060 |
|
| 3061 |
[package.dependencies]
|
|
|
|
| 3215 |
|
| 3216 |
[[package]]
|
| 3217 |
name = "pywinpty"
|
| 3218 |
+
version = "2.0.14"
|
| 3219 |
description = "Pseudo terminal support for Windows from Python."
|
| 3220 |
optional = false
|
| 3221 |
python-versions = ">=3.8"
|
| 3222 |
files = [
|
| 3223 |
+
{file = "pywinpty-2.0.14-cp310-none-win_amd64.whl", hash = "sha256:0b149c2918c7974f575ba79f5a4aad58bd859a52fa9eb1296cc22aa412aa411f"},
|
| 3224 |
+
{file = "pywinpty-2.0.14-cp311-none-win_amd64.whl", hash = "sha256:cf2a43ac7065b3e0dc8510f8c1f13a75fb8fde805efa3b8cff7599a1ef497bc7"},
|
| 3225 |
+
{file = "pywinpty-2.0.14-cp312-none-win_amd64.whl", hash = "sha256:55dad362ef3e9408ade68fd173e4f9032b3ce08f68cfe7eacb2c263ea1179737"},
|
| 3226 |
+
{file = "pywinpty-2.0.14-cp313-none-win_amd64.whl", hash = "sha256:074fb988a56ec79ca90ed03a896d40707131897cefb8f76f926e3834227f2819"},
|
| 3227 |
+
{file = "pywinpty-2.0.14-cp39-none-win_amd64.whl", hash = "sha256:5725fd56f73c0531ec218663bd8c8ff5acc43c78962fab28564871b5fce053fd"},
|
| 3228 |
+
{file = "pywinpty-2.0.14.tar.gz", hash = "sha256:18bd9529e4a5daf2d9719aa17788ba6013e594ae94c5a0c27e83df3278b0660e"},
|
| 3229 |
]
|
| 3230 |
|
| 3231 |
[[package]]
|
|
|
|
| 4056 |
|
| 4057 |
[[package]]
|
| 4058 |
name = "setuptools"
|
| 4059 |
+
version = "75.2.0"
|
| 4060 |
description = "Easily download, build, install, upgrade, and uninstall Python packages"
|
| 4061 |
optional = false
|
| 4062 |
python-versions = ">=3.8"
|
| 4063 |
files = [
|
| 4064 |
+
{file = "setuptools-75.2.0-py3-none-any.whl", hash = "sha256:a7fcb66f68b4d9e8e66b42f9876150a3371558f98fa32222ffaa5bced76406f8"},
|
| 4065 |
+
{file = "setuptools-75.2.0.tar.gz", hash = "sha256:753bb6ebf1f465a1912e19ed1d41f403a79173a9acf66a42e7e6aec45c3c16ec"},
|
| 4066 |
]
|
| 4067 |
|
| 4068 |
[package.extras]
|
|
|
|
| 4978 |
|
| 4979 |
[[package]]
|
| 4980 |
name = "yarl"
|
| 4981 |
+
version = "1.15.4"
|
| 4982 |
description = "Yet another URL library"
|
| 4983 |
optional = false
|
| 4984 |
+
python-versions = ">=3.9"
|
| 4985 |
files = [
|
| 4986 |
+
{file = "yarl-1.15.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:551205388d1da18a9975302c9a274ba24788f53bb9bb86187496ebf9e938916e"},
|
| 4987 |
+
{file = "yarl-1.15.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:eee724176b5bc50ee64905f559345448119b860a30b9489bd7a073f61baf925f"},
|
| 4988 |
+
{file = "yarl-1.15.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:db818e33599f7b2e4c6507f2b2c24f45ff539a1b6e4e09163bb6f3cfb4616ca7"},
|
| 4989 |
+
{file = "yarl-1.15.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:07019a9de859c5a29916defd1e8c7557de6491a10bf50c49ff5284e6aedf5313"},
|
| 4990 |
+
{file = "yarl-1.15.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:db64a20e78969fc66665d2e5fc96cb4f4dc80f2137d8fed4b5a650ad569bb60f"},
|
| 4991 |
+
{file = "yarl-1.15.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4076bfd8f1621449b19b9826848ed51bf0f2d1d38e82647c312c0730d8778903"},
|
| 4992 |
+
{file = "yarl-1.15.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c23a442973dba3646811c284fce3dddd7fe5c2bd674ac73a122198e8218d6115"},
|
| 4993 |
+
{file = "yarl-1.15.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b2bdb038b3f5c284e3919218c580dedc95f592c417a358361450b9519b22f7a8"},
|
| 4994 |
+
{file = "yarl-1.15.4-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:59db8e6888d5302b8dbca0c1026ddabe99d81d67cdc101941519e13ffc9050fe"},
|
| 4995 |
+
{file = "yarl-1.15.4-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:f3294ce265011547630a59c20085fcb6af8cc5fa1fa44a203251f7d86cd5d913"},
|
| 4996 |
+
{file = "yarl-1.15.4-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:4851618679ca70b863ba2e7109be5f09f8fd7715ec505bd42e5a947dcfde3a45"},
|
| 4997 |
+
{file = "yarl-1.15.4-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:dce1c56beef74d9c799a6ed94001693232a1402138292353a8ce302b64f457d9"},
|
| 4998 |
+
{file = "yarl-1.15.4-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:1e7468f31de61a82817f918743e5229fce774f73fad58487cdf88eef4f06d864"},
|
| 4999 |
+
{file = "yarl-1.15.4-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:527c68f48a91d953691291d3bce0209293aa5ad13ff05286ddb506791c331818"},
|
| 5000 |
+
{file = "yarl-1.15.4-cp310-cp310-win32.whl", hash = "sha256:c30115cecaf25fdcb67cc71c669d08425207f62d7a2f6d5416057c1460529216"},
|
| 5001 |
+
{file = "yarl-1.15.4-cp310-cp310-win_amd64.whl", hash = "sha256:df09c80f4bc2bc2efde309af383c3fe8fd8c51fe0519edb350b9c9e0af43ffa4"},
|
| 5002 |
+
{file = "yarl-1.15.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:76259901cf1ac3db65e7e6dff04775b626d0715f9b51d92b447351144c756a82"},
|
| 5003 |
+
{file = "yarl-1.15.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:98d8dc1e8133f86d916125deca9780d791b22645f0d62bafe1452d1cd5eac631"},
|
| 5004 |
+
{file = "yarl-1.15.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:0d0f16c87c62b7a94b389ddf6a8c9d081265d788875c39f3a80108c4856eea7b"},
|
| 5005 |
+
{file = "yarl-1.15.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8de5328d91859b461899497980d4cc8269e84e2d18640f6ac643886fda9000bf"},
|
| 5006 |
+
{file = "yarl-1.15.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:84937d00e2ea03616c40977de20189fa13a9213e5744a3c6afa0e7dd9141d69c"},
|
| 5007 |
+
{file = "yarl-1.15.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:691a3b498fdebef63308e8967bb598cfd326c56d628da82b799dd181bace4503"},
|
| 5008 |
+
{file = "yarl-1.15.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4a706db0c3b7e4578ff34ed2b1d2507b08fd491346ffc64468786fdf1151d938"},
|
| 5009 |
+
{file = "yarl-1.15.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:adb6b5d07d17c32f9d34c9dd4a693637a72323cfcb1f8a52d57033ab2dd21e99"},
|
| 5010 |
+
{file = "yarl-1.15.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6e100c6c7d9e9d469009fd55cc4d7ad168d67d40758865c50da713f7ada491e5"},
|
| 5011 |
+
{file = "yarl-1.15.4-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:df6b254e55c8ac2362afaa651e3e53453aa19a095570792346245773b434176e"},
|
| 5012 |
+
{file = "yarl-1.15.4-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8721f8bedaa722c3c483cc06a1399cbfdb280eadf443aa5d324b0203cef2a75f"},
|
| 5013 |
+
{file = "yarl-1.15.4-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:1005921b30f4f39bf893946df6173567ff650307babb5ec04bbf64342a1f62c1"},
|
| 5014 |
+
{file = "yarl-1.15.4-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:ab79cc13307065a0b3ef087f09f0509996fc605d35d6642bb28e5d85b2648e1e"},
|
| 5015 |
+
{file = "yarl-1.15.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f337486742c700b102d640830aab3faf2848bed966b479a39e6783edd4ab1c6c"},
|
| 5016 |
+
{file = "yarl-1.15.4-cp311-cp311-win32.whl", hash = "sha256:20acf84bd1ce530065f8e957e4a5878fda4bc5f18cb02659828210e1519de54e"},
|
| 5017 |
+
{file = "yarl-1.15.4-cp311-cp311-win_amd64.whl", hash = "sha256:ab9ccf26cb3fa32747ba2a637a189d2d42386a2fc4afc10dbc7f85922dd23b0f"},
|
| 5018 |
+
{file = "yarl-1.15.4-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:f923e94e93a37fd990e8336e0b9bedea533e7cbed14e0c572bf9357ef2a70681"},
|
| 5019 |
+
{file = "yarl-1.15.4-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:3198da7d7c34e29fc8c823e0c3ce6c7274aac35760de557c2017489c7d98fc5a"},
|
| 5020 |
+
{file = "yarl-1.15.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d886de2ea81f513ba2d6820451d33b767a97c37867ba688d42e164b2dbca1362"},
|
| 5021 |
+
{file = "yarl-1.15.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4ac85e760543129a1912a82438fc8075223e35eaa2d457d61cd83c27d00d17be"},
|
| 5022 |
+
{file = "yarl-1.15.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e58c5d07b1f78dd4cb180c5b3b82465cd281aaeee8aafea0e5d72a4b97922cb1"},
|
| 5023 |
+
{file = "yarl-1.15.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9060589d0acad1fca048861fa9ee3e8ed060f67894fa885969648ab6e9e99a54"},
|
| 5024 |
+
{file = "yarl-1.15.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ccd6774aa7bebdf9ca608bb0839318757a71b8e0d2cf7b10c002bc8790bd343e"},
|
| 5025 |
+
{file = "yarl-1.15.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7694f109867ee428c21b85ae19fd31d164c691eb45cc95c561cfdeba237a12e3"},
|
| 5026 |
+
{file = "yarl-1.15.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:83e7154aa0d17f5c93d27ac01088fd9ab6673e7bab1acbd07cd7a865b980c045"},
|
| 5027 |
+
{file = "yarl-1.15.4-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:f16d1940c0cbc342f1d29d6212a006d172be616d2942c5c41966e8a3ce4c3be1"},
|
| 5028 |
+
{file = "yarl-1.15.4-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:7d5226c70af3ad9569ccc4ccc04ab65be79eeb22c87d7ae789c89e62ef76bbd6"},
|
| 5029 |
+
{file = "yarl-1.15.4-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:f25906e4a72d9833e81717c39a39dee7297ff5cb44957d06d177a2ab8ef2ef7f"},
|
| 5030 |
+
{file = "yarl-1.15.4-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:0e07e4b17b648c880e8e42bf1ac0a730bde114961646ae1c2ec4433f0c11ca94"},
|
| 5031 |
+
{file = "yarl-1.15.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6f8136bde8dfa4477c6a85c79a366581b4a505b51a52b669318fb631d3f4f638"},
|
| 5032 |
+
{file = "yarl-1.15.4-cp312-cp312-win32.whl", hash = "sha256:ccbeaf5b18b173b9d78e332e017b30ba8bedcf03cdce1d13490b82a3f421bc98"},
|
| 5033 |
+
{file = "yarl-1.15.4-cp312-cp312-win_amd64.whl", hash = "sha256:f74f6ffdc633aefecbc80282242a5395058db9d1247fa7dd2f070ef84dc82583"},
|
| 5034 |
+
{file = "yarl-1.15.4-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:4f66a0eda48844508736e47ed476d8fdd7cdbf16a4053b5d439509a25f708504"},
|
| 5035 |
+
{file = "yarl-1.15.4-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:fd2bb86f40962d53a91def15a2f7684c62e081a7b96ec74ed0259c34b15973b9"},
|
| 5036 |
+
{file = "yarl-1.15.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f864b412557e69a6b953d62c01a0ed0ee342666298aa7f2a29af526bfa80f6e9"},
|
| 5037 |
+
{file = "yarl-1.15.4-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3a79c0a8bbb046add85663af85e9993b691bf20c2a109518bd35e0ce77edfe42"},
|
| 5038 |
+
{file = "yarl-1.15.4-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:de479e30abd2dfd49fdad3bd6953f2d930a45380be5143c0c9f7a1215cffc8cc"},
|
| 5039 |
+
{file = "yarl-1.15.4-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:21fabe58042f3e567b4edc75b2cf44cea02f228e41ac09d73de126bf685fe883"},
|
| 5040 |
+
{file = "yarl-1.15.4-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:77390496f2f32437a721c854897f889abefae0f3009daf90a2f703508d96c920"},
|
| 5041 |
+
{file = "yarl-1.15.4-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3896bf15284dd23acab1f2e7fceb350d8da6f6f2436b922f7ec6b3de685d34ca"},
|
| 5042 |
+
{file = "yarl-1.15.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:590e2d733a82ecf004c5c531cbef0d6be328e93adec960024eb213f10cb9503e"},
|
| 5043 |
+
{file = "yarl-1.15.4-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:1ceb677fb583971351627eac70eec6763fbc889761828da7a276681b5e39742d"},
|
| 5044 |
+
{file = "yarl-1.15.4-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:69f628d2da1489b27959f4d63fdb326781fe484944dce94abbf919e416c54abe"},
|
| 5045 |
+
{file = "yarl-1.15.4-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:35a6b69cc44bda002705d6138346bf0a0234cbb7c26c3bf192513eb946aee6f9"},
|
| 5046 |
+
{file = "yarl-1.15.4-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:49f886e8dcf591275c6e20915b516fd81647857566b0c0158c52df1e468849c9"},
|
| 5047 |
+
{file = "yarl-1.15.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:49190eb2ece70313742b0ea51520340288a059674da1f39eefb589d598d9453e"},
|
| 5048 |
+
{file = "yarl-1.15.4-cp313-cp313-win32.whl", hash = "sha256:48334a6c8afee93097eb17c0a094234dac2d88da076c8cf372e09e2a5dcc4b66"},
|
| 5049 |
+
{file = "yarl-1.15.4-cp313-cp313-win_amd64.whl", hash = "sha256:f68025d6ba1816428b7de615c80f61cb03d5b7061158d4ced7696657a64aa59c"},
|
| 5050 |
+
{file = "yarl-1.15.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:8b569f4f511b59518ba6719feb5b8bf0a5d4115e6ac903c89e10a8a9ac656017"},
|
| 5051 |
+
{file = "yarl-1.15.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9fe17744d60fc404ac61f824118e1e15ce3c2e92eced9b8e22f3c7847acafbf2"},
|
| 5052 |
+
{file = "yarl-1.15.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:115346433fad2084ee3a1a925ccc0659990aa42e208ca54c278830a150a3caf3"},
|
| 5053 |
+
{file = "yarl-1.15.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:60165b8bc260f453321004b193770a66cc1b1a5c57c07d4b8dcc96839e7ad578"},
|
| 5054 |
+
{file = "yarl-1.15.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:65a0168691373e08d869d48b62c8bed0af0cdaef19c76e11ad73b43901bbdb5a"},
|
| 5055 |
+
{file = "yarl-1.15.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:787532f00543a21b8f4ec3050b4e01b8fe437797903c0156a0b03dfca5e1ba6c"},
|
| 5056 |
+
{file = "yarl-1.15.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f51c9d173e5fa4b12d06ddca09a41cabbdeb660471dbe55432423eec095709ab"},
|
| 5057 |
+
{file = "yarl-1.15.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0c96eaa30030e1cfafe533f3da8983812281235b7c50ef2a6c78ceca7aea1a0b"},
|
| 5058 |
+
{file = "yarl-1.15.4-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:4feab2dcb725eb5b4835207ecf3d370ff7ce930b253cba5e681646cb80d64c2c"},
|
| 5059 |
+
{file = "yarl-1.15.4-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:de38b0b5b86e57efb129d179854e78b65cb8e294a8c75560877869c43aa2415a"},
|
| 5060 |
+
{file = "yarl-1.15.4-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:65e0467f90f2acf3bc83bbfeedece8f1fd84df8add1a54e9600ed7b7b5debdb0"},
|
| 5061 |
+
{file = "yarl-1.15.4-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:40c18f96696549e73b92dc12619f07019cbf5faefc1612608f967c144816e493"},
|
| 5062 |
+
{file = "yarl-1.15.4-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:46491b3e058de7b484e1c9fb20aa8441f06d6c9a18395d711c1c2a9ad6707d6a"},
|
| 5063 |
+
{file = "yarl-1.15.4-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:faa3dd7f4620ab5e5da7a0789d0aac78a9ad0376f102409d442ec5a4179e200a"},
|
| 5064 |
+
{file = "yarl-1.15.4-cp39-cp39-win32.whl", hash = "sha256:c33ea7c55a73be343f02361795caf52a187357ea07708fb1cae6661ee1d689c8"},
|
| 5065 |
+
{file = "yarl-1.15.4-cp39-cp39-win_amd64.whl", hash = "sha256:11b207061f28b4b6d980239b22ab0ecfadc47846b5a3b8e79f27fcc019d02cf9"},
|
| 5066 |
+
{file = "yarl-1.15.4-py3-none-any.whl", hash = "sha256:e5cc288111c450c0a54a74475591b206d3b1cb47dc71bb6200f6be8b1337184c"},
|
| 5067 |
+
{file = "yarl-1.15.4.tar.gz", hash = "sha256:a0c5e271058d148d730219ca4f33c5d841c6bd46e05b0da60fea7b516906ccd3"},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5068 |
]
|
| 5069 |
|
| 5070 |
[package.dependencies]
|
|
|
|
| 5075 |
[metadata]
|
| 5076 |
lock-version = "2.0"
|
| 5077 |
python-versions = "^3.10"
|
| 5078 |
+
content-hash = "4983d14f11f46193fc13b3256d1f7ed8d0877ad579cb8b30fbab8e87d3febb22"
|
pyproject.toml
CHANGED
|
@@ -35,7 +35,7 @@ rapidfuzz = "^3.8.1"
|
|
| 35 |
surya-ocr = "^0.6.3"
|
| 36 |
filetype = "^1.2.0"
|
| 37 |
regex = "^2024.4.28"
|
| 38 |
-
pdftext = "^0.3.
|
| 39 |
tabled-pdf = "^0.1.0"
|
| 40 |
|
| 41 |
[tool.poetry.group.dev.dependencies]
|
|
|
|
| 35 |
surya-ocr = "^0.6.3"
|
| 36 |
filetype = "^1.2.0"
|
| 37 |
regex = "^2024.4.28"
|
| 38 |
+
pdftext = "^0.3.15"
|
| 39 |
tabled-pdf = "^0.1.0"
|
| 40 |
|
| 41 |
[tool.poetry.group.dev.dependencies]
|
scripts/verify_benchmark_scores.py
CHANGED
|
@@ -9,7 +9,7 @@ def verify_scores(file_path):
|
|
| 9 |
multicolcnn_score = data["marker"]["files"]["multicolcnn.pdf"]["score"]
|
| 10 |
switch_trans_score = data["marker"]["files"]["switch_trans.pdf"]["score"]
|
| 11 |
|
| 12 |
-
if multicolcnn_score <= 0.
|
| 13 |
raise ValueError("One or more scores are below the required threshold of 0.4")
|
| 14 |
|
| 15 |
|
|
|
|
| 9 |
multicolcnn_score = data["marker"]["files"]["multicolcnn.pdf"]["score"]
|
| 10 |
switch_trans_score = data["marker"]["files"]["switch_trans.pdf"]["score"]
|
| 11 |
|
| 12 |
+
if multicolcnn_score <= 0.35 or switch_trans_score <= 0.39:
|
| 13 |
raise ValueError("One or more scores are below the required threshold of 0.4")
|
| 14 |
|
| 15 |
|