Vik Paruchuri
commited on
Commit
·
45ef16a
1
Parent(s):
aa38742
Improve force ocr, enable parallel factor below 1
Browse files- .github/workflows/tests.yml +3 -1
- README.md +2 -1
- marker/convert.py +4 -4
- marker/debug/data.py +2 -2
- marker/ocr/page.py +2 -1
- marker/settings.py +9 -0
- poetry.lock +67 -1
- pyproject.toml +6 -2
- scripts/install/apt-requirements.txt +5 -1
- scripts/markdown_to_pdf.sh +1 -1
.github/workflows/tests.yml
CHANGED
|
@@ -17,7 +17,9 @@ jobs:
|
|
| 17 |
with:
|
| 18 |
python-version: 3.11
|
| 19 |
- name: Install system dependencies
|
| 20 |
-
run:
|
|
|
|
|
|
|
| 21 |
- name: Show tessdata folders
|
| 22 |
run: ls /usr/share/tesseract-ocr/
|
| 23 |
- name: Install python dependencies
|
|
|
|
| 17 |
with:
|
| 18 |
python-version: 3.11
|
| 19 |
- name: Install system dependencies
|
| 20 |
+
run: |
|
| 21 |
+
sudo apt-get update
|
| 22 |
+
cat scripts/install/apt-requirements.txt | xargs sudo apt-get install -y
|
| 23 |
- name: Show tessdata folders
|
| 24 |
run: ls /usr/share/tesseract-ocr/
|
| 25 |
- name: Install python dependencies
|
README.md
CHANGED
|
@@ -51,7 +51,7 @@ PDF is a tricky format, so marker will not always work perfectly. Here are some
|
|
| 51 |
- Marker will convert fewer equations to latex than nougat. This is because it has to first detect equations, then convert them without hallucation.
|
| 52 |
- Whitespace and indentations are not always respected.
|
| 53 |
- Not all lines/spans will be joined properly.
|
| 54 |
-
-
|
| 55 |
- This works best on digital PDFs that won't require a lot of OCR. It's optimized for speed, and limited OCR is used to fix errors.
|
| 56 |
|
| 57 |
# Installation
|
|
@@ -88,6 +88,7 @@ First, clone the repo:
|
|
| 88 |
- Install python requirements
|
| 89 |
- `poetry install`
|
| 90 |
- `poetry shell` to activate your poetry venv
|
|
|
|
| 91 |
|
| 92 |
# Usage
|
| 93 |
|
|
|
|
| 51 |
- Marker will convert fewer equations to latex than nougat. This is because it has to first detect equations, then convert them without hallucation.
|
| 52 |
- Whitespace and indentations are not always respected.
|
| 53 |
- Not all lines/spans will be joined properly.
|
| 54 |
+
- Languages similar to English (Spanish, French, German, Russian, etc) have the best support. There is provisional support for Chinese, Japanese, Korean, and Hindi, but it may not work as well.
|
| 55 |
- This works best on digital PDFs that won't require a lot of OCR. It's optimized for speed, and limited OCR is used to fix errors.
|
| 56 |
|
| 57 |
# Installation
|
|
|
|
| 88 |
- Install python requirements
|
| 89 |
- `poetry install`
|
| 90 |
- `poetry shell` to activate your poetry venv
|
| 91 |
+
- On ARM macs (M1+), make sure to set the `TORCH_DEVICE` setting to `mps` (more details below) for a speedup
|
| 92 |
|
| 93 |
# Usage
|
| 94 |
|
marker/convert.py
CHANGED
|
@@ -92,7 +92,7 @@ def convert_single_pdf(
|
|
| 92 |
tess_lang,
|
| 93 |
spell_lang,
|
| 94 |
max_pages=max_pages,
|
| 95 |
-
parallel=parallel_factor * settings.OCR_PARALLEL_WORKERS
|
| 96 |
)
|
| 97 |
|
| 98 |
out_meta["toc"] = toc
|
|
@@ -109,7 +109,7 @@ def convert_single_pdf(
|
|
| 109 |
doc,
|
| 110 |
blocks,
|
| 111 |
layoutlm_model,
|
| 112 |
-
batch_size=settings.LAYOUT_BATCH_SIZE * parallel_factor
|
| 113 |
)
|
| 114 |
|
| 115 |
# Find headers and footers
|
|
@@ -125,7 +125,7 @@ def convert_single_pdf(
|
|
| 125 |
doc,
|
| 126 |
blocks,
|
| 127 |
order_model,
|
| 128 |
-
batch_size=settings.ORDERER_BATCH_SIZE * parallel_factor
|
| 129 |
)
|
| 130 |
|
| 131 |
# Fix code blocks
|
|
@@ -148,7 +148,7 @@ def convert_single_pdf(
|
|
| 148 |
blocks,
|
| 149 |
block_types,
|
| 150 |
nougat_model,
|
| 151 |
-
batch_size=settings.NOUGAT_BATCH_SIZE * parallel_factor
|
| 152 |
)
|
| 153 |
out_meta["block_stats"]["equations"] = eq_stats
|
| 154 |
|
|
|
|
| 92 |
tess_lang,
|
| 93 |
spell_lang,
|
| 94 |
max_pages=max_pages,
|
| 95 |
+
parallel=int(parallel_factor * settings.OCR_PARALLEL_WORKERS)
|
| 96 |
)
|
| 97 |
|
| 98 |
out_meta["toc"] = toc
|
|
|
|
| 109 |
doc,
|
| 110 |
blocks,
|
| 111 |
layoutlm_model,
|
| 112 |
+
batch_size=int(settings.LAYOUT_BATCH_SIZE * parallel_factor)
|
| 113 |
)
|
| 114 |
|
| 115 |
# Find headers and footers
|
|
|
|
| 125 |
doc,
|
| 126 |
blocks,
|
| 127 |
order_model,
|
| 128 |
+
batch_size=int(settings.ORDERER_BATCH_SIZE * parallel_factor)
|
| 129 |
)
|
| 130 |
|
| 131 |
# Fix code blocks
|
|
|
|
| 148 |
blocks,
|
| 149 |
block_types,
|
| 150 |
nougat_model,
|
| 151 |
+
batch_size=int(settings.NOUGAT_BATCH_SIZE * parallel_factor)
|
| 152 |
)
|
| 153 |
out_meta["block_stats"]["equations"] = eq_stats
|
| 154 |
|
marker/debug/data.py
CHANGED
|
@@ -11,7 +11,7 @@ import io
|
|
| 11 |
|
| 12 |
|
| 13 |
def dump_nougat_debug_data(doc, images, converted_spans):
|
| 14 |
-
if not settings.DEBUG_DATA_FOLDER:
|
| 15 |
return
|
| 16 |
|
| 17 |
if len(images) == 0:
|
|
@@ -44,7 +44,7 @@ def dump_nougat_debug_data(doc, images, converted_spans):
|
|
| 44 |
|
| 45 |
|
| 46 |
def dump_bbox_debug_data(doc, blocks: List[Page]):
|
| 47 |
-
if not settings.DEBUG_DATA_FOLDER:
|
| 48 |
return
|
| 49 |
|
| 50 |
# Remove extension from doc name
|
|
|
|
| 11 |
|
| 12 |
|
| 13 |
def dump_nougat_debug_data(doc, images, converted_spans):
|
| 14 |
+
if not settings.DEBUG_DATA_FOLDER or settings.DEBUG_LEVEL == 0:
|
| 15 |
return
|
| 16 |
|
| 17 |
if len(images) == 0:
|
|
|
|
| 44 |
|
| 45 |
|
| 46 |
def dump_bbox_debug_data(doc, blocks: List[Page]):
|
| 47 |
+
if not settings.DEBUG_DATA_FOLDER or settings.DEBUG_LEVEL < 2:
|
| 48 |
return
|
| 49 |
|
| 50 |
# Remove extension from doc name
|
marker/ocr/page.py
CHANGED
|
@@ -53,7 +53,8 @@ def ocr_entire_page_ocrmp(page, lang: str, spellchecker: Optional[SpellChecker]
|
|
| 53 |
outbytes,
|
| 54 |
language=lang,
|
| 55 |
output_type="pdf",
|
| 56 |
-
redo_ocr=True,
|
|
|
|
| 57 |
progress_bar=False,
|
| 58 |
optimize=False,
|
| 59 |
fast_web_view=1e6,
|
|
|
|
| 53 |
outbytes,
|
| 54 |
language=lang,
|
| 55 |
output_type="pdf",
|
| 56 |
+
redo_ocr=None if settings.OCR_ALL_PAGES else True,
|
| 57 |
+
force_ocr=True if settings.OCR_ALL_PAGES else None,
|
| 58 |
progress_bar=False,
|
| 59 |
optimize=False,
|
| 60 |
fast_web_view=1e6,
|
marker/settings.py
CHANGED
|
@@ -37,6 +37,10 @@ class Settings(BaseSettings):
|
|
| 37 |
"French": "fra",
|
| 38 |
"German": "deu",
|
| 39 |
"Russian": "rus",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
}
|
| 41 |
TESSERACT_TIMEOUT: int = 20 # When to give up on OCR
|
| 42 |
SPELLCHECK_LANGUAGES: Dict = {
|
|
@@ -46,6 +50,10 @@ class Settings(BaseSettings):
|
|
| 46 |
"French": "fr",
|
| 47 |
"German": "de",
|
| 48 |
"Russian": "ru",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
}
|
| 50 |
OCR_ALL_PAGES: bool = False # Run OCR on every page even if text can be extracted
|
| 51 |
OCR_PARALLEL_WORKERS: int = 2 # How many CPU workers to use for OCR
|
|
@@ -101,6 +109,7 @@ class Settings(BaseSettings):
|
|
| 101 |
# Debug
|
| 102 |
DEBUG: bool = False # Enable debug logging
|
| 103 |
DEBUG_DATA_FOLDER: Optional[str] = None
|
|
|
|
| 104 |
|
| 105 |
@computed_field
|
| 106 |
@property
|
|
|
|
| 37 |
"French": "fra",
|
| 38 |
"German": "deu",
|
| 39 |
"Russian": "rus",
|
| 40 |
+
"Chinese": "chi_sim",
|
| 41 |
+
"Japanese": "jpn",
|
| 42 |
+
"Korean": "kor",
|
| 43 |
+
"Hindi": "hin",
|
| 44 |
}
|
| 45 |
TESSERACT_TIMEOUT: int = 20 # When to give up on OCR
|
| 46 |
SPELLCHECK_LANGUAGES: Dict = {
|
|
|
|
| 50 |
"French": "fr",
|
| 51 |
"German": "de",
|
| 52 |
"Russian": "ru",
|
| 53 |
+
"Chinese": None,
|
| 54 |
+
"Japanese": None,
|
| 55 |
+
"Korean": None,
|
| 56 |
+
"Hindi": None,
|
| 57 |
}
|
| 58 |
OCR_ALL_PAGES: bool = False # Run OCR on every page even if text can be extracted
|
| 59 |
OCR_PARALLEL_WORKERS: int = 2 # How many CPU workers to use for OCR
|
|
|
|
| 109 |
# Debug
|
| 110 |
DEBUG: bool = False # Enable debug logging
|
| 111 |
DEBUG_DATA_FOLDER: Optional[str] = None
|
| 112 |
+
DEBUG_LEVEL: int = 0 # 0 to 2, 2 means log everything
|
| 113 |
|
| 114 |
@computed_field
|
| 115 |
@property
|
poetry.lock
CHANGED
|
@@ -968,6 +968,72 @@ files = [
|
|
| 968 |
[package.dependencies]
|
| 969 |
wcwidth = ">=0.2.12,<0.3.0"
|
| 970 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 971 |
[[package]]
|
| 972 |
name = "huggingface-hub"
|
| 973 |
version = "0.19.4"
|
|
@@ -5746,4 +5812,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
|
|
| 5746 |
[metadata]
|
| 5747 |
lock-version = "2.0"
|
| 5748 |
python-versions = ">=3.9,<3.13"
|
| 5749 |
-
content-hash = "
|
|
|
|
| 968 |
[package.dependencies]
|
| 969 |
wcwidth = ">=0.2.12,<0.3.0"
|
| 970 |
|
| 971 |
+
[[package]]
|
| 972 |
+
name = "grpcio"
|
| 973 |
+
version = "1.60.0"
|
| 974 |
+
description = "HTTP/2-based RPC framework"
|
| 975 |
+
optional = false
|
| 976 |
+
python-versions = ">=3.7"
|
| 977 |
+
files = [
|
| 978 |
+
{file = "grpcio-1.60.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:d020cfa595d1f8f5c6b343530cd3ca16ae5aefdd1e832b777f9f0eb105f5b139"},
|
| 979 |
+
{file = "grpcio-1.60.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:b98f43fcdb16172dec5f4b49f2fece4b16a99fd284d81c6bbac1b3b69fcbe0ff"},
|
| 980 |
+
{file = "grpcio-1.60.0-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:20e7a4f7ded59097c84059d28230907cd97130fa74f4a8bfd1d8e5ba18c81491"},
|
| 981 |
+
{file = "grpcio-1.60.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:452ca5b4afed30e7274445dd9b441a35ece656ec1600b77fff8c216fdf07df43"},
|
| 982 |
+
{file = "grpcio-1.60.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:43e636dc2ce9ece583b3e2ca41df5c983f4302eabc6d5f9cd04f0562ee8ec1ae"},
|
| 983 |
+
{file = "grpcio-1.60.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:6e306b97966369b889985a562ede9d99180def39ad42c8014628dd3cc343f508"},
|
| 984 |
+
{file = "grpcio-1.60.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f897c3b127532e6befdcf961c415c97f320d45614daf84deba0a54e64ea2457b"},
|
| 985 |
+
{file = "grpcio-1.60.0-cp310-cp310-win32.whl", hash = "sha256:b87efe4a380887425bb15f220079aa8336276398dc33fce38c64d278164f963d"},
|
| 986 |
+
{file = "grpcio-1.60.0-cp310-cp310-win_amd64.whl", hash = "sha256:a9c7b71211f066908e518a2ef7a5e211670761651039f0d6a80d8d40054047df"},
|
| 987 |
+
{file = "grpcio-1.60.0-cp311-cp311-linux_armv7l.whl", hash = "sha256:fb464479934778d7cc5baf463d959d361954d6533ad34c3a4f1d267e86ee25fd"},
|
| 988 |
+
{file = "grpcio-1.60.0-cp311-cp311-macosx_10_10_universal2.whl", hash = "sha256:4b44d7e39964e808b071714666a812049765b26b3ea48c4434a3b317bac82f14"},
|
| 989 |
+
{file = "grpcio-1.60.0-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:90bdd76b3f04bdb21de5398b8a7c629676c81dfac290f5f19883857e9371d28c"},
|
| 990 |
+
{file = "grpcio-1.60.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:91229d7203f1ef0ab420c9b53fe2ca5c1fbeb34f69b3bc1b5089466237a4a134"},
|
| 991 |
+
{file = "grpcio-1.60.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b36a2c6d4920ba88fa98075fdd58ff94ebeb8acc1215ae07d01a418af4c0253"},
|
| 992 |
+
{file = "grpcio-1.60.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:297eef542156d6b15174a1231c2493ea9ea54af8d016b8ca7d5d9cc65cfcc444"},
|
| 993 |
+
{file = "grpcio-1.60.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:87c9224acba0ad8bacddf427a1c2772e17ce50b3042a789547af27099c5f751d"},
|
| 994 |
+
{file = "grpcio-1.60.0-cp311-cp311-win32.whl", hash = "sha256:95ae3e8e2c1b9bf671817f86f155c5da7d49a2289c5cf27a319458c3e025c320"},
|
| 995 |
+
{file = "grpcio-1.60.0-cp311-cp311-win_amd64.whl", hash = "sha256:467a7d31554892eed2aa6c2d47ded1079fc40ea0b9601d9f79204afa8902274b"},
|
| 996 |
+
{file = "grpcio-1.60.0-cp312-cp312-linux_armv7l.whl", hash = "sha256:a7152fa6e597c20cb97923407cf0934e14224af42c2b8d915f48bc3ad2d9ac18"},
|
| 997 |
+
{file = "grpcio-1.60.0-cp312-cp312-macosx_10_10_universal2.whl", hash = "sha256:7db16dd4ea1b05ada504f08d0dca1cd9b926bed3770f50e715d087c6f00ad748"},
|
| 998 |
+
{file = "grpcio-1.60.0-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:b0571a5aef36ba9177e262dc88a9240c866d903a62799e44fd4aae3f9a2ec17e"},
|
| 999 |
+
{file = "grpcio-1.60.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6fd9584bf1bccdfff1512719316efa77be235469e1e3295dce64538c4773840b"},
|
| 1000 |
+
{file = "grpcio-1.60.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d6a478581b1a1a8fdf3318ecb5f4d0cda41cacdffe2b527c23707c9c1b8fdb55"},
|
| 1001 |
+
{file = "grpcio-1.60.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:77c8a317f0fd5a0a2be8ed5cbe5341537d5c00bb79b3bb27ba7c5378ba77dbca"},
|
| 1002 |
+
{file = "grpcio-1.60.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1c30bb23a41df95109db130a6cc1b974844300ae2e5d68dd4947aacba5985aa5"},
|
| 1003 |
+
{file = "grpcio-1.60.0-cp312-cp312-win32.whl", hash = "sha256:2aef56e85901c2397bd557c5ba514f84de1f0ae5dd132f5d5fed042858115951"},
|
| 1004 |
+
{file = "grpcio-1.60.0-cp312-cp312-win_amd64.whl", hash = "sha256:e381fe0c2aa6c03b056ad8f52f8efca7be29fb4d9ae2f8873520843b6039612a"},
|
| 1005 |
+
{file = "grpcio-1.60.0-cp37-cp37m-linux_armv7l.whl", hash = "sha256:92f88ca1b956eb8427a11bb8b4a0c0b2b03377235fc5102cb05e533b8693a415"},
|
| 1006 |
+
{file = "grpcio-1.60.0-cp37-cp37m-macosx_10_10_universal2.whl", hash = "sha256:e278eafb406f7e1b1b637c2cf51d3ad45883bb5bd1ca56bc05e4fc135dfdaa65"},
|
| 1007 |
+
{file = "grpcio-1.60.0-cp37-cp37m-manylinux_2_17_aarch64.whl", hash = "sha256:a48edde788b99214613e440fce495bbe2b1e142a7f214cce9e0832146c41e324"},
|
| 1008 |
+
{file = "grpcio-1.60.0-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:de2ad69c9a094bf37c1102b5744c9aec6cf74d2b635558b779085d0263166454"},
|
| 1009 |
+
{file = "grpcio-1.60.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:073f959c6f570797272f4ee9464a9997eaf1e98c27cb680225b82b53390d61e6"},
|
| 1010 |
+
{file = "grpcio-1.60.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:c826f93050c73e7769806f92e601e0efdb83ec8d7c76ddf45d514fee54e8e619"},
|
| 1011 |
+
{file = "grpcio-1.60.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:9e30be89a75ee66aec7f9e60086fadb37ff8c0ba49a022887c28c134341f7179"},
|
| 1012 |
+
{file = "grpcio-1.60.0-cp37-cp37m-win_amd64.whl", hash = "sha256:b0fb2d4801546598ac5cd18e3ec79c1a9af8b8f2a86283c55a5337c5aeca4b1b"},
|
| 1013 |
+
{file = "grpcio-1.60.0-cp38-cp38-linux_armv7l.whl", hash = "sha256:9073513ec380434eb8d21970e1ab3161041de121f4018bbed3146839451a6d8e"},
|
| 1014 |
+
{file = "grpcio-1.60.0-cp38-cp38-macosx_10_10_universal2.whl", hash = "sha256:74d7d9fa97809c5b892449b28a65ec2bfa458a4735ddad46074f9f7d9550ad13"},
|
| 1015 |
+
{file = "grpcio-1.60.0-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:1434ca77d6fed4ea312901122dc8da6c4389738bf5788f43efb19a838ac03ead"},
|
| 1016 |
+
{file = "grpcio-1.60.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e61e76020e0c332a98290323ecfec721c9544f5b739fab925b6e8cbe1944cf19"},
|
| 1017 |
+
{file = "grpcio-1.60.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:675997222f2e2f22928fbba640824aebd43791116034f62006e19730715166c0"},
|
| 1018 |
+
{file = "grpcio-1.60.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:5208a57eae445ae84a219dfd8b56e04313445d146873117b5fa75f3245bc1390"},
|
| 1019 |
+
{file = "grpcio-1.60.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:428d699c8553c27e98f4d29fdc0f0edc50e9a8a7590bfd294d2edb0da7be3629"},
|
| 1020 |
+
{file = "grpcio-1.60.0-cp38-cp38-win32.whl", hash = "sha256:83f2292ae292ed5a47cdcb9821039ca8e88902923198f2193f13959360c01860"},
|
| 1021 |
+
{file = "grpcio-1.60.0-cp38-cp38-win_amd64.whl", hash = "sha256:705a68a973c4c76db5d369ed573fec3367d7d196673fa86614b33d8c8e9ebb08"},
|
| 1022 |
+
{file = "grpcio-1.60.0-cp39-cp39-linux_armv7l.whl", hash = "sha256:c193109ca4070cdcaa6eff00fdb5a56233dc7610216d58fb81638f89f02e4968"},
|
| 1023 |
+
{file = "grpcio-1.60.0-cp39-cp39-macosx_10_10_universal2.whl", hash = "sha256:676e4a44e740deaba0f4d95ba1d8c5c89a2fcc43d02c39f69450b1fa19d39590"},
|
| 1024 |
+
{file = "grpcio-1.60.0-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:5ff21e000ff2f658430bde5288cb1ac440ff15c0d7d18b5fb222f941b46cb0d2"},
|
| 1025 |
+
{file = "grpcio-1.60.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4c86343cf9ff7b2514dd229bdd88ebba760bd8973dac192ae687ff75e39ebfab"},
|
| 1026 |
+
{file = "grpcio-1.60.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0fd3b3968ffe7643144580f260f04d39d869fcc2cddb745deef078b09fd2b328"},
|
| 1027 |
+
{file = "grpcio-1.60.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:30943b9530fe3620e3b195c03130396cd0ee3a0d10a66c1bee715d1819001eaf"},
|
| 1028 |
+
{file = "grpcio-1.60.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:b10241250cb77657ab315270b064a6c7f1add58af94befa20687e7c8d8603ae6"},
|
| 1029 |
+
{file = "grpcio-1.60.0-cp39-cp39-win32.whl", hash = "sha256:79a050889eb8d57a93ed21d9585bb63fca881666fc709f5d9f7f9372f5e7fd03"},
|
| 1030 |
+
{file = "grpcio-1.60.0-cp39-cp39-win_amd64.whl", hash = "sha256:8a97a681e82bc11a42d4372fe57898d270a2707f36c45c6676e49ce0d5c41353"},
|
| 1031 |
+
{file = "grpcio-1.60.0.tar.gz", hash = "sha256:2199165a1affb666aa24adf0c97436686d0a61bc5fc113c037701fb7c7fceb96"},
|
| 1032 |
+
]
|
| 1033 |
+
|
| 1034 |
+
[package.extras]
|
| 1035 |
+
protobuf = ["grpcio-tools (>=1.60.0)"]
|
| 1036 |
+
|
| 1037 |
[[package]]
|
| 1038 |
name = "huggingface-hub"
|
| 1039 |
version = "0.19.4"
|
|
|
|
| 5812 |
[metadata]
|
| 5813 |
lock-version = "2.0"
|
| 5814 |
python-versions = ">=3.9,<3.13"
|
| 5815 |
+
content-hash = "7d8f07f7b4ab7e802386b76d1add6ade5560636df131e8a7123436817638ad7c"
|
pyproject.toml
CHANGED
|
@@ -1,9 +1,12 @@
|
|
| 1 |
[tool.poetry]
|
| 2 |
name = "marker"
|
| 3 |
version = "0.1.0"
|
| 4 |
-
description = ""
|
| 5 |
-
authors = ["Vik Paruchuri <
|
| 6 |
readme = "README.md"
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
[tool.poetry.dependencies]
|
| 9 |
python = ">=3.9,<3.13"
|
|
@@ -29,6 +32,7 @@ ftfy = "^6.1.1"
|
|
| 29 |
nltk = "^3.8.1"
|
| 30 |
ocrmypdf = "^15.4.0"
|
| 31 |
bitsandbytes = "^0.41.2.post2"
|
|
|
|
| 32 |
|
| 33 |
[tool.poetry.group.dev.dependencies]
|
| 34 |
jupyter = "^1.0.0"
|
|
|
|
| 1 |
[tool.poetry]
|
| 2 |
name = "marker"
|
| 3 |
version = "0.1.0"
|
| 4 |
+
description = "Convert PDF to markdown with high speed and accuracy."
|
| 5 |
+
authors = ["Vik Paruchuri <github@vikas.sh>"]
|
| 6 |
readme = "README.md"
|
| 7 |
+
license = "GPL-3.0-or-later"
|
| 8 |
+
repository = "https://github.com/VikParuchuri/marker"
|
| 9 |
+
keywords = ["pdf", "markdown", "ocr", "nlp"]
|
| 10 |
|
| 11 |
[tool.poetry.dependencies]
|
| 12 |
python = ">=3.9,<3.13"
|
|
|
|
| 32 |
nltk = "^3.8.1"
|
| 33 |
ocrmypdf = "^15.4.0"
|
| 34 |
bitsandbytes = "^0.41.2.post2"
|
| 35 |
+
grpcio = "^1.60.0"
|
| 36 |
|
| 37 |
[tool.poetry.group.dev.dependencies]
|
| 38 |
jupyter = "^1.0.0"
|
scripts/install/apt-requirements.txt
CHANGED
|
@@ -7,4 +7,8 @@ tesseract-ocr-deu
|
|
| 7 |
tesseract-ocr-por
|
| 8 |
tesseract-ocr-spa
|
| 9 |
tesseract-ocr-rus
|
| 10 |
-
tesseract-ocr-fra
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
tesseract-ocr-por
|
| 8 |
tesseract-ocr-spa
|
| 9 |
tesseract-ocr-rus
|
| 10 |
+
tesseract-ocr-fra
|
| 11 |
+
tesseract-ocr-chi-sim
|
| 12 |
+
tesseract-ocr-jpn
|
| 13 |
+
tesseract-ocr-kor
|
| 14 |
+
tesseract-ocr-hin
|
scripts/markdown_to_pdf.sh
CHANGED
|
@@ -7,4 +7,4 @@ if [ $# -ne 2 ]; then
|
|
| 7 |
exit 1
|
| 8 |
fi
|
| 9 |
|
| 10 |
-
pandoc $1 $2 --pdf-engine=xelatex --include-in-header=header.tex
|
|
|
|
| 7 |
exit 1
|
| 8 |
fi
|
| 9 |
|
| 10 |
+
pandoc $1 -o $2 --pdf-engine=xelatex --include-in-header=header.tex
|