Vik Paruchuri
commited on
Commit
·
55f74a0
1
Parent(s):
44d1d02
Avoid duplicate use llm flag
Browse files- .github/workflows/scripts.yml +5 -1
- marker/config/parser.py +6 -0
- marker/config/printer.py +5 -0
- poetry.lock +5 -5
- pyproject.toml +2 -2
.github/workflows/scripts.yml
CHANGED
|
@@ -24,4 +24,8 @@ jobs:
|
|
| 24 |
- name: Test convert script
|
| 25 |
run: poetry run marker benchmark_data/pdfs --max_files 1 --workers 1 --page_range 0
|
| 26 |
- name: Text convert script multiple workers
|
| 27 |
-
run: poetry run marker benchmark_data/pdfs --max_files 2 --workers 2 --page_range 0-5
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
- name: Test convert script
|
| 25 |
run: poetry run marker benchmark_data/pdfs --max_files 1 --workers 1 --page_range 0
|
| 26 |
- name: Text convert script multiple workers
|
| 27 |
+
run: poetry run marker benchmark_data/pdfs --max_files 2 --workers 2 --page_range 0-5
|
| 28 |
+
- name: Test llm option
|
| 29 |
+
run: |
|
| 30 |
+
poetry run marker_single benchmark_data/pdfs/switch_trans.pdf --page_range 0 --use_llm > output.txt || true
|
| 31 |
+
grep -qv "UserWarning" output.txt
|
marker/config/parser.py
CHANGED
|
@@ -70,6 +70,12 @@ class ConfigParser:
|
|
| 70 |
)(fn)
|
| 71 |
|
| 72 |
# we put common options here
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
fn = click.option(
|
| 74 |
"--converter_cls",
|
| 75 |
type=str,
|
|
|
|
| 70 |
)(fn)
|
| 71 |
|
| 72 |
# we put common options here
|
| 73 |
+
fn = click.option(
|
| 74 |
+
"--use_llm",
|
| 75 |
+
is_flag=True,
|
| 76 |
+
default=False,
|
| 77 |
+
help="Use LLM for higher accuracy.",
|
| 78 |
+
)(fn)
|
| 79 |
fn = click.option(
|
| 80 |
"--converter_cls",
|
| 81 |
type=str,
|
marker/config/printer.py
CHANGED
|
@@ -6,6 +6,8 @@ from marker.config.crawler import crawler
|
|
| 6 |
|
| 7 |
|
| 8 |
class CustomClickPrinter(click.Command):
|
|
|
|
|
|
|
| 9 |
def parse_args(self, ctx, args):
|
| 10 |
display_help = "config" in args and "--help" in args
|
| 11 |
if display_help:
|
|
@@ -47,6 +49,9 @@ class CustomClickPrinter(click.Command):
|
|
| 47 |
# Add shared attribute options first
|
| 48 |
for attr, info in shared_attrs.items():
|
| 49 |
if info["type"] in attr_types:
|
|
|
|
|
|
|
|
|
|
| 50 |
ctx.command.params.append(
|
| 51 |
click.Option(
|
| 52 |
["--" + attr],
|
|
|
|
| 6 |
|
| 7 |
|
| 8 |
class CustomClickPrinter(click.Command):
|
| 9 |
+
force_flags = ["use_llm"]
|
| 10 |
+
|
| 11 |
def parse_args(self, ctx, args):
|
| 12 |
display_help = "config" in args and "--help" in args
|
| 13 |
if display_help:
|
|
|
|
| 49 |
# Add shared attribute options first
|
| 50 |
for attr, info in shared_attrs.items():
|
| 51 |
if info["type"] in attr_types:
|
| 52 |
+
if attr in self.force_flags:
|
| 53 |
+
continue
|
| 54 |
+
|
| 55 |
ctx.command.params.append(
|
| 56 |
click.Option(
|
| 57 |
["--" + attr],
|
poetry.lock
CHANGED
|
@@ -850,14 +850,14 @@ files = [
|
|
| 850 |
|
| 851 |
[[package]]
|
| 852 |
name = "click"
|
| 853 |
-
version = "8.
|
| 854 |
description = "Composable command line interface toolkit"
|
| 855 |
optional = false
|
| 856 |
-
python-versions = ">=3.
|
| 857 |
groups = ["main", "dev"]
|
| 858 |
files = [
|
| 859 |
-
{file = "click-8.
|
| 860 |
-
{file = "click-8.
|
| 861 |
]
|
| 862 |
|
| 863 |
[package.dependencies]
|
|
@@ -6467,4 +6467,4 @@ full = ["ebooklib", "mammoth", "openpyxl", "python-pptx", "weasyprint"]
|
|
| 6467 |
[metadata]
|
| 6468 |
lock-version = "2.1"
|
| 6469 |
python-versions = "^3.10"
|
| 6470 |
-
content-hash = "
|
|
|
|
| 850 |
|
| 851 |
[[package]]
|
| 852 |
name = "click"
|
| 853 |
+
version = "8.2.0"
|
| 854 |
description = "Composable command line interface toolkit"
|
| 855 |
optional = false
|
| 856 |
+
python-versions = ">=3.10"
|
| 857 |
groups = ["main", "dev"]
|
| 858 |
files = [
|
| 859 |
+
{file = "click-8.2.0-py3-none-any.whl", hash = "sha256:6b303f0b2aa85f1cb4e5303078fadcbcd4e476f114fab9b5007005711839325c"},
|
| 860 |
+
{file = "click-8.2.0.tar.gz", hash = "sha256:f5452aeddd9988eefa20f90f05ab66f17fce1ee2a36907fd30b05bbb5953814d"},
|
| 861 |
]
|
| 862 |
|
| 863 |
[package.dependencies]
|
|
|
|
| 6467 |
[metadata]
|
| 6468 |
lock-version = "2.1"
|
| 6469 |
python-versions = "^3.10"
|
| 6470 |
+
content-hash = "1f5c00e7588f89650cbe93b1ae34fa7b265d472b0b954e4a81a7b4912c2f3c01"
|
pyproject.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
[tool.poetry]
|
| 2 |
name = "marker-pdf"
|
| 3 |
-
version = "1.7.
|
| 4 |
description = "Convert documents to markdown with high speed and accuracy."
|
| 5 |
authors = ["Vik Paruchuri <github@vikas.sh>"]
|
| 6 |
readme = "README.md"
|
|
@@ -30,7 +30,7 @@ surya-ocr = "^0.14.1"
|
|
| 30 |
regex = "^2024.4.28"
|
| 31 |
pdftext = "~0.6.2"
|
| 32 |
markdownify = "^0.13.1"
|
| 33 |
-
click = "^8.
|
| 34 |
markdown2 = "^2.5.2"
|
| 35 |
filetype = "^1.2.0"
|
| 36 |
scikit-learn = "^1.6.1"
|
|
|
|
| 1 |
[tool.poetry]
|
| 2 |
name = "marker-pdf"
|
| 3 |
+
version = "1.7.1"
|
| 4 |
description = "Convert documents to markdown with high speed and accuracy."
|
| 5 |
authors = ["Vik Paruchuri <github@vikas.sh>"]
|
| 6 |
readme = "README.md"
|
|
|
|
| 30 |
regex = "^2024.4.28"
|
| 31 |
pdftext = "~0.6.2"
|
| 32 |
markdownify = "^0.13.1"
|
| 33 |
+
click = "^8.2.0"
|
| 34 |
markdown2 = "^2.5.2"
|
| 35 |
filetype = "^1.2.0"
|
| 36 |
scikit-learn = "^1.6.1"
|