Vik Paruchuri
commited on
Commit
·
44d1d02
1
Parent(s):
dcd5453
Fix click bug
Browse files- README.md +1 -1
- marker/config/parser.py +0 -13
- marker/config/printer.py +39 -21
- marker/logger.py +0 -1
- marker/output.py +1 -1
- marker/renderers/extraction.py +2 -2
- tests/converters/test_extraction_converter.py +2 -2
README.md
CHANGED
|
@@ -43,7 +43,7 @@ As you can see, the use_llm mode offers higher accuracy than marker or gemini al
|
|
| 43 |
|
| 44 |
I want marker to be as widely accessible as possible, while still funding my development/training costs. Research and personal usage is always okay, but there are some restrictions on commercial usage.
|
| 45 |
|
| 46 |
-
The weights for the models are licensed `cc-by-nc-sa-4.0`, but I will waive that for any organization under \$
|
| 47 |
|
| 48 |
# Hosted API
|
| 49 |
|
|
|
|
| 43 |
|
| 44 |
I want marker to be as widely accessible as possible, while still funding my development/training costs. Research and personal usage is always okay, but there are some restrictions on commercial usage.
|
| 45 |
|
| 46 |
+
The weights for the models are licensed `cc-by-nc-sa-4.0`, but I will waive that for any organization under \$2M USD in gross revenue in the most recent 12-month period AND under \$2M in lifetime VC/angel funding raised. You also must not be competitive with the [Datalab API](https://www.datalab.to/). If you want to remove the GPL license requirements (dual-license) and/or use the weights commercially over the revenue limit, check out the options [here](https://www.datalab.to).
|
| 47 |
|
| 48 |
# Hosted API
|
| 49 |
|
marker/config/parser.py
CHANGED
|
@@ -12,7 +12,6 @@ from marker.renderers.json import JSONRenderer
|
|
| 12 |
from marker.renderers.markdown import MarkdownRenderer
|
| 13 |
from marker.settings import settings
|
| 14 |
from marker.util import classes_to_strings, parse_range_str, strings_to_classes
|
| 15 |
-
from marker.schema import BlockTypes
|
| 16 |
|
| 17 |
logger = get_logger()
|
| 18 |
|
|
@@ -71,11 +70,6 @@ class ConfigParser:
|
|
| 71 |
)(fn)
|
| 72 |
|
| 73 |
# we put common options here
|
| 74 |
-
fn = click.option(
|
| 75 |
-
"--use_llm",
|
| 76 |
-
default=False,
|
| 77 |
-
help="Enable higher quality processing with LLMs.",
|
| 78 |
-
)(fn)
|
| 79 |
fn = click.option(
|
| 80 |
"--converter_cls",
|
| 81 |
type=str,
|
|
@@ -88,13 +82,6 @@ class ConfigParser:
|
|
| 88 |
default=None,
|
| 89 |
help="LLM service to use - should be full import path, like marker.services.gemini.GoogleGeminiService",
|
| 90 |
)(fn)
|
| 91 |
-
|
| 92 |
-
# enum options
|
| 93 |
-
fn = click.option(
|
| 94 |
-
"--force_layout_block",
|
| 95 |
-
type=click.Choice(choices=[t.name for t in BlockTypes]),
|
| 96 |
-
default=None,
|
| 97 |
-
)(fn)
|
| 98 |
return fn
|
| 99 |
|
| 100 |
def generate_config_dict(self) -> Dict[str, any]:
|
|
|
|
| 12 |
from marker.renderers.markdown import MarkdownRenderer
|
| 13 |
from marker.settings import settings
|
| 14 |
from marker.util import classes_to_strings, parse_range_str, strings_to_classes
|
|
|
|
| 15 |
|
| 16 |
logger = get_logger()
|
| 17 |
|
|
|
|
| 70 |
)(fn)
|
| 71 |
|
| 72 |
# we put common options here
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
fn = click.option(
|
| 74 |
"--converter_cls",
|
| 75 |
type=str,
|
|
|
|
| 82 |
default=None,
|
| 83 |
help="LLM service to use - should be full import path, like marker.services.gemini.GoogleGeminiService",
|
| 84 |
)(fn)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
return fn
|
| 86 |
|
| 87 |
def generate_config_dict(self) -> Dict[str, any]:
|
marker/config/printer.py
CHANGED
|
@@ -7,11 +7,11 @@ from marker.config.crawler import crawler
|
|
| 7 |
|
| 8 |
class CustomClickPrinter(click.Command):
|
| 9 |
def parse_args(self, ctx, args):
|
| 10 |
-
|
| 11 |
-
display_help = 'config' in args and '--help' in args
|
| 12 |
if display_help:
|
| 13 |
click.echo(
|
| 14 |
-
"Here is a list of all the Builders, Processors, Converters, Providers and Renderers in Marker along with their attributes:"
|
|
|
|
| 15 |
|
| 16 |
# Keep track of shared attributes and their types
|
| 17 |
shared_attrs = {}
|
|
@@ -19,30 +19,42 @@ class CustomClickPrinter(click.Command):
|
|
| 19 |
# First pass: identify shared attributes and verify compatibility
|
| 20 |
for base_type, base_type_dict in crawler.class_config_map.items():
|
| 21 |
for class_name, class_map in base_type_dict.items():
|
| 22 |
-
for attr, (attr_type, formatted_type, default, metadata) in class_map[
|
|
|
|
|
|
|
| 23 |
if attr not in shared_attrs:
|
| 24 |
shared_attrs[attr] = {
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
|
|
|
| 30 |
}
|
| 31 |
-
shared_attrs[attr][
|
| 32 |
|
| 33 |
# These are the types of attrs that can be set from the command line
|
| 34 |
-
attr_types = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
# Add shared attribute options first
|
| 37 |
for attr, info in shared_attrs.items():
|
| 38 |
-
if info[
|
| 39 |
ctx.command.params.append(
|
| 40 |
click.Option(
|
| 41 |
["--" + attr],
|
| 42 |
-
type=info[
|
| 43 |
-
help=" ".join(info[
|
| 44 |
-
|
| 45 |
-
|
|
|
|
| 46 |
)
|
| 47 |
)
|
| 48 |
|
|
@@ -51,15 +63,21 @@ class CustomClickPrinter(click.Command):
|
|
| 51 |
if display_help:
|
| 52 |
click.echo(f"{base_type}s:")
|
| 53 |
for class_name, class_map in base_type_dict.items():
|
| 54 |
-
if display_help and class_map[
|
| 55 |
-
click.echo(
|
|
|
|
|
|
|
| 56 |
click.echo(" " * 4 + "Attributes:")
|
| 57 |
-
for attr, (attr_type, formatted_type, default, metadata) in class_map[
|
|
|
|
|
|
|
| 58 |
class_name_attr = class_name + "_" + attr
|
| 59 |
|
| 60 |
if display_help:
|
| 61 |
click.echo(" " * 8 + f"{attr} ({formatted_type}):")
|
| 62 |
-
click.echo(
|
|
|
|
|
|
|
| 63 |
|
| 64 |
if attr_type in attr_types:
|
| 65 |
is_flag = attr_type in [bool, Optional[bool]] and not default
|
|
@@ -71,7 +89,7 @@ class CustomClickPrinter(click.Command):
|
|
| 71 |
type=attr_type,
|
| 72 |
help=" ".join(metadata),
|
| 73 |
is_flag=is_flag,
|
| 74 |
-
default=None
|
| 75 |
)
|
| 76 |
)
|
| 77 |
|
|
|
|
| 7 |
|
| 8 |
class CustomClickPrinter(click.Command):
|
| 9 |
def parse_args(self, ctx, args):
|
| 10 |
+
display_help = "config" in args and "--help" in args
|
|
|
|
| 11 |
if display_help:
|
| 12 |
click.echo(
|
| 13 |
+
"Here is a list of all the Builders, Processors, Converters, Providers and Renderers in Marker along with their attributes:"
|
| 14 |
+
)
|
| 15 |
|
| 16 |
# Keep track of shared attributes and their types
|
| 17 |
shared_attrs = {}
|
|
|
|
| 19 |
# First pass: identify shared attributes and verify compatibility
|
| 20 |
for base_type, base_type_dict in crawler.class_config_map.items():
|
| 21 |
for class_name, class_map in base_type_dict.items():
|
| 22 |
+
for attr, (attr_type, formatted_type, default, metadata) in class_map[
|
| 23 |
+
"config"
|
| 24 |
+
].items():
|
| 25 |
if attr not in shared_attrs:
|
| 26 |
shared_attrs[attr] = {
|
| 27 |
+
"classes": [],
|
| 28 |
+
"type": attr_type,
|
| 29 |
+
"is_flag": attr_type in [bool, Optional[bool]]
|
| 30 |
+
and not default,
|
| 31 |
+
"metadata": metadata,
|
| 32 |
+
"default": default,
|
| 33 |
}
|
| 34 |
+
shared_attrs[attr]["classes"].append(class_name)
|
| 35 |
|
| 36 |
# These are the types of attrs that can be set from the command line
|
| 37 |
+
attr_types = [
|
| 38 |
+
str,
|
| 39 |
+
int,
|
| 40 |
+
float,
|
| 41 |
+
bool,
|
| 42 |
+
Optional[int],
|
| 43 |
+
Optional[float],
|
| 44 |
+
Optional[str],
|
| 45 |
+
]
|
| 46 |
|
| 47 |
# Add shared attribute options first
|
| 48 |
for attr, info in shared_attrs.items():
|
| 49 |
+
if info["type"] in attr_types:
|
| 50 |
ctx.command.params.append(
|
| 51 |
click.Option(
|
| 52 |
["--" + attr],
|
| 53 |
+
type=info["type"],
|
| 54 |
+
help=" ".join(info["metadata"])
|
| 55 |
+
+ f" (Applies to: {', '.join(info['classes'])})",
|
| 56 |
+
default=None, # This is important, or it sets all the default keys again in config
|
| 57 |
+
is_flag=info["is_flag"],
|
| 58 |
)
|
| 59 |
)
|
| 60 |
|
|
|
|
| 63 |
if display_help:
|
| 64 |
click.echo(f"{base_type}s:")
|
| 65 |
for class_name, class_map in base_type_dict.items():
|
| 66 |
+
if display_help and class_map["config"]:
|
| 67 |
+
click.echo(
|
| 68 |
+
f"\n {class_name}: {class_map['class_type'].__doc__ or ''}"
|
| 69 |
+
)
|
| 70 |
click.echo(" " * 4 + "Attributes:")
|
| 71 |
+
for attr, (attr_type, formatted_type, default, metadata) in class_map[
|
| 72 |
+
"config"
|
| 73 |
+
].items():
|
| 74 |
class_name_attr = class_name + "_" + attr
|
| 75 |
|
| 76 |
if display_help:
|
| 77 |
click.echo(" " * 8 + f"{attr} ({formatted_type}):")
|
| 78 |
+
click.echo(
|
| 79 |
+
"\n".join([f"{' ' * 12}" + desc for desc in metadata])
|
| 80 |
+
)
|
| 81 |
|
| 82 |
if attr_type in attr_types:
|
| 83 |
is_flag = attr_type in [bool, Optional[bool]] and not default
|
|
|
|
| 89 |
type=attr_type,
|
| 90 |
help=" ".join(metadata),
|
| 91 |
is_flag=is_flag,
|
| 92 |
+
default=None, # This is important, or it sets all the default keys again in config
|
| 93 |
)
|
| 94 |
)
|
| 95 |
|
marker/logger.py
CHANGED
|
@@ -18,7 +18,6 @@ def configure_logging():
|
|
| 18 |
|
| 19 |
# Ignore future warnings
|
| 20 |
warnings.simplefilter(action="ignore", category=FutureWarning)
|
| 21 |
-
warnings.simplefilter(action="ignore", category=UserWarning)
|
| 22 |
|
| 23 |
# Set component loglevels
|
| 24 |
logging.getLogger("PIL").setLevel(logging.ERROR)
|
|
|
|
| 18 |
|
| 19 |
# Ignore future warnings
|
| 20 |
warnings.simplefilter(action="ignore", category=FutureWarning)
|
|
|
|
| 21 |
|
| 22 |
# Set component loglevels
|
| 23 |
logging.getLogger("PIL").setLevel(logging.ERROR)
|
marker/output.py
CHANGED
|
@@ -62,7 +62,7 @@ def text_from_rendered(rendered: BaseModel):
|
|
| 62 |
elif isinstance(rendered, OCRJSONOutput):
|
| 63 |
return rendered.model_dump_json(exclude=["metadata"], indent=2), "json", {}
|
| 64 |
elif isinstance(rendered, ExtractionOutput):
|
| 65 |
-
return rendered.
|
| 66 |
else:
|
| 67 |
raise ValueError("Invalid output type")
|
| 68 |
|
|
|
|
| 62 |
elif isinstance(rendered, OCRJSONOutput):
|
| 63 |
return rendered.model_dump_json(exclude=["metadata"], indent=2), "json", {}
|
| 64 |
elif isinstance(rendered, ExtractionOutput):
|
| 65 |
+
return rendered.document_json, "json", {}
|
| 66 |
else:
|
| 67 |
raise ValueError("Invalid output type")
|
| 68 |
|
marker/renderers/extraction.py
CHANGED
|
@@ -42,7 +42,7 @@ def merge_keys(
|
|
| 42 |
|
| 43 |
class ExtractionOutput(BaseModel):
|
| 44 |
pages: Dict[int, ExtractionResult]
|
| 45 |
-
|
| 46 |
|
| 47 |
|
| 48 |
class ExtractionRenderer(BaseRenderer):
|
|
@@ -61,4 +61,4 @@ class ExtractionRenderer(BaseRenderer):
|
|
| 61 |
)
|
| 62 |
merge_keys(merged_result, outputs[pnum].extracted_data, merge_data)
|
| 63 |
|
| 64 |
-
return ExtractionOutput(pages=outputs,
|
|
|
|
| 42 |
|
| 43 |
class ExtractionOutput(BaseModel):
|
| 44 |
pages: Dict[int, ExtractionResult]
|
| 45 |
+
document_json: dict
|
| 46 |
|
| 47 |
|
| 48 |
class ExtractionRenderer(BaseRenderer):
|
|
|
|
| 61 |
)
|
| 62 |
merge_keys(merged_result, outputs[pnum].extracted_data, merge_data)
|
| 63 |
|
| 64 |
+
return ExtractionOutput(pages=outputs, document_json=merged_result)
|
tests/converters/test_extraction_converter.py
CHANGED
|
@@ -62,5 +62,5 @@ def test_extraction_converter_multiple_pages(extraction_converter, temp_doc):
|
|
| 62 |
result = extraction_converter(temp_doc.name)
|
| 63 |
|
| 64 |
assert result is not None
|
| 65 |
-
assert result.
|
| 66 |
-
assert result.
|
|
|
|
| 62 |
result = extraction_converter(temp_doc.name)
|
| 63 |
|
| 64 |
assert result is not None
|
| 65 |
+
assert result.document_json is not None
|
| 66 |
+
assert result.document_json == {"test_key": "test_value"}
|