Vik Paruchuri commited on
Commit
44d1d02
·
1 Parent(s): dcd5453

Fix click bug

Browse files
README.md CHANGED
@@ -43,7 +43,7 @@ As you can see, the use_llm mode offers higher accuracy than marker or gemini al
43
 
44
  I want marker to be as widely accessible as possible, while still funding my development/training costs. Research and personal usage is always okay, but there are some restrictions on commercial usage.
45
 
46
- The weights for the models are licensed `cc-by-nc-sa-4.0`, but I will waive that for any organization under \$5M USD in gross revenue in the most recent 12-month period AND under $5M in lifetime VC/angel funding raised. You also must not be competitive with the [Datalab API](https://www.datalab.to/). If you want to remove the GPL license requirements (dual-license) and/or use the weights commercially over the revenue limit, check out the options [here](https://www.datalab.to).
47
 
48
  # Hosted API
49
 
 
43
 
44
  I want marker to be as widely accessible as possible, while still funding my development/training costs. Research and personal usage is always okay, but there are some restrictions on commercial usage.
45
 
46
+ The weights for the models are licensed `cc-by-nc-sa-4.0`, but I will waive that for any organization under \$2M USD in gross revenue in the most recent 12-month period AND under \$2M in lifetime VC/angel funding raised. You also must not be competitive with the [Datalab API](https://www.datalab.to/). If you want to remove the GPL license requirements (dual-license) and/or use the weights commercially over the revenue limit, check out the options [here](https://www.datalab.to).
47
 
48
  # Hosted API
49
 
marker/config/parser.py CHANGED
@@ -12,7 +12,6 @@ from marker.renderers.json import JSONRenderer
12
  from marker.renderers.markdown import MarkdownRenderer
13
  from marker.settings import settings
14
  from marker.util import classes_to_strings, parse_range_str, strings_to_classes
15
- from marker.schema import BlockTypes
16
 
17
  logger = get_logger()
18
 
@@ -71,11 +70,6 @@ class ConfigParser:
71
  )(fn)
72
 
73
  # we put common options here
74
- fn = click.option(
75
- "--use_llm",
76
- default=False,
77
- help="Enable higher quality processing with LLMs.",
78
- )(fn)
79
  fn = click.option(
80
  "--converter_cls",
81
  type=str,
@@ -88,13 +82,6 @@ class ConfigParser:
88
  default=None,
89
  help="LLM service to use - should be full import path, like marker.services.gemini.GoogleGeminiService",
90
  )(fn)
91
-
92
- # enum options
93
- fn = click.option(
94
- "--force_layout_block",
95
- type=click.Choice(choices=[t.name for t in BlockTypes]),
96
- default=None,
97
- )(fn)
98
  return fn
99
 
100
  def generate_config_dict(self) -> Dict[str, any]:
 
12
  from marker.renderers.markdown import MarkdownRenderer
13
  from marker.settings import settings
14
  from marker.util import classes_to_strings, parse_range_str, strings_to_classes
 
15
 
16
  logger = get_logger()
17
 
 
70
  )(fn)
71
 
72
  # we put common options here
 
 
 
 
 
73
  fn = click.option(
74
  "--converter_cls",
75
  type=str,
 
82
  default=None,
83
  help="LLM service to use - should be full import path, like marker.services.gemini.GoogleGeminiService",
84
  )(fn)
 
 
 
 
 
 
 
85
  return fn
86
 
87
  def generate_config_dict(self) -> Dict[str, any]:
marker/config/printer.py CHANGED
@@ -7,11 +7,11 @@ from marker.config.crawler import crawler
7
 
8
  class CustomClickPrinter(click.Command):
9
  def parse_args(self, ctx, args):
10
-
11
- display_help = 'config' in args and '--help' in args
12
  if display_help:
13
  click.echo(
14
- "Here is a list of all the Builders, Processors, Converters, Providers and Renderers in Marker along with their attributes:")
 
15
 
16
  # Keep track of shared attributes and their types
17
  shared_attrs = {}
@@ -19,30 +19,42 @@ class CustomClickPrinter(click.Command):
19
  # First pass: identify shared attributes and verify compatibility
20
  for base_type, base_type_dict in crawler.class_config_map.items():
21
  for class_name, class_map in base_type_dict.items():
22
- for attr, (attr_type, formatted_type, default, metadata) in class_map['config'].items():
 
 
23
  if attr not in shared_attrs:
24
  shared_attrs[attr] = {
25
- 'classes': [],
26
- 'type': attr_type,
27
- 'is_flag': attr_type in [bool, Optional[bool]] and not default,
28
- 'metadata': metadata,
29
- 'default': default
 
30
  }
31
- shared_attrs[attr]['classes'].append(class_name)
32
 
33
  # These are the types of attrs that can be set from the command line
34
- attr_types = [str, int, float, bool, Optional[int], Optional[float], Optional[str]]
 
 
 
 
 
 
 
 
35
 
36
  # Add shared attribute options first
37
  for attr, info in shared_attrs.items():
38
- if info['type'] in attr_types:
39
  ctx.command.params.append(
40
  click.Option(
41
  ["--" + attr],
42
- type=info['type'],
43
- help=" ".join(info['metadata']) + f" (Applies to: {', '.join(info['classes'])})",
44
- default=None, # This is important, or it sets all the default keys again in config
45
- is_flag=info['is_flag'],
 
46
  )
47
  )
48
 
@@ -51,15 +63,21 @@ class CustomClickPrinter(click.Command):
51
  if display_help:
52
  click.echo(f"{base_type}s:")
53
  for class_name, class_map in base_type_dict.items():
54
- if display_help and class_map['config']:
55
- click.echo(f"\n {class_name}: {class_map['class_type'].__doc__ or ''}")
 
 
56
  click.echo(" " * 4 + "Attributes:")
57
- for attr, (attr_type, formatted_type, default, metadata) in class_map['config'].items():
 
 
58
  class_name_attr = class_name + "_" + attr
59
 
60
  if display_help:
61
  click.echo(" " * 8 + f"{attr} ({formatted_type}):")
62
- click.echo("\n".join([f'{" " * 12}' + desc for desc in metadata]))
 
 
63
 
64
  if attr_type in attr_types:
65
  is_flag = attr_type in [bool, Optional[bool]] and not default
@@ -71,7 +89,7 @@ class CustomClickPrinter(click.Command):
71
  type=attr_type,
72
  help=" ".join(metadata),
73
  is_flag=is_flag,
74
- default=None # This is important, or it sets all the default keys again in config
75
  )
76
  )
77
 
 
7
 
8
  class CustomClickPrinter(click.Command):
9
  def parse_args(self, ctx, args):
10
+ display_help = "config" in args and "--help" in args
 
11
  if display_help:
12
  click.echo(
13
+ "Here is a list of all the Builders, Processors, Converters, Providers and Renderers in Marker along with their attributes:"
14
+ )
15
 
16
  # Keep track of shared attributes and their types
17
  shared_attrs = {}
 
19
  # First pass: identify shared attributes and verify compatibility
20
  for base_type, base_type_dict in crawler.class_config_map.items():
21
  for class_name, class_map in base_type_dict.items():
22
+ for attr, (attr_type, formatted_type, default, metadata) in class_map[
23
+ "config"
24
+ ].items():
25
  if attr not in shared_attrs:
26
  shared_attrs[attr] = {
27
+ "classes": [],
28
+ "type": attr_type,
29
+ "is_flag": attr_type in [bool, Optional[bool]]
30
+ and not default,
31
+ "metadata": metadata,
32
+ "default": default,
33
  }
34
+ shared_attrs[attr]["classes"].append(class_name)
35
 
36
  # These are the types of attrs that can be set from the command line
37
+ attr_types = [
38
+ str,
39
+ int,
40
+ float,
41
+ bool,
42
+ Optional[int],
43
+ Optional[float],
44
+ Optional[str],
45
+ ]
46
 
47
  # Add shared attribute options first
48
  for attr, info in shared_attrs.items():
49
+ if info["type"] in attr_types:
50
  ctx.command.params.append(
51
  click.Option(
52
  ["--" + attr],
53
+ type=info["type"],
54
+ help=" ".join(info["metadata"])
55
+ + f" (Applies to: {', '.join(info['classes'])})",
56
+ default=None, # This is important, or it sets all the default keys again in config
57
+ is_flag=info["is_flag"],
58
  )
59
  )
60
 
 
63
  if display_help:
64
  click.echo(f"{base_type}s:")
65
  for class_name, class_map in base_type_dict.items():
66
+ if display_help and class_map["config"]:
67
+ click.echo(
68
+ f"\n {class_name}: {class_map['class_type'].__doc__ or ''}"
69
+ )
70
  click.echo(" " * 4 + "Attributes:")
71
+ for attr, (attr_type, formatted_type, default, metadata) in class_map[
72
+ "config"
73
+ ].items():
74
  class_name_attr = class_name + "_" + attr
75
 
76
  if display_help:
77
  click.echo(" " * 8 + f"{attr} ({formatted_type}):")
78
+ click.echo(
79
+ "\n".join([f"{' ' * 12}" + desc for desc in metadata])
80
+ )
81
 
82
  if attr_type in attr_types:
83
  is_flag = attr_type in [bool, Optional[bool]] and not default
 
89
  type=attr_type,
90
  help=" ".join(metadata),
91
  is_flag=is_flag,
92
+ default=None, # This is important, or it sets all the default keys again in config
93
  )
94
  )
95
 
marker/logger.py CHANGED
@@ -18,7 +18,6 @@ def configure_logging():
18
 
19
  # Ignore future warnings
20
  warnings.simplefilter(action="ignore", category=FutureWarning)
21
- warnings.simplefilter(action="ignore", category=UserWarning)
22
 
23
  # Set component loglevels
24
  logging.getLogger("PIL").setLevel(logging.ERROR)
 
18
 
19
  # Ignore future warnings
20
  warnings.simplefilter(action="ignore", category=FutureWarning)
 
21
 
22
  # Set component loglevels
23
  logging.getLogger("PIL").setLevel(logging.ERROR)
marker/output.py CHANGED
@@ -62,7 +62,7 @@ def text_from_rendered(rendered: BaseModel):
62
  elif isinstance(rendered, OCRJSONOutput):
63
  return rendered.model_dump_json(exclude=["metadata"], indent=2), "json", {}
64
  elif isinstance(rendered, ExtractionOutput):
65
- return rendered.json, "json", {}
66
  else:
67
  raise ValueError("Invalid output type")
68
 
 
62
  elif isinstance(rendered, OCRJSONOutput):
63
  return rendered.model_dump_json(exclude=["metadata"], indent=2), "json", {}
64
  elif isinstance(rendered, ExtractionOutput):
65
+ return rendered.document_json, "json", {}
66
  else:
67
  raise ValueError("Invalid output type")
68
 
marker/renderers/extraction.py CHANGED
@@ -42,7 +42,7 @@ def merge_keys(
42
 
43
  class ExtractionOutput(BaseModel):
44
  pages: Dict[int, ExtractionResult]
45
- json: dict
46
 
47
 
48
  class ExtractionRenderer(BaseRenderer):
@@ -61,4 +61,4 @@ class ExtractionRenderer(BaseRenderer):
61
  )
62
  merge_keys(merged_result, outputs[pnum].extracted_data, merge_data)
63
 
64
- return ExtractionOutput(pages=outputs, json=merged_result)
 
42
 
43
  class ExtractionOutput(BaseModel):
44
  pages: Dict[int, ExtractionResult]
45
+ document_json: dict
46
 
47
 
48
  class ExtractionRenderer(BaseRenderer):
 
61
  )
62
  merge_keys(merged_result, outputs[pnum].extracted_data, merge_data)
63
 
64
+ return ExtractionOutput(pages=outputs, document_json=merged_result)
tests/converters/test_extraction_converter.py CHANGED
@@ -62,5 +62,5 @@ def test_extraction_converter_multiple_pages(extraction_converter, temp_doc):
62
  result = extraction_converter(temp_doc.name)
63
 
64
  assert result is not None
65
- assert result.json is not None
66
- assert result.json == {"test_key": "test_value"}
 
62
  result = extraction_converter(temp_doc.name)
63
 
64
  assert result is not None
65
+ assert result.document_json is not None
66
+ assert result.document_json == {"test_key": "test_value"}