Vik Paruchuri commited on
Commit
948c3c2
·
1 Parent(s): 2fa0059

Iterate on llm processors

Browse files
marker/builders/structure.py CHANGED
@@ -2,6 +2,7 @@ from typing import Annotated
2
 
3
  from marker.builders import BaseBuilder
4
  from marker.schema import BlockTypes
 
5
  from marker.schema.document import Document
6
  from marker.schema.groups import ListGroup
7
  from marker.schema.groups.page import PageGroup
@@ -28,6 +29,7 @@ class StructureBuilder(BaseBuilder):
28
  for page in document.pages:
29
  self.group_caption_blocks(page)
30
  self.group_lists(page)
 
31
 
32
  def group_caption_blocks(self, page: PageGroup):
33
  gap_threshold_px = self.gap_threshold * page.polygon.height
@@ -110,3 +112,15 @@ class StructureBuilder(BaseBuilder):
110
  remove_ids.extend(block_structure)
111
 
112
  page.remove_structure_items(remove_ids)
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  from marker.builders import BaseBuilder
4
  from marker.schema import BlockTypes
5
+ from marker.schema.blocks import Text
6
  from marker.schema.document import Document
7
  from marker.schema.groups import ListGroup
8
  from marker.schema.groups.page import PageGroup
 
29
  for page in document.pages:
30
  self.group_caption_blocks(page)
31
  self.group_lists(page)
32
+ self.unmark_lists(page)
33
 
34
  def group_caption_blocks(self, page: PageGroup):
35
  gap_threshold_px = self.gap_threshold * page.polygon.height
 
112
  remove_ids.extend(block_structure)
113
 
114
  page.remove_structure_items(remove_ids)
115
+
116
+ def unmark_lists(self, page: PageGroup):
117
+ # If lists aren't grouped, unmark them as list items
118
+ for block_id in page.structure:
119
+ block = page.get_block(block_id)
120
+ if block.block_type == BlockTypes.ListItem:
121
+ generated_block = Text(
122
+ polygon=block.polygon,
123
+ page_id=block.page_id,
124
+ structure=block.structure,
125
+ )
126
+ page.replace_block(block, generated_block)
marker/processors/llm/llm_equation.py CHANGED
@@ -30,7 +30,7 @@ You'll receive an image of a math block that may contain one or more equations.
30
 
31
  Some guidelines:
32
  - Output valid html, where all the equations can render properly.
33
- - Use <math display="block"> as a block equation delimiter and <math> for inline equations.
34
  - Keep the LaTeX code inside the math tags simple, concise, and KaTeX compatible.
35
  - Enclose all equations in the correct math tags. Use multiple math tags inside the html to represent multiple equations.
36
  - Only use the html tags math, i, b, p, and br.
@@ -103,7 +103,12 @@ Output:
103
  return
104
 
105
  html_equation = response["html_equation"]
106
- if len(html_equation) < len(text) * .5:
 
 
 
 
 
107
  block.update_metadata(llm_error_count=1)
108
  return
109
 
 
30
 
31
  Some guidelines:
32
  - Output valid html, where all the equations can render properly.
33
+ - Use <math display="block"> as a block equation delimiter and <math> for inline equations. Do not use $ or $$ as delimiters.
34
  - Keep the LaTeX code inside the math tags simple, concise, and KaTeX compatible.
35
  - Enclose all equations in the correct math tags. Use multiple math tags inside the html to represent multiple equations.
36
  - Only use the html tags math, i, b, p, and br.
 
103
  return
104
 
105
  html_equation = response["html_equation"]
106
+ balanced_tags = html_equation.count("<math") == html_equation.count("</math>")
107
+ if not all([
108
+ html_equation,
109
+ balanced_tags,
110
+ len(html_equation) > len(text) * .3,
111
+ ]):
112
  block.update_metadata(llm_error_count=1)
113
  return
114
 
marker/processors/llm/llm_inlinemath.py CHANGED
@@ -40,7 +40,7 @@ Your task is to correct any errors in the extracted block, including math, forma
40
  3. Compare the extracted text to the corresponding text in the image.
41
  4. If there are no errors in any of the extracted text, output "No corrections needed".
42
  5. Correct any errors in the extracted text, including:
43
- * Inline math: Ensure all mathematical expressions are correctly formatted and rendered. Surround them with <math>...</math> tags. The math expressions should be rendered in simple, concise, KaTeX-compatible LaTeX.
44
  * If a math expression is not in LaTeX format, convert it to LaTeX format, and surround it with <math>...</math> tags.
45
  * Formatting: Maintain consistent formatting with the text block image, including spacing, indentation, subscripts/superscripts, and special characters. Use the <h1>, <h2>, <h3>, <h4>, <i>, <b>, <sup>, <sub>, and <span> tags to format the text as needed.
46
  * Other inaccuracies: If the image is handwritten then you may correct any spelling errors, or other discrepancies.
@@ -133,8 +133,6 @@ Adversarial training <i>(AT)</i> <a href='#page-9-1'>[23]</a>, which aims to min
133
  return text_lines, extracted_lines
134
 
135
  def process_rewriting(self, document: Document, page: PageGroup, block: Block):
136
- SpanClass = get_block_class(BlockTypes.Span)
137
-
138
  block_text = self.get_block_text(block, document)
139
  prompt = self.text_math_rewriting_prompt.replace("{extracted_html}", block_text)
140
 
 
40
  3. Compare the extracted text to the corresponding text in the image.
41
  4. If there are no errors in any of the extracted text, output "No corrections needed".
42
  5. Correct any errors in the extracted text, including:
43
+ * Inline math: Ensure all mathematical expressions are correctly formatted and rendered. Surround them with <math>...</math> tags. The math expressions should be rendered in simple, concise, KaTeX-compatible LaTeX. Do not use $ or $$ as delimiters.
44
  * If a math expression is not in LaTeX format, convert it to LaTeX format, and surround it with <math>...</math> tags.
45
  * Formatting: Maintain consistent formatting with the text block image, including spacing, indentation, subscripts/superscripts, and special characters. Use the <h1>, <h2>, <h3>, <h4>, <i>, <b>, <sup>, <sub>, and <span> tags to format the text as needed.
46
  * Other inaccuracies: If the image is handwritten then you may correct any spelling errors, or other discrepancies.
 
133
  return text_lines, extracted_lines
134
 
135
  def process_rewriting(self, document: Document, page: PageGroup, block: Block):
 
 
136
  block_text = self.get_block_text(block, document)
137
  prompt = self.text_math_rewriting_prompt.replace("{extracted_html}", block_text)
138
 
marker/processors/llm/llm_text.py CHANGED
@@ -33,7 +33,7 @@ The number of output lines MUST match the number of input lines. Stay as faithf
33
  2. Analyze the extracted lines.
34
  3. For each extracted line, compare it to the corresponding line in the image.
35
  4. Correct any errors in the extracted line, including:
36
- * Inline math: Ensure all mathematical expressions are correctly formatted and rendered. Use the `<math>` and `</math>` tags to surround inline math properly. Make sure the opening and closing tags appear in pairs, on the same line. The math should be written in simple, concise, KaTeX-compatible LaTeX.
37
  * Formatting: Maintain consistent formatting with the text block image, including spacing, indentation, subscripts/superscripts, and special characters. Use the `<i>`, `<b>`, `<sup>`, `<sub>`, and `<span>` tags to format the text as needed.
38
  * Other inaccuracies: If the image is handwritten then you may correct any spelling errors, or other discrepancies.
39
  5. Do not remove any formatting i.e bold, italics, math, superscripts, subscripts, etc from the extracted lines unless it is necessary to correct an error. The formatting
 
33
  2. Analyze the extracted lines.
34
  3. For each extracted line, compare it to the corresponding line in the image.
35
  4. Correct any errors in the extracted line, including:
36
+ * Inline math: Ensure all mathematical expressions are correctly formatted and rendered. Use the `<math>` and `</math>` tags to surround inline math properly. Make sure the opening and closing tags appear in pairs, on the same line. The math should be written in simple, concise, KaTeX-compatible LaTeX. Do not use $ or $$ as delimiters.
37
  * Formatting: Maintain consistent formatting with the text block image, including spacing, indentation, subscripts/superscripts, and special characters. Use the `<i>`, `<b>`, `<sup>`, `<sub>`, and `<span>` tags to format the text as needed.
38
  * Other inaccuracies: If the image is handwritten then you may correct any spelling errors, or other discrepancies.
39
  5. Do not remove any formatting i.e bold, italics, math, superscripts, subscripts, etc from the extracted lines unless it is necessary to correct an error. The formatting
marker/services/claude.py CHANGED
@@ -17,7 +17,7 @@ class ClaudeService(BaseService):
17
  claude_model_name: Annotated[
18
  str,
19
  "The name of the Google model to use for the service."
20
- ] = "claude-3-7-sonnet-20250219"
21
  claude_api_key: Annotated[
22
  str,
23
  "The Claude API key to use for the service."
 
17
  claude_model_name: Annotated[
18
  str,
19
  "The name of the Google model to use for the service."
20
+ ] = "claude-3-5-haiku-20241022"
21
  claude_api_key: Annotated[
22
  str,
23
  "The Claude API key to use for the service."