Vik Paruchuri
commited on
Commit
·
948c3c2
1
Parent(s):
2fa0059
Iterate on llm processors
Browse files
marker/builders/structure.py
CHANGED
|
@@ -2,6 +2,7 @@ from typing import Annotated
|
|
| 2 |
|
| 3 |
from marker.builders import BaseBuilder
|
| 4 |
from marker.schema import BlockTypes
|
|
|
|
| 5 |
from marker.schema.document import Document
|
| 6 |
from marker.schema.groups import ListGroup
|
| 7 |
from marker.schema.groups.page import PageGroup
|
|
@@ -28,6 +29,7 @@ class StructureBuilder(BaseBuilder):
|
|
| 28 |
for page in document.pages:
|
| 29 |
self.group_caption_blocks(page)
|
| 30 |
self.group_lists(page)
|
|
|
|
| 31 |
|
| 32 |
def group_caption_blocks(self, page: PageGroup):
|
| 33 |
gap_threshold_px = self.gap_threshold * page.polygon.height
|
|
@@ -110,3 +112,15 @@ class StructureBuilder(BaseBuilder):
|
|
| 110 |
remove_ids.extend(block_structure)
|
| 111 |
|
| 112 |
page.remove_structure_items(remove_ids)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
from marker.builders import BaseBuilder
|
| 4 |
from marker.schema import BlockTypes
|
| 5 |
+
from marker.schema.blocks import Text
|
| 6 |
from marker.schema.document import Document
|
| 7 |
from marker.schema.groups import ListGroup
|
| 8 |
from marker.schema.groups.page import PageGroup
|
|
|
|
| 29 |
for page in document.pages:
|
| 30 |
self.group_caption_blocks(page)
|
| 31 |
self.group_lists(page)
|
| 32 |
+
self.unmark_lists(page)
|
| 33 |
|
| 34 |
def group_caption_blocks(self, page: PageGroup):
|
| 35 |
gap_threshold_px = self.gap_threshold * page.polygon.height
|
|
|
|
| 112 |
remove_ids.extend(block_structure)
|
| 113 |
|
| 114 |
page.remove_structure_items(remove_ids)
|
| 115 |
+
|
| 116 |
+
def unmark_lists(self, page: PageGroup):
|
| 117 |
+
# If lists aren't grouped, unmark them as list items
|
| 118 |
+
for block_id in page.structure:
|
| 119 |
+
block = page.get_block(block_id)
|
| 120 |
+
if block.block_type == BlockTypes.ListItem:
|
| 121 |
+
generated_block = Text(
|
| 122 |
+
polygon=block.polygon,
|
| 123 |
+
page_id=block.page_id,
|
| 124 |
+
structure=block.structure,
|
| 125 |
+
)
|
| 126 |
+
page.replace_block(block, generated_block)
|
marker/processors/llm/llm_equation.py
CHANGED
|
@@ -30,7 +30,7 @@ You'll receive an image of a math block that may contain one or more equations.
|
|
| 30 |
|
| 31 |
Some guidelines:
|
| 32 |
- Output valid html, where all the equations can render properly.
|
| 33 |
-
- Use <math display="block"> as a block equation delimiter and <math> for inline equations.
|
| 34 |
- Keep the LaTeX code inside the math tags simple, concise, and KaTeX compatible.
|
| 35 |
- Enclose all equations in the correct math tags. Use multiple math tags inside the html to represent multiple equations.
|
| 36 |
- Only use the html tags math, i, b, p, and br.
|
|
@@ -103,7 +103,12 @@ Output:
|
|
| 103 |
return
|
| 104 |
|
| 105 |
html_equation = response["html_equation"]
|
| 106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
block.update_metadata(llm_error_count=1)
|
| 108 |
return
|
| 109 |
|
|
|
|
| 30 |
|
| 31 |
Some guidelines:
|
| 32 |
- Output valid html, where all the equations can render properly.
|
| 33 |
+
- Use <math display="block"> as a block equation delimiter and <math> for inline equations. Do not use $ or $$ as delimiters.
|
| 34 |
- Keep the LaTeX code inside the math tags simple, concise, and KaTeX compatible.
|
| 35 |
- Enclose all equations in the correct math tags. Use multiple math tags inside the html to represent multiple equations.
|
| 36 |
- Only use the html tags math, i, b, p, and br.
|
|
|
|
| 103 |
return
|
| 104 |
|
| 105 |
html_equation = response["html_equation"]
|
| 106 |
+
balanced_tags = html_equation.count("<math") == html_equation.count("</math>")
|
| 107 |
+
if not all([
|
| 108 |
+
html_equation,
|
| 109 |
+
balanced_tags,
|
| 110 |
+
len(html_equation) > len(text) * .3,
|
| 111 |
+
]):
|
| 112 |
block.update_metadata(llm_error_count=1)
|
| 113 |
return
|
| 114 |
|
marker/processors/llm/llm_inlinemath.py
CHANGED
|
@@ -40,7 +40,7 @@ Your task is to correct any errors in the extracted block, including math, forma
|
|
| 40 |
3. Compare the extracted text to the corresponding text in the image.
|
| 41 |
4. If there are no errors in any of the extracted text, output "No corrections needed".
|
| 42 |
5. Correct any errors in the extracted text, including:
|
| 43 |
-
* Inline math: Ensure all mathematical expressions are correctly formatted and rendered. Surround them with <math>...</math> tags. The math expressions should be rendered in simple, concise, KaTeX-compatible LaTeX.
|
| 44 |
* If a math expression is not in LaTeX format, convert it to LaTeX format, and surround it with <math>...</math> tags.
|
| 45 |
* Formatting: Maintain consistent formatting with the text block image, including spacing, indentation, subscripts/superscripts, and special characters. Use the <h1>, <h2>, <h3>, <h4>, <i>, <b>, <sup>, <sub>, and <span> tags to format the text as needed.
|
| 46 |
* Other inaccuracies: If the image is handwritten then you may correct any spelling errors, or other discrepancies.
|
|
@@ -133,8 +133,6 @@ Adversarial training <i>(AT)</i> <a href='#page-9-1'>[23]</a>, which aims to min
|
|
| 133 |
return text_lines, extracted_lines
|
| 134 |
|
| 135 |
def process_rewriting(self, document: Document, page: PageGroup, block: Block):
|
| 136 |
-
SpanClass = get_block_class(BlockTypes.Span)
|
| 137 |
-
|
| 138 |
block_text = self.get_block_text(block, document)
|
| 139 |
prompt = self.text_math_rewriting_prompt.replace("{extracted_html}", block_text)
|
| 140 |
|
|
|
|
| 40 |
3. Compare the extracted text to the corresponding text in the image.
|
| 41 |
4. If there are no errors in any of the extracted text, output "No corrections needed".
|
| 42 |
5. Correct any errors in the extracted text, including:
|
| 43 |
+
* Inline math: Ensure all mathematical expressions are correctly formatted and rendered. Surround them with <math>...</math> tags. The math expressions should be rendered in simple, concise, KaTeX-compatible LaTeX. Do not use $ or $$ as delimiters.
|
| 44 |
* If a math expression is not in LaTeX format, convert it to LaTeX format, and surround it with <math>...</math> tags.
|
| 45 |
* Formatting: Maintain consistent formatting with the text block image, including spacing, indentation, subscripts/superscripts, and special characters. Use the <h1>, <h2>, <h3>, <h4>, <i>, <b>, <sup>, <sub>, and <span> tags to format the text as needed.
|
| 46 |
* Other inaccuracies: If the image is handwritten then you may correct any spelling errors, or other discrepancies.
|
|
|
|
| 133 |
return text_lines, extracted_lines
|
| 134 |
|
| 135 |
def process_rewriting(self, document: Document, page: PageGroup, block: Block):
|
|
|
|
|
|
|
| 136 |
block_text = self.get_block_text(block, document)
|
| 137 |
prompt = self.text_math_rewriting_prompt.replace("{extracted_html}", block_text)
|
| 138 |
|
marker/processors/llm/llm_text.py
CHANGED
|
@@ -33,7 +33,7 @@ The number of output lines MUST match the number of input lines. Stay as faithf
|
|
| 33 |
2. Analyze the extracted lines.
|
| 34 |
3. For each extracted line, compare it to the corresponding line in the image.
|
| 35 |
4. Correct any errors in the extracted line, including:
|
| 36 |
-
* Inline math: Ensure all mathematical expressions are correctly formatted and rendered. Use the `<math>` and `</math>` tags to surround inline math properly. Make sure the opening and closing tags appear in pairs, on the same line. The math should be written in simple, concise, KaTeX-compatible LaTeX.
|
| 37 |
* Formatting: Maintain consistent formatting with the text block image, including spacing, indentation, subscripts/superscripts, and special characters. Use the `<i>`, `<b>`, `<sup>`, `<sub>`, and `<span>` tags to format the text as needed.
|
| 38 |
* Other inaccuracies: If the image is handwritten then you may correct any spelling errors, or other discrepancies.
|
| 39 |
5. Do not remove any formatting i.e bold, italics, math, superscripts, subscripts, etc from the extracted lines unless it is necessary to correct an error. The formatting
|
|
|
|
| 33 |
2. Analyze the extracted lines.
|
| 34 |
3. For each extracted line, compare it to the corresponding line in the image.
|
| 35 |
4. Correct any errors in the extracted line, including:
|
| 36 |
+
* Inline math: Ensure all mathematical expressions are correctly formatted and rendered. Use the `<math>` and `</math>` tags to surround inline math properly. Make sure the opening and closing tags appear in pairs, on the same line. The math should be written in simple, concise, KaTeX-compatible LaTeX. Do not use $ or $$ as delimiters.
|
| 37 |
* Formatting: Maintain consistent formatting with the text block image, including spacing, indentation, subscripts/superscripts, and special characters. Use the `<i>`, `<b>`, `<sup>`, `<sub>`, and `<span>` tags to format the text as needed.
|
| 38 |
* Other inaccuracies: If the image is handwritten then you may correct any spelling errors, or other discrepancies.
|
| 39 |
5. Do not remove any formatting i.e bold, italics, math, superscripts, subscripts, etc from the extracted lines unless it is necessary to correct an error. The formatting
|
marker/services/claude.py
CHANGED
|
@@ -17,7 +17,7 @@ class ClaudeService(BaseService):
|
|
| 17 |
claude_model_name: Annotated[
|
| 18 |
str,
|
| 19 |
"The name of the Google model to use for the service."
|
| 20 |
-
] = "claude-3-
|
| 21 |
claude_api_key: Annotated[
|
| 22 |
str,
|
| 23 |
"The Claude API key to use for the service."
|
|
|
|
| 17 |
claude_model_name: Annotated[
|
| 18 |
str,
|
| 19 |
"The name of the Google model to use for the service."
|
| 20 |
+
] = "claude-3-5-haiku-20241022"
|
| 21 |
claude_api_key: Annotated[
|
| 22 |
str,
|
| 23 |
"The Claude API key to use for the service."
|