Spaces:

rt4u
/

marker

Sleeping

App Files Files Community

Vik Paruchuri commited on Feb 28

Commit

948c3c2

1 Parent(s): 2fa0059

Iterate on llm processors

Browse files

Files changed (5) hide show

marker/builders/structure.py +14 -0
marker/processors/llm/llm_equation.py +7 -2
marker/processors/llm/llm_inlinemath.py +1 -3
marker/processors/llm/llm_text.py +1 -1
marker/services/claude.py +1 -1

marker/builders/structure.py CHANGED Viewed

@@ -2,6 +2,7 @@ from typing import Annotated
 from marker.builders import BaseBuilder
 from marker.schema import BlockTypes
 from marker.schema.document import Document
 from marker.schema.groups import ListGroup
 from marker.schema.groups.page import PageGroup
@@ -28,6 +29,7 @@ class StructureBuilder(BaseBuilder):
         for page in document.pages:
             self.group_caption_blocks(page)
             self.group_lists(page)
     def group_caption_blocks(self, page: PageGroup):
         gap_threshold_px = self.gap_threshold * page.polygon.height
@@ -110,3 +112,15 @@ class StructureBuilder(BaseBuilder):
                 remove_ids.extend(block_structure)
         page.remove_structure_items(remove_ids)

 from marker.builders import BaseBuilder
 from marker.schema import BlockTypes
+from marker.schema.blocks import Text
 from marker.schema.document import Document
 from marker.schema.groups import ListGroup
 from marker.schema.groups.page import PageGroup
         for page in document.pages:
             self.group_caption_blocks(page)
             self.group_lists(page)
+            self.unmark_lists(page)
     def group_caption_blocks(self, page: PageGroup):
         gap_threshold_px = self.gap_threshold * page.polygon.height
                 remove_ids.extend(block_structure)
         page.remove_structure_items(remove_ids)
+    def unmark_lists(self, page: PageGroup):
+        # If lists aren't grouped, unmark them as list items
+        for block_id in page.structure:
+            block = page.get_block(block_id)
+            if block.block_type == BlockTypes.ListItem:
+                generated_block = Text(
+                    polygon=block.polygon,
+                    page_id=block.page_id,
+                    structure=block.structure,
+                )
+                page.replace_block(block, generated_block)

marker/processors/llm/llm_equation.py CHANGED Viewed

@@ -30,7 +30,7 @@ You'll receive an image of a math block that may contain one or more equations.
 Some guidelines:
 - Output valid html, where all the equations can render properly.
-- Use <math display="block"> as a block equation delimiter and <math> for inline equations.
 - Keep the LaTeX code inside the math tags simple, concise, and KaTeX compatible.
 - Enclose all equations in the correct math tags. Use multiple math tags inside the html to represent multiple equations.
 - Only use the html tags math, i, b, p, and br.
@@ -103,7 +103,12 @@ Output:
             return
         html_equation = response["html_equation"]
-        if len(html_equation) < len(text) * .5:
             block.update_metadata(llm_error_count=1)
             return

 Some guidelines:
 - Output valid html, where all the equations can render properly.
+- Use <math display="block"> as a block equation delimiter and <math> for inline equations.  Do not use $ or $$ as delimiters.
 - Keep the LaTeX code inside the math tags simple, concise, and KaTeX compatible.
 - Enclose all equations in the correct math tags. Use multiple math tags inside the html to represent multiple equations.
 - Only use the html tags math, i, b, p, and br.
             return
         html_equation = response["html_equation"]
+        balanced_tags = html_equation.count("<math") == html_equation.count("</math>")
+        if not all([
+            html_equation,
+            balanced_tags,
+            len(html_equation) > len(text) * .3,
+        ]):
             block.update_metadata(llm_error_count=1)
             return

marker/processors/llm/llm_inlinemath.py CHANGED Viewed

@@ -40,7 +40,7 @@ Your task is to correct any errors in the extracted block, including math, forma
 3. Compare the extracted text to the corresponding text in the image.
 4. If there are no errors in any of the extracted text, output "No corrections needed".
 5. Correct any errors in the extracted text, including:
-    * Inline math: Ensure all mathematical expressions are correctly formatted and rendered.  Surround them with <math>...</math> tags.  The math expressions should be rendered in simple, concise, KaTeX-compatible LaTeX.
       * If a math expression is not in LaTeX format, convert it to LaTeX format, and surround it with <math>...</math> tags.
     * Formatting: Maintain consistent formatting with the text block image, including spacing, indentation, subscripts/superscripts, and special characters.  Use the <h1>, <h2>, <h3>, <h4>, <i>, <b>, <sup>, <sub>, and <span> tags to format the text as needed.
     * Other inaccuracies:  If the image is handwritten then you may correct any spelling errors, or other discrepancies.
@@ -133,8 +133,6 @@ Adversarial training <i>(AT)</i> <a href='#page-9-1'>[23]</a>, which aims to min
         return text_lines, extracted_lines
     def process_rewriting(self, document: Document, page: PageGroup, block: Block):
-        SpanClass = get_block_class(BlockTypes.Span)
         block_text = self.get_block_text(block, document)
         prompt = self.text_math_rewriting_prompt.replace("{extracted_html}", block_text)

 3. Compare the extracted text to the corresponding text in the image.
 4. If there are no errors in any of the extracted text, output "No corrections needed".
 5. Correct any errors in the extracted text, including:
+    * Inline math: Ensure all mathematical expressions are correctly formatted and rendered.  Surround them with <math>...</math> tags.  The math expressions should be rendered in simple, concise, KaTeX-compatible LaTeX.  Do not use $ or $$ as delimiters.
       * If a math expression is not in LaTeX format, convert it to LaTeX format, and surround it with <math>...</math> tags.
     * Formatting: Maintain consistent formatting with the text block image, including spacing, indentation, subscripts/superscripts, and special characters.  Use the <h1>, <h2>, <h3>, <h4>, <i>, <b>, <sup>, <sub>, and <span> tags to format the text as needed.
     * Other inaccuracies:  If the image is handwritten then you may correct any spelling errors, or other discrepancies.
         return text_lines, extracted_lines
     def process_rewriting(self, document: Document, page: PageGroup, block: Block):
         block_text = self.get_block_text(block, document)
         prompt = self.text_math_rewriting_prompt.replace("{extracted_html}", block_text)

marker/processors/llm/llm_text.py CHANGED Viewed

@@ -33,7 +33,7 @@ The number of output lines MUST match the number of input lines.  Stay as faithf
 2. Analyze the extracted lines.
 3. For each extracted line, compare it to the corresponding line in the image.
 4. Correct any errors in the extracted line, including:
-    * Inline math: Ensure all mathematical expressions are correctly formatted and rendered.  Use the `<math>` and `</math>` tags to surround inline math properly.  Make sure the opening and closing tags appear in pairs, on the same line.  The math should be written in simple, concise, KaTeX-compatible LaTeX.
     * Formatting: Maintain consistent formatting with the text block image, including spacing, indentation, subscripts/superscripts, and special characters.  Use the `<i>`, `<b>`, `<sup>`, `<sub>`, and `<span>` tags to format the text as needed.
     * Other inaccuracies:  If the image is handwritten then you may correct any spelling errors, or other discrepancies.
 5. Do not remove any formatting i.e bold, italics, math, superscripts, subscripts, etc from the extracted lines unless it is necessary to correct an error.  The formatting

 2. Analyze the extracted lines.
 3. For each extracted line, compare it to the corresponding line in the image.
 4. Correct any errors in the extracted line, including:
+    * Inline math: Ensure all mathematical expressions are correctly formatted and rendered.  Use the `<math>` and `</math>` tags to surround inline math properly.  Make sure the opening and closing tags appear in pairs, on the same line.  The math should be written in simple, concise, KaTeX-compatible LaTeX.  Do not use $ or $$ as delimiters.
     * Formatting: Maintain consistent formatting with the text block image, including spacing, indentation, subscripts/superscripts, and special characters.  Use the `<i>`, `<b>`, `<sup>`, `<sub>`, and `<span>` tags to format the text as needed.
     * Other inaccuracies:  If the image is handwritten then you may correct any spelling errors, or other discrepancies.
 5. Do not remove any formatting i.e bold, italics, math, superscripts, subscripts, etc from the extracted lines unless it is necessary to correct an error.  The formatting

marker/services/claude.py CHANGED Viewed

@@ -17,7 +17,7 @@ class ClaudeService(BaseService):
     claude_model_name: Annotated[
         str,
         "The name of the Google model to use for the service."
-    ] = "claude-3-7-sonnet-20250219"
     claude_api_key: Annotated[
         str,
         "The Claude API key to use for the service."

     claude_model_name: Annotated[
         str,
         "The name of the Google model to use for the service."
+    ] = "claude-3-5-haiku-20241022"
     claude_api_key: Annotated[
         str,
         "The Claude API key to use for the service."