Spaces:

rt4u
/

marker

Sleeping

App Files Files Community

Vik Paruchuri commited on Mar 7

Commit

1643ef3

1 Parent(s): b334fad

Misc fixes, benchmark updates

Browse files

Files changed (14) hide show

.gitignore +1 -0
benchmarks/overall/download/base.py +5 -2
benchmarks/overall/download/llamaparse.py +0 -1
benchmarks/overall/download/main.py +7 -5
benchmarks/overall/download/mistral.py +73 -0
benchmarks/overall/elo.py +3 -2
benchmarks/overall/methods/mistral.py +22 -0
benchmarks/overall/overall.py +5 -2
benchmarks/overall/registry.py +3 -1
marker/output.py +4 -1
marker/processors/llm/llm_form.py +3 -2
marker/processors/llm/llm_mathblock.py +5 -4
marker/processors/llm/llm_table_merge.py +3 -2
marker/services/vertex.py +8 -1

.gitignore CHANGED Viewed

@@ -13,6 +13,7 @@ temp.md
 temp
 conversion_results
 uploads
 # Byte-compiled / optimized / DLL files
 __pycache__/

 temp
 conversion_results
 uploads
+/cache
 # Byte-compiled / optimized / DLL files
 __pycache__/

benchmarks/overall/download/base.py CHANGED Viewed

@@ -32,10 +32,10 @@ class Downloader:
             "uuid": datasets.Value("string"),
             "time": datasets.Value("float"),
         }))
-        out_ds.push_to_hub(f"datalab-to/marker_benchmark_{self.service}")
     def generate_data(self):
-        max_rows = 2200
         for idx, sample in tqdm(enumerate(self.ds), desc=f"Saving {self.service} results"):
             cache_file = self.cache_path / f"{idx}.json"
             if cache_file.exists():
@@ -47,6 +47,9 @@ class Downloader:
             except JSONDecodeError as e:
                 print(f"Error with sample {idx}: {e}")
                 continue
             out_data["uuid"] = sample["uuid"]
             with cache_file.open("w") as f:

             "uuid": datasets.Value("string"),
             "time": datasets.Value("float"),
         }))
+        out_ds.push_to_hub(f"datalab-to/marker_benchmark_{self.service}", private=True)
     def generate_data(self):
+        max_rows = self.max_rows
         for idx, sample in tqdm(enumerate(self.ds), desc=f"Saving {self.service} results"):
             cache_file = self.cache_path / f"{idx}.json"
             if cache_file.exists():
             except JSONDecodeError as e:
                 print(f"Error with sample {idx}: {e}")
                 continue
+            except Exception as e:
+                print(f"Error with sample {idx}: {e}")
+                continue
             out_data["uuid"] = sample["uuid"]
             with cache_file.open("w") as f:

benchmarks/overall/download/llamaparse.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import io
-import os
 import time
 import requests

 import io
 import time
 import requests

benchmarks/overall/download/main.py CHANGED Viewed

@@ -2,17 +2,19 @@ import click
 from benchmarks.overall.download.llamaparse import LlamaParseDownloader
 from benchmarks.overall.download.mathpix import MathpixDownloader
 @click.command("Download data from inference services")
-@click.argument("service", type=click.Choice(["mathpix", "llamaparse"]))
-@click.argument("--max_rows", type=int, default=2200)
-@click.argument("--api_key", type=str, default=None)
-@click.argument("--app_id", type=str, default=None)
 def main(service: str, max_rows: int, api_key: str, app_id: str):
     registry = {
         "mathpix": MathpixDownloader,
-        "llamaparse": LlamaParseDownloader
     }
     downloader = registry[service](api_key, app_id, max_rows=max_rows)

 from benchmarks.overall.download.llamaparse import LlamaParseDownloader
 from benchmarks.overall.download.mathpix import MathpixDownloader
+from benchmarks.overall.download.mistral import MistralDownloader
 @click.command("Download data from inference services")
+@click.argument("service", type=click.Choice(["mathpix", "llamaparse", "mistral"]))
+@click.option("--max_rows", type=int, default=2200)
+@click.option("--api_key", type=str, default=None)
+@click.option("--app_id", type=str, default=None)
 def main(service: str, max_rows: int, api_key: str, app_id: str):
     registry = {
         "mathpix": MathpixDownloader,
+        "llamaparse": LlamaParseDownloader,
+        "mistral": MistralDownloader,
     }
     downloader = registry[service](api_key, app_id, max_rows=max_rows)

benchmarks/overall/download/mistral.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import io
+import time
+import requests
+from benchmarks.overall.download.base import Downloader
+class MistralDownloader(Downloader):
+    service = "mistral"
+    def get_html(self, pdf_bytes):
+        rand_name = str(time.time()) + ".pdf"
+        start = time.time()
+        buff = io.BytesIO(pdf_bytes)
+        md = upload_and_process_file(self.api_key, rand_name, buff)
+        end = time.time()
+        if isinstance(md, bytes):
+            md = md.decode("utf-8")
+        return {
+            "md": md,
+            "time": end - start,
+        }
+def upload_and_process_file(api_key: str, fname: str, buff):
+    headers = {
+        "Authorization": f"Bearer {api_key}"
+    }
+    upload_headers = headers.copy()
+    files = {
+        'file': (fname, buff, 'application/pdf'),
+        'purpose': (None, 'ocr')
+    }
+    upload_response = requests.post(
+        'https://api.mistral.ai/v1/files',
+        headers=upload_headers,
+        files=files
+    )
+    upload_response.raise_for_status()
+    file_id = upload_response.json()['id']
+    url_headers = headers.copy()
+    url_headers["Accept"] = "application/json"
+    url_response = requests.get(
+        f'https://api.mistral.ai/v1/files/{file_id}/url?expiry=24',
+        headers=url_headers
+    )
+    url_response.raise_for_status()
+    signed_url = url_response.json()['url']
+    ocr_headers = headers.copy()
+    ocr_headers["Content-Type"] = "application/json"
+    ocr_data = {
+        "model": "mistral-ocr-latest",
+        "document": {
+            "type": "document_url",
+            "document_url": signed_url
+        },
+        "include_image_base64": True
+    }
+    ocr_response = requests.post(
+        'https://api.mistral.ai/v1/ocr',
+        headers=ocr_headers,
+        json=ocr_data
+    )
+    ocr_response.raise_for_status()
+    result = ocr_response.json()
+    return result["pages"][0]["markdown"]

benchmarks/overall/elo.py CHANGED Viewed

@@ -176,7 +176,7 @@ def display_win_rates_table(win_rates: dict):
 @click.argument("dataset", type=str)
 @click.option("--methods", type=str, help="List of methods to compare: comma separated like marker,mathpix")
 @click.option("--row_samples", type=int, default=2, help="Number of samples per row")
-@click.option("--max_rows", type=int, default=100, help="Maximum number of rows to process")
 def main(
     dataset: str,
     methods: str,
@@ -187,8 +187,9 @@ def main(
     method_lst = methods.split(",")
     win_rates = {m: defaultdict(lambda: defaultdict(int)) for m in method_lst}
     comparer = Comparer()
-    for i in tqdm(range(min(len(ds), max_rows)), desc="Calculating win rates..."):
         row = ds[i]
         # Avoid any bias in ordering
         random.shuffle(method_lst)

 @click.argument("dataset", type=str)
 @click.option("--methods", type=str, help="List of methods to compare: comma separated like marker,mathpix")
 @click.option("--row_samples", type=int, default=2, help="Number of samples per row")
+@click.option("--max_rows", type=int, default=None, help="Maximum number of rows to process")
 def main(
     dataset: str,
     methods: str,
     method_lst = methods.split(",")
     win_rates = {m: defaultdict(lambda: defaultdict(int)) for m in method_lst}
     comparer = Comparer()
+    max_rows = max_rows or len(ds)
+    for i in tqdm(range(max_rows), desc="Calculating win rates..."):
         row = ds[i]
         # Avoid any bias in ordering
         random.shuffle(method_lst)

benchmarks/overall/methods/mistral.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import datasets
+from benchmarks.overall.methods import BaseMethod, BenchmarkResult
+class MistralMethod(BaseMethod):
+    mistral_ds: datasets.Dataset = None
+    def __call__(self, sample) -> BenchmarkResult:
+        uuid = sample["uuid"]
+        data = None
+        for row in self.mistral_ds:
+            if str(row["uuid"]) == str(uuid):
+                data = row
+                break
+        if not data:
+            raise ValueError(f"Could not find data for uuid {uuid}")
+        return {
+            "markdown": data["md"],
+            "time": data["time"]
+        }

benchmarks/overall/overall.py CHANGED Viewed

@@ -89,7 +89,7 @@ def get_method_scores(benchmark_dataset: datasets.Dataset, methods: List[str], s
 @click.command(help="Benchmark PDF to MD conversion.")
 @click.option("--dataset", type=str, help="Path to the benchmark dataset", default="datalab-to/marker_benchmark")
 @click.option("--out_dataset", type=str, help="Path to the output dataset", default=None)
-@click.option("--methods", type=str, help="Comma separated list of other methods to compare against.  Possible values: marker,mathpix,llamaparse,docling", default="marker")
 @click.option("--scores", type=str, help="Comma separated list of scoring functions to use.  Possible values: heuristic,llm", default="heuristic")
 @click.option("--result_path", type=str, default=os.path.join(settings.OUTPUT_DIR, "benchmark", "overall"), help="Output path for results.")
 @click.option("--max_rows", type=int, default=None, help="Maximum number of rows to process.")
@@ -145,6 +145,9 @@ def main(
     if "llamaparse" in methods:
         artifacts["llamaparse_ds"] = datasets.load_dataset("datalab-to/marker_benchmark_llamaparse", split="train")
     if "olmocr" in methods:
         from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
         model = Qwen2VLForConditionalGeneration.from_pretrained("allenai/olmOCR-7B-0225-preview",
@@ -167,7 +170,7 @@ def main(
         if use_llm:
             out_dataset += "_llm"
         dataset = build_dataset(benchmark_dataset, result, score_types, max_rows=max_rows)
-        dataset.push_to_hub(out_dataset)
 if __name__ == "__main__":

 @click.command(help="Benchmark PDF to MD conversion.")
 @click.option("--dataset", type=str, help="Path to the benchmark dataset", default="datalab-to/marker_benchmark")
 @click.option("--out_dataset", type=str, help="Path to the output dataset", default=None)
+@click.option("--methods", type=str, help="Comma separated list of other methods to compare against.  Possible values: marker,mathpix,llamaparse,docling,mistral", default="marker")
 @click.option("--scores", type=str, help="Comma separated list of scoring functions to use.  Possible values: heuristic,llm", default="heuristic")
 @click.option("--result_path", type=str, default=os.path.join(settings.OUTPUT_DIR, "benchmark", "overall"), help="Output path for results.")
 @click.option("--max_rows", type=int, default=None, help="Maximum number of rows to process.")
     if "llamaparse" in methods:
         artifacts["llamaparse_ds"] = datasets.load_dataset("datalab-to/marker_benchmark_llamaparse", split="train")
+    if "mistral" in methods:
+        artifacts["mistral_ds"] = datasets.load_dataset("datalab-to/marker_benchmark_mistral", split="train")
     if "olmocr" in methods:
         from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
         model = Qwen2VLForConditionalGeneration.from_pretrained("allenai/olmOCR-7B-0225-preview",
         if use_llm:
             out_dataset += "_llm"
         dataset = build_dataset(benchmark_dataset, result, score_types, max_rows=max_rows)
+        dataset.push_to_hub(out_dataset, private=True)
 if __name__ == "__main__":

benchmarks/overall/registry.py CHANGED Viewed

@@ -3,6 +3,7 @@ from benchmarks.overall.methods.gt import GTMethod
 from benchmarks.overall.methods.llamaparse import LlamaParseMethod
 from benchmarks.overall.methods.marker import MarkerMethod
 from benchmarks.overall.methods.mathpix import MathpixMethod
 from benchmarks.overall.methods.olmocr import OlmOCRMethod
 from benchmarks.overall.scorers.heuristic import HeuristicScorer
 from benchmarks.overall.scorers.llm import LLMScorer
@@ -18,5 +19,6 @@ METHOD_REGISTRY = {
     "mathpix": MathpixMethod,
     "llamaparse": LlamaParseMethod,
     "docling": DoclingMethod,
-    "olmocr": OlmOCRMethod
 }

 from benchmarks.overall.methods.llamaparse import LlamaParseMethod
 from benchmarks.overall.methods.marker import MarkerMethod
 from benchmarks.overall.methods.mathpix import MathpixMethod
+from benchmarks.overall.methods.mistral import MistralMethod
 from benchmarks.overall.methods.olmocr import OlmOCRMethod
 from benchmarks.overall.scorers.heuristic import HeuristicScorer
 from benchmarks.overall.scorers.llm import LLMScorer
     "mathpix": MathpixMethod,
     "llamaparse": LlamaParseMethod,
     "docling": DoclingMethod,
+    "olmocr": OlmOCRMethod,
+    "mistral": MistralMethod
 }

marker/output.py CHANGED Viewed

@@ -7,9 +7,12 @@ from pydantic import BaseModel
 from marker.renderers.html import HTMLOutput
 from marker.renderers.json import JSONOutput, JSONBlockOutput
 from marker.renderers.markdown import MarkdownOutput
 from marker.settings import settings
-def json_to_html(block: JSONBlockOutput):
     # Utility function to take in json block output and give html for the block.
     if not getattr(block, "children", None):
         return block.html

 from marker.renderers.html import HTMLOutput
 from marker.renderers.json import JSONOutput, JSONBlockOutput
 from marker.renderers.markdown import MarkdownOutput
+from marker.schema.blocks import Block, BlockOutput
+from marker.schema.document import Document
 from marker.settings import settings
+def json_to_html(block: JSONBlockOutput | BlockOutput):
     # Utility function to take in json block output and give html for the block.
     if not getattr(block, "children", None):
         return block.html

marker/processors/llm/llm_form.py CHANGED Viewed

@@ -2,6 +2,7 @@ from typing import List
 from pydantic import BaseModel
 from marker.processors.llm import PromptData, BaseLLMSimpleBlockProcessor, BlockData
 from marker.schema import BlockTypes
@@ -77,7 +78,7 @@ Comparison: The html representation has the labels in the first row and the valu
         prompt_data = []
         for block_data in self.inference_blocks(document):
             block = block_data["block"]
-            block_html = block.render(document).html
             prompt = self.form_rewriting_prompt.replace("{block_html}", block_html)
             image = self.extract_image(document, block)
             prompt_data.append({
@@ -92,7 +93,7 @@ Comparison: The html representation has the labels in the first row and the valu
     def rewrite_block(self, response: dict, prompt_data: PromptData, document: Document):
         block = prompt_data["block"]
-        block_html = block.render(document).html
         if not response or "corrected_html" not in response:
             block.update_metadata(llm_error_count=1)

 from pydantic import BaseModel
+from marker.output import json_to_html
 from marker.processors.llm import PromptData, BaseLLMSimpleBlockProcessor, BlockData
 from marker.schema import BlockTypes
         prompt_data = []
         for block_data in self.inference_blocks(document):
             block = block_data["block"]
+            block_html = json_to_html(block.render(document))
             prompt = self.form_rewriting_prompt.replace("{block_html}", block_html)
             image = self.extract_image(document, block)
             prompt_data.append({
     def rewrite_block(self, response: dict, prompt_data: PromptData, document: Document):
         block = prompt_data["block"]
+        block_html = json_to_html(block.render(document))
         if not response or "corrected_html" not in response:
             block.update_metadata(llm_error_count=1)

marker/processors/llm/llm_mathblock.py CHANGED Viewed

@@ -5,6 +5,7 @@ from typing import List, Tuple, Annotated
 from pydantic import BaseModel
 from tqdm import tqdm
 from marker.processors.llm import BaseLLMComplexBlockProcessor
 from marker.schema import BlockTypes
@@ -27,8 +28,8 @@ class LLMMathBlockProcessor(BaseLLMComplexBlockProcessor):
     additional_block_types = (BlockTypes.Text, BlockTypes.Caption, BlockTypes.SectionHeader, BlockTypes.Footnote) # Seconday, can also contain math
     text_math_rewriting_prompt = """You are a text correction expert specializing in accurately reproducing text from images.
-You will receive an image of a text block and a set of extracted lines corresponding to the text in the image.
-Your task is to correct any errors in the extracted block, including math, formatting, and other inaccuracies, and output the corrected block in html format.  Stay as faithful to the original text as possible.
 **Instructions:**
@@ -39,7 +40,7 @@ Your task is to correct any errors in the extracted block, including math, forma
 5. If there are no errors in any of the extracted text, output "No corrections needed".
 6. Correct any errors in the extracted text, including:
     * Inline math: Ensure all mathematical expressions are correctly formatted and rendered.  Surround them with <math>...</math> tags.  The math expressions should be rendered in simple, concise, KaTeX-compatible LaTeX.  Do not use $ or $$ as delimiters.
-      * If a math expression is not in LaTeX format, convert it to LaTeX format, and surround it with <math>...</math> tags.
     * Formatting: Maintain consistent formatting with the text block image, including spacing, indentation, subscripts/superscripts, and special characters.  Use the <i>, <b>, <sup>, <sub>, and <span> tags to format the text as needed.
     * Other inaccuracies:  If the image is handwritten then you may correct any spelling errors, or other discrepancies.
     * Ensure lines wrap properly, and that newlines are not in the middle of sentences.
@@ -125,7 +126,7 @@ Adversarial training <i>(AT)</i> <a href='#page-9-1'>[23]</a>, which aims to min
         pbar.close()
     def get_block_text(self, block: Block, document: Document) -> str:
-        html = block.render(document).html
         return html
     def get_block_lines(self, block: Block, document: Document) -> Tuple[list, list]:

 from pydantic import BaseModel
 from tqdm import tqdm
+from marker.output import json_to_html
 from marker.processors.llm import BaseLLMComplexBlockProcessor
 from marker.schema import BlockTypes
     additional_block_types = (BlockTypes.Text, BlockTypes.Caption, BlockTypes.SectionHeader, BlockTypes.Footnote) # Seconday, can also contain math
     text_math_rewriting_prompt = """You are a text correction expert specializing in accurately reproducing text from images.
+You will receive an image of a text block and extracted text corresponding to the text in the image.
+Your task is to correct any errors in the extracted text, including math, formatting, and other inaccuracies, and output the corrected block in html format.  Stay as faithful to the text in the image as possible.
 **Instructions:**
 5. If there are no errors in any of the extracted text, output "No corrections needed".
 6. Correct any errors in the extracted text, including:
     * Inline math: Ensure all mathematical expressions are correctly formatted and rendered.  Surround them with <math>...</math> tags.  The math expressions should be rendered in simple, concise, KaTeX-compatible LaTeX.  Do not use $ or $$ as delimiters.
+    * If a math expression is not in LaTeX format, convert it to LaTeX format, and surround it with <math>...</math> tags.
     * Formatting: Maintain consistent formatting with the text block image, including spacing, indentation, subscripts/superscripts, and special characters.  Use the <i>, <b>, <sup>, <sub>, and <span> tags to format the text as needed.
     * Other inaccuracies:  If the image is handwritten then you may correct any spelling errors, or other discrepancies.
     * Ensure lines wrap properly, and that newlines are not in the middle of sentences.
         pbar.close()
     def get_block_text(self, block: Block, document: Document) -> str:
+        html = json_to_html(block.render(document))
         return html
     def get_block_lines(self, block: Block, document: Document) -> Tuple[list, list]:

marker/processors/llm/llm_table_merge.py CHANGED Viewed

@@ -5,6 +5,7 @@ from pydantic import BaseModel
 from tqdm import tqdm
 from PIL import Image
 from marker.processors.llm import BaseLLMComplexBlockProcessor
 from marker.schema import BlockTypes
 from marker.schema.blocks import Block, TableCell
@@ -235,8 +236,8 @@ Table 2
             start_image = start_block.get_image(document, highres=False)
             curr_image = curr_block.get_image(document, highres=False)
-            start_html = start_block.render(document).html
-            curr_html = curr_block.render(document).html
             prompt = self.table_merge_prompt.replace("{{table1}}", start_html).replace("{{table2}}", curr_html)

 from tqdm import tqdm
 from PIL import Image
+from marker.output import json_to_html
 from marker.processors.llm import BaseLLMComplexBlockProcessor
 from marker.schema import BlockTypes
 from marker.schema.blocks import Block, TableCell
             start_image = start_block.get_image(document, highres=False)
             curr_image = curr_block.get_image(document, highres=False)
+            start_html = json_to_html(start_block.render(document))
+            curr_html = json_to_html(curr_block.render(document))
             prompt = self.table_merge_prompt.replace("{{table1}}", start_html).replace("{{table2}}", curr_html)

marker/services/vertex.py CHANGED Viewed

@@ -17,11 +17,18 @@ class GoogleVertexService(BaseGeminiService):
         str,
         "The name of the Google model to use for the service."
     ] = "gemini-2.0-flash-001"
     def get_google_client(self, timeout: int):
         return genai.Client(
             vertexai=True,
             project=self.vertex_project_id,
             location=self.vertex_location,
-            http_options={"timeout": timeout * 1000} # Convert to milliseconds
         )

         str,
         "The name of the Google model to use for the service."
     ] = "gemini-2.0-flash-001"
+    vertex_dedicated: Annotated[
+        bool,
+        "Whether to use a dedicated Vertex AI instance."
+    ] = False
     def get_google_client(self, timeout: int):
+        http_options = {"timeout": timeout * 1000} # Convert to milliseconds
+        if self.vertex_dedicated:
+            http_options["headers"] = {"x-vertex-ai-llm-request-type": "dedicated"}
         return genai.Client(
             vertexai=True,
             project=self.vertex_project_id,
             location=self.vertex_location,
+            http_options=http_options,
         )