Spaces:

rt4u
/

marker

Sleeping

App Files Files Community

Vik Paruchuri commited on Jan 31

Commit

6c81421

2 Parent(s): a9c09d7 38790b9

Improve bench

Browse files

Files changed (10) hide show

benchmarks/overall/clean.py +125 -0
benchmarks/overall/inference.py +16 -16
benchmarks/overall/overall.py +11 -1
benchmarks/overall/render.py +109 -0
benchmarks/overall/schema.py +1 -2
benchmarks/overall/scoring.py +7 -103
marker/scripts/streamlit_app.py +4 -1
marker/scripts/streamlit_app_blocks_viz.html +17 -17
poetry.lock +189 -42
pyproject.toml +5 -3

benchmarks/overall/clean.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import re
+import subprocess
+import tempfile
+from pathlib import Path
+import latex2mathml.converter
+from marker.renderers.markdown import MarkdownRenderer
+class MarkdownCleaner:
+    def __init__(self):
+        pass
+    def __call__(self, markdown):
+        markdown = self.normalize_markdown(markdown)  # Use pandoc to normalize
+        # Replace math expressions with latexml
+        pattern = r'(?<!\\)\$(?:\$([^$]+)\$\$|\s*([^$\n]+?)\s*\$)'
+        markdown = re.sub(pattern, self.standardize_math, markdown)
+        # Replace image urls with a generic tag
+        pattern = r'!\[(.*?)\]\((https?://[^\s\)]+)\)'
+        markdown = re.sub(pattern, r'![link]', markdown)
+        # Clean up stray html tags
+        markdown = markdown.replace("<br>", "\n")
+        markdown = re.sub(r"<sub>(.*?)</sub>", r"\1", markdown)
+        markdown = re.sub(r"<sup>(.*?)</sup>", r"\1", markdown)
+        markdown = re.sub(r"<span.*?>(.*?)</span>", r"\1", markdown)  # Remove span tags and keep content
+        # Clean up markdown formatting
+        markdown = re.sub(r"\s+", " ", markdown)
+        markdown = re.sub(r"\n+", "\n", markdown)
+        markdown = re.sub("\\.+", ".",
+                          markdown)  # Replace repeated periods with a single period, like in table of contents
+        markdown = re.sub("#+", "#", markdown)  # Replace repeated headers with a single header
+        markdown = markdown.encode().decode('unicode-escape', errors="ignore")  # Decode unicode characters properly
+        return markdown.strip().lower()
+    @staticmethod
+    def normalize_markdown(md_text: str) -> str:
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            dirpath = Path(tmp_dir)
+            input_file = dirpath / 'input.md'
+            input_file.write_text(md_text, encoding='utf-8')
+            # Markdown to HTML
+            html_file = dirpath / 'temp.html'
+            subprocess.run(
+                [
+                    'pandoc',
+                    str(input_file),
+                    '-f', 'markdown+tex_math_dollars',
+                    '-t', 'html',
+                    '-o', str(html_file),
+                    '--quiet'
+                ],
+                check=True
+            )
+            # HTML to Markdown
+            output_file = dirpath / 'output.md'
+            subprocess.run(
+                [
+                    'pandoc',
+                    str(html_file),
+                    '-f', 'html',
+                    '-t', 'markdown+tex_math_dollars',
+                    '-o', str(output_file),
+                    '--quiet'
+                ],
+                check=True
+            )
+            # Read back the normalized Markdown
+            normalized_md = output_file.read_text(encoding='utf-8')
+        return normalized_md
+    def standardize_math(self, match):
+        try:
+            delim = "$$" if match.group(0).startswith('$$') else "$"
+            math_content = match.group(1) or match.group(2)
+            if delim == "$$":
+                math_content = latex2mathml.converter.convert(math_content)
+            else:
+                math_content = self.clean_latex(math_content)
+            return f'{delim}{math_content}{delim}'
+        except Exception as e:
+            print(f"Failed to standardize math expression: {match.group(0)} with error: {e}")
+            return match.group(0)
+    @staticmethod
+    def clean_latex(latex_str):
+        latex_str = re.sub(r'\s+', ' ', latex_str.strip())
+        for tag in [r'\\text', r'\\mathrm', r'\\mathbf', r'\\textbf']:
+            latex_str = re.sub(tag + r'\{([^}]+)\}', r'\1', latex_str)
+        replacements = {
+            '\\times': '*',
+            '\\cdot': '*',
+            '\\div': '/',
+            '\\le': '<=',
+            '\\ge': '>=',
+            '\\neq': '!=',
+            '\\to': '\\rightarrow',
+        }
+        for old, new in replacements.items():
+            latex_str = latex_str.replace(old, new)
+        return latex_str
+def convert_to_md(html):
+    md = MarkdownRenderer()
+    markdown = md.md_cls.convert(html)
+    return markdown
+def clean_input(markdown):
+    cleaner = MarkdownCleaner()
+    return cleaner(markdown)

benchmarks/overall/inference.py CHANGED Viewed

@@ -1,38 +1,37 @@
 import tempfile
 import time
-from bs4 import BeautifulSoup
-from benchmarks.overall.scoring import score_blocks
 from benchmarks.overall.schema import BlockScores
 from marker.converters.pdf import PdfConverter
-def get_marker_html(marker_models: dict, pdf_bytes: bytes, use_llm: bool):
     block_converter = PdfConverter(
         artifact_dict=marker_models,
-        config={"page_range": [0], "disable_tqdm": True, "use_llm": use_llm},
-        renderer="marker.renderers.html.HTMLRenderer"
     )
     with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f:
         f.write(pdf_bytes)
         rendered = block_converter(f.name)
-    html = rendered.html
-    soup = BeautifulSoup(html, "html.parser")
-    inner_html = str(soup.find("body").decode_contents())
-    return inner_html
-def marker_scoring_func(model_dict, sample, gt_html, use_llm=False, **kwargs) -> BlockScores:
     pdf_bytes = sample["pdf"]  # This is a single page PDF
     start = time.time()
-    marker_html = get_marker_html(model_dict, pdf_bytes, use_llm)
     total = time.time() - start
-    scores = score_blocks(gt_html, marker_html)
     scores["time"] = total
     return scores
-def mathpix_scoring_func(model_dict, sample, gt_html, mathpix_ds=None, **kwargs) -> BlockScores:
     uuid = sample["uuid"]
     data = None
     for row in mathpix_ds:
@@ -42,7 +41,8 @@ def mathpix_scoring_func(model_dict, sample, gt_html, mathpix_ds=None, **kwargs)
     if not data:
         raise ValueError(f"Could not find data for uuid {uuid}")
-    mathpix_md = data["md"]
-    scores = score_blocks(gt_html, mathpix_md, convert=False)
     scores["time"] = data["time"]
     return scores

 import tempfile
 import time
+from benchmarks.overall.clean import clean_input
 from benchmarks.overall.schema import BlockScores
+from benchmarks.overall.scoring import score_blocks
 from marker.converters.pdf import PdfConverter
+def get_marker_markdown(marker_models: dict, pdf_bytes: bytes, use_llm: bool):
     block_converter = PdfConverter(
         artifact_dict=marker_models,
+        config={"page_range": [0], "disable_tqdm": True, "use_llm": use_llm}
     )
     with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f:
         f.write(pdf_bytes)
         rendered = block_converter(f.name)
+    return rendered.markdown
+def marker_scoring_func(model_dict, sample, gt_markdown, use_llm=False, **kwargs) -> BlockScores:
     pdf_bytes = sample["pdf"]  # This is a single page PDF
     start = time.time()
+    marker_md = get_marker_markdown(model_dict, pdf_bytes, use_llm)
+    marker_md = clean_input(marker_md)
     total = time.time() - start
+    scores = score_blocks(gt_markdown, marker_md)
     scores["time"] = total
+    scores["markdown"] = marker_md
     return scores
+def mathpix_scoring_func(model_dict, sample, gt_markdown, mathpix_ds=None, **kwargs) -> BlockScores:
     uuid = sample["uuid"]
     data = None
     for row in mathpix_ds:
     if not data:
         raise ValueError(f"Could not find data for uuid {uuid}")
+    mathpix_md = clean_input(data["md"])
+    scores = score_blocks(gt_markdown, mathpix_md)
     scores["time"] = data["time"]
+    scores["markdown"] = mathpix_md
     return scores

benchmarks/overall/overall.py CHANGED Viewed

@@ -7,9 +7,11 @@ from typing import Dict
 import click
 import datasets
 import tabulate
 from tqdm import tqdm
 import pypdfium2 as pdfium
 from benchmarks.overall.inference import marker_scoring_func, mathpix_scoring_func
 from benchmarks.overall.schema import FullResult
 from marker.logger import configure_logging
@@ -32,7 +34,8 @@ def get_method_scores(ds, model_dict, max_rows=None, score_func=marker_scoring_f
         try:
             gt_html = [block["html"] for block in gt_blocks if len(block["html"]) > 0]
-            scores = score_func(model_dict, sample, gt_html, **kwargs)
         except ValueError as e:
             print(f"Error with sample {idx}: {e}")
             continue
@@ -101,12 +104,14 @@ def print_scores(scores: Dict[str, FullResult], out_path: Path, default_method="
 @click.command(help="Benchmark PDF to MD conversion.")
 @click.option("--dataset", type=str, help="Path to the benchmark dataset", default="datalab-to/marker_benchmark")
 @click.option("--other_methods", type=str, help="Comma separated list of other methods to compare against.  Possible values: mathpix", default="")
 @click.option("--result_path", type=str, default=os.path.join(settings.OUTPUT_DIR, "benchmark", "overall"), help="Output path for results.")
 @click.option("--max_rows", type=int, default=None, help="Maximum number of rows to process.")
 @click.option("--use_llm", is_flag=True, help="Use the LLM model for better marker quality.")
 def main(
         dataset: str,
         other_methods: str,
         result_path: str,
         max_rows: int,
@@ -142,6 +147,11 @@ def main(
     print(f"Results saved to {out_path}.")
 if __name__ == "__main__":
     main()

 import click
 import datasets
 import tabulate
+from benchmarks.overall.render import build_dataset
 from tqdm import tqdm
 import pypdfium2 as pdfium
+from benchmarks.overall.clean import convert_to_md, clean_input
 from benchmarks.overall.inference import marker_scoring_func, mathpix_scoring_func
 from benchmarks.overall.schema import FullResult
 from marker.logger import configure_logging
         try:
             gt_html = [block["html"] for block in gt_blocks if len(block["html"]) > 0]
+            gt_markdown = [clean_input(convert_to_md(block)) for block in gt_html]
+            scores = score_func(model_dict, sample, gt_markdown, **kwargs)
         except ValueError as e:
             print(f"Error with sample {idx}: {e}")
             continue
 @click.command(help="Benchmark PDF to MD conversion.")
 @click.option("--dataset", type=str, help="Path to the benchmark dataset", default="datalab-to/marker_benchmark")
+@click.option("--out_dataset", type=str, help="Path to the output dataset", default=None)
 @click.option("--other_methods", type=str, help="Comma separated list of other methods to compare against.  Possible values: mathpix", default="")
 @click.option("--result_path", type=str, default=os.path.join(settings.OUTPUT_DIR, "benchmark", "overall"), help="Output path for results.")
 @click.option("--max_rows", type=int, default=None, help="Maximum number of rows to process.")
 @click.option("--use_llm", is_flag=True, help="Use the LLM model for better marker quality.")
 def main(
         dataset: str,
+        out_dataset: str,
         other_methods: str,
         result_path: str,
         max_rows: int,
     print(f"Results saved to {out_path}.")
+    # Push up comparison dataset
+    if out_dataset is not None:
+        out_ds = build_dataset(ds, all_scores)
+        out_ds.push_to_hub(out_dataset)
 if __name__ == "__main__":
     main()

benchmarks/overall/render.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import subprocess
+import tempfile
+import pypdfium2 as pdfium
+from typing import Dict
+from collections import defaultdict
+import re
+import io
+import json
+from PIL import Image
+import datasets
+import markdown2
+from playwright.sync_api import sync_playwright
+from benchmarks.overall.schema import FullResult
+def convert_to_html(md: str):
+    block_placeholders = []
+    inline_placeholders = []
+    # Add placeholders for the math
+    def block_sub(match):
+        content = match.group(1)
+        placeholder = f"1BLOCKMATH{len(block_placeholders)}1"
+        block_placeholders.append((placeholder, f"$${content}$$"))
+        return placeholder
+    def inline_sub(match):
+        content = match.group(1)
+        placeholder = f"1INLINEMATH{len(inline_placeholders)}1"
+        inline_placeholders.append((placeholder, f"${content}$"))
+        return placeholder
+    md = re.sub(r'\${2}(.*?)\${2}', block_sub, md, flags=re.DOTALL)
+    md = re.sub(r'\$(.*?)\$', inline_sub, md)
+    html = markdown2.markdown(md, extras=['tables'])
+    # Replace placeholders
+    for placeholder, math_str in block_placeholders:
+        html = html.replace(placeholder, math_str)
+    for placeholder, math_str in inline_placeholders:
+        html = html.replace(placeholder, math_str)
+    return html
+def markdown_to_image(md: str) -> Image.Image:
+    html = convert_to_html(md)
+    with sync_playwright() as p:
+        browser = p.chromium.launch()
+        page = browser.new_page()
+        page.set_content(f"""
+            <head>
+                <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.16.21/dist/katex.min.css" integrity="sha384-zh0CIslj+VczCZtlzBcjt5ppRcsAmDnRem7ESsYwWwg3m/OaJ2l4x7YBZl9Kxxib" crossorigin="anonymous">
+                <!-- The loading of KaTeX is deferred to speed up page rendering -->
+                <script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.21/dist/katex.min.js" integrity="sha384-Rma6DA2IPUwhNxmrB/7S3Tno0YY7sFu9WSYMCuulLhIqYSGZ2gKCJWIqhBWqMQfh" crossorigin="anonymous"></script>
+                <!-- To automatically render math in text elements, include the auto-render extension: -->
+                <script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.21/dist/contrib/auto-render.min.js" integrity="sha384-hCXGrW6PitJEwbkoStFjeJxv+fSOOQKOPbJxSfM6G5sWZjAyWhXiTIIAmQqnlLlh" crossorigin="anonymous"></script>
+            </head>
+            <body>
+                {html}
+                    <script>
+                        renderMathInElement(document.body, {{
+                            delimiters: [
+                                {{left: '$$', right: '$$', display: true}},
+                                {{left: '$', right: '$', display: false}}
+                            ]
+                        }});
+                    </script>
+            </body>
+        """)
+        page.set_viewport_size({"width": 1200, "height": 800})
+        page.wait_for_timeout(500) # Wait for KaTeX to render
+        screenshot_bytes = page.screenshot(full_page=True)
+        browser.close()
+    return Image.open(io.BytesIO(screenshot_bytes))
+def build_dataset(ds: datasets.Dataset, all_scores: Dict[str, FullResult]) -> datasets.Dataset:
+    # Get all the dataset indices that went through inference
+    full_idxs = None
+    for method in all_scores:
+        result_idxs = list(all_scores[method]["raw_scores"].keys())
+        if full_idxs is None:
+            full_idxs = sorted(result_idxs)
+        else:
+            full_idxs = [f for f in full_idxs if f in result_idxs]
+    ds_rows = defaultdict(dict)
+    for idx in full_idxs:
+        row = ds[idx] # img, gt_blocks, classification, language, uuid
+        for method in all_scores:
+            method_row = all_scores[method]["raw_scores"][idx]
+            ds_rows[idx].update({
+                f"{method}_score": method_row["overall_score"],
+                f"{method}_markdown": method_row["markdown"],
+                f"{method}_image": markdown_to_image(method_row["markdown"]),
+                f"{method}_time": method_row["time"]
+            })
+        gt_md = "\n\n".join([clean_input(convert_to_md(block)) for block in json.loads(row["gt_blocks"])])
+        ds_rows[idx].update({
+            "gt_markdown": gt_md,
+            "gt_image": markdown_to_image(gt_md)
+        })
+    out_dataset = datasets.Dataset.from_list([ds_rows[k] for k in full_idxs])
+    return out_dataset

benchmarks/overall/schema.py CHANGED Viewed

@@ -4,10 +4,9 @@ from typing import TypedDict, List, Dict, Optional
 class BlockScores(TypedDict):
     scores: List[float]
     order_score: float
-    gt: List[str]
-    method: str
     overall_score: float
     time: Optional[float]
 class FullResult(TypedDict):

 class BlockScores(TypedDict):
     scores: List[float]
     order_score: float
     overall_score: float
     time: Optional[float]
+    markdown: str
 class FullResult(TypedDict):

benchmarks/overall/scoring.py CHANGED Viewed

@@ -2,9 +2,8 @@ from typing import List
 from rapidfuzz import fuzz
 from benchmarks.overall.schema import BlockScores
-from marker.renderers.markdown import MarkdownRenderer
-import re
 def kendall_tau(correct_order: List[int], actual_order: List[int]) -> float:
@@ -58,112 +57,19 @@ def find_fuzzy_alignments(
         })
     return alignments
-def convert_to_md(html):
-    md = MarkdownRenderer()
-    markdown = md.md_cls.convert(html)
-    return markdown
-def standardize_markdown(markdown):
-    # Replace math expressions
-    pattern = r'(?<!\\)\$(?:\$([^$]+)\$\$|\s*([^$\n]+?)\s*\$)'
-    markdown = re.sub(pattern, standardize_math, markdown)
-    # Replace image urls
-    pattern = r'!\[(.*?)\]\((https?://[^\s\)]+)\)'
-    markdown =  re.sub(pattern, r'![link]', markdown)
-    markdown = strip_latex_symbols(markdown)
-    markdown = replace_centered_lines(markdown)
-    # Clean up html tags
-    markdown = markdown.replace("<br>", "\n")
-    markdown = re.sub(r"<sub>(.*?)</sub>", r"\1", markdown)
-    markdown = re.sub(r"<sup>(.*?)</sup>", r"\1", markdown)
-    markdown = re.sub(r"<span.*?>(.*?)</span>", r"\1", markdown) # Remove span tags and keep content
-    # Clean up markdown
-    markdown = re.sub(r"\s+", " ", markdown)
-    markdown = re.sub(r"\n+", "\n", markdown)
-    markdown = re.sub("\\.+", ".", markdown) # Replace repeated periods with a single period, like in table of contents
-    markdown = re.sub("#+", "#", markdown) # Replace repeated headers with a single header
-    markdown = re.sub(r"\$", "", markdown) # Remove equation delimiters
-    markdown = markdown.encode().decode('unicode-escape', errors="ignore") # Decode unicode characters properly
-    return markdown.strip().lower()
-def replace_centered_lines(text):
-    def replace_match(m):
-        content = m.group(0)
-        dash_count = content.count('-')
-        return '-' * dash_count
-    pattern = r':-+:'
-    return re.sub(pattern, replace_match, text)
-def strip_latex_symbols(text):
-    # Handle short math mode sequences first - only match $ $ with brief content
-    text = re.sub(r'\$\s*\\?[a-zA-Z]+\d?\s*\$', '', text)
-    # Handle common patterns inside remaining math mode
-    patterns = [
-        r'\$\s*\\?[a-zA-Z]+\d?\s*\$',  # \alpha or \alpha2 in math mode
-        r'\$\s*\d+\\[a-zA-Z]+\s*\$',  # 45\circ in math mode
-        r'\$\s*[a-zA-Z0-9]\\[a-zA-Z]+\s*\$'  # x\dagger in math mode
-    ]
-    pattern = '|'.join(patterns)
-    return re.sub(pattern, '', text)
-def standardize_math(match):
-    try:
-        delim = "$$" if match.group(0).startswith('$$') else "$"
-        math_content = match.group(1) or match.group(2)
-        result = clean_latex(math_content)
-        return f'{delim}{result}{delim}'
-    except Exception as e:
-        print(f"Failed to standardize math expression: {match.group(0)} with error: {e}")
-        return match.group(0)
-def clean_latex(latex_str):
-    latex_str = re.sub(r'\s+', ' ', latex_str.strip())
-    for tag in [r'\\text', r'\\mathrm', r'\\mathbf', r'\\textbf']:
-        latex_str = re.sub(tag + r'\{([^}]+)\}', r'\1', latex_str)
-    replacements = {
-        '\\times': '*',
-        '\\cdot': '*',
-        '\\div': '/',
-        '\\le': '<=',
-        '\\ge': '>=',
-        '\\neq': '!=',
-        '\\to': '\\rightarrow',
-    }
-    for old, new in replacements.items():
-        latex_str = latex_str.replace(old, new)
-    return latex_str
-def score_blocks(gt_html, method_html, convert=True) -> BlockScores:
-    if convert:
-        method_html = convert_to_md(method_html)
-    method_html = standardize_markdown(method_html)
-    gt = [standardize_markdown(convert_to_md(gt)) for gt in gt_html]
-    alignments = find_fuzzy_alignments(method_html, gt)
     scores = [alignment["score"] for alignment in alignments]
     # Find order score
     orders = [alignment["start"] for alignment in alignments]
-    correct_order = list(range(len(gt)))
-    actual_order = sorted(range(len(gt)), key=lambda x: orders[x])
     order_score = kendall_tau(correct_order, actual_order)
     # Weight score by sequence length
-    gt_weights = [len(g) for g in gt]
     weighted_scores = [score * weight for score, weight in zip(scores, gt_weights)]
     # Weight the score by sequence length
@@ -172,8 +78,6 @@ def score_blocks(gt_html, method_html, convert=True) -> BlockScores:
     return {
         "scores": scores,
         "order_score": order_score,
-        "gt": gt,
-        "method": method_html,
         "overall_score": overall_score,
         "time": None
     }

 from rapidfuzz import fuzz
+from benchmarks.overall.clean import convert_to_md, MarkdownCleaner
 from benchmarks.overall.schema import BlockScores
 def kendall_tau(correct_order: List[int], actual_order: List[int]) -> float:
         })
     return alignments
+def score_blocks(gt_markdown: List[str], method_markdown: str) -> BlockScores:
+    alignments = find_fuzzy_alignments(method_markdown, gt_markdown)
     scores = [alignment["score"] for alignment in alignments]
     # Find order score
     orders = [alignment["start"] for alignment in alignments]
+    correct_order = list(range(len(gt_markdown)))
+    actual_order = sorted(range(len(gt_markdown)), key=lambda x: orders[x])
     order_score = kendall_tau(correct_order, actual_order)
     # Weight score by sequence length
+    gt_weights = [len(g) for g in gt_markdown]
     weighted_scores = [score * weight for score, weight in zip(scores, gt_weights)]
     # Weight the score by sequence length
     return {
         "scores": scores,
         "order_score": order_score,
         "overall_score": overall_score,
         "time": None
     }

marker/scripts/streamlit_app.py CHANGED Viewed

@@ -115,7 +115,10 @@ def pillow_image_to_base64_string(img: Image) -> str:
     return base64.b64encode(buffered.getvalue()).decode("utf-8")
-def block_display(image: Image, blocks: dict = {}, dpi=96):
     image_data_url = (
         'data:image/jpeg;base64,' + pillow_image_to_base64_string(image)
     )

     return base64.b64encode(buffered.getvalue()).decode("utf-8")
+def block_display(image: Image, blocks: dict | None = None, dpi=96):
+    if blocks is None:
+        blocks = {}
     image_data_url = (
         'data:image/jpeg;base64,' + pillow_image_to_base64_string(image)
     )

marker/scripts/streamlit_app_blocks_viz.html CHANGED Viewed

@@ -114,7 +114,7 @@
   <body>
     <div style="text-align: center" class="image-container">
       <dialog id="block-info-dialog">
-        <button
           class="close-button"
           onclick="document.querySelector('#block-info-dialog').close()"
         ></button>
@@ -147,17 +147,17 @@
         const BLOCK_TYPES = $block_types_json;
         const blocksById = {};
         const blockInfoDialog = document.querySelector("dialog#block-info-dialog");
         function blockTypeColor(blockType) {
           return COLORS[BLOCK_TYPES[blockType] % COLORS.length];
         }
         function traverseAndGenerateSVG(block) {
           let svg = "";
           if (block.polygon) {
             const color = blockTypeColor(block.block_type);
             // dollar signs are escaped because this files gets read into a template string
             svg += `<rect id="$${block.id}"
                                 class="block type-$${block.block_type}"
@@ -171,52 +171,52 @@
                                 }"
                                 fill=$${color} stroke=$${color}>
                           </rect>`;
             blocksById[block.id] = block;
           }
           if (Array.isArray(block.children) && block.children.length > 0) {
             block.children.forEach((child) => {
               svg += traverseAndGenerateSVG(child);
             });
           }
           return svg;
         }
         if (Object.keys(BLOCKS).length == 0) {
           // bail out if no blocks
           return;
         }
         const [vbWidth, vbHeight] = BLOCKS.children[0].polygon[2];
         document
             .querySelector("svg")
             .setAttribute("viewBox", `0 0 $${vbWidth} $${vbHeight}`);
         const blocksOverlay = document.querySelector(".blocks-overlay");
         blocksOverlay.innerHTML = traverseAndGenerateSVG(BLOCKS.children[0]);
         tippy("rect.block", {
             content: (block) => block.getAttribute("data-type"),
             placement: "top-start",
             arrow: false,
             offset: [0, 5],
         });
         blocksOverlay.addEventListener("click", (event) => {
             if (event.target.tagName !== "rect") return;
             const blockId = event.target.id;
             const block = blocksById[blockId];
             blockInfoDialog.querySelector("h1").innerHTML = `
               $${blockId} <span style="color: $${blockTypeColor(block.block_type)}">($${block.block_type})</span>
             `;
             blockInfoDialog.querySelector(".text-content").textContent = block.html;
             blockInfoDialog.dataset.blockJSON = JSON.stringify(block, null, 2);
             if (block.images) {
                 const imagesDiv = blockInfoDialog.querySelector(".images");
                 imagesDiv.innerHTML = "";

   <body>
     <div style="text-align: center" class="image-container">
       <dialog id="block-info-dialog">
+        <button
           class="close-button"
           onclick="document.querySelector('#block-info-dialog').close()"
         ></button>
         const BLOCK_TYPES = $block_types_json;
         const blocksById = {};
         const blockInfoDialog = document.querySelector("dialog#block-info-dialog");
         function blockTypeColor(blockType) {
           return COLORS[BLOCK_TYPES[blockType] % COLORS.length];
         }
         function traverseAndGenerateSVG(block) {
           let svg = "";
           if (block.polygon) {
             const color = blockTypeColor(block.block_type);
             // dollar signs are escaped because this files gets read into a template string
             svg += `<rect id="$${block.id}"
                                 class="block type-$${block.block_type}"
                                 }"
                                 fill=$${color} stroke=$${color}>
                           </rect>`;
             blocksById[block.id] = block;
           }
           if (Array.isArray(block.children) && block.children.length > 0) {
             block.children.forEach((child) => {
               svg += traverseAndGenerateSVG(child);
             });
           }
           return svg;
         }
         if (Object.keys(BLOCKS).length == 0) {
           // bail out if no blocks
           return;
         }
         const [vbWidth, vbHeight] = BLOCKS.children[0].polygon[2];
         document
             .querySelector("svg")
             .setAttribute("viewBox", `0 0 $${vbWidth} $${vbHeight}`);
         const blocksOverlay = document.querySelector(".blocks-overlay");
         blocksOverlay.innerHTML = traverseAndGenerateSVG(BLOCKS.children[0]);
         tippy("rect.block", {
             content: (block) => block.getAttribute("data-type"),
             placement: "top-start",
             arrow: false,
             offset: [0, 5],
         });
         blocksOverlay.addEventListener("click", (event) => {
             if (event.target.tagName !== "rect") return;
             const blockId = event.target.id;
             const block = blocksById[blockId];
             blockInfoDialog.querySelector("h1").innerHTML = `
               $${blockId} <span style="color: $${blockTypeColor(block.block_type)}">($${block.block_type})</span>
             `;
             blockInfoDialog.querySelector(".text-content").textContent = block.html;
             blockInfoDialog.dataset.blockJSON = JSON.stringify(block, null, 2);
             if (block.images) {
                 const imagesDiv = blockInfoDialog.querySelector(".images");
                 imagesDiv.innerHTML = "";

poetry.lock CHANGED Viewed

@@ -414,13 +414,13 @@ files = [
 [[package]]
 name = "certifi"
-version = "2024.12.14"
 description = "Python package for providing Mozilla's CA Bundle."
 optional = false
 python-versions = ">=3.6"
 files = [
-    {file = "certifi-2024.12.14-py3-none-any.whl", hash = "sha256:1275f7a45be9464efc1173084eaa30f866fe2e47d389406136d332ed4967ec56"},
-    {file = "certifi-2024.12.14.tar.gz", hash = "sha256:b650d30f370c2b724812bee08008be0c4163b163ddaec3f2546c1caf65f191db"},
 ]
 [[package]]
@@ -1212,6 +1212,92 @@ protobuf = ">=3.20.2,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4
 [package.extras]
 grpc = ["grpcio (>=1.44.0,<2.0.0.dev0)"]
 [[package]]
 name = "grpcio"
 version = "1.70.0"
@@ -1905,6 +1991,17 @@ files = [
     {file = "jupyterlab_widgets-3.0.13.tar.gz", hash = "sha256:a2966d385328c1942b683a8cd96b89b8dd82c8b8f81dda902bb2bc06d46f5bed"},
 ]
 [[package]]
 name = "lxml"
 version = "5.3.0"
@@ -2729,6 +2826,18 @@ files = [
 [package.dependencies]
 nvidia-nvjitlink-cu12 = "*"
 [[package]]
 name = "nvidia-nccl-cu12"
 version = "2.21.5"
@@ -3065,6 +3174,26 @@ docs = ["furo (>=2024.8.6)", "proselint (>=0.14)", "sphinx (>=8.0.2)", "sphinx-a
 test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=8.3.2)", "pytest-cov (>=5)", "pytest-mock (>=3.14)"]
 type = ["mypy (>=1.11.2)"]
 [[package]]
 name = "pluggy"
 version = "1.5.0"
@@ -3552,6 +3681,23 @@ numpy = ">=1.16.4"
 carto = ["pydeck-carto"]
 jupyter = ["ipykernel (>=5.1.2)", "ipython (>=5.8.0)", "ipywidgets (>=7,<8)", "traitlets (>=4.3.2)"]
 [[package]]
 name = "pygments"
 version = "2.19.1"
@@ -3696,13 +3842,13 @@ files = [
 [[package]]
 name = "pytz"
-version = "2024.2"
 description = "World timezone definitions, modern and historical"
 optional = false
 python-versions = "*"
 files = [
-    {file = "pytz-2024.2-py2.py3-none-any.whl", hash = "sha256:31c7c1817eb7fae7ca4b8c7ee50c72f93aa2dd863de768e1ef4245d426aa0725"},
-    {file = "pytz-2024.2.tar.gz", hash = "sha256:2aa355083c50a0f93fa581709deac0c9ad65cca8a9e9beac660adcbd493c798a"},
 ]
 [[package]]
@@ -4641,13 +4787,13 @@ snowflake = ["snowflake-connector-python (>=2.8.0)", "snowflake-snowpark-python[
 [[package]]
 name = "surya-ocr"
-version = "0.10.1"
 description = "OCR, layout, reading order, and table recognition in 90+ languages"
 optional = false
 python-versions = "<4.0,>=3.10"
 files = [
-    {file = "surya_ocr-0.10.1-py3-none-any.whl", hash = "sha256:39fdc04ae1531e4b2ceb784e481a22941e53bb72f876fa1638677b5c4bd3c784"},
-    {file = "surya_ocr-0.10.1.tar.gz", hash = "sha256:0e57975df87f0dcc17ea6ff06dfe68ff5308c6610e42608a1038f8cbbd044e35"},
 ]
 [package.dependencies]
@@ -4659,7 +4805,7 @@ pydantic = ">=2.5.3,<3.0.0"
 pydantic-settings = ">=2.1.0,<3.0.0"
 pypdfium2 = "4.30.0"
 python-dotenv = ">=1.0.0,<2.0.0"
-torch = ">=2.5.1,<2.6.0"
 transformers = ">=4.41.0,<5.0.0"
 [[package]]
@@ -4844,28 +4990,31 @@ files = [
 [[package]]
 name = "torch"
-version = "2.5.1"
 description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration"
 optional = false
-python-versions = ">=3.8.0"
 files = [
-    {file = "torch-2.5.1-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:71328e1bbe39d213b8721678f9dcac30dfc452a46d586f1d514a6aa0a99d4744"},
-    {file = "torch-2.5.1-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:34bfa1a852e5714cbfa17f27c49d8ce35e1b7af5608c4bc6e81392c352dbc601"},
-    {file = "torch-2.5.1-cp310-cp310-win_amd64.whl", hash = "sha256:32a037bd98a241df6c93e4c789b683335da76a2ac142c0973675b715102dc5fa"},
-    {file = "torch-2.5.1-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:23d062bf70776a3d04dbe74db950db2a5245e1ba4f27208a87f0d743b0d06e86"},
-    {file = "torch-2.5.1-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:de5b7d6740c4b636ef4db92be922f0edc425b65ed78c5076c43c42d362a45457"},
-    {file = "torch-2.5.1-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:340ce0432cad0d37f5a31be666896e16788f1adf8ad7be481196b503dad675b9"},
-    {file = "torch-2.5.1-cp311-cp311-win_amd64.whl", hash = "sha256:603c52d2fe06433c18b747d25f5c333f9c1d58615620578c326d66f258686f9a"},
-    {file = "torch-2.5.1-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:31f8c39660962f9ae4eeec995e3049b5492eb7360dd4f07377658ef4d728fa4c"},
-    {file = "torch-2.5.1-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:ed231a4b3a5952177fafb661213d690a72caaad97d5824dd4fc17ab9e15cec03"},
-    {file = "torch-2.5.1-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:3f4b7f10a247e0dcd7ea97dc2d3bfbfc90302ed36d7f3952b0008d0df264e697"},
-    {file = "torch-2.5.1-cp312-cp312-win_amd64.whl", hash = "sha256:73e58e78f7d220917c5dbfad1a40e09df9929d3b95d25e57d9f8558f84c9a11c"},
-    {file = "torch-2.5.1-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:8c712df61101964eb11910a846514011f0b6f5920c55dbf567bff8a34163d5b1"},
-    {file = "torch-2.5.1-cp313-cp313-manylinux1_x86_64.whl", hash = "sha256:9b61edf3b4f6e3b0e0adda8b3960266b9009d02b37555971f4d1c8f7a05afed7"},
-    {file = "torch-2.5.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:1f3b7fb3cf7ab97fae52161423f81be8c6b8afac8d9760823fd623994581e1a3"},
-    {file = "torch-2.5.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:7974e3dce28b5a21fb554b73e1bc9072c25dde873fa00d54280861e7a009d7dc"},
-    {file = "torch-2.5.1-cp39-cp39-win_amd64.whl", hash = "sha256:46c817d3ea33696ad3b9df5e774dba2257e9a4cd3c4a3afbf92f6bb13ac5ce2d"},
-    {file = "torch-2.5.1-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:8046768b7f6d35b85d101b4b38cba8aa2f3cd51952bc4c06a49580f2ce682291"},
 ]
 [package.dependencies]
@@ -4882,17 +5031,18 @@ nvidia-cufft-cu12 = {version = "11.2.1.3", markers = "platform_system == \"Linux
 nvidia-curand-cu12 = {version = "10.3.5.147", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
 nvidia-cusolver-cu12 = {version = "11.6.1.9", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
 nvidia-cusparse-cu12 = {version = "12.3.1.170", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
 nvidia-nccl-cu12 = {version = "2.21.5", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
 nvidia-nvjitlink-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
 nvidia-nvtx-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
 setuptools = {version = "*", markers = "python_version >= \"3.12\""}
 sympy = {version = "1.13.1", markers = "python_version >= \"3.9\""}
-triton = {version = "3.1.0", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and python_version < \"3.13\""}
-typing-extensions = ">=4.8.0"
 [package.extras]
 opt-einsum = ["opt-einsum (>=3.3)"]
-optree = ["optree (>=0.12.0)"]
 [[package]]
 name = "tornado"
@@ -5021,21 +5171,18 @@ vision = ["Pillow (>=10.0.1,<=15.0)"]
 [[package]]
 name = "triton"
-version = "3.1.0"
 description = "A language and compiler for custom Deep Learning operations"
 optional = false
 python-versions = "*"
 files = [
-    {file = "triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b0dd10a925263abbe9fa37dcde67a5e9b2383fc269fdf59f5657cac38c5d1d8"},
-    {file = "triton-3.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0f34f6e7885d1bf0eaaf7ba875a5f0ce6f3c13ba98f9503651c1e6dc6757ed5c"},
-    {file = "triton-3.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c8182f42fd8080a7d39d666814fa36c5e30cc00ea7eeeb1a2983dbb4c99a0fdc"},
-    {file = "triton-3.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6dadaca7fc24de34e180271b5cf864c16755702e9f63a16f62df714a8099126a"},
-    {file = "triton-3.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aafa9a20cd0d9fee523cd4504aa7131807a864cd77dcf6efe7e981f18b8c6c11"},
 ]
-[package.dependencies]
-filelock = "*"
 [package.extras]
 build = ["cmake (>=3.20)", "lit"]
 tests = ["autopep8", "flake8", "isort", "llnl-hatchet", "numpy", "pytest", "scipy (>=1.7.1)"]
@@ -5468,4 +5615,4 @@ propcache = ">=0.2.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "294f3036e322ab123bc681335d96606bbc2c8cb52a8a2c253874725b3180c2f7"

 [[package]]
 name = "certifi"
+version = "2025.1.31"
 description = "Python package for providing Mozilla's CA Bundle."
 optional = false
 python-versions = ">=3.6"
 files = [
+    {file = "certifi-2025.1.31-py3-none-any.whl", hash = "sha256:ca78db4565a652026a4db2bcdf68f2fb589ea80d0be70e03929ed730746b84fe"},
+    {file = "certifi-2025.1.31.tar.gz", hash = "sha256:3d5da6925056f6f18f119200434a4780a94263f10d1c21d032a6f6b2baa20651"},
 ]
 [[package]]
 [package.extras]
 grpc = ["grpcio (>=1.44.0,<2.0.0.dev0)"]
+[[package]]
+name = "greenlet"
+version = "3.1.1"
+description = "Lightweight in-process concurrent programming"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "greenlet-3.1.1-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:0bbae94a29c9e5c7e4a2b7f0aae5c17e8e90acbfd3bf6270eeba60c39fce3563"},
+    {file = "greenlet-3.1.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0fde093fb93f35ca72a556cf72c92ea3ebfda3d79fc35bb19fbe685853869a83"},
+    {file = "greenlet-3.1.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:36b89d13c49216cadb828db8dfa6ce86bbbc476a82d3a6c397f0efae0525bdd0"},
+    {file = "greenlet-3.1.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:94b6150a85e1b33b40b1464a3f9988dcc5251d6ed06842abff82e42632fac120"},
+    {file = "greenlet-3.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:93147c513fac16385d1036b7e5b102c7fbbdb163d556b791f0f11eada7ba65dc"},
+    {file = "greenlet-3.1.1-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:da7a9bff22ce038e19bf62c4dd1ec8391062878710ded0a845bcf47cc0200617"},
+    {file = "greenlet-3.1.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b2795058c23988728eec1f36a4e5e4ebad22f8320c85f3587b539b9ac84128d7"},
+    {file = "greenlet-3.1.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ed10eac5830befbdd0c32f83e8aa6288361597550ba669b04c48f0f9a2c843c6"},
+    {file = "greenlet-3.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:77c386de38a60d1dfb8e55b8c1101d68c79dfdd25c7095d51fec2dd800892b80"},
+    {file = "greenlet-3.1.1-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:e4d333e558953648ca09d64f13e6d8f0523fa705f51cae3f03b5983489958c70"},
+    {file = "greenlet-3.1.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:09fc016b73c94e98e29af67ab7b9a879c307c6731a2c9da0db5a7d9b7edd1159"},
+    {file = "greenlet-3.1.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d5e975ca70269d66d17dd995dafc06f1b06e8cb1ec1e9ed54c1d1e4a7c4cf26e"},
+    {file = "greenlet-3.1.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3b2813dc3de8c1ee3f924e4d4227999285fd335d1bcc0d2be6dc3f1f6a318ec1"},
+    {file = "greenlet-3.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e347b3bfcf985a05e8c0b7d462ba6f15b1ee1c909e2dcad795e49e91b152c383"},
+    {file = "greenlet-3.1.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9e8f8c9cb53cdac7ba9793c276acd90168f416b9ce36799b9b885790f8ad6c0a"},
+    {file = "greenlet-3.1.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:62ee94988d6b4722ce0028644418d93a52429e977d742ca2ccbe1c4f4a792511"},
+    {file = "greenlet-3.1.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:1776fd7f989fc6b8d8c8cb8da1f6b82c5814957264d1f6cf818d475ec2bf6395"},
+    {file = "greenlet-3.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:48ca08c771c268a768087b408658e216133aecd835c0ded47ce955381105ba39"},
+    {file = "greenlet-3.1.1-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:4afe7ea89de619adc868e087b4d2359282058479d7cfb94970adf4b55284574d"},
+    {file = "greenlet-3.1.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f406b22b7c9a9b4f8aa9d2ab13d6ae0ac3e85c9a809bd590ad53fed2bf70dc79"},
+    {file = "greenlet-3.1.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c3a701fe5a9695b238503ce5bbe8218e03c3bcccf7e204e455e7462d770268aa"},
+    {file = "greenlet-3.1.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2846930c65b47d70b9d178e89c7e1a69c95c1f68ea5aa0a58646b7a96df12441"},
+    {file = "greenlet-3.1.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:99cfaa2110534e2cf3ba31a7abcac9d328d1d9f1b95beede58294a60348fba36"},
+    {file = "greenlet-3.1.1-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1443279c19fca463fc33e65ef2a935a5b09bb90f978beab37729e1c3c6c25fe9"},
+    {file = "greenlet-3.1.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:b7cede291382a78f7bb5f04a529cb18e068dd29e0fb27376074b6d0317bf4dd0"},
+    {file = "greenlet-3.1.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:23f20bb60ae298d7d8656c6ec6db134bca379ecefadb0b19ce6f19d1f232a942"},
+    {file = "greenlet-3.1.1-cp312-cp312-win_amd64.whl", hash = "sha256:7124e16b4c55d417577c2077be379514321916d5790fa287c9ed6f23bd2ffd01"},
+    {file = "greenlet-3.1.1-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:05175c27cb459dcfc05d026c4232f9de8913ed006d42713cb8a5137bd49375f1"},
+    {file = "greenlet-3.1.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:935e943ec47c4afab8965954bf49bfa639c05d4ccf9ef6e924188f762145c0ff"},
+    {file = "greenlet-3.1.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:667a9706c970cb552ede35aee17339a18e8f2a87a51fba2ed39ceeeb1004798a"},
+    {file = "greenlet-3.1.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b8a678974d1f3aa55f6cc34dc480169d58f2e6d8958895d68845fa4ab566509e"},
+    {file = "greenlet-3.1.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:efc0f674aa41b92da8c49e0346318c6075d734994c3c4e4430b1c3f853e498e4"},
+    {file = "greenlet-3.1.1-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0153404a4bb921f0ff1abeb5ce8a5131da56b953eda6e14b88dc6bbc04d2049e"},
+    {file = "greenlet-3.1.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:275f72decf9932639c1c6dd1013a1bc266438eb32710016a1c742df5da6e60a1"},
+    {file = "greenlet-3.1.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:c4aab7f6381f38a4b42f269057aee279ab0fc7bf2e929e3d4abfae97b682a12c"},
+    {file = "greenlet-3.1.1-cp313-cp313-win_amd64.whl", hash = "sha256:b42703b1cf69f2aa1df7d1030b9d77d3e584a70755674d60e710f0af570f3761"},
+    {file = "greenlet-3.1.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f1695e76146579f8c06c1509c7ce4dfe0706f49c6831a817ac04eebb2fd02011"},
+    {file = "greenlet-3.1.1-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7876452af029456b3f3549b696bb36a06db7c90747740c5302f74a9e9fa14b13"},
+    {file = "greenlet-3.1.1-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4ead44c85f8ab905852d3de8d86f6f8baf77109f9da589cb4fa142bd3b57b475"},
+    {file = "greenlet-3.1.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8320f64b777d00dd7ccdade271eaf0cad6636343293a25074cc5566160e4de7b"},
+    {file = "greenlet-3.1.1-cp313-cp313t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6510bf84a6b643dabba74d3049ead221257603a253d0a9873f55f6a59a65f822"},
+    {file = "greenlet-3.1.1-cp313-cp313t-musllinux_1_1_aarch64.whl", hash = "sha256:04b013dc07c96f83134b1e99888e7a79979f1a247e2a9f59697fa14b5862ed01"},
+    {file = "greenlet-3.1.1-cp313-cp313t-musllinux_1_1_x86_64.whl", hash = "sha256:411f015496fec93c1c8cd4e5238da364e1da7a124bcb293f085bf2860c32c6f6"},
+    {file = "greenlet-3.1.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:47da355d8687fd65240c364c90a31569a133b7b60de111c255ef5b606f2ae291"},
+    {file = "greenlet-3.1.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:98884ecf2ffb7d7fe6bd517e8eb99d31ff7855a840fa6d0d63cd07c037f6a981"},
+    {file = "greenlet-3.1.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f1d4aeb8891338e60d1ab6127af1fe45def5259def8094b9c7e34690c8858803"},
+    {file = "greenlet-3.1.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:db32b5348615a04b82240cc67983cb315309e88d444a288934ee6ceaebcad6cc"},
+    {file = "greenlet-3.1.1-cp37-cp37m-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dcc62f31eae24de7f8dce72134c8651c58000d3b1868e01392baea7c32c247de"},
+    {file = "greenlet-3.1.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:1d3755bcb2e02de341c55b4fca7a745a24a9e7212ac953f6b3a48d117d7257aa"},
+    {file = "greenlet-3.1.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:b8da394b34370874b4572676f36acabac172602abf054cbc4ac910219f3340af"},
+    {file = "greenlet-3.1.1-cp37-cp37m-win32.whl", hash = "sha256:a0dfc6c143b519113354e780a50381508139b07d2177cb6ad6a08278ec655798"},
+    {file = "greenlet-3.1.1-cp37-cp37m-win_amd64.whl", hash = "sha256:54558ea205654b50c438029505def3834e80f0869a70fb15b871c29b4575ddef"},
+    {file = "greenlet-3.1.1-cp38-cp38-macosx_11_0_universal2.whl", hash = "sha256:346bed03fe47414091be4ad44786d1bd8bef0c3fcad6ed3dee074a032ab408a9"},
+    {file = "greenlet-3.1.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dfc59d69fc48664bc693842bd57acfdd490acafda1ab52c7836e3fc75c90a111"},
+    {file = "greenlet-3.1.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d21e10da6ec19b457b82636209cbe2331ff4306b54d06fa04b7c138ba18c8a81"},
+    {file = "greenlet-3.1.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:37b9de5a96111fc15418819ab4c4432e4f3c2ede61e660b1e33971eba26ef9ba"},
+    {file = "greenlet-3.1.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6ef9ea3f137e5711f0dbe5f9263e8c009b7069d8a1acea822bd5e9dae0ae49c8"},
+    {file = "greenlet-3.1.1-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:85f3ff71e2e60bd4b4932a043fbbe0f499e263c628390b285cb599154a3b03b1"},
+    {file = "greenlet-3.1.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:95ffcf719966dd7c453f908e208e14cde192e09fde6c7186c8f1896ef778d8cd"},
+    {file = "greenlet-3.1.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:03a088b9de532cbfe2ba2034b2b85e82df37874681e8c470d6fb2f8c04d7e4b7"},
+    {file = "greenlet-3.1.1-cp38-cp38-win32.whl", hash = "sha256:8b8b36671f10ba80e159378df9c4f15c14098c4fd73a36b9ad715f057272fbef"},
+    {file = "greenlet-3.1.1-cp38-cp38-win_amd64.whl", hash = "sha256:7017b2be767b9d43cc31416aba48aab0d2309ee31b4dbf10a1d38fb7972bdf9d"},
+    {file = "greenlet-3.1.1-cp39-cp39-macosx_11_0_universal2.whl", hash = "sha256:396979749bd95f018296af156201d6211240e7a23090f50a8d5d18c370084dc3"},
+    {file = "greenlet-3.1.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca9d0ff5ad43e785350894d97e13633a66e2b50000e8a183a50a88d834752d42"},
+    {file = "greenlet-3.1.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f6ff3b14f2df4c41660a7dec01045a045653998784bf8cfcb5a525bdffffbc8f"},
+    {file = "greenlet-3.1.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:94ebba31df2aa506d7b14866fed00ac141a867e63143fe5bca82a8e503b36437"},
+    {file = "greenlet-3.1.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:73aaad12ac0ff500f62cebed98d8789198ea0e6f233421059fa68a5aa7220145"},
+    {file = "greenlet-3.1.1-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:63e4844797b975b9af3a3fb8f7866ff08775f5426925e1e0bbcfe7932059a12c"},
+    {file = "greenlet-3.1.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:7939aa3ca7d2a1593596e7ac6d59391ff30281ef280d8632fa03d81f7c5f955e"},
+    {file = "greenlet-3.1.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d0028e725ee18175c6e422797c407874da24381ce0690d6b9396c204c7f7276e"},
+    {file = "greenlet-3.1.1-cp39-cp39-win32.whl", hash = "sha256:5e06afd14cbaf9e00899fae69b24a32f2196c19de08fcb9f4779dd4f004e5e7c"},
+    {file = "greenlet-3.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:3319aa75e0e0639bc15ff54ca327e8dc7a6fe404003496e3c6925cd3142e0e22"},
+    {file = "greenlet-3.1.1.tar.gz", hash = "sha256:4ce3ac6cdb6adf7946475d7ef31777c26d94bccc377e070a7986bd2d5c515467"},
+]
+[package.extras]
+docs = ["Sphinx", "furo"]
+test = ["objgraph", "psutil"]
 [[package]]
 name = "grpcio"
 version = "1.70.0"
     {file = "jupyterlab_widgets-3.0.13.tar.gz", hash = "sha256:a2966d385328c1942b683a8cd96b89b8dd82c8b8f81dda902bb2bc06d46f5bed"},
 ]
+[[package]]
+name = "latex2mathml"
+version = "3.77.0"
+description = "Pure Python library for LaTeX to MathML conversion"
+optional = false
+python-versions = ">=3.8.1,<4.0.0"
+files = [
+    {file = "latex2mathml-3.77.0-py3-none-any.whl", hash = "sha256:5531e18a2a9eae7c24e257118b6a444cbba253cd27ff3e81f1bd6c41e88e786e"},
+    {file = "latex2mathml-3.77.0.tar.gz", hash = "sha256:e2f501d1878f2e489c3f6f12786bef74c62f712d2770f7f3c837eb20a55d0a1e"},
+]
 [[package]]
 name = "lxml"
 version = "5.3.0"
 [package.dependencies]
 nvidia-nvjitlink-cu12 = "*"
+[[package]]
+name = "nvidia-cusparselt-cu12"
+version = "0.6.2"
+description = "NVIDIA cuSPARSELt"
+optional = false
+python-versions = "*"
+files = [
+    {file = "nvidia_cusparselt_cu12-0.6.2-py3-none-manylinux2014_aarch64.whl", hash = "sha256:067a7f6d03ea0d4841c85f0c6f1991c5dda98211f6302cb83a4ab234ee95bef8"},
+    {file = "nvidia_cusparselt_cu12-0.6.2-py3-none-manylinux2014_x86_64.whl", hash = "sha256:df2c24502fd76ebafe7457dbc4716b2fec071aabaed4fb7691a201cde03704d9"},
+    {file = "nvidia_cusparselt_cu12-0.6.2-py3-none-win_amd64.whl", hash = "sha256:0057c91d230703924c0422feabe4ce768841f9b4b44d28586b6f6d2eb86fbe70"},
+]
 [[package]]
 name = "nvidia-nccl-cu12"
 version = "2.21.5"
 test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=8.3.2)", "pytest-cov (>=5)", "pytest-mock (>=3.14)"]
 type = ["mypy (>=1.11.2)"]
+[[package]]
+name = "playwright"
+version = "1.49.1"
+description = "A high-level API to automate web browsers"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "playwright-1.49.1-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:1041ffb45a0d0bc44d698d3a5aa3ac4b67c9bd03540da43a0b70616ad52592b8"},
+    {file = "playwright-1.49.1-py3-none-macosx_11_0_arm64.whl", hash = "sha256:9f38ed3d0c1f4e0a6d1c92e73dd9a61f8855133249d6f0cec28648d38a7137be"},
+    {file = "playwright-1.49.1-py3-none-macosx_11_0_universal2.whl", hash = "sha256:3be48c6d26dc819ca0a26567c1ae36a980a0303dcd4249feb6f59e115aaddfb8"},
+    {file = "playwright-1.49.1-py3-none-manylinux1_x86_64.whl", hash = "sha256:753ca90ee31b4b03d165cfd36e477309ebf2b4381953f2a982ff612d85b147d2"},
+    {file = "playwright-1.49.1-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cd9bc8dab37aa25198a01f555f0a2e2c3813fe200fef018ac34dfe86b34994b9"},
+    {file = "playwright-1.49.1-py3-none-win32.whl", hash = "sha256:43b304be67f096058e587dac453ece550eff87b8fbed28de30f4f022cc1745bb"},
+    {file = "playwright-1.49.1-py3-none-win_amd64.whl", hash = "sha256:47b23cb346283278f5b4d1e1990bcb6d6302f80c0aa0ca93dd0601a1400191df"},
+]
+[package.dependencies]
+greenlet = "3.1.1"
+pyee = "12.0.0"
 [[package]]
 name = "pluggy"
 version = "1.5.0"
 carto = ["pydeck-carto"]
 jupyter = ["ipykernel (>=5.1.2)", "ipython (>=5.8.0)", "ipywidgets (>=7,<8)", "traitlets (>=4.3.2)"]
+[[package]]
+name = "pyee"
+version = "12.0.0"
+description = "A rough port of Node.js's EventEmitter to Python with a few tricks of its own"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "pyee-12.0.0-py3-none-any.whl", hash = "sha256:7b14b74320600049ccc7d0e0b1becd3b4bd0a03c745758225e31a59f4095c990"},
+    {file = "pyee-12.0.0.tar.gz", hash = "sha256:c480603f4aa2927d4766eb41fa82793fe60a82cbfdb8d688e0d08c55a534e145"},
+]
+[package.dependencies]
+typing-extensions = "*"
+[package.extras]
+dev = ["black", "build", "flake8", "flake8-black", "isort", "jupyter-console", "mkdocs", "mkdocs-include-markdown-plugin", "mkdocstrings[python]", "pytest", "pytest-asyncio", "pytest-trio", "sphinx", "toml", "tox", "trio", "trio", "trio-typing", "twine", "twisted", "validate-pyproject[all]"]
 [[package]]
 name = "pygments"
 version = "2.19.1"
 [[package]]
 name = "pytz"
+version = "2025.1"
 description = "World timezone definitions, modern and historical"
 optional = false
 python-versions = "*"
 files = [
+    {file = "pytz-2025.1-py2.py3-none-any.whl", hash = "sha256:89dd22dca55b46eac6eda23b2d72721bf1bdfef212645d81513ef5d03038de57"},
+    {file = "pytz-2025.1.tar.gz", hash = "sha256:c2db42be2a2518b28e65f9207c4d05e6ff547d1efa4086469ef855e4ab70178e"},
 ]
 [[package]]
 [[package]]
 name = "surya-ocr"
+version = "0.10.2"
 description = "OCR, layout, reading order, and table recognition in 90+ languages"
 optional = false
 python-versions = "<4.0,>=3.10"
 files = [
+    {file = "surya_ocr-0.10.2-py3-none-any.whl", hash = "sha256:fbb590ae92b2a785e75ca25a53dd2ff59b1f56ec017a22f6127c9c7c62a1b910"},
+    {file = "surya_ocr-0.10.2.tar.gz", hash = "sha256:ddbaf5d2f2cc0a08992446f889f782aa81e9e1cfa3fd957c124273365d411057"},
 ]
 [package.dependencies]
 pydantic-settings = ">=2.1.0,<3.0.0"
 pypdfium2 = "4.30.0"
 python-dotenv = ">=1.0.0,<2.0.0"
+torch = ">=2.5.1,<3.0.0"
 transformers = ">=4.41.0,<5.0.0"
 [[package]]
 [[package]]
 name = "torch"
+version = "2.6.0"
 description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration"
 optional = false
+python-versions = ">=3.9.0"
 files = [
+    {file = "torch-2.6.0-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:6860df13d9911ac158f4c44031609700e1eba07916fff62e21e6ffa0a9e01961"},
+    {file = "torch-2.6.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:c4f103a49830ce4c7561ef4434cc7926e5a5fe4e5eb100c19ab36ea1e2b634ab"},
+    {file = "torch-2.6.0-cp310-cp310-win_amd64.whl", hash = "sha256:56eeaf2ecac90da5d9e35f7f35eb286da82673ec3c582e310a8d1631a1c02341"},
+    {file = "torch-2.6.0-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:09e06f9949e1a0518c5b09fe95295bc9661f219d9ecb6f9893e5123e10696628"},
+    {file = "torch-2.6.0-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:7979834102cd5b7a43cc64e87f2f3b14bd0e1458f06e9f88ffa386d07c7446e1"},
+    {file = "torch-2.6.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:ccbd0320411fe1a3b3fec7b4d3185aa7d0c52adac94480ab024b5c8f74a0bf1d"},
+    {file = "torch-2.6.0-cp311-cp311-win_amd64.whl", hash = "sha256:46763dcb051180ce1ed23d1891d9b1598e07d051ce4c9d14307029809c4d64f7"},
+    {file = "torch-2.6.0-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:94fc63b3b4bedd327af588696559f68c264440e2503cc9e6954019473d74ae21"},
+    {file = "torch-2.6.0-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:2bb8987f3bb1ef2675897034402373ddfc8f5ef0e156e2d8cfc47cacafdda4a9"},
+    {file = "torch-2.6.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:b789069020c5588c70d5c2158ac0aa23fd24a028f34a8b4fcb8fcb4d7efcf5fb"},
+    {file = "torch-2.6.0-cp312-cp312-win_amd64.whl", hash = "sha256:7e1448426d0ba3620408218b50aa6ada88aeae34f7a239ba5431f6c8774b1239"},
+    {file = "torch-2.6.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:9a610afe216a85a8b9bc9f8365ed561535c93e804c2a317ef7fabcc5deda0989"},
+    {file = "torch-2.6.0-cp313-cp313-manylinux1_x86_64.whl", hash = "sha256:4874a73507a300a5d089ceaff616a569e7bb7c613c56f37f63ec3ffac65259cf"},
+    {file = "torch-2.6.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:a0d5e1b9874c1a6c25556840ab8920569a7a4137afa8a63a32cee0bc7d89bd4b"},
+    {file = "torch-2.6.0-cp313-cp313-win_amd64.whl", hash = "sha256:510c73251bee9ba02ae1cb6c9d4ee0907b3ce6020e62784e2d7598e0cfa4d6cc"},
+    {file = "torch-2.6.0-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:ff96f4038f8af9f7ec4231710ed4549da1bdebad95923953a25045dcf6fd87e2"},
+    {file = "torch-2.6.0-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:9ea955317cfcd3852b1402b62af258ce735c2edeee42ca9419b6bc889e5ae053"},
+    {file = "torch-2.6.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:bb2c6c3e65049f081940f5ab15c9136c7de40d3f01192541c920a07c7c585b7e"},
+    {file = "torch-2.6.0-cp39-cp39-win_amd64.whl", hash = "sha256:683410f97984103148e31b38a8631acf31c3034c020c0f4d26171e7626d8317a"},
+    {file = "torch-2.6.0-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:265f70de5fd45b864d924b64be1797f86e76c8e48a02c2a3a6fc7ec247d2226c"},
 ]
 [package.dependencies]
 nvidia-curand-cu12 = {version = "10.3.5.147", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
 nvidia-cusolver-cu12 = {version = "11.6.1.9", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
 nvidia-cusparse-cu12 = {version = "12.3.1.170", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-cusparselt-cu12 = {version = "0.6.2", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
 nvidia-nccl-cu12 = {version = "2.21.5", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
 nvidia-nvjitlink-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
 nvidia-nvtx-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
 setuptools = {version = "*", markers = "python_version >= \"3.12\""}
 sympy = {version = "1.13.1", markers = "python_version >= \"3.9\""}
+triton = {version = "3.2.0", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+typing-extensions = ">=4.10.0"
 [package.extras]
 opt-einsum = ["opt-einsum (>=3.3)"]
+optree = ["optree (>=0.13.0)"]
 [[package]]
 name = "tornado"
 [[package]]
 name = "triton"
+version = "3.2.0"
 description = "A language and compiler for custom Deep Learning operations"
 optional = false
 python-versions = "*"
 files = [
+    {file = "triton-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b3e54983cd51875855da7c68ec05c05cf8bb08df361b1d5b69e05e40b0c9bd62"},
+    {file = "triton-3.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8009a1fb093ee8546495e96731336a33fb8856a38e45bb4ab6affd6dbc3ba220"},
+    {file = "triton-3.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d9b215efc1c26fa7eefb9a157915c92d52e000d2bf83e5f69704047e63f125c"},
+    {file = "triton-3.2.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e5dfa23ba84541d7c0a531dfce76d8bcd19159d50a4a8b14ad01e91734a5c1b0"},
+    {file = "triton-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:30ceed0eff2c4a73b14eb63e052992f44bbdf175f3fad21e1ac8097a772de7ee"},
 ]
 [package.extras]
 build = ["cmake (>=3.20)", "lit"]
 tests = ["autopep8", "flake8", "isort", "llnl-hatchet", "numpy", "pytest", "scipy (>=1.7.1)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
+content-hash = "589d4265c99bb94e935eeae053707638d72da1eaca38f0d60c832210703bd5bc"

pyproject.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "marker-pdf"
-version = "1.3.3"
 description = "Convert PDF to markdown with high speed and accuracy."
 authors = ["Vik Paruchuri <github@vikas.sh>"]
 readme = "README.md"
@@ -22,11 +22,11 @@ pydantic = "^2.4.2"
 pydantic-settings = "^2.0.3"
 transformers = "^4.45.2"
 python-dotenv = "^1.0.0"
-torch = "~2.5.1" # 2.6.0 appears to fail with mps
 tqdm = "^4.66.1"
 ftfy = "^6.1.1"
 rapidfuzz = "^3.8.1"
-surya-ocr = "~0.10.1"
 regex = "^2024.4.28"
 pdftext = "~0.5.1"
 markdownify = "^0.13.1"
@@ -49,6 +49,8 @@ apted = "1.0.3"
 distance = "0.1.3"
 lxml = "5.3.0"
 tabulate = "^0.9.0"
 [tool.poetry.scripts]
 marker = "marker.scripts.convert:convert_cli"

 [tool.poetry]
 name = "marker-pdf"
+version = "1.3.4"
 description = "Convert PDF to markdown with high speed and accuracy."
 authors = ["Vik Paruchuri <github@vikas.sh>"]
 readme = "README.md"
 pydantic-settings = "^2.0.3"
 transformers = "^4.45.2"
 python-dotenv = "^1.0.0"
+torch = "^2.5.1"
 tqdm = "^4.66.1"
 ftfy = "^6.1.1"
 rapidfuzz = "^3.8.1"
+surya-ocr = "~0.10.2"
 regex = "^2024.4.28"
 pdftext = "~0.5.1"
 markdownify = "^0.13.1"
 distance = "0.1.3"
 lxml = "5.3.0"
 tabulate = "^0.9.0"
+latex2mathml = "^3.77.0"
+playwright = "^1.49.1"
 [tool.poetry.scripts]
 marker = "marker.scripts.convert:convert_cli"