Improve bench
Browse files- benchmarks/overall/clean.py +125 -0
- benchmarks/overall/inference.py +16 -16
- benchmarks/overall/overall.py +11 -1
- benchmarks/overall/render.py +109 -0
- benchmarks/overall/schema.py +1 -2
- benchmarks/overall/scoring.py +7 -103
- marker/scripts/streamlit_app.py +4 -1
- marker/scripts/streamlit_app_blocks_viz.html +17 -17
- poetry.lock +189 -42
- pyproject.toml +5 -3
benchmarks/overall/clean.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import subprocess
|
| 3 |
+
import tempfile
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
import latex2mathml.converter
|
| 7 |
+
|
| 8 |
+
from marker.renderers.markdown import MarkdownRenderer
|
| 9 |
+
|
| 10 |
+
class MarkdownCleaner:
|
| 11 |
+
def __init__(self):
|
| 12 |
+
pass
|
| 13 |
+
|
| 14 |
+
def __call__(self, markdown):
|
| 15 |
+
markdown = self.normalize_markdown(markdown) # Use pandoc to normalize
|
| 16 |
+
|
| 17 |
+
# Replace math expressions with latexml
|
| 18 |
+
pattern = r'(?<!\\)\$(?:\$([^$]+)\$\$|\s*([^$\n]+?)\s*\$)'
|
| 19 |
+
markdown = re.sub(pattern, self.standardize_math, markdown)
|
| 20 |
+
|
| 21 |
+
# Replace image urls with a generic tag
|
| 22 |
+
pattern = r'!\[(.*?)\]\((https?://[^\s\)]+)\)'
|
| 23 |
+
markdown = re.sub(pattern, r'![link]', markdown)
|
| 24 |
+
|
| 25 |
+
# Clean up stray html tags
|
| 26 |
+
markdown = markdown.replace("<br>", "\n")
|
| 27 |
+
markdown = re.sub(r"<sub>(.*?)</sub>", r"\1", markdown)
|
| 28 |
+
markdown = re.sub(r"<sup>(.*?)</sup>", r"\1", markdown)
|
| 29 |
+
markdown = re.sub(r"<span.*?>(.*?)</span>", r"\1", markdown) # Remove span tags and keep content
|
| 30 |
+
|
| 31 |
+
# Clean up markdown formatting
|
| 32 |
+
markdown = re.sub(r"\s+", " ", markdown)
|
| 33 |
+
markdown = re.sub(r"\n+", "\n", markdown)
|
| 34 |
+
markdown = re.sub("\\.+", ".",
|
| 35 |
+
markdown) # Replace repeated periods with a single period, like in table of contents
|
| 36 |
+
markdown = re.sub("#+", "#", markdown) # Replace repeated headers with a single header
|
| 37 |
+
markdown = markdown.encode().decode('unicode-escape', errors="ignore") # Decode unicode characters properly
|
| 38 |
+
return markdown.strip().lower()
|
| 39 |
+
|
| 40 |
+
@staticmethod
|
| 41 |
+
def normalize_markdown(md_text: str) -> str:
|
| 42 |
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
| 43 |
+
dirpath = Path(tmp_dir)
|
| 44 |
+
input_file = dirpath / 'input.md'
|
| 45 |
+
input_file.write_text(md_text, encoding='utf-8')
|
| 46 |
+
|
| 47 |
+
# Markdown to HTML
|
| 48 |
+
html_file = dirpath / 'temp.html'
|
| 49 |
+
subprocess.run(
|
| 50 |
+
[
|
| 51 |
+
'pandoc',
|
| 52 |
+
str(input_file),
|
| 53 |
+
'-f', 'markdown+tex_math_dollars',
|
| 54 |
+
'-t', 'html',
|
| 55 |
+
'-o', str(html_file),
|
| 56 |
+
'--quiet'
|
| 57 |
+
],
|
| 58 |
+
check=True
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
# HTML to Markdown
|
| 62 |
+
output_file = dirpath / 'output.md'
|
| 63 |
+
subprocess.run(
|
| 64 |
+
[
|
| 65 |
+
'pandoc',
|
| 66 |
+
str(html_file),
|
| 67 |
+
'-f', 'html',
|
| 68 |
+
'-t', 'markdown+tex_math_dollars',
|
| 69 |
+
'-o', str(output_file),
|
| 70 |
+
'--quiet'
|
| 71 |
+
],
|
| 72 |
+
check=True
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
# Read back the normalized Markdown
|
| 76 |
+
normalized_md = output_file.read_text(encoding='utf-8')
|
| 77 |
+
|
| 78 |
+
return normalized_md
|
| 79 |
+
|
| 80 |
+
def standardize_math(self, match):
|
| 81 |
+
try:
|
| 82 |
+
delim = "$$" if match.group(0).startswith('$$') else "$"
|
| 83 |
+
math_content = match.group(1) or match.group(2)
|
| 84 |
+
if delim == "$$":
|
| 85 |
+
math_content = latex2mathml.converter.convert(math_content)
|
| 86 |
+
else:
|
| 87 |
+
math_content = self.clean_latex(math_content)
|
| 88 |
+
return f'{delim}{math_content}{delim}'
|
| 89 |
+
except Exception as e:
|
| 90 |
+
print(f"Failed to standardize math expression: {match.group(0)} with error: {e}")
|
| 91 |
+
return match.group(0)
|
| 92 |
+
|
| 93 |
+
@staticmethod
|
| 94 |
+
def clean_latex(latex_str):
|
| 95 |
+
latex_str = re.sub(r'\s+', ' ', latex_str.strip())
|
| 96 |
+
for tag in [r'\\text', r'\\mathrm', r'\\mathbf', r'\\textbf']:
|
| 97 |
+
latex_str = re.sub(tag + r'\{([^}]+)\}', r'\1', latex_str)
|
| 98 |
+
|
| 99 |
+
replacements = {
|
| 100 |
+
'\\times': '*',
|
| 101 |
+
'\\cdot': '*',
|
| 102 |
+
'\\div': '/',
|
| 103 |
+
'\\le': '<=',
|
| 104 |
+
'\\ge': '>=',
|
| 105 |
+
'\\neq': '!=',
|
| 106 |
+
'\\to': '\\rightarrow',
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
for old, new in replacements.items():
|
| 110 |
+
latex_str = latex_str.replace(old, new)
|
| 111 |
+
|
| 112 |
+
return latex_str
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def convert_to_md(html):
|
| 116 |
+
md = MarkdownRenderer()
|
| 117 |
+
markdown = md.md_cls.convert(html)
|
| 118 |
+
return markdown
|
| 119 |
+
|
| 120 |
+
def clean_input(markdown):
|
| 121 |
+
cleaner = MarkdownCleaner()
|
| 122 |
+
return cleaner(markdown)
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
|
benchmarks/overall/inference.py
CHANGED
|
@@ -1,38 +1,37 @@
|
|
| 1 |
import tempfile
|
| 2 |
import time
|
| 3 |
|
| 4 |
-
from
|
| 5 |
-
|
| 6 |
-
from benchmarks.overall.scoring import score_blocks
|
| 7 |
from benchmarks.overall.schema import BlockScores
|
|
|
|
| 8 |
from marker.converters.pdf import PdfConverter
|
| 9 |
|
| 10 |
-
def
|
| 11 |
block_converter = PdfConverter(
|
| 12 |
artifact_dict=marker_models,
|
| 13 |
-
config={"page_range": [0], "disable_tqdm": True, "use_llm": use_llm}
|
| 14 |
-
renderer="marker.renderers.html.HTMLRenderer"
|
| 15 |
)
|
|
|
|
| 16 |
with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f:
|
| 17 |
f.write(pdf_bytes)
|
| 18 |
rendered = block_converter(f.name)
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
inner_html = str(soup.find("body").decode_contents())
|
| 22 |
-
return inner_html
|
| 23 |
|
| 24 |
|
| 25 |
-
def marker_scoring_func(model_dict, sample,
|
| 26 |
pdf_bytes = sample["pdf"] # This is a single page PDF
|
| 27 |
start = time.time()
|
| 28 |
-
|
|
|
|
| 29 |
total = time.time() - start
|
| 30 |
-
scores = score_blocks(
|
| 31 |
scores["time"] = total
|
|
|
|
| 32 |
return scores
|
| 33 |
|
| 34 |
|
| 35 |
-
def mathpix_scoring_func(model_dict, sample,
|
| 36 |
uuid = sample["uuid"]
|
| 37 |
data = None
|
| 38 |
for row in mathpix_ds:
|
|
@@ -42,7 +41,8 @@ def mathpix_scoring_func(model_dict, sample, gt_html, mathpix_ds=None, **kwargs)
|
|
| 42 |
if not data:
|
| 43 |
raise ValueError(f"Could not find data for uuid {uuid}")
|
| 44 |
|
| 45 |
-
mathpix_md = data["md"]
|
| 46 |
-
scores = score_blocks(
|
| 47 |
scores["time"] = data["time"]
|
|
|
|
| 48 |
return scores
|
|
|
|
| 1 |
import tempfile
|
| 2 |
import time
|
| 3 |
|
| 4 |
+
from benchmarks.overall.clean import clean_input
|
|
|
|
|
|
|
| 5 |
from benchmarks.overall.schema import BlockScores
|
| 6 |
+
from benchmarks.overall.scoring import score_blocks
|
| 7 |
from marker.converters.pdf import PdfConverter
|
| 8 |
|
| 9 |
+
def get_marker_markdown(marker_models: dict, pdf_bytes: bytes, use_llm: bool):
|
| 10 |
block_converter = PdfConverter(
|
| 11 |
artifact_dict=marker_models,
|
| 12 |
+
config={"page_range": [0], "disable_tqdm": True, "use_llm": use_llm}
|
|
|
|
| 13 |
)
|
| 14 |
+
|
| 15 |
with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f:
|
| 16 |
f.write(pdf_bytes)
|
| 17 |
rendered = block_converter(f.name)
|
| 18 |
+
|
| 19 |
+
return rendered.markdown
|
|
|
|
|
|
|
| 20 |
|
| 21 |
|
| 22 |
+
def marker_scoring_func(model_dict, sample, gt_markdown, use_llm=False, **kwargs) -> BlockScores:
|
| 23 |
pdf_bytes = sample["pdf"] # This is a single page PDF
|
| 24 |
start = time.time()
|
| 25 |
+
marker_md = get_marker_markdown(model_dict, pdf_bytes, use_llm)
|
| 26 |
+
marker_md = clean_input(marker_md)
|
| 27 |
total = time.time() - start
|
| 28 |
+
scores = score_blocks(gt_markdown, marker_md)
|
| 29 |
scores["time"] = total
|
| 30 |
+
scores["markdown"] = marker_md
|
| 31 |
return scores
|
| 32 |
|
| 33 |
|
| 34 |
+
def mathpix_scoring_func(model_dict, sample, gt_markdown, mathpix_ds=None, **kwargs) -> BlockScores:
|
| 35 |
uuid = sample["uuid"]
|
| 36 |
data = None
|
| 37 |
for row in mathpix_ds:
|
|
|
|
| 41 |
if not data:
|
| 42 |
raise ValueError(f"Could not find data for uuid {uuid}")
|
| 43 |
|
| 44 |
+
mathpix_md = clean_input(data["md"])
|
| 45 |
+
scores = score_blocks(gt_markdown, mathpix_md)
|
| 46 |
scores["time"] = data["time"]
|
| 47 |
+
scores["markdown"] = mathpix_md
|
| 48 |
return scores
|
benchmarks/overall/overall.py
CHANGED
|
@@ -7,9 +7,11 @@ from typing import Dict
|
|
| 7 |
import click
|
| 8 |
import datasets
|
| 9 |
import tabulate
|
|
|
|
| 10 |
from tqdm import tqdm
|
| 11 |
import pypdfium2 as pdfium
|
| 12 |
|
|
|
|
| 13 |
from benchmarks.overall.inference import marker_scoring_func, mathpix_scoring_func
|
| 14 |
from benchmarks.overall.schema import FullResult
|
| 15 |
from marker.logger import configure_logging
|
|
@@ -32,7 +34,8 @@ def get_method_scores(ds, model_dict, max_rows=None, score_func=marker_scoring_f
|
|
| 32 |
|
| 33 |
try:
|
| 34 |
gt_html = [block["html"] for block in gt_blocks if len(block["html"]) > 0]
|
| 35 |
-
|
|
|
|
| 36 |
except ValueError as e:
|
| 37 |
print(f"Error with sample {idx}: {e}")
|
| 38 |
continue
|
|
@@ -101,12 +104,14 @@ def print_scores(scores: Dict[str, FullResult], out_path: Path, default_method="
|
|
| 101 |
|
| 102 |
@click.command(help="Benchmark PDF to MD conversion.")
|
| 103 |
@click.option("--dataset", type=str, help="Path to the benchmark dataset", default="datalab-to/marker_benchmark")
|
|
|
|
| 104 |
@click.option("--other_methods", type=str, help="Comma separated list of other methods to compare against. Possible values: mathpix", default="")
|
| 105 |
@click.option("--result_path", type=str, default=os.path.join(settings.OUTPUT_DIR, "benchmark", "overall"), help="Output path for results.")
|
| 106 |
@click.option("--max_rows", type=int, default=None, help="Maximum number of rows to process.")
|
| 107 |
@click.option("--use_llm", is_flag=True, help="Use the LLM model for better marker quality.")
|
| 108 |
def main(
|
| 109 |
dataset: str,
|
|
|
|
| 110 |
other_methods: str,
|
| 111 |
result_path: str,
|
| 112 |
max_rows: int,
|
|
@@ -142,6 +147,11 @@ def main(
|
|
| 142 |
|
| 143 |
print(f"Results saved to {out_path}.")
|
| 144 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
if __name__ == "__main__":
|
| 146 |
main()
|
| 147 |
|
|
|
|
| 7 |
import click
|
| 8 |
import datasets
|
| 9 |
import tabulate
|
| 10 |
+
from benchmarks.overall.render import build_dataset
|
| 11 |
from tqdm import tqdm
|
| 12 |
import pypdfium2 as pdfium
|
| 13 |
|
| 14 |
+
from benchmarks.overall.clean import convert_to_md, clean_input
|
| 15 |
from benchmarks.overall.inference import marker_scoring_func, mathpix_scoring_func
|
| 16 |
from benchmarks.overall.schema import FullResult
|
| 17 |
from marker.logger import configure_logging
|
|
|
|
| 34 |
|
| 35 |
try:
|
| 36 |
gt_html = [block["html"] for block in gt_blocks if len(block["html"]) > 0]
|
| 37 |
+
gt_markdown = [clean_input(convert_to_md(block)) for block in gt_html]
|
| 38 |
+
scores = score_func(model_dict, sample, gt_markdown, **kwargs)
|
| 39 |
except ValueError as e:
|
| 40 |
print(f"Error with sample {idx}: {e}")
|
| 41 |
continue
|
|
|
|
| 104 |
|
| 105 |
@click.command(help="Benchmark PDF to MD conversion.")
|
| 106 |
@click.option("--dataset", type=str, help="Path to the benchmark dataset", default="datalab-to/marker_benchmark")
|
| 107 |
+
@click.option("--out_dataset", type=str, help="Path to the output dataset", default=None)
|
| 108 |
@click.option("--other_methods", type=str, help="Comma separated list of other methods to compare against. Possible values: mathpix", default="")
|
| 109 |
@click.option("--result_path", type=str, default=os.path.join(settings.OUTPUT_DIR, "benchmark", "overall"), help="Output path for results.")
|
| 110 |
@click.option("--max_rows", type=int, default=None, help="Maximum number of rows to process.")
|
| 111 |
@click.option("--use_llm", is_flag=True, help="Use the LLM model for better marker quality.")
|
| 112 |
def main(
|
| 113 |
dataset: str,
|
| 114 |
+
out_dataset: str,
|
| 115 |
other_methods: str,
|
| 116 |
result_path: str,
|
| 117 |
max_rows: int,
|
|
|
|
| 147 |
|
| 148 |
print(f"Results saved to {out_path}.")
|
| 149 |
|
| 150 |
+
# Push up comparison dataset
|
| 151 |
+
if out_dataset is not None:
|
| 152 |
+
out_ds = build_dataset(ds, all_scores)
|
| 153 |
+
out_ds.push_to_hub(out_dataset)
|
| 154 |
+
|
| 155 |
if __name__ == "__main__":
|
| 156 |
main()
|
| 157 |
|
benchmarks/overall/render.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import subprocess
|
| 2 |
+
import tempfile
|
| 3 |
+
import pypdfium2 as pdfium
|
| 4 |
+
from typing import Dict
|
| 5 |
+
from collections import defaultdict
|
| 6 |
+
import re
|
| 7 |
+
import io
|
| 8 |
+
import json
|
| 9 |
+
|
| 10 |
+
from PIL import Image
|
| 11 |
+
import datasets
|
| 12 |
+
import markdown2
|
| 13 |
+
from playwright.sync_api import sync_playwright
|
| 14 |
+
|
| 15 |
+
from benchmarks.overall.schema import FullResult
|
| 16 |
+
|
| 17 |
+
def convert_to_html(md: str):
|
| 18 |
+
block_placeholders = []
|
| 19 |
+
inline_placeholders = []
|
| 20 |
+
|
| 21 |
+
# Add placeholders for the math
|
| 22 |
+
def block_sub(match):
|
| 23 |
+
content = match.group(1)
|
| 24 |
+
placeholder = f"1BLOCKMATH{len(block_placeholders)}1"
|
| 25 |
+
block_placeholders.append((placeholder, f"$${content}$$"))
|
| 26 |
+
return placeholder
|
| 27 |
+
|
| 28 |
+
def inline_sub(match):
|
| 29 |
+
content = match.group(1)
|
| 30 |
+
placeholder = f"1INLINEMATH{len(inline_placeholders)}1"
|
| 31 |
+
inline_placeholders.append((placeholder, f"${content}$"))
|
| 32 |
+
return placeholder
|
| 33 |
+
|
| 34 |
+
md = re.sub(r'\${2}(.*?)\${2}', block_sub, md, flags=re.DOTALL)
|
| 35 |
+
md = re.sub(r'\$(.*?)\$', inline_sub, md)
|
| 36 |
+
|
| 37 |
+
html = markdown2.markdown(md, extras=['tables'])
|
| 38 |
+
|
| 39 |
+
# Replace placeholders
|
| 40 |
+
for placeholder, math_str in block_placeholders:
|
| 41 |
+
html = html.replace(placeholder, math_str)
|
| 42 |
+
for placeholder, math_str in inline_placeholders:
|
| 43 |
+
html = html.replace(placeholder, math_str)
|
| 44 |
+
|
| 45 |
+
return html
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def markdown_to_image(md: str) -> Image.Image:
|
| 49 |
+
html = convert_to_html(md)
|
| 50 |
+
with sync_playwright() as p:
|
| 51 |
+
browser = p.chromium.launch()
|
| 52 |
+
page = browser.new_page()
|
| 53 |
+
page.set_content(f"""
|
| 54 |
+
<head>
|
| 55 |
+
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.16.21/dist/katex.min.css" integrity="sha384-zh0CIslj+VczCZtlzBcjt5ppRcsAmDnRem7ESsYwWwg3m/OaJ2l4x7YBZl9Kxxib" crossorigin="anonymous">
|
| 56 |
+
<!-- The loading of KaTeX is deferred to speed up page rendering -->
|
| 57 |
+
<script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.21/dist/katex.min.js" integrity="sha384-Rma6DA2IPUwhNxmrB/7S3Tno0YY7sFu9WSYMCuulLhIqYSGZ2gKCJWIqhBWqMQfh" crossorigin="anonymous"></script>
|
| 58 |
+
<!-- To automatically render math in text elements, include the auto-render extension: -->
|
| 59 |
+
<script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.21/dist/contrib/auto-render.min.js" integrity="sha384-hCXGrW6PitJEwbkoStFjeJxv+fSOOQKOPbJxSfM6G5sWZjAyWhXiTIIAmQqnlLlh" crossorigin="anonymous"></script>
|
| 60 |
+
</head>
|
| 61 |
+
<body>
|
| 62 |
+
{html}
|
| 63 |
+
<script>
|
| 64 |
+
renderMathInElement(document.body, {{
|
| 65 |
+
delimiters: [
|
| 66 |
+
{{left: '$$', right: '$$', display: true}},
|
| 67 |
+
{{left: '$', right: '$', display: false}}
|
| 68 |
+
]
|
| 69 |
+
}});
|
| 70 |
+
</script>
|
| 71 |
+
</body>
|
| 72 |
+
""")
|
| 73 |
+
page.set_viewport_size({"width": 1200, "height": 800})
|
| 74 |
+
page.wait_for_timeout(500) # Wait for KaTeX to render
|
| 75 |
+
screenshot_bytes = page.screenshot(full_page=True)
|
| 76 |
+
browser.close()
|
| 77 |
+
|
| 78 |
+
return Image.open(io.BytesIO(screenshot_bytes))
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def build_dataset(ds: datasets.Dataset, all_scores: Dict[str, FullResult]) -> datasets.Dataset:
|
| 82 |
+
# Get all the dataset indices that went through inference
|
| 83 |
+
full_idxs = None
|
| 84 |
+
for method in all_scores:
|
| 85 |
+
result_idxs = list(all_scores[method]["raw_scores"].keys())
|
| 86 |
+
if full_idxs is None:
|
| 87 |
+
full_idxs = sorted(result_idxs)
|
| 88 |
+
else:
|
| 89 |
+
full_idxs = [f for f in full_idxs if f in result_idxs]
|
| 90 |
+
|
| 91 |
+
ds_rows = defaultdict(dict)
|
| 92 |
+
for idx in full_idxs:
|
| 93 |
+
row = ds[idx] # img, gt_blocks, classification, language, uuid
|
| 94 |
+
for method in all_scores:
|
| 95 |
+
method_row = all_scores[method]["raw_scores"][idx]
|
| 96 |
+
ds_rows[idx].update({
|
| 97 |
+
f"{method}_score": method_row["overall_score"],
|
| 98 |
+
f"{method}_markdown": method_row["markdown"],
|
| 99 |
+
f"{method}_image": markdown_to_image(method_row["markdown"]),
|
| 100 |
+
f"{method}_time": method_row["time"]
|
| 101 |
+
})
|
| 102 |
+
gt_md = "\n\n".join([clean_input(convert_to_md(block)) for block in json.loads(row["gt_blocks"])])
|
| 103 |
+
ds_rows[idx].update({
|
| 104 |
+
"gt_markdown": gt_md,
|
| 105 |
+
"gt_image": markdown_to_image(gt_md)
|
| 106 |
+
})
|
| 107 |
+
out_dataset = datasets.Dataset.from_list([ds_rows[k] for k in full_idxs])
|
| 108 |
+
return out_dataset
|
| 109 |
+
|
benchmarks/overall/schema.py
CHANGED
|
@@ -4,10 +4,9 @@ from typing import TypedDict, List, Dict, Optional
|
|
| 4 |
class BlockScores(TypedDict):
|
| 5 |
scores: List[float]
|
| 6 |
order_score: float
|
| 7 |
-
gt: List[str]
|
| 8 |
-
method: str
|
| 9 |
overall_score: float
|
| 10 |
time: Optional[float]
|
|
|
|
| 11 |
|
| 12 |
|
| 13 |
class FullResult(TypedDict):
|
|
|
|
| 4 |
class BlockScores(TypedDict):
|
| 5 |
scores: List[float]
|
| 6 |
order_score: float
|
|
|
|
|
|
|
| 7 |
overall_score: float
|
| 8 |
time: Optional[float]
|
| 9 |
+
markdown: str
|
| 10 |
|
| 11 |
|
| 12 |
class FullResult(TypedDict):
|
benchmarks/overall/scoring.py
CHANGED
|
@@ -2,9 +2,8 @@ from typing import List
|
|
| 2 |
|
| 3 |
from rapidfuzz import fuzz
|
| 4 |
|
|
|
|
| 5 |
from benchmarks.overall.schema import BlockScores
|
| 6 |
-
from marker.renderers.markdown import MarkdownRenderer
|
| 7 |
-
import re
|
| 8 |
|
| 9 |
|
| 10 |
def kendall_tau(correct_order: List[int], actual_order: List[int]) -> float:
|
|
@@ -58,112 +57,19 @@ def find_fuzzy_alignments(
|
|
| 58 |
})
|
| 59 |
return alignments
|
| 60 |
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
return markdown
|
| 65 |
-
|
| 66 |
-
def standardize_markdown(markdown):
|
| 67 |
-
# Replace math expressions
|
| 68 |
-
pattern = r'(?<!\\)\$(?:\$([^$]+)\$\$|\s*([^$\n]+?)\s*\$)'
|
| 69 |
-
markdown = re.sub(pattern, standardize_math, markdown)
|
| 70 |
-
|
| 71 |
-
# Replace image urls
|
| 72 |
-
pattern = r'!\[(.*?)\]\((https?://[^\s\)]+)\)'
|
| 73 |
-
markdown = re.sub(pattern, r'![link]', markdown)
|
| 74 |
-
markdown = strip_latex_symbols(markdown)
|
| 75 |
-
markdown = replace_centered_lines(markdown)
|
| 76 |
-
|
| 77 |
-
# Clean up html tags
|
| 78 |
-
markdown = markdown.replace("<br>", "\n")
|
| 79 |
-
markdown = re.sub(r"<sub>(.*?)</sub>", r"\1", markdown)
|
| 80 |
-
markdown = re.sub(r"<sup>(.*?)</sup>", r"\1", markdown)
|
| 81 |
-
markdown = re.sub(r"<span.*?>(.*?)</span>", r"\1", markdown) # Remove span tags and keep content
|
| 82 |
-
|
| 83 |
-
# Clean up markdown
|
| 84 |
-
markdown = re.sub(r"\s+", " ", markdown)
|
| 85 |
-
markdown = re.sub(r"\n+", "\n", markdown)
|
| 86 |
-
markdown = re.sub("\\.+", ".", markdown) # Replace repeated periods with a single period, like in table of contents
|
| 87 |
-
markdown = re.sub("#+", "#", markdown) # Replace repeated headers with a single header
|
| 88 |
-
markdown = re.sub(r"\$", "", markdown) # Remove equation delimiters
|
| 89 |
-
markdown = markdown.encode().decode('unicode-escape', errors="ignore") # Decode unicode characters properly
|
| 90 |
-
return markdown.strip().lower()
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
def replace_centered_lines(text):
|
| 94 |
-
def replace_match(m):
|
| 95 |
-
content = m.group(0)
|
| 96 |
-
dash_count = content.count('-')
|
| 97 |
-
return '-' * dash_count
|
| 98 |
-
|
| 99 |
-
pattern = r':-+:'
|
| 100 |
-
return re.sub(pattern, replace_match, text)
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
def strip_latex_symbols(text):
|
| 104 |
-
# Handle short math mode sequences first - only match $ $ with brief content
|
| 105 |
-
text = re.sub(r'\$\s*\\?[a-zA-Z]+\d?\s*\$', '', text)
|
| 106 |
-
|
| 107 |
-
# Handle common patterns inside remaining math mode
|
| 108 |
-
patterns = [
|
| 109 |
-
r'\$\s*\\?[a-zA-Z]+\d?\s*\$', # \alpha or \alpha2 in math mode
|
| 110 |
-
r'\$\s*\d+\\[a-zA-Z]+\s*\$', # 45\circ in math mode
|
| 111 |
-
r'\$\s*[a-zA-Z0-9]\\[a-zA-Z]+\s*\$' # x\dagger in math mode
|
| 112 |
-
]
|
| 113 |
-
|
| 114 |
-
pattern = '|'.join(patterns)
|
| 115 |
-
return re.sub(pattern, '', text)
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
def standardize_math(match):
|
| 119 |
-
try:
|
| 120 |
-
delim = "$$" if match.group(0).startswith('$$') else "$"
|
| 121 |
-
math_content = match.group(1) or match.group(2)
|
| 122 |
-
result = clean_latex(math_content)
|
| 123 |
-
return f'{delim}{result}{delim}'
|
| 124 |
-
except Exception as e:
|
| 125 |
-
print(f"Failed to standardize math expression: {match.group(0)} with error: {e}")
|
| 126 |
-
return match.group(0)
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
def clean_latex(latex_str):
|
| 130 |
-
latex_str = re.sub(r'\s+', ' ', latex_str.strip())
|
| 131 |
-
for tag in [r'\\text', r'\\mathrm', r'\\mathbf', r'\\textbf']:
|
| 132 |
-
latex_str = re.sub(tag + r'\{([^}]+)\}', r'\1', latex_str)
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
replacements = {
|
| 136 |
-
'\\times': '*',
|
| 137 |
-
'\\cdot': '*',
|
| 138 |
-
'\\div': '/',
|
| 139 |
-
'\\le': '<=',
|
| 140 |
-
'\\ge': '>=',
|
| 141 |
-
'\\neq': '!=',
|
| 142 |
-
'\\to': '\\rightarrow',
|
| 143 |
-
}
|
| 144 |
-
|
| 145 |
-
for old, new in replacements.items():
|
| 146 |
-
latex_str = latex_str.replace(old, new)
|
| 147 |
-
|
| 148 |
-
return latex_str
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
def score_blocks(gt_html, method_html, convert=True) -> BlockScores:
|
| 152 |
-
if convert:
|
| 153 |
-
method_html = convert_to_md(method_html)
|
| 154 |
-
method_html = standardize_markdown(method_html)
|
| 155 |
-
gt = [standardize_markdown(convert_to_md(gt)) for gt in gt_html]
|
| 156 |
-
alignments = find_fuzzy_alignments(method_html, gt)
|
| 157 |
scores = [alignment["score"] for alignment in alignments]
|
| 158 |
|
| 159 |
# Find order score
|
| 160 |
orders = [alignment["start"] for alignment in alignments]
|
| 161 |
-
correct_order = list(range(len(
|
| 162 |
-
actual_order = sorted(range(len(
|
| 163 |
order_score = kendall_tau(correct_order, actual_order)
|
| 164 |
|
| 165 |
# Weight score by sequence length
|
| 166 |
-
gt_weights = [len(g) for g in
|
| 167 |
weighted_scores = [score * weight for score, weight in zip(scores, gt_weights)]
|
| 168 |
|
| 169 |
# Weight the score by sequence length
|
|
@@ -172,8 +78,6 @@ def score_blocks(gt_html, method_html, convert=True) -> BlockScores:
|
|
| 172 |
return {
|
| 173 |
"scores": scores,
|
| 174 |
"order_score": order_score,
|
| 175 |
-
"gt": gt,
|
| 176 |
-
"method": method_html,
|
| 177 |
"overall_score": overall_score,
|
| 178 |
"time": None
|
| 179 |
}
|
|
|
|
| 2 |
|
| 3 |
from rapidfuzz import fuzz
|
| 4 |
|
| 5 |
+
from benchmarks.overall.clean import convert_to_md, MarkdownCleaner
|
| 6 |
from benchmarks.overall.schema import BlockScores
|
|
|
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
def kendall_tau(correct_order: List[int], actual_order: List[int]) -> float:
|
|
|
|
| 57 |
})
|
| 58 |
return alignments
|
| 59 |
|
| 60 |
+
|
| 61 |
+
def score_blocks(gt_markdown: List[str], method_markdown: str) -> BlockScores:
|
| 62 |
+
alignments = find_fuzzy_alignments(method_markdown, gt_markdown)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
scores = [alignment["score"] for alignment in alignments]
|
| 64 |
|
| 65 |
# Find order score
|
| 66 |
orders = [alignment["start"] for alignment in alignments]
|
| 67 |
+
correct_order = list(range(len(gt_markdown)))
|
| 68 |
+
actual_order = sorted(range(len(gt_markdown)), key=lambda x: orders[x])
|
| 69 |
order_score = kendall_tau(correct_order, actual_order)
|
| 70 |
|
| 71 |
# Weight score by sequence length
|
| 72 |
+
gt_weights = [len(g) for g in gt_markdown]
|
| 73 |
weighted_scores = [score * weight for score, weight in zip(scores, gt_weights)]
|
| 74 |
|
| 75 |
# Weight the score by sequence length
|
|
|
|
| 78 |
return {
|
| 79 |
"scores": scores,
|
| 80 |
"order_score": order_score,
|
|
|
|
|
|
|
| 81 |
"overall_score": overall_score,
|
| 82 |
"time": None
|
| 83 |
}
|
marker/scripts/streamlit_app.py
CHANGED
|
@@ -115,7 +115,10 @@ def pillow_image_to_base64_string(img: Image) -> str:
|
|
| 115 |
return base64.b64encode(buffered.getvalue()).decode("utf-8")
|
| 116 |
|
| 117 |
|
| 118 |
-
def block_display(image: Image, blocks: dict =
|
|
|
|
|
|
|
|
|
|
| 119 |
image_data_url = (
|
| 120 |
'data:image/jpeg;base64,' + pillow_image_to_base64_string(image)
|
| 121 |
)
|
|
|
|
| 115 |
return base64.b64encode(buffered.getvalue()).decode("utf-8")
|
| 116 |
|
| 117 |
|
| 118 |
+
def block_display(image: Image, blocks: dict | None = None, dpi=96):
|
| 119 |
+
if blocks is None:
|
| 120 |
+
blocks = {}
|
| 121 |
+
|
| 122 |
image_data_url = (
|
| 123 |
'data:image/jpeg;base64,' + pillow_image_to_base64_string(image)
|
| 124 |
)
|
marker/scripts/streamlit_app_blocks_viz.html
CHANGED
|
@@ -114,7 +114,7 @@
|
|
| 114 |
<body>
|
| 115 |
<div style="text-align: center" class="image-container">
|
| 116 |
<dialog id="block-info-dialog">
|
| 117 |
-
<button
|
| 118 |
class="close-button"
|
| 119 |
onclick="document.querySelector('#block-info-dialog').close()"
|
| 120 |
></button>
|
|
@@ -147,17 +147,17 @@
|
|
| 147 |
const BLOCK_TYPES = $block_types_json;
|
| 148 |
const blocksById = {};
|
| 149 |
const blockInfoDialog = document.querySelector("dialog#block-info-dialog");
|
| 150 |
-
|
| 151 |
function blockTypeColor(blockType) {
|
| 152 |
return COLORS[BLOCK_TYPES[blockType] % COLORS.length];
|
| 153 |
}
|
| 154 |
-
|
| 155 |
function traverseAndGenerateSVG(block) {
|
| 156 |
let svg = "";
|
| 157 |
-
|
| 158 |
if (block.polygon) {
|
| 159 |
const color = blockTypeColor(block.block_type);
|
| 160 |
-
|
| 161 |
// dollar signs are escaped because this files gets read into a template string
|
| 162 |
svg += `<rect id="$${block.id}"
|
| 163 |
class="block type-$${block.block_type}"
|
|
@@ -171,52 +171,52 @@
|
|
| 171 |
}"
|
| 172 |
fill=$${color} stroke=$${color}>
|
| 173 |
</rect>`;
|
| 174 |
-
|
| 175 |
blocksById[block.id] = block;
|
| 176 |
}
|
| 177 |
-
|
| 178 |
if (Array.isArray(block.children) && block.children.length > 0) {
|
| 179 |
block.children.forEach((child) => {
|
| 180 |
svg += traverseAndGenerateSVG(child);
|
| 181 |
});
|
| 182 |
}
|
| 183 |
-
|
| 184 |
return svg;
|
| 185 |
}
|
| 186 |
-
|
| 187 |
if (Object.keys(BLOCKS).length == 0) {
|
| 188 |
// bail out if no blocks
|
| 189 |
return;
|
| 190 |
}
|
| 191 |
-
|
| 192 |
const [vbWidth, vbHeight] = BLOCKS.children[0].polygon[2];
|
| 193 |
document
|
| 194 |
.querySelector("svg")
|
| 195 |
.setAttribute("viewBox", `0 0 $${vbWidth} $${vbHeight}`);
|
| 196 |
-
|
| 197 |
const blocksOverlay = document.querySelector(".blocks-overlay");
|
| 198 |
blocksOverlay.innerHTML = traverseAndGenerateSVG(BLOCKS.children[0]);
|
| 199 |
-
|
| 200 |
tippy("rect.block", {
|
| 201 |
content: (block) => block.getAttribute("data-type"),
|
| 202 |
placement: "top-start",
|
| 203 |
arrow: false,
|
| 204 |
offset: [0, 5],
|
| 205 |
});
|
| 206 |
-
|
| 207 |
blocksOverlay.addEventListener("click", (event) => {
|
| 208 |
if (event.target.tagName !== "rect") return;
|
| 209 |
-
|
| 210 |
const blockId = event.target.id;
|
| 211 |
const block = blocksById[blockId];
|
| 212 |
-
|
| 213 |
blockInfoDialog.querySelector("h1").innerHTML = `
|
| 214 |
$${blockId} <span style="color: $${blockTypeColor(block.block_type)}">($${block.block_type})</span>
|
| 215 |
`;
|
| 216 |
blockInfoDialog.querySelector(".text-content").textContent = block.html;
|
| 217 |
-
|
| 218 |
blockInfoDialog.dataset.blockJSON = JSON.stringify(block, null, 2);
|
| 219 |
-
|
| 220 |
if (block.images) {
|
| 221 |
const imagesDiv = blockInfoDialog.querySelector(".images");
|
| 222 |
imagesDiv.innerHTML = "";
|
|
|
|
| 114 |
<body>
|
| 115 |
<div style="text-align: center" class="image-container">
|
| 116 |
<dialog id="block-info-dialog">
|
| 117 |
+
<button
|
| 118 |
class="close-button"
|
| 119 |
onclick="document.querySelector('#block-info-dialog').close()"
|
| 120 |
></button>
|
|
|
|
| 147 |
const BLOCK_TYPES = $block_types_json;
|
| 148 |
const blocksById = {};
|
| 149 |
const blockInfoDialog = document.querySelector("dialog#block-info-dialog");
|
| 150 |
+
|
| 151 |
function blockTypeColor(blockType) {
|
| 152 |
return COLORS[BLOCK_TYPES[blockType] % COLORS.length];
|
| 153 |
}
|
| 154 |
+
|
| 155 |
function traverseAndGenerateSVG(block) {
|
| 156 |
let svg = "";
|
| 157 |
+
|
| 158 |
if (block.polygon) {
|
| 159 |
const color = blockTypeColor(block.block_type);
|
| 160 |
+
|
| 161 |
// dollar signs are escaped because this files gets read into a template string
|
| 162 |
svg += `<rect id="$${block.id}"
|
| 163 |
class="block type-$${block.block_type}"
|
|
|
|
| 171 |
}"
|
| 172 |
fill=$${color} stroke=$${color}>
|
| 173 |
</rect>`;
|
| 174 |
+
|
| 175 |
blocksById[block.id] = block;
|
| 176 |
}
|
| 177 |
+
|
| 178 |
if (Array.isArray(block.children) && block.children.length > 0) {
|
| 179 |
block.children.forEach((child) => {
|
| 180 |
svg += traverseAndGenerateSVG(child);
|
| 181 |
});
|
| 182 |
}
|
| 183 |
+
|
| 184 |
return svg;
|
| 185 |
}
|
| 186 |
+
|
| 187 |
if (Object.keys(BLOCKS).length == 0) {
|
| 188 |
// bail out if no blocks
|
| 189 |
return;
|
| 190 |
}
|
| 191 |
+
|
| 192 |
const [vbWidth, vbHeight] = BLOCKS.children[0].polygon[2];
|
| 193 |
document
|
| 194 |
.querySelector("svg")
|
| 195 |
.setAttribute("viewBox", `0 0 $${vbWidth} $${vbHeight}`);
|
| 196 |
+
|
| 197 |
const blocksOverlay = document.querySelector(".blocks-overlay");
|
| 198 |
blocksOverlay.innerHTML = traverseAndGenerateSVG(BLOCKS.children[0]);
|
| 199 |
+
|
| 200 |
tippy("rect.block", {
|
| 201 |
content: (block) => block.getAttribute("data-type"),
|
| 202 |
placement: "top-start",
|
| 203 |
arrow: false,
|
| 204 |
offset: [0, 5],
|
| 205 |
});
|
| 206 |
+
|
| 207 |
blocksOverlay.addEventListener("click", (event) => {
|
| 208 |
if (event.target.tagName !== "rect") return;
|
| 209 |
+
|
| 210 |
const blockId = event.target.id;
|
| 211 |
const block = blocksById[blockId];
|
| 212 |
+
|
| 213 |
blockInfoDialog.querySelector("h1").innerHTML = `
|
| 214 |
$${blockId} <span style="color: $${blockTypeColor(block.block_type)}">($${block.block_type})</span>
|
| 215 |
`;
|
| 216 |
blockInfoDialog.querySelector(".text-content").textContent = block.html;
|
| 217 |
+
|
| 218 |
blockInfoDialog.dataset.blockJSON = JSON.stringify(block, null, 2);
|
| 219 |
+
|
| 220 |
if (block.images) {
|
| 221 |
const imagesDiv = blockInfoDialog.querySelector(".images");
|
| 222 |
imagesDiv.innerHTML = "";
|
poetry.lock
CHANGED
|
@@ -414,13 +414,13 @@ files = [
|
|
| 414 |
|
| 415 |
[[package]]
|
| 416 |
name = "certifi"
|
| 417 |
-
version = "
|
| 418 |
description = "Python package for providing Mozilla's CA Bundle."
|
| 419 |
optional = false
|
| 420 |
python-versions = ">=3.6"
|
| 421 |
files = [
|
| 422 |
-
{file = "certifi-
|
| 423 |
-
{file = "certifi-
|
| 424 |
]
|
| 425 |
|
| 426 |
[[package]]
|
|
@@ -1212,6 +1212,92 @@ protobuf = ">=3.20.2,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4
|
|
| 1212 |
[package.extras]
|
| 1213 |
grpc = ["grpcio (>=1.44.0,<2.0.0.dev0)"]
|
| 1214 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1215 |
[[package]]
|
| 1216 |
name = "grpcio"
|
| 1217 |
version = "1.70.0"
|
|
@@ -1905,6 +1991,17 @@ files = [
|
|
| 1905 |
{file = "jupyterlab_widgets-3.0.13.tar.gz", hash = "sha256:a2966d385328c1942b683a8cd96b89b8dd82c8b8f81dda902bb2bc06d46f5bed"},
|
| 1906 |
]
|
| 1907 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1908 |
[[package]]
|
| 1909 |
name = "lxml"
|
| 1910 |
version = "5.3.0"
|
|
@@ -2729,6 +2826,18 @@ files = [
|
|
| 2729 |
[package.dependencies]
|
| 2730 |
nvidia-nvjitlink-cu12 = "*"
|
| 2731 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2732 |
[[package]]
|
| 2733 |
name = "nvidia-nccl-cu12"
|
| 2734 |
version = "2.21.5"
|
|
@@ -3065,6 +3174,26 @@ docs = ["furo (>=2024.8.6)", "proselint (>=0.14)", "sphinx (>=8.0.2)", "sphinx-a
|
|
| 3065 |
test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=8.3.2)", "pytest-cov (>=5)", "pytest-mock (>=3.14)"]
|
| 3066 |
type = ["mypy (>=1.11.2)"]
|
| 3067 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3068 |
[[package]]
|
| 3069 |
name = "pluggy"
|
| 3070 |
version = "1.5.0"
|
|
@@ -3552,6 +3681,23 @@ numpy = ">=1.16.4"
|
|
| 3552 |
carto = ["pydeck-carto"]
|
| 3553 |
jupyter = ["ipykernel (>=5.1.2)", "ipython (>=5.8.0)", "ipywidgets (>=7,<8)", "traitlets (>=4.3.2)"]
|
| 3554 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3555 |
[[package]]
|
| 3556 |
name = "pygments"
|
| 3557 |
version = "2.19.1"
|
|
@@ -3696,13 +3842,13 @@ files = [
|
|
| 3696 |
|
| 3697 |
[[package]]
|
| 3698 |
name = "pytz"
|
| 3699 |
-
version = "
|
| 3700 |
description = "World timezone definitions, modern and historical"
|
| 3701 |
optional = false
|
| 3702 |
python-versions = "*"
|
| 3703 |
files = [
|
| 3704 |
-
{file = "pytz-
|
| 3705 |
-
{file = "pytz-
|
| 3706 |
]
|
| 3707 |
|
| 3708 |
[[package]]
|
|
@@ -4641,13 +4787,13 @@ snowflake = ["snowflake-connector-python (>=2.8.0)", "snowflake-snowpark-python[
|
|
| 4641 |
|
| 4642 |
[[package]]
|
| 4643 |
name = "surya-ocr"
|
| 4644 |
-
version = "0.10.
|
| 4645 |
description = "OCR, layout, reading order, and table recognition in 90+ languages"
|
| 4646 |
optional = false
|
| 4647 |
python-versions = "<4.0,>=3.10"
|
| 4648 |
files = [
|
| 4649 |
-
{file = "surya_ocr-0.10.
|
| 4650 |
-
{file = "surya_ocr-0.10.
|
| 4651 |
]
|
| 4652 |
|
| 4653 |
[package.dependencies]
|
|
@@ -4659,7 +4805,7 @@ pydantic = ">=2.5.3,<3.0.0"
|
|
| 4659 |
pydantic-settings = ">=2.1.0,<3.0.0"
|
| 4660 |
pypdfium2 = "4.30.0"
|
| 4661 |
python-dotenv = ">=1.0.0,<2.0.0"
|
| 4662 |
-
torch = ">=2.5.1,<
|
| 4663 |
transformers = ">=4.41.0,<5.0.0"
|
| 4664 |
|
| 4665 |
[[package]]
|
|
@@ -4844,28 +4990,31 @@ files = [
|
|
| 4844 |
|
| 4845 |
[[package]]
|
| 4846 |
name = "torch"
|
| 4847 |
-
version = "2.
|
| 4848 |
description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration"
|
| 4849 |
optional = false
|
| 4850 |
-
python-versions = ">=3.
|
| 4851 |
files = [
|
| 4852 |
-
{file = "torch-2.
|
| 4853 |
-
{file = "torch-2.
|
| 4854 |
-
{file = "torch-2.
|
| 4855 |
-
{file = "torch-2.
|
| 4856 |
-
{file = "torch-2.
|
| 4857 |
-
{file = "torch-2.
|
| 4858 |
-
{file = "torch-2.
|
| 4859 |
-
{file = "torch-2.
|
| 4860 |
-
{file = "torch-2.
|
| 4861 |
-
{file = "torch-2.
|
| 4862 |
-
{file = "torch-2.
|
| 4863 |
-
{file = "torch-2.
|
| 4864 |
-
{file = "torch-2.
|
| 4865 |
-
{file = "torch-2.
|
| 4866 |
-
{file = "torch-2.
|
| 4867 |
-
{file = "torch-2.
|
| 4868 |
-
{file = "torch-2.
|
|
|
|
|
|
|
|
|
|
| 4869 |
]
|
| 4870 |
|
| 4871 |
[package.dependencies]
|
|
@@ -4882,17 +5031,18 @@ nvidia-cufft-cu12 = {version = "11.2.1.3", markers = "platform_system == \"Linux
|
|
| 4882 |
nvidia-curand-cu12 = {version = "10.3.5.147", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
|
| 4883 |
nvidia-cusolver-cu12 = {version = "11.6.1.9", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
|
| 4884 |
nvidia-cusparse-cu12 = {version = "12.3.1.170", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
|
|
|
|
| 4885 |
nvidia-nccl-cu12 = {version = "2.21.5", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
|
| 4886 |
nvidia-nvjitlink-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
|
| 4887 |
nvidia-nvtx-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
|
| 4888 |
setuptools = {version = "*", markers = "python_version >= \"3.12\""}
|
| 4889 |
sympy = {version = "1.13.1", markers = "python_version >= \"3.9\""}
|
| 4890 |
-
triton = {version = "3.
|
| 4891 |
-
typing-extensions = ">=4.
|
| 4892 |
|
| 4893 |
[package.extras]
|
| 4894 |
opt-einsum = ["opt-einsum (>=3.3)"]
|
| 4895 |
-
optree = ["optree (>=0.
|
| 4896 |
|
| 4897 |
[[package]]
|
| 4898 |
name = "tornado"
|
|
@@ -5021,21 +5171,18 @@ vision = ["Pillow (>=10.0.1,<=15.0)"]
|
|
| 5021 |
|
| 5022 |
[[package]]
|
| 5023 |
name = "triton"
|
| 5024 |
-
version = "3.
|
| 5025 |
description = "A language and compiler for custom Deep Learning operations"
|
| 5026 |
optional = false
|
| 5027 |
python-versions = "*"
|
| 5028 |
files = [
|
| 5029 |
-
{file = "triton-3.
|
| 5030 |
-
{file = "triton-3.
|
| 5031 |
-
{file = "triton-3.
|
| 5032 |
-
{file = "triton-3.
|
| 5033 |
-
{file = "triton-3.
|
| 5034 |
]
|
| 5035 |
|
| 5036 |
-
[package.dependencies]
|
| 5037 |
-
filelock = "*"
|
| 5038 |
-
|
| 5039 |
[package.extras]
|
| 5040 |
build = ["cmake (>=3.20)", "lit"]
|
| 5041 |
tests = ["autopep8", "flake8", "isort", "llnl-hatchet", "numpy", "pytest", "scipy (>=1.7.1)"]
|
|
@@ -5468,4 +5615,4 @@ propcache = ">=0.2.0"
|
|
| 5468 |
[metadata]
|
| 5469 |
lock-version = "2.0"
|
| 5470 |
python-versions = "^3.10"
|
| 5471 |
-
content-hash = "
|
|
|
|
| 414 |
|
| 415 |
[[package]]
|
| 416 |
name = "certifi"
|
| 417 |
+
version = "2025.1.31"
|
| 418 |
description = "Python package for providing Mozilla's CA Bundle."
|
| 419 |
optional = false
|
| 420 |
python-versions = ">=3.6"
|
| 421 |
files = [
|
| 422 |
+
{file = "certifi-2025.1.31-py3-none-any.whl", hash = "sha256:ca78db4565a652026a4db2bcdf68f2fb589ea80d0be70e03929ed730746b84fe"},
|
| 423 |
+
{file = "certifi-2025.1.31.tar.gz", hash = "sha256:3d5da6925056f6f18f119200434a4780a94263f10d1c21d032a6f6b2baa20651"},
|
| 424 |
]
|
| 425 |
|
| 426 |
[[package]]
|
|
|
|
| 1212 |
[package.extras]
|
| 1213 |
grpc = ["grpcio (>=1.44.0,<2.0.0.dev0)"]
|
| 1214 |
|
| 1215 |
+
[[package]]
|
| 1216 |
+
name = "greenlet"
|
| 1217 |
+
version = "3.1.1"
|
| 1218 |
+
description = "Lightweight in-process concurrent programming"
|
| 1219 |
+
optional = false
|
| 1220 |
+
python-versions = ">=3.7"
|
| 1221 |
+
files = [
|
| 1222 |
+
{file = "greenlet-3.1.1-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:0bbae94a29c9e5c7e4a2b7f0aae5c17e8e90acbfd3bf6270eeba60c39fce3563"},
|
| 1223 |
+
{file = "greenlet-3.1.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0fde093fb93f35ca72a556cf72c92ea3ebfda3d79fc35bb19fbe685853869a83"},
|
| 1224 |
+
{file = "greenlet-3.1.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:36b89d13c49216cadb828db8dfa6ce86bbbc476a82d3a6c397f0efae0525bdd0"},
|
| 1225 |
+
{file = "greenlet-3.1.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:94b6150a85e1b33b40b1464a3f9988dcc5251d6ed06842abff82e42632fac120"},
|
| 1226 |
+
{file = "greenlet-3.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:93147c513fac16385d1036b7e5b102c7fbbdb163d556b791f0f11eada7ba65dc"},
|
| 1227 |
+
{file = "greenlet-3.1.1-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:da7a9bff22ce038e19bf62c4dd1ec8391062878710ded0a845bcf47cc0200617"},
|
| 1228 |
+
{file = "greenlet-3.1.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b2795058c23988728eec1f36a4e5e4ebad22f8320c85f3587b539b9ac84128d7"},
|
| 1229 |
+
{file = "greenlet-3.1.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ed10eac5830befbdd0c32f83e8aa6288361597550ba669b04c48f0f9a2c843c6"},
|
| 1230 |
+
{file = "greenlet-3.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:77c386de38a60d1dfb8e55b8c1101d68c79dfdd25c7095d51fec2dd800892b80"},
|
| 1231 |
+
{file = "greenlet-3.1.1-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:e4d333e558953648ca09d64f13e6d8f0523fa705f51cae3f03b5983489958c70"},
|
| 1232 |
+
{file = "greenlet-3.1.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:09fc016b73c94e98e29af67ab7b9a879c307c6731a2c9da0db5a7d9b7edd1159"},
|
| 1233 |
+
{file = "greenlet-3.1.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d5e975ca70269d66d17dd995dafc06f1b06e8cb1ec1e9ed54c1d1e4a7c4cf26e"},
|
| 1234 |
+
{file = "greenlet-3.1.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3b2813dc3de8c1ee3f924e4d4227999285fd335d1bcc0d2be6dc3f1f6a318ec1"},
|
| 1235 |
+
{file = "greenlet-3.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e347b3bfcf985a05e8c0b7d462ba6f15b1ee1c909e2dcad795e49e91b152c383"},
|
| 1236 |
+
{file = "greenlet-3.1.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9e8f8c9cb53cdac7ba9793c276acd90168f416b9ce36799b9b885790f8ad6c0a"},
|
| 1237 |
+
{file = "greenlet-3.1.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:62ee94988d6b4722ce0028644418d93a52429e977d742ca2ccbe1c4f4a792511"},
|
| 1238 |
+
{file = "greenlet-3.1.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:1776fd7f989fc6b8d8c8cb8da1f6b82c5814957264d1f6cf818d475ec2bf6395"},
|
| 1239 |
+
{file = "greenlet-3.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:48ca08c771c268a768087b408658e216133aecd835c0ded47ce955381105ba39"},
|
| 1240 |
+
{file = "greenlet-3.1.1-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:4afe7ea89de619adc868e087b4d2359282058479d7cfb94970adf4b55284574d"},
|
| 1241 |
+
{file = "greenlet-3.1.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f406b22b7c9a9b4f8aa9d2ab13d6ae0ac3e85c9a809bd590ad53fed2bf70dc79"},
|
| 1242 |
+
{file = "greenlet-3.1.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c3a701fe5a9695b238503ce5bbe8218e03c3bcccf7e204e455e7462d770268aa"},
|
| 1243 |
+
{file = "greenlet-3.1.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2846930c65b47d70b9d178e89c7e1a69c95c1f68ea5aa0a58646b7a96df12441"},
|
| 1244 |
+
{file = "greenlet-3.1.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:99cfaa2110534e2cf3ba31a7abcac9d328d1d9f1b95beede58294a60348fba36"},
|
| 1245 |
+
{file = "greenlet-3.1.1-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1443279c19fca463fc33e65ef2a935a5b09bb90f978beab37729e1c3c6c25fe9"},
|
| 1246 |
+
{file = "greenlet-3.1.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:b7cede291382a78f7bb5f04a529cb18e068dd29e0fb27376074b6d0317bf4dd0"},
|
| 1247 |
+
{file = "greenlet-3.1.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:23f20bb60ae298d7d8656c6ec6db134bca379ecefadb0b19ce6f19d1f232a942"},
|
| 1248 |
+
{file = "greenlet-3.1.1-cp312-cp312-win_amd64.whl", hash = "sha256:7124e16b4c55d417577c2077be379514321916d5790fa287c9ed6f23bd2ffd01"},
|
| 1249 |
+
{file = "greenlet-3.1.1-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:05175c27cb459dcfc05d026c4232f9de8913ed006d42713cb8a5137bd49375f1"},
|
| 1250 |
+
{file = "greenlet-3.1.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:935e943ec47c4afab8965954bf49bfa639c05d4ccf9ef6e924188f762145c0ff"},
|
| 1251 |
+
{file = "greenlet-3.1.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:667a9706c970cb552ede35aee17339a18e8f2a87a51fba2ed39ceeeb1004798a"},
|
| 1252 |
+
{file = "greenlet-3.1.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b8a678974d1f3aa55f6cc34dc480169d58f2e6d8958895d68845fa4ab566509e"},
|
| 1253 |
+
{file = "greenlet-3.1.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:efc0f674aa41b92da8c49e0346318c6075d734994c3c4e4430b1c3f853e498e4"},
|
| 1254 |
+
{file = "greenlet-3.1.1-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0153404a4bb921f0ff1abeb5ce8a5131da56b953eda6e14b88dc6bbc04d2049e"},
|
| 1255 |
+
{file = "greenlet-3.1.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:275f72decf9932639c1c6dd1013a1bc266438eb32710016a1c742df5da6e60a1"},
|
| 1256 |
+
{file = "greenlet-3.1.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:c4aab7f6381f38a4b42f269057aee279ab0fc7bf2e929e3d4abfae97b682a12c"},
|
| 1257 |
+
{file = "greenlet-3.1.1-cp313-cp313-win_amd64.whl", hash = "sha256:b42703b1cf69f2aa1df7d1030b9d77d3e584a70755674d60e710f0af570f3761"},
|
| 1258 |
+
{file = "greenlet-3.1.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f1695e76146579f8c06c1509c7ce4dfe0706f49c6831a817ac04eebb2fd02011"},
|
| 1259 |
+
{file = "greenlet-3.1.1-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7876452af029456b3f3549b696bb36a06db7c90747740c5302f74a9e9fa14b13"},
|
| 1260 |
+
{file = "greenlet-3.1.1-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4ead44c85f8ab905852d3de8d86f6f8baf77109f9da589cb4fa142bd3b57b475"},
|
| 1261 |
+
{file = "greenlet-3.1.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8320f64b777d00dd7ccdade271eaf0cad6636343293a25074cc5566160e4de7b"},
|
| 1262 |
+
{file = "greenlet-3.1.1-cp313-cp313t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6510bf84a6b643dabba74d3049ead221257603a253d0a9873f55f6a59a65f822"},
|
| 1263 |
+
{file = "greenlet-3.1.1-cp313-cp313t-musllinux_1_1_aarch64.whl", hash = "sha256:04b013dc07c96f83134b1e99888e7a79979f1a247e2a9f59697fa14b5862ed01"},
|
| 1264 |
+
{file = "greenlet-3.1.1-cp313-cp313t-musllinux_1_1_x86_64.whl", hash = "sha256:411f015496fec93c1c8cd4e5238da364e1da7a124bcb293f085bf2860c32c6f6"},
|
| 1265 |
+
{file = "greenlet-3.1.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:47da355d8687fd65240c364c90a31569a133b7b60de111c255ef5b606f2ae291"},
|
| 1266 |
+
{file = "greenlet-3.1.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:98884ecf2ffb7d7fe6bd517e8eb99d31ff7855a840fa6d0d63cd07c037f6a981"},
|
| 1267 |
+
{file = "greenlet-3.1.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f1d4aeb8891338e60d1ab6127af1fe45def5259def8094b9c7e34690c8858803"},
|
| 1268 |
+
{file = "greenlet-3.1.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:db32b5348615a04b82240cc67983cb315309e88d444a288934ee6ceaebcad6cc"},
|
| 1269 |
+
{file = "greenlet-3.1.1-cp37-cp37m-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dcc62f31eae24de7f8dce72134c8651c58000d3b1868e01392baea7c32c247de"},
|
| 1270 |
+
{file = "greenlet-3.1.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:1d3755bcb2e02de341c55b4fca7a745a24a9e7212ac953f6b3a48d117d7257aa"},
|
| 1271 |
+
{file = "greenlet-3.1.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:b8da394b34370874b4572676f36acabac172602abf054cbc4ac910219f3340af"},
|
| 1272 |
+
{file = "greenlet-3.1.1-cp37-cp37m-win32.whl", hash = "sha256:a0dfc6c143b519113354e780a50381508139b07d2177cb6ad6a08278ec655798"},
|
| 1273 |
+
{file = "greenlet-3.1.1-cp37-cp37m-win_amd64.whl", hash = "sha256:54558ea205654b50c438029505def3834e80f0869a70fb15b871c29b4575ddef"},
|
| 1274 |
+
{file = "greenlet-3.1.1-cp38-cp38-macosx_11_0_universal2.whl", hash = "sha256:346bed03fe47414091be4ad44786d1bd8bef0c3fcad6ed3dee074a032ab408a9"},
|
| 1275 |
+
{file = "greenlet-3.1.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dfc59d69fc48664bc693842bd57acfdd490acafda1ab52c7836e3fc75c90a111"},
|
| 1276 |
+
{file = "greenlet-3.1.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d21e10da6ec19b457b82636209cbe2331ff4306b54d06fa04b7c138ba18c8a81"},
|
| 1277 |
+
{file = "greenlet-3.1.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:37b9de5a96111fc15418819ab4c4432e4f3c2ede61e660b1e33971eba26ef9ba"},
|
| 1278 |
+
{file = "greenlet-3.1.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6ef9ea3f137e5711f0dbe5f9263e8c009b7069d8a1acea822bd5e9dae0ae49c8"},
|
| 1279 |
+
{file = "greenlet-3.1.1-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:85f3ff71e2e60bd4b4932a043fbbe0f499e263c628390b285cb599154a3b03b1"},
|
| 1280 |
+
{file = "greenlet-3.1.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:95ffcf719966dd7c453f908e208e14cde192e09fde6c7186c8f1896ef778d8cd"},
|
| 1281 |
+
{file = "greenlet-3.1.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:03a088b9de532cbfe2ba2034b2b85e82df37874681e8c470d6fb2f8c04d7e4b7"},
|
| 1282 |
+
{file = "greenlet-3.1.1-cp38-cp38-win32.whl", hash = "sha256:8b8b36671f10ba80e159378df9c4f15c14098c4fd73a36b9ad715f057272fbef"},
|
| 1283 |
+
{file = "greenlet-3.1.1-cp38-cp38-win_amd64.whl", hash = "sha256:7017b2be767b9d43cc31416aba48aab0d2309ee31b4dbf10a1d38fb7972bdf9d"},
|
| 1284 |
+
{file = "greenlet-3.1.1-cp39-cp39-macosx_11_0_universal2.whl", hash = "sha256:396979749bd95f018296af156201d6211240e7a23090f50a8d5d18c370084dc3"},
|
| 1285 |
+
{file = "greenlet-3.1.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca9d0ff5ad43e785350894d97e13633a66e2b50000e8a183a50a88d834752d42"},
|
| 1286 |
+
{file = "greenlet-3.1.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f6ff3b14f2df4c41660a7dec01045a045653998784bf8cfcb5a525bdffffbc8f"},
|
| 1287 |
+
{file = "greenlet-3.1.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:94ebba31df2aa506d7b14866fed00ac141a867e63143fe5bca82a8e503b36437"},
|
| 1288 |
+
{file = "greenlet-3.1.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:73aaad12ac0ff500f62cebed98d8789198ea0e6f233421059fa68a5aa7220145"},
|
| 1289 |
+
{file = "greenlet-3.1.1-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:63e4844797b975b9af3a3fb8f7866ff08775f5426925e1e0bbcfe7932059a12c"},
|
| 1290 |
+
{file = "greenlet-3.1.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:7939aa3ca7d2a1593596e7ac6d59391ff30281ef280d8632fa03d81f7c5f955e"},
|
| 1291 |
+
{file = "greenlet-3.1.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d0028e725ee18175c6e422797c407874da24381ce0690d6b9396c204c7f7276e"},
|
| 1292 |
+
{file = "greenlet-3.1.1-cp39-cp39-win32.whl", hash = "sha256:5e06afd14cbaf9e00899fae69b24a32f2196c19de08fcb9f4779dd4f004e5e7c"},
|
| 1293 |
+
{file = "greenlet-3.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:3319aa75e0e0639bc15ff54ca327e8dc7a6fe404003496e3c6925cd3142e0e22"},
|
| 1294 |
+
{file = "greenlet-3.1.1.tar.gz", hash = "sha256:4ce3ac6cdb6adf7946475d7ef31777c26d94bccc377e070a7986bd2d5c515467"},
|
| 1295 |
+
]
|
| 1296 |
+
|
| 1297 |
+
[package.extras]
|
| 1298 |
+
docs = ["Sphinx", "furo"]
|
| 1299 |
+
test = ["objgraph", "psutil"]
|
| 1300 |
+
|
| 1301 |
[[package]]
|
| 1302 |
name = "grpcio"
|
| 1303 |
version = "1.70.0"
|
|
|
|
| 1991 |
{file = "jupyterlab_widgets-3.0.13.tar.gz", hash = "sha256:a2966d385328c1942b683a8cd96b89b8dd82c8b8f81dda902bb2bc06d46f5bed"},
|
| 1992 |
]
|
| 1993 |
|
| 1994 |
+
[[package]]
|
| 1995 |
+
name = "latex2mathml"
|
| 1996 |
+
version = "3.77.0"
|
| 1997 |
+
description = "Pure Python library for LaTeX to MathML conversion"
|
| 1998 |
+
optional = false
|
| 1999 |
+
python-versions = ">=3.8.1,<4.0.0"
|
| 2000 |
+
files = [
|
| 2001 |
+
{file = "latex2mathml-3.77.0-py3-none-any.whl", hash = "sha256:5531e18a2a9eae7c24e257118b6a444cbba253cd27ff3e81f1bd6c41e88e786e"},
|
| 2002 |
+
{file = "latex2mathml-3.77.0.tar.gz", hash = "sha256:e2f501d1878f2e489c3f6f12786bef74c62f712d2770f7f3c837eb20a55d0a1e"},
|
| 2003 |
+
]
|
| 2004 |
+
|
| 2005 |
[[package]]
|
| 2006 |
name = "lxml"
|
| 2007 |
version = "5.3.0"
|
|
|
|
| 2826 |
[package.dependencies]
|
| 2827 |
nvidia-nvjitlink-cu12 = "*"
|
| 2828 |
|
| 2829 |
+
[[package]]
|
| 2830 |
+
name = "nvidia-cusparselt-cu12"
|
| 2831 |
+
version = "0.6.2"
|
| 2832 |
+
description = "NVIDIA cuSPARSELt"
|
| 2833 |
+
optional = false
|
| 2834 |
+
python-versions = "*"
|
| 2835 |
+
files = [
|
| 2836 |
+
{file = "nvidia_cusparselt_cu12-0.6.2-py3-none-manylinux2014_aarch64.whl", hash = "sha256:067a7f6d03ea0d4841c85f0c6f1991c5dda98211f6302cb83a4ab234ee95bef8"},
|
| 2837 |
+
{file = "nvidia_cusparselt_cu12-0.6.2-py3-none-manylinux2014_x86_64.whl", hash = "sha256:df2c24502fd76ebafe7457dbc4716b2fec071aabaed4fb7691a201cde03704d9"},
|
| 2838 |
+
{file = "nvidia_cusparselt_cu12-0.6.2-py3-none-win_amd64.whl", hash = "sha256:0057c91d230703924c0422feabe4ce768841f9b4b44d28586b6f6d2eb86fbe70"},
|
| 2839 |
+
]
|
| 2840 |
+
|
| 2841 |
[[package]]
|
| 2842 |
name = "nvidia-nccl-cu12"
|
| 2843 |
version = "2.21.5"
|
|
|
|
| 3174 |
test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=8.3.2)", "pytest-cov (>=5)", "pytest-mock (>=3.14)"]
|
| 3175 |
type = ["mypy (>=1.11.2)"]
|
| 3176 |
|
| 3177 |
+
[[package]]
|
| 3178 |
+
name = "playwright"
|
| 3179 |
+
version = "1.49.1"
|
| 3180 |
+
description = "A high-level API to automate web browsers"
|
| 3181 |
+
optional = false
|
| 3182 |
+
python-versions = ">=3.9"
|
| 3183 |
+
files = [
|
| 3184 |
+
{file = "playwright-1.49.1-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:1041ffb45a0d0bc44d698d3a5aa3ac4b67c9bd03540da43a0b70616ad52592b8"},
|
| 3185 |
+
{file = "playwright-1.49.1-py3-none-macosx_11_0_arm64.whl", hash = "sha256:9f38ed3d0c1f4e0a6d1c92e73dd9a61f8855133249d6f0cec28648d38a7137be"},
|
| 3186 |
+
{file = "playwright-1.49.1-py3-none-macosx_11_0_universal2.whl", hash = "sha256:3be48c6d26dc819ca0a26567c1ae36a980a0303dcd4249feb6f59e115aaddfb8"},
|
| 3187 |
+
{file = "playwright-1.49.1-py3-none-manylinux1_x86_64.whl", hash = "sha256:753ca90ee31b4b03d165cfd36e477309ebf2b4381953f2a982ff612d85b147d2"},
|
| 3188 |
+
{file = "playwright-1.49.1-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cd9bc8dab37aa25198a01f555f0a2e2c3813fe200fef018ac34dfe86b34994b9"},
|
| 3189 |
+
{file = "playwright-1.49.1-py3-none-win32.whl", hash = "sha256:43b304be67f096058e587dac453ece550eff87b8fbed28de30f4f022cc1745bb"},
|
| 3190 |
+
{file = "playwright-1.49.1-py3-none-win_amd64.whl", hash = "sha256:47b23cb346283278f5b4d1e1990bcb6d6302f80c0aa0ca93dd0601a1400191df"},
|
| 3191 |
+
]
|
| 3192 |
+
|
| 3193 |
+
[package.dependencies]
|
| 3194 |
+
greenlet = "3.1.1"
|
| 3195 |
+
pyee = "12.0.0"
|
| 3196 |
+
|
| 3197 |
[[package]]
|
| 3198 |
name = "pluggy"
|
| 3199 |
version = "1.5.0"
|
|
|
|
| 3681 |
carto = ["pydeck-carto"]
|
| 3682 |
jupyter = ["ipykernel (>=5.1.2)", "ipython (>=5.8.0)", "ipywidgets (>=7,<8)", "traitlets (>=4.3.2)"]
|
| 3683 |
|
| 3684 |
+
[[package]]
|
| 3685 |
+
name = "pyee"
|
| 3686 |
+
version = "12.0.0"
|
| 3687 |
+
description = "A rough port of Node.js's EventEmitter to Python with a few tricks of its own"
|
| 3688 |
+
optional = false
|
| 3689 |
+
python-versions = ">=3.8"
|
| 3690 |
+
files = [
|
| 3691 |
+
{file = "pyee-12.0.0-py3-none-any.whl", hash = "sha256:7b14b74320600049ccc7d0e0b1becd3b4bd0a03c745758225e31a59f4095c990"},
|
| 3692 |
+
{file = "pyee-12.0.0.tar.gz", hash = "sha256:c480603f4aa2927d4766eb41fa82793fe60a82cbfdb8d688e0d08c55a534e145"},
|
| 3693 |
+
]
|
| 3694 |
+
|
| 3695 |
+
[package.dependencies]
|
| 3696 |
+
typing-extensions = "*"
|
| 3697 |
+
|
| 3698 |
+
[package.extras]
|
| 3699 |
+
dev = ["black", "build", "flake8", "flake8-black", "isort", "jupyter-console", "mkdocs", "mkdocs-include-markdown-plugin", "mkdocstrings[python]", "pytest", "pytest-asyncio", "pytest-trio", "sphinx", "toml", "tox", "trio", "trio", "trio-typing", "twine", "twisted", "validate-pyproject[all]"]
|
| 3700 |
+
|
| 3701 |
[[package]]
|
| 3702 |
name = "pygments"
|
| 3703 |
version = "2.19.1"
|
|
|
|
| 3842 |
|
| 3843 |
[[package]]
|
| 3844 |
name = "pytz"
|
| 3845 |
+
version = "2025.1"
|
| 3846 |
description = "World timezone definitions, modern and historical"
|
| 3847 |
optional = false
|
| 3848 |
python-versions = "*"
|
| 3849 |
files = [
|
| 3850 |
+
{file = "pytz-2025.1-py2.py3-none-any.whl", hash = "sha256:89dd22dca55b46eac6eda23b2d72721bf1bdfef212645d81513ef5d03038de57"},
|
| 3851 |
+
{file = "pytz-2025.1.tar.gz", hash = "sha256:c2db42be2a2518b28e65f9207c4d05e6ff547d1efa4086469ef855e4ab70178e"},
|
| 3852 |
]
|
| 3853 |
|
| 3854 |
[[package]]
|
|
|
|
| 4787 |
|
| 4788 |
[[package]]
|
| 4789 |
name = "surya-ocr"
|
| 4790 |
+
version = "0.10.2"
|
| 4791 |
description = "OCR, layout, reading order, and table recognition in 90+ languages"
|
| 4792 |
optional = false
|
| 4793 |
python-versions = "<4.0,>=3.10"
|
| 4794 |
files = [
|
| 4795 |
+
{file = "surya_ocr-0.10.2-py3-none-any.whl", hash = "sha256:fbb590ae92b2a785e75ca25a53dd2ff59b1f56ec017a22f6127c9c7c62a1b910"},
|
| 4796 |
+
{file = "surya_ocr-0.10.2.tar.gz", hash = "sha256:ddbaf5d2f2cc0a08992446f889f782aa81e9e1cfa3fd957c124273365d411057"},
|
| 4797 |
]
|
| 4798 |
|
| 4799 |
[package.dependencies]
|
|
|
|
| 4805 |
pydantic-settings = ">=2.1.0,<3.0.0"
|
| 4806 |
pypdfium2 = "4.30.0"
|
| 4807 |
python-dotenv = ">=1.0.0,<2.0.0"
|
| 4808 |
+
torch = ">=2.5.1,<3.0.0"
|
| 4809 |
transformers = ">=4.41.0,<5.0.0"
|
| 4810 |
|
| 4811 |
[[package]]
|
|
|
|
| 4990 |
|
| 4991 |
[[package]]
|
| 4992 |
name = "torch"
|
| 4993 |
+
version = "2.6.0"
|
| 4994 |
description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration"
|
| 4995 |
optional = false
|
| 4996 |
+
python-versions = ">=3.9.0"
|
| 4997 |
files = [
|
| 4998 |
+
{file = "torch-2.6.0-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:6860df13d9911ac158f4c44031609700e1eba07916fff62e21e6ffa0a9e01961"},
|
| 4999 |
+
{file = "torch-2.6.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:c4f103a49830ce4c7561ef4434cc7926e5a5fe4e5eb100c19ab36ea1e2b634ab"},
|
| 5000 |
+
{file = "torch-2.6.0-cp310-cp310-win_amd64.whl", hash = "sha256:56eeaf2ecac90da5d9e35f7f35eb286da82673ec3c582e310a8d1631a1c02341"},
|
| 5001 |
+
{file = "torch-2.6.0-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:09e06f9949e1a0518c5b09fe95295bc9661f219d9ecb6f9893e5123e10696628"},
|
| 5002 |
+
{file = "torch-2.6.0-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:7979834102cd5b7a43cc64e87f2f3b14bd0e1458f06e9f88ffa386d07c7446e1"},
|
| 5003 |
+
{file = "torch-2.6.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:ccbd0320411fe1a3b3fec7b4d3185aa7d0c52adac94480ab024b5c8f74a0bf1d"},
|
| 5004 |
+
{file = "torch-2.6.0-cp311-cp311-win_amd64.whl", hash = "sha256:46763dcb051180ce1ed23d1891d9b1598e07d051ce4c9d14307029809c4d64f7"},
|
| 5005 |
+
{file = "torch-2.6.0-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:94fc63b3b4bedd327af588696559f68c264440e2503cc9e6954019473d74ae21"},
|
| 5006 |
+
{file = "torch-2.6.0-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:2bb8987f3bb1ef2675897034402373ddfc8f5ef0e156e2d8cfc47cacafdda4a9"},
|
| 5007 |
+
{file = "torch-2.6.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:b789069020c5588c70d5c2158ac0aa23fd24a028f34a8b4fcb8fcb4d7efcf5fb"},
|
| 5008 |
+
{file = "torch-2.6.0-cp312-cp312-win_amd64.whl", hash = "sha256:7e1448426d0ba3620408218b50aa6ada88aeae34f7a239ba5431f6c8774b1239"},
|
| 5009 |
+
{file = "torch-2.6.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:9a610afe216a85a8b9bc9f8365ed561535c93e804c2a317ef7fabcc5deda0989"},
|
| 5010 |
+
{file = "torch-2.6.0-cp313-cp313-manylinux1_x86_64.whl", hash = "sha256:4874a73507a300a5d089ceaff616a569e7bb7c613c56f37f63ec3ffac65259cf"},
|
| 5011 |
+
{file = "torch-2.6.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:a0d5e1b9874c1a6c25556840ab8920569a7a4137afa8a63a32cee0bc7d89bd4b"},
|
| 5012 |
+
{file = "torch-2.6.0-cp313-cp313-win_amd64.whl", hash = "sha256:510c73251bee9ba02ae1cb6c9d4ee0907b3ce6020e62784e2d7598e0cfa4d6cc"},
|
| 5013 |
+
{file = "torch-2.6.0-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:ff96f4038f8af9f7ec4231710ed4549da1bdebad95923953a25045dcf6fd87e2"},
|
| 5014 |
+
{file = "torch-2.6.0-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:9ea955317cfcd3852b1402b62af258ce735c2edeee42ca9419b6bc889e5ae053"},
|
| 5015 |
+
{file = "torch-2.6.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:bb2c6c3e65049f081940f5ab15c9136c7de40d3f01192541c920a07c7c585b7e"},
|
| 5016 |
+
{file = "torch-2.6.0-cp39-cp39-win_amd64.whl", hash = "sha256:683410f97984103148e31b38a8631acf31c3034c020c0f4d26171e7626d8317a"},
|
| 5017 |
+
{file = "torch-2.6.0-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:265f70de5fd45b864d924b64be1797f86e76c8e48a02c2a3a6fc7ec247d2226c"},
|
| 5018 |
]
|
| 5019 |
|
| 5020 |
[package.dependencies]
|
|
|
|
| 5031 |
nvidia-curand-cu12 = {version = "10.3.5.147", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
|
| 5032 |
nvidia-cusolver-cu12 = {version = "11.6.1.9", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
|
| 5033 |
nvidia-cusparse-cu12 = {version = "12.3.1.170", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
|
| 5034 |
+
nvidia-cusparselt-cu12 = {version = "0.6.2", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
|
| 5035 |
nvidia-nccl-cu12 = {version = "2.21.5", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
|
| 5036 |
nvidia-nvjitlink-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
|
| 5037 |
nvidia-nvtx-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
|
| 5038 |
setuptools = {version = "*", markers = "python_version >= \"3.12\""}
|
| 5039 |
sympy = {version = "1.13.1", markers = "python_version >= \"3.9\""}
|
| 5040 |
+
triton = {version = "3.2.0", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
|
| 5041 |
+
typing-extensions = ">=4.10.0"
|
| 5042 |
|
| 5043 |
[package.extras]
|
| 5044 |
opt-einsum = ["opt-einsum (>=3.3)"]
|
| 5045 |
+
optree = ["optree (>=0.13.0)"]
|
| 5046 |
|
| 5047 |
[[package]]
|
| 5048 |
name = "tornado"
|
|
|
|
| 5171 |
|
| 5172 |
[[package]]
|
| 5173 |
name = "triton"
|
| 5174 |
+
version = "3.2.0"
|
| 5175 |
description = "A language and compiler for custom Deep Learning operations"
|
| 5176 |
optional = false
|
| 5177 |
python-versions = "*"
|
| 5178 |
files = [
|
| 5179 |
+
{file = "triton-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b3e54983cd51875855da7c68ec05c05cf8bb08df361b1d5b69e05e40b0c9bd62"},
|
| 5180 |
+
{file = "triton-3.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8009a1fb093ee8546495e96731336a33fb8856a38e45bb4ab6affd6dbc3ba220"},
|
| 5181 |
+
{file = "triton-3.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d9b215efc1c26fa7eefb9a157915c92d52e000d2bf83e5f69704047e63f125c"},
|
| 5182 |
+
{file = "triton-3.2.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e5dfa23ba84541d7c0a531dfce76d8bcd19159d50a4a8b14ad01e91734a5c1b0"},
|
| 5183 |
+
{file = "triton-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:30ceed0eff2c4a73b14eb63e052992f44bbdf175f3fad21e1ac8097a772de7ee"},
|
| 5184 |
]
|
| 5185 |
|
|
|
|
|
|
|
|
|
|
| 5186 |
[package.extras]
|
| 5187 |
build = ["cmake (>=3.20)", "lit"]
|
| 5188 |
tests = ["autopep8", "flake8", "isort", "llnl-hatchet", "numpy", "pytest", "scipy (>=1.7.1)"]
|
|
|
|
| 5615 |
[metadata]
|
| 5616 |
lock-version = "2.0"
|
| 5617 |
python-versions = "^3.10"
|
| 5618 |
+
content-hash = "589d4265c99bb94e935eeae053707638d72da1eaca38f0d60c832210703bd5bc"
|
pyproject.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
[tool.poetry]
|
| 2 |
name = "marker-pdf"
|
| 3 |
-
version = "1.3.
|
| 4 |
description = "Convert PDF to markdown with high speed and accuracy."
|
| 5 |
authors = ["Vik Paruchuri <github@vikas.sh>"]
|
| 6 |
readme = "README.md"
|
|
@@ -22,11 +22,11 @@ pydantic = "^2.4.2"
|
|
| 22 |
pydantic-settings = "^2.0.3"
|
| 23 |
transformers = "^4.45.2"
|
| 24 |
python-dotenv = "^1.0.0"
|
| 25 |
-
torch = "
|
| 26 |
tqdm = "^4.66.1"
|
| 27 |
ftfy = "^6.1.1"
|
| 28 |
rapidfuzz = "^3.8.1"
|
| 29 |
-
surya-ocr = "~0.10.
|
| 30 |
regex = "^2024.4.28"
|
| 31 |
pdftext = "~0.5.1"
|
| 32 |
markdownify = "^0.13.1"
|
|
@@ -49,6 +49,8 @@ apted = "1.0.3"
|
|
| 49 |
distance = "0.1.3"
|
| 50 |
lxml = "5.3.0"
|
| 51 |
tabulate = "^0.9.0"
|
|
|
|
|
|
|
| 52 |
|
| 53 |
[tool.poetry.scripts]
|
| 54 |
marker = "marker.scripts.convert:convert_cli"
|
|
|
|
| 1 |
[tool.poetry]
|
| 2 |
name = "marker-pdf"
|
| 3 |
+
version = "1.3.4"
|
| 4 |
description = "Convert PDF to markdown with high speed and accuracy."
|
| 5 |
authors = ["Vik Paruchuri <github@vikas.sh>"]
|
| 6 |
readme = "README.md"
|
|
|
|
| 22 |
pydantic-settings = "^2.0.3"
|
| 23 |
transformers = "^4.45.2"
|
| 24 |
python-dotenv = "^1.0.0"
|
| 25 |
+
torch = "^2.5.1"
|
| 26 |
tqdm = "^4.66.1"
|
| 27 |
ftfy = "^6.1.1"
|
| 28 |
rapidfuzz = "^3.8.1"
|
| 29 |
+
surya-ocr = "~0.10.2"
|
| 30 |
regex = "^2024.4.28"
|
| 31 |
pdftext = "~0.5.1"
|
| 32 |
markdownify = "^0.13.1"
|
|
|
|
| 49 |
distance = "0.1.3"
|
| 50 |
lxml = "5.3.0"
|
| 51 |
tabulate = "^0.9.0"
|
| 52 |
+
latex2mathml = "^3.77.0"
|
| 53 |
+
playwright = "^1.49.1"
|
| 54 |
|
| 55 |
[tool.poetry.scripts]
|
| 56 |
marker = "marker.scripts.convert:convert_cli"
|