Vik Paruchuri commited on
Commit
6c81421
·
2 Parent(s): a9c09d7 38790b9

Improve bench

Browse files
benchmarks/overall/clean.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import subprocess
3
+ import tempfile
4
+ from pathlib import Path
5
+
6
+ import latex2mathml.converter
7
+
8
+ from marker.renderers.markdown import MarkdownRenderer
9
+
10
+ class MarkdownCleaner:
11
+ def __init__(self):
12
+ pass
13
+
14
+ def __call__(self, markdown):
15
+ markdown = self.normalize_markdown(markdown) # Use pandoc to normalize
16
+
17
+ # Replace math expressions with latexml
18
+ pattern = r'(?<!\\)\$(?:\$([^$]+)\$\$|\s*([^$\n]+?)\s*\$)'
19
+ markdown = re.sub(pattern, self.standardize_math, markdown)
20
+
21
+ # Replace image urls with a generic tag
22
+ pattern = r'!\[(.*?)\]\((https?://[^\s\)]+)\)'
23
+ markdown = re.sub(pattern, r'![link]', markdown)
24
+
25
+ # Clean up stray html tags
26
+ markdown = markdown.replace("<br>", "\n")
27
+ markdown = re.sub(r"<sub>(.*?)</sub>", r"\1", markdown)
28
+ markdown = re.sub(r"<sup>(.*?)</sup>", r"\1", markdown)
29
+ markdown = re.sub(r"<span.*?>(.*?)</span>", r"\1", markdown) # Remove span tags and keep content
30
+
31
+ # Clean up markdown formatting
32
+ markdown = re.sub(r"\s+", " ", markdown)
33
+ markdown = re.sub(r"\n+", "\n", markdown)
34
+ markdown = re.sub("\\.+", ".",
35
+ markdown) # Replace repeated periods with a single period, like in table of contents
36
+ markdown = re.sub("#+", "#", markdown) # Replace repeated headers with a single header
37
+ markdown = markdown.encode().decode('unicode-escape', errors="ignore") # Decode unicode characters properly
38
+ return markdown.strip().lower()
39
+
40
+ @staticmethod
41
+ def normalize_markdown(md_text: str) -> str:
42
+ with tempfile.TemporaryDirectory() as tmp_dir:
43
+ dirpath = Path(tmp_dir)
44
+ input_file = dirpath / 'input.md'
45
+ input_file.write_text(md_text, encoding='utf-8')
46
+
47
+ # Markdown to HTML
48
+ html_file = dirpath / 'temp.html'
49
+ subprocess.run(
50
+ [
51
+ 'pandoc',
52
+ str(input_file),
53
+ '-f', 'markdown+tex_math_dollars',
54
+ '-t', 'html',
55
+ '-o', str(html_file),
56
+ '--quiet'
57
+ ],
58
+ check=True
59
+ )
60
+
61
+ # HTML to Markdown
62
+ output_file = dirpath / 'output.md'
63
+ subprocess.run(
64
+ [
65
+ 'pandoc',
66
+ str(html_file),
67
+ '-f', 'html',
68
+ '-t', 'markdown+tex_math_dollars',
69
+ '-o', str(output_file),
70
+ '--quiet'
71
+ ],
72
+ check=True
73
+ )
74
+
75
+ # Read back the normalized Markdown
76
+ normalized_md = output_file.read_text(encoding='utf-8')
77
+
78
+ return normalized_md
79
+
80
+ def standardize_math(self, match):
81
+ try:
82
+ delim = "$$" if match.group(0).startswith('$$') else "$"
83
+ math_content = match.group(1) or match.group(2)
84
+ if delim == "$$":
85
+ math_content = latex2mathml.converter.convert(math_content)
86
+ else:
87
+ math_content = self.clean_latex(math_content)
88
+ return f'{delim}{math_content}{delim}'
89
+ except Exception as e:
90
+ print(f"Failed to standardize math expression: {match.group(0)} with error: {e}")
91
+ return match.group(0)
92
+
93
+ @staticmethod
94
+ def clean_latex(latex_str):
95
+ latex_str = re.sub(r'\s+', ' ', latex_str.strip())
96
+ for tag in [r'\\text', r'\\mathrm', r'\\mathbf', r'\\textbf']:
97
+ latex_str = re.sub(tag + r'\{([^}]+)\}', r'\1', latex_str)
98
+
99
+ replacements = {
100
+ '\\times': '*',
101
+ '\\cdot': '*',
102
+ '\\div': '/',
103
+ '\\le': '<=',
104
+ '\\ge': '>=',
105
+ '\\neq': '!=',
106
+ '\\to': '\\rightarrow',
107
+ }
108
+
109
+ for old, new in replacements.items():
110
+ latex_str = latex_str.replace(old, new)
111
+
112
+ return latex_str
113
+
114
+
115
+ def convert_to_md(html):
116
+ md = MarkdownRenderer()
117
+ markdown = md.md_cls.convert(html)
118
+ return markdown
119
+
120
+ def clean_input(markdown):
121
+ cleaner = MarkdownCleaner()
122
+ return cleaner(markdown)
123
+
124
+
125
+
benchmarks/overall/inference.py CHANGED
@@ -1,38 +1,37 @@
1
  import tempfile
2
  import time
3
 
4
- from bs4 import BeautifulSoup
5
-
6
- from benchmarks.overall.scoring import score_blocks
7
  from benchmarks.overall.schema import BlockScores
 
8
  from marker.converters.pdf import PdfConverter
9
 
10
- def get_marker_html(marker_models: dict, pdf_bytes: bytes, use_llm: bool):
11
  block_converter = PdfConverter(
12
  artifact_dict=marker_models,
13
- config={"page_range": [0], "disable_tqdm": True, "use_llm": use_llm},
14
- renderer="marker.renderers.html.HTMLRenderer"
15
  )
 
16
  with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f:
17
  f.write(pdf_bytes)
18
  rendered = block_converter(f.name)
19
- html = rendered.html
20
- soup = BeautifulSoup(html, "html.parser")
21
- inner_html = str(soup.find("body").decode_contents())
22
- return inner_html
23
 
24
 
25
- def marker_scoring_func(model_dict, sample, gt_html, use_llm=False, **kwargs) -> BlockScores:
26
  pdf_bytes = sample["pdf"] # This is a single page PDF
27
  start = time.time()
28
- marker_html = get_marker_html(model_dict, pdf_bytes, use_llm)
 
29
  total = time.time() - start
30
- scores = score_blocks(gt_html, marker_html)
31
  scores["time"] = total
 
32
  return scores
33
 
34
 
35
- def mathpix_scoring_func(model_dict, sample, gt_html, mathpix_ds=None, **kwargs) -> BlockScores:
36
  uuid = sample["uuid"]
37
  data = None
38
  for row in mathpix_ds:
@@ -42,7 +41,8 @@ def mathpix_scoring_func(model_dict, sample, gt_html, mathpix_ds=None, **kwargs)
42
  if not data:
43
  raise ValueError(f"Could not find data for uuid {uuid}")
44
 
45
- mathpix_md = data["md"]
46
- scores = score_blocks(gt_html, mathpix_md, convert=False)
47
  scores["time"] = data["time"]
 
48
  return scores
 
1
  import tempfile
2
  import time
3
 
4
+ from benchmarks.overall.clean import clean_input
 
 
5
  from benchmarks.overall.schema import BlockScores
6
+ from benchmarks.overall.scoring import score_blocks
7
  from marker.converters.pdf import PdfConverter
8
 
9
+ def get_marker_markdown(marker_models: dict, pdf_bytes: bytes, use_llm: bool):
10
  block_converter = PdfConverter(
11
  artifact_dict=marker_models,
12
+ config={"page_range": [0], "disable_tqdm": True, "use_llm": use_llm}
 
13
  )
14
+
15
  with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f:
16
  f.write(pdf_bytes)
17
  rendered = block_converter(f.name)
18
+
19
+ return rendered.markdown
 
 
20
 
21
 
22
+ def marker_scoring_func(model_dict, sample, gt_markdown, use_llm=False, **kwargs) -> BlockScores:
23
  pdf_bytes = sample["pdf"] # This is a single page PDF
24
  start = time.time()
25
+ marker_md = get_marker_markdown(model_dict, pdf_bytes, use_llm)
26
+ marker_md = clean_input(marker_md)
27
  total = time.time() - start
28
+ scores = score_blocks(gt_markdown, marker_md)
29
  scores["time"] = total
30
+ scores["markdown"] = marker_md
31
  return scores
32
 
33
 
34
+ def mathpix_scoring_func(model_dict, sample, gt_markdown, mathpix_ds=None, **kwargs) -> BlockScores:
35
  uuid = sample["uuid"]
36
  data = None
37
  for row in mathpix_ds:
 
41
  if not data:
42
  raise ValueError(f"Could not find data for uuid {uuid}")
43
 
44
+ mathpix_md = clean_input(data["md"])
45
+ scores = score_blocks(gt_markdown, mathpix_md)
46
  scores["time"] = data["time"]
47
+ scores["markdown"] = mathpix_md
48
  return scores
benchmarks/overall/overall.py CHANGED
@@ -7,9 +7,11 @@ from typing import Dict
7
  import click
8
  import datasets
9
  import tabulate
 
10
  from tqdm import tqdm
11
  import pypdfium2 as pdfium
12
 
 
13
  from benchmarks.overall.inference import marker_scoring_func, mathpix_scoring_func
14
  from benchmarks.overall.schema import FullResult
15
  from marker.logger import configure_logging
@@ -32,7 +34,8 @@ def get_method_scores(ds, model_dict, max_rows=None, score_func=marker_scoring_f
32
 
33
  try:
34
  gt_html = [block["html"] for block in gt_blocks if len(block["html"]) > 0]
35
- scores = score_func(model_dict, sample, gt_html, **kwargs)
 
36
  except ValueError as e:
37
  print(f"Error with sample {idx}: {e}")
38
  continue
@@ -101,12 +104,14 @@ def print_scores(scores: Dict[str, FullResult], out_path: Path, default_method="
101
 
102
  @click.command(help="Benchmark PDF to MD conversion.")
103
  @click.option("--dataset", type=str, help="Path to the benchmark dataset", default="datalab-to/marker_benchmark")
 
104
  @click.option("--other_methods", type=str, help="Comma separated list of other methods to compare against. Possible values: mathpix", default="")
105
  @click.option("--result_path", type=str, default=os.path.join(settings.OUTPUT_DIR, "benchmark", "overall"), help="Output path for results.")
106
  @click.option("--max_rows", type=int, default=None, help="Maximum number of rows to process.")
107
  @click.option("--use_llm", is_flag=True, help="Use the LLM model for better marker quality.")
108
  def main(
109
  dataset: str,
 
110
  other_methods: str,
111
  result_path: str,
112
  max_rows: int,
@@ -142,6 +147,11 @@ def main(
142
 
143
  print(f"Results saved to {out_path}.")
144
 
 
 
 
 
 
145
  if __name__ == "__main__":
146
  main()
147
 
 
7
  import click
8
  import datasets
9
  import tabulate
10
+ from benchmarks.overall.render import build_dataset
11
  from tqdm import tqdm
12
  import pypdfium2 as pdfium
13
 
14
+ from benchmarks.overall.clean import convert_to_md, clean_input
15
  from benchmarks.overall.inference import marker_scoring_func, mathpix_scoring_func
16
  from benchmarks.overall.schema import FullResult
17
  from marker.logger import configure_logging
 
34
 
35
  try:
36
  gt_html = [block["html"] for block in gt_blocks if len(block["html"]) > 0]
37
+ gt_markdown = [clean_input(convert_to_md(block)) for block in gt_html]
38
+ scores = score_func(model_dict, sample, gt_markdown, **kwargs)
39
  except ValueError as e:
40
  print(f"Error with sample {idx}: {e}")
41
  continue
 
104
 
105
  @click.command(help="Benchmark PDF to MD conversion.")
106
  @click.option("--dataset", type=str, help="Path to the benchmark dataset", default="datalab-to/marker_benchmark")
107
+ @click.option("--out_dataset", type=str, help="Path to the output dataset", default=None)
108
  @click.option("--other_methods", type=str, help="Comma separated list of other methods to compare against. Possible values: mathpix", default="")
109
  @click.option("--result_path", type=str, default=os.path.join(settings.OUTPUT_DIR, "benchmark", "overall"), help="Output path for results.")
110
  @click.option("--max_rows", type=int, default=None, help="Maximum number of rows to process.")
111
  @click.option("--use_llm", is_flag=True, help="Use the LLM model for better marker quality.")
112
  def main(
113
  dataset: str,
114
+ out_dataset: str,
115
  other_methods: str,
116
  result_path: str,
117
  max_rows: int,
 
147
 
148
  print(f"Results saved to {out_path}.")
149
 
150
+ # Push up comparison dataset
151
+ if out_dataset is not None:
152
+ out_ds = build_dataset(ds, all_scores)
153
+ out_ds.push_to_hub(out_dataset)
154
+
155
  if __name__ == "__main__":
156
  main()
157
 
benchmarks/overall/render.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+ import tempfile
3
+ import pypdfium2 as pdfium
4
+ from typing import Dict
5
+ from collections import defaultdict
6
+ import re
7
+ import io
8
+ import json
9
+
10
+ from PIL import Image
11
+ import datasets
12
+ import markdown2
13
+ from playwright.sync_api import sync_playwright
14
+
15
+ from benchmarks.overall.schema import FullResult
16
+
17
+ def convert_to_html(md: str):
18
+ block_placeholders = []
19
+ inline_placeholders = []
20
+
21
+ # Add placeholders for the math
22
+ def block_sub(match):
23
+ content = match.group(1)
24
+ placeholder = f"1BLOCKMATH{len(block_placeholders)}1"
25
+ block_placeholders.append((placeholder, f"$${content}$$"))
26
+ return placeholder
27
+
28
+ def inline_sub(match):
29
+ content = match.group(1)
30
+ placeholder = f"1INLINEMATH{len(inline_placeholders)}1"
31
+ inline_placeholders.append((placeholder, f"${content}$"))
32
+ return placeholder
33
+
34
+ md = re.sub(r'\${2}(.*?)\${2}', block_sub, md, flags=re.DOTALL)
35
+ md = re.sub(r'\$(.*?)\$', inline_sub, md)
36
+
37
+ html = markdown2.markdown(md, extras=['tables'])
38
+
39
+ # Replace placeholders
40
+ for placeholder, math_str in block_placeholders:
41
+ html = html.replace(placeholder, math_str)
42
+ for placeholder, math_str in inline_placeholders:
43
+ html = html.replace(placeholder, math_str)
44
+
45
+ return html
46
+
47
+
48
+ def markdown_to_image(md: str) -> Image.Image:
49
+ html = convert_to_html(md)
50
+ with sync_playwright() as p:
51
+ browser = p.chromium.launch()
52
+ page = browser.new_page()
53
+ page.set_content(f"""
54
+ <head>
55
+ <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.16.21/dist/katex.min.css" integrity="sha384-zh0CIslj+VczCZtlzBcjt5ppRcsAmDnRem7ESsYwWwg3m/OaJ2l4x7YBZl9Kxxib" crossorigin="anonymous">
56
+ <!-- The loading of KaTeX is deferred to speed up page rendering -->
57
+ <script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.21/dist/katex.min.js" integrity="sha384-Rma6DA2IPUwhNxmrB/7S3Tno0YY7sFu9WSYMCuulLhIqYSGZ2gKCJWIqhBWqMQfh" crossorigin="anonymous"></script>
58
+ <!-- To automatically render math in text elements, include the auto-render extension: -->
59
+ <script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.21/dist/contrib/auto-render.min.js" integrity="sha384-hCXGrW6PitJEwbkoStFjeJxv+fSOOQKOPbJxSfM6G5sWZjAyWhXiTIIAmQqnlLlh" crossorigin="anonymous"></script>
60
+ </head>
61
+ <body>
62
+ {html}
63
+ <script>
64
+ renderMathInElement(document.body, {{
65
+ delimiters: [
66
+ {{left: '$$', right: '$$', display: true}},
67
+ {{left: '$', right: '$', display: false}}
68
+ ]
69
+ }});
70
+ </script>
71
+ </body>
72
+ """)
73
+ page.set_viewport_size({"width": 1200, "height": 800})
74
+ page.wait_for_timeout(500) # Wait for KaTeX to render
75
+ screenshot_bytes = page.screenshot(full_page=True)
76
+ browser.close()
77
+
78
+ return Image.open(io.BytesIO(screenshot_bytes))
79
+
80
+
81
+ def build_dataset(ds: datasets.Dataset, all_scores: Dict[str, FullResult]) -> datasets.Dataset:
82
+ # Get all the dataset indices that went through inference
83
+ full_idxs = None
84
+ for method in all_scores:
85
+ result_idxs = list(all_scores[method]["raw_scores"].keys())
86
+ if full_idxs is None:
87
+ full_idxs = sorted(result_idxs)
88
+ else:
89
+ full_idxs = [f for f in full_idxs if f in result_idxs]
90
+
91
+ ds_rows = defaultdict(dict)
92
+ for idx in full_idxs:
93
+ row = ds[idx] # img, gt_blocks, classification, language, uuid
94
+ for method in all_scores:
95
+ method_row = all_scores[method]["raw_scores"][idx]
96
+ ds_rows[idx].update({
97
+ f"{method}_score": method_row["overall_score"],
98
+ f"{method}_markdown": method_row["markdown"],
99
+ f"{method}_image": markdown_to_image(method_row["markdown"]),
100
+ f"{method}_time": method_row["time"]
101
+ })
102
+ gt_md = "\n\n".join([clean_input(convert_to_md(block)) for block in json.loads(row["gt_blocks"])])
103
+ ds_rows[idx].update({
104
+ "gt_markdown": gt_md,
105
+ "gt_image": markdown_to_image(gt_md)
106
+ })
107
+ out_dataset = datasets.Dataset.from_list([ds_rows[k] for k in full_idxs])
108
+ return out_dataset
109
+
benchmarks/overall/schema.py CHANGED
@@ -4,10 +4,9 @@ from typing import TypedDict, List, Dict, Optional
4
  class BlockScores(TypedDict):
5
  scores: List[float]
6
  order_score: float
7
- gt: List[str]
8
- method: str
9
  overall_score: float
10
  time: Optional[float]
 
11
 
12
 
13
  class FullResult(TypedDict):
 
4
  class BlockScores(TypedDict):
5
  scores: List[float]
6
  order_score: float
 
 
7
  overall_score: float
8
  time: Optional[float]
9
+ markdown: str
10
 
11
 
12
  class FullResult(TypedDict):
benchmarks/overall/scoring.py CHANGED
@@ -2,9 +2,8 @@ from typing import List
2
 
3
  from rapidfuzz import fuzz
4
 
 
5
  from benchmarks.overall.schema import BlockScores
6
- from marker.renderers.markdown import MarkdownRenderer
7
- import re
8
 
9
 
10
  def kendall_tau(correct_order: List[int], actual_order: List[int]) -> float:
@@ -58,112 +57,19 @@ def find_fuzzy_alignments(
58
  })
59
  return alignments
60
 
61
- def convert_to_md(html):
62
- md = MarkdownRenderer()
63
- markdown = md.md_cls.convert(html)
64
- return markdown
65
-
66
- def standardize_markdown(markdown):
67
- # Replace math expressions
68
- pattern = r'(?<!\\)\$(?:\$([^$]+)\$\$|\s*([^$\n]+?)\s*\$)'
69
- markdown = re.sub(pattern, standardize_math, markdown)
70
-
71
- # Replace image urls
72
- pattern = r'!\[(.*?)\]\((https?://[^\s\)]+)\)'
73
- markdown = re.sub(pattern, r'![link]', markdown)
74
- markdown = strip_latex_symbols(markdown)
75
- markdown = replace_centered_lines(markdown)
76
-
77
- # Clean up html tags
78
- markdown = markdown.replace("<br>", "\n")
79
- markdown = re.sub(r"<sub>(.*?)</sub>", r"\1", markdown)
80
- markdown = re.sub(r"<sup>(.*?)</sup>", r"\1", markdown)
81
- markdown = re.sub(r"<span.*?>(.*?)</span>", r"\1", markdown) # Remove span tags and keep content
82
-
83
- # Clean up markdown
84
- markdown = re.sub(r"\s+", " ", markdown)
85
- markdown = re.sub(r"\n+", "\n", markdown)
86
- markdown = re.sub("\\.+", ".", markdown) # Replace repeated periods with a single period, like in table of contents
87
- markdown = re.sub("#+", "#", markdown) # Replace repeated headers with a single header
88
- markdown = re.sub(r"\$", "", markdown) # Remove equation delimiters
89
- markdown = markdown.encode().decode('unicode-escape', errors="ignore") # Decode unicode characters properly
90
- return markdown.strip().lower()
91
-
92
-
93
- def replace_centered_lines(text):
94
- def replace_match(m):
95
- content = m.group(0)
96
- dash_count = content.count('-')
97
- return '-' * dash_count
98
-
99
- pattern = r':-+:'
100
- return re.sub(pattern, replace_match, text)
101
-
102
-
103
- def strip_latex_symbols(text):
104
- # Handle short math mode sequences first - only match $ $ with brief content
105
- text = re.sub(r'\$\s*\\?[a-zA-Z]+\d?\s*\$', '', text)
106
-
107
- # Handle common patterns inside remaining math mode
108
- patterns = [
109
- r'\$\s*\\?[a-zA-Z]+\d?\s*\$', # \alpha or \alpha2 in math mode
110
- r'\$\s*\d+\\[a-zA-Z]+\s*\$', # 45\circ in math mode
111
- r'\$\s*[a-zA-Z0-9]\\[a-zA-Z]+\s*\$' # x\dagger in math mode
112
- ]
113
-
114
- pattern = '|'.join(patterns)
115
- return re.sub(pattern, '', text)
116
-
117
-
118
- def standardize_math(match):
119
- try:
120
- delim = "$$" if match.group(0).startswith('$$') else "$"
121
- math_content = match.group(1) or match.group(2)
122
- result = clean_latex(math_content)
123
- return f'{delim}{result}{delim}'
124
- except Exception as e:
125
- print(f"Failed to standardize math expression: {match.group(0)} with error: {e}")
126
- return match.group(0)
127
-
128
-
129
- def clean_latex(latex_str):
130
- latex_str = re.sub(r'\s+', ' ', latex_str.strip())
131
- for tag in [r'\\text', r'\\mathrm', r'\\mathbf', r'\\textbf']:
132
- latex_str = re.sub(tag + r'\{([^}]+)\}', r'\1', latex_str)
133
-
134
-
135
- replacements = {
136
- '\\times': '*',
137
- '\\cdot': '*',
138
- '\\div': '/',
139
- '\\le': '<=',
140
- '\\ge': '>=',
141
- '\\neq': '!=',
142
- '\\to': '\\rightarrow',
143
- }
144
-
145
- for old, new in replacements.items():
146
- latex_str = latex_str.replace(old, new)
147
-
148
- return latex_str
149
-
150
-
151
- def score_blocks(gt_html, method_html, convert=True) -> BlockScores:
152
- if convert:
153
- method_html = convert_to_md(method_html)
154
- method_html = standardize_markdown(method_html)
155
- gt = [standardize_markdown(convert_to_md(gt)) for gt in gt_html]
156
- alignments = find_fuzzy_alignments(method_html, gt)
157
  scores = [alignment["score"] for alignment in alignments]
158
 
159
  # Find order score
160
  orders = [alignment["start"] for alignment in alignments]
161
- correct_order = list(range(len(gt)))
162
- actual_order = sorted(range(len(gt)), key=lambda x: orders[x])
163
  order_score = kendall_tau(correct_order, actual_order)
164
 
165
  # Weight score by sequence length
166
- gt_weights = [len(g) for g in gt]
167
  weighted_scores = [score * weight for score, weight in zip(scores, gt_weights)]
168
 
169
  # Weight the score by sequence length
@@ -172,8 +78,6 @@ def score_blocks(gt_html, method_html, convert=True) -> BlockScores:
172
  return {
173
  "scores": scores,
174
  "order_score": order_score,
175
- "gt": gt,
176
- "method": method_html,
177
  "overall_score": overall_score,
178
  "time": None
179
  }
 
2
 
3
  from rapidfuzz import fuzz
4
 
5
+ from benchmarks.overall.clean import convert_to_md, MarkdownCleaner
6
  from benchmarks.overall.schema import BlockScores
 
 
7
 
8
 
9
  def kendall_tau(correct_order: List[int], actual_order: List[int]) -> float:
 
57
  })
58
  return alignments
59
 
60
+
61
+ def score_blocks(gt_markdown: List[str], method_markdown: str) -> BlockScores:
62
+ alignments = find_fuzzy_alignments(method_markdown, gt_markdown)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  scores = [alignment["score"] for alignment in alignments]
64
 
65
  # Find order score
66
  orders = [alignment["start"] for alignment in alignments]
67
+ correct_order = list(range(len(gt_markdown)))
68
+ actual_order = sorted(range(len(gt_markdown)), key=lambda x: orders[x])
69
  order_score = kendall_tau(correct_order, actual_order)
70
 
71
  # Weight score by sequence length
72
+ gt_weights = [len(g) for g in gt_markdown]
73
  weighted_scores = [score * weight for score, weight in zip(scores, gt_weights)]
74
 
75
  # Weight the score by sequence length
 
78
  return {
79
  "scores": scores,
80
  "order_score": order_score,
 
 
81
  "overall_score": overall_score,
82
  "time": None
83
  }
marker/scripts/streamlit_app.py CHANGED
@@ -115,7 +115,10 @@ def pillow_image_to_base64_string(img: Image) -> str:
115
  return base64.b64encode(buffered.getvalue()).decode("utf-8")
116
 
117
 
118
- def block_display(image: Image, blocks: dict = {}, dpi=96):
 
 
 
119
  image_data_url = (
120
  'data:image/jpeg;base64,' + pillow_image_to_base64_string(image)
121
  )
 
115
  return base64.b64encode(buffered.getvalue()).decode("utf-8")
116
 
117
 
118
+ def block_display(image: Image, blocks: dict | None = None, dpi=96):
119
+ if blocks is None:
120
+ blocks = {}
121
+
122
  image_data_url = (
123
  'data:image/jpeg;base64,' + pillow_image_to_base64_string(image)
124
  )
marker/scripts/streamlit_app_blocks_viz.html CHANGED
@@ -114,7 +114,7 @@
114
  <body>
115
  <div style="text-align: center" class="image-container">
116
  <dialog id="block-info-dialog">
117
- <button
118
  class="close-button"
119
  onclick="document.querySelector('#block-info-dialog').close()"
120
  ></button>
@@ -147,17 +147,17 @@
147
  const BLOCK_TYPES = $block_types_json;
148
  const blocksById = {};
149
  const blockInfoDialog = document.querySelector("dialog#block-info-dialog");
150
-
151
  function blockTypeColor(blockType) {
152
  return COLORS[BLOCK_TYPES[blockType] % COLORS.length];
153
  }
154
-
155
  function traverseAndGenerateSVG(block) {
156
  let svg = "";
157
-
158
  if (block.polygon) {
159
  const color = blockTypeColor(block.block_type);
160
-
161
  // dollar signs are escaped because this files gets read into a template string
162
  svg += `<rect id="$${block.id}"
163
  class="block type-$${block.block_type}"
@@ -171,52 +171,52 @@
171
  }"
172
  fill=$${color} stroke=$${color}>
173
  </rect>`;
174
-
175
  blocksById[block.id] = block;
176
  }
177
-
178
  if (Array.isArray(block.children) && block.children.length > 0) {
179
  block.children.forEach((child) => {
180
  svg += traverseAndGenerateSVG(child);
181
  });
182
  }
183
-
184
  return svg;
185
  }
186
-
187
  if (Object.keys(BLOCKS).length == 0) {
188
  // bail out if no blocks
189
  return;
190
  }
191
-
192
  const [vbWidth, vbHeight] = BLOCKS.children[0].polygon[2];
193
  document
194
  .querySelector("svg")
195
  .setAttribute("viewBox", `0 0 $${vbWidth} $${vbHeight}`);
196
-
197
  const blocksOverlay = document.querySelector(".blocks-overlay");
198
  blocksOverlay.innerHTML = traverseAndGenerateSVG(BLOCKS.children[0]);
199
-
200
  tippy("rect.block", {
201
  content: (block) => block.getAttribute("data-type"),
202
  placement: "top-start",
203
  arrow: false,
204
  offset: [0, 5],
205
  });
206
-
207
  blocksOverlay.addEventListener("click", (event) => {
208
  if (event.target.tagName !== "rect") return;
209
-
210
  const blockId = event.target.id;
211
  const block = blocksById[blockId];
212
-
213
  blockInfoDialog.querySelector("h1").innerHTML = `
214
  $${blockId} <span style="color: $${blockTypeColor(block.block_type)}">($${block.block_type})</span>
215
  `;
216
  blockInfoDialog.querySelector(".text-content").textContent = block.html;
217
-
218
  blockInfoDialog.dataset.blockJSON = JSON.stringify(block, null, 2);
219
-
220
  if (block.images) {
221
  const imagesDiv = blockInfoDialog.querySelector(".images");
222
  imagesDiv.innerHTML = "";
 
114
  <body>
115
  <div style="text-align: center" class="image-container">
116
  <dialog id="block-info-dialog">
117
+ <button
118
  class="close-button"
119
  onclick="document.querySelector('#block-info-dialog').close()"
120
  ></button>
 
147
  const BLOCK_TYPES = $block_types_json;
148
  const blocksById = {};
149
  const blockInfoDialog = document.querySelector("dialog#block-info-dialog");
150
+
151
  function blockTypeColor(blockType) {
152
  return COLORS[BLOCK_TYPES[blockType] % COLORS.length];
153
  }
154
+
155
  function traverseAndGenerateSVG(block) {
156
  let svg = "";
157
+
158
  if (block.polygon) {
159
  const color = blockTypeColor(block.block_type);
160
+
161
  // dollar signs are escaped because this files gets read into a template string
162
  svg += `<rect id="$${block.id}"
163
  class="block type-$${block.block_type}"
 
171
  }"
172
  fill=$${color} stroke=$${color}>
173
  </rect>`;
174
+
175
  blocksById[block.id] = block;
176
  }
177
+
178
  if (Array.isArray(block.children) && block.children.length > 0) {
179
  block.children.forEach((child) => {
180
  svg += traverseAndGenerateSVG(child);
181
  });
182
  }
183
+
184
  return svg;
185
  }
186
+
187
  if (Object.keys(BLOCKS).length == 0) {
188
  // bail out if no blocks
189
  return;
190
  }
191
+
192
  const [vbWidth, vbHeight] = BLOCKS.children[0].polygon[2];
193
  document
194
  .querySelector("svg")
195
  .setAttribute("viewBox", `0 0 $${vbWidth} $${vbHeight}`);
196
+
197
  const blocksOverlay = document.querySelector(".blocks-overlay");
198
  blocksOverlay.innerHTML = traverseAndGenerateSVG(BLOCKS.children[0]);
199
+
200
  tippy("rect.block", {
201
  content: (block) => block.getAttribute("data-type"),
202
  placement: "top-start",
203
  arrow: false,
204
  offset: [0, 5],
205
  });
206
+
207
  blocksOverlay.addEventListener("click", (event) => {
208
  if (event.target.tagName !== "rect") return;
209
+
210
  const blockId = event.target.id;
211
  const block = blocksById[blockId];
212
+
213
  blockInfoDialog.querySelector("h1").innerHTML = `
214
  $${blockId} <span style="color: $${blockTypeColor(block.block_type)}">($${block.block_type})</span>
215
  `;
216
  blockInfoDialog.querySelector(".text-content").textContent = block.html;
217
+
218
  blockInfoDialog.dataset.blockJSON = JSON.stringify(block, null, 2);
219
+
220
  if (block.images) {
221
  const imagesDiv = blockInfoDialog.querySelector(".images");
222
  imagesDiv.innerHTML = "";
poetry.lock CHANGED
@@ -414,13 +414,13 @@ files = [
414
 
415
  [[package]]
416
  name = "certifi"
417
- version = "2024.12.14"
418
  description = "Python package for providing Mozilla's CA Bundle."
419
  optional = false
420
  python-versions = ">=3.6"
421
  files = [
422
- {file = "certifi-2024.12.14-py3-none-any.whl", hash = "sha256:1275f7a45be9464efc1173084eaa30f866fe2e47d389406136d332ed4967ec56"},
423
- {file = "certifi-2024.12.14.tar.gz", hash = "sha256:b650d30f370c2b724812bee08008be0c4163b163ddaec3f2546c1caf65f191db"},
424
  ]
425
 
426
  [[package]]
@@ -1212,6 +1212,92 @@ protobuf = ">=3.20.2,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4
1212
  [package.extras]
1213
  grpc = ["grpcio (>=1.44.0,<2.0.0.dev0)"]
1214
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1215
  [[package]]
1216
  name = "grpcio"
1217
  version = "1.70.0"
@@ -1905,6 +1991,17 @@ files = [
1905
  {file = "jupyterlab_widgets-3.0.13.tar.gz", hash = "sha256:a2966d385328c1942b683a8cd96b89b8dd82c8b8f81dda902bb2bc06d46f5bed"},
1906
  ]
1907
 
 
 
 
 
 
 
 
 
 
 
 
1908
  [[package]]
1909
  name = "lxml"
1910
  version = "5.3.0"
@@ -2729,6 +2826,18 @@ files = [
2729
  [package.dependencies]
2730
  nvidia-nvjitlink-cu12 = "*"
2731
 
 
 
 
 
 
 
 
 
 
 
 
 
2732
  [[package]]
2733
  name = "nvidia-nccl-cu12"
2734
  version = "2.21.5"
@@ -3065,6 +3174,26 @@ docs = ["furo (>=2024.8.6)", "proselint (>=0.14)", "sphinx (>=8.0.2)", "sphinx-a
3065
  test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=8.3.2)", "pytest-cov (>=5)", "pytest-mock (>=3.14)"]
3066
  type = ["mypy (>=1.11.2)"]
3067
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3068
  [[package]]
3069
  name = "pluggy"
3070
  version = "1.5.0"
@@ -3552,6 +3681,23 @@ numpy = ">=1.16.4"
3552
  carto = ["pydeck-carto"]
3553
  jupyter = ["ipykernel (>=5.1.2)", "ipython (>=5.8.0)", "ipywidgets (>=7,<8)", "traitlets (>=4.3.2)"]
3554
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3555
  [[package]]
3556
  name = "pygments"
3557
  version = "2.19.1"
@@ -3696,13 +3842,13 @@ files = [
3696
 
3697
  [[package]]
3698
  name = "pytz"
3699
- version = "2024.2"
3700
  description = "World timezone definitions, modern and historical"
3701
  optional = false
3702
  python-versions = "*"
3703
  files = [
3704
- {file = "pytz-2024.2-py2.py3-none-any.whl", hash = "sha256:31c7c1817eb7fae7ca4b8c7ee50c72f93aa2dd863de768e1ef4245d426aa0725"},
3705
- {file = "pytz-2024.2.tar.gz", hash = "sha256:2aa355083c50a0f93fa581709deac0c9ad65cca8a9e9beac660adcbd493c798a"},
3706
  ]
3707
 
3708
  [[package]]
@@ -4641,13 +4787,13 @@ snowflake = ["snowflake-connector-python (>=2.8.0)", "snowflake-snowpark-python[
4641
 
4642
  [[package]]
4643
  name = "surya-ocr"
4644
- version = "0.10.1"
4645
  description = "OCR, layout, reading order, and table recognition in 90+ languages"
4646
  optional = false
4647
  python-versions = "<4.0,>=3.10"
4648
  files = [
4649
- {file = "surya_ocr-0.10.1-py3-none-any.whl", hash = "sha256:39fdc04ae1531e4b2ceb784e481a22941e53bb72f876fa1638677b5c4bd3c784"},
4650
- {file = "surya_ocr-0.10.1.tar.gz", hash = "sha256:0e57975df87f0dcc17ea6ff06dfe68ff5308c6610e42608a1038f8cbbd044e35"},
4651
  ]
4652
 
4653
  [package.dependencies]
@@ -4659,7 +4805,7 @@ pydantic = ">=2.5.3,<3.0.0"
4659
  pydantic-settings = ">=2.1.0,<3.0.0"
4660
  pypdfium2 = "4.30.0"
4661
  python-dotenv = ">=1.0.0,<2.0.0"
4662
- torch = ">=2.5.1,<2.6.0"
4663
  transformers = ">=4.41.0,<5.0.0"
4664
 
4665
  [[package]]
@@ -4844,28 +4990,31 @@ files = [
4844
 
4845
  [[package]]
4846
  name = "torch"
4847
- version = "2.5.1"
4848
  description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration"
4849
  optional = false
4850
- python-versions = ">=3.8.0"
4851
  files = [
4852
- {file = "torch-2.5.1-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:71328e1bbe39d213b8721678f9dcac30dfc452a46d586f1d514a6aa0a99d4744"},
4853
- {file = "torch-2.5.1-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:34bfa1a852e5714cbfa17f27c49d8ce35e1b7af5608c4bc6e81392c352dbc601"},
4854
- {file = "torch-2.5.1-cp310-cp310-win_amd64.whl", hash = "sha256:32a037bd98a241df6c93e4c789b683335da76a2ac142c0973675b715102dc5fa"},
4855
- {file = "torch-2.5.1-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:23d062bf70776a3d04dbe74db950db2a5245e1ba4f27208a87f0d743b0d06e86"},
4856
- {file = "torch-2.5.1-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:de5b7d6740c4b636ef4db92be922f0edc425b65ed78c5076c43c42d362a45457"},
4857
- {file = "torch-2.5.1-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:340ce0432cad0d37f5a31be666896e16788f1adf8ad7be481196b503dad675b9"},
4858
- {file = "torch-2.5.1-cp311-cp311-win_amd64.whl", hash = "sha256:603c52d2fe06433c18b747d25f5c333f9c1d58615620578c326d66f258686f9a"},
4859
- {file = "torch-2.5.1-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:31f8c39660962f9ae4eeec995e3049b5492eb7360dd4f07377658ef4d728fa4c"},
4860
- {file = "torch-2.5.1-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:ed231a4b3a5952177fafb661213d690a72caaad97d5824dd4fc17ab9e15cec03"},
4861
- {file = "torch-2.5.1-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:3f4b7f10a247e0dcd7ea97dc2d3bfbfc90302ed36d7f3952b0008d0df264e697"},
4862
- {file = "torch-2.5.1-cp312-cp312-win_amd64.whl", hash = "sha256:73e58e78f7d220917c5dbfad1a40e09df9929d3b95d25e57d9f8558f84c9a11c"},
4863
- {file = "torch-2.5.1-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:8c712df61101964eb11910a846514011f0b6f5920c55dbf567bff8a34163d5b1"},
4864
- {file = "torch-2.5.1-cp313-cp313-manylinux1_x86_64.whl", hash = "sha256:9b61edf3b4f6e3b0e0adda8b3960266b9009d02b37555971f4d1c8f7a05afed7"},
4865
- {file = "torch-2.5.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:1f3b7fb3cf7ab97fae52161423f81be8c6b8afac8d9760823fd623994581e1a3"},
4866
- {file = "torch-2.5.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:7974e3dce28b5a21fb554b73e1bc9072c25dde873fa00d54280861e7a009d7dc"},
4867
- {file = "torch-2.5.1-cp39-cp39-win_amd64.whl", hash = "sha256:46c817d3ea33696ad3b9df5e774dba2257e9a4cd3c4a3afbf92f6bb13ac5ce2d"},
4868
- {file = "torch-2.5.1-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:8046768b7f6d35b85d101b4b38cba8aa2f3cd51952bc4c06a49580f2ce682291"},
 
 
 
4869
  ]
4870
 
4871
  [package.dependencies]
@@ -4882,17 +5031,18 @@ nvidia-cufft-cu12 = {version = "11.2.1.3", markers = "platform_system == \"Linux
4882
  nvidia-curand-cu12 = {version = "10.3.5.147", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
4883
  nvidia-cusolver-cu12 = {version = "11.6.1.9", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
4884
  nvidia-cusparse-cu12 = {version = "12.3.1.170", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
 
4885
  nvidia-nccl-cu12 = {version = "2.21.5", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
4886
  nvidia-nvjitlink-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
4887
  nvidia-nvtx-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
4888
  setuptools = {version = "*", markers = "python_version >= \"3.12\""}
4889
  sympy = {version = "1.13.1", markers = "python_version >= \"3.9\""}
4890
- triton = {version = "3.1.0", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and python_version < \"3.13\""}
4891
- typing-extensions = ">=4.8.0"
4892
 
4893
  [package.extras]
4894
  opt-einsum = ["opt-einsum (>=3.3)"]
4895
- optree = ["optree (>=0.12.0)"]
4896
 
4897
  [[package]]
4898
  name = "tornado"
@@ -5021,21 +5171,18 @@ vision = ["Pillow (>=10.0.1,<=15.0)"]
5021
 
5022
  [[package]]
5023
  name = "triton"
5024
- version = "3.1.0"
5025
  description = "A language and compiler for custom Deep Learning operations"
5026
  optional = false
5027
  python-versions = "*"
5028
  files = [
5029
- {file = "triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b0dd10a925263abbe9fa37dcde67a5e9b2383fc269fdf59f5657cac38c5d1d8"},
5030
- {file = "triton-3.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0f34f6e7885d1bf0eaaf7ba875a5f0ce6f3c13ba98f9503651c1e6dc6757ed5c"},
5031
- {file = "triton-3.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c8182f42fd8080a7d39d666814fa36c5e30cc00ea7eeeb1a2983dbb4c99a0fdc"},
5032
- {file = "triton-3.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6dadaca7fc24de34e180271b5cf864c16755702e9f63a16f62df714a8099126a"},
5033
- {file = "triton-3.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aafa9a20cd0d9fee523cd4504aa7131807a864cd77dcf6efe7e981f18b8c6c11"},
5034
  ]
5035
 
5036
- [package.dependencies]
5037
- filelock = "*"
5038
-
5039
  [package.extras]
5040
  build = ["cmake (>=3.20)", "lit"]
5041
  tests = ["autopep8", "flake8", "isort", "llnl-hatchet", "numpy", "pytest", "scipy (>=1.7.1)"]
@@ -5468,4 +5615,4 @@ propcache = ">=0.2.0"
5468
  [metadata]
5469
  lock-version = "2.0"
5470
  python-versions = "^3.10"
5471
- content-hash = "294f3036e322ab123bc681335d96606bbc2c8cb52a8a2c253874725b3180c2f7"
 
414
 
415
  [[package]]
416
  name = "certifi"
417
+ version = "2025.1.31"
418
  description = "Python package for providing Mozilla's CA Bundle."
419
  optional = false
420
  python-versions = ">=3.6"
421
  files = [
422
+ {file = "certifi-2025.1.31-py3-none-any.whl", hash = "sha256:ca78db4565a652026a4db2bcdf68f2fb589ea80d0be70e03929ed730746b84fe"},
423
+ {file = "certifi-2025.1.31.tar.gz", hash = "sha256:3d5da6925056f6f18f119200434a4780a94263f10d1c21d032a6f6b2baa20651"},
424
  ]
425
 
426
  [[package]]
 
1212
  [package.extras]
1213
  grpc = ["grpcio (>=1.44.0,<2.0.0.dev0)"]
1214
 
1215
+ [[package]]
1216
+ name = "greenlet"
1217
+ version = "3.1.1"
1218
+ description = "Lightweight in-process concurrent programming"
1219
+ optional = false
1220
+ python-versions = ">=3.7"
1221
+ files = [
1222
+ {file = "greenlet-3.1.1-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:0bbae94a29c9e5c7e4a2b7f0aae5c17e8e90acbfd3bf6270eeba60c39fce3563"},
1223
+ {file = "greenlet-3.1.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0fde093fb93f35ca72a556cf72c92ea3ebfda3d79fc35bb19fbe685853869a83"},
1224
+ {file = "greenlet-3.1.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:36b89d13c49216cadb828db8dfa6ce86bbbc476a82d3a6c397f0efae0525bdd0"},
1225
+ {file = "greenlet-3.1.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:94b6150a85e1b33b40b1464a3f9988dcc5251d6ed06842abff82e42632fac120"},
1226
+ {file = "greenlet-3.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:93147c513fac16385d1036b7e5b102c7fbbdb163d556b791f0f11eada7ba65dc"},
1227
+ {file = "greenlet-3.1.1-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:da7a9bff22ce038e19bf62c4dd1ec8391062878710ded0a845bcf47cc0200617"},
1228
+ {file = "greenlet-3.1.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b2795058c23988728eec1f36a4e5e4ebad22f8320c85f3587b539b9ac84128d7"},
1229
+ {file = "greenlet-3.1.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ed10eac5830befbdd0c32f83e8aa6288361597550ba669b04c48f0f9a2c843c6"},
1230
+ {file = "greenlet-3.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:77c386de38a60d1dfb8e55b8c1101d68c79dfdd25c7095d51fec2dd800892b80"},
1231
+ {file = "greenlet-3.1.1-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:e4d333e558953648ca09d64f13e6d8f0523fa705f51cae3f03b5983489958c70"},
1232
+ {file = "greenlet-3.1.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:09fc016b73c94e98e29af67ab7b9a879c307c6731a2c9da0db5a7d9b7edd1159"},
1233
+ {file = "greenlet-3.1.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d5e975ca70269d66d17dd995dafc06f1b06e8cb1ec1e9ed54c1d1e4a7c4cf26e"},
1234
+ {file = "greenlet-3.1.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3b2813dc3de8c1ee3f924e4d4227999285fd335d1bcc0d2be6dc3f1f6a318ec1"},
1235
+ {file = "greenlet-3.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e347b3bfcf985a05e8c0b7d462ba6f15b1ee1c909e2dcad795e49e91b152c383"},
1236
+ {file = "greenlet-3.1.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9e8f8c9cb53cdac7ba9793c276acd90168f416b9ce36799b9b885790f8ad6c0a"},
1237
+ {file = "greenlet-3.1.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:62ee94988d6b4722ce0028644418d93a52429e977d742ca2ccbe1c4f4a792511"},
1238
+ {file = "greenlet-3.1.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:1776fd7f989fc6b8d8c8cb8da1f6b82c5814957264d1f6cf818d475ec2bf6395"},
1239
+ {file = "greenlet-3.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:48ca08c771c268a768087b408658e216133aecd835c0ded47ce955381105ba39"},
1240
+ {file = "greenlet-3.1.1-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:4afe7ea89de619adc868e087b4d2359282058479d7cfb94970adf4b55284574d"},
1241
+ {file = "greenlet-3.1.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f406b22b7c9a9b4f8aa9d2ab13d6ae0ac3e85c9a809bd590ad53fed2bf70dc79"},
1242
+ {file = "greenlet-3.1.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c3a701fe5a9695b238503ce5bbe8218e03c3bcccf7e204e455e7462d770268aa"},
1243
+ {file = "greenlet-3.1.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2846930c65b47d70b9d178e89c7e1a69c95c1f68ea5aa0a58646b7a96df12441"},
1244
+ {file = "greenlet-3.1.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:99cfaa2110534e2cf3ba31a7abcac9d328d1d9f1b95beede58294a60348fba36"},
1245
+ {file = "greenlet-3.1.1-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1443279c19fca463fc33e65ef2a935a5b09bb90f978beab37729e1c3c6c25fe9"},
1246
+ {file = "greenlet-3.1.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:b7cede291382a78f7bb5f04a529cb18e068dd29e0fb27376074b6d0317bf4dd0"},
1247
+ {file = "greenlet-3.1.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:23f20bb60ae298d7d8656c6ec6db134bca379ecefadb0b19ce6f19d1f232a942"},
1248
+ {file = "greenlet-3.1.1-cp312-cp312-win_amd64.whl", hash = "sha256:7124e16b4c55d417577c2077be379514321916d5790fa287c9ed6f23bd2ffd01"},
1249
+ {file = "greenlet-3.1.1-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:05175c27cb459dcfc05d026c4232f9de8913ed006d42713cb8a5137bd49375f1"},
1250
+ {file = "greenlet-3.1.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:935e943ec47c4afab8965954bf49bfa639c05d4ccf9ef6e924188f762145c0ff"},
1251
+ {file = "greenlet-3.1.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:667a9706c970cb552ede35aee17339a18e8f2a87a51fba2ed39ceeeb1004798a"},
1252
+ {file = "greenlet-3.1.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b8a678974d1f3aa55f6cc34dc480169d58f2e6d8958895d68845fa4ab566509e"},
1253
+ {file = "greenlet-3.1.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:efc0f674aa41b92da8c49e0346318c6075d734994c3c4e4430b1c3f853e498e4"},
1254
+ {file = "greenlet-3.1.1-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0153404a4bb921f0ff1abeb5ce8a5131da56b953eda6e14b88dc6bbc04d2049e"},
1255
+ {file = "greenlet-3.1.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:275f72decf9932639c1c6dd1013a1bc266438eb32710016a1c742df5da6e60a1"},
1256
+ {file = "greenlet-3.1.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:c4aab7f6381f38a4b42f269057aee279ab0fc7bf2e929e3d4abfae97b682a12c"},
1257
+ {file = "greenlet-3.1.1-cp313-cp313-win_amd64.whl", hash = "sha256:b42703b1cf69f2aa1df7d1030b9d77d3e584a70755674d60e710f0af570f3761"},
1258
+ {file = "greenlet-3.1.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f1695e76146579f8c06c1509c7ce4dfe0706f49c6831a817ac04eebb2fd02011"},
1259
+ {file = "greenlet-3.1.1-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7876452af029456b3f3549b696bb36a06db7c90747740c5302f74a9e9fa14b13"},
1260
+ {file = "greenlet-3.1.1-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4ead44c85f8ab905852d3de8d86f6f8baf77109f9da589cb4fa142bd3b57b475"},
1261
+ {file = "greenlet-3.1.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8320f64b777d00dd7ccdade271eaf0cad6636343293a25074cc5566160e4de7b"},
1262
+ {file = "greenlet-3.1.1-cp313-cp313t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6510bf84a6b643dabba74d3049ead221257603a253d0a9873f55f6a59a65f822"},
1263
+ {file = "greenlet-3.1.1-cp313-cp313t-musllinux_1_1_aarch64.whl", hash = "sha256:04b013dc07c96f83134b1e99888e7a79979f1a247e2a9f59697fa14b5862ed01"},
1264
+ {file = "greenlet-3.1.1-cp313-cp313t-musllinux_1_1_x86_64.whl", hash = "sha256:411f015496fec93c1c8cd4e5238da364e1da7a124bcb293f085bf2860c32c6f6"},
1265
+ {file = "greenlet-3.1.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:47da355d8687fd65240c364c90a31569a133b7b60de111c255ef5b606f2ae291"},
1266
+ {file = "greenlet-3.1.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:98884ecf2ffb7d7fe6bd517e8eb99d31ff7855a840fa6d0d63cd07c037f6a981"},
1267
+ {file = "greenlet-3.1.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f1d4aeb8891338e60d1ab6127af1fe45def5259def8094b9c7e34690c8858803"},
1268
+ {file = "greenlet-3.1.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:db32b5348615a04b82240cc67983cb315309e88d444a288934ee6ceaebcad6cc"},
1269
+ {file = "greenlet-3.1.1-cp37-cp37m-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dcc62f31eae24de7f8dce72134c8651c58000d3b1868e01392baea7c32c247de"},
1270
+ {file = "greenlet-3.1.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:1d3755bcb2e02de341c55b4fca7a745a24a9e7212ac953f6b3a48d117d7257aa"},
1271
+ {file = "greenlet-3.1.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:b8da394b34370874b4572676f36acabac172602abf054cbc4ac910219f3340af"},
1272
+ {file = "greenlet-3.1.1-cp37-cp37m-win32.whl", hash = "sha256:a0dfc6c143b519113354e780a50381508139b07d2177cb6ad6a08278ec655798"},
1273
+ {file = "greenlet-3.1.1-cp37-cp37m-win_amd64.whl", hash = "sha256:54558ea205654b50c438029505def3834e80f0869a70fb15b871c29b4575ddef"},
1274
+ {file = "greenlet-3.1.1-cp38-cp38-macosx_11_0_universal2.whl", hash = "sha256:346bed03fe47414091be4ad44786d1bd8bef0c3fcad6ed3dee074a032ab408a9"},
1275
+ {file = "greenlet-3.1.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dfc59d69fc48664bc693842bd57acfdd490acafda1ab52c7836e3fc75c90a111"},
1276
+ {file = "greenlet-3.1.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d21e10da6ec19b457b82636209cbe2331ff4306b54d06fa04b7c138ba18c8a81"},
1277
+ {file = "greenlet-3.1.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:37b9de5a96111fc15418819ab4c4432e4f3c2ede61e660b1e33971eba26ef9ba"},
1278
+ {file = "greenlet-3.1.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6ef9ea3f137e5711f0dbe5f9263e8c009b7069d8a1acea822bd5e9dae0ae49c8"},
1279
+ {file = "greenlet-3.1.1-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:85f3ff71e2e60bd4b4932a043fbbe0f499e263c628390b285cb599154a3b03b1"},
1280
+ {file = "greenlet-3.1.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:95ffcf719966dd7c453f908e208e14cde192e09fde6c7186c8f1896ef778d8cd"},
1281
+ {file = "greenlet-3.1.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:03a088b9de532cbfe2ba2034b2b85e82df37874681e8c470d6fb2f8c04d7e4b7"},
1282
+ {file = "greenlet-3.1.1-cp38-cp38-win32.whl", hash = "sha256:8b8b36671f10ba80e159378df9c4f15c14098c4fd73a36b9ad715f057272fbef"},
1283
+ {file = "greenlet-3.1.1-cp38-cp38-win_amd64.whl", hash = "sha256:7017b2be767b9d43cc31416aba48aab0d2309ee31b4dbf10a1d38fb7972bdf9d"},
1284
+ {file = "greenlet-3.1.1-cp39-cp39-macosx_11_0_universal2.whl", hash = "sha256:396979749bd95f018296af156201d6211240e7a23090f50a8d5d18c370084dc3"},
1285
+ {file = "greenlet-3.1.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca9d0ff5ad43e785350894d97e13633a66e2b50000e8a183a50a88d834752d42"},
1286
+ {file = "greenlet-3.1.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f6ff3b14f2df4c41660a7dec01045a045653998784bf8cfcb5a525bdffffbc8f"},
1287
+ {file = "greenlet-3.1.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:94ebba31df2aa506d7b14866fed00ac141a867e63143fe5bca82a8e503b36437"},
1288
+ {file = "greenlet-3.1.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:73aaad12ac0ff500f62cebed98d8789198ea0e6f233421059fa68a5aa7220145"},
1289
+ {file = "greenlet-3.1.1-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:63e4844797b975b9af3a3fb8f7866ff08775f5426925e1e0bbcfe7932059a12c"},
1290
+ {file = "greenlet-3.1.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:7939aa3ca7d2a1593596e7ac6d59391ff30281ef280d8632fa03d81f7c5f955e"},
1291
+ {file = "greenlet-3.1.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d0028e725ee18175c6e422797c407874da24381ce0690d6b9396c204c7f7276e"},
1292
+ {file = "greenlet-3.1.1-cp39-cp39-win32.whl", hash = "sha256:5e06afd14cbaf9e00899fae69b24a32f2196c19de08fcb9f4779dd4f004e5e7c"},
1293
+ {file = "greenlet-3.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:3319aa75e0e0639bc15ff54ca327e8dc7a6fe404003496e3c6925cd3142e0e22"},
1294
+ {file = "greenlet-3.1.1.tar.gz", hash = "sha256:4ce3ac6cdb6adf7946475d7ef31777c26d94bccc377e070a7986bd2d5c515467"},
1295
+ ]
1296
+
1297
+ [package.extras]
1298
+ docs = ["Sphinx", "furo"]
1299
+ test = ["objgraph", "psutil"]
1300
+
1301
  [[package]]
1302
  name = "grpcio"
1303
  version = "1.70.0"
 
1991
  {file = "jupyterlab_widgets-3.0.13.tar.gz", hash = "sha256:a2966d385328c1942b683a8cd96b89b8dd82c8b8f81dda902bb2bc06d46f5bed"},
1992
  ]
1993
 
1994
+ [[package]]
1995
+ name = "latex2mathml"
1996
+ version = "3.77.0"
1997
+ description = "Pure Python library for LaTeX to MathML conversion"
1998
+ optional = false
1999
+ python-versions = ">=3.8.1,<4.0.0"
2000
+ files = [
2001
+ {file = "latex2mathml-3.77.0-py3-none-any.whl", hash = "sha256:5531e18a2a9eae7c24e257118b6a444cbba253cd27ff3e81f1bd6c41e88e786e"},
2002
+ {file = "latex2mathml-3.77.0.tar.gz", hash = "sha256:e2f501d1878f2e489c3f6f12786bef74c62f712d2770f7f3c837eb20a55d0a1e"},
2003
+ ]
2004
+
2005
  [[package]]
2006
  name = "lxml"
2007
  version = "5.3.0"
 
2826
  [package.dependencies]
2827
  nvidia-nvjitlink-cu12 = "*"
2828
 
2829
+ [[package]]
2830
+ name = "nvidia-cusparselt-cu12"
2831
+ version = "0.6.2"
2832
+ description = "NVIDIA cuSPARSELt"
2833
+ optional = false
2834
+ python-versions = "*"
2835
+ files = [
2836
+ {file = "nvidia_cusparselt_cu12-0.6.2-py3-none-manylinux2014_aarch64.whl", hash = "sha256:067a7f6d03ea0d4841c85f0c6f1991c5dda98211f6302cb83a4ab234ee95bef8"},
2837
+ {file = "nvidia_cusparselt_cu12-0.6.2-py3-none-manylinux2014_x86_64.whl", hash = "sha256:df2c24502fd76ebafe7457dbc4716b2fec071aabaed4fb7691a201cde03704d9"},
2838
+ {file = "nvidia_cusparselt_cu12-0.6.2-py3-none-win_amd64.whl", hash = "sha256:0057c91d230703924c0422feabe4ce768841f9b4b44d28586b6f6d2eb86fbe70"},
2839
+ ]
2840
+
2841
  [[package]]
2842
  name = "nvidia-nccl-cu12"
2843
  version = "2.21.5"
 
3174
  test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=8.3.2)", "pytest-cov (>=5)", "pytest-mock (>=3.14)"]
3175
  type = ["mypy (>=1.11.2)"]
3176
 
3177
+ [[package]]
3178
+ name = "playwright"
3179
+ version = "1.49.1"
3180
+ description = "A high-level API to automate web browsers"
3181
+ optional = false
3182
+ python-versions = ">=3.9"
3183
+ files = [
3184
+ {file = "playwright-1.49.1-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:1041ffb45a0d0bc44d698d3a5aa3ac4b67c9bd03540da43a0b70616ad52592b8"},
3185
+ {file = "playwright-1.49.1-py3-none-macosx_11_0_arm64.whl", hash = "sha256:9f38ed3d0c1f4e0a6d1c92e73dd9a61f8855133249d6f0cec28648d38a7137be"},
3186
+ {file = "playwright-1.49.1-py3-none-macosx_11_0_universal2.whl", hash = "sha256:3be48c6d26dc819ca0a26567c1ae36a980a0303dcd4249feb6f59e115aaddfb8"},
3187
+ {file = "playwright-1.49.1-py3-none-manylinux1_x86_64.whl", hash = "sha256:753ca90ee31b4b03d165cfd36e477309ebf2b4381953f2a982ff612d85b147d2"},
3188
+ {file = "playwright-1.49.1-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cd9bc8dab37aa25198a01f555f0a2e2c3813fe200fef018ac34dfe86b34994b9"},
3189
+ {file = "playwright-1.49.1-py3-none-win32.whl", hash = "sha256:43b304be67f096058e587dac453ece550eff87b8fbed28de30f4f022cc1745bb"},
3190
+ {file = "playwright-1.49.1-py3-none-win_amd64.whl", hash = "sha256:47b23cb346283278f5b4d1e1990bcb6d6302f80c0aa0ca93dd0601a1400191df"},
3191
+ ]
3192
+
3193
+ [package.dependencies]
3194
+ greenlet = "3.1.1"
3195
+ pyee = "12.0.0"
3196
+
3197
  [[package]]
3198
  name = "pluggy"
3199
  version = "1.5.0"
 
3681
  carto = ["pydeck-carto"]
3682
  jupyter = ["ipykernel (>=5.1.2)", "ipython (>=5.8.0)", "ipywidgets (>=7,<8)", "traitlets (>=4.3.2)"]
3683
 
3684
+ [[package]]
3685
+ name = "pyee"
3686
+ version = "12.0.0"
3687
+ description = "A rough port of Node.js's EventEmitter to Python with a few tricks of its own"
3688
+ optional = false
3689
+ python-versions = ">=3.8"
3690
+ files = [
3691
+ {file = "pyee-12.0.0-py3-none-any.whl", hash = "sha256:7b14b74320600049ccc7d0e0b1becd3b4bd0a03c745758225e31a59f4095c990"},
3692
+ {file = "pyee-12.0.0.tar.gz", hash = "sha256:c480603f4aa2927d4766eb41fa82793fe60a82cbfdb8d688e0d08c55a534e145"},
3693
+ ]
3694
+
3695
+ [package.dependencies]
3696
+ typing-extensions = "*"
3697
+
3698
+ [package.extras]
3699
+ dev = ["black", "build", "flake8", "flake8-black", "isort", "jupyter-console", "mkdocs", "mkdocs-include-markdown-plugin", "mkdocstrings[python]", "pytest", "pytest-asyncio", "pytest-trio", "sphinx", "toml", "tox", "trio", "trio", "trio-typing", "twine", "twisted", "validate-pyproject[all]"]
3700
+
3701
  [[package]]
3702
  name = "pygments"
3703
  version = "2.19.1"
 
3842
 
3843
  [[package]]
3844
  name = "pytz"
3845
+ version = "2025.1"
3846
  description = "World timezone definitions, modern and historical"
3847
  optional = false
3848
  python-versions = "*"
3849
  files = [
3850
+ {file = "pytz-2025.1-py2.py3-none-any.whl", hash = "sha256:89dd22dca55b46eac6eda23b2d72721bf1bdfef212645d81513ef5d03038de57"},
3851
+ {file = "pytz-2025.1.tar.gz", hash = "sha256:c2db42be2a2518b28e65f9207c4d05e6ff547d1efa4086469ef855e4ab70178e"},
3852
  ]
3853
 
3854
  [[package]]
 
4787
 
4788
  [[package]]
4789
  name = "surya-ocr"
4790
+ version = "0.10.2"
4791
  description = "OCR, layout, reading order, and table recognition in 90+ languages"
4792
  optional = false
4793
  python-versions = "<4.0,>=3.10"
4794
  files = [
4795
+ {file = "surya_ocr-0.10.2-py3-none-any.whl", hash = "sha256:fbb590ae92b2a785e75ca25a53dd2ff59b1f56ec017a22f6127c9c7c62a1b910"},
4796
+ {file = "surya_ocr-0.10.2.tar.gz", hash = "sha256:ddbaf5d2f2cc0a08992446f889f782aa81e9e1cfa3fd957c124273365d411057"},
4797
  ]
4798
 
4799
  [package.dependencies]
 
4805
  pydantic-settings = ">=2.1.0,<3.0.0"
4806
  pypdfium2 = "4.30.0"
4807
  python-dotenv = ">=1.0.0,<2.0.0"
4808
+ torch = ">=2.5.1,<3.0.0"
4809
  transformers = ">=4.41.0,<5.0.0"
4810
 
4811
  [[package]]
 
4990
 
4991
  [[package]]
4992
  name = "torch"
4993
+ version = "2.6.0"
4994
  description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration"
4995
  optional = false
4996
+ python-versions = ">=3.9.0"
4997
  files = [
4998
+ {file = "torch-2.6.0-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:6860df13d9911ac158f4c44031609700e1eba07916fff62e21e6ffa0a9e01961"},
4999
+ {file = "torch-2.6.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:c4f103a49830ce4c7561ef4434cc7926e5a5fe4e5eb100c19ab36ea1e2b634ab"},
5000
+ {file = "torch-2.6.0-cp310-cp310-win_amd64.whl", hash = "sha256:56eeaf2ecac90da5d9e35f7f35eb286da82673ec3c582e310a8d1631a1c02341"},
5001
+ {file = "torch-2.6.0-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:09e06f9949e1a0518c5b09fe95295bc9661f219d9ecb6f9893e5123e10696628"},
5002
+ {file = "torch-2.6.0-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:7979834102cd5b7a43cc64e87f2f3b14bd0e1458f06e9f88ffa386d07c7446e1"},
5003
+ {file = "torch-2.6.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:ccbd0320411fe1a3b3fec7b4d3185aa7d0c52adac94480ab024b5c8f74a0bf1d"},
5004
+ {file = "torch-2.6.0-cp311-cp311-win_amd64.whl", hash = "sha256:46763dcb051180ce1ed23d1891d9b1598e07d051ce4c9d14307029809c4d64f7"},
5005
+ {file = "torch-2.6.0-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:94fc63b3b4bedd327af588696559f68c264440e2503cc9e6954019473d74ae21"},
5006
+ {file = "torch-2.6.0-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:2bb8987f3bb1ef2675897034402373ddfc8f5ef0e156e2d8cfc47cacafdda4a9"},
5007
+ {file = "torch-2.6.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:b789069020c5588c70d5c2158ac0aa23fd24a028f34a8b4fcb8fcb4d7efcf5fb"},
5008
+ {file = "torch-2.6.0-cp312-cp312-win_amd64.whl", hash = "sha256:7e1448426d0ba3620408218b50aa6ada88aeae34f7a239ba5431f6c8774b1239"},
5009
+ {file = "torch-2.6.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:9a610afe216a85a8b9bc9f8365ed561535c93e804c2a317ef7fabcc5deda0989"},
5010
+ {file = "torch-2.6.0-cp313-cp313-manylinux1_x86_64.whl", hash = "sha256:4874a73507a300a5d089ceaff616a569e7bb7c613c56f37f63ec3ffac65259cf"},
5011
+ {file = "torch-2.6.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:a0d5e1b9874c1a6c25556840ab8920569a7a4137afa8a63a32cee0bc7d89bd4b"},
5012
+ {file = "torch-2.6.0-cp313-cp313-win_amd64.whl", hash = "sha256:510c73251bee9ba02ae1cb6c9d4ee0907b3ce6020e62784e2d7598e0cfa4d6cc"},
5013
+ {file = "torch-2.6.0-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:ff96f4038f8af9f7ec4231710ed4549da1bdebad95923953a25045dcf6fd87e2"},
5014
+ {file = "torch-2.6.0-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:9ea955317cfcd3852b1402b62af258ce735c2edeee42ca9419b6bc889e5ae053"},
5015
+ {file = "torch-2.6.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:bb2c6c3e65049f081940f5ab15c9136c7de40d3f01192541c920a07c7c585b7e"},
5016
+ {file = "torch-2.6.0-cp39-cp39-win_amd64.whl", hash = "sha256:683410f97984103148e31b38a8631acf31c3034c020c0f4d26171e7626d8317a"},
5017
+ {file = "torch-2.6.0-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:265f70de5fd45b864d924b64be1797f86e76c8e48a02c2a3a6fc7ec247d2226c"},
5018
  ]
5019
 
5020
  [package.dependencies]
 
5031
  nvidia-curand-cu12 = {version = "10.3.5.147", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
5032
  nvidia-cusolver-cu12 = {version = "11.6.1.9", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
5033
  nvidia-cusparse-cu12 = {version = "12.3.1.170", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
5034
+ nvidia-cusparselt-cu12 = {version = "0.6.2", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
5035
  nvidia-nccl-cu12 = {version = "2.21.5", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
5036
  nvidia-nvjitlink-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
5037
  nvidia-nvtx-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
5038
  setuptools = {version = "*", markers = "python_version >= \"3.12\""}
5039
  sympy = {version = "1.13.1", markers = "python_version >= \"3.9\""}
5040
+ triton = {version = "3.2.0", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
5041
+ typing-extensions = ">=4.10.0"
5042
 
5043
  [package.extras]
5044
  opt-einsum = ["opt-einsum (>=3.3)"]
5045
+ optree = ["optree (>=0.13.0)"]
5046
 
5047
  [[package]]
5048
  name = "tornado"
 
5171
 
5172
  [[package]]
5173
  name = "triton"
5174
+ version = "3.2.0"
5175
  description = "A language and compiler for custom Deep Learning operations"
5176
  optional = false
5177
  python-versions = "*"
5178
  files = [
5179
+ {file = "triton-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b3e54983cd51875855da7c68ec05c05cf8bb08df361b1d5b69e05e40b0c9bd62"},
5180
+ {file = "triton-3.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8009a1fb093ee8546495e96731336a33fb8856a38e45bb4ab6affd6dbc3ba220"},
5181
+ {file = "triton-3.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d9b215efc1c26fa7eefb9a157915c92d52e000d2bf83e5f69704047e63f125c"},
5182
+ {file = "triton-3.2.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e5dfa23ba84541d7c0a531dfce76d8bcd19159d50a4a8b14ad01e91734a5c1b0"},
5183
+ {file = "triton-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:30ceed0eff2c4a73b14eb63e052992f44bbdf175f3fad21e1ac8097a772de7ee"},
5184
  ]
5185
 
 
 
 
5186
  [package.extras]
5187
  build = ["cmake (>=3.20)", "lit"]
5188
  tests = ["autopep8", "flake8", "isort", "llnl-hatchet", "numpy", "pytest", "scipy (>=1.7.1)"]
 
5615
  [metadata]
5616
  lock-version = "2.0"
5617
  python-versions = "^3.10"
5618
+ content-hash = "589d4265c99bb94e935eeae053707638d72da1eaca38f0d60c832210703bd5bc"
pyproject.toml CHANGED
@@ -1,6 +1,6 @@
1
  [tool.poetry]
2
  name = "marker-pdf"
3
- version = "1.3.3"
4
  description = "Convert PDF to markdown with high speed and accuracy."
5
  authors = ["Vik Paruchuri <github@vikas.sh>"]
6
  readme = "README.md"
@@ -22,11 +22,11 @@ pydantic = "^2.4.2"
22
  pydantic-settings = "^2.0.3"
23
  transformers = "^4.45.2"
24
  python-dotenv = "^1.0.0"
25
- torch = "~2.5.1" # 2.6.0 appears to fail with mps
26
  tqdm = "^4.66.1"
27
  ftfy = "^6.1.1"
28
  rapidfuzz = "^3.8.1"
29
- surya-ocr = "~0.10.1"
30
  regex = "^2024.4.28"
31
  pdftext = "~0.5.1"
32
  markdownify = "^0.13.1"
@@ -49,6 +49,8 @@ apted = "1.0.3"
49
  distance = "0.1.3"
50
  lxml = "5.3.0"
51
  tabulate = "^0.9.0"
 
 
52
 
53
  [tool.poetry.scripts]
54
  marker = "marker.scripts.convert:convert_cli"
 
1
  [tool.poetry]
2
  name = "marker-pdf"
3
+ version = "1.3.4"
4
  description = "Convert PDF to markdown with high speed and accuracy."
5
  authors = ["Vik Paruchuri <github@vikas.sh>"]
6
  readme = "README.md"
 
22
  pydantic-settings = "^2.0.3"
23
  transformers = "^4.45.2"
24
  python-dotenv = "^1.0.0"
25
+ torch = "^2.5.1"
26
  tqdm = "^4.66.1"
27
  ftfy = "^6.1.1"
28
  rapidfuzz = "^3.8.1"
29
+ surya-ocr = "~0.10.2"
30
  regex = "^2024.4.28"
31
  pdftext = "~0.5.1"
32
  markdownify = "^0.13.1"
 
49
  distance = "0.1.3"
50
  lxml = "5.3.0"
51
  tabulate = "^0.9.0"
52
+ latex2mathml = "^3.77.0"
53
+ playwright = "^1.49.1"
54
 
55
  [tool.poetry.scripts]
56
  marker = "marker.scripts.convert:convert_cli"