Vik Paruchuri commited on
Commit
6828240
·
1 Parent(s): 692dda6

Fix misc bugs

Browse files
convert_single.py CHANGED
@@ -3,8 +3,6 @@ import time
3
  import pypdfium2 # Needs to be at the top to avoid warnings
4
  import os
5
 
6
- from marker.settings import settings
7
-
8
  os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # For some reason, transformers decided to use .isin for a simple op, which is not supported on MPS
9
 
10
  import argparse
 
3
  import pypdfium2 # Needs to be at the top to avoid warnings
4
  import os
5
 
 
 
6
  os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # For some reason, transformers decided to use .isin for a simple op, which is not supported on MPS
7
 
8
  import argparse
marker/convert.py CHANGED
@@ -1,7 +1,4 @@
1
  import warnings
2
-
3
- from marker.cleaners.toc import compute_toc
4
-
5
  warnings.filterwarnings("ignore", category=UserWarning) # Filter torch pytree user warnings
6
 
7
  import os
@@ -31,6 +28,7 @@ from marker.postprocessors.markdown import merge_spans, merge_lines, get_full_te
31
  from marker.cleaners.text import cleanup_text
32
  from marker.images.extract import extract_images
33
  from marker.images.save import images_to_dict
 
34
 
35
  from typing import List, Dict, Tuple, Optional
36
  from marker.settings import settings
 
1
  import warnings
 
 
 
2
  warnings.filterwarnings("ignore", category=UserWarning) # Filter torch pytree user warnings
3
 
4
  import os
 
28
  from marker.cleaners.text import cleanup_text
29
  from marker.images.extract import extract_images
30
  from marker.images.save import images_to_dict
31
+ from marker.cleaners.toc import compute_toc
32
 
33
  from typing import List, Dict, Tuple, Optional
34
  from marker.settings import settings
marker/equations/equations.py CHANGED
@@ -6,7 +6,7 @@ from marker.equations.inference import get_total_texify_tokens, get_latex_batche
6
  from marker.pdf.images import render_bbox_image
7
  from marker.schema.bbox import rescale_bbox
8
  from marker.schema.page import Page
9
- from marker.schema.block import Line, Span, Block, bbox_from_lines, split_block_lines, find_insert_block
10
  from marker.settings import settings
11
 
12
 
 
6
  from marker.pdf.images import render_bbox_image
7
  from marker.schema.bbox import rescale_bbox
8
  from marker.schema.page import Page
9
+ from marker.schema.block import Line, Span, Block, split_block_lines, find_insert_block
10
  from marker.settings import settings
11
 
12
 
poetry.lock CHANGED
@@ -2510,13 +2510,13 @@ testing = ["docopt", "pytest"]
2510
 
2511
  [[package]]
2512
  name = "pdftext"
2513
- version = "0.3.15"
2514
  description = "Extract structured text from pdfs quickly"
2515
  optional = false
2516
  python-versions = "<4.0,>=3.10"
2517
  files = [
2518
- {file = "pdftext-0.3.15-py3-none-any.whl", hash = "sha256:3151abacd5c2cfed9975d090333b543151e14de439ebe0d228b935328d512f3d"},
2519
- {file = "pdftext-0.3.15.tar.gz", hash = "sha256:3c6d55781c1adfd263cdc05c39cbea2c40d4e626439ab24078f860eca65c2e6c"},
2520
  ]
2521
 
2522
  [package.dependencies]
@@ -5075,4 +5075,4 @@ propcache = ">=0.2.0"
5075
  [metadata]
5076
  lock-version = "2.0"
5077
  python-versions = "^3.10"
5078
- content-hash = "4983d14f11f46193fc13b3256d1f7ed8d0877ad579cb8b30fbab8e87d3febb22"
 
2510
 
2511
  [[package]]
2512
  name = "pdftext"
2513
+ version = "0.3.16"
2514
  description = "Extract structured text from pdfs quickly"
2515
  optional = false
2516
  python-versions = "<4.0,>=3.10"
2517
  files = [
2518
+ {file = "pdftext-0.3.16-py3-none-any.whl", hash = "sha256:c541ee95496b51325fcdeebf9390a45cd2a1f511aac21b021bb45cca4634b6b7"},
2519
+ {file = "pdftext-0.3.16.tar.gz", hash = "sha256:a30f4fddafe1ad67fb45b18689942a78c6263ef4a53888bd107cc25c27974a9d"},
2520
  ]
2521
 
2522
  [package.dependencies]
 
5075
  [metadata]
5076
  lock-version = "2.0"
5077
  python-versions = "^3.10"
5078
+ content-hash = "fb9aaf82b97c40f5f850b3143558d44e7f6a501eb34e40bdfdc5d3a0aa58fecf"
pyproject.toml CHANGED
@@ -35,7 +35,7 @@ rapidfuzz = "^3.8.1"
35
  surya-ocr = "^0.6.3"
36
  filetype = "^1.2.0"
37
  regex = "^2024.4.28"
38
- pdftext = "^0.3.15"
39
  tabled-pdf = "^0.1.0"
40
 
41
  [tool.poetry.group.dev.dependencies]
 
35
  surya-ocr = "^0.6.3"
36
  filetype = "^1.2.0"
37
  regex = "^2024.4.28"
38
+ pdftext = "^0.3.16"
39
  tabled-pdf = "^0.1.0"
40
 
41
  [tool.poetry.group.dev.dependencies]
scripts/verify_benchmark_scores.py CHANGED
@@ -9,7 +9,7 @@ def verify_scores(file_path):
9
  multicolcnn_score = data["marker"]["files"]["multicolcnn.pdf"]["score"]
10
  switch_trans_score = data["marker"]["files"]["switch_trans.pdf"]["score"]
11
 
12
- if multicolcnn_score <= 0.35 or switch_trans_score <= 0.39:
13
  raise ValueError("One or more scores are below the required threshold of 0.4")
14
 
15
 
 
9
  multicolcnn_score = data["marker"]["files"]["multicolcnn.pdf"]["score"]
10
  switch_trans_score = data["marker"]["files"]["switch_trans.pdf"]["score"]
11
 
12
+ if multicolcnn_score <= 0.37 or switch_trans_score <= 0.4:
13
  raise ValueError("One or more scores are below the required threshold of 0.4")
14
 
15