Vik Paruchuri commited on
Commit
2c69783
Β·
1 Parent(s): ac26884

Initial integration

Browse files
This view is limited to 50 files because it contains too many changes. Β  See raw diff
Files changed (50) hide show
  1. docs/install_ocrmypdf.md +0 -29
  2. marker/{v2/builders β†’ builders}/__init__.py +1 -1
  3. marker/{v2/builders β†’ builders}/document.py +13 -10
  4. marker/{v2/builders β†’ builders}/layout.py +8 -8
  5. marker/{v2/builders β†’ builders}/ocr.py +9 -9
  6. marker/{v2/builders β†’ builders}/structure.py +6 -6
  7. marker/cleaners/bullets.py +0 -8
  8. marker/cleaners/code.py +0 -131
  9. marker/cleaners/fontstyle.py +0 -30
  10. marker/cleaners/headers.py +0 -82
  11. marker/cleaners/headings.py +0 -129
  12. marker/cleaners/text.py +0 -8
  13. marker/cleaners/toc.py +0 -29
  14. marker/convert.py +0 -32
  15. marker/{v2/converters β†’ converters}/__init__.py +1 -1
  16. marker/{v2/converters β†’ converters}/pdf.py +22 -22
  17. marker/debug/data.py +0 -109
  18. marker/debug/render.py +0 -62
  19. marker/equations/equations.py +0 -179
  20. marker/equations/inference.py +0 -51
  21. marker/images/extract.py +0 -77
  22. marker/images/save.py +0 -18
  23. marker/layout/layout.py +0 -113
  24. marker/layout/order.py +0 -73
  25. marker/logger.py +0 -3
  26. marker/models.py +15 -35
  27. marker/ocr/detection.py +0 -28
  28. marker/ocr/heuristics.py +0 -78
  29. marker/ocr/lang.py +0 -44
  30. marker/ocr/recognition.py +0 -182
  31. marker/ocr/tesseract.py +0 -97
  32. marker/pdf/extract_text.py +0 -114
  33. marker/pdf/images.py +0 -27
  34. marker/pdf/utils.py +0 -75
  35. marker/postprocessors/markdown.py +0 -254
  36. marker/{v2/processors β†’ processors}/__init__.py +3 -3
  37. marker/{v2/processors β†’ processors}/code.py +4 -4
  38. marker/{v2/processors β†’ processors}/debug.py +3 -3
  39. marker/{v2/processors β†’ processors}/document_toc.py +3 -3
  40. marker/{v2/processors β†’ processors}/equation.py +3 -3
  41. marker/{v2/processors β†’ processors}/ignoretext.py +3 -3
  42. marker/{v2/processors β†’ processors}/sectionheader.py +3 -3
  43. marker/{v2/processors β†’ processors}/table.py +3 -3
  44. marker/{v2/processors β†’ processors}/text.py +4 -4
  45. marker/{v2/providers β†’ providers}/__init__.py +3 -3
  46. marker/{v2/providers β†’ providers}/pdf.py +37 -8
  47. marker/{ocr β†’ providers}/utils.py +4 -1
  48. marker/{v2/renderers β†’ renderers}/__init__.py +3 -3
  49. marker/{v2/renderers β†’ renderers}/html.py +3 -3
  50. marker/{v2/renderers β†’ renderers}/json.py +5 -5
docs/install_ocrmypdf.md DELETED
@@ -1,29 +0,0 @@
1
- ## Linux
2
-
3
- - Run `apt-get install ocrmypdf`
4
- - Install ghostscript > 9.55 by following [these instructions](https://ghostscript.readthedocs.io/en/latest/Install.html) or running `scripts/install/ghostscript_install.sh`.
5
- - Run `pip install ocrmypdf`
6
- - Install any tesseract language packages that you want (example `apt-get install tesseract-ocr-eng`)
7
- - Set the tesseract data folder path
8
- - Find the tesseract data folder `tessdata` with `find / -name tessdata`. Make sure to use the one corresponding to the latest tesseract version if you have multiple.
9
- - Create a `local.env` file in the root `marker` folder with `TESSDATA_PREFIX=/path/to/tessdata` inside it
10
-
11
- ## Mac
12
-
13
- Only needed if using `ocrmypdf` as the ocr backend.
14
-
15
- - Run `brew install ocrmypdf`
16
- - Run `brew install tesseract-lang` to add language support
17
- - Run `pip install ocrmypdf`
18
- - Set the tesseract data folder path
19
- - Find the tesseract data folder `tessdata` with `brew list tesseract`
20
- - Create a `local.env` file in the root `marker` folder with `TESSDATA_PREFIX=/path/to/tessdata` inside it
21
-
22
- ## Windows
23
-
24
- - Install `ocrmypdf` and ghostscript by following [these instructions](https://ocrmypdf.readthedocs.io/en/latest/installation.html#installing-on-windows)
25
- - Run `pip install ocrmypdf`
26
- - Install any tesseract language packages you want
27
- - Set the tesseract data folder path
28
- - Find the tesseract data folder `tessdata` with `brew list tesseract`
29
- - Create a `local.env` file in the root `marker` folder with `TESSDATA_PREFIX=/path/to/tessdata` inside it
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
marker/{v2/builders β†’ builders}/__init__.py RENAMED
@@ -2,7 +2,7 @@ from typing import Optional
2
 
3
  from pydantic import BaseModel
4
 
5
- from marker.v2.util import assign_config
6
 
7
 
8
  class BaseBuilder:
 
2
 
3
  from pydantic import BaseModel
4
 
5
+ from marker.util import assign_config
6
 
7
 
8
  class BaseBuilder:
marker/{v2/builders β†’ builders}/document.py RENAMED
@@ -1,15 +1,18 @@
1
  from marker.settings import settings
2
- from marker.v2.builders import BaseBuilder
3
- from marker.v2.builders.layout import LayoutBuilder
4
- from marker.v2.builders.ocr import OcrBuilder
5
- from marker.v2.providers.pdf import PdfProvider
6
- from marker.v2.schema import BlockTypes
7
- from marker.v2.schema.document import Document
8
- from marker.v2.schema.groups.page import PageGroup
9
- from marker.v2.schema.registry import get_block_class
10
 
11
 
12
  class DocumentBuilder(BaseBuilder):
 
 
 
13
  def __call__(self, provider: PdfProvider, layout_builder: LayoutBuilder, ocr_builder: OcrBuilder):
14
  document = self.build_document(provider)
15
  layout_builder(document, provider)
@@ -21,8 +24,8 @@ class DocumentBuilder(BaseBuilder):
21
  initial_pages = [
22
  PageGroupClass(
23
  page_id=i,
24
- lowres_image=provider.get_image(i, settings.IMAGE_DPI),
25
- highres_image=provider.get_image(i, settings.HIGHRES_IMAGE_DPI),
26
  polygon=provider.get_page_bbox(i)
27
  ) for i in provider.page_range
28
  ]
 
1
  from marker.settings import settings
2
+ from marker.builders import BaseBuilder
3
+ from marker.builders.layout import LayoutBuilder
4
+ from marker.builders.ocr import OcrBuilder
5
+ from marker.providers.pdf import PdfProvider
6
+ from marker.schema import BlockTypes
7
+ from marker.schema.document import Document
8
+ from marker.schema.groups.page import PageGroup
9
+ from marker.schema.registry import get_block_class
10
 
11
 
12
  class DocumentBuilder(BaseBuilder):
13
+ lowres_image_dpi: int = 96
14
+ highres_image_dpi: int = 192
15
+
16
  def __call__(self, provider: PdfProvider, layout_builder: LayoutBuilder, ocr_builder: OcrBuilder):
17
  document = self.build_document(provider)
18
  layout_builder(document, provider)
 
24
  initial_pages = [
25
  PageGroupClass(
26
  page_id=i,
27
+ lowres_image=provider.get_image(i, self.lowres_image_dpi),
28
+ highres_image=provider.get_image(i, self.highres_image_dpi),
29
  polygon=provider.get_page_bbox(i)
30
  ) for i in provider.page_range
31
  ]
marker/{v2/builders β†’ builders}/layout.py RENAMED
@@ -5,14 +5,14 @@ from surya.schema import LayoutResult
5
  from surya.model.layout.encoderdecoder import SuryaLayoutModel
6
 
7
  from marker.settings import settings
8
- from marker.v2.builders import BaseBuilder
9
- from marker.v2.providers import ProviderOutput, ProviderPageLines
10
- from marker.v2.providers.pdf import PdfProvider
11
- from marker.v2.schema import BlockTypes
12
- from marker.v2.schema.document import Document
13
- from marker.v2.schema.groups.page import PageGroup
14
- from marker.v2.schema.polygon import PolygonBox
15
- from marker.v2.schema.registry import get_block_class
16
 
17
 
18
  class LayoutBuilder(BaseBuilder):
 
5
  from surya.model.layout.encoderdecoder import SuryaLayoutModel
6
 
7
  from marker.settings import settings
8
+ from marker.builders import BaseBuilder
9
+ from marker.providers import ProviderOutput, ProviderPageLines
10
+ from marker.providers.pdf import PdfProvider
11
+ from marker.schema import BlockTypes
12
+ from marker.schema.document import Document
13
+ from marker.schema.groups.page import PageGroup
14
+ from marker.schema.polygon import PolygonBox
15
+ from marker.schema.registry import get_block_class
16
 
17
 
18
  class LayoutBuilder(BaseBuilder):
marker/{v2/builders β†’ builders}/ocr.py RENAMED
@@ -3,15 +3,15 @@ from surya.model.recognition.encoderdecoder import OCREncoderDecoderModel
3
  from surya.ocr import run_ocr
4
 
5
  from marker.settings import settings
6
- from marker.v2.builders import BaseBuilder
7
- from marker.v2.providers import ProviderOutput, ProviderPageLines
8
- from marker.v2.providers.pdf import PdfProvider
9
- from marker.v2.schema import BlockTypes
10
- from marker.v2.schema.document import Document
11
- from marker.v2.schema.polygon import PolygonBox
12
- from marker.v2.schema.registry import get_block_class
13
- from marker.v2.schema.text.line import Line
14
- from marker.v2.schema.text.span import Span
15
 
16
 
17
  class OcrBuilder(BaseBuilder):
 
3
  from surya.ocr import run_ocr
4
 
5
  from marker.settings import settings
6
+ from marker.builders import BaseBuilder
7
+ from marker.providers import ProviderOutput, ProviderPageLines
8
+ from marker.providers.pdf import PdfProvider
9
+ from marker.schema import BlockTypes
10
+ from marker.schema.document import Document
11
+ from marker.schema.polygon import PolygonBox
12
+ from marker.schema.registry import get_block_class
13
+ from marker.schema.text.line import Line
14
+ from marker.schema.text.span import Span
15
 
16
 
17
  class OcrBuilder(BaseBuilder):
marker/{v2/builders β†’ builders}/structure.py RENAMED
@@ -1,9 +1,9 @@
1
- from marker.v2.builders import BaseBuilder
2
- from marker.v2.schema import BlockTypes
3
- from marker.v2.schema.document import Document
4
- from marker.v2.schema.groups import ListGroup
5
- from marker.v2.schema.groups.page import PageGroup
6
- from marker.v2.schema.registry import get_block_class
7
 
8
 
9
  class StructureBuilder(BaseBuilder):
 
1
+ from marker.builders import BaseBuilder
2
+ from marker.schema import BlockTypes
3
+ from marker.schema.document import Document
4
+ from marker.schema.groups import ListGroup
5
+ from marker.schema.groups.page import PageGroup
6
+ from marker.schema.registry import get_block_class
7
 
8
 
9
  class StructureBuilder(BaseBuilder):
marker/cleaners/bullets.py DELETED
@@ -1,8 +0,0 @@
1
- import re
2
-
3
-
4
- def replace_bullets(text):
5
- # Replace bullet characters with a -
6
- bullet_pattern = r"(^|[\n ])[‒●○■β–ͺ▫–—]( )"
7
- replaced_string = re.sub(bullet_pattern, r"\1-\2", text)
8
- return replaced_string
 
 
 
 
 
 
 
 
 
marker/cleaners/code.py DELETED
@@ -1,131 +0,0 @@
1
- from collections import Counter
2
- from statistics import mean, median
3
-
4
- from marker.schema.block import Span, Line
5
- from marker.schema.page import Page
6
- import re
7
- from typing import List
8
-
9
-
10
- def is_code_linelen(lines, thresh=80):
11
- # Decide based on chars per newline threshold
12
- total_alnum_chars = sum(len(re.findall(r'\w', line.prelim_text)) for line in lines)
13
- total_newlines = max(len(lines) - 1, 1)
14
-
15
- if total_alnum_chars == 0:
16
- return False
17
-
18
- ratio = total_alnum_chars / total_newlines
19
- return ratio < thresh
20
-
21
-
22
- def comment_count(lines):
23
- pattern = re.compile(r"^(//|#|'|--|/\*|'''|\"\"\"|--\[\[|<!--|%|%{|\(\*)")
24
- return sum([1 for line in lines if pattern.match(line)])
25
-
26
-
27
- def identify_code_blocks(pages: List[Page]):
28
- code_block_count = 0
29
- font_sizes = []
30
- line_heights = []
31
- for page in pages:
32
- font_sizes += page.get_font_sizes()
33
- line_heights += page.get_line_heights()
34
-
35
- avg_font_size = None
36
- avg_line_height = None
37
- if len(font_sizes) > 0:
38
- avg_line_height = median(line_heights)
39
- avg_font_size = mean(font_sizes)
40
-
41
- for page in pages:
42
- for block in page.blocks:
43
- if block.block_type != "Text":
44
- last_block = block
45
- continue
46
-
47
- # Ensure we have lines and spans
48
- if len(block.lines) == 0:
49
- continue
50
- if sum([len(line.spans) for line in block.lines]) == 0:
51
- continue
52
-
53
- min_start = block.get_min_line_start()
54
-
55
- is_indent = []
56
- line_fonts = []
57
- line_font_sizes = []
58
- block_line_heights = []
59
- for line in block.lines:
60
- line_fonts += [span.font for span in line.spans]
61
- line_font_sizes += [span.font_size for span in line.spans]
62
- block_line_heights.append(line.bbox[3] - line.bbox[1])
63
-
64
- is_indent.append(line.bbox[0] > min_start)
65
-
66
- comment_lines = comment_count([line.prelim_text for line in block.lines])
67
- is_code = [
68
- len(block.lines) > 3,
69
- is_code_linelen(block.lines),
70
- sum(is_indent) + comment_lines > len(block.lines) * .7, # Indentation and comments are a majority
71
- ]
72
-
73
- if avg_font_size is not None:
74
- font_checks = [
75
- mean(line_font_sizes) <= avg_font_size * .8, # Lower than average font size and line height
76
- mean(block_line_heights) < avg_line_height * .8
77
- ]
78
- is_code += font_checks
79
-
80
- if all(is_code):
81
- code_block_count += 1
82
- block.block_type = "Code"
83
-
84
- return code_block_count
85
-
86
-
87
- def indent_blocks(pages: List[Page]):
88
- span_counter = 0
89
- for page in pages:
90
- for block in page.blocks:
91
- if block.block_type != "Code":
92
- continue
93
-
94
- lines = []
95
- min_left = 1000 # will contain x- coord of column 0
96
- col_width = 0 # width of 1 char
97
- for line in block.lines:
98
- text = ""
99
- min_left = min(line.bbox[0], min_left)
100
- for span in line.spans:
101
- if col_width == 0 and len(span.text) > 0:
102
- col_width = (span.bbox[2] - span.bbox[0]) / len(span.text)
103
- text += span.text
104
- lines.append((line.bbox, text))
105
-
106
- block_text = ""
107
- blank_line = False
108
- for line in lines:
109
- text = line[1]
110
- if col_width == 0:
111
- prefix = ""
112
- else:
113
- prefix = " " * int((line[0][0] - min_left) / col_width)
114
- current_line_blank = len(text.strip()) == 0
115
- if blank_line and current_line_blank:
116
- # Don't put multiple blank lines in a row
117
- continue
118
-
119
- block_text += prefix + text + "\n"
120
- blank_line = current_line_blank
121
-
122
- new_span = Span(
123
- text=block_text,
124
- bbox=block.bbox,
125
- span_id=f"{span_counter}_fix_code",
126
- font=block.lines[0].spans[0].font,
127
- font_weight=block.lines[0].spans[0].font_weight,
128
- font_size=block.lines[0].spans[0].font_size,
129
- )
130
- span_counter += 1
131
- block.lines = [Line(spans=[new_span], bbox=block.bbox)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
marker/cleaners/fontstyle.py DELETED
@@ -1,30 +0,0 @@
1
- from typing import List
2
-
3
- from marker.schema.page import Page
4
-
5
-
6
- def find_bold_italic(pages: List[Page], bold_min_weight=600):
7
- font_weights = []
8
- for page in pages:
9
- for block in page.blocks:
10
- # We don't want to bias our font stats
11
- if block.block_type in ["Title", "Section-header"]:
12
- continue
13
- for line in block.lines:
14
- for span in line.spans:
15
- if "bold" in span.font.lower():
16
- span.bold = True
17
- if "ital" in span.font.lower():
18
- span.italic = True
19
-
20
- font_weights.append(span.font_weight)
21
-
22
- if len(font_weights) == 0:
23
- return
24
-
25
- for page in pages:
26
- for block in page.blocks:
27
- for line in block.lines:
28
- for span in line.spans:
29
- if span.font_weight >= bold_min_weight:
30
- span.bold = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
marker/cleaners/headers.py DELETED
@@ -1,82 +0,0 @@
1
- import re
2
- from collections import Counter
3
- from rapidfuzz import fuzz
4
-
5
- from marker.schema.merged import FullyMergedBlock
6
- from typing import List, Tuple
7
-
8
-
9
- def filter_common_elements(lines, page_count, threshold=.6):
10
- # We can't filter if we don't have enough pages to find common elements
11
- if page_count < 3:
12
- return []
13
- text = [s.text for line in lines for s in line.spans if len(s.text) > 4]
14
- counter = Counter(text)
15
- common = [k for k, v in counter.items() if v > page_count * threshold]
16
- bad_span_ids = [s.span_id for line in lines for s in line.spans if s.text in common]
17
- return bad_span_ids
18
-
19
-
20
- def filter_header_footer(all_page_blocks, max_selected_lines=2):
21
- first_lines = []
22
- last_lines = []
23
- for page in all_page_blocks:
24
- nonblank_lines = page.get_nonblank_lines()
25
- first_lines.extend(nonblank_lines[:max_selected_lines])
26
- last_lines.extend(nonblank_lines[-max_selected_lines:])
27
-
28
- bad_span_ids = filter_common_elements(first_lines, len(all_page_blocks))
29
- bad_span_ids += filter_common_elements(last_lines, len(all_page_blocks))
30
- return bad_span_ids
31
-
32
-
33
- def replace_leading_trailing_digits(string, replacement):
34
- string = re.sub(r'^\d+', replacement, string)
35
- string = re.sub(r'\d+$', replacement, string)
36
- return string
37
-
38
-
39
- def find_overlap_elements(lst: List[Tuple[str, int]], string_match_thresh=.9, min_overlap=.05) -> List[int]:
40
- # Initialize a list to store the elements that meet the criteria
41
- result = []
42
- titles = [l[0] for l in lst]
43
-
44
- for i, (str1, id_num) in enumerate(lst):
45
- overlap_count = 0 # Count the number of elements that overlap by at least 80%
46
-
47
- for j, str2 in enumerate(titles):
48
- if i != j and fuzz.ratio(str1, str2) >= string_match_thresh * 100:
49
- overlap_count += 1
50
-
51
- # Check if the element overlaps with at least 50% of other elements
52
- if overlap_count >= max(3.0, len(lst) * min_overlap):
53
- result.append(id_num)
54
-
55
- return result
56
-
57
-
58
- def filter_common_titles(merged_blocks: List[FullyMergedBlock]) -> List[FullyMergedBlock]:
59
- titles = []
60
- for i, block in enumerate(merged_blocks):
61
- if block.block_type in ["Title", "Section-header"]:
62
- text = block.text
63
- if text.strip().startswith("#"):
64
- text = re.sub(r'#+', '', text)
65
- text = text.strip()
66
- # Remove page numbers from start/end
67
- text = replace_leading_trailing_digits(text, "").strip()
68
- titles.append((text, i))
69
-
70
- bad_block_ids = find_overlap_elements(titles)
71
-
72
- new_blocks = []
73
- for i, block in enumerate(merged_blocks):
74
- if i in bad_block_ids:
75
- continue
76
- new_blocks.append(block)
77
-
78
- return new_blocks
79
-
80
-
81
-
82
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
marker/cleaners/headings.py DELETED
@@ -1,129 +0,0 @@
1
- from collections import defaultdict
2
- from typing import List
3
- import numpy as np
4
- from sklearn.cluster import KMeans
5
-
6
- from marker.settings import settings
7
- from marker.schema.bbox import rescale_bbox
8
- from marker.schema.block import bbox_from_lines
9
- from marker.schema.page import Page
10
-
11
-
12
- def split_heading_blocks(pages: List[Page]):
13
- # Heading lines can be combined into regular text blocks sometimes by pdftext
14
- # Split up heading lines into separate blocks properly
15
- for page in pages:
16
- page_heading_boxes = [b for b in page.layout.bboxes if b.label in ["Title", "Section-header"]]
17
- page_heading_boxes = [(rescale_bbox(page.layout.image_bbox, page.bbox, b.bbox), b.label) for b in page_heading_boxes]
18
-
19
- new_blocks = []
20
- for block_idx, block in enumerate(page.blocks):
21
- if block.block_type not in ["Text"]:
22
- new_blocks.append(block)
23
- continue
24
-
25
- heading_lines = []
26
- for line_idx, line in enumerate(block.lines):
27
- for (heading_box, label) in page_heading_boxes:
28
- if line.intersection_pct(heading_box) > settings.BBOX_INTERSECTION_THRESH:
29
- heading_lines.append((line_idx, label))
30
- break
31
-
32
- if len(heading_lines) == 0:
33
- new_blocks.append(block)
34
- continue
35
-
36
- # Split up the block into separate blocks around headers
37
- start = 0
38
- for (heading_line, label) in heading_lines:
39
- if start < heading_line:
40
- copied_block = block.copy()
41
- copied_block.lines = block.lines[start:heading_line]
42
- copied_block.bbox = bbox_from_lines(copied_block.lines)
43
- new_blocks.append(copied_block)
44
-
45
- copied_block = block.copy()
46
- copied_block.lines = block.lines[heading_line:heading_line + 1]
47
- copied_block.block_type = label
48
- copied_block.bbox = bbox_from_lines(copied_block.lines)
49
- new_blocks.append(copied_block)
50
-
51
- start = heading_line + 1
52
- if start >= len(block.lines):
53
- break
54
-
55
- # Add any remaining lines
56
- if start < len(block.lines):
57
- copied_block = block.copy()
58
- copied_block.lines = block.lines[start:]
59
- copied_block.bbox = bbox_from_lines(copied_block.lines)
60
- new_blocks.append(copied_block)
61
-
62
- page.blocks = new_blocks
63
-
64
-
65
- def bucket_headings(line_heights, num_levels=settings.HEADING_LEVEL_COUNT):
66
- if len(line_heights) <= num_levels:
67
- return []
68
-
69
- data = np.asarray(line_heights).reshape(-1, 1)
70
- labels = KMeans(n_clusters=num_levels, random_state=0, n_init="auto").fit_predict(data)
71
- data_labels = np.concatenate([data, labels.reshape(-1, 1)], axis=1)
72
- data_labels = np.sort(data_labels, axis=0)
73
-
74
- cluster_means = {int(label): float(np.mean(data_labels[data_labels[:, 1] == label, 0])) for label in np.unique(labels)}
75
- label_max = None
76
- label_min = None
77
- heading_ranges = []
78
- prev_cluster = None
79
- for row in data_labels:
80
- value, label = row
81
- value = float(value)
82
- label = int(label)
83
- if prev_cluster is not None and label != prev_cluster:
84
- prev_cluster_mean = cluster_means[prev_cluster]
85
- cluster_mean = cluster_means[label]
86
- if cluster_mean * settings.HEADING_MERGE_THRESHOLD < prev_cluster_mean:
87
- heading_ranges.append((label_min, label_max))
88
- label_min = None
89
- label_max = None
90
-
91
- label_min = value if label_min is None else min(label_min, value)
92
- label_max = value if label_max is None else max(label_max, value)
93
- prev_cluster = label
94
-
95
- if label_min is not None:
96
- heading_ranges.append((label_min, label_max))
97
-
98
- heading_ranges = sorted(heading_ranges, reverse=True)
99
-
100
- return heading_ranges
101
-
102
-
103
- def infer_heading_levels(pages: List[Page], height_tol=.99):
104
- all_line_heights = []
105
- for page in pages:
106
- for block in page.blocks:
107
- if block.block_type not in ["Title", "Section-header"]:
108
- continue
109
-
110
- all_line_heights.extend([l.height for l in block.lines])
111
-
112
- heading_ranges = bucket_headings(all_line_heights)
113
-
114
- for page in pages:
115
- for block in page.blocks:
116
- if block.block_type not in ["Title", "Section-header"]:
117
- continue
118
-
119
- block_heights = [l.height for l in block.lines]
120
- if len(block_heights) > 0:
121
- avg_height = sum(block_heights) / len(block_heights)
122
- for idx, (min_height, max_height) in enumerate(heading_ranges):
123
- if avg_height >= min_height * height_tol:
124
- block.heading_level = idx + 1
125
- break
126
-
127
- if block.heading_level is None:
128
- block.heading_level = settings.HEADING_DEFAULT_LEVEL
129
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
marker/cleaners/text.py DELETED
@@ -1,8 +0,0 @@
1
- import re
2
-
3
-
4
- def cleanup_text(full_text):
5
- full_text = re.sub(r'\n{3,}', '\n\n', full_text)
6
- full_text = re.sub(r'(\n\s){3,}', '\n\n', full_text)
7
- full_text = full_text.replace('\xa0', ' ') # Replace non-breaking spaces
8
- return full_text
 
 
 
 
 
 
 
 
 
marker/cleaners/toc.py DELETED
@@ -1,29 +0,0 @@
1
- from typing import List
2
-
3
- from marker.schema.page import Page
4
-
5
-
6
- def get_pdf_toc(doc, max_depth=15):
7
- toc = doc.get_toc(max_depth=max_depth)
8
- toc_list = []
9
- for item in toc:
10
- list_item = {
11
- "title": item.title,
12
- "level": item.level,
13
- "page": item.page_index,
14
- }
15
- toc_list.append(list_item)
16
- return toc_list
17
-
18
-
19
- def compute_toc(pages: List[Page]):
20
- toc = []
21
- for page in pages:
22
- for block in page.blocks:
23
- if block.block_type in ["Title", "Section-header"]:
24
- toc.append({
25
- "title": block.prelim_text,
26
- "level": block.heading_level,
27
- "page": page.pnum
28
- })
29
- return toc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
marker/convert.py CHANGED
@@ -1,42 +1,10 @@
1
  import warnings
2
-
3
- from marker.pdf.images import render_image
4
-
5
  warnings.filterwarnings("ignore", category=UserWarning) # Filter torch pytree user warnings
6
 
7
  import os
8
  os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # For some reason, transformers decided to use .isin for a simple op, which is not supported on MPS
9
 
10
 
11
- import pypdfium2 as pdfium # Needs to be at the top to avoid warnings
12
- from PIL import Image
13
-
14
- from marker.utils import flush_cuda_memory
15
- from marker.tables.table import format_tables
16
- from marker.debug.data import dump_bbox_debug_data, draw_page_debug_images
17
- from marker.layout.layout import surya_layout, annotate_block_types
18
- from marker.layout.order import surya_order, sort_blocks_in_reading_order
19
- from marker.ocr.lang import replace_langs_with_codes, validate_langs
20
- from marker.ocr.detection import surya_detection
21
- from marker.ocr.recognition import run_ocr
22
- from marker.pdf.extract_text import get_text_blocks
23
- from marker.cleaners.headers import filter_header_footer, filter_common_titles
24
- from marker.equations.equations import replace_equations
25
- from marker.pdf.utils import find_filetype
26
- from marker.cleaners.code import identify_code_blocks, indent_blocks
27
- from marker.cleaners.bullets import replace_bullets
28
- from marker.cleaners.headings import split_heading_blocks, infer_heading_levels
29
- from marker.cleaners.fontstyle import find_bold_italic
30
- from marker.postprocessors.markdown import merge_spans, merge_lines, get_full_text
31
- from marker.cleaners.text import cleanup_text
32
- from marker.images.extract import extract_images
33
- from marker.images.save import images_to_dict
34
- from marker.cleaners.toc import compute_toc
35
-
36
- from typing import List, Dict, Tuple, Optional
37
- from marker.settings import settings
38
-
39
-
40
  def convert_single_pdf(
41
  fname: str,
42
  model_lst: List,
 
1
  import warnings
 
 
 
2
  warnings.filterwarnings("ignore", category=UserWarning) # Filter torch pytree user warnings
3
 
4
  import os
5
  os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # For some reason, transformers decided to use .isin for a simple op, which is not supported on MPS
6
 
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  def convert_single_pdf(
9
  fname: str,
10
  model_lst: List,
marker/{v2/converters β†’ converters}/__init__.py RENAMED
@@ -2,7 +2,7 @@ from typing import Optional
2
 
3
  from pydantic import BaseModel
4
 
5
- from marker.v2.util import assign_config
6
 
7
 
8
  class BaseConverter:
 
2
 
3
  from pydantic import BaseModel
4
 
5
+ from marker.util import assign_config
6
 
7
 
8
  class BaseConverter:
marker/{v2/converters β†’ converters}/pdf.py RENAMED
@@ -1,13 +1,13 @@
1
  import json
2
 
3
  from marker.settings import settings
4
- from marker.v2.processors.code import CodeProcessor
5
- from marker.v2.processors.document_toc import DocumentTOCProcessor
6
- from marker.v2.providers.pdf import PdfProvider
7
  import os
8
 
9
- from marker.v2.renderers.json import JSONRenderer
10
- from marker.v2.util import parse_range_str
11
 
12
  os.environ["TOKENIZERS_PARALLELISM"] = "false" # disables a tokenizers warning
13
 
@@ -17,24 +17,24 @@ from typing import Dict, Type, List, Any
17
  import click
18
  import inspect
19
 
20
- from marker.v2.builders.document import DocumentBuilder
21
- from marker.v2.builders.layout import LayoutBuilder
22
- from marker.v2.builders.ocr import OcrBuilder
23
- from marker.v2.builders.structure import StructureBuilder
24
- from marker.v2.converters import BaseConverter
25
- from marker.v2.models import setup_detection_model, setup_layout_model, \
26
  setup_recognition_model, setup_table_rec_model, setup_texify_model
27
- from marker.v2.processors.equation import EquationProcessor
28
- from marker.v2.processors.sectionheader import SectionHeaderProcessor
29
- from marker.v2.processors.text import TextProcessor
30
- from marker.v2.processors.table import TableProcessor
31
- from marker.v2.renderers.markdown import MarkdownRenderer
32
- from marker.v2.schema import BlockTypes
33
- from marker.v2.schema.blocks import Block
34
- from marker.v2.schema.registry import register_block_class
35
- from marker.v2.processors.debug import DebugProcessor
36
- from marker.v2.processors import BaseProcessor
37
- from marker.v2.renderers import BaseRenderer
38
 
39
 
40
  class PdfConverter(BaseConverter):
 
1
  import json
2
 
3
  from marker.settings import settings
4
+ from marker.processors.code import CodeProcessor
5
+ from marker.processors.document_toc import DocumentTOCProcessor
6
+ from marker.providers.pdf import PdfProvider
7
  import os
8
 
9
+ from marker.renderers.json import JSONRenderer
10
+ from marker.util import parse_range_str
11
 
12
  os.environ["TOKENIZERS_PARALLELISM"] = "false" # disables a tokenizers warning
13
 
 
17
  import click
18
  import inspect
19
 
20
+ from marker.builders.document import DocumentBuilder
21
+ from marker.builders.layout import LayoutBuilder
22
+ from marker.builders.ocr import OcrBuilder
23
+ from marker.builders.structure import StructureBuilder
24
+ from marker.converters import BaseConverter
25
+ from marker.models import setup_detection_model, setup_layout_model, \
26
  setup_recognition_model, setup_table_rec_model, setup_texify_model
27
+ from marker.processors.equation import EquationProcessor
28
+ from marker.processors.sectionheader import SectionHeaderProcessor
29
+ from marker.processors.text import TextProcessor
30
+ from marker.processors.table import TableProcessor
31
+ from marker.renderers.markdown import MarkdownRenderer
32
+ from marker.schema import BlockTypes
33
+ from marker.schema.blocks import Block
34
+ from marker.schema.registry import register_block_class
35
+ from marker.processors.debug import DebugProcessor
36
+ from marker.processors import BaseProcessor
37
+ from marker.renderers import BaseRenderer
38
 
39
 
40
  class PdfConverter(BaseConverter):
marker/debug/data.py DELETED
@@ -1,109 +0,0 @@
1
- import json
2
- import math
3
- import os
4
- from typing import List
5
-
6
- from marker.debug.render import render_on_image
7
- from marker.schema.bbox import rescale_bbox
8
- from marker.schema.page import Page
9
- from marker.settings import settings
10
- from PIL import Image
11
-
12
-
13
- def draw_layout_page_debug_images(fname, pages: List[Page]):
14
- # Remove extension from doc name
15
- doc_base = os.path.basename(fname).rsplit(".", 1)[0]
16
-
17
- debug_folder = os.path.join(settings.DEBUG_DATA_FOLDER, doc_base)
18
- os.makedirs(debug_folder, exist_ok=True)
19
- for idx, page in enumerate(pages):
20
- img_size = (int(math.ceil(page.text_lines.image_bbox[2])), int(math.ceil(page.text_lines.image_bbox[3])))
21
- png_image = Image.new("RGB", img_size, color="white")
22
-
23
- line_bboxes = []
24
- line_text = []
25
- for block in page.blocks:
26
- for line in block.lines:
27
- line_bboxes.append(rescale_bbox(page.bbox, page.text_lines.image_bbox, line.bbox))
28
- line_text.append(line.prelim_text)
29
-
30
- render_on_image(line_bboxes, png_image, labels=line_text, color="black", draw_bbox=False)
31
- pdf_image = png_image.copy()
32
-
33
- line_bboxes = [line.bbox for line in page.text_lines.bboxes]
34
- render_on_image(line_bboxes, png_image, color="blue")
35
-
36
- layout_boxes = [rescale_bbox(page.layout.image_bbox, page.text_lines.image_bbox, box.bbox) for box in page.layout.bboxes]
37
- layout_labels = [box.label for box in page.layout.bboxes]
38
-
39
- render_on_image(layout_boxes, png_image, labels=layout_labels, color="red")
40
-
41
- order_labels = [str(i) for i in range(len(page.layout.bboxes))]
42
- render_on_image(layout_boxes, png_image, labels=order_labels, color="green", draw_bbox=False, label_offset=5)
43
-
44
- debug_file = os.path.join(debug_folder, f"layout_page_{idx}.png")
45
- png_image.save(debug_file)
46
-
47
- # PDF Image
48
-
49
- block_bboxes = [rescale_bbox(page.bbox, page.text_lines.image_bbox, block.bbox) for block in page.blocks]
50
- block_labels = [block.block_type for block in page.blocks]
51
- render_on_image(block_bboxes, pdf_image, labels=block_labels, color="red")
52
-
53
- block_order = [str(i) for i in range(len(page.blocks))]
54
- render_on_image(block_bboxes, pdf_image, labels=block_order, color="green", draw_bbox=False, label_offset=5)
55
-
56
- debug_file = os.path.join(debug_folder, f"pdf_page_{idx}.png")
57
- pdf_image.save(debug_file)
58
-
59
-
60
- def draw_pdf_page_debug_images(fname, pages: List[Page]):
61
- # Remove extension from doc name
62
- doc_base = os.path.basename(fname).rsplit(".", 1)[0]
63
-
64
- debug_folder = os.path.join(settings.DEBUG_DATA_FOLDER, doc_base)
65
- os.makedirs(debug_folder, exist_ok=True)
66
- for idx, page in enumerate(pages):
67
- img_size = (int(math.ceil(page.text_lines.image_bbox[2])), int(math.ceil(page.text_lines.image_bbox[3])))
68
- png_image = Image.new("RGB", img_size, color="white")
69
-
70
- line_bboxes = []
71
- line_text = []
72
- for block in page.blocks:
73
- for line in block.lines:
74
- line_bboxes.append(rescale_bbox(page.bbox, page.text_lines.image_bbox, line.bbox))
75
- line_text.append(line.prelim_text)
76
-
77
-
78
-
79
-
80
- def draw_page_debug_images(fname, pages: List[Page]):
81
- if not settings.DEBUG:
82
- return
83
-
84
- draw_layout_page_debug_images(fname, pages)
85
- draw_pdf_page_debug_images(fname, pages)
86
-
87
-
88
-
89
- def dump_bbox_debug_data(fname, pages: List[Page]):
90
- if not settings.DEBUG:
91
- return
92
-
93
- # Remove extension from doc name
94
- doc_base = os.path.basename(fname).rsplit(".", 1)[0]
95
-
96
- debug_file = os.path.join(settings.DEBUG_DATA_FOLDER, f"{doc_base}_bbox.json")
97
- debug_data = []
98
- for idx, page_blocks in enumerate(pages):
99
- page_data = page_blocks.model_dump(exclude=["images", "layout", "text_lines"])
100
- page_data["layout"] = page_blocks.layout.model_dump(exclude=["segmentation_map"])
101
- page_data["text_lines"] = page_blocks.text_lines.model_dump(exclude=["heatmap", "affinity_map"])
102
- debug_data.append(page_data)
103
-
104
- with open(debug_file, "w+") as f:
105
- json.dump(debug_data, f)
106
- print(f"Dumped bbox debug data to {debug_file}")
107
-
108
-
109
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
marker/debug/render.py DELETED
@@ -1,62 +0,0 @@
1
- import requests
2
- from PIL import ImageDraw, ImageFont, Image
3
-
4
- from marker.settings import settings
5
- import os
6
-
7
-
8
- def get_font_path() -> str:
9
- font_path = settings.DEBUG_RENDER_FONT
10
-
11
- if not os.path.exists(font_path):
12
- os.makedirs(os.path.dirname(font_path), exist_ok=True)
13
- font_dl_path = f"{settings.FONT_DL_BASE}/{os.path.basename(font_path)}"
14
- with requests.get(font_dl_path, stream=True) as r, open(font_path, 'wb') as f:
15
- r.raise_for_status()
16
- for chunk in r.iter_content(chunk_size=8192):
17
- f.write(chunk)
18
-
19
- return font_path
20
-
21
-
22
- def get_text_size(text, font):
23
- im = Image.new(mode="P", size=(0, 0))
24
- draw = ImageDraw.Draw(im)
25
- _, _, width, height = draw.textbbox((0, 0), text=text, font=font)
26
- return width, height
27
-
28
-
29
- def render_on_image(bboxes, image, labels=None, label_offset=1, label_font_size=10, color: str | list='red', draw_bbox=True):
30
- draw = ImageDraw.Draw(image)
31
- font_path = get_font_path()
32
- label_font = ImageFont.truetype(font_path, label_font_size)
33
-
34
- for i, bbox in enumerate(bboxes):
35
- bbox = [int(p) for p in bbox]
36
- if draw_bbox:
37
- draw.rectangle(bbox, outline=color[i] if isinstance(color, list) else color, width=1)
38
-
39
- if labels is not None:
40
- label = labels[i]
41
- text_position = (
42
- bbox[0] + label_offset,
43
- bbox[1] + label_offset
44
- )
45
- text_size = get_text_size(label, label_font)
46
- if text_size[0] <= 0 or text_size[1] <= 0:
47
- continue
48
- box_position = (
49
- text_position[0],
50
- text_position[1],
51
- text_position[0] + text_size[0],
52
- text_position[1] + text_size[1]
53
- )
54
- draw.rectangle(box_position, fill="white")
55
- draw.text(
56
- text_position,
57
- label,
58
- fill=color[i] if isinstance(color, list) else color,
59
- font=label_font
60
- )
61
-
62
- return image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
marker/equations/equations.py DELETED
@@ -1,179 +0,0 @@
1
- from collections import defaultdict
2
- from copy import deepcopy
3
- from typing import List
4
-
5
- from marker.equations.inference import get_total_texify_tokens, get_latex_batched
6
- from marker.pdf.images import render_bbox_image
7
- from marker.schema.bbox import rescale_bbox
8
- from marker.schema.page import Page
9
- from marker.schema.block import Line, Span, Block, split_block_lines, find_insert_block
10
- from marker.settings import settings
11
-
12
-
13
- def find_equation_blocks(page, processor):
14
- equation_blocks = []
15
- equation_regions = [l.bbox for l in page.layout.bboxes if l.label in ["Formula"]]
16
- equation_regions = [rescale_bbox(page.layout.image_bbox, page.bbox, b) for b in equation_regions]
17
-
18
- lines_to_remove = defaultdict(list)
19
- insert_points = {}
20
- equation_lines = defaultdict(list)
21
- for region_idx, region in enumerate(equation_regions):
22
- for block_idx, block in enumerate(page.blocks):
23
- for line_idx, line in enumerate(block.lines):
24
- if line.intersection_pct(region) > settings.BBOX_INTERSECTION_THRESH:
25
- # We will remove this line from the block
26
- lines_to_remove[region_idx].append((block_idx, line_idx))
27
- equation_lines[region_idx].append(line)
28
-
29
- if region_idx not in insert_points:
30
- insert_points[region_idx] = (block_idx, line_idx)
31
-
32
- # Account for regions where the lines were not detected
33
- for region_idx, region in enumerate(equation_regions):
34
- if region_idx in insert_points:
35
- continue
36
-
37
- insert_points[region_idx] = (find_insert_block(page.blocks, region), 0)
38
-
39
- block_lines_to_remove = defaultdict(set)
40
- for region_idx, equation_region in enumerate(equation_regions):
41
- if region_idx not in equation_lines or len(equation_lines[region_idx]) == 0:
42
- block_text = ""
43
- total_tokens = 0
44
- else:
45
- equation_block = equation_lines[region_idx]
46
- block_text = " ".join([line.prelim_text for line in equation_block])
47
- total_tokens = get_total_texify_tokens(block_text, processor)
48
-
49
- equation_insert = insert_points[region_idx]
50
- equation_insert_line_idx = equation_insert[1]
51
- equation_insert_line_idx -= len(
52
- [x for x in lines_to_remove[region_idx] if x[0] == equation_insert[0] and x[1] < equation_insert[1]])
53
-
54
- selected_blocks = [equation_insert[0], equation_insert_line_idx, total_tokens, block_text, equation_region]
55
- if total_tokens < settings.TEXIFY_MODEL_MAX:
56
- # Account for the lines we're about to remove
57
- for item in lines_to_remove[region_idx]:
58
- block_lines_to_remove[item[0]].add(item[1])
59
- equation_blocks.append(selected_blocks)
60
-
61
- # Remove the lines from the blocks
62
- for block_idx, bad_lines in block_lines_to_remove.items():
63
- block = page.blocks[block_idx]
64
- block.lines = [line for idx, line in enumerate(block.lines) if idx not in bad_lines]
65
-
66
- return equation_blocks
67
-
68
-
69
- def increment_insert_points(page_equation_blocks, insert_block_idx, insert_count):
70
- for idx, (block_idx, line_idx, token_count, block_text, equation_bbox) in enumerate(page_equation_blocks):
71
- if block_idx >= insert_block_idx:
72
- page_equation_blocks[idx][0] += insert_count
73
-
74
-
75
- def insert_latex_block(page_blocks: Page, page_equation_blocks, predictions, pnum, processor):
76
- converted_spans = []
77
- idx = 0
78
- success_count = 0
79
- fail_count = 0
80
- for block_number, (insert_block_idx, insert_line_idx, token_count, block_text, equation_bbox) in enumerate(page_equation_blocks):
81
- latex_text = predictions[block_number]
82
- conditions = [
83
- get_total_texify_tokens(latex_text, processor) < settings.TEXIFY_MODEL_MAX, # Make sure we didn't get to the overall token max, indicates run-on
84
- len(latex_text) > len(block_text) * .7,
85
- len(latex_text.strip()) > 0
86
- ]
87
-
88
- new_block = Block(
89
- lines=[Line(
90
- spans=[
91
- Span(
92
- text=block_text.replace("\n", " "),
93
- bbox=equation_bbox,
94
- span_id=f"{pnum}_{idx}_fixeq",
95
- font="Latex",
96
- font_weight=0,
97
- font_size=0
98
- )
99
- ],
100
- bbox=equation_bbox
101
- )],
102
- bbox=equation_bbox,
103
- block_type="Formula",
104
- pnum=pnum
105
- )
106
-
107
- if not all(conditions):
108
- fail_count += 1
109
- else:
110
- success_count += 1
111
- new_block.lines[0].spans[0].text = latex_text.replace("\n", " ")
112
- converted_spans.append(deepcopy(new_block.lines[0].spans[0]))
113
-
114
- # Add in the new LaTeX block
115
- if insert_line_idx == 0:
116
- page_blocks.blocks.insert(insert_block_idx, new_block)
117
- increment_insert_points(page_equation_blocks, insert_block_idx, 1)
118
- elif insert_line_idx >= len(page_blocks.blocks[insert_block_idx].lines):
119
- page_blocks.blocks.insert(insert_block_idx + 1, new_block)
120
- increment_insert_points(page_equation_blocks, insert_block_idx + 1, 1)
121
- else:
122
- new_blocks = []
123
- for block_idx, block in enumerate(page_blocks.blocks):
124
- if block_idx == insert_block_idx:
125
- split_block = split_block_lines(block, insert_line_idx)
126
- new_blocks.append(split_block[0])
127
- new_blocks.append(new_block)
128
- new_blocks.append(split_block[1])
129
- increment_insert_points(page_equation_blocks, insert_block_idx, 2)
130
- else:
131
- new_blocks.append(block)
132
- page_blocks.blocks = new_blocks
133
-
134
- return success_count, fail_count, converted_spans
135
-
136
-
137
- def replace_equations(doc, pages: List[Page], texify_model, batch_multiplier=1):
138
- unsuccessful_ocr = 0
139
- successful_ocr = 0
140
-
141
- # Find potential equation regions, and length of text in each region
142
- equation_blocks = []
143
- for pnum, page in enumerate(pages):
144
- equation_blocks.append(find_equation_blocks(page, texify_model.processor))
145
-
146
- eq_count = sum([len(x) for x in equation_blocks])
147
-
148
- images = []
149
- token_counts = []
150
- for page_idx, page_equation_blocks in enumerate(equation_blocks):
151
- page_obj = doc[page_idx]
152
- for equation_idx, (insert_block_idx, insert_line_idx, token_count, block_text, equation_bbox) in enumerate(page_equation_blocks):
153
- png_image = render_bbox_image(page_obj, pages[page_idx], equation_bbox)
154
-
155
- images.append(png_image)
156
- token_counts.append(token_count)
157
-
158
- # Make batched predictions
159
- predictions = get_latex_batched(images, token_counts, texify_model, batch_multiplier=batch_multiplier)
160
-
161
- # Replace blocks with predictions
162
- page_start = 0
163
- converted_spans = []
164
- for page_idx, page_equation_blocks in enumerate(equation_blocks):
165
- page_equation_count = len(page_equation_blocks)
166
- page_predictions = predictions[page_start:page_start + page_equation_count]
167
- success_count, fail_count, converted_span = insert_latex_block(
168
- pages[page_idx],
169
- page_equation_blocks,
170
- page_predictions,
171
- page_idx,
172
- texify_model.processor
173
- )
174
- converted_spans.extend(converted_span)
175
- page_start += page_equation_count
176
- successful_ocr += success_count
177
- unsuccessful_ocr += fail_count
178
-
179
- return pages, {"successful_ocr": successful_ocr, "unsuccessful_ocr": unsuccessful_ocr, "equations": eq_count}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
marker/equations/inference.py DELETED
@@ -1,51 +0,0 @@
1
- from texify.inference import batch_inference
2
- from tqdm import tqdm
3
-
4
- from marker.settings import settings
5
- import os
6
-
7
- os.environ["TOKENIZERS_PARALLELISM"] = "false"
8
-
9
-
10
- def get_batch_size():
11
- if settings.TEXIFY_BATCH_SIZE is not None:
12
- return settings.TEXIFY_BATCH_SIZE
13
- elif settings.TORCH_DEVICE_MODEL == "cuda":
14
- return 6
15
- elif settings.TORCH_DEVICE_MODEL == "mps":
16
- return 6
17
- return 2
18
-
19
- def get_latex_batched(images, token_counts, texify_model, batch_multiplier=1):
20
- if len(images) == 0:
21
- return []
22
-
23
- predictions = [""] * len(images)
24
- batch_size = get_batch_size() * batch_multiplier
25
-
26
- for i in tqdm(range(0, len(images), batch_size), desc="Recognizing equations"):
27
- # Dynamically set max length to save inference time
28
- min_idx = i
29
- max_idx = min(min_idx + batch_size, len(images))
30
- max_length = max(token_counts[min_idx:max_idx])
31
- max_length = min(max_length, settings.TEXIFY_MODEL_MAX)
32
- max_length += settings.TEXIFY_TOKEN_BUFFER
33
-
34
- model_output = batch_inference(images[min_idx:max_idx], texify_model, texify_model.processor, max_tokens=max_length)
35
-
36
- for j, output in enumerate(model_output):
37
- token_count = get_total_texify_tokens(output, texify_model.processor)
38
- if token_count >= max_length - 1:
39
- output = ""
40
-
41
- image_idx = i + j
42
- predictions[image_idx] = output
43
- return predictions
44
-
45
-
46
- def get_total_texify_tokens(text, processor):
47
- tokenizer = processor.tokenizer
48
- tokens = tokenizer(text)
49
- return len(tokens["input_ids"])
50
-
51
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
marker/images/extract.py DELETED
@@ -1,77 +0,0 @@
1
- from marker.images.save import get_image_filename
2
- from marker.pdf.images import render_bbox_image
3
- from marker.schema.bbox import rescale_bbox
4
- from marker.schema.block import find_insert_block, Span, Line
5
- from marker.settings import settings
6
-
7
-
8
- def find_image_blocks(page):
9
- image_blocks = []
10
- image_regions = [l.bbox for l in page.layout.bboxes if l.label in ["Figure", "Picture"]]
11
- image_regions = [rescale_bbox(page.layout.image_bbox, page.bbox, b) for b in image_regions]
12
-
13
- insert_points = {}
14
- for region_idx, region in enumerate(image_regions):
15
- for block_idx, block in enumerate(page.blocks):
16
- for line_idx, line in enumerate(block.lines):
17
- if line.intersection_pct(region) > settings.BBOX_INTERSECTION_THRESH:
18
- line.spans = [] # We will remove this line from the block
19
-
20
- if region_idx not in insert_points:
21
- insert_points[region_idx] = (block_idx, line_idx)
22
-
23
- # Account for images with no detected lines
24
- for region_idx, region in enumerate(image_regions):
25
- if region_idx in insert_points:
26
- continue
27
-
28
- insert_points[region_idx] = (find_insert_block(page.blocks, region), 0)
29
-
30
- for region_idx, image_region in enumerate(image_regions):
31
- image_insert = insert_points[region_idx]
32
- image_blocks.append([image_insert[0], image_insert[1], image_region])
33
-
34
- return image_blocks
35
-
36
-
37
- def extract_page_images(page_obj, page):
38
- page.images = []
39
- image_blocks = find_image_blocks(page)
40
-
41
- for image_idx, (block_idx, line_idx, bbox) in enumerate(image_blocks):
42
- if block_idx >= len(page.blocks):
43
- block_idx = len(page.blocks) - 1
44
- if block_idx < 0:
45
- continue
46
-
47
- block = page.blocks[block_idx]
48
- image = render_bbox_image(page_obj, page, bbox)
49
- image_filename = get_image_filename(page, image_idx)
50
- image_markdown = f"\n\n![{image_filename}]({image_filename})\n\n"
51
- image_span = Span(
52
- bbox=bbox,
53
- text=image_markdown,
54
- font="Image",
55
- rotation=0,
56
- font_weight=0,
57
- font_size=0,
58
- image=True,
59
- span_id=f"image_{image_idx}"
60
- )
61
-
62
- # Sometimes, the block has zero lines
63
- if len(block.lines) > line_idx:
64
- block.lines[line_idx].spans.append(image_span)
65
- else:
66
- line = Line(
67
- bbox=bbox,
68
- spans=[image_span]
69
- )
70
- block.lines.append(line)
71
- page.images.append(image)
72
-
73
-
74
- def extract_images(doc, pages):
75
- for page_idx, page in enumerate(pages):
76
- page_obj = doc[page_idx]
77
- extract_page_images(page_obj, page)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
marker/images/save.py DELETED
@@ -1,18 +0,0 @@
1
- from typing import List
2
-
3
- from marker.schema.page import Page
4
-
5
-
6
- def get_image_filename(page: Page, image_idx):
7
- return f"{page.pnum}_image_{image_idx}.png"
8
-
9
-
10
- def images_to_dict(pages: List[Page]):
11
- images = {}
12
- for page in pages:
13
- if page.images is None:
14
- continue
15
- for image_idx, image in enumerate(page.images):
16
- image_filename = get_image_filename(page, image_idx)
17
- images[image_filename] = image
18
- return images
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
marker/layout/layout.py DELETED
@@ -1,113 +0,0 @@
1
- from collections import defaultdict, Counter
2
- from typing import List
3
-
4
- from surya.layout import batch_layout_detection
5
-
6
- from marker.pdf.images import render_image
7
- from marker.schema.bbox import rescale_bbox
8
- from marker.schema.block import bbox_from_lines
9
- from marker.schema.page import Page
10
- from marker.settings import settings
11
-
12
-
13
- def get_batch_size():
14
- if settings.LAYOUT_BATCH_SIZE is not None:
15
- return settings.LAYOUT_BATCH_SIZE
16
- elif settings.TORCH_DEVICE_MODEL == "cuda":
17
- return 6
18
- return 6
19
-
20
-
21
- def surya_layout(images: list, pages: List[Page], layout_model, batch_multiplier=1):
22
- text_detection_results = [p.text_lines for p in pages]
23
-
24
- processor = layout_model.processor
25
- layout_results = batch_layout_detection(images, layout_model, processor, detection_results=text_detection_results, batch_size=int(get_batch_size() * batch_multiplier))
26
- for page, layout_result in zip(pages, layout_results):
27
- page.layout = layout_result
28
-
29
-
30
- def annotate_block_types(pages: List[Page]):
31
- for page in pages:
32
- max_intersections = {}
33
- for i, block in enumerate(page.blocks):
34
- for j, layout_block in enumerate(page.layout.bboxes):
35
- layout_bbox = layout_block.bbox
36
- layout_bbox = rescale_bbox(page.layout.image_bbox, page.bbox, layout_bbox)
37
- intersection_pct = block.intersection_pct(layout_bbox)
38
- if i not in max_intersections:
39
- max_intersections[i] = (intersection_pct, j)
40
- elif intersection_pct > max_intersections[i][0]:
41
- max_intersections[i] = (intersection_pct, j)
42
-
43
- for i, block in enumerate(page.blocks):
44
- block = page.blocks[i]
45
- block_type = None
46
- if i in max_intersections and max_intersections[i][0] > 0.0:
47
- j = max_intersections[i][1]
48
- block_type = page.layout.bboxes[j].label
49
- block.block_type = block_type
50
-
51
- # Smarter block layout assignment - first assign same as closest block
52
- # Next, fall back to text
53
- for i, block in enumerate(page.blocks):
54
- if block.block_type is not None:
55
- continue
56
- min_dist = None
57
- min_dist_idx = None
58
- for j, block2 in enumerate(page.blocks):
59
- if j == i or block2.block_type is None:
60
- continue
61
- dist = block.distance(block2.bbox)
62
- if min_dist_idx is None or dist < min_dist:
63
- min_dist = dist
64
- min_dist_idx = j
65
- for line in block2.lines:
66
- dist = block.distance(line.bbox)
67
- if dist < min_dist:
68
- min_dist = dist
69
- min_dist_idx = j
70
-
71
- if min_dist_idx is not None:
72
- block.block_type = page.blocks[min_dist_idx].block_type
73
-
74
- for i, block in enumerate(page.blocks):
75
- if block.block_type is None:
76
- block.block_type = settings.DEFAULT_BLOCK_TYPE
77
-
78
- def get_layout_label(block_labels: List[str]):
79
- counter = Counter(block_labels)
80
- return counter.most_common(1)[0][0]
81
-
82
- def generate_block(block, block_labels):
83
- block.bbox = bbox_from_lines(block.lines)
84
- block.block_type = get_layout_label(block_labels)
85
- return block
86
-
87
- # Merge blocks together, preserving pdf order
88
- curr_layout_idx = None
89
- curr_layout_block = None
90
- curr_block_labels = []
91
- new_blocks = []
92
- for i in range(len(page.blocks)):
93
- if i not in max_intersections or max_intersections[i][0] == 0:
94
- if curr_layout_block is not None:
95
- new_blocks.append(generate_block(curr_layout_block, curr_block_labels))
96
- curr_layout_block = None
97
- curr_layout_idx = None
98
- curr_block_labels = []
99
- new_blocks.append(page.blocks[i])
100
- elif max_intersections[i][1] != curr_layout_idx:
101
- if curr_layout_block is not None:
102
- new_blocks.append(generate_block(curr_layout_block, curr_block_labels))
103
- curr_layout_block = page.blocks[i].copy()
104
- curr_layout_idx = max_intersections[i][1]
105
- curr_block_labels = [page.blocks[i].block_type]
106
- else:
107
- curr_layout_block.lines.extend(page.blocks[i].lines)
108
- curr_block_labels.append(page.blocks[i].block_type)
109
-
110
- if curr_layout_block is not None:
111
- new_blocks.append(generate_block(curr_layout_block, curr_block_labels))
112
-
113
- page.blocks = new_blocks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
marker/layout/order.py DELETED
@@ -1,73 +0,0 @@
1
- from collections import defaultdict
2
- from typing import List
3
-
4
- from surya.ordering import batch_ordering
5
-
6
- from marker.pdf.images import render_image
7
- from marker.pdf.utils import sort_block_group
8
- from marker.schema.bbox import rescale_bbox
9
- from marker.schema.page import Page
10
- from marker.settings import settings
11
-
12
-
13
- def get_batch_size():
14
- if settings.ORDER_BATCH_SIZE is not None:
15
- return settings.ORDER_BATCH_SIZE
16
- elif settings.TORCH_DEVICE_MODEL == "cuda":
17
- return 6
18
- elif settings.TORCH_DEVICE_MODEL == "mps":
19
- return 6
20
- return 6
21
-
22
-
23
- def surya_order(images: list, pages: List[Page], order_model, batch_multiplier=1):
24
- # Get bboxes for all pages
25
- bboxes = []
26
- for page in pages:
27
- bbox = [b.bbox for b in page.layout.bboxes][:settings.ORDER_MAX_BBOXES]
28
- bboxes.append(bbox)
29
-
30
- processor = order_model.processor
31
- order_results = batch_ordering(images, bboxes, order_model, processor, batch_size=int(get_batch_size() * batch_multiplier))
32
- for page, order_result in zip(pages, order_results):
33
- page.order = order_result
34
-
35
-
36
- def sort_blocks_in_reading_order(pages: List[Page]):
37
- for page in pages:
38
- order = page.order
39
- block_positions = {}
40
- max_position = 0
41
- for i, block in enumerate(page.blocks):
42
- for order_box in order.bboxes:
43
- order_bbox = order_box.bbox
44
- position = order_box.position
45
- order_bbox = rescale_bbox(order.image_bbox, page.bbox, order_bbox)
46
- block_intersection = block.intersection_pct(order_bbox)
47
- if i not in block_positions:
48
- block_positions[i] = (block_intersection, position)
49
- elif block_intersection > block_positions[i][0]:
50
- block_positions[i] = (block_intersection, position)
51
- max_position = max(max_position, position)
52
- block_groups = defaultdict(list)
53
- for i, block in enumerate(page.blocks):
54
- if i in block_positions:
55
- position = block_positions[i][1]
56
- else:
57
- max_position += 1
58
- position = max_position
59
-
60
- block_groups[position].append(block)
61
-
62
- new_blocks = []
63
- for position in sorted(block_groups.keys()):
64
- block_group = sort_block_group(block_groups[position])
65
- new_blocks.extend(block_group)
66
-
67
- # Ensure we properly put footers at the end of the page
68
- footer_blocks = [b for b in new_blocks if b.block_type in ["Footnote", "Page-footer"]]
69
- header_blocks = [b for b in new_blocks if b.block_type in ["Page-header"]]
70
- regular_blocks = [b for b in new_blocks if b.block_type not in ["Footnote", "Page-footer", "Page-header"]]
71
-
72
- new_blocks = header_blocks + regular_blocks + footer_blocks
73
- page.blocks = new_blocks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
marker/logger.py CHANGED
@@ -5,8 +5,5 @@ import warnings
5
  def configure_logging():
6
  logging.basicConfig(level=logging.WARNING)
7
 
8
- logging.getLogger('pdfminer').setLevel(logging.ERROR)
9
  logging.getLogger('PIL').setLevel(logging.ERROR)
10
- logging.getLogger('fitz').setLevel(logging.ERROR)
11
- logging.getLogger('ocrmypdf').setLevel(logging.ERROR)
12
  warnings.simplefilter(action='ignore', category=FutureWarning)
 
5
  def configure_logging():
6
  logging.basicConfig(level=logging.WARNING)
7
 
 
8
  logging.getLogger('PIL').setLevel(logging.ERROR)
 
 
9
  warnings.simplefilter(action='ignore', category=FutureWarning)
marker/models.py CHANGED
@@ -1,21 +1,27 @@
1
  import os
 
2
  os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # For some reason, transformers decided to use .isin for a simple op, which is not supported on MPS
3
 
4
 
5
  from surya.model.detection.model import load_model as load_detection_model, load_processor as load_detection_processor
6
- from surya.model.layout.model import load_model as load_layout_model, load_processor as load_layout_processor
 
7
  from texify.model.model import load_model as load_texify_model
8
  from texify.model.processor import load_processor as load_texify_processor
9
  from marker.settings import settings
10
  from surya.model.recognition.model import load_model as load_recognition_model
11
  from surya.model.recognition.processor import load_processor as load_recognition_processor
12
- from surya.model.ordering.model import load_model as load_order_model
13
- from surya.model.ordering.processor import load_processor as load_order_processor
14
  from surya.model.table_rec.model import load_model as load_table_model
15
  from surya.model.table_rec.processor import load_processor as load_table_processor
16
 
 
 
 
 
 
 
17
 
18
- def setup_table_rec_model(device=None, dtype=None):
19
  if device:
20
  table_model = load_table_model(device=device, dtype=dtype)
21
  else:
@@ -24,7 +30,7 @@ def setup_table_rec_model(device=None, dtype=None):
24
  return table_model
25
 
26
 
27
- def setup_recognition_model(device=None, dtype=None):
28
  if device:
29
  rec_model = load_recognition_model(device=device, dtype=dtype)
30
  else:
@@ -33,7 +39,7 @@ def setup_recognition_model(device=None, dtype=None):
33
  return rec_model
34
 
35
 
36
- def setup_detection_model(device=None, dtype=None):
37
  if device:
38
  model = load_detection_model(device=device, dtype=dtype)
39
  else:
@@ -42,7 +48,7 @@ def setup_detection_model(device=None, dtype=None):
42
  return model
43
 
44
 
45
- def setup_texify_model(device=None, dtype=None):
46
  if device:
47
  texify_model = load_texify_model(checkpoint=settings.TEXIFY_MODEL_NAME, device=device, dtype=dtype)
48
  else:
@@ -51,36 +57,10 @@ def setup_texify_model(device=None, dtype=None):
51
  return texify_model
52
 
53
 
54
- def setup_layout_model(device=None, dtype=None):
55
  if device:
56
  model = load_layout_model(device=device, dtype=dtype)
57
  else:
58
  model = load_layout_model()
59
  model.processor = load_layout_processor()
60
- return model
61
-
62
-
63
- def setup_order_model(device=None, dtype=None):
64
- if device:
65
- model = load_order_model(device=device, dtype=dtype)
66
- else:
67
- model = load_order_model()
68
- model.processor = load_order_processor()
69
- return model
70
-
71
-
72
- def load_all_models(device=None, dtype=None):
73
- if device is not None:
74
- assert dtype is not None, "Must provide dtype if device is provided"
75
-
76
- # langs is optional list of languages to prune from recognition MoE model
77
- detection = setup_detection_model(device, dtype)
78
- layout = setup_layout_model(device, dtype)
79
- order = setup_order_model(device, dtype)
80
-
81
- # Only load recognition model if we'll need it for all pdfs
82
- ocr = setup_recognition_model(device, dtype)
83
- texify = setup_texify_model(device, dtype)
84
- table_model = setup_table_rec_model(device, dtype)
85
- model_lst = [texify, layout, order, detection, ocr, table_model]
86
- return model_lst
 
1
  import os
2
+
3
  os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # For some reason, transformers decided to use .isin for a simple op, which is not supported on MPS
4
 
5
 
6
  from surya.model.detection.model import load_model as load_detection_model, load_processor as load_detection_processor
7
+ from surya.model.layout.model import load_model as load_layout_model
8
+ from surya.model.layout.processor import load_processor as load_layout_processor
9
  from texify.model.model import load_model as load_texify_model
10
  from texify.model.processor import load_processor as load_texify_processor
11
  from marker.settings import settings
12
  from surya.model.recognition.model import load_model as load_recognition_model
13
  from surya.model.recognition.processor import load_processor as load_recognition_processor
 
 
14
  from surya.model.table_rec.model import load_model as load_table_model
15
  from surya.model.table_rec.processor import load_processor as load_table_processor
16
 
17
+ from texify.model.model import GenerateVisionEncoderDecoderModel
18
+ from surya.model.layout.encoderdecoder import SuryaLayoutModel
19
+ from surya.model.detection.model import EfficientViTForSemanticSegmentation
20
+ from surya.model.recognition.encoderdecoder import OCREncoderDecoderModel
21
+ from surya.model.table_rec.encoderdecoder import TableRecEncoderDecoderModel
22
+
23
 
24
+ def setup_table_rec_model(device=None, dtype=None) -> TableRecEncoderDecoderModel:
25
  if device:
26
  table_model = load_table_model(device=device, dtype=dtype)
27
  else:
 
30
  return table_model
31
 
32
 
33
+ def setup_recognition_model(device=None, dtype=None) -> OCREncoderDecoderModel:
34
  if device:
35
  rec_model = load_recognition_model(device=device, dtype=dtype)
36
  else:
 
39
  return rec_model
40
 
41
 
42
+ def setup_detection_model(device=None, dtype=None) -> EfficientViTForSemanticSegmentation:
43
  if device:
44
  model = load_detection_model(device=device, dtype=dtype)
45
  else:
 
48
  return model
49
 
50
 
51
+ def setup_texify_model(device=None, dtype=None) -> GenerateVisionEncoderDecoderModel:
52
  if device:
53
  texify_model = load_texify_model(checkpoint=settings.TEXIFY_MODEL_NAME, device=device, dtype=dtype)
54
  else:
 
57
  return texify_model
58
 
59
 
60
+ def setup_layout_model(device=None, dtype=None) -> SuryaLayoutModel:
61
  if device:
62
  model = load_layout_model(device=device, dtype=dtype)
63
  else:
64
  model = load_layout_model()
65
  model.processor = load_layout_processor()
66
+ return model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
marker/ocr/detection.py DELETED
@@ -1,28 +0,0 @@
1
- from typing import List
2
-
3
- from pypdfium2 import PdfDocument
4
- from surya.detection import batch_text_detection
5
-
6
- from marker.pdf.images import render_image
7
- from marker.schema.page import Page
8
- from marker.settings import settings
9
-
10
-
11
- def get_batch_size():
12
- if settings.DETECTOR_BATCH_SIZE is not None:
13
- return settings.DETECTOR_BATCH_SIZE
14
- elif settings.TORCH_DEVICE_MODEL == "cuda":
15
- return 4
16
- return 4
17
-
18
-
19
- def surya_detection(images: list, pages: List[Page], det_model, batch_multiplier=1):
20
- processor = det_model.processor
21
-
22
- predictions = batch_text_detection(images, det_model, processor, batch_size=int(get_batch_size() * batch_multiplier))
23
- for (page, pred) in zip(pages, predictions):
24
- page.text_lines = pred
25
-
26
-
27
-
28
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
marker/ocr/heuristics.py DELETED
@@ -1,78 +0,0 @@
1
- import re
2
- from typing import List
3
-
4
- from marker.ocr.utils import alphanum_ratio
5
- from marker.schema.bbox import rescale_bbox, box_intersection_pct
6
- from marker.schema.page import Page
7
- from marker.settings import settings
8
-
9
-
10
- def should_ocr_page(page: Page, no_text: bool, ocr_all_pages=False):
11
- detected_lines_found, total_lines = detected_line_coverage(page)
12
-
13
- # No reason to OCR page if it has no text lines
14
- if total_lines == 0:
15
- return False
16
-
17
- # OCR page if we got minimal text, or if we got too many spaces
18
- conditions = [
19
- no_text, # Full doc has no text, and needs full OCR
20
- (len(page.prelim_text) > 0 and detect_bad_ocr(page.prelim_text)), # Bad OCR
21
- detected_lines_found is False, # didn't extract text for all detected lines
22
- ]
23
-
24
- return any(conditions) or ocr_all_pages
25
-
26
-
27
- def detect_bad_ocr(text, space_threshold=.7, newline_threshold=.6, alphanum_threshold=.3):
28
- if len(text) == 0:
29
- # Assume OCR failed if we have no text
30
- return True
31
-
32
- spaces = len(re.findall(r'\s+', text))
33
- alpha_chars = len(re.sub(r'\s+', '', text))
34
- if spaces / (alpha_chars + spaces) > space_threshold:
35
- return True
36
-
37
- newlines = len(re.findall(r'\n+', text))
38
- non_newlines = len(re.sub(r'\n+', '', text))
39
- if newlines / (newlines + non_newlines) > newline_threshold:
40
- return True
41
-
42
- if alphanum_ratio(text) < alphanum_threshold: # Garbled text
43
- return True
44
-
45
- invalid_chars = len([c for c in text if c in settings.INVALID_CHARS])
46
- if invalid_chars > max(6.0, len(text) * .03):
47
- return True
48
-
49
- return False
50
-
51
-
52
- def no_text_found(pages: List[Page]):
53
- full_text = ""
54
- for page in pages:
55
- full_text += page.prelim_text
56
- return len(full_text.strip()) == 0
57
-
58
-
59
- def detected_line_coverage(page: Page, intersect_thresh=.5, detection_thresh=.4):
60
- found_lines = 0
61
- for detected_line in page.text_lines.bboxes:
62
- # Get bbox and rescale to match dimensions of original page
63
- detected_bbox = detected_line.bbox
64
- detected_bbox = rescale_bbox(page.text_lines.image_bbox, page.bbox, detected_bbox)
65
-
66
- total_intersection = 0
67
- for block in page.blocks:
68
- for line in block.lines:
69
- intersection_pct = box_intersection_pct(detected_bbox, line.bbox)
70
- total_intersection += intersection_pct
71
- if total_intersection > intersect_thresh:
72
- found_lines += 1
73
-
74
- total_lines = len(page.text_lines.bboxes)
75
- if total_lines == 0:
76
- return True, 0
77
-
78
- return found_lines / total_lines > detection_thresh, total_lines
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
marker/ocr/lang.py DELETED
@@ -1,44 +0,0 @@
1
- from typing import List
2
-
3
- from surya.languages import CODE_TO_LANGUAGE, LANGUAGE_TO_CODE
4
- from surya.model.recognition.tokenizer import _tokenize as lang_tokenize
5
-
6
- from marker.ocr.tesseract import LANGUAGE_TO_TESSERACT_CODE, TESSERACT_CODE_TO_LANGUAGE
7
- from marker.settings import settings
8
-
9
-
10
- def langs_to_ids(langs: List[str]):
11
- unique_langs = list(set(langs))
12
- _, lang_tokens = lang_tokenize("", unique_langs)
13
- return lang_tokens
14
-
15
-
16
- def replace_langs_with_codes(langs):
17
- if settings.OCR_ENGINE == "surya":
18
- if langs is None:
19
- return
20
- for i, lang in enumerate(langs):
21
- if lang.title() in LANGUAGE_TO_CODE:
22
- langs[i] = LANGUAGE_TO_CODE[lang.title()]
23
- else:
24
- if langs is None:
25
- langs = [settings.DEFAULT_LANG]
26
- print(f"No languages specified for tesseract, defaulting to {settings.DEFAULT_LANG}.")
27
-
28
- for i, lang in enumerate(langs):
29
- if lang in LANGUAGE_TO_CODE:
30
- langs[i] = LANGUAGE_TO_TESSERACT_CODE[lang]
31
- return langs
32
-
33
-
34
- def validate_langs(langs):
35
- if settings.OCR_ENGINE == "surya":
36
- if langs is None:
37
- return
38
- for lang in langs:
39
- if lang not in CODE_TO_LANGUAGE:
40
- raise ValueError(f"Invalid language code {lang} for Surya OCR")
41
- else:
42
- for lang in langs:
43
- if lang not in TESSERACT_CODE_TO_LANGUAGE:
44
- raise ValueError(f"Invalid language code {lang} for Tesseract")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
marker/ocr/recognition.py DELETED
@@ -1,182 +0,0 @@
1
- import tempfile
2
- from copy import deepcopy
3
- from itertools import repeat
4
- from typing import List, Optional, Dict
5
-
6
- import pypdfium2 as pdfium
7
- import io
8
- from concurrent.futures import ThreadPoolExecutor
9
-
10
- from surya.ocr import run_recognition
11
-
12
- from marker.models import setup_recognition_model
13
- from marker.ocr.heuristics import should_ocr_page, no_text_found, detect_bad_ocr
14
- from marker.ocr.lang import langs_to_ids
15
- from marker.pdf.images import render_image
16
- from marker.schema.bbox import rescale_bbox
17
- from marker.schema.page import Page
18
- from marker.schema.block import Block, Line, Span
19
- from marker.settings import settings
20
- from marker.pdf.extract_text import get_text_blocks
21
-
22
-
23
- def get_batch_size():
24
- if settings.RECOGNITION_BATCH_SIZE is not None:
25
- return settings.RECOGNITION_BATCH_SIZE
26
- elif settings.TORCH_DEVICE_MODEL == "cuda":
27
- return 32
28
- elif settings.TORCH_DEVICE_MODEL == "mps":
29
- return 32
30
- return 32
31
-
32
-
33
- def run_ocr(doc, pages: List[Page], langs: List[str], rec_model, batch_multiplier=1, ocr_all_pages=False) -> (List[Page], Dict):
34
- ocr_pages = 0
35
- ocr_success = 0
36
- ocr_failed = 0
37
- no_text = no_text_found(pages)
38
- ocr_idxs = []
39
- for pnum, page in enumerate(pages):
40
- ocr_needed = should_ocr_page(page, no_text, ocr_all_pages=ocr_all_pages)
41
- if ocr_needed:
42
- ocr_idxs.append(pnum)
43
- ocr_pages += 1
44
-
45
- # No pages need OCR
46
- if ocr_pages == 0:
47
- return pages, {"ocr_pages": 0, "ocr_failed": 0, "ocr_success": 0, "ocr_engine": "none"}
48
-
49
- ocr_method = settings.OCR_ENGINE
50
- if ocr_method is None or ocr_method == "None":
51
- return pages, {"ocr_pages": 0, "ocr_failed": 0, "ocr_success": 0, "ocr_engine": "none"}
52
- elif ocr_method == "surya":
53
- new_pages = surya_recognition(doc, ocr_idxs, langs, rec_model, pages, batch_multiplier=batch_multiplier)
54
- elif ocr_method == "ocrmypdf":
55
- new_pages = tesseract_recognition(doc, ocr_idxs, langs)
56
- else:
57
- raise ValueError(f"Unknown OCR method {ocr_method}")
58
-
59
- for orig_idx, page in zip(ocr_idxs, new_pages):
60
- if detect_bad_ocr(page.prelim_text) or len(page.prelim_text) == 0:
61
- ocr_failed += 1
62
- else:
63
- ocr_success += 1
64
- pages[orig_idx] = page
65
-
66
- return pages, {"ocr_pages": ocr_pages, "ocr_failed": ocr_failed, "ocr_success": ocr_success, "ocr_engine": ocr_method}
67
-
68
-
69
- def surya_recognition(doc, page_idxs, langs: List[str], rec_model, pages: List[Page], batch_multiplier=1) -> List[Optional[Page]]:
70
- # Slice images in higher resolution than detection happened in
71
- images = [render_image(doc[pnum], dpi=settings.SURYA_OCR_DPI) for pnum in page_idxs]
72
- box_scale = settings.SURYA_OCR_DPI / settings.SURYA_DETECTOR_DPI
73
-
74
- processor = rec_model.processor
75
- selected_pages = [p for i, p in enumerate(pages) if i in page_idxs]
76
-
77
- surya_langs = [langs] * len(page_idxs)
78
- detection_results = [p.text_lines.bboxes for p in selected_pages]
79
- polygons = deepcopy([[b.polygon for b in bboxes] for bboxes in detection_results])
80
-
81
- # Scale polygons to get correct image slices
82
- for j, poly in enumerate(polygons):
83
- skip_idxs = []
84
- for z, p in enumerate(poly):
85
- for i in range(len(p)):
86
- p[i] = [int(p[i][0] * box_scale), int(p[i][1] * box_scale)]
87
- x_coords = [p[i][0] for i in range(len(p))]
88
- y_coords = [p[i][1] for i in range(len(p))]
89
- bbox = [min(x_coords), min(y_coords), max(x_coords), max(y_coords)]
90
- if (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) == 0:
91
- skip_idxs.append(z)
92
- if len(skip_idxs) > 0:
93
- polygons[j] = [p for i, p in enumerate(poly) if i not in skip_idxs]
94
-
95
- results = run_recognition(images, surya_langs, rec_model, processor, polygons=polygons, batch_size=int(get_batch_size() * batch_multiplier))
96
-
97
- new_pages = []
98
- for idx, (page_idx, result, old_page) in enumerate(zip(page_idxs, results, selected_pages)):
99
- text_lines = old_page.text_lines
100
- ocr_results = result.text_lines
101
- blocks = []
102
- for i, line in enumerate(ocr_results):
103
- scaled_bbox = rescale_bbox([0, 0, images[idx].size[0], images[idx].size[1]], old_page.text_lines.image_bbox, line.bbox)
104
- block = Block(
105
- bbox=scaled_bbox,
106
- pnum=page_idx,
107
- lines=[Line(
108
- bbox=scaled_bbox,
109
- spans=[Span(
110
- text=line.text,
111
- bbox=scaled_bbox,
112
- span_id=f"{page_idx}_{i}",
113
- font="",
114
- font_weight=0,
115
- font_size=0,
116
- )
117
- ]
118
- )]
119
- )
120
- blocks.append(block)
121
- page = Page(
122
- blocks=blocks,
123
- pnum=page_idx,
124
- bbox=old_page.text_lines.image_bbox,
125
- rotation=0,
126
- text_lines=text_lines,
127
- ocr_method="surya"
128
- )
129
- new_pages.append(page)
130
- return new_pages
131
-
132
-
133
- def tesseract_recognition(doc, page_idxs, langs: List[str]) -> List[Optional[Page]]:
134
- pdf_pages = generate_single_page_pdfs(doc, page_idxs)
135
- with ThreadPoolExecutor(max_workers=settings.OCR_PARALLEL_WORKERS) as executor:
136
- pages = list(executor.map(_tesseract_recognition, pdf_pages, repeat(langs, len(pdf_pages))))
137
-
138
- return pages
139
-
140
-
141
- def generate_single_page_pdfs(doc, page_idxs) -> List[io.BytesIO]:
142
- pdf_pages = []
143
- for page_idx in page_idxs:
144
- blank_doc = pdfium.PdfDocument.new()
145
- blank_doc.import_pages(doc, pages=[page_idx])
146
- assert len(blank_doc) == 1, "Failed to import page"
147
-
148
- in_pdf = io.BytesIO()
149
- blank_doc.save(in_pdf)
150
- in_pdf.seek(0)
151
- pdf_pages.append(in_pdf)
152
- return pdf_pages
153
-
154
-
155
- def _tesseract_recognition(in_pdf, langs: List[str]) -> Optional[Page]:
156
- import ocrmypdf
157
- out_pdf = io.BytesIO()
158
-
159
- ocrmypdf.ocr(
160
- in_pdf,
161
- out_pdf,
162
- language=langs[0],
163
- output_type="pdf",
164
- redo_ocr=None,
165
- force_ocr=True,
166
- progress_bar=False,
167
- optimize=False,
168
- fast_web_view=1e6,
169
- skip_big=15, # skip images larger than 15 megapixels
170
- tesseract_timeout=settings.TESSERACT_TIMEOUT,
171
- tesseract_non_ocr_timeout=settings.TESSERACT_TIMEOUT,
172
- )
173
-
174
- with tempfile.NamedTemporaryFile() as f:
175
- f.write(out_pdf.getvalue())
176
- f.seek(0)
177
- new_doc = pdfium.PdfDocument(f.name)
178
- blocks, _ = get_text_blocks(new_doc, f.name, max_pages=1)
179
-
180
- page = blocks[0]
181
- page.ocr_method = "tesseract"
182
- return page
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
marker/ocr/tesseract.py DELETED
@@ -1,97 +0,0 @@
1
- LANGUAGE_TO_TESSERACT_CODE = {
2
- 'Afrikaans': 'afr',
3
- 'Amharic': 'amh',
4
- 'Arabic': 'ara',
5
- 'Assamese': 'asm',
6
- 'Azerbaijani': 'aze',
7
- 'Belarusian': 'bel',
8
- 'Bulgarian': 'bul',
9
- 'Bengali': 'ben',
10
- 'Breton': 'bre',
11
- 'Bosnian': 'bos',
12
- 'Catalan': 'cat',
13
- 'Czech': 'ces',
14
- 'Welsh': 'cym',
15
- 'Danish': 'dan',
16
- 'German': 'deu',
17
- 'Greek': 'ell',
18
- 'English': 'eng',
19
- 'Esperanto': 'epo',
20
- 'Spanish': 'spa',
21
- 'Estonian': 'est',
22
- 'Basque': 'eus',
23
- 'Persian': 'fas',
24
- 'Finnish': 'fin',
25
- 'French': 'fra',
26
- 'Western Frisian': 'fry',
27
- 'Irish': 'gle',
28
- 'Scottish Gaelic': 'gla',
29
- 'Galician': 'glg',
30
- 'Gujarati': 'guj',
31
- 'Hausa': 'hau',
32
- 'Hebrew': 'heb',
33
- 'Hindi': 'hin',
34
- 'Croatian': 'hrv',
35
- 'Hungarian': 'hun',
36
- 'Armenian': 'hye',
37
- 'Indonesian': 'ind',
38
- 'Icelandic': 'isl',
39
- 'Italian': 'ita',
40
- 'Japanese': 'jpn',
41
- 'Javanese': 'jav',
42
- 'Georgian': 'kat',
43
- 'Kazakh': 'kaz',
44
- 'Khmer': 'khm',
45
- 'Kannada': 'kan',
46
- 'Korean': 'kor',
47
- 'Kurdish': 'kur',
48
- 'Kyrgyz': 'kir',
49
- 'Latin': 'lat',
50
- 'Lao': 'lao',
51
- 'Lithuanian': 'lit',
52
- 'Latvian': 'lav',
53
- 'Malagasy': 'mlg',
54
- 'Macedonian': 'mkd',
55
- 'Malayalam': 'mal',
56
- 'Mongolian': 'mon',
57
- 'Marathi': 'mar',
58
- 'Malay': 'msa',
59
- 'Burmese': 'mya',
60
- 'Nepali': 'nep',
61
- 'Dutch': 'nld',
62
- 'Norwegian': 'nor',
63
- 'Oromo': 'orm',
64
- 'Oriya': 'ori',
65
- 'Punjabi': 'pan',
66
- 'Polish': 'pol',
67
- 'Pashto': 'pus',
68
- 'Portuguese': 'por',
69
- 'Romanian': 'ron',
70
- 'Russian': 'rus',
71
- 'Sanskrit': 'san',
72
- 'Sindhi': 'snd',
73
- 'Sinhala': 'sin',
74
- 'Slovak': 'slk',
75
- 'Slovenian': 'slv',
76
- 'Somali': 'som',
77
- 'Albanian': 'sqi',
78
- 'Serbian': 'srp',
79
- 'Sundanese': 'sun',
80
- 'Swedish': 'swe',
81
- 'Swahili': 'swa',
82
- 'Tamil': 'tam',
83
- 'Telugu': 'tel',
84
- 'Thai': 'tha',
85
- 'Tagalog': 'tgl',
86
- 'Turkish': 'tur',
87
- 'Uyghur': 'uig',
88
- 'Ukrainian': 'ukr',
89
- 'Urdu': 'urd',
90
- 'Uzbek': 'uzb',
91
- 'Vietnamese': 'vie',
92
- 'Xhosa': 'xho',
93
- 'Yiddish': 'yid',
94
- 'Chinese': 'chi_sim',
95
- }
96
-
97
- TESSERACT_CODE_TO_LANGUAGE = {v:k for k,v in LANGUAGE_TO_TESSERACT_CODE.items()}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
marker/pdf/extract_text.py DELETED
@@ -1,114 +0,0 @@
1
- import os
2
- from typing import List, Optional, Dict
3
-
4
- import pypdfium2 as pdfium
5
-
6
- from marker.cleaners.toc import get_pdf_toc
7
- from marker.pdf.utils import font_flags_decomposer
8
- from marker.settings import settings
9
- from marker.schema.block import Span, Line, Block
10
- from marker.schema.page import Page
11
- from pdftext.extraction import dictionary_output
12
-
13
- os.environ["TESSDATA_PREFIX"] = settings.TESSDATA_PREFIX
14
-
15
-
16
- def pdftext_format_to_blocks(page, pnum: int) -> Page:
17
- page_blocks = []
18
- span_id = 0
19
- for block_idx, block in enumerate(page["blocks"]):
20
- for l in block["lines"]:
21
- block_lines = []
22
- spans = []
23
- for i, s in enumerate(l["spans"]):
24
- block_text = s["text"]
25
- # Remove trailing newlines and carriage returns (tesseract)
26
- while len(block_text) > 0 and block_text[-1] in ["\n", "\r"]:
27
- block_text = block_text[:-1]
28
-
29
- block_text = block_text.replace("-\n", "") # Remove hyphenated line breaks
30
- span_obj = Span(
31
- text=block_text, # Remove end of line newlines, not spaces
32
- bbox=s["bbox"],
33
- span_id=f"{pnum}_{span_id}",
34
- font=f"{s['font']['name']}_{font_flags_decomposer(s['font']['flags'])}", # Add font flags to end of font
35
- font_weight=s["font"]["weight"],
36
- font_size=s["font"]["size"],
37
- )
38
- spans.append(span_obj) # Text, bounding box, span id
39
- span_id += 1
40
- line_obj = Line(
41
- spans=spans,
42
- bbox=l["bbox"],
43
- )
44
- # Only select valid lines, with positive bboxes
45
- if line_obj.area >= 0:
46
- block_lines.append(line_obj)
47
-
48
- # Each block is a single line
49
- block_obj = Block(
50
- lines=block_lines,
51
- bbox=l["bbox"],
52
- pnum=pnum
53
- )
54
- # Only select blocks with lines
55
- if len(block_lines) > 0:
56
- page_blocks.append(block_obj)
57
-
58
- page_bbox = page["bbox"]
59
- page_width = abs(page_bbox[2] - page_bbox[0])
60
- page_height = abs(page_bbox[3] - page_bbox[1])
61
- rotation = page["rotation"]
62
-
63
- # Flip width and height if rotated
64
- if rotation == 90 or rotation == 270:
65
- page_width, page_height = page_height, page_width
66
-
67
- char_blocks = page["blocks"]
68
- page_bbox = [0, 0, page_width, page_height]
69
- out_page = Page(
70
- blocks=page_blocks,
71
- pnum=page["page"],
72
- bbox=page_bbox,
73
- rotation=rotation,
74
- char_blocks=char_blocks
75
- )
76
- return out_page
77
-
78
-
79
- def get_text_blocks(doc, fname, max_pages: Optional[int] = None, start_page: Optional[int] = None) -> (List[Page], Dict):
80
- toc = get_pdf_toc(doc)
81
-
82
- if start_page:
83
- assert start_page < len(doc)
84
- else:
85
- start_page = 0
86
-
87
- if max_pages:
88
- if max_pages + start_page > len(doc):
89
- max_pages = len(doc) - start_page
90
- else:
91
- max_pages = len(doc) - start_page
92
-
93
- page_range = range(start_page, start_page + max_pages)
94
-
95
- char_blocks = dictionary_output(fname, page_range=page_range, keep_chars=False, workers=settings.PDFTEXT_CPU_WORKERS, flatten_pdf=settings.FLATTEN_PDF)
96
- marker_blocks = [pdftext_format_to_blocks(page, pnum) for pnum, page in enumerate(char_blocks)]
97
-
98
- return marker_blocks, toc
99
-
100
-
101
- def naive_get_text(doc):
102
- full_text = ""
103
- for page_idx in range(len(doc)):
104
- page = doc.get_page(page_idx)
105
- text_page = page.get_textpage()
106
- full_text += text_page.get_text_bounded() + "\n"
107
- return full_text
108
-
109
-
110
- def get_length_of_text(fname: str) -> int:
111
- doc = pdfium.PdfDocument(fname)
112
- text = naive_get_text(doc).strip()
113
-
114
- return len(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
marker/pdf/images.py DELETED
@@ -1,27 +0,0 @@
1
- import pypdfium2 as pdfium
2
- from pypdfium2 import PdfPage
3
-
4
- from marker.schema.page import Page
5
- from marker.schema.bbox import rescale_bbox
6
- from marker.settings import settings
7
-
8
-
9
- def render_image(page: pdfium.PdfPage, dpi):
10
- image = page.render(
11
- scale=dpi / 72,
12
- draw_annots=False
13
- ).to_pil()
14
- image = image.convert("RGB")
15
- return image
16
-
17
-
18
- def render_bbox_image(page_obj: PdfPage, page: Page, bbox):
19
- png_image = render_image(page_obj, settings.IMAGE_DPI)
20
- # Rescale original pdf bbox bounds to match png image size
21
- png_bbox = [0, 0, png_image.size[0], png_image.size[1]]
22
- rescaled_merged = rescale_bbox(page.bbox, png_bbox, bbox)
23
-
24
- # Crop out only the equation image
25
- png_image = png_image.crop(rescaled_merged)
26
- png_image = png_image.convert("RGB")
27
- return png_image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
marker/pdf/utils.py DELETED
@@ -1,75 +0,0 @@
1
- from typing import Optional
2
-
3
- import filetype
4
-
5
- from marker.settings import settings
6
-
7
-
8
- def find_filetype(fpath):
9
- kind = filetype.guess(fpath)
10
- if kind is None:
11
- print(f"Could not determine filetype for {fpath}")
12
- return "other"
13
-
14
- mimetype = kind.mime
15
-
16
- # Get extensions from mimetype
17
- # The mimetype is not always consistent, so use in to check the most common formats
18
- if "pdf" in mimetype:
19
- return "pdf"
20
- elif mimetype in settings.SUPPORTED_FILETYPES:
21
- return settings.SUPPORTED_FILETYPES[mimetype]
22
- else:
23
- print(f"Found nonstandard filetype {mimetype}")
24
- return "other"
25
-
26
-
27
- def font_flags_decomposer(flags: Optional[int]) -> str:
28
- if flags is None:
29
- return ""
30
-
31
- flag_descriptions = []
32
- if flags & (1 << 0): # PDFFONT_FIXEDPITCH
33
- flag_descriptions.append("fixed_pitch")
34
- if flags & (1 << 1): # PDFFONT_SERIF
35
- flag_descriptions.append("serif")
36
- if flags & (1 << 2): # PDFFONT_SYMBOLIC
37
- flag_descriptions.append("symbolic")
38
- if flags & (1 << 3): # PDFFONT_SCRIPT
39
- flag_descriptions.append("script")
40
- if flags & (1 << 5): # PDFFONT_NONSYMBOLIC
41
- flag_descriptions.append("non_symbolic")
42
- if flags & (1 << 6): # PDFFONT_ITALIC
43
- flag_descriptions.append("italic")
44
- if flags & (1 << 16): # PDFFONT_ALLCAP
45
- flag_descriptions.append("all_cap")
46
- if flags & (1 << 17): # PDFFONT_SMALLCAP
47
- flag_descriptions.append("small_cap")
48
- if flags & (1 << 18): # PDFFONT_FORCEBOLD
49
- flag_descriptions.append("bold")
50
- if flags & (1 << 19): # PDFFONT_USEEXTERNATTR
51
- flag_descriptions.append("use_extern_attr")
52
-
53
- return "_".join(flag_descriptions)
54
-
55
-
56
- def sort_block_group(blocks, tolerance=1.25):
57
- vertical_groups = {}
58
- for block in blocks:
59
- if hasattr(block, "bbox"):
60
- bbox = block.bbox
61
- else:
62
- bbox = block["bbox"]
63
-
64
- group_key = round(bbox[1] / tolerance) * tolerance
65
- if group_key not in vertical_groups:
66
- vertical_groups[group_key] = []
67
- vertical_groups[group_key].append(block)
68
-
69
- # Sort each group horizontally and flatten the groups into a single list
70
- sorted_blocks = []
71
- for _, group in sorted(vertical_groups.items()):
72
- sorted_group = sorted(group, key=lambda x: x.bbox[0] if hasattr(x, "bbox") else x["bbox"][0])
73
- sorted_blocks.extend(sorted_group)
74
-
75
- return sorted_blocks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
marker/postprocessors/markdown.py DELETED
@@ -1,254 +0,0 @@
1
- from marker.schema.merged import MergedLine, MergedBlock, FullyMergedBlock
2
- from marker.schema.page import Page
3
- import re
4
- import regex
5
- from typing import List
6
- from copy import deepcopy
7
-
8
- from marker.settings import settings
9
-
10
-
11
- def escape_markdown(text):
12
- # List of characters that need to be escaped in markdown
13
- characters_to_escape = r"[#]"
14
- # Escape each of these characters with a backslash
15
- escaped_text = re.sub(characters_to_escape, r'\\\g<0>', text)
16
- return escaped_text
17
-
18
-
19
- def surround_text(s, char_to_insert):
20
- leading_whitespace = re.match(r'^(\s*)', s).group(1)
21
- trailing_whitespace = re.search(r'(\s*)$', s).group(1)
22
- stripped_string = s.strip()
23
- modified_string = char_to_insert + stripped_string + char_to_insert
24
- final_string = leading_whitespace + modified_string + trailing_whitespace
25
- return final_string
26
-
27
-
28
- def merge_spans(pages: List[Page]) -> List[List[MergedBlock]]:
29
- merged_blocks = []
30
- for page in pages:
31
- page_blocks = []
32
- for blocknum, block in enumerate(page.blocks):
33
- block_lines = []
34
- for linenum, line in enumerate(block.lines):
35
- line_text = ""
36
- if len(line.spans) == 0:
37
- continue
38
- fonts = []
39
- for i, span in enumerate(line.spans):
40
- font = span.font.lower()
41
- next_span = None
42
- next_idx = 1
43
- while len(line.spans) > i + next_idx:
44
- next_span = line.spans[i + next_idx]
45
- next_idx += 1
46
- if len(next_span.text.strip()) > 2:
47
- break
48
-
49
- fonts.append(font)
50
- span_text = span.text
51
-
52
- # Don't bold or italicize very short sequences
53
- # Avoid bolding first and last sequence so lines can be joined properly
54
- if len(span_text) > 3 and 0 < i < len(line.spans) - 1:
55
- if span.italic and (not next_span or not next_span.italic):
56
- span_text = surround_text(span_text, "*")
57
- elif span.bold and (not next_span or not next_span.bold):
58
- span_text = surround_text(span_text, "**")
59
- line_text += span_text
60
- block_lines.append(MergedLine(
61
- text=line_text,
62
- fonts=fonts,
63
- bbox=line.bbox
64
- ))
65
- if len(block_lines) > 0:
66
- page_blocks.append(MergedBlock(
67
- lines=block_lines,
68
- pnum=page.pnum,
69
- bbox=block.bbox,
70
- block_type=block.block_type,
71
- heading_level=block.heading_level
72
- ))
73
- if len(page_blocks) == 0:
74
- page_blocks.append(MergedBlock(
75
- lines=[],
76
- pnum=page.pnum,
77
- bbox=page.bbox,
78
- block_type="Text",
79
- heading_level=None
80
- ))
81
- merged_blocks.append(page_blocks)
82
-
83
- return merged_blocks
84
-
85
-
86
- def block_surround(text, block_type, heading_level):
87
- if block_type == "Section-header":
88
- if not text.startswith("#"):
89
- asterisks = "#" * heading_level if heading_level is not None else "##"
90
- text = f"\n{asterisks} " + text.strip().title() + "\n"
91
- elif block_type == "Title":
92
- if not text.startswith("#"):
93
- text = "# " + text.strip().title() + "\n"
94
- elif block_type == "Table":
95
- text = "\n" + text + "\n"
96
- elif block_type == "List-item":
97
- text = escape_markdown(text.rstrip()) + "\n"
98
- elif block_type == "Code":
99
- text = "\n```\n" + text + "\n```\n"
100
- elif block_type == "Text":
101
- text = escape_markdown(text)
102
- elif block_type == "Formula":
103
- if text.strip().startswith("$$") and text.strip().endswith("$$"):
104
- text = text.strip()
105
- text = "\n" + text + "\n"
106
- elif block_type == "Caption":
107
- text = "\n" + escape_markdown(text) + "\n"
108
- return text
109
-
110
-
111
- def line_separator(block_text: str, prev_line: MergedLine, line: MergedLine, block_type: str, new_column: bool, new_page: bool, new_block: bool) -> str:
112
- lowercase_letters = r'\p{Ll}|\d'
113
- hyphens = r'-β€”Β¬'
114
-
115
- hyphen_regex = regex.compile(rf'.*[{hyphens}]\s?$', regex.DOTALL)
116
- hyphens_lowercase_regex = regex.compile(rf'.*[{lowercase_letters}][{hyphens}]\s?$', regex.DOTALL)
117
- line_starts_lowercase = regex.match(rf"^\s?[{lowercase_letters}]", line.text)
118
- prev_has_reference = regex.match(r"^\[\d+\]\s+[A-Z]", prev_line.text)
119
- has_reference = regex.match(r"^\[\d+\]\s+[A-Z]", line.text)
120
- has_numbered_item = regex.match(r"^\d+:\s+", line.text)
121
-
122
- line_text = line.text.lstrip()
123
- block_text = block_text.rstrip()
124
-
125
- if block_type in ["Text", "List-item", "Footnote", "Caption", "Figure"]:
126
- if has_reference or has_numbered_item:
127
- return block_text + "\n\n" + line_text
128
- elif hyphen_regex.match(block_text):
129
- if line_starts_lowercase and hyphens_lowercase_regex.match(block_text):
130
- return regex.split(rf"[{hyphens}]\s?$", block_text)[0].rstrip() + line_text
131
- return block_text + line_text
132
- elif new_page or new_column:
133
- if line_starts_lowercase:
134
- return block_text + " " + line_text
135
- return block_text + "\n\n" + line_text
136
- elif new_block:
137
- if prev_has_reference:
138
- return block_text + " " + line_text
139
- return block_text + "\n\n" + line_text
140
- else:
141
- # General case for joining lines with a space
142
- return block_text + " " + line_text
143
- elif block_type in ["Title", "Section-header"]:
144
- return block_text + " " + line_text
145
- elif block_type in ["Formula"]:
146
- return block_text + "\n" + line_text
147
- elif block_type in ["Code", "Table"]:
148
- return block_text + "\n\n" + line_text
149
- else:
150
- return block_text + " " + line_text
151
-
152
-
153
- def block_separator(prev_block: FullyMergedBlock, block: FullyMergedBlock):
154
- sep = "\n"
155
- if prev_block.block_type == "Text":
156
- sep = "\n\n"
157
-
158
- return sep + block.text
159
-
160
- def merge_lines(blocks: List[List[MergedBlock]], min_new_block_x_indent_percent=5.0):
161
- text_blocks = []
162
- prev_block = None
163
- prev_type = None
164
- prev_line = None
165
- block_text = ""
166
- block_type = ""
167
- prev_heading_level = None
168
- pnum = None
169
-
170
- for page_id, page in enumerate(blocks):
171
- # Insert pagination at every page boundary
172
- if settings.PAGINATE_OUTPUT:
173
- if block_text:
174
- text_blocks.append(
175
- FullyMergedBlock(
176
- text=block_surround(block_text, prev_type, prev_heading_level),
177
- block_type=prev_type if prev_type else settings.DEFAULT_BLOCK_TYPE,
178
- page_start=False,
179
- pnum=pnum
180
- )
181
- )
182
- block_text = ""
183
- text_blocks.append(
184
- FullyMergedBlock(
185
- text="",
186
- block_type="Text",
187
- page_start=True,
188
- pnum=page[0].pnum
189
- )
190
- )
191
- for block_id, block in enumerate(page):
192
- first_block_in_page = block_id == 0
193
- block_type = block.block_type
194
- if (block_type != prev_type and prev_type) or (block.heading_level != prev_heading_level and prev_heading_level):
195
- text_blocks.append(
196
- FullyMergedBlock(
197
- text=block_surround(block_text, prev_type, prev_heading_level),
198
- block_type=prev_type if prev_type else settings.DEFAULT_BLOCK_TYPE,
199
- page_start=False,
200
- pnum=block.pnum
201
- )
202
- )
203
- block_text = ""
204
- # Join lines in the block together properly
205
- for line_id, line in enumerate(block.lines):
206
- first_line_in_block = line_id == 0
207
- if prev_line is None:
208
- prev_line = deepcopy(line)
209
- if prev_block is None:
210
- prev_block = deepcopy(block)
211
- x_indent = line.x_start - prev_line.x_start
212
- y_indent = line.y_start - prev_line.y_start
213
- new_line = y_indent > prev_line.height
214
- new_column = line.x_start > prev_block.x_end
215
- new_block = first_line_in_block or \
216
- ( # we consider it a new block when there's an x indent from the previous line and it's a new line (y indent)
217
- ((x_indent/block.width) * 100) > min_new_block_x_indent_percent and new_line
218
- )
219
- new_page = first_line_in_block and first_block_in_page
220
- if block_text:
221
- block_text = line_separator(block_text, prev_line, line, block_type, new_column, new_page, new_block)
222
- else:
223
- block_text = line.text
224
- prev_line = line
225
- prev_block = block
226
- prev_type = block_type
227
- prev_heading_level = block.heading_level
228
- pnum = block.pnum
229
- # Append the final block
230
- text_blocks.append(
231
- FullyMergedBlock(
232
- text=block_surround(block_text, prev_type, prev_heading_level),
233
- block_type=block_type if block_type else settings.DEFAULT_BLOCK_TYPE,
234
- page_start=False,
235
- pnum=pnum
236
- )
237
- )
238
-
239
- text_blocks = [block for block in text_blocks if (block.text.strip() or block.page_start)]
240
- return text_blocks
241
-
242
-
243
- def get_full_text(text_blocks):
244
- full_text = ""
245
- prev_block = None
246
- for block in text_blocks:
247
- if block.page_start:
248
- full_text += "\n\n{" + str(block.pnum) + "}" + settings.PAGE_SEPARATOR
249
- elif prev_block:
250
- full_text += block_separator(prev_block, block)
251
- else:
252
- full_text += block.text
253
- prev_block = block
254
- return full_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
marker/{v2/processors β†’ processors}/__init__.py RENAMED
@@ -2,9 +2,9 @@ from typing import Optional, Tuple
2
 
3
  from pydantic import BaseModel
4
 
5
- from marker.v2.schema import BlockTypes
6
- from marker.v2.schema.document import Document
7
- from marker.v2.util import assign_config
8
 
9
 
10
  class BaseProcessor:
 
2
 
3
  from pydantic import BaseModel
4
 
5
+ from marker.schema import BlockTypes
6
+ from marker.schema.document import Document
7
+ from marker.util import assign_config
8
 
9
 
10
  class BaseProcessor:
marker/{v2/processors β†’ processors}/code.py RENAMED
@@ -1,7 +1,7 @@
1
- from marker.v2.processors import BaseProcessor
2
- from marker.v2.schema import BlockTypes
3
- from marker.v2.schema.blocks import Code
4
- from marker.v2.schema.document import Document
5
 
6
 
7
  class CodeProcessor(BaseProcessor):
 
1
+ from marker.processors import BaseProcessor
2
+ from marker.schema import BlockTypes
3
+ from marker.schema.blocks import Code
4
+ from marker.schema.document import Document
5
 
6
 
7
  class CodeProcessor(BaseProcessor):
marker/{v2/processors β†’ processors}/debug.py RENAMED
@@ -5,9 +5,9 @@ import requests
5
  from PIL import Image, ImageDraw, ImageFont
6
 
7
  from marker.settings import settings
8
- from marker.v2.processors import BaseProcessor
9
- from marker.v2.schema import BlockTypes
10
- from marker.v2.schema.document import Document
11
 
12
 
13
  class DebugProcessor(BaseProcessor):
 
5
  from PIL import Image, ImageDraw, ImageFont
6
 
7
  from marker.settings import settings
8
+ from marker.processors import BaseProcessor
9
+ from marker.schema import BlockTypes
10
+ from marker.schema.document import Document
11
 
12
 
13
  class DebugProcessor(BaseProcessor):
marker/{v2/processors β†’ processors}/document_toc.py RENAMED
@@ -1,6 +1,6 @@
1
- from marker.v2.processors import BaseProcessor
2
- from marker.v2.schema import BlockTypes
3
- from marker.v2.schema.document import Document
4
 
5
 
6
  class DocumentTOCProcessor(BaseProcessor):
 
1
+ from marker.processors import BaseProcessor
2
+ from marker.schema import BlockTypes
3
+ from marker.schema.document import Document
4
 
5
 
6
  class DocumentTOCProcessor(BaseProcessor):
marker/{v2/processors β†’ processors}/equation.py RENAMED
@@ -5,9 +5,9 @@ from texify.model.model import GenerateVisionEncoderDecoderModel
5
  from tqdm import tqdm
6
 
7
  from marker.settings import settings
8
- from marker.v2.processors import BaseProcessor
9
- from marker.v2.schema import BlockTypes
10
- from marker.v2.schema.document import Document
11
 
12
 
13
  class EquationProcessor(BaseProcessor):
 
5
  from tqdm import tqdm
6
 
7
  from marker.settings import settings
8
+ from marker.processors import BaseProcessor
9
+ from marker.schema import BlockTypes
10
+ from marker.schema.document import Document
11
 
12
 
13
  class EquationProcessor(BaseProcessor):
marker/{v2/processors β†’ processors}/ignoretext.py RENAMED
@@ -1,8 +1,8 @@
1
  from collections import Counter
2
 
3
- from marker.v2.processors import BaseProcessor
4
- from marker.v2.schema import BlockTypes
5
- from marker.v2.schema.document import Document
6
 
7
 
8
  class IgnoreTextProcessor(BaseProcessor):
 
1
  from collections import Counter
2
 
3
+ from marker.processors import BaseProcessor
4
+ from marker.schema import BlockTypes
5
+ from marker.schema.document import Document
6
 
7
 
8
  class IgnoreTextProcessor(BaseProcessor):
marker/{v2/processors β†’ processors}/sectionheader.py RENAMED
@@ -1,6 +1,6 @@
1
- from marker.v2.processors import BaseProcessor
2
- from marker.v2.schema import BlockTypes
3
- from marker.v2.schema.document import Document
4
 
5
  from typing import Dict, List
6
  import numpy as np
 
1
+ from marker.processors import BaseProcessor
2
+ from marker.schema import BlockTypes
3
+ from marker.schema.document import Document
4
 
5
  from typing import Dict, List
6
  import numpy as np
marker/{v2/processors β†’ processors}/table.py RENAMED
@@ -7,9 +7,9 @@ from tabled.assignment import assign_rows_columns
7
  from tabled.inference.recognition import get_cells, recognize_tables
8
 
9
  from marker.settings import settings
10
- from marker.v2.processors import BaseProcessor
11
- from marker.v2.schema import BlockTypes
12
- from marker.v2.schema.document import Document
13
 
14
 
15
  class TableProcessor(BaseProcessor):
 
7
  from tabled.inference.recognition import get_cells, recognize_tables
8
 
9
  from marker.settings import settings
10
+ from marker.processors import BaseProcessor
11
+ from marker.schema import BlockTypes
12
+ from marker.schema.document import Document
13
 
14
 
15
  class TableProcessor(BaseProcessor):
marker/{v2/processors β†’ processors}/text.py RENAMED
@@ -3,10 +3,10 @@ from typing import List
3
 
4
  import regex
5
 
6
- from marker.v2.processors import BaseProcessor
7
- from marker.v2.schema import BlockTypes
8
- from marker.v2.schema.document import Document
9
- from marker.v2.schema.text.line import Line
10
 
11
 
12
  class TextProcessor(BaseProcessor):
 
3
 
4
  import regex
5
 
6
+ from marker.processors import BaseProcessor
7
+ from marker.schema import BlockTypes
8
+ from marker.schema.document import Document
9
+ from marker.schema.text.line import Line
10
 
11
 
12
  class TextProcessor(BaseProcessor):
marker/{v2/providers β†’ providers}/__init__.py RENAMED
@@ -2,9 +2,9 @@ from typing import List, Optional, Dict
2
 
3
  from pydantic import BaseModel
4
 
5
- from marker.v2.schema.text import Span
6
- from marker.v2.schema.text.line import Line
7
- from marker.v2.util import assign_config
8
 
9
 
10
  class ProviderOutput(BaseModel):
 
2
 
3
  from pydantic import BaseModel
4
 
5
+ from marker.schema.text import Span
6
+ from marker.schema.text.line import Line
7
+ from marker.util import assign_config
8
 
9
 
10
  class ProviderOutput(BaseModel):
marker/{v2/providers β†’ providers}/pdf.py RENAMED
@@ -1,18 +1,19 @@
1
  import atexit
2
  import functools
 
3
  from typing import List, Set
4
 
5
  import pypdfium2 as pdfium
6
  from pdftext.extraction import dictionary_output
7
  from PIL import Image
8
 
9
- from marker.ocr.heuristics import detect_bad_ocr
10
- from marker.v2.providers import BaseProvider, ProviderOutput, ProviderPageLines
11
- from marker.v2.schema.polygon import PolygonBox
12
- from marker.v2.schema import BlockTypes
13
- from marker.v2.schema.registry import get_block_class
14
- from marker.v2.schema.text.line import Line
15
- from marker.v2.schema.text.span import Span
16
 
17
 
18
  class PdfProvider(BaseProvider):
@@ -20,6 +21,10 @@ class PdfProvider(BaseProvider):
20
  pdftext_workers: int = 4
21
  flatten_pdf: bool = True
22
  force_ocr: bool = False
 
 
 
 
23
 
24
  def __init__(self, filepath: str, config=None):
25
  super().__init__(filepath, config)
@@ -153,10 +158,34 @@ class PdfProvider(BaseProvider):
153
  text = text + "\n"
154
  if len(text.strip()) == 0:
155
  return False
156
- if detect_bad_ocr(text):
157
  return False
158
  return True
159
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  @functools.lru_cache(maxsize=None)
161
  def get_image(self, idx: int, dpi: int) -> Image.Image:
162
  page = self.doc[idx]
 
1
  import atexit
2
  import functools
3
+ import re
4
  from typing import List, Set
5
 
6
  import pypdfium2 as pdfium
7
  from pdftext.extraction import dictionary_output
8
  from PIL import Image
9
 
10
+ from marker.providers.utils import alphanum_ratio
11
+ from marker.providers import BaseProvider, ProviderOutput, ProviderPageLines
12
+ from marker.schema.polygon import PolygonBox
13
+ from marker.schema import BlockTypes
14
+ from marker.schema.registry import get_block_class
15
+ from marker.schema.text.line import Line
16
+ from marker.schema.text.span import Span
17
 
18
 
19
  class PdfProvider(BaseProvider):
 
21
  pdftext_workers: int = 4
22
  flatten_pdf: bool = True
23
  force_ocr: bool = False
24
+ ocr_invalid_chars: tuple = (chr(0xfffd), "οΏ½")
25
+ ocr_space_threshold: float = .7
26
+ ocr_newline_threshold: float = .6
27
+ ocr_alphanum_threshold: float = .3
28
 
29
  def __init__(self, filepath: str, config=None):
30
  super().__init__(filepath, config)
 
158
  text = text + "\n"
159
  if len(text.strip()) == 0:
160
  return False
161
+ if self.detect_bad_ocr(text):
162
  return False
163
  return True
164
 
165
+ def detect_bad_ocr(self, text):
166
+ if len(text) == 0:
167
+ # Assume OCR failed if we have no text
168
+ return True
169
+
170
+ spaces = len(re.findall(r'\s+', text))
171
+ alpha_chars = len(re.sub(r'\s+', '', text))
172
+ if spaces / (alpha_chars + spaces) > self.ocr_space_threshold:
173
+ return True
174
+
175
+ newlines = len(re.findall(r'\n+', text))
176
+ non_newlines = len(re.sub(r'\n+', '', text))
177
+ if newlines / (newlines + non_newlines) > self.ocr_newline_threshold:
178
+ return True
179
+
180
+ if alphanum_ratio(text) < self.ocr_alphanum_threshold: # Garbled text
181
+ return True
182
+
183
+ invalid_chars = len([c for c in text if c in self.ocr_invalid_chars])
184
+ if invalid_chars > max(6.0, len(text) * .03):
185
+ return True
186
+
187
+ return False
188
+
189
  @functools.lru_cache(maxsize=None)
190
  def get_image(self, idx: int, dpi: int) -> Image.Image:
191
  page = self.doc[idx]
marker/{ocr β†’ providers}/utils.py RENAMED
@@ -1,3 +1,6 @@
 
 
 
1
  def alphanum_ratio(text):
2
  text = text.replace(" ", "")
3
  text = text.replace("\n", "")
@@ -7,4 +10,4 @@ def alphanum_ratio(text):
7
  return 1
8
 
9
  ratio = alphanumeric_count / len(text)
10
- return ratio
 
1
+ import re
2
+
3
+
4
  def alphanum_ratio(text):
5
  text = text.replace(" ", "")
6
  text = text.replace("\n", "")
 
10
  return 1
11
 
12
  ratio = alphanumeric_count / len(text)
13
+ return ratio
marker/{v2/renderers β†’ renderers}/__init__.py RENAMED
@@ -6,9 +6,9 @@ from typing import Optional
6
  from bs4 import BeautifulSoup
7
  from pydantic import BaseModel
8
 
9
- from marker.v2.schema import BlockTypes
10
- from marker.v2.schema.blocks.base import BlockOutput, BlockId
11
- from marker.v2.util import assign_config
12
 
13
 
14
  class BaseRenderer:
 
6
  from bs4 import BeautifulSoup
7
  from pydantic import BaseModel
8
 
9
+ from marker.schema import BlockTypes
10
+ from marker.schema.blocks.base import BlockOutput, BlockId
11
+ from marker.util import assign_config
12
 
13
 
14
  class BaseRenderer:
marker/{v2/renderers β†’ renderers}/html.py RENAMED
@@ -3,9 +3,9 @@ import re
3
  from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
4
  from pydantic import BaseModel
5
 
6
- from marker.v2.renderers import BaseRenderer
7
- from marker.v2.schema import BlockTypes
8
- from marker.v2.schema.blocks import BlockId
9
 
10
  # Ignore beautifulsoup warnings
11
  import warnings
 
3
  from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
4
  from pydantic import BaseModel
5
 
6
+ from marker.renderers import BaseRenderer
7
+ from marker.schema import BlockTypes
8
+ from marker.schema.blocks import BlockId
9
 
10
  # Ignore beautifulsoup warnings
11
  import warnings
marker/{v2/renderers β†’ renderers}/json.py RENAMED
@@ -7,11 +7,11 @@ from typing import List, Dict
7
  from bs4 import BeautifulSoup
8
  from pydantic import BaseModel
9
 
10
- from marker.v2.schema.blocks import Block
11
- from marker.v2.renderers import BaseRenderer
12
- from marker.v2.schema import BlockTypes
13
- from marker.v2.schema.blocks import BlockId
14
- from marker.v2.schema.registry import get_block_class
15
 
16
 
17
  class JSONBlockOutput(BaseModel):
 
7
  from bs4 import BeautifulSoup
8
  from pydantic import BaseModel
9
 
10
+ from marker.schema.blocks import Block
11
+ from marker.renderers import BaseRenderer
12
+ from marker.schema import BlockTypes
13
+ from marker.schema.blocks import BlockId
14
+ from marker.schema.registry import get_block_class
15
 
16
 
17
  class JSONBlockOutput(BaseModel):