Vik Paruchuri commited on
Commit
2ad7f6b
·
1 Parent(s): 173a1b8

Remove pymupdf

Browse files
marker/bbox.py CHANGED
@@ -1,5 +1,3 @@
1
- import fitz as pymupdf
2
-
3
  def should_merge_blocks(box1, box2, tol=5):
4
  # Within tol y px, and to the right within tol px
5
  merge = [
@@ -60,22 +58,4 @@ def unnormalize_box(bbox, width, height):
60
  height * (bbox[1] / 1000),
61
  width * (bbox[2] / 1000),
62
  height * (bbox[3] / 1000),
63
- ]
64
-
65
-
66
- def correct_rotation(bbox, page):
67
- #bbox base is (x0, y0, x1, y1)
68
- rotation = page.rotation
69
- if rotation == 0:
70
- return bbox
71
-
72
- tl = pymupdf.Point(bbox[0], bbox[1]) * page.rotation_matrix
73
- br = pymupdf.Point(bbox[2], bbox[3]) * page.rotation_matrix
74
- if rotation == 90:
75
- bbox = [br[0], tl[1], tl[0], br[1]]
76
- elif rotation == 180:
77
- bbox = [br[0], br[1], tl[0], tl[1]]
78
- elif rotation == 270:
79
- bbox = [tl[0], br[1], br[0], tl[1]]
80
-
81
- return bbox
 
 
 
1
  def should_merge_blocks(box1, box2, tol=5):
2
  # Within tol y px, and to the right within tol px
3
  merge = [
 
58
  height * (bbox[1] / 1000),
59
  width * (bbox[2] / 1000),
60
  height * (bbox[3] / 1000),
61
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
marker/cleaners/code.py CHANGED
@@ -1,7 +1,6 @@
1
  from marker.schema import Span, Line, Page
2
  import re
3
  from typing import List
4
- import fitz as pymupdf
5
 
6
 
7
  def is_code_linelen(lines, thresh=60):
@@ -102,13 +101,13 @@ def indent_blocks(blocks: List[Page]):
102
  if col_width == 0 and len(span.text) > 0:
103
  col_width = (span.bbox[2] - span.bbox[0]) / len(span.text)
104
  text += span.text
105
- lines.append((pymupdf.Rect(line.bbox), text))
106
 
107
  block_text = ""
108
  blank_line = False
109
  for line in lines:
110
  text = line[1]
111
- prefix = " " * int((line[0].x0 - min_left) / col_width)
112
  current_line_blank = len(text.strip()) == 0
113
  if blank_line and current_line_blank:
114
  # Don't put multiple blank lines in a row
@@ -120,9 +119,10 @@ def indent_blocks(blocks: List[Page]):
120
  new_span = Span(
121
  text=block_text,
122
  bbox=block.bbox,
123
- color=block.lines[0].spans[0].color,
124
  span_id=f"{span_counter}_fix_code",
125
  font=block.lines[0].spans[0].font,
 
 
126
  block_type="Code"
127
  )
128
  span_counter += 1
 
1
  from marker.schema import Span, Line, Page
2
  import re
3
  from typing import List
 
4
 
5
 
6
  def is_code_linelen(lines, thresh=60):
 
101
  if col_width == 0 and len(span.text) > 0:
102
  col_width = (span.bbox[2] - span.bbox[0]) / len(span.text)
103
  text += span.text
104
+ lines.append((line.bbox, text))
105
 
106
  block_text = ""
107
  blank_line = False
108
  for line in lines:
109
  text = line[1]
110
+ prefix = " " * int((line[0][0] - min_left) / col_width)
111
  current_line_blank = len(text.strip()) == 0
112
  if blank_line and current_line_blank:
113
  # Don't put multiple blank lines in a row
 
119
  new_span = Span(
120
  text=block_text,
121
  bbox=block.bbox,
 
122
  span_id=f"{span_counter}_fix_code",
123
  font=block.lines[0].spans[0].font,
124
+ font_weight=block.lines[0].spans[0].font_weight,
125
+ font_size=block.lines[0].spans[0].font_size,
126
  block_type="Code"
127
  )
128
  span_counter += 1
marker/cleaners/equations.py CHANGED
@@ -12,6 +12,7 @@ from PIL import Image, ImageDraw
12
 
13
  from marker.bbox import should_merge_blocks, merge_boxes
14
  from marker.debug.data import dump_equation_debug_data
 
15
  from marker.settings import settings
16
  from marker.schema import Page, Span, Line, Block, BlockType
17
  import os
@@ -51,9 +52,7 @@ def mask_bbox(png_image, bbox, selected_bboxes):
51
 
52
 
53
  def get_masked_image(page, bbox, selected_bboxes):
54
- pix = page.get_pixmap(dpi=settings.TEXIFY_DPI, clip=bbox)
55
- png = pix.pil_tobytes(format="PNG")
56
- png_image = Image.open(io.BytesIO(png))
57
  png_image = mask_bbox(png_image, bbox, selected_bboxes)
58
  png_image = png_image.convert("RGB")
59
  return png_image
@@ -212,7 +211,8 @@ def replace_blocks_with_latex(page_blocks: Page, merged_boxes, reformat_regions,
212
  bbox=merged_boxes[current_region],
213
  span_id=f"{pnum}_{idx}_fixeq",
214
  font="Latex",
215
- color=0,
 
216
  block_type="Formula"
217
  )
218
  ],
 
12
 
13
  from marker.bbox import should_merge_blocks, merge_boxes
14
  from marker.debug.data import dump_equation_debug_data
15
+ from marker.pdf.images import render_image
16
  from marker.settings import settings
17
  from marker.schema import Page, Span, Line, Block, BlockType
18
  import os
 
52
 
53
 
54
  def get_masked_image(page, bbox, selected_bboxes):
55
+ png_image = render_image(page, settings.TEXIFY_DPI)
 
 
56
  png_image = mask_bbox(png_image, bbox, selected_bboxes)
57
  png_image = png_image.convert("RGB")
58
  return png_image
 
211
  bbox=merged_boxes[current_region],
212
  span_id=f"{pnum}_{idx}_fixeq",
213
  font="Latex",
214
+ font_weight=0,
215
+ font_size=0,
216
  block_type="Formula"
217
  )
218
  ],
marker/cleaners/headers.py CHANGED
@@ -1,7 +1,7 @@
1
  import re
2
  from collections import Counter, defaultdict
3
  from itertools import chain
4
- from thefuzz import fuzz
5
 
6
  from sklearn.cluster import DBSCAN
7
  import numpy as np
 
1
  import re
2
  from collections import Counter, defaultdict
3
  from itertools import chain
4
+ from rapidfuzz import fuzz
5
 
6
  from sklearn.cluster import DBSCAN
7
  import numpy as np
marker/cleaners/table.py CHANGED
@@ -4,7 +4,6 @@ from copy import deepcopy
4
  from tabulate import tabulate
5
  from typing import List
6
  import re
7
- import textwrap
8
 
9
 
10
  def merge_table_blocks(blocks: List[Page]):
@@ -84,7 +83,8 @@ def create_new_tables(blocks: List[Page]):
84
  bbox=block.bbox,
85
  span_id=f"{table_idx}_fix_table",
86
  font="Table",
87
- color=0,
 
88
  block_type="Table",
89
  text=new_text
90
  )
 
4
  from tabulate import tabulate
5
  from typing import List
6
  import re
 
7
 
8
 
9
  def merge_table_blocks(blocks: List[Page]):
 
83
  bbox=block.bbox,
84
  span_id=f"{table_idx}_fix_table",
85
  font="Table",
86
+ font_size=0,
87
+ font_weight=0,
88
  block_type="Table",
89
  text=new_text
90
  )
marker/convert.py CHANGED
@@ -1,4 +1,4 @@
1
- import fitz as pymupdf
2
 
3
  from marker.cleaners.table import merge_table_blocks, create_new_tables
4
  from marker.debug.data import dump_bbox_debug_data
@@ -25,10 +25,10 @@ def find_filetype(fpath):
25
  # The mimetype is not always consistent, so use in to check the most common formats
26
  if "pdf" in mimetype:
27
  return "pdf"
28
- elif "epub" in mimetype:
29
- return "epub"
30
- elif "mobi" in mimetype:
31
- return "mobi"
32
  elif mimetype in settings.SUPPORTED_FILETYPES:
33
  return settings.SUPPORTED_FILETYPES[mimetype]
34
  else:
@@ -47,10 +47,12 @@ def get_length_of_text(fname: str) -> int:
47
  if filetype == "other":
48
  return 0
49
 
50
- doc = pymupdf.open(fname, filetype=filetype)
51
  full_text = ""
52
- for page in doc:
53
- full_text += page.get_text("text", sort=True, flags=settings.TEXT_FLAGS)
 
 
54
 
55
  return len(full_text)
56
 
@@ -81,11 +83,7 @@ def convert_single_pdf(
81
 
82
  out_meta["filetype"] = filetype
83
 
84
- doc = pymupdf.open(fname, filetype=filetype)
85
- if filetype != "pdf":
86
- conv = doc.convert_to_pdf()
87
- doc = pymupdf.open("pdf", conv)
88
-
89
  blocks, toc, ocr_stats = get_text_blocks(
90
  doc,
91
  tess_lang,
 
1
+ import pypdfium2 as pdfium
2
 
3
  from marker.cleaners.table import merge_table_blocks, create_new_tables
4
  from marker.debug.data import dump_bbox_debug_data
 
25
  # The mimetype is not always consistent, so use in to check the most common formats
26
  if "pdf" in mimetype:
27
  return "pdf"
28
+ #elif "epub" in mimetype:
29
+ # return "epub"
30
+ #elif "mobi" in mimetype:
31
+ # return "mobi"
32
  elif mimetype in settings.SUPPORTED_FILETYPES:
33
  return settings.SUPPORTED_FILETYPES[mimetype]
34
  else:
 
47
  if filetype == "other":
48
  return 0
49
 
50
+ doc = pdfium.PdfDocument(fname)
51
  full_text = ""
52
+ for page_idx in range(len(doc)):
53
+ page = doc.get_page(page_idx)
54
+ text_page = page.get_textpage()
55
+ full_text += text_page.get_text_bounded()
56
 
57
  return len(full_text)
58
 
 
83
 
84
  out_meta["filetype"] = filetype
85
 
86
+ doc = pdfium.PdfDocument(fname)
 
 
 
 
87
  blocks, toc, ocr_stats = get_text_blocks(
88
  doc,
89
  tess_lang,
marker/debug/data.py CHANGED
@@ -4,6 +4,7 @@ import os
4
  import zlib
5
  from typing import List
6
 
 
7
  from marker.schema import Page
8
  from marker.settings import settings
9
  from PIL import Image
@@ -54,9 +55,7 @@ def dump_bbox_debug_data(doc, blocks: List[Page]):
54
  for idx, page_blocks in enumerate(blocks):
55
  page = doc[idx]
56
 
57
- pix = page.get_pixmap(dpi=settings.TEXIFY_DPI, annots=False, clip=page_blocks.bbox)
58
- png = pix.pil_tobytes(format="PNG")
59
- png_image = Image.open(io.BytesIO(png))
60
  width, height = png_image.size
61
  max_dimension = 6000
62
  if width > max_dimension or height > max_dimension:
 
4
  import zlib
5
  from typing import List
6
 
7
+ from marker.pdf.images import render_image
8
  from marker.schema import Page
9
  from marker.settings import settings
10
  from PIL import Image
 
55
  for idx, page_blocks in enumerate(blocks):
56
  page = doc[idx]
57
 
58
+ png_image = render_image(page, dpi=settings.TEXIFY_DPI)
 
 
59
  width, height = png_image.size
60
  max_dimension = 6000
61
  if width > max_dimension or height > max_dimension:
marker/extract_text.py CHANGED
@@ -1,96 +1,66 @@
1
  import os
2
- from typing import Tuple, List, Optional
3
 
4
- from spellchecker import SpellChecker
5
 
6
- from marker.bbox import correct_rotation
7
- from marker.ocr.page import ocr_entire_page
8
  from marker.ocr.utils import detect_bad_ocr, font_flags_decomposer
9
  from marker.settings import settings
10
  from marker.schema import Span, Line, Block, Page
11
- from concurrent.futures import ThreadPoolExecutor
12
 
13
  os.environ["TESSDATA_PREFIX"] = settings.TESSDATA_PREFIX
14
 
15
 
16
- def sort_rotated_text(page_blocks, tolerance=1.25):
17
- vertical_groups = {}
18
- for block in page_blocks:
19
- group_key = round(block.bbox[1] / tolerance) * tolerance
20
- if group_key not in vertical_groups:
21
- vertical_groups[group_key] = []
22
- vertical_groups[group_key].append(block)
23
-
24
- # Sort each group horizontally and flatten the groups into a single list
25
- sorted_page_blocks = []
26
- for _, group in sorted(vertical_groups.items()):
27
- sorted_group = sorted(group, key=lambda x: x.bbox[0])
28
- sorted_page_blocks.extend(sorted_group)
29
-
30
- return sorted_page_blocks
31
-
32
-
33
- def get_single_page_blocks(doc, pnum: int, tess_lang: str, spellchecker: Optional[SpellChecker] = None, ocr=False) -> Tuple[List[Block], int]:
34
- page = doc[pnum]
35
- rotation = page.rotation
36
-
37
- if ocr:
38
- blocks = ocr_entire_page(page, tess_lang, spellchecker)
39
- else:
40
- blocks = page.get_text("dict", sort=True, flags=settings.TEXT_FLAGS)["blocks"]
41
-
42
  page_blocks = []
43
  span_id = 0
44
- for block_idx, block in enumerate(blocks):
45
  block_lines = []
46
  for l in block["lines"]:
47
  spans = []
48
  for i, s in enumerate(l["spans"]):
49
  block_text = s["text"]
50
- bbox = s["bbox"]
51
  span_obj = Span(
52
  text=block_text,
53
- bbox=correct_rotation(bbox, page),
54
  span_id=f"{pnum}_{span_id}",
55
- font=f"{s['font']}_{font_flags_decomposer(s['flags'])}", # Add font flags to end of font
56
- color=s["color"],
57
- ascender=s["ascender"],
58
- descender=s["descender"],
59
  )
60
  spans.append(span_obj) # Text, bounding box, span id
61
  span_id += 1
62
  line_obj = Line(
63
  spans=spans,
64
- bbox=correct_rotation(l["bbox"], page),
65
  )
66
  # Only select valid lines, with positive bboxes
67
- if line_obj.area > 0:
68
  block_lines.append(line_obj)
69
  block_obj = Block(
70
  lines=block_lines,
71
- bbox=correct_rotation(block["bbox"], page),
72
  pnum=pnum
73
  )
74
- # Only select blocks with multiple lines
75
  if len(block_lines) > 0:
76
  page_blocks.append(block_obj)
 
 
 
 
 
 
 
77
 
78
- # If the page was rotated, sort the text again
79
- if rotation > 0:
80
- page_blocks = sort_rotated_text(page_blocks)
81
- return page_blocks
82
 
83
-
84
- def convert_single_page(doc, pnum, tess_lang: str, spell_lang: Optional[str], no_text: bool, disable_ocr: bool = False, min_ocr_page: int = 2):
85
  ocr_pages = 0
86
  ocr_success = 0
87
  ocr_failed = 0
88
- spellchecker = None
89
  page_bbox = doc[pnum].bound()
90
- if spell_lang:
91
- spellchecker = SpellChecker(language=spell_lang)
92
 
93
- blocks = get_single_page_blocks(doc, pnum, tess_lang, spellchecker)
94
  page_obj = Page(blocks=blocks, pnum=pnum, bbox=page_bbox)
95
 
96
  # OCR page if we got minimal text, or if we got too many spaces
@@ -98,14 +68,14 @@ def convert_single_page(doc, pnum, tess_lang: str, spell_lang: Optional[str], no
98
  (
99
  no_text # Full doc has no text, and needs full OCR
100
  or
101
- (len(page_obj.prelim_text) > 0 and detect_bad_ocr(page_obj.prelim_text, spellchecker)) # Bad OCR
102
  ),
103
  min_ocr_page < pnum < len(doc) - 1,
104
  not disable_ocr
105
  ]
106
  if all(conditions) or settings.OCR_ALL_PAGES:
107
  page = doc[pnum]
108
- blocks = get_single_page_blocks(doc, pnum, tess_lang, spellchecker, ocr=True)
109
  page_obj = Page(blocks=blocks, pnum=pnum, bbox=page_bbox, rotation=page.rotation)
110
  ocr_pages = 1
111
  if len(blocks) == 0:
@@ -116,37 +86,43 @@ def convert_single_page(doc, pnum, tess_lang: str, spell_lang: Optional[str], no
116
 
117
 
118
  def get_text_blocks(doc, tess_lang: str, spell_lang: Optional[str], max_pages: Optional[int] = None, parallel: int = settings.OCR_PARALLEL_WORKERS):
119
- all_blocks = []
120
- toc = doc.get_toc()
121
  ocr_pages = 0
122
  ocr_failed = 0
123
  ocr_success = 0
124
- # This is a thread because most of the work happens in a separate process (tesseract)
125
- range_end = len(doc)
126
- no_text = len(naive_get_text(doc).strip()) == 0
127
  if max_pages:
128
  range_end = min(max_pages, len(doc))
129
- with ThreadPoolExecutor(max_workers=parallel) as pool:
130
- args_list = [(doc, pnum, tess_lang, spell_lang, no_text) for pnum in range(range_end)]
131
- if parallel == 1:
132
- func = map
133
- else:
134
- func = pool.map
135
- results = func(lambda a: convert_single_page(*a), args_list)
136
 
137
- for result in results:
138
- page_obj, ocr_stats = result
139
- all_blocks.append(page_obj)
140
- ocr_pages += ocr_stats["ocr_pages"]
141
- ocr_failed += ocr_stats["ocr_failed"]
142
- ocr_success += ocr_stats["ocr_success"]
143
 
144
  return all_blocks, toc, {"ocr_pages": ocr_pages, "ocr_failed": ocr_failed, "ocr_success": ocr_success}
145
 
146
 
147
  def naive_get_text(doc):
148
  full_text = ""
149
- for page in doc:
150
- full_text += page.get_text("text", sort=True, flags=settings.TEXT_FLAGS)
151
- full_text += "\n"
 
152
  return full_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+ from typing import List, Optional
3
 
4
+ import pypdfium2.internal as pdfium_i
5
 
 
 
6
  from marker.ocr.utils import detect_bad_ocr, font_flags_decomposer
7
  from marker.settings import settings
8
  from marker.schema import Span, Line, Block, Page
9
+ from pdftext.extraction import dictionary_output
10
 
11
  os.environ["TESSDATA_PREFIX"] = settings.TESSDATA_PREFIX
12
 
13
 
14
+ def pdftext_format_to_blocks(page, pnum: int) -> List[Block]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  page_blocks = []
16
  span_id = 0
17
+ for block_idx, block in enumerate(page["blocks"]):
18
  block_lines = []
19
  for l in block["lines"]:
20
  spans = []
21
  for i, s in enumerate(l["spans"]):
22
  block_text = s["text"]
 
23
  span_obj = Span(
24
  text=block_text,
25
+ bbox=s["bbox"],
26
  span_id=f"{pnum}_{span_id}",
27
+ font=f"{s['font']['name']}_{font_flags_decomposer(s['font']['flags'])}", # Add font flags to end of font
28
+ font_weight=s["font"]["weight"],
29
+ font_size=s["font"]["size"],
 
30
  )
31
  spans.append(span_obj) # Text, bounding box, span id
32
  span_id += 1
33
  line_obj = Line(
34
  spans=spans,
35
+ bbox=l["bbox"],
36
  )
37
  # Only select valid lines, with positive bboxes
38
+ if line_obj.area >= 0:
39
  block_lines.append(line_obj)
40
  block_obj = Block(
41
  lines=block_lines,
42
+ bbox=block["bbox"],
43
  pnum=pnum
44
  )
45
+ # Only select blocks with lines
46
  if len(block_lines) > 0:
47
  page_blocks.append(block_obj)
48
+ out_page = Page(
49
+ blocks=page_blocks,
50
+ pnum=page["page"],
51
+ bbox=page["bbox"],
52
+ rotation=page["rotation"],
53
+ )
54
+ return out_page
55
 
 
 
 
 
56
 
57
+ def ocr_page(doc, pnum, page: Page, tess_lang: str):
 
58
  ocr_pages = 0
59
  ocr_success = 0
60
  ocr_failed = 0
 
61
  page_bbox = doc[pnum].bound()
 
 
62
 
63
+ blocks = get_single_page_blocks(doc, pnum, tess_lang)
64
  page_obj = Page(blocks=blocks, pnum=pnum, bbox=page_bbox)
65
 
66
  # OCR page if we got minimal text, or if we got too many spaces
 
68
  (
69
  no_text # Full doc has no text, and needs full OCR
70
  or
71
+ (len(page_obj.prelim_text) > 0 and detect_bad_ocr(page_obj.prelim_text)) # Bad OCR
72
  ),
73
  min_ocr_page < pnum < len(doc) - 1,
74
  not disable_ocr
75
  ]
76
  if all(conditions) or settings.OCR_ALL_PAGES:
77
  page = doc[pnum]
78
+ blocks = get_single_page_blocks(doc, pnum, tess_lang, ocr=True)
79
  page_obj = Page(blocks=blocks, pnum=pnum, bbox=page_bbox, rotation=page.rotation)
80
  ocr_pages = 1
81
  if len(blocks) == 0:
 
86
 
87
 
88
  def get_text_blocks(doc, tess_lang: str, spell_lang: Optional[str], max_pages: Optional[int] = None, parallel: int = settings.OCR_PARALLEL_WORKERS):
89
+ toc = get_toc(doc)
 
90
  ocr_pages = 0
91
  ocr_failed = 0
92
  ocr_success = 0
93
+
94
+ page_range = range(len(doc))
 
95
  if max_pages:
96
  range_end = min(max_pages, len(doc))
97
+ page_range = range(range_end)
 
 
 
 
 
 
98
 
99
+ all_blocks = dictionary_output(doc, page_range=page_range)
100
+ all_blocks = [pdftext_format_to_blocks(page, pnum) for pnum, page in enumerate(all_blocks)]
 
 
 
 
101
 
102
  return all_blocks, toc, {"ocr_pages": ocr_pages, "ocr_failed": ocr_failed, "ocr_success": ocr_success}
103
 
104
 
105
  def naive_get_text(doc):
106
  full_text = ""
107
+ for page_idx in range(len(doc)):
108
+ page = doc.get_page(page_idx)
109
+ text_page = page.get_textpage()
110
+ full_text += text_page.get_text_bounded() + "\n"
111
  return full_text
112
+
113
+
114
+ def get_toc(doc, max_depth=15):
115
+ toc = doc.get_toc(max_depth=max_depth)
116
+ toc_list = []
117
+ for item in toc:
118
+ list_item = {
119
+ "title": item.title,
120
+ "level": item.level,
121
+ "is_closed": item.is_closed,
122
+ "n_kids": item.n_kids,
123
+ "page_index": item.page_index,
124
+ "view_mode": pdfium_i.ViewmodeToStr.get(item.view_mode),
125
+ "view_pos": item.view_pos,
126
+ }
127
+ toc_list.append(list_item)
128
+ return toc_list
marker/logger.py CHANGED
@@ -1,5 +1,4 @@
1
  import logging
2
- import fitz as pymupdf
3
  import warnings
4
 
5
 
@@ -10,5 +9,4 @@ def configure_logging():
10
  logging.getLogger('PIL').setLevel(logging.ERROR)
11
  logging.getLogger('fitz').setLevel(logging.ERROR)
12
  logging.getLogger('ocrmypdf').setLevel(logging.ERROR)
13
- pymupdf.TOOLS.mupdf_display_errors(False)
14
  warnings.simplefilter(action='ignore', category=FutureWarning)
 
1
  import logging
 
2
  import warnings
3
 
4
 
 
9
  logging.getLogger('PIL').setLevel(logging.ERROR)
10
  logging.getLogger('fitz').setLevel(logging.ERROR)
11
  logging.getLogger('ocrmypdf').setLevel(logging.ERROR)
 
12
  warnings.simplefilter(action='ignore', category=FutureWarning)
marker/ocr/page.py CHANGED
@@ -1,9 +1,7 @@
1
  import io
2
  from typing import List, Optional
3
 
4
- import fitz as pymupdf
5
  import ocrmypdf
6
- from spellchecker import SpellChecker
7
 
8
  from marker.ocr.utils import detect_bad_ocr
9
  from marker.schema import Block
@@ -12,16 +10,16 @@ from marker.settings import settings
12
  ocrmypdf.configure_logging(verbosity=ocrmypdf.Verbosity.quiet)
13
 
14
 
15
- def ocr_entire_page(page, lang: str, spellchecker: Optional[SpellChecker] = None) -> List[Block]:
16
  if settings.OCR_ENGINE == "tesseract":
17
- return ocr_entire_page_tess(page, lang, spellchecker)
18
  elif settings.OCR_ENGINE == "ocrmypdf":
19
- return ocr_entire_page_ocrmp(page, lang, spellchecker)
20
  else:
21
  raise ValueError(f"Unknown OCR engine {settings.OCR_ENGINE}")
22
 
23
 
24
- def ocr_entire_page_tess(page, lang: str, spellchecker: Optional[SpellChecker] = None) -> List[Block]:
25
  try:
26
  full_tp = page.get_textpage_ocr(flags=settings.TEXT_FLAGS, dpi=settings.OCR_DPI, full=True, language=lang)
27
  blocks = page.get_text("dict", sort=True, flags=settings.TEXT_FLAGS, textpage=full_tp)["blocks"]
@@ -32,14 +30,14 @@ def ocr_entire_page_tess(page, lang: str, spellchecker: Optional[SpellChecker] =
32
 
33
  # Check if OCR worked. If it didn't, return empty list
34
  # OCR can fail if there is a scanned blank page with some faint text impressions, for example
35
- if detect_bad_ocr(full_text, spellchecker):
36
  return []
37
  except RuntimeError:
38
  return []
39
  return blocks
40
 
41
 
42
- def ocr_entire_page_ocrmp(page, lang: str, spellchecker: Optional[SpellChecker] = None) -> List[Block]:
43
  # Use ocrmypdf to get OCR text for the whole page
44
  src = page.parent # the page's document
45
  blank_doc = pymupdf.open() # make temporary 1-pager
@@ -71,7 +69,7 @@ def ocr_entire_page_ocrmp(page, lang: str, spellchecker: Optional[SpellChecker]
71
  if len(full_text) == 0:
72
  return []
73
 
74
- if detect_bad_ocr(full_text, spellchecker):
75
  return []
76
 
77
  return blocks
 
1
  import io
2
  from typing import List, Optional
3
 
 
4
  import ocrmypdf
 
5
 
6
  from marker.ocr.utils import detect_bad_ocr
7
  from marker.schema import Block
 
10
  ocrmypdf.configure_logging(verbosity=ocrmypdf.Verbosity.quiet)
11
 
12
 
13
+ def ocr_entire_page(page, lang: str) -> List[Block]:
14
  if settings.OCR_ENGINE == "tesseract":
15
+ return ocr_entire_page_tess(page, lang)
16
  elif settings.OCR_ENGINE == "ocrmypdf":
17
+ return ocr_entire_page_ocrmp(page, lang)
18
  else:
19
  raise ValueError(f"Unknown OCR engine {settings.OCR_ENGINE}")
20
 
21
 
22
+ def ocr_entire_page_tess(page, lang: str) -> List[Block]:
23
  try:
24
  full_tp = page.get_textpage_ocr(flags=settings.TEXT_FLAGS, dpi=settings.OCR_DPI, full=True, language=lang)
25
  blocks = page.get_text("dict", sort=True, flags=settings.TEXT_FLAGS, textpage=full_tp)["blocks"]
 
30
 
31
  # Check if OCR worked. If it didn't, return empty list
32
  # OCR can fail if there is a scanned blank page with some faint text impressions, for example
33
+ if detect_bad_ocr(full_text):
34
  return []
35
  except RuntimeError:
36
  return []
37
  return blocks
38
 
39
 
40
+ def ocr_entire_page_ocrmp(page, lang: str) -> List[Block]:
41
  # Use ocrmypdf to get OCR text for the whole page
42
  src = page.parent # the page's document
43
  blank_doc = pymupdf.open() # make temporary 1-pager
 
69
  if len(full_text) == 0:
70
  return []
71
 
72
+ if detect_bad_ocr(full_text):
73
  return []
74
 
75
  return blocks
marker/ocr/utils.py CHANGED
@@ -1,12 +1,11 @@
1
  from typing import Optional
2
 
3
  from nltk import wordpunct_tokenize
4
- from spellchecker import SpellChecker
5
  from marker.settings import settings
6
  import re
7
 
8
 
9
- def detect_bad_ocr(text, spellchecker: Optional[SpellChecker], misspell_threshold=.7, space_threshold=.6, newline_threshold=.5, alphanum_threshold=.4):
10
  if len(text) == 0:
11
  # Assume OCR failed if we have no text
12
  return True
@@ -15,11 +14,6 @@ def detect_bad_ocr(text, spellchecker: Optional[SpellChecker], misspell_threshol
15
  words = [w for w in words if w.strip()]
16
  alpha_words = [word for word in words if word.isalnum()]
17
 
18
- if spellchecker:
19
- misspelled = spellchecker.unknown(alpha_words)
20
- if len(misspelled) > len(alpha_words) * misspell_threshold:
21
- return True
22
-
23
  spaces = len(re.findall(r'\s+', text))
24
  alpha_chars = len(re.sub(r'\s+', '', text))
25
  if spaces / (alpha_chars + spaces) > space_threshold:
@@ -41,7 +35,8 @@ def detect_bad_ocr(text, spellchecker: Optional[SpellChecker], misspell_threshol
41
 
42
 
43
  def font_flags_decomposer(flags):
44
- """Make font flags human readable."""
 
45
  l = []
46
  if flags & 2 ** 0:
47
  l.append("superscript")
 
1
  from typing import Optional
2
 
3
  from nltk import wordpunct_tokenize
 
4
  from marker.settings import settings
5
  import re
6
 
7
 
8
+ def detect_bad_ocr(text, space_threshold=.6, newline_threshold=.5, alphanum_threshold=.4):
9
  if len(text) == 0:
10
  # Assume OCR failed if we have no text
11
  return True
 
14
  words = [w for w in words if w.strip()]
15
  alpha_words = [word for word in words if word.isalnum()]
16
 
 
 
 
 
 
17
  spaces = len(re.findall(r'\s+', text))
18
  alpha_chars = len(re.sub(r'\s+', '', text))
19
  if spaces / (alpha_chars + spaces) > space_threshold:
 
35
 
36
 
37
  def font_flags_decomposer(flags):
38
+ flags = int(flags)
39
+
40
  l = []
41
  if flags & 2 ** 0:
42
  l.append("superscript")
marker/ordering.py CHANGED
@@ -4,11 +4,11 @@ from typing import List
4
  import torch
5
  import sys, os
6
 
7
- from marker.extract_text import convert_single_page
8
  from transformers import LayoutLMv3ForSequenceClassification, LayoutLMv3Processor
9
  from PIL import Image
10
  import io
11
 
 
12
  from marker.schema import Page
13
  from marker.settings import settings
14
 
@@ -28,9 +28,7 @@ def get_inference_data(page, page_blocks: Page):
28
  bboxes = deepcopy([block.bbox for block in page_blocks.blocks])
29
  words = ["."] * len(bboxes)
30
 
31
- pix = page.get_pixmap(dpi=settings.LAYOUT_DPI, annots=False, clip=page_blocks.bbox)
32
- png = pix.pil_tobytes(format="PNG")
33
- rgb_image = Image.open(io.BytesIO(png)).convert("RGB")
34
 
35
  page_box = page_blocks.bbox
36
  pwidth = page_blocks.width
 
4
  import torch
5
  import sys, os
6
 
 
7
  from transformers import LayoutLMv3ForSequenceClassification, LayoutLMv3Processor
8
  from PIL import Image
9
  import io
10
 
11
+ from marker.pdf.images import render_image
12
  from marker.schema import Page
13
  from marker.settings import settings
14
 
 
28
  bboxes = deepcopy([block.bbox for block in page_blocks.blocks])
29
  words = ["."] * len(bboxes)
30
 
31
+ rgb_image = render_image(page, dpi=settings.LAYOUT_DPI)
 
 
32
 
33
  page_box = page_blocks.bbox
34
  pwidth = page_blocks.width
marker/pdf/images.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import pypdfium2 as pdfium
2
+
3
+
4
+ def render_image(page: pdfium.PdfPage, dpi):
5
+ image = page.render(
6
+ scale=dpi / 72,
7
+ draw_annots=False
8
+ ).to_pil()
9
+ image = image.convert("RGB")
10
+ return image
marker/schema.py CHANGED
@@ -56,9 +56,8 @@ class Span(BboxElement):
56
  text: str
57
  span_id: str
58
  font: str
59
- color: int
60
- ascender: Optional[float] = None
61
- descender: Optional[float] = None
62
  block_type: Optional[str] = None
63
  selected: bool = True
64
 
 
56
  text: str
57
  span_id: str
58
  font: str
59
+ font_weight: float
60
+ font_size: float
 
61
  block_type: Optional[str] = None
62
  selected: bool = True
63
 
marker/segmentation.py CHANGED
@@ -9,6 +9,8 @@ import io
9
  from PIL import Image
10
  from transformers import LayoutLMv3Processor
11
  import numpy as np
 
 
12
  from marker.settings import settings
13
  from marker.schema import Page, BlockType
14
  import torch
@@ -69,11 +71,7 @@ def get_page_encoding(page, page_blocks: Page):
69
  pwidth = page_blocks.width
70
  pheight = page_blocks.height
71
 
72
- pix = page.get_pixmap(dpi=settings.LAYOUT_DPI, annots=False, clip=page_blocks.bbox)
73
- png = pix.pil_tobytes(format="PNG")
74
- png_image = Image.open(io.BytesIO(png))
75
- # If it is too large, make it smaller for the model
76
- rgb_image = png_image.convert('RGB')
77
  rgb_width, rgb_height = rgb_image.size
78
 
79
  # Image is correct size wrt the pdf page
 
9
  from PIL import Image
10
  from transformers import LayoutLMv3Processor
11
  import numpy as np
12
+
13
+ from marker.pdf.images import render_image
14
  from marker.settings import settings
15
  from marker.schema import Page, BlockType
16
  import torch
 
71
  pwidth = page_blocks.width
72
  pheight = page_blocks.height
73
 
74
+ rgb_image = render_image(page, dpi=settings.LAYOUT_DPI)
 
 
 
 
75
  rgb_width, rgb_height = rgb_image.size
76
 
77
  # Image is correct size wrt the pdf page
marker/settings.py CHANGED
@@ -4,7 +4,6 @@ from typing import Optional, List, Dict
4
  from dotenv import find_dotenv
5
  from pydantic import computed_field
6
  from pydantic_settings import BaseSettings
7
- import fitz as pymupdf
8
  import torch
9
 
10
 
@@ -32,15 +31,12 @@ class Settings(BaseSettings):
32
 
33
  SUPPORTED_FILETYPES: Dict = {
34
  "application/pdf": "pdf",
35
- "application/epub+zip": "epub",
36
- "application/x-mobipocket-ebook": "mobi",
37
- "application/vnd.ms-xpsdocument": "xps",
38
- "application/x-fictionbook+xml": "fb2"
39
  }
40
 
41
- # PyMuPDF
42
- TEXT_FLAGS: int = pymupdf.TEXTFLAGS_DICT & ~pymupdf.TEXT_PRESERVE_LIGATURES & ~pymupdf.TEXT_PRESERVE_IMAGES
43
-
44
  # OCR
45
  INVALID_CHARS: List[str] = [chr(0xfffd), "�"]
46
  OCR_DPI: int = 400
 
4
  from dotenv import find_dotenv
5
  from pydantic import computed_field
6
  from pydantic_settings import BaseSettings
 
7
  import torch
8
 
9
 
 
31
 
32
  SUPPORTED_FILETYPES: Dict = {
33
  "application/pdf": "pdf",
34
+ #"application/epub+zip": "epub",
35
+ #"application/x-mobipocket-ebook": "mobi",
36
+ #"application/vnd.ms-xpsdocument": "xps",
37
+ #"application/x-fictionbook+xml": "fb2"
38
  }
39
 
 
 
 
40
  # OCR
41
  INVALID_CHARS: List[str] = [chr(0xfffd), "�"]
42
  OCR_DPI: int = 400
poetry.lock CHANGED
The diff for this file is too large to render. See raw diff
 
pyproject.toml CHANGED
@@ -23,7 +23,6 @@ python = ">=3.9,<3.13,!=3.9.7"
23
  scikit-learn = "^1.3.2"
24
  Pillow = "^10.1.0"
25
  pytesseract = "^0.3.10"
26
- PyMuPDF = "^1.23.5"
27
  pymupdf-fonts = "^1.0.5"
28
  pydantic = "^2.4.2"
29
  pydantic-settings = "^2.0.3"
@@ -34,15 +33,15 @@ torch = "^2.1.2"
34
  ray = "^2.9.0"
35
  tqdm = "^4.66.1"
36
  tabulate = "^0.9.0"
37
- thefuzz = "^0.20.0"
38
  python-magic = "^0.4.27"
39
- pyspellchecker = "^0.7.2"
40
  ftfy = "^6.1.1"
41
  nltk = "^3.8.1"
42
  ocrmypdf = "^15.4.0"
43
- bitsandbytes = "^0.41.2.post2"
44
  grpcio = "^1.60.0"
45
  texify = "^0.1.8"
 
 
 
46
 
47
  [tool.poetry.group.dev.dependencies]
48
  jupyter = "^1.0.0"
 
23
  scikit-learn = "^1.3.2"
24
  Pillow = "^10.1.0"
25
  pytesseract = "^0.3.10"
 
26
  pymupdf-fonts = "^1.0.5"
27
  pydantic = "^2.4.2"
28
  pydantic-settings = "^2.0.3"
 
33
  ray = "^2.9.0"
34
  tqdm = "^4.66.1"
35
  tabulate = "^0.9.0"
 
36
  python-magic = "^0.4.27"
 
37
  ftfy = "^6.1.1"
38
  nltk = "^3.8.1"
39
  ocrmypdf = "^15.4.0"
 
40
  grpcio = "^1.60.0"
41
  texify = "^0.1.8"
42
+ pdftext = "^0.3.1"
43
+ rapidfuzz = "^3.8.1"
44
+ surya-ocr = "^0.4.0"
45
 
46
  [tool.poetry.group.dev.dependencies]
47
  jupyter = "^1.0.0"