Vik Paruchuri commited on
Commit
aa38742
·
1 Parent(s): 90342a4

Work with rotation

Browse files
README.md CHANGED
@@ -40,6 +40,10 @@ The above results are with marker and nougat setup so they each take ~3GB of VRA
40
 
41
  See [below](#benchmarks) for detailed speed and accuracy benchmarks, and instructions on how to run your own benchmarks.
42
 
 
 
 
 
43
  # Limitations
44
 
45
  PDF is a tricky format, so marker will not always work perfectly. Here are some known limitations that are on the roadmap to address:
 
40
 
41
  See [below](#benchmarks) for detailed speed and accuracy benchmarks, and instructions on how to run your own benchmarks.
42
 
43
+ # Community
44
+
45
+ [Discord](https://discord.gg//KuZwXNGnfH) is where we discuss future development.
46
+
47
  # Limitations
48
 
49
  PDF is a tricky format, so marker will not always work perfectly. Here are some known limitations that are on the roadmap to address:
marker/bbox.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  def should_merge_blocks(box1, box2, tol=5):
2
  # Within tol y px, and to the right within tol px
3
  merge = [
@@ -58,4 +60,22 @@ def unnormalize_box(bbox, width, height):
58
  height * (bbox[1] / 1000),
59
  width * (bbox[2] / 1000),
60
  height * (bbox[3] / 1000),
61
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz as pymupdf
2
+
3
  def should_merge_blocks(box1, box2, tol=5):
4
  # Within tol y px, and to the right within tol px
5
  merge = [
 
60
  height * (bbox[1] / 1000),
61
  width * (bbox[2] / 1000),
62
  height * (bbox[3] / 1000),
63
+ ]
64
+
65
+
66
+ def correct_rotation(bbox, page):
67
+ #bbox base is (x0, y0, x1, y1)
68
+ rotation = page.rotation
69
+ if rotation == 0:
70
+ return bbox
71
+
72
+ tl = pymupdf.Point(bbox[0], bbox[1]) * page.rotation_matrix
73
+ br = pymupdf.Point(bbox[2], bbox[3]) * page.rotation_matrix
74
+ if rotation == 90:
75
+ bbox = [br[0], tl[1], tl[0], br[1]]
76
+ elif rotation == 180:
77
+ bbox = [br[0], br[1], tl[0], tl[1]]
78
+ elif rotation == 270:
79
+ bbox = [tl[0], br[1], br[0], tl[1]]
80
+
81
+ return bbox
marker/debug/data.py CHANGED
@@ -14,6 +14,9 @@ def dump_nougat_debug_data(doc, images, converted_spans):
14
  if not settings.DEBUG_DATA_FOLDER:
15
  return
16
 
 
 
 
17
  # We attempted one conversion per image
18
  assert len(converted_spans) == len(images)
19
 
@@ -37,7 +40,7 @@ def dump_nougat_debug_data(doc, images, converted_spans):
37
 
38
  debug_file = os.path.join(settings.DEBUG_DATA_FOLDER, f"{doc_base}_equations.json")
39
  with open(debug_file, "w+") as f:
40
- json.dump(data_lines, f, indent=4)
41
 
42
 
43
  def dump_bbox_debug_data(doc, blocks: List[Page]):
@@ -70,7 +73,7 @@ def dump_bbox_debug_data(doc, blocks: List[Page]):
70
  debug_data.append(page_data)
71
 
72
  with open(debug_file, "w+") as f:
73
- json.dump(debug_data, f, indent=4)
74
 
75
 
76
 
 
14
  if not settings.DEBUG_DATA_FOLDER:
15
  return
16
 
17
+ if len(images) == 0:
18
+ return
19
+
20
  # We attempted one conversion per image
21
  assert len(converted_spans) == len(images)
22
 
 
40
 
41
  debug_file = os.path.join(settings.DEBUG_DATA_FOLDER, f"{doc_base}_equations.json")
42
  with open(debug_file, "w+") as f:
43
+ json.dump(data_lines, f)
44
 
45
 
46
  def dump_bbox_debug_data(doc, blocks: List[Page]):
 
73
  debug_data.append(page_data)
74
 
75
  with open(debug_file, "w+") as f:
76
+ json.dump(debug_data, f)
77
 
78
 
79
 
marker/extract_text.py CHANGED
@@ -3,6 +3,7 @@ from typing import Tuple, List, Optional
3
 
4
  from spellchecker import SpellChecker
5
 
 
6
  from marker.ocr.page import ocr_entire_page
7
  from marker.ocr.utils import detect_bad_ocr, font_flags_decomposer
8
  from marker.settings import settings
@@ -12,8 +13,27 @@ from concurrent.futures import ThreadPoolExecutor
12
  os.environ["TESSDATA_PREFIX"] = settings.TESSDATA_PREFIX
13
 
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  def get_single_page_blocks(doc, pnum: int, tess_lang: str, spellchecker: Optional[SpellChecker] = None, ocr=False) -> Tuple[List[Block], int]:
16
  page = doc[pnum]
 
 
17
  if ocr:
18
  blocks = ocr_entire_page(page, tess_lang, spellchecker)
19
  else:
@@ -30,7 +50,7 @@ def get_single_page_blocks(doc, pnum: int, tess_lang: str, spellchecker: Optiona
30
  bbox = s["bbox"]
31
  span_obj = Span(
32
  text=block_text,
33
- bbox=bbox,
34
  span_id=f"{pnum}_{span_id}",
35
  font=f"{s['font']}_{font_flags_decomposer(s['flags'])}", # Add font flags to end of font
36
  color=s["color"],
@@ -41,19 +61,23 @@ def get_single_page_blocks(doc, pnum: int, tess_lang: str, spellchecker: Optiona
41
  span_id += 1
42
  line_obj = Line(
43
  spans=spans,
44
- bbox=l["bbox"]
45
  )
46
  # Only select valid lines, with positive bboxes
47
  if line_obj.area > 0:
48
  block_lines.append(line_obj)
49
  block_obj = Block(
50
  lines=block_lines,
51
- bbox=block["bbox"],
52
  pnum=pnum
53
  )
54
  # Only select blocks with multiple lines
55
  if len(block_lines) > 0:
56
  page_blocks.append(block_obj)
 
 
 
 
57
  return page_blocks
58
 
59
 
@@ -80,8 +104,9 @@ def convert_single_page(doc, pnum, tess_lang: str, spell_lang: Optional[str], no
80
  not disable_ocr
81
  ]
82
  if all(conditions) or settings.OCR_ALL_PAGES:
 
83
  blocks = get_single_page_blocks(doc, pnum, tess_lang, spellchecker, ocr=True)
84
- page_obj = Page(blocks=blocks, pnum=pnum, bbox=page_bbox)
85
  ocr_pages = 1
86
  if len(blocks) == 0:
87
  ocr_failed = 1
 
3
 
4
  from spellchecker import SpellChecker
5
 
6
+ from marker.bbox import correct_rotation
7
  from marker.ocr.page import ocr_entire_page
8
  from marker.ocr.utils import detect_bad_ocr, font_flags_decomposer
9
  from marker.settings import settings
 
13
  os.environ["TESSDATA_PREFIX"] = settings.TESSDATA_PREFIX
14
 
15
 
16
+ def sort_rotated_text(page_blocks, tolerance=1.25):
17
+ vertical_groups = {}
18
+ for block in page_blocks:
19
+ group_key = round(block.bbox[1] / tolerance) * tolerance
20
+ if group_key not in vertical_groups:
21
+ vertical_groups[group_key] = []
22
+ vertical_groups[group_key].append(block)
23
+
24
+ # Sort each group horizontally and flatten the groups into a single list
25
+ sorted_page_blocks = []
26
+ for _, group in sorted(vertical_groups.items()):
27
+ sorted_group = sorted(group, key=lambda x: x.bbox[0])
28
+ sorted_page_blocks.extend(sorted_group)
29
+
30
+ return sorted_page_blocks
31
+
32
+
33
  def get_single_page_blocks(doc, pnum: int, tess_lang: str, spellchecker: Optional[SpellChecker] = None, ocr=False) -> Tuple[List[Block], int]:
34
  page = doc[pnum]
35
+ rotation = page.rotation
36
+
37
  if ocr:
38
  blocks = ocr_entire_page(page, tess_lang, spellchecker)
39
  else:
 
50
  bbox = s["bbox"]
51
  span_obj = Span(
52
  text=block_text,
53
+ bbox=correct_rotation(bbox, page),
54
  span_id=f"{pnum}_{span_id}",
55
  font=f"{s['font']}_{font_flags_decomposer(s['flags'])}", # Add font flags to end of font
56
  color=s["color"],
 
61
  span_id += 1
62
  line_obj = Line(
63
  spans=spans,
64
+ bbox=correct_rotation(l["bbox"], page),
65
  )
66
  # Only select valid lines, with positive bboxes
67
  if line_obj.area > 0:
68
  block_lines.append(line_obj)
69
  block_obj = Block(
70
  lines=block_lines,
71
+ bbox=correct_rotation(block["bbox"], page),
72
  pnum=pnum
73
  )
74
  # Only select blocks with multiple lines
75
  if len(block_lines) > 0:
76
  page_blocks.append(block_obj)
77
+
78
+ # If the page was rotated, sort the text again
79
+ if rotation > 0:
80
+ page_blocks = sort_rotated_text(page_blocks)
81
  return page_blocks
82
 
83
 
 
104
  not disable_ocr
105
  ]
106
  if all(conditions) or settings.OCR_ALL_PAGES:
107
+ page = doc[pnum]
108
  blocks = get_single_page_blocks(doc, pnum, tess_lang, spellchecker, ocr=True)
109
+ page_obj = Page(blocks=blocks, pnum=pnum, bbox=page_bbox, rotation=page.rotation)
110
  ocr_pages = 1
111
  if len(blocks) == 0:
112
  ocr_failed = 1
marker/schema.py CHANGED
@@ -1,5 +1,5 @@
1
  from collections import Counter
2
- from typing import List, Optional
3
 
4
  from pydantic import BaseModel, field_validator
5
  import ftfy
@@ -20,7 +20,6 @@ def find_span_type(span, page_blocks):
20
  class BboxElement(BaseModel):
21
  bbox: List[float]
22
 
23
-
24
  @field_validator('bbox')
25
  @classmethod
26
  def check_4_elements(cls, v: List[float]) -> List[float]:
@@ -134,6 +133,7 @@ class Page(BboxElement):
134
  blocks: List[Block]
135
  pnum: int
136
  column_count: Optional[int] = None
 
137
 
138
  def get_nonblank_lines(self):
139
  lines = self.get_all_lines()
 
1
  from collections import Counter
2
+ from typing import List, Optional, Tuple
3
 
4
  from pydantic import BaseModel, field_validator
5
  import ftfy
 
20
  class BboxElement(BaseModel):
21
  bbox: List[float]
22
 
 
23
  @field_validator('bbox')
24
  @classmethod
25
  def check_4_elements(cls, v: List[float]) -> List[float]:
 
133
  blocks: List[Block]
134
  pnum: int
135
  column_count: Optional[int] = None
136
+ rotation: Optional[int] = None # Rotation degrees of the page
137
 
138
  def get_nonblank_lines(self):
139
  lines = self.get_all_lines()
marker/settings.py CHANGED
@@ -54,8 +54,22 @@ class Settings(BaseSettings):
54
  # Nougat model
55
  NOUGAT_MODEL_MAX: int = 512 # Max inference length for nougat
56
  NOUGAT_TOKEN_BUFFER: int = 256 # Number of tokens to buffer above max for nougat
57
- NOUGAT_HALLUCINATION_WORDS: List[str] = ["[MISSING_PAGE_POST]", "## References\n", "**Figure Captions**\n", "Footnote",
58
- "\par\par\par", "## Chapter", "Fig.", "particle", "[REPEATS]", "[TRUNCATED]", "### ", "effective field strength", "\Phi_{\rm eff}"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  NOUGAT_DPI: int = 96 # DPI to render images at, matches default settings for nougat
60
  NOUGAT_MODEL_NAME: str = "0.1.0-small" # Name of the model to use
61
  NOUGAT_BATCH_SIZE: int = 6 if TORCH_DEVICE == "cuda" else 1 # Batch size for nougat, don't batch on cpu
 
54
  # Nougat model
55
  NOUGAT_MODEL_MAX: int = 512 # Max inference length for nougat
56
  NOUGAT_TOKEN_BUFFER: int = 256 # Number of tokens to buffer above max for nougat
57
+ NOUGAT_HALLUCINATION_WORDS: List[str] = [
58
+ "[MISSING_PAGE_POST]",
59
+ "## References\n",
60
+ "**Figure Captions**\n",
61
+ "Footnote",
62
+ "\par\par\par",
63
+ "## Chapter",
64
+ "Fig.",
65
+ "particle",
66
+ "[REPEATS]",
67
+ "[TRUNCATED]",
68
+ "### ",
69
+ "effective field strength",
70
+ "\Phi_{\rm eff}",
71
+ "\mathbf{\mathbf"
72
+ ]
73
  NOUGAT_DPI: int = 96 # DPI to render images at, matches default settings for nougat
74
  NOUGAT_MODEL_NAME: str = "0.1.0-small" # Name of the model to use
75
  NOUGAT_BATCH_SIZE: int = 6 if TORCH_DEVICE == "cuda" else 1 # Batch size for nougat, don't batch on cpu