Vik Paruchuri
commited on
Commit
·
aa38742
1
Parent(s):
90342a4
Work with rotation
Browse files- README.md +4 -0
- marker/bbox.py +21 -1
- marker/debug/data.py +5 -2
- marker/extract_text.py +29 -4
- marker/schema.py +2 -2
- marker/settings.py +16 -2
README.md
CHANGED
|
@@ -40,6 +40,10 @@ The above results are with marker and nougat setup so they each take ~3GB of VRA
|
|
| 40 |
|
| 41 |
See [below](#benchmarks) for detailed speed and accuracy benchmarks, and instructions on how to run your own benchmarks.
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
# Limitations
|
| 44 |
|
| 45 |
PDF is a tricky format, so marker will not always work perfectly. Here are some known limitations that are on the roadmap to address:
|
|
|
|
| 40 |
|
| 41 |
See [below](#benchmarks) for detailed speed and accuracy benchmarks, and instructions on how to run your own benchmarks.
|
| 42 |
|
| 43 |
+
# Community
|
| 44 |
+
|
| 45 |
+
[Discord](https://discord.gg//KuZwXNGnfH) is where we discuss future development.
|
| 46 |
+
|
| 47 |
# Limitations
|
| 48 |
|
| 49 |
PDF is a tricky format, so marker will not always work perfectly. Here are some known limitations that are on the roadmap to address:
|
marker/bbox.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
|
|
|
|
|
| 1 |
def should_merge_blocks(box1, box2, tol=5):
|
| 2 |
# Within tol y px, and to the right within tol px
|
| 3 |
merge = [
|
|
@@ -58,4 +60,22 @@ def unnormalize_box(bbox, width, height):
|
|
| 58 |
height * (bbox[1] / 1000),
|
| 59 |
width * (bbox[2] / 1000),
|
| 60 |
height * (bbox[3] / 1000),
|
| 61 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import fitz as pymupdf
|
| 2 |
+
|
| 3 |
def should_merge_blocks(box1, box2, tol=5):
|
| 4 |
# Within tol y px, and to the right within tol px
|
| 5 |
merge = [
|
|
|
|
| 60 |
height * (bbox[1] / 1000),
|
| 61 |
width * (bbox[2] / 1000),
|
| 62 |
height * (bbox[3] / 1000),
|
| 63 |
+
]
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def correct_rotation(bbox, page):
|
| 67 |
+
#bbox base is (x0, y0, x1, y1)
|
| 68 |
+
rotation = page.rotation
|
| 69 |
+
if rotation == 0:
|
| 70 |
+
return bbox
|
| 71 |
+
|
| 72 |
+
tl = pymupdf.Point(bbox[0], bbox[1]) * page.rotation_matrix
|
| 73 |
+
br = pymupdf.Point(bbox[2], bbox[3]) * page.rotation_matrix
|
| 74 |
+
if rotation == 90:
|
| 75 |
+
bbox = [br[0], tl[1], tl[0], br[1]]
|
| 76 |
+
elif rotation == 180:
|
| 77 |
+
bbox = [br[0], br[1], tl[0], tl[1]]
|
| 78 |
+
elif rotation == 270:
|
| 79 |
+
bbox = [tl[0], br[1], br[0], tl[1]]
|
| 80 |
+
|
| 81 |
+
return bbox
|
marker/debug/data.py
CHANGED
|
@@ -14,6 +14,9 @@ def dump_nougat_debug_data(doc, images, converted_spans):
|
|
| 14 |
if not settings.DEBUG_DATA_FOLDER:
|
| 15 |
return
|
| 16 |
|
|
|
|
|
|
|
|
|
|
| 17 |
# We attempted one conversion per image
|
| 18 |
assert len(converted_spans) == len(images)
|
| 19 |
|
|
@@ -37,7 +40,7 @@ def dump_nougat_debug_data(doc, images, converted_spans):
|
|
| 37 |
|
| 38 |
debug_file = os.path.join(settings.DEBUG_DATA_FOLDER, f"{doc_base}_equations.json")
|
| 39 |
with open(debug_file, "w+") as f:
|
| 40 |
-
json.dump(data_lines, f
|
| 41 |
|
| 42 |
|
| 43 |
def dump_bbox_debug_data(doc, blocks: List[Page]):
|
|
@@ -70,7 +73,7 @@ def dump_bbox_debug_data(doc, blocks: List[Page]):
|
|
| 70 |
debug_data.append(page_data)
|
| 71 |
|
| 72 |
with open(debug_file, "w+") as f:
|
| 73 |
-
json.dump(debug_data, f
|
| 74 |
|
| 75 |
|
| 76 |
|
|
|
|
| 14 |
if not settings.DEBUG_DATA_FOLDER:
|
| 15 |
return
|
| 16 |
|
| 17 |
+
if len(images) == 0:
|
| 18 |
+
return
|
| 19 |
+
|
| 20 |
# We attempted one conversion per image
|
| 21 |
assert len(converted_spans) == len(images)
|
| 22 |
|
|
|
|
| 40 |
|
| 41 |
debug_file = os.path.join(settings.DEBUG_DATA_FOLDER, f"{doc_base}_equations.json")
|
| 42 |
with open(debug_file, "w+") as f:
|
| 43 |
+
json.dump(data_lines, f)
|
| 44 |
|
| 45 |
|
| 46 |
def dump_bbox_debug_data(doc, blocks: List[Page]):
|
|
|
|
| 73 |
debug_data.append(page_data)
|
| 74 |
|
| 75 |
with open(debug_file, "w+") as f:
|
| 76 |
+
json.dump(debug_data, f)
|
| 77 |
|
| 78 |
|
| 79 |
|
marker/extract_text.py
CHANGED
|
@@ -3,6 +3,7 @@ from typing import Tuple, List, Optional
|
|
| 3 |
|
| 4 |
from spellchecker import SpellChecker
|
| 5 |
|
|
|
|
| 6 |
from marker.ocr.page import ocr_entire_page
|
| 7 |
from marker.ocr.utils import detect_bad_ocr, font_flags_decomposer
|
| 8 |
from marker.settings import settings
|
|
@@ -12,8 +13,27 @@ from concurrent.futures import ThreadPoolExecutor
|
|
| 12 |
os.environ["TESSDATA_PREFIX"] = settings.TESSDATA_PREFIX
|
| 13 |
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
def get_single_page_blocks(doc, pnum: int, tess_lang: str, spellchecker: Optional[SpellChecker] = None, ocr=False) -> Tuple[List[Block], int]:
|
| 16 |
page = doc[pnum]
|
|
|
|
|
|
|
| 17 |
if ocr:
|
| 18 |
blocks = ocr_entire_page(page, tess_lang, spellchecker)
|
| 19 |
else:
|
|
@@ -30,7 +50,7 @@ def get_single_page_blocks(doc, pnum: int, tess_lang: str, spellchecker: Optiona
|
|
| 30 |
bbox = s["bbox"]
|
| 31 |
span_obj = Span(
|
| 32 |
text=block_text,
|
| 33 |
-
bbox=bbox,
|
| 34 |
span_id=f"{pnum}_{span_id}",
|
| 35 |
font=f"{s['font']}_{font_flags_decomposer(s['flags'])}", # Add font flags to end of font
|
| 36 |
color=s["color"],
|
|
@@ -41,19 +61,23 @@ def get_single_page_blocks(doc, pnum: int, tess_lang: str, spellchecker: Optiona
|
|
| 41 |
span_id += 1
|
| 42 |
line_obj = Line(
|
| 43 |
spans=spans,
|
| 44 |
-
bbox=l["bbox"]
|
| 45 |
)
|
| 46 |
# Only select valid lines, with positive bboxes
|
| 47 |
if line_obj.area > 0:
|
| 48 |
block_lines.append(line_obj)
|
| 49 |
block_obj = Block(
|
| 50 |
lines=block_lines,
|
| 51 |
-
bbox=block["bbox"],
|
| 52 |
pnum=pnum
|
| 53 |
)
|
| 54 |
# Only select blocks with multiple lines
|
| 55 |
if len(block_lines) > 0:
|
| 56 |
page_blocks.append(block_obj)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
return page_blocks
|
| 58 |
|
| 59 |
|
|
@@ -80,8 +104,9 @@ def convert_single_page(doc, pnum, tess_lang: str, spell_lang: Optional[str], no
|
|
| 80 |
not disable_ocr
|
| 81 |
]
|
| 82 |
if all(conditions) or settings.OCR_ALL_PAGES:
|
|
|
|
| 83 |
blocks = get_single_page_blocks(doc, pnum, tess_lang, spellchecker, ocr=True)
|
| 84 |
-
page_obj = Page(blocks=blocks, pnum=pnum, bbox=page_bbox)
|
| 85 |
ocr_pages = 1
|
| 86 |
if len(blocks) == 0:
|
| 87 |
ocr_failed = 1
|
|
|
|
| 3 |
|
| 4 |
from spellchecker import SpellChecker
|
| 5 |
|
| 6 |
+
from marker.bbox import correct_rotation
|
| 7 |
from marker.ocr.page import ocr_entire_page
|
| 8 |
from marker.ocr.utils import detect_bad_ocr, font_flags_decomposer
|
| 9 |
from marker.settings import settings
|
|
|
|
| 13 |
os.environ["TESSDATA_PREFIX"] = settings.TESSDATA_PREFIX
|
| 14 |
|
| 15 |
|
| 16 |
+
def sort_rotated_text(page_blocks, tolerance=1.25):
|
| 17 |
+
vertical_groups = {}
|
| 18 |
+
for block in page_blocks:
|
| 19 |
+
group_key = round(block.bbox[1] / tolerance) * tolerance
|
| 20 |
+
if group_key not in vertical_groups:
|
| 21 |
+
vertical_groups[group_key] = []
|
| 22 |
+
vertical_groups[group_key].append(block)
|
| 23 |
+
|
| 24 |
+
# Sort each group horizontally and flatten the groups into a single list
|
| 25 |
+
sorted_page_blocks = []
|
| 26 |
+
for _, group in sorted(vertical_groups.items()):
|
| 27 |
+
sorted_group = sorted(group, key=lambda x: x.bbox[0])
|
| 28 |
+
sorted_page_blocks.extend(sorted_group)
|
| 29 |
+
|
| 30 |
+
return sorted_page_blocks
|
| 31 |
+
|
| 32 |
+
|
| 33 |
def get_single_page_blocks(doc, pnum: int, tess_lang: str, spellchecker: Optional[SpellChecker] = None, ocr=False) -> Tuple[List[Block], int]:
|
| 34 |
page = doc[pnum]
|
| 35 |
+
rotation = page.rotation
|
| 36 |
+
|
| 37 |
if ocr:
|
| 38 |
blocks = ocr_entire_page(page, tess_lang, spellchecker)
|
| 39 |
else:
|
|
|
|
| 50 |
bbox = s["bbox"]
|
| 51 |
span_obj = Span(
|
| 52 |
text=block_text,
|
| 53 |
+
bbox=correct_rotation(bbox, page),
|
| 54 |
span_id=f"{pnum}_{span_id}",
|
| 55 |
font=f"{s['font']}_{font_flags_decomposer(s['flags'])}", # Add font flags to end of font
|
| 56 |
color=s["color"],
|
|
|
|
| 61 |
span_id += 1
|
| 62 |
line_obj = Line(
|
| 63 |
spans=spans,
|
| 64 |
+
bbox=correct_rotation(l["bbox"], page),
|
| 65 |
)
|
| 66 |
# Only select valid lines, with positive bboxes
|
| 67 |
if line_obj.area > 0:
|
| 68 |
block_lines.append(line_obj)
|
| 69 |
block_obj = Block(
|
| 70 |
lines=block_lines,
|
| 71 |
+
bbox=correct_rotation(block["bbox"], page),
|
| 72 |
pnum=pnum
|
| 73 |
)
|
| 74 |
# Only select blocks with multiple lines
|
| 75 |
if len(block_lines) > 0:
|
| 76 |
page_blocks.append(block_obj)
|
| 77 |
+
|
| 78 |
+
# If the page was rotated, sort the text again
|
| 79 |
+
if rotation > 0:
|
| 80 |
+
page_blocks = sort_rotated_text(page_blocks)
|
| 81 |
return page_blocks
|
| 82 |
|
| 83 |
|
|
|
|
| 104 |
not disable_ocr
|
| 105 |
]
|
| 106 |
if all(conditions) or settings.OCR_ALL_PAGES:
|
| 107 |
+
page = doc[pnum]
|
| 108 |
blocks = get_single_page_blocks(doc, pnum, tess_lang, spellchecker, ocr=True)
|
| 109 |
+
page_obj = Page(blocks=blocks, pnum=pnum, bbox=page_bbox, rotation=page.rotation)
|
| 110 |
ocr_pages = 1
|
| 111 |
if len(blocks) == 0:
|
| 112 |
ocr_failed = 1
|
marker/schema.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
from collections import Counter
|
| 2 |
-
from typing import List, Optional
|
| 3 |
|
| 4 |
from pydantic import BaseModel, field_validator
|
| 5 |
import ftfy
|
|
@@ -20,7 +20,6 @@ def find_span_type(span, page_blocks):
|
|
| 20 |
class BboxElement(BaseModel):
|
| 21 |
bbox: List[float]
|
| 22 |
|
| 23 |
-
|
| 24 |
@field_validator('bbox')
|
| 25 |
@classmethod
|
| 26 |
def check_4_elements(cls, v: List[float]) -> List[float]:
|
|
@@ -134,6 +133,7 @@ class Page(BboxElement):
|
|
| 134 |
blocks: List[Block]
|
| 135 |
pnum: int
|
| 136 |
column_count: Optional[int] = None
|
|
|
|
| 137 |
|
| 138 |
def get_nonblank_lines(self):
|
| 139 |
lines = self.get_all_lines()
|
|
|
|
| 1 |
from collections import Counter
|
| 2 |
+
from typing import List, Optional, Tuple
|
| 3 |
|
| 4 |
from pydantic import BaseModel, field_validator
|
| 5 |
import ftfy
|
|
|
|
| 20 |
class BboxElement(BaseModel):
|
| 21 |
bbox: List[float]
|
| 22 |
|
|
|
|
| 23 |
@field_validator('bbox')
|
| 24 |
@classmethod
|
| 25 |
def check_4_elements(cls, v: List[float]) -> List[float]:
|
|
|
|
| 133 |
blocks: List[Block]
|
| 134 |
pnum: int
|
| 135 |
column_count: Optional[int] = None
|
| 136 |
+
rotation: Optional[int] = None # Rotation degrees of the page
|
| 137 |
|
| 138 |
def get_nonblank_lines(self):
|
| 139 |
lines = self.get_all_lines()
|
marker/settings.py
CHANGED
|
@@ -54,8 +54,22 @@ class Settings(BaseSettings):
|
|
| 54 |
# Nougat model
|
| 55 |
NOUGAT_MODEL_MAX: int = 512 # Max inference length for nougat
|
| 56 |
NOUGAT_TOKEN_BUFFER: int = 256 # Number of tokens to buffer above max for nougat
|
| 57 |
-
NOUGAT_HALLUCINATION_WORDS: List[str] = [
|
| 58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
NOUGAT_DPI: int = 96 # DPI to render images at, matches default settings for nougat
|
| 60 |
NOUGAT_MODEL_NAME: str = "0.1.0-small" # Name of the model to use
|
| 61 |
NOUGAT_BATCH_SIZE: int = 6 if TORCH_DEVICE == "cuda" else 1 # Batch size for nougat, don't batch on cpu
|
|
|
|
| 54 |
# Nougat model
|
| 55 |
NOUGAT_MODEL_MAX: int = 512 # Max inference length for nougat
|
| 56 |
NOUGAT_TOKEN_BUFFER: int = 256 # Number of tokens to buffer above max for nougat
|
| 57 |
+
NOUGAT_HALLUCINATION_WORDS: List[str] = [
|
| 58 |
+
"[MISSING_PAGE_POST]",
|
| 59 |
+
"## References\n",
|
| 60 |
+
"**Figure Captions**\n",
|
| 61 |
+
"Footnote",
|
| 62 |
+
"\par\par\par",
|
| 63 |
+
"## Chapter",
|
| 64 |
+
"Fig.",
|
| 65 |
+
"particle",
|
| 66 |
+
"[REPEATS]",
|
| 67 |
+
"[TRUNCATED]",
|
| 68 |
+
"### ",
|
| 69 |
+
"effective field strength",
|
| 70 |
+
"\Phi_{\rm eff}",
|
| 71 |
+
"\mathbf{\mathbf"
|
| 72 |
+
]
|
| 73 |
NOUGAT_DPI: int = 96 # DPI to render images at, matches default settings for nougat
|
| 74 |
NOUGAT_MODEL_NAME: str = "0.1.0-small" # Name of the model to use
|
| 75 |
NOUGAT_BATCH_SIZE: int = 6 if TORCH_DEVICE == "cuda" else 1 # Batch size for nougat, don't batch on cpu
|