Vik Paruchuri
commited on
Commit
·
a5c1c2e
0
Parent(s):
Initial commit
Browse files- .gitignore +166 -0
- README.md +13 -0
- marker/bbox.py +61 -0
- marker/code.py +86 -0
- marker/equations.py +167 -0
- marker/extract_text.py +94 -0
- marker/headers.py +60 -0
- marker/markdown.py +163 -0
- marker/schema.py +176 -0
- marker/segmentation.py +139 -0
- marker/settings.py +20 -0
- parse.py +49 -0
- poetry.lock +0 -0
- pyproject.toml +28 -0
- requirements.txt +12 -0
.gitignore
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
private.py
|
| 2 |
+
.DS_Store
|
| 3 |
+
local.env
|
| 4 |
+
experiments
|
| 5 |
+
test_data
|
| 6 |
+
|
| 7 |
+
# Byte-compiled / optimized / DLL files
|
| 8 |
+
__pycache__/
|
| 9 |
+
*.py[cod]
|
| 10 |
+
*$py.class
|
| 11 |
+
|
| 12 |
+
# C extensions
|
| 13 |
+
*.so
|
| 14 |
+
|
| 15 |
+
# Distribution / packaging
|
| 16 |
+
.Python
|
| 17 |
+
build/
|
| 18 |
+
develop-eggs/
|
| 19 |
+
dist/
|
| 20 |
+
downloads/
|
| 21 |
+
eggs/
|
| 22 |
+
.eggs/
|
| 23 |
+
lib/
|
| 24 |
+
lib64/
|
| 25 |
+
parts/
|
| 26 |
+
sdist/
|
| 27 |
+
var/
|
| 28 |
+
wheels/
|
| 29 |
+
share/python-wheels/
|
| 30 |
+
*.egg-info/
|
| 31 |
+
.installed.cfg
|
| 32 |
+
*.egg
|
| 33 |
+
MANIFEST
|
| 34 |
+
|
| 35 |
+
# PyInstaller
|
| 36 |
+
# Usually these files are written by a python script from a template
|
| 37 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 38 |
+
*.manifest
|
| 39 |
+
*.spec
|
| 40 |
+
|
| 41 |
+
# Installer logs
|
| 42 |
+
pip-log.txt
|
| 43 |
+
pip-delete-this-directory.txt
|
| 44 |
+
|
| 45 |
+
# Unit test / coverage reports
|
| 46 |
+
htmlcov/
|
| 47 |
+
.tox/
|
| 48 |
+
.nox/
|
| 49 |
+
.coverage
|
| 50 |
+
.coverage.*
|
| 51 |
+
.cache
|
| 52 |
+
nosetests.xml
|
| 53 |
+
coverage.xml
|
| 54 |
+
*.cover
|
| 55 |
+
*.py,cover
|
| 56 |
+
.hypothesis/
|
| 57 |
+
.pytest_cache/
|
| 58 |
+
cover/
|
| 59 |
+
|
| 60 |
+
# Translations
|
| 61 |
+
*.mo
|
| 62 |
+
*.pot
|
| 63 |
+
|
| 64 |
+
# Django stuff:
|
| 65 |
+
*.log
|
| 66 |
+
local_settings.py
|
| 67 |
+
db.sqlite3
|
| 68 |
+
db.sqlite3-journal
|
| 69 |
+
|
| 70 |
+
# Flask stuff:
|
| 71 |
+
instance/
|
| 72 |
+
.webassets-cache
|
| 73 |
+
|
| 74 |
+
# Scrapy stuff:
|
| 75 |
+
.scrapy
|
| 76 |
+
|
| 77 |
+
# Sphinx documentation
|
| 78 |
+
docs/_build/
|
| 79 |
+
|
| 80 |
+
# PyBuilder
|
| 81 |
+
.pybuilder/
|
| 82 |
+
target/
|
| 83 |
+
|
| 84 |
+
# Jupyter Notebook
|
| 85 |
+
.ipynb_checkpoints
|
| 86 |
+
|
| 87 |
+
# IPython
|
| 88 |
+
profile_default/
|
| 89 |
+
ipython_config.py
|
| 90 |
+
|
| 91 |
+
# pyenv
|
| 92 |
+
# For a library or package, you might want to ignore these files since the code is
|
| 93 |
+
# intended to run in multiple environments; otherwise, check them in:
|
| 94 |
+
# .python-version
|
| 95 |
+
|
| 96 |
+
# pipenv
|
| 97 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 98 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 99 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 100 |
+
# install all needed dependencies.
|
| 101 |
+
#Pipfile.lock
|
| 102 |
+
|
| 103 |
+
# poetry
|
| 104 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
| 105 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 106 |
+
# commonly ignored for libraries.
|
| 107 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
| 108 |
+
#poetry.lock
|
| 109 |
+
|
| 110 |
+
# pdm
|
| 111 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
| 112 |
+
#pdm.lock
|
| 113 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
| 114 |
+
# in version control.
|
| 115 |
+
# https://pdm.fming.dev/#use-with-ide
|
| 116 |
+
.pdm.toml
|
| 117 |
+
|
| 118 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
| 119 |
+
__pypackages__/
|
| 120 |
+
|
| 121 |
+
# Celery stuff
|
| 122 |
+
celerybeat-schedule
|
| 123 |
+
celerybeat.pid
|
| 124 |
+
|
| 125 |
+
# SageMath parsed files
|
| 126 |
+
*.sage.py
|
| 127 |
+
|
| 128 |
+
# Environments
|
| 129 |
+
.env
|
| 130 |
+
.venv
|
| 131 |
+
env/
|
| 132 |
+
venv/
|
| 133 |
+
ENV/
|
| 134 |
+
env.bak/
|
| 135 |
+
venv.bak/
|
| 136 |
+
|
| 137 |
+
# Spyder project settings
|
| 138 |
+
.spyderproject
|
| 139 |
+
.spyproject
|
| 140 |
+
|
| 141 |
+
# Rope project settings
|
| 142 |
+
.ropeproject
|
| 143 |
+
|
| 144 |
+
# mkdocs documentation
|
| 145 |
+
/site
|
| 146 |
+
|
| 147 |
+
# mypy
|
| 148 |
+
.mypy_cache/
|
| 149 |
+
.dmypy.json
|
| 150 |
+
dmypy.json
|
| 151 |
+
|
| 152 |
+
# Pyre type checker
|
| 153 |
+
.pyre/
|
| 154 |
+
|
| 155 |
+
# pytype static type analyzer
|
| 156 |
+
.pytype/
|
| 157 |
+
|
| 158 |
+
# Cython debug symbols
|
| 159 |
+
cython_debug/
|
| 160 |
+
|
| 161 |
+
# PyCharm
|
| 162 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
| 163 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
| 164 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
| 165 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 166 |
+
.idea/
|
README.md
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Marker
|
| 2 |
+
|
| 3 |
+
This project converts PDF to Markdown, balancing speed with quality:
|
| 4 |
+
|
| 5 |
+
- Equations will be detected and converted to Latex. This is not 100% accurate.
|
| 6 |
+
- All headers/footers/other artifacts will be removed.
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
## Install
|
| 11 |
+
|
| 12 |
+
- `poetry install`
|
| 13 |
+
- Set `TESSDATA_PREFIX`
|
marker/bbox.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def should_merge_blocks(box1, box2, tol=10):
|
| 2 |
+
# Within tol y px, and to the right within tol px
|
| 3 |
+
merge = [
|
| 4 |
+
box2[0] > box1[0],
|
| 5 |
+
abs(box2[1] - box1[1]) < tol, # Within tol y px
|
| 6 |
+
abs(box2[3] - box1[3]) < tol, # Within tol y px
|
| 7 |
+
abs(box2[0] - box1[2]) < tol, # Within tol x px
|
| 8 |
+
]
|
| 9 |
+
return all(merge)
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def merge_boxes(box1, box2):
|
| 13 |
+
return (min(box1[0], box2[0]), min(box1[1], box2[1]), max(box2[2], box1[2]), max(box1[3], box2[3]))
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def boxes_intersect(box1, box2):
|
| 17 |
+
# Box1 intersects box2
|
| 18 |
+
return box1[0] < box2[2] and box1[2] > box2[0] and box1[1] < box2[3] and box1[3] > box2[1]
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def boxes_intersect_pct(box1, box2, pct=.9):
|
| 22 |
+
# determine the coordinates of the intersection rectangle
|
| 23 |
+
x_left = max(box1[0], box2[0])
|
| 24 |
+
y_top = max(box1[1], box2[1])
|
| 25 |
+
x_right = min(box1[2], box2[2])
|
| 26 |
+
y_bottom = min(box1[3], box2[3])
|
| 27 |
+
|
| 28 |
+
if x_right < x_left or y_bottom < y_top:
|
| 29 |
+
return 0.0
|
| 30 |
+
|
| 31 |
+
# The intersection of two axis-aligned bounding boxes is always an
|
| 32 |
+
# axis-aligned bounding box
|
| 33 |
+
intersection_area = (x_right - x_left) * (y_bottom - y_top)
|
| 34 |
+
|
| 35 |
+
# compute the area of both AABBs
|
| 36 |
+
bb1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
|
| 37 |
+
bb2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])
|
| 38 |
+
|
| 39 |
+
iou = intersection_area / float(bb1_area + bb2_area - intersection_area)
|
| 40 |
+
return iou > pct
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def multiple_boxes_intersect(box1, boxes):
|
| 44 |
+
for box2 in boxes:
|
| 45 |
+
if boxes_intersect(box1, box2):
|
| 46 |
+
return True
|
| 47 |
+
return False
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def box_contained(box1, box2):
|
| 51 |
+
# Box1 inside box2
|
| 52 |
+
return box1[0] > box2[0] and box1[1] > box2[1] and box1[2] < box2[2] and box1[3] < box2[3]
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def unnormalize_box(bbox, width, height):
|
| 56 |
+
return [
|
| 57 |
+
width * (bbox[0] / 1000),
|
| 58 |
+
height * (bbox[1] / 1000),
|
| 59 |
+
width * (bbox[2] / 1000),
|
| 60 |
+
height * (bbox[3] / 1000),
|
| 61 |
+
]
|
marker/code.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from marker.schema import Span, Line, Page
|
| 2 |
+
import re
|
| 3 |
+
from typing import List
|
| 4 |
+
import fitz as pymupdf
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def is_code_linelen(lines, thresh=50):
|
| 8 |
+
# Decide based on chars per newline threshold
|
| 9 |
+
total_alnum_chars = sum(len(re.findall(r'\w', line.prelim_text)) for line in lines)
|
| 10 |
+
total_newlines = len(lines) - 1
|
| 11 |
+
|
| 12 |
+
if total_alnum_chars == 0:
|
| 13 |
+
return False
|
| 14 |
+
|
| 15 |
+
ratio = total_alnum_chars / total_newlines
|
| 16 |
+
return ratio < thresh
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def identify_code_blocks(blocks: List[Page]):
|
| 20 |
+
for page in blocks:
|
| 21 |
+
try:
|
| 22 |
+
common_height = page.get_line_height_stats().most_common(1)[0][0]
|
| 23 |
+
common_start = page.get_line_start_stats().most_common(1)[0][0]
|
| 24 |
+
except IndexError:
|
| 25 |
+
continue
|
| 26 |
+
|
| 27 |
+
for block in page.blocks:
|
| 28 |
+
if len(block.lines) < 2:
|
| 29 |
+
continue
|
| 30 |
+
if block.most_common_block_type() != "Text":
|
| 31 |
+
continue
|
| 32 |
+
|
| 33 |
+
is_code = []
|
| 34 |
+
for line in block.lines:
|
| 35 |
+
fonts = [span.font for span in line.spans]
|
| 36 |
+
monospace_font = any([font for font in fonts if "mono" in font.lower() or "prop" in font.lower()])
|
| 37 |
+
line_height = line.bbox[3] - line.bbox[1]
|
| 38 |
+
line_start = line.bbox[0]
|
| 39 |
+
if line_height <= common_height and line_start > common_start and monospace_font:
|
| 40 |
+
is_code.append(True)
|
| 41 |
+
else:
|
| 42 |
+
is_code.append(False)
|
| 43 |
+
is_code = [
|
| 44 |
+
sum(is_code) > len(block.lines) / 1.5,
|
| 45 |
+
len(block.lines) > 4,
|
| 46 |
+
is_code_linelen(block.lines)
|
| 47 |
+
]
|
| 48 |
+
|
| 49 |
+
if all(is_code):
|
| 50 |
+
block.set_block_type("Code")
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def indent_blocks(blocks: List[Page]):
|
| 54 |
+
span_counter = 0
|
| 55 |
+
for page in blocks:
|
| 56 |
+
for block in page.blocks:
|
| 57 |
+
if block.most_common_block_type() != "Code":
|
| 58 |
+
continue
|
| 59 |
+
|
| 60 |
+
lines = []
|
| 61 |
+
min_left = 1000 # will contain x- coord of column 0
|
| 62 |
+
col_width = 0 # width of 1 char
|
| 63 |
+
for line in block.lines:
|
| 64 |
+
text = ""
|
| 65 |
+
min_left = min(line.bbox[0], min_left)
|
| 66 |
+
for span in line.spans:
|
| 67 |
+
if col_width == 0 and len(span.text) > 0:
|
| 68 |
+
col_width = (span.bbox[2] - span.bbox[0]) / len(span.text)
|
| 69 |
+
text += span.text
|
| 70 |
+
lines.append((pymupdf.Rect(line.bbox), text))
|
| 71 |
+
|
| 72 |
+
block_text = ""
|
| 73 |
+
for line in lines:
|
| 74 |
+
text = line[1]
|
| 75 |
+
prefix = " " * int((line[0].x0 - min_left) / col_width)
|
| 76 |
+
block_text += prefix + text + "\n"
|
| 77 |
+
new_span = Span(
|
| 78 |
+
text=block_text,
|
| 79 |
+
bbox=block.bbox,
|
| 80 |
+
color=block.lines[0].spans[0].color,
|
| 81 |
+
span_id=f"{span_counter}_fix_code",
|
| 82 |
+
font=block.lines[0].spans[0].font,
|
| 83 |
+
block_type="Code"
|
| 84 |
+
)
|
| 85 |
+
span_counter += 1
|
| 86 |
+
block.lines = [Line(spans=[new_span], bbox=block.bbox)]
|
marker/equations.py
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import io
|
| 2 |
+
from copy import deepcopy
|
| 3 |
+
from typing import List
|
| 4 |
+
|
| 5 |
+
from nougat import NougatModel
|
| 6 |
+
from nougat.utils.checkpoint import get_checkpoint
|
| 7 |
+
import re
|
| 8 |
+
from PIL import Image, ImageDraw
|
| 9 |
+
import fitz as pymupdf
|
| 10 |
+
from marker.bbox import should_merge_blocks, merge_boxes, multiple_boxes_intersect
|
| 11 |
+
from marker.settings import settings
|
| 12 |
+
from marker.schema import Page, Span, Line, Block, BlockType
|
| 13 |
+
from nougat.utils.device import move_to_device
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def load_model():
|
| 17 |
+
ckpt = get_checkpoint(None, model_tag="0.1.0-small")
|
| 18 |
+
nougat_model = NougatModel.from_pretrained(ckpt)
|
| 19 |
+
if settings.TORCH_DEVICE != "cpu":
|
| 20 |
+
is_cuda = "cuda" in settings.TORCH_DEVICE
|
| 21 |
+
move_to_device(nougat_model, bf16=is_cuda, cuda=is_cuda)
|
| 22 |
+
nougat_model.eval()
|
| 23 |
+
return nougat_model
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
nougat_model = load_model()
|
| 27 |
+
MODEL_MAX = nougat_model.config.max_length
|
| 28 |
+
|
| 29 |
+
NOUGAT_HALLUCINATION_WORDS = ["[MISSING_PAGE_POST]", "## References\n", "**Figure Captions**\n", "Footnote", "\par\par\par", "## Chapter", "Fig."]
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def contains_equation(text):
|
| 33 |
+
# Define a regular expression pattern to look for operators and symbols commonly found in equations
|
| 34 |
+
pattern = re.compile(r'[=\^\√∑∏∫∂∆π≈≠≤≥∞∩∪∈∉∀∃∅∇λμσαβγδεζηθφχψω]')
|
| 35 |
+
# Search the text for the pattern
|
| 36 |
+
match = pattern.search(text)
|
| 37 |
+
|
| 38 |
+
# Alternative equation patterns
|
| 39 |
+
alt_pattern = re.compile(r' P(?=[ \n\(\)$])')
|
| 40 |
+
alt_match = alt_pattern.search(text)
|
| 41 |
+
# Return True if the pattern is found, otherwise return False
|
| 42 |
+
return bool(match) or bool(alt_match)
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def mask_bbox(png_image, bbox, selected_bboxes):
|
| 46 |
+
mask = Image.new('L', png_image.size, 0) # 'L' mode for grayscale
|
| 47 |
+
draw = ImageDraw.Draw(mask)
|
| 48 |
+
first_x = bbox[0]
|
| 49 |
+
first_y = bbox[1]
|
| 50 |
+
bbox_height = bbox[3] - bbox[1]
|
| 51 |
+
bbox_width = bbox[2] - bbox[0]
|
| 52 |
+
|
| 53 |
+
for box in selected_bboxes:
|
| 54 |
+
# Fit the box to the selected region
|
| 55 |
+
new_box = (box[0] - first_x, box[1] - first_y, box[2] - first_x, box[3] - first_y)
|
| 56 |
+
# Fit mask to image bounds versus the pdf bounds
|
| 57 |
+
resized = (
|
| 58 |
+
new_box[0] / bbox_width * png_image.size[0],
|
| 59 |
+
new_box[1] / bbox_height * png_image.size[1],
|
| 60 |
+
new_box[2] / bbox_width * png_image.size[0],
|
| 61 |
+
new_box[3] / bbox_height * png_image.size[1]
|
| 62 |
+
)
|
| 63 |
+
draw.rectangle(resized, fill=255)
|
| 64 |
+
|
| 65 |
+
result = Image.composite(png_image, Image.new('RGBA', png_image.size, 'white'), mask)
|
| 66 |
+
return result
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def get_nougat_text(page, old_text, bbox, selected_bboxes, save_id, max_length=MODEL_MAX):
|
| 70 |
+
pix = page.get_pixmap(dpi=settings.DPI, clip=bbox)
|
| 71 |
+
png = pix.pil_tobytes(format="PNG")
|
| 72 |
+
png_image = Image.open(io.BytesIO(png))
|
| 73 |
+
png_image = mask_bbox(png_image, bbox, selected_bboxes)
|
| 74 |
+
|
| 75 |
+
nougat_model.config.max_length = min(max_length, MODEL_MAX)
|
| 76 |
+
output = nougat_model.inference(image=png_image)
|
| 77 |
+
return output["predictions"][0]
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def replace_equations(doc, blocks: List[Page], block_types: List[List[BlockType]]):
|
| 81 |
+
span_id = 0
|
| 82 |
+
new_blocks = []
|
| 83 |
+
for pnum, page in enumerate(blocks):
|
| 84 |
+
i = 0
|
| 85 |
+
new_page_blocks = []
|
| 86 |
+
equation_boxes = [b.bbox for b in block_types[pnum] if b.block_type == "Formula"]
|
| 87 |
+
while i < len(page.blocks):
|
| 88 |
+
block = page.blocks[i]
|
| 89 |
+
block_text = block.prelim_text
|
| 90 |
+
bbox = block.bbox
|
| 91 |
+
# Check if the block contains an equation
|
| 92 |
+
if not block.contains_equation(equation_boxes):
|
| 93 |
+
new_page_blocks.append(block)
|
| 94 |
+
i += 1
|
| 95 |
+
continue
|
| 96 |
+
|
| 97 |
+
selected_blocks = [i]
|
| 98 |
+
if i > 0:
|
| 99 |
+
j = 1
|
| 100 |
+
prev_block = page.blocks[i - j]
|
| 101 |
+
prev_bbox = prev_block.bbox
|
| 102 |
+
while (should_merge_blocks(prev_bbox, bbox) or prev_block.contains_equation(equation_boxes)) and i - j >= 0:
|
| 103 |
+
bbox = merge_boxes(prev_bbox, bbox)
|
| 104 |
+
prev_block = page.blocks[i - j]
|
| 105 |
+
prev_bbox = prev_block.bbox
|
| 106 |
+
block_text = prev_block.prelim_text + " " + block_text
|
| 107 |
+
new_page_blocks = new_page_blocks[:-1] # Remove the previous block, since we're merging it in
|
| 108 |
+
j += 1
|
| 109 |
+
selected_blocks.append(i - j)
|
| 110 |
+
|
| 111 |
+
if i < len(page.blocks) - 1:
|
| 112 |
+
next_block = page.blocks[i + 1]
|
| 113 |
+
next_bbox = next_block.bbox
|
| 114 |
+
while (should_merge_blocks(bbox, next_bbox) or next_block.contains_equation(equation_boxes)) and i + 1 < len(page.blocks):
|
| 115 |
+
bbox = merge_boxes(bbox, next_bbox)
|
| 116 |
+
block_text += " " + next_block.prelim_text
|
| 117 |
+
i += 1
|
| 118 |
+
selected_blocks.append(i)
|
| 119 |
+
if i + 1 < len(page.blocks):
|
| 120 |
+
next_block = page.blocks[i + 1]
|
| 121 |
+
next_bbox = next_block.bbox
|
| 122 |
+
|
| 123 |
+
used_nougat = False
|
| 124 |
+
if len(block_text) < 2000:
|
| 125 |
+
selected_bboxes = [page.blocks[i].bbox for i in selected_blocks]
|
| 126 |
+
# This prevents hallucinations from running on for a long time
|
| 127 |
+
max_tokens = len(block_text) + 50
|
| 128 |
+
max_char_length = 2 * len(block_text) + 100
|
| 129 |
+
nougat_text = get_nougat_text(doc[pnum], block_text, bbox, selected_bboxes, f"{pnum}_{i}", max_length=max_tokens)
|
| 130 |
+
conditions = [
|
| 131 |
+
len(nougat_text) > 0,
|
| 132 |
+
not any([word in nougat_text for word in NOUGAT_HALLUCINATION_WORDS]),
|
| 133 |
+
len(nougat_text) < max_char_length, # Reduce hallucinations
|
| 134 |
+
len(nougat_text) >= len(block_text) * .8
|
| 135 |
+
]
|
| 136 |
+
if all(conditions):
|
| 137 |
+
block_line = Line(
|
| 138 |
+
spans=[
|
| 139 |
+
Span(
|
| 140 |
+
text=nougat_text,
|
| 141 |
+
bbox=bbox,
|
| 142 |
+
span_id=f"{pnum}_{span_id}_fixeq",
|
| 143 |
+
font="Latex",
|
| 144 |
+
color=0,
|
| 145 |
+
block_type="Formula"
|
| 146 |
+
)
|
| 147 |
+
],
|
| 148 |
+
bbox=bbox
|
| 149 |
+
)
|
| 150 |
+
new_page_blocks.append(Block(
|
| 151 |
+
lines=[block_line],
|
| 152 |
+
bbox=bbox,
|
| 153 |
+
pnum=pnum
|
| 154 |
+
))
|
| 155 |
+
used_nougat = True
|
| 156 |
+
span_id += 1
|
| 157 |
+
|
| 158 |
+
if not used_nougat:
|
| 159 |
+
for block_idx in selected_blocks:
|
| 160 |
+
new_page_blocks.append(page.blocks[block_idx])
|
| 161 |
+
|
| 162 |
+
i += 1
|
| 163 |
+
# Assign back to page
|
| 164 |
+
new_page = deepcopy(page)
|
| 165 |
+
new_page.blocks = new_page_blocks
|
| 166 |
+
new_blocks.append(new_page)
|
| 167 |
+
return new_blocks
|
marker/extract_text.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import fitz as pymupdf
|
| 2 |
+
import os
|
| 3 |
+
from marker.settings import settings
|
| 4 |
+
from marker.schema import Span, Line, Block, Page
|
| 5 |
+
|
| 6 |
+
os.environ["TESSDATA_PREFIX"] = settings.TESSDATA_PREFIX
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def get_tessocr(page, old_text, bbox):
|
| 10 |
+
pix = page.get_pixmap(dpi=settings.DPI, clip=bbox)
|
| 11 |
+
|
| 12 |
+
ocrpdf = pymupdf.open("pdf", pix.pdfocr_tobytes())
|
| 13 |
+
ocrpage = ocrpdf[0]
|
| 14 |
+
new_text = ocrpage.get_text() # extract OCR-ed text
|
| 15 |
+
|
| 16 |
+
# Tesseract ignores leading spaces, hence some corrections
|
| 17 |
+
lblanks = len(old_text) - len(old_text.lstrip())
|
| 18 |
+
|
| 19 |
+
# prefix OCRed text with this many spaces
|
| 20 |
+
new_text = " " * lblanks + new_text
|
| 21 |
+
return new_text
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def font_flags_decomposer(flags):
|
| 25 |
+
"""Make font flags human readable."""
|
| 26 |
+
l = []
|
| 27 |
+
if flags & 2 ** 0:
|
| 28 |
+
l.append("superscript")
|
| 29 |
+
if flags & 2 ** 1:
|
| 30 |
+
l.append("italic")
|
| 31 |
+
if flags & 2 ** 2:
|
| 32 |
+
l.append("serifed")
|
| 33 |
+
else:
|
| 34 |
+
l.append("sans")
|
| 35 |
+
if flags & 2 ** 3:
|
| 36 |
+
l.append("monospaced")
|
| 37 |
+
else:
|
| 38 |
+
l.append("proportional")
|
| 39 |
+
if flags & 2 ** 4:
|
| 40 |
+
l.append("bold")
|
| 41 |
+
return "_".join(l)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def get_single_page_blocks(page, pnum):
|
| 45 |
+
blocks = page.get_text("dict", sort=True,
|
| 46 |
+
flags=~pymupdf.TEXT_PRESERVE_LIGATURES & pymupdf.TEXT_PRESERVE_WHITESPACE & ~pymupdf.TEXT_PRESERVE_IMAGES & ~pymupdf.TEXT_INHIBIT_SPACES & pymupdf.TEXT_DEHYPHENATE & pymupdf.TEXT_MEDIABOX_CLIP)["blocks"]
|
| 47 |
+
page_blocks = []
|
| 48 |
+
span_id = 0
|
| 49 |
+
for block_idx, block in enumerate(blocks):
|
| 50 |
+
block_lines = []
|
| 51 |
+
for l in block["lines"]:
|
| 52 |
+
spans = []
|
| 53 |
+
for i, s in enumerate(l["spans"]):
|
| 54 |
+
block_text = s["text"]
|
| 55 |
+
bbox = s["bbox"]
|
| 56 |
+
# Find if any of the elements in invalid chars are in block_text
|
| 57 |
+
if set(settings.INVALID_CHARS).intersection(block_text): # invalid characters encountered!
|
| 58 |
+
# invoke OCR
|
| 59 |
+
block_text = get_tessocr(page, block_text, bbox)
|
| 60 |
+
# print("block %i, bbox: %s, text: %s" % (block_idx, bbox, block_text))
|
| 61 |
+
span_obj = Span(
|
| 62 |
+
text=block_text,
|
| 63 |
+
bbox=bbox,
|
| 64 |
+
span_id=f"{pnum}_{span_id}",
|
| 65 |
+
font=f"{s['font']}_{font_flags_decomposer(s['flags'])}", # Add font flags to end of font
|
| 66 |
+
color=s["color"],
|
| 67 |
+
ascender=s["ascender"],
|
| 68 |
+
descender=s["descender"],
|
| 69 |
+
)
|
| 70 |
+
spans.append(span_obj) # Text, bounding box, span id
|
| 71 |
+
span_id += 1
|
| 72 |
+
line_obj = Line(
|
| 73 |
+
spans=spans,
|
| 74 |
+
bbox=l["bbox"]
|
| 75 |
+
)
|
| 76 |
+
block_lines.append(line_obj)
|
| 77 |
+
block_obj = Block(
|
| 78 |
+
lines=block_lines,
|
| 79 |
+
bbox=block["bbox"],
|
| 80 |
+
pnum=pnum
|
| 81 |
+
)
|
| 82 |
+
page_blocks.append(block_obj)
|
| 83 |
+
return page_blocks
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def get_text_blocks(doc):
|
| 87 |
+
all_blocks = []
|
| 88 |
+
toc = doc.get_toc()
|
| 89 |
+
for pnum, page in enumerate(doc):
|
| 90 |
+
blocks = get_single_page_blocks(page, pnum)
|
| 91 |
+
page_obj = Page(blocks=blocks, pnum=pnum)
|
| 92 |
+
all_blocks.append(page_obj)
|
| 93 |
+
|
| 94 |
+
return all_blocks, toc
|
marker/headers.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from collections import Counter, defaultdict
|
| 2 |
+
from itertools import chain
|
| 3 |
+
|
| 4 |
+
from sklearn.cluster import DBSCAN, HDBSCAN
|
| 5 |
+
import numpy as np
|
| 6 |
+
|
| 7 |
+
from collections import Counter
|
| 8 |
+
from copy import deepcopy
|
| 9 |
+
|
| 10 |
+
from marker.schema import Page
|
| 11 |
+
from typing import List
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def filter_common_elements(lines, page_count):
|
| 15 |
+
text = [s.text for line in lines for s in line.spans]
|
| 16 |
+
counter = Counter(text)
|
| 17 |
+
common = [k for k, v in counter.items() if v > page_count * .4]
|
| 18 |
+
bad_span_ids = [s.text for line in lines for s in line.spans if s.span_id in common]
|
| 19 |
+
return bad_span_ids
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def filter_header_footer(all_page_blocks, max_selected_lines = 2):
|
| 23 |
+
first_lines = []
|
| 24 |
+
last_lines = []
|
| 25 |
+
for page in all_page_blocks:
|
| 26 |
+
nonblank_lines = page.get_nonblank_lines()
|
| 27 |
+
first_lines.extend(nonblank_lines[:max_selected_lines])
|
| 28 |
+
last_lines.extend(nonblank_lines[-max_selected_lines:])
|
| 29 |
+
|
| 30 |
+
bad_span_ids = filter_common_elements(first_lines, len(all_page_blocks))
|
| 31 |
+
bad_span_ids += filter_common_elements(last_lines, len(all_page_blocks))
|
| 32 |
+
return bad_span_ids
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def categorize_blocks(all_page_blocks: List[Page]):
|
| 36 |
+
spans = list(chain.from_iterable([p.get_nonblank_spans() for p in all_page_blocks]))
|
| 37 |
+
X = np.array(
|
| 38 |
+
[(*s.bbox, len(s.text)) for s in spans]
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
dbscan = DBSCAN(eps=.1, min_samples=5)
|
| 42 |
+
dbscan.fit(X)
|
| 43 |
+
labels = dbscan.labels_
|
| 44 |
+
label_chars = defaultdict(int)
|
| 45 |
+
for i, label in enumerate(labels):
|
| 46 |
+
label_chars[label] += len(spans[i].text)
|
| 47 |
+
|
| 48 |
+
most_common_label = None
|
| 49 |
+
most_chars = 0
|
| 50 |
+
for i in label_chars.keys():
|
| 51 |
+
if label_chars[i] > most_chars:
|
| 52 |
+
most_common_label = i
|
| 53 |
+
most_chars = label_chars[i]
|
| 54 |
+
|
| 55 |
+
labels = [0 if label == most_common_label else 1 for label in labels]
|
| 56 |
+
bad_span_ids = [spans[i].span_id for i in range(len(spans)) if labels[i] == 1]
|
| 57 |
+
|
| 58 |
+
return bad_span_ids
|
| 59 |
+
|
| 60 |
+
|
marker/markdown.py
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from marker.schema import MergedLine, MergedBlock, FullyMergedBlock, Page
|
| 2 |
+
import re
|
| 3 |
+
from typing import List
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def surround_text(s, char_to_insert):
|
| 7 |
+
leading_whitespace = re.match(r'^(\s*)', s).group(1)
|
| 8 |
+
trailing_whitespace = re.search(r'(\s*)$', s).group(1)
|
| 9 |
+
stripped_string = s.strip()
|
| 10 |
+
modified_string = char_to_insert + stripped_string + char_to_insert
|
| 11 |
+
final_string = leading_whitespace + modified_string + trailing_whitespace
|
| 12 |
+
return final_string
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def merge_spans(blocks):
|
| 16 |
+
merged_blocks = []
|
| 17 |
+
for page in blocks:
|
| 18 |
+
page_blocks = []
|
| 19 |
+
for blocknum, block in enumerate(page.blocks):
|
| 20 |
+
block_lines = []
|
| 21 |
+
block_types = []
|
| 22 |
+
for linenum, line in enumerate(block.lines):
|
| 23 |
+
line_text = ""
|
| 24 |
+
if len(line.spans) == 0:
|
| 25 |
+
continue
|
| 26 |
+
fonts = []
|
| 27 |
+
for i, span in enumerate(line.spans):
|
| 28 |
+
font = span.font.lower()
|
| 29 |
+
next_font = None
|
| 30 |
+
if len(line.spans) > i + 1:
|
| 31 |
+
next_font = line.spans[i + 1].font.lower()
|
| 32 |
+
fonts.append(font)
|
| 33 |
+
block_types.append(span.block_type)
|
| 34 |
+
span_text = span.text
|
| 35 |
+
if "ital" in font and (not next_font or "ital" not in next_font):
|
| 36 |
+
span_text = surround_text(span_text, "*")
|
| 37 |
+
elif "bold" in font and (not next_font or "bold" not in next_font):
|
| 38 |
+
span_text = surround_text(span_text, "**")
|
| 39 |
+
line_text += span_text
|
| 40 |
+
block_lines.append(MergedLine(
|
| 41 |
+
text=line_text,
|
| 42 |
+
fonts=fonts,
|
| 43 |
+
bbox=line.bbox
|
| 44 |
+
))
|
| 45 |
+
if len(block_lines) > 0:
|
| 46 |
+
page_blocks.append(MergedBlock(
|
| 47 |
+
lines=block_lines,
|
| 48 |
+
pnum=block.pnum,
|
| 49 |
+
bbox=block.bbox,
|
| 50 |
+
block_types=block_types
|
| 51 |
+
))
|
| 52 |
+
merged_blocks.append(page_blocks)
|
| 53 |
+
|
| 54 |
+
return merged_blocks
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def block_surround(text, block_type):
|
| 58 |
+
dot_pattern = re.compile(r'(\s*\.\s*){4,}')
|
| 59 |
+
dot_multiline_pattern = re.compile(r'.*(\s*\.\s*){4,}.*', re.DOTALL)
|
| 60 |
+
match block_type:
|
| 61 |
+
case "Section-header":
|
| 62 |
+
if not text.startswith("#"):
|
| 63 |
+
text = "\n## " + text.strip() + "\n"
|
| 64 |
+
case "Title":
|
| 65 |
+
if not text.startswith("#"):
|
| 66 |
+
text = "# " + text.strip() + "\n"
|
| 67 |
+
case "Table" if dot_multiline_pattern.match(text):
|
| 68 |
+
text = dot_pattern.sub(' ', text)
|
| 69 |
+
case "List-item":
|
| 70 |
+
pass
|
| 71 |
+
case "Code":
|
| 72 |
+
text = "```\n" + text + "\n```\n"
|
| 73 |
+
case _:
|
| 74 |
+
pass
|
| 75 |
+
return text
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def line_separator(line1, line2, block_type, is_continuation=False):
|
| 79 |
+
# Remove hyphen in current line if next line and current line appear to be joined
|
| 80 |
+
hyphen_pattern = re.compile(r'.*[a-z][-]\s?$', re.DOTALL)
|
| 81 |
+
if line1 and hyphen_pattern.match(line1) and re.match(r"^[a-z]", line2):
|
| 82 |
+
# Split on — or - from the right
|
| 83 |
+
line1 = re.split(r"[-—]\s?$", line1)[0]
|
| 84 |
+
return line1.rstrip() + line2.lstrip()
|
| 85 |
+
|
| 86 |
+
lowercase_pattern1 = re.compile(r'.*[a-z,]\s?$', re.DOTALL)
|
| 87 |
+
lowercase_pattern2 = re.compile(r'^\s?[A-Za-z]', re.DOTALL)
|
| 88 |
+
end_pattern = re.compile(r'.*[.?!]\s?$', re.DOTALL)
|
| 89 |
+
|
| 90 |
+
if block_type in ["Title", "Section-header"]:
|
| 91 |
+
return line1.rstrip() + " " + line2.lstrip()
|
| 92 |
+
elif lowercase_pattern1.match(line1) and lowercase_pattern2.match(line2):
|
| 93 |
+
return line1.rstrip() + " " + line2.lstrip()
|
| 94 |
+
elif is_continuation:
|
| 95 |
+
return line1.rstrip() + " " + line2.lstrip()
|
| 96 |
+
elif block_type == "Text" and end_pattern.match(line1):
|
| 97 |
+
return line1 + "\n\n" + line2
|
| 98 |
+
elif block_type == "Formula":
|
| 99 |
+
return line1 + " " + line2
|
| 100 |
+
else:
|
| 101 |
+
return line1 + "\n" + line2
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def block_separator(line1, line2, block_type1, block_type2):
|
| 105 |
+
sep = "\n"
|
| 106 |
+
if block_type1 == "Text":
|
| 107 |
+
sep = "\n\n"
|
| 108 |
+
|
| 109 |
+
return sep + line2
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def merge_lines(blocks, page_blocks: List[Page]):
|
| 113 |
+
text_blocks = []
|
| 114 |
+
prev_type = None
|
| 115 |
+
prev_line = None
|
| 116 |
+
block_text = ""
|
| 117 |
+
block_type = ""
|
| 118 |
+
common_line_heights = [p.get_line_height_stats() for p in page_blocks]
|
| 119 |
+
for page in blocks:
|
| 120 |
+
for block in page:
|
| 121 |
+
block_type = block.most_common_block_type()
|
| 122 |
+
if block_type != prev_type and prev_type:
|
| 123 |
+
text_blocks.append(
|
| 124 |
+
FullyMergedBlock(
|
| 125 |
+
text=block_surround(block_text, prev_type),
|
| 126 |
+
block_type=prev_type
|
| 127 |
+
)
|
| 128 |
+
)
|
| 129 |
+
block_text = ""
|
| 130 |
+
|
| 131 |
+
prev_type = block_type
|
| 132 |
+
common_line_height = common_line_heights[block.pnum].most_common(1)[0][0]
|
| 133 |
+
for i, line in enumerate(block.lines):
|
| 134 |
+
line_height = line.bbox[3] - line.bbox[1]
|
| 135 |
+
prev_line_height = prev_line.bbox[3] - prev_line.bbox[1] if prev_line else 0
|
| 136 |
+
prev_line_x = prev_line.bbox[0] if prev_line else 0
|
| 137 |
+
prev_line = line
|
| 138 |
+
is_continuation = line_height == prev_line_height and line.bbox[0] == prev_line_x
|
| 139 |
+
if block_text:
|
| 140 |
+
block_text = line_separator(block_text, line.text, block_type, is_continuation)
|
| 141 |
+
else:
|
| 142 |
+
block_text = line.text
|
| 143 |
+
|
| 144 |
+
# Append the final block
|
| 145 |
+
text_blocks.append(
|
| 146 |
+
FullyMergedBlock(
|
| 147 |
+
text=block_surround(block_text, prev_type),
|
| 148 |
+
block_type=block_type
|
| 149 |
+
)
|
| 150 |
+
)
|
| 151 |
+
return text_blocks
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
def get_full_text(text_blocks):
|
| 155 |
+
full_text = ""
|
| 156 |
+
prev_block = None
|
| 157 |
+
for block in text_blocks:
|
| 158 |
+
if prev_block:
|
| 159 |
+
full_text += block_separator(prev_block.text, block.text, prev_block.block_type, block.block_type)
|
| 160 |
+
else:
|
| 161 |
+
full_text += block.text
|
| 162 |
+
prev_block = block
|
| 163 |
+
return full_text
|
marker/schema.py
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from collections import Counter
|
| 2 |
+
from typing import List
|
| 3 |
+
|
| 4 |
+
from pydantic import BaseModel, field_validator
|
| 5 |
+
|
| 6 |
+
from marker.bbox import boxes_intersect_pct, multiple_boxes_intersect
|
| 7 |
+
from marker.settings import settings
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def find_span_type(span, page_blocks):
|
| 11 |
+
block_type = "Text"
|
| 12 |
+
for block in page_blocks:
|
| 13 |
+
if boxes_intersect_pct(span.bbox, block.bbox):
|
| 14 |
+
block_type = block.block_type
|
| 15 |
+
break
|
| 16 |
+
return block_type
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class BboxElement(BaseModel):
|
| 20 |
+
bbox: List[float]
|
| 21 |
+
|
| 22 |
+
@field_validator('bbox')
|
| 23 |
+
@classmethod
|
| 24 |
+
def check_4_elements(cls, v: List[float]) -> List[float]:
|
| 25 |
+
if len(v) != 4:
|
| 26 |
+
raise ValueError('bbox must have 4 elements')
|
| 27 |
+
return v
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class BlockType(BboxElement):
|
| 31 |
+
block_type: str
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class Span(BboxElement):
|
| 35 |
+
text: str
|
| 36 |
+
span_id: str
|
| 37 |
+
font: str
|
| 38 |
+
color: int
|
| 39 |
+
ascender: float | None = None
|
| 40 |
+
descender: float | None = None
|
| 41 |
+
block_type: str | None = None
|
| 42 |
+
selected: bool = True
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
class Line(BboxElement):
|
| 46 |
+
spans: List[Span]
|
| 47 |
+
|
| 48 |
+
@property
|
| 49 |
+
def prelim_text(self):
|
| 50 |
+
return "".join([s.text for s in self.spans])
|
| 51 |
+
|
| 52 |
+
@property
|
| 53 |
+
def start(self):
|
| 54 |
+
return self.spans[0].bbox[0]
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
class Block(BboxElement):
|
| 58 |
+
lines: List[Line]
|
| 59 |
+
pnum: int
|
| 60 |
+
|
| 61 |
+
@property
|
| 62 |
+
def prelim_text(self):
|
| 63 |
+
return "\n".join([l.prelim_text for l in self.lines])
|
| 64 |
+
|
| 65 |
+
def contains_equation(self, equation_boxes=None):
|
| 66 |
+
conditions = [s.block_type == "Formula" for l in self.lines for s in l.spans]
|
| 67 |
+
if equation_boxes:
|
| 68 |
+
conditions += [multiple_boxes_intersect(self.bbox, equation_boxes)]
|
| 69 |
+
return any(conditions)
|
| 70 |
+
|
| 71 |
+
def filter_spans(self, bad_span_ids):
|
| 72 |
+
new_lines = []
|
| 73 |
+
for line in self.lines:
|
| 74 |
+
new_spans = []
|
| 75 |
+
for span in line.spans:
|
| 76 |
+
if not span.span_id in bad_span_ids:
|
| 77 |
+
new_spans.append(span)
|
| 78 |
+
line.spans = new_spans
|
| 79 |
+
if len(new_spans) > 0:
|
| 80 |
+
new_lines.append(line)
|
| 81 |
+
self.lines = new_lines
|
| 82 |
+
|
| 83 |
+
def filter_bad_span_types(self, block_types: List[BlockType]):
|
| 84 |
+
bad_spans = [b.bbox for b in block_types if b.block_type in settings.BAD_SPAN_TYPES]
|
| 85 |
+
new_lines = []
|
| 86 |
+
for line in self.lines:
|
| 87 |
+
new_spans = []
|
| 88 |
+
for span in line.spans:
|
| 89 |
+
if not multiple_boxes_intersect(span.bbox, bad_spans):
|
| 90 |
+
new_spans.append(span)
|
| 91 |
+
line.spans = new_spans
|
| 92 |
+
if len(new_spans) > 0:
|
| 93 |
+
new_lines.append(line)
|
| 94 |
+
self.lines = new_lines
|
| 95 |
+
|
| 96 |
+
def most_common_block_type(self):
|
| 97 |
+
counter = Counter([s.block_type for l in self.lines for s in l.spans])
|
| 98 |
+
return counter.most_common(1)[0][0]
|
| 99 |
+
|
| 100 |
+
def set_block_type(self, block_type):
|
| 101 |
+
for line in self.lines:
|
| 102 |
+
for span in line.spans:
|
| 103 |
+
span.block_type = block_type
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
class Page(BaseModel):
|
| 107 |
+
blocks: List[Block]
|
| 108 |
+
pnum: int
|
| 109 |
+
|
| 110 |
+
def get_nonblank_lines(self):
|
| 111 |
+
lines = self.get_all_lines()
|
| 112 |
+
nonblank_lines = [l for l in lines if l.prelim_text.strip()]
|
| 113 |
+
return nonblank_lines
|
| 114 |
+
|
| 115 |
+
def get_all_lines(self):
|
| 116 |
+
lines = [l for b in self.blocks for l in b.lines]
|
| 117 |
+
return lines
|
| 118 |
+
|
| 119 |
+
def get_nonblank_spans(self) -> List[Span]:
|
| 120 |
+
lines = [l for b in self.blocks for l in b.lines]
|
| 121 |
+
spans = [s for l in lines for s in l.spans if s.text.strip()]
|
| 122 |
+
return spans
|
| 123 |
+
|
| 124 |
+
def add_block_types(self, page_block_types):
|
| 125 |
+
if len(page_block_types) != len(self.get_all_lines()):
|
| 126 |
+
print(f"Warning: Number of detected lines {len(page_block_types)} does not match number of lines {len(self.get_all_lines())}")
|
| 127 |
+
|
| 128 |
+
i = 0
|
| 129 |
+
for block in self.blocks:
|
| 130 |
+
for line in block.lines:
|
| 131 |
+
if i < len(page_block_types):
|
| 132 |
+
line_block_type = page_block_types[i].block_type
|
| 133 |
+
else:
|
| 134 |
+
line_block_type = "Text"
|
| 135 |
+
i += 1
|
| 136 |
+
for span in line.spans:
|
| 137 |
+
span.block_type = line_block_type
|
| 138 |
+
|
| 139 |
+
def get_font_stats(self):
|
| 140 |
+
fonts = [s.font for s in self.get_nonblank_spans()]
|
| 141 |
+
font_counts = Counter(fonts)
|
| 142 |
+
return font_counts
|
| 143 |
+
|
| 144 |
+
def get_line_height_stats(self):
|
| 145 |
+
heights = [l.bbox[3] - l.bbox[1] for l in self.get_nonblank_lines()]
|
| 146 |
+
height_counts = Counter(heights)
|
| 147 |
+
return height_counts
|
| 148 |
+
|
| 149 |
+
def get_line_start_stats(self):
|
| 150 |
+
starts = [l.bbox[0] for l in self.get_nonblank_lines()]
|
| 151 |
+
start_counts = Counter(starts)
|
| 152 |
+
return start_counts
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
class MergedLine(BboxElement):
|
| 156 |
+
text: str
|
| 157 |
+
fonts: List[str]
|
| 158 |
+
|
| 159 |
+
def most_common_font(self):
|
| 160 |
+
counter = Counter(self.fonts)
|
| 161 |
+
return counter.most_common(1)[0][0]
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
class MergedBlock(BboxElement):
|
| 165 |
+
lines: List[MergedLine]
|
| 166 |
+
pnum: int
|
| 167 |
+
block_types: List[str]
|
| 168 |
+
|
| 169 |
+
def most_common_block_type(self):
|
| 170 |
+
counter = Counter(self.block_types)
|
| 171 |
+
return counter.most_common(1)[0][0]
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
class FullyMergedBlock(BaseModel):
|
| 175 |
+
text: str
|
| 176 |
+
block_type: str
|
marker/segmentation.py
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List
|
| 2 |
+
|
| 3 |
+
from transformers import LayoutLMv3ForTokenClassification
|
| 4 |
+
|
| 5 |
+
from marker.bbox import unnormalize_box
|
| 6 |
+
from transformers.models.layoutlmv3.image_processing_layoutlmv3 import normalize_box
|
| 7 |
+
import io
|
| 8 |
+
from PIL import Image
|
| 9 |
+
from transformers import LayoutLMv3Processor
|
| 10 |
+
import numpy as np
|
| 11 |
+
from marker.settings import settings
|
| 12 |
+
from marker.schema import Page, BlockType
|
| 13 |
+
import torch
|
| 14 |
+
|
| 15 |
+
processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
|
| 16 |
+
|
| 17 |
+
CHUNK_KEYS = ["input_ids", "attention_mask", "bbox", "offset_mapping"]
|
| 18 |
+
NO_CHUNK_KEYS = ["pixel_values"]
|
| 19 |
+
MODEL_MAX_LEN = 512
|
| 20 |
+
CHUNK_OVERLAP = 128
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def load_model():
|
| 24 |
+
model = LayoutLMv3ForTokenClassification.from_pretrained("Kwan0/layoutlmv3-base-finetune-DocLayNet-100k").to(settings.TORCH_DEVICE)
|
| 25 |
+
model.config.id2label = {
|
| 26 |
+
0: "Caption",
|
| 27 |
+
1: "Footnote",
|
| 28 |
+
2: "Formula",
|
| 29 |
+
3: "List-item",
|
| 30 |
+
4: "Page-footer",
|
| 31 |
+
5: "Page-header",
|
| 32 |
+
6: "Picture",
|
| 33 |
+
7: "Section-header",
|
| 34 |
+
8: "Table",
|
| 35 |
+
9: "Text",
|
| 36 |
+
10: "Title"
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
model.config.label2id = d = {v: k for k, v in model.config.id2label.items()}
|
| 40 |
+
return model
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
layoutlm_model = load_model()
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def detect_all_block_types(doc, blocks: List[Page]):
|
| 47 |
+
block_types = []
|
| 48 |
+
for pnum, page in enumerate(doc):
|
| 49 |
+
page_blocks = blocks[pnum]
|
| 50 |
+
predictions = detect_page_block_types(page, page_blocks)
|
| 51 |
+
block_types.append(predictions)
|
| 52 |
+
return block_types
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def detect_page_block_types(page, page_blocks: Page):
|
| 56 |
+
page_box = page.bound()
|
| 57 |
+
pwidth = page_box[2] - page_box[0]
|
| 58 |
+
pheight = page_box[3] - page_box[1]
|
| 59 |
+
|
| 60 |
+
pix = page.get_pixmap(dpi=400)
|
| 61 |
+
png = pix.pil_tobytes(format="PNG")
|
| 62 |
+
png_image = Image.open(io.BytesIO(png))
|
| 63 |
+
rgb_image = png_image.convert('RGB')
|
| 64 |
+
|
| 65 |
+
lines = page_blocks.get_all_lines()
|
| 66 |
+
boxes = [s.bbox for s in lines]
|
| 67 |
+
text = [s.prelim_text for s in lines]
|
| 68 |
+
|
| 69 |
+
predictions = make_predictions(rgb_image, text, boxes, pwidth, pheight)
|
| 70 |
+
return predictions
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def find_first_false(lst, start_idx):
|
| 74 |
+
# Traverse the list to the left from start_idx
|
| 75 |
+
for idx in range(start_idx, -1, -1):
|
| 76 |
+
if not lst[idx]:
|
| 77 |
+
return idx
|
| 78 |
+
|
| 79 |
+
return 0 # Return 0 if no false found (aka, no lines)
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def get_provisional_boxes(pred, box, is_subword, start_idx=0):
|
| 83 |
+
prov_predictions = [pred_ for idx, pred_ in enumerate(pred) if not is_subword[idx]][start_idx:]
|
| 84 |
+
prov_boxes = [box_ for idx, box_ in enumerate(box) if not is_subword[idx]][start_idx:]
|
| 85 |
+
return prov_predictions, prov_boxes
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def make_predictions(rgb_image, text, boxes, pwidth, pheight) -> List[BlockType]:
|
| 89 |
+
# Normalize boxes for model (scale to 1000x1000)
|
| 90 |
+
boxes = [normalize_box(box, pwidth, pheight) for box in boxes]
|
| 91 |
+
encoding = processor(rgb_image, text=text, boxes=boxes, return_offsets_mapping=True, return_tensors="pt", truncation=True, stride=CHUNK_OVERLAP, padding="max_length", max_length=MODEL_MAX_LEN, return_overflowing_tokens=True)
|
| 92 |
+
offset_mapping = encoding.pop('offset_mapping')
|
| 93 |
+
overflow_to_sample_mapping = encoding.pop('overflow_to_sample_mapping')
|
| 94 |
+
|
| 95 |
+
# change the shape of pixel values
|
| 96 |
+
x = []
|
| 97 |
+
for i in range(0, len(encoding['pixel_values'])):
|
| 98 |
+
x.append(encoding['pixel_values'][i])
|
| 99 |
+
x = torch.stack(x)
|
| 100 |
+
encoding['pixel_values'] = x
|
| 101 |
+
|
| 102 |
+
with torch.no_grad():
|
| 103 |
+
encoding = encoding.to(settings.TORCH_DEVICE)
|
| 104 |
+
outputs = layoutlm_model(**encoding)
|
| 105 |
+
|
| 106 |
+
logits = outputs.logits
|
| 107 |
+
# We take the highest score for each token, using argmax. This serves as the predicted label for each token.
|
| 108 |
+
predictions = logits.argmax(-1).squeeze().tolist()
|
| 109 |
+
token_boxes = encoding.bbox.squeeze().tolist()
|
| 110 |
+
|
| 111 |
+
if len(token_boxes) == MODEL_MAX_LEN:
|
| 112 |
+
predictions = [predictions]
|
| 113 |
+
token_boxes = [token_boxes]
|
| 114 |
+
|
| 115 |
+
predicted_block_types = []
|
| 116 |
+
|
| 117 |
+
for i, (pred, box, mapped) in enumerate(zip(predictions, token_boxes, offset_mapping)):
|
| 118 |
+
is_subword = np.array(mapped.squeeze().tolist())[:, 0] != 0
|
| 119 |
+
overlap_adjust = 0
|
| 120 |
+
if i > 0:
|
| 121 |
+
overlap_adjust = 1 + CHUNK_OVERLAP - sum(is_subword[:1 + CHUNK_OVERLAP])
|
| 122 |
+
|
| 123 |
+
prov_predictions, prov_boxes = get_provisional_boxes(pred, box, is_subword, overlap_adjust)
|
| 124 |
+
|
| 125 |
+
for prov_box, prov_prediction in zip(prov_boxes, prov_predictions):
|
| 126 |
+
if prov_box == [0, 0, 0, 0]:
|
| 127 |
+
continue
|
| 128 |
+
unnorm_box = unnormalize_box(prov_box, pwidth, pheight)
|
| 129 |
+
block_type = BlockType(
|
| 130 |
+
block_type=layoutlm_model.config.id2label[prov_prediction],
|
| 131 |
+
bbox=unnorm_box
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
# Sometimes blocks will cross chunks, unclear why
|
| 135 |
+
if len(predicted_block_types) == 0 or unnorm_box != predicted_block_types[-1].bbox:
|
| 136 |
+
predicted_block_types.append(block_type)
|
| 137 |
+
|
| 138 |
+
return predicted_block_types
|
| 139 |
+
|
marker/settings.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from typing import Optional, List
|
| 3 |
+
|
| 4 |
+
from dotenv import find_dotenv
|
| 5 |
+
from pydantic_settings import BaseSettings
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class Settings(BaseSettings):
|
| 9 |
+
# Path settings
|
| 10 |
+
DPI: int = 400
|
| 11 |
+
INVALID_CHARS: List[str] = [chr(0xfffd), "~", chr(65533), "↵"]
|
| 12 |
+
TORCH_DEVICE: str = "cpu"
|
| 13 |
+
TESSDATA_PREFIX: str = ""
|
| 14 |
+
BAD_SPAN_TYPES: List[str] = ["Caption", "Footnote", "Page-footer", "Page-header", "Picture"]
|
| 15 |
+
|
| 16 |
+
class Config:
|
| 17 |
+
env_file = find_dotenv("local.env")
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
settings = Settings()
|
parse.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import fitz as pymupdf
|
| 2 |
+
from marker.extract_text import get_text_blocks
|
| 3 |
+
from marker.headers import categorize_blocks, filter_header_footer
|
| 4 |
+
from marker.equations import replace_equations
|
| 5 |
+
from marker.segmentation import detect_all_block_types
|
| 6 |
+
from marker.code import identify_code_blocks, indent_blocks
|
| 7 |
+
from marker.markdown import merge_spans, merge_lines, get_full_text
|
| 8 |
+
from marker.schema import Page, BlockType
|
| 9 |
+
from typing import List
|
| 10 |
+
from copy import deepcopy
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def annotate_spans(blocks: List[Page], block_types: List[BlockType]):
|
| 14 |
+
for i, page in enumerate(blocks):
|
| 15 |
+
page_block_types = block_types[i]
|
| 16 |
+
page.add_block_types(page_block_types)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
if __name__ == "__main__":
|
| 20 |
+
fname = "test_data/thinkpython.pdf"
|
| 21 |
+
doc = pymupdf.open(fname)
|
| 22 |
+
blocks, toc = get_text_blocks(doc)
|
| 23 |
+
|
| 24 |
+
block_types = detect_all_block_types(doc, blocks)
|
| 25 |
+
|
| 26 |
+
filtered = deepcopy(blocks)
|
| 27 |
+
annotate_spans(filtered, block_types)
|
| 28 |
+
identify_code_blocks(filtered)
|
| 29 |
+
indent_blocks(filtered)
|
| 30 |
+
|
| 31 |
+
bad_span_ids = categorize_blocks(blocks)
|
| 32 |
+
bad_span_ids += filter_header_footer(blocks)
|
| 33 |
+
|
| 34 |
+
# Copy to avoid changing original data
|
| 35 |
+
|
| 36 |
+
for page in filtered:
|
| 37 |
+
for block in page.blocks:
|
| 38 |
+
block.filter_spans(bad_span_ids)
|
| 39 |
+
block.filter_bad_span_types(block_types[page.pnum])
|
| 40 |
+
|
| 41 |
+
filtered = replace_equations(doc, filtered, block_types)
|
| 42 |
+
|
| 43 |
+
# Copy to avoid changing original data
|
| 44 |
+
merged_lines = merge_spans(filtered)
|
| 45 |
+
text_blocks = merge_lines(merged_lines, filtered)
|
| 46 |
+
full_text = get_full_text(text_blocks)
|
| 47 |
+
|
| 48 |
+
with open("test_data/thinkpython.md", "w+") as f:
|
| 49 |
+
f.write(full_text)
|
poetry.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
pyproject.toml
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[tool.poetry]
|
| 2 |
+
name = "marker"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = ""
|
| 5 |
+
authors = ["Vik Paruchuri <vik.paruchuri@gmail.com>"]
|
| 6 |
+
readme = "README.md"
|
| 7 |
+
|
| 8 |
+
[tool.poetry.dependencies]
|
| 9 |
+
python = ">=3.9,<3.13"
|
| 10 |
+
scikit-learn = "^1.3.2"
|
| 11 |
+
Pillow = "^10.1.0"
|
| 12 |
+
pytesseract = "^0.3.10"
|
| 13 |
+
PyMuPDF = "^1.23.5"
|
| 14 |
+
pymupdf-fonts = "^1.0.5"
|
| 15 |
+
pydantic = "^2.4.2"
|
| 16 |
+
pydantic-settings = "^2.0.3"
|
| 17 |
+
nougat-ocr = "^0.1.17"
|
| 18 |
+
transformers = "^4.34.1"
|
| 19 |
+
torch = "^2.1.0"
|
| 20 |
+
numpy = "^1.26.1"
|
| 21 |
+
python-dotenv = "^1.0.0"
|
| 22 |
+
|
| 23 |
+
[tool.poetry.group.dev.dependencies]
|
| 24 |
+
jupyter = "^1.0.0"
|
| 25 |
+
|
| 26 |
+
[build-system]
|
| 27 |
+
requires = ["poetry-core"]
|
| 28 |
+
build-backend = "poetry.core.masonry.api"
|
requirements.txt
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Pillow==9.5.0
|
| 2 |
+
layoutparser
|
| 3 |
+
torchvision
|
| 4 |
+
git+https://github.com/facebookresearch/detectron2.git@v0.5#egg=detectron2
|
| 5 |
+
pytesseract
|
| 6 |
+
pymupdf
|
| 7 |
+
pymupdf-fonts
|
| 8 |
+
pydantic
|
| 9 |
+
pydantic-settings
|
| 10 |
+
nougat
|
| 11 |
+
transformers
|
| 12 |
+
scikit-learn
|