Vik Paruchuri
commited on
Commit
·
e0b8544
1
Parent(s):
182dbdb
Alter OCR thresholds
Browse files- README.md +1 -0
- marker/cleaners/equations.py +1 -2
- marker/extract_text.py +40 -20
- marker/schema.py +7 -0
- marker/settings.py +2 -3
- poetry.lock +15 -1
- pyproject.toml +2 -0
README.md
CHANGED
|
@@ -13,6 +13,7 @@ PDF is a tricky format, so this will not always work perfectly, but it is good e
|
|
| 13 |
## Install
|
| 14 |
|
| 15 |
- `poetry install`
|
|
|
|
| 16 |
- Install apt requirements
|
| 17 |
- Set `TESSDATA_PREFIX`
|
| 18 |
- Find tessdata folder
|
|
|
|
| 13 |
## Install
|
| 14 |
|
| 15 |
- `poetry install`
|
| 16 |
+
- Recommend installing tesseract 5 by following [these instructions](https://notesalexp.org/tesseract-ocr/html/). You may get tesseract 4 otherwise.
|
| 17 |
- Install apt requirements
|
| 18 |
- Set `TESSDATA_PREFIX`
|
| 19 |
- Find tessdata folder
|
marker/cleaners/equations.py
CHANGED
|
@@ -60,8 +60,7 @@ def mask_bbox(png_image, bbox, selected_bboxes):
|
|
| 60 |
|
| 61 |
|
| 62 |
def get_nougat_text(page, bbox, selected_bboxes, nougat_model, max_length=settings.NOUGAT_MODEL_MAX):
|
| 63 |
-
|
| 64 |
-
pix = page.get_pixmap(dpi=settings.NOUGAT_DPI, clip=bbox, matrix=mat)
|
| 65 |
png = pix.pil_tobytes(format="BMP")
|
| 66 |
png_image = Image.open(io.BytesIO(png))
|
| 67 |
png_image = mask_bbox(png_image, bbox, selected_bboxes)
|
|
|
|
| 60 |
|
| 61 |
|
| 62 |
def get_nougat_text(page, bbox, selected_bboxes, nougat_model, max_length=settings.NOUGAT_MODEL_MAX):
|
| 63 |
+
pix = page.get_pixmap(dpi=settings.NOUGAT_DPI, clip=bbox)
|
|
|
|
| 64 |
png = pix.pil_tobytes(format="BMP")
|
| 65 |
png_image = Image.open(io.BytesIO(png))
|
| 66 |
png_image = mask_bbox(png_image, bbox, selected_bboxes)
|
marker/extract_text.py
CHANGED
|
@@ -4,39 +4,53 @@ from marker.settings import settings
|
|
| 4 |
from marker.schema import Span, Line, Block, Page
|
| 5 |
import string
|
| 6 |
from spellchecker import SpellChecker
|
|
|
|
| 7 |
|
| 8 |
os.environ["TESSDATA_PREFIX"] = settings.TESSDATA_PREFIX
|
| 9 |
TEXT_FLAGS = ~pymupdf.TEXT_PRESERVE_LIGATURES & pymupdf.TEXT_PRESERVE_WHITESPACE & ~pymupdf.TEXT_PRESERVE_IMAGES & ~pymupdf.TEXT_INHIBIT_SPACES & pymupdf.TEXT_DEHYPHENATE & pymupdf.TEXT_MEDIABOX_CLIP
|
| 10 |
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
def ocr_entire_page(page, lang: str, spell_lang: str | None):
|
| 13 |
-
mat = pymupdf.Matrix(settings.SEGMENT_ZOOM, settings.SEGMENT_ZOOM)
|
| 14 |
try:
|
| 15 |
-
full_tp = page.get_textpage_ocr(flags=TEXT_FLAGS, dpi=settings.DPI, full=True, language=lang
|
| 16 |
blocks = page.get_text("dict", sort=True, flags=TEXT_FLAGS, textpage=full_tp)["blocks"]
|
| 17 |
full_text = page.get_text("text", flags=TEXT_FLAGS, textpage=full_tp)
|
| 18 |
|
| 19 |
-
words = full_text.split()
|
| 20 |
-
words = [w for w in words if w.strip()]
|
| 21 |
-
alpha_words = [word for word in words if word.isalnum()]
|
| 22 |
-
nonalpha_words = [word for word in words if not word.isalnum()]
|
| 23 |
-
|
| 24 |
# Check spelling to determine if OCR worked
|
| 25 |
# If it didn't, return empty list
|
| 26 |
# OCR can fail if there is a scanned blank page with some faint text impressions, for example
|
| 27 |
-
if spell_lang:
|
| 28 |
-
|
| 29 |
-
misspelled = spell.unknown(alpha_words)
|
| 30 |
-
if len(misspelled) + len(nonalpha_words) > len(words) / 1.5:
|
| 31 |
-
return []
|
| 32 |
except RuntimeError:
|
| 33 |
return []
|
| 34 |
return blocks
|
| 35 |
|
| 36 |
|
| 37 |
def ocr_bbox(page, old_text, bbox, lang: str):
|
| 38 |
-
|
| 39 |
-
pix = page.get_pixmap(dpi=settings.DPI, clip=bbox, matrix=mat)
|
| 40 |
|
| 41 |
try:
|
| 42 |
ocrpdf = pymupdf.open("pdf", pix.pdfocr_tobytes(language=lang))
|
|
@@ -131,8 +145,11 @@ def get_single_page_blocks(page, pnum: int, tess_lang: str, spell_lang=None, ocr
|
|
| 131 |
def alphanum_ratio(text):
|
| 132 |
alphanumeric_count = sum([1 for c in text if c.isalnum()])
|
| 133 |
|
| 134 |
-
if
|
| 135 |
-
|
|
|
|
|
|
|
|
|
|
| 136 |
|
| 137 |
ratio = alphanumeric_count / len(text)
|
| 138 |
return ratio
|
|
@@ -141,6 +158,7 @@ def alphanum_ratio(text):
|
|
| 141 |
def get_text_blocks(doc, tess_lang: str, spell_lang: str, max_pages: int | None=None):
|
| 142 |
all_blocks = []
|
| 143 |
toc = doc.get_toc()
|
|
|
|
| 144 |
for pnum, page in enumerate(doc):
|
| 145 |
if max_pages and pnum >= max_pages:
|
| 146 |
break
|
|
@@ -149,16 +167,18 @@ def get_text_blocks(doc, tess_lang: str, spell_lang: str, max_pages: int | None=
|
|
| 149 |
|
| 150 |
# OCR page if we got minimal text, or if we got too many spaces
|
| 151 |
conditions = [
|
| 152 |
-
len(page_obj.get_nonblank_lines()) < 3,
|
| 153 |
(
|
| 154 |
-
|
| 155 |
-
|
| 156 |
),
|
| 157 |
2 < pnum < len(doc) - 2
|
| 158 |
]
|
| 159 |
if all(conditions):
|
| 160 |
blocks = get_single_page_blocks(page, pnum, tess_lang, spell_lang, ocr=True)
|
| 161 |
page_obj = Page(blocks=blocks, pnum=pnum)
|
| 162 |
-
|
|
|
|
|
|
|
| 163 |
|
|
|
|
| 164 |
return all_blocks, toc
|
|
|
|
| 4 |
from marker.schema import Span, Line, Block, Page
|
| 5 |
import string
|
| 6 |
from spellchecker import SpellChecker
|
| 7 |
+
from nltk.tokenize import wordpunct_tokenize
|
| 8 |
|
| 9 |
os.environ["TESSDATA_PREFIX"] = settings.TESSDATA_PREFIX
|
| 10 |
TEXT_FLAGS = ~pymupdf.TEXT_PRESERVE_LIGATURES & pymupdf.TEXT_PRESERVE_WHITESPACE & ~pymupdf.TEXT_PRESERVE_IMAGES & ~pymupdf.TEXT_INHIBIT_SPACES & pymupdf.TEXT_DEHYPHENATE & pymupdf.TEXT_MEDIABOX_CLIP
|
| 11 |
|
| 12 |
|
| 13 |
+
def detect_bad_ocr(text, spell_lang: str | None, misspell_threshold=.8, space_threshold=.5, newline_threshold=.3):
|
| 14 |
+
words = wordpunct_tokenize(text)
|
| 15 |
+
words = [w for w in words if w.strip()]
|
| 16 |
+
alpha_words = [word for word in words if word.isalnum()]
|
| 17 |
+
nonalpha_words = [word for word in words if not word.isalnum()]
|
| 18 |
+
|
| 19 |
+
if spell_lang:
|
| 20 |
+
spell = SpellChecker(language=spell_lang)
|
| 21 |
+
misspelled = spell.unknown(alpha_words)
|
| 22 |
+
if len(misspelled) + len(nonalpha_words) > len(words) * misspell_threshold:
|
| 23 |
+
return True
|
| 24 |
+
spaces = text.count(" ")
|
| 25 |
+
# More than 50% of chars are spaces
|
| 26 |
+
if spaces / len(text) > space_threshold:
|
| 27 |
+
return True
|
| 28 |
+
|
| 29 |
+
newlines = text.count("\n")
|
| 30 |
+
# More than 30% of chars are newlines
|
| 31 |
+
if newlines / len(text) > newline_threshold:
|
| 32 |
+
return True
|
| 33 |
+
return False
|
| 34 |
+
|
| 35 |
+
|
| 36 |
def ocr_entire_page(page, lang: str, spell_lang: str | None):
|
|
|
|
| 37 |
try:
|
| 38 |
+
full_tp = page.get_textpage_ocr(flags=TEXT_FLAGS, dpi=settings.DPI, full=True, language=lang)
|
| 39 |
blocks = page.get_text("dict", sort=True, flags=TEXT_FLAGS, textpage=full_tp)["blocks"]
|
| 40 |
full_text = page.get_text("text", flags=TEXT_FLAGS, textpage=full_tp)
|
| 41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
# Check spelling to determine if OCR worked
|
| 43 |
# If it didn't, return empty list
|
| 44 |
# OCR can fail if there is a scanned blank page with some faint text impressions, for example
|
| 45 |
+
if detect_bad_ocr(full_text, spell_lang):
|
| 46 |
+
return []
|
|
|
|
|
|
|
|
|
|
| 47 |
except RuntimeError:
|
| 48 |
return []
|
| 49 |
return blocks
|
| 50 |
|
| 51 |
|
| 52 |
def ocr_bbox(page, old_text, bbox, lang: str):
|
| 53 |
+
pix = page.get_pixmap(dpi=settings.SEGMENT_DPI, clip=bbox)
|
|
|
|
| 54 |
|
| 55 |
try:
|
| 56 |
ocrpdf = pymupdf.open("pdf", pix.pdfocr_tobytes(language=lang))
|
|
|
|
| 145 |
def alphanum_ratio(text):
|
| 146 |
alphanumeric_count = sum([1 for c in text if c.isalnum()])
|
| 147 |
|
| 148 |
+
if len(text) == 0:
|
| 149 |
+
if alphanumeric_count == 0:
|
| 150 |
+
return 1
|
| 151 |
+
else:
|
| 152 |
+
return 0
|
| 153 |
|
| 154 |
ratio = alphanumeric_count / len(text)
|
| 155 |
return ratio
|
|
|
|
| 158 |
def get_text_blocks(doc, tess_lang: str, spell_lang: str, max_pages: int | None=None):
|
| 159 |
all_blocks = []
|
| 160 |
toc = doc.get_toc()
|
| 161 |
+
page_extracted = False
|
| 162 |
for pnum, page in enumerate(doc):
|
| 163 |
if max_pages and pnum >= max_pages:
|
| 164 |
break
|
|
|
|
| 167 |
|
| 168 |
# OCR page if we got minimal text, or if we got too many spaces
|
| 169 |
conditions = [
|
|
|
|
| 170 |
(
|
| 171 |
+
(len(page_obj.get_nonblank_lines()) < 3 and not page_extracted) # Possibly PDF has no text, and needs full OCR
|
| 172 |
+
or alphanum_ratio(page_obj.prelim_text) < .6 # Garbled text
|
| 173 |
),
|
| 174 |
2 < pnum < len(doc) - 2
|
| 175 |
]
|
| 176 |
if all(conditions):
|
| 177 |
blocks = get_single_page_blocks(page, pnum, tess_lang, spell_lang, ocr=True)
|
| 178 |
page_obj = Page(blocks=blocks, pnum=pnum)
|
| 179 |
+
page_extracted = False
|
| 180 |
+
else:
|
| 181 |
+
page_extracted = True
|
| 182 |
|
| 183 |
+
all_blocks.append(page_obj)
|
| 184 |
return all_blocks, toc
|
marker/schema.py
CHANGED
|
@@ -2,6 +2,7 @@ from collections import Counter
|
|
| 2 |
from typing import List
|
| 3 |
|
| 4 |
from pydantic import BaseModel, field_validator
|
|
|
|
| 5 |
|
| 6 |
from marker.bbox import boxes_intersect_pct, multiple_boxes_intersect
|
| 7 |
from marker.settings import settings
|
|
@@ -63,6 +64,12 @@ class Span(BboxElement):
|
|
| 63 |
selected: bool = True
|
| 64 |
|
| 65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
class Line(BboxElement):
|
| 67 |
spans: List[Span]
|
| 68 |
|
|
|
|
| 2 |
from typing import List
|
| 3 |
|
| 4 |
from pydantic import BaseModel, field_validator
|
| 5 |
+
import ftfy
|
| 6 |
|
| 7 |
from marker.bbox import boxes_intersect_pct, multiple_boxes_intersect
|
| 8 |
from marker.settings import settings
|
|
|
|
| 64 |
selected: bool = True
|
| 65 |
|
| 66 |
|
| 67 |
+
@field_validator('text')
|
| 68 |
+
@classmethod
|
| 69 |
+
def fix_unicode(cls, text: str) -> str:
|
| 70 |
+
return ftfy.fix_text(text)
|
| 71 |
+
|
| 72 |
+
|
| 73 |
class Line(BboxElement):
|
| 74 |
spans: List[Span]
|
| 75 |
|
marker/settings.py
CHANGED
|
@@ -23,7 +23,7 @@ class Settings(BaseSettings):
|
|
| 23 |
# OCR
|
| 24 |
INVALID_CHARS: List[str] = [chr(0xfffd), chr(65533)]
|
| 25 |
DPI: int = 800
|
| 26 |
-
|
| 27 |
TESSDATA_PREFIX: str = ""
|
| 28 |
TESSERACT_LANGUAGES: Dict = {
|
| 29 |
"English": "eng",
|
|
@@ -46,8 +46,7 @@ class Settings(BaseSettings):
|
|
| 46 |
NOUGAT_MODEL_MAX: int = 1024 # Max inference length for nougat
|
| 47 |
NOUGAT_HALLUCINATION_WORDS: List[str] = ["[MISSING_PAGE_POST]", "## References\n", "**Figure Captions**\n", "Footnote",
|
| 48 |
"\par\par\par", "## Chapter", "Fig.", "particle"]
|
| 49 |
-
NOUGAT_DPI: int = 96 # DPI to render images at
|
| 50 |
-
NOUGAT_ZOOM: int = 3 # Zoom to render images at
|
| 51 |
|
| 52 |
# Layout Model
|
| 53 |
BAD_SPAN_TYPES: List[str] = ["Caption", "Footnote", "Page-footer", "Page-header", "Picture"]
|
|
|
|
| 23 |
# OCR
|
| 24 |
INVALID_CHARS: List[str] = [chr(0xfffd), chr(65533)]
|
| 25 |
DPI: int = 800
|
| 26 |
+
SEGMENT_DPI: int = 1200
|
| 27 |
TESSDATA_PREFIX: str = ""
|
| 28 |
TESSERACT_LANGUAGES: Dict = {
|
| 29 |
"English": "eng",
|
|
|
|
| 46 |
NOUGAT_MODEL_MAX: int = 1024 # Max inference length for nougat
|
| 47 |
NOUGAT_HALLUCINATION_WORDS: List[str] = ["[MISSING_PAGE_POST]", "## References\n", "**Figure Captions**\n", "Footnote",
|
| 48 |
"\par\par\par", "## Chapter", "Fig.", "particle"]
|
| 49 |
+
NOUGAT_DPI: int = 96 # DPI to render images at, matches default settings for nougat
|
|
|
|
| 50 |
|
| 51 |
# Layout Model
|
| 52 |
BAD_SPAN_TYPES: List[str] = ["Caption", "Footnote", "Page-footer", "Page-header", "Picture"]
|
poetry.lock
CHANGED
|
@@ -881,6 +881,20 @@ smb = ["smbprotocol"]
|
|
| 881 |
ssh = ["paramiko"]
|
| 882 |
tqdm = ["tqdm"]
|
| 883 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 884 |
[[package]]
|
| 885 |
name = "huggingface-hub"
|
| 886 |
version = "0.17.3"
|
|
@@ -5112,4 +5126,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
|
|
| 5112 |
[metadata]
|
| 5113 |
lock-version = "2.0"
|
| 5114 |
python-versions = ">=3.9,<3.13"
|
| 5115 |
-
content-hash = "
|
|
|
|
| 881 |
ssh = ["paramiko"]
|
| 882 |
tqdm = ["tqdm"]
|
| 883 |
|
| 884 |
+
[[package]]
|
| 885 |
+
name = "ftfy"
|
| 886 |
+
version = "6.1.1"
|
| 887 |
+
description = "Fixes mojibake and other problems with Unicode, after the fact"
|
| 888 |
+
optional = false
|
| 889 |
+
python-versions = ">=3.7,<4"
|
| 890 |
+
files = [
|
| 891 |
+
{file = "ftfy-6.1.1-py3-none-any.whl", hash = "sha256:0ffd33fce16b54cccaec78d6ec73d95ad370e5df5a25255c8966a6147bd667ca"},
|
| 892 |
+
{file = "ftfy-6.1.1.tar.gz", hash = "sha256:bfc2019f84fcd851419152320a6375604a0f1459c281b5b199b2cd0d2e727f8f"},
|
| 893 |
+
]
|
| 894 |
+
|
| 895 |
+
[package.dependencies]
|
| 896 |
+
wcwidth = ">=0.2.5"
|
| 897 |
+
|
| 898 |
[[package]]
|
| 899 |
name = "huggingface-hub"
|
| 900 |
version = "0.17.3"
|
|
|
|
| 5126 |
[metadata]
|
| 5127 |
lock-version = "2.0"
|
| 5128 |
python-versions = ">=3.9,<3.13"
|
| 5129 |
+
content-hash = "79ac2f866f67a7f018f227251d027ec89dca6c45d33337884ee8a7c3e749edd0"
|
pyproject.toml
CHANGED
|
@@ -25,6 +25,8 @@ tabulate = "^0.9.0"
|
|
| 25 |
thefuzz = "^0.20.0"
|
| 26 |
python-magic = "^0.4.27"
|
| 27 |
pyspellchecker = "^0.7.2"
|
|
|
|
|
|
|
| 28 |
|
| 29 |
|
| 30 |
[tool.poetry.group.dev.dependencies]
|
|
|
|
| 25 |
thefuzz = "^0.20.0"
|
| 26 |
python-magic = "^0.4.27"
|
| 27 |
pyspellchecker = "^0.7.2"
|
| 28 |
+
ftfy = "^6.1.1"
|
| 29 |
+
nltk = "^3.8.1"
|
| 30 |
|
| 31 |
|
| 32 |
[tool.poetry.group.dev.dependencies]
|