Spaces:

rt4u
/

marker

Sleeping

App Files Files Community

Vik Paruchuri commited on Nov 2, 2023

Commit

e0b8544

1 Parent(s): 182dbdb

Alter OCR thresholds

Browse files

Files changed (7) hide show

README.md +1 -0
marker/cleaners/equations.py +1 -2
marker/extract_text.py +40 -20
marker/schema.py +7 -0
marker/settings.py +2 -3
poetry.lock +15 -1
pyproject.toml +2 -0

README.md CHANGED Viewed

@@ -13,6 +13,7 @@ PDF is a tricky format, so this will not always work perfectly, but it is good e
 ## Install
 - `poetry install`
 - Install apt requirements
 - Set `TESSDATA_PREFIX`
   - Find tessdata folder

 ## Install
 - `poetry install`
+- Recommend installing tesseract 5 by following [these instructions](https://notesalexp.org/tesseract-ocr/html/).  You may get tesseract 4 otherwise.
 - Install apt requirements
 - Set `TESSDATA_PREFIX`
   - Find tessdata folder

marker/cleaners/equations.py CHANGED Viewed

@@ -60,8 +60,7 @@ def mask_bbox(png_image, bbox, selected_bboxes):
 def get_nougat_text(page, bbox, selected_bboxes, nougat_model, max_length=settings.NOUGAT_MODEL_MAX):
-    mat = pymupdf.Matrix(settings.NOUGAT_ZOOM, settings.NOUGAT_ZOOM)
-    pix = page.get_pixmap(dpi=settings.NOUGAT_DPI, clip=bbox, matrix=mat)
     png = pix.pil_tobytes(format="BMP")
     png_image = Image.open(io.BytesIO(png))
     png_image = mask_bbox(png_image, bbox, selected_bboxes)

 def get_nougat_text(page, bbox, selected_bboxes, nougat_model, max_length=settings.NOUGAT_MODEL_MAX):
+    pix = page.get_pixmap(dpi=settings.NOUGAT_DPI, clip=bbox)
     png = pix.pil_tobytes(format="BMP")
     png_image = Image.open(io.BytesIO(png))
     png_image = mask_bbox(png_image, bbox, selected_bboxes)

marker/extract_text.py CHANGED Viewed

@@ -4,39 +4,53 @@ from marker.settings import settings
 from marker.schema import Span, Line, Block, Page
 import string
 from spellchecker import SpellChecker
 os.environ["TESSDATA_PREFIX"] = settings.TESSDATA_PREFIX
 TEXT_FLAGS = ~pymupdf.TEXT_PRESERVE_LIGATURES & pymupdf.TEXT_PRESERVE_WHITESPACE & ~pymupdf.TEXT_PRESERVE_IMAGES & ~pymupdf.TEXT_INHIBIT_SPACES & pymupdf.TEXT_DEHYPHENATE & pymupdf.TEXT_MEDIABOX_CLIP
 def ocr_entire_page(page, lang: str, spell_lang: str | None):
-    mat = pymupdf.Matrix(settings.SEGMENT_ZOOM, settings.SEGMENT_ZOOM)
     try:
-        full_tp = page.get_textpage_ocr(flags=TEXT_FLAGS, dpi=settings.DPI, full=True, language=lang, matrix=mat)
         blocks = page.get_text("dict", sort=True, flags=TEXT_FLAGS, textpage=full_tp)["blocks"]
         full_text = page.get_text("text", flags=TEXT_FLAGS, textpage=full_tp)
-        words = full_text.split()
-        words = [w for w in words if w.strip()]
-        alpha_words = [word for word in words if word.isalnum()]
-        nonalpha_words = [word for word in words if not word.isalnum()]
         # Check spelling to determine if OCR worked
         # If it didn't, return empty list
         # OCR can fail if there is a scanned blank page with some faint text impressions, for example
-        if spell_lang:
-            spell = SpellChecker(language=spell_lang)
-            misspelled = spell.unknown(alpha_words)
-            if len(misspelled) + len(nonalpha_words) > len(words) / 1.5:
-                return []
     except RuntimeError:
         return []
     return blocks
 def ocr_bbox(page, old_text, bbox, lang: str):
-    mat = pymupdf.Matrix(settings.SEGMENT_ZOOM, settings.SEGMENT_ZOOM)
-    pix = page.get_pixmap(dpi=settings.DPI, clip=bbox, matrix=mat)
     try:
         ocrpdf = pymupdf.open("pdf", pix.pdfocr_tobytes(language=lang))
@@ -131,8 +145,11 @@ def get_single_page_blocks(page, pnum: int, tess_lang: str, spell_lang=None, ocr
 def alphanum_ratio(text):
     alphanumeric_count = sum([1 for c in text if c.isalnum()])
-    if alphanumeric_count == 0:
-        return 0
     ratio = alphanumeric_count / len(text)
     return ratio
@@ -141,6 +158,7 @@ def alphanum_ratio(text):
 def get_text_blocks(doc, tess_lang: str, spell_lang: str, max_pages: int | None=None):
     all_blocks = []
     toc = doc.get_toc()
     for pnum, page in enumerate(doc):
         if max_pages and pnum >= max_pages:
             break
@@ -149,16 +167,18 @@ def get_text_blocks(doc, tess_lang: str, spell_lang: str, max_pages: int | None=
         # OCR page if we got minimal text, or if we got too many spaces
         conditions = [
-            len(page_obj.get_nonblank_lines()) < 3,
             (
-                    alphanum_ratio(page_obj.prelim_text) < .7 # Garbled or bad OCR text
-                    or (page_obj.prelim_text.count(" ") / len(page_obj.prelim_text)) > .3 ## too many spaces on the page
             ),
             2 < pnum < len(doc) - 2
         ]
         if all(conditions):
             blocks = get_single_page_blocks(page, pnum, tess_lang, spell_lang, ocr=True)
             page_obj = Page(blocks=blocks, pnum=pnum)
-        all_blocks.append(page_obj)
     return all_blocks, toc

 from marker.schema import Span, Line, Block, Page
 import string
 from spellchecker import SpellChecker
+from nltk.tokenize import wordpunct_tokenize
 os.environ["TESSDATA_PREFIX"] = settings.TESSDATA_PREFIX
 TEXT_FLAGS = ~pymupdf.TEXT_PRESERVE_LIGATURES & pymupdf.TEXT_PRESERVE_WHITESPACE & ~pymupdf.TEXT_PRESERVE_IMAGES & ~pymupdf.TEXT_INHIBIT_SPACES & pymupdf.TEXT_DEHYPHENATE & pymupdf.TEXT_MEDIABOX_CLIP
+def detect_bad_ocr(text, spell_lang: str | None, misspell_threshold=.8, space_threshold=.5, newline_threshold=.3):
+    words = wordpunct_tokenize(text)
+    words = [w for w in words if w.strip()]
+    alpha_words = [word for word in words if word.isalnum()]
+    nonalpha_words = [word for word in words if not word.isalnum()]
+    if spell_lang:
+        spell = SpellChecker(language=spell_lang)
+        misspelled = spell.unknown(alpha_words)
+        if len(misspelled) + len(nonalpha_words) > len(words) * misspell_threshold:
+            return True
+    spaces = text.count(" ")
+    # More than 50% of chars are spaces
+    if spaces / len(text) > space_threshold:
+        return True
+    newlines = text.count("\n")
+    # More than 30% of chars are newlines
+    if newlines / len(text) > newline_threshold:
+        return True
+    return False
 def ocr_entire_page(page, lang: str, spell_lang: str | None):
     try:
+        full_tp = page.get_textpage_ocr(flags=TEXT_FLAGS, dpi=settings.DPI, full=True, language=lang)
         blocks = page.get_text("dict", sort=True, flags=TEXT_FLAGS, textpage=full_tp)["blocks"]
         full_text = page.get_text("text", flags=TEXT_FLAGS, textpage=full_tp)
         # Check spelling to determine if OCR worked
         # If it didn't, return empty list
         # OCR can fail if there is a scanned blank page with some faint text impressions, for example
+        if detect_bad_ocr(full_text, spell_lang):
+            return []
     except RuntimeError:
         return []
     return blocks
 def ocr_bbox(page, old_text, bbox, lang: str):
+    pix = page.get_pixmap(dpi=settings.SEGMENT_DPI, clip=bbox)
     try:
         ocrpdf = pymupdf.open("pdf", pix.pdfocr_tobytes(language=lang))
 def alphanum_ratio(text):
     alphanumeric_count = sum([1 for c in text if c.isalnum()])
+    if len(text) == 0:
+        if alphanumeric_count == 0:
+            return 1
+        else:
+            return 0
     ratio = alphanumeric_count / len(text)
     return ratio
 def get_text_blocks(doc, tess_lang: str, spell_lang: str, max_pages: int | None=None):
     all_blocks = []
     toc = doc.get_toc()
+    page_extracted = False
     for pnum, page in enumerate(doc):
         if max_pages and pnum >= max_pages:
             break
         # OCR page if we got minimal text, or if we got too many spaces
         conditions = [
             (
+                (len(page_obj.get_nonblank_lines()) < 3 and not page_extracted) # Possibly PDF has no text, and needs full OCR
+                or alphanum_ratio(page_obj.prelim_text) < .6 # Garbled text
             ),
             2 < pnum < len(doc) - 2
         ]
         if all(conditions):
             blocks = get_single_page_blocks(page, pnum, tess_lang, spell_lang, ocr=True)
             page_obj = Page(blocks=blocks, pnum=pnum)
+            page_extracted = False
+        else:
+            page_extracted = True
+        all_blocks.append(page_obj)
     return all_blocks, toc

marker/schema.py CHANGED Viewed

@@ -2,6 +2,7 @@ from collections import Counter
 from typing import List
 from pydantic import BaseModel, field_validator
 from marker.bbox import boxes_intersect_pct, multiple_boxes_intersect
 from marker.settings import settings
@@ -63,6 +64,12 @@ class Span(BboxElement):
     selected: bool = True
 class Line(BboxElement):
     spans: List[Span]

 from typing import List
 from pydantic import BaseModel, field_validator
+import ftfy
 from marker.bbox import boxes_intersect_pct, multiple_boxes_intersect
 from marker.settings import settings
     selected: bool = True
+    @field_validator('text')
+    @classmethod
+    def fix_unicode(cls, text: str) -> str:
+        return ftfy.fix_text(text)
 class Line(BboxElement):
     spans: List[Span]

marker/settings.py CHANGED Viewed

@@ -23,7 +23,7 @@ class Settings(BaseSettings):
     # OCR
     INVALID_CHARS: List[str] = [chr(0xfffd), chr(65533)]
     DPI: int = 800
-    SEGMENT_ZOOM: int = 2
     TESSDATA_PREFIX: str = ""
     TESSERACT_LANGUAGES: Dict = {
         "English": "eng",
@@ -46,8 +46,7 @@ class Settings(BaseSettings):
     NOUGAT_MODEL_MAX: int = 1024 # Max inference length for nougat
     NOUGAT_HALLUCINATION_WORDS: List[str] = ["[MISSING_PAGE_POST]", "## References\n", "**Figure Captions**\n", "Footnote",
                                   "\par\par\par", "## Chapter", "Fig.", "particle"]
-    NOUGAT_DPI: int = 96 # DPI to render images at
-    NOUGAT_ZOOM: int = 3 # Zoom to render images at
     # Layout Model
     BAD_SPAN_TYPES: List[str] = ["Caption", "Footnote", "Page-footer", "Page-header", "Picture"]

     # OCR
     INVALID_CHARS: List[str] = [chr(0xfffd), chr(65533)]
     DPI: int = 800
+    SEGMENT_DPI: int = 1200
     TESSDATA_PREFIX: str = ""
     TESSERACT_LANGUAGES: Dict = {
         "English": "eng",
     NOUGAT_MODEL_MAX: int = 1024 # Max inference length for nougat
     NOUGAT_HALLUCINATION_WORDS: List[str] = ["[MISSING_PAGE_POST]", "## References\n", "**Figure Captions**\n", "Footnote",
                                   "\par\par\par", "## Chapter", "Fig.", "particle"]
+    NOUGAT_DPI: int = 96 # DPI to render images at, matches default settings for nougat
     # Layout Model
     BAD_SPAN_TYPES: List[str] = ["Caption", "Footnote", "Page-footer", "Page-header", "Picture"]

poetry.lock CHANGED Viewed

@@ -881,6 +881,20 @@ smb = ["smbprotocol"]
 ssh = ["paramiko"]
 tqdm = ["tqdm"]
 [[package]]
 name = "huggingface-hub"
 version = "0.17.3"
@@ -5112,4 +5126,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9,<3.13"
-content-hash = "75fb733b7de5d7f77571ba03e8e76fe59454d7bec6d7af8052d910f3d6d95e22"

 ssh = ["paramiko"]
 tqdm = ["tqdm"]
+[[package]]
+name = "ftfy"
+version = "6.1.1"
+description = "Fixes mojibake and other problems with Unicode, after the fact"
+optional = false
+python-versions = ">=3.7,<4"
+files = [
+    {file = "ftfy-6.1.1-py3-none-any.whl", hash = "sha256:0ffd33fce16b54cccaec78d6ec73d95ad370e5df5a25255c8966a6147bd667ca"},
+    {file = "ftfy-6.1.1.tar.gz", hash = "sha256:bfc2019f84fcd851419152320a6375604a0f1459c281b5b199b2cd0d2e727f8f"},
+]
+[package.dependencies]
+wcwidth = ">=0.2.5"
 [[package]]
 name = "huggingface-hub"
 version = "0.17.3"
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9,<3.13"
+content-hash = "79ac2f866f67a7f018f227251d027ec89dca6c45d33337884ee8a7c3e749edd0"

pyproject.toml CHANGED Viewed

@@ -25,6 +25,8 @@ tabulate = "^0.9.0"
 thefuzz = "^0.20.0"
 python-magic = "^0.4.27"
 pyspellchecker = "^0.7.2"
 [tool.poetry.group.dev.dependencies]

 thefuzz = "^0.20.0"
 python-magic = "^0.4.27"
 pyspellchecker = "^0.7.2"
+ftfy = "^6.1.1"
+nltk = "^3.8.1"
 [tool.poetry.group.dev.dependencies]