Vik Paruchuri commited on
Commit
e0b8544
·
1 Parent(s): 182dbdb

Alter OCR thresholds

Browse files
README.md CHANGED
@@ -13,6 +13,7 @@ PDF is a tricky format, so this will not always work perfectly, but it is good e
13
  ## Install
14
 
15
  - `poetry install`
 
16
  - Install apt requirements
17
  - Set `TESSDATA_PREFIX`
18
  - Find tessdata folder
 
13
  ## Install
14
 
15
  - `poetry install`
16
+ - Recommend installing tesseract 5 by following [these instructions](https://notesalexp.org/tesseract-ocr/html/). You may get tesseract 4 otherwise.
17
  - Install apt requirements
18
  - Set `TESSDATA_PREFIX`
19
  - Find tessdata folder
marker/cleaners/equations.py CHANGED
@@ -60,8 +60,7 @@ def mask_bbox(png_image, bbox, selected_bboxes):
60
 
61
 
62
  def get_nougat_text(page, bbox, selected_bboxes, nougat_model, max_length=settings.NOUGAT_MODEL_MAX):
63
- mat = pymupdf.Matrix(settings.NOUGAT_ZOOM, settings.NOUGAT_ZOOM)
64
- pix = page.get_pixmap(dpi=settings.NOUGAT_DPI, clip=bbox, matrix=mat)
65
  png = pix.pil_tobytes(format="BMP")
66
  png_image = Image.open(io.BytesIO(png))
67
  png_image = mask_bbox(png_image, bbox, selected_bboxes)
 
60
 
61
 
62
  def get_nougat_text(page, bbox, selected_bboxes, nougat_model, max_length=settings.NOUGAT_MODEL_MAX):
63
+ pix = page.get_pixmap(dpi=settings.NOUGAT_DPI, clip=bbox)
 
64
  png = pix.pil_tobytes(format="BMP")
65
  png_image = Image.open(io.BytesIO(png))
66
  png_image = mask_bbox(png_image, bbox, selected_bboxes)
marker/extract_text.py CHANGED
@@ -4,39 +4,53 @@ from marker.settings import settings
4
  from marker.schema import Span, Line, Block, Page
5
  import string
6
  from spellchecker import SpellChecker
 
7
 
8
  os.environ["TESSDATA_PREFIX"] = settings.TESSDATA_PREFIX
9
  TEXT_FLAGS = ~pymupdf.TEXT_PRESERVE_LIGATURES & pymupdf.TEXT_PRESERVE_WHITESPACE & ~pymupdf.TEXT_PRESERVE_IMAGES & ~pymupdf.TEXT_INHIBIT_SPACES & pymupdf.TEXT_DEHYPHENATE & pymupdf.TEXT_MEDIABOX_CLIP
10
 
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  def ocr_entire_page(page, lang: str, spell_lang: str | None):
13
- mat = pymupdf.Matrix(settings.SEGMENT_ZOOM, settings.SEGMENT_ZOOM)
14
  try:
15
- full_tp = page.get_textpage_ocr(flags=TEXT_FLAGS, dpi=settings.DPI, full=True, language=lang, matrix=mat)
16
  blocks = page.get_text("dict", sort=True, flags=TEXT_FLAGS, textpage=full_tp)["blocks"]
17
  full_text = page.get_text("text", flags=TEXT_FLAGS, textpage=full_tp)
18
 
19
- words = full_text.split()
20
- words = [w for w in words if w.strip()]
21
- alpha_words = [word for word in words if word.isalnum()]
22
- nonalpha_words = [word for word in words if not word.isalnum()]
23
-
24
  # Check spelling to determine if OCR worked
25
  # If it didn't, return empty list
26
  # OCR can fail if there is a scanned blank page with some faint text impressions, for example
27
- if spell_lang:
28
- spell = SpellChecker(language=spell_lang)
29
- misspelled = spell.unknown(alpha_words)
30
- if len(misspelled) + len(nonalpha_words) > len(words) / 1.5:
31
- return []
32
  except RuntimeError:
33
  return []
34
  return blocks
35
 
36
 
37
  def ocr_bbox(page, old_text, bbox, lang: str):
38
- mat = pymupdf.Matrix(settings.SEGMENT_ZOOM, settings.SEGMENT_ZOOM)
39
- pix = page.get_pixmap(dpi=settings.DPI, clip=bbox, matrix=mat)
40
 
41
  try:
42
  ocrpdf = pymupdf.open("pdf", pix.pdfocr_tobytes(language=lang))
@@ -131,8 +145,11 @@ def get_single_page_blocks(page, pnum: int, tess_lang: str, spell_lang=None, ocr
131
  def alphanum_ratio(text):
132
  alphanumeric_count = sum([1 for c in text if c.isalnum()])
133
 
134
- if alphanumeric_count == 0:
135
- return 0
 
 
 
136
 
137
  ratio = alphanumeric_count / len(text)
138
  return ratio
@@ -141,6 +158,7 @@ def alphanum_ratio(text):
141
  def get_text_blocks(doc, tess_lang: str, spell_lang: str, max_pages: int | None=None):
142
  all_blocks = []
143
  toc = doc.get_toc()
 
144
  for pnum, page in enumerate(doc):
145
  if max_pages and pnum >= max_pages:
146
  break
@@ -149,16 +167,18 @@ def get_text_blocks(doc, tess_lang: str, spell_lang: str, max_pages: int | None=
149
 
150
  # OCR page if we got minimal text, or if we got too many spaces
151
  conditions = [
152
- len(page_obj.get_nonblank_lines()) < 3,
153
  (
154
- alphanum_ratio(page_obj.prelim_text) < .7 # Garbled or bad OCR text
155
- or (page_obj.prelim_text.count(" ") / len(page_obj.prelim_text)) > .3 ## too many spaces on the page
156
  ),
157
  2 < pnum < len(doc) - 2
158
  ]
159
  if all(conditions):
160
  blocks = get_single_page_blocks(page, pnum, tess_lang, spell_lang, ocr=True)
161
  page_obj = Page(blocks=blocks, pnum=pnum)
162
- all_blocks.append(page_obj)
 
 
163
 
 
164
  return all_blocks, toc
 
4
  from marker.schema import Span, Line, Block, Page
5
  import string
6
  from spellchecker import SpellChecker
7
+ from nltk.tokenize import wordpunct_tokenize
8
 
9
  os.environ["TESSDATA_PREFIX"] = settings.TESSDATA_PREFIX
10
  TEXT_FLAGS = ~pymupdf.TEXT_PRESERVE_LIGATURES & pymupdf.TEXT_PRESERVE_WHITESPACE & ~pymupdf.TEXT_PRESERVE_IMAGES & ~pymupdf.TEXT_INHIBIT_SPACES & pymupdf.TEXT_DEHYPHENATE & pymupdf.TEXT_MEDIABOX_CLIP
11
 
12
 
13
+ def detect_bad_ocr(text, spell_lang: str | None, misspell_threshold=.8, space_threshold=.5, newline_threshold=.3):
14
+ words = wordpunct_tokenize(text)
15
+ words = [w for w in words if w.strip()]
16
+ alpha_words = [word for word in words if word.isalnum()]
17
+ nonalpha_words = [word for word in words if not word.isalnum()]
18
+
19
+ if spell_lang:
20
+ spell = SpellChecker(language=spell_lang)
21
+ misspelled = spell.unknown(alpha_words)
22
+ if len(misspelled) + len(nonalpha_words) > len(words) * misspell_threshold:
23
+ return True
24
+ spaces = text.count(" ")
25
+ # More than 50% of chars are spaces
26
+ if spaces / len(text) > space_threshold:
27
+ return True
28
+
29
+ newlines = text.count("\n")
30
+ # More than 30% of chars are newlines
31
+ if newlines / len(text) > newline_threshold:
32
+ return True
33
+ return False
34
+
35
+
36
  def ocr_entire_page(page, lang: str, spell_lang: str | None):
 
37
  try:
38
+ full_tp = page.get_textpage_ocr(flags=TEXT_FLAGS, dpi=settings.DPI, full=True, language=lang)
39
  blocks = page.get_text("dict", sort=True, flags=TEXT_FLAGS, textpage=full_tp)["blocks"]
40
  full_text = page.get_text("text", flags=TEXT_FLAGS, textpage=full_tp)
41
 
 
 
 
 
 
42
  # Check spelling to determine if OCR worked
43
  # If it didn't, return empty list
44
  # OCR can fail if there is a scanned blank page with some faint text impressions, for example
45
+ if detect_bad_ocr(full_text, spell_lang):
46
+ return []
 
 
 
47
  except RuntimeError:
48
  return []
49
  return blocks
50
 
51
 
52
  def ocr_bbox(page, old_text, bbox, lang: str):
53
+ pix = page.get_pixmap(dpi=settings.SEGMENT_DPI, clip=bbox)
 
54
 
55
  try:
56
  ocrpdf = pymupdf.open("pdf", pix.pdfocr_tobytes(language=lang))
 
145
  def alphanum_ratio(text):
146
  alphanumeric_count = sum([1 for c in text if c.isalnum()])
147
 
148
+ if len(text) == 0:
149
+ if alphanumeric_count == 0:
150
+ return 1
151
+ else:
152
+ return 0
153
 
154
  ratio = alphanumeric_count / len(text)
155
  return ratio
 
158
  def get_text_blocks(doc, tess_lang: str, spell_lang: str, max_pages: int | None=None):
159
  all_blocks = []
160
  toc = doc.get_toc()
161
+ page_extracted = False
162
  for pnum, page in enumerate(doc):
163
  if max_pages and pnum >= max_pages:
164
  break
 
167
 
168
  # OCR page if we got minimal text, or if we got too many spaces
169
  conditions = [
 
170
  (
171
+ (len(page_obj.get_nonblank_lines()) < 3 and not page_extracted) # Possibly PDF has no text, and needs full OCR
172
+ or alphanum_ratio(page_obj.prelim_text) < .6 # Garbled text
173
  ),
174
  2 < pnum < len(doc) - 2
175
  ]
176
  if all(conditions):
177
  blocks = get_single_page_blocks(page, pnum, tess_lang, spell_lang, ocr=True)
178
  page_obj = Page(blocks=blocks, pnum=pnum)
179
+ page_extracted = False
180
+ else:
181
+ page_extracted = True
182
 
183
+ all_blocks.append(page_obj)
184
  return all_blocks, toc
marker/schema.py CHANGED
@@ -2,6 +2,7 @@ from collections import Counter
2
  from typing import List
3
 
4
  from pydantic import BaseModel, field_validator
 
5
 
6
  from marker.bbox import boxes_intersect_pct, multiple_boxes_intersect
7
  from marker.settings import settings
@@ -63,6 +64,12 @@ class Span(BboxElement):
63
  selected: bool = True
64
 
65
 
 
 
 
 
 
 
66
  class Line(BboxElement):
67
  spans: List[Span]
68
 
 
2
  from typing import List
3
 
4
  from pydantic import BaseModel, field_validator
5
+ import ftfy
6
 
7
  from marker.bbox import boxes_intersect_pct, multiple_boxes_intersect
8
  from marker.settings import settings
 
64
  selected: bool = True
65
 
66
 
67
+ @field_validator('text')
68
+ @classmethod
69
+ def fix_unicode(cls, text: str) -> str:
70
+ return ftfy.fix_text(text)
71
+
72
+
73
  class Line(BboxElement):
74
  spans: List[Span]
75
 
marker/settings.py CHANGED
@@ -23,7 +23,7 @@ class Settings(BaseSettings):
23
  # OCR
24
  INVALID_CHARS: List[str] = [chr(0xfffd), chr(65533)]
25
  DPI: int = 800
26
- SEGMENT_ZOOM: int = 2
27
  TESSDATA_PREFIX: str = ""
28
  TESSERACT_LANGUAGES: Dict = {
29
  "English": "eng",
@@ -46,8 +46,7 @@ class Settings(BaseSettings):
46
  NOUGAT_MODEL_MAX: int = 1024 # Max inference length for nougat
47
  NOUGAT_HALLUCINATION_WORDS: List[str] = ["[MISSING_PAGE_POST]", "## References\n", "**Figure Captions**\n", "Footnote",
48
  "\par\par\par", "## Chapter", "Fig.", "particle"]
49
- NOUGAT_DPI: int = 96 # DPI to render images at
50
- NOUGAT_ZOOM: int = 3 # Zoom to render images at
51
 
52
  # Layout Model
53
  BAD_SPAN_TYPES: List[str] = ["Caption", "Footnote", "Page-footer", "Page-header", "Picture"]
 
23
  # OCR
24
  INVALID_CHARS: List[str] = [chr(0xfffd), chr(65533)]
25
  DPI: int = 800
26
+ SEGMENT_DPI: int = 1200
27
  TESSDATA_PREFIX: str = ""
28
  TESSERACT_LANGUAGES: Dict = {
29
  "English": "eng",
 
46
  NOUGAT_MODEL_MAX: int = 1024 # Max inference length for nougat
47
  NOUGAT_HALLUCINATION_WORDS: List[str] = ["[MISSING_PAGE_POST]", "## References\n", "**Figure Captions**\n", "Footnote",
48
  "\par\par\par", "## Chapter", "Fig.", "particle"]
49
+ NOUGAT_DPI: int = 96 # DPI to render images at, matches default settings for nougat
 
50
 
51
  # Layout Model
52
  BAD_SPAN_TYPES: List[str] = ["Caption", "Footnote", "Page-footer", "Page-header", "Picture"]
poetry.lock CHANGED
@@ -881,6 +881,20 @@ smb = ["smbprotocol"]
881
  ssh = ["paramiko"]
882
  tqdm = ["tqdm"]
883
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
884
  [[package]]
885
  name = "huggingface-hub"
886
  version = "0.17.3"
@@ -5112,4 +5126,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
5112
  [metadata]
5113
  lock-version = "2.0"
5114
  python-versions = ">=3.9,<3.13"
5115
- content-hash = "75fb733b7de5d7f77571ba03e8e76fe59454d7bec6d7af8052d910f3d6d95e22"
 
881
  ssh = ["paramiko"]
882
  tqdm = ["tqdm"]
883
 
884
+ [[package]]
885
+ name = "ftfy"
886
+ version = "6.1.1"
887
+ description = "Fixes mojibake and other problems with Unicode, after the fact"
888
+ optional = false
889
+ python-versions = ">=3.7,<4"
890
+ files = [
891
+ {file = "ftfy-6.1.1-py3-none-any.whl", hash = "sha256:0ffd33fce16b54cccaec78d6ec73d95ad370e5df5a25255c8966a6147bd667ca"},
892
+ {file = "ftfy-6.1.1.tar.gz", hash = "sha256:bfc2019f84fcd851419152320a6375604a0f1459c281b5b199b2cd0d2e727f8f"},
893
+ ]
894
+
895
+ [package.dependencies]
896
+ wcwidth = ">=0.2.5"
897
+
898
  [[package]]
899
  name = "huggingface-hub"
900
  version = "0.17.3"
 
5126
  [metadata]
5127
  lock-version = "2.0"
5128
  python-versions = ">=3.9,<3.13"
5129
+ content-hash = "79ac2f866f67a7f018f227251d027ec89dca6c45d33337884ee8a7c3e749edd0"
pyproject.toml CHANGED
@@ -25,6 +25,8 @@ tabulate = "^0.9.0"
25
  thefuzz = "^0.20.0"
26
  python-magic = "^0.4.27"
27
  pyspellchecker = "^0.7.2"
 
 
28
 
29
 
30
  [tool.poetry.group.dev.dependencies]
 
25
  thefuzz = "^0.20.0"
26
  python-magic = "^0.4.27"
27
  pyspellchecker = "^0.7.2"
28
+ ftfy = "^6.1.1"
29
+ nltk = "^3.8.1"
30
 
31
 
32
  [tool.poetry.group.dev.dependencies]