Vik Paruchuri
commited on
Commit
·
2ad7f6b
1
Parent(s):
173a1b8
Remove pymupdf
Browse files- marker/bbox.py +1 -21
- marker/cleaners/code.py +4 -4
- marker/cleaners/equations.py +4 -4
- marker/cleaners/headers.py +1 -1
- marker/cleaners/table.py +2 -2
- marker/convert.py +11 -13
- marker/debug/data.py +2 -3
- marker/extract_text.py +51 -75
- marker/logger.py +0 -2
- marker/ocr/page.py +7 -9
- marker/ocr/utils.py +3 -8
- marker/ordering.py +2 -4
- marker/pdf/images.py +10 -0
- marker/schema.py +2 -3
- marker/segmentation.py +3 -5
- marker/settings.py +4 -8
- poetry.lock +0 -0
- pyproject.toml +3 -4
marker/bbox.py
CHANGED
|
@@ -1,5 +1,3 @@
|
|
| 1 |
-
import fitz as pymupdf
|
| 2 |
-
|
| 3 |
def should_merge_blocks(box1, box2, tol=5):
|
| 4 |
# Within tol y px, and to the right within tol px
|
| 5 |
merge = [
|
|
@@ -60,22 +58,4 @@ def unnormalize_box(bbox, width, height):
|
|
| 60 |
height * (bbox[1] / 1000),
|
| 61 |
width * (bbox[2] / 1000),
|
| 62 |
height * (bbox[3] / 1000),
|
| 63 |
-
]
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
def correct_rotation(bbox, page):
|
| 67 |
-
#bbox base is (x0, y0, x1, y1)
|
| 68 |
-
rotation = page.rotation
|
| 69 |
-
if rotation == 0:
|
| 70 |
-
return bbox
|
| 71 |
-
|
| 72 |
-
tl = pymupdf.Point(bbox[0], bbox[1]) * page.rotation_matrix
|
| 73 |
-
br = pymupdf.Point(bbox[2], bbox[3]) * page.rotation_matrix
|
| 74 |
-
if rotation == 90:
|
| 75 |
-
bbox = [br[0], tl[1], tl[0], br[1]]
|
| 76 |
-
elif rotation == 180:
|
| 77 |
-
bbox = [br[0], br[1], tl[0], tl[1]]
|
| 78 |
-
elif rotation == 270:
|
| 79 |
-
bbox = [tl[0], br[1], br[0], tl[1]]
|
| 80 |
-
|
| 81 |
-
return bbox
|
|
|
|
|
|
|
|
|
|
| 1 |
def should_merge_blocks(box1, box2, tol=5):
|
| 2 |
# Within tol y px, and to the right within tol px
|
| 3 |
merge = [
|
|
|
|
| 58 |
height * (bbox[1] / 1000),
|
| 59 |
width * (bbox[2] / 1000),
|
| 60 |
height * (bbox[3] / 1000),
|
| 61 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
marker/cleaners/code.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
from marker.schema import Span, Line, Page
|
| 2 |
import re
|
| 3 |
from typing import List
|
| 4 |
-
import fitz as pymupdf
|
| 5 |
|
| 6 |
|
| 7 |
def is_code_linelen(lines, thresh=60):
|
|
@@ -102,13 +101,13 @@ def indent_blocks(blocks: List[Page]):
|
|
| 102 |
if col_width == 0 and len(span.text) > 0:
|
| 103 |
col_width = (span.bbox[2] - span.bbox[0]) / len(span.text)
|
| 104 |
text += span.text
|
| 105 |
-
lines.append((
|
| 106 |
|
| 107 |
block_text = ""
|
| 108 |
blank_line = False
|
| 109 |
for line in lines:
|
| 110 |
text = line[1]
|
| 111 |
-
prefix = " " * int((line[0]
|
| 112 |
current_line_blank = len(text.strip()) == 0
|
| 113 |
if blank_line and current_line_blank:
|
| 114 |
# Don't put multiple blank lines in a row
|
|
@@ -120,9 +119,10 @@ def indent_blocks(blocks: List[Page]):
|
|
| 120 |
new_span = Span(
|
| 121 |
text=block_text,
|
| 122 |
bbox=block.bbox,
|
| 123 |
-
color=block.lines[0].spans[0].color,
|
| 124 |
span_id=f"{span_counter}_fix_code",
|
| 125 |
font=block.lines[0].spans[0].font,
|
|
|
|
|
|
|
| 126 |
block_type="Code"
|
| 127 |
)
|
| 128 |
span_counter += 1
|
|
|
|
| 1 |
from marker.schema import Span, Line, Page
|
| 2 |
import re
|
| 3 |
from typing import List
|
|
|
|
| 4 |
|
| 5 |
|
| 6 |
def is_code_linelen(lines, thresh=60):
|
|
|
|
| 101 |
if col_width == 0 and len(span.text) > 0:
|
| 102 |
col_width = (span.bbox[2] - span.bbox[0]) / len(span.text)
|
| 103 |
text += span.text
|
| 104 |
+
lines.append((line.bbox, text))
|
| 105 |
|
| 106 |
block_text = ""
|
| 107 |
blank_line = False
|
| 108 |
for line in lines:
|
| 109 |
text = line[1]
|
| 110 |
+
prefix = " " * int((line[0][0] - min_left) / col_width)
|
| 111 |
current_line_blank = len(text.strip()) == 0
|
| 112 |
if blank_line and current_line_blank:
|
| 113 |
# Don't put multiple blank lines in a row
|
|
|
|
| 119 |
new_span = Span(
|
| 120 |
text=block_text,
|
| 121 |
bbox=block.bbox,
|
|
|
|
| 122 |
span_id=f"{span_counter}_fix_code",
|
| 123 |
font=block.lines[0].spans[0].font,
|
| 124 |
+
font_weight=block.lines[0].spans[0].font_weight,
|
| 125 |
+
font_size=block.lines[0].spans[0].font_size,
|
| 126 |
block_type="Code"
|
| 127 |
)
|
| 128 |
span_counter += 1
|
marker/cleaners/equations.py
CHANGED
|
@@ -12,6 +12,7 @@ from PIL import Image, ImageDraw
|
|
| 12 |
|
| 13 |
from marker.bbox import should_merge_blocks, merge_boxes
|
| 14 |
from marker.debug.data import dump_equation_debug_data
|
|
|
|
| 15 |
from marker.settings import settings
|
| 16 |
from marker.schema import Page, Span, Line, Block, BlockType
|
| 17 |
import os
|
|
@@ -51,9 +52,7 @@ def mask_bbox(png_image, bbox, selected_bboxes):
|
|
| 51 |
|
| 52 |
|
| 53 |
def get_masked_image(page, bbox, selected_bboxes):
|
| 54 |
-
|
| 55 |
-
png = pix.pil_tobytes(format="PNG")
|
| 56 |
-
png_image = Image.open(io.BytesIO(png))
|
| 57 |
png_image = mask_bbox(png_image, bbox, selected_bboxes)
|
| 58 |
png_image = png_image.convert("RGB")
|
| 59 |
return png_image
|
|
@@ -212,7 +211,8 @@ def replace_blocks_with_latex(page_blocks: Page, merged_boxes, reformat_regions,
|
|
| 212 |
bbox=merged_boxes[current_region],
|
| 213 |
span_id=f"{pnum}_{idx}_fixeq",
|
| 214 |
font="Latex",
|
| 215 |
-
|
|
|
|
| 216 |
block_type="Formula"
|
| 217 |
)
|
| 218 |
],
|
|
|
|
| 12 |
|
| 13 |
from marker.bbox import should_merge_blocks, merge_boxes
|
| 14 |
from marker.debug.data import dump_equation_debug_data
|
| 15 |
+
from marker.pdf.images import render_image
|
| 16 |
from marker.settings import settings
|
| 17 |
from marker.schema import Page, Span, Line, Block, BlockType
|
| 18 |
import os
|
|
|
|
| 52 |
|
| 53 |
|
| 54 |
def get_masked_image(page, bbox, selected_bboxes):
|
| 55 |
+
png_image = render_image(page, settings.TEXIFY_DPI)
|
|
|
|
|
|
|
| 56 |
png_image = mask_bbox(png_image, bbox, selected_bboxes)
|
| 57 |
png_image = png_image.convert("RGB")
|
| 58 |
return png_image
|
|
|
|
| 211 |
bbox=merged_boxes[current_region],
|
| 212 |
span_id=f"{pnum}_{idx}_fixeq",
|
| 213 |
font="Latex",
|
| 214 |
+
font_weight=0,
|
| 215 |
+
font_size=0,
|
| 216 |
block_type="Formula"
|
| 217 |
)
|
| 218 |
],
|
marker/cleaners/headers.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
import re
|
| 2 |
from collections import Counter, defaultdict
|
| 3 |
from itertools import chain
|
| 4 |
-
from
|
| 5 |
|
| 6 |
from sklearn.cluster import DBSCAN
|
| 7 |
import numpy as np
|
|
|
|
| 1 |
import re
|
| 2 |
from collections import Counter, defaultdict
|
| 3 |
from itertools import chain
|
| 4 |
+
from rapidfuzz import fuzz
|
| 5 |
|
| 6 |
from sklearn.cluster import DBSCAN
|
| 7 |
import numpy as np
|
marker/cleaners/table.py
CHANGED
|
@@ -4,7 +4,6 @@ from copy import deepcopy
|
|
| 4 |
from tabulate import tabulate
|
| 5 |
from typing import List
|
| 6 |
import re
|
| 7 |
-
import textwrap
|
| 8 |
|
| 9 |
|
| 10 |
def merge_table_blocks(blocks: List[Page]):
|
|
@@ -84,7 +83,8 @@ def create_new_tables(blocks: List[Page]):
|
|
| 84 |
bbox=block.bbox,
|
| 85 |
span_id=f"{table_idx}_fix_table",
|
| 86 |
font="Table",
|
| 87 |
-
|
|
|
|
| 88 |
block_type="Table",
|
| 89 |
text=new_text
|
| 90 |
)
|
|
|
|
| 4 |
from tabulate import tabulate
|
| 5 |
from typing import List
|
| 6 |
import re
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
def merge_table_blocks(blocks: List[Page]):
|
|
|
|
| 83 |
bbox=block.bbox,
|
| 84 |
span_id=f"{table_idx}_fix_table",
|
| 85 |
font="Table",
|
| 86 |
+
font_size=0,
|
| 87 |
+
font_weight=0,
|
| 88 |
block_type="Table",
|
| 89 |
text=new_text
|
| 90 |
)
|
marker/convert.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
import
|
| 2 |
|
| 3 |
from marker.cleaners.table import merge_table_blocks, create_new_tables
|
| 4 |
from marker.debug.data import dump_bbox_debug_data
|
|
@@ -25,10 +25,10 @@ def find_filetype(fpath):
|
|
| 25 |
# The mimetype is not always consistent, so use in to check the most common formats
|
| 26 |
if "pdf" in mimetype:
|
| 27 |
return "pdf"
|
| 28 |
-
elif "epub" in mimetype:
|
| 29 |
-
|
| 30 |
-
elif "mobi" in mimetype:
|
| 31 |
-
|
| 32 |
elif mimetype in settings.SUPPORTED_FILETYPES:
|
| 33 |
return settings.SUPPORTED_FILETYPES[mimetype]
|
| 34 |
else:
|
|
@@ -47,10 +47,12 @@ def get_length_of_text(fname: str) -> int:
|
|
| 47 |
if filetype == "other":
|
| 48 |
return 0
|
| 49 |
|
| 50 |
-
doc =
|
| 51 |
full_text = ""
|
| 52 |
-
for
|
| 53 |
-
|
|
|
|
|
|
|
| 54 |
|
| 55 |
return len(full_text)
|
| 56 |
|
|
@@ -81,11 +83,7 @@ def convert_single_pdf(
|
|
| 81 |
|
| 82 |
out_meta["filetype"] = filetype
|
| 83 |
|
| 84 |
-
doc =
|
| 85 |
-
if filetype != "pdf":
|
| 86 |
-
conv = doc.convert_to_pdf()
|
| 87 |
-
doc = pymupdf.open("pdf", conv)
|
| 88 |
-
|
| 89 |
blocks, toc, ocr_stats = get_text_blocks(
|
| 90 |
doc,
|
| 91 |
tess_lang,
|
|
|
|
| 1 |
+
import pypdfium2 as pdfium
|
| 2 |
|
| 3 |
from marker.cleaners.table import merge_table_blocks, create_new_tables
|
| 4 |
from marker.debug.data import dump_bbox_debug_data
|
|
|
|
| 25 |
# The mimetype is not always consistent, so use in to check the most common formats
|
| 26 |
if "pdf" in mimetype:
|
| 27 |
return "pdf"
|
| 28 |
+
#elif "epub" in mimetype:
|
| 29 |
+
# return "epub"
|
| 30 |
+
#elif "mobi" in mimetype:
|
| 31 |
+
# return "mobi"
|
| 32 |
elif mimetype in settings.SUPPORTED_FILETYPES:
|
| 33 |
return settings.SUPPORTED_FILETYPES[mimetype]
|
| 34 |
else:
|
|
|
|
| 47 |
if filetype == "other":
|
| 48 |
return 0
|
| 49 |
|
| 50 |
+
doc = pdfium.PdfDocument(fname)
|
| 51 |
full_text = ""
|
| 52 |
+
for page_idx in range(len(doc)):
|
| 53 |
+
page = doc.get_page(page_idx)
|
| 54 |
+
text_page = page.get_textpage()
|
| 55 |
+
full_text += text_page.get_text_bounded()
|
| 56 |
|
| 57 |
return len(full_text)
|
| 58 |
|
|
|
|
| 83 |
|
| 84 |
out_meta["filetype"] = filetype
|
| 85 |
|
| 86 |
+
doc = pdfium.PdfDocument(fname)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
blocks, toc, ocr_stats = get_text_blocks(
|
| 88 |
doc,
|
| 89 |
tess_lang,
|
marker/debug/data.py
CHANGED
|
@@ -4,6 +4,7 @@ import os
|
|
| 4 |
import zlib
|
| 5 |
from typing import List
|
| 6 |
|
|
|
|
| 7 |
from marker.schema import Page
|
| 8 |
from marker.settings import settings
|
| 9 |
from PIL import Image
|
|
@@ -54,9 +55,7 @@ def dump_bbox_debug_data(doc, blocks: List[Page]):
|
|
| 54 |
for idx, page_blocks in enumerate(blocks):
|
| 55 |
page = doc[idx]
|
| 56 |
|
| 57 |
-
|
| 58 |
-
png = pix.pil_tobytes(format="PNG")
|
| 59 |
-
png_image = Image.open(io.BytesIO(png))
|
| 60 |
width, height = png_image.size
|
| 61 |
max_dimension = 6000
|
| 62 |
if width > max_dimension or height > max_dimension:
|
|
|
|
| 4 |
import zlib
|
| 5 |
from typing import List
|
| 6 |
|
| 7 |
+
from marker.pdf.images import render_image
|
| 8 |
from marker.schema import Page
|
| 9 |
from marker.settings import settings
|
| 10 |
from PIL import Image
|
|
|
|
| 55 |
for idx, page_blocks in enumerate(blocks):
|
| 56 |
page = doc[idx]
|
| 57 |
|
| 58 |
+
png_image = render_image(page, dpi=settings.TEXIFY_DPI)
|
|
|
|
|
|
|
| 59 |
width, height = png_image.size
|
| 60 |
max_dimension = 6000
|
| 61 |
if width > max_dimension or height > max_dimension:
|
marker/extract_text.py
CHANGED
|
@@ -1,96 +1,66 @@
|
|
| 1 |
import os
|
| 2 |
-
from typing import
|
| 3 |
|
| 4 |
-
|
| 5 |
|
| 6 |
-
from marker.bbox import correct_rotation
|
| 7 |
-
from marker.ocr.page import ocr_entire_page
|
| 8 |
from marker.ocr.utils import detect_bad_ocr, font_flags_decomposer
|
| 9 |
from marker.settings import settings
|
| 10 |
from marker.schema import Span, Line, Block, Page
|
| 11 |
-
from
|
| 12 |
|
| 13 |
os.environ["TESSDATA_PREFIX"] = settings.TESSDATA_PREFIX
|
| 14 |
|
| 15 |
|
| 16 |
-
def
|
| 17 |
-
vertical_groups = {}
|
| 18 |
-
for block in page_blocks:
|
| 19 |
-
group_key = round(block.bbox[1] / tolerance) * tolerance
|
| 20 |
-
if group_key not in vertical_groups:
|
| 21 |
-
vertical_groups[group_key] = []
|
| 22 |
-
vertical_groups[group_key].append(block)
|
| 23 |
-
|
| 24 |
-
# Sort each group horizontally and flatten the groups into a single list
|
| 25 |
-
sorted_page_blocks = []
|
| 26 |
-
for _, group in sorted(vertical_groups.items()):
|
| 27 |
-
sorted_group = sorted(group, key=lambda x: x.bbox[0])
|
| 28 |
-
sorted_page_blocks.extend(sorted_group)
|
| 29 |
-
|
| 30 |
-
return sorted_page_blocks
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
def get_single_page_blocks(doc, pnum: int, tess_lang: str, spellchecker: Optional[SpellChecker] = None, ocr=False) -> Tuple[List[Block], int]:
|
| 34 |
-
page = doc[pnum]
|
| 35 |
-
rotation = page.rotation
|
| 36 |
-
|
| 37 |
-
if ocr:
|
| 38 |
-
blocks = ocr_entire_page(page, tess_lang, spellchecker)
|
| 39 |
-
else:
|
| 40 |
-
blocks = page.get_text("dict", sort=True, flags=settings.TEXT_FLAGS)["blocks"]
|
| 41 |
-
|
| 42 |
page_blocks = []
|
| 43 |
span_id = 0
|
| 44 |
-
for block_idx, block in enumerate(blocks):
|
| 45 |
block_lines = []
|
| 46 |
for l in block["lines"]:
|
| 47 |
spans = []
|
| 48 |
for i, s in enumerate(l["spans"]):
|
| 49 |
block_text = s["text"]
|
| 50 |
-
bbox = s["bbox"]
|
| 51 |
span_obj = Span(
|
| 52 |
text=block_text,
|
| 53 |
-
bbox=
|
| 54 |
span_id=f"{pnum}_{span_id}",
|
| 55 |
-
font=f"{s['font']}_{font_flags_decomposer(s['flags'])}", # Add font flags to end of font
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
descender=s["descender"],
|
| 59 |
)
|
| 60 |
spans.append(span_obj) # Text, bounding box, span id
|
| 61 |
span_id += 1
|
| 62 |
line_obj = Line(
|
| 63 |
spans=spans,
|
| 64 |
-
bbox=
|
| 65 |
)
|
| 66 |
# Only select valid lines, with positive bboxes
|
| 67 |
-
if line_obj.area
|
| 68 |
block_lines.append(line_obj)
|
| 69 |
block_obj = Block(
|
| 70 |
lines=block_lines,
|
| 71 |
-
bbox=
|
| 72 |
pnum=pnum
|
| 73 |
)
|
| 74 |
-
# Only select blocks with
|
| 75 |
if len(block_lines) > 0:
|
| 76 |
page_blocks.append(block_obj)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
|
| 78 |
-
# If the page was rotated, sort the text again
|
| 79 |
-
if rotation > 0:
|
| 80 |
-
page_blocks = sort_rotated_text(page_blocks)
|
| 81 |
-
return page_blocks
|
| 82 |
|
| 83 |
-
|
| 84 |
-
def convert_single_page(doc, pnum, tess_lang: str, spell_lang: Optional[str], no_text: bool, disable_ocr: bool = False, min_ocr_page: int = 2):
|
| 85 |
ocr_pages = 0
|
| 86 |
ocr_success = 0
|
| 87 |
ocr_failed = 0
|
| 88 |
-
spellchecker = None
|
| 89 |
page_bbox = doc[pnum].bound()
|
| 90 |
-
if spell_lang:
|
| 91 |
-
spellchecker = SpellChecker(language=spell_lang)
|
| 92 |
|
| 93 |
-
blocks = get_single_page_blocks(doc, pnum, tess_lang
|
| 94 |
page_obj = Page(blocks=blocks, pnum=pnum, bbox=page_bbox)
|
| 95 |
|
| 96 |
# OCR page if we got minimal text, or if we got too many spaces
|
|
@@ -98,14 +68,14 @@ def convert_single_page(doc, pnum, tess_lang: str, spell_lang: Optional[str], no
|
|
| 98 |
(
|
| 99 |
no_text # Full doc has no text, and needs full OCR
|
| 100 |
or
|
| 101 |
-
(len(page_obj.prelim_text) > 0 and detect_bad_ocr(page_obj.prelim_text
|
| 102 |
),
|
| 103 |
min_ocr_page < pnum < len(doc) - 1,
|
| 104 |
not disable_ocr
|
| 105 |
]
|
| 106 |
if all(conditions) or settings.OCR_ALL_PAGES:
|
| 107 |
page = doc[pnum]
|
| 108 |
-
blocks = get_single_page_blocks(doc, pnum, tess_lang,
|
| 109 |
page_obj = Page(blocks=blocks, pnum=pnum, bbox=page_bbox, rotation=page.rotation)
|
| 110 |
ocr_pages = 1
|
| 111 |
if len(blocks) == 0:
|
|
@@ -116,37 +86,43 @@ def convert_single_page(doc, pnum, tess_lang: str, spell_lang: Optional[str], no
|
|
| 116 |
|
| 117 |
|
| 118 |
def get_text_blocks(doc, tess_lang: str, spell_lang: Optional[str], max_pages: Optional[int] = None, parallel: int = settings.OCR_PARALLEL_WORKERS):
|
| 119 |
-
|
| 120 |
-
toc = doc.get_toc()
|
| 121 |
ocr_pages = 0
|
| 122 |
ocr_failed = 0
|
| 123 |
ocr_success = 0
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
no_text = len(naive_get_text(doc).strip()) == 0
|
| 127 |
if max_pages:
|
| 128 |
range_end = min(max_pages, len(doc))
|
| 129 |
-
|
| 130 |
-
args_list = [(doc, pnum, tess_lang, spell_lang, no_text) for pnum in range(range_end)]
|
| 131 |
-
if parallel == 1:
|
| 132 |
-
func = map
|
| 133 |
-
else:
|
| 134 |
-
func = pool.map
|
| 135 |
-
results = func(lambda a: convert_single_page(*a), args_list)
|
| 136 |
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
all_blocks.append(page_obj)
|
| 140 |
-
ocr_pages += ocr_stats["ocr_pages"]
|
| 141 |
-
ocr_failed += ocr_stats["ocr_failed"]
|
| 142 |
-
ocr_success += ocr_stats["ocr_success"]
|
| 143 |
|
| 144 |
return all_blocks, toc, {"ocr_pages": ocr_pages, "ocr_failed": ocr_failed, "ocr_success": ocr_success}
|
| 145 |
|
| 146 |
|
| 147 |
def naive_get_text(doc):
|
| 148 |
full_text = ""
|
| 149 |
-
for
|
| 150 |
-
|
| 151 |
-
|
|
|
|
| 152 |
return full_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
+
from typing import List, Optional
|
| 3 |
|
| 4 |
+
import pypdfium2.internal as pdfium_i
|
| 5 |
|
|
|
|
|
|
|
| 6 |
from marker.ocr.utils import detect_bad_ocr, font_flags_decomposer
|
| 7 |
from marker.settings import settings
|
| 8 |
from marker.schema import Span, Line, Block, Page
|
| 9 |
+
from pdftext.extraction import dictionary_output
|
| 10 |
|
| 11 |
os.environ["TESSDATA_PREFIX"] = settings.TESSDATA_PREFIX
|
| 12 |
|
| 13 |
|
| 14 |
+
def pdftext_format_to_blocks(page, pnum: int) -> List[Block]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
page_blocks = []
|
| 16 |
span_id = 0
|
| 17 |
+
for block_idx, block in enumerate(page["blocks"]):
|
| 18 |
block_lines = []
|
| 19 |
for l in block["lines"]:
|
| 20 |
spans = []
|
| 21 |
for i, s in enumerate(l["spans"]):
|
| 22 |
block_text = s["text"]
|
|
|
|
| 23 |
span_obj = Span(
|
| 24 |
text=block_text,
|
| 25 |
+
bbox=s["bbox"],
|
| 26 |
span_id=f"{pnum}_{span_id}",
|
| 27 |
+
font=f"{s['font']['name']}_{font_flags_decomposer(s['font']['flags'])}", # Add font flags to end of font
|
| 28 |
+
font_weight=s["font"]["weight"],
|
| 29 |
+
font_size=s["font"]["size"],
|
|
|
|
| 30 |
)
|
| 31 |
spans.append(span_obj) # Text, bounding box, span id
|
| 32 |
span_id += 1
|
| 33 |
line_obj = Line(
|
| 34 |
spans=spans,
|
| 35 |
+
bbox=l["bbox"],
|
| 36 |
)
|
| 37 |
# Only select valid lines, with positive bboxes
|
| 38 |
+
if line_obj.area >= 0:
|
| 39 |
block_lines.append(line_obj)
|
| 40 |
block_obj = Block(
|
| 41 |
lines=block_lines,
|
| 42 |
+
bbox=block["bbox"],
|
| 43 |
pnum=pnum
|
| 44 |
)
|
| 45 |
+
# Only select blocks with lines
|
| 46 |
if len(block_lines) > 0:
|
| 47 |
page_blocks.append(block_obj)
|
| 48 |
+
out_page = Page(
|
| 49 |
+
blocks=page_blocks,
|
| 50 |
+
pnum=page["page"],
|
| 51 |
+
bbox=page["bbox"],
|
| 52 |
+
rotation=page["rotation"],
|
| 53 |
+
)
|
| 54 |
+
return out_page
|
| 55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
+
def ocr_page(doc, pnum, page: Page, tess_lang: str):
|
|
|
|
| 58 |
ocr_pages = 0
|
| 59 |
ocr_success = 0
|
| 60 |
ocr_failed = 0
|
|
|
|
| 61 |
page_bbox = doc[pnum].bound()
|
|
|
|
|
|
|
| 62 |
|
| 63 |
+
blocks = get_single_page_blocks(doc, pnum, tess_lang)
|
| 64 |
page_obj = Page(blocks=blocks, pnum=pnum, bbox=page_bbox)
|
| 65 |
|
| 66 |
# OCR page if we got minimal text, or if we got too many spaces
|
|
|
|
| 68 |
(
|
| 69 |
no_text # Full doc has no text, and needs full OCR
|
| 70 |
or
|
| 71 |
+
(len(page_obj.prelim_text) > 0 and detect_bad_ocr(page_obj.prelim_text)) # Bad OCR
|
| 72 |
),
|
| 73 |
min_ocr_page < pnum < len(doc) - 1,
|
| 74 |
not disable_ocr
|
| 75 |
]
|
| 76 |
if all(conditions) or settings.OCR_ALL_PAGES:
|
| 77 |
page = doc[pnum]
|
| 78 |
+
blocks = get_single_page_blocks(doc, pnum, tess_lang, ocr=True)
|
| 79 |
page_obj = Page(blocks=blocks, pnum=pnum, bbox=page_bbox, rotation=page.rotation)
|
| 80 |
ocr_pages = 1
|
| 81 |
if len(blocks) == 0:
|
|
|
|
| 86 |
|
| 87 |
|
| 88 |
def get_text_blocks(doc, tess_lang: str, spell_lang: Optional[str], max_pages: Optional[int] = None, parallel: int = settings.OCR_PARALLEL_WORKERS):
|
| 89 |
+
toc = get_toc(doc)
|
|
|
|
| 90 |
ocr_pages = 0
|
| 91 |
ocr_failed = 0
|
| 92 |
ocr_success = 0
|
| 93 |
+
|
| 94 |
+
page_range = range(len(doc))
|
|
|
|
| 95 |
if max_pages:
|
| 96 |
range_end = min(max_pages, len(doc))
|
| 97 |
+
page_range = range(range_end)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
|
| 99 |
+
all_blocks = dictionary_output(doc, page_range=page_range)
|
| 100 |
+
all_blocks = [pdftext_format_to_blocks(page, pnum) for pnum, page in enumerate(all_blocks)]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
|
| 102 |
return all_blocks, toc, {"ocr_pages": ocr_pages, "ocr_failed": ocr_failed, "ocr_success": ocr_success}
|
| 103 |
|
| 104 |
|
| 105 |
def naive_get_text(doc):
|
| 106 |
full_text = ""
|
| 107 |
+
for page_idx in range(len(doc)):
|
| 108 |
+
page = doc.get_page(page_idx)
|
| 109 |
+
text_page = page.get_textpage()
|
| 110 |
+
full_text += text_page.get_text_bounded() + "\n"
|
| 111 |
return full_text
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def get_toc(doc, max_depth=15):
|
| 115 |
+
toc = doc.get_toc(max_depth=max_depth)
|
| 116 |
+
toc_list = []
|
| 117 |
+
for item in toc:
|
| 118 |
+
list_item = {
|
| 119 |
+
"title": item.title,
|
| 120 |
+
"level": item.level,
|
| 121 |
+
"is_closed": item.is_closed,
|
| 122 |
+
"n_kids": item.n_kids,
|
| 123 |
+
"page_index": item.page_index,
|
| 124 |
+
"view_mode": pdfium_i.ViewmodeToStr.get(item.view_mode),
|
| 125 |
+
"view_pos": item.view_pos,
|
| 126 |
+
}
|
| 127 |
+
toc_list.append(list_item)
|
| 128 |
+
return toc_list
|
marker/logger.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
import logging
|
| 2 |
-
import fitz as pymupdf
|
| 3 |
import warnings
|
| 4 |
|
| 5 |
|
|
@@ -10,5 +9,4 @@ def configure_logging():
|
|
| 10 |
logging.getLogger('PIL').setLevel(logging.ERROR)
|
| 11 |
logging.getLogger('fitz').setLevel(logging.ERROR)
|
| 12 |
logging.getLogger('ocrmypdf').setLevel(logging.ERROR)
|
| 13 |
-
pymupdf.TOOLS.mupdf_display_errors(False)
|
| 14 |
warnings.simplefilter(action='ignore', category=FutureWarning)
|
|
|
|
| 1 |
import logging
|
|
|
|
| 2 |
import warnings
|
| 3 |
|
| 4 |
|
|
|
|
| 9 |
logging.getLogger('PIL').setLevel(logging.ERROR)
|
| 10 |
logging.getLogger('fitz').setLevel(logging.ERROR)
|
| 11 |
logging.getLogger('ocrmypdf').setLevel(logging.ERROR)
|
|
|
|
| 12 |
warnings.simplefilter(action='ignore', category=FutureWarning)
|
marker/ocr/page.py
CHANGED
|
@@ -1,9 +1,7 @@
|
|
| 1 |
import io
|
| 2 |
from typing import List, Optional
|
| 3 |
|
| 4 |
-
import fitz as pymupdf
|
| 5 |
import ocrmypdf
|
| 6 |
-
from spellchecker import SpellChecker
|
| 7 |
|
| 8 |
from marker.ocr.utils import detect_bad_ocr
|
| 9 |
from marker.schema import Block
|
|
@@ -12,16 +10,16 @@ from marker.settings import settings
|
|
| 12 |
ocrmypdf.configure_logging(verbosity=ocrmypdf.Verbosity.quiet)
|
| 13 |
|
| 14 |
|
| 15 |
-
def ocr_entire_page(page, lang: str
|
| 16 |
if settings.OCR_ENGINE == "tesseract":
|
| 17 |
-
return ocr_entire_page_tess(page, lang
|
| 18 |
elif settings.OCR_ENGINE == "ocrmypdf":
|
| 19 |
-
return ocr_entire_page_ocrmp(page, lang
|
| 20 |
else:
|
| 21 |
raise ValueError(f"Unknown OCR engine {settings.OCR_ENGINE}")
|
| 22 |
|
| 23 |
|
| 24 |
-
def ocr_entire_page_tess(page, lang: str
|
| 25 |
try:
|
| 26 |
full_tp = page.get_textpage_ocr(flags=settings.TEXT_FLAGS, dpi=settings.OCR_DPI, full=True, language=lang)
|
| 27 |
blocks = page.get_text("dict", sort=True, flags=settings.TEXT_FLAGS, textpage=full_tp)["blocks"]
|
|
@@ -32,14 +30,14 @@ def ocr_entire_page_tess(page, lang: str, spellchecker: Optional[SpellChecker] =
|
|
| 32 |
|
| 33 |
# Check if OCR worked. If it didn't, return empty list
|
| 34 |
# OCR can fail if there is a scanned blank page with some faint text impressions, for example
|
| 35 |
-
if detect_bad_ocr(full_text
|
| 36 |
return []
|
| 37 |
except RuntimeError:
|
| 38 |
return []
|
| 39 |
return blocks
|
| 40 |
|
| 41 |
|
| 42 |
-
def ocr_entire_page_ocrmp(page, lang: str
|
| 43 |
# Use ocrmypdf to get OCR text for the whole page
|
| 44 |
src = page.parent # the page's document
|
| 45 |
blank_doc = pymupdf.open() # make temporary 1-pager
|
|
@@ -71,7 +69,7 @@ def ocr_entire_page_ocrmp(page, lang: str, spellchecker: Optional[SpellChecker]
|
|
| 71 |
if len(full_text) == 0:
|
| 72 |
return []
|
| 73 |
|
| 74 |
-
if detect_bad_ocr(full_text
|
| 75 |
return []
|
| 76 |
|
| 77 |
return blocks
|
|
|
|
| 1 |
import io
|
| 2 |
from typing import List, Optional
|
| 3 |
|
|
|
|
| 4 |
import ocrmypdf
|
|
|
|
| 5 |
|
| 6 |
from marker.ocr.utils import detect_bad_ocr
|
| 7 |
from marker.schema import Block
|
|
|
|
| 10 |
ocrmypdf.configure_logging(verbosity=ocrmypdf.Verbosity.quiet)
|
| 11 |
|
| 12 |
|
| 13 |
+
def ocr_entire_page(page, lang: str) -> List[Block]:
|
| 14 |
if settings.OCR_ENGINE == "tesseract":
|
| 15 |
+
return ocr_entire_page_tess(page, lang)
|
| 16 |
elif settings.OCR_ENGINE == "ocrmypdf":
|
| 17 |
+
return ocr_entire_page_ocrmp(page, lang)
|
| 18 |
else:
|
| 19 |
raise ValueError(f"Unknown OCR engine {settings.OCR_ENGINE}")
|
| 20 |
|
| 21 |
|
| 22 |
+
def ocr_entire_page_tess(page, lang: str) -> List[Block]:
|
| 23 |
try:
|
| 24 |
full_tp = page.get_textpage_ocr(flags=settings.TEXT_FLAGS, dpi=settings.OCR_DPI, full=True, language=lang)
|
| 25 |
blocks = page.get_text("dict", sort=True, flags=settings.TEXT_FLAGS, textpage=full_tp)["blocks"]
|
|
|
|
| 30 |
|
| 31 |
# Check if OCR worked. If it didn't, return empty list
|
| 32 |
# OCR can fail if there is a scanned blank page with some faint text impressions, for example
|
| 33 |
+
if detect_bad_ocr(full_text):
|
| 34 |
return []
|
| 35 |
except RuntimeError:
|
| 36 |
return []
|
| 37 |
return blocks
|
| 38 |
|
| 39 |
|
| 40 |
+
def ocr_entire_page_ocrmp(page, lang: str) -> List[Block]:
|
| 41 |
# Use ocrmypdf to get OCR text for the whole page
|
| 42 |
src = page.parent # the page's document
|
| 43 |
blank_doc = pymupdf.open() # make temporary 1-pager
|
|
|
|
| 69 |
if len(full_text) == 0:
|
| 70 |
return []
|
| 71 |
|
| 72 |
+
if detect_bad_ocr(full_text):
|
| 73 |
return []
|
| 74 |
|
| 75 |
return blocks
|
marker/ocr/utils.py
CHANGED
|
@@ -1,12 +1,11 @@
|
|
| 1 |
from typing import Optional
|
| 2 |
|
| 3 |
from nltk import wordpunct_tokenize
|
| 4 |
-
from spellchecker import SpellChecker
|
| 5 |
from marker.settings import settings
|
| 6 |
import re
|
| 7 |
|
| 8 |
|
| 9 |
-
def detect_bad_ocr(text,
|
| 10 |
if len(text) == 0:
|
| 11 |
# Assume OCR failed if we have no text
|
| 12 |
return True
|
|
@@ -15,11 +14,6 @@ def detect_bad_ocr(text, spellchecker: Optional[SpellChecker], misspell_threshol
|
|
| 15 |
words = [w for w in words if w.strip()]
|
| 16 |
alpha_words = [word for word in words if word.isalnum()]
|
| 17 |
|
| 18 |
-
if spellchecker:
|
| 19 |
-
misspelled = spellchecker.unknown(alpha_words)
|
| 20 |
-
if len(misspelled) > len(alpha_words) * misspell_threshold:
|
| 21 |
-
return True
|
| 22 |
-
|
| 23 |
spaces = len(re.findall(r'\s+', text))
|
| 24 |
alpha_chars = len(re.sub(r'\s+', '', text))
|
| 25 |
if spaces / (alpha_chars + spaces) > space_threshold:
|
|
@@ -41,7 +35,8 @@ def detect_bad_ocr(text, spellchecker: Optional[SpellChecker], misspell_threshol
|
|
| 41 |
|
| 42 |
|
| 43 |
def font_flags_decomposer(flags):
|
| 44 |
-
|
|
|
|
| 45 |
l = []
|
| 46 |
if flags & 2 ** 0:
|
| 47 |
l.append("superscript")
|
|
|
|
| 1 |
from typing import Optional
|
| 2 |
|
| 3 |
from nltk import wordpunct_tokenize
|
|
|
|
| 4 |
from marker.settings import settings
|
| 5 |
import re
|
| 6 |
|
| 7 |
|
| 8 |
+
def detect_bad_ocr(text, space_threshold=.6, newline_threshold=.5, alphanum_threshold=.4):
|
| 9 |
if len(text) == 0:
|
| 10 |
# Assume OCR failed if we have no text
|
| 11 |
return True
|
|
|
|
| 14 |
words = [w for w in words if w.strip()]
|
| 15 |
alpha_words = [word for word in words if word.isalnum()]
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
spaces = len(re.findall(r'\s+', text))
|
| 18 |
alpha_chars = len(re.sub(r'\s+', '', text))
|
| 19 |
if spaces / (alpha_chars + spaces) > space_threshold:
|
|
|
|
| 35 |
|
| 36 |
|
| 37 |
def font_flags_decomposer(flags):
|
| 38 |
+
flags = int(flags)
|
| 39 |
+
|
| 40 |
l = []
|
| 41 |
if flags & 2 ** 0:
|
| 42 |
l.append("superscript")
|
marker/ordering.py
CHANGED
|
@@ -4,11 +4,11 @@ from typing import List
|
|
| 4 |
import torch
|
| 5 |
import sys, os
|
| 6 |
|
| 7 |
-
from marker.extract_text import convert_single_page
|
| 8 |
from transformers import LayoutLMv3ForSequenceClassification, LayoutLMv3Processor
|
| 9 |
from PIL import Image
|
| 10 |
import io
|
| 11 |
|
|
|
|
| 12 |
from marker.schema import Page
|
| 13 |
from marker.settings import settings
|
| 14 |
|
|
@@ -28,9 +28,7 @@ def get_inference_data(page, page_blocks: Page):
|
|
| 28 |
bboxes = deepcopy([block.bbox for block in page_blocks.blocks])
|
| 29 |
words = ["."] * len(bboxes)
|
| 30 |
|
| 31 |
-
|
| 32 |
-
png = pix.pil_tobytes(format="PNG")
|
| 33 |
-
rgb_image = Image.open(io.BytesIO(png)).convert("RGB")
|
| 34 |
|
| 35 |
page_box = page_blocks.bbox
|
| 36 |
pwidth = page_blocks.width
|
|
|
|
| 4 |
import torch
|
| 5 |
import sys, os
|
| 6 |
|
|
|
|
| 7 |
from transformers import LayoutLMv3ForSequenceClassification, LayoutLMv3Processor
|
| 8 |
from PIL import Image
|
| 9 |
import io
|
| 10 |
|
| 11 |
+
from marker.pdf.images import render_image
|
| 12 |
from marker.schema import Page
|
| 13 |
from marker.settings import settings
|
| 14 |
|
|
|
|
| 28 |
bboxes = deepcopy([block.bbox for block in page_blocks.blocks])
|
| 29 |
words = ["."] * len(bboxes)
|
| 30 |
|
| 31 |
+
rgb_image = render_image(page, dpi=settings.LAYOUT_DPI)
|
|
|
|
|
|
|
| 32 |
|
| 33 |
page_box = page_blocks.bbox
|
| 34 |
pwidth = page_blocks.width
|
marker/pdf/images.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pypdfium2 as pdfium
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def render_image(page: pdfium.PdfPage, dpi):
|
| 5 |
+
image = page.render(
|
| 6 |
+
scale=dpi / 72,
|
| 7 |
+
draw_annots=False
|
| 8 |
+
).to_pil()
|
| 9 |
+
image = image.convert("RGB")
|
| 10 |
+
return image
|
marker/schema.py
CHANGED
|
@@ -56,9 +56,8 @@ class Span(BboxElement):
|
|
| 56 |
text: str
|
| 57 |
span_id: str
|
| 58 |
font: str
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
descender: Optional[float] = None
|
| 62 |
block_type: Optional[str] = None
|
| 63 |
selected: bool = True
|
| 64 |
|
|
|
|
| 56 |
text: str
|
| 57 |
span_id: str
|
| 58 |
font: str
|
| 59 |
+
font_weight: float
|
| 60 |
+
font_size: float
|
|
|
|
| 61 |
block_type: Optional[str] = None
|
| 62 |
selected: bool = True
|
| 63 |
|
marker/segmentation.py
CHANGED
|
@@ -9,6 +9,8 @@ import io
|
|
| 9 |
from PIL import Image
|
| 10 |
from transformers import LayoutLMv3Processor
|
| 11 |
import numpy as np
|
|
|
|
|
|
|
| 12 |
from marker.settings import settings
|
| 13 |
from marker.schema import Page, BlockType
|
| 14 |
import torch
|
|
@@ -69,11 +71,7 @@ def get_page_encoding(page, page_blocks: Page):
|
|
| 69 |
pwidth = page_blocks.width
|
| 70 |
pheight = page_blocks.height
|
| 71 |
|
| 72 |
-
|
| 73 |
-
png = pix.pil_tobytes(format="PNG")
|
| 74 |
-
png_image = Image.open(io.BytesIO(png))
|
| 75 |
-
# If it is too large, make it smaller for the model
|
| 76 |
-
rgb_image = png_image.convert('RGB')
|
| 77 |
rgb_width, rgb_height = rgb_image.size
|
| 78 |
|
| 79 |
# Image is correct size wrt the pdf page
|
|
|
|
| 9 |
from PIL import Image
|
| 10 |
from transformers import LayoutLMv3Processor
|
| 11 |
import numpy as np
|
| 12 |
+
|
| 13 |
+
from marker.pdf.images import render_image
|
| 14 |
from marker.settings import settings
|
| 15 |
from marker.schema import Page, BlockType
|
| 16 |
import torch
|
|
|
|
| 71 |
pwidth = page_blocks.width
|
| 72 |
pheight = page_blocks.height
|
| 73 |
|
| 74 |
+
rgb_image = render_image(page, dpi=settings.LAYOUT_DPI)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
rgb_width, rgb_height = rgb_image.size
|
| 76 |
|
| 77 |
# Image is correct size wrt the pdf page
|
marker/settings.py
CHANGED
|
@@ -4,7 +4,6 @@ from typing import Optional, List, Dict
|
|
| 4 |
from dotenv import find_dotenv
|
| 5 |
from pydantic import computed_field
|
| 6 |
from pydantic_settings import BaseSettings
|
| 7 |
-
import fitz as pymupdf
|
| 8 |
import torch
|
| 9 |
|
| 10 |
|
|
@@ -32,15 +31,12 @@ class Settings(BaseSettings):
|
|
| 32 |
|
| 33 |
SUPPORTED_FILETYPES: Dict = {
|
| 34 |
"application/pdf": "pdf",
|
| 35 |
-
"application/epub+zip": "epub",
|
| 36 |
-
"application/x-mobipocket-ebook": "mobi",
|
| 37 |
-
"application/vnd.ms-xpsdocument": "xps",
|
| 38 |
-
"application/x-fictionbook+xml": "fb2"
|
| 39 |
}
|
| 40 |
|
| 41 |
-
# PyMuPDF
|
| 42 |
-
TEXT_FLAGS: int = pymupdf.TEXTFLAGS_DICT & ~pymupdf.TEXT_PRESERVE_LIGATURES & ~pymupdf.TEXT_PRESERVE_IMAGES
|
| 43 |
-
|
| 44 |
# OCR
|
| 45 |
INVALID_CHARS: List[str] = [chr(0xfffd), "�"]
|
| 46 |
OCR_DPI: int = 400
|
|
|
|
| 4 |
from dotenv import find_dotenv
|
| 5 |
from pydantic import computed_field
|
| 6 |
from pydantic_settings import BaseSettings
|
|
|
|
| 7 |
import torch
|
| 8 |
|
| 9 |
|
|
|
|
| 31 |
|
| 32 |
SUPPORTED_FILETYPES: Dict = {
|
| 33 |
"application/pdf": "pdf",
|
| 34 |
+
#"application/epub+zip": "epub",
|
| 35 |
+
#"application/x-mobipocket-ebook": "mobi",
|
| 36 |
+
#"application/vnd.ms-xpsdocument": "xps",
|
| 37 |
+
#"application/x-fictionbook+xml": "fb2"
|
| 38 |
}
|
| 39 |
|
|
|
|
|
|
|
|
|
|
| 40 |
# OCR
|
| 41 |
INVALID_CHARS: List[str] = [chr(0xfffd), "�"]
|
| 42 |
OCR_DPI: int = 400
|
poetry.lock
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
pyproject.toml
CHANGED
|
@@ -23,7 +23,6 @@ python = ">=3.9,<3.13,!=3.9.7"
|
|
| 23 |
scikit-learn = "^1.3.2"
|
| 24 |
Pillow = "^10.1.0"
|
| 25 |
pytesseract = "^0.3.10"
|
| 26 |
-
PyMuPDF = "^1.23.5"
|
| 27 |
pymupdf-fonts = "^1.0.5"
|
| 28 |
pydantic = "^2.4.2"
|
| 29 |
pydantic-settings = "^2.0.3"
|
|
@@ -34,15 +33,15 @@ torch = "^2.1.2"
|
|
| 34 |
ray = "^2.9.0"
|
| 35 |
tqdm = "^4.66.1"
|
| 36 |
tabulate = "^0.9.0"
|
| 37 |
-
thefuzz = "^0.20.0"
|
| 38 |
python-magic = "^0.4.27"
|
| 39 |
-
pyspellchecker = "^0.7.2"
|
| 40 |
ftfy = "^6.1.1"
|
| 41 |
nltk = "^3.8.1"
|
| 42 |
ocrmypdf = "^15.4.0"
|
| 43 |
-
bitsandbytes = "^0.41.2.post2"
|
| 44 |
grpcio = "^1.60.0"
|
| 45 |
texify = "^0.1.8"
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
[tool.poetry.group.dev.dependencies]
|
| 48 |
jupyter = "^1.0.0"
|
|
|
|
| 23 |
scikit-learn = "^1.3.2"
|
| 24 |
Pillow = "^10.1.0"
|
| 25 |
pytesseract = "^0.3.10"
|
|
|
|
| 26 |
pymupdf-fonts = "^1.0.5"
|
| 27 |
pydantic = "^2.4.2"
|
| 28 |
pydantic-settings = "^2.0.3"
|
|
|
|
| 33 |
ray = "^2.9.0"
|
| 34 |
tqdm = "^4.66.1"
|
| 35 |
tabulate = "^0.9.0"
|
|
|
|
| 36 |
python-magic = "^0.4.27"
|
|
|
|
| 37 |
ftfy = "^6.1.1"
|
| 38 |
nltk = "^3.8.1"
|
| 39 |
ocrmypdf = "^15.4.0"
|
|
|
|
| 40 |
grpcio = "^1.60.0"
|
| 41 |
texify = "^0.1.8"
|
| 42 |
+
pdftext = "^0.3.1"
|
| 43 |
+
rapidfuzz = "^3.8.1"
|
| 44 |
+
surya-ocr = "^0.4.0"
|
| 45 |
|
| 46 |
[tool.poetry.group.dev.dependencies]
|
| 47 |
jupyter = "^1.0.0"
|