Vik Paruchuri
commited on
Commit
Β·
2c69783
1
Parent(s):
ac26884
Initial integration
Browse filesThis view is limited to 50 files because it contains too many changes. Β
See raw diff
- docs/install_ocrmypdf.md +0 -29
- marker/{v2/builders β builders}/__init__.py +1 -1
- marker/{v2/builders β builders}/document.py +13 -10
- marker/{v2/builders β builders}/layout.py +8 -8
- marker/{v2/builders β builders}/ocr.py +9 -9
- marker/{v2/builders β builders}/structure.py +6 -6
- marker/cleaners/bullets.py +0 -8
- marker/cleaners/code.py +0 -131
- marker/cleaners/fontstyle.py +0 -30
- marker/cleaners/headers.py +0 -82
- marker/cleaners/headings.py +0 -129
- marker/cleaners/text.py +0 -8
- marker/cleaners/toc.py +0 -29
- marker/convert.py +0 -32
- marker/{v2/converters β converters}/__init__.py +1 -1
- marker/{v2/converters β converters}/pdf.py +22 -22
- marker/debug/data.py +0 -109
- marker/debug/render.py +0 -62
- marker/equations/equations.py +0 -179
- marker/equations/inference.py +0 -51
- marker/images/extract.py +0 -77
- marker/images/save.py +0 -18
- marker/layout/layout.py +0 -113
- marker/layout/order.py +0 -73
- marker/logger.py +0 -3
- marker/models.py +15 -35
- marker/ocr/detection.py +0 -28
- marker/ocr/heuristics.py +0 -78
- marker/ocr/lang.py +0 -44
- marker/ocr/recognition.py +0 -182
- marker/ocr/tesseract.py +0 -97
- marker/pdf/extract_text.py +0 -114
- marker/pdf/images.py +0 -27
- marker/pdf/utils.py +0 -75
- marker/postprocessors/markdown.py +0 -254
- marker/{v2/processors β processors}/__init__.py +3 -3
- marker/{v2/processors β processors}/code.py +4 -4
- marker/{v2/processors β processors}/debug.py +3 -3
- marker/{v2/processors β processors}/document_toc.py +3 -3
- marker/{v2/processors β processors}/equation.py +3 -3
- marker/{v2/processors β processors}/ignoretext.py +3 -3
- marker/{v2/processors β processors}/sectionheader.py +3 -3
- marker/{v2/processors β processors}/table.py +3 -3
- marker/{v2/processors β processors}/text.py +4 -4
- marker/{v2/providers β providers}/__init__.py +3 -3
- marker/{v2/providers β providers}/pdf.py +37 -8
- marker/{ocr β providers}/utils.py +4 -1
- marker/{v2/renderers β renderers}/__init__.py +3 -3
- marker/{v2/renderers β renderers}/html.py +3 -3
- marker/{v2/renderers β renderers}/json.py +5 -5
docs/install_ocrmypdf.md
DELETED
|
@@ -1,29 +0,0 @@
|
|
| 1 |
-
## Linux
|
| 2 |
-
|
| 3 |
-
- Run `apt-get install ocrmypdf`
|
| 4 |
-
- Install ghostscript > 9.55 by following [these instructions](https://ghostscript.readthedocs.io/en/latest/Install.html) or running `scripts/install/ghostscript_install.sh`.
|
| 5 |
-
- Run `pip install ocrmypdf`
|
| 6 |
-
- Install any tesseract language packages that you want (example `apt-get install tesseract-ocr-eng`)
|
| 7 |
-
- Set the tesseract data folder path
|
| 8 |
-
- Find the tesseract data folder `tessdata` with `find / -name tessdata`. Make sure to use the one corresponding to the latest tesseract version if you have multiple.
|
| 9 |
-
- Create a `local.env` file in the root `marker` folder with `TESSDATA_PREFIX=/path/to/tessdata` inside it
|
| 10 |
-
|
| 11 |
-
## Mac
|
| 12 |
-
|
| 13 |
-
Only needed if using `ocrmypdf` as the ocr backend.
|
| 14 |
-
|
| 15 |
-
- Run `brew install ocrmypdf`
|
| 16 |
-
- Run `brew install tesseract-lang` to add language support
|
| 17 |
-
- Run `pip install ocrmypdf`
|
| 18 |
-
- Set the tesseract data folder path
|
| 19 |
-
- Find the tesseract data folder `tessdata` with `brew list tesseract`
|
| 20 |
-
- Create a `local.env` file in the root `marker` folder with `TESSDATA_PREFIX=/path/to/tessdata` inside it
|
| 21 |
-
|
| 22 |
-
## Windows
|
| 23 |
-
|
| 24 |
-
- Install `ocrmypdf` and ghostscript by following [these instructions](https://ocrmypdf.readthedocs.io/en/latest/installation.html#installing-on-windows)
|
| 25 |
-
- Run `pip install ocrmypdf`
|
| 26 |
-
- Install any tesseract language packages you want
|
| 27 |
-
- Set the tesseract data folder path
|
| 28 |
-
- Find the tesseract data folder `tessdata` with `brew list tesseract`
|
| 29 |
-
- Create a `local.env` file in the root `marker` folder with `TESSDATA_PREFIX=/path/to/tessdata` inside it
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
marker/{v2/builders β builders}/__init__.py
RENAMED
|
@@ -2,7 +2,7 @@ from typing import Optional
|
|
| 2 |
|
| 3 |
from pydantic import BaseModel
|
| 4 |
|
| 5 |
-
from marker.
|
| 6 |
|
| 7 |
|
| 8 |
class BaseBuilder:
|
|
|
|
| 2 |
|
| 3 |
from pydantic import BaseModel
|
| 4 |
|
| 5 |
+
from marker.util import assign_config
|
| 6 |
|
| 7 |
|
| 8 |
class BaseBuilder:
|
marker/{v2/builders β builders}/document.py
RENAMED
|
@@ -1,15 +1,18 @@
|
|
| 1 |
from marker.settings import settings
|
| 2 |
-
from marker.
|
| 3 |
-
from marker.
|
| 4 |
-
from marker.
|
| 5 |
-
from marker.
|
| 6 |
-
from marker.
|
| 7 |
-
from marker.
|
| 8 |
-
from marker.
|
| 9 |
-
from marker.
|
| 10 |
|
| 11 |
|
| 12 |
class DocumentBuilder(BaseBuilder):
|
|
|
|
|
|
|
|
|
|
| 13 |
def __call__(self, provider: PdfProvider, layout_builder: LayoutBuilder, ocr_builder: OcrBuilder):
|
| 14 |
document = self.build_document(provider)
|
| 15 |
layout_builder(document, provider)
|
|
@@ -21,8 +24,8 @@ class DocumentBuilder(BaseBuilder):
|
|
| 21 |
initial_pages = [
|
| 22 |
PageGroupClass(
|
| 23 |
page_id=i,
|
| 24 |
-
lowres_image=provider.get_image(i,
|
| 25 |
-
highres_image=provider.get_image(i,
|
| 26 |
polygon=provider.get_page_bbox(i)
|
| 27 |
) for i in provider.page_range
|
| 28 |
]
|
|
|
|
| 1 |
from marker.settings import settings
|
| 2 |
+
from marker.builders import BaseBuilder
|
| 3 |
+
from marker.builders.layout import LayoutBuilder
|
| 4 |
+
from marker.builders.ocr import OcrBuilder
|
| 5 |
+
from marker.providers.pdf import PdfProvider
|
| 6 |
+
from marker.schema import BlockTypes
|
| 7 |
+
from marker.schema.document import Document
|
| 8 |
+
from marker.schema.groups.page import PageGroup
|
| 9 |
+
from marker.schema.registry import get_block_class
|
| 10 |
|
| 11 |
|
| 12 |
class DocumentBuilder(BaseBuilder):
|
| 13 |
+
lowres_image_dpi: int = 96
|
| 14 |
+
highres_image_dpi: int = 192
|
| 15 |
+
|
| 16 |
def __call__(self, provider: PdfProvider, layout_builder: LayoutBuilder, ocr_builder: OcrBuilder):
|
| 17 |
document = self.build_document(provider)
|
| 18 |
layout_builder(document, provider)
|
|
|
|
| 24 |
initial_pages = [
|
| 25 |
PageGroupClass(
|
| 26 |
page_id=i,
|
| 27 |
+
lowres_image=provider.get_image(i, self.lowres_image_dpi),
|
| 28 |
+
highres_image=provider.get_image(i, self.highres_image_dpi),
|
| 29 |
polygon=provider.get_page_bbox(i)
|
| 30 |
) for i in provider.page_range
|
| 31 |
]
|
marker/{v2/builders β builders}/layout.py
RENAMED
|
@@ -5,14 +5,14 @@ from surya.schema import LayoutResult
|
|
| 5 |
from surya.model.layout.encoderdecoder import SuryaLayoutModel
|
| 6 |
|
| 7 |
from marker.settings import settings
|
| 8 |
-
from marker.
|
| 9 |
-
from marker.
|
| 10 |
-
from marker.
|
| 11 |
-
from marker.
|
| 12 |
-
from marker.
|
| 13 |
-
from marker.
|
| 14 |
-
from marker.
|
| 15 |
-
from marker.
|
| 16 |
|
| 17 |
|
| 18 |
class LayoutBuilder(BaseBuilder):
|
|
|
|
| 5 |
from surya.model.layout.encoderdecoder import SuryaLayoutModel
|
| 6 |
|
| 7 |
from marker.settings import settings
|
| 8 |
+
from marker.builders import BaseBuilder
|
| 9 |
+
from marker.providers import ProviderOutput, ProviderPageLines
|
| 10 |
+
from marker.providers.pdf import PdfProvider
|
| 11 |
+
from marker.schema import BlockTypes
|
| 12 |
+
from marker.schema.document import Document
|
| 13 |
+
from marker.schema.groups.page import PageGroup
|
| 14 |
+
from marker.schema.polygon import PolygonBox
|
| 15 |
+
from marker.schema.registry import get_block_class
|
| 16 |
|
| 17 |
|
| 18 |
class LayoutBuilder(BaseBuilder):
|
marker/{v2/builders β builders}/ocr.py
RENAMED
|
@@ -3,15 +3,15 @@ from surya.model.recognition.encoderdecoder import OCREncoderDecoderModel
|
|
| 3 |
from surya.ocr import run_ocr
|
| 4 |
|
| 5 |
from marker.settings import settings
|
| 6 |
-
from marker.
|
| 7 |
-
from marker.
|
| 8 |
-
from marker.
|
| 9 |
-
from marker.
|
| 10 |
-
from marker.
|
| 11 |
-
from marker.
|
| 12 |
-
from marker.
|
| 13 |
-
from marker.
|
| 14 |
-
from marker.
|
| 15 |
|
| 16 |
|
| 17 |
class OcrBuilder(BaseBuilder):
|
|
|
|
| 3 |
from surya.ocr import run_ocr
|
| 4 |
|
| 5 |
from marker.settings import settings
|
| 6 |
+
from marker.builders import BaseBuilder
|
| 7 |
+
from marker.providers import ProviderOutput, ProviderPageLines
|
| 8 |
+
from marker.providers.pdf import PdfProvider
|
| 9 |
+
from marker.schema import BlockTypes
|
| 10 |
+
from marker.schema.document import Document
|
| 11 |
+
from marker.schema.polygon import PolygonBox
|
| 12 |
+
from marker.schema.registry import get_block_class
|
| 13 |
+
from marker.schema.text.line import Line
|
| 14 |
+
from marker.schema.text.span import Span
|
| 15 |
|
| 16 |
|
| 17 |
class OcrBuilder(BaseBuilder):
|
marker/{v2/builders β builders}/structure.py
RENAMED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
-
from marker.
|
| 2 |
-
from marker.
|
| 3 |
-
from marker.
|
| 4 |
-
from marker.
|
| 5 |
-
from marker.
|
| 6 |
-
from marker.
|
| 7 |
|
| 8 |
|
| 9 |
class StructureBuilder(BaseBuilder):
|
|
|
|
| 1 |
+
from marker.builders import BaseBuilder
|
| 2 |
+
from marker.schema import BlockTypes
|
| 3 |
+
from marker.schema.document import Document
|
| 4 |
+
from marker.schema.groups import ListGroup
|
| 5 |
+
from marker.schema.groups.page import PageGroup
|
| 6 |
+
from marker.schema.registry import get_block_class
|
| 7 |
|
| 8 |
|
| 9 |
class StructureBuilder(BaseBuilder):
|
marker/cleaners/bullets.py
DELETED
|
@@ -1,8 +0,0 @@
|
|
| 1 |
-
import re
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
def replace_bullets(text):
|
| 5 |
-
# Replace bullet characters with a -
|
| 6 |
-
bullet_pattern = r"(^|[\n ])[β’βββ βͺβ«ββ]( )"
|
| 7 |
-
replaced_string = re.sub(bullet_pattern, r"\1-\2", text)
|
| 8 |
-
return replaced_string
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
marker/cleaners/code.py
DELETED
|
@@ -1,131 +0,0 @@
|
|
| 1 |
-
from collections import Counter
|
| 2 |
-
from statistics import mean, median
|
| 3 |
-
|
| 4 |
-
from marker.schema.block import Span, Line
|
| 5 |
-
from marker.schema.page import Page
|
| 6 |
-
import re
|
| 7 |
-
from typing import List
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
def is_code_linelen(lines, thresh=80):
|
| 11 |
-
# Decide based on chars per newline threshold
|
| 12 |
-
total_alnum_chars = sum(len(re.findall(r'\w', line.prelim_text)) for line in lines)
|
| 13 |
-
total_newlines = max(len(lines) - 1, 1)
|
| 14 |
-
|
| 15 |
-
if total_alnum_chars == 0:
|
| 16 |
-
return False
|
| 17 |
-
|
| 18 |
-
ratio = total_alnum_chars / total_newlines
|
| 19 |
-
return ratio < thresh
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
def comment_count(lines):
|
| 23 |
-
pattern = re.compile(r"^(//|#|'|--|/\*|'''|\"\"\"|--\[\[|<!--|%|%{|\(\*)")
|
| 24 |
-
return sum([1 for line in lines if pattern.match(line)])
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
def identify_code_blocks(pages: List[Page]):
|
| 28 |
-
code_block_count = 0
|
| 29 |
-
font_sizes = []
|
| 30 |
-
line_heights = []
|
| 31 |
-
for page in pages:
|
| 32 |
-
font_sizes += page.get_font_sizes()
|
| 33 |
-
line_heights += page.get_line_heights()
|
| 34 |
-
|
| 35 |
-
avg_font_size = None
|
| 36 |
-
avg_line_height = None
|
| 37 |
-
if len(font_sizes) > 0:
|
| 38 |
-
avg_line_height = median(line_heights)
|
| 39 |
-
avg_font_size = mean(font_sizes)
|
| 40 |
-
|
| 41 |
-
for page in pages:
|
| 42 |
-
for block in page.blocks:
|
| 43 |
-
if block.block_type != "Text":
|
| 44 |
-
last_block = block
|
| 45 |
-
continue
|
| 46 |
-
|
| 47 |
-
# Ensure we have lines and spans
|
| 48 |
-
if len(block.lines) == 0:
|
| 49 |
-
continue
|
| 50 |
-
if sum([len(line.spans) for line in block.lines]) == 0:
|
| 51 |
-
continue
|
| 52 |
-
|
| 53 |
-
min_start = block.get_min_line_start()
|
| 54 |
-
|
| 55 |
-
is_indent = []
|
| 56 |
-
line_fonts = []
|
| 57 |
-
line_font_sizes = []
|
| 58 |
-
block_line_heights = []
|
| 59 |
-
for line in block.lines:
|
| 60 |
-
line_fonts += [span.font for span in line.spans]
|
| 61 |
-
line_font_sizes += [span.font_size for span in line.spans]
|
| 62 |
-
block_line_heights.append(line.bbox[3] - line.bbox[1])
|
| 63 |
-
|
| 64 |
-
is_indent.append(line.bbox[0] > min_start)
|
| 65 |
-
|
| 66 |
-
comment_lines = comment_count([line.prelim_text for line in block.lines])
|
| 67 |
-
is_code = [
|
| 68 |
-
len(block.lines) > 3,
|
| 69 |
-
is_code_linelen(block.lines),
|
| 70 |
-
sum(is_indent) + comment_lines > len(block.lines) * .7, # Indentation and comments are a majority
|
| 71 |
-
]
|
| 72 |
-
|
| 73 |
-
if avg_font_size is not None:
|
| 74 |
-
font_checks = [
|
| 75 |
-
mean(line_font_sizes) <= avg_font_size * .8, # Lower than average font size and line height
|
| 76 |
-
mean(block_line_heights) < avg_line_height * .8
|
| 77 |
-
]
|
| 78 |
-
is_code += font_checks
|
| 79 |
-
|
| 80 |
-
if all(is_code):
|
| 81 |
-
code_block_count += 1
|
| 82 |
-
block.block_type = "Code"
|
| 83 |
-
|
| 84 |
-
return code_block_count
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
def indent_blocks(pages: List[Page]):
|
| 88 |
-
span_counter = 0
|
| 89 |
-
for page in pages:
|
| 90 |
-
for block in page.blocks:
|
| 91 |
-
if block.block_type != "Code":
|
| 92 |
-
continue
|
| 93 |
-
|
| 94 |
-
lines = []
|
| 95 |
-
min_left = 1000 # will contain x- coord of column 0
|
| 96 |
-
col_width = 0 # width of 1 char
|
| 97 |
-
for line in block.lines:
|
| 98 |
-
text = ""
|
| 99 |
-
min_left = min(line.bbox[0], min_left)
|
| 100 |
-
for span in line.spans:
|
| 101 |
-
if col_width == 0 and len(span.text) > 0:
|
| 102 |
-
col_width = (span.bbox[2] - span.bbox[0]) / len(span.text)
|
| 103 |
-
text += span.text
|
| 104 |
-
lines.append((line.bbox, text))
|
| 105 |
-
|
| 106 |
-
block_text = ""
|
| 107 |
-
blank_line = False
|
| 108 |
-
for line in lines:
|
| 109 |
-
text = line[1]
|
| 110 |
-
if col_width == 0:
|
| 111 |
-
prefix = ""
|
| 112 |
-
else:
|
| 113 |
-
prefix = " " * int((line[0][0] - min_left) / col_width)
|
| 114 |
-
current_line_blank = len(text.strip()) == 0
|
| 115 |
-
if blank_line and current_line_blank:
|
| 116 |
-
# Don't put multiple blank lines in a row
|
| 117 |
-
continue
|
| 118 |
-
|
| 119 |
-
block_text += prefix + text + "\n"
|
| 120 |
-
blank_line = current_line_blank
|
| 121 |
-
|
| 122 |
-
new_span = Span(
|
| 123 |
-
text=block_text,
|
| 124 |
-
bbox=block.bbox,
|
| 125 |
-
span_id=f"{span_counter}_fix_code",
|
| 126 |
-
font=block.lines[0].spans[0].font,
|
| 127 |
-
font_weight=block.lines[0].spans[0].font_weight,
|
| 128 |
-
font_size=block.lines[0].spans[0].font_size,
|
| 129 |
-
)
|
| 130 |
-
span_counter += 1
|
| 131 |
-
block.lines = [Line(spans=[new_span], bbox=block.bbox)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
marker/cleaners/fontstyle.py
DELETED
|
@@ -1,30 +0,0 @@
|
|
| 1 |
-
from typing import List
|
| 2 |
-
|
| 3 |
-
from marker.schema.page import Page
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
def find_bold_italic(pages: List[Page], bold_min_weight=600):
|
| 7 |
-
font_weights = []
|
| 8 |
-
for page in pages:
|
| 9 |
-
for block in page.blocks:
|
| 10 |
-
# We don't want to bias our font stats
|
| 11 |
-
if block.block_type in ["Title", "Section-header"]:
|
| 12 |
-
continue
|
| 13 |
-
for line in block.lines:
|
| 14 |
-
for span in line.spans:
|
| 15 |
-
if "bold" in span.font.lower():
|
| 16 |
-
span.bold = True
|
| 17 |
-
if "ital" in span.font.lower():
|
| 18 |
-
span.italic = True
|
| 19 |
-
|
| 20 |
-
font_weights.append(span.font_weight)
|
| 21 |
-
|
| 22 |
-
if len(font_weights) == 0:
|
| 23 |
-
return
|
| 24 |
-
|
| 25 |
-
for page in pages:
|
| 26 |
-
for block in page.blocks:
|
| 27 |
-
for line in block.lines:
|
| 28 |
-
for span in line.spans:
|
| 29 |
-
if span.font_weight >= bold_min_weight:
|
| 30 |
-
span.bold = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
marker/cleaners/headers.py
DELETED
|
@@ -1,82 +0,0 @@
|
|
| 1 |
-
import re
|
| 2 |
-
from collections import Counter
|
| 3 |
-
from rapidfuzz import fuzz
|
| 4 |
-
|
| 5 |
-
from marker.schema.merged import FullyMergedBlock
|
| 6 |
-
from typing import List, Tuple
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
def filter_common_elements(lines, page_count, threshold=.6):
|
| 10 |
-
# We can't filter if we don't have enough pages to find common elements
|
| 11 |
-
if page_count < 3:
|
| 12 |
-
return []
|
| 13 |
-
text = [s.text for line in lines for s in line.spans if len(s.text) > 4]
|
| 14 |
-
counter = Counter(text)
|
| 15 |
-
common = [k for k, v in counter.items() if v > page_count * threshold]
|
| 16 |
-
bad_span_ids = [s.span_id for line in lines for s in line.spans if s.text in common]
|
| 17 |
-
return bad_span_ids
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
def filter_header_footer(all_page_blocks, max_selected_lines=2):
|
| 21 |
-
first_lines = []
|
| 22 |
-
last_lines = []
|
| 23 |
-
for page in all_page_blocks:
|
| 24 |
-
nonblank_lines = page.get_nonblank_lines()
|
| 25 |
-
first_lines.extend(nonblank_lines[:max_selected_lines])
|
| 26 |
-
last_lines.extend(nonblank_lines[-max_selected_lines:])
|
| 27 |
-
|
| 28 |
-
bad_span_ids = filter_common_elements(first_lines, len(all_page_blocks))
|
| 29 |
-
bad_span_ids += filter_common_elements(last_lines, len(all_page_blocks))
|
| 30 |
-
return bad_span_ids
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
def replace_leading_trailing_digits(string, replacement):
|
| 34 |
-
string = re.sub(r'^\d+', replacement, string)
|
| 35 |
-
string = re.sub(r'\d+$', replacement, string)
|
| 36 |
-
return string
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
def find_overlap_elements(lst: List[Tuple[str, int]], string_match_thresh=.9, min_overlap=.05) -> List[int]:
|
| 40 |
-
# Initialize a list to store the elements that meet the criteria
|
| 41 |
-
result = []
|
| 42 |
-
titles = [l[0] for l in lst]
|
| 43 |
-
|
| 44 |
-
for i, (str1, id_num) in enumerate(lst):
|
| 45 |
-
overlap_count = 0 # Count the number of elements that overlap by at least 80%
|
| 46 |
-
|
| 47 |
-
for j, str2 in enumerate(titles):
|
| 48 |
-
if i != j and fuzz.ratio(str1, str2) >= string_match_thresh * 100:
|
| 49 |
-
overlap_count += 1
|
| 50 |
-
|
| 51 |
-
# Check if the element overlaps with at least 50% of other elements
|
| 52 |
-
if overlap_count >= max(3.0, len(lst) * min_overlap):
|
| 53 |
-
result.append(id_num)
|
| 54 |
-
|
| 55 |
-
return result
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
def filter_common_titles(merged_blocks: List[FullyMergedBlock]) -> List[FullyMergedBlock]:
|
| 59 |
-
titles = []
|
| 60 |
-
for i, block in enumerate(merged_blocks):
|
| 61 |
-
if block.block_type in ["Title", "Section-header"]:
|
| 62 |
-
text = block.text
|
| 63 |
-
if text.strip().startswith("#"):
|
| 64 |
-
text = re.sub(r'#+', '', text)
|
| 65 |
-
text = text.strip()
|
| 66 |
-
# Remove page numbers from start/end
|
| 67 |
-
text = replace_leading_trailing_digits(text, "").strip()
|
| 68 |
-
titles.append((text, i))
|
| 69 |
-
|
| 70 |
-
bad_block_ids = find_overlap_elements(titles)
|
| 71 |
-
|
| 72 |
-
new_blocks = []
|
| 73 |
-
for i, block in enumerate(merged_blocks):
|
| 74 |
-
if i in bad_block_ids:
|
| 75 |
-
continue
|
| 76 |
-
new_blocks.append(block)
|
| 77 |
-
|
| 78 |
-
return new_blocks
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
marker/cleaners/headings.py
DELETED
|
@@ -1,129 +0,0 @@
|
|
| 1 |
-
from collections import defaultdict
|
| 2 |
-
from typing import List
|
| 3 |
-
import numpy as np
|
| 4 |
-
from sklearn.cluster import KMeans
|
| 5 |
-
|
| 6 |
-
from marker.settings import settings
|
| 7 |
-
from marker.schema.bbox import rescale_bbox
|
| 8 |
-
from marker.schema.block import bbox_from_lines
|
| 9 |
-
from marker.schema.page import Page
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
def split_heading_blocks(pages: List[Page]):
|
| 13 |
-
# Heading lines can be combined into regular text blocks sometimes by pdftext
|
| 14 |
-
# Split up heading lines into separate blocks properly
|
| 15 |
-
for page in pages:
|
| 16 |
-
page_heading_boxes = [b for b in page.layout.bboxes if b.label in ["Title", "Section-header"]]
|
| 17 |
-
page_heading_boxes = [(rescale_bbox(page.layout.image_bbox, page.bbox, b.bbox), b.label) for b in page_heading_boxes]
|
| 18 |
-
|
| 19 |
-
new_blocks = []
|
| 20 |
-
for block_idx, block in enumerate(page.blocks):
|
| 21 |
-
if block.block_type not in ["Text"]:
|
| 22 |
-
new_blocks.append(block)
|
| 23 |
-
continue
|
| 24 |
-
|
| 25 |
-
heading_lines = []
|
| 26 |
-
for line_idx, line in enumerate(block.lines):
|
| 27 |
-
for (heading_box, label) in page_heading_boxes:
|
| 28 |
-
if line.intersection_pct(heading_box) > settings.BBOX_INTERSECTION_THRESH:
|
| 29 |
-
heading_lines.append((line_idx, label))
|
| 30 |
-
break
|
| 31 |
-
|
| 32 |
-
if len(heading_lines) == 0:
|
| 33 |
-
new_blocks.append(block)
|
| 34 |
-
continue
|
| 35 |
-
|
| 36 |
-
# Split up the block into separate blocks around headers
|
| 37 |
-
start = 0
|
| 38 |
-
for (heading_line, label) in heading_lines:
|
| 39 |
-
if start < heading_line:
|
| 40 |
-
copied_block = block.copy()
|
| 41 |
-
copied_block.lines = block.lines[start:heading_line]
|
| 42 |
-
copied_block.bbox = bbox_from_lines(copied_block.lines)
|
| 43 |
-
new_blocks.append(copied_block)
|
| 44 |
-
|
| 45 |
-
copied_block = block.copy()
|
| 46 |
-
copied_block.lines = block.lines[heading_line:heading_line + 1]
|
| 47 |
-
copied_block.block_type = label
|
| 48 |
-
copied_block.bbox = bbox_from_lines(copied_block.lines)
|
| 49 |
-
new_blocks.append(copied_block)
|
| 50 |
-
|
| 51 |
-
start = heading_line + 1
|
| 52 |
-
if start >= len(block.lines):
|
| 53 |
-
break
|
| 54 |
-
|
| 55 |
-
# Add any remaining lines
|
| 56 |
-
if start < len(block.lines):
|
| 57 |
-
copied_block = block.copy()
|
| 58 |
-
copied_block.lines = block.lines[start:]
|
| 59 |
-
copied_block.bbox = bbox_from_lines(copied_block.lines)
|
| 60 |
-
new_blocks.append(copied_block)
|
| 61 |
-
|
| 62 |
-
page.blocks = new_blocks
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
def bucket_headings(line_heights, num_levels=settings.HEADING_LEVEL_COUNT):
|
| 66 |
-
if len(line_heights) <= num_levels:
|
| 67 |
-
return []
|
| 68 |
-
|
| 69 |
-
data = np.asarray(line_heights).reshape(-1, 1)
|
| 70 |
-
labels = KMeans(n_clusters=num_levels, random_state=0, n_init="auto").fit_predict(data)
|
| 71 |
-
data_labels = np.concatenate([data, labels.reshape(-1, 1)], axis=1)
|
| 72 |
-
data_labels = np.sort(data_labels, axis=0)
|
| 73 |
-
|
| 74 |
-
cluster_means = {int(label): float(np.mean(data_labels[data_labels[:, 1] == label, 0])) for label in np.unique(labels)}
|
| 75 |
-
label_max = None
|
| 76 |
-
label_min = None
|
| 77 |
-
heading_ranges = []
|
| 78 |
-
prev_cluster = None
|
| 79 |
-
for row in data_labels:
|
| 80 |
-
value, label = row
|
| 81 |
-
value = float(value)
|
| 82 |
-
label = int(label)
|
| 83 |
-
if prev_cluster is not None and label != prev_cluster:
|
| 84 |
-
prev_cluster_mean = cluster_means[prev_cluster]
|
| 85 |
-
cluster_mean = cluster_means[label]
|
| 86 |
-
if cluster_mean * settings.HEADING_MERGE_THRESHOLD < prev_cluster_mean:
|
| 87 |
-
heading_ranges.append((label_min, label_max))
|
| 88 |
-
label_min = None
|
| 89 |
-
label_max = None
|
| 90 |
-
|
| 91 |
-
label_min = value if label_min is None else min(label_min, value)
|
| 92 |
-
label_max = value if label_max is None else max(label_max, value)
|
| 93 |
-
prev_cluster = label
|
| 94 |
-
|
| 95 |
-
if label_min is not None:
|
| 96 |
-
heading_ranges.append((label_min, label_max))
|
| 97 |
-
|
| 98 |
-
heading_ranges = sorted(heading_ranges, reverse=True)
|
| 99 |
-
|
| 100 |
-
return heading_ranges
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
def infer_heading_levels(pages: List[Page], height_tol=.99):
|
| 104 |
-
all_line_heights = []
|
| 105 |
-
for page in pages:
|
| 106 |
-
for block in page.blocks:
|
| 107 |
-
if block.block_type not in ["Title", "Section-header"]:
|
| 108 |
-
continue
|
| 109 |
-
|
| 110 |
-
all_line_heights.extend([l.height for l in block.lines])
|
| 111 |
-
|
| 112 |
-
heading_ranges = bucket_headings(all_line_heights)
|
| 113 |
-
|
| 114 |
-
for page in pages:
|
| 115 |
-
for block in page.blocks:
|
| 116 |
-
if block.block_type not in ["Title", "Section-header"]:
|
| 117 |
-
continue
|
| 118 |
-
|
| 119 |
-
block_heights = [l.height for l in block.lines]
|
| 120 |
-
if len(block_heights) > 0:
|
| 121 |
-
avg_height = sum(block_heights) / len(block_heights)
|
| 122 |
-
for idx, (min_height, max_height) in enumerate(heading_ranges):
|
| 123 |
-
if avg_height >= min_height * height_tol:
|
| 124 |
-
block.heading_level = idx + 1
|
| 125 |
-
break
|
| 126 |
-
|
| 127 |
-
if block.heading_level is None:
|
| 128 |
-
block.heading_level = settings.HEADING_DEFAULT_LEVEL
|
| 129 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
marker/cleaners/text.py
DELETED
|
@@ -1,8 +0,0 @@
|
|
| 1 |
-
import re
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
def cleanup_text(full_text):
|
| 5 |
-
full_text = re.sub(r'\n{3,}', '\n\n', full_text)
|
| 6 |
-
full_text = re.sub(r'(\n\s){3,}', '\n\n', full_text)
|
| 7 |
-
full_text = full_text.replace('\xa0', ' ') # Replace non-breaking spaces
|
| 8 |
-
return full_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
marker/cleaners/toc.py
DELETED
|
@@ -1,29 +0,0 @@
|
|
| 1 |
-
from typing import List
|
| 2 |
-
|
| 3 |
-
from marker.schema.page import Page
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
def get_pdf_toc(doc, max_depth=15):
|
| 7 |
-
toc = doc.get_toc(max_depth=max_depth)
|
| 8 |
-
toc_list = []
|
| 9 |
-
for item in toc:
|
| 10 |
-
list_item = {
|
| 11 |
-
"title": item.title,
|
| 12 |
-
"level": item.level,
|
| 13 |
-
"page": item.page_index,
|
| 14 |
-
}
|
| 15 |
-
toc_list.append(list_item)
|
| 16 |
-
return toc_list
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
def compute_toc(pages: List[Page]):
|
| 20 |
-
toc = []
|
| 21 |
-
for page in pages:
|
| 22 |
-
for block in page.blocks:
|
| 23 |
-
if block.block_type in ["Title", "Section-header"]:
|
| 24 |
-
toc.append({
|
| 25 |
-
"title": block.prelim_text,
|
| 26 |
-
"level": block.heading_level,
|
| 27 |
-
"page": page.pnum
|
| 28 |
-
})
|
| 29 |
-
return toc
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
marker/convert.py
CHANGED
|
@@ -1,42 +1,10 @@
|
|
| 1 |
import warnings
|
| 2 |
-
|
| 3 |
-
from marker.pdf.images import render_image
|
| 4 |
-
|
| 5 |
warnings.filterwarnings("ignore", category=UserWarning) # Filter torch pytree user warnings
|
| 6 |
|
| 7 |
import os
|
| 8 |
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # For some reason, transformers decided to use .isin for a simple op, which is not supported on MPS
|
| 9 |
|
| 10 |
|
| 11 |
-
import pypdfium2 as pdfium # Needs to be at the top to avoid warnings
|
| 12 |
-
from PIL import Image
|
| 13 |
-
|
| 14 |
-
from marker.utils import flush_cuda_memory
|
| 15 |
-
from marker.tables.table import format_tables
|
| 16 |
-
from marker.debug.data import dump_bbox_debug_data, draw_page_debug_images
|
| 17 |
-
from marker.layout.layout import surya_layout, annotate_block_types
|
| 18 |
-
from marker.layout.order import surya_order, sort_blocks_in_reading_order
|
| 19 |
-
from marker.ocr.lang import replace_langs_with_codes, validate_langs
|
| 20 |
-
from marker.ocr.detection import surya_detection
|
| 21 |
-
from marker.ocr.recognition import run_ocr
|
| 22 |
-
from marker.pdf.extract_text import get_text_blocks
|
| 23 |
-
from marker.cleaners.headers import filter_header_footer, filter_common_titles
|
| 24 |
-
from marker.equations.equations import replace_equations
|
| 25 |
-
from marker.pdf.utils import find_filetype
|
| 26 |
-
from marker.cleaners.code import identify_code_blocks, indent_blocks
|
| 27 |
-
from marker.cleaners.bullets import replace_bullets
|
| 28 |
-
from marker.cleaners.headings import split_heading_blocks, infer_heading_levels
|
| 29 |
-
from marker.cleaners.fontstyle import find_bold_italic
|
| 30 |
-
from marker.postprocessors.markdown import merge_spans, merge_lines, get_full_text
|
| 31 |
-
from marker.cleaners.text import cleanup_text
|
| 32 |
-
from marker.images.extract import extract_images
|
| 33 |
-
from marker.images.save import images_to_dict
|
| 34 |
-
from marker.cleaners.toc import compute_toc
|
| 35 |
-
|
| 36 |
-
from typing import List, Dict, Tuple, Optional
|
| 37 |
-
from marker.settings import settings
|
| 38 |
-
|
| 39 |
-
|
| 40 |
def convert_single_pdf(
|
| 41 |
fname: str,
|
| 42 |
model_lst: List,
|
|
|
|
| 1 |
import warnings
|
|
|
|
|
|
|
|
|
|
| 2 |
warnings.filterwarnings("ignore", category=UserWarning) # Filter torch pytree user warnings
|
| 3 |
|
| 4 |
import os
|
| 5 |
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # For some reason, transformers decided to use .isin for a simple op, which is not supported on MPS
|
| 6 |
|
| 7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
def convert_single_pdf(
|
| 9 |
fname: str,
|
| 10 |
model_lst: List,
|
marker/{v2/converters β converters}/__init__.py
RENAMED
|
@@ -2,7 +2,7 @@ from typing import Optional
|
|
| 2 |
|
| 3 |
from pydantic import BaseModel
|
| 4 |
|
| 5 |
-
from marker.
|
| 6 |
|
| 7 |
|
| 8 |
class BaseConverter:
|
|
|
|
| 2 |
|
| 3 |
from pydantic import BaseModel
|
| 4 |
|
| 5 |
+
from marker.util import assign_config
|
| 6 |
|
| 7 |
|
| 8 |
class BaseConverter:
|
marker/{v2/converters β converters}/pdf.py
RENAMED
|
@@ -1,13 +1,13 @@
|
|
| 1 |
import json
|
| 2 |
|
| 3 |
from marker.settings import settings
|
| 4 |
-
from marker.
|
| 5 |
-
from marker.
|
| 6 |
-
from marker.
|
| 7 |
import os
|
| 8 |
|
| 9 |
-
from marker.
|
| 10 |
-
from marker.
|
| 11 |
|
| 12 |
os.environ["TOKENIZERS_PARALLELISM"] = "false" # disables a tokenizers warning
|
| 13 |
|
|
@@ -17,24 +17,24 @@ from typing import Dict, Type, List, Any
|
|
| 17 |
import click
|
| 18 |
import inspect
|
| 19 |
|
| 20 |
-
from marker.
|
| 21 |
-
from marker.
|
| 22 |
-
from marker.
|
| 23 |
-
from marker.
|
| 24 |
-
from marker.
|
| 25 |
-
from marker.
|
| 26 |
setup_recognition_model, setup_table_rec_model, setup_texify_model
|
| 27 |
-
from marker.
|
| 28 |
-
from marker.
|
| 29 |
-
from marker.
|
| 30 |
-
from marker.
|
| 31 |
-
from marker.
|
| 32 |
-
from marker.
|
| 33 |
-
from marker.
|
| 34 |
-
from marker.
|
| 35 |
-
from marker.
|
| 36 |
-
from marker.
|
| 37 |
-
from marker.
|
| 38 |
|
| 39 |
|
| 40 |
class PdfConverter(BaseConverter):
|
|
|
|
| 1 |
import json
|
| 2 |
|
| 3 |
from marker.settings import settings
|
| 4 |
+
from marker.processors.code import CodeProcessor
|
| 5 |
+
from marker.processors.document_toc import DocumentTOCProcessor
|
| 6 |
+
from marker.providers.pdf import PdfProvider
|
| 7 |
import os
|
| 8 |
|
| 9 |
+
from marker.renderers.json import JSONRenderer
|
| 10 |
+
from marker.util import parse_range_str
|
| 11 |
|
| 12 |
os.environ["TOKENIZERS_PARALLELISM"] = "false" # disables a tokenizers warning
|
| 13 |
|
|
|
|
| 17 |
import click
|
| 18 |
import inspect
|
| 19 |
|
| 20 |
+
from marker.builders.document import DocumentBuilder
|
| 21 |
+
from marker.builders.layout import LayoutBuilder
|
| 22 |
+
from marker.builders.ocr import OcrBuilder
|
| 23 |
+
from marker.builders.structure import StructureBuilder
|
| 24 |
+
from marker.converters import BaseConverter
|
| 25 |
+
from marker.models import setup_detection_model, setup_layout_model, \
|
| 26 |
setup_recognition_model, setup_table_rec_model, setup_texify_model
|
| 27 |
+
from marker.processors.equation import EquationProcessor
|
| 28 |
+
from marker.processors.sectionheader import SectionHeaderProcessor
|
| 29 |
+
from marker.processors.text import TextProcessor
|
| 30 |
+
from marker.processors.table import TableProcessor
|
| 31 |
+
from marker.renderers.markdown import MarkdownRenderer
|
| 32 |
+
from marker.schema import BlockTypes
|
| 33 |
+
from marker.schema.blocks import Block
|
| 34 |
+
from marker.schema.registry import register_block_class
|
| 35 |
+
from marker.processors.debug import DebugProcessor
|
| 36 |
+
from marker.processors import BaseProcessor
|
| 37 |
+
from marker.renderers import BaseRenderer
|
| 38 |
|
| 39 |
|
| 40 |
class PdfConverter(BaseConverter):
|
marker/debug/data.py
DELETED
|
@@ -1,109 +0,0 @@
|
|
| 1 |
-
import json
|
| 2 |
-
import math
|
| 3 |
-
import os
|
| 4 |
-
from typing import List
|
| 5 |
-
|
| 6 |
-
from marker.debug.render import render_on_image
|
| 7 |
-
from marker.schema.bbox import rescale_bbox
|
| 8 |
-
from marker.schema.page import Page
|
| 9 |
-
from marker.settings import settings
|
| 10 |
-
from PIL import Image
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
def draw_layout_page_debug_images(fname, pages: List[Page]):
|
| 14 |
-
# Remove extension from doc name
|
| 15 |
-
doc_base = os.path.basename(fname).rsplit(".", 1)[0]
|
| 16 |
-
|
| 17 |
-
debug_folder = os.path.join(settings.DEBUG_DATA_FOLDER, doc_base)
|
| 18 |
-
os.makedirs(debug_folder, exist_ok=True)
|
| 19 |
-
for idx, page in enumerate(pages):
|
| 20 |
-
img_size = (int(math.ceil(page.text_lines.image_bbox[2])), int(math.ceil(page.text_lines.image_bbox[3])))
|
| 21 |
-
png_image = Image.new("RGB", img_size, color="white")
|
| 22 |
-
|
| 23 |
-
line_bboxes = []
|
| 24 |
-
line_text = []
|
| 25 |
-
for block in page.blocks:
|
| 26 |
-
for line in block.lines:
|
| 27 |
-
line_bboxes.append(rescale_bbox(page.bbox, page.text_lines.image_bbox, line.bbox))
|
| 28 |
-
line_text.append(line.prelim_text)
|
| 29 |
-
|
| 30 |
-
render_on_image(line_bboxes, png_image, labels=line_text, color="black", draw_bbox=False)
|
| 31 |
-
pdf_image = png_image.copy()
|
| 32 |
-
|
| 33 |
-
line_bboxes = [line.bbox for line in page.text_lines.bboxes]
|
| 34 |
-
render_on_image(line_bboxes, png_image, color="blue")
|
| 35 |
-
|
| 36 |
-
layout_boxes = [rescale_bbox(page.layout.image_bbox, page.text_lines.image_bbox, box.bbox) for box in page.layout.bboxes]
|
| 37 |
-
layout_labels = [box.label for box in page.layout.bboxes]
|
| 38 |
-
|
| 39 |
-
render_on_image(layout_boxes, png_image, labels=layout_labels, color="red")
|
| 40 |
-
|
| 41 |
-
order_labels = [str(i) for i in range(len(page.layout.bboxes))]
|
| 42 |
-
render_on_image(layout_boxes, png_image, labels=order_labels, color="green", draw_bbox=False, label_offset=5)
|
| 43 |
-
|
| 44 |
-
debug_file = os.path.join(debug_folder, f"layout_page_{idx}.png")
|
| 45 |
-
png_image.save(debug_file)
|
| 46 |
-
|
| 47 |
-
# PDF Image
|
| 48 |
-
|
| 49 |
-
block_bboxes = [rescale_bbox(page.bbox, page.text_lines.image_bbox, block.bbox) for block in page.blocks]
|
| 50 |
-
block_labels = [block.block_type for block in page.blocks]
|
| 51 |
-
render_on_image(block_bboxes, pdf_image, labels=block_labels, color="red")
|
| 52 |
-
|
| 53 |
-
block_order = [str(i) for i in range(len(page.blocks))]
|
| 54 |
-
render_on_image(block_bboxes, pdf_image, labels=block_order, color="green", draw_bbox=False, label_offset=5)
|
| 55 |
-
|
| 56 |
-
debug_file = os.path.join(debug_folder, f"pdf_page_{idx}.png")
|
| 57 |
-
pdf_image.save(debug_file)
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
def draw_pdf_page_debug_images(fname, pages: List[Page]):
|
| 61 |
-
# Remove extension from doc name
|
| 62 |
-
doc_base = os.path.basename(fname).rsplit(".", 1)[0]
|
| 63 |
-
|
| 64 |
-
debug_folder = os.path.join(settings.DEBUG_DATA_FOLDER, doc_base)
|
| 65 |
-
os.makedirs(debug_folder, exist_ok=True)
|
| 66 |
-
for idx, page in enumerate(pages):
|
| 67 |
-
img_size = (int(math.ceil(page.text_lines.image_bbox[2])), int(math.ceil(page.text_lines.image_bbox[3])))
|
| 68 |
-
png_image = Image.new("RGB", img_size, color="white")
|
| 69 |
-
|
| 70 |
-
line_bboxes = []
|
| 71 |
-
line_text = []
|
| 72 |
-
for block in page.blocks:
|
| 73 |
-
for line in block.lines:
|
| 74 |
-
line_bboxes.append(rescale_bbox(page.bbox, page.text_lines.image_bbox, line.bbox))
|
| 75 |
-
line_text.append(line.prelim_text)
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
def draw_page_debug_images(fname, pages: List[Page]):
|
| 81 |
-
if not settings.DEBUG:
|
| 82 |
-
return
|
| 83 |
-
|
| 84 |
-
draw_layout_page_debug_images(fname, pages)
|
| 85 |
-
draw_pdf_page_debug_images(fname, pages)
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
def dump_bbox_debug_data(fname, pages: List[Page]):
|
| 90 |
-
if not settings.DEBUG:
|
| 91 |
-
return
|
| 92 |
-
|
| 93 |
-
# Remove extension from doc name
|
| 94 |
-
doc_base = os.path.basename(fname).rsplit(".", 1)[0]
|
| 95 |
-
|
| 96 |
-
debug_file = os.path.join(settings.DEBUG_DATA_FOLDER, f"{doc_base}_bbox.json")
|
| 97 |
-
debug_data = []
|
| 98 |
-
for idx, page_blocks in enumerate(pages):
|
| 99 |
-
page_data = page_blocks.model_dump(exclude=["images", "layout", "text_lines"])
|
| 100 |
-
page_data["layout"] = page_blocks.layout.model_dump(exclude=["segmentation_map"])
|
| 101 |
-
page_data["text_lines"] = page_blocks.text_lines.model_dump(exclude=["heatmap", "affinity_map"])
|
| 102 |
-
debug_data.append(page_data)
|
| 103 |
-
|
| 104 |
-
with open(debug_file, "w+") as f:
|
| 105 |
-
json.dump(debug_data, f)
|
| 106 |
-
print(f"Dumped bbox debug data to {debug_file}")
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
marker/debug/render.py
DELETED
|
@@ -1,62 +0,0 @@
|
|
| 1 |
-
import requests
|
| 2 |
-
from PIL import ImageDraw, ImageFont, Image
|
| 3 |
-
|
| 4 |
-
from marker.settings import settings
|
| 5 |
-
import os
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
def get_font_path() -> str:
|
| 9 |
-
font_path = settings.DEBUG_RENDER_FONT
|
| 10 |
-
|
| 11 |
-
if not os.path.exists(font_path):
|
| 12 |
-
os.makedirs(os.path.dirname(font_path), exist_ok=True)
|
| 13 |
-
font_dl_path = f"{settings.FONT_DL_BASE}/{os.path.basename(font_path)}"
|
| 14 |
-
with requests.get(font_dl_path, stream=True) as r, open(font_path, 'wb') as f:
|
| 15 |
-
r.raise_for_status()
|
| 16 |
-
for chunk in r.iter_content(chunk_size=8192):
|
| 17 |
-
f.write(chunk)
|
| 18 |
-
|
| 19 |
-
return font_path
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
def get_text_size(text, font):
|
| 23 |
-
im = Image.new(mode="P", size=(0, 0))
|
| 24 |
-
draw = ImageDraw.Draw(im)
|
| 25 |
-
_, _, width, height = draw.textbbox((0, 0), text=text, font=font)
|
| 26 |
-
return width, height
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
def render_on_image(bboxes, image, labels=None, label_offset=1, label_font_size=10, color: str | list='red', draw_bbox=True):
|
| 30 |
-
draw = ImageDraw.Draw(image)
|
| 31 |
-
font_path = get_font_path()
|
| 32 |
-
label_font = ImageFont.truetype(font_path, label_font_size)
|
| 33 |
-
|
| 34 |
-
for i, bbox in enumerate(bboxes):
|
| 35 |
-
bbox = [int(p) for p in bbox]
|
| 36 |
-
if draw_bbox:
|
| 37 |
-
draw.rectangle(bbox, outline=color[i] if isinstance(color, list) else color, width=1)
|
| 38 |
-
|
| 39 |
-
if labels is not None:
|
| 40 |
-
label = labels[i]
|
| 41 |
-
text_position = (
|
| 42 |
-
bbox[0] + label_offset,
|
| 43 |
-
bbox[1] + label_offset
|
| 44 |
-
)
|
| 45 |
-
text_size = get_text_size(label, label_font)
|
| 46 |
-
if text_size[0] <= 0 or text_size[1] <= 0:
|
| 47 |
-
continue
|
| 48 |
-
box_position = (
|
| 49 |
-
text_position[0],
|
| 50 |
-
text_position[1],
|
| 51 |
-
text_position[0] + text_size[0],
|
| 52 |
-
text_position[1] + text_size[1]
|
| 53 |
-
)
|
| 54 |
-
draw.rectangle(box_position, fill="white")
|
| 55 |
-
draw.text(
|
| 56 |
-
text_position,
|
| 57 |
-
label,
|
| 58 |
-
fill=color[i] if isinstance(color, list) else color,
|
| 59 |
-
font=label_font
|
| 60 |
-
)
|
| 61 |
-
|
| 62 |
-
return image
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
marker/equations/equations.py
DELETED
|
@@ -1,179 +0,0 @@
|
|
| 1 |
-
from collections import defaultdict
|
| 2 |
-
from copy import deepcopy
|
| 3 |
-
from typing import List
|
| 4 |
-
|
| 5 |
-
from marker.equations.inference import get_total_texify_tokens, get_latex_batched
|
| 6 |
-
from marker.pdf.images import render_bbox_image
|
| 7 |
-
from marker.schema.bbox import rescale_bbox
|
| 8 |
-
from marker.schema.page import Page
|
| 9 |
-
from marker.schema.block import Line, Span, Block, split_block_lines, find_insert_block
|
| 10 |
-
from marker.settings import settings
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
def find_equation_blocks(page, processor):
|
| 14 |
-
equation_blocks = []
|
| 15 |
-
equation_regions = [l.bbox for l in page.layout.bboxes if l.label in ["Formula"]]
|
| 16 |
-
equation_regions = [rescale_bbox(page.layout.image_bbox, page.bbox, b) for b in equation_regions]
|
| 17 |
-
|
| 18 |
-
lines_to_remove = defaultdict(list)
|
| 19 |
-
insert_points = {}
|
| 20 |
-
equation_lines = defaultdict(list)
|
| 21 |
-
for region_idx, region in enumerate(equation_regions):
|
| 22 |
-
for block_idx, block in enumerate(page.blocks):
|
| 23 |
-
for line_idx, line in enumerate(block.lines):
|
| 24 |
-
if line.intersection_pct(region) > settings.BBOX_INTERSECTION_THRESH:
|
| 25 |
-
# We will remove this line from the block
|
| 26 |
-
lines_to_remove[region_idx].append((block_idx, line_idx))
|
| 27 |
-
equation_lines[region_idx].append(line)
|
| 28 |
-
|
| 29 |
-
if region_idx not in insert_points:
|
| 30 |
-
insert_points[region_idx] = (block_idx, line_idx)
|
| 31 |
-
|
| 32 |
-
# Account for regions where the lines were not detected
|
| 33 |
-
for region_idx, region in enumerate(equation_regions):
|
| 34 |
-
if region_idx in insert_points:
|
| 35 |
-
continue
|
| 36 |
-
|
| 37 |
-
insert_points[region_idx] = (find_insert_block(page.blocks, region), 0)
|
| 38 |
-
|
| 39 |
-
block_lines_to_remove = defaultdict(set)
|
| 40 |
-
for region_idx, equation_region in enumerate(equation_regions):
|
| 41 |
-
if region_idx not in equation_lines or len(equation_lines[region_idx]) == 0:
|
| 42 |
-
block_text = ""
|
| 43 |
-
total_tokens = 0
|
| 44 |
-
else:
|
| 45 |
-
equation_block = equation_lines[region_idx]
|
| 46 |
-
block_text = " ".join([line.prelim_text for line in equation_block])
|
| 47 |
-
total_tokens = get_total_texify_tokens(block_text, processor)
|
| 48 |
-
|
| 49 |
-
equation_insert = insert_points[region_idx]
|
| 50 |
-
equation_insert_line_idx = equation_insert[1]
|
| 51 |
-
equation_insert_line_idx -= len(
|
| 52 |
-
[x for x in lines_to_remove[region_idx] if x[0] == equation_insert[0] and x[1] < equation_insert[1]])
|
| 53 |
-
|
| 54 |
-
selected_blocks = [equation_insert[0], equation_insert_line_idx, total_tokens, block_text, equation_region]
|
| 55 |
-
if total_tokens < settings.TEXIFY_MODEL_MAX:
|
| 56 |
-
# Account for the lines we're about to remove
|
| 57 |
-
for item in lines_to_remove[region_idx]:
|
| 58 |
-
block_lines_to_remove[item[0]].add(item[1])
|
| 59 |
-
equation_blocks.append(selected_blocks)
|
| 60 |
-
|
| 61 |
-
# Remove the lines from the blocks
|
| 62 |
-
for block_idx, bad_lines in block_lines_to_remove.items():
|
| 63 |
-
block = page.blocks[block_idx]
|
| 64 |
-
block.lines = [line for idx, line in enumerate(block.lines) if idx not in bad_lines]
|
| 65 |
-
|
| 66 |
-
return equation_blocks
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
def increment_insert_points(page_equation_blocks, insert_block_idx, insert_count):
|
| 70 |
-
for idx, (block_idx, line_idx, token_count, block_text, equation_bbox) in enumerate(page_equation_blocks):
|
| 71 |
-
if block_idx >= insert_block_idx:
|
| 72 |
-
page_equation_blocks[idx][0] += insert_count
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
def insert_latex_block(page_blocks: Page, page_equation_blocks, predictions, pnum, processor):
|
| 76 |
-
converted_spans = []
|
| 77 |
-
idx = 0
|
| 78 |
-
success_count = 0
|
| 79 |
-
fail_count = 0
|
| 80 |
-
for block_number, (insert_block_idx, insert_line_idx, token_count, block_text, equation_bbox) in enumerate(page_equation_blocks):
|
| 81 |
-
latex_text = predictions[block_number]
|
| 82 |
-
conditions = [
|
| 83 |
-
get_total_texify_tokens(latex_text, processor) < settings.TEXIFY_MODEL_MAX, # Make sure we didn't get to the overall token max, indicates run-on
|
| 84 |
-
len(latex_text) > len(block_text) * .7,
|
| 85 |
-
len(latex_text.strip()) > 0
|
| 86 |
-
]
|
| 87 |
-
|
| 88 |
-
new_block = Block(
|
| 89 |
-
lines=[Line(
|
| 90 |
-
spans=[
|
| 91 |
-
Span(
|
| 92 |
-
text=block_text.replace("\n", " "),
|
| 93 |
-
bbox=equation_bbox,
|
| 94 |
-
span_id=f"{pnum}_{idx}_fixeq",
|
| 95 |
-
font="Latex",
|
| 96 |
-
font_weight=0,
|
| 97 |
-
font_size=0
|
| 98 |
-
)
|
| 99 |
-
],
|
| 100 |
-
bbox=equation_bbox
|
| 101 |
-
)],
|
| 102 |
-
bbox=equation_bbox,
|
| 103 |
-
block_type="Formula",
|
| 104 |
-
pnum=pnum
|
| 105 |
-
)
|
| 106 |
-
|
| 107 |
-
if not all(conditions):
|
| 108 |
-
fail_count += 1
|
| 109 |
-
else:
|
| 110 |
-
success_count += 1
|
| 111 |
-
new_block.lines[0].spans[0].text = latex_text.replace("\n", " ")
|
| 112 |
-
converted_spans.append(deepcopy(new_block.lines[0].spans[0]))
|
| 113 |
-
|
| 114 |
-
# Add in the new LaTeX block
|
| 115 |
-
if insert_line_idx == 0:
|
| 116 |
-
page_blocks.blocks.insert(insert_block_idx, new_block)
|
| 117 |
-
increment_insert_points(page_equation_blocks, insert_block_idx, 1)
|
| 118 |
-
elif insert_line_idx >= len(page_blocks.blocks[insert_block_idx].lines):
|
| 119 |
-
page_blocks.blocks.insert(insert_block_idx + 1, new_block)
|
| 120 |
-
increment_insert_points(page_equation_blocks, insert_block_idx + 1, 1)
|
| 121 |
-
else:
|
| 122 |
-
new_blocks = []
|
| 123 |
-
for block_idx, block in enumerate(page_blocks.blocks):
|
| 124 |
-
if block_idx == insert_block_idx:
|
| 125 |
-
split_block = split_block_lines(block, insert_line_idx)
|
| 126 |
-
new_blocks.append(split_block[0])
|
| 127 |
-
new_blocks.append(new_block)
|
| 128 |
-
new_blocks.append(split_block[1])
|
| 129 |
-
increment_insert_points(page_equation_blocks, insert_block_idx, 2)
|
| 130 |
-
else:
|
| 131 |
-
new_blocks.append(block)
|
| 132 |
-
page_blocks.blocks = new_blocks
|
| 133 |
-
|
| 134 |
-
return success_count, fail_count, converted_spans
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
def replace_equations(doc, pages: List[Page], texify_model, batch_multiplier=1):
|
| 138 |
-
unsuccessful_ocr = 0
|
| 139 |
-
successful_ocr = 0
|
| 140 |
-
|
| 141 |
-
# Find potential equation regions, and length of text in each region
|
| 142 |
-
equation_blocks = []
|
| 143 |
-
for pnum, page in enumerate(pages):
|
| 144 |
-
equation_blocks.append(find_equation_blocks(page, texify_model.processor))
|
| 145 |
-
|
| 146 |
-
eq_count = sum([len(x) for x in equation_blocks])
|
| 147 |
-
|
| 148 |
-
images = []
|
| 149 |
-
token_counts = []
|
| 150 |
-
for page_idx, page_equation_blocks in enumerate(equation_blocks):
|
| 151 |
-
page_obj = doc[page_idx]
|
| 152 |
-
for equation_idx, (insert_block_idx, insert_line_idx, token_count, block_text, equation_bbox) in enumerate(page_equation_blocks):
|
| 153 |
-
png_image = render_bbox_image(page_obj, pages[page_idx], equation_bbox)
|
| 154 |
-
|
| 155 |
-
images.append(png_image)
|
| 156 |
-
token_counts.append(token_count)
|
| 157 |
-
|
| 158 |
-
# Make batched predictions
|
| 159 |
-
predictions = get_latex_batched(images, token_counts, texify_model, batch_multiplier=batch_multiplier)
|
| 160 |
-
|
| 161 |
-
# Replace blocks with predictions
|
| 162 |
-
page_start = 0
|
| 163 |
-
converted_spans = []
|
| 164 |
-
for page_idx, page_equation_blocks in enumerate(equation_blocks):
|
| 165 |
-
page_equation_count = len(page_equation_blocks)
|
| 166 |
-
page_predictions = predictions[page_start:page_start + page_equation_count]
|
| 167 |
-
success_count, fail_count, converted_span = insert_latex_block(
|
| 168 |
-
pages[page_idx],
|
| 169 |
-
page_equation_blocks,
|
| 170 |
-
page_predictions,
|
| 171 |
-
page_idx,
|
| 172 |
-
texify_model.processor
|
| 173 |
-
)
|
| 174 |
-
converted_spans.extend(converted_span)
|
| 175 |
-
page_start += page_equation_count
|
| 176 |
-
successful_ocr += success_count
|
| 177 |
-
unsuccessful_ocr += fail_count
|
| 178 |
-
|
| 179 |
-
return pages, {"successful_ocr": successful_ocr, "unsuccessful_ocr": unsuccessful_ocr, "equations": eq_count}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
marker/equations/inference.py
DELETED
|
@@ -1,51 +0,0 @@
|
|
| 1 |
-
from texify.inference import batch_inference
|
| 2 |
-
from tqdm import tqdm
|
| 3 |
-
|
| 4 |
-
from marker.settings import settings
|
| 5 |
-
import os
|
| 6 |
-
|
| 7 |
-
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
def get_batch_size():
|
| 11 |
-
if settings.TEXIFY_BATCH_SIZE is not None:
|
| 12 |
-
return settings.TEXIFY_BATCH_SIZE
|
| 13 |
-
elif settings.TORCH_DEVICE_MODEL == "cuda":
|
| 14 |
-
return 6
|
| 15 |
-
elif settings.TORCH_DEVICE_MODEL == "mps":
|
| 16 |
-
return 6
|
| 17 |
-
return 2
|
| 18 |
-
|
| 19 |
-
def get_latex_batched(images, token_counts, texify_model, batch_multiplier=1):
|
| 20 |
-
if len(images) == 0:
|
| 21 |
-
return []
|
| 22 |
-
|
| 23 |
-
predictions = [""] * len(images)
|
| 24 |
-
batch_size = get_batch_size() * batch_multiplier
|
| 25 |
-
|
| 26 |
-
for i in tqdm(range(0, len(images), batch_size), desc="Recognizing equations"):
|
| 27 |
-
# Dynamically set max length to save inference time
|
| 28 |
-
min_idx = i
|
| 29 |
-
max_idx = min(min_idx + batch_size, len(images))
|
| 30 |
-
max_length = max(token_counts[min_idx:max_idx])
|
| 31 |
-
max_length = min(max_length, settings.TEXIFY_MODEL_MAX)
|
| 32 |
-
max_length += settings.TEXIFY_TOKEN_BUFFER
|
| 33 |
-
|
| 34 |
-
model_output = batch_inference(images[min_idx:max_idx], texify_model, texify_model.processor, max_tokens=max_length)
|
| 35 |
-
|
| 36 |
-
for j, output in enumerate(model_output):
|
| 37 |
-
token_count = get_total_texify_tokens(output, texify_model.processor)
|
| 38 |
-
if token_count >= max_length - 1:
|
| 39 |
-
output = ""
|
| 40 |
-
|
| 41 |
-
image_idx = i + j
|
| 42 |
-
predictions[image_idx] = output
|
| 43 |
-
return predictions
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
def get_total_texify_tokens(text, processor):
|
| 47 |
-
tokenizer = processor.tokenizer
|
| 48 |
-
tokens = tokenizer(text)
|
| 49 |
-
return len(tokens["input_ids"])
|
| 50 |
-
|
| 51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
marker/images/extract.py
DELETED
|
@@ -1,77 +0,0 @@
|
|
| 1 |
-
from marker.images.save import get_image_filename
|
| 2 |
-
from marker.pdf.images import render_bbox_image
|
| 3 |
-
from marker.schema.bbox import rescale_bbox
|
| 4 |
-
from marker.schema.block import find_insert_block, Span, Line
|
| 5 |
-
from marker.settings import settings
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
def find_image_blocks(page):
|
| 9 |
-
image_blocks = []
|
| 10 |
-
image_regions = [l.bbox for l in page.layout.bboxes if l.label in ["Figure", "Picture"]]
|
| 11 |
-
image_regions = [rescale_bbox(page.layout.image_bbox, page.bbox, b) for b in image_regions]
|
| 12 |
-
|
| 13 |
-
insert_points = {}
|
| 14 |
-
for region_idx, region in enumerate(image_regions):
|
| 15 |
-
for block_idx, block in enumerate(page.blocks):
|
| 16 |
-
for line_idx, line in enumerate(block.lines):
|
| 17 |
-
if line.intersection_pct(region) > settings.BBOX_INTERSECTION_THRESH:
|
| 18 |
-
line.spans = [] # We will remove this line from the block
|
| 19 |
-
|
| 20 |
-
if region_idx not in insert_points:
|
| 21 |
-
insert_points[region_idx] = (block_idx, line_idx)
|
| 22 |
-
|
| 23 |
-
# Account for images with no detected lines
|
| 24 |
-
for region_idx, region in enumerate(image_regions):
|
| 25 |
-
if region_idx in insert_points:
|
| 26 |
-
continue
|
| 27 |
-
|
| 28 |
-
insert_points[region_idx] = (find_insert_block(page.blocks, region), 0)
|
| 29 |
-
|
| 30 |
-
for region_idx, image_region in enumerate(image_regions):
|
| 31 |
-
image_insert = insert_points[region_idx]
|
| 32 |
-
image_blocks.append([image_insert[0], image_insert[1], image_region])
|
| 33 |
-
|
| 34 |
-
return image_blocks
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
def extract_page_images(page_obj, page):
|
| 38 |
-
page.images = []
|
| 39 |
-
image_blocks = find_image_blocks(page)
|
| 40 |
-
|
| 41 |
-
for image_idx, (block_idx, line_idx, bbox) in enumerate(image_blocks):
|
| 42 |
-
if block_idx >= len(page.blocks):
|
| 43 |
-
block_idx = len(page.blocks) - 1
|
| 44 |
-
if block_idx < 0:
|
| 45 |
-
continue
|
| 46 |
-
|
| 47 |
-
block = page.blocks[block_idx]
|
| 48 |
-
image = render_bbox_image(page_obj, page, bbox)
|
| 49 |
-
image_filename = get_image_filename(page, image_idx)
|
| 50 |
-
image_markdown = f"\n\n\n\n"
|
| 51 |
-
image_span = Span(
|
| 52 |
-
bbox=bbox,
|
| 53 |
-
text=image_markdown,
|
| 54 |
-
font="Image",
|
| 55 |
-
rotation=0,
|
| 56 |
-
font_weight=0,
|
| 57 |
-
font_size=0,
|
| 58 |
-
image=True,
|
| 59 |
-
span_id=f"image_{image_idx}"
|
| 60 |
-
)
|
| 61 |
-
|
| 62 |
-
# Sometimes, the block has zero lines
|
| 63 |
-
if len(block.lines) > line_idx:
|
| 64 |
-
block.lines[line_idx].spans.append(image_span)
|
| 65 |
-
else:
|
| 66 |
-
line = Line(
|
| 67 |
-
bbox=bbox,
|
| 68 |
-
spans=[image_span]
|
| 69 |
-
)
|
| 70 |
-
block.lines.append(line)
|
| 71 |
-
page.images.append(image)
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
def extract_images(doc, pages):
|
| 75 |
-
for page_idx, page in enumerate(pages):
|
| 76 |
-
page_obj = doc[page_idx]
|
| 77 |
-
extract_page_images(page_obj, page)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
marker/images/save.py
DELETED
|
@@ -1,18 +0,0 @@
|
|
| 1 |
-
from typing import List
|
| 2 |
-
|
| 3 |
-
from marker.schema.page import Page
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
def get_image_filename(page: Page, image_idx):
|
| 7 |
-
return f"{page.pnum}_image_{image_idx}.png"
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
def images_to_dict(pages: List[Page]):
|
| 11 |
-
images = {}
|
| 12 |
-
for page in pages:
|
| 13 |
-
if page.images is None:
|
| 14 |
-
continue
|
| 15 |
-
for image_idx, image in enumerate(page.images):
|
| 16 |
-
image_filename = get_image_filename(page, image_idx)
|
| 17 |
-
images[image_filename] = image
|
| 18 |
-
return images
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
marker/layout/layout.py
DELETED
|
@@ -1,113 +0,0 @@
|
|
| 1 |
-
from collections import defaultdict, Counter
|
| 2 |
-
from typing import List
|
| 3 |
-
|
| 4 |
-
from surya.layout import batch_layout_detection
|
| 5 |
-
|
| 6 |
-
from marker.pdf.images import render_image
|
| 7 |
-
from marker.schema.bbox import rescale_bbox
|
| 8 |
-
from marker.schema.block import bbox_from_lines
|
| 9 |
-
from marker.schema.page import Page
|
| 10 |
-
from marker.settings import settings
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
def get_batch_size():
|
| 14 |
-
if settings.LAYOUT_BATCH_SIZE is not None:
|
| 15 |
-
return settings.LAYOUT_BATCH_SIZE
|
| 16 |
-
elif settings.TORCH_DEVICE_MODEL == "cuda":
|
| 17 |
-
return 6
|
| 18 |
-
return 6
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
def surya_layout(images: list, pages: List[Page], layout_model, batch_multiplier=1):
|
| 22 |
-
text_detection_results = [p.text_lines for p in pages]
|
| 23 |
-
|
| 24 |
-
processor = layout_model.processor
|
| 25 |
-
layout_results = batch_layout_detection(images, layout_model, processor, detection_results=text_detection_results, batch_size=int(get_batch_size() * batch_multiplier))
|
| 26 |
-
for page, layout_result in zip(pages, layout_results):
|
| 27 |
-
page.layout = layout_result
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
def annotate_block_types(pages: List[Page]):
|
| 31 |
-
for page in pages:
|
| 32 |
-
max_intersections = {}
|
| 33 |
-
for i, block in enumerate(page.blocks):
|
| 34 |
-
for j, layout_block in enumerate(page.layout.bboxes):
|
| 35 |
-
layout_bbox = layout_block.bbox
|
| 36 |
-
layout_bbox = rescale_bbox(page.layout.image_bbox, page.bbox, layout_bbox)
|
| 37 |
-
intersection_pct = block.intersection_pct(layout_bbox)
|
| 38 |
-
if i not in max_intersections:
|
| 39 |
-
max_intersections[i] = (intersection_pct, j)
|
| 40 |
-
elif intersection_pct > max_intersections[i][0]:
|
| 41 |
-
max_intersections[i] = (intersection_pct, j)
|
| 42 |
-
|
| 43 |
-
for i, block in enumerate(page.blocks):
|
| 44 |
-
block = page.blocks[i]
|
| 45 |
-
block_type = None
|
| 46 |
-
if i in max_intersections and max_intersections[i][0] > 0.0:
|
| 47 |
-
j = max_intersections[i][1]
|
| 48 |
-
block_type = page.layout.bboxes[j].label
|
| 49 |
-
block.block_type = block_type
|
| 50 |
-
|
| 51 |
-
# Smarter block layout assignment - first assign same as closest block
|
| 52 |
-
# Next, fall back to text
|
| 53 |
-
for i, block in enumerate(page.blocks):
|
| 54 |
-
if block.block_type is not None:
|
| 55 |
-
continue
|
| 56 |
-
min_dist = None
|
| 57 |
-
min_dist_idx = None
|
| 58 |
-
for j, block2 in enumerate(page.blocks):
|
| 59 |
-
if j == i or block2.block_type is None:
|
| 60 |
-
continue
|
| 61 |
-
dist = block.distance(block2.bbox)
|
| 62 |
-
if min_dist_idx is None or dist < min_dist:
|
| 63 |
-
min_dist = dist
|
| 64 |
-
min_dist_idx = j
|
| 65 |
-
for line in block2.lines:
|
| 66 |
-
dist = block.distance(line.bbox)
|
| 67 |
-
if dist < min_dist:
|
| 68 |
-
min_dist = dist
|
| 69 |
-
min_dist_idx = j
|
| 70 |
-
|
| 71 |
-
if min_dist_idx is not None:
|
| 72 |
-
block.block_type = page.blocks[min_dist_idx].block_type
|
| 73 |
-
|
| 74 |
-
for i, block in enumerate(page.blocks):
|
| 75 |
-
if block.block_type is None:
|
| 76 |
-
block.block_type = settings.DEFAULT_BLOCK_TYPE
|
| 77 |
-
|
| 78 |
-
def get_layout_label(block_labels: List[str]):
|
| 79 |
-
counter = Counter(block_labels)
|
| 80 |
-
return counter.most_common(1)[0][0]
|
| 81 |
-
|
| 82 |
-
def generate_block(block, block_labels):
|
| 83 |
-
block.bbox = bbox_from_lines(block.lines)
|
| 84 |
-
block.block_type = get_layout_label(block_labels)
|
| 85 |
-
return block
|
| 86 |
-
|
| 87 |
-
# Merge blocks together, preserving pdf order
|
| 88 |
-
curr_layout_idx = None
|
| 89 |
-
curr_layout_block = None
|
| 90 |
-
curr_block_labels = []
|
| 91 |
-
new_blocks = []
|
| 92 |
-
for i in range(len(page.blocks)):
|
| 93 |
-
if i not in max_intersections or max_intersections[i][0] == 0:
|
| 94 |
-
if curr_layout_block is not None:
|
| 95 |
-
new_blocks.append(generate_block(curr_layout_block, curr_block_labels))
|
| 96 |
-
curr_layout_block = None
|
| 97 |
-
curr_layout_idx = None
|
| 98 |
-
curr_block_labels = []
|
| 99 |
-
new_blocks.append(page.blocks[i])
|
| 100 |
-
elif max_intersections[i][1] != curr_layout_idx:
|
| 101 |
-
if curr_layout_block is not None:
|
| 102 |
-
new_blocks.append(generate_block(curr_layout_block, curr_block_labels))
|
| 103 |
-
curr_layout_block = page.blocks[i].copy()
|
| 104 |
-
curr_layout_idx = max_intersections[i][1]
|
| 105 |
-
curr_block_labels = [page.blocks[i].block_type]
|
| 106 |
-
else:
|
| 107 |
-
curr_layout_block.lines.extend(page.blocks[i].lines)
|
| 108 |
-
curr_block_labels.append(page.blocks[i].block_type)
|
| 109 |
-
|
| 110 |
-
if curr_layout_block is not None:
|
| 111 |
-
new_blocks.append(generate_block(curr_layout_block, curr_block_labels))
|
| 112 |
-
|
| 113 |
-
page.blocks = new_blocks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
marker/layout/order.py
DELETED
|
@@ -1,73 +0,0 @@
|
|
| 1 |
-
from collections import defaultdict
|
| 2 |
-
from typing import List
|
| 3 |
-
|
| 4 |
-
from surya.ordering import batch_ordering
|
| 5 |
-
|
| 6 |
-
from marker.pdf.images import render_image
|
| 7 |
-
from marker.pdf.utils import sort_block_group
|
| 8 |
-
from marker.schema.bbox import rescale_bbox
|
| 9 |
-
from marker.schema.page import Page
|
| 10 |
-
from marker.settings import settings
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
def get_batch_size():
|
| 14 |
-
if settings.ORDER_BATCH_SIZE is not None:
|
| 15 |
-
return settings.ORDER_BATCH_SIZE
|
| 16 |
-
elif settings.TORCH_DEVICE_MODEL == "cuda":
|
| 17 |
-
return 6
|
| 18 |
-
elif settings.TORCH_DEVICE_MODEL == "mps":
|
| 19 |
-
return 6
|
| 20 |
-
return 6
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
def surya_order(images: list, pages: List[Page], order_model, batch_multiplier=1):
|
| 24 |
-
# Get bboxes for all pages
|
| 25 |
-
bboxes = []
|
| 26 |
-
for page in pages:
|
| 27 |
-
bbox = [b.bbox for b in page.layout.bboxes][:settings.ORDER_MAX_BBOXES]
|
| 28 |
-
bboxes.append(bbox)
|
| 29 |
-
|
| 30 |
-
processor = order_model.processor
|
| 31 |
-
order_results = batch_ordering(images, bboxes, order_model, processor, batch_size=int(get_batch_size() * batch_multiplier))
|
| 32 |
-
for page, order_result in zip(pages, order_results):
|
| 33 |
-
page.order = order_result
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
def sort_blocks_in_reading_order(pages: List[Page]):
|
| 37 |
-
for page in pages:
|
| 38 |
-
order = page.order
|
| 39 |
-
block_positions = {}
|
| 40 |
-
max_position = 0
|
| 41 |
-
for i, block in enumerate(page.blocks):
|
| 42 |
-
for order_box in order.bboxes:
|
| 43 |
-
order_bbox = order_box.bbox
|
| 44 |
-
position = order_box.position
|
| 45 |
-
order_bbox = rescale_bbox(order.image_bbox, page.bbox, order_bbox)
|
| 46 |
-
block_intersection = block.intersection_pct(order_bbox)
|
| 47 |
-
if i not in block_positions:
|
| 48 |
-
block_positions[i] = (block_intersection, position)
|
| 49 |
-
elif block_intersection > block_positions[i][0]:
|
| 50 |
-
block_positions[i] = (block_intersection, position)
|
| 51 |
-
max_position = max(max_position, position)
|
| 52 |
-
block_groups = defaultdict(list)
|
| 53 |
-
for i, block in enumerate(page.blocks):
|
| 54 |
-
if i in block_positions:
|
| 55 |
-
position = block_positions[i][1]
|
| 56 |
-
else:
|
| 57 |
-
max_position += 1
|
| 58 |
-
position = max_position
|
| 59 |
-
|
| 60 |
-
block_groups[position].append(block)
|
| 61 |
-
|
| 62 |
-
new_blocks = []
|
| 63 |
-
for position in sorted(block_groups.keys()):
|
| 64 |
-
block_group = sort_block_group(block_groups[position])
|
| 65 |
-
new_blocks.extend(block_group)
|
| 66 |
-
|
| 67 |
-
# Ensure we properly put footers at the end of the page
|
| 68 |
-
footer_blocks = [b for b in new_blocks if b.block_type in ["Footnote", "Page-footer"]]
|
| 69 |
-
header_blocks = [b for b in new_blocks if b.block_type in ["Page-header"]]
|
| 70 |
-
regular_blocks = [b for b in new_blocks if b.block_type not in ["Footnote", "Page-footer", "Page-header"]]
|
| 71 |
-
|
| 72 |
-
new_blocks = header_blocks + regular_blocks + footer_blocks
|
| 73 |
-
page.blocks = new_blocks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
marker/logger.py
CHANGED
|
@@ -5,8 +5,5 @@ import warnings
|
|
| 5 |
def configure_logging():
|
| 6 |
logging.basicConfig(level=logging.WARNING)
|
| 7 |
|
| 8 |
-
logging.getLogger('pdfminer').setLevel(logging.ERROR)
|
| 9 |
logging.getLogger('PIL').setLevel(logging.ERROR)
|
| 10 |
-
logging.getLogger('fitz').setLevel(logging.ERROR)
|
| 11 |
-
logging.getLogger('ocrmypdf').setLevel(logging.ERROR)
|
| 12 |
warnings.simplefilter(action='ignore', category=FutureWarning)
|
|
|
|
| 5 |
def configure_logging():
|
| 6 |
logging.basicConfig(level=logging.WARNING)
|
| 7 |
|
|
|
|
| 8 |
logging.getLogger('PIL').setLevel(logging.ERROR)
|
|
|
|
|
|
|
| 9 |
warnings.simplefilter(action='ignore', category=FutureWarning)
|
marker/models.py
CHANGED
|
@@ -1,21 +1,27 @@
|
|
| 1 |
import os
|
|
|
|
| 2 |
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # For some reason, transformers decided to use .isin for a simple op, which is not supported on MPS
|
| 3 |
|
| 4 |
|
| 5 |
from surya.model.detection.model import load_model as load_detection_model, load_processor as load_detection_processor
|
| 6 |
-
from surya.model.layout.model import load_model as load_layout_model
|
|
|
|
| 7 |
from texify.model.model import load_model as load_texify_model
|
| 8 |
from texify.model.processor import load_processor as load_texify_processor
|
| 9 |
from marker.settings import settings
|
| 10 |
from surya.model.recognition.model import load_model as load_recognition_model
|
| 11 |
from surya.model.recognition.processor import load_processor as load_recognition_processor
|
| 12 |
-
from surya.model.ordering.model import load_model as load_order_model
|
| 13 |
-
from surya.model.ordering.processor import load_processor as load_order_processor
|
| 14 |
from surya.model.table_rec.model import load_model as load_table_model
|
| 15 |
from surya.model.table_rec.processor import load_processor as load_table_processor
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
-
def setup_table_rec_model(device=None, dtype=None):
|
| 19 |
if device:
|
| 20 |
table_model = load_table_model(device=device, dtype=dtype)
|
| 21 |
else:
|
|
@@ -24,7 +30,7 @@ def setup_table_rec_model(device=None, dtype=None):
|
|
| 24 |
return table_model
|
| 25 |
|
| 26 |
|
| 27 |
-
def setup_recognition_model(device=None, dtype=None):
|
| 28 |
if device:
|
| 29 |
rec_model = load_recognition_model(device=device, dtype=dtype)
|
| 30 |
else:
|
|
@@ -33,7 +39,7 @@ def setup_recognition_model(device=None, dtype=None):
|
|
| 33 |
return rec_model
|
| 34 |
|
| 35 |
|
| 36 |
-
def setup_detection_model(device=None, dtype=None):
|
| 37 |
if device:
|
| 38 |
model = load_detection_model(device=device, dtype=dtype)
|
| 39 |
else:
|
|
@@ -42,7 +48,7 @@ def setup_detection_model(device=None, dtype=None):
|
|
| 42 |
return model
|
| 43 |
|
| 44 |
|
| 45 |
-
def setup_texify_model(device=None, dtype=None):
|
| 46 |
if device:
|
| 47 |
texify_model = load_texify_model(checkpoint=settings.TEXIFY_MODEL_NAME, device=device, dtype=dtype)
|
| 48 |
else:
|
|
@@ -51,36 +57,10 @@ def setup_texify_model(device=None, dtype=None):
|
|
| 51 |
return texify_model
|
| 52 |
|
| 53 |
|
| 54 |
-
def setup_layout_model(device=None, dtype=None):
|
| 55 |
if device:
|
| 56 |
model = load_layout_model(device=device, dtype=dtype)
|
| 57 |
else:
|
| 58 |
model = load_layout_model()
|
| 59 |
model.processor = load_layout_processor()
|
| 60 |
-
return model
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
def setup_order_model(device=None, dtype=None):
|
| 64 |
-
if device:
|
| 65 |
-
model = load_order_model(device=device, dtype=dtype)
|
| 66 |
-
else:
|
| 67 |
-
model = load_order_model()
|
| 68 |
-
model.processor = load_order_processor()
|
| 69 |
-
return model
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
def load_all_models(device=None, dtype=None):
|
| 73 |
-
if device is not None:
|
| 74 |
-
assert dtype is not None, "Must provide dtype if device is provided"
|
| 75 |
-
|
| 76 |
-
# langs is optional list of languages to prune from recognition MoE model
|
| 77 |
-
detection = setup_detection_model(device, dtype)
|
| 78 |
-
layout = setup_layout_model(device, dtype)
|
| 79 |
-
order = setup_order_model(device, dtype)
|
| 80 |
-
|
| 81 |
-
# Only load recognition model if we'll need it for all pdfs
|
| 82 |
-
ocr = setup_recognition_model(device, dtype)
|
| 83 |
-
texify = setup_texify_model(device, dtype)
|
| 84 |
-
table_model = setup_table_rec_model(device, dtype)
|
| 85 |
-
model_lst = [texify, layout, order, detection, ocr, table_model]
|
| 86 |
-
return model_lst
|
|
|
|
| 1 |
import os
|
| 2 |
+
|
| 3 |
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # For some reason, transformers decided to use .isin for a simple op, which is not supported on MPS
|
| 4 |
|
| 5 |
|
| 6 |
from surya.model.detection.model import load_model as load_detection_model, load_processor as load_detection_processor
|
| 7 |
+
from surya.model.layout.model import load_model as load_layout_model
|
| 8 |
+
from surya.model.layout.processor import load_processor as load_layout_processor
|
| 9 |
from texify.model.model import load_model as load_texify_model
|
| 10 |
from texify.model.processor import load_processor as load_texify_processor
|
| 11 |
from marker.settings import settings
|
| 12 |
from surya.model.recognition.model import load_model as load_recognition_model
|
| 13 |
from surya.model.recognition.processor import load_processor as load_recognition_processor
|
|
|
|
|
|
|
| 14 |
from surya.model.table_rec.model import load_model as load_table_model
|
| 15 |
from surya.model.table_rec.processor import load_processor as load_table_processor
|
| 16 |
|
| 17 |
+
from texify.model.model import GenerateVisionEncoderDecoderModel
|
| 18 |
+
from surya.model.layout.encoderdecoder import SuryaLayoutModel
|
| 19 |
+
from surya.model.detection.model import EfficientViTForSemanticSegmentation
|
| 20 |
+
from surya.model.recognition.encoderdecoder import OCREncoderDecoderModel
|
| 21 |
+
from surya.model.table_rec.encoderdecoder import TableRecEncoderDecoderModel
|
| 22 |
+
|
| 23 |
|
| 24 |
+
def setup_table_rec_model(device=None, dtype=None) -> TableRecEncoderDecoderModel:
|
| 25 |
if device:
|
| 26 |
table_model = load_table_model(device=device, dtype=dtype)
|
| 27 |
else:
|
|
|
|
| 30 |
return table_model
|
| 31 |
|
| 32 |
|
| 33 |
+
def setup_recognition_model(device=None, dtype=None) -> OCREncoderDecoderModel:
|
| 34 |
if device:
|
| 35 |
rec_model = load_recognition_model(device=device, dtype=dtype)
|
| 36 |
else:
|
|
|
|
| 39 |
return rec_model
|
| 40 |
|
| 41 |
|
| 42 |
+
def setup_detection_model(device=None, dtype=None) -> EfficientViTForSemanticSegmentation:
|
| 43 |
if device:
|
| 44 |
model = load_detection_model(device=device, dtype=dtype)
|
| 45 |
else:
|
|
|
|
| 48 |
return model
|
| 49 |
|
| 50 |
|
| 51 |
+
def setup_texify_model(device=None, dtype=None) -> GenerateVisionEncoderDecoderModel:
|
| 52 |
if device:
|
| 53 |
texify_model = load_texify_model(checkpoint=settings.TEXIFY_MODEL_NAME, device=device, dtype=dtype)
|
| 54 |
else:
|
|
|
|
| 57 |
return texify_model
|
| 58 |
|
| 59 |
|
| 60 |
+
def setup_layout_model(device=None, dtype=None) -> SuryaLayoutModel:
|
| 61 |
if device:
|
| 62 |
model = load_layout_model(device=device, dtype=dtype)
|
| 63 |
else:
|
| 64 |
model = load_layout_model()
|
| 65 |
model.processor = load_layout_processor()
|
| 66 |
+
return model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
marker/ocr/detection.py
DELETED
|
@@ -1,28 +0,0 @@
|
|
| 1 |
-
from typing import List
|
| 2 |
-
|
| 3 |
-
from pypdfium2 import PdfDocument
|
| 4 |
-
from surya.detection import batch_text_detection
|
| 5 |
-
|
| 6 |
-
from marker.pdf.images import render_image
|
| 7 |
-
from marker.schema.page import Page
|
| 8 |
-
from marker.settings import settings
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
def get_batch_size():
|
| 12 |
-
if settings.DETECTOR_BATCH_SIZE is not None:
|
| 13 |
-
return settings.DETECTOR_BATCH_SIZE
|
| 14 |
-
elif settings.TORCH_DEVICE_MODEL == "cuda":
|
| 15 |
-
return 4
|
| 16 |
-
return 4
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
def surya_detection(images: list, pages: List[Page], det_model, batch_multiplier=1):
|
| 20 |
-
processor = det_model.processor
|
| 21 |
-
|
| 22 |
-
predictions = batch_text_detection(images, det_model, processor, batch_size=int(get_batch_size() * batch_multiplier))
|
| 23 |
-
for (page, pred) in zip(pages, predictions):
|
| 24 |
-
page.text_lines = pred
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
marker/ocr/heuristics.py
DELETED
|
@@ -1,78 +0,0 @@
|
|
| 1 |
-
import re
|
| 2 |
-
from typing import List
|
| 3 |
-
|
| 4 |
-
from marker.ocr.utils import alphanum_ratio
|
| 5 |
-
from marker.schema.bbox import rescale_bbox, box_intersection_pct
|
| 6 |
-
from marker.schema.page import Page
|
| 7 |
-
from marker.settings import settings
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
def should_ocr_page(page: Page, no_text: bool, ocr_all_pages=False):
|
| 11 |
-
detected_lines_found, total_lines = detected_line_coverage(page)
|
| 12 |
-
|
| 13 |
-
# No reason to OCR page if it has no text lines
|
| 14 |
-
if total_lines == 0:
|
| 15 |
-
return False
|
| 16 |
-
|
| 17 |
-
# OCR page if we got minimal text, or if we got too many spaces
|
| 18 |
-
conditions = [
|
| 19 |
-
no_text, # Full doc has no text, and needs full OCR
|
| 20 |
-
(len(page.prelim_text) > 0 and detect_bad_ocr(page.prelim_text)), # Bad OCR
|
| 21 |
-
detected_lines_found is False, # didn't extract text for all detected lines
|
| 22 |
-
]
|
| 23 |
-
|
| 24 |
-
return any(conditions) or ocr_all_pages
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
def detect_bad_ocr(text, space_threshold=.7, newline_threshold=.6, alphanum_threshold=.3):
|
| 28 |
-
if len(text) == 0:
|
| 29 |
-
# Assume OCR failed if we have no text
|
| 30 |
-
return True
|
| 31 |
-
|
| 32 |
-
spaces = len(re.findall(r'\s+', text))
|
| 33 |
-
alpha_chars = len(re.sub(r'\s+', '', text))
|
| 34 |
-
if spaces / (alpha_chars + spaces) > space_threshold:
|
| 35 |
-
return True
|
| 36 |
-
|
| 37 |
-
newlines = len(re.findall(r'\n+', text))
|
| 38 |
-
non_newlines = len(re.sub(r'\n+', '', text))
|
| 39 |
-
if newlines / (newlines + non_newlines) > newline_threshold:
|
| 40 |
-
return True
|
| 41 |
-
|
| 42 |
-
if alphanum_ratio(text) < alphanum_threshold: # Garbled text
|
| 43 |
-
return True
|
| 44 |
-
|
| 45 |
-
invalid_chars = len([c for c in text if c in settings.INVALID_CHARS])
|
| 46 |
-
if invalid_chars > max(6.0, len(text) * .03):
|
| 47 |
-
return True
|
| 48 |
-
|
| 49 |
-
return False
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
def no_text_found(pages: List[Page]):
|
| 53 |
-
full_text = ""
|
| 54 |
-
for page in pages:
|
| 55 |
-
full_text += page.prelim_text
|
| 56 |
-
return len(full_text.strip()) == 0
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
def detected_line_coverage(page: Page, intersect_thresh=.5, detection_thresh=.4):
|
| 60 |
-
found_lines = 0
|
| 61 |
-
for detected_line in page.text_lines.bboxes:
|
| 62 |
-
# Get bbox and rescale to match dimensions of original page
|
| 63 |
-
detected_bbox = detected_line.bbox
|
| 64 |
-
detected_bbox = rescale_bbox(page.text_lines.image_bbox, page.bbox, detected_bbox)
|
| 65 |
-
|
| 66 |
-
total_intersection = 0
|
| 67 |
-
for block in page.blocks:
|
| 68 |
-
for line in block.lines:
|
| 69 |
-
intersection_pct = box_intersection_pct(detected_bbox, line.bbox)
|
| 70 |
-
total_intersection += intersection_pct
|
| 71 |
-
if total_intersection > intersect_thresh:
|
| 72 |
-
found_lines += 1
|
| 73 |
-
|
| 74 |
-
total_lines = len(page.text_lines.bboxes)
|
| 75 |
-
if total_lines == 0:
|
| 76 |
-
return True, 0
|
| 77 |
-
|
| 78 |
-
return found_lines / total_lines > detection_thresh, total_lines
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
marker/ocr/lang.py
DELETED
|
@@ -1,44 +0,0 @@
|
|
| 1 |
-
from typing import List
|
| 2 |
-
|
| 3 |
-
from surya.languages import CODE_TO_LANGUAGE, LANGUAGE_TO_CODE
|
| 4 |
-
from surya.model.recognition.tokenizer import _tokenize as lang_tokenize
|
| 5 |
-
|
| 6 |
-
from marker.ocr.tesseract import LANGUAGE_TO_TESSERACT_CODE, TESSERACT_CODE_TO_LANGUAGE
|
| 7 |
-
from marker.settings import settings
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
def langs_to_ids(langs: List[str]):
|
| 11 |
-
unique_langs = list(set(langs))
|
| 12 |
-
_, lang_tokens = lang_tokenize("", unique_langs)
|
| 13 |
-
return lang_tokens
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
def replace_langs_with_codes(langs):
|
| 17 |
-
if settings.OCR_ENGINE == "surya":
|
| 18 |
-
if langs is None:
|
| 19 |
-
return
|
| 20 |
-
for i, lang in enumerate(langs):
|
| 21 |
-
if lang.title() in LANGUAGE_TO_CODE:
|
| 22 |
-
langs[i] = LANGUAGE_TO_CODE[lang.title()]
|
| 23 |
-
else:
|
| 24 |
-
if langs is None:
|
| 25 |
-
langs = [settings.DEFAULT_LANG]
|
| 26 |
-
print(f"No languages specified for tesseract, defaulting to {settings.DEFAULT_LANG}.")
|
| 27 |
-
|
| 28 |
-
for i, lang in enumerate(langs):
|
| 29 |
-
if lang in LANGUAGE_TO_CODE:
|
| 30 |
-
langs[i] = LANGUAGE_TO_TESSERACT_CODE[lang]
|
| 31 |
-
return langs
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
def validate_langs(langs):
|
| 35 |
-
if settings.OCR_ENGINE == "surya":
|
| 36 |
-
if langs is None:
|
| 37 |
-
return
|
| 38 |
-
for lang in langs:
|
| 39 |
-
if lang not in CODE_TO_LANGUAGE:
|
| 40 |
-
raise ValueError(f"Invalid language code {lang} for Surya OCR")
|
| 41 |
-
else:
|
| 42 |
-
for lang in langs:
|
| 43 |
-
if lang not in TESSERACT_CODE_TO_LANGUAGE:
|
| 44 |
-
raise ValueError(f"Invalid language code {lang} for Tesseract")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
marker/ocr/recognition.py
DELETED
|
@@ -1,182 +0,0 @@
|
|
| 1 |
-
import tempfile
|
| 2 |
-
from copy import deepcopy
|
| 3 |
-
from itertools import repeat
|
| 4 |
-
from typing import List, Optional, Dict
|
| 5 |
-
|
| 6 |
-
import pypdfium2 as pdfium
|
| 7 |
-
import io
|
| 8 |
-
from concurrent.futures import ThreadPoolExecutor
|
| 9 |
-
|
| 10 |
-
from surya.ocr import run_recognition
|
| 11 |
-
|
| 12 |
-
from marker.models import setup_recognition_model
|
| 13 |
-
from marker.ocr.heuristics import should_ocr_page, no_text_found, detect_bad_ocr
|
| 14 |
-
from marker.ocr.lang import langs_to_ids
|
| 15 |
-
from marker.pdf.images import render_image
|
| 16 |
-
from marker.schema.bbox import rescale_bbox
|
| 17 |
-
from marker.schema.page import Page
|
| 18 |
-
from marker.schema.block import Block, Line, Span
|
| 19 |
-
from marker.settings import settings
|
| 20 |
-
from marker.pdf.extract_text import get_text_blocks
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
def get_batch_size():
|
| 24 |
-
if settings.RECOGNITION_BATCH_SIZE is not None:
|
| 25 |
-
return settings.RECOGNITION_BATCH_SIZE
|
| 26 |
-
elif settings.TORCH_DEVICE_MODEL == "cuda":
|
| 27 |
-
return 32
|
| 28 |
-
elif settings.TORCH_DEVICE_MODEL == "mps":
|
| 29 |
-
return 32
|
| 30 |
-
return 32
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
def run_ocr(doc, pages: List[Page], langs: List[str], rec_model, batch_multiplier=1, ocr_all_pages=False) -> (List[Page], Dict):
|
| 34 |
-
ocr_pages = 0
|
| 35 |
-
ocr_success = 0
|
| 36 |
-
ocr_failed = 0
|
| 37 |
-
no_text = no_text_found(pages)
|
| 38 |
-
ocr_idxs = []
|
| 39 |
-
for pnum, page in enumerate(pages):
|
| 40 |
-
ocr_needed = should_ocr_page(page, no_text, ocr_all_pages=ocr_all_pages)
|
| 41 |
-
if ocr_needed:
|
| 42 |
-
ocr_idxs.append(pnum)
|
| 43 |
-
ocr_pages += 1
|
| 44 |
-
|
| 45 |
-
# No pages need OCR
|
| 46 |
-
if ocr_pages == 0:
|
| 47 |
-
return pages, {"ocr_pages": 0, "ocr_failed": 0, "ocr_success": 0, "ocr_engine": "none"}
|
| 48 |
-
|
| 49 |
-
ocr_method = settings.OCR_ENGINE
|
| 50 |
-
if ocr_method is None or ocr_method == "None":
|
| 51 |
-
return pages, {"ocr_pages": 0, "ocr_failed": 0, "ocr_success": 0, "ocr_engine": "none"}
|
| 52 |
-
elif ocr_method == "surya":
|
| 53 |
-
new_pages = surya_recognition(doc, ocr_idxs, langs, rec_model, pages, batch_multiplier=batch_multiplier)
|
| 54 |
-
elif ocr_method == "ocrmypdf":
|
| 55 |
-
new_pages = tesseract_recognition(doc, ocr_idxs, langs)
|
| 56 |
-
else:
|
| 57 |
-
raise ValueError(f"Unknown OCR method {ocr_method}")
|
| 58 |
-
|
| 59 |
-
for orig_idx, page in zip(ocr_idxs, new_pages):
|
| 60 |
-
if detect_bad_ocr(page.prelim_text) or len(page.prelim_text) == 0:
|
| 61 |
-
ocr_failed += 1
|
| 62 |
-
else:
|
| 63 |
-
ocr_success += 1
|
| 64 |
-
pages[orig_idx] = page
|
| 65 |
-
|
| 66 |
-
return pages, {"ocr_pages": ocr_pages, "ocr_failed": ocr_failed, "ocr_success": ocr_success, "ocr_engine": ocr_method}
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
def surya_recognition(doc, page_idxs, langs: List[str], rec_model, pages: List[Page], batch_multiplier=1) -> List[Optional[Page]]:
|
| 70 |
-
# Slice images in higher resolution than detection happened in
|
| 71 |
-
images = [render_image(doc[pnum], dpi=settings.SURYA_OCR_DPI) for pnum in page_idxs]
|
| 72 |
-
box_scale = settings.SURYA_OCR_DPI / settings.SURYA_DETECTOR_DPI
|
| 73 |
-
|
| 74 |
-
processor = rec_model.processor
|
| 75 |
-
selected_pages = [p for i, p in enumerate(pages) if i in page_idxs]
|
| 76 |
-
|
| 77 |
-
surya_langs = [langs] * len(page_idxs)
|
| 78 |
-
detection_results = [p.text_lines.bboxes for p in selected_pages]
|
| 79 |
-
polygons = deepcopy([[b.polygon for b in bboxes] for bboxes in detection_results])
|
| 80 |
-
|
| 81 |
-
# Scale polygons to get correct image slices
|
| 82 |
-
for j, poly in enumerate(polygons):
|
| 83 |
-
skip_idxs = []
|
| 84 |
-
for z, p in enumerate(poly):
|
| 85 |
-
for i in range(len(p)):
|
| 86 |
-
p[i] = [int(p[i][0] * box_scale), int(p[i][1] * box_scale)]
|
| 87 |
-
x_coords = [p[i][0] for i in range(len(p))]
|
| 88 |
-
y_coords = [p[i][1] for i in range(len(p))]
|
| 89 |
-
bbox = [min(x_coords), min(y_coords), max(x_coords), max(y_coords)]
|
| 90 |
-
if (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) == 0:
|
| 91 |
-
skip_idxs.append(z)
|
| 92 |
-
if len(skip_idxs) > 0:
|
| 93 |
-
polygons[j] = [p for i, p in enumerate(poly) if i not in skip_idxs]
|
| 94 |
-
|
| 95 |
-
results = run_recognition(images, surya_langs, rec_model, processor, polygons=polygons, batch_size=int(get_batch_size() * batch_multiplier))
|
| 96 |
-
|
| 97 |
-
new_pages = []
|
| 98 |
-
for idx, (page_idx, result, old_page) in enumerate(zip(page_idxs, results, selected_pages)):
|
| 99 |
-
text_lines = old_page.text_lines
|
| 100 |
-
ocr_results = result.text_lines
|
| 101 |
-
blocks = []
|
| 102 |
-
for i, line in enumerate(ocr_results):
|
| 103 |
-
scaled_bbox = rescale_bbox([0, 0, images[idx].size[0], images[idx].size[1]], old_page.text_lines.image_bbox, line.bbox)
|
| 104 |
-
block = Block(
|
| 105 |
-
bbox=scaled_bbox,
|
| 106 |
-
pnum=page_idx,
|
| 107 |
-
lines=[Line(
|
| 108 |
-
bbox=scaled_bbox,
|
| 109 |
-
spans=[Span(
|
| 110 |
-
text=line.text,
|
| 111 |
-
bbox=scaled_bbox,
|
| 112 |
-
span_id=f"{page_idx}_{i}",
|
| 113 |
-
font="",
|
| 114 |
-
font_weight=0,
|
| 115 |
-
font_size=0,
|
| 116 |
-
)
|
| 117 |
-
]
|
| 118 |
-
)]
|
| 119 |
-
)
|
| 120 |
-
blocks.append(block)
|
| 121 |
-
page = Page(
|
| 122 |
-
blocks=blocks,
|
| 123 |
-
pnum=page_idx,
|
| 124 |
-
bbox=old_page.text_lines.image_bbox,
|
| 125 |
-
rotation=0,
|
| 126 |
-
text_lines=text_lines,
|
| 127 |
-
ocr_method="surya"
|
| 128 |
-
)
|
| 129 |
-
new_pages.append(page)
|
| 130 |
-
return new_pages
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
def tesseract_recognition(doc, page_idxs, langs: List[str]) -> List[Optional[Page]]:
|
| 134 |
-
pdf_pages = generate_single_page_pdfs(doc, page_idxs)
|
| 135 |
-
with ThreadPoolExecutor(max_workers=settings.OCR_PARALLEL_WORKERS) as executor:
|
| 136 |
-
pages = list(executor.map(_tesseract_recognition, pdf_pages, repeat(langs, len(pdf_pages))))
|
| 137 |
-
|
| 138 |
-
return pages
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
def generate_single_page_pdfs(doc, page_idxs) -> List[io.BytesIO]:
|
| 142 |
-
pdf_pages = []
|
| 143 |
-
for page_idx in page_idxs:
|
| 144 |
-
blank_doc = pdfium.PdfDocument.new()
|
| 145 |
-
blank_doc.import_pages(doc, pages=[page_idx])
|
| 146 |
-
assert len(blank_doc) == 1, "Failed to import page"
|
| 147 |
-
|
| 148 |
-
in_pdf = io.BytesIO()
|
| 149 |
-
blank_doc.save(in_pdf)
|
| 150 |
-
in_pdf.seek(0)
|
| 151 |
-
pdf_pages.append(in_pdf)
|
| 152 |
-
return pdf_pages
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
def _tesseract_recognition(in_pdf, langs: List[str]) -> Optional[Page]:
|
| 156 |
-
import ocrmypdf
|
| 157 |
-
out_pdf = io.BytesIO()
|
| 158 |
-
|
| 159 |
-
ocrmypdf.ocr(
|
| 160 |
-
in_pdf,
|
| 161 |
-
out_pdf,
|
| 162 |
-
language=langs[0],
|
| 163 |
-
output_type="pdf",
|
| 164 |
-
redo_ocr=None,
|
| 165 |
-
force_ocr=True,
|
| 166 |
-
progress_bar=False,
|
| 167 |
-
optimize=False,
|
| 168 |
-
fast_web_view=1e6,
|
| 169 |
-
skip_big=15, # skip images larger than 15 megapixels
|
| 170 |
-
tesseract_timeout=settings.TESSERACT_TIMEOUT,
|
| 171 |
-
tesseract_non_ocr_timeout=settings.TESSERACT_TIMEOUT,
|
| 172 |
-
)
|
| 173 |
-
|
| 174 |
-
with tempfile.NamedTemporaryFile() as f:
|
| 175 |
-
f.write(out_pdf.getvalue())
|
| 176 |
-
f.seek(0)
|
| 177 |
-
new_doc = pdfium.PdfDocument(f.name)
|
| 178 |
-
blocks, _ = get_text_blocks(new_doc, f.name, max_pages=1)
|
| 179 |
-
|
| 180 |
-
page = blocks[0]
|
| 181 |
-
page.ocr_method = "tesseract"
|
| 182 |
-
return page
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
marker/ocr/tesseract.py
DELETED
|
@@ -1,97 +0,0 @@
|
|
| 1 |
-
LANGUAGE_TO_TESSERACT_CODE = {
|
| 2 |
-
'Afrikaans': 'afr',
|
| 3 |
-
'Amharic': 'amh',
|
| 4 |
-
'Arabic': 'ara',
|
| 5 |
-
'Assamese': 'asm',
|
| 6 |
-
'Azerbaijani': 'aze',
|
| 7 |
-
'Belarusian': 'bel',
|
| 8 |
-
'Bulgarian': 'bul',
|
| 9 |
-
'Bengali': 'ben',
|
| 10 |
-
'Breton': 'bre',
|
| 11 |
-
'Bosnian': 'bos',
|
| 12 |
-
'Catalan': 'cat',
|
| 13 |
-
'Czech': 'ces',
|
| 14 |
-
'Welsh': 'cym',
|
| 15 |
-
'Danish': 'dan',
|
| 16 |
-
'German': 'deu',
|
| 17 |
-
'Greek': 'ell',
|
| 18 |
-
'English': 'eng',
|
| 19 |
-
'Esperanto': 'epo',
|
| 20 |
-
'Spanish': 'spa',
|
| 21 |
-
'Estonian': 'est',
|
| 22 |
-
'Basque': 'eus',
|
| 23 |
-
'Persian': 'fas',
|
| 24 |
-
'Finnish': 'fin',
|
| 25 |
-
'French': 'fra',
|
| 26 |
-
'Western Frisian': 'fry',
|
| 27 |
-
'Irish': 'gle',
|
| 28 |
-
'Scottish Gaelic': 'gla',
|
| 29 |
-
'Galician': 'glg',
|
| 30 |
-
'Gujarati': 'guj',
|
| 31 |
-
'Hausa': 'hau',
|
| 32 |
-
'Hebrew': 'heb',
|
| 33 |
-
'Hindi': 'hin',
|
| 34 |
-
'Croatian': 'hrv',
|
| 35 |
-
'Hungarian': 'hun',
|
| 36 |
-
'Armenian': 'hye',
|
| 37 |
-
'Indonesian': 'ind',
|
| 38 |
-
'Icelandic': 'isl',
|
| 39 |
-
'Italian': 'ita',
|
| 40 |
-
'Japanese': 'jpn',
|
| 41 |
-
'Javanese': 'jav',
|
| 42 |
-
'Georgian': 'kat',
|
| 43 |
-
'Kazakh': 'kaz',
|
| 44 |
-
'Khmer': 'khm',
|
| 45 |
-
'Kannada': 'kan',
|
| 46 |
-
'Korean': 'kor',
|
| 47 |
-
'Kurdish': 'kur',
|
| 48 |
-
'Kyrgyz': 'kir',
|
| 49 |
-
'Latin': 'lat',
|
| 50 |
-
'Lao': 'lao',
|
| 51 |
-
'Lithuanian': 'lit',
|
| 52 |
-
'Latvian': 'lav',
|
| 53 |
-
'Malagasy': 'mlg',
|
| 54 |
-
'Macedonian': 'mkd',
|
| 55 |
-
'Malayalam': 'mal',
|
| 56 |
-
'Mongolian': 'mon',
|
| 57 |
-
'Marathi': 'mar',
|
| 58 |
-
'Malay': 'msa',
|
| 59 |
-
'Burmese': 'mya',
|
| 60 |
-
'Nepali': 'nep',
|
| 61 |
-
'Dutch': 'nld',
|
| 62 |
-
'Norwegian': 'nor',
|
| 63 |
-
'Oromo': 'orm',
|
| 64 |
-
'Oriya': 'ori',
|
| 65 |
-
'Punjabi': 'pan',
|
| 66 |
-
'Polish': 'pol',
|
| 67 |
-
'Pashto': 'pus',
|
| 68 |
-
'Portuguese': 'por',
|
| 69 |
-
'Romanian': 'ron',
|
| 70 |
-
'Russian': 'rus',
|
| 71 |
-
'Sanskrit': 'san',
|
| 72 |
-
'Sindhi': 'snd',
|
| 73 |
-
'Sinhala': 'sin',
|
| 74 |
-
'Slovak': 'slk',
|
| 75 |
-
'Slovenian': 'slv',
|
| 76 |
-
'Somali': 'som',
|
| 77 |
-
'Albanian': 'sqi',
|
| 78 |
-
'Serbian': 'srp',
|
| 79 |
-
'Sundanese': 'sun',
|
| 80 |
-
'Swedish': 'swe',
|
| 81 |
-
'Swahili': 'swa',
|
| 82 |
-
'Tamil': 'tam',
|
| 83 |
-
'Telugu': 'tel',
|
| 84 |
-
'Thai': 'tha',
|
| 85 |
-
'Tagalog': 'tgl',
|
| 86 |
-
'Turkish': 'tur',
|
| 87 |
-
'Uyghur': 'uig',
|
| 88 |
-
'Ukrainian': 'ukr',
|
| 89 |
-
'Urdu': 'urd',
|
| 90 |
-
'Uzbek': 'uzb',
|
| 91 |
-
'Vietnamese': 'vie',
|
| 92 |
-
'Xhosa': 'xho',
|
| 93 |
-
'Yiddish': 'yid',
|
| 94 |
-
'Chinese': 'chi_sim',
|
| 95 |
-
}
|
| 96 |
-
|
| 97 |
-
TESSERACT_CODE_TO_LANGUAGE = {v:k for k,v in LANGUAGE_TO_TESSERACT_CODE.items()}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
marker/pdf/extract_text.py
DELETED
|
@@ -1,114 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
from typing import List, Optional, Dict
|
| 3 |
-
|
| 4 |
-
import pypdfium2 as pdfium
|
| 5 |
-
|
| 6 |
-
from marker.cleaners.toc import get_pdf_toc
|
| 7 |
-
from marker.pdf.utils import font_flags_decomposer
|
| 8 |
-
from marker.settings import settings
|
| 9 |
-
from marker.schema.block import Span, Line, Block
|
| 10 |
-
from marker.schema.page import Page
|
| 11 |
-
from pdftext.extraction import dictionary_output
|
| 12 |
-
|
| 13 |
-
os.environ["TESSDATA_PREFIX"] = settings.TESSDATA_PREFIX
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
def pdftext_format_to_blocks(page, pnum: int) -> Page:
|
| 17 |
-
page_blocks = []
|
| 18 |
-
span_id = 0
|
| 19 |
-
for block_idx, block in enumerate(page["blocks"]):
|
| 20 |
-
for l in block["lines"]:
|
| 21 |
-
block_lines = []
|
| 22 |
-
spans = []
|
| 23 |
-
for i, s in enumerate(l["spans"]):
|
| 24 |
-
block_text = s["text"]
|
| 25 |
-
# Remove trailing newlines and carriage returns (tesseract)
|
| 26 |
-
while len(block_text) > 0 and block_text[-1] in ["\n", "\r"]:
|
| 27 |
-
block_text = block_text[:-1]
|
| 28 |
-
|
| 29 |
-
block_text = block_text.replace("-\n", "") # Remove hyphenated line breaks
|
| 30 |
-
span_obj = Span(
|
| 31 |
-
text=block_text, # Remove end of line newlines, not spaces
|
| 32 |
-
bbox=s["bbox"],
|
| 33 |
-
span_id=f"{pnum}_{span_id}",
|
| 34 |
-
font=f"{s['font']['name']}_{font_flags_decomposer(s['font']['flags'])}", # Add font flags to end of font
|
| 35 |
-
font_weight=s["font"]["weight"],
|
| 36 |
-
font_size=s["font"]["size"],
|
| 37 |
-
)
|
| 38 |
-
spans.append(span_obj) # Text, bounding box, span id
|
| 39 |
-
span_id += 1
|
| 40 |
-
line_obj = Line(
|
| 41 |
-
spans=spans,
|
| 42 |
-
bbox=l["bbox"],
|
| 43 |
-
)
|
| 44 |
-
# Only select valid lines, with positive bboxes
|
| 45 |
-
if line_obj.area >= 0:
|
| 46 |
-
block_lines.append(line_obj)
|
| 47 |
-
|
| 48 |
-
# Each block is a single line
|
| 49 |
-
block_obj = Block(
|
| 50 |
-
lines=block_lines,
|
| 51 |
-
bbox=l["bbox"],
|
| 52 |
-
pnum=pnum
|
| 53 |
-
)
|
| 54 |
-
# Only select blocks with lines
|
| 55 |
-
if len(block_lines) > 0:
|
| 56 |
-
page_blocks.append(block_obj)
|
| 57 |
-
|
| 58 |
-
page_bbox = page["bbox"]
|
| 59 |
-
page_width = abs(page_bbox[2] - page_bbox[0])
|
| 60 |
-
page_height = abs(page_bbox[3] - page_bbox[1])
|
| 61 |
-
rotation = page["rotation"]
|
| 62 |
-
|
| 63 |
-
# Flip width and height if rotated
|
| 64 |
-
if rotation == 90 or rotation == 270:
|
| 65 |
-
page_width, page_height = page_height, page_width
|
| 66 |
-
|
| 67 |
-
char_blocks = page["blocks"]
|
| 68 |
-
page_bbox = [0, 0, page_width, page_height]
|
| 69 |
-
out_page = Page(
|
| 70 |
-
blocks=page_blocks,
|
| 71 |
-
pnum=page["page"],
|
| 72 |
-
bbox=page_bbox,
|
| 73 |
-
rotation=rotation,
|
| 74 |
-
char_blocks=char_blocks
|
| 75 |
-
)
|
| 76 |
-
return out_page
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
def get_text_blocks(doc, fname, max_pages: Optional[int] = None, start_page: Optional[int] = None) -> (List[Page], Dict):
|
| 80 |
-
toc = get_pdf_toc(doc)
|
| 81 |
-
|
| 82 |
-
if start_page:
|
| 83 |
-
assert start_page < len(doc)
|
| 84 |
-
else:
|
| 85 |
-
start_page = 0
|
| 86 |
-
|
| 87 |
-
if max_pages:
|
| 88 |
-
if max_pages + start_page > len(doc):
|
| 89 |
-
max_pages = len(doc) - start_page
|
| 90 |
-
else:
|
| 91 |
-
max_pages = len(doc) - start_page
|
| 92 |
-
|
| 93 |
-
page_range = range(start_page, start_page + max_pages)
|
| 94 |
-
|
| 95 |
-
char_blocks = dictionary_output(fname, page_range=page_range, keep_chars=False, workers=settings.PDFTEXT_CPU_WORKERS, flatten_pdf=settings.FLATTEN_PDF)
|
| 96 |
-
marker_blocks = [pdftext_format_to_blocks(page, pnum) for pnum, page in enumerate(char_blocks)]
|
| 97 |
-
|
| 98 |
-
return marker_blocks, toc
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
def naive_get_text(doc):
|
| 102 |
-
full_text = ""
|
| 103 |
-
for page_idx in range(len(doc)):
|
| 104 |
-
page = doc.get_page(page_idx)
|
| 105 |
-
text_page = page.get_textpage()
|
| 106 |
-
full_text += text_page.get_text_bounded() + "\n"
|
| 107 |
-
return full_text
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
def get_length_of_text(fname: str) -> int:
|
| 111 |
-
doc = pdfium.PdfDocument(fname)
|
| 112 |
-
text = naive_get_text(doc).strip()
|
| 113 |
-
|
| 114 |
-
return len(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
marker/pdf/images.py
DELETED
|
@@ -1,27 +0,0 @@
|
|
| 1 |
-
import pypdfium2 as pdfium
|
| 2 |
-
from pypdfium2 import PdfPage
|
| 3 |
-
|
| 4 |
-
from marker.schema.page import Page
|
| 5 |
-
from marker.schema.bbox import rescale_bbox
|
| 6 |
-
from marker.settings import settings
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
def render_image(page: pdfium.PdfPage, dpi):
|
| 10 |
-
image = page.render(
|
| 11 |
-
scale=dpi / 72,
|
| 12 |
-
draw_annots=False
|
| 13 |
-
).to_pil()
|
| 14 |
-
image = image.convert("RGB")
|
| 15 |
-
return image
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
def render_bbox_image(page_obj: PdfPage, page: Page, bbox):
|
| 19 |
-
png_image = render_image(page_obj, settings.IMAGE_DPI)
|
| 20 |
-
# Rescale original pdf bbox bounds to match png image size
|
| 21 |
-
png_bbox = [0, 0, png_image.size[0], png_image.size[1]]
|
| 22 |
-
rescaled_merged = rescale_bbox(page.bbox, png_bbox, bbox)
|
| 23 |
-
|
| 24 |
-
# Crop out only the equation image
|
| 25 |
-
png_image = png_image.crop(rescaled_merged)
|
| 26 |
-
png_image = png_image.convert("RGB")
|
| 27 |
-
return png_image
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
marker/pdf/utils.py
DELETED
|
@@ -1,75 +0,0 @@
|
|
| 1 |
-
from typing import Optional
|
| 2 |
-
|
| 3 |
-
import filetype
|
| 4 |
-
|
| 5 |
-
from marker.settings import settings
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
def find_filetype(fpath):
|
| 9 |
-
kind = filetype.guess(fpath)
|
| 10 |
-
if kind is None:
|
| 11 |
-
print(f"Could not determine filetype for {fpath}")
|
| 12 |
-
return "other"
|
| 13 |
-
|
| 14 |
-
mimetype = kind.mime
|
| 15 |
-
|
| 16 |
-
# Get extensions from mimetype
|
| 17 |
-
# The mimetype is not always consistent, so use in to check the most common formats
|
| 18 |
-
if "pdf" in mimetype:
|
| 19 |
-
return "pdf"
|
| 20 |
-
elif mimetype in settings.SUPPORTED_FILETYPES:
|
| 21 |
-
return settings.SUPPORTED_FILETYPES[mimetype]
|
| 22 |
-
else:
|
| 23 |
-
print(f"Found nonstandard filetype {mimetype}")
|
| 24 |
-
return "other"
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
def font_flags_decomposer(flags: Optional[int]) -> str:
|
| 28 |
-
if flags is None:
|
| 29 |
-
return ""
|
| 30 |
-
|
| 31 |
-
flag_descriptions = []
|
| 32 |
-
if flags & (1 << 0): # PDFFONT_FIXEDPITCH
|
| 33 |
-
flag_descriptions.append("fixed_pitch")
|
| 34 |
-
if flags & (1 << 1): # PDFFONT_SERIF
|
| 35 |
-
flag_descriptions.append("serif")
|
| 36 |
-
if flags & (1 << 2): # PDFFONT_SYMBOLIC
|
| 37 |
-
flag_descriptions.append("symbolic")
|
| 38 |
-
if flags & (1 << 3): # PDFFONT_SCRIPT
|
| 39 |
-
flag_descriptions.append("script")
|
| 40 |
-
if flags & (1 << 5): # PDFFONT_NONSYMBOLIC
|
| 41 |
-
flag_descriptions.append("non_symbolic")
|
| 42 |
-
if flags & (1 << 6): # PDFFONT_ITALIC
|
| 43 |
-
flag_descriptions.append("italic")
|
| 44 |
-
if flags & (1 << 16): # PDFFONT_ALLCAP
|
| 45 |
-
flag_descriptions.append("all_cap")
|
| 46 |
-
if flags & (1 << 17): # PDFFONT_SMALLCAP
|
| 47 |
-
flag_descriptions.append("small_cap")
|
| 48 |
-
if flags & (1 << 18): # PDFFONT_FORCEBOLD
|
| 49 |
-
flag_descriptions.append("bold")
|
| 50 |
-
if flags & (1 << 19): # PDFFONT_USEEXTERNATTR
|
| 51 |
-
flag_descriptions.append("use_extern_attr")
|
| 52 |
-
|
| 53 |
-
return "_".join(flag_descriptions)
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
def sort_block_group(blocks, tolerance=1.25):
|
| 57 |
-
vertical_groups = {}
|
| 58 |
-
for block in blocks:
|
| 59 |
-
if hasattr(block, "bbox"):
|
| 60 |
-
bbox = block.bbox
|
| 61 |
-
else:
|
| 62 |
-
bbox = block["bbox"]
|
| 63 |
-
|
| 64 |
-
group_key = round(bbox[1] / tolerance) * tolerance
|
| 65 |
-
if group_key not in vertical_groups:
|
| 66 |
-
vertical_groups[group_key] = []
|
| 67 |
-
vertical_groups[group_key].append(block)
|
| 68 |
-
|
| 69 |
-
# Sort each group horizontally and flatten the groups into a single list
|
| 70 |
-
sorted_blocks = []
|
| 71 |
-
for _, group in sorted(vertical_groups.items()):
|
| 72 |
-
sorted_group = sorted(group, key=lambda x: x.bbox[0] if hasattr(x, "bbox") else x["bbox"][0])
|
| 73 |
-
sorted_blocks.extend(sorted_group)
|
| 74 |
-
|
| 75 |
-
return sorted_blocks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
marker/postprocessors/markdown.py
DELETED
|
@@ -1,254 +0,0 @@
|
|
| 1 |
-
from marker.schema.merged import MergedLine, MergedBlock, FullyMergedBlock
|
| 2 |
-
from marker.schema.page import Page
|
| 3 |
-
import re
|
| 4 |
-
import regex
|
| 5 |
-
from typing import List
|
| 6 |
-
from copy import deepcopy
|
| 7 |
-
|
| 8 |
-
from marker.settings import settings
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
def escape_markdown(text):
|
| 12 |
-
# List of characters that need to be escaped in markdown
|
| 13 |
-
characters_to_escape = r"[#]"
|
| 14 |
-
# Escape each of these characters with a backslash
|
| 15 |
-
escaped_text = re.sub(characters_to_escape, r'\\\g<0>', text)
|
| 16 |
-
return escaped_text
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
def surround_text(s, char_to_insert):
|
| 20 |
-
leading_whitespace = re.match(r'^(\s*)', s).group(1)
|
| 21 |
-
trailing_whitespace = re.search(r'(\s*)$', s).group(1)
|
| 22 |
-
stripped_string = s.strip()
|
| 23 |
-
modified_string = char_to_insert + stripped_string + char_to_insert
|
| 24 |
-
final_string = leading_whitespace + modified_string + trailing_whitespace
|
| 25 |
-
return final_string
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
def merge_spans(pages: List[Page]) -> List[List[MergedBlock]]:
|
| 29 |
-
merged_blocks = []
|
| 30 |
-
for page in pages:
|
| 31 |
-
page_blocks = []
|
| 32 |
-
for blocknum, block in enumerate(page.blocks):
|
| 33 |
-
block_lines = []
|
| 34 |
-
for linenum, line in enumerate(block.lines):
|
| 35 |
-
line_text = ""
|
| 36 |
-
if len(line.spans) == 0:
|
| 37 |
-
continue
|
| 38 |
-
fonts = []
|
| 39 |
-
for i, span in enumerate(line.spans):
|
| 40 |
-
font = span.font.lower()
|
| 41 |
-
next_span = None
|
| 42 |
-
next_idx = 1
|
| 43 |
-
while len(line.spans) > i + next_idx:
|
| 44 |
-
next_span = line.spans[i + next_idx]
|
| 45 |
-
next_idx += 1
|
| 46 |
-
if len(next_span.text.strip()) > 2:
|
| 47 |
-
break
|
| 48 |
-
|
| 49 |
-
fonts.append(font)
|
| 50 |
-
span_text = span.text
|
| 51 |
-
|
| 52 |
-
# Don't bold or italicize very short sequences
|
| 53 |
-
# Avoid bolding first and last sequence so lines can be joined properly
|
| 54 |
-
if len(span_text) > 3 and 0 < i < len(line.spans) - 1:
|
| 55 |
-
if span.italic and (not next_span or not next_span.italic):
|
| 56 |
-
span_text = surround_text(span_text, "*")
|
| 57 |
-
elif span.bold and (not next_span or not next_span.bold):
|
| 58 |
-
span_text = surround_text(span_text, "**")
|
| 59 |
-
line_text += span_text
|
| 60 |
-
block_lines.append(MergedLine(
|
| 61 |
-
text=line_text,
|
| 62 |
-
fonts=fonts,
|
| 63 |
-
bbox=line.bbox
|
| 64 |
-
))
|
| 65 |
-
if len(block_lines) > 0:
|
| 66 |
-
page_blocks.append(MergedBlock(
|
| 67 |
-
lines=block_lines,
|
| 68 |
-
pnum=page.pnum,
|
| 69 |
-
bbox=block.bbox,
|
| 70 |
-
block_type=block.block_type,
|
| 71 |
-
heading_level=block.heading_level
|
| 72 |
-
))
|
| 73 |
-
if len(page_blocks) == 0:
|
| 74 |
-
page_blocks.append(MergedBlock(
|
| 75 |
-
lines=[],
|
| 76 |
-
pnum=page.pnum,
|
| 77 |
-
bbox=page.bbox,
|
| 78 |
-
block_type="Text",
|
| 79 |
-
heading_level=None
|
| 80 |
-
))
|
| 81 |
-
merged_blocks.append(page_blocks)
|
| 82 |
-
|
| 83 |
-
return merged_blocks
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
def block_surround(text, block_type, heading_level):
|
| 87 |
-
if block_type == "Section-header":
|
| 88 |
-
if not text.startswith("#"):
|
| 89 |
-
asterisks = "#" * heading_level if heading_level is not None else "##"
|
| 90 |
-
text = f"\n{asterisks} " + text.strip().title() + "\n"
|
| 91 |
-
elif block_type == "Title":
|
| 92 |
-
if not text.startswith("#"):
|
| 93 |
-
text = "# " + text.strip().title() + "\n"
|
| 94 |
-
elif block_type == "Table":
|
| 95 |
-
text = "\n" + text + "\n"
|
| 96 |
-
elif block_type == "List-item":
|
| 97 |
-
text = escape_markdown(text.rstrip()) + "\n"
|
| 98 |
-
elif block_type == "Code":
|
| 99 |
-
text = "\n```\n" + text + "\n```\n"
|
| 100 |
-
elif block_type == "Text":
|
| 101 |
-
text = escape_markdown(text)
|
| 102 |
-
elif block_type == "Formula":
|
| 103 |
-
if text.strip().startswith("$$") and text.strip().endswith("$$"):
|
| 104 |
-
text = text.strip()
|
| 105 |
-
text = "\n" + text + "\n"
|
| 106 |
-
elif block_type == "Caption":
|
| 107 |
-
text = "\n" + escape_markdown(text) + "\n"
|
| 108 |
-
return text
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
def line_separator(block_text: str, prev_line: MergedLine, line: MergedLine, block_type: str, new_column: bool, new_page: bool, new_block: bool) -> str:
|
| 112 |
-
lowercase_letters = r'\p{Ll}|\d'
|
| 113 |
-
hyphens = r'-βΒ¬'
|
| 114 |
-
|
| 115 |
-
hyphen_regex = regex.compile(rf'.*[{hyphens}]\s?$', regex.DOTALL)
|
| 116 |
-
hyphens_lowercase_regex = regex.compile(rf'.*[{lowercase_letters}][{hyphens}]\s?$', regex.DOTALL)
|
| 117 |
-
line_starts_lowercase = regex.match(rf"^\s?[{lowercase_letters}]", line.text)
|
| 118 |
-
prev_has_reference = regex.match(r"^\[\d+\]\s+[A-Z]", prev_line.text)
|
| 119 |
-
has_reference = regex.match(r"^\[\d+\]\s+[A-Z]", line.text)
|
| 120 |
-
has_numbered_item = regex.match(r"^\d+:\s+", line.text)
|
| 121 |
-
|
| 122 |
-
line_text = line.text.lstrip()
|
| 123 |
-
block_text = block_text.rstrip()
|
| 124 |
-
|
| 125 |
-
if block_type in ["Text", "List-item", "Footnote", "Caption", "Figure"]:
|
| 126 |
-
if has_reference or has_numbered_item:
|
| 127 |
-
return block_text + "\n\n" + line_text
|
| 128 |
-
elif hyphen_regex.match(block_text):
|
| 129 |
-
if line_starts_lowercase and hyphens_lowercase_regex.match(block_text):
|
| 130 |
-
return regex.split(rf"[{hyphens}]\s?$", block_text)[0].rstrip() + line_text
|
| 131 |
-
return block_text + line_text
|
| 132 |
-
elif new_page or new_column:
|
| 133 |
-
if line_starts_lowercase:
|
| 134 |
-
return block_text + " " + line_text
|
| 135 |
-
return block_text + "\n\n" + line_text
|
| 136 |
-
elif new_block:
|
| 137 |
-
if prev_has_reference:
|
| 138 |
-
return block_text + " " + line_text
|
| 139 |
-
return block_text + "\n\n" + line_text
|
| 140 |
-
else:
|
| 141 |
-
# General case for joining lines with a space
|
| 142 |
-
return block_text + " " + line_text
|
| 143 |
-
elif block_type in ["Title", "Section-header"]:
|
| 144 |
-
return block_text + " " + line_text
|
| 145 |
-
elif block_type in ["Formula"]:
|
| 146 |
-
return block_text + "\n" + line_text
|
| 147 |
-
elif block_type in ["Code", "Table"]:
|
| 148 |
-
return block_text + "\n\n" + line_text
|
| 149 |
-
else:
|
| 150 |
-
return block_text + " " + line_text
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
def block_separator(prev_block: FullyMergedBlock, block: FullyMergedBlock):
|
| 154 |
-
sep = "\n"
|
| 155 |
-
if prev_block.block_type == "Text":
|
| 156 |
-
sep = "\n\n"
|
| 157 |
-
|
| 158 |
-
return sep + block.text
|
| 159 |
-
|
| 160 |
-
def merge_lines(blocks: List[List[MergedBlock]], min_new_block_x_indent_percent=5.0):
|
| 161 |
-
text_blocks = []
|
| 162 |
-
prev_block = None
|
| 163 |
-
prev_type = None
|
| 164 |
-
prev_line = None
|
| 165 |
-
block_text = ""
|
| 166 |
-
block_type = ""
|
| 167 |
-
prev_heading_level = None
|
| 168 |
-
pnum = None
|
| 169 |
-
|
| 170 |
-
for page_id, page in enumerate(blocks):
|
| 171 |
-
# Insert pagination at every page boundary
|
| 172 |
-
if settings.PAGINATE_OUTPUT:
|
| 173 |
-
if block_text:
|
| 174 |
-
text_blocks.append(
|
| 175 |
-
FullyMergedBlock(
|
| 176 |
-
text=block_surround(block_text, prev_type, prev_heading_level),
|
| 177 |
-
block_type=prev_type if prev_type else settings.DEFAULT_BLOCK_TYPE,
|
| 178 |
-
page_start=False,
|
| 179 |
-
pnum=pnum
|
| 180 |
-
)
|
| 181 |
-
)
|
| 182 |
-
block_text = ""
|
| 183 |
-
text_blocks.append(
|
| 184 |
-
FullyMergedBlock(
|
| 185 |
-
text="",
|
| 186 |
-
block_type="Text",
|
| 187 |
-
page_start=True,
|
| 188 |
-
pnum=page[0].pnum
|
| 189 |
-
)
|
| 190 |
-
)
|
| 191 |
-
for block_id, block in enumerate(page):
|
| 192 |
-
first_block_in_page = block_id == 0
|
| 193 |
-
block_type = block.block_type
|
| 194 |
-
if (block_type != prev_type and prev_type) or (block.heading_level != prev_heading_level and prev_heading_level):
|
| 195 |
-
text_blocks.append(
|
| 196 |
-
FullyMergedBlock(
|
| 197 |
-
text=block_surround(block_text, prev_type, prev_heading_level),
|
| 198 |
-
block_type=prev_type if prev_type else settings.DEFAULT_BLOCK_TYPE,
|
| 199 |
-
page_start=False,
|
| 200 |
-
pnum=block.pnum
|
| 201 |
-
)
|
| 202 |
-
)
|
| 203 |
-
block_text = ""
|
| 204 |
-
# Join lines in the block together properly
|
| 205 |
-
for line_id, line in enumerate(block.lines):
|
| 206 |
-
first_line_in_block = line_id == 0
|
| 207 |
-
if prev_line is None:
|
| 208 |
-
prev_line = deepcopy(line)
|
| 209 |
-
if prev_block is None:
|
| 210 |
-
prev_block = deepcopy(block)
|
| 211 |
-
x_indent = line.x_start - prev_line.x_start
|
| 212 |
-
y_indent = line.y_start - prev_line.y_start
|
| 213 |
-
new_line = y_indent > prev_line.height
|
| 214 |
-
new_column = line.x_start > prev_block.x_end
|
| 215 |
-
new_block = first_line_in_block or \
|
| 216 |
-
( # we consider it a new block when there's an x indent from the previous line and it's a new line (y indent)
|
| 217 |
-
((x_indent/block.width) * 100) > min_new_block_x_indent_percent and new_line
|
| 218 |
-
)
|
| 219 |
-
new_page = first_line_in_block and first_block_in_page
|
| 220 |
-
if block_text:
|
| 221 |
-
block_text = line_separator(block_text, prev_line, line, block_type, new_column, new_page, new_block)
|
| 222 |
-
else:
|
| 223 |
-
block_text = line.text
|
| 224 |
-
prev_line = line
|
| 225 |
-
prev_block = block
|
| 226 |
-
prev_type = block_type
|
| 227 |
-
prev_heading_level = block.heading_level
|
| 228 |
-
pnum = block.pnum
|
| 229 |
-
# Append the final block
|
| 230 |
-
text_blocks.append(
|
| 231 |
-
FullyMergedBlock(
|
| 232 |
-
text=block_surround(block_text, prev_type, prev_heading_level),
|
| 233 |
-
block_type=block_type if block_type else settings.DEFAULT_BLOCK_TYPE,
|
| 234 |
-
page_start=False,
|
| 235 |
-
pnum=pnum
|
| 236 |
-
)
|
| 237 |
-
)
|
| 238 |
-
|
| 239 |
-
text_blocks = [block for block in text_blocks if (block.text.strip() or block.page_start)]
|
| 240 |
-
return text_blocks
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
def get_full_text(text_blocks):
|
| 244 |
-
full_text = ""
|
| 245 |
-
prev_block = None
|
| 246 |
-
for block in text_blocks:
|
| 247 |
-
if block.page_start:
|
| 248 |
-
full_text += "\n\n{" + str(block.pnum) + "}" + settings.PAGE_SEPARATOR
|
| 249 |
-
elif prev_block:
|
| 250 |
-
full_text += block_separator(prev_block, block)
|
| 251 |
-
else:
|
| 252 |
-
full_text += block.text
|
| 253 |
-
prev_block = block
|
| 254 |
-
return full_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
marker/{v2/processors β processors}/__init__.py
RENAMED
|
@@ -2,9 +2,9 @@ from typing import Optional, Tuple
|
|
| 2 |
|
| 3 |
from pydantic import BaseModel
|
| 4 |
|
| 5 |
-
from marker.
|
| 6 |
-
from marker.
|
| 7 |
-
from marker.
|
| 8 |
|
| 9 |
|
| 10 |
class BaseProcessor:
|
|
|
|
| 2 |
|
| 3 |
from pydantic import BaseModel
|
| 4 |
|
| 5 |
+
from marker.schema import BlockTypes
|
| 6 |
+
from marker.schema.document import Document
|
| 7 |
+
from marker.util import assign_config
|
| 8 |
|
| 9 |
|
| 10 |
class BaseProcessor:
|
marker/{v2/processors β processors}/code.py
RENAMED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
-
from marker.
|
| 2 |
-
from marker.
|
| 3 |
-
from marker.
|
| 4 |
-
from marker.
|
| 5 |
|
| 6 |
|
| 7 |
class CodeProcessor(BaseProcessor):
|
|
|
|
| 1 |
+
from marker.processors import BaseProcessor
|
| 2 |
+
from marker.schema import BlockTypes
|
| 3 |
+
from marker.schema.blocks import Code
|
| 4 |
+
from marker.schema.document import Document
|
| 5 |
|
| 6 |
|
| 7 |
class CodeProcessor(BaseProcessor):
|
marker/{v2/processors β processors}/debug.py
RENAMED
|
@@ -5,9 +5,9 @@ import requests
|
|
| 5 |
from PIL import Image, ImageDraw, ImageFont
|
| 6 |
|
| 7 |
from marker.settings import settings
|
| 8 |
-
from marker.
|
| 9 |
-
from marker.
|
| 10 |
-
from marker.
|
| 11 |
|
| 12 |
|
| 13 |
class DebugProcessor(BaseProcessor):
|
|
|
|
| 5 |
from PIL import Image, ImageDraw, ImageFont
|
| 6 |
|
| 7 |
from marker.settings import settings
|
| 8 |
+
from marker.processors import BaseProcessor
|
| 9 |
+
from marker.schema import BlockTypes
|
| 10 |
+
from marker.schema.document import Document
|
| 11 |
|
| 12 |
|
| 13 |
class DebugProcessor(BaseProcessor):
|
marker/{v2/processors β processors}/document_toc.py
RENAMED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
-
from marker.
|
| 2 |
-
from marker.
|
| 3 |
-
from marker.
|
| 4 |
|
| 5 |
|
| 6 |
class DocumentTOCProcessor(BaseProcessor):
|
|
|
|
| 1 |
+
from marker.processors import BaseProcessor
|
| 2 |
+
from marker.schema import BlockTypes
|
| 3 |
+
from marker.schema.document import Document
|
| 4 |
|
| 5 |
|
| 6 |
class DocumentTOCProcessor(BaseProcessor):
|
marker/{v2/processors β processors}/equation.py
RENAMED
|
@@ -5,9 +5,9 @@ from texify.model.model import GenerateVisionEncoderDecoderModel
|
|
| 5 |
from tqdm import tqdm
|
| 6 |
|
| 7 |
from marker.settings import settings
|
| 8 |
-
from marker.
|
| 9 |
-
from marker.
|
| 10 |
-
from marker.
|
| 11 |
|
| 12 |
|
| 13 |
class EquationProcessor(BaseProcessor):
|
|
|
|
| 5 |
from tqdm import tqdm
|
| 6 |
|
| 7 |
from marker.settings import settings
|
| 8 |
+
from marker.processors import BaseProcessor
|
| 9 |
+
from marker.schema import BlockTypes
|
| 10 |
+
from marker.schema.document import Document
|
| 11 |
|
| 12 |
|
| 13 |
class EquationProcessor(BaseProcessor):
|
marker/{v2/processors β processors}/ignoretext.py
RENAMED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
from collections import Counter
|
| 2 |
|
| 3 |
-
from marker.
|
| 4 |
-
from marker.
|
| 5 |
-
from marker.
|
| 6 |
|
| 7 |
|
| 8 |
class IgnoreTextProcessor(BaseProcessor):
|
|
|
|
| 1 |
from collections import Counter
|
| 2 |
|
| 3 |
+
from marker.processors import BaseProcessor
|
| 4 |
+
from marker.schema import BlockTypes
|
| 5 |
+
from marker.schema.document import Document
|
| 6 |
|
| 7 |
|
| 8 |
class IgnoreTextProcessor(BaseProcessor):
|
marker/{v2/processors β processors}/sectionheader.py
RENAMED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
-
from marker.
|
| 2 |
-
from marker.
|
| 3 |
-
from marker.
|
| 4 |
|
| 5 |
from typing import Dict, List
|
| 6 |
import numpy as np
|
|
|
|
| 1 |
+
from marker.processors import BaseProcessor
|
| 2 |
+
from marker.schema import BlockTypes
|
| 3 |
+
from marker.schema.document import Document
|
| 4 |
|
| 5 |
from typing import Dict, List
|
| 6 |
import numpy as np
|
marker/{v2/processors β processors}/table.py
RENAMED
|
@@ -7,9 +7,9 @@ from tabled.assignment import assign_rows_columns
|
|
| 7 |
from tabled.inference.recognition import get_cells, recognize_tables
|
| 8 |
|
| 9 |
from marker.settings import settings
|
| 10 |
-
from marker.
|
| 11 |
-
from marker.
|
| 12 |
-
from marker.
|
| 13 |
|
| 14 |
|
| 15 |
class TableProcessor(BaseProcessor):
|
|
|
|
| 7 |
from tabled.inference.recognition import get_cells, recognize_tables
|
| 8 |
|
| 9 |
from marker.settings import settings
|
| 10 |
+
from marker.processors import BaseProcessor
|
| 11 |
+
from marker.schema import BlockTypes
|
| 12 |
+
from marker.schema.document import Document
|
| 13 |
|
| 14 |
|
| 15 |
class TableProcessor(BaseProcessor):
|
marker/{v2/processors β processors}/text.py
RENAMED
|
@@ -3,10 +3,10 @@ from typing import List
|
|
| 3 |
|
| 4 |
import regex
|
| 5 |
|
| 6 |
-
from marker.
|
| 7 |
-
from marker.
|
| 8 |
-
from marker.
|
| 9 |
-
from marker.
|
| 10 |
|
| 11 |
|
| 12 |
class TextProcessor(BaseProcessor):
|
|
|
|
| 3 |
|
| 4 |
import regex
|
| 5 |
|
| 6 |
+
from marker.processors import BaseProcessor
|
| 7 |
+
from marker.schema import BlockTypes
|
| 8 |
+
from marker.schema.document import Document
|
| 9 |
+
from marker.schema.text.line import Line
|
| 10 |
|
| 11 |
|
| 12 |
class TextProcessor(BaseProcessor):
|
marker/{v2/providers β providers}/__init__.py
RENAMED
|
@@ -2,9 +2,9 @@ from typing import List, Optional, Dict
|
|
| 2 |
|
| 3 |
from pydantic import BaseModel
|
| 4 |
|
| 5 |
-
from marker.
|
| 6 |
-
from marker.
|
| 7 |
-
from marker.
|
| 8 |
|
| 9 |
|
| 10 |
class ProviderOutput(BaseModel):
|
|
|
|
| 2 |
|
| 3 |
from pydantic import BaseModel
|
| 4 |
|
| 5 |
+
from marker.schema.text import Span
|
| 6 |
+
from marker.schema.text.line import Line
|
| 7 |
+
from marker.util import assign_config
|
| 8 |
|
| 9 |
|
| 10 |
class ProviderOutput(BaseModel):
|
marker/{v2/providers β providers}/pdf.py
RENAMED
|
@@ -1,18 +1,19 @@
|
|
| 1 |
import atexit
|
| 2 |
import functools
|
|
|
|
| 3 |
from typing import List, Set
|
| 4 |
|
| 5 |
import pypdfium2 as pdfium
|
| 6 |
from pdftext.extraction import dictionary_output
|
| 7 |
from PIL import Image
|
| 8 |
|
| 9 |
-
from marker.
|
| 10 |
-
from marker.
|
| 11 |
-
from marker.
|
| 12 |
-
from marker.
|
| 13 |
-
from marker.
|
| 14 |
-
from marker.
|
| 15 |
-
from marker.
|
| 16 |
|
| 17 |
|
| 18 |
class PdfProvider(BaseProvider):
|
|
@@ -20,6 +21,10 @@ class PdfProvider(BaseProvider):
|
|
| 20 |
pdftext_workers: int = 4
|
| 21 |
flatten_pdf: bool = True
|
| 22 |
force_ocr: bool = False
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
def __init__(self, filepath: str, config=None):
|
| 25 |
super().__init__(filepath, config)
|
|
@@ -153,10 +158,34 @@ class PdfProvider(BaseProvider):
|
|
| 153 |
text = text + "\n"
|
| 154 |
if len(text.strip()) == 0:
|
| 155 |
return False
|
| 156 |
-
if detect_bad_ocr(text):
|
| 157 |
return False
|
| 158 |
return True
|
| 159 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
@functools.lru_cache(maxsize=None)
|
| 161 |
def get_image(self, idx: int, dpi: int) -> Image.Image:
|
| 162 |
page = self.doc[idx]
|
|
|
|
| 1 |
import atexit
|
| 2 |
import functools
|
| 3 |
+
import re
|
| 4 |
from typing import List, Set
|
| 5 |
|
| 6 |
import pypdfium2 as pdfium
|
| 7 |
from pdftext.extraction import dictionary_output
|
| 8 |
from PIL import Image
|
| 9 |
|
| 10 |
+
from marker.providers.utils import alphanum_ratio
|
| 11 |
+
from marker.providers import BaseProvider, ProviderOutput, ProviderPageLines
|
| 12 |
+
from marker.schema.polygon import PolygonBox
|
| 13 |
+
from marker.schema import BlockTypes
|
| 14 |
+
from marker.schema.registry import get_block_class
|
| 15 |
+
from marker.schema.text.line import Line
|
| 16 |
+
from marker.schema.text.span import Span
|
| 17 |
|
| 18 |
|
| 19 |
class PdfProvider(BaseProvider):
|
|
|
|
| 21 |
pdftext_workers: int = 4
|
| 22 |
flatten_pdf: bool = True
|
| 23 |
force_ocr: bool = False
|
| 24 |
+
ocr_invalid_chars: tuple = (chr(0xfffd), "οΏ½")
|
| 25 |
+
ocr_space_threshold: float = .7
|
| 26 |
+
ocr_newline_threshold: float = .6
|
| 27 |
+
ocr_alphanum_threshold: float = .3
|
| 28 |
|
| 29 |
def __init__(self, filepath: str, config=None):
|
| 30 |
super().__init__(filepath, config)
|
|
|
|
| 158 |
text = text + "\n"
|
| 159 |
if len(text.strip()) == 0:
|
| 160 |
return False
|
| 161 |
+
if self.detect_bad_ocr(text):
|
| 162 |
return False
|
| 163 |
return True
|
| 164 |
|
| 165 |
+
def detect_bad_ocr(self, text):
|
| 166 |
+
if len(text) == 0:
|
| 167 |
+
# Assume OCR failed if we have no text
|
| 168 |
+
return True
|
| 169 |
+
|
| 170 |
+
spaces = len(re.findall(r'\s+', text))
|
| 171 |
+
alpha_chars = len(re.sub(r'\s+', '', text))
|
| 172 |
+
if spaces / (alpha_chars + spaces) > self.ocr_space_threshold:
|
| 173 |
+
return True
|
| 174 |
+
|
| 175 |
+
newlines = len(re.findall(r'\n+', text))
|
| 176 |
+
non_newlines = len(re.sub(r'\n+', '', text))
|
| 177 |
+
if newlines / (newlines + non_newlines) > self.ocr_newline_threshold:
|
| 178 |
+
return True
|
| 179 |
+
|
| 180 |
+
if alphanum_ratio(text) < self.ocr_alphanum_threshold: # Garbled text
|
| 181 |
+
return True
|
| 182 |
+
|
| 183 |
+
invalid_chars = len([c for c in text if c in self.ocr_invalid_chars])
|
| 184 |
+
if invalid_chars > max(6.0, len(text) * .03):
|
| 185 |
+
return True
|
| 186 |
+
|
| 187 |
+
return False
|
| 188 |
+
|
| 189 |
@functools.lru_cache(maxsize=None)
|
| 190 |
def get_image(self, idx: int, dpi: int) -> Image.Image:
|
| 191 |
page = self.doc[idx]
|
marker/{ocr β providers}/utils.py
RENAMED
|
@@ -1,3 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
| 1 |
def alphanum_ratio(text):
|
| 2 |
text = text.replace(" ", "")
|
| 3 |
text = text.replace("\n", "")
|
|
@@ -7,4 +10,4 @@ def alphanum_ratio(text):
|
|
| 7 |
return 1
|
| 8 |
|
| 9 |
ratio = alphanumeric_count / len(text)
|
| 10 |
-
return ratio
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
|
| 3 |
+
|
| 4 |
def alphanum_ratio(text):
|
| 5 |
text = text.replace(" ", "")
|
| 6 |
text = text.replace("\n", "")
|
|
|
|
| 10 |
return 1
|
| 11 |
|
| 12 |
ratio = alphanumeric_count / len(text)
|
| 13 |
+
return ratio
|
marker/{v2/renderers β renderers}/__init__.py
RENAMED
|
@@ -6,9 +6,9 @@ from typing import Optional
|
|
| 6 |
from bs4 import BeautifulSoup
|
| 7 |
from pydantic import BaseModel
|
| 8 |
|
| 9 |
-
from marker.
|
| 10 |
-
from marker.
|
| 11 |
-
from marker.
|
| 12 |
|
| 13 |
|
| 14 |
class BaseRenderer:
|
|
|
|
| 6 |
from bs4 import BeautifulSoup
|
| 7 |
from pydantic import BaseModel
|
| 8 |
|
| 9 |
+
from marker.schema import BlockTypes
|
| 10 |
+
from marker.schema.blocks.base import BlockOutput, BlockId
|
| 11 |
+
from marker.util import assign_config
|
| 12 |
|
| 13 |
|
| 14 |
class BaseRenderer:
|
marker/{v2/renderers β renderers}/html.py
RENAMED
|
@@ -3,9 +3,9 @@ import re
|
|
| 3 |
from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
|
| 4 |
from pydantic import BaseModel
|
| 5 |
|
| 6 |
-
from marker.
|
| 7 |
-
from marker.
|
| 8 |
-
from marker.
|
| 9 |
|
| 10 |
# Ignore beautifulsoup warnings
|
| 11 |
import warnings
|
|
|
|
| 3 |
from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
|
| 4 |
from pydantic import BaseModel
|
| 5 |
|
| 6 |
+
from marker.renderers import BaseRenderer
|
| 7 |
+
from marker.schema import BlockTypes
|
| 8 |
+
from marker.schema.blocks import BlockId
|
| 9 |
|
| 10 |
# Ignore beautifulsoup warnings
|
| 11 |
import warnings
|
marker/{v2/renderers β renderers}/json.py
RENAMED
|
@@ -7,11 +7,11 @@ from typing import List, Dict
|
|
| 7 |
from bs4 import BeautifulSoup
|
| 8 |
from pydantic import BaseModel
|
| 9 |
|
| 10 |
-
from marker.
|
| 11 |
-
from marker.
|
| 12 |
-
from marker.
|
| 13 |
-
from marker.
|
| 14 |
-
from marker.
|
| 15 |
|
| 16 |
|
| 17 |
class JSONBlockOutput(BaseModel):
|
|
|
|
| 7 |
from bs4 import BeautifulSoup
|
| 8 |
from pydantic import BaseModel
|
| 9 |
|
| 10 |
+
from marker.schema.blocks import Block
|
| 11 |
+
from marker.renderers import BaseRenderer
|
| 12 |
+
from marker.schema import BlockTypes
|
| 13 |
+
from marker.schema.blocks import BlockId
|
| 14 |
+
from marker.schema.registry import get_block_class
|
| 15 |
|
| 16 |
|
| 17 |
class JSONBlockOutput(BaseModel):
|