Spaces:

rt4u
/

marker

Sleeping

App Files Files Community

Vik Paruchuri commited on Oct 30, 2023

Commit

a5c1c2e

0 Parent(s):

Initial commit

Browse files

Files changed (15) hide show

.gitignore +166 -0
README.md +13 -0
marker/bbox.py +61 -0
marker/code.py +86 -0
marker/equations.py +167 -0
marker/extract_text.py +94 -0
marker/headers.py +60 -0
marker/markdown.py +163 -0
marker/schema.py +176 -0
marker/segmentation.py +139 -0
marker/settings.py +20 -0
parse.py +49 -0
poetry.lock +0 -0
pyproject.toml +28 -0
requirements.txt +12 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,166 @@

+private.py
+.DS_Store
+local.env
+experiments
+test_data
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+# Marker
+This project converts PDF to Markdown, balancing speed with quality:
+- Equations will be detected and converted to Latex.  This is not 100% accurate.
+- All headers/footers/other artifacts will be removed.
+## Install
+- `poetry install`
+- Set `TESSDATA_PREFIX`

marker/bbox.py ADDED Viewed

	@@ -0,0 +1,61 @@

+def should_merge_blocks(box1, box2, tol=10):
+    # Within tol y px, and to the right within tol px
+    merge = [
+        box2[0] > box1[0],
+        abs(box2[1] - box1[1]) < tol, # Within tol y px
+        abs(box2[3] - box1[3]) < tol, # Within tol y px
+        abs(box2[0] - box1[2]) < tol, # Within tol x px
+    ]
+    return all(merge)
+def merge_boxes(box1, box2):
+    return (min(box1[0], box2[0]), min(box1[1], box2[1]), max(box2[2], box1[2]), max(box1[3], box2[3]))
+def boxes_intersect(box1, box2):
+    # Box1 intersects box2
+    return box1[0] < box2[2] and box1[2] > box2[0] and box1[1] < box2[3] and box1[3] > box2[1]
+def boxes_intersect_pct(box1, box2, pct=.9):
+    # determine the coordinates of the intersection rectangle
+    x_left = max(box1[0], box2[0])
+    y_top = max(box1[1], box2[1])
+    x_right = min(box1[2], box2[2])
+    y_bottom = min(box1[3], box2[3])
+    if x_right < x_left or y_bottom < y_top:
+        return 0.0
+    # The intersection of two axis-aligned bounding boxes is always an
+    # axis-aligned bounding box
+    intersection_area = (x_right - x_left) * (y_bottom - y_top)
+    # compute the area of both AABBs
+    bb1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
+    bb2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])
+    iou = intersection_area / float(bb1_area + bb2_area - intersection_area)
+    return iou > pct
+def multiple_boxes_intersect(box1, boxes):
+    for box2 in boxes:
+        if boxes_intersect(box1, box2):
+            return True
+    return False
+def box_contained(box1, box2):
+    # Box1 inside box2
+    return box1[0] > box2[0] and box1[1] > box2[1] and box1[2] < box2[2] and box1[3] < box2[3]
+def unnormalize_box(bbox, width, height):
+    return [
+        width * (bbox[0] / 1000),
+        height * (bbox[1] / 1000),
+        width * (bbox[2] / 1000),
+        height * (bbox[3] / 1000),
+    ]

marker/code.py ADDED Viewed

	@@ -0,0 +1,86 @@

+from marker.schema import Span, Line, Page
+import re
+from typing import List
+import fitz as pymupdf
+def is_code_linelen(lines, thresh=50):
+    # Decide based on chars per newline threshold
+    total_alnum_chars = sum(len(re.findall(r'\w', line.prelim_text)) for line in lines)
+    total_newlines = len(lines) - 1
+    if total_alnum_chars == 0:
+        return False
+    ratio = total_alnum_chars / total_newlines
+    return ratio < thresh
+def identify_code_blocks(blocks: List[Page]):
+    for page in blocks:
+        try:
+            common_height = page.get_line_height_stats().most_common(1)[0][0]
+            common_start = page.get_line_start_stats().most_common(1)[0][0]
+        except IndexError:
+            continue
+        for block in page.blocks:
+            if len(block.lines) < 2:
+                continue
+            if block.most_common_block_type() != "Text":
+                continue
+            is_code = []
+            for line in block.lines:
+                fonts = [span.font for span in line.spans]
+                monospace_font = any([font for font in fonts if "mono" in font.lower() or "prop" in font.lower()])
+                line_height = line.bbox[3] - line.bbox[1]
+                line_start = line.bbox[0]
+                if line_height <= common_height and line_start > common_start and monospace_font:
+                    is_code.append(True)
+                else:
+                    is_code.append(False)
+            is_code = [
+                sum(is_code) > len(block.lines) / 1.5,
+                len(block.lines) > 4,
+                is_code_linelen(block.lines)
+            ]
+            if all(is_code):
+                block.set_block_type("Code")
+def indent_blocks(blocks: List[Page]):
+    span_counter = 0
+    for page in blocks:
+        for block in page.blocks:
+            if block.most_common_block_type() != "Code":
+                continue
+            lines = []
+            min_left = 1000  # will contain x- coord of column 0
+            col_width = 0  # width of 1 char
+            for line in block.lines:
+                text = ""
+                min_left = min(line.bbox[0], min_left)
+                for span in line.spans:
+                    if col_width == 0 and len(span.text) > 0:
+                        col_width = (span.bbox[2] - span.bbox[0]) / len(span.text)
+                    text += span.text
+                lines.append((pymupdf.Rect(line.bbox), text))
+            block_text = ""
+            for line in lines:
+                text = line[1]
+                prefix = " " * int((line[0].x0 - min_left) / col_width)
+                block_text += prefix + text + "\n"
+            new_span = Span(
+                text=block_text,
+                bbox=block.bbox,
+                color=block.lines[0].spans[0].color,
+                span_id=f"{span_counter}_fix_code",
+                font=block.lines[0].spans[0].font,
+                block_type="Code"
+            )
+            span_counter += 1
+            block.lines = [Line(spans=[new_span], bbox=block.bbox)]

marker/equations.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import io
+from copy import deepcopy
+from typing import List
+from nougat import NougatModel
+from nougat.utils.checkpoint import get_checkpoint
+import re
+from PIL import Image, ImageDraw
+import fitz as pymupdf
+from marker.bbox import should_merge_blocks, merge_boxes, multiple_boxes_intersect
+from marker.settings import settings
+from marker.schema import Page, Span, Line, Block, BlockType
+from nougat.utils.device import move_to_device
+def load_model():
+    ckpt = get_checkpoint(None, model_tag="0.1.0-small")
+    nougat_model = NougatModel.from_pretrained(ckpt)
+    if settings.TORCH_DEVICE != "cpu":
+        is_cuda = "cuda" in settings.TORCH_DEVICE
+        move_to_device(nougat_model, bf16=is_cuda, cuda=is_cuda)
+    nougat_model.eval()
+    return nougat_model
+nougat_model = load_model()
+MODEL_MAX = nougat_model.config.max_length
+NOUGAT_HALLUCINATION_WORDS = ["[MISSING_PAGE_POST]", "## References\n", "**Figure Captions**\n", "Footnote", "\par\par\par", "## Chapter", "Fig."]
+def contains_equation(text):
+    # Define a regular expression pattern to look for operators and symbols commonly found in equations
+    pattern = re.compile(r'[=\^\√∑∏∫∂∆π≈≠≤≥∞∩∪∈∉∀∃∅∇λμσαβγδεζηθφχψω]')
+    # Search the text for the pattern
+    match = pattern.search(text)
+    # Alternative equation patterns
+    alt_pattern = re.compile(r' P(?=[ \n\(\)$])')
+    alt_match = alt_pattern.search(text)
+    # Return True if the pattern is found, otherwise return False
+    return bool(match) or bool(alt_match)
+def mask_bbox(png_image, bbox, selected_bboxes):
+    mask = Image.new('L', png_image.size, 0)  # 'L' mode for grayscale
+    draw = ImageDraw.Draw(mask)
+    first_x = bbox[0]
+    first_y = bbox[1]
+    bbox_height = bbox[3] - bbox[1]
+    bbox_width = bbox[2] - bbox[0]
+    for box in selected_bboxes:
+        # Fit the box to the selected region
+        new_box = (box[0] - first_x, box[1] - first_y, box[2] - first_x, box[3] - first_y)
+        # Fit mask to image bounds versus the pdf bounds
+        resized = (
+           new_box[0] / bbox_width * png_image.size[0],
+           new_box[1] / bbox_height * png_image.size[1],
+           new_box[2] / bbox_width * png_image.size[0],
+           new_box[3] / bbox_height * png_image.size[1]
+        )
+        draw.rectangle(resized, fill=255)
+    result = Image.composite(png_image, Image.new('RGBA', png_image.size, 'white'), mask)
+    return result
+def get_nougat_text(page, old_text, bbox, selected_bboxes, save_id, max_length=MODEL_MAX):
+    pix = page.get_pixmap(dpi=settings.DPI, clip=bbox)
+    png = pix.pil_tobytes(format="PNG")
+    png_image = Image.open(io.BytesIO(png))
+    png_image = mask_bbox(png_image, bbox, selected_bboxes)
+    nougat_model.config.max_length = min(max_length, MODEL_MAX)
+    output = nougat_model.inference(image=png_image)
+    return output["predictions"][0]
+def replace_equations(doc, blocks: List[Page], block_types: List[List[BlockType]]):
+    span_id = 0
+    new_blocks = []
+    for pnum, page in enumerate(blocks):
+        i = 0
+        new_page_blocks = []
+        equation_boxes = [b.bbox for b in block_types[pnum] if b.block_type == "Formula"]
+        while i < len(page.blocks):
+            block = page.blocks[i]
+            block_text = block.prelim_text
+            bbox = block.bbox
+            # Check if the block contains an equation
+            if not block.contains_equation(equation_boxes):
+                new_page_blocks.append(block)
+                i += 1
+                continue
+            selected_blocks = [i]
+            if i > 0:
+                j = 1
+                prev_block = page.blocks[i - j]
+                prev_bbox = prev_block.bbox
+                while (should_merge_blocks(prev_bbox, bbox) or prev_block.contains_equation(equation_boxes)) and i - j >= 0:
+                    bbox = merge_boxes(prev_bbox, bbox)
+                    prev_block = page.blocks[i - j]
+                    prev_bbox = prev_block.bbox
+                    block_text = prev_block.prelim_text + " " + block_text
+                    new_page_blocks = new_page_blocks[:-1]  # Remove the previous block, since we're merging it in
+                    j += 1
+                    selected_blocks.append(i - j)
+            if i < len(page.blocks) - 1:
+                next_block = page.blocks[i + 1]
+                next_bbox = next_block.bbox
+                while (should_merge_blocks(bbox, next_bbox) or next_block.contains_equation(equation_boxes)) and i + 1 < len(page.blocks):
+                    bbox = merge_boxes(bbox, next_bbox)
+                    block_text += " " + next_block.prelim_text
+                    i += 1
+                    selected_blocks.append(i)
+                    if i + 1 < len(page.blocks):
+                        next_block = page.blocks[i + 1]
+                        next_bbox = next_block.bbox
+            used_nougat = False
+            if len(block_text) < 2000:
+                selected_bboxes = [page.blocks[i].bbox for i in selected_blocks]
+                # This prevents hallucinations from running on for a long time
+                max_tokens = len(block_text) + 50
+                max_char_length = 2 * len(block_text) + 100
+                nougat_text = get_nougat_text(doc[pnum], block_text, bbox, selected_bboxes, f"{pnum}_{i}", max_length=max_tokens)
+                conditions = [
+                    len(nougat_text) > 0,
+                    not any([word in nougat_text for word in NOUGAT_HALLUCINATION_WORDS]),
+                    len(nougat_text) < max_char_length, # Reduce hallucinations
+                    len(nougat_text) >= len(block_text) * .8
+                ]
+                if all(conditions):
+                    block_line = Line(
+                        spans=[
+                            Span(
+                                text=nougat_text,
+                                bbox=bbox,
+                                span_id=f"{pnum}_{span_id}_fixeq",
+                                font="Latex",
+                                color=0,
+                                block_type="Formula"
+                            )
+                        ],
+                        bbox=bbox
+                    )
+                    new_page_blocks.append(Block(
+                        lines=[block_line],
+                        bbox=bbox,
+                        pnum=pnum
+                    ))
+                    used_nougat = True
+                    span_id += 1
+            if not used_nougat:
+                for block_idx in selected_blocks:
+                    new_page_blocks.append(page.blocks[block_idx])
+            i += 1
+        # Assign back to page
+        new_page = deepcopy(page)
+        new_page.blocks = new_page_blocks
+        new_blocks.append(new_page)
+    return new_blocks

marker/extract_text.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import fitz as pymupdf
+import os
+from marker.settings import settings
+from marker.schema import Span, Line, Block, Page
+os.environ["TESSDATA_PREFIX"] = settings.TESSDATA_PREFIX
+def get_tessocr(page, old_text, bbox):
+    pix = page.get_pixmap(dpi=settings.DPI, clip=bbox)
+    ocrpdf = pymupdf.open("pdf", pix.pdfocr_tobytes())
+    ocrpage = ocrpdf[0]
+    new_text = ocrpage.get_text()  # extract OCR-ed text
+    # Tesseract ignores leading spaces, hence some corrections
+    lblanks = len(old_text) - len(old_text.lstrip())
+    # prefix OCRed text with this many spaces
+    new_text = " " * lblanks + new_text
+    return new_text
+def font_flags_decomposer(flags):
+    """Make font flags human readable."""
+    l = []
+    if flags & 2 ** 0:
+        l.append("superscript")
+    if flags & 2 ** 1:
+        l.append("italic")
+    if flags & 2 ** 2:
+        l.append("serifed")
+    else:
+        l.append("sans")
+    if flags & 2 ** 3:
+        l.append("monospaced")
+    else:
+        l.append("proportional")
+    if flags & 2 ** 4:
+        l.append("bold")
+    return "_".join(l)
+def get_single_page_blocks(page, pnum):
+    blocks = page.get_text("dict", sort=True,
+                           flags=~pymupdf.TEXT_PRESERVE_LIGATURES & pymupdf.TEXT_PRESERVE_WHITESPACE & ~pymupdf.TEXT_PRESERVE_IMAGES & ~pymupdf.TEXT_INHIBIT_SPACES & pymupdf.TEXT_DEHYPHENATE & pymupdf.TEXT_MEDIABOX_CLIP)["blocks"]
+    page_blocks = []
+    span_id = 0
+    for block_idx, block in enumerate(blocks):
+        block_lines = []
+        for l in block["lines"]:
+            spans = []
+            for i, s in enumerate(l["spans"]):
+                block_text = s["text"]
+                bbox = s["bbox"]
+                # Find if any of the elements in invalid chars are in block_text
+                if set(settings.INVALID_CHARS).intersection(block_text):  # invalid characters encountered!
+                    # invoke OCR
+                    block_text = get_tessocr(page, block_text, bbox)
+                # print("block %i, bbox: %s, text: %s" % (block_idx, bbox, block_text))
+                span_obj = Span(
+                    text=block_text,
+                    bbox=bbox,
+                    span_id=f"{pnum}_{span_id}",
+                    font=f"{s['font']}_{font_flags_decomposer(s['flags'])}", # Add font flags to end of font
+                    color=s["color"],
+                    ascender=s["ascender"],
+                    descender=s["descender"],
+                )
+                spans.append(span_obj)  # Text, bounding box, span id
+                span_id += 1
+            line_obj = Line(
+                spans=spans,
+                bbox=l["bbox"]
+            )
+            block_lines.append(line_obj)
+        block_obj = Block(
+            lines=block_lines,
+            bbox=block["bbox"],
+            pnum=pnum
+        )
+        page_blocks.append(block_obj)
+    return page_blocks
+def get_text_blocks(doc):
+    all_blocks = []
+    toc = doc.get_toc()
+    for pnum, page in enumerate(doc):
+        blocks = get_single_page_blocks(page, pnum)
+        page_obj = Page(blocks=blocks, pnum=pnum)
+        all_blocks.append(page_obj)
+    return all_blocks, toc

marker/headers.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from collections import Counter, defaultdict
+from itertools import chain
+from sklearn.cluster import DBSCAN, HDBSCAN
+import numpy as np
+from collections import Counter
+from copy import deepcopy
+from marker.schema import Page
+from typing import List
+def filter_common_elements(lines, page_count):
+    text = [s.text for line in lines for s in line.spans]
+    counter = Counter(text)
+    common = [k for k, v in counter.items() if v > page_count * .4]
+    bad_span_ids = [s.text for line in lines for s in line.spans if s.span_id in common]
+    return bad_span_ids
+def filter_header_footer(all_page_blocks, max_selected_lines = 2):
+    first_lines = []
+    last_lines = []
+    for page in all_page_blocks:
+        nonblank_lines = page.get_nonblank_lines()
+        first_lines.extend(nonblank_lines[:max_selected_lines])
+        last_lines.extend(nonblank_lines[-max_selected_lines:])
+    bad_span_ids = filter_common_elements(first_lines, len(all_page_blocks))
+    bad_span_ids += filter_common_elements(last_lines, len(all_page_blocks))
+    return bad_span_ids
+def categorize_blocks(all_page_blocks: List[Page]):
+    spans = list(chain.from_iterable([p.get_nonblank_spans() for p in all_page_blocks]))
+    X = np.array(
+        [(*s.bbox, len(s.text)) for s in spans]
+    )
+    dbscan = DBSCAN(eps=.1, min_samples=5)
+    dbscan.fit(X)
+    labels = dbscan.labels_
+    label_chars = defaultdict(int)
+    for i, label in enumerate(labels):
+        label_chars[label] += len(spans[i].text)
+    most_common_label = None
+    most_chars = 0
+    for i in label_chars.keys():
+        if label_chars[i] > most_chars:
+            most_common_label = i
+            most_chars = label_chars[i]
+    labels = [0 if label == most_common_label else 1 for label in labels]
+    bad_span_ids = [spans[i].span_id for i in range(len(spans)) if labels[i] == 1]
+    return bad_span_ids

marker/markdown.py ADDED Viewed

	@@ -0,0 +1,163 @@

+from marker.schema import MergedLine, MergedBlock, FullyMergedBlock, Page
+import re
+from typing import List
+def surround_text(s, char_to_insert):
+    leading_whitespace = re.match(r'^(\s*)', s).group(1)
+    trailing_whitespace = re.search(r'(\s*)$', s).group(1)
+    stripped_string = s.strip()
+    modified_string = char_to_insert + stripped_string + char_to_insert
+    final_string = leading_whitespace + modified_string + trailing_whitespace
+    return final_string
+def merge_spans(blocks):
+    merged_blocks = []
+    for page in blocks:
+        page_blocks = []
+        for blocknum, block in enumerate(page.blocks):
+            block_lines = []
+            block_types = []
+            for linenum, line in enumerate(block.lines):
+                line_text = ""
+                if len(line.spans) == 0:
+                    continue
+                fonts = []
+                for i, span in enumerate(line.spans):
+                    font = span.font.lower()
+                    next_font = None
+                    if len(line.spans) > i + 1:
+                        next_font = line.spans[i + 1].font.lower()
+                    fonts.append(font)
+                    block_types.append(span.block_type)
+                    span_text = span.text
+                    if "ital" in font and (not next_font or "ital" not in next_font):
+                        span_text = surround_text(span_text, "*")
+                    elif "bold" in font and (not next_font or "bold" not in next_font):
+                        span_text = surround_text(span_text, "**")
+                    line_text += span_text
+                block_lines.append(MergedLine(
+                    text=line_text,
+                    fonts=fonts,
+                    bbox=line.bbox
+                ))
+            if len(block_lines) > 0:
+                page_blocks.append(MergedBlock(
+                    lines=block_lines,
+                    pnum=block.pnum,
+                    bbox=block.bbox,
+                    block_types=block_types
+                ))
+        merged_blocks.append(page_blocks)
+    return merged_blocks
+def block_surround(text, block_type):
+    dot_pattern = re.compile(r'(\s*\.\s*){4,}')
+    dot_multiline_pattern = re.compile(r'.*(\s*\.\s*){4,}.*', re.DOTALL)
+    match block_type:
+        case "Section-header":
+            if not text.startswith("#"):
+                text = "\n## " + text.strip() + "\n"
+        case "Title":
+            if not text.startswith("#"):
+                text = "# " + text.strip() + "\n"
+        case "Table" if dot_multiline_pattern.match(text):
+            text = dot_pattern.sub(' ', text)
+        case "List-item":
+            pass
+        case "Code":
+            text = "```\n" + text + "\n```\n"
+        case _:
+            pass
+    return text
+def line_separator(line1, line2, block_type, is_continuation=False):
+    # Remove hyphen in current line if next line and current line appear to be joined
+    hyphen_pattern = re.compile(r'.*[a-z][-]\s?$', re.DOTALL)
+    if line1 and hyphen_pattern.match(line1) and re.match(r"^[a-z]", line2):
+        # Split on — or - from the right
+        line1 = re.split(r"[-—]\s?$", line1)[0]
+        return line1.rstrip() + line2.lstrip()
+    lowercase_pattern1 = re.compile(r'.*[a-z,]\s?$', re.DOTALL)
+    lowercase_pattern2 = re.compile(r'^\s?[A-Za-z]', re.DOTALL)
+    end_pattern = re.compile(r'.*[.?!]\s?$', re.DOTALL)
+    if block_type in ["Title", "Section-header"]:
+        return line1.rstrip() + " " + line2.lstrip()
+    elif lowercase_pattern1.match(line1) and lowercase_pattern2.match(line2):
+        return line1.rstrip() + " " + line2.lstrip()
+    elif is_continuation:
+        return line1.rstrip() + " " + line2.lstrip()
+    elif block_type == "Text" and end_pattern.match(line1):
+        return line1 + "\n\n" + line2
+    elif block_type == "Formula":
+        return line1 + " " + line2
+    else:
+        return line1 + "\n" + line2
+def block_separator(line1, line2, block_type1, block_type2):
+    sep = "\n"
+    if block_type1 == "Text":
+        sep = "\n\n"
+    return sep + line2
+def merge_lines(blocks, page_blocks: List[Page]):
+    text_blocks = []
+    prev_type = None
+    prev_line = None
+    block_text = ""
+    block_type = ""
+    common_line_heights = [p.get_line_height_stats() for p in page_blocks]
+    for page in blocks:
+        for block in page:
+            block_type = block.most_common_block_type()
+            if block_type != prev_type and prev_type:
+                text_blocks.append(
+                    FullyMergedBlock(
+                        text=block_surround(block_text, prev_type),
+                        block_type=prev_type
+                    )
+                )
+                block_text = ""
+            prev_type = block_type
+            common_line_height = common_line_heights[block.pnum].most_common(1)[0][0]
+            for i, line in enumerate(block.lines):
+                line_height = line.bbox[3] - line.bbox[1]
+                prev_line_height = prev_line.bbox[3] - prev_line.bbox[1] if prev_line else 0
+                prev_line_x = prev_line.bbox[0] if prev_line else 0
+                prev_line = line
+                is_continuation = line_height == prev_line_height and line.bbox[0] == prev_line_x
+                if block_text:
+                    block_text = line_separator(block_text, line.text, block_type, is_continuation)
+                else:
+                    block_text = line.text
+    # Append the final block
+    text_blocks.append(
+        FullyMergedBlock(
+            text=block_surround(block_text, prev_type),
+            block_type=block_type
+        )
+    )
+    return text_blocks
+def get_full_text(text_blocks):
+    full_text = ""
+    prev_block = None
+    for block in text_blocks:
+        if prev_block:
+            full_text += block_separator(prev_block.text, block.text, prev_block.block_type, block.block_type)
+        else:
+            full_text += block.text
+        prev_block = block
+    return full_text

marker/schema.py ADDED Viewed

	@@ -0,0 +1,176 @@

+from collections import Counter
+from typing import List
+from pydantic import BaseModel, field_validator
+from marker.bbox import boxes_intersect_pct, multiple_boxes_intersect
+from marker.settings import settings
+def find_span_type(span, page_blocks):
+    block_type = "Text"
+    for block in page_blocks:
+        if boxes_intersect_pct(span.bbox, block.bbox):
+            block_type = block.block_type
+            break
+    return block_type
+class BboxElement(BaseModel):
+    bbox: List[float]
+    @field_validator('bbox')
+    @classmethod
+    def check_4_elements(cls, v: List[float]) -> List[float]:
+        if len(v) != 4:
+            raise ValueError('bbox must have 4 elements')
+        return v
+class BlockType(BboxElement):
+    block_type: str
+class Span(BboxElement):
+    text: str
+    span_id: str
+    font: str
+    color: int
+    ascender: float | None = None
+    descender: float | None = None
+    block_type: str | None = None
+    selected: bool = True
+class Line(BboxElement):
+    spans: List[Span]
+    @property
+    def prelim_text(self):
+        return "".join([s.text for s in self.spans])
+    @property
+    def start(self):
+        return self.spans[0].bbox[0]
+class Block(BboxElement):
+    lines: List[Line]
+    pnum: int
+    @property
+    def prelim_text(self):
+        return "\n".join([l.prelim_text for l in self.lines])
+    def contains_equation(self, equation_boxes=None):
+        conditions = [s.block_type == "Formula" for l in self.lines for s in l.spans]
+        if equation_boxes:
+            conditions += [multiple_boxes_intersect(self.bbox, equation_boxes)]
+        return any(conditions)
+    def filter_spans(self, bad_span_ids):
+        new_lines = []
+        for line in self.lines:
+            new_spans = []
+            for span in line.spans:
+                if not span.span_id in bad_span_ids:
+                    new_spans.append(span)
+            line.spans = new_spans
+            if len(new_spans) > 0:
+                new_lines.append(line)
+        self.lines = new_lines
+    def filter_bad_span_types(self, block_types: List[BlockType]):
+        bad_spans = [b.bbox for b in block_types if b.block_type in settings.BAD_SPAN_TYPES]
+        new_lines = []
+        for line in self.lines:
+            new_spans = []
+            for span in line.spans:
+                if not multiple_boxes_intersect(span.bbox, bad_spans):
+                    new_spans.append(span)
+            line.spans = new_spans
+            if len(new_spans) > 0:
+                new_lines.append(line)
+        self.lines = new_lines
+    def most_common_block_type(self):
+        counter = Counter([s.block_type for l in self.lines for s in l.spans])
+        return counter.most_common(1)[0][0]
+    def set_block_type(self, block_type):
+        for line in self.lines:
+            for span in line.spans:
+                span.block_type = block_type
+class Page(BaseModel):
+    blocks: List[Block]
+    pnum: int
+    def get_nonblank_lines(self):
+        lines = self.get_all_lines()
+        nonblank_lines = [l for l in lines if l.prelim_text.strip()]
+        return nonblank_lines
+    def get_all_lines(self):
+        lines = [l for b in self.blocks for l in b.lines]
+        return lines
+    def get_nonblank_spans(self) -> List[Span]:
+        lines = [l for b in self.blocks for l in b.lines]
+        spans = [s for l in lines for s in l.spans if s.text.strip()]
+        return spans
+    def add_block_types(self, page_block_types):
+        if len(page_block_types) != len(self.get_all_lines()):
+            print(f"Warning: Number of detected lines {len(page_block_types)} does not match number of lines {len(self.get_all_lines())}")
+        i = 0
+        for block in self.blocks:
+            for line in block.lines:
+                if i < len(page_block_types):
+                    line_block_type = page_block_types[i].block_type
+                else:
+                    line_block_type = "Text"
+                i += 1
+                for span in line.spans:
+                    span.block_type = line_block_type
+    def get_font_stats(self):
+        fonts = [s.font for s in self.get_nonblank_spans()]
+        font_counts = Counter(fonts)
+        return font_counts
+    def get_line_height_stats(self):
+        heights = [l.bbox[3] - l.bbox[1] for l in self.get_nonblank_lines()]
+        height_counts = Counter(heights)
+        return height_counts
+    def get_line_start_stats(self):
+        starts = [l.bbox[0] for l in self.get_nonblank_lines()]
+        start_counts = Counter(starts)
+        return start_counts
+class MergedLine(BboxElement):
+    text: str
+    fonts: List[str]
+    def most_common_font(self):
+        counter = Counter(self.fonts)
+        return counter.most_common(1)[0][0]
+class MergedBlock(BboxElement):
+    lines: List[MergedLine]
+    pnum: int
+    block_types: List[str]
+    def most_common_block_type(self):
+        counter = Counter(self.block_types)
+        return counter.most_common(1)[0][0]
+class FullyMergedBlock(BaseModel):
+    text: str
+    block_type: str

marker/segmentation.py ADDED Viewed

	@@ -0,0 +1,139 @@

+from typing import List
+from transformers import LayoutLMv3ForTokenClassification
+from marker.bbox import unnormalize_box
+from transformers.models.layoutlmv3.image_processing_layoutlmv3 import normalize_box
+import io
+from PIL import Image
+from transformers import LayoutLMv3Processor
+import numpy as np
+from marker.settings import settings
+from marker.schema import Page, BlockType
+import torch
+processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
+CHUNK_KEYS = ["input_ids", "attention_mask", "bbox", "offset_mapping"]
+NO_CHUNK_KEYS = ["pixel_values"]
+MODEL_MAX_LEN = 512
+CHUNK_OVERLAP = 128
+def load_model():
+    model = LayoutLMv3ForTokenClassification.from_pretrained("Kwan0/layoutlmv3-base-finetune-DocLayNet-100k").to(settings.TORCH_DEVICE)
+    model.config.id2label = {
+        0: "Caption",
+        1: "Footnote",
+        2: "Formula",
+        3: "List-item",
+        4: "Page-footer",
+        5: "Page-header",
+        6: "Picture",
+        7: "Section-header",
+        8: "Table",
+        9: "Text",
+        10: "Title"
+    }
+    model.config.label2id = d = {v: k for k, v in model.config.id2label.items()}
+    return model
+layoutlm_model = load_model()
+def detect_all_block_types(doc, blocks: List[Page]):
+    block_types = []
+    for pnum, page in enumerate(doc):
+        page_blocks = blocks[pnum]
+        predictions = detect_page_block_types(page, page_blocks)
+        block_types.append(predictions)
+    return block_types
+def detect_page_block_types(page, page_blocks: Page):
+    page_box = page.bound()
+    pwidth = page_box[2] - page_box[0]
+    pheight = page_box[3] - page_box[1]
+    pix = page.get_pixmap(dpi=400)
+    png = pix.pil_tobytes(format="PNG")
+    png_image = Image.open(io.BytesIO(png))
+    rgb_image = png_image.convert('RGB')
+    lines = page_blocks.get_all_lines()
+    boxes = [s.bbox for s in lines]
+    text = [s.prelim_text for s in lines]
+    predictions = make_predictions(rgb_image, text, boxes, pwidth, pheight)
+    return predictions
+def find_first_false(lst, start_idx):
+    # Traverse the list to the left from start_idx
+    for idx in range(start_idx, -1, -1):
+        if not lst[idx]:
+            return idx
+    return 0  # Return 0 if no false found (aka, no lines)
+def get_provisional_boxes(pred, box, is_subword, start_idx=0):
+    prov_predictions = [pred_ for idx, pred_ in enumerate(pred) if not is_subword[idx]][start_idx:]
+    prov_boxes = [box_ for idx, box_ in enumerate(box) if not is_subword[idx]][start_idx:]
+    return prov_predictions, prov_boxes
+def make_predictions(rgb_image, text, boxes, pwidth, pheight) -> List[BlockType]:
+    # Normalize boxes for model (scale to 1000x1000)
+    boxes = [normalize_box(box, pwidth, pheight) for box in boxes]
+    encoding = processor(rgb_image, text=text, boxes=boxes, return_offsets_mapping=True, return_tensors="pt", truncation=True, stride=CHUNK_OVERLAP, padding="max_length", max_length=MODEL_MAX_LEN, return_overflowing_tokens=True)
+    offset_mapping = encoding.pop('offset_mapping')
+    overflow_to_sample_mapping = encoding.pop('overflow_to_sample_mapping')
+    # change the shape of pixel values
+    x = []
+    for i in range(0, len(encoding['pixel_values'])):
+        x.append(encoding['pixel_values'][i])
+    x = torch.stack(x)
+    encoding['pixel_values'] = x
+    with torch.no_grad():
+        encoding = encoding.to(settings.TORCH_DEVICE)
+        outputs = layoutlm_model(**encoding)
+    logits = outputs.logits
+    # We take the highest score for each token, using argmax. This serves as the predicted label for each token.
+    predictions = logits.argmax(-1).squeeze().tolist()
+    token_boxes = encoding.bbox.squeeze().tolist()
+    if len(token_boxes) == MODEL_MAX_LEN:
+        predictions = [predictions]
+        token_boxes = [token_boxes]
+    predicted_block_types = []
+    for i, (pred, box, mapped) in enumerate(zip(predictions, token_boxes, offset_mapping)):
+        is_subword = np.array(mapped.squeeze().tolist())[:, 0] != 0
+        overlap_adjust = 0
+        if i > 0:
+            overlap_adjust = 1 + CHUNK_OVERLAP - sum(is_subword[:1 + CHUNK_OVERLAP])
+        prov_predictions, prov_boxes = get_provisional_boxes(pred, box, is_subword, overlap_adjust)
+        for prov_box, prov_prediction in zip(prov_boxes, prov_predictions):
+            if prov_box == [0, 0, 0, 0]:
+                continue
+            unnorm_box = unnormalize_box(prov_box, pwidth, pheight)
+            block_type = BlockType(
+                block_type=layoutlm_model.config.id2label[prov_prediction],
+                bbox=unnorm_box
+            )
+            # Sometimes blocks will cross chunks, unclear why
+            if len(predicted_block_types) == 0 or unnorm_box != predicted_block_types[-1].bbox:
+                predicted_block_types.append(block_type)
+    return predicted_block_types

marker/settings.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import os
+from typing import Optional, List
+from dotenv import find_dotenv
+from pydantic_settings import BaseSettings
+class Settings(BaseSettings):
+    # Path settings
+    DPI: int = 400
+    INVALID_CHARS: List[str] = [chr(0xfffd), "~", chr(65533), "↵"]
+    TORCH_DEVICE: str = "cpu"
+    TESSDATA_PREFIX: str = ""
+    BAD_SPAN_TYPES: List[str] = ["Caption", "Footnote", "Page-footer", "Page-header", "Picture"]
+    class Config:
+        env_file = find_dotenv("local.env")
+settings = Settings()

parse.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import fitz as pymupdf
+from marker.extract_text import get_text_blocks
+from marker.headers import categorize_blocks, filter_header_footer
+from marker.equations import replace_equations
+from marker.segmentation import detect_all_block_types
+from marker.code import identify_code_blocks, indent_blocks
+from marker.markdown import merge_spans, merge_lines, get_full_text
+from marker.schema import Page, BlockType
+from typing import List
+from copy import deepcopy
+def annotate_spans(blocks: List[Page], block_types: List[BlockType]):
+    for i, page in enumerate(blocks):
+        page_block_types = block_types[i]
+        page.add_block_types(page_block_types)
+if __name__ == "__main__":
+    fname = "test_data/thinkpython.pdf"
+    doc = pymupdf.open(fname)
+    blocks, toc = get_text_blocks(doc)
+    block_types = detect_all_block_types(doc, blocks)
+    filtered = deepcopy(blocks)
+    annotate_spans(filtered, block_types)
+    identify_code_blocks(filtered)
+    indent_blocks(filtered)
+    bad_span_ids = categorize_blocks(blocks)
+    bad_span_ids += filter_header_footer(blocks)
+    # Copy to avoid changing original data
+    for page in filtered:
+        for block in page.blocks:
+            block.filter_spans(bad_span_ids)
+            block.filter_bad_span_types(block_types[page.pnum])
+    filtered = replace_equations(doc, filtered, block_types)
+    # Copy to avoid changing original data
+    merged_lines = merge_spans(filtered)
+    text_blocks = merge_lines(merged_lines, filtered)
+    full_text = get_full_text(text_blocks)
+    with open("test_data/thinkpython.md", "w+") as f:
+        f.write(full_text)

poetry.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml ADDED Viewed

	@@ -0,0 +1,28 @@

+[tool.poetry]
+name = "marker"
+version = "0.1.0"
+description = ""
+authors = ["Vik Paruchuri <vik.paruchuri@gmail.com>"]
+readme = "README.md"
+[tool.poetry.dependencies]
+python = ">=3.9,<3.13"
+scikit-learn = "^1.3.2"
+Pillow = "^10.1.0"
+pytesseract = "^0.3.10"
+PyMuPDF = "^1.23.5"
+pymupdf-fonts = "^1.0.5"
+pydantic = "^2.4.2"
+pydantic-settings = "^2.0.3"
+nougat-ocr = "^0.1.17"
+transformers = "^4.34.1"
+torch = "^2.1.0"
+numpy = "^1.26.1"
+python-dotenv = "^1.0.0"
+[tool.poetry.group.dev.dependencies]
+jupyter = "^1.0.0"
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+Pillow==9.5.0
+layoutparser
+torchvision
+git+https://github.com/facebookresearch/detectron2.git@v0.5#egg=detectron2
+pytesseract
+pymupdf
+pymupdf-fonts
+pydantic
+pydantic-settings
+nougat
+transformers
+scikit-learn