Vik Paruchuri commited on
Commit
a5c1c2e
·
0 Parent(s):

Initial commit

Browse files
.gitignore ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ private.py
2
+ .DS_Store
3
+ local.env
4
+ experiments
5
+ test_data
6
+
7
+ # Byte-compiled / optimized / DLL files
8
+ __pycache__/
9
+ *.py[cod]
10
+ *$py.class
11
+
12
+ # C extensions
13
+ *.so
14
+
15
+ # Distribution / packaging
16
+ .Python
17
+ build/
18
+ develop-eggs/
19
+ dist/
20
+ downloads/
21
+ eggs/
22
+ .eggs/
23
+ lib/
24
+ lib64/
25
+ parts/
26
+ sdist/
27
+ var/
28
+ wheels/
29
+ share/python-wheels/
30
+ *.egg-info/
31
+ .installed.cfg
32
+ *.egg
33
+ MANIFEST
34
+
35
+ # PyInstaller
36
+ # Usually these files are written by a python script from a template
37
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
38
+ *.manifest
39
+ *.spec
40
+
41
+ # Installer logs
42
+ pip-log.txt
43
+ pip-delete-this-directory.txt
44
+
45
+ # Unit test / coverage reports
46
+ htmlcov/
47
+ .tox/
48
+ .nox/
49
+ .coverage
50
+ .coverage.*
51
+ .cache
52
+ nosetests.xml
53
+ coverage.xml
54
+ *.cover
55
+ *.py,cover
56
+ .hypothesis/
57
+ .pytest_cache/
58
+ cover/
59
+
60
+ # Translations
61
+ *.mo
62
+ *.pot
63
+
64
+ # Django stuff:
65
+ *.log
66
+ local_settings.py
67
+ db.sqlite3
68
+ db.sqlite3-journal
69
+
70
+ # Flask stuff:
71
+ instance/
72
+ .webassets-cache
73
+
74
+ # Scrapy stuff:
75
+ .scrapy
76
+
77
+ # Sphinx documentation
78
+ docs/_build/
79
+
80
+ # PyBuilder
81
+ .pybuilder/
82
+ target/
83
+
84
+ # Jupyter Notebook
85
+ .ipynb_checkpoints
86
+
87
+ # IPython
88
+ profile_default/
89
+ ipython_config.py
90
+
91
+ # pyenv
92
+ # For a library or package, you might want to ignore these files since the code is
93
+ # intended to run in multiple environments; otherwise, check them in:
94
+ # .python-version
95
+
96
+ # pipenv
97
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
98
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
99
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
100
+ # install all needed dependencies.
101
+ #Pipfile.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+
110
+ # pdm
111
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112
+ #pdm.lock
113
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114
+ # in version control.
115
+ # https://pdm.fming.dev/#use-with-ide
116
+ .pdm.toml
117
+
118
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
119
+ __pypackages__/
120
+
121
+ # Celery stuff
122
+ celerybeat-schedule
123
+ celerybeat.pid
124
+
125
+ # SageMath parsed files
126
+ *.sage.py
127
+
128
+ # Environments
129
+ .env
130
+ .venv
131
+ env/
132
+ venv/
133
+ ENV/
134
+ env.bak/
135
+ venv.bak/
136
+
137
+ # Spyder project settings
138
+ .spyderproject
139
+ .spyproject
140
+
141
+ # Rope project settings
142
+ .ropeproject
143
+
144
+ # mkdocs documentation
145
+ /site
146
+
147
+ # mypy
148
+ .mypy_cache/
149
+ .dmypy.json
150
+ dmypy.json
151
+
152
+ # Pyre type checker
153
+ .pyre/
154
+
155
+ # pytype static type analyzer
156
+ .pytype/
157
+
158
+ # Cython debug symbols
159
+ cython_debug/
160
+
161
+ # PyCharm
162
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
163
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
164
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
165
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
166
+ .idea/
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Marker
2
+
3
+ This project converts PDF to Markdown, balancing speed with quality:
4
+
5
+ - Equations will be detected and converted to Latex. This is not 100% accurate.
6
+ - All headers/footers/other artifacts will be removed.
7
+
8
+
9
+
10
+ ## Install
11
+
12
+ - `poetry install`
13
+ - Set `TESSDATA_PREFIX`
marker/bbox.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def should_merge_blocks(box1, box2, tol=10):
2
+ # Within tol y px, and to the right within tol px
3
+ merge = [
4
+ box2[0] > box1[0],
5
+ abs(box2[1] - box1[1]) < tol, # Within tol y px
6
+ abs(box2[3] - box1[3]) < tol, # Within tol y px
7
+ abs(box2[0] - box1[2]) < tol, # Within tol x px
8
+ ]
9
+ return all(merge)
10
+
11
+
12
+ def merge_boxes(box1, box2):
13
+ return (min(box1[0], box2[0]), min(box1[1], box2[1]), max(box2[2], box1[2]), max(box1[3], box2[3]))
14
+
15
+
16
+ def boxes_intersect(box1, box2):
17
+ # Box1 intersects box2
18
+ return box1[0] < box2[2] and box1[2] > box2[0] and box1[1] < box2[3] and box1[3] > box2[1]
19
+
20
+
21
+ def boxes_intersect_pct(box1, box2, pct=.9):
22
+ # determine the coordinates of the intersection rectangle
23
+ x_left = max(box1[0], box2[0])
24
+ y_top = max(box1[1], box2[1])
25
+ x_right = min(box1[2], box2[2])
26
+ y_bottom = min(box1[3], box2[3])
27
+
28
+ if x_right < x_left or y_bottom < y_top:
29
+ return 0.0
30
+
31
+ # The intersection of two axis-aligned bounding boxes is always an
32
+ # axis-aligned bounding box
33
+ intersection_area = (x_right - x_left) * (y_bottom - y_top)
34
+
35
+ # compute the area of both AABBs
36
+ bb1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
37
+ bb2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])
38
+
39
+ iou = intersection_area / float(bb1_area + bb2_area - intersection_area)
40
+ return iou > pct
41
+
42
+
43
+ def multiple_boxes_intersect(box1, boxes):
44
+ for box2 in boxes:
45
+ if boxes_intersect(box1, box2):
46
+ return True
47
+ return False
48
+
49
+
50
+ def box_contained(box1, box2):
51
+ # Box1 inside box2
52
+ return box1[0] > box2[0] and box1[1] > box2[1] and box1[2] < box2[2] and box1[3] < box2[3]
53
+
54
+
55
+ def unnormalize_box(bbox, width, height):
56
+ return [
57
+ width * (bbox[0] / 1000),
58
+ height * (bbox[1] / 1000),
59
+ width * (bbox[2] / 1000),
60
+ height * (bbox[3] / 1000),
61
+ ]
marker/code.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from marker.schema import Span, Line, Page
2
+ import re
3
+ from typing import List
4
+ import fitz as pymupdf
5
+
6
+
7
+ def is_code_linelen(lines, thresh=50):
8
+ # Decide based on chars per newline threshold
9
+ total_alnum_chars = sum(len(re.findall(r'\w', line.prelim_text)) for line in lines)
10
+ total_newlines = len(lines) - 1
11
+
12
+ if total_alnum_chars == 0:
13
+ return False
14
+
15
+ ratio = total_alnum_chars / total_newlines
16
+ return ratio < thresh
17
+
18
+
19
+ def identify_code_blocks(blocks: List[Page]):
20
+ for page in blocks:
21
+ try:
22
+ common_height = page.get_line_height_stats().most_common(1)[0][0]
23
+ common_start = page.get_line_start_stats().most_common(1)[0][0]
24
+ except IndexError:
25
+ continue
26
+
27
+ for block in page.blocks:
28
+ if len(block.lines) < 2:
29
+ continue
30
+ if block.most_common_block_type() != "Text":
31
+ continue
32
+
33
+ is_code = []
34
+ for line in block.lines:
35
+ fonts = [span.font for span in line.spans]
36
+ monospace_font = any([font for font in fonts if "mono" in font.lower() or "prop" in font.lower()])
37
+ line_height = line.bbox[3] - line.bbox[1]
38
+ line_start = line.bbox[0]
39
+ if line_height <= common_height and line_start > common_start and monospace_font:
40
+ is_code.append(True)
41
+ else:
42
+ is_code.append(False)
43
+ is_code = [
44
+ sum(is_code) > len(block.lines) / 1.5,
45
+ len(block.lines) > 4,
46
+ is_code_linelen(block.lines)
47
+ ]
48
+
49
+ if all(is_code):
50
+ block.set_block_type("Code")
51
+
52
+
53
+ def indent_blocks(blocks: List[Page]):
54
+ span_counter = 0
55
+ for page in blocks:
56
+ for block in page.blocks:
57
+ if block.most_common_block_type() != "Code":
58
+ continue
59
+
60
+ lines = []
61
+ min_left = 1000 # will contain x- coord of column 0
62
+ col_width = 0 # width of 1 char
63
+ for line in block.lines:
64
+ text = ""
65
+ min_left = min(line.bbox[0], min_left)
66
+ for span in line.spans:
67
+ if col_width == 0 and len(span.text) > 0:
68
+ col_width = (span.bbox[2] - span.bbox[0]) / len(span.text)
69
+ text += span.text
70
+ lines.append((pymupdf.Rect(line.bbox), text))
71
+
72
+ block_text = ""
73
+ for line in lines:
74
+ text = line[1]
75
+ prefix = " " * int((line[0].x0 - min_left) / col_width)
76
+ block_text += prefix + text + "\n"
77
+ new_span = Span(
78
+ text=block_text,
79
+ bbox=block.bbox,
80
+ color=block.lines[0].spans[0].color,
81
+ span_id=f"{span_counter}_fix_code",
82
+ font=block.lines[0].spans[0].font,
83
+ block_type="Code"
84
+ )
85
+ span_counter += 1
86
+ block.lines = [Line(spans=[new_span], bbox=block.bbox)]
marker/equations.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ from copy import deepcopy
3
+ from typing import List
4
+
5
+ from nougat import NougatModel
6
+ from nougat.utils.checkpoint import get_checkpoint
7
+ import re
8
+ from PIL import Image, ImageDraw
9
+ import fitz as pymupdf
10
+ from marker.bbox import should_merge_blocks, merge_boxes, multiple_boxes_intersect
11
+ from marker.settings import settings
12
+ from marker.schema import Page, Span, Line, Block, BlockType
13
+ from nougat.utils.device import move_to_device
14
+
15
+
16
+ def load_model():
17
+ ckpt = get_checkpoint(None, model_tag="0.1.0-small")
18
+ nougat_model = NougatModel.from_pretrained(ckpt)
19
+ if settings.TORCH_DEVICE != "cpu":
20
+ is_cuda = "cuda" in settings.TORCH_DEVICE
21
+ move_to_device(nougat_model, bf16=is_cuda, cuda=is_cuda)
22
+ nougat_model.eval()
23
+ return nougat_model
24
+
25
+
26
+ nougat_model = load_model()
27
+ MODEL_MAX = nougat_model.config.max_length
28
+
29
+ NOUGAT_HALLUCINATION_WORDS = ["[MISSING_PAGE_POST]", "## References\n", "**Figure Captions**\n", "Footnote", "\par\par\par", "## Chapter", "Fig."]
30
+
31
+
32
+ def contains_equation(text):
33
+ # Define a regular expression pattern to look for operators and symbols commonly found in equations
34
+ pattern = re.compile(r'[=\^\√∑∏∫∂∆π≈≠≤≥∞∩∪∈∉∀∃∅∇λμσαβγδεζηθφχψω]')
35
+ # Search the text for the pattern
36
+ match = pattern.search(text)
37
+
38
+ # Alternative equation patterns
39
+ alt_pattern = re.compile(r' P(?=[ \n\(\)$])')
40
+ alt_match = alt_pattern.search(text)
41
+ # Return True if the pattern is found, otherwise return False
42
+ return bool(match) or bool(alt_match)
43
+
44
+
45
+ def mask_bbox(png_image, bbox, selected_bboxes):
46
+ mask = Image.new('L', png_image.size, 0) # 'L' mode for grayscale
47
+ draw = ImageDraw.Draw(mask)
48
+ first_x = bbox[0]
49
+ first_y = bbox[1]
50
+ bbox_height = bbox[3] - bbox[1]
51
+ bbox_width = bbox[2] - bbox[0]
52
+
53
+ for box in selected_bboxes:
54
+ # Fit the box to the selected region
55
+ new_box = (box[0] - first_x, box[1] - first_y, box[2] - first_x, box[3] - first_y)
56
+ # Fit mask to image bounds versus the pdf bounds
57
+ resized = (
58
+ new_box[0] / bbox_width * png_image.size[0],
59
+ new_box[1] / bbox_height * png_image.size[1],
60
+ new_box[2] / bbox_width * png_image.size[0],
61
+ new_box[3] / bbox_height * png_image.size[1]
62
+ )
63
+ draw.rectangle(resized, fill=255)
64
+
65
+ result = Image.composite(png_image, Image.new('RGBA', png_image.size, 'white'), mask)
66
+ return result
67
+
68
+
69
+ def get_nougat_text(page, old_text, bbox, selected_bboxes, save_id, max_length=MODEL_MAX):
70
+ pix = page.get_pixmap(dpi=settings.DPI, clip=bbox)
71
+ png = pix.pil_tobytes(format="PNG")
72
+ png_image = Image.open(io.BytesIO(png))
73
+ png_image = mask_bbox(png_image, bbox, selected_bboxes)
74
+
75
+ nougat_model.config.max_length = min(max_length, MODEL_MAX)
76
+ output = nougat_model.inference(image=png_image)
77
+ return output["predictions"][0]
78
+
79
+
80
+ def replace_equations(doc, blocks: List[Page], block_types: List[List[BlockType]]):
81
+ span_id = 0
82
+ new_blocks = []
83
+ for pnum, page in enumerate(blocks):
84
+ i = 0
85
+ new_page_blocks = []
86
+ equation_boxes = [b.bbox for b in block_types[pnum] if b.block_type == "Formula"]
87
+ while i < len(page.blocks):
88
+ block = page.blocks[i]
89
+ block_text = block.prelim_text
90
+ bbox = block.bbox
91
+ # Check if the block contains an equation
92
+ if not block.contains_equation(equation_boxes):
93
+ new_page_blocks.append(block)
94
+ i += 1
95
+ continue
96
+
97
+ selected_blocks = [i]
98
+ if i > 0:
99
+ j = 1
100
+ prev_block = page.blocks[i - j]
101
+ prev_bbox = prev_block.bbox
102
+ while (should_merge_blocks(prev_bbox, bbox) or prev_block.contains_equation(equation_boxes)) and i - j >= 0:
103
+ bbox = merge_boxes(prev_bbox, bbox)
104
+ prev_block = page.blocks[i - j]
105
+ prev_bbox = prev_block.bbox
106
+ block_text = prev_block.prelim_text + " " + block_text
107
+ new_page_blocks = new_page_blocks[:-1] # Remove the previous block, since we're merging it in
108
+ j += 1
109
+ selected_blocks.append(i - j)
110
+
111
+ if i < len(page.blocks) - 1:
112
+ next_block = page.blocks[i + 1]
113
+ next_bbox = next_block.bbox
114
+ while (should_merge_blocks(bbox, next_bbox) or next_block.contains_equation(equation_boxes)) and i + 1 < len(page.blocks):
115
+ bbox = merge_boxes(bbox, next_bbox)
116
+ block_text += " " + next_block.prelim_text
117
+ i += 1
118
+ selected_blocks.append(i)
119
+ if i + 1 < len(page.blocks):
120
+ next_block = page.blocks[i + 1]
121
+ next_bbox = next_block.bbox
122
+
123
+ used_nougat = False
124
+ if len(block_text) < 2000:
125
+ selected_bboxes = [page.blocks[i].bbox for i in selected_blocks]
126
+ # This prevents hallucinations from running on for a long time
127
+ max_tokens = len(block_text) + 50
128
+ max_char_length = 2 * len(block_text) + 100
129
+ nougat_text = get_nougat_text(doc[pnum], block_text, bbox, selected_bboxes, f"{pnum}_{i}", max_length=max_tokens)
130
+ conditions = [
131
+ len(nougat_text) > 0,
132
+ not any([word in nougat_text for word in NOUGAT_HALLUCINATION_WORDS]),
133
+ len(nougat_text) < max_char_length, # Reduce hallucinations
134
+ len(nougat_text) >= len(block_text) * .8
135
+ ]
136
+ if all(conditions):
137
+ block_line = Line(
138
+ spans=[
139
+ Span(
140
+ text=nougat_text,
141
+ bbox=bbox,
142
+ span_id=f"{pnum}_{span_id}_fixeq",
143
+ font="Latex",
144
+ color=0,
145
+ block_type="Formula"
146
+ )
147
+ ],
148
+ bbox=bbox
149
+ )
150
+ new_page_blocks.append(Block(
151
+ lines=[block_line],
152
+ bbox=bbox,
153
+ pnum=pnum
154
+ ))
155
+ used_nougat = True
156
+ span_id += 1
157
+
158
+ if not used_nougat:
159
+ for block_idx in selected_blocks:
160
+ new_page_blocks.append(page.blocks[block_idx])
161
+
162
+ i += 1
163
+ # Assign back to page
164
+ new_page = deepcopy(page)
165
+ new_page.blocks = new_page_blocks
166
+ new_blocks.append(new_page)
167
+ return new_blocks
marker/extract_text.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz as pymupdf
2
+ import os
3
+ from marker.settings import settings
4
+ from marker.schema import Span, Line, Block, Page
5
+
6
+ os.environ["TESSDATA_PREFIX"] = settings.TESSDATA_PREFIX
7
+
8
+
9
+ def get_tessocr(page, old_text, bbox):
10
+ pix = page.get_pixmap(dpi=settings.DPI, clip=bbox)
11
+
12
+ ocrpdf = pymupdf.open("pdf", pix.pdfocr_tobytes())
13
+ ocrpage = ocrpdf[0]
14
+ new_text = ocrpage.get_text() # extract OCR-ed text
15
+
16
+ # Tesseract ignores leading spaces, hence some corrections
17
+ lblanks = len(old_text) - len(old_text.lstrip())
18
+
19
+ # prefix OCRed text with this many spaces
20
+ new_text = " " * lblanks + new_text
21
+ return new_text
22
+
23
+
24
+ def font_flags_decomposer(flags):
25
+ """Make font flags human readable."""
26
+ l = []
27
+ if flags & 2 ** 0:
28
+ l.append("superscript")
29
+ if flags & 2 ** 1:
30
+ l.append("italic")
31
+ if flags & 2 ** 2:
32
+ l.append("serifed")
33
+ else:
34
+ l.append("sans")
35
+ if flags & 2 ** 3:
36
+ l.append("monospaced")
37
+ else:
38
+ l.append("proportional")
39
+ if flags & 2 ** 4:
40
+ l.append("bold")
41
+ return "_".join(l)
42
+
43
+
44
+ def get_single_page_blocks(page, pnum):
45
+ blocks = page.get_text("dict", sort=True,
46
+ flags=~pymupdf.TEXT_PRESERVE_LIGATURES & pymupdf.TEXT_PRESERVE_WHITESPACE & ~pymupdf.TEXT_PRESERVE_IMAGES & ~pymupdf.TEXT_INHIBIT_SPACES & pymupdf.TEXT_DEHYPHENATE & pymupdf.TEXT_MEDIABOX_CLIP)["blocks"]
47
+ page_blocks = []
48
+ span_id = 0
49
+ for block_idx, block in enumerate(blocks):
50
+ block_lines = []
51
+ for l in block["lines"]:
52
+ spans = []
53
+ for i, s in enumerate(l["spans"]):
54
+ block_text = s["text"]
55
+ bbox = s["bbox"]
56
+ # Find if any of the elements in invalid chars are in block_text
57
+ if set(settings.INVALID_CHARS).intersection(block_text): # invalid characters encountered!
58
+ # invoke OCR
59
+ block_text = get_tessocr(page, block_text, bbox)
60
+ # print("block %i, bbox: %s, text: %s" % (block_idx, bbox, block_text))
61
+ span_obj = Span(
62
+ text=block_text,
63
+ bbox=bbox,
64
+ span_id=f"{pnum}_{span_id}",
65
+ font=f"{s['font']}_{font_flags_decomposer(s['flags'])}", # Add font flags to end of font
66
+ color=s["color"],
67
+ ascender=s["ascender"],
68
+ descender=s["descender"],
69
+ )
70
+ spans.append(span_obj) # Text, bounding box, span id
71
+ span_id += 1
72
+ line_obj = Line(
73
+ spans=spans,
74
+ bbox=l["bbox"]
75
+ )
76
+ block_lines.append(line_obj)
77
+ block_obj = Block(
78
+ lines=block_lines,
79
+ bbox=block["bbox"],
80
+ pnum=pnum
81
+ )
82
+ page_blocks.append(block_obj)
83
+ return page_blocks
84
+
85
+
86
+ def get_text_blocks(doc):
87
+ all_blocks = []
88
+ toc = doc.get_toc()
89
+ for pnum, page in enumerate(doc):
90
+ blocks = get_single_page_blocks(page, pnum)
91
+ page_obj = Page(blocks=blocks, pnum=pnum)
92
+ all_blocks.append(page_obj)
93
+
94
+ return all_blocks, toc
marker/headers.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import Counter, defaultdict
2
+ from itertools import chain
3
+
4
+ from sklearn.cluster import DBSCAN, HDBSCAN
5
+ import numpy as np
6
+
7
+ from collections import Counter
8
+ from copy import deepcopy
9
+
10
+ from marker.schema import Page
11
+ from typing import List
12
+
13
+
14
+ def filter_common_elements(lines, page_count):
15
+ text = [s.text for line in lines for s in line.spans]
16
+ counter = Counter(text)
17
+ common = [k for k, v in counter.items() if v > page_count * .4]
18
+ bad_span_ids = [s.text for line in lines for s in line.spans if s.span_id in common]
19
+ return bad_span_ids
20
+
21
+
22
+ def filter_header_footer(all_page_blocks, max_selected_lines = 2):
23
+ first_lines = []
24
+ last_lines = []
25
+ for page in all_page_blocks:
26
+ nonblank_lines = page.get_nonblank_lines()
27
+ first_lines.extend(nonblank_lines[:max_selected_lines])
28
+ last_lines.extend(nonblank_lines[-max_selected_lines:])
29
+
30
+ bad_span_ids = filter_common_elements(first_lines, len(all_page_blocks))
31
+ bad_span_ids += filter_common_elements(last_lines, len(all_page_blocks))
32
+ return bad_span_ids
33
+
34
+
35
+ def categorize_blocks(all_page_blocks: List[Page]):
36
+ spans = list(chain.from_iterable([p.get_nonblank_spans() for p in all_page_blocks]))
37
+ X = np.array(
38
+ [(*s.bbox, len(s.text)) for s in spans]
39
+ )
40
+
41
+ dbscan = DBSCAN(eps=.1, min_samples=5)
42
+ dbscan.fit(X)
43
+ labels = dbscan.labels_
44
+ label_chars = defaultdict(int)
45
+ for i, label in enumerate(labels):
46
+ label_chars[label] += len(spans[i].text)
47
+
48
+ most_common_label = None
49
+ most_chars = 0
50
+ for i in label_chars.keys():
51
+ if label_chars[i] > most_chars:
52
+ most_common_label = i
53
+ most_chars = label_chars[i]
54
+
55
+ labels = [0 if label == most_common_label else 1 for label in labels]
56
+ bad_span_ids = [spans[i].span_id for i in range(len(spans)) if labels[i] == 1]
57
+
58
+ return bad_span_ids
59
+
60
+
marker/markdown.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from marker.schema import MergedLine, MergedBlock, FullyMergedBlock, Page
2
+ import re
3
+ from typing import List
4
+
5
+
6
+ def surround_text(s, char_to_insert):
7
+ leading_whitespace = re.match(r'^(\s*)', s).group(1)
8
+ trailing_whitespace = re.search(r'(\s*)$', s).group(1)
9
+ stripped_string = s.strip()
10
+ modified_string = char_to_insert + stripped_string + char_to_insert
11
+ final_string = leading_whitespace + modified_string + trailing_whitespace
12
+ return final_string
13
+
14
+
15
+ def merge_spans(blocks):
16
+ merged_blocks = []
17
+ for page in blocks:
18
+ page_blocks = []
19
+ for blocknum, block in enumerate(page.blocks):
20
+ block_lines = []
21
+ block_types = []
22
+ for linenum, line in enumerate(block.lines):
23
+ line_text = ""
24
+ if len(line.spans) == 0:
25
+ continue
26
+ fonts = []
27
+ for i, span in enumerate(line.spans):
28
+ font = span.font.lower()
29
+ next_font = None
30
+ if len(line.spans) > i + 1:
31
+ next_font = line.spans[i + 1].font.lower()
32
+ fonts.append(font)
33
+ block_types.append(span.block_type)
34
+ span_text = span.text
35
+ if "ital" in font and (not next_font or "ital" not in next_font):
36
+ span_text = surround_text(span_text, "*")
37
+ elif "bold" in font and (not next_font or "bold" not in next_font):
38
+ span_text = surround_text(span_text, "**")
39
+ line_text += span_text
40
+ block_lines.append(MergedLine(
41
+ text=line_text,
42
+ fonts=fonts,
43
+ bbox=line.bbox
44
+ ))
45
+ if len(block_lines) > 0:
46
+ page_blocks.append(MergedBlock(
47
+ lines=block_lines,
48
+ pnum=block.pnum,
49
+ bbox=block.bbox,
50
+ block_types=block_types
51
+ ))
52
+ merged_blocks.append(page_blocks)
53
+
54
+ return merged_blocks
55
+
56
+
57
+ def block_surround(text, block_type):
58
+ dot_pattern = re.compile(r'(\s*\.\s*){4,}')
59
+ dot_multiline_pattern = re.compile(r'.*(\s*\.\s*){4,}.*', re.DOTALL)
60
+ match block_type:
61
+ case "Section-header":
62
+ if not text.startswith("#"):
63
+ text = "\n## " + text.strip() + "\n"
64
+ case "Title":
65
+ if not text.startswith("#"):
66
+ text = "# " + text.strip() + "\n"
67
+ case "Table" if dot_multiline_pattern.match(text):
68
+ text = dot_pattern.sub(' ', text)
69
+ case "List-item":
70
+ pass
71
+ case "Code":
72
+ text = "```\n" + text + "\n```\n"
73
+ case _:
74
+ pass
75
+ return text
76
+
77
+
78
+ def line_separator(line1, line2, block_type, is_continuation=False):
79
+ # Remove hyphen in current line if next line and current line appear to be joined
80
+ hyphen_pattern = re.compile(r'.*[a-z][-]\s?$', re.DOTALL)
81
+ if line1 and hyphen_pattern.match(line1) and re.match(r"^[a-z]", line2):
82
+ # Split on — or - from the right
83
+ line1 = re.split(r"[-—]\s?$", line1)[0]
84
+ return line1.rstrip() + line2.lstrip()
85
+
86
+ lowercase_pattern1 = re.compile(r'.*[a-z,]\s?$', re.DOTALL)
87
+ lowercase_pattern2 = re.compile(r'^\s?[A-Za-z]', re.DOTALL)
88
+ end_pattern = re.compile(r'.*[.?!]\s?$', re.DOTALL)
89
+
90
+ if block_type in ["Title", "Section-header"]:
91
+ return line1.rstrip() + " " + line2.lstrip()
92
+ elif lowercase_pattern1.match(line1) and lowercase_pattern2.match(line2):
93
+ return line1.rstrip() + " " + line2.lstrip()
94
+ elif is_continuation:
95
+ return line1.rstrip() + " " + line2.lstrip()
96
+ elif block_type == "Text" and end_pattern.match(line1):
97
+ return line1 + "\n\n" + line2
98
+ elif block_type == "Formula":
99
+ return line1 + " " + line2
100
+ else:
101
+ return line1 + "\n" + line2
102
+
103
+
104
+ def block_separator(line1, line2, block_type1, block_type2):
105
+ sep = "\n"
106
+ if block_type1 == "Text":
107
+ sep = "\n\n"
108
+
109
+ return sep + line2
110
+
111
+
112
+ def merge_lines(blocks, page_blocks: List[Page]):
113
+ text_blocks = []
114
+ prev_type = None
115
+ prev_line = None
116
+ block_text = ""
117
+ block_type = ""
118
+ common_line_heights = [p.get_line_height_stats() for p in page_blocks]
119
+ for page in blocks:
120
+ for block in page:
121
+ block_type = block.most_common_block_type()
122
+ if block_type != prev_type and prev_type:
123
+ text_blocks.append(
124
+ FullyMergedBlock(
125
+ text=block_surround(block_text, prev_type),
126
+ block_type=prev_type
127
+ )
128
+ )
129
+ block_text = ""
130
+
131
+ prev_type = block_type
132
+ common_line_height = common_line_heights[block.pnum].most_common(1)[0][0]
133
+ for i, line in enumerate(block.lines):
134
+ line_height = line.bbox[3] - line.bbox[1]
135
+ prev_line_height = prev_line.bbox[3] - prev_line.bbox[1] if prev_line else 0
136
+ prev_line_x = prev_line.bbox[0] if prev_line else 0
137
+ prev_line = line
138
+ is_continuation = line_height == prev_line_height and line.bbox[0] == prev_line_x
139
+ if block_text:
140
+ block_text = line_separator(block_text, line.text, block_type, is_continuation)
141
+ else:
142
+ block_text = line.text
143
+
144
+ # Append the final block
145
+ text_blocks.append(
146
+ FullyMergedBlock(
147
+ text=block_surround(block_text, prev_type),
148
+ block_type=block_type
149
+ )
150
+ )
151
+ return text_blocks
152
+
153
+
154
+ def get_full_text(text_blocks):
155
+ full_text = ""
156
+ prev_block = None
157
+ for block in text_blocks:
158
+ if prev_block:
159
+ full_text += block_separator(prev_block.text, block.text, prev_block.block_type, block.block_type)
160
+ else:
161
+ full_text += block.text
162
+ prev_block = block
163
+ return full_text
marker/schema.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import Counter
2
+ from typing import List
3
+
4
+ from pydantic import BaseModel, field_validator
5
+
6
+ from marker.bbox import boxes_intersect_pct, multiple_boxes_intersect
7
+ from marker.settings import settings
8
+
9
+
10
+ def find_span_type(span, page_blocks):
11
+ block_type = "Text"
12
+ for block in page_blocks:
13
+ if boxes_intersect_pct(span.bbox, block.bbox):
14
+ block_type = block.block_type
15
+ break
16
+ return block_type
17
+
18
+
19
+ class BboxElement(BaseModel):
20
+ bbox: List[float]
21
+
22
+ @field_validator('bbox')
23
+ @classmethod
24
+ def check_4_elements(cls, v: List[float]) -> List[float]:
25
+ if len(v) != 4:
26
+ raise ValueError('bbox must have 4 elements')
27
+ return v
28
+
29
+
30
+ class BlockType(BboxElement):
31
+ block_type: str
32
+
33
+
34
+ class Span(BboxElement):
35
+ text: str
36
+ span_id: str
37
+ font: str
38
+ color: int
39
+ ascender: float | None = None
40
+ descender: float | None = None
41
+ block_type: str | None = None
42
+ selected: bool = True
43
+
44
+
45
+ class Line(BboxElement):
46
+ spans: List[Span]
47
+
48
+ @property
49
+ def prelim_text(self):
50
+ return "".join([s.text for s in self.spans])
51
+
52
+ @property
53
+ def start(self):
54
+ return self.spans[0].bbox[0]
55
+
56
+
57
+ class Block(BboxElement):
58
+ lines: List[Line]
59
+ pnum: int
60
+
61
+ @property
62
+ def prelim_text(self):
63
+ return "\n".join([l.prelim_text for l in self.lines])
64
+
65
+ def contains_equation(self, equation_boxes=None):
66
+ conditions = [s.block_type == "Formula" for l in self.lines for s in l.spans]
67
+ if equation_boxes:
68
+ conditions += [multiple_boxes_intersect(self.bbox, equation_boxes)]
69
+ return any(conditions)
70
+
71
+ def filter_spans(self, bad_span_ids):
72
+ new_lines = []
73
+ for line in self.lines:
74
+ new_spans = []
75
+ for span in line.spans:
76
+ if not span.span_id in bad_span_ids:
77
+ new_spans.append(span)
78
+ line.spans = new_spans
79
+ if len(new_spans) > 0:
80
+ new_lines.append(line)
81
+ self.lines = new_lines
82
+
83
+ def filter_bad_span_types(self, block_types: List[BlockType]):
84
+ bad_spans = [b.bbox for b in block_types if b.block_type in settings.BAD_SPAN_TYPES]
85
+ new_lines = []
86
+ for line in self.lines:
87
+ new_spans = []
88
+ for span in line.spans:
89
+ if not multiple_boxes_intersect(span.bbox, bad_spans):
90
+ new_spans.append(span)
91
+ line.spans = new_spans
92
+ if len(new_spans) > 0:
93
+ new_lines.append(line)
94
+ self.lines = new_lines
95
+
96
+ def most_common_block_type(self):
97
+ counter = Counter([s.block_type for l in self.lines for s in l.spans])
98
+ return counter.most_common(1)[0][0]
99
+
100
+ def set_block_type(self, block_type):
101
+ for line in self.lines:
102
+ for span in line.spans:
103
+ span.block_type = block_type
104
+
105
+
106
+ class Page(BaseModel):
107
+ blocks: List[Block]
108
+ pnum: int
109
+
110
+ def get_nonblank_lines(self):
111
+ lines = self.get_all_lines()
112
+ nonblank_lines = [l for l in lines if l.prelim_text.strip()]
113
+ return nonblank_lines
114
+
115
+ def get_all_lines(self):
116
+ lines = [l for b in self.blocks for l in b.lines]
117
+ return lines
118
+
119
+ def get_nonblank_spans(self) -> List[Span]:
120
+ lines = [l for b in self.blocks for l in b.lines]
121
+ spans = [s for l in lines for s in l.spans if s.text.strip()]
122
+ return spans
123
+
124
+ def add_block_types(self, page_block_types):
125
+ if len(page_block_types) != len(self.get_all_lines()):
126
+ print(f"Warning: Number of detected lines {len(page_block_types)} does not match number of lines {len(self.get_all_lines())}")
127
+
128
+ i = 0
129
+ for block in self.blocks:
130
+ for line in block.lines:
131
+ if i < len(page_block_types):
132
+ line_block_type = page_block_types[i].block_type
133
+ else:
134
+ line_block_type = "Text"
135
+ i += 1
136
+ for span in line.spans:
137
+ span.block_type = line_block_type
138
+
139
+ def get_font_stats(self):
140
+ fonts = [s.font for s in self.get_nonblank_spans()]
141
+ font_counts = Counter(fonts)
142
+ return font_counts
143
+
144
+ def get_line_height_stats(self):
145
+ heights = [l.bbox[3] - l.bbox[1] for l in self.get_nonblank_lines()]
146
+ height_counts = Counter(heights)
147
+ return height_counts
148
+
149
+ def get_line_start_stats(self):
150
+ starts = [l.bbox[0] for l in self.get_nonblank_lines()]
151
+ start_counts = Counter(starts)
152
+ return start_counts
153
+
154
+
155
+ class MergedLine(BboxElement):
156
+ text: str
157
+ fonts: List[str]
158
+
159
+ def most_common_font(self):
160
+ counter = Counter(self.fonts)
161
+ return counter.most_common(1)[0][0]
162
+
163
+
164
+ class MergedBlock(BboxElement):
165
+ lines: List[MergedLine]
166
+ pnum: int
167
+ block_types: List[str]
168
+
169
+ def most_common_block_type(self):
170
+ counter = Counter(self.block_types)
171
+ return counter.most_common(1)[0][0]
172
+
173
+
174
+ class FullyMergedBlock(BaseModel):
175
+ text: str
176
+ block_type: str
marker/segmentation.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+
3
+ from transformers import LayoutLMv3ForTokenClassification
4
+
5
+ from marker.bbox import unnormalize_box
6
+ from transformers.models.layoutlmv3.image_processing_layoutlmv3 import normalize_box
7
+ import io
8
+ from PIL import Image
9
+ from transformers import LayoutLMv3Processor
10
+ import numpy as np
11
+ from marker.settings import settings
12
+ from marker.schema import Page, BlockType
13
+ import torch
14
+
15
+ processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
16
+
17
+ CHUNK_KEYS = ["input_ids", "attention_mask", "bbox", "offset_mapping"]
18
+ NO_CHUNK_KEYS = ["pixel_values"]
19
+ MODEL_MAX_LEN = 512
20
+ CHUNK_OVERLAP = 128
21
+
22
+
23
+ def load_model():
24
+ model = LayoutLMv3ForTokenClassification.from_pretrained("Kwan0/layoutlmv3-base-finetune-DocLayNet-100k").to(settings.TORCH_DEVICE)
25
+ model.config.id2label = {
26
+ 0: "Caption",
27
+ 1: "Footnote",
28
+ 2: "Formula",
29
+ 3: "List-item",
30
+ 4: "Page-footer",
31
+ 5: "Page-header",
32
+ 6: "Picture",
33
+ 7: "Section-header",
34
+ 8: "Table",
35
+ 9: "Text",
36
+ 10: "Title"
37
+ }
38
+
39
+ model.config.label2id = d = {v: k for k, v in model.config.id2label.items()}
40
+ return model
41
+
42
+
43
+ layoutlm_model = load_model()
44
+
45
+
46
+ def detect_all_block_types(doc, blocks: List[Page]):
47
+ block_types = []
48
+ for pnum, page in enumerate(doc):
49
+ page_blocks = blocks[pnum]
50
+ predictions = detect_page_block_types(page, page_blocks)
51
+ block_types.append(predictions)
52
+ return block_types
53
+
54
+
55
+ def detect_page_block_types(page, page_blocks: Page):
56
+ page_box = page.bound()
57
+ pwidth = page_box[2] - page_box[0]
58
+ pheight = page_box[3] - page_box[1]
59
+
60
+ pix = page.get_pixmap(dpi=400)
61
+ png = pix.pil_tobytes(format="PNG")
62
+ png_image = Image.open(io.BytesIO(png))
63
+ rgb_image = png_image.convert('RGB')
64
+
65
+ lines = page_blocks.get_all_lines()
66
+ boxes = [s.bbox for s in lines]
67
+ text = [s.prelim_text for s in lines]
68
+
69
+ predictions = make_predictions(rgb_image, text, boxes, pwidth, pheight)
70
+ return predictions
71
+
72
+
73
+ def find_first_false(lst, start_idx):
74
+ # Traverse the list to the left from start_idx
75
+ for idx in range(start_idx, -1, -1):
76
+ if not lst[idx]:
77
+ return idx
78
+
79
+ return 0 # Return 0 if no false found (aka, no lines)
80
+
81
+
82
+ def get_provisional_boxes(pred, box, is_subword, start_idx=0):
83
+ prov_predictions = [pred_ for idx, pred_ in enumerate(pred) if not is_subword[idx]][start_idx:]
84
+ prov_boxes = [box_ for idx, box_ in enumerate(box) if not is_subword[idx]][start_idx:]
85
+ return prov_predictions, prov_boxes
86
+
87
+
88
+ def make_predictions(rgb_image, text, boxes, pwidth, pheight) -> List[BlockType]:
89
+ # Normalize boxes for model (scale to 1000x1000)
90
+ boxes = [normalize_box(box, pwidth, pheight) for box in boxes]
91
+ encoding = processor(rgb_image, text=text, boxes=boxes, return_offsets_mapping=True, return_tensors="pt", truncation=True, stride=CHUNK_OVERLAP, padding="max_length", max_length=MODEL_MAX_LEN, return_overflowing_tokens=True)
92
+ offset_mapping = encoding.pop('offset_mapping')
93
+ overflow_to_sample_mapping = encoding.pop('overflow_to_sample_mapping')
94
+
95
+ # change the shape of pixel values
96
+ x = []
97
+ for i in range(0, len(encoding['pixel_values'])):
98
+ x.append(encoding['pixel_values'][i])
99
+ x = torch.stack(x)
100
+ encoding['pixel_values'] = x
101
+
102
+ with torch.no_grad():
103
+ encoding = encoding.to(settings.TORCH_DEVICE)
104
+ outputs = layoutlm_model(**encoding)
105
+
106
+ logits = outputs.logits
107
+ # We take the highest score for each token, using argmax. This serves as the predicted label for each token.
108
+ predictions = logits.argmax(-1).squeeze().tolist()
109
+ token_boxes = encoding.bbox.squeeze().tolist()
110
+
111
+ if len(token_boxes) == MODEL_MAX_LEN:
112
+ predictions = [predictions]
113
+ token_boxes = [token_boxes]
114
+
115
+ predicted_block_types = []
116
+
117
+ for i, (pred, box, mapped) in enumerate(zip(predictions, token_boxes, offset_mapping)):
118
+ is_subword = np.array(mapped.squeeze().tolist())[:, 0] != 0
119
+ overlap_adjust = 0
120
+ if i > 0:
121
+ overlap_adjust = 1 + CHUNK_OVERLAP - sum(is_subword[:1 + CHUNK_OVERLAP])
122
+
123
+ prov_predictions, prov_boxes = get_provisional_boxes(pred, box, is_subword, overlap_adjust)
124
+
125
+ for prov_box, prov_prediction in zip(prov_boxes, prov_predictions):
126
+ if prov_box == [0, 0, 0, 0]:
127
+ continue
128
+ unnorm_box = unnormalize_box(prov_box, pwidth, pheight)
129
+ block_type = BlockType(
130
+ block_type=layoutlm_model.config.id2label[prov_prediction],
131
+ bbox=unnorm_box
132
+ )
133
+
134
+ # Sometimes blocks will cross chunks, unclear why
135
+ if len(predicted_block_types) == 0 or unnorm_box != predicted_block_types[-1].bbox:
136
+ predicted_block_types.append(block_type)
137
+
138
+ return predicted_block_types
139
+
marker/settings.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Optional, List
3
+
4
+ from dotenv import find_dotenv
5
+ from pydantic_settings import BaseSettings
6
+
7
+
8
+ class Settings(BaseSettings):
9
+ # Path settings
10
+ DPI: int = 400
11
+ INVALID_CHARS: List[str] = [chr(0xfffd), "~", chr(65533), "↵"]
12
+ TORCH_DEVICE: str = "cpu"
13
+ TESSDATA_PREFIX: str = ""
14
+ BAD_SPAN_TYPES: List[str] = ["Caption", "Footnote", "Page-footer", "Page-header", "Picture"]
15
+
16
+ class Config:
17
+ env_file = find_dotenv("local.env")
18
+
19
+
20
+ settings = Settings()
parse.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz as pymupdf
2
+ from marker.extract_text import get_text_blocks
3
+ from marker.headers import categorize_blocks, filter_header_footer
4
+ from marker.equations import replace_equations
5
+ from marker.segmentation import detect_all_block_types
6
+ from marker.code import identify_code_blocks, indent_blocks
7
+ from marker.markdown import merge_spans, merge_lines, get_full_text
8
+ from marker.schema import Page, BlockType
9
+ from typing import List
10
+ from copy import deepcopy
11
+
12
+
13
+ def annotate_spans(blocks: List[Page], block_types: List[BlockType]):
14
+ for i, page in enumerate(blocks):
15
+ page_block_types = block_types[i]
16
+ page.add_block_types(page_block_types)
17
+
18
+
19
+ if __name__ == "__main__":
20
+ fname = "test_data/thinkpython.pdf"
21
+ doc = pymupdf.open(fname)
22
+ blocks, toc = get_text_blocks(doc)
23
+
24
+ block_types = detect_all_block_types(doc, blocks)
25
+
26
+ filtered = deepcopy(blocks)
27
+ annotate_spans(filtered, block_types)
28
+ identify_code_blocks(filtered)
29
+ indent_blocks(filtered)
30
+
31
+ bad_span_ids = categorize_blocks(blocks)
32
+ bad_span_ids += filter_header_footer(blocks)
33
+
34
+ # Copy to avoid changing original data
35
+
36
+ for page in filtered:
37
+ for block in page.blocks:
38
+ block.filter_spans(bad_span_ids)
39
+ block.filter_bad_span_types(block_types[page.pnum])
40
+
41
+ filtered = replace_equations(doc, filtered, block_types)
42
+
43
+ # Copy to avoid changing original data
44
+ merged_lines = merge_spans(filtered)
45
+ text_blocks = merge_lines(merged_lines, filtered)
46
+ full_text = get_full_text(text_blocks)
47
+
48
+ with open("test_data/thinkpython.md", "w+") as f:
49
+ f.write(full_text)
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "marker"
3
+ version = "0.1.0"
4
+ description = ""
5
+ authors = ["Vik Paruchuri <vik.paruchuri@gmail.com>"]
6
+ readme = "README.md"
7
+
8
+ [tool.poetry.dependencies]
9
+ python = ">=3.9,<3.13"
10
+ scikit-learn = "^1.3.2"
11
+ Pillow = "^10.1.0"
12
+ pytesseract = "^0.3.10"
13
+ PyMuPDF = "^1.23.5"
14
+ pymupdf-fonts = "^1.0.5"
15
+ pydantic = "^2.4.2"
16
+ pydantic-settings = "^2.0.3"
17
+ nougat-ocr = "^0.1.17"
18
+ transformers = "^4.34.1"
19
+ torch = "^2.1.0"
20
+ numpy = "^1.26.1"
21
+ python-dotenv = "^1.0.0"
22
+
23
+ [tool.poetry.group.dev.dependencies]
24
+ jupyter = "^1.0.0"
25
+
26
+ [build-system]
27
+ requires = ["poetry-core"]
28
+ build-backend = "poetry.core.masonry.api"
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Pillow==9.5.0
2
+ layoutparser
3
+ torchvision
4
+ git+https://github.com/facebookresearch/detectron2.git@v0.5#egg=detectron2
5
+ pytesseract
6
+ pymupdf
7
+ pymupdf-fonts
8
+ pydantic
9
+ pydantic-settings
10
+ nougat
11
+ transformers
12
+ scikit-learn