Vik Paruchuri commited on
Commit
a800c59
·
1 Parent(s): d440114

Flatten pdfs, fix page separators

Browse files
README.md CHANGED
@@ -212,7 +212,7 @@ Set `DEBUG=true` to save data to the `debug` subfolder in the marker root direct
212
  These settings can improve/change output quality:
213
 
214
  - `OCR_ALL_PAGES` will force OCR across the document. Many PDFs have bad text embedded due to older OCR engines being used.
215
- - `PAGINATE_OUTPUT` will put a horizontal rule between pages. Default: False.
216
  - `EXTRACT_IMAGES` will extract images and save separately. Default: True.
217
  - `BAD_SPAN_TYPES` specifies layout blocks to remove from the markdown output.
218
 
 
212
  These settings can improve/change output quality:
213
 
214
  - `OCR_ALL_PAGES` will force OCR across the document. Many PDFs have bad text embedded due to older OCR engines being used.
215
+ - `PAGINATE_OUTPUT` will put a horizontal rule between pages. Default: False. The horizontal rule will be `\n\n`, then `{PAGE_NUMBER}`, then 48 single dashes `-`, then `\n\n`. The separator can be configured via the `PAGE_SEPARATOR` setting.
216
  - `EXTRACT_IMAGES` will extract images and save separately. Default: True.
217
  - `BAD_SPAN_TYPES` specifies layout blocks to remove from the markdown output.
218
 
marker/pdf/extract_text.py CHANGED
@@ -92,7 +92,7 @@ def get_text_blocks(doc, fname, max_pages: Optional[int] = None, start_page: Opt
92
 
93
  page_range = range(start_page, start_page + max_pages)
94
 
95
- char_blocks = dictionary_output(fname, page_range=page_range, keep_chars=False, workers=settings.PDFTEXT_CPU_WORKERS)
96
  marker_blocks = [pdftext_format_to_blocks(page, pnum) for pnum, page in enumerate(char_blocks)]
97
 
98
  return marker_blocks, toc
 
92
 
93
  page_range = range(start_page, start_page + max_pages)
94
 
95
+ char_blocks = dictionary_output(fname, page_range=page_range, keep_chars=False, workers=settings.PDFTEXT_CPU_WORKERS, flatten_pdf=settings.FLATTEN_PDF)
96
  marker_blocks = [pdftext_format_to_blocks(page, pnum) for pnum, page in enumerate(char_blocks)]
97
 
98
  return marker_blocks, toc
marker/postprocessors/markdown.py CHANGED
@@ -64,11 +64,19 @@ def merge_spans(pages: List[Page]) -> List[List[MergedBlock]]:
64
  if len(block_lines) > 0:
65
  page_blocks.append(MergedBlock(
66
  lines=block_lines,
67
- pnum=block.pnum,
68
  bbox=block.bbox,
69
  block_type=block.block_type,
70
  heading_level=block.heading_level
71
  ))
 
 
 
 
 
 
 
 
72
  merged_blocks.append(page_blocks)
73
 
74
  return merged_blocks
@@ -139,9 +147,6 @@ def block_separator(prev_block: FullyMergedBlock, block: FullyMergedBlock):
139
  if prev_block.block_type == "Text":
140
  sep = "\n\n"
141
 
142
- if prev_block.page_end:
143
- sep = settings.PAGE_SEPARATOR
144
-
145
  return sep + block.text
146
 
147
 
@@ -152,8 +157,30 @@ def merge_lines(blocks: List[List[MergedBlock]], max_block_gap=15):
152
  block_text = ""
153
  block_type = ""
154
  prev_heading_level = None
 
155
 
156
  for idx, page in enumerate(blocks):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  for block in page:
158
  block_type = block.block_type
159
  if (block_type != prev_type and prev_type) or (block.heading_level != prev_heading_level and prev_heading_level):
@@ -161,13 +188,15 @@ def merge_lines(blocks: List[List[MergedBlock]], max_block_gap=15):
161
  FullyMergedBlock(
162
  text=block_surround(block_text, prev_type, prev_heading_level),
163
  block_type=prev_type if prev_type else settings.DEFAULT_BLOCK_TYPE,
164
- page_end=False
 
165
  )
166
  )
167
  block_text = ""
168
 
169
  prev_type = block_type
170
  prev_heading_level = block.heading_level
 
171
  # Join lines in the block together properly
172
  for i, line in enumerate(block.lines):
173
  line_height = line.bbox[3] - line.bbox[1]
@@ -181,28 +210,17 @@ def merge_lines(blocks: List[List[MergedBlock]], max_block_gap=15):
181
  else:
182
  block_text = line.text
183
 
184
- # Force blocks to end at page boundaries
185
- if settings.PAGINATE_OUTPUT:
186
- text_blocks.append(
187
- FullyMergedBlock(
188
- text=block_surround(block_text, prev_type, prev_heading_level),
189
- block_type=prev_type if prev_type else settings.DEFAULT_BLOCK_TYPE,
190
- page_end=True
191
- )
192
- )
193
- block_text = ""
194
-
195
-
196
  # Append the final block
197
  text_blocks.append(
198
  FullyMergedBlock(
199
  text=block_surround(block_text, prev_type, prev_heading_level),
200
  block_type=block_type if block_type else settings.DEFAULT_BLOCK_TYPE,
201
- page_end=False
 
202
  )
203
  )
204
 
205
- text_blocks = [block for block in text_blocks if block.text.strip()]
206
  return text_blocks
207
 
208
 
@@ -210,7 +228,9 @@ def get_full_text(text_blocks):
210
  full_text = ""
211
  prev_block = None
212
  for block in text_blocks:
213
- if prev_block:
 
 
214
  full_text += block_separator(prev_block, block)
215
  else:
216
  full_text += block.text
 
64
  if len(block_lines) > 0:
65
  page_blocks.append(MergedBlock(
66
  lines=block_lines,
67
+ pnum=page.pnum,
68
  bbox=block.bbox,
69
  block_type=block.block_type,
70
  heading_level=block.heading_level
71
  ))
72
+ if len(page_blocks) == 0:
73
+ page_blocks.append(MergedBlock(
74
+ lines=[],
75
+ pnum=page.pnum,
76
+ bbox=page.bbox,
77
+ block_type="Text",
78
+ heading_level=None
79
+ ))
80
  merged_blocks.append(page_blocks)
81
 
82
  return merged_blocks
 
147
  if prev_block.block_type == "Text":
148
  sep = "\n\n"
149
 
 
 
 
150
  return sep + block.text
151
 
152
 
 
157
  block_text = ""
158
  block_type = ""
159
  prev_heading_level = None
160
+ pnum = None
161
 
162
  for idx, page in enumerate(blocks):
163
+ # Insert pagination at every page boundary
164
+ if settings.PAGINATE_OUTPUT:
165
+ if block_text:
166
+ text_blocks.append(
167
+ FullyMergedBlock(
168
+ text=block_surround(block_text, prev_type, prev_heading_level),
169
+ block_type=prev_type if prev_type else settings.DEFAULT_BLOCK_TYPE,
170
+ page_start=False,
171
+ pnum=pnum
172
+ )
173
+ )
174
+ block_text = ""
175
+ text_blocks.append(
176
+ FullyMergedBlock(
177
+ text="",
178
+ block_type="Text",
179
+ page_start=True,
180
+ pnum=page[0].pnum
181
+ )
182
+ )
183
+
184
  for block in page:
185
  block_type = block.block_type
186
  if (block_type != prev_type and prev_type) or (block.heading_level != prev_heading_level and prev_heading_level):
 
188
  FullyMergedBlock(
189
  text=block_surround(block_text, prev_type, prev_heading_level),
190
  block_type=prev_type if prev_type else settings.DEFAULT_BLOCK_TYPE,
191
+ page_start=False,
192
+ pnum=block.pnum
193
  )
194
  )
195
  block_text = ""
196
 
197
  prev_type = block_type
198
  prev_heading_level = block.heading_level
199
+ pnum = block.pnum
200
  # Join lines in the block together properly
201
  for i, line in enumerate(block.lines):
202
  line_height = line.bbox[3] - line.bbox[1]
 
210
  else:
211
  block_text = line.text
212
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  # Append the final block
214
  text_blocks.append(
215
  FullyMergedBlock(
216
  text=block_surround(block_text, prev_type, prev_heading_level),
217
  block_type=block_type if block_type else settings.DEFAULT_BLOCK_TYPE,
218
+ page_start=False,
219
+ pnum=pnum
220
  )
221
  )
222
 
223
+ text_blocks = [block for block in text_blocks if (block.text.strip() or block.page_start)]
224
  return text_blocks
225
 
226
 
 
228
  full_text = ""
229
  prev_block = None
230
  for block in text_blocks:
231
+ if block.page_start:
232
+ full_text += "\n\n{" + str(block.pnum) + "}" + settings.PAGE_SEPARATOR
233
+ elif prev_block:
234
  full_text += block_separator(prev_block, block)
235
  else:
236
  full_text += block.text
marker/schema/merged.py CHANGED
@@ -25,4 +25,5 @@ class MergedBlock(BboxElement):
25
  class FullyMergedBlock(BaseModel):
26
  text: str
27
  block_type: str
28
- page_end: bool
 
 
25
  class FullyMergedBlock(BaseModel):
26
  text: str
27
  block_type: str
28
+ page_start: bool
29
+ pnum: int | None
marker/settings.py CHANGED
@@ -14,6 +14,7 @@ class Settings(BaseSettings):
14
  EXTRACT_IMAGES: bool = True # Extract images from pdfs and save them
15
  PAGINATE_OUTPUT: bool = False # Paginate output markdown
16
  BASE_DIR: str = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 
17
 
18
  @computed_field
19
  @property
@@ -88,7 +89,7 @@ class Settings(BaseSettings):
88
  HEADING_DEFAULT_LEVEL: int = 2
89
 
90
  # Output
91
- PAGE_SEPARATOR: str = "\n\n" + "-" * 48 + "\n\n"
92
 
93
  # Debug
94
  DEBUG_DATA_FOLDER: str = os.path.join(BASE_DIR, "debug_data")
 
14
  EXTRACT_IMAGES: bool = True # Extract images from pdfs and save them
15
  PAGINATE_OUTPUT: bool = False # Paginate output markdown
16
  BASE_DIR: str = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
17
+ FLATTEN_PDF: bool = True # Pull form field values into the PDF before converting to markdown
18
 
19
  @computed_field
20
  @property
 
89
  HEADING_DEFAULT_LEVEL: int = 2
90
 
91
  # Output
92
+ PAGE_SEPARATOR: str = "-" * 48 + "\n\n"
93
 
94
  # Debug
95
  DEBUG_DATA_FOLDER: str = os.path.join(BASE_DIR, "debug_data")
marker/tables/table.py CHANGED
@@ -73,6 +73,7 @@ def get_table_boxes(pages: List[Page], doc: PdfDocument, fname):
73
  fname,
74
  doc_idxs,
75
  [hr for i, hr in enumerate(img_sizes) if i in table_idxs],
 
76
  )
77
  text_lines = []
78
  out_img_sizes = []
 
73
  fname,
74
  doc_idxs,
75
  [hr for i, hr in enumerate(img_sizes) if i in table_idxs],
76
+ # Add flatten pdf here
77
  )
78
  text_lines = []
79
  out_img_sizes = []
poetry.lock CHANGED
@@ -4175,13 +4175,13 @@ snowflake = ["snowflake-connector-python (>=2.8.0)", "snowflake-snowpark-python[
4175
 
4176
  [[package]]
4177
  name = "surya-ocr"
4178
- version = "0.6.10"
4179
  description = "OCR, layout, reading order, and table recognition in 90+ languages"
4180
  optional = false
4181
  python-versions = "<4.0,>=3.10"
4182
  files = [
4183
- {file = "surya_ocr-0.6.10-py3-none-any.whl", hash = "sha256:e22038d226d73bead781abda761a3813bacb1261f47996a00f5679686e86434e"},
4184
- {file = "surya_ocr-0.6.10.tar.gz", hash = "sha256:7867dd02242a67e8d632d3f1343c62eb16e2068a98e36612dce1ec40065ff5b5"},
4185
  ]
4186
 
4187
  [package.dependencies]
@@ -5076,4 +5076,4 @@ propcache = ">=0.2.0"
5076
  [metadata]
5077
  lock-version = "2.0"
5078
  python-versions = "^3.10"
5079
- content-hash = "380f95b398ed6864345aa1ed6d5357bfef0045cfee6bf32450d71e6e05ec079f"
 
4175
 
4176
  [[package]]
4177
  name = "surya-ocr"
4178
+ version = "0.6.11"
4179
  description = "OCR, layout, reading order, and table recognition in 90+ languages"
4180
  optional = false
4181
  python-versions = "<4.0,>=3.10"
4182
  files = [
4183
+ {file = "surya_ocr-0.6.11-py3-none-any.whl", hash = "sha256:9b0c8638feda3f0f9db73a2ebceccdcbf5fdbb4cef0102b8d19837c455799347"},
4184
+ {file = "surya_ocr-0.6.11.tar.gz", hash = "sha256:d1415fcceae30cd44b08e8012d810efef51538dcf8d902fad035189b6219ee48"},
4185
  ]
4186
 
4187
  [package.dependencies]
 
5076
  [metadata]
5077
  lock-version = "2.0"
5078
  python-versions = "^3.10"
5079
+ content-hash = "e60697c44fdc30b1d5b48e5f4077ac4c65ff5844a49a4057f236dc6933a56dbb"
pyproject.toml CHANGED
@@ -1,6 +1,6 @@
1
  [tool.poetry]
2
  name = "marker-pdf"
3
- version = "0.3.8"
4
  description = "Convert PDF to markdown with high speed and accuracy."
5
  authors = ["Vik Paruchuri <github@vikas.sh>"]
6
  readme = "README.md"
@@ -32,7 +32,7 @@ tabulate = "^0.9.0"
32
  ftfy = "^6.1.1"
33
  texify = "^0.2.0"
34
  rapidfuzz = "^3.8.1"
35
- surya-ocr = "^0.6.10"
36
  filetype = "^1.2.0"
37
  regex = "^2024.4.28"
38
  pdftext = "^0.3.17"
 
1
  [tool.poetry]
2
  name = "marker-pdf"
3
+ version = "0.3.9"
4
  description = "Convert PDF to markdown with high speed and accuracy."
5
  authors = ["Vik Paruchuri <github@vikas.sh>"]
6
  readme = "README.md"
 
32
  ftfy = "^6.1.1"
33
  texify = "^0.2.0"
34
  rapidfuzz = "^3.8.1"
35
+ surya-ocr = "^0.6.11"
36
  filetype = "^1.2.0"
37
  regex = "^2024.4.28"
38
  pdftext = "^0.3.17"