Vik Paruchuri
commited on
Commit
·
d440114
1
Parent(s):
e3aa5ff
Fix table recognition
Browse files- marker/tables/table.py +11 -5
- pyproject.toml +1 -1
marker/tables/table.py
CHANGED
|
@@ -31,12 +31,12 @@ def get_table_boxes(pages: List[Page], doc: PdfDocument, fname):
|
|
| 31 |
table_counts = []
|
| 32 |
table_bboxes = []
|
| 33 |
img_sizes = []
|
|
|
|
| 34 |
|
| 35 |
-
for page in pages:
|
| 36 |
-
pnum = page.pnum
|
| 37 |
# The bbox for the entire table
|
| 38 |
bbox = [b.bbox for b in page.layout.bboxes if b.label == "Table"]
|
| 39 |
-
highres_img = render_image(doc[
|
| 40 |
|
| 41 |
page_table_imgs = []
|
| 42 |
page_bboxes = []
|
|
@@ -48,11 +48,13 @@ def get_table_boxes(pages: List[Page], doc: PdfDocument, fname):
|
|
| 48 |
if len(bbox) == 0:
|
| 49 |
table_counts.append(0)
|
| 50 |
img_sizes.append(None)
|
|
|
|
| 51 |
continue
|
| 52 |
|
| 53 |
# Number of tables per page
|
| 54 |
table_counts.append(len(bbox))
|
| 55 |
img_sizes.append(highres_img.size)
|
|
|
|
| 56 |
|
| 57 |
for bb in bbox:
|
| 58 |
highres_bb = rescale_bbox(page.layout.image_bbox, [0, 0, highres_img.size[0], highres_img.size[1]], bb)
|
|
@@ -62,10 +64,14 @@ def get_table_boxes(pages: List[Page], doc: PdfDocument, fname):
|
|
| 62 |
table_imgs.extend(page_table_imgs)
|
| 63 |
table_bboxes.extend(page_bboxes)
|
| 64 |
|
| 65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
sel_text_lines = get_page_text_lines(
|
| 67 |
fname,
|
| 68 |
-
|
| 69 |
[hr for i, hr in enumerate(img_sizes) if i in table_idxs],
|
| 70 |
)
|
| 71 |
text_lines = []
|
|
|
|
| 31 |
table_counts = []
|
| 32 |
table_bboxes = []
|
| 33 |
img_sizes = []
|
| 34 |
+
pnums = []
|
| 35 |
|
| 36 |
+
for page_idx, page in enumerate(pages):
|
|
|
|
| 37 |
# The bbox for the entire table
|
| 38 |
bbox = [b.bbox for b in page.layout.bboxes if b.label == "Table"]
|
| 39 |
+
highres_img = render_image(doc[page_idx], dpi=settings.SURYA_TABLE_DPI)
|
| 40 |
|
| 41 |
page_table_imgs = []
|
| 42 |
page_bboxes = []
|
|
|
|
| 48 |
if len(bbox) == 0:
|
| 49 |
table_counts.append(0)
|
| 50 |
img_sizes.append(None)
|
| 51 |
+
pnums.append(page.pnum)
|
| 52 |
continue
|
| 53 |
|
| 54 |
# Number of tables per page
|
| 55 |
table_counts.append(len(bbox))
|
| 56 |
img_sizes.append(highres_img.size)
|
| 57 |
+
pnums.append(page.pnum)
|
| 58 |
|
| 59 |
for bb in bbox:
|
| 60 |
highres_bb = rescale_bbox(page.layout.image_bbox, [0, 0, highres_img.size[0], highres_img.size[1]], bb)
|
|
|
|
| 64 |
table_imgs.extend(page_table_imgs)
|
| 65 |
table_bboxes.extend(page_bboxes)
|
| 66 |
|
| 67 |
+
# The page number in doc and in the original document are not the same
|
| 68 |
+
# Doc has had pages removed from the start to align to start_page
|
| 69 |
+
# This corrects for that
|
| 70 |
+
doc_idxs = [pnum for pnum, tc in zip(pnums, table_counts) if tc > 0]
|
| 71 |
+
table_idxs = [i for i, tc in enumerate(table_counts) if tc > 0]
|
| 72 |
sel_text_lines = get_page_text_lines(
|
| 73 |
fname,
|
| 74 |
+
doc_idxs,
|
| 75 |
[hr for i, hr in enumerate(img_sizes) if i in table_idxs],
|
| 76 |
)
|
| 77 |
text_lines = []
|
pyproject.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
[tool.poetry]
|
| 2 |
name = "marker-pdf"
|
| 3 |
-
version = "0.3.
|
| 4 |
description = "Convert PDF to markdown with high speed and accuracy."
|
| 5 |
authors = ["Vik Paruchuri <github@vikas.sh>"]
|
| 6 |
readme = "README.md"
|
|
|
|
| 1 |
[tool.poetry]
|
| 2 |
name = "marker-pdf"
|
| 3 |
+
version = "0.3.8"
|
| 4 |
description = "Convert PDF to markdown with high speed and accuracy."
|
| 5 |
authors = ["Vik Paruchuri <github@vikas.sh>"]
|
| 6 |
readme = "README.md"
|