Vik Paruchuri commited on
Commit
d440114
·
1 Parent(s): e3aa5ff

Fix table recognition

Browse files
Files changed (2) hide show
  1. marker/tables/table.py +11 -5
  2. pyproject.toml +1 -1
marker/tables/table.py CHANGED
@@ -31,12 +31,12 @@ def get_table_boxes(pages: List[Page], doc: PdfDocument, fname):
31
  table_counts = []
32
  table_bboxes = []
33
  img_sizes = []
 
34
 
35
- for page in pages:
36
- pnum = page.pnum
37
  # The bbox for the entire table
38
  bbox = [b.bbox for b in page.layout.bboxes if b.label == "Table"]
39
- highres_img = render_image(doc[pnum], dpi=settings.SURYA_TABLE_DPI)
40
 
41
  page_table_imgs = []
42
  page_bboxes = []
@@ -48,11 +48,13 @@ def get_table_boxes(pages: List[Page], doc: PdfDocument, fname):
48
  if len(bbox) == 0:
49
  table_counts.append(0)
50
  img_sizes.append(None)
 
51
  continue
52
 
53
  # Number of tables per page
54
  table_counts.append(len(bbox))
55
  img_sizes.append(highres_img.size)
 
56
 
57
  for bb in bbox:
58
  highres_bb = rescale_bbox(page.layout.image_bbox, [0, 0, highres_img.size[0], highres_img.size[1]], bb)
@@ -62,10 +64,14 @@ def get_table_boxes(pages: List[Page], doc: PdfDocument, fname):
62
  table_imgs.extend(page_table_imgs)
63
  table_bboxes.extend(page_bboxes)
64
 
65
- table_idxs = [i for i, c in enumerate(table_counts) if c > 0]
 
 
 
 
66
  sel_text_lines = get_page_text_lines(
67
  fname,
68
- table_idxs,
69
  [hr for i, hr in enumerate(img_sizes) if i in table_idxs],
70
  )
71
  text_lines = []
 
31
  table_counts = []
32
  table_bboxes = []
33
  img_sizes = []
34
+ pnums = []
35
 
36
+ for page_idx, page in enumerate(pages):
 
37
  # The bbox for the entire table
38
  bbox = [b.bbox for b in page.layout.bboxes if b.label == "Table"]
39
+ highres_img = render_image(doc[page_idx], dpi=settings.SURYA_TABLE_DPI)
40
 
41
  page_table_imgs = []
42
  page_bboxes = []
 
48
  if len(bbox) == 0:
49
  table_counts.append(0)
50
  img_sizes.append(None)
51
+ pnums.append(page.pnum)
52
  continue
53
 
54
  # Number of tables per page
55
  table_counts.append(len(bbox))
56
  img_sizes.append(highres_img.size)
57
+ pnums.append(page.pnum)
58
 
59
  for bb in bbox:
60
  highres_bb = rescale_bbox(page.layout.image_bbox, [0, 0, highres_img.size[0], highres_img.size[1]], bb)
 
64
  table_imgs.extend(page_table_imgs)
65
  table_bboxes.extend(page_bboxes)
66
 
67
+ # The page number in doc and in the original document are not the same
68
+ # Doc has had pages removed from the start to align to start_page
69
+ # This corrects for that
70
+ doc_idxs = [pnum for pnum, tc in zip(pnums, table_counts) if tc > 0]
71
+ table_idxs = [i for i, tc in enumerate(table_counts) if tc > 0]
72
  sel_text_lines = get_page_text_lines(
73
  fname,
74
+ doc_idxs,
75
  [hr for i, hr in enumerate(img_sizes) if i in table_idxs],
76
  )
77
  text_lines = []
pyproject.toml CHANGED
@@ -1,6 +1,6 @@
1
  [tool.poetry]
2
  name = "marker-pdf"
3
- version = "0.3.7"
4
  description = "Convert PDF to markdown with high speed and accuracy."
5
  authors = ["Vik Paruchuri <github@vikas.sh>"]
6
  readme = "README.md"
 
1
  [tool.poetry]
2
  name = "marker-pdf"
3
+ version = "0.3.8"
4
  description = "Convert PDF to markdown with high speed and accuracy."
5
  authors = ["Vik Paruchuri <github@vikas.sh>"]
6
  readme = "README.md"