Vik Paruchuri commited on
Commit
19a4543
·
1 Parent(s): fc75dc5

Add structure

Browse files
marker/v2/builders/__init__.py CHANGED
@@ -1,6 +1,8 @@
1
  class BaseBuilder:
2
  def __init__(self, config=None):
3
- self.config = config
 
 
4
 
5
  def __call__(self, data, *args, **kwargs):
6
  raise NotImplementedError
 
1
  class BaseBuilder:
2
  def __init__(self, config=None):
3
+ if config:
4
+ for k in config:
5
+ setattr(self, k, config[k])
6
 
7
  def __call__(self, data, *args, **kwargs):
8
  raise NotImplementedError
marker/v2/builders/structure.py CHANGED
@@ -1,53 +1,85 @@
 
 
1
  from pydantic import BaseModel
2
 
3
  from marker.v2.builders import BaseBuilder
4
  from marker.v2.schema.document import Document
5
- from marker.v2.schema.groups import GROUP_BLOCK_REGISTRY
 
6
 
7
 
8
- class StructureConfig(BaseModel):
9
  gap_threshold: int = 10
10
 
11
-
12
- class StructureBuilder(BaseBuilder):
13
- def __init__(self, config):
14
  super().__init__(config)
15
 
16
  def __call__(self, document: Document):
17
  for page in document.pages:
18
- initial_structure = [block._id for block in page.children]
 
 
19
 
20
- def group_caption_blocks(self, page):
21
  for i, block in enumerate(page.children):
22
- if block.block_type in ["Table", "Figure", "Picture"]:
23
- block_structure = [block._id]
24
-
25
- for j, prev_block in enumerate(page.children[:i][::-1]):
26
- if all([
27
- prev_block.block_type in ["Caption", "Footnote"],
28
- prev_block.minimum_gap(block) < self.config.gap_threshold
29
- ]):
30
- block_structure.append(prev_block._id)
31
- else:
32
- break
33
-
34
- for j, next_block in enumerate(page.children[i + 1:]):
35
- if all([
36
- next_block.block_type in ["Caption", "Footnote"],
37
- next_block.minimum_gap(block) < self.config.gap_threshold
38
- ]):
39
- block_structure.append(next_block._id)
40
- else:
41
- break
42
-
43
- if len(block_structure) > 0:
44
- new_block_cls = GROUP_BLOCK_REGISTRY[block.block_type]
45
- # TODO: fix the polygon to span all the blocks inside the grouped block
46
- # TODO: Add the structure, etc, to the block
47
- block = page.add_block(new_block_cls, block.polygon)
48
-
49
- # Table, Figure, Picture
50
- pass
51
-
52
- def group_lists(self, page):
53
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional
2
+
3
  from pydantic import BaseModel
4
 
5
  from marker.v2.builders import BaseBuilder
6
  from marker.v2.schema.document import Document
7
+ from marker.v2.schema.groups import GROUP_BLOCK_REGISTRY, ListGroup
8
+ from marker.v2.schema.groups.page import PageGroup
9
 
10
 
11
+ class StructureBuilder(BaseBuilder):
12
  gap_threshold: int = 10
13
 
14
+ def __init__(self, config: Optional[BaseModel] = None):
 
 
15
  super().__init__(config)
16
 
17
  def __call__(self, document: Document):
18
  for page in document.pages:
19
+ page.structure = [block._id for block in page.children]
20
+ self.group_caption_blocks(page)
21
+ self.group_lists(page)
22
 
23
+ def group_caption_blocks(self, page: PageGroup):
24
  for i, block in enumerate(page.children):
25
+ if block.block_type not in ["Table", "Figure", "Picture"]:
26
+ continue
27
+
28
+ block_structure = [block._id]
29
+ selected_polygons = [block.polygon]
30
+
31
+ for j, prev_block in enumerate(page.children[:i][::-1]):
32
+ if all([
33
+ prev_block.block_type in ["Caption", "Footnote"],
34
+ prev_block.minimum_gap(block) < self.gap_threshold
35
+ ]):
36
+ block_structure.insert(prev_block._id, 0)
37
+ selected_polygons.append(prev_block.polygon)
38
+ page.structure.remove(prev_block._id)
39
+ else:
40
+ break
41
+
42
+ for j, next_block in enumerate(page.children[i + 1:]):
43
+ if all([
44
+ next_block.block_type in ["Caption", "Footnote"],
45
+ next_block.minimum_gap(block) < self.gap_threshold
46
+ ]):
47
+ block_structure.append(next_block._id)
48
+ selected_polygons.append(next_block.polygon)
49
+ page.structure.remove(next_block._id)
50
+ else:
51
+ break
52
+
53
+ if len(block_structure) > 1:
54
+ # Create a merged block
55
+ new_block_cls = GROUP_BLOCK_REGISTRY[block.block_type + "Group"]
56
+ new_polygon = block.polygon.merge(selected_polygons)
57
+ group_block = page.add_block(new_block_cls, new_polygon)
58
+ group_block.structure = block_structure
59
+
60
+ # Update the structure of the page to reflect the new block
61
+ orig_block_idx = page.structure.index(block_structure[0])
62
+ page.structure[orig_block_idx] = group_block._id
63
+
64
+ def group_lists(self, page: PageGroup):
65
+ for i, block in enumerate(page.children):
66
+ if block.block_type not in ["ListItem"]:
67
+ continue
68
+ block_structure = [block._id]
69
+ selected_polygons = [block.polygon]
70
+
71
+ for j, next_block in enumerate(page.children[i + 1:]):
72
+ if all([
73
+ next_block.block_type == "ListItem",
74
+ next_block.minimum_gap(block) < self.gap_threshold
75
+ ]):
76
+ block_structure.append(next_block._id)
77
+ selected_polygons.append(next_block.polygon)
78
+ page.structure.remove(next_block._id)
79
+ else:
80
+ break
81
+
82
+ if len(block_structure) > 1:
83
+ new_polygon = block.polygon.merge(selected_polygons)
84
+ block = page.add_block(ListGroup, new_polygon)
85
+ block.structure = block_structure
marker/v2/converters/__init__.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+
3
+ from pydantic import BaseModel
4
+
5
+
6
+ class ConverterConfig(BaseModel):
7
+ filepath: str
8
+ page_range: List[int] | None = None
9
+
10
+
11
+ class BaseConverter:
12
+ def __init__(self, config: ConverterConfig):
13
+ self.config = config
14
+
15
+ def __call__(self):
16
+ raise NotImplementedError
marker/v2/converters/pdf.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from surya.model.layout.model import load_model
2
+ from surya.model.layout.processor import load_processor
3
+
4
+ from marker.v2.builders.document import DocumentBuilder
5
+ from marker.v2.builders.layout import LayoutBuilder
6
+ from marker.v2.builders.structure import StructureBuilder
7
+ from marker.v2.converters import BaseConverter
8
+ from marker.v2.providers.pdf import PdfProvider
9
+ from marker.v2.schema.config.pdf import PdfProviderConfig
10
+
11
+
12
+ class PdfConverter(BaseConverter):
13
+ def __call__(self):
14
+ pdf_provider = PdfProvider(self.config.filepath, PdfProviderConfig())
15
+
16
+ layout_model = load_model()
17
+ layout_model.processor = load_processor()
18
+ layout_builder = LayoutBuilder(layout_model)
19
+
20
+ document = DocumentBuilder()(pdf_provider, layout_builder)
21
+ StructureBuilder()(document)
22
+
marker/v2/schema/__init__.py CHANGED
@@ -16,8 +16,8 @@ class Block(BaseModel):
16
 
17
  @property
18
  def _id(self):
19
- page_path = f"/page/{self.pnum}"
20
- if self.block_num is not None:
21
- return f"{page_path}/block/{self.block_num}"
22
  else:
23
  return page_path
 
16
 
17
  @property
18
  def _id(self):
19
+ page_path = f"/page/{self.page_id}"
20
+ if self.block_id is not None:
21
+ return f"{page_path}/block/{self.block_id}"
22
  else:
23
  return page_path
marker/v2/schema/groups/figure.py CHANGED
@@ -2,4 +2,4 @@ from marker.v2.schema import Block
2
 
3
 
4
  class FigureGroup(Block):
5
- pass
 
2
 
3
 
4
  class FigureGroup(Block):
5
+ block_type: str = "FigureGroup"
marker/v2/schema/groups/list.py CHANGED
@@ -2,4 +2,4 @@ from marker.v2.schema import Block
2
 
3
 
4
  class ListGroup(Block):
5
- pass
 
2
 
3
 
4
  class ListGroup(Block):
5
+ block_type: str = "ListGroup"
marker/v2/schema/groups/picture.py CHANGED
@@ -2,4 +2,4 @@ from marker.v2.schema import Block
2
 
3
 
4
  class PictureGroup(Block):
5
- pass
 
2
 
3
 
4
  class PictureGroup(Block):
5
+ block_type: str = "PictureGroup"
marker/v2/schema/groups/table.py CHANGED
@@ -2,4 +2,4 @@ from marker.v2.schema import Block
2
 
3
 
4
  class TableGroup(Block):
5
- pass
 
2
 
3
 
4
  class TableGroup(Block):
5
+ block_type: str = "TableGroup"
marker/v2/schema/polygon.py CHANGED
@@ -83,13 +83,6 @@ class PolygonBox(BaseModel):
83
  corner[1] = max(min(corner[1], bounds[3]), bounds[1])
84
  self.polygon = new_corners
85
 
86
- def merge(self, other):
87
- x1 = min(self.bbox[0], other.bbox[0])
88
- y1 = min(self.bbox[1], other.bbox[1])
89
- x2 = max(self.bbox[2], other.bbox[2])
90
- y2 = max(self.bbox[3], other.bbox[3])
91
- self.polygon = [[x1, y1], [x2, y1], [x2, y2], [x1, y2]]
92
-
93
  def overlap_x(self, other):
94
  return max(0, min(self.bbox[2], other.bbox[2]) - max(self.bbox[0], other.bbox[0]))
95
 
@@ -113,6 +106,14 @@ class PolygonBox(BaseModel):
113
  intersection = self.intersection_area(other, x_margin, y_margin)
114
  return intersection / self.area
115
 
 
 
 
 
 
 
 
 
116
  @classmethod
117
  def from_bbox(cls, bbox: List[float]):
118
  return cls(polygon=[[bbox[0], bbox[1]], [bbox[2], bbox[1]], [bbox[2], bbox[3]], [bbox[0], bbox[3]]])
 
83
  corner[1] = max(min(corner[1], bounds[3]), bounds[1])
84
  self.polygon = new_corners
85
 
 
 
 
 
 
 
 
86
  def overlap_x(self, other):
87
  return max(0, min(self.bbox[2], other.bbox[2]) - max(self.bbox[0], other.bbox[0]))
88
 
 
106
  intersection = self.intersection_area(other, x_margin, y_margin)
107
  return intersection / self.area
108
 
109
+ def merge(self, others: List[PolygonBox]) -> PolygonBox:
110
+ corners = []
111
+ for i in range(len(self.polygon)):
112
+ min_x = min([self.polygon[i][0]] + [other.polygon[i][0] for other in others])
113
+ min_y = min([self.polygon[i][1]] + [other.polygon[i][1] for other in others])
114
+ corners.append([min_x, min_y])
115
+ return PolygonBox(polygon=corners)
116
+
117
  @classmethod
118
  def from_bbox(cls, bbox: List[float]):
119
  return cls(polygon=[[bbox[0], bbox[1]], [bbox[2], bbox[1]], [bbox[2], bbox[3]], [bbox[0], bbox[3]]])
pytest.ini ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [pytest]
2
+ testpaths=tests
tests/test_structure.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from marker.v2.builders.structure import StructureBuilder
2
+ from tests.utils import setup_pdf_document
3
+
4
+
5
+ def test_structure_builder():
6
+ document = setup_pdf_document('adversarial.pdf')
7
+ structure = StructureBuilder()
8
+ structure(document)
9
+ assert len(document.pages[0].structure) > 0
tests/utils.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datasets
2
+ from surya.model.layout.model import load_model
3
+ from surya.model.layout.processor import load_processor
4
+ import tempfile
5
+
6
+ from marker.v2.builders.document import DocumentBuilder
7
+ from marker.v2.builders.layout import LayoutBuilder
8
+ from marker.v2.providers.pdf import PdfProvider
9
+ from marker.v2.schema.config.pdf import PdfProviderConfig
10
+ from marker.v2.schema.document import Document
11
+
12
+
13
+ def setup_pdf_document(filename: str) -> Document:
14
+ dataset = datasets.load_dataset("datalab-to/pdfs", split="train")
15
+ idx = dataset['filename'].index(filename)
16
+
17
+ temp_pdf = tempfile.NamedTemporaryFile(suffix=".pdf")
18
+ temp_pdf.write(dataset['pdf'][idx])
19
+ temp_pdf.flush()
20
+
21
+ layout_model = load_model()
22
+ layout_model.processor = load_processor()
23
+
24
+ provider = PdfProvider(temp_pdf.name, PdfProviderConfig())
25
+ layout_builder = LayoutBuilder(layout_model)
26
+ builder = DocumentBuilder()
27
+ document = builder(provider, layout_builder)
28
+ return document