Moses Paul R commited on
Commit
1aa3752
·
2 Parent(s): aee20f6 0c603b0

Merge remote-tracking branch 'origin/vik_v2' into dev-mose/marker-v2

Browse files
marker/v2/builders/structure.py CHANGED
@@ -12,7 +12,7 @@ from marker.v2.schema.groups.page import PageGroup
12
  class StructureBuilder(BaseBuilder):
13
  gap_threshold: int = 10
14
 
15
- def __init__(self, config: Optional[BaseModel] = None):
16
  super().__init__(config)
17
 
18
  def __call__(self, document: Document):
 
12
  class StructureBuilder(BaseBuilder):
13
  gap_threshold: int = 10
14
 
15
+ def __init__(self, config=None):
16
  super().__init__(config)
17
 
18
  def __call__(self, document: Document):
marker/v2/converters/__init__.py CHANGED
@@ -8,6 +8,7 @@ from marker.v2.util import assign_config
8
  class BaseConverter:
9
  def __init__(self, config: Optional[BaseModel | dict] = None):
10
  assign_config(self, config)
 
11
 
12
  def __call__(self, *args, **kwargs):
13
  raise NotImplementedError
 
8
  class BaseConverter:
9
  def __init__(self, config: Optional[BaseModel | dict] = None):
10
  assign_config(self, config)
11
+ self.config = config
12
 
13
  def __call__(self, *args, **kwargs):
14
  raise NotImplementedError
marker/v2/converters/pdf.py CHANGED
@@ -1,9 +1,8 @@
1
- <<<<<<< HEAD
2
  import os
3
- =======
4
- from marker.v2.providers.pdf import PdfProvider
5
 
6
- >>>>>>> origin/v2
 
7
  import tempfile
8
  from typing import List, Optional
9
 
@@ -24,7 +23,7 @@ from marker.v2.renderers.markdown import MarkdownRenderer
24
 
25
 
26
  class PdfConverter(BaseConverter):
27
- def __init__(self, config: Optional[BaseModel] = None):
28
  super().__init__(config)
29
 
30
  self.layout_model = setup_layout_model()
@@ -33,21 +32,24 @@ class PdfConverter(BaseConverter):
33
  self.table_rec_model = setup_table_rec_model()
34
  self.detection_model = setup_detection_model()
35
 
36
- def __call__(self, filepath: str, page_range: List[int] | None = None):
37
- pdf_provider = PdfProvider(filepath, {"page_range": page_range, "force_ocr": False})
38
 
39
- layout_builder = LayoutBuilder(self.layout_model)
40
- ocr_builder = OcrBuilder(self.detection_model, self.recognition_model)
41
- document = DocumentBuilder()(pdf_provider, layout_builder, ocr_builder)
42
- StructureBuilder()(document)
43
 
44
- equation_processor = EquationProcessor(self.texify_model)
45
  equation_processor(document)
46
 
47
- table_processor = TableProcessor(self.detection_model, self.recognition_model, self.table_rec_model)
48
  table_processor(document)
49
 
50
- renderer = MarkdownRenderer()
 
 
 
51
  return renderer(document)
52
 
53
 
 
 
1
  import os
2
+ os.environ["TOKENIZERS_PARALLELISM"] = "false" # disables a tokenizers warning
 
3
 
4
+ from marker.v2.processors.sectionheader import SectionHeaderProcessor
5
+ from marker.v2.providers.pdf import PdfProvider
6
  import tempfile
7
  from typing import List, Optional
8
 
 
23
 
24
 
25
  class PdfConverter(BaseConverter):
26
+ def __init__(self, config=None):
27
  super().__init__(config)
28
 
29
  self.layout_model = setup_layout_model()
 
32
  self.table_rec_model = setup_table_rec_model()
33
  self.detection_model = setup_detection_model()
34
 
35
+ def __call__(self, filepath: str):
36
+ pdf_provider = PdfProvider(filepath, self.config)
37
 
38
+ layout_builder = LayoutBuilder(self.layout_model, self.config)
39
+ ocr_builder = OcrBuilder(self.detection_model, self.recognition_model, self.config)
40
+ document = DocumentBuilder(self.config)(pdf_provider, layout_builder, ocr_builder)
41
+ StructureBuilder(self.config)(document)
42
 
43
+ equation_processor = EquationProcessor(self.texify_model, self.config)
44
  equation_processor(document)
45
 
46
+ table_processor = TableProcessor(self.detection_model, self.recognition_model, self.table_rec_model, self.config)
47
  table_processor(document)
48
 
49
+ section_header_processor = SectionHeaderProcessor(self.config)
50
+ section_header_processor(document)
51
+
52
+ renderer = MarkdownRenderer(self.config)
53
  return renderer(document)
54
 
55
 
marker/v2/processors/equation.py CHANGED
@@ -16,7 +16,7 @@ class EquationProcessor(BaseProcessor):
16
  batch_size = None
17
  token_buffer = 256
18
 
19
- def __init__(self, texify_model, config: Optional[BaseModel] = None):
20
  super().__init__(config)
21
 
22
  self.texify_model = texify_model
 
16
  batch_size = None
17
  token_buffer = 256
18
 
19
+ def __init__(self, texify_model, config=None):
20
  super().__init__(config)
21
 
22
  self.texify_model = texify_model
marker/v2/processors/sectionheader.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from marker.v2.processors import BaseProcessor
2
+ from marker.v2.schema import BlockTypes
3
+ from marker.v2.schema.document import Document
4
+
5
+ import numpy as np
6
+ from sklearn.cluster import KMeans
7
+ from sklearn.exceptions import ConvergenceWarning
8
+
9
+ # Ignore sklearn warning about not converging
10
+ import warnings
11
+ warnings.filterwarnings("ignore", category=ConvergenceWarning)
12
+
13
+
14
+ class SectionHeaderProcessor(BaseProcessor):
15
+ block_types = (BlockTypes.SectionHeader, )
16
+ level_count = 4
17
+ merge_threshold = .25
18
+ default_level = 2
19
+ height_tolerance = .99
20
+
21
+ def __call__(self, document: Document):
22
+ line_heights = {}
23
+ for page in document.pages:
24
+ for block in page.children:
25
+ if block.block_type not in self.block_types:
26
+ continue
27
+
28
+ line_heights[block.block_id] = [document.get_block(l).polygon.height for l in block.structure if l.block_type == BlockTypes.Line]
29
+
30
+ flat_line_heights = [h for heights in line_heights.values() for h in heights]
31
+ heading_ranges = self.bucket_headings(flat_line_heights)
32
+
33
+ for page in document.pages:
34
+ for block in page.children:
35
+ if block.block_type not in self.block_types:
36
+ continue
37
+
38
+ block_heights = line_heights[block.block_id]
39
+ if len(block_heights) > 0:
40
+ avg_height = sum(block_heights) / len(block_heights)
41
+ for idx, (min_height, max_height) in enumerate(heading_ranges):
42
+ if avg_height >= min_height * self.height_tolerance:
43
+ block.heading_level = idx + 1
44
+ break
45
+
46
+ if block.heading_level is None:
47
+ block.heading_level = self.default_level
48
+
49
+ def bucket_headings(self, line_heights, num_levels=4):
50
+ if len(line_heights) <= self.level_count:
51
+ return []
52
+
53
+ data = np.asarray(line_heights).reshape(-1, 1)
54
+ labels = KMeans(n_clusters=num_levels, random_state=0, n_init="auto").fit_predict(data)
55
+ data_labels = np.concatenate([data, labels.reshape(-1, 1)], axis=1)
56
+ data_labels = np.sort(data_labels, axis=0)
57
+
58
+ cluster_means = {int(label): float(np.mean(data_labels[data_labels[:, 1] == label, 0])) for label in np.unique(labels)}
59
+ label_max = None
60
+ label_min = None
61
+ heading_ranges = []
62
+ prev_cluster = None
63
+ for row in data_labels:
64
+ value, label = row
65
+ value = float(value)
66
+ label = int(label)
67
+ if prev_cluster is not None and label != prev_cluster:
68
+ prev_cluster_mean = cluster_means[prev_cluster]
69
+ cluster_mean = cluster_means[label]
70
+ if cluster_mean * self.merge_threshold < prev_cluster_mean:
71
+ heading_ranges.append((label_min, label_max))
72
+ label_min = None
73
+ label_max = None
74
+
75
+ label_min = value if label_min is None else min(label_min, value)
76
+ label_max = value if label_max is None else max(label_max, value)
77
+ prev_cluster = label
78
+
79
+ if label_min is not None:
80
+ heading_ranges.append((label_min, label_max))
81
+
82
+ heading_ranges = sorted(heading_ranges, reverse=True)
83
+
84
+ return heading_ranges
marker/v2/processors/table.py CHANGED
@@ -18,7 +18,7 @@ class TableProcessor(BaseProcessor):
18
  table_rec_batch_size = None
19
  ocr_batch_size = None
20
 
21
- def __init__(self, detection_model, ocr_model, table_rec_model, config: Optional[BaseModel] = None):
22
  super().__init__(config)
23
 
24
  self.detection_model = detection_model
 
18
  table_rec_batch_size = None
19
  ocr_batch_size = None
20
 
21
+ def __init__(self, detection_model, ocr_model, table_rec_model, config=None):
22
  super().__init__(config)
23
 
24
  self.detection_model = detection_model
marker/v2/providers/pdf.py CHANGED
@@ -23,7 +23,7 @@ class PdfProvider(BaseProvider):
23
  flatten_pdf: bool = True
24
  force_ocr: bool = False
25
 
26
- def __init__(self, filepath: str, config: Optional[BaseModel] = None):
27
  super().__init__(filepath, config)
28
 
29
  self.doc: pdfium.PdfDocument = pdfium.PdfDocument(self.filepath)
 
23
  flatten_pdf: bool = True
24
  force_ocr: bool = False
25
 
26
+ def __init__(self, filepath: str, config = None):
27
  super().__init__(filepath, config)
28
 
29
  self.doc: pdfium.PdfDocument = pdfium.PdfDocument(self.filepath)
marker/v2/renderers/__init__.py CHANGED
@@ -3,16 +3,14 @@ from typing import Optional
3
  from pydantic import BaseModel
4
 
5
  from marker.v2.schema import BlockTypes
6
-
7
 
8
 
9
  class BaseRenderer:
10
  block_type: BlockTypes | None = None
11
 
12
  def __init__(self, config: Optional[BaseModel | dict] = None):
13
- if config:
14
- for k in config.model_fields:
15
- setattr(self, k, config[k])
16
 
17
  def __call__(self, document):
18
  # Children are in reading order
 
3
  from pydantic import BaseModel
4
 
5
  from marker.v2.schema import BlockTypes
6
+ from marker.v2.util import assign_config
7
 
8
 
9
  class BaseRenderer:
10
  block_type: BlockTypes | None = None
11
 
12
  def __init__(self, config: Optional[BaseModel | dict] = None):
13
+ assign_config(self, config)
 
 
14
 
15
  def __call__(self, document):
16
  # Children are in reading order
marker/v2/renderers/html.py CHANGED
@@ -1,12 +1,16 @@
1
  import re
2
 
3
- from bs4 import BeautifulSoup
4
  from pydantic import BaseModel
5
 
6
  from marker.v2.renderers import BaseRenderer
7
  from marker.v2.schema import BlockTypes
8
  from marker.v2.schema.blocks import BlockId
9
 
 
 
 
 
10
 
11
  class HTMLOutput(BaseModel):
12
  html: str
@@ -34,6 +38,8 @@ def merge_consecutive_tags(html, tag):
34
  class HTMLRenderer(BaseRenderer):
35
  remove_blocks: list = [BlockTypes.PageHeader, BlockTypes.PageFooter]
36
  image_blocks: list = [BlockTypes.Picture, BlockTypes.Figure]
 
 
37
 
38
  def extract_image(self, document, image_id):
39
  image_block = document.get_block(image_id)
@@ -65,6 +71,11 @@ class HTMLRenderer(BaseRenderer):
65
  image_name = f"{ref_block_id.to_path()}.png"
66
  images[image_name] = image
67
  ref.replace_with(BeautifulSoup(f"<p><img src='{image_name}'></p>", 'html.parser'))
 
 
 
 
 
68
  else:
69
  images.update(sub_images)
70
  ref.replace_with(BeautifulSoup(f"{content}", 'html.parser'))
 
1
  import re
2
 
3
+ from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
4
  from pydantic import BaseModel
5
 
6
  from marker.v2.renderers import BaseRenderer
7
  from marker.v2.schema import BlockTypes
8
  from marker.v2.schema.blocks import BlockId
9
 
10
+ # Ignore beautifulsoup warnings
11
+ import warnings
12
+ warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)
13
+
14
 
15
  class HTMLOutput(BaseModel):
16
  html: str
 
38
  class HTMLRenderer(BaseRenderer):
39
  remove_blocks: list = [BlockTypes.PageHeader, BlockTypes.PageFooter]
40
  image_blocks: list = [BlockTypes.Picture, BlockTypes.Figure]
41
+ page_blocks: list = [BlockTypes.Page]
42
+ paginate_output: bool = False
43
 
44
  def extract_image(self, document, image_id):
45
  image_block = document.get_block(image_id)
 
71
  image_name = f"{ref_block_id.to_path()}.png"
72
  images[image_name] = image
73
  ref.replace_with(BeautifulSoup(f"<p><img src='{image_name}'></p>", 'html.parser'))
74
+ elif ref_block_id.block_type in self.page_blocks:
75
+ images.update(sub_images)
76
+ if self.paginate_output:
77
+ content = f"<div class='page' data-page-id='{ref_block_id.page_id}'>{content}</div>"
78
+ ref.replace_with(BeautifulSoup(f"{content}", 'html.parser'))
79
  else:
80
  images.update(sub_images)
81
  ref.replace_with(BeautifulSoup(f"{content}", 'html.parser'))
marker/v2/renderers/markdown.py CHANGED
@@ -6,7 +6,19 @@ from marker.v2.schema.document import Document
6
 
7
 
8
  class Markdownify(MarkdownConverter):
9
- pass
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
 
12
  class MarkdownOutput(BaseModel):
@@ -15,10 +27,14 @@ class MarkdownOutput(BaseModel):
15
 
16
 
17
  class MarkdownRenderer(HTMLRenderer):
 
 
18
  def __call__(self, document: Document) -> MarkdownOutput:
19
  document_output = document.render()
20
  full_html, images = self.extract_html(document, document_output)
21
  md_cls = Markdownify(
 
 
22
  heading_style="ATX",
23
  bullets="-",
24
  escape_misc=False,
 
6
 
7
 
8
  class Markdownify(MarkdownConverter):
9
+ def __init__(self, paginate_output, page_separator, **kwargs):
10
+ super().__init__(**kwargs)
11
+ self.paginate_output = paginate_output
12
+ self.page_separator = page_separator
13
+
14
+ def convert_div(self, el, text, convert_as_inline):
15
+ is_page = el.has_attr('class') and el['class'][0] == 'page'
16
+ if self.paginate_output and is_page:
17
+ page_id = el['data-page-id']
18
+ pagination_item = "\n\n" + "{" + str(page_id) + "}" + self.page_separator + "\n\n"
19
+ return pagination_item + text
20
+ else:
21
+ return text
22
 
23
 
24
  class MarkdownOutput(BaseModel):
 
27
 
28
 
29
  class MarkdownRenderer(HTMLRenderer):
30
+ page_separator: str = "-" * 48
31
+
32
  def __call__(self, document: Document) -> MarkdownOutput:
33
  document_output = document.render()
34
  full_html, images = self.extract_html(document, document_output)
35
  md_cls = Markdownify(
36
+ self.paginate_output,
37
+ self.page_separator,
38
  heading_style="ATX",
39
  bullets="-",
40
  escape_misc=False,
marker/v2/schema/blocks/base.py CHANGED
@@ -28,6 +28,9 @@ class BlockId(BaseModel):
28
  return f"/page/{self.page_id}"
29
  return f"/page/{self.page_id}/{self.block_type.name}/{self.block_id}"
30
 
 
 
 
31
  def __repr__(self):
32
  return str(self)
33
 
 
28
  return f"/page/{self.page_id}"
29
  return f"/page/{self.page_id}/{self.block_type.name}/{self.block_id}"
30
 
31
+ def __hash__(self):
32
+ return hash(str(self))
33
+
34
  def __repr__(self):
35
  return str(self)
36
 
marker/v2/schema/blocks/sectionheader.py CHANGED
@@ -4,8 +4,10 @@ from marker.v2.schema.blocks import Block
4
 
5
  class SectionHeader(Block):
6
  block_type: BlockTypes = BlockTypes.SectionHeader
 
7
 
8
  def assemble_html(self, child_blocks, parent_structure):
9
  template = super().assemble_html(child_blocks, parent_structure)
10
  template = template.replace("\n", " ")
11
- return f"<h2>{template}</h2>"
 
 
4
 
5
  class SectionHeader(Block):
6
  block_type: BlockTypes = BlockTypes.SectionHeader
7
+ heading_level: int | None = None
8
 
9
  def assemble_html(self, child_blocks, parent_structure):
10
  template = super().assemble_html(child_blocks, parent_structure)
11
  template = template.replace("\n", " ")
12
+ tag = f"h{self.heading_level}" if self.heading_level else "h2"
13
+ return f"<{tag}>{template}</{tag}>"
marker/v2/schema/groups/page.py CHANGED
@@ -48,6 +48,12 @@ class PageGroup(Block):
48
  assert block.block_id == block_id.block_id
49
  return block
50
 
 
 
 
 
 
 
51
  def merge_blocks(
52
  self,
53
  page_lines: List[Line],
 
48
  assert block.block_id == block_id.block_id
49
  return block
50
 
51
+ def assemble_html(self, child_blocks, parent_structure=None):
52
+ template = ""
53
+ for c in child_blocks:
54
+ template += f"<content-ref src='{c.id}'></content-ref>"
55
+ return template
56
+
57
  def merge_blocks(
58
  self,
59
  page_lines: List[Line],
marker/v2/util.py CHANGED
@@ -2,11 +2,24 @@ from pydantic import BaseModel
2
 
3
 
4
  def assign_config(cls, config: BaseModel | dict | None):
 
5
  if config is None:
6
  return
7
  elif isinstance(config, BaseModel):
8
- for k in config.model_fields:
9
- setattr(cls, k, config[k])
10
  elif isinstance(config, dict):
11
- for k, v in config.items():
12
- setattr(cls, k, v)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
 
4
  def assign_config(cls, config: BaseModel | dict | None):
5
+ cls_name = cls.__class__.__name__
6
  if config is None:
7
  return
8
  elif isinstance(config, BaseModel):
9
+ dict_config = config.dict()
 
10
  elif isinstance(config, dict):
11
+ dict_config = config
12
+ else:
13
+ raise ValueError("config must be a dict or a pydantic BaseModel")
14
+
15
+ for k in dict_config:
16
+ if hasattr(cls, k):
17
+ setattr(cls, k, dict_config[k])
18
+ for k in dict_config:
19
+ if cls_name not in k:
20
+ continue
21
+ # Enables using class-specific keys, like "MarkdownRenderer_remove_blocks"
22
+ split_k = k.removeprefix(cls_name + "_")
23
+
24
+ if hasattr(cls, split_k):
25
+ setattr(cls, split_k, dict_config[k])
tests/utils.py CHANGED
@@ -11,10 +11,7 @@ from marker.v2.schema.document import Document
11
 
12
  def setup_pdf_document(
13
  filename='adversarial.pdf',
14
- pdf_provider_config=None,
15
- layout_builder_config=None,
16
- ocr_builder_config=None,
17
- document_builder_config=None
18
  ) -> Document:
19
  dataset = datasets.load_dataset("datalab-to/pdfs", split="train")
20
  idx = dataset['filename'].index(filename)
@@ -27,9 +24,9 @@ def setup_pdf_document(
27
  recognition_model = setup_recognition_model()
28
  detection_model = setup_detection_model()
29
 
30
- provider = PdfProvider(temp_pdf.name, pdf_provider_config)
31
- layout_builder = LayoutBuilder(layout_model, layout_builder_config)
32
- ocr_builder = OcrBuilder(detection_model, recognition_model, ocr_builder_config)
33
- builder = DocumentBuilder(document_builder_config)
34
  document = builder(provider, layout_builder, ocr_builder)
35
  return document
 
11
 
12
  def setup_pdf_document(
13
  filename='adversarial.pdf',
14
+ config=None,
 
 
 
15
  ) -> Document:
16
  dataset = datasets.load_dataset("datalab-to/pdfs", split="train")
17
  idx = dataset['filename'].index(filename)
 
24
  recognition_model = setup_recognition_model()
25
  detection_model = setup_detection_model()
26
 
27
+ provider = PdfProvider(temp_pdf.name, config)
28
+ layout_builder = LayoutBuilder(layout_model, config)
29
+ ocr_builder = OcrBuilder(detection_model, recognition_model, config)
30
+ builder = DocumentBuilder(config)
31
  document = builder(provider, layout_builder, ocr_builder)
32
  return document