Moses Paul R commited on
Commit
6df89e5
·
1 Parent(s): 2798e13

integrate changes and increment surya version

Browse files
convert_single.py CHANGED
@@ -1,6 +1,6 @@
1
  import os
2
 
3
- os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # Transformers uses .isin for a simple op, which is not supported on MPS
4
 
5
  import time
6
 
 
1
  import os
2
 
3
+ os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # Transformers uses .isin for a simple op, which is not supported on MPS
4
 
5
  import time
6
 
marker/builders/high_quality_layout.py CHANGED
@@ -57,16 +57,16 @@ Here are the top k predictions from the model followed by the image:
57
  def __init__(self, layout_model: SuryaLayoutModel, config=None):
58
  super().__init__(layout_model, config)
59
 
60
- if self.google_api_key is not None:
61
- genai.configure(api_key=self.google_api_key)
62
- self.model = genai.GenerativeModel(self.model_name)
 
 
 
63
 
64
  def __call__(self, document: Document, provider: PdfProvider):
65
  super().__call__(document, provider)
66
 
67
- if self.model is None:
68
- return
69
-
70
  self.relabel_blocks(document)
71
 
72
  def relabel_blocks(self, document: Document):
 
57
  def __init__(self, layout_model: SuryaLayoutModel, config=None):
58
  super().__init__(layout_model, config)
59
 
60
+ self.model = None
61
+ if self.google_api_key is None:
62
+ raise ValueError("Google API key is not set")
63
+
64
+ genai.configure(api_key=self.google_api_key)
65
+ self.model = genai.GenerativeModel(self.model_name)
66
 
67
  def __call__(self, document: Document, provider: PdfProvider):
68
  super().__call__(document, provider)
69
 
 
 
 
70
  self.relabel_blocks(document)
71
 
72
  def relabel_blocks(self, document: Document):
marker/config/parser.py CHANGED
@@ -34,6 +34,7 @@ class ConfigParser:
34
  fn = click.option("--disable_multiprocessing", is_flag=True, default=False, help="Disable multiprocessing.")(fn)
35
  fn = click.option("--paginate_output", is_flag=True, default=False, help="Paginate output.")(fn)
36
  fn = click.option("--disable_image_extraction", is_flag=True, default=False, help="Disable image extraction.")(fn)
 
37
  return fn
38
 
39
  def generate_config_dict(self) -> Dict[str, any]:
@@ -69,6 +70,9 @@ class ConfigParser:
69
  case "disable_image_extraction":
70
  if v:
71
  config["extract_images"] = False
 
 
 
72
  return config
73
 
74
  def get_renderer(self):
 
34
  fn = click.option("--disable_multiprocessing", is_flag=True, default=False, help="Disable multiprocessing.")(fn)
35
  fn = click.option("--paginate_output", is_flag=True, default=False, help="Paginate output.")(fn)
36
  fn = click.option("--disable_image_extraction", is_flag=True, default=False, help="Disable image extraction.")(fn)
37
+ fn = click.option("--high_quality", is_flag=True, default=False, help="Enable high quality processing with Gemini.")(fn)
38
  return fn
39
 
40
  def generate_config_dict(self) -> Dict[str, any]:
 
70
  case "disable_image_extraction":
71
  if v:
72
  config["extract_images"] = False
73
+ case "high_quality":
74
+ if v:
75
+ config["high_quality"] = True
76
  return config
77
 
78
  def get_renderer(self):
marker/converters/pdf.py CHANGED
@@ -45,6 +45,7 @@ class PdfConverter(BaseConverter):
45
  instead of the defaults.
46
  """
47
  override_map: Dict[BlockTypes, Type[Block]] = defaultdict()
 
48
 
49
  def __init__(self, artifact_dict: Dict[str, Any], processor_list: List[str] | None = None, renderer: str | None = None, config=None):
50
  super().__init__(config)
@@ -81,6 +82,10 @@ class PdfConverter(BaseConverter):
81
  self.processor_list = processor_list
82
  self.renderer = renderer
83
 
 
 
 
 
84
  def resolve_dependencies(self, cls):
85
  init_signature = inspect.signature(cls.__init__)
86
  parameters = init_signature.parameters
@@ -102,7 +107,7 @@ class PdfConverter(BaseConverter):
102
 
103
  def __call__(self, filepath: str):
104
  pdf_provider = PdfProvider(filepath, self.config)
105
- layout_builder = self.resolve_dependencies(HighQualityLayoutBuilder)
106
  ocr_builder = self.resolve_dependencies(OcrBuilder)
107
  document = DocumentBuilder(self.config)(pdf_provider, layout_builder, ocr_builder)
108
  StructureBuilder(self.config)(document)
 
45
  instead of the defaults.
46
  """
47
  override_map: Dict[BlockTypes, Type[Block]] = defaultdict()
48
+ high_quality: bool = False
49
 
50
  def __init__(self, artifact_dict: Dict[str, Any], processor_list: List[str] | None = None, renderer: str | None = None, config=None):
51
  super().__init__(config)
 
82
  self.processor_list = processor_list
83
  self.renderer = renderer
84
 
85
+ self.layout_builder_class = LayoutBuilder
86
+ if self.high_quality:
87
+ self.layout_builder_class = HighQualityLayoutBuilder
88
+
89
  def resolve_dependencies(self, cls):
90
  init_signature = inspect.signature(cls.__init__)
91
  parameters = init_signature.parameters
 
107
 
108
  def __call__(self, filepath: str):
109
  pdf_provider = PdfProvider(filepath, self.config)
110
+ layout_builder = self.resolve_dependencies(self.layout_builder_class)
111
  ocr_builder = self.resolve_dependencies(OcrBuilder)
112
  document = DocumentBuilder(self.config)(pdf_provider, layout_builder, ocr_builder)
113
  StructureBuilder(self.config)(document)
marker/processors/high_quality_text.py CHANGED
@@ -40,6 +40,7 @@ class HighQualityTextProcessor(BaseProcessor):
40
  google_api_key: Optional[str] = settings.GOOGLE_API_KEY
41
  confidence_threshold: float = 0.7
42
  model_name: str = "gemini-1.5-flash"
 
43
 
44
  gemini_rewriting_prompt = """You are a text correction expert specializing in accurately reproducing text from images.
45
  You will receive an image of a text block and a set of extracted lines corresponding to the text in the image.
@@ -98,12 +99,18 @@ Output:
98
  def __init__(self, config=None):
99
  super().__init__(config)
100
 
101
- if self.google_api_key is not None:
102
- genai.configure(api_key=self.google_api_key)
103
- self.model = genai.GenerativeModel(self.model_name)
 
 
 
 
 
 
104
 
105
  def __call__(self, document: Document):
106
- if self.model is None:
107
  return
108
 
109
  self.rewrite_blocks(document)
 
40
  google_api_key: Optional[str] = settings.GOOGLE_API_KEY
41
  confidence_threshold: float = 0.7
42
  model_name: str = "gemini-1.5-flash"
43
+ high_quality: bool = False
44
 
45
  gemini_rewriting_prompt = """You are a text correction expert specializing in accurately reproducing text from images.
46
  You will receive an image of a text block and a set of extracted lines corresponding to the text in the image.
 
99
  def __init__(self, config=None):
100
  super().__init__(config)
101
 
102
+ self.model = None
103
+ if not self.high_quality:
104
+ return
105
+
106
+ if self.google_api_key is None:
107
+ raise ValueError("Google API key is not set")
108
+
109
+ genai.configure(api_key=self.google_api_key)
110
+ self.model = genai.GenerativeModel(self.model_name)
111
 
112
  def __call__(self, document: Document):
113
+ if not self.high_quality or self.model is None:
114
  return
115
 
116
  self.rewrite_blocks(document)
poetry.lock CHANGED
The diff for this file is too large to render. See raw diff
 
pyproject.toml CHANGED
@@ -33,7 +33,7 @@ tabulate = "^0.9.0"
33
  ftfy = "^6.1.1"
34
  texify = "^0.2.1"
35
  rapidfuzz = "^3.8.1"
36
- surya-ocr = "~0.8.0"
37
  regex = "^2024.4.28"
38
  pdftext = "~0.4.0"
39
  tabled-pdf = "~0.2.0"
 
33
  ftfy = "^6.1.1"
34
  texify = "^0.2.1"
35
  rapidfuzz = "^3.8.1"
36
+ surya-ocr = "~0.8.1"
37
  regex = "^2024.4.28"
38
  pdftext = "~0.4.0"
39
  tabled-pdf = "~0.2.0"