Moses Paul R
commited on
Commit
·
6df89e5
1
Parent(s):
2798e13
integrate changes and increment surya version
Browse files- convert_single.py +1 -1
- marker/builders/high_quality_layout.py +6 -6
- marker/config/parser.py +4 -0
- marker/converters/pdf.py +6 -1
- marker/processors/high_quality_text.py +11 -4
- poetry.lock +0 -0
- pyproject.toml +1 -1
convert_single.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
import os
|
| 2 |
|
| 3 |
-
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
|
| 4 |
|
| 5 |
import time
|
| 6 |
|
|
|
|
| 1 |
import os
|
| 2 |
|
| 3 |
+
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # Transformers uses .isin for a simple op, which is not supported on MPS
|
| 4 |
|
| 5 |
import time
|
| 6 |
|
marker/builders/high_quality_layout.py
CHANGED
|
@@ -57,16 +57,16 @@ Here are the top k predictions from the model followed by the image:
|
|
| 57 |
def __init__(self, layout_model: SuryaLayoutModel, config=None):
|
| 58 |
super().__init__(layout_model, config)
|
| 59 |
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
| 63 |
|
| 64 |
def __call__(self, document: Document, provider: PdfProvider):
|
| 65 |
super().__call__(document, provider)
|
| 66 |
|
| 67 |
-
if self.model is None:
|
| 68 |
-
return
|
| 69 |
-
|
| 70 |
self.relabel_blocks(document)
|
| 71 |
|
| 72 |
def relabel_blocks(self, document: Document):
|
|
|
|
| 57 |
def __init__(self, layout_model: SuryaLayoutModel, config=None):
|
| 58 |
super().__init__(layout_model, config)
|
| 59 |
|
| 60 |
+
self.model = None
|
| 61 |
+
if self.google_api_key is None:
|
| 62 |
+
raise ValueError("Google API key is not set")
|
| 63 |
+
|
| 64 |
+
genai.configure(api_key=self.google_api_key)
|
| 65 |
+
self.model = genai.GenerativeModel(self.model_name)
|
| 66 |
|
| 67 |
def __call__(self, document: Document, provider: PdfProvider):
|
| 68 |
super().__call__(document, provider)
|
| 69 |
|
|
|
|
|
|
|
|
|
|
| 70 |
self.relabel_blocks(document)
|
| 71 |
|
| 72 |
def relabel_blocks(self, document: Document):
|
marker/config/parser.py
CHANGED
|
@@ -34,6 +34,7 @@ class ConfigParser:
|
|
| 34 |
fn = click.option("--disable_multiprocessing", is_flag=True, default=False, help="Disable multiprocessing.")(fn)
|
| 35 |
fn = click.option("--paginate_output", is_flag=True, default=False, help="Paginate output.")(fn)
|
| 36 |
fn = click.option("--disable_image_extraction", is_flag=True, default=False, help="Disable image extraction.")(fn)
|
|
|
|
| 37 |
return fn
|
| 38 |
|
| 39 |
def generate_config_dict(self) -> Dict[str, any]:
|
|
@@ -69,6 +70,9 @@ class ConfigParser:
|
|
| 69 |
case "disable_image_extraction":
|
| 70 |
if v:
|
| 71 |
config["extract_images"] = False
|
|
|
|
|
|
|
|
|
|
| 72 |
return config
|
| 73 |
|
| 74 |
def get_renderer(self):
|
|
|
|
| 34 |
fn = click.option("--disable_multiprocessing", is_flag=True, default=False, help="Disable multiprocessing.")(fn)
|
| 35 |
fn = click.option("--paginate_output", is_flag=True, default=False, help="Paginate output.")(fn)
|
| 36 |
fn = click.option("--disable_image_extraction", is_flag=True, default=False, help="Disable image extraction.")(fn)
|
| 37 |
+
fn = click.option("--high_quality", is_flag=True, default=False, help="Enable high quality processing with Gemini.")(fn)
|
| 38 |
return fn
|
| 39 |
|
| 40 |
def generate_config_dict(self) -> Dict[str, any]:
|
|
|
|
| 70 |
case "disable_image_extraction":
|
| 71 |
if v:
|
| 72 |
config["extract_images"] = False
|
| 73 |
+
case "high_quality":
|
| 74 |
+
if v:
|
| 75 |
+
config["high_quality"] = True
|
| 76 |
return config
|
| 77 |
|
| 78 |
def get_renderer(self):
|
marker/converters/pdf.py
CHANGED
|
@@ -45,6 +45,7 @@ class PdfConverter(BaseConverter):
|
|
| 45 |
instead of the defaults.
|
| 46 |
"""
|
| 47 |
override_map: Dict[BlockTypes, Type[Block]] = defaultdict()
|
|
|
|
| 48 |
|
| 49 |
def __init__(self, artifact_dict: Dict[str, Any], processor_list: List[str] | None = None, renderer: str | None = None, config=None):
|
| 50 |
super().__init__(config)
|
|
@@ -81,6 +82,10 @@ class PdfConverter(BaseConverter):
|
|
| 81 |
self.processor_list = processor_list
|
| 82 |
self.renderer = renderer
|
| 83 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
def resolve_dependencies(self, cls):
|
| 85 |
init_signature = inspect.signature(cls.__init__)
|
| 86 |
parameters = init_signature.parameters
|
|
@@ -102,7 +107,7 @@ class PdfConverter(BaseConverter):
|
|
| 102 |
|
| 103 |
def __call__(self, filepath: str):
|
| 104 |
pdf_provider = PdfProvider(filepath, self.config)
|
| 105 |
-
layout_builder = self.resolve_dependencies(
|
| 106 |
ocr_builder = self.resolve_dependencies(OcrBuilder)
|
| 107 |
document = DocumentBuilder(self.config)(pdf_provider, layout_builder, ocr_builder)
|
| 108 |
StructureBuilder(self.config)(document)
|
|
|
|
| 45 |
instead of the defaults.
|
| 46 |
"""
|
| 47 |
override_map: Dict[BlockTypes, Type[Block]] = defaultdict()
|
| 48 |
+
high_quality: bool = False
|
| 49 |
|
| 50 |
def __init__(self, artifact_dict: Dict[str, Any], processor_list: List[str] | None = None, renderer: str | None = None, config=None):
|
| 51 |
super().__init__(config)
|
|
|
|
| 82 |
self.processor_list = processor_list
|
| 83 |
self.renderer = renderer
|
| 84 |
|
| 85 |
+
self.layout_builder_class = LayoutBuilder
|
| 86 |
+
if self.high_quality:
|
| 87 |
+
self.layout_builder_class = HighQualityLayoutBuilder
|
| 88 |
+
|
| 89 |
def resolve_dependencies(self, cls):
|
| 90 |
init_signature = inspect.signature(cls.__init__)
|
| 91 |
parameters = init_signature.parameters
|
|
|
|
| 107 |
|
| 108 |
def __call__(self, filepath: str):
|
| 109 |
pdf_provider = PdfProvider(filepath, self.config)
|
| 110 |
+
layout_builder = self.resolve_dependencies(self.layout_builder_class)
|
| 111 |
ocr_builder = self.resolve_dependencies(OcrBuilder)
|
| 112 |
document = DocumentBuilder(self.config)(pdf_provider, layout_builder, ocr_builder)
|
| 113 |
StructureBuilder(self.config)(document)
|
marker/processors/high_quality_text.py
CHANGED
|
@@ -40,6 +40,7 @@ class HighQualityTextProcessor(BaseProcessor):
|
|
| 40 |
google_api_key: Optional[str] = settings.GOOGLE_API_KEY
|
| 41 |
confidence_threshold: float = 0.7
|
| 42 |
model_name: str = "gemini-1.5-flash"
|
|
|
|
| 43 |
|
| 44 |
gemini_rewriting_prompt = """You are a text correction expert specializing in accurately reproducing text from images.
|
| 45 |
You will receive an image of a text block and a set of extracted lines corresponding to the text in the image.
|
|
@@ -98,12 +99,18 @@ Output:
|
|
| 98 |
def __init__(self, config=None):
|
| 99 |
super().__init__(config)
|
| 100 |
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
|
| 105 |
def __call__(self, document: Document):
|
| 106 |
-
if self.model is None:
|
| 107 |
return
|
| 108 |
|
| 109 |
self.rewrite_blocks(document)
|
|
|
|
| 40 |
google_api_key: Optional[str] = settings.GOOGLE_API_KEY
|
| 41 |
confidence_threshold: float = 0.7
|
| 42 |
model_name: str = "gemini-1.5-flash"
|
| 43 |
+
high_quality: bool = False
|
| 44 |
|
| 45 |
gemini_rewriting_prompt = """You are a text correction expert specializing in accurately reproducing text from images.
|
| 46 |
You will receive an image of a text block and a set of extracted lines corresponding to the text in the image.
|
|
|
|
| 99 |
def __init__(self, config=None):
|
| 100 |
super().__init__(config)
|
| 101 |
|
| 102 |
+
self.model = None
|
| 103 |
+
if not self.high_quality:
|
| 104 |
+
return
|
| 105 |
+
|
| 106 |
+
if self.google_api_key is None:
|
| 107 |
+
raise ValueError("Google API key is not set")
|
| 108 |
+
|
| 109 |
+
genai.configure(api_key=self.google_api_key)
|
| 110 |
+
self.model = genai.GenerativeModel(self.model_name)
|
| 111 |
|
| 112 |
def __call__(self, document: Document):
|
| 113 |
+
if not self.high_quality or self.model is None:
|
| 114 |
return
|
| 115 |
|
| 116 |
self.rewrite_blocks(document)
|
poetry.lock
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
pyproject.toml
CHANGED
|
@@ -33,7 +33,7 @@ tabulate = "^0.9.0"
|
|
| 33 |
ftfy = "^6.1.1"
|
| 34 |
texify = "^0.2.1"
|
| 35 |
rapidfuzz = "^3.8.1"
|
| 36 |
-
surya-ocr = "~0.8.
|
| 37 |
regex = "^2024.4.28"
|
| 38 |
pdftext = "~0.4.0"
|
| 39 |
tabled-pdf = "~0.2.0"
|
|
|
|
| 33 |
ftfy = "^6.1.1"
|
| 34 |
texify = "^0.2.1"
|
| 35 |
rapidfuzz = "^3.8.1"
|
| 36 |
+
surya-ocr = "~0.8.1"
|
| 37 |
regex = "^2024.4.28"
|
| 38 |
pdftext = "~0.4.0"
|
| 39 |
tabled-pdf = "~0.2.0"
|