Vik Paruchuri
commited on
Commit
·
9cca8c4
1
Parent(s):
14b3a02
Test adjustments
Browse files
marker/renderers/markdown.py
CHANGED
|
@@ -5,7 +5,7 @@ from typing import Annotated, Tuple
|
|
| 5 |
import regex
|
| 6 |
import six
|
| 7 |
from bs4 import NavigableString
|
| 8 |
-
from markdownify import MarkdownConverter,
|
| 9 |
from marker.logger import get_logger
|
| 10 |
from pydantic import BaseModel
|
| 11 |
|
|
@@ -70,7 +70,7 @@ class Markdownify(MarkdownConverter):
|
|
| 70 |
self.inline_math_delimiters = inline_math_delimiters
|
| 71 |
self.block_math_delimiters = block_math_delimiters
|
| 72 |
|
| 73 |
-
def convert_div(self, el, text,
|
| 74 |
is_page = el.has_attr("class") and el["class"][0] == "page"
|
| 75 |
if self.paginate_output and is_page:
|
| 76 |
page_id = el["data-page-id"]
|
|
@@ -81,7 +81,7 @@ class Markdownify(MarkdownConverter):
|
|
| 81 |
else:
|
| 82 |
return text
|
| 83 |
|
| 84 |
-
def convert_p(self, el, text,
|
| 85 |
hyphens = r"-—¬"
|
| 86 |
has_continuation = el.has_attr("class") and "has-continuation" in el["class"]
|
| 87 |
if has_continuation:
|
|
@@ -96,7 +96,7 @@ class Markdownify(MarkdownConverter):
|
|
| 96 |
return f"{text}"
|
| 97 |
return f"{text}\n\n" if text else "" # default convert_p behavior
|
| 98 |
|
| 99 |
-
def convert_math(self, el, text,
|
| 100 |
block = el.has_attr("display") and el["display"] == "block"
|
| 101 |
if block:
|
| 102 |
return (
|
|
@@ -117,7 +117,7 @@ class Markdownify(MarkdownConverter):
|
|
| 117 |
+ " "
|
| 118 |
)
|
| 119 |
|
| 120 |
-
def convert_table(self, el, text,
|
| 121 |
total_rows = len(el.find_all("tr"))
|
| 122 |
colspans = []
|
| 123 |
rowspan_cols = defaultdict(int)
|
|
@@ -214,30 +214,30 @@ class Markdownify(MarkdownConverter):
|
|
| 214 |
table_md = "\n".join(markdown_lines)
|
| 215 |
return "\n\n" + table_md + "\n\n"
|
| 216 |
|
| 217 |
-
def convert_a(self, el, text,
|
| 218 |
text = self.escape(text)
|
| 219 |
# Escape brackets and parentheses in text
|
| 220 |
text = re.sub(r"([\[\]()])", r"\\\1", text)
|
| 221 |
-
return super().convert_a(el, text,
|
| 222 |
|
| 223 |
-
def convert_span(self, el, text,
|
| 224 |
if el.get("id"):
|
| 225 |
return f'<span id="{el["id"]}">{text}</span>'
|
| 226 |
else:
|
| 227 |
return text
|
| 228 |
|
| 229 |
-
def escape(self, text):
|
| 230 |
-
text = super().escape(text)
|
| 231 |
if self.options["escape_dollars"]:
|
| 232 |
text = text.replace("$", r"\$")
|
| 233 |
return text
|
| 234 |
|
| 235 |
-
def process_text(self, el):
|
| 236 |
text = six.text_type(el) or ""
|
| 237 |
|
| 238 |
# normalize whitespace if we're not inside a preformatted element
|
| 239 |
if not el.find_parent("pre"):
|
| 240 |
-
text =
|
| 241 |
|
| 242 |
# escape special characters if we're not inside a preformatted or code element
|
| 243 |
if not el.find_parent(["pre", "code", "kbd", "samp", "math"]):
|
|
|
|
| 5 |
import regex
|
| 6 |
import six
|
| 7 |
from bs4 import NavigableString
|
| 8 |
+
from markdownify import MarkdownConverter, re_whitespace
|
| 9 |
from marker.logger import get_logger
|
| 10 |
from pydantic import BaseModel
|
| 11 |
|
|
|
|
| 70 |
self.inline_math_delimiters = inline_math_delimiters
|
| 71 |
self.block_math_delimiters = block_math_delimiters
|
| 72 |
|
| 73 |
+
def convert_div(self, el, text, parent_tags):
|
| 74 |
is_page = el.has_attr("class") and el["class"][0] == "page"
|
| 75 |
if self.paginate_output and is_page:
|
| 76 |
page_id = el["data-page-id"]
|
|
|
|
| 81 |
else:
|
| 82 |
return text
|
| 83 |
|
| 84 |
+
def convert_p(self, el, text, parent_tags):
|
| 85 |
hyphens = r"-—¬"
|
| 86 |
has_continuation = el.has_attr("class") and "has-continuation" in el["class"]
|
| 87 |
if has_continuation:
|
|
|
|
| 96 |
return f"{text}"
|
| 97 |
return f"{text}\n\n" if text else "" # default convert_p behavior
|
| 98 |
|
| 99 |
+
def convert_math(self, el, text, parent_tags):
|
| 100 |
block = el.has_attr("display") and el["display"] == "block"
|
| 101 |
if block:
|
| 102 |
return (
|
|
|
|
| 117 |
+ " "
|
| 118 |
)
|
| 119 |
|
| 120 |
+
def convert_table(self, el, text, parent_tags):
|
| 121 |
total_rows = len(el.find_all("tr"))
|
| 122 |
colspans = []
|
| 123 |
rowspan_cols = defaultdict(int)
|
|
|
|
| 214 |
table_md = "\n".join(markdown_lines)
|
| 215 |
return "\n\n" + table_md + "\n\n"
|
| 216 |
|
| 217 |
+
def convert_a(self, el, text, parent_tags):
|
| 218 |
text = self.escape(text)
|
| 219 |
# Escape brackets and parentheses in text
|
| 220 |
text = re.sub(r"([\[\]()])", r"\\\1", text)
|
| 221 |
+
return super().convert_a(el, text, parent_tags)
|
| 222 |
|
| 223 |
+
def convert_span(self, el, text, parent_tags):
|
| 224 |
if el.get("id"):
|
| 225 |
return f'<span id="{el["id"]}">{text}</span>'
|
| 226 |
else:
|
| 227 |
return text
|
| 228 |
|
| 229 |
+
def escape(self, text, parent_tags=None):
|
| 230 |
+
text = super().escape(text, parent_tags)
|
| 231 |
if self.options["escape_dollars"]:
|
| 232 |
text = text.replace("$", r"\$")
|
| 233 |
return text
|
| 234 |
|
| 235 |
+
def process_text(self, el, parent_tags=None):
|
| 236 |
text = six.text_type(el) or ""
|
| 237 |
|
| 238 |
# normalize whitespace if we're not inside a preformatted element
|
| 239 |
if not el.find_parent("pre"):
|
| 240 |
+
text = re_whitespace.sub(" ", text)
|
| 241 |
|
| 242 |
# escape special characters if we're not inside a preformatted or code element
|
| 243 |
if not el.find_parent(["pre", "code", "kbd", "samp", "math"]):
|
tests/converters/test_extraction_converter.py
CHANGED
|
@@ -49,19 +49,17 @@ def extraction_converter(config, model_dict, mock_llm_service):
|
|
| 49 |
|
| 50 |
|
| 51 |
@pytest.mark.config({"page_range": [0]})
|
| 52 |
-
def
|
| 53 |
-
config, model_dict, mock_llm_service, temp_doc
|
| 54 |
-
):
|
| 55 |
config["page_schema"] = "invalid json"
|
| 56 |
|
| 57 |
model_dict["llm_service"] = mock_llm_service
|
| 58 |
converter = ExtractionConverter(
|
| 59 |
artifact_dict=model_dict, processor_list=None, config=config
|
| 60 |
)
|
| 61 |
-
converter.llm_service = mock_llm_service
|
| 62 |
|
| 63 |
-
|
| 64 |
-
|
| 65 |
|
| 66 |
|
| 67 |
@pytest.mark.config({"page_range": [0, 1]})
|
|
|
|
| 49 |
|
| 50 |
|
| 51 |
@pytest.mark.config({"page_range": [0]})
|
| 52 |
+
def test_extraction_converter(config, model_dict, mock_llm_service, temp_doc):
|
|
|
|
|
|
|
| 53 |
config["page_schema"] = "invalid json"
|
| 54 |
|
| 55 |
model_dict["llm_service"] = mock_llm_service
|
| 56 |
converter = ExtractionConverter(
|
| 57 |
artifact_dict=model_dict, processor_list=None, config=config
|
| 58 |
)
|
| 59 |
+
converter.artifact_dict["llm_service"] = mock_llm_service()
|
| 60 |
|
| 61 |
+
results = converter(temp_doc.name)
|
| 62 |
+
assert results.document_json == '{"test_key": "test_value"}'
|
| 63 |
|
| 64 |
|
| 65 |
@pytest.mark.config({"page_range": [0, 1]})
|