Spaces:

rt4u
/

marker

Sleeping

App Files Files Community

Vik Paruchuri commited on Jun 26

Commit

9cca8c4

1 Parent(s): 14b3a02

Test adjustments

Browse files

Files changed (2) hide show

marker/renderers/markdown.py +12 -12
tests/converters/test_extraction_converter.py +4 -6

marker/renderers/markdown.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import Annotated, Tuple
 import regex
 import six
 from bs4 import NavigableString
-from markdownify import MarkdownConverter, whitespace_re
 from marker.logger import get_logger
 from pydantic import BaseModel
@@ -70,7 +70,7 @@ class Markdownify(MarkdownConverter):
         self.inline_math_delimiters = inline_math_delimiters
         self.block_math_delimiters = block_math_delimiters
-    def convert_div(self, el, text, convert_as_inline):
         is_page = el.has_attr("class") and el["class"][0] == "page"
         if self.paginate_output and is_page:
             page_id = el["data-page-id"]
@@ -81,7 +81,7 @@ class Markdownify(MarkdownConverter):
         else:
             return text
-    def convert_p(self, el, text, convert_as_inline):
         hyphens = r"-—¬"
         has_continuation = el.has_attr("class") and "has-continuation" in el["class"]
         if has_continuation:
@@ -96,7 +96,7 @@ class Markdownify(MarkdownConverter):
                 return f"{text}"
         return f"{text}\n\n" if text else ""  # default convert_p behavior
-    def convert_math(self, el, text, convert_as_inline):
         block = el.has_attr("display") and el["display"] == "block"
         if block:
             return (
@@ -117,7 +117,7 @@ class Markdownify(MarkdownConverter):
                 + " "
             )
-    def convert_table(self, el, text, convert_as_inline):
         total_rows = len(el.find_all("tr"))
         colspans = []
         rowspan_cols = defaultdict(int)
@@ -214,30 +214,30 @@ class Markdownify(MarkdownConverter):
         table_md = "\n".join(markdown_lines)
         return "\n\n" + table_md + "\n\n"
-    def convert_a(self, el, text, convert_as_inline):
         text = self.escape(text)
         # Escape brackets and parentheses in text
         text = re.sub(r"([\[\]()])", r"\\\1", text)
-        return super().convert_a(el, text, convert_as_inline)
-    def convert_span(self, el, text, convert_as_inline):
         if el.get("id"):
             return f'<span id="{el["id"]}">{text}</span>'
         else:
             return text
-    def escape(self, text):
-        text = super().escape(text)
         if self.options["escape_dollars"]:
             text = text.replace("$", r"\$")
         return text
-    def process_text(self, el):
         text = six.text_type(el) or ""
         # normalize whitespace if we're not inside a preformatted element
         if not el.find_parent("pre"):
-            text = whitespace_re.sub(" ", text)
         # escape special characters if we're not inside a preformatted or code element
         if not el.find_parent(["pre", "code", "kbd", "samp", "math"]):

 import regex
 import six
 from bs4 import NavigableString
+from markdownify import MarkdownConverter, re_whitespace
 from marker.logger import get_logger
 from pydantic import BaseModel
         self.inline_math_delimiters = inline_math_delimiters
         self.block_math_delimiters = block_math_delimiters
+    def convert_div(self, el, text, parent_tags):
         is_page = el.has_attr("class") and el["class"][0] == "page"
         if self.paginate_output and is_page:
             page_id = el["data-page-id"]
         else:
             return text
+    def convert_p(self, el, text, parent_tags):
         hyphens = r"-—¬"
         has_continuation = el.has_attr("class") and "has-continuation" in el["class"]
         if has_continuation:
                 return f"{text}"
         return f"{text}\n\n" if text else ""  # default convert_p behavior
+    def convert_math(self, el, text, parent_tags):
         block = el.has_attr("display") and el["display"] == "block"
         if block:
             return (
                 + " "
             )
+    def convert_table(self, el, text, parent_tags):
         total_rows = len(el.find_all("tr"))
         colspans = []
         rowspan_cols = defaultdict(int)
         table_md = "\n".join(markdown_lines)
         return "\n\n" + table_md + "\n\n"
+    def convert_a(self, el, text, parent_tags):
         text = self.escape(text)
         # Escape brackets and parentheses in text
         text = re.sub(r"([\[\]()])", r"\\\1", text)
+        return super().convert_a(el, text, parent_tags)
+    def convert_span(self, el, text, parent_tags):
         if el.get("id"):
             return f'<span id="{el["id"]}">{text}</span>'
         else:
             return text
+    def escape(self, text, parent_tags=None):
+        text = super().escape(text, parent_tags)
         if self.options["escape_dollars"]:
             text = text.replace("$", r"\$")
         return text
+    def process_text(self, el, parent_tags=None):
         text = six.text_type(el) or ""
         # normalize whitespace if we're not inside a preformatted element
         if not el.find_parent("pre"):
+            text = re_whitespace.sub(" ", text)
         # escape special characters if we're not inside a preformatted or code element
         if not el.find_parent(["pre", "code", "kbd", "samp", "math"]):

tests/converters/test_extraction_converter.py CHANGED Viewed

@@ -49,19 +49,17 @@ def extraction_converter(config, model_dict, mock_llm_service):
 @pytest.mark.config({"page_range": [0]})
-def test_extraction_converter_invalid_schema(
-    config, model_dict, mock_llm_service, temp_doc
-):
     config["page_schema"] = "invalid json"
     model_dict["llm_service"] = mock_llm_service
     converter = ExtractionConverter(
         artifact_dict=model_dict, processor_list=None, config=config
     )
-    converter.llm_service = mock_llm_service
-    with pytest.raises(ValueError):
-        converter(temp_doc.name)
 @pytest.mark.config({"page_range": [0, 1]})

 @pytest.mark.config({"page_range": [0]})
+def test_extraction_converter(config, model_dict, mock_llm_service, temp_doc):
     config["page_schema"] = "invalid json"
     model_dict["llm_service"] = mock_llm_service
     converter = ExtractionConverter(
         artifact_dict=model_dict, processor_list=None, config=config
     )
+    converter.artifact_dict["llm_service"] = mock_llm_service()
+    results = converter(temp_doc.name)
+    assert results.document_json == '{"test_key": "test_value"}'
 @pytest.mark.config({"page_range": [0, 1]})