Vik Paruchuri commited on
Commit
9cca8c4
·
1 Parent(s): 14b3a02

Test adjustments

Browse files
marker/renderers/markdown.py CHANGED
@@ -5,7 +5,7 @@ from typing import Annotated, Tuple
5
  import regex
6
  import six
7
  from bs4 import NavigableString
8
- from markdownify import MarkdownConverter, whitespace_re
9
  from marker.logger import get_logger
10
  from pydantic import BaseModel
11
 
@@ -70,7 +70,7 @@ class Markdownify(MarkdownConverter):
70
  self.inline_math_delimiters = inline_math_delimiters
71
  self.block_math_delimiters = block_math_delimiters
72
 
73
- def convert_div(self, el, text, convert_as_inline):
74
  is_page = el.has_attr("class") and el["class"][0] == "page"
75
  if self.paginate_output and is_page:
76
  page_id = el["data-page-id"]
@@ -81,7 +81,7 @@ class Markdownify(MarkdownConverter):
81
  else:
82
  return text
83
 
84
- def convert_p(self, el, text, convert_as_inline):
85
  hyphens = r"-—¬"
86
  has_continuation = el.has_attr("class") and "has-continuation" in el["class"]
87
  if has_continuation:
@@ -96,7 +96,7 @@ class Markdownify(MarkdownConverter):
96
  return f"{text}"
97
  return f"{text}\n\n" if text else "" # default convert_p behavior
98
 
99
- def convert_math(self, el, text, convert_as_inline):
100
  block = el.has_attr("display") and el["display"] == "block"
101
  if block:
102
  return (
@@ -117,7 +117,7 @@ class Markdownify(MarkdownConverter):
117
  + " "
118
  )
119
 
120
- def convert_table(self, el, text, convert_as_inline):
121
  total_rows = len(el.find_all("tr"))
122
  colspans = []
123
  rowspan_cols = defaultdict(int)
@@ -214,30 +214,30 @@ class Markdownify(MarkdownConverter):
214
  table_md = "\n".join(markdown_lines)
215
  return "\n\n" + table_md + "\n\n"
216
 
217
- def convert_a(self, el, text, convert_as_inline):
218
  text = self.escape(text)
219
  # Escape brackets and parentheses in text
220
  text = re.sub(r"([\[\]()])", r"\\\1", text)
221
- return super().convert_a(el, text, convert_as_inline)
222
 
223
- def convert_span(self, el, text, convert_as_inline):
224
  if el.get("id"):
225
  return f'<span id="{el["id"]}">{text}</span>'
226
  else:
227
  return text
228
 
229
- def escape(self, text):
230
- text = super().escape(text)
231
  if self.options["escape_dollars"]:
232
  text = text.replace("$", r"\$")
233
  return text
234
 
235
- def process_text(self, el):
236
  text = six.text_type(el) or ""
237
 
238
  # normalize whitespace if we're not inside a preformatted element
239
  if not el.find_parent("pre"):
240
- text = whitespace_re.sub(" ", text)
241
 
242
  # escape special characters if we're not inside a preformatted or code element
243
  if not el.find_parent(["pre", "code", "kbd", "samp", "math"]):
 
5
  import regex
6
  import six
7
  from bs4 import NavigableString
8
+ from markdownify import MarkdownConverter, re_whitespace
9
  from marker.logger import get_logger
10
  from pydantic import BaseModel
11
 
 
70
  self.inline_math_delimiters = inline_math_delimiters
71
  self.block_math_delimiters = block_math_delimiters
72
 
73
+ def convert_div(self, el, text, parent_tags):
74
  is_page = el.has_attr("class") and el["class"][0] == "page"
75
  if self.paginate_output and is_page:
76
  page_id = el["data-page-id"]
 
81
  else:
82
  return text
83
 
84
+ def convert_p(self, el, text, parent_tags):
85
  hyphens = r"-—¬"
86
  has_continuation = el.has_attr("class") and "has-continuation" in el["class"]
87
  if has_continuation:
 
96
  return f"{text}"
97
  return f"{text}\n\n" if text else "" # default convert_p behavior
98
 
99
+ def convert_math(self, el, text, parent_tags):
100
  block = el.has_attr("display") and el["display"] == "block"
101
  if block:
102
  return (
 
117
  + " "
118
  )
119
 
120
+ def convert_table(self, el, text, parent_tags):
121
  total_rows = len(el.find_all("tr"))
122
  colspans = []
123
  rowspan_cols = defaultdict(int)
 
214
  table_md = "\n".join(markdown_lines)
215
  return "\n\n" + table_md + "\n\n"
216
 
217
+ def convert_a(self, el, text, parent_tags):
218
  text = self.escape(text)
219
  # Escape brackets and parentheses in text
220
  text = re.sub(r"([\[\]()])", r"\\\1", text)
221
+ return super().convert_a(el, text, parent_tags)
222
 
223
+ def convert_span(self, el, text, parent_tags):
224
  if el.get("id"):
225
  return f'<span id="{el["id"]}">{text}</span>'
226
  else:
227
  return text
228
 
229
+ def escape(self, text, parent_tags=None):
230
+ text = super().escape(text, parent_tags)
231
  if self.options["escape_dollars"]:
232
  text = text.replace("$", r"\$")
233
  return text
234
 
235
+ def process_text(self, el, parent_tags=None):
236
  text = six.text_type(el) or ""
237
 
238
  # normalize whitespace if we're not inside a preformatted element
239
  if not el.find_parent("pre"):
240
+ text = re_whitespace.sub(" ", text)
241
 
242
  # escape special characters if we're not inside a preformatted or code element
243
  if not el.find_parent(["pre", "code", "kbd", "samp", "math"]):
tests/converters/test_extraction_converter.py CHANGED
@@ -49,19 +49,17 @@ def extraction_converter(config, model_dict, mock_llm_service):
49
 
50
 
51
  @pytest.mark.config({"page_range": [0]})
52
- def test_extraction_converter_invalid_schema(
53
- config, model_dict, mock_llm_service, temp_doc
54
- ):
55
  config["page_schema"] = "invalid json"
56
 
57
  model_dict["llm_service"] = mock_llm_service
58
  converter = ExtractionConverter(
59
  artifact_dict=model_dict, processor_list=None, config=config
60
  )
61
- converter.llm_service = mock_llm_service
62
 
63
- with pytest.raises(ValueError):
64
- converter(temp_doc.name)
65
 
66
 
67
  @pytest.mark.config({"page_range": [0, 1]})
 
49
 
50
 
51
  @pytest.mark.config({"page_range": [0]})
52
+ def test_extraction_converter(config, model_dict, mock_llm_service, temp_doc):
 
 
53
  config["page_schema"] = "invalid json"
54
 
55
  model_dict["llm_service"] = mock_llm_service
56
  converter = ExtractionConverter(
57
  artifact_dict=model_dict, processor_list=None, config=config
58
  )
59
+ converter.artifact_dict["llm_service"] = mock_llm_service()
60
 
61
+ results = converter(temp_doc.name)
62
+ assert results.document_json == '{"test_key": "test_value"}'
63
 
64
 
65
  @pytest.mark.config({"page_range": [0, 1]})