Tarun Menta commited on
Commit
8c63b8f
·
2 Parent(s): 15ab8fd 8f95248

Merge pull request #631 from VikParuchuri/tarun-dev

Browse files
marker/processors/table.py CHANGED
@@ -1,4 +1,5 @@
1
  import re
 
2
  from collections import defaultdict
3
  from copy import deepcopy
4
  from typing import Annotated, List
@@ -158,7 +159,8 @@ class TableProcessor(BaseProcessor):
158
  continue
159
  text = re.sub(r"(\s\.){2,}", "", text) # Replace . . .
160
  text = re.sub(r"\.{2,}", "", text) # Replace ..., like in table of contents
161
- fixed_text.append(self.normalize_spaces(fix_text(text)))
 
162
  return fixed_text
163
 
164
  @staticmethod
 
1
  import re
2
+ import html
3
  from collections import defaultdict
4
  from copy import deepcopy
5
  from typing import Annotated, List
 
159
  continue
160
  text = re.sub(r"(\s\.){2,}", "", text) # Replace . . .
161
  text = re.sub(r"\.{2,}", "", text) # Replace ..., like in table of contents
162
+ text = self.normalize_spaces(fix_text(text))
163
+ fixed_text.append(html.escape(text))
164
  return fixed_text
165
 
166
  @staticmethod
marker/providers/pdf.py CHANGED
@@ -247,7 +247,10 @@ class PdfProvider(BaseProvider):
247
  )
248
  if self.check_line_spans(lines):
249
  page_lines[page_id] = lines
250
- self.page_refs[page_id] = page["refs"]
 
 
 
251
 
252
  return page_lines
253
 
 
247
  )
248
  if self.check_line_spans(lines):
249
  page_lines[page_id] = lines
250
+
251
+ self.page_refs[page_id] = []
252
+ if page_refs:= page.get('refs', None):
253
+ self.page_refs[page_id] = page_refs
254
 
255
  return page_lines
256
 
marker/services/openai.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import json
3
+ import time
4
+ from io import BytesIO
5
+ from typing import Annotated, List, Union
6
+
7
+ import openai
8
+ import PIL
9
+ from openai import APITimeoutError, RateLimitError
10
+ from PIL import Image
11
+ from pydantic import BaseModel
12
+
13
+ from marker.schema.blocks import Block
14
+ from marker.services import BaseService
15
+
16
+
17
+ class OpenAIService(BaseService):
18
+ openai_base_url: Annotated[
19
+ str,
20
+ "The base url to use for OpenAI-like models. No trailing slash."
21
+ ] = "https://api.openai.com/v1"
22
+ openai_model: Annotated[
23
+ str,
24
+ "The model name to use for OpenAI-like model."
25
+ ] = "gpt-4o-mini"
26
+ openai_api_key: Annotated[
27
+ str,
28
+ "The API key to use for the OpenAI-like service."
29
+ ] = None
30
+
31
+ def image_to_base64(self, image: PIL.Image.Image):
32
+ image_bytes = BytesIO()
33
+ image.save(image_bytes, format="WEBP")
34
+ return base64.b64encode(image_bytes.getvalue()).decode("utf-8")
35
+
36
+ def prepare_images(
37
+ self, images: Union[Image.Image, List[Image.Image]]
38
+ ) -> List[dict]:
39
+ if isinstance(images, Image.Image):
40
+ images = [images]
41
+
42
+ return [
43
+ {
44
+ "type": "image_url",
45
+ "image_url": {
46
+ "url": "data:image/webp;base64,{}".format(
47
+ self.image_to_base64(img)
48
+ ),
49
+ }
50
+ }
51
+ for img in images
52
+ ]
53
+
54
+ def __call__(
55
+ self,
56
+ prompt: str,
57
+ image: PIL.Image.Image | List[PIL.Image.Image],
58
+ block: Block,
59
+ response_schema: type[BaseModel],
60
+ max_retries: int | None = None,
61
+ timeout: int | None = None,
62
+ ):
63
+ if max_retries is None:
64
+ max_retries = self.max_retries
65
+
66
+ if timeout is None:
67
+ timeout = self.timeout
68
+
69
+ if not isinstance(image, list):
70
+ image = [image]
71
+
72
+ client = self.get_client()
73
+ image_data = self.prepare_images(image)
74
+
75
+ messages = [
76
+ {
77
+ "role": "user",
78
+ "content": [
79
+ *image_data,
80
+ {"type": "text", "text": prompt},
81
+ ],
82
+ }
83
+ ]
84
+
85
+ tries = 0
86
+ while tries < max_retries:
87
+ try:
88
+ response = client.beta.chat.completions.parse(
89
+ extra_headers={
90
+ "X-Title": "Marker",
91
+ "HTTP-Referer": "https://github.com/VikParuchuri/marker",
92
+ },
93
+ model=self.openai_model,
94
+ messages=messages,
95
+ timeout=timeout,
96
+ response_format=response_schema,
97
+ )
98
+ response_text = response.choices[0].message.content
99
+ total_tokens = response.usage.total_tokens
100
+ block.update_metadata(llm_tokens_used=total_tokens, llm_request_count=1)
101
+ return json.loads(response_text)
102
+ except (APITimeoutError, RateLimitError) as e:
103
+ # Rate limit exceeded
104
+ tries += 1
105
+ wait_time = tries * 3
106
+ print(
107
+ f"Rate limit error: {e}. Retrying in {wait_time} seconds... (Attempt {tries}/{max_retries})"
108
+ )
109
+ time.sleep(wait_time)
110
+ except Exception as e:
111
+ print(e)
112
+ break
113
+
114
+ return {}
115
+
116
+ def get_client(self) -> openai.OpenAI:
117
+ return openai.OpenAI(api_key=self.openai_api_key, base_url=self.openai_base_url)
poetry.lock CHANGED
The diff for this file is too large to render. See raw diff
 
pyproject.toml CHANGED
@@ -43,6 +43,7 @@ openpyxl = {version = "^3.1.5", optional = true}
43
  python-pptx = {version = "^1.0.2", optional = true}
44
  ebooklib = {version = "^0.18", optional = true}
45
  weasyprint = {version = "^63.1", optional = true}
 
46
 
47
  [tool.poetry.group.dev.dependencies]
48
  jupyter = "^1.0.0"
 
43
  python-pptx = {version = "^1.0.2", optional = true}
44
  ebooklib = {version = "^0.18", optional = true}
45
  weasyprint = {version = "^63.1", optional = true}
46
+ openai = "^1.65.2"
47
 
48
  [tool.poetry.group.dev.dependencies]
49
  jupyter = "^1.0.0"
signatures/version1/cla.json CHANGED
@@ -183,6 +183,14 @@
183
  "created_at": "2025-02-16T23:02:34Z",
184
  "repoId": 712111618,
185
  "pullRequestNo": 555
 
 
 
 
 
 
 
 
186
  }
187
  ]
188
  }
 
183
  "created_at": "2025-02-16T23:02:34Z",
184
  "repoId": 712111618,
185
  "pullRequestNo": 555
186
+ },
187
+ {
188
+ "name": "vicenciomf2",
189
+ "id": 127889973,
190
+ "comment_id": 2676007412,
191
+ "created_at": "2025-02-22T04:34:27Z",
192
+ "repoId": 712111618,
193
+ "pullRequestNo": 574
194
  }
195
  ]
196
  }