Vik Paruchuri
commited on
Commit
·
78f3a66
1
Parent(s):
66acddd
Add image provider and tests
Browse files- README.md +26 -5
- marker/converters/pdf.py +3 -1
- marker/converters/table.py +3 -1
- marker/providers/__init__.py +2 -1
- marker/providers/image.py +45 -0
- marker/providers/registry.py +12 -0
- poetry.lock +76 -90
- pyproject.toml +2 -0
- tests/conftest.py +12 -0
- tests/providers/test_image_provider.py +17 -0
README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
# Marker
|
| 2 |
|
| 3 |
-
Marker converts PDFs to markdown, JSON, and HTML quickly and accurately.
|
| 4 |
|
| 5 |
- Supports a wide range of documents
|
| 6 |
- Supports all languages
|
|
@@ -63,11 +63,11 @@ There's a hosted API for marker available [here](https://www.datalab.to/):
|
|
| 63 |
PDF is a tricky format, so marker will not always work perfectly. Here are some known limitations that are on the roadmap to address:
|
| 64 |
|
| 65 |
- Marker will only convert block equations
|
| 66 |
-
- Tables are not always formatted 100% correctly
|
| 67 |
- Forms are not converted optimally
|
| 68 |
- Very complex layouts, with nested tables and forms, may not work
|
| 69 |
|
| 70 |
-
Note: Passing the `--use_llm` flag will mostly solve
|
| 71 |
|
| 72 |
# Installation
|
| 73 |
|
|
@@ -84,7 +84,7 @@ pip install marker-pdf
|
|
| 84 |
First, some configuration:
|
| 85 |
|
| 86 |
- Your torch device will be automatically detected, but you can override this. For example, `TORCH_DEVICE=cuda`.
|
| 87 |
-
- Some PDFs, even digital ones, have bad text in them. Set the `force_ocr` flag on the CLI or via configuration to ensure your PDF runs through OCR.
|
| 88 |
|
| 89 |
## Interactive App
|
| 90 |
|
|
@@ -101,6 +101,8 @@ marker_gui
|
|
| 101 |
marker_single /path/to/file.pdf
|
| 102 |
```
|
| 103 |
|
|
|
|
|
|
|
| 104 |
Options:
|
| 105 |
- `--output_dir PATH`: Directory where output files will be saved. Defaults to the value specified in settings.OUTPUT_DIR.
|
| 106 |
- `--output_format [markdown|json|html]`: Specify the format for the output results.
|
|
@@ -115,6 +117,7 @@ Options:
|
|
| 115 |
- `--config_json PATH`: Path to a JSON configuration file containing additional settings.
|
| 116 |
- `--languages TEXT`: Optionally specify which languages to use for OCR processing. Accepts a comma-separated list. Example: `--languages "en,fr,de"` for English, French, and German.
|
| 117 |
- `config --help`: List all available builders, processors, and converters, and their associated configuration. These values can be used to build a JSON configuration file for additional tweaking of marker defaults.
|
|
|
|
| 118 |
|
| 119 |
The list of supported languages for surya OCR is [here](https://github.com/VikParuchuri/surya/blob/master/surya/languages.py). If you don't need OCR, marker can work with any language.
|
| 120 |
|
|
@@ -180,7 +183,7 @@ rendered = converter("FILEPATH")
|
|
| 180 |
|
| 181 |
### Extract blocks
|
| 182 |
|
| 183 |
-
Each document consists of one or more pages. Pages contain blocks, which can themselves contain other blocks. It's possible to
|
| 184 |
|
| 185 |
Here's an example of extracting all forms from a document:
|
| 186 |
|
|
@@ -198,6 +201,24 @@ forms = document.contained_blocks((BlockTypes.Form,))
|
|
| 198 |
|
| 199 |
Look at the processors for more examples of extracting and manipulating blocks.
|
| 200 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
# Output Formats
|
| 202 |
|
| 203 |
## Markdown
|
|
|
|
| 1 |
# Marker
|
| 2 |
|
| 3 |
+
Marker converts PDFs and images to markdown, JSON, and HTML quickly and accurately.
|
| 4 |
|
| 5 |
- Supports a wide range of documents
|
| 6 |
- Supports all languages
|
|
|
|
| 63 |
PDF is a tricky format, so marker will not always work perfectly. Here are some known limitations that are on the roadmap to address:
|
| 64 |
|
| 65 |
- Marker will only convert block equations
|
| 66 |
+
- Tables are not always formatted 100% correctly
|
| 67 |
- Forms are not converted optimally
|
| 68 |
- Very complex layouts, with nested tables and forms, may not work
|
| 69 |
|
| 70 |
+
Note: Passing the `--use_llm` flag will mostly solve these issues.
|
| 71 |
|
| 72 |
# Installation
|
| 73 |
|
|
|
|
| 84 |
First, some configuration:
|
| 85 |
|
| 86 |
- Your torch device will be automatically detected, but you can override this. For example, `TORCH_DEVICE=cuda`.
|
| 87 |
+
- Some PDFs, even digital ones, have bad text in them. Set the `force_ocr` flag on the CLI or via configuration to ensure your PDF runs through OCR, or the `strip_existing_ocr` to keep all digital text, and only strip out any existing OCR text.
|
| 88 |
|
| 89 |
## Interactive App
|
| 90 |
|
|
|
|
| 101 |
marker_single /path/to/file.pdf
|
| 102 |
```
|
| 103 |
|
| 104 |
+
You can pass in PDFs or images.
|
| 105 |
+
|
| 106 |
Options:
|
| 107 |
- `--output_dir PATH`: Directory where output files will be saved. Defaults to the value specified in settings.OUTPUT_DIR.
|
| 108 |
- `--output_format [markdown|json|html]`: Specify the format for the output results.
|
|
|
|
| 117 |
- `--config_json PATH`: Path to a JSON configuration file containing additional settings.
|
| 118 |
- `--languages TEXT`: Optionally specify which languages to use for OCR processing. Accepts a comma-separated list. Example: `--languages "en,fr,de"` for English, French, and German.
|
| 119 |
- `config --help`: List all available builders, processors, and converters, and their associated configuration. These values can be used to build a JSON configuration file for additional tweaking of marker defaults.
|
| 120 |
+
- `--converter_cls`: One of `marker.converters.pdf.PdfConverter` (default) or `marker.converters.table.TableConverter`. The `PdfConverter` will convert the whole PDF, the `TableConverter` will only extract and convert tables.
|
| 121 |
|
| 122 |
The list of supported languages for surya OCR is [here](https://github.com/VikParuchuri/surya/blob/master/surya/languages.py). If you don't need OCR, marker can work with any language.
|
| 123 |
|
|
|
|
| 183 |
|
| 184 |
### Extract blocks
|
| 185 |
|
| 186 |
+
Each document consists of one or more pages. Pages contain blocks, which can themselves contain other blocks. It's possible to programmatically manipulate these blocks.
|
| 187 |
|
| 188 |
Here's an example of extracting all forms from a document:
|
| 189 |
|
|
|
|
| 201 |
|
| 202 |
Look at the processors for more examples of extracting and manipulating blocks.
|
| 203 |
|
| 204 |
+
### Custom converters
|
| 205 |
+
|
| 206 |
+
You can also use custom converters to define your own conversion pipelines. For example, the `TableConverter` will only extract tables:
|
| 207 |
+
|
| 208 |
+
```python
|
| 209 |
+
from marker.converters.table import TableConverter
|
| 210 |
+
from marker.models import create_model_dict
|
| 211 |
+
from marker.output import text_from_rendered
|
| 212 |
+
|
| 213 |
+
converter = TableConverter(
|
| 214 |
+
artifact_dict=create_model_dict(),
|
| 215 |
+
)
|
| 216 |
+
rendered = converter("FILEPATH")
|
| 217 |
+
text, _, images = text_from_rendered(rendered)
|
| 218 |
+
```
|
| 219 |
+
|
| 220 |
+
This takes all the same configuration as the PdfConverter.
|
| 221 |
+
|
| 222 |
# Output Formats
|
| 223 |
|
| 224 |
## Markdown
|
marker/converters/pdf.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import os
|
| 2 |
|
| 3 |
from marker.processors import BaseProcessor
|
|
|
|
| 4 |
|
| 5 |
os.environ["TOKENIZERS_PARALLELISM"] = "false" # disables a tokenizers warning
|
| 6 |
|
|
@@ -120,7 +121,8 @@ class PdfConverter(BaseConverter):
|
|
| 120 |
return cls(**resolved_kwargs)
|
| 121 |
|
| 122 |
def build_document(self, filepath: str):
|
| 123 |
-
|
|
|
|
| 124 |
layout_builder = self.resolve_dependencies(self.layout_builder_class)
|
| 125 |
ocr_builder = self.resolve_dependencies(OcrBuilder)
|
| 126 |
document = DocumentBuilder(self.config)(pdf_provider, layout_builder, ocr_builder)
|
|
|
|
| 1 |
import os
|
| 2 |
|
| 3 |
from marker.processors import BaseProcessor
|
| 4 |
+
from marker.providers.registry import provider_from_filepath
|
| 5 |
|
| 6 |
os.environ["TOKENIZERS_PARALLELISM"] = "false" # disables a tokenizers warning
|
| 7 |
|
|
|
|
| 121 |
return cls(**resolved_kwargs)
|
| 122 |
|
| 123 |
def build_document(self, filepath: str):
|
| 124 |
+
provider_cls = provider_from_filepath(filepath)
|
| 125 |
+
pdf_provider = provider_cls(filepath, self.config)
|
| 126 |
layout_builder = self.resolve_dependencies(self.layout_builder_class)
|
| 127 |
ocr_builder = self.resolve_dependencies(OcrBuilder)
|
| 128 |
document = DocumentBuilder(self.config)(pdf_provider, layout_builder, ocr_builder)
|
marker/converters/table.py
CHANGED
|
@@ -9,6 +9,7 @@ from marker.processors.llm.llm_form import LLMFormProcessor
|
|
| 9 |
from marker.processors.llm.llm_table import LLMTableProcessor
|
| 10 |
from marker.processors.table import TableProcessor
|
| 11 |
from marker.providers.pdf import PdfProvider
|
|
|
|
| 12 |
from marker.schema import BlockTypes
|
| 13 |
|
| 14 |
|
|
@@ -22,7 +23,8 @@ class TableConverter(PdfConverter):
|
|
| 22 |
converter_block_types: List[BlockTypes] = (BlockTypes.Table, BlockTypes.Form, BlockTypes.TableOfContents)
|
| 23 |
|
| 24 |
def build_document(self, filepath: str):
|
| 25 |
-
|
|
|
|
| 26 |
layout_builder = self.resolve_dependencies(self.layout_builder_class)
|
| 27 |
ocr_builder = self.resolve_dependencies(OcrBuilder)
|
| 28 |
document_builder = DocumentBuilder(self.config)
|
|
|
|
| 9 |
from marker.processors.llm.llm_table import LLMTableProcessor
|
| 10 |
from marker.processors.table import TableProcessor
|
| 11 |
from marker.providers.pdf import PdfProvider
|
| 12 |
+
from marker.providers.registry import provider_from_filepath
|
| 13 |
from marker.schema import BlockTypes
|
| 14 |
|
| 15 |
|
|
|
|
| 23 |
converter_block_types: List[BlockTypes] = (BlockTypes.Table, BlockTypes.Form, BlockTypes.TableOfContents)
|
| 24 |
|
| 25 |
def build_document(self, filepath: str):
|
| 26 |
+
provider_cls = provider_from_filepath(filepath)
|
| 27 |
+
pdf_provider = provider_cls(filepath, self.config)
|
| 28 |
layout_builder = self.resolve_dependencies(self.layout_builder_class)
|
| 29 |
ocr_builder = self.resolve_dependencies(OcrBuilder)
|
| 30 |
document_builder = DocumentBuilder(self.config)
|
marker/providers/__init__.py
CHANGED
|
@@ -3,6 +3,7 @@ from typing import List, Optional, Dict
|
|
| 3 |
from PIL import Image
|
| 4 |
from pydantic import BaseModel
|
| 5 |
|
|
|
|
| 6 |
from marker.schema.text import Span
|
| 7 |
from marker.schema.text.line import Line
|
| 8 |
from marker.util import assign_config
|
|
@@ -29,7 +30,7 @@ class BaseProvider:
|
|
| 29 |
def get_images(self, idxs: List[int], dpi: int) -> List[Image.Image]:
|
| 30 |
pass
|
| 31 |
|
| 32 |
-
def get_page_bbox(self, idx: int) ->
|
| 33 |
pass
|
| 34 |
|
| 35 |
def get_page_lines(self, idx: int) -> List[Line]:
|
|
|
|
| 3 |
from PIL import Image
|
| 4 |
from pydantic import BaseModel
|
| 5 |
|
| 6 |
+
from marker.schema.polygon import PolygonBox
|
| 7 |
from marker.schema.text import Span
|
| 8 |
from marker.schema.text.line import Line
|
| 9 |
from marker.util import assign_config
|
|
|
|
| 30 |
def get_images(self, idxs: List[int], dpi: int) -> List[Image.Image]:
|
| 31 |
pass
|
| 32 |
|
| 33 |
+
def get_page_bbox(self, idx: int) -> PolygonBox | None:
|
| 34 |
pass
|
| 35 |
|
| 36 |
def get_page_lines(self, idx: int) -> List[Line]:
|
marker/providers/image.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Annotated, Optional
|
| 2 |
+
from PIL import Image
|
| 3 |
+
|
| 4 |
+
from marker.providers import ProviderPageLines, BaseProvider
|
| 5 |
+
from marker.schema.polygon import PolygonBox
|
| 6 |
+
from marker.schema.text import Line
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class ImageProvider(BaseProvider):
|
| 10 |
+
page_range: Annotated[
|
| 11 |
+
Optional[List[int]],
|
| 12 |
+
"The range of pages to process.",
|
| 13 |
+
"Default is None, which will process all pages."
|
| 14 |
+
] = None
|
| 15 |
+
|
| 16 |
+
image_count: int = 1
|
| 17 |
+
|
| 18 |
+
def __init__(self, filepath: str, config=None):
|
| 19 |
+
super().__init__(filepath, config)
|
| 20 |
+
|
| 21 |
+
self.images = [Image.open(filepath)]
|
| 22 |
+
self.page_lines: ProviderPageLines = {i: [] for i in range(self.image_count)}
|
| 23 |
+
|
| 24 |
+
if self.page_range is None:
|
| 25 |
+
self.page_range = range(self.image_count)
|
| 26 |
+
|
| 27 |
+
assert max(self.page_range) < self.image_count and min(self.page_range) >= 0, \
|
| 28 |
+
f"Invalid page range, values must be between 0 and {len(self.doc) - 1}. Min of provided page range is {min(self.page_range)} and max is {max(self.page_range)}."
|
| 29 |
+
|
| 30 |
+
self.page_bboxes = {i: [0, 0, self.images[i].size[0], self.images[i].size[1]] for i in self.page_range}
|
| 31 |
+
|
| 32 |
+
def __len__(self):
|
| 33 |
+
return self.image_count
|
| 34 |
+
|
| 35 |
+
def get_images(self, idxs: List[int], dpi: int) -> List[Image.Image]:
|
| 36 |
+
return [self.images[i] for i in idxs]
|
| 37 |
+
|
| 38 |
+
def get_page_bbox(self, idx: int) -> PolygonBox | None:
|
| 39 |
+
bbox = self.page_bboxes[idx]
|
| 40 |
+
if bbox:
|
| 41 |
+
return PolygonBox.from_bbox(bbox)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def get_page_lines(self, idx: int) -> List[Line]:
|
| 45 |
+
return self.page_lines[idx]
|
marker/providers/registry.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import filetype
|
| 2 |
+
|
| 3 |
+
from marker.providers.image import ImageProvider
|
| 4 |
+
from marker.providers.pdf import PdfProvider
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def provider_from_filepath(filepath: str):
|
| 8 |
+
kind = filetype.image_match(filepath)
|
| 9 |
+
if kind is not None:
|
| 10 |
+
return ImageProvider
|
| 11 |
+
|
| 12 |
+
return PdfProvider
|
poetry.lock
CHANGED
|
@@ -4194,41 +4194,41 @@ torch = ["safetensors[numpy]", "torch (>=1.10)"]
|
|
| 4194 |
|
| 4195 |
[[package]]
|
| 4196 |
name = "scikit-learn"
|
| 4197 |
-
version = "1.6.
|
| 4198 |
description = "A set of python modules for machine learning and data mining"
|
| 4199 |
optional = false
|
| 4200 |
python-versions = ">=3.9"
|
| 4201 |
files = [
|
| 4202 |
-
{file = "scikit_learn-1.6.
|
| 4203 |
-
{file = "scikit_learn-1.6.
|
| 4204 |
-
{file = "scikit_learn-1.6.
|
| 4205 |
-
{file = "scikit_learn-1.6.
|
| 4206 |
-
{file = "scikit_learn-1.6.
|
| 4207 |
-
{file = "scikit_learn-1.6.
|
| 4208 |
-
{file = "scikit_learn-1.6.
|
| 4209 |
-
{file = "scikit_learn-1.6.
|
| 4210 |
-
{file = "scikit_learn-1.6.
|
| 4211 |
-
{file = "scikit_learn-1.6.
|
| 4212 |
-
{file = "scikit_learn-1.6.
|
| 4213 |
-
{file = "scikit_learn-1.6.
|
| 4214 |
-
{file = "scikit_learn-1.6.
|
| 4215 |
-
{file = "scikit_learn-1.6.
|
| 4216 |
-
{file = "scikit_learn-1.6.
|
| 4217 |
-
{file = "scikit_learn-1.6.
|
| 4218 |
-
{file = "scikit_learn-1.6.
|
| 4219 |
-
{file = "scikit_learn-1.6.
|
| 4220 |
-
{file = "scikit_learn-1.6.
|
| 4221 |
-
{file = "scikit_learn-1.6.
|
| 4222 |
-
{file = "scikit_learn-1.6.
|
| 4223 |
-
{file = "scikit_learn-1.6.
|
| 4224 |
-
{file = "scikit_learn-1.6.
|
| 4225 |
-
{file = "scikit_learn-1.6.
|
| 4226 |
-
{file = "scikit_learn-1.6.
|
| 4227 |
-
{file = "scikit_learn-1.6.
|
| 4228 |
-
{file = "scikit_learn-1.6.
|
| 4229 |
-
{file = "scikit_learn-1.6.
|
| 4230 |
-
{file = "scikit_learn-1.6.
|
| 4231 |
-
{file = "scikit_learn-1.6.
|
| 4232 |
]
|
| 4233 |
|
| 4234 |
[package.dependencies]
|
|
@@ -4248,53 +4248,60 @@ tests = ["black (>=24.3.0)", "matplotlib (>=3.3.4)", "mypy (>=1.9)", "numpydoc (
|
|
| 4248 |
|
| 4249 |
[[package]]
|
| 4250 |
name = "scipy"
|
| 4251 |
-
version = "1.
|
| 4252 |
description = "Fundamental algorithms for scientific computing in Python"
|
| 4253 |
optional = false
|
| 4254 |
python-versions = ">=3.10"
|
| 4255 |
files = [
|
| 4256 |
-
{file = "scipy-1.
|
| 4257 |
-
{file = "scipy-1.
|
| 4258 |
-
{file = "scipy-1.
|
| 4259 |
-
{file = "scipy-1.
|
| 4260 |
-
{file = "scipy-1.
|
| 4261 |
-
{file = "scipy-1.
|
| 4262 |
-
{file = "scipy-1.
|
| 4263 |
-
{file = "scipy-1.
|
| 4264 |
-
{file = "scipy-1.
|
| 4265 |
-
{file = "scipy-1.
|
| 4266 |
-
{file = "scipy-1.
|
| 4267 |
-
{file = "scipy-1.
|
| 4268 |
-
{file = "scipy-1.
|
| 4269 |
-
{file = "scipy-1.
|
| 4270 |
-
{file = "scipy-1.
|
| 4271 |
-
{file = "scipy-1.
|
| 4272 |
-
{file = "scipy-1.
|
| 4273 |
-
{file = "scipy-1.
|
| 4274 |
-
{file = "scipy-1.
|
| 4275 |
-
{file = "scipy-1.
|
| 4276 |
-
{file = "scipy-1.
|
| 4277 |
-
{file = "scipy-1.
|
| 4278 |
-
{file = "scipy-1.
|
| 4279 |
-
{file = "scipy-1.
|
| 4280 |
-
{file = "scipy-1.
|
| 4281 |
-
{file = "scipy-1.
|
| 4282 |
-
{file = "scipy-1.
|
| 4283 |
-
{file = "scipy-1.
|
| 4284 |
-
{file = "scipy-1.
|
| 4285 |
-
{file = "scipy-1.
|
| 4286 |
-
{file = "scipy-1.
|
| 4287 |
-
{file = "scipy-1.
|
| 4288 |
-
{file = "scipy-1.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4289 |
]
|
| 4290 |
|
| 4291 |
[package.dependencies]
|
| 4292 |
-
numpy = ">=1.23.5,<2.
|
| 4293 |
|
| 4294 |
[package.extras]
|
| 4295 |
dev = ["cython-lint (>=0.12.2)", "doit (>=0.36.0)", "mypy (==1.10.0)", "pycodestyle", "pydevtool", "rich-click", "ruff (>=0.0.292)", "types-psutil", "typing_extensions"]
|
| 4296 |
-
doc = ["jupyterlite-pyodide-kernel", "jupyterlite-sphinx (>=0.
|
| 4297 |
-
test = ["Cython", "array-api-strict (>=2.0)", "asv", "gmpy2", "hypothesis (>=6.30)", "meson", "mpmath", "ninja", "pooch", "pytest", "pytest-cov", "pytest-timeout", "pytest-xdist", "scikit-umfpack", "threadpoolctl"]
|
| 4298 |
|
| 4299 |
[[package]]
|
| 4300 |
name = "send2trash"
|
|
@@ -4489,27 +4496,6 @@ mpmath = ">=1.1.0,<1.4"
|
|
| 4489 |
[package.extras]
|
| 4490 |
dev = ["hypothesis (>=6.70.0)", "pytest (>=7.1.0)"]
|
| 4491 |
|
| 4492 |
-
[[package]]
|
| 4493 |
-
name = "tabled-pdf"
|
| 4494 |
-
version = "0.2.0"
|
| 4495 |
-
description = "Detect and recognize tables in PDFs and images."
|
| 4496 |
-
optional = false
|
| 4497 |
-
python-versions = "<4.0,>=3.10"
|
| 4498 |
-
files = [
|
| 4499 |
-
{file = "tabled_pdf-0.2.0-py3-none-any.whl", hash = "sha256:7f055907d92e4a3322d8c23190eaf552d90dedb4da7f0833eb070c578a6ffe8f"},
|
| 4500 |
-
{file = "tabled_pdf-0.2.0.tar.gz", hash = "sha256:0751227326944bcbf3a6589746d648e802df91bab1545a8a7f0608e8b6c84913"},
|
| 4501 |
-
]
|
| 4502 |
-
|
| 4503 |
-
[package.dependencies]
|
| 4504 |
-
click = ">=8.1.7,<9.0.0"
|
| 4505 |
-
pydantic = ">=2.9.2,<3.0.0"
|
| 4506 |
-
pydantic-settings = ">=2.5.2,<3.0.0"
|
| 4507 |
-
pypdfium2 = ">=4.30.0,<5.0.0"
|
| 4508 |
-
python-dotenv = ">=1.0.1,<2.0.0"
|
| 4509 |
-
scikit-learn = ">=1.5.2,<2.0.0"
|
| 4510 |
-
surya-ocr = ">=0.8.0,<0.9.0"
|
| 4511 |
-
tabulate = ">=0.9.0,<0.10.0"
|
| 4512 |
-
|
| 4513 |
[[package]]
|
| 4514 |
name = "tabulate"
|
| 4515 |
version = "0.9.0"
|
|
@@ -5320,4 +5306,4 @@ propcache = ">=0.2.0"
|
|
| 5320 |
[metadata]
|
| 5321 |
lock-version = "2.0"
|
| 5322 |
python-versions = "^3.10"
|
| 5323 |
-
content-hash = "
|
|
|
|
| 4194 |
|
| 4195 |
[[package]]
|
| 4196 |
name = "scikit-learn"
|
| 4197 |
+
version = "1.6.1"
|
| 4198 |
description = "A set of python modules for machine learning and data mining"
|
| 4199 |
optional = false
|
| 4200 |
python-versions = ">=3.9"
|
| 4201 |
files = [
|
| 4202 |
+
{file = "scikit_learn-1.6.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d056391530ccd1e501056160e3c9673b4da4805eb67eb2bdf4e983e1f9c9204e"},
|
| 4203 |
+
{file = "scikit_learn-1.6.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:0c8d036eb937dbb568c6242fa598d551d88fb4399c0344d95c001980ec1c7d36"},
|
| 4204 |
+
{file = "scikit_learn-1.6.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8634c4bd21a2a813e0a7e3900464e6d593162a29dd35d25bdf0103b3fce60ed5"},
|
| 4205 |
+
{file = "scikit_learn-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:775da975a471c4f6f467725dff0ced5c7ac7bda5e9316b260225b48475279a1b"},
|
| 4206 |
+
{file = "scikit_learn-1.6.1-cp310-cp310-win_amd64.whl", hash = "sha256:8a600c31592bd7dab31e1c61b9bbd6dea1b3433e67d264d17ce1017dbdce8002"},
|
| 4207 |
+
{file = "scikit_learn-1.6.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:72abc587c75234935e97d09aa4913a82f7b03ee0b74111dcc2881cba3c5a7b33"},
|
| 4208 |
+
{file = "scikit_learn-1.6.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:b3b00cdc8f1317b5f33191df1386c0befd16625f49d979fe77a8d44cae82410d"},
|
| 4209 |
+
{file = "scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dc4765af3386811c3ca21638f63b9cf5ecf66261cc4815c1db3f1e7dc7b79db2"},
|
| 4210 |
+
{file = "scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:25fc636bdaf1cc2f4a124a116312d837148b5e10872147bdaf4887926b8c03d8"},
|
| 4211 |
+
{file = "scikit_learn-1.6.1-cp311-cp311-win_amd64.whl", hash = "sha256:fa909b1a36e000a03c382aade0bd2063fd5680ff8b8e501660c0f59f021a6415"},
|
| 4212 |
+
{file = "scikit_learn-1.6.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:926f207c804104677af4857b2c609940b743d04c4c35ce0ddc8ff4f053cddc1b"},
|
| 4213 |
+
{file = "scikit_learn-1.6.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:2c2cae262064e6a9b77eee1c8e768fc46aa0b8338c6a8297b9b6759720ec0ff2"},
|
| 4214 |
+
{file = "scikit_learn-1.6.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1061b7c028a8663fb9a1a1baf9317b64a257fcb036dae5c8752b2abef31d136f"},
|
| 4215 |
+
{file = "scikit_learn-1.6.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e69fab4ebfc9c9b580a7a80111b43d214ab06250f8a7ef590a4edf72464dd86"},
|
| 4216 |
+
{file = "scikit_learn-1.6.1-cp312-cp312-win_amd64.whl", hash = "sha256:70b1d7e85b1c96383f872a519b3375f92f14731e279a7b4c6cfd650cf5dffc52"},
|
| 4217 |
+
{file = "scikit_learn-1.6.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:2ffa1e9e25b3d93990e74a4be2c2fc61ee5af85811562f1288d5d055880c4322"},
|
| 4218 |
+
{file = "scikit_learn-1.6.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:dc5cf3d68c5a20ad6d571584c0750ec641cc46aeef1c1507be51300e6003a7e1"},
|
| 4219 |
+
{file = "scikit_learn-1.6.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c06beb2e839ecc641366000ca84f3cf6fa9faa1777e29cf0c04be6e4d096a348"},
|
| 4220 |
+
{file = "scikit_learn-1.6.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8ca8cb270fee8f1f76fa9bfd5c3507d60c6438bbee5687f81042e2bb98e5a97"},
|
| 4221 |
+
{file = "scikit_learn-1.6.1-cp313-cp313-win_amd64.whl", hash = "sha256:7a1c43c8ec9fde528d664d947dc4c0789be4077a3647f232869f41d9bf50e0fb"},
|
| 4222 |
+
{file = "scikit_learn-1.6.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:a17c1dea1d56dcda2fac315712f3651a1fea86565b64b48fa1bc090249cbf236"},
|
| 4223 |
+
{file = "scikit_learn-1.6.1-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:6a7aa5f9908f0f28f4edaa6963c0a6183f1911e63a69aa03782f0d924c830a35"},
|
| 4224 |
+
{file = "scikit_learn-1.6.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0650e730afb87402baa88afbf31c07b84c98272622aaba002559b614600ca691"},
|
| 4225 |
+
{file = "scikit_learn-1.6.1-cp313-cp313t-win_amd64.whl", hash = "sha256:3f59fe08dc03ea158605170eb52b22a105f238a5d512c4470ddeca71feae8e5f"},
|
| 4226 |
+
{file = "scikit_learn-1.6.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6849dd3234e87f55dce1db34c89a810b489ead832aaf4d4550b7ea85628be6c1"},
|
| 4227 |
+
{file = "scikit_learn-1.6.1-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:e7be3fa5d2eb9be7d77c3734ff1d599151bb523674be9b834e8da6abe132f44e"},
|
| 4228 |
+
{file = "scikit_learn-1.6.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:44a17798172df1d3c1065e8fcf9019183f06c87609b49a124ebdf57ae6cb0107"},
|
| 4229 |
+
{file = "scikit_learn-1.6.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b8b7a3b86e411e4bce21186e1c180d792f3d99223dcfa3b4f597ecc92fa1a422"},
|
| 4230 |
+
{file = "scikit_learn-1.6.1-cp39-cp39-win_amd64.whl", hash = "sha256:7a73d457070e3318e32bdb3aa79a8d990474f19035464dfd8bede2883ab5dc3b"},
|
| 4231 |
+
{file = "scikit_learn-1.6.1.tar.gz", hash = "sha256:b4fc2525eca2c69a59260f583c56a7557c6ccdf8deafdba6e060f94c1c59738e"},
|
| 4232 |
]
|
| 4233 |
|
| 4234 |
[package.dependencies]
|
|
|
|
| 4248 |
|
| 4249 |
[[package]]
|
| 4250 |
name = "scipy"
|
| 4251 |
+
version = "1.15.0"
|
| 4252 |
description = "Fundamental algorithms for scientific computing in Python"
|
| 4253 |
optional = false
|
| 4254 |
python-versions = ">=3.10"
|
| 4255 |
files = [
|
| 4256 |
+
{file = "scipy-1.15.0-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:aeac60d3562a7bf2f35549bdfdb6b1751c50590f55ce7322b4b2fc821dc27fca"},
|
| 4257 |
+
{file = "scipy-1.15.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:5abbdc6ede5c5fed7910cf406a948e2c0869231c0db091593a6b2fa78be77e5d"},
|
| 4258 |
+
{file = "scipy-1.15.0-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:eb1533c59f0ec6c55871206f15a5c72d1fae7ad3c0a8ca33ca88f7c309bbbf8c"},
|
| 4259 |
+
{file = "scipy-1.15.0-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:de112c2dae53107cfeaf65101419662ac0a54e9a088c17958b51c95dac5de56d"},
|
| 4260 |
+
{file = "scipy-1.15.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2240e1fd0782e62e1aacdc7234212ee271d810f67e9cd3b8d521003a82603ef8"},
|
| 4261 |
+
{file = "scipy-1.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d35aef233b098e4de88b1eac29f0df378278e7e250a915766786b773309137c4"},
|
| 4262 |
+
{file = "scipy-1.15.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:1b29e4fc02e155a5fd1165f1e6a73edfdd110470736b0f48bcbe48083f0eee37"},
|
| 4263 |
+
{file = "scipy-1.15.0-cp310-cp310-win_amd64.whl", hash = "sha256:0e5b34f8894f9904cc578008d1a9467829c1817e9f9cb45e6d6eeb61d2ab7731"},
|
| 4264 |
+
{file = "scipy-1.15.0-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:46e91b5b16909ff79224b56e19cbad65ca500b3afda69225820aa3afbf9ec020"},
|
| 4265 |
+
{file = "scipy-1.15.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:82bff2eb01ccf7cea8b6ee5274c2dbeadfdac97919da308ee6d8e5bcbe846443"},
|
| 4266 |
+
{file = "scipy-1.15.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:9c8254fe21dd2c6c8f7757035ec0c31daecf3bb3cffd93bc1ca661b731d28136"},
|
| 4267 |
+
{file = "scipy-1.15.0-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:c9624eeae79b18cab1a31944b5ef87aa14b125d6ab69b71db22f0dbd962caf1e"},
|
| 4268 |
+
{file = "scipy-1.15.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d13bbc0658c11f3d19df4138336e4bce2c4fbd78c2755be4bf7b8e235481557f"},
|
| 4269 |
+
{file = "scipy-1.15.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bdca4c7bb8dc41307e5f39e9e5d19c707d8e20a29845e7533b3bb20a9d4ccba0"},
|
| 4270 |
+
{file = "scipy-1.15.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6f376d7c767731477bac25a85d0118efdc94a572c6b60decb1ee48bf2391a73b"},
|
| 4271 |
+
{file = "scipy-1.15.0-cp311-cp311-win_amd64.whl", hash = "sha256:61513b989ee8d5218fbeb178b2d51534ecaddba050db949ae99eeb3d12f6825d"},
|
| 4272 |
+
{file = "scipy-1.15.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:5beb0a2200372b7416ec73fdae94fe81a6e85e44eb49c35a11ac356d2b8eccc6"},
|
| 4273 |
+
{file = "scipy-1.15.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:fde0f3104dfa1dfbc1f230f65506532d0558d43188789eaf68f97e106249a913"},
|
| 4274 |
+
{file = "scipy-1.15.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:35c68f7044b4e7ad73a3e68e513dda946989e523df9b062bd3cf401a1a882192"},
|
| 4275 |
+
{file = "scipy-1.15.0-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:52475011be29dfcbecc3dfe3060e471ac5155d72e9233e8d5616b84e2b542054"},
|
| 4276 |
+
{file = "scipy-1.15.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5972e3f96f7dda4fd3bb85906a17338e65eaddfe47f750e240f22b331c08858e"},
|
| 4277 |
+
{file = "scipy-1.15.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fe00169cf875bed0b3c40e4da45b57037dc21d7c7bf0c85ed75f210c281488f1"},
|
| 4278 |
+
{file = "scipy-1.15.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:161f80a98047c219c257bf5ce1777c574bde36b9d962a46b20d0d7e531f86863"},
|
| 4279 |
+
{file = "scipy-1.15.0-cp312-cp312-win_amd64.whl", hash = "sha256:327163ad73e54541a675240708244644294cb0a65cca420c9c79baeb9648e479"},
|
| 4280 |
+
{file = "scipy-1.15.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0fcb16eb04d84670722ce8d93b05257df471704c913cb0ff9dc5a1c31d1e9422"},
|
| 4281 |
+
{file = "scipy-1.15.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:767e8cf6562931f8312f4faa7ddea412cb783d8df49e62c44d00d89f41f9bbe8"},
|
| 4282 |
+
{file = "scipy-1.15.0-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:37ce9394cdcd7c5f437583fc6ef91bd290014993900643fdfc7af9b052d1613b"},
|
| 4283 |
+
{file = "scipy-1.15.0-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:6d26f17c64abd6c6c2dfb39920f61518cc9e213d034b45b2380e32ba78fde4c0"},
|
| 4284 |
+
{file = "scipy-1.15.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e2448acd79c6374583581a1ded32ac71a00c2b9c62dfa87a40e1dd2520be111"},
|
| 4285 |
+
{file = "scipy-1.15.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:36be480e512d38db67f377add5b759fb117edd987f4791cdf58e59b26962bee4"},
|
| 4286 |
+
{file = "scipy-1.15.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ccb6248a9987193fe74363a2d73b93bc2c546e0728bd786050b7aef6e17db03c"},
|
| 4287 |
+
{file = "scipy-1.15.0-cp313-cp313-win_amd64.whl", hash = "sha256:952d2e9eaa787f0a9e95b6e85da3654791b57a156c3e6609e65cc5176ccfe6f2"},
|
| 4288 |
+
{file = "scipy-1.15.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:b1432102254b6dc7766d081fa92df87832ac25ff0b3d3a940f37276e63eb74ff"},
|
| 4289 |
+
{file = "scipy-1.15.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:4e08c6a36f46abaedf765dd2dfcd3698fa4bd7e311a9abb2d80e33d9b2d72c34"},
|
| 4290 |
+
{file = "scipy-1.15.0-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:ec915cd26d76f6fc7ae8522f74f5b2accf39546f341c771bb2297f3871934a52"},
|
| 4291 |
+
{file = "scipy-1.15.0-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:351899dd2a801edd3691622172bc8ea01064b1cada794f8641b89a7dc5418db6"},
|
| 4292 |
+
{file = "scipy-1.15.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e9baff912ea4f78a543d183ed6f5b3bea9784509b948227daaf6f10727a0e2e5"},
|
| 4293 |
+
{file = "scipy-1.15.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:cd9d9198a7fd9a77f0eb5105ea9734df26f41faeb2a88a0e62e5245506f7b6df"},
|
| 4294 |
+
{file = "scipy-1.15.0-cp313-cp313t-win_amd64.whl", hash = "sha256:129f899ed275c0515d553b8d31696924e2ca87d1972421e46c376b9eb87de3d2"},
|
| 4295 |
+
{file = "scipy-1.15.0.tar.gz", hash = "sha256:300742e2cc94e36a2880ebe464a1c8b4352a7b0f3e36ec3d2ac006cdbe0219ac"},
|
| 4296 |
]
|
| 4297 |
|
| 4298 |
[package.dependencies]
|
| 4299 |
+
numpy = ">=1.23.5,<2.5"
|
| 4300 |
|
| 4301 |
[package.extras]
|
| 4302 |
dev = ["cython-lint (>=0.12.2)", "doit (>=0.36.0)", "mypy (==1.10.0)", "pycodestyle", "pydevtool", "rich-click", "ruff (>=0.0.292)", "types-psutil", "typing_extensions"]
|
| 4303 |
+
doc = ["intersphinx_registry", "jupyterlite-pyodide-kernel", "jupyterlite-sphinx (>=0.16.5)", "jupytext", "matplotlib (>=3.5)", "myst-nb", "numpydoc", "pooch", "pydata-sphinx-theme (>=0.15.2)", "sphinx (>=5.0.0,<8.0.0)", "sphinx-copybutton", "sphinx-design (>=0.4.0)"]
|
| 4304 |
+
test = ["Cython", "array-api-strict (>=2.0,<2.1.1)", "asv", "gmpy2", "hypothesis (>=6.30)", "meson", "mpmath", "ninja", "pooch", "pytest", "pytest-cov", "pytest-timeout", "pytest-xdist", "scikit-umfpack", "threadpoolctl"]
|
| 4305 |
|
| 4306 |
[[package]]
|
| 4307 |
name = "send2trash"
|
|
|
|
| 4496 |
[package.extras]
|
| 4497 |
dev = ["hypothesis (>=6.70.0)", "pytest (>=7.1.0)"]
|
| 4498 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4499 |
[[package]]
|
| 4500 |
name = "tabulate"
|
| 4501 |
version = "0.9.0"
|
|
|
|
| 5306 |
[metadata]
|
| 5307 |
lock-version = "2.0"
|
| 5308 |
python-versions = "^3.10"
|
| 5309 |
+
content-hash = "8118fd027892740419e08b6c5e0c1ff1231f6dc5ccd1b20e82d5ea2de358cb59"
|
pyproject.toml
CHANGED
|
@@ -40,6 +40,8 @@ markdownify = "^0.13.1"
|
|
| 40 |
click = "^8.1.7"
|
| 41 |
google-generativeai = "^0.8.3"
|
| 42 |
markdown2 = "^2.5.2"
|
|
|
|
|
|
|
| 43 |
|
| 44 |
[tool.poetry.group.dev.dependencies]
|
| 45 |
jupyter = "^1.0.0"
|
|
|
|
| 40 |
click = "^8.1.7"
|
| 41 |
google-generativeai = "^0.8.3"
|
| 42 |
markdown2 = "^2.5.2"
|
| 43 |
+
filetype = "^1.2.0"
|
| 44 |
+
scikit-learn = "^1.6.1"
|
| 45 |
|
| 46 |
[tool.poetry.group.dev.dependencies]
|
| 47 |
jupyter = "^1.0.0"
|
tests/conftest.py
CHANGED
|
@@ -2,6 +2,8 @@ from marker.providers.pdf import PdfProvider
|
|
| 2 |
import tempfile
|
| 3 |
from typing import Dict, Type
|
| 4 |
|
|
|
|
|
|
|
| 5 |
import datasets
|
| 6 |
import pytest
|
| 7 |
|
|
@@ -116,3 +118,13 @@ def renderer(request, config):
|
|
| 116 |
raise ValueError(f"Unknown output format: {output_format}")
|
| 117 |
else:
|
| 118 |
return MarkdownRenderer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
import tempfile
|
| 3 |
from typing import Dict, Type
|
| 4 |
|
| 5 |
+
from PIL import Image, ImageDraw
|
| 6 |
+
|
| 7 |
import datasets
|
| 8 |
import pytest
|
| 9 |
|
|
|
|
| 118 |
raise ValueError(f"Unknown output format: {output_format}")
|
| 119 |
else:
|
| 120 |
return MarkdownRenderer
|
| 121 |
+
|
| 122 |
+
@pytest.fixture(scope="function")
|
| 123 |
+
def temp_image():
|
| 124 |
+
img = Image.new("RGB", (512, 512), color="white")
|
| 125 |
+
draw = ImageDraw.Draw(img)
|
| 126 |
+
draw.text((10, 10), "Hello, World!", fill="black")
|
| 127 |
+
with tempfile.NamedTemporaryFile(suffix=".png") as f:
|
| 128 |
+
img.save(f.name)
|
| 129 |
+
f.flush()
|
| 130 |
+
yield f
|
tests/providers/test_image_provider.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from marker.providers.image import ImageProvider
|
| 2 |
+
from marker.renderers.markdown import MarkdownOutput
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def test_image_provider(config, temp_image):
|
| 6 |
+
provider = ImageProvider(temp_image.name, config)
|
| 7 |
+
assert len(provider) == 1
|
| 8 |
+
assert provider.get_images([0], 72)[0].size == (512, 512)
|
| 9 |
+
|
| 10 |
+
page_lines = provider.get_page_lines(0)
|
| 11 |
+
assert len(page_lines) == 0
|
| 12 |
+
|
| 13 |
+
def test_image_provider_conversion(pdf_converter, temp_image):
|
| 14 |
+
markdown_output: MarkdownOutput = pdf_converter(temp_image.name)
|
| 15 |
+
assert "Hello, World!" in markdown_output.markdown
|
| 16 |
+
|
| 17 |
+
|