Vik Paruchuri
commited on
Commit
·
b34699f
1
Parent(s):
282333c
Add pre-commit hook
Browse files- .pre-commit-config.yaml +12 -0
- marker/providers/registry.py +42 -15
- poetry.lock +0 -0
- pyproject.toml +1 -0
.pre-commit-config.yaml
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
repos:
|
| 2 |
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
| 3 |
+
# Ruff version.
|
| 4 |
+
rev: v0.9.10
|
| 5 |
+
hooks:
|
| 6 |
+
# Run the linter.
|
| 7 |
+
- id: ruff
|
| 8 |
+
types_or: [ python, pyi ]
|
| 9 |
+
args: [ --fix ]
|
| 10 |
+
# Run the formatter.
|
| 11 |
+
- id: ruff-format
|
| 12 |
+
types_or: [ python, pyi ]
|
marker/providers/registry.py
CHANGED
|
@@ -12,6 +12,27 @@ from marker.providers.powerpoint import PowerPointProvider
|
|
| 12 |
from marker.providers.spreadsheet import SpreadSheetProvider
|
| 13 |
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
def provider_from_filepath(filepath: str):
|
| 16 |
if filetype.image_match(filepath) is not None:
|
| 17 |
return ImageProvider
|
|
@@ -19,34 +40,40 @@ def provider_from_filepath(filepath: str):
|
|
| 19 |
return PdfProvider
|
| 20 |
if match(filepath, (archive.Epub(),)) is not None:
|
| 21 |
return EpubProvider
|
| 22 |
-
if match(
|
| 23 |
-
filepath, (
|
| 24 |
-
document.Doc(),
|
| 25 |
-
document.Docx(),
|
| 26 |
-
document.Odt()
|
| 27 |
-
)) is not None:
|
| 28 |
return DocumentProvider
|
| 29 |
-
if
|
| 30 |
-
|
|
|
|
|
|
|
| 31 |
document.Xls(),
|
| 32 |
document.Xlsx(),
|
| 33 |
document.Ods(),
|
| 34 |
-
)
|
|
|
|
|
|
|
|
|
|
| 35 |
return SpreadSheetProvider
|
| 36 |
-
if
|
| 37 |
-
|
|
|
|
|
|
|
| 38 |
document.Ppt(),
|
| 39 |
document.Pptx(),
|
| 40 |
document.Odp(),
|
| 41 |
-
)
|
|
|
|
|
|
|
|
|
|
| 42 |
return PowerPointProvider
|
| 43 |
|
| 44 |
try:
|
| 45 |
-
soup = BeautifulSoup(open(filepath,
|
| 46 |
# Check if there are any HTML tags
|
| 47 |
if bool(soup.find()):
|
| 48 |
return HTMLProvider
|
| 49 |
-
except:
|
| 50 |
pass
|
| 51 |
|
| 52 |
-
|
|
|
|
|
|
| 12 |
from marker.providers.spreadsheet import SpreadSheetProvider
|
| 13 |
|
| 14 |
|
| 15 |
+
def provider_from_ext(filepath: str):
|
| 16 |
+
ext = filepath.split(".")[-1].strip()
|
| 17 |
+
if not ext:
|
| 18 |
+
return PdfProvider
|
| 19 |
+
|
| 20 |
+
if ext in ["jpg", "jpeg", "png", "gif", "webp"]:
|
| 21 |
+
return ImageProvider
|
| 22 |
+
if ext in ["pdf"]:
|
| 23 |
+
return PdfProvider
|
| 24 |
+
if ext in ["doc", "docx", "odt"]:
|
| 25 |
+
return DocumentProvider
|
| 26 |
+
if ext in ["xls", "xlsx", "ods"]:
|
| 27 |
+
return SpreadSheetProvider
|
| 28 |
+
if ext in ["ppt", "pptx", "odp"]:
|
| 29 |
+
return PowerPointProvider
|
| 30 |
+
if ext in ["epub"]:
|
| 31 |
+
return EpubProvider
|
| 32 |
+
|
| 33 |
+
return PdfProvider
|
| 34 |
+
|
| 35 |
+
|
| 36 |
def provider_from_filepath(filepath: str):
|
| 37 |
if filetype.image_match(filepath) is not None:
|
| 38 |
return ImageProvider
|
|
|
|
| 40 |
return PdfProvider
|
| 41 |
if match(filepath, (archive.Epub(),)) is not None:
|
| 42 |
return EpubProvider
|
| 43 |
+
if match(filepath, (document.Doc(), document.Docx(), document.Odt())) is not None:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
return DocumentProvider
|
| 45 |
+
if (
|
| 46 |
+
match(
|
| 47 |
+
filepath,
|
| 48 |
+
(
|
| 49 |
document.Xls(),
|
| 50 |
document.Xlsx(),
|
| 51 |
document.Ods(),
|
| 52 |
+
),
|
| 53 |
+
)
|
| 54 |
+
is not None
|
| 55 |
+
):
|
| 56 |
return SpreadSheetProvider
|
| 57 |
+
if (
|
| 58 |
+
match(
|
| 59 |
+
filepath,
|
| 60 |
+
(
|
| 61 |
document.Ppt(),
|
| 62 |
document.Pptx(),
|
| 63 |
document.Odp(),
|
| 64 |
+
),
|
| 65 |
+
)
|
| 66 |
+
is not None
|
| 67 |
+
):
|
| 68 |
return PowerPointProvider
|
| 69 |
|
| 70 |
try:
|
| 71 |
+
soup = BeautifulSoup(open(filepath, "r").read(), "html.parser")
|
| 72 |
# Check if there are any HTML tags
|
| 73 |
if bool(soup.find()):
|
| 74 |
return HTMLProvider
|
| 75 |
+
except Exception:
|
| 76 |
pass
|
| 77 |
|
| 78 |
+
# Fallback if we incorrectly detect the file type
|
| 79 |
+
return provider_from_ext(filepath)
|
poetry.lock
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
pyproject.toml
CHANGED
|
@@ -36,6 +36,7 @@ filetype = "^1.2.0"
|
|
| 36 |
scikit-learn = "^1.6.1"
|
| 37 |
google-genai = "^1.0.0"
|
| 38 |
anthropic = "^0.46.0"
|
|
|
|
| 39 |
|
| 40 |
# Optional dependencies for documents
|
| 41 |
mammoth = {version = "^1.9.0", optional = true}
|
|
|
|
| 36 |
scikit-learn = "^1.6.1"
|
| 37 |
google-genai = "^1.0.0"
|
| 38 |
anthropic = "^0.46.0"
|
| 39 |
+
pre-commit = "^4.2.0"
|
| 40 |
|
| 41 |
# Optional dependencies for documents
|
| 42 |
mammoth = {version = "^1.9.0", optional = true}
|