Vik Paruchuri commited on
Commit
b34699f
·
1 Parent(s): 282333c

Add pre-commit hook

Browse files
.pre-commit-config.yaml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ repos:
2
+ - repo: https://github.com/astral-sh/ruff-pre-commit
3
+ # Ruff version.
4
+ rev: v0.9.10
5
+ hooks:
6
+ # Run the linter.
7
+ - id: ruff
8
+ types_or: [ python, pyi ]
9
+ args: [ --fix ]
10
+ # Run the formatter.
11
+ - id: ruff-format
12
+ types_or: [ python, pyi ]
marker/providers/registry.py CHANGED
@@ -12,6 +12,27 @@ from marker.providers.powerpoint import PowerPointProvider
12
  from marker.providers.spreadsheet import SpreadSheetProvider
13
 
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  def provider_from_filepath(filepath: str):
16
  if filetype.image_match(filepath) is not None:
17
  return ImageProvider
@@ -19,34 +40,40 @@ def provider_from_filepath(filepath: str):
19
  return PdfProvider
20
  if match(filepath, (archive.Epub(),)) is not None:
21
  return EpubProvider
22
- if match(
23
- filepath, (
24
- document.Doc(),
25
- document.Docx(),
26
- document.Odt()
27
- )) is not None:
28
  return DocumentProvider
29
- if match(
30
- filepath, (
 
 
31
  document.Xls(),
32
  document.Xlsx(),
33
  document.Ods(),
34
- )) is not None:
 
 
 
35
  return SpreadSheetProvider
36
- if match(
37
- filepath, (
 
 
38
  document.Ppt(),
39
  document.Pptx(),
40
  document.Odp(),
41
- )) is not None:
 
 
 
42
  return PowerPointProvider
43
 
44
  try:
45
- soup = BeautifulSoup(open(filepath, 'r').read(), 'html.parser')
46
  # Check if there are any HTML tags
47
  if bool(soup.find()):
48
  return HTMLProvider
49
- except:
50
  pass
51
 
52
- return PdfProvider
 
 
12
  from marker.providers.spreadsheet import SpreadSheetProvider
13
 
14
 
15
+ def provider_from_ext(filepath: str):
16
+ ext = filepath.split(".")[-1].strip()
17
+ if not ext:
18
+ return PdfProvider
19
+
20
+ if ext in ["jpg", "jpeg", "png", "gif", "webp"]:
21
+ return ImageProvider
22
+ if ext in ["pdf"]:
23
+ return PdfProvider
24
+ if ext in ["doc", "docx", "odt"]:
25
+ return DocumentProvider
26
+ if ext in ["xls", "xlsx", "ods"]:
27
+ return SpreadSheetProvider
28
+ if ext in ["ppt", "pptx", "odp"]:
29
+ return PowerPointProvider
30
+ if ext in ["epub"]:
31
+ return EpubProvider
32
+
33
+ return PdfProvider
34
+
35
+
36
  def provider_from_filepath(filepath: str):
37
  if filetype.image_match(filepath) is not None:
38
  return ImageProvider
 
40
  return PdfProvider
41
  if match(filepath, (archive.Epub(),)) is not None:
42
  return EpubProvider
43
+ if match(filepath, (document.Doc(), document.Docx(), document.Odt())) is not None:
 
 
 
 
 
44
  return DocumentProvider
45
+ if (
46
+ match(
47
+ filepath,
48
+ (
49
  document.Xls(),
50
  document.Xlsx(),
51
  document.Ods(),
52
+ ),
53
+ )
54
+ is not None
55
+ ):
56
  return SpreadSheetProvider
57
+ if (
58
+ match(
59
+ filepath,
60
+ (
61
  document.Ppt(),
62
  document.Pptx(),
63
  document.Odp(),
64
+ ),
65
+ )
66
+ is not None
67
+ ):
68
  return PowerPointProvider
69
 
70
  try:
71
+ soup = BeautifulSoup(open(filepath, "r").read(), "html.parser")
72
  # Check if there are any HTML tags
73
  if bool(soup.find()):
74
  return HTMLProvider
75
+ except Exception:
76
  pass
77
 
78
+ # Fallback if we incorrectly detect the file type
79
+ return provider_from_ext(filepath)
poetry.lock CHANGED
The diff for this file is too large to render. See raw diff
 
pyproject.toml CHANGED
@@ -36,6 +36,7 @@ filetype = "^1.2.0"
36
  scikit-learn = "^1.6.1"
37
  google-genai = "^1.0.0"
38
  anthropic = "^0.46.0"
 
39
 
40
  # Optional dependencies for documents
41
  mammoth = {version = "^1.9.0", optional = true}
 
36
  scikit-learn = "^1.6.1"
37
  google-genai = "^1.0.0"
38
  anthropic = "^0.46.0"
39
+ pre-commit = "^4.2.0"
40
 
41
  # Optional dependencies for documents
42
  mammoth = {version = "^1.9.0", optional = true}