Vik Paruchuri commited on
Commit
76b8182
·
1 Parent(s): c5a5454

Get chunk conversion working

Browse files
.github/workflows/tests.yml CHANGED
@@ -24,7 +24,7 @@ jobs:
24
  - name: Download benchmark data
25
  run: |
26
  wget -O benchmark_data.zip "https://drive.google.com/uc?export=download&id=1NHrdYatR1rtqs2gPVfdvO0BAvocH8CJi"
27
- unzip benchmark_data.zip
28
  - name: Run benchmark test
29
  run: |
30
  poetry run python benchmark.py benchmark_data/pdfs benchmark_data/references report.json
 
24
  - name: Download benchmark data
25
  run: |
26
  wget -O benchmark_data.zip "https://drive.google.com/uc?export=download&id=1NHrdYatR1rtqs2gPVfdvO0BAvocH8CJi"
27
+ unzip -o benchmark_data.zip
28
  - name: Run benchmark test
29
  run: |
30
  poetry run python benchmark.py benchmark_data/pdfs benchmark_data/references report.json
chunk_convert.py CHANGED
@@ -1,5 +1,6 @@
1
  import argparse
2
  import subprocess
 
3
 
4
 
5
  def main():
@@ -8,8 +9,10 @@ def main():
8
  parser.add_argument("out_folder", help="Output folder")
9
  args = parser.parse_args()
10
 
 
 
11
  # Construct the command
12
- cmd = f"./chunk_convert.sh {args.in_folder} {args.out_folder}"
13
 
14
  # Execute the shell script
15
  subprocess.run(cmd, shell=True, check=True)
 
1
  import argparse
2
  import subprocess
3
+ import pkg_resources
4
 
5
 
6
  def main():
 
9
  parser.add_argument("out_folder", help="Output folder")
10
  args = parser.parse_args()
11
 
12
+ script_path = pkg_resources.resource_filename(__name__, 'chunk_convert.sh')
13
+
14
  # Construct the command
15
+ cmd = f"{script_path} {args.in_folder} {args.out_folder}"
16
 
17
  # Execute the shell script
18
  subprocess.run(cmd, shell=True, check=True)
convert.py CHANGED
@@ -24,6 +24,7 @@ def process_single_pdf(filepath: str, out_folder: str, model_refs, metadata: Opt
24
  fname = os.path.basename(filepath)
25
  if markdown_exists(out_folder, fname):
26
  return
 
27
  try:
28
  # Skip trying to convert files that don't have a lot of embedded text
29
  # This can indicate that they were scanned, and not OCRed properly
 
24
  fname = os.path.basename(filepath)
25
  if markdown_exists(out_folder, fname):
26
  return
27
+
28
  try:
29
  # Skip trying to convert files that don't have a lot of embedded text
30
  # This can indicate that they were scanned, and not OCRed properly
marker/output.py CHANGED
@@ -5,7 +5,6 @@ import json
5
  def get_subfolder_path(out_folder, fname):
6
  subfolder_name = fname.split(".")[0]
7
  subfolder_path = os.path.join(out_folder, subfolder_name)
8
- os.makedirs(subfolder_path, exist_ok=True)
9
  return subfolder_path
10
 
11
 
@@ -23,6 +22,8 @@ def markdown_exists(out_folder, fname):
23
 
24
  def save_markdown(out_folder, fname, full_text, images, out_metadata):
25
  subfolder_path = get_subfolder_path(out_folder, fname)
 
 
26
  markdown_filepath = get_markdown_filepath(out_folder, fname)
27
  out_meta_filepath = markdown_filepath.rsplit(".", 1)[0] + "_meta.json"
28
 
 
5
  def get_subfolder_path(out_folder, fname):
6
  subfolder_name = fname.split(".")[0]
7
  subfolder_path = os.path.join(out_folder, subfolder_name)
 
8
  return subfolder_path
9
 
10
 
 
22
 
23
  def save_markdown(out_folder, fname, full_text, images, out_metadata):
24
  subfolder_path = get_subfolder_path(out_folder, fname)
25
+ os.makedirs(subfolder_path, exist_ok=True)
26
+
27
  markdown_filepath = get_markdown_filepath(out_folder, fname)
28
  out_meta_filepath = markdown_filepath.rsplit(".", 1)[0] + "_meta.json"
29
 
marker/pdf/utils.py CHANGED
@@ -7,6 +7,9 @@ from marker.settings import settings
7
 
8
  def find_filetype(fpath):
9
  kind = filetype.guess(fpath)
 
 
 
10
 
11
  mimetype = kind.mime
12
 
 
7
 
8
  def find_filetype(fpath):
9
  kind = filetype.guess(fpath)
10
+ if kind is None:
11
+ print(f"Could not determine filetype for {fpath}")
12
+ return "other"
13
 
14
  mimetype = kind.mime
15
 
pyproject.toml CHANGED
@@ -1,6 +1,6 @@
1
  [tool.poetry]
2
  name = "marker-pdf"
3
- version = "0.2.2"
4
  description = "Convert PDF to markdown with high speed and accuracy."
5
  authors = ["Vik Paruchuri <github@vikas.sh>"]
6
  readme = "README.md"
 
1
  [tool.poetry]
2
  name = "marker-pdf"
3
+ version = "0.2.3"
4
  description = "Convert PDF to markdown with high speed and accuracy."
5
  authors = ["Vik Paruchuri <github@vikas.sh>"]
6
  readme = "README.md"