Spaces:

rt4u
/

marker

Sleeping

Vik Paruchuri commited on May 9, 2024

Commit

76b8182

1 Parent(s): c5a5454

Get chunk conversion working

Files changed (6) hide show

.github/workflows/tests.yml CHANGED Viewed

@@ -24,7 +24,7 @@ jobs:
       - name: Download benchmark data
         run: |
           wget -O benchmark_data.zip "https://drive.google.com/uc?export=download&id=1NHrdYatR1rtqs2gPVfdvO0BAvocH8CJi"
-          unzip benchmark_data.zip
       - name: Run benchmark test
         run: |
           poetry run python benchmark.py benchmark_data/pdfs benchmark_data/references report.json

       - name: Download benchmark data
         run: |
           wget -O benchmark_data.zip "https://drive.google.com/uc?export=download&id=1NHrdYatR1rtqs2gPVfdvO0BAvocH8CJi"
+          unzip -o benchmark_data.zip
       - name: Run benchmark test
         run: |
           poetry run python benchmark.py benchmark_data/pdfs benchmark_data/references report.json

chunk_convert.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import argparse
 import subprocess
 def main():
@@ -8,8 +9,10 @@ def main():
     parser.add_argument("out_folder", help="Output folder")
     args = parser.parse_args()
     # Construct the command
-    cmd = f"./chunk_convert.sh {args.in_folder} {args.out_folder}"
     # Execute the shell script
     subprocess.run(cmd, shell=True, check=True)

 import argparse
 import subprocess
+import pkg_resources
 def main():
     parser.add_argument("out_folder", help="Output folder")
     args = parser.parse_args()
+    script_path = pkg_resources.resource_filename(__name__, 'chunk_convert.sh')
     # Construct the command
+    cmd = f"{script_path} {args.in_folder} {args.out_folder}"
     # Execute the shell script
     subprocess.run(cmd, shell=True, check=True)

convert.py CHANGED Viewed

@@ -24,6 +24,7 @@ def process_single_pdf(filepath: str, out_folder: str, model_refs, metadata: Opt
     fname = os.path.basename(filepath)
     if markdown_exists(out_folder, fname):
         return
     try:
         # Skip trying to convert files that don't have a lot of embedded text
         # This can indicate that they were scanned, and not OCRed properly

     fname = os.path.basename(filepath)
     if markdown_exists(out_folder, fname):
         return
     try:
         # Skip trying to convert files that don't have a lot of embedded text
         # This can indicate that they were scanned, and not OCRed properly

marker/output.py CHANGED Viewed

@@ -5,7 +5,6 @@ import json
 def get_subfolder_path(out_folder, fname):
     subfolder_name = fname.split(".")[0]
     subfolder_path = os.path.join(out_folder, subfolder_name)
-    os.makedirs(subfolder_path, exist_ok=True)
     return subfolder_path
@@ -23,6 +22,8 @@ def markdown_exists(out_folder, fname):
 def save_markdown(out_folder, fname, full_text, images, out_metadata):
     subfolder_path = get_subfolder_path(out_folder, fname)
     markdown_filepath = get_markdown_filepath(out_folder, fname)
     out_meta_filepath = markdown_filepath.rsplit(".", 1)[0] + "_meta.json"

 def get_subfolder_path(out_folder, fname):
     subfolder_name = fname.split(".")[0]
     subfolder_path = os.path.join(out_folder, subfolder_name)
     return subfolder_path
 def save_markdown(out_folder, fname, full_text, images, out_metadata):
     subfolder_path = get_subfolder_path(out_folder, fname)
+    os.makedirs(subfolder_path, exist_ok=True)
     markdown_filepath = get_markdown_filepath(out_folder, fname)
     out_meta_filepath = markdown_filepath.rsplit(".", 1)[0] + "_meta.json"

marker/pdf/utils.py CHANGED Viewed

@@ -7,6 +7,9 @@ from marker.settings import settings
 def find_filetype(fpath):
     kind = filetype.guess(fpath)
     mimetype = kind.mime

 def find_filetype(fpath):
     kind = filetype.guess(fpath)
+    if kind is None:
+        print(f"Could not determine filetype for {fpath}")
+        return "other"
     mimetype = kind.mime

pyproject.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "marker-pdf"
-version = "0.2.2"
 description = "Convert PDF to markdown with high speed and accuracy."
 authors = ["Vik Paruchuri <github@vikas.sh>"]
 readme = "README.md"

 [tool.poetry]
 name = "marker-pdf"
+version = "0.2.3"
 description = "Convert PDF to markdown with high speed and accuracy."
 authors = ["Vik Paruchuri <github@vikas.sh>"]
 readme = "README.md"