Vik Paruchuri
commited on
Commit
·
76b8182
1
Parent(s):
c5a5454
Get chunk conversion working
Browse files- .github/workflows/tests.yml +1 -1
- chunk_convert.py +4 -1
- convert.py +1 -0
- marker/output.py +2 -1
- marker/pdf/utils.py +3 -0
- pyproject.toml +1 -1
.github/workflows/tests.yml
CHANGED
|
@@ -24,7 +24,7 @@ jobs:
|
|
| 24 |
- name: Download benchmark data
|
| 25 |
run: |
|
| 26 |
wget -O benchmark_data.zip "https://drive.google.com/uc?export=download&id=1NHrdYatR1rtqs2gPVfdvO0BAvocH8CJi"
|
| 27 |
-
unzip benchmark_data.zip
|
| 28 |
- name: Run benchmark test
|
| 29 |
run: |
|
| 30 |
poetry run python benchmark.py benchmark_data/pdfs benchmark_data/references report.json
|
|
|
|
| 24 |
- name: Download benchmark data
|
| 25 |
run: |
|
| 26 |
wget -O benchmark_data.zip "https://drive.google.com/uc?export=download&id=1NHrdYatR1rtqs2gPVfdvO0BAvocH8CJi"
|
| 27 |
+
unzip -o benchmark_data.zip
|
| 28 |
- name: Run benchmark test
|
| 29 |
run: |
|
| 30 |
poetry run python benchmark.py benchmark_data/pdfs benchmark_data/references report.json
|
chunk_convert.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
import argparse
|
| 2 |
import subprocess
|
|
|
|
| 3 |
|
| 4 |
|
| 5 |
def main():
|
|
@@ -8,8 +9,10 @@ def main():
|
|
| 8 |
parser.add_argument("out_folder", help="Output folder")
|
| 9 |
args = parser.parse_args()
|
| 10 |
|
|
|
|
|
|
|
| 11 |
# Construct the command
|
| 12 |
-
cmd = f"
|
| 13 |
|
| 14 |
# Execute the shell script
|
| 15 |
subprocess.run(cmd, shell=True, check=True)
|
|
|
|
| 1 |
import argparse
|
| 2 |
import subprocess
|
| 3 |
+
import pkg_resources
|
| 4 |
|
| 5 |
|
| 6 |
def main():
|
|
|
|
| 9 |
parser.add_argument("out_folder", help="Output folder")
|
| 10 |
args = parser.parse_args()
|
| 11 |
|
| 12 |
+
script_path = pkg_resources.resource_filename(__name__, 'chunk_convert.sh')
|
| 13 |
+
|
| 14 |
# Construct the command
|
| 15 |
+
cmd = f"{script_path} {args.in_folder} {args.out_folder}"
|
| 16 |
|
| 17 |
# Execute the shell script
|
| 18 |
subprocess.run(cmd, shell=True, check=True)
|
convert.py
CHANGED
|
@@ -24,6 +24,7 @@ def process_single_pdf(filepath: str, out_folder: str, model_refs, metadata: Opt
|
|
| 24 |
fname = os.path.basename(filepath)
|
| 25 |
if markdown_exists(out_folder, fname):
|
| 26 |
return
|
|
|
|
| 27 |
try:
|
| 28 |
# Skip trying to convert files that don't have a lot of embedded text
|
| 29 |
# This can indicate that they were scanned, and not OCRed properly
|
|
|
|
| 24 |
fname = os.path.basename(filepath)
|
| 25 |
if markdown_exists(out_folder, fname):
|
| 26 |
return
|
| 27 |
+
|
| 28 |
try:
|
| 29 |
# Skip trying to convert files that don't have a lot of embedded text
|
| 30 |
# This can indicate that they were scanned, and not OCRed properly
|
marker/output.py
CHANGED
|
@@ -5,7 +5,6 @@ import json
|
|
| 5 |
def get_subfolder_path(out_folder, fname):
|
| 6 |
subfolder_name = fname.split(".")[0]
|
| 7 |
subfolder_path = os.path.join(out_folder, subfolder_name)
|
| 8 |
-
os.makedirs(subfolder_path, exist_ok=True)
|
| 9 |
return subfolder_path
|
| 10 |
|
| 11 |
|
|
@@ -23,6 +22,8 @@ def markdown_exists(out_folder, fname):
|
|
| 23 |
|
| 24 |
def save_markdown(out_folder, fname, full_text, images, out_metadata):
|
| 25 |
subfolder_path = get_subfolder_path(out_folder, fname)
|
|
|
|
|
|
|
| 26 |
markdown_filepath = get_markdown_filepath(out_folder, fname)
|
| 27 |
out_meta_filepath = markdown_filepath.rsplit(".", 1)[0] + "_meta.json"
|
| 28 |
|
|
|
|
| 5 |
def get_subfolder_path(out_folder, fname):
|
| 6 |
subfolder_name = fname.split(".")[0]
|
| 7 |
subfolder_path = os.path.join(out_folder, subfolder_name)
|
|
|
|
| 8 |
return subfolder_path
|
| 9 |
|
| 10 |
|
|
|
|
| 22 |
|
| 23 |
def save_markdown(out_folder, fname, full_text, images, out_metadata):
|
| 24 |
subfolder_path = get_subfolder_path(out_folder, fname)
|
| 25 |
+
os.makedirs(subfolder_path, exist_ok=True)
|
| 26 |
+
|
| 27 |
markdown_filepath = get_markdown_filepath(out_folder, fname)
|
| 28 |
out_meta_filepath = markdown_filepath.rsplit(".", 1)[0] + "_meta.json"
|
| 29 |
|
marker/pdf/utils.py
CHANGED
|
@@ -7,6 +7,9 @@ from marker.settings import settings
|
|
| 7 |
|
| 8 |
def find_filetype(fpath):
|
| 9 |
kind = filetype.guess(fpath)
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
mimetype = kind.mime
|
| 12 |
|
|
|
|
| 7 |
|
| 8 |
def find_filetype(fpath):
|
| 9 |
kind = filetype.guess(fpath)
|
| 10 |
+
if kind is None:
|
| 11 |
+
print(f"Could not determine filetype for {fpath}")
|
| 12 |
+
return "other"
|
| 13 |
|
| 14 |
mimetype = kind.mime
|
| 15 |
|
pyproject.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
[tool.poetry]
|
| 2 |
name = "marker-pdf"
|
| 3 |
-
version = "0.2.
|
| 4 |
description = "Convert PDF to markdown with high speed and accuracy."
|
| 5 |
authors = ["Vik Paruchuri <github@vikas.sh>"]
|
| 6 |
readme = "README.md"
|
|
|
|
| 1 |
[tool.poetry]
|
| 2 |
name = "marker-pdf"
|
| 3 |
+
version = "0.2.3"
|
| 4 |
description = "Convert PDF to markdown with high speed and accuracy."
|
| 5 |
authors = ["Vik Paruchuri <github@vikas.sh>"]
|
| 6 |
readme = "README.md"
|