Vik Paruchuri
commited on
Commit
·
8650951
1
Parent(s):
9a62b5a
Add pypi package config
Browse files- .github/workflows/publish.yml +29 -0
- benchmark.py +5 -5
- chunk_convert.py +19 -0
- chunk_convert.sh +0 -0
- convert.py +6 -2
- convert_single.py +6 -3
- marker/convert.py +0 -1
- pyproject.toml +19 -3
.github/workflows/publish.yml
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Python package
|
| 2 |
+
on:
|
| 3 |
+
push:
|
| 4 |
+
tags:
|
| 5 |
+
- "v*.*.*"
|
| 6 |
+
jobs:
|
| 7 |
+
build:
|
| 8 |
+
runs-on: ubuntu-latest
|
| 9 |
+
steps:
|
| 10 |
+
- uses: actions/checkout@v3
|
| 11 |
+
- name: Set up Python 3.11
|
| 12 |
+
uses: actions/setup-python@v4
|
| 13 |
+
with:
|
| 14 |
+
python-version: 3.11
|
| 15 |
+
- name: Install python dependencies
|
| 16 |
+
run: |
|
| 17 |
+
pip install poetry
|
| 18 |
+
poetry install
|
| 19 |
+
poetry remove torch
|
| 20 |
+
poetry run pip install torch --index-url https://download.pytorch.org/whl/cpu
|
| 21 |
+
- name: Build package
|
| 22 |
+
run: |
|
| 23 |
+
poetry build
|
| 24 |
+
- name: Publish package
|
| 25 |
+
env:
|
| 26 |
+
PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
|
| 27 |
+
run: |
|
| 28 |
+
poetry config pypi-token.pypi "$PYPI_TOKEN"
|
| 29 |
+
poetry publish
|
benchmark.py
CHANGED
|
@@ -8,9 +8,6 @@ from tqdm import tqdm
|
|
| 8 |
from marker.convert import convert_single_pdf
|
| 9 |
from marker.logger import configure_logging
|
| 10 |
from marker.models import load_all_models
|
| 11 |
-
from marker.ordering import load_ordering_model
|
| 12 |
-
from marker.segmentation import load_layout_model
|
| 13 |
-
from marker.cleaners.equations import load_nougat_model
|
| 14 |
from marker.benchmark.scoring import score_text
|
| 15 |
from marker.extract_text import naive_get_text
|
| 16 |
import json
|
|
@@ -18,7 +15,6 @@ import os
|
|
| 18 |
import subprocess
|
| 19 |
import shutil
|
| 20 |
import fitz as pymupdf
|
| 21 |
-
from marker.settings import settings
|
| 22 |
from tabulate import tabulate
|
| 23 |
|
| 24 |
configure_logging()
|
|
@@ -34,7 +30,7 @@ def nougat_prediction(pdf_filename, batch_size=1):
|
|
| 34 |
return data
|
| 35 |
|
| 36 |
|
| 37 |
-
|
| 38 |
parser = argparse.ArgumentParser(description="Benchmark PDF to MD conversion. Needs source pdfs, and a refernece folder with the correct markdown.")
|
| 39 |
parser.add_argument("in_folder", help="Input PDF files")
|
| 40 |
parser.add_argument("reference_folder", help="Reference folder with reference markdown files")
|
|
@@ -126,3 +122,7 @@ if __name__ == "__main__":
|
|
| 126 |
print("Scores by file")
|
| 127 |
print(tabulate(score_table, headers=["Method", *score_headers]))
|
| 128 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
from marker.convert import convert_single_pdf
|
| 9 |
from marker.logger import configure_logging
|
| 10 |
from marker.models import load_all_models
|
|
|
|
|
|
|
|
|
|
| 11 |
from marker.benchmark.scoring import score_text
|
| 12 |
from marker.extract_text import naive_get_text
|
| 13 |
import json
|
|
|
|
| 15 |
import subprocess
|
| 16 |
import shutil
|
| 17 |
import fitz as pymupdf
|
|
|
|
| 18 |
from tabulate import tabulate
|
| 19 |
|
| 20 |
configure_logging()
|
|
|
|
| 30 |
return data
|
| 31 |
|
| 32 |
|
| 33 |
+
def main():
|
| 34 |
parser = argparse.ArgumentParser(description="Benchmark PDF to MD conversion. Needs source pdfs, and a refernece folder with the correct markdown.")
|
| 35 |
parser.add_argument("in_folder", help="Input PDF files")
|
| 36 |
parser.add_argument("reference_folder", help="Reference folder with reference markdown files")
|
|
|
|
| 122 |
print("Scores by file")
|
| 123 |
print(tabulate(score_table, headers=["Method", *score_headers]))
|
| 124 |
|
| 125 |
+
|
| 126 |
+
if __name__ == "__main__":
|
| 127 |
+
main()
|
| 128 |
+
|
chunk_convert.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import subprocess
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def main():
|
| 6 |
+
parser = argparse.ArgumentParser(description="Convert a folder of PDFs to a folder of markdown files in chunks.")
|
| 7 |
+
parser.add_argument("in_folder", help="Input folder with pdfs.")
|
| 8 |
+
parser.add_argument("out_folder", help="Output folder")
|
| 9 |
+
args = parser.parse_args()
|
| 10 |
+
|
| 11 |
+
# Construct the command
|
| 12 |
+
cmd = f"./chunk_convert.sh {args.in_folder} {args.out_folder}"
|
| 13 |
+
|
| 14 |
+
# Execute the shell script
|
| 15 |
+
subprocess.run(cmd, shell=True, check=True)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
if __name__ == "__main__":
|
| 19 |
+
main()
|
chunk_convert.sh
CHANGED
|
File without changes
|
convert.py
CHANGED
|
@@ -45,7 +45,7 @@ def process_single_pdf(fname: str, out_folder: str, model_refs, metadata: Option
|
|
| 45 |
print(traceback.format_exc())
|
| 46 |
|
| 47 |
|
| 48 |
-
|
| 49 |
parser = argparse.ArgumentParser(description="Convert multiple pdfs to markdown.")
|
| 50 |
parser.add_argument("in_folder", help="Input folder with pdfs.")
|
| 51 |
parser.add_argument("out_folder", help="Output folder")
|
|
@@ -121,4 +121,8 @@ if __name__ == "__main__":
|
|
| 121 |
progress_bar.update(1)
|
| 122 |
|
| 123 |
# Shutdown ray to free resources
|
| 124 |
-
ray.shutdown()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
print(traceback.format_exc())
|
| 46 |
|
| 47 |
|
| 48 |
+
def main():
|
| 49 |
parser = argparse.ArgumentParser(description="Convert multiple pdfs to markdown.")
|
| 50 |
parser.add_argument("in_folder", help="Input folder with pdfs.")
|
| 51 |
parser.add_argument("out_folder", help="Output folder")
|
|
|
|
| 121 |
progress_bar.update(1)
|
| 122 |
|
| 123 |
# Shutdown ray to free resources
|
| 124 |
+
ray.shutdown()
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
if __name__ == "__main__":
|
| 128 |
+
main()
|
convert_single.py
CHANGED
|
@@ -3,13 +3,12 @@ import argparse
|
|
| 3 |
from marker.convert import convert_single_pdf
|
| 4 |
from marker.logger import configure_logging
|
| 5 |
from marker.models import load_all_models
|
| 6 |
-
from marker.settings import settings
|
| 7 |
import json
|
| 8 |
|
| 9 |
configure_logging()
|
| 10 |
|
| 11 |
|
| 12 |
-
|
| 13 |
parser = argparse.ArgumentParser()
|
| 14 |
parser.add_argument("filename", help="PDF file to parse")
|
| 15 |
parser.add_argument("output", help="Output file name")
|
|
@@ -26,4 +25,8 @@ if __name__ == "__main__":
|
|
| 26 |
|
| 27 |
out_meta_filename = args.output.rsplit(".", 1)[0] + "_meta.json"
|
| 28 |
with open(out_meta_filename, "w+") as f:
|
| 29 |
-
f.write(json.dumps(out_meta, indent=4))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
from marker.convert import convert_single_pdf
|
| 4 |
from marker.logger import configure_logging
|
| 5 |
from marker.models import load_all_models
|
|
|
|
| 6 |
import json
|
| 7 |
|
| 8 |
configure_logging()
|
| 9 |
|
| 10 |
|
| 11 |
+
def main():
|
| 12 |
parser = argparse.ArgumentParser()
|
| 13 |
parser.add_argument("filename", help="PDF file to parse")
|
| 14 |
parser.add_argument("output", help="Output file name")
|
|
|
|
| 25 |
|
| 26 |
out_meta_filename = args.output.rsplit(".", 1)[0] + "_meta.json"
|
| 27 |
with open(out_meta_filename, "w+") as f:
|
| 28 |
+
f.write(json.dumps(out_meta, indent=4))
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
if __name__ == "__main__":
|
| 32 |
+
main()
|
marker/convert.py
CHANGED
|
@@ -13,7 +13,6 @@ from marker.cleaners.bullets import replace_bullets
|
|
| 13 |
from marker.markdown import merge_spans, merge_lines, get_full_text
|
| 14 |
from marker.schema import Page, BlockType
|
| 15 |
from typing import List, Dict, Tuple, Optional
|
| 16 |
-
from copy import deepcopy
|
| 17 |
import re
|
| 18 |
import magic
|
| 19 |
from marker.settings import settings
|
|
|
|
| 13 |
from marker.markdown import merge_spans, merge_lines, get_full_text
|
| 14 |
from marker.schema import Page, BlockType
|
| 15 |
from typing import List, Dict, Tuple, Optional
|
|
|
|
| 16 |
import re
|
| 17 |
import magic
|
| 18 |
from marker.settings import settings
|
pyproject.toml
CHANGED
|
@@ -1,12 +1,22 @@
|
|
| 1 |
[tool.poetry]
|
| 2 |
-
name = "marker"
|
| 3 |
-
version = "0.1.
|
| 4 |
description = "Convert PDF to markdown with high speed and accuracy."
|
| 5 |
authors = ["Vik Paruchuri <github@vikas.sh>"]
|
| 6 |
readme = "README.md"
|
| 7 |
license = "GPL-3.0-or-later"
|
| 8 |
repository = "https://github.com/VikParuchuri/marker"
|
| 9 |
keywords = ["pdf", "markdown", "ocr", "nlp"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
[tool.poetry.dependencies]
|
| 12 |
python = ">=3.9,<3.13"
|
|
@@ -37,6 +47,12 @@ grpcio = "^1.60.0"
|
|
| 37 |
[tool.poetry.group.dev.dependencies]
|
| 38 |
jupyter = "^1.0.0"
|
| 39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
[build-system]
|
| 41 |
requires = ["poetry-core"]
|
| 42 |
-
build-backend = "poetry.core.masonry.api"
|
|
|
|
| 1 |
[tool.poetry]
|
| 2 |
+
name = "marker-pdf"
|
| 3 |
+
version = "0.1.1"
|
| 4 |
description = "Convert PDF to markdown with high speed and accuracy."
|
| 5 |
authors = ["Vik Paruchuri <github@vikas.sh>"]
|
| 6 |
readme = "README.md"
|
| 7 |
license = "GPL-3.0-or-later"
|
| 8 |
repository = "https://github.com/VikParuchuri/marker"
|
| 9 |
keywords = ["pdf", "markdown", "ocr", "nlp"]
|
| 10 |
+
packages = [
|
| 11 |
+
{include = "marker"}
|
| 12 |
+
]
|
| 13 |
+
include = [
|
| 14 |
+
"convert.py",
|
| 15 |
+
"convert_single.py",
|
| 16 |
+
"chunk_convert.sh",
|
| 17 |
+
"benchmark.py",
|
| 18 |
+
"chunk_convert.py",
|
| 19 |
+
]
|
| 20 |
|
| 21 |
[tool.poetry.dependencies]
|
| 22 |
python = ">=3.9,<3.13"
|
|
|
|
| 47 |
[tool.poetry.group.dev.dependencies]
|
| 48 |
jupyter = "^1.0.0"
|
| 49 |
|
| 50 |
+
[tool.poetry.scripts]
|
| 51 |
+
marker = "convert:main"
|
| 52 |
+
marker_single = "convert_single:main"
|
| 53 |
+
marker_benchmark = "benchmark:main"
|
| 54 |
+
marker_chunk_convert = "chunk_convert:main"
|
| 55 |
+
|
| 56 |
[build-system]
|
| 57 |
requires = ["poetry-core"]
|
| 58 |
+
build-backend = "poetry.core.masonry.api"
|