Vik Paruchuri commited on
Commit
8650951
·
1 Parent(s): 9a62b5a

Add pypi package config

Browse files
.github/workflows/publish.yml ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Python package
2
+ on:
3
+ push:
4
+ tags:
5
+ - "v*.*.*"
6
+ jobs:
7
+ build:
8
+ runs-on: ubuntu-latest
9
+ steps:
10
+ - uses: actions/checkout@v3
11
+ - name: Set up Python 3.11
12
+ uses: actions/setup-python@v4
13
+ with:
14
+ python-version: 3.11
15
+ - name: Install python dependencies
16
+ run: |
17
+ pip install poetry
18
+ poetry install
19
+ poetry remove torch
20
+ poetry run pip install torch --index-url https://download.pytorch.org/whl/cpu
21
+ - name: Build package
22
+ run: |
23
+ poetry build
24
+ - name: Publish package
25
+ env:
26
+ PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
27
+ run: |
28
+ poetry config pypi-token.pypi "$PYPI_TOKEN"
29
+ poetry publish
benchmark.py CHANGED
@@ -8,9 +8,6 @@ from tqdm import tqdm
8
  from marker.convert import convert_single_pdf
9
  from marker.logger import configure_logging
10
  from marker.models import load_all_models
11
- from marker.ordering import load_ordering_model
12
- from marker.segmentation import load_layout_model
13
- from marker.cleaners.equations import load_nougat_model
14
  from marker.benchmark.scoring import score_text
15
  from marker.extract_text import naive_get_text
16
  import json
@@ -18,7 +15,6 @@ import os
18
  import subprocess
19
  import shutil
20
  import fitz as pymupdf
21
- from marker.settings import settings
22
  from tabulate import tabulate
23
 
24
  configure_logging()
@@ -34,7 +30,7 @@ def nougat_prediction(pdf_filename, batch_size=1):
34
  return data
35
 
36
 
37
- if __name__ == "__main__":
38
  parser = argparse.ArgumentParser(description="Benchmark PDF to MD conversion. Needs source pdfs, and a refernece folder with the correct markdown.")
39
  parser.add_argument("in_folder", help="Input PDF files")
40
  parser.add_argument("reference_folder", help="Reference folder with reference markdown files")
@@ -126,3 +122,7 @@ if __name__ == "__main__":
126
  print("Scores by file")
127
  print(tabulate(score_table, headers=["Method", *score_headers]))
128
 
 
 
 
 
 
8
  from marker.convert import convert_single_pdf
9
  from marker.logger import configure_logging
10
  from marker.models import load_all_models
 
 
 
11
  from marker.benchmark.scoring import score_text
12
  from marker.extract_text import naive_get_text
13
  import json
 
15
  import subprocess
16
  import shutil
17
  import fitz as pymupdf
 
18
  from tabulate import tabulate
19
 
20
  configure_logging()
 
30
  return data
31
 
32
 
33
+ def main():
34
  parser = argparse.ArgumentParser(description="Benchmark PDF to MD conversion. Needs source pdfs, and a refernece folder with the correct markdown.")
35
  parser.add_argument("in_folder", help="Input PDF files")
36
  parser.add_argument("reference_folder", help="Reference folder with reference markdown files")
 
122
  print("Scores by file")
123
  print(tabulate(score_table, headers=["Method", *score_headers]))
124
 
125
+
126
+ if __name__ == "__main__":
127
+ main()
128
+
chunk_convert.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import subprocess
3
+
4
+
5
+ def main():
6
+ parser = argparse.ArgumentParser(description="Convert a folder of PDFs to a folder of markdown files in chunks.")
7
+ parser.add_argument("in_folder", help="Input folder with pdfs.")
8
+ parser.add_argument("out_folder", help="Output folder")
9
+ args = parser.parse_args()
10
+
11
+ # Construct the command
12
+ cmd = f"./chunk_convert.sh {args.in_folder} {args.out_folder}"
13
+
14
+ # Execute the shell script
15
+ subprocess.run(cmd, shell=True, check=True)
16
+
17
+
18
+ if __name__ == "__main__":
19
+ main()
chunk_convert.sh CHANGED
File without changes
convert.py CHANGED
@@ -45,7 +45,7 @@ def process_single_pdf(fname: str, out_folder: str, model_refs, metadata: Option
45
  print(traceback.format_exc())
46
 
47
 
48
- if __name__ == "__main__":
49
  parser = argparse.ArgumentParser(description="Convert multiple pdfs to markdown.")
50
  parser.add_argument("in_folder", help="Input folder with pdfs.")
51
  parser.add_argument("out_folder", help="Output folder")
@@ -121,4 +121,8 @@ if __name__ == "__main__":
121
  progress_bar.update(1)
122
 
123
  # Shutdown ray to free resources
124
- ray.shutdown()
 
 
 
 
 
45
  print(traceback.format_exc())
46
 
47
 
48
+ def main():
49
  parser = argparse.ArgumentParser(description="Convert multiple pdfs to markdown.")
50
  parser.add_argument("in_folder", help="Input folder with pdfs.")
51
  parser.add_argument("out_folder", help="Output folder")
 
121
  progress_bar.update(1)
122
 
123
  # Shutdown ray to free resources
124
+ ray.shutdown()
125
+
126
+
127
+ if __name__ == "__main__":
128
+ main()
convert_single.py CHANGED
@@ -3,13 +3,12 @@ import argparse
3
  from marker.convert import convert_single_pdf
4
  from marker.logger import configure_logging
5
  from marker.models import load_all_models
6
- from marker.settings import settings
7
  import json
8
 
9
  configure_logging()
10
 
11
 
12
- if __name__ == "__main__":
13
  parser = argparse.ArgumentParser()
14
  parser.add_argument("filename", help="PDF file to parse")
15
  parser.add_argument("output", help="Output file name")
@@ -26,4 +25,8 @@ if __name__ == "__main__":
26
 
27
  out_meta_filename = args.output.rsplit(".", 1)[0] + "_meta.json"
28
  with open(out_meta_filename, "w+") as f:
29
- f.write(json.dumps(out_meta, indent=4))
 
 
 
 
 
3
  from marker.convert import convert_single_pdf
4
  from marker.logger import configure_logging
5
  from marker.models import load_all_models
 
6
  import json
7
 
8
  configure_logging()
9
 
10
 
11
+ def main():
12
  parser = argparse.ArgumentParser()
13
  parser.add_argument("filename", help="PDF file to parse")
14
  parser.add_argument("output", help="Output file name")
 
25
 
26
  out_meta_filename = args.output.rsplit(".", 1)[0] + "_meta.json"
27
  with open(out_meta_filename, "w+") as f:
28
+ f.write(json.dumps(out_meta, indent=4))
29
+
30
+
31
+ if __name__ == "__main__":
32
+ main()
marker/convert.py CHANGED
@@ -13,7 +13,6 @@ from marker.cleaners.bullets import replace_bullets
13
  from marker.markdown import merge_spans, merge_lines, get_full_text
14
  from marker.schema import Page, BlockType
15
  from typing import List, Dict, Tuple, Optional
16
- from copy import deepcopy
17
  import re
18
  import magic
19
  from marker.settings import settings
 
13
  from marker.markdown import merge_spans, merge_lines, get_full_text
14
  from marker.schema import Page, BlockType
15
  from typing import List, Dict, Tuple, Optional
 
16
  import re
17
  import magic
18
  from marker.settings import settings
pyproject.toml CHANGED
@@ -1,12 +1,22 @@
1
  [tool.poetry]
2
- name = "marker"
3
- version = "0.1.0"
4
  description = "Convert PDF to markdown with high speed and accuracy."
5
  authors = ["Vik Paruchuri <github@vikas.sh>"]
6
  readme = "README.md"
7
  license = "GPL-3.0-or-later"
8
  repository = "https://github.com/VikParuchuri/marker"
9
  keywords = ["pdf", "markdown", "ocr", "nlp"]
 
 
 
 
 
 
 
 
 
 
10
 
11
  [tool.poetry.dependencies]
12
  python = ">=3.9,<3.13"
@@ -37,6 +47,12 @@ grpcio = "^1.60.0"
37
  [tool.poetry.group.dev.dependencies]
38
  jupyter = "^1.0.0"
39
 
 
 
 
 
 
 
40
  [build-system]
41
  requires = ["poetry-core"]
42
- build-backend = "poetry.core.masonry.api"
 
1
  [tool.poetry]
2
+ name = "marker-pdf"
3
+ version = "0.1.1"
4
  description = "Convert PDF to markdown with high speed and accuracy."
5
  authors = ["Vik Paruchuri <github@vikas.sh>"]
6
  readme = "README.md"
7
  license = "GPL-3.0-or-later"
8
  repository = "https://github.com/VikParuchuri/marker"
9
  keywords = ["pdf", "markdown", "ocr", "nlp"]
10
+ packages = [
11
+ {include = "marker"}
12
+ ]
13
+ include = [
14
+ "convert.py",
15
+ "convert_single.py",
16
+ "chunk_convert.sh",
17
+ "benchmark.py",
18
+ "chunk_convert.py",
19
+ ]
20
 
21
  [tool.poetry.dependencies]
22
  python = ">=3.9,<3.13"
 
47
  [tool.poetry.group.dev.dependencies]
48
  jupyter = "^1.0.0"
49
 
50
+ [tool.poetry.scripts]
51
+ marker = "convert:main"
52
+ marker_single = "convert_single:main"
53
+ marker_benchmark = "benchmark:main"
54
+ marker_chunk_convert = "chunk_convert:main"
55
+
56
  [build-system]
57
  requires = ["poetry-core"]
58
+ build-backend = "poetry.core.masonry.api"