Vik Paruchuri commited on
Commit
ba2fd61
·
1 Parent(s): c59ca05

Hotfix scripts

Browse files
chunk_convert.py CHANGED
@@ -1,22 +1,4 @@
1
- import argparse
2
- import subprocess
3
- import pkg_resources
4
-
5
-
6
- def main():
7
- parser = argparse.ArgumentParser(description="Convert a folder of PDFs to a folder of markdown files in chunks.")
8
- parser.add_argument("in_folder", help="Input folder with pdfs.")
9
- parser.add_argument("out_folder", help="Output folder")
10
- args = parser.parse_args()
11
-
12
- script_path = pkg_resources.resource_filename(__name__, 'chunk_convert.sh')
13
-
14
- # Construct the command
15
- cmd = f"{script_path} {args.in_folder} {args.out_folder}"
16
-
17
- # Execute the shell script
18
- subprocess.run(cmd, shell=True, check=True)
19
-
20
 
21
  if __name__ == "__main__":
22
- main()
 
1
+ from marker.scripts import chunk_convert_cli
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  if __name__ == "__main__":
4
+ chunk_convert_cli()
convert.py CHANGED
@@ -1,116 +1,4 @@
1
- import os
2
-
3
- os.environ["GRPC_VERBOSITY"] = "ERROR"
4
- os.environ["GLOG_minloglevel"] = "2"
5
- os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # Transformers uses .isin for a simple op, which is not supported on MPS
6
- os.environ["IN_STREAMLIT"] = "true" # Avoid multiprocessing inside surya
7
-
8
- import math
9
- import traceback
10
-
11
- import click
12
- import torch.multiprocessing as mp
13
- from tqdm import tqdm
14
-
15
- from marker.config.parser import ConfigParser
16
- from marker.converters.pdf import PdfConverter
17
- from marker.logger import configure_logging
18
- from marker.models import create_model_dict
19
- from marker.output import output_exists, save_output
20
- from marker.settings import settings
21
-
22
- configure_logging()
23
-
24
-
25
- def worker_init(model_dict):
26
- if model_dict is None:
27
- model_dict = create_model_dict()
28
-
29
- global model_refs
30
- model_refs = model_dict
31
-
32
-
33
- def worker_exit():
34
- global model_refs
35
- del model_refs
36
-
37
-
38
- def process_single_pdf(args):
39
- fpath, cli_options = args
40
- config_parser = ConfigParser(cli_options)
41
-
42
- out_folder = config_parser.get_output_folder(fpath)
43
- base_name = config_parser.get_base_filename(fpath)
44
- if cli_options.get('skip_existing') and output_exists(out_folder, base_name):
45
- return
46
-
47
- try:
48
- converter = PdfConverter(
49
- config=config_parser.generate_config_dict(),
50
- artifact_dict=model_refs,
51
- processor_list=config_parser.get_processors(),
52
- renderer=config_parser.get_renderer()
53
- )
54
- rendered = converter(fpath)
55
- out_folder = config_parser.get_output_folder(fpath)
56
- save_output(rendered, out_folder, base_name)
57
- except Exception as e:
58
- print(f"Error converting {fpath}: {e}")
59
- print(traceback.format_exc())
60
-
61
-
62
- @click.command()
63
- @click.argument("in_folder", type=str)
64
- @ConfigParser.common_options
65
- @click.option("--chunk_idx", type=int, default=0, help="Chunk index to convert")
66
- @click.option("--num_chunks", type=int, default=1, help="Number of chunks being processed in parallel")
67
- @click.option("--max_files", type=int, default=None, help="Maximum number of pdfs to convert")
68
- @click.option("--workers", type=int, default=5, help="Number of worker processes to use.")
69
- @click.option("--skip_existing", is_flag=True, default=False, help="Skip existing converted files.")
70
- def main(in_folder: str, **kwargs):
71
- in_folder = os.path.abspath(in_folder)
72
- files = [os.path.join(in_folder, f) for f in os.listdir(in_folder)]
73
- files = [f for f in files if os.path.isfile(f)]
74
-
75
- # Handle chunks if we're processing in parallel
76
- # Ensure we get all files into a chunk
77
- chunk_size = math.ceil(len(files) / kwargs["num_chunks"])
78
- start_idx = kwargs["chunk_idx"] * chunk_size
79
- end_idx = start_idx + chunk_size
80
- files_to_convert = files[start_idx:end_idx]
81
-
82
- # Limit files converted if needed
83
- if kwargs["max_files"]:
84
- files_to_convert = files_to_convert[:kwargs["max_files"]]
85
-
86
- # Disable nested multiprocessing
87
- kwargs["disable_multiprocessing"] = True
88
-
89
- total_processes = min(len(files_to_convert), kwargs["workers"])
90
-
91
- try:
92
- mp.set_start_method('spawn') # Required for CUDA, forkserver doesn't work
93
- except RuntimeError:
94
- raise RuntimeError("Set start method to spawn twice. This may be a temporary issue with the script. Please try running it again.")
95
-
96
- if settings.TORCH_DEVICE == "mps" or settings.TORCH_DEVICE_MODEL == "mps":
97
- model_dict = None
98
- else:
99
- model_dict = create_model_dict()
100
- for k, v in model_dict.items():
101
- v.share_memory()
102
-
103
- print(f"Converting {len(files_to_convert)} pdfs in chunk {kwargs['chunk_idx'] + 1}/{kwargs['num_chunks']} with {total_processes} processes and saving to {kwargs['output_dir']}")
104
- task_args = [(f, kwargs) for f in files_to_convert]
105
-
106
- with mp.Pool(processes=total_processes, initializer=worker_init, initargs=(model_dict,)) as pool:
107
- list(tqdm(pool.imap(process_single_pdf, task_args), total=len(task_args), desc="Processing PDFs", unit="pdf"))
108
-
109
- pool._worker_handler.terminate = worker_exit
110
-
111
- # Delete all CUDA tensors
112
- del model_dict
113
-
114
 
115
  if __name__ == "__main__":
116
  main()
 
1
+ from marker.scripts import convert_cli
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  if __name__ == "__main__":
4
  main()
convert_single.py CHANGED
@@ -1,43 +1,4 @@
1
- import os
2
-
3
- os.environ["GRPC_VERBOSITY"] = "ERROR"
4
- os.environ["GLOG_minloglevel"] = "2"
5
- os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # Transformers uses .isin for a simple op, which is not supported on MPS
6
-
7
- import time
8
- import click
9
-
10
- from marker.config.parser import ConfigParser
11
- from marker.config.printer import CustomClickPrinter
12
- from marker.converters.pdf import PdfConverter
13
- from marker.logger import configure_logging
14
- from marker.models import create_model_dict
15
- from marker.output import save_output
16
-
17
- configure_logging()
18
-
19
-
20
- @click.command(cls=CustomClickPrinter, help="Convert a single PDF to markdown.")
21
- @click.argument("fpath", type=str)
22
- @ConfigParser.common_options
23
- def main(fpath: str, **kwargs):
24
- models = create_model_dict()
25
- start = time.time()
26
- config_parser = ConfigParser(kwargs)
27
-
28
- converter = PdfConverter(
29
- config=config_parser.generate_config_dict(),
30
- artifact_dict=models,
31
- processor_list=config_parser.get_processors(),
32
- renderer=config_parser.get_renderer()
33
- )
34
- rendered = converter(fpath)
35
- out_folder = config_parser.get_output_folder(fpath)
36
- save_output(rendered, out_folder, config_parser.get_base_filename(fpath))
37
-
38
- print(f"Saved markdown to {out_folder}")
39
- print(f"Total time: {time.time() - start}")
40
-
41
 
42
  if __name__ == "__main__":
43
- main()
 
1
+ from marker.scripts import convert_single_cli
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  if __name__ == "__main__":
4
+ convert_single_cli()
marker/scripts/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from marker.scripts.convert_single import convert_single_cli
2
+ from marker.scripts.convert import convert_cli
3
+ from marker.scripts.server import server_cli
4
+ from marker.scripts.run_streamlit_app import streamlit_app_cli
5
+ from marker.scripts.chunk_convert import chunk_convert_cli
marker/scripts/chunk_convert.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import subprocess
4
+ import pkg_resources
5
+
6
+
7
+ def chunk_convert_cli():
8
+ parser = argparse.ArgumentParser(description="Convert a folder of PDFs to a folder of markdown files in chunks.")
9
+ parser.add_argument("in_folder", help="Input folder with pdfs.")
10
+ parser.add_argument("out_folder", help="Output folder")
11
+ args = parser.parse_args()
12
+
13
+ cur_dir = os.path.dirname(os.path.abspath(__file__))
14
+ script_path = os.path.join(cur_dir, "chunk_convert.sh")
15
+
16
+ # Construct the command
17
+ cmd = f"{script_path} {args.in_folder} {args.out_folder}"
18
+
19
+ # Execute the shell script
20
+ subprocess.run(cmd, shell=True, check=True)
chunk_convert.sh → marker/scripts/chunk_convert.sh RENAMED
File without changes
marker/scripts/convert.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ os.environ["GRPC_VERBOSITY"] = "ERROR"
4
+ os.environ["GLOG_minloglevel"] = "2"
5
+ os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # Transformers uses .isin for a simple op, which is not supported on MPS
6
+ os.environ["IN_STREAMLIT"] = "true" # Avoid multiprocessing inside surya
7
+
8
+ import math
9
+ import traceback
10
+
11
+ import click
12
+ import torch.multiprocessing as mp
13
+ from tqdm import tqdm
14
+
15
+ from marker.config.parser import ConfigParser
16
+ from marker.config.printer import CustomClickPrinter
17
+ from marker.logger import configure_logging
18
+ from marker.models import create_model_dict
19
+ from marker.output import output_exists, save_output
20
+ from marker.settings import settings
21
+
22
+ configure_logging()
23
+
24
+
25
+ def worker_init(model_dict):
26
+ if model_dict is None:
27
+ model_dict = create_model_dict()
28
+
29
+ global model_refs
30
+ model_refs = model_dict
31
+
32
+
33
+ def worker_exit():
34
+ global model_refs
35
+ del model_refs
36
+
37
+
38
+ def process_single_pdf(args):
39
+ fpath, cli_options = args
40
+ config_parser = ConfigParser(cli_options)
41
+
42
+ out_folder = config_parser.get_output_folder(fpath)
43
+ base_name = config_parser.get_base_filename(fpath)
44
+ if cli_options.get('skip_existing') and output_exists(out_folder, base_name):
45
+ return
46
+
47
+ converter_cls = config_parser.get_converter_cls()
48
+
49
+ try:
50
+ converter = converter_cls(
51
+ config=config_parser.generate_config_dict(),
52
+ artifact_dict=model_refs,
53
+ processor_list=config_parser.get_processors(),
54
+ renderer=config_parser.get_renderer()
55
+ )
56
+ rendered = converter(fpath)
57
+ out_folder = config_parser.get_output_folder(fpath)
58
+ save_output(rendered, out_folder, base_name)
59
+ except Exception as e:
60
+ print(f"Error converting {fpath}: {e}")
61
+ print(traceback.format_exc())
62
+
63
+
64
+ @click.command(cls=CustomClickPrinter)
65
+ @click.argument("in_folder", type=str)
66
+ @ConfigParser.common_options
67
+ @click.option("--chunk_idx", type=int, default=0, help="Chunk index to convert")
68
+ @click.option("--num_chunks", type=int, default=1, help="Number of chunks being processed in parallel")
69
+ @click.option("--max_files", type=int, default=None, help="Maximum number of pdfs to convert")
70
+ @click.option("--workers", type=int, default=5, help="Number of worker processes to use.")
71
+ @click.option("--skip_existing", is_flag=True, default=False, help="Skip existing converted files.")
72
+ def convert_cli(in_folder: str, **kwargs):
73
+ in_folder = os.path.abspath(in_folder)
74
+ files = [os.path.join(in_folder, f) for f in os.listdir(in_folder)]
75
+ files = [f for f in files if os.path.isfile(f)]
76
+
77
+ # Handle chunks if we're processing in parallel
78
+ # Ensure we get all files into a chunk
79
+ chunk_size = math.ceil(len(files) / kwargs["num_chunks"])
80
+ start_idx = kwargs["chunk_idx"] * chunk_size
81
+ end_idx = start_idx + chunk_size
82
+ files_to_convert = files[start_idx:end_idx]
83
+
84
+ # Limit files converted if needed
85
+ if kwargs["max_files"]:
86
+ files_to_convert = files_to_convert[:kwargs["max_files"]]
87
+
88
+ # Disable nested multiprocessing
89
+ kwargs["disable_multiprocessing"] = True
90
+
91
+ total_processes = min(len(files_to_convert), kwargs["workers"])
92
+
93
+ try:
94
+ mp.set_start_method('spawn') # Required for CUDA, forkserver doesn't work
95
+ except RuntimeError:
96
+ raise RuntimeError("Set start method to spawn twice. This may be a temporary issue with the script. Please try running it again.")
97
+
98
+ if settings.TORCH_DEVICE == "mps" or settings.TORCH_DEVICE_MODEL == "mps":
99
+ model_dict = None
100
+ else:
101
+ model_dict = create_model_dict()
102
+ for k, v in model_dict.items():
103
+ v.share_memory()
104
+
105
+ print(f"Converting {len(files_to_convert)} pdfs in chunk {kwargs['chunk_idx'] + 1}/{kwargs['num_chunks']} with {total_processes} processes and saving to {kwargs['output_dir']}")
106
+ task_args = [(f, kwargs) for f in files_to_convert]
107
+
108
+ with mp.Pool(processes=total_processes, initializer=worker_init, initargs=(model_dict,)) as pool:
109
+ list(tqdm(pool.imap(process_single_pdf, task_args), total=len(task_args), desc="Processing PDFs", unit="pdf"))
110
+
111
+ pool._worker_handler.terminate = worker_exit
112
+
113
+ # Delete all CUDA tensors
114
+ del model_dict
marker/scripts/convert_single.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ os.environ["GRPC_VERBOSITY"] = "ERROR"
4
+ os.environ["GLOG_minloglevel"] = "2"
5
+ os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # Transformers uses .isin for a simple op, which is not supported on MPS
6
+
7
+ import time
8
+ import click
9
+
10
+ from marker.config.parser import ConfigParser
11
+ from marker.config.printer import CustomClickPrinter
12
+ from marker.logger import configure_logging
13
+ from marker.models import create_model_dict
14
+ from marker.output import save_output
15
+
16
+ configure_logging()
17
+
18
+
19
+ @click.command(cls=CustomClickPrinter, help="Convert a single PDF to markdown.")
20
+ @click.argument("fpath", type=str)
21
+ @ConfigParser.common_options
22
+ def convert_single_cli(fpath: str, **kwargs):
23
+ models = create_model_dict()
24
+ start = time.time()
25
+ config_parser = ConfigParser(kwargs)
26
+
27
+ converter_cls = config_parser.get_converter_cls()
28
+ converter = converter_cls(
29
+ config=config_parser.generate_config_dict(),
30
+ artifact_dict=models,
31
+ processor_list=config_parser.get_processors(),
32
+ renderer=config_parser.get_renderer()
33
+ )
34
+ rendered = converter(fpath)
35
+ out_folder = config_parser.get_output_folder(fpath)
36
+ save_output(rendered, out_folder, config_parser.get_base_filename(fpath))
37
+
38
+ print(f"Saved markdown to {out_folder}")
39
+ print(f"Total time: {time.time() - start}")
run_marker_app.py → marker/scripts/run_streamlit_app.py RENAMED
@@ -2,12 +2,8 @@ import subprocess
2
  import os
3
 
4
 
5
- def run():
6
  cur_dir = os.path.dirname(os.path.abspath(__file__))
7
- app_path = os.path.join(cur_dir, "marker_app.py")
8
  cmd = ["streamlit", "run", app_path]
9
  subprocess.run(cmd, env={**os.environ, "IN_STREAMLIT": "true"})
10
-
11
-
12
- if __name__ == "__main__":
13
- run()
 
2
  import os
3
 
4
 
5
+ def streamlit_app_cli():
6
  cur_dir = os.path.dirname(os.path.abspath(__file__))
7
+ app_path = os.path.join(cur_dir, "streamlit_app.py")
8
  cmd = ["streamlit", "run", app_path]
9
  subprocess.run(cmd, env={**os.environ, "IN_STREAMLIT": "true"})
 
 
 
 
marker/scripts/server.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import traceback
2
+
3
+ import click
4
+ import os
5
+
6
+ import uvicorn
7
+ from pydantic import BaseModel, Field
8
+ from starlette.responses import HTMLResponse
9
+
10
+ from marker.config.parser import ConfigParser
11
+ from marker.output import text_from_rendered
12
+
13
+ import base64
14
+ from contextlib import asynccontextmanager
15
+ from typing import Optional, Annotated
16
+ import io
17
+
18
+ from fastapi import FastAPI, Form, File, UploadFile
19
+ from marker.converters.pdf import PdfConverter
20
+ from marker.models import create_model_dict
21
+ from marker.settings import settings
22
+
23
+ app_data = {}
24
+
25
+
26
+ UPLOAD_DIRECTORY = "./uploads"
27
+ os.makedirs(UPLOAD_DIRECTORY, exist_ok=True)
28
+
29
+
30
+ @asynccontextmanager
31
+ async def lifespan(app: FastAPI):
32
+ app_data["models"] = create_model_dict()
33
+
34
+ yield
35
+
36
+ if "models" in app_data:
37
+ del app_data["models"]
38
+
39
+
40
+ app = FastAPI(lifespan=lifespan)
41
+
42
+
43
+ @app.get("/")
44
+ async def root():
45
+ return HTMLResponse(
46
+ """
47
+ <h1>Marker API</h1>
48
+ <ul>
49
+ <li><a href="/docs">API Documentation</a></li>
50
+ <li><a href="/marker">Run marker (post request only)</a></li>
51
+ </ul>
52
+ """
53
+ )
54
+
55
+
56
+ class CommonParams(BaseModel):
57
+ filepath: Annotated[
58
+ Optional[str], Field(description="The path to the PDF file to convert.")
59
+ ]
60
+ page_range: Annotated[
61
+ Optional[str],
62
+ Field(description="Page range to convert, specify comma separated page numbers or ranges. Example: 0,5-10,20", example=None)
63
+ ] = None
64
+ languages: Annotated[
65
+ Optional[str],
66
+ Field(description="Comma separated list of languages to use for OCR. Must be either the names or codes from from https://github.com/VikParuchuri/surya/blob/master/surya/languages.py.", example=None)
67
+ ] = None
68
+ force_ocr: Annotated[
69
+ bool,
70
+ Field(
71
+ description="Force OCR on all pages of the PDF. Defaults to False. This can lead to worse results if you have good text in your PDFs (which is true in most cases)."
72
+ ),
73
+ ] = False
74
+ paginate_output: Annotated[
75
+ bool,
76
+ Field(
77
+ description="Whether to paginate the output. Defaults to False. If set to True, each page of the output will be separated by a horizontal rule that contains the page number (2 newlines, {PAGE_NUMBER}, 48 - characters, 2 newlines)."
78
+ ),
79
+ ] = False
80
+ output_format: Annotated[
81
+ str,
82
+ Field(description="The format to output the text in. Can be 'markdown', 'json', or 'html'. Defaults to 'markdown'.")
83
+ ] = "markdown"
84
+
85
+
86
+ async def _convert_pdf(params: CommonParams):
87
+ assert params.output_format in ["markdown", "json", "html"], "Invalid output format"
88
+ try:
89
+ options = params.model_dump()
90
+ print(options)
91
+ config_parser = ConfigParser(options)
92
+ config_dict = config_parser.generate_config_dict()
93
+ config_dict["pdftext_workers"] = 1
94
+ converter = PdfConverter(
95
+ config=config_dict,
96
+ artifact_dict=app_data["models"],
97
+ processor_list=config_parser.get_processors(),
98
+ renderer=config_parser.get_renderer()
99
+ )
100
+ rendered = converter(params.filepath)
101
+ text, _, images = text_from_rendered(rendered)
102
+ metadata = rendered.metadata
103
+ except Exception as e:
104
+ traceback.print_exc()
105
+ return {
106
+ "success": False,
107
+ "error": str(e),
108
+ }
109
+
110
+ encoded = {}
111
+ for k, v in images.items():
112
+ byte_stream = io.BytesIO()
113
+ v.save(byte_stream, format=settings.OUTPUT_IMAGE_FORMAT)
114
+ encoded[k] = base64.b64encode(byte_stream.getvalue()).decode(settings.OUTPUT_ENCODING)
115
+
116
+ return {
117
+ "format": params.output_format,
118
+ "output": text,
119
+ "images": encoded,
120
+ "metadata": metadata,
121
+ "success": True,
122
+ }
123
+
124
+ @app.post("/marker")
125
+ async def convert_pdf(
126
+ params: CommonParams
127
+ ):
128
+ return await _convert_pdf(params)
129
+
130
+
131
+
132
+ @app.post("/marker/upload")
133
+ async def convert_pdf_upload(
134
+ page_range: Optional[str] = Form(default=None),
135
+ languages: Optional[str] = Form(default=None),
136
+ force_ocr: Optional[bool] = Form(default=False),
137
+ paginate_output: Optional[bool] = Form(default=False),
138
+ output_format: Optional[str] = Form(default="markdown"),
139
+ file: UploadFile = File(
140
+ ..., description="The PDF file to convert.", media_type="application/pdf"
141
+ ),
142
+ ):
143
+ upload_path = os.path.join(UPLOAD_DIRECTORY, file.filename)
144
+ with open(upload_path, "wb+") as upload_file:
145
+ file_contents = await file.read()
146
+ upload_file.write(file_contents)
147
+
148
+ params = CommonParams(
149
+ filepath=upload_path,
150
+ page_range=page_range,
151
+ languages=languages,
152
+ force_ocr=force_ocr,
153
+ paginate_output=paginate_output,
154
+ output_format=output_format,
155
+ )
156
+ results = await _convert_pdf(params)
157
+ os.remove(upload_path)
158
+ return results
159
+
160
+
161
+ @click.command()
162
+ @click.option("--port", type=int, default=8000, help="Port to run the server on")
163
+ @click.option("--host", type=str, default="127.0.0.1", help="Host to run the server on")
164
+ def server_cli(port: int, host: str):
165
+ # Run the server
166
+ uvicorn.run(
167
+ app,
168
+ host=host,
169
+ port=port,
170
+ )
marker/scripts/streamlit_app.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from marker.settings import settings
4
+
5
+ os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
6
+ os.environ["IN_STREAMLIT"] = "true"
7
+
8
+ import base64
9
+ import io
10
+ import re
11
+ import tempfile
12
+ from typing import Any, Dict
13
+
14
+ import pypdfium2
15
+ import streamlit as st
16
+ from PIL import Image
17
+
18
+ from marker.converters.pdf import PdfConverter
19
+ from marker.models import create_model_dict
20
+ from marker.config.parser import ConfigParser
21
+ from marker.output import text_from_rendered
22
+
23
+ @st.cache_resource()
24
+ def load_models():
25
+ return create_model_dict()
26
+
27
+
28
+ def convert_pdf(fname: str, config_parser: ConfigParser) -> (str, Dict[str, Any], dict):
29
+ config_dict = config_parser.generate_config_dict()
30
+ config_dict["pdftext_workers"] = 1
31
+ converter = PdfConverter(
32
+ config=config_dict,
33
+ artifact_dict=model_dict,
34
+ processor_list=config_parser.get_processors(),
35
+ renderer=config_parser.get_renderer()
36
+ )
37
+ return converter(fname)
38
+
39
+
40
+ def open_pdf(pdf_file):
41
+ stream = io.BytesIO(pdf_file.getvalue())
42
+ return pypdfium2.PdfDocument(stream)
43
+
44
+
45
+ def img_to_html(img, img_alt):
46
+ img_bytes = io.BytesIO()
47
+ img.save(img_bytes, format=settings.OUTPUT_IMAGE_FORMAT)
48
+ img_bytes = img_bytes.getvalue()
49
+ encoded = base64.b64encode(img_bytes).decode()
50
+ img_html = f'<img src="data:image/{settings.OUTPUT_IMAGE_FORMAT.lower()};base64,{encoded}" alt="{img_alt}" style="max-width: 100%;">'
51
+ return img_html
52
+
53
+
54
+ def markdown_insert_images(markdown, images):
55
+ image_tags = re.findall(r'(!\[(?P<image_title>[^\]]*)\]\((?P<image_path>[^\)"\s]+)\s*([^\)]*)\))', markdown)
56
+
57
+ for image in image_tags:
58
+ image_markdown = image[0]
59
+ image_alt = image[1]
60
+ image_path = image[2]
61
+ if image_path in images:
62
+ markdown = markdown.replace(image_markdown, img_to_html(images[image_path], image_alt))
63
+ return markdown
64
+
65
+
66
+ @st.cache_data()
67
+ def get_page_image(pdf_file, page_num, dpi=96):
68
+ doc = open_pdf(pdf_file)
69
+ renderer = doc.render(
70
+ pypdfium2.PdfBitmap.to_pil,
71
+ page_indices=[page_num],
72
+ scale=dpi / 72,
73
+ )
74
+ png = list(renderer)[0]
75
+ png_image = png.convert("RGB")
76
+ return png_image
77
+
78
+
79
+ @st.cache_data()
80
+ def page_count(pdf_file):
81
+ doc = open_pdf(pdf_file)
82
+ return len(doc) - 1
83
+
84
+
85
+ st.set_page_config(layout="wide")
86
+ col1, col2 = st.columns([.5, .5])
87
+
88
+ model_dict = load_models()
89
+
90
+
91
+ st.markdown("""
92
+ # Marker Demo
93
+
94
+ This app will let you try marker, a PDF -> Markdown converter. It works with any languages, and extracts images, tables, equations, etc.
95
+
96
+ Find the project [here](https://github.com/VikParuchuri/marker).
97
+ """)
98
+
99
+ in_file = st.sidebar.file_uploader("PDF file:", type=["pdf"])
100
+
101
+ if in_file is None:
102
+ st.stop()
103
+
104
+ filetype = in_file.type
105
+
106
+ with col1:
107
+ page_count = page_count(in_file)
108
+ page_number = st.number_input(f"Page number out of {page_count}:", min_value=0, value=0, max_value=page_count)
109
+ pil_image = get_page_image(in_file, page_number)
110
+
111
+ st.image(pil_image, caption="PDF file (preview)", use_container_width=True)
112
+
113
+ page_range = st.sidebar.text_input("Page range to parse, comma separated like 0,5-10,20", value=f"{page_number}-{page_number}")
114
+ output_format = st.sidebar.selectbox("Output format", ["markdown", "json", "html"], index=0)
115
+ run_marker = st.sidebar.button("Run Marker")
116
+
117
+ use_llm = st.sidebar.checkbox("Use LLM", help="Use LLM for higher quality processing", value=False)
118
+ force_ocr = st.sidebar.checkbox("Force OCR", help="Force OCR on all pages", value=False)
119
+ strip_existing_ocr = st.sidebar.checkbox("Strip existing OCR", help="Strip existing OCR text from the PDF and re-OCR.", value=False)
120
+ debug = st.sidebar.checkbox("Debug", help="Show debug information", value=False)
121
+
122
+ if not run_marker:
123
+ st.stop()
124
+
125
+ # Run Marker
126
+ with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb+") as temp_pdf:
127
+ temp_pdf.write(in_file.getvalue())
128
+ temp_pdf.seek(0)
129
+ filename = temp_pdf.name
130
+ cli_options = {
131
+ "output_format": output_format,
132
+ "page_range": page_range,
133
+ "force_ocr": force_ocr,
134
+ "debug": debug,
135
+ "output_dir": settings.DEBUG_DATA_FOLDER if debug else None,
136
+ "use_llm": use_llm,
137
+ "strip_existing_ocr": strip_existing_ocr
138
+ }
139
+ config_parser = ConfigParser(cli_options)
140
+ rendered = convert_pdf(
141
+ filename,
142
+ config_parser
143
+ )
144
+ page_range = config_parser.generate_config_dict()["page_range"]
145
+ first_page = page_range[0] if page_range else 0
146
+
147
+ text, ext, images = text_from_rendered(rendered)
148
+ with col2:
149
+ if output_format == "markdown":
150
+ text = markdown_insert_images(text, images)
151
+ st.markdown(text, unsafe_allow_html=True)
152
+ elif output_format == "json":
153
+ st.json(text)
154
+ elif output_format == "html":
155
+ st.html(text)
156
+
157
+ if debug:
158
+ with col1:
159
+ debug_data_path = rendered.metadata.get("debug_data_path")
160
+ if debug_data_path:
161
+ pdf_image_path = os.path.join(debug_data_path, f"pdf_page_{first_page}.png")
162
+ img = Image.open(pdf_image_path)
163
+ st.image(img, caption="PDF debug image", use_container_width=True)
164
+ layout_image_path = os.path.join(debug_data_path, f"layout_page_{first_page}.png")
165
+ img = Image.open(layout_image_path)
166
+ st.image(img, caption="Layout debug image", use_container_width=True)
167
+
marker_app.py CHANGED
@@ -1,167 +1,4 @@
1
- import os
2
-
3
- from marker.settings import settings
4
-
5
- os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
6
- os.environ["IN_STREAMLIT"] = "true"
7
-
8
- import base64
9
- import io
10
- import re
11
- import tempfile
12
- from typing import Any, Dict
13
-
14
- import pypdfium2
15
- import streamlit as st
16
- from PIL import Image
17
-
18
- from marker.converters.pdf import PdfConverter
19
- from marker.models import create_model_dict
20
- from marker.config.parser import ConfigParser
21
- from marker.output import text_from_rendered
22
-
23
- @st.cache_resource()
24
- def load_models():
25
- return create_model_dict()
26
-
27
-
28
- def convert_pdf(fname: str, config_parser: ConfigParser) -> (str, Dict[str, Any], dict):
29
- config_dict = config_parser.generate_config_dict()
30
- config_dict["pdftext_workers"] = 1
31
- converter = PdfConverter(
32
- config=config_dict,
33
- artifact_dict=model_dict,
34
- processor_list=config_parser.get_processors(),
35
- renderer=config_parser.get_renderer()
36
- )
37
- return converter(fname)
38
-
39
-
40
- def open_pdf(pdf_file):
41
- stream = io.BytesIO(pdf_file.getvalue())
42
- return pypdfium2.PdfDocument(stream)
43
-
44
-
45
- def img_to_html(img, img_alt):
46
- img_bytes = io.BytesIO()
47
- img.save(img_bytes, format=settings.OUTPUT_IMAGE_FORMAT)
48
- img_bytes = img_bytes.getvalue()
49
- encoded = base64.b64encode(img_bytes).decode()
50
- img_html = f'<img src="data:image/{settings.OUTPUT_IMAGE_FORMAT.lower()};base64,{encoded}" alt="{img_alt}" style="max-width: 100%;">'
51
- return img_html
52
-
53
-
54
- def markdown_insert_images(markdown, images):
55
- image_tags = re.findall(r'(!\[(?P<image_title>[^\]]*)\]\((?P<image_path>[^\)"\s]+)\s*([^\)]*)\))', markdown)
56
-
57
- for image in image_tags:
58
- image_markdown = image[0]
59
- image_alt = image[1]
60
- image_path = image[2]
61
- if image_path in images:
62
- markdown = markdown.replace(image_markdown, img_to_html(images[image_path], image_alt))
63
- return markdown
64
-
65
-
66
- @st.cache_data()
67
- def get_page_image(pdf_file, page_num, dpi=96):
68
- doc = open_pdf(pdf_file)
69
- renderer = doc.render(
70
- pypdfium2.PdfBitmap.to_pil,
71
- page_indices=[page_num],
72
- scale=dpi / 72,
73
- )
74
- png = list(renderer)[0]
75
- png_image = png.convert("RGB")
76
- return png_image
77
-
78
-
79
- @st.cache_data()
80
- def page_count(pdf_file):
81
- doc = open_pdf(pdf_file)
82
- return len(doc) - 1
83
-
84
-
85
- st.set_page_config(layout="wide")
86
- col1, col2 = st.columns([.5, .5])
87
-
88
- model_dict = load_models()
89
-
90
-
91
- st.markdown("""
92
- # Marker Demo
93
-
94
- This app will let you try marker, a PDF -> Markdown converter. It works with any languages, and extracts images, tables, equations, etc.
95
-
96
- Find the project [here](https://github.com/VikParuchuri/marker).
97
- """)
98
-
99
- in_file = st.sidebar.file_uploader("PDF file:", type=["pdf"])
100
-
101
- if in_file is None:
102
- st.stop()
103
-
104
- filetype = in_file.type
105
-
106
- with col1:
107
- page_count = page_count(in_file)
108
- page_number = st.number_input(f"Page number out of {page_count}:", min_value=0, value=0, max_value=page_count)
109
- pil_image = get_page_image(in_file, page_number)
110
-
111
- st.image(pil_image, caption="PDF file (preview)", use_container_width=True)
112
-
113
- page_range = st.sidebar.text_input("Page range to parse, comma separated like 0,5-10,20", value=f"{page_number}-{page_number}")
114
- output_format = st.sidebar.selectbox("Output format", ["markdown", "json", "html"], index=0)
115
- run_marker = st.sidebar.button("Run Marker")
116
-
117
- use_llm = st.sidebar.checkbox("Use LLM", help="Use LLM for higher quality processing", value=False)
118
- force_ocr = st.sidebar.checkbox("Force OCR", help="Force OCR on all pages", value=False)
119
- strip_existing_ocr = st.sidebar.checkbox("Strip existing OCR", help="Strip existing OCR text from the PDF and re-OCR.", value=False)
120
- debug = st.sidebar.checkbox("Debug", help="Show debug information", value=False)
121
-
122
- if not run_marker:
123
- st.stop()
124
-
125
- # Run Marker
126
- with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb+") as temp_pdf:
127
- temp_pdf.write(in_file.getvalue())
128
- temp_pdf.seek(0)
129
- filename = temp_pdf.name
130
- cli_options = {
131
- "output_format": output_format,
132
- "page_range": page_range,
133
- "force_ocr": force_ocr,
134
- "debug": debug,
135
- "output_dir": settings.DEBUG_DATA_FOLDER if debug else None,
136
- "use_llm": use_llm,
137
- "strip_existing_ocr": strip_existing_ocr
138
- }
139
- config_parser = ConfigParser(cli_options)
140
- rendered = convert_pdf(
141
- filename,
142
- config_parser
143
- )
144
- page_range = config_parser.generate_config_dict()["page_range"]
145
- first_page = page_range[0] if page_range else 0
146
-
147
- text, ext, images = text_from_rendered(rendered)
148
- with col2:
149
- if output_format == "markdown":
150
- text = markdown_insert_images(text, images)
151
- st.markdown(text, unsafe_allow_html=True)
152
- elif output_format == "json":
153
- st.json(text)
154
- elif output_format == "html":
155
- st.html(text)
156
-
157
- if debug:
158
- with col1:
159
- debug_data_path = rendered.metadata.get("debug_data_path")
160
- if debug_data_path:
161
- pdf_image_path = os.path.join(debug_data_path, f"pdf_page_{first_page}.png")
162
- img = Image.open(pdf_image_path)
163
- st.image(img, caption="PDF debug image", use_container_width=True)
164
- layout_image_path = os.path.join(debug_data_path, f"layout_page_{first_page}.png")
165
- img = Image.open(layout_image_path)
166
- st.image(img, caption="Layout debug image", use_container_width=True)
167
 
 
 
 
1
+ from marker.scripts import streamlit_app_cli
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
+ if __name__ == "__main__":
4
+ streamlit_app_cli()
marker_server.py CHANGED
@@ -1,174 +1,4 @@
1
- import traceback
2
-
3
- import click
4
- import os
5
-
6
- import uvicorn
7
- from pydantic import BaseModel, Field
8
- from starlette.responses import HTMLResponse
9
-
10
- from marker.config.parser import ConfigParser
11
- from marker.output import text_from_rendered
12
-
13
- import base64
14
- from contextlib import asynccontextmanager
15
- from typing import Optional, Annotated
16
- import io
17
-
18
- from fastapi import FastAPI, Form, File, UploadFile
19
- from marker.converters.pdf import PdfConverter
20
- from marker.models import create_model_dict
21
- from marker.settings import settings
22
-
23
- app_data = {}
24
-
25
-
26
- UPLOAD_DIRECTORY = "./uploads"
27
- os.makedirs(UPLOAD_DIRECTORY, exist_ok=True)
28
-
29
-
30
- @asynccontextmanager
31
- async def lifespan(app: FastAPI):
32
- app_data["models"] = create_model_dict()
33
-
34
- yield
35
-
36
- if "models" in app_data:
37
- del app_data["models"]
38
-
39
-
40
- app = FastAPI(lifespan=lifespan)
41
-
42
-
43
- @app.get("/")
44
- async def root():
45
- return HTMLResponse(
46
- """
47
- <h1>Marker API</h1>
48
- <ul>
49
- <li><a href="/docs">API Documentation</a></li>
50
- <li><a href="/marker">Run marker (post request only)</a></li>
51
- </ul>
52
- """
53
- )
54
-
55
-
56
- class CommonParams(BaseModel):
57
- filepath: Annotated[
58
- Optional[str], Field(description="The path to the PDF file to convert.")
59
- ]
60
- page_range: Annotated[
61
- Optional[str],
62
- Field(description="Page range to convert, specify comma separated page numbers or ranges. Example: 0,5-10,20", example=None)
63
- ] = None
64
- languages: Annotated[
65
- Optional[str],
66
- Field(description="Comma separated list of languages to use for OCR. Must be either the names or codes from from https://github.com/VikParuchuri/surya/blob/master/surya/languages.py.", example=None)
67
- ] = None
68
- force_ocr: Annotated[
69
- bool,
70
- Field(
71
- description="Force OCR on all pages of the PDF. Defaults to False. This can lead to worse results if you have good text in your PDFs (which is true in most cases)."
72
- ),
73
- ] = False
74
- paginate_output: Annotated[
75
- bool,
76
- Field(
77
- description="Whether to paginate the output. Defaults to False. If set to True, each page of the output will be separated by a horizontal rule that contains the page number (2 newlines, {PAGE_NUMBER}, 48 - characters, 2 newlines)."
78
- ),
79
- ] = False
80
- output_format: Annotated[
81
- str,
82
- Field(description="The format to output the text in. Can be 'markdown', 'json', or 'html'. Defaults to 'markdown'.")
83
- ] = "markdown"
84
-
85
-
86
- async def _convert_pdf(params: CommonParams):
87
- assert params.output_format in ["markdown", "json", "html"], "Invalid output format"
88
- try:
89
- options = params.model_dump()
90
- print(options)
91
- config_parser = ConfigParser(options)
92
- config_dict = config_parser.generate_config_dict()
93
- config_dict["pdftext_workers"] = 1
94
- converter = PdfConverter(
95
- config=config_dict,
96
- artifact_dict=app_data["models"],
97
- processor_list=config_parser.get_processors(),
98
- renderer=config_parser.get_renderer()
99
- )
100
- rendered = converter(params.filepath)
101
- text, _, images = text_from_rendered(rendered)
102
- metadata = rendered.metadata
103
- except Exception as e:
104
- traceback.print_exc()
105
- return {
106
- "success": False,
107
- "error": str(e),
108
- }
109
-
110
- encoded = {}
111
- for k, v in images.items():
112
- byte_stream = io.BytesIO()
113
- v.save(byte_stream, format=settings.OUTPUT_IMAGE_FORMAT)
114
- encoded[k] = base64.b64encode(byte_stream.getvalue()).decode(settings.OUTPUT_ENCODING)
115
-
116
- return {
117
- "format": params.output_format,
118
- "output": text,
119
- "images": encoded,
120
- "metadata": metadata,
121
- "success": True,
122
- }
123
-
124
- @app.post("/marker")
125
- async def convert_pdf(
126
- params: CommonParams
127
- ):
128
- return await _convert_pdf(params)
129
-
130
-
131
-
132
- @app.post("/marker/upload")
133
- async def convert_pdf_upload(
134
- page_range: Optional[str] = Form(default=None),
135
- languages: Optional[str] = Form(default=None),
136
- force_ocr: Optional[bool] = Form(default=False),
137
- paginate_output: Optional[bool] = Form(default=False),
138
- output_format: Optional[str] = Form(default="markdown"),
139
- file: UploadFile = File(
140
- ..., description="The PDF file to convert.", media_type="application/pdf"
141
- ),
142
- ):
143
- upload_path = os.path.join(UPLOAD_DIRECTORY, file.filename)
144
- with open(upload_path, "wb+") as upload_file:
145
- file_contents = await file.read()
146
- upload_file.write(file_contents)
147
-
148
- params = CommonParams(
149
- filepath=upload_path,
150
- page_range=page_range,
151
- languages=languages,
152
- force_ocr=force_ocr,
153
- paginate_output=paginate_output,
154
- output_format=output_format,
155
- )
156
- results = await _convert_pdf(params)
157
- os.remove(upload_path)
158
- return results
159
-
160
-
161
- @click.command()
162
- @click.option("--port", type=int, default=8000, help="Port to run the server on")
163
- @click.option("--host", type=str, default="127.0.0.1", help="Host to run the server on")
164
- def main(port: int, host: str):
165
- # Run the server
166
- uvicorn.run(
167
- app,
168
- host=host,
169
- port=port,
170
- )
171
-
172
 
173
  if __name__ == "__main__":
174
- main()
 
1
+ from marker.scripts import server_cli
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  if __name__ == "__main__":
4
+ server_cli()
pyproject.toml CHANGED
@@ -1,6 +1,6 @@
1
  [tool.poetry]
2
  name = "marker-pdf"
3
- version = "1.2.4"
4
  description = "Convert PDF to markdown with high speed and accuracy."
5
  authors = ["Vik Paruchuri <github@vikas.sh>"]
6
  readme = "README.md"
@@ -11,13 +11,7 @@ packages = [
11
  {include = "marker"}
12
  ]
13
  include = [
14
- "convert.py",
15
- "convert_single.py",
16
- "chunk_convert.sh",
17
- "chunk_convert.py",
18
- "marker_app.py",
19
- "run_marker_app.py",
20
- "marker_server.py",
21
  ]
22
 
23
  [tool.poetry.dependencies]
@@ -53,11 +47,11 @@ pytest = "^8.3.3"
53
  pytest-mock = "^3.14.0"
54
 
55
  [tool.poetry.scripts]
56
- marker = "convert:main"
57
- marker_single = "convert_single:main"
58
- marker_chunk_convert = "chunk_convert:main"
59
- marker_gui = "run_marker_app:run"
60
- marker_server = "marker_server:main"
61
 
62
  [build-system]
63
  requires = ["poetry-core"]
 
1
  [tool.poetry]
2
  name = "marker-pdf"
3
+ version = "1.2.5"
4
  description = "Convert PDF to markdown with high speed and accuracy."
5
  authors = ["Vik Paruchuri <github@vikas.sh>"]
6
  readme = "README.md"
 
11
  {include = "marker"}
12
  ]
13
  include = [
14
+ "marker/scripts/*.sh"
 
 
 
 
 
 
15
  ]
16
 
17
  [tool.poetry.dependencies]
 
47
  pytest-mock = "^3.14.0"
48
 
49
  [tool.poetry.scripts]
50
+ marker = "marker.scripts.convert:convert_cli"
51
+ marker_single = "marker.scripts.convert_single:convert_single_cli"
52
+ marker_chunk_convert = "marker.scripts.chunk_convert:chunk_convert_cli"
53
+ marker_gui = "marker.scripts.run_streamlit_app:streamlit_app_cli"
54
+ marker_server = "marker.scripts.server:server_cli"
55
 
56
  [build-system]
57
  requires = ["poetry-core"]