Spaces:

rt4u
/

marker

Sleeping

App Files Files Community

Vik Paruchuri commited on Jan 19

Commit

ba2fd61

1 Parent(s): c59ca05

Hotfix scripts

Browse files

Files changed (14) hide show

chunk_convert.py +2 -20
convert.py +1 -113
convert_single.py +2 -41
marker/scripts/__init__.py +5 -0
marker/scripts/chunk_convert.py +20 -0
chunk_convert.sh → marker/scripts/chunk_convert.sh +0 -0
marker/scripts/convert.py +114 -0
marker/scripts/convert_single.py +39 -0
run_marker_app.py → marker/scripts/run_streamlit_app.py +2 -6
marker/scripts/server.py +170 -0
marker/scripts/streamlit_app.py +167 -0
marker_app.py +3 -166
marker_server.py +2 -172
pyproject.toml +7 -13

chunk_convert.py CHANGED Viewed

@@ -1,22 +1,4 @@
-import argparse
-import subprocess
-import pkg_resources
-def main():
-    parser = argparse.ArgumentParser(description="Convert a folder of PDFs to a folder of markdown files in chunks.")
-    parser.add_argument("in_folder", help="Input folder with pdfs.")
-    parser.add_argument("out_folder", help="Output folder")
-    args = parser.parse_args()
-    script_path = pkg_resources.resource_filename(__name__, 'chunk_convert.sh')
-    # Construct the command
-    cmd = f"{script_path} {args.in_folder} {args.out_folder}"
-    # Execute the shell script
-    subprocess.run(cmd, shell=True, check=True)
 if __name__ == "__main__":
-    main()

+from marker.scripts import chunk_convert_cli
 if __name__ == "__main__":
+    chunk_convert_cli()

convert.py CHANGED Viewed

@@ -1,116 +1,4 @@
-import os
-os.environ["GRPC_VERBOSITY"] = "ERROR"
-os.environ["GLOG_minloglevel"] = "2"
-os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # Transformers uses .isin for a simple op, which is not supported on MPS
-os.environ["IN_STREAMLIT"] = "true" # Avoid multiprocessing inside surya
-import math
-import traceback
-import click
-import torch.multiprocessing as mp
-from tqdm import tqdm
-from marker.config.parser import ConfigParser
-from marker.converters.pdf import PdfConverter
-from marker.logger import configure_logging
-from marker.models import create_model_dict
-from marker.output import output_exists, save_output
-from marker.settings import settings
-configure_logging()
-def worker_init(model_dict):
-    if model_dict is None:
-        model_dict = create_model_dict()
-    global model_refs
-    model_refs = model_dict
-def worker_exit():
-    global model_refs
-    del model_refs
-def process_single_pdf(args):
-    fpath, cli_options = args
-    config_parser = ConfigParser(cli_options)
-    out_folder = config_parser.get_output_folder(fpath)
-    base_name = config_parser.get_base_filename(fpath)
-    if cli_options.get('skip_existing') and output_exists(out_folder, base_name):
-        return
-    try:
-        converter = PdfConverter(
-            config=config_parser.generate_config_dict(),
-            artifact_dict=model_refs,
-            processor_list=config_parser.get_processors(),
-            renderer=config_parser.get_renderer()
-        )
-        rendered = converter(fpath)
-        out_folder = config_parser.get_output_folder(fpath)
-        save_output(rendered, out_folder, base_name)
-    except Exception as e:
-        print(f"Error converting {fpath}: {e}")
-        print(traceback.format_exc())
-@click.command()
-@click.argument("in_folder", type=str)
-@ConfigParser.common_options
-@click.option("--chunk_idx", type=int, default=0, help="Chunk index to convert")
-@click.option("--num_chunks", type=int, default=1, help="Number of chunks being processed in parallel")
-@click.option("--max_files", type=int, default=None, help="Maximum number of pdfs to convert")
-@click.option("--workers", type=int, default=5, help="Number of worker processes to use.")
-@click.option("--skip_existing", is_flag=True, default=False, help="Skip existing converted files.")
-def main(in_folder: str, **kwargs):
-    in_folder = os.path.abspath(in_folder)
-    files = [os.path.join(in_folder, f) for f in os.listdir(in_folder)]
-    files = [f for f in files if os.path.isfile(f)]
-    # Handle chunks if we're processing in parallel
-    # Ensure we get all files into a chunk
-    chunk_size = math.ceil(len(files) / kwargs["num_chunks"])
-    start_idx = kwargs["chunk_idx"] * chunk_size
-    end_idx = start_idx + chunk_size
-    files_to_convert = files[start_idx:end_idx]
-    # Limit files converted if needed
-    if kwargs["max_files"]:
-        files_to_convert = files_to_convert[:kwargs["max_files"]]
-    # Disable nested multiprocessing
-    kwargs["disable_multiprocessing"] = True
-    total_processes = min(len(files_to_convert), kwargs["workers"])
-    try:
-        mp.set_start_method('spawn') # Required for CUDA, forkserver doesn't work
-    except RuntimeError:
-        raise RuntimeError("Set start method to spawn twice. This may be a temporary issue with the script. Please try running it again.")
-    if settings.TORCH_DEVICE == "mps" or settings.TORCH_DEVICE_MODEL == "mps":
-        model_dict = None
-    else:
-        model_dict = create_model_dict()
-        for k, v in model_dict.items():
-            v.share_memory()
-    print(f"Converting {len(files_to_convert)} pdfs in chunk {kwargs['chunk_idx'] + 1}/{kwargs['num_chunks']} with {total_processes} processes and saving to {kwargs['output_dir']}")
-    task_args = [(f, kwargs) for f in files_to_convert]
-    with mp.Pool(processes=total_processes, initializer=worker_init, initargs=(model_dict,)) as pool:
-        list(tqdm(pool.imap(process_single_pdf, task_args), total=len(task_args), desc="Processing PDFs", unit="pdf"))
-        pool._worker_handler.terminate = worker_exit
-    # Delete all CUDA tensors
-    del model_dict
 if __name__ == "__main__":
     main()

+from marker.scripts import convert_cli
 if __name__ == "__main__":
     main()

convert_single.py CHANGED Viewed

@@ -1,43 +1,4 @@
-import os
-os.environ["GRPC_VERBOSITY"] = "ERROR"
-os.environ["GLOG_minloglevel"] = "2"
-os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"  # Transformers uses .isin for a simple op, which is not supported on MPS
-import time
-import click
-from marker.config.parser import ConfigParser
-from marker.config.printer import CustomClickPrinter
-from marker.converters.pdf import PdfConverter
-from marker.logger import configure_logging
-from marker.models import create_model_dict
-from marker.output import save_output
-configure_logging()
-@click.command(cls=CustomClickPrinter, help="Convert a single PDF to markdown.")
-@click.argument("fpath", type=str)
-@ConfigParser.common_options
-def main(fpath: str, **kwargs):
-    models = create_model_dict()
-    start = time.time()
-    config_parser = ConfigParser(kwargs)
-    converter = PdfConverter(
-        config=config_parser.generate_config_dict(),
-        artifact_dict=models,
-        processor_list=config_parser.get_processors(),
-        renderer=config_parser.get_renderer()
-    )
-    rendered = converter(fpath)
-    out_folder = config_parser.get_output_folder(fpath)
-    save_output(rendered, out_folder, config_parser.get_base_filename(fpath))
-    print(f"Saved markdown to {out_folder}")
-    print(f"Total time: {time.time() - start}")
 if __name__ == "__main__":
-    main()

+from marker.scripts import convert_single_cli
 if __name__ == "__main__":
+    convert_single_cli()

marker/scripts/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from marker.scripts.convert_single import convert_single_cli
+from marker.scripts.convert import convert_cli
+from marker.scripts.server import server_cli
+from marker.scripts.run_streamlit_app import streamlit_app_cli
+from marker.scripts.chunk_convert import chunk_convert_cli

marker/scripts/chunk_convert.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import argparse
+import os
+import subprocess
+import pkg_resources
+def chunk_convert_cli():
+    parser = argparse.ArgumentParser(description="Convert a folder of PDFs to a folder of markdown files in chunks.")
+    parser.add_argument("in_folder", help="Input folder with pdfs.")
+    parser.add_argument("out_folder", help="Output folder")
+    args = parser.parse_args()
+    cur_dir = os.path.dirname(os.path.abspath(__file__))
+    script_path = os.path.join(cur_dir, "chunk_convert.sh")
+    # Construct the command
+    cmd = f"{script_path} {args.in_folder} {args.out_folder}"
+    # Execute the shell script
+    subprocess.run(cmd, shell=True, check=True)

chunk_convert.sh → marker/scripts/chunk_convert.sh RENAMED Viewed

File without changes

marker/scripts/convert.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import os
+os.environ["GRPC_VERBOSITY"] = "ERROR"
+os.environ["GLOG_minloglevel"] = "2"
+os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # Transformers uses .isin for a simple op, which is not supported on MPS
+os.environ["IN_STREAMLIT"] = "true" # Avoid multiprocessing inside surya
+import math
+import traceback
+import click
+import torch.multiprocessing as mp
+from tqdm import tqdm
+from marker.config.parser import ConfigParser
+from marker.config.printer import CustomClickPrinter
+from marker.logger import configure_logging
+from marker.models import create_model_dict
+from marker.output import output_exists, save_output
+from marker.settings import settings
+configure_logging()
+def worker_init(model_dict):
+    if model_dict is None:
+        model_dict = create_model_dict()
+    global model_refs
+    model_refs = model_dict
+def worker_exit():
+    global model_refs
+    del model_refs
+def process_single_pdf(args):
+    fpath, cli_options = args
+    config_parser = ConfigParser(cli_options)
+    out_folder = config_parser.get_output_folder(fpath)
+    base_name = config_parser.get_base_filename(fpath)
+    if cli_options.get('skip_existing') and output_exists(out_folder, base_name):
+        return
+    converter_cls = config_parser.get_converter_cls()
+    try:
+        converter = converter_cls(
+            config=config_parser.generate_config_dict(),
+            artifact_dict=model_refs,
+            processor_list=config_parser.get_processors(),
+            renderer=config_parser.get_renderer()
+        )
+        rendered = converter(fpath)
+        out_folder = config_parser.get_output_folder(fpath)
+        save_output(rendered, out_folder, base_name)
+    except Exception as e:
+        print(f"Error converting {fpath}: {e}")
+        print(traceback.format_exc())
+@click.command(cls=CustomClickPrinter)
+@click.argument("in_folder", type=str)
+@ConfigParser.common_options
+@click.option("--chunk_idx", type=int, default=0, help="Chunk index to convert")
+@click.option("--num_chunks", type=int, default=1, help="Number of chunks being processed in parallel")
+@click.option("--max_files", type=int, default=None, help="Maximum number of pdfs to convert")
+@click.option("--workers", type=int, default=5, help="Number of worker processes to use.")
+@click.option("--skip_existing", is_flag=True, default=False, help="Skip existing converted files.")
+def convert_cli(in_folder: str, **kwargs):
+    in_folder = os.path.abspath(in_folder)
+    files = [os.path.join(in_folder, f) for f in os.listdir(in_folder)]
+    files = [f for f in files if os.path.isfile(f)]
+    # Handle chunks if we're processing in parallel
+    # Ensure we get all files into a chunk
+    chunk_size = math.ceil(len(files) / kwargs["num_chunks"])
+    start_idx = kwargs["chunk_idx"] * chunk_size
+    end_idx = start_idx + chunk_size
+    files_to_convert = files[start_idx:end_idx]
+    # Limit files converted if needed
+    if kwargs["max_files"]:
+        files_to_convert = files_to_convert[:kwargs["max_files"]]
+    # Disable nested multiprocessing
+    kwargs["disable_multiprocessing"] = True
+    total_processes = min(len(files_to_convert), kwargs["workers"])
+    try:
+        mp.set_start_method('spawn') # Required for CUDA, forkserver doesn't work
+    except RuntimeError:
+        raise RuntimeError("Set start method to spawn twice. This may be a temporary issue with the script. Please try running it again.")
+    if settings.TORCH_DEVICE == "mps" or settings.TORCH_DEVICE_MODEL == "mps":
+        model_dict = None
+    else:
+        model_dict = create_model_dict()
+        for k, v in model_dict.items():
+            v.share_memory()
+    print(f"Converting {len(files_to_convert)} pdfs in chunk {kwargs['chunk_idx'] + 1}/{kwargs['num_chunks']} with {total_processes} processes and saving to {kwargs['output_dir']}")
+    task_args = [(f, kwargs) for f in files_to_convert]
+    with mp.Pool(processes=total_processes, initializer=worker_init, initargs=(model_dict,)) as pool:
+        list(tqdm(pool.imap(process_single_pdf, task_args), total=len(task_args), desc="Processing PDFs", unit="pdf"))
+        pool._worker_handler.terminate = worker_exit
+    # Delete all CUDA tensors
+    del model_dict

marker/scripts/convert_single.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import os
+os.environ["GRPC_VERBOSITY"] = "ERROR"
+os.environ["GLOG_minloglevel"] = "2"
+os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"  # Transformers uses .isin for a simple op, which is not supported on MPS
+import time
+import click
+from marker.config.parser import ConfigParser
+from marker.config.printer import CustomClickPrinter
+from marker.logger import configure_logging
+from marker.models import create_model_dict
+from marker.output import save_output
+configure_logging()
+@click.command(cls=CustomClickPrinter, help="Convert a single PDF to markdown.")
+@click.argument("fpath", type=str)
+@ConfigParser.common_options
+def convert_single_cli(fpath: str, **kwargs):
+    models = create_model_dict()
+    start = time.time()
+    config_parser = ConfigParser(kwargs)
+    converter_cls = config_parser.get_converter_cls()
+    converter = converter_cls(
+        config=config_parser.generate_config_dict(),
+        artifact_dict=models,
+        processor_list=config_parser.get_processors(),
+        renderer=config_parser.get_renderer()
+    )
+    rendered = converter(fpath)
+    out_folder = config_parser.get_output_folder(fpath)
+    save_output(rendered, out_folder, config_parser.get_base_filename(fpath))
+    print(f"Saved markdown to {out_folder}")
+    print(f"Total time: {time.time() - start}")

run_marker_app.py → marker/scripts/run_streamlit_app.py RENAMED Viewed

@@ -2,12 +2,8 @@ import subprocess
 import os
-def run():
     cur_dir = os.path.dirname(os.path.abspath(__file__))
-    app_path = os.path.join(cur_dir, "marker_app.py")
     cmd = ["streamlit", "run", app_path]
     subprocess.run(cmd, env={**os.environ, "IN_STREAMLIT": "true"})
-if __name__ == "__main__":
-    run()

 import os
+def streamlit_app_cli():
     cur_dir = os.path.dirname(os.path.abspath(__file__))
+    app_path = os.path.join(cur_dir, "streamlit_app.py")
     cmd = ["streamlit", "run", app_path]
     subprocess.run(cmd, env={**os.environ, "IN_STREAMLIT": "true"})

marker/scripts/server.py ADDED Viewed

	@@ -0,0 +1,170 @@

+import traceback
+import click
+import os
+import uvicorn
+from pydantic import BaseModel, Field
+from starlette.responses import HTMLResponse
+from marker.config.parser import ConfigParser
+from marker.output import text_from_rendered
+import base64
+from contextlib import asynccontextmanager
+from typing import Optional, Annotated
+import io
+from fastapi import FastAPI, Form, File, UploadFile
+from marker.converters.pdf import PdfConverter
+from marker.models import create_model_dict
+from marker.settings import settings
+app_data = {}
+UPLOAD_DIRECTORY = "./uploads"
+os.makedirs(UPLOAD_DIRECTORY, exist_ok=True)
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    app_data["models"] = create_model_dict()
+    yield
+    if "models" in app_data:
+        del app_data["models"]
+app = FastAPI(lifespan=lifespan)
+@app.get("/")
+async def root():
+    return HTMLResponse(
+        """
+<h1>Marker API</h1>
+<ul>
+    <li><a href="/docs">API Documentation</a></li>
+    <li><a href="/marker">Run marker (post request only)</a></li>
+</ul>
+"""
+    )
+class CommonParams(BaseModel):
+    filepath: Annotated[
+        Optional[str], Field(description="The path to the PDF file to convert.")
+    ]
+    page_range: Annotated[
+        Optional[str],
+        Field(description="Page range to convert, specify comma separated page numbers or ranges.  Example: 0,5-10,20", example=None)
+    ] = None
+    languages: Annotated[
+        Optional[str],
+        Field(description="Comma separated list of languages to use for OCR. Must be either the names or codes from from https://github.com/VikParuchuri/surya/blob/master/surya/languages.py.", example=None)
+    ] = None
+    force_ocr: Annotated[
+        bool,
+        Field(
+            description="Force OCR on all pages of the PDF.  Defaults to False.  This can lead to worse results if you have good text in your PDFs (which is true in most cases)."
+        ),
+    ] = False
+    paginate_output: Annotated[
+        bool,
+        Field(
+            description="Whether to paginate the output.  Defaults to False.  If set to True, each page of the output will be separated by a horizontal rule that contains the page number (2 newlines, {PAGE_NUMBER}, 48 - characters, 2 newlines)."
+        ),
+    ] = False
+    output_format: Annotated[
+        str,
+        Field(description="The format to output the text in.  Can be 'markdown', 'json', or 'html'.  Defaults to 'markdown'.")
+    ] = "markdown"
+async def _convert_pdf(params: CommonParams):
+    assert params.output_format in ["markdown", "json", "html"], "Invalid output format"
+    try:
+        options = params.model_dump()
+        print(options)
+        config_parser = ConfigParser(options)
+        config_dict = config_parser.generate_config_dict()
+        config_dict["pdftext_workers"] = 1
+        converter = PdfConverter(
+            config=config_dict,
+            artifact_dict=app_data["models"],
+            processor_list=config_parser.get_processors(),
+            renderer=config_parser.get_renderer()
+        )
+        rendered = converter(params.filepath)
+        text, _, images = text_from_rendered(rendered)
+        metadata = rendered.metadata
+    except Exception as e:
+        traceback.print_exc()
+        return {
+            "success": False,
+            "error": str(e),
+        }
+    encoded = {}
+    for k, v in images.items():
+        byte_stream = io.BytesIO()
+        v.save(byte_stream, format=settings.OUTPUT_IMAGE_FORMAT)
+        encoded[k] = base64.b64encode(byte_stream.getvalue()).decode(settings.OUTPUT_ENCODING)
+    return {
+        "format": params.output_format,
+        "output": text,
+        "images": encoded,
+        "metadata": metadata,
+        "success": True,
+    }
+@app.post("/marker")
+async def convert_pdf(
+    params: CommonParams
+):
+    return await _convert_pdf(params)
+@app.post("/marker/upload")
+async def convert_pdf_upload(
+    page_range: Optional[str] = Form(default=None),
+    languages: Optional[str] = Form(default=None),
+    force_ocr: Optional[bool] = Form(default=False),
+    paginate_output: Optional[bool] = Form(default=False),
+    output_format: Optional[str] = Form(default="markdown"),
+    file: UploadFile = File(
+        ..., description="The PDF file to convert.", media_type="application/pdf"
+    ),
+):
+    upload_path = os.path.join(UPLOAD_DIRECTORY, file.filename)
+    with open(upload_path, "wb+") as upload_file:
+        file_contents = await file.read()
+        upload_file.write(file_contents)
+    params = CommonParams(
+        filepath=upload_path,
+        page_range=page_range,
+        languages=languages,
+        force_ocr=force_ocr,
+        paginate_output=paginate_output,
+        output_format=output_format,
+    )
+    results = await _convert_pdf(params)
+    os.remove(upload_path)
+    return results
+@click.command()
+@click.option("--port", type=int, default=8000, help="Port to run the server on")
+@click.option("--host", type=str, default="127.0.0.1", help="Host to run the server on")
+def server_cli(port: int, host: str):
+    # Run the server
+    uvicorn.run(
+        app,
+        host=host,
+        port=port,
+    )

marker/scripts/streamlit_app.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import os
+from marker.settings import settings
+os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
+os.environ["IN_STREAMLIT"] = "true"
+import base64
+import io
+import re
+import tempfile
+from typing import Any, Dict
+import pypdfium2
+import streamlit as st
+from PIL import Image
+from marker.converters.pdf import PdfConverter
+from marker.models import create_model_dict
+from marker.config.parser import ConfigParser
+from marker.output import text_from_rendered
+@st.cache_resource()
+def load_models():
+    return create_model_dict()
+def convert_pdf(fname: str, config_parser: ConfigParser) -> (str, Dict[str, Any], dict):
+    config_dict = config_parser.generate_config_dict()
+    config_dict["pdftext_workers"] = 1
+    converter = PdfConverter(
+        config=config_dict,
+        artifact_dict=model_dict,
+        processor_list=config_parser.get_processors(),
+        renderer=config_parser.get_renderer()
+    )
+    return converter(fname)
+def open_pdf(pdf_file):
+    stream = io.BytesIO(pdf_file.getvalue())
+    return pypdfium2.PdfDocument(stream)
+def img_to_html(img, img_alt):
+    img_bytes = io.BytesIO()
+    img.save(img_bytes, format=settings.OUTPUT_IMAGE_FORMAT)
+    img_bytes = img_bytes.getvalue()
+    encoded = base64.b64encode(img_bytes).decode()
+    img_html = f'<img src="data:image/{settings.OUTPUT_IMAGE_FORMAT.lower()};base64,{encoded}" alt="{img_alt}" style="max-width: 100%;">'
+    return img_html
+def markdown_insert_images(markdown, images):
+    image_tags = re.findall(r'(!\[(?P<image_title>[^\]]*)\]\((?P<image_path>[^\)"\s]+)\s*([^\)]*)\))', markdown)
+    for image in image_tags:
+        image_markdown = image[0]
+        image_alt = image[1]
+        image_path = image[2]
+        if image_path in images:
+            markdown = markdown.replace(image_markdown, img_to_html(images[image_path], image_alt))
+    return markdown
+@st.cache_data()
+def get_page_image(pdf_file, page_num, dpi=96):
+    doc = open_pdf(pdf_file)
+    renderer = doc.render(
+        pypdfium2.PdfBitmap.to_pil,
+        page_indices=[page_num],
+        scale=dpi / 72,
+    )
+    png = list(renderer)[0]
+    png_image = png.convert("RGB")
+    return png_image
+@st.cache_data()
+def page_count(pdf_file):
+    doc = open_pdf(pdf_file)
+    return len(doc) - 1
+st.set_page_config(layout="wide")
+col1, col2 = st.columns([.5, .5])
+model_dict = load_models()
+st.markdown("""
+# Marker Demo
+This app will let you try marker, a PDF -> Markdown converter. It works with any languages, and extracts images, tables, equations, etc.
+Find the project [here](https://github.com/VikParuchuri/marker).
+""")
+in_file = st.sidebar.file_uploader("PDF file:", type=["pdf"])
+if in_file is None:
+    st.stop()
+filetype = in_file.type
+with col1:
+    page_count = page_count(in_file)
+    page_number = st.number_input(f"Page number out of {page_count}:", min_value=0, value=0, max_value=page_count)
+    pil_image = get_page_image(in_file, page_number)
+    st.image(pil_image, caption="PDF file (preview)", use_container_width=True)
+page_range = st.sidebar.text_input("Page range to parse, comma separated like 0,5-10,20", value=f"{page_number}-{page_number}")
+output_format = st.sidebar.selectbox("Output format", ["markdown", "json", "html"], index=0)
+run_marker = st.sidebar.button("Run Marker")
+use_llm = st.sidebar.checkbox("Use LLM", help="Use LLM for higher quality processing", value=False)
+force_ocr = st.sidebar.checkbox("Force OCR", help="Force OCR on all pages", value=False)
+strip_existing_ocr = st.sidebar.checkbox("Strip existing OCR", help="Strip existing OCR text from the PDF and re-OCR.", value=False)
+debug = st.sidebar.checkbox("Debug", help="Show debug information", value=False)
+if not run_marker:
+    st.stop()
+# Run Marker
+with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb+") as temp_pdf:
+    temp_pdf.write(in_file.getvalue())
+    temp_pdf.seek(0)
+    filename = temp_pdf.name
+    cli_options = {
+        "output_format": output_format,
+        "page_range": page_range,
+        "force_ocr": force_ocr,
+        "debug": debug,
+        "output_dir": settings.DEBUG_DATA_FOLDER if debug else None,
+        "use_llm": use_llm,
+        "strip_existing_ocr": strip_existing_ocr
+    }
+    config_parser = ConfigParser(cli_options)
+    rendered = convert_pdf(
+        filename,
+        config_parser
+    )
+    page_range = config_parser.generate_config_dict()["page_range"]
+    first_page = page_range[0] if page_range else 0
+text, ext, images = text_from_rendered(rendered)
+with col2:
+    if output_format == "markdown":
+        text = markdown_insert_images(text, images)
+        st.markdown(text, unsafe_allow_html=True)
+    elif output_format == "json":
+        st.json(text)
+    elif output_format == "html":
+        st.html(text)
+if debug:
+    with col1:
+        debug_data_path = rendered.metadata.get("debug_data_path")
+        if debug_data_path:
+            pdf_image_path = os.path.join(debug_data_path, f"pdf_page_{first_page}.png")
+            img = Image.open(pdf_image_path)
+            st.image(img, caption="PDF debug image", use_container_width=True)
+            layout_image_path = os.path.join(debug_data_path, f"layout_page_{first_page}.png")
+            img = Image.open(layout_image_path)
+            st.image(img, caption="Layout debug image", use_container_width=True)

marker_app.py CHANGED Viewed

@@ -1,167 +1,4 @@
-import os
-from marker.settings import settings
-os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
-os.environ["IN_STREAMLIT"] = "true"
-import base64
-import io
-import re
-import tempfile
-from typing import Any, Dict
-import pypdfium2
-import streamlit as st
-from PIL import Image
-from marker.converters.pdf import PdfConverter
-from marker.models import create_model_dict
-from marker.config.parser import ConfigParser
-from marker.output import text_from_rendered
-@st.cache_resource()
-def load_models():
-    return create_model_dict()
-def convert_pdf(fname: str, config_parser: ConfigParser) -> (str, Dict[str, Any], dict):
-    config_dict = config_parser.generate_config_dict()
-    config_dict["pdftext_workers"] = 1
-    converter = PdfConverter(
-        config=config_dict,
-        artifact_dict=model_dict,
-        processor_list=config_parser.get_processors(),
-        renderer=config_parser.get_renderer()
-    )
-    return converter(fname)
-def open_pdf(pdf_file):
-    stream = io.BytesIO(pdf_file.getvalue())
-    return pypdfium2.PdfDocument(stream)
-def img_to_html(img, img_alt):
-    img_bytes = io.BytesIO()
-    img.save(img_bytes, format=settings.OUTPUT_IMAGE_FORMAT)
-    img_bytes = img_bytes.getvalue()
-    encoded = base64.b64encode(img_bytes).decode()
-    img_html = f'<img src="data:image/{settings.OUTPUT_IMAGE_FORMAT.lower()};base64,{encoded}" alt="{img_alt}" style="max-width: 100%;">'
-    return img_html
-def markdown_insert_images(markdown, images):
-    image_tags = re.findall(r'(!\[(?P<image_title>[^\]]*)\]\((?P<image_path>[^\)"\s]+)\s*([^\)]*)\))', markdown)
-    for image in image_tags:
-        image_markdown = image[0]
-        image_alt = image[1]
-        image_path = image[2]
-        if image_path in images:
-            markdown = markdown.replace(image_markdown, img_to_html(images[image_path], image_alt))
-    return markdown
-@st.cache_data()
-def get_page_image(pdf_file, page_num, dpi=96):
-    doc = open_pdf(pdf_file)
-    renderer = doc.render(
-        pypdfium2.PdfBitmap.to_pil,
-        page_indices=[page_num],
-        scale=dpi / 72,
-    )
-    png = list(renderer)[0]
-    png_image = png.convert("RGB")
-    return png_image
-@st.cache_data()
-def page_count(pdf_file):
-    doc = open_pdf(pdf_file)
-    return len(doc) - 1
-st.set_page_config(layout="wide")
-col1, col2 = st.columns([.5, .5])
-model_dict = load_models()
-st.markdown("""
-# Marker Demo
-This app will let you try marker, a PDF -> Markdown converter. It works with any languages, and extracts images, tables, equations, etc.
-Find the project [here](https://github.com/VikParuchuri/marker).
-""")
-in_file = st.sidebar.file_uploader("PDF file:", type=["pdf"])
-if in_file is None:
-    st.stop()
-filetype = in_file.type
-with col1:
-    page_count = page_count(in_file)
-    page_number = st.number_input(f"Page number out of {page_count}:", min_value=0, value=0, max_value=page_count)
-    pil_image = get_page_image(in_file, page_number)
-    st.image(pil_image, caption="PDF file (preview)", use_container_width=True)
-page_range = st.sidebar.text_input("Page range to parse, comma separated like 0,5-10,20", value=f"{page_number}-{page_number}")
-output_format = st.sidebar.selectbox("Output format", ["markdown", "json", "html"], index=0)
-run_marker = st.sidebar.button("Run Marker")
-use_llm = st.sidebar.checkbox("Use LLM", help="Use LLM for higher quality processing", value=False)
-force_ocr = st.sidebar.checkbox("Force OCR", help="Force OCR on all pages", value=False)
-strip_existing_ocr = st.sidebar.checkbox("Strip existing OCR", help="Strip existing OCR text from the PDF and re-OCR.", value=False)
-debug = st.sidebar.checkbox("Debug", help="Show debug information", value=False)
-if not run_marker:
-    st.stop()
-# Run Marker
-with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb+") as temp_pdf:
-    temp_pdf.write(in_file.getvalue())
-    temp_pdf.seek(0)
-    filename = temp_pdf.name
-    cli_options = {
-        "output_format": output_format,
-        "page_range": page_range,
-        "force_ocr": force_ocr,
-        "debug": debug,
-        "output_dir": settings.DEBUG_DATA_FOLDER if debug else None,
-        "use_llm": use_llm,
-        "strip_existing_ocr": strip_existing_ocr
-    }
-    config_parser = ConfigParser(cli_options)
-    rendered = convert_pdf(
-        filename,
-        config_parser
-    )
-    page_range = config_parser.generate_config_dict()["page_range"]
-    first_page = page_range[0] if page_range else 0
-text, ext, images = text_from_rendered(rendered)
-with col2:
-    if output_format == "markdown":
-        text = markdown_insert_images(text, images)
-        st.markdown(text, unsafe_allow_html=True)
-    elif output_format == "json":
-        st.json(text)
-    elif output_format == "html":
-        st.html(text)
-if debug:
-    with col1:
-        debug_data_path = rendered.metadata.get("debug_data_path")
-        if debug_data_path:
-            pdf_image_path = os.path.join(debug_data_path, f"pdf_page_{first_page}.png")
-            img = Image.open(pdf_image_path)
-            st.image(img, caption="PDF debug image", use_container_width=True)
-            layout_image_path = os.path.join(debug_data_path, f"layout_page_{first_page}.png")
-            img = Image.open(layout_image_path)
-            st.image(img, caption="Layout debug image", use_container_width=True)

+from marker.scripts import streamlit_app_cli
+if __name__ == "__main__":
+    streamlit_app_cli()

marker_server.py CHANGED Viewed

@@ -1,174 +1,4 @@
-import traceback
-import click
-import os
-import uvicorn
-from pydantic import BaseModel, Field
-from starlette.responses import HTMLResponse
-from marker.config.parser import ConfigParser
-from marker.output import text_from_rendered
-import base64
-from contextlib import asynccontextmanager
-from typing import Optional, Annotated
-import io
-from fastapi import FastAPI, Form, File, UploadFile
-from marker.converters.pdf import PdfConverter
-from marker.models import create_model_dict
-from marker.settings import settings
-app_data = {}
-UPLOAD_DIRECTORY = "./uploads"
-os.makedirs(UPLOAD_DIRECTORY, exist_ok=True)
-@asynccontextmanager
-async def lifespan(app: FastAPI):
-    app_data["models"] = create_model_dict()
-    yield
-    if "models" in app_data:
-        del app_data["models"]
-app = FastAPI(lifespan=lifespan)
-@app.get("/")
-async def root():
-    return HTMLResponse(
-        """
-<h1>Marker API</h1>
-<ul>
-    <li><a href="/docs">API Documentation</a></li>
-    <li><a href="/marker">Run marker (post request only)</a></li>
-</ul>
-"""
-    )
-class CommonParams(BaseModel):
-    filepath: Annotated[
-        Optional[str], Field(description="The path to the PDF file to convert.")
-    ]
-    page_range: Annotated[
-        Optional[str],
-        Field(description="Page range to convert, specify comma separated page numbers or ranges.  Example: 0,5-10,20", example=None)
-    ] = None
-    languages: Annotated[
-        Optional[str],
-        Field(description="Comma separated list of languages to use for OCR. Must be either the names or codes from from https://github.com/VikParuchuri/surya/blob/master/surya/languages.py.", example=None)
-    ] = None
-    force_ocr: Annotated[
-        bool,
-        Field(
-            description="Force OCR on all pages of the PDF.  Defaults to False.  This can lead to worse results if you have good text in your PDFs (which is true in most cases)."
-        ),
-    ] = False
-    paginate_output: Annotated[
-        bool,
-        Field(
-            description="Whether to paginate the output.  Defaults to False.  If set to True, each page of the output will be separated by a horizontal rule that contains the page number (2 newlines, {PAGE_NUMBER}, 48 - characters, 2 newlines)."
-        ),
-    ] = False
-    output_format: Annotated[
-        str,
-        Field(description="The format to output the text in.  Can be 'markdown', 'json', or 'html'.  Defaults to 'markdown'.")
-    ] = "markdown"
-async def _convert_pdf(params: CommonParams):
-    assert params.output_format in ["markdown", "json", "html"], "Invalid output format"
-    try:
-        options = params.model_dump()
-        print(options)
-        config_parser = ConfigParser(options)
-        config_dict = config_parser.generate_config_dict()
-        config_dict["pdftext_workers"] = 1
-        converter = PdfConverter(
-            config=config_dict,
-            artifact_dict=app_data["models"],
-            processor_list=config_parser.get_processors(),
-            renderer=config_parser.get_renderer()
-        )
-        rendered = converter(params.filepath)
-        text, _, images = text_from_rendered(rendered)
-        metadata = rendered.metadata
-    except Exception as e:
-        traceback.print_exc()
-        return {
-            "success": False,
-            "error": str(e),
-        }
-    encoded = {}
-    for k, v in images.items():
-        byte_stream = io.BytesIO()
-        v.save(byte_stream, format=settings.OUTPUT_IMAGE_FORMAT)
-        encoded[k] = base64.b64encode(byte_stream.getvalue()).decode(settings.OUTPUT_ENCODING)
-    return {
-        "format": params.output_format,
-        "output": text,
-        "images": encoded,
-        "metadata": metadata,
-        "success": True,
-    }
-@app.post("/marker")
-async def convert_pdf(
-    params: CommonParams
-):
-    return await _convert_pdf(params)
-@app.post("/marker/upload")
-async def convert_pdf_upload(
-    page_range: Optional[str] = Form(default=None),
-    languages: Optional[str] = Form(default=None),
-    force_ocr: Optional[bool] = Form(default=False),
-    paginate_output: Optional[bool] = Form(default=False),
-    output_format: Optional[str] = Form(default="markdown"),
-    file: UploadFile = File(
-        ..., description="The PDF file to convert.", media_type="application/pdf"
-    ),
-):
-    upload_path = os.path.join(UPLOAD_DIRECTORY, file.filename)
-    with open(upload_path, "wb+") as upload_file:
-        file_contents = await file.read()
-        upload_file.write(file_contents)
-    params = CommonParams(
-        filepath=upload_path,
-        page_range=page_range,
-        languages=languages,
-        force_ocr=force_ocr,
-        paginate_output=paginate_output,
-        output_format=output_format,
-    )
-    results = await _convert_pdf(params)
-    os.remove(upload_path)
-    return results
-@click.command()
-@click.option("--port", type=int, default=8000, help="Port to run the server on")
-@click.option("--host", type=str, default="127.0.0.1", help="Host to run the server on")
-def main(port: int, host: str):
-    # Run the server
-    uvicorn.run(
-        app,
-        host=host,
-        port=port,
-    )
 if __name__ == "__main__":
-    main()

+from marker.scripts import server_cli
 if __name__ == "__main__":
+    server_cli()

pyproject.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "marker-pdf"
-version = "1.2.4"
 description = "Convert PDF to markdown with high speed and accuracy."
 authors = ["Vik Paruchuri <github@vikas.sh>"]
 readme = "README.md"
@@ -11,13 +11,7 @@ packages = [
     {include = "marker"}
 ]
 include = [
-    "convert.py",
-    "convert_single.py",
-    "chunk_convert.sh",
-    "chunk_convert.py",
-    "marker_app.py",
-    "run_marker_app.py",
-    "marker_server.py",
 ]
 [tool.poetry.dependencies]
@@ -53,11 +47,11 @@ pytest = "^8.3.3"
 pytest-mock = "^3.14.0"
 [tool.poetry.scripts]
-marker = "convert:main"
-marker_single = "convert_single:main"
-marker_chunk_convert = "chunk_convert:main"
-marker_gui = "run_marker_app:run"
-marker_server = "marker_server:main"
 [build-system]
 requires = ["poetry-core"]

 [tool.poetry]
 name = "marker-pdf"
+version = "1.2.5"
 description = "Convert PDF to markdown with high speed and accuracy."
 authors = ["Vik Paruchuri <github@vikas.sh>"]
 readme = "README.md"
     {include = "marker"}
 ]
 include = [
+    "marker/scripts/*.sh"
 ]
 [tool.poetry.dependencies]
 pytest-mock = "^3.14.0"
 [tool.poetry.scripts]
+marker = "marker.scripts.convert:convert_cli"
+marker_single = "marker.scripts.convert_single:convert_single_cli"
+marker_chunk_convert = "marker.scripts.chunk_convert:chunk_convert_cli"
+marker_gui = "marker.scripts.run_streamlit_app:streamlit_app_cli"
+marker_server = "marker.scripts.server:server_cli"
 [build-system]
 requires = ["poetry-core"]