Vik Paruchuri
commited on
Commit
Β·
c959776
1
Parent(s):
fc65ff4
Handle tables
Browse files- README.md +2 -0
- apt-requirements.txt +2 -0
- convert.py +80 -0
- convert_single.py +20 -0
- marker/{code.py β cleaners/code.py} +24 -22
- marker/{equations.py β cleaners/equations.py} +0 -0
- marker/{headers.py β cleaners/headers.py} +0 -0
- marker/cleaners/table.py +92 -0
- parse.py β marker/convert.py +9 -21
- marker/markdown.py +3 -5
- marker/schema.py +21 -0
- marker/segmentation.py +2 -1
- marker/settings.py +16 -4
- poetry.lock +165 -4
- pyproject.toml +5 -1
README.md
CHANGED
|
@@ -10,7 +10,9 @@ This project converts PDF to Markdown, balancing speed with quality:
|
|
| 10 |
## Install
|
| 11 |
|
| 12 |
- `poetry install`
|
|
|
|
| 13 |
- Set `TESSDATA_PREFIX`
|
|
|
|
| 14 |
|
| 15 |
|
| 16 |
## Usage
|
|
|
|
| 10 |
## Install
|
| 11 |
|
| 12 |
- `poetry install`
|
| 13 |
+
- Install apt requirements
|
| 14 |
- Set `TESSDATA_PREFIX`
|
| 15 |
+
- Find tessdata folder
|
| 16 |
|
| 17 |
|
| 18 |
## Usage
|
apt-requirements.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
tesseract-ocr
|
| 2 |
+
libtesseract-dev
|
convert.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import os
|
| 3 |
+
import ray
|
| 4 |
+
from tqdm import tqdm
|
| 5 |
+
|
| 6 |
+
from marker.convert import convert_single_pdf
|
| 7 |
+
from marker.segmentation import load_layout_model
|
| 8 |
+
from marker.cleaners.equations import load_nougat_model
|
| 9 |
+
from marker.settings import settings
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@ray.remote(num_cpus=settings.RAY_CORES_PER_WORKER)
|
| 13 |
+
def process_single_pdf(fname, out_folder, nougat_model, layout_model):
|
| 14 |
+
out_filename = fname.rsplit(".", 1)[0] + ".md"
|
| 15 |
+
out_filename = os.path.join(out_folder, os.path.basename(out_filename))
|
| 16 |
+
if os.path.exists(out_filename):
|
| 17 |
+
return
|
| 18 |
+
try:
|
| 19 |
+
full_text = convert_single_pdf(fname, layout_model, nougat_model)
|
| 20 |
+
with open(out_filename, "w+") as f:
|
| 21 |
+
f.write(full_text)
|
| 22 |
+
except Exception as e:
|
| 23 |
+
print(f"Error converting {fname}: {e}")
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
if __name__ == "__main__":
|
| 27 |
+
parser = argparse.ArgumentParser(description="Convert multiple pdfs to markdown.")
|
| 28 |
+
parser.add_argument("in_folder", help="Input folder with pdfs.")
|
| 29 |
+
parser.add_argument("out_folder", help="Output folder")
|
| 30 |
+
parser.add_argument("--chunk_idx", type=int, default=0, help="Chunk index to convert")
|
| 31 |
+
parser.add_argument("--num_chunks", type=int, default=1, help="Number of chunks being processed in parallel")
|
| 32 |
+
parser.add_argument("--max", type=int, default=None, help="Maximum number of pdfs to convert")
|
| 33 |
+
parser.add_argument("--workers", type=int, default=5, help="Number of worker processes to use")
|
| 34 |
+
|
| 35 |
+
args = parser.parse_args()
|
| 36 |
+
|
| 37 |
+
in_folder = args.in_folder
|
| 38 |
+
out_folder = args.out_folder
|
| 39 |
+
files = [os.path.join(in_folder, f) for f in os.listdir(in_folder) if f.endswith(".pdf")]
|
| 40 |
+
os.makedirs(out_folder, exist_ok=True)
|
| 41 |
+
|
| 42 |
+
# Handle chunks if we're processing in parallel
|
| 43 |
+
chunk_size = len(files) // args.num_chunks
|
| 44 |
+
start_idx = args.chunk_idx * chunk_size
|
| 45 |
+
end_idx = start_idx + chunk_size
|
| 46 |
+
files_to_convert = files[start_idx:end_idx]
|
| 47 |
+
|
| 48 |
+
# Limit files converted if needed
|
| 49 |
+
if args.max:
|
| 50 |
+
files_to_convert = files_to_convert[:args.max]
|
| 51 |
+
|
| 52 |
+
total_processes = min(len(files), args.workers)
|
| 53 |
+
|
| 54 |
+
ray.init(
|
| 55 |
+
num_cpus=total_processes,
|
| 56 |
+
storage=settings.RAY_CACHE_PATH,
|
| 57 |
+
_temp_dir=settings.RAY_CACHE_PATH,
|
| 58 |
+
dashboard_host=settings.RAY_DASHBOARD_HOST
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
nougat_model = load_nougat_model()
|
| 62 |
+
layoutlm_model = load_layout_model()
|
| 63 |
+
|
| 64 |
+
nougat_ref = ray.put(nougat_model)
|
| 65 |
+
layoutlm_ref = ray.put(layoutlm_model)
|
| 66 |
+
|
| 67 |
+
print(f"Converting {len(files_to_convert)} pdfs with {total_processes} processes, and storing in {out_folder}")
|
| 68 |
+
futures = [process_single_pdf.remote(filename, out_folder, nougat_ref, layoutlm_ref) for filename in files_to_convert]
|
| 69 |
+
|
| 70 |
+
# Run all ray conversion tasks
|
| 71 |
+
progress_bar = tqdm(total=len(futures))
|
| 72 |
+
while len(futures) > 0:
|
| 73 |
+
finished, futures = ray.wait(
|
| 74 |
+
futures, timeout=7.0
|
| 75 |
+
)
|
| 76 |
+
finished_lst = ray.get(finished)
|
| 77 |
+
progress_bar.update(len(finished_lst))
|
| 78 |
+
|
| 79 |
+
# Shutdown ray to free resources
|
| 80 |
+
ray.shutdown()
|
convert_single.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
|
| 3 |
+
from marker.convert import convert_single_pdf
|
| 4 |
+
from marker.segmentation import load_layout_model
|
| 5 |
+
from marker.cleaners.equations import load_nougat_model
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
if __name__ == "__main__":
|
| 9 |
+
parser = argparse.ArgumentParser()
|
| 10 |
+
parser.add_argument("filename", help="PDF file to parse")
|
| 11 |
+
parser.add_argument("output", help="Output file name")
|
| 12 |
+
args = parser.parse_args()
|
| 13 |
+
|
| 14 |
+
fname = args.filename
|
| 15 |
+
layoutlm_model = load_layout_model()
|
| 16 |
+
nougat_model = load_nougat_model()
|
| 17 |
+
full_text = convert_single_pdf(fname, layoutlm_model, nougat_model)
|
| 18 |
+
|
| 19 |
+
with open(args.output, "w+") as f:
|
| 20 |
+
f.write(full_text)
|
marker/{code.py β cleaners/code.py}
RENAMED
|
@@ -4,10 +4,10 @@ from typing import List
|
|
| 4 |
import fitz as pymupdf
|
| 5 |
|
| 6 |
|
| 7 |
-
def is_code_linelen(lines, thresh=
|
| 8 |
# Decide based on chars per newline threshold
|
| 9 |
total_alnum_chars = sum(len(re.findall(r'\w', line.prelim_text)) for line in lines)
|
| 10 |
-
total_newlines = len(lines) - 1
|
| 11 |
|
| 12 |
if total_alnum_chars == 0:
|
| 13 |
return False
|
|
@@ -30,49 +30,51 @@ def identify_code_blocks(blocks: List[Page]):
|
|
| 30 |
else:
|
| 31 |
font_info += stats
|
| 32 |
most_common_font = font_info.most_common(1)[0][0]
|
|
|
|
| 33 |
for page in blocks:
|
| 34 |
try:
|
| 35 |
-
|
| 36 |
-
common_start = page.get_line_start_stats().most_common(1)[0][0]
|
| 37 |
except IndexError:
|
| 38 |
continue
|
| 39 |
|
| 40 |
for block in page.blocks:
|
| 41 |
-
if len(block.lines) < 2:
|
| 42 |
-
continue
|
| 43 |
if block.most_common_block_type() != "Text":
|
|
|
|
| 44 |
continue
|
| 45 |
|
| 46 |
-
|
| 47 |
line_fonts = []
|
| 48 |
for line in block.lines:
|
| 49 |
fonts = [span.font for span in line.spans]
|
| 50 |
line_fonts += fonts
|
| 51 |
-
line_height = line.bbox[3] - line.bbox[1]
|
| 52 |
line_start = line.bbox[0]
|
| 53 |
-
if line_start >
|
| 54 |
-
|
| 55 |
else:
|
| 56 |
-
|
| 57 |
comment_lines = comment_count([line.prelim_text for line in block.lines])
|
| 58 |
is_code = [
|
| 59 |
-
len(block.lines) >
|
| 60 |
-
sum([f != most_common_font for f in line_fonts]) > len(line_fonts)
|
| 61 |
-
(
|
| 62 |
-
sum(is_code) > len(block.lines) * .2
|
| 63 |
-
or
|
| 64 |
-
comment_lines > len(block.lines) * .1
|
| 65 |
-
), # 20% of lines are indented or comments
|
| 66 |
(
|
| 67 |
-
|
| 68 |
or
|
| 69 |
-
comment_lines > len(block.lines) * .
|
| 70 |
-
|
| 71 |
]
|
| 72 |
|
| 73 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
block.set_block_type("Code")
|
| 75 |
|
|
|
|
|
|
|
| 76 |
|
| 77 |
def indent_blocks(blocks: List[Page]):
|
| 78 |
span_counter = 0
|
|
|
|
| 4 |
import fitz as pymupdf
|
| 5 |
|
| 6 |
|
| 7 |
+
def is_code_linelen(lines, thresh=60):
|
| 8 |
# Decide based on chars per newline threshold
|
| 9 |
total_alnum_chars = sum(len(re.findall(r'\w', line.prelim_text)) for line in lines)
|
| 10 |
+
total_newlines = max(len(lines) - 1, 1)
|
| 11 |
|
| 12 |
if total_alnum_chars == 0:
|
| 13 |
return False
|
|
|
|
| 30 |
else:
|
| 31 |
font_info += stats
|
| 32 |
most_common_font = font_info.most_common(1)[0][0]
|
| 33 |
+
last_block = None
|
| 34 |
for page in blocks:
|
| 35 |
try:
|
| 36 |
+
min_start = page.get_min_line_start()
|
|
|
|
| 37 |
except IndexError:
|
| 38 |
continue
|
| 39 |
|
| 40 |
for block in page.blocks:
|
|
|
|
|
|
|
| 41 |
if block.most_common_block_type() != "Text":
|
| 42 |
+
last_block = block
|
| 43 |
continue
|
| 44 |
|
| 45 |
+
is_indent = []
|
| 46 |
line_fonts = []
|
| 47 |
for line in block.lines:
|
| 48 |
fonts = [span.font for span in line.spans]
|
| 49 |
line_fonts += fonts
|
|
|
|
| 50 |
line_start = line.bbox[0]
|
| 51 |
+
if line_start > min_start:
|
| 52 |
+
is_indent.append(True)
|
| 53 |
else:
|
| 54 |
+
is_indent.append(False)
|
| 55 |
comment_lines = comment_count([line.prelim_text for line in block.lines])
|
| 56 |
is_code = [
|
| 57 |
+
len(block.lines) > 3,
|
| 58 |
+
sum([f != most_common_font for f in line_fonts]) > len(line_fonts) * .8, # At least 80% of the fonts are not the most common, since code usually uses a different font from the main body text
|
| 59 |
+
is_code_linelen(block.lines),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
(
|
| 61 |
+
sum(is_indent) > len(block.lines) * .2
|
| 62 |
or
|
| 63 |
+
comment_lines > len(block.lines) * .2
|
| 64 |
+
), # 20% lines indented or 20% of the lines are comments
|
| 65 |
]
|
| 66 |
|
| 67 |
+
# Check if previous block is code, and this block is indented
|
| 68 |
+
is_code_prev = [
|
| 69 |
+
last_block and last_block.most_common_block_type() == "Code",
|
| 70 |
+
sum(is_indent) >= len(block.lines) * .8 # At least 80% indented
|
| 71 |
+
]
|
| 72 |
+
|
| 73 |
+
if all(is_code) or all(is_code_prev):
|
| 74 |
block.set_block_type("Code")
|
| 75 |
|
| 76 |
+
last_block = block
|
| 77 |
+
|
| 78 |
|
| 79 |
def indent_blocks(blocks: List[Page]):
|
| 80 |
span_counter = 0
|
marker/{equations.py β cleaners/equations.py}
RENAMED
|
File without changes
|
marker/{headers.py β cleaners/headers.py}
RENAMED
|
File without changes
|
marker/cleaners/table.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from marker.bbox import merge_boxes
|
| 2 |
+
from marker.schema import Line, Span, Block, Page
|
| 3 |
+
from copy import deepcopy
|
| 4 |
+
from tabulate import tabulate
|
| 5 |
+
from typing import List
|
| 6 |
+
import re
|
| 7 |
+
import textwrap
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def merge_table_blocks(blocks: List[Page]):
|
| 11 |
+
last_block = None
|
| 12 |
+
current_lines = []
|
| 13 |
+
current_bbox = None
|
| 14 |
+
for page in blocks:
|
| 15 |
+
new_page_blocks = []
|
| 16 |
+
for block in page.blocks:
|
| 17 |
+
if block.most_common_block_type() != "Table":
|
| 18 |
+
if len(current_lines) > 0:
|
| 19 |
+
new_block = Block(
|
| 20 |
+
lines=deepcopy(current_lines),
|
| 21 |
+
pnum=last_block.pnum,
|
| 22 |
+
bbox=current_bbox
|
| 23 |
+
)
|
| 24 |
+
new_page_blocks.append(new_block)
|
| 25 |
+
current_lines = []
|
| 26 |
+
current_bbox = None
|
| 27 |
+
|
| 28 |
+
new_page_blocks.append(block)
|
| 29 |
+
last_block = block
|
| 30 |
+
continue
|
| 31 |
+
|
| 32 |
+
current_lines.extend(block.lines)
|
| 33 |
+
if current_bbox is None:
|
| 34 |
+
current_bbox = block.bbox
|
| 35 |
+
else:
|
| 36 |
+
current_bbox = merge_boxes(current_bbox, block.bbox)
|
| 37 |
+
|
| 38 |
+
if len(current_lines) > 0:
|
| 39 |
+
new_block = Block(
|
| 40 |
+
lines=deepcopy(current_lines),
|
| 41 |
+
pnum=last_block.pnum,
|
| 42 |
+
bbox=current_bbox
|
| 43 |
+
)
|
| 44 |
+
blocks[-1].blocks.append(new_block)
|
| 45 |
+
current_lines = []
|
| 46 |
+
current_bbox = []
|
| 47 |
+
|
| 48 |
+
page.blocks = new_page_blocks
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def create_new_tables(blocks: List[Page]):
|
| 52 |
+
table_idx = 0
|
| 53 |
+
dot_pattern = re.compile(r'(\s*\.\s*){4,}')
|
| 54 |
+
dot_multiline_pattern = re.compile(r'.*(\s*\.\s*){4,}.*', re.DOTALL)
|
| 55 |
+
|
| 56 |
+
for page in blocks:
|
| 57 |
+
for block in page.blocks:
|
| 58 |
+
if block.most_common_block_type() != "Table" or len(block.lines) < 3:
|
| 59 |
+
continue
|
| 60 |
+
|
| 61 |
+
table_rows = []
|
| 62 |
+
y_coord = None
|
| 63 |
+
row = []
|
| 64 |
+
for line in block.lines:
|
| 65 |
+
for span in line.spans:
|
| 66 |
+
if y_coord != span.y_start:
|
| 67 |
+
if len(row) > 0:
|
| 68 |
+
table_rows.append(row)
|
| 69 |
+
row = []
|
| 70 |
+
y_coord = span.y_start
|
| 71 |
+
|
| 72 |
+
text = span.text
|
| 73 |
+
if dot_multiline_pattern.match(text):
|
| 74 |
+
text = dot_pattern.sub(' ', text)
|
| 75 |
+
row.append(text)
|
| 76 |
+
if len(row) > 0:
|
| 77 |
+
table_rows.append(row)
|
| 78 |
+
new_text = tabulate(table_rows, headers="firstrow", tablefmt="pipe")
|
| 79 |
+
new_span = Span(
|
| 80 |
+
bbox=block.bbox,
|
| 81 |
+
span_id=f"{table_idx}_fix_table",
|
| 82 |
+
font="Table",
|
| 83 |
+
color=0,
|
| 84 |
+
block_type="Table",
|
| 85 |
+
text=new_text
|
| 86 |
+
)
|
| 87 |
+
new_line = Line(
|
| 88 |
+
bbox=block.bbox,
|
| 89 |
+
spans=[new_span]
|
| 90 |
+
)
|
| 91 |
+
block.lines = [new_line]
|
| 92 |
+
table_idx += 1
|
parse.py β marker/convert.py
RENAMED
|
@@ -1,11 +1,9 @@
|
|
| 1 |
-
import argparse
|
| 2 |
-
|
| 3 |
import fitz as pymupdf
|
| 4 |
from marker.extract_text import get_text_blocks
|
| 5 |
-
from marker.headers import categorize_blocks, filter_header_footer
|
| 6 |
-
from marker.equations import replace_equations, load_nougat_model
|
| 7 |
from marker.segmentation import detect_all_block_types, load_layout_model
|
| 8 |
-
from marker.code import identify_code_blocks, indent_blocks
|
| 9 |
from marker.markdown import merge_spans, merge_lines, get_full_text
|
| 10 |
from marker.schema import Page, BlockType
|
| 11 |
from typing import List
|
|
@@ -18,35 +16,26 @@ def annotate_spans(blocks: List[Page], block_types: List[BlockType]):
|
|
| 18 |
page.add_block_types(page_block_types)
|
| 19 |
|
| 20 |
|
| 21 |
-
|
| 22 |
-
parser = argparse.ArgumentParser()
|
| 23 |
-
parser.add_argument("filename", help="PDF file to parse")
|
| 24 |
-
parser.add_argument("output", help="Output file name")
|
| 25 |
-
args = parser.parse_args()
|
| 26 |
-
|
| 27 |
-
fname = args.filename
|
| 28 |
doc = pymupdf.open(fname)
|
| 29 |
blocks, toc = get_text_blocks(doc)
|
| 30 |
|
| 31 |
-
layoutlm_model = load_layout_model()
|
| 32 |
block_types = detect_all_block_types(doc, blocks, layoutlm_model)
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
filtered = deepcopy(blocks)
|
| 35 |
annotate_spans(filtered, block_types)
|
| 36 |
identify_code_blocks(filtered)
|
| 37 |
indent_blocks(filtered)
|
| 38 |
|
| 39 |
-
bad_span_ids = categorize_blocks(blocks)
|
| 40 |
-
bad_span_ids += filter_header_footer(blocks)
|
| 41 |
-
|
| 42 |
-
# Copy to avoid changing original data
|
| 43 |
-
|
| 44 |
for page in filtered:
|
| 45 |
for block in page.blocks:
|
| 46 |
block.filter_spans(bad_span_ids)
|
| 47 |
block.filter_bad_span_types(block_types[page.pnum])
|
| 48 |
|
| 49 |
-
nougat_model = load_nougat_model()
|
| 50 |
filtered = replace_equations(doc, filtered, block_types, nougat_model)
|
| 51 |
|
| 52 |
# Copy to avoid changing original data
|
|
@@ -54,5 +43,4 @@ if __name__ == "__main__":
|
|
| 54 |
text_blocks = merge_lines(merged_lines, filtered)
|
| 55 |
full_text = get_full_text(text_blocks)
|
| 56 |
|
| 57 |
-
|
| 58 |
-
f.write(full_text)
|
|
|
|
|
|
|
|
|
|
| 1 |
import fitz as pymupdf
|
| 2 |
from marker.extract_text import get_text_blocks
|
| 3 |
+
from marker.cleaners.headers import categorize_blocks, filter_header_footer
|
| 4 |
+
from marker.cleaners.equations import replace_equations, load_nougat_model
|
| 5 |
from marker.segmentation import detect_all_block_types, load_layout_model
|
| 6 |
+
from marker.cleaners.code import identify_code_blocks, indent_blocks
|
| 7 |
from marker.markdown import merge_spans, merge_lines, get_full_text
|
| 8 |
from marker.schema import Page, BlockType
|
| 9 |
from typing import List
|
|
|
|
| 16 |
page.add_block_types(page_block_types)
|
| 17 |
|
| 18 |
|
| 19 |
+
def convert_single_pdf(fname: str, layoutlm_model, nougat_model):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
doc = pymupdf.open(fname)
|
| 21 |
blocks, toc = get_text_blocks(doc)
|
| 22 |
|
|
|
|
| 23 |
block_types = detect_all_block_types(doc, blocks, layoutlm_model)
|
| 24 |
|
| 25 |
+
# Find headers and footers
|
| 26 |
+
bad_span_ids = categorize_blocks(blocks)
|
| 27 |
+
bad_span_ids += filter_header_footer(blocks)
|
| 28 |
+
|
| 29 |
filtered = deepcopy(blocks)
|
| 30 |
annotate_spans(filtered, block_types)
|
| 31 |
identify_code_blocks(filtered)
|
| 32 |
indent_blocks(filtered)
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
for page in filtered:
|
| 35 |
for block in page.blocks:
|
| 36 |
block.filter_spans(bad_span_ids)
|
| 37 |
block.filter_bad_span_types(block_types[page.pnum])
|
| 38 |
|
|
|
|
| 39 |
filtered = replace_equations(doc, filtered, block_types, nougat_model)
|
| 40 |
|
| 41 |
# Copy to avoid changing original data
|
|
|
|
| 43 |
text_blocks = merge_lines(merged_lines, filtered)
|
| 44 |
full_text = get_full_text(text_blocks)
|
| 45 |
|
| 46 |
+
return full_text
|
|
|
marker/markdown.py
CHANGED
|
@@ -55,8 +55,6 @@ def merge_spans(blocks):
|
|
| 55 |
|
| 56 |
|
| 57 |
def block_surround(text, block_type):
|
| 58 |
-
dot_pattern = re.compile(r'(\s*\.\s*){4,}')
|
| 59 |
-
dot_multiline_pattern = re.compile(r'.*(\s*\.\s*){4,}.*', re.DOTALL)
|
| 60 |
match block_type:
|
| 61 |
case "Section-header":
|
| 62 |
if not text.startswith("#"):
|
|
@@ -64,8 +62,8 @@ def block_surround(text, block_type):
|
|
| 64 |
case "Title":
|
| 65 |
if not text.startswith("#"):
|
| 66 |
text = "# " + text.strip() + "\n"
|
| 67 |
-
case "Table"
|
| 68 |
-
text =
|
| 69 |
case "List-item":
|
| 70 |
pass
|
| 71 |
case "Code":
|
|
@@ -89,7 +87,7 @@ def line_separator(line1, line2, block_type, is_continuation=False):
|
|
| 89 |
|
| 90 |
if block_type in ["Title", "Section-header"]:
|
| 91 |
return line1.rstrip() + " " + line2.lstrip()
|
| 92 |
-
elif lowercase_pattern1.match(line1) and lowercase_pattern2.match(line2):
|
| 93 |
return line1.rstrip() + " " + line2.lstrip()
|
| 94 |
elif is_continuation:
|
| 95 |
return line1.rstrip() + " " + line2.lstrip()
|
|
|
|
| 55 |
|
| 56 |
|
| 57 |
def block_surround(text, block_type):
|
|
|
|
|
|
|
| 58 |
match block_type:
|
| 59 |
case "Section-header":
|
| 60 |
if not text.startswith("#"):
|
|
|
|
| 62 |
case "Title":
|
| 63 |
if not text.startswith("#"):
|
| 64 |
text = "# " + text.strip() + "\n"
|
| 65 |
+
case "Table":
|
| 66 |
+
text = "\n" + text + "\n"
|
| 67 |
case "List-item":
|
| 68 |
pass
|
| 69 |
case "Code":
|
|
|
|
| 87 |
|
| 88 |
if block_type in ["Title", "Section-header"]:
|
| 89 |
return line1.rstrip() + " " + line2.lstrip()
|
| 90 |
+
elif lowercase_pattern1.match(line1) and lowercase_pattern2.match(line2) and block_type == "Text":
|
| 91 |
return line1.rstrip() + " " + line2.lstrip()
|
| 92 |
elif is_continuation:
|
| 93 |
return line1.rstrip() + " " + line2.lstrip()
|
marker/schema.py
CHANGED
|
@@ -26,6 +26,22 @@ class BboxElement(BaseModel):
|
|
| 26 |
raise ValueError('bbox must have 4 elements')
|
| 27 |
return v
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
class BlockType(BboxElement):
|
| 31 |
block_type: str
|
|
@@ -151,6 +167,11 @@ class Page(BaseModel):
|
|
| 151 |
start_counts = Counter(starts)
|
| 152 |
return start_counts
|
| 153 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
|
| 155 |
class MergedLine(BboxElement):
|
| 156 |
text: str
|
|
|
|
| 26 |
raise ValueError('bbox must have 4 elements')
|
| 27 |
return v
|
| 28 |
|
| 29 |
+
@property
|
| 30 |
+
def height(self):
|
| 31 |
+
return self.bbox[3] - self.bbox[1]
|
| 32 |
+
|
| 33 |
+
@property
|
| 34 |
+
def width(self):
|
| 35 |
+
return self.bbox[2] - self.bbox[0]
|
| 36 |
+
|
| 37 |
+
@property
|
| 38 |
+
def x_start(self):
|
| 39 |
+
return self.bbox[0]
|
| 40 |
+
|
| 41 |
+
@property
|
| 42 |
+
def y_start(self):
|
| 43 |
+
return self.bbox[1]
|
| 44 |
+
|
| 45 |
|
| 46 |
class BlockType(BboxElement):
|
| 47 |
block_type: str
|
|
|
|
| 167 |
start_counts = Counter(starts)
|
| 168 |
return start_counts
|
| 169 |
|
| 170 |
+
def get_min_line_start(self):
|
| 171 |
+
starts = [l.bbox[0] for l in self.get_nonblank_lines() if l.spans[0].block_type == "Text"]
|
| 172 |
+
if len(starts) == 0:
|
| 173 |
+
raise IndexError("No lines found")
|
| 174 |
+
return min(starts)
|
| 175 |
|
| 176 |
class MergedLine(BboxElement):
|
| 177 |
text: str
|
marker/segmentation.py
CHANGED
|
@@ -19,6 +19,7 @@ NO_CHUNK_KEYS = ["pixel_values"]
|
|
| 19 |
|
| 20 |
def load_layout_model():
|
| 21 |
model = LayoutLMv3ForTokenClassification.from_pretrained("Kwan0/layoutlmv3-base-finetune-DocLayNet-100k").to(settings.TORCH_DEVICE)
|
|
|
|
| 22 |
model.config.id2label = {
|
| 23 |
0: "Caption",
|
| 24 |
1: "Footnote",
|
|
@@ -33,7 +34,7 @@ def load_layout_model():
|
|
| 33 |
10: "Title"
|
| 34 |
}
|
| 35 |
|
| 36 |
-
model.config.label2id =
|
| 37 |
return model
|
| 38 |
|
| 39 |
|
|
|
|
| 19 |
|
| 20 |
def load_layout_model():
|
| 21 |
model = LayoutLMv3ForTokenClassification.from_pretrained("Kwan0/layoutlmv3-base-finetune-DocLayNet-100k").to(settings.TORCH_DEVICE)
|
| 22 |
+
|
| 23 |
model.config.id2label = {
|
| 24 |
0: "Caption",
|
| 25 |
1: "Footnote",
|
|
|
|
| 34 |
10: "Title"
|
| 35 |
}
|
| 36 |
|
| 37 |
+
model.config.label2id = {v: k for k, v in model.config.id2label.items()}
|
| 38 |
return model
|
| 39 |
|
| 40 |
|
marker/settings.py
CHANGED
|
@@ -6,18 +6,30 @@ from pydantic_settings import BaseSettings
|
|
| 6 |
|
| 7 |
|
| 8 |
class Settings(BaseSettings):
|
| 9 |
-
#
|
| 10 |
-
DPI: int = 400
|
| 11 |
-
INVALID_CHARS: List[str] = [chr(0xfffd), "~", chr(65533), "β΅"]
|
| 12 |
TORCH_DEVICE: str = "cpu"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
TESSDATA_PREFIX: str = ""
|
| 14 |
-
|
|
|
|
| 15 |
NOUGAT_MODEL_MAX: int = 1024 # Max inference length for nougat
|
| 16 |
NOUGAT_HALLUCINATION_WORDS: List[str] = ["[MISSING_PAGE_POST]", "## References\n", "**Figure Captions**\n", "Footnote",
|
| 17 |
"\par\par\par", "## Chapter", "Fig."]
|
|
|
|
|
|
|
|
|
|
| 18 |
LAYOUT_MODEL_MAX: int = 512
|
| 19 |
LAYOUT_CHUNK_OVERLAP: int = 128
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
class Config:
|
| 22 |
env_file = find_dotenv("local.env")
|
| 23 |
|
|
|
|
| 6 |
|
| 7 |
|
| 8 |
class Settings(BaseSettings):
|
| 9 |
+
# General
|
|
|
|
|
|
|
| 10 |
TORCH_DEVICE: str = "cpu"
|
| 11 |
+
|
| 12 |
+
# OCR
|
| 13 |
+
INVALID_CHARS: List[str] = [chr(0xfffd), "~", chr(65533), "β΅"]
|
| 14 |
+
DPI: int = 400
|
| 15 |
TESSDATA_PREFIX: str = ""
|
| 16 |
+
|
| 17 |
+
# Nougat Model
|
| 18 |
NOUGAT_MODEL_MAX: int = 1024 # Max inference length for nougat
|
| 19 |
NOUGAT_HALLUCINATION_WORDS: List[str] = ["[MISSING_PAGE_POST]", "## References\n", "**Figure Captions**\n", "Footnote",
|
| 20 |
"\par\par\par", "## Chapter", "Fig."]
|
| 21 |
+
|
| 22 |
+
# Layout Model
|
| 23 |
+
BAD_SPAN_TYPES: List[str] = ["Caption", "Footnote", "Page-footer", "Page-header", "Picture"]
|
| 24 |
LAYOUT_MODEL_MAX: int = 512
|
| 25 |
LAYOUT_CHUNK_OVERLAP: int = 128
|
| 26 |
|
| 27 |
+
# Ray
|
| 28 |
+
RAY_CACHE_PATH: Optional[str] = None # Where to save ray cache
|
| 29 |
+
RAY_DASHBOARD_HOST: str = "127.0.0.1"
|
| 30 |
+
RAY_CORES_PER_WORKER: int = 1 # How many cpu cores to allocate per worker
|
| 31 |
+
|
| 32 |
+
|
| 33 |
class Config:
|
| 34 |
env_file = find_dotenv("local.env")
|
| 35 |
|
poetry.lock
CHANGED
|
@@ -1786,6 +1786,71 @@ docs = ["sphinx"]
|
|
| 1786 |
gmpy = ["gmpy2 (>=2.1.0a4)"]
|
| 1787 |
tests = ["pytest (>=4.6)"]
|
| 1788 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1789 |
[[package]]
|
| 1790 |
name = "multidict"
|
| 1791 |
version = "6.0.4"
|
|
@@ -2483,6 +2548,28 @@ files = [
|
|
| 2483 |
[package.dependencies]
|
| 2484 |
wcwidth = "*"
|
| 2485 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2486 |
[[package]]
|
| 2487 |
name = "psutil"
|
| 2488 |
version = "5.9.6"
|
|
@@ -3348,6 +3435,66 @@ files = [
|
|
| 3348 |
[package.extras]
|
| 3349 |
full = ["numpy"]
|
| 3350 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3351 |
[[package]]
|
| 3352 |
name = "referencing"
|
| 3353 |
version = "0.30.2"
|
|
@@ -4113,6 +4260,20 @@ files = [
|
|
| 4113 |
[package.dependencies]
|
| 4114 |
mpmath = ">=0.19"
|
| 4115 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4116 |
[[package]]
|
| 4117 |
name = "terminado"
|
| 4118 |
version = "0.17.1"
|
|
@@ -4616,13 +4777,13 @@ zstd = ["zstandard (>=0.18.0)"]
|
|
| 4616 |
|
| 4617 |
[[package]]
|
| 4618 |
name = "wcwidth"
|
| 4619 |
-
version = "0.2.
|
| 4620 |
description = "Measures the displayed width of unicode strings in a terminal"
|
| 4621 |
optional = false
|
| 4622 |
python-versions = "*"
|
| 4623 |
files = [
|
| 4624 |
-
{file = "wcwidth-0.2.
|
| 4625 |
-
{file = "wcwidth-0.2.
|
| 4626 |
]
|
| 4627 |
|
| 4628 |
[[package]]
|
|
@@ -4900,4 +5061,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
|
|
| 4900 |
[metadata]
|
| 4901 |
lock-version = "2.0"
|
| 4902 |
python-versions = ">=3.9,<3.13"
|
| 4903 |
-
content-hash = "
|
|
|
|
| 1786 |
gmpy = ["gmpy2 (>=2.1.0a4)"]
|
| 1787 |
tests = ["pytest (>=4.6)"]
|
| 1788 |
|
| 1789 |
+
[[package]]
|
| 1790 |
+
name = "msgpack"
|
| 1791 |
+
version = "1.0.7"
|
| 1792 |
+
description = "MessagePack serializer"
|
| 1793 |
+
optional = false
|
| 1794 |
+
python-versions = ">=3.8"
|
| 1795 |
+
files = [
|
| 1796 |
+
{file = "msgpack-1.0.7-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:04ad6069c86e531682f9e1e71b71c1c3937d6014a7c3e9edd2aa81ad58842862"},
|
| 1797 |
+
{file = "msgpack-1.0.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:cca1b62fe70d761a282496b96a5e51c44c213e410a964bdffe0928e611368329"},
|
| 1798 |
+
{file = "msgpack-1.0.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e50ebce52f41370707f1e21a59514e3375e3edd6e1832f5e5235237db933c98b"},
|
| 1799 |
+
{file = "msgpack-1.0.7-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4a7b4f35de6a304b5533c238bee86b670b75b03d31b7797929caa7a624b5dda6"},
|
| 1800 |
+
{file = "msgpack-1.0.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:28efb066cde83c479dfe5a48141a53bc7e5f13f785b92ddde336c716663039ee"},
|
| 1801 |
+
{file = "msgpack-1.0.7-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4cb14ce54d9b857be9591ac364cb08dc2d6a5c4318c1182cb1d02274029d590d"},
|
| 1802 |
+
{file = "msgpack-1.0.7-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b573a43ef7c368ba4ea06050a957c2a7550f729c31f11dd616d2ac4aba99888d"},
|
| 1803 |
+
{file = "msgpack-1.0.7-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ccf9a39706b604d884d2cb1e27fe973bc55f2890c52f38df742bc1d79ab9f5e1"},
|
| 1804 |
+
{file = "msgpack-1.0.7-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:cb70766519500281815dfd7a87d3a178acf7ce95390544b8c90587d76b227681"},
|
| 1805 |
+
{file = "msgpack-1.0.7-cp310-cp310-win32.whl", hash = "sha256:b610ff0f24e9f11c9ae653c67ff8cc03c075131401b3e5ef4b82570d1728f8a9"},
|
| 1806 |
+
{file = "msgpack-1.0.7-cp310-cp310-win_amd64.whl", hash = "sha256:a40821a89dc373d6427e2b44b572efc36a2778d3f543299e2f24eb1a5de65415"},
|
| 1807 |
+
{file = "msgpack-1.0.7-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:576eb384292b139821c41995523654ad82d1916da6a60cff129c715a6223ea84"},
|
| 1808 |
+
{file = "msgpack-1.0.7-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:730076207cb816138cf1af7f7237b208340a2c5e749707457d70705715c93b93"},
|
| 1809 |
+
{file = "msgpack-1.0.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:85765fdf4b27eb5086f05ac0491090fc76f4f2b28e09d9350c31aac25a5aaff8"},
|
| 1810 |
+
{file = "msgpack-1.0.7-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3476fae43db72bd11f29a5147ae2f3cb22e2f1a91d575ef130d2bf49afd21c46"},
|
| 1811 |
+
{file = "msgpack-1.0.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d4c80667de2e36970ebf74f42d1088cc9ee7ef5f4e8c35eee1b40eafd33ca5b"},
|
| 1812 |
+
{file = "msgpack-1.0.7-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5b0bf0effb196ed76b7ad883848143427a73c355ae8e569fa538365064188b8e"},
|
| 1813 |
+
{file = "msgpack-1.0.7-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:f9a7c509542db4eceed3dcf21ee5267ab565a83555c9b88a8109dcecc4709002"},
|
| 1814 |
+
{file = "msgpack-1.0.7-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:84b0daf226913133f899ea9b30618722d45feffa67e4fe867b0b5ae83a34060c"},
|
| 1815 |
+
{file = "msgpack-1.0.7-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:ec79ff6159dffcc30853b2ad612ed572af86c92b5168aa3fc01a67b0fa40665e"},
|
| 1816 |
+
{file = "msgpack-1.0.7-cp311-cp311-win32.whl", hash = "sha256:3e7bf4442b310ff154b7bb9d81eb2c016b7d597e364f97d72b1acc3817a0fdc1"},
|
| 1817 |
+
{file = "msgpack-1.0.7-cp311-cp311-win_amd64.whl", hash = "sha256:3f0c8c6dfa6605ab8ff0611995ee30d4f9fcff89966cf562733b4008a3d60d82"},
|
| 1818 |
+
{file = "msgpack-1.0.7-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:f0936e08e0003f66bfd97e74ee530427707297b0d0361247e9b4f59ab78ddc8b"},
|
| 1819 |
+
{file = "msgpack-1.0.7-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:98bbd754a422a0b123c66a4c341de0474cad4a5c10c164ceed6ea090f3563db4"},
|
| 1820 |
+
{file = "msgpack-1.0.7-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b291f0ee7961a597cbbcc77709374087fa2a9afe7bdb6a40dbbd9b127e79afee"},
|
| 1821 |
+
{file = "msgpack-1.0.7-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ebbbba226f0a108a7366bf4b59bf0f30a12fd5e75100c630267d94d7f0ad20e5"},
|
| 1822 |
+
{file = "msgpack-1.0.7-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1e2d69948e4132813b8d1131f29f9101bc2c915f26089a6d632001a5c1349672"},
|
| 1823 |
+
{file = "msgpack-1.0.7-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bdf38ba2d393c7911ae989c3bbba510ebbcdf4ecbdbfec36272abe350c454075"},
|
| 1824 |
+
{file = "msgpack-1.0.7-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:993584fc821c58d5993521bfdcd31a4adf025c7d745bbd4d12ccfecf695af5ba"},
|
| 1825 |
+
{file = "msgpack-1.0.7-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:52700dc63a4676669b341ba33520f4d6e43d3ca58d422e22ba66d1736b0a6e4c"},
|
| 1826 |
+
{file = "msgpack-1.0.7-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:e45ae4927759289c30ccba8d9fdce62bb414977ba158286b5ddaf8df2cddb5c5"},
|
| 1827 |
+
{file = "msgpack-1.0.7-cp312-cp312-win32.whl", hash = "sha256:27dcd6f46a21c18fa5e5deed92a43d4554e3df8d8ca5a47bf0615d6a5f39dbc9"},
|
| 1828 |
+
{file = "msgpack-1.0.7-cp312-cp312-win_amd64.whl", hash = "sha256:7687e22a31e976a0e7fc99c2f4d11ca45eff652a81eb8c8085e9609298916dcf"},
|
| 1829 |
+
{file = "msgpack-1.0.7-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5b6ccc0c85916998d788b295765ea0e9cb9aac7e4a8ed71d12e7d8ac31c23c95"},
|
| 1830 |
+
{file = "msgpack-1.0.7-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:235a31ec7db685f5c82233bddf9858748b89b8119bf4538d514536c485c15fe0"},
|
| 1831 |
+
{file = "msgpack-1.0.7-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:cab3db8bab4b7e635c1c97270d7a4b2a90c070b33cbc00c99ef3f9be03d3e1f7"},
|
| 1832 |
+
{file = "msgpack-1.0.7-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0bfdd914e55e0d2c9e1526de210f6fe8ffe9705f2b1dfcc4aecc92a4cb4b533d"},
|
| 1833 |
+
{file = "msgpack-1.0.7-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:36e17c4592231a7dbd2ed09027823ab295d2791b3b1efb2aee874b10548b7524"},
|
| 1834 |
+
{file = "msgpack-1.0.7-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:38949d30b11ae5f95c3c91917ee7a6b239f5ec276f271f28638dec9156f82cfc"},
|
| 1835 |
+
{file = "msgpack-1.0.7-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:ff1d0899f104f3921d94579a5638847f783c9b04f2d5f229392ca77fba5b82fc"},
|
| 1836 |
+
{file = "msgpack-1.0.7-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:dc43f1ec66eb8440567186ae2f8c447d91e0372d793dfe8c222aec857b81a8cf"},
|
| 1837 |
+
{file = "msgpack-1.0.7-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:dd632777ff3beaaf629f1ab4396caf7ba0bdd075d948a69460d13d44357aca4c"},
|
| 1838 |
+
{file = "msgpack-1.0.7-cp38-cp38-win32.whl", hash = "sha256:4e71bc4416de195d6e9b4ee93ad3f2f6b2ce11d042b4d7a7ee00bbe0358bd0c2"},
|
| 1839 |
+
{file = "msgpack-1.0.7-cp38-cp38-win_amd64.whl", hash = "sha256:8f5b234f567cf76ee489502ceb7165c2a5cecec081db2b37e35332b537f8157c"},
|
| 1840 |
+
{file = "msgpack-1.0.7-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:bfef2bb6ef068827bbd021017a107194956918ab43ce4d6dc945ffa13efbc25f"},
|
| 1841 |
+
{file = "msgpack-1.0.7-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:484ae3240666ad34cfa31eea7b8c6cd2f1fdaae21d73ce2974211df099a95d81"},
|
| 1842 |
+
{file = "msgpack-1.0.7-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3967e4ad1aa9da62fd53e346ed17d7b2e922cba5ab93bdd46febcac39be636fc"},
|
| 1843 |
+
{file = "msgpack-1.0.7-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8dd178c4c80706546702c59529ffc005681bd6dc2ea234c450661b205445a34d"},
|
| 1844 |
+
{file = "msgpack-1.0.7-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f6ffbc252eb0d229aeb2f9ad051200668fc3a9aaa8994e49f0cb2ffe2b7867e7"},
|
| 1845 |
+
{file = "msgpack-1.0.7-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:822ea70dc4018c7e6223f13affd1c5c30c0f5c12ac1f96cd8e9949acddb48a61"},
|
| 1846 |
+
{file = "msgpack-1.0.7-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:384d779f0d6f1b110eae74cb0659d9aa6ff35aaf547b3955abf2ab4c901c4819"},
|
| 1847 |
+
{file = "msgpack-1.0.7-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:f64e376cd20d3f030190e8c32e1c64582eba56ac6dc7d5b0b49a9d44021b52fd"},
|
| 1848 |
+
{file = "msgpack-1.0.7-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5ed82f5a7af3697b1c4786053736f24a0efd0a1b8a130d4c7bfee4b9ded0f08f"},
|
| 1849 |
+
{file = "msgpack-1.0.7-cp39-cp39-win32.whl", hash = "sha256:f26a07a6e877c76a88e3cecac8531908d980d3d5067ff69213653649ec0f60ad"},
|
| 1850 |
+
{file = "msgpack-1.0.7-cp39-cp39-win_amd64.whl", hash = "sha256:1dc93e8e4653bdb5910aed79f11e165c85732067614f180f70534f056da97db3"},
|
| 1851 |
+
{file = "msgpack-1.0.7.tar.gz", hash = "sha256:572efc93db7a4d27e404501975ca6d2d9775705c2d922390d878fcf768d92c87"},
|
| 1852 |
+
]
|
| 1853 |
+
|
| 1854 |
[[package]]
|
| 1855 |
name = "multidict"
|
| 1856 |
version = "6.0.4"
|
|
|
|
| 2548 |
[package.dependencies]
|
| 2549 |
wcwidth = "*"
|
| 2550 |
|
| 2551 |
+
[[package]]
|
| 2552 |
+
name = "protobuf"
|
| 2553 |
+
version = "4.24.4"
|
| 2554 |
+
description = ""
|
| 2555 |
+
optional = false
|
| 2556 |
+
python-versions = ">=3.7"
|
| 2557 |
+
files = [
|
| 2558 |
+
{file = "protobuf-4.24.4-cp310-abi3-win32.whl", hash = "sha256:ec9912d5cb6714a5710e28e592ee1093d68c5ebfeda61983b3f40331da0b1ebb"},
|
| 2559 |
+
{file = "protobuf-4.24.4-cp310-abi3-win_amd64.whl", hash = "sha256:1badab72aa8a3a2b812eacfede5020472e16c6b2212d737cefd685884c191085"},
|
| 2560 |
+
{file = "protobuf-4.24.4-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:8e61a27f362369c2f33248a0ff6896c20dcd47b5d48239cb9720134bef6082e4"},
|
| 2561 |
+
{file = "protobuf-4.24.4-cp37-abi3-manylinux2014_aarch64.whl", hash = "sha256:bffa46ad9612e6779d0e51ae586fde768339b791a50610d85eb162daeb23661e"},
|
| 2562 |
+
{file = "protobuf-4.24.4-cp37-abi3-manylinux2014_x86_64.whl", hash = "sha256:b493cb590960ff863743b9ff1452c413c2ee12b782f48beca77c8da3e2ffe9d9"},
|
| 2563 |
+
{file = "protobuf-4.24.4-cp37-cp37m-win32.whl", hash = "sha256:dbbed8a56e56cee8d9d522ce844a1379a72a70f453bde6243e3c86c30c2a3d46"},
|
| 2564 |
+
{file = "protobuf-4.24.4-cp37-cp37m-win_amd64.whl", hash = "sha256:6b7d2e1c753715dcfe9d284a25a52d67818dd43c4932574307daf836f0071e37"},
|
| 2565 |
+
{file = "protobuf-4.24.4-cp38-cp38-win32.whl", hash = "sha256:02212557a76cd99574775a81fefeba8738d0f668d6abd0c6b1d3adcc75503dbe"},
|
| 2566 |
+
{file = "protobuf-4.24.4-cp38-cp38-win_amd64.whl", hash = "sha256:2fa3886dfaae6b4c5ed2730d3bf47c7a38a72b3a1f0acb4d4caf68e6874b947b"},
|
| 2567 |
+
{file = "protobuf-4.24.4-cp39-cp39-win32.whl", hash = "sha256:b77272f3e28bb416e2071186cb39efd4abbf696d682cbb5dc731308ad37fa6dd"},
|
| 2568 |
+
{file = "protobuf-4.24.4-cp39-cp39-win_amd64.whl", hash = "sha256:9fee5e8aa20ef1b84123bb9232b3f4a5114d9897ed89b4b8142d81924e05d79b"},
|
| 2569 |
+
{file = "protobuf-4.24.4-py3-none-any.whl", hash = "sha256:80797ce7424f8c8d2f2547e2d42bfbb6c08230ce5832d6c099a37335c9c90a92"},
|
| 2570 |
+
{file = "protobuf-4.24.4.tar.gz", hash = "sha256:5a70731910cd9104762161719c3d883c960151eea077134458503723b60e3667"},
|
| 2571 |
+
]
|
| 2572 |
+
|
| 2573 |
[[package]]
|
| 2574 |
name = "psutil"
|
| 2575 |
version = "5.9.6"
|
|
|
|
| 3435 |
[package.extras]
|
| 3436 |
full = ["numpy"]
|
| 3437 |
|
| 3438 |
+
[[package]]
|
| 3439 |
+
name = "ray"
|
| 3440 |
+
version = "2.7.1"
|
| 3441 |
+
description = "Ray provides a simple, universal API for building distributed applications."
|
| 3442 |
+
optional = false
|
| 3443 |
+
python-versions = "*"
|
| 3444 |
+
files = [
|
| 3445 |
+
{file = "ray-2.7.1-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:4a2c98ab42881836894f20408ce40c0fd7fe5da7f0bc69cf22c951ccceda55ed"},
|
| 3446 |
+
{file = "ray-2.7.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:53800aadfc07152bc8672d5fa91bb4dc17d96b572a9bd436dd00fd2e0d07ef6a"},
|
| 3447 |
+
{file = "ray-2.7.1-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:17a425b4a2c2098f78fd0ab3831a35a53608d36466453e90c30a6495e9dce354"},
|
| 3448 |
+
{file = "ray-2.7.1-cp310-cp310-manylinux2014_x86_64.whl", hash = "sha256:9681a8a7bf081e2244360206f3cd80d1a6adb4dc6330a507fd8c78ebe6e57365"},
|
| 3449 |
+
{file = "ray-2.7.1-cp310-cp310-win_amd64.whl", hash = "sha256:148c77050ceab3c90739147bb86ac535e9590046cc36364ae9eb15469ea16fbc"},
|
| 3450 |
+
{file = "ray-2.7.1-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:0b0e80e26d6899820c12301626a74a209ab29373f46caf5b48c3ae3f99ec1bc7"},
|
| 3451 |
+
{file = "ray-2.7.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b5d13e910bb3449ef7b25084dcc4f0b9a763d3aa7b2fdd39e3b4d93d8c266951"},
|
| 3452 |
+
{file = "ray-2.7.1-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:0a6e8a736fe5294a0b0064679e59e393c66942db81fdf95804bdc1495d1f1651"},
|
| 3453 |
+
{file = "ray-2.7.1-cp311-cp311-manylinux2014_x86_64.whl", hash = "sha256:f4c9f8a813444bd5346756db1a6d6e09a805b28b5fb6831e91b8d1324c12a888"},
|
| 3454 |
+
{file = "ray-2.7.1-cp311-cp311-win_amd64.whl", hash = "sha256:85a8b0f122e4c14d2ee354fce9651834f7ffc9b60ebdce023a5ba8ca5841a6ee"},
|
| 3455 |
+
{file = "ray-2.7.1-cp37-cp37m-macosx_10_15_x86_64.whl", hash = "sha256:bfa924bbc4042e83a0f31f058f08818418307252fceeee27c4c02bc0d3c02f3f"},
|
| 3456 |
+
{file = "ray-2.7.1-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:0f5657abb376eddf6b56489082d2f94ab36597a2f25da2849e2f66476b90dcc0"},
|
| 3457 |
+
{file = "ray-2.7.1-cp37-cp37m-manylinux2014_x86_64.whl", hash = "sha256:d548e1c67a512975c4241be64a8df2153ae6c29ee2f5b08834fadcad7dfc94a4"},
|
| 3458 |
+
{file = "ray-2.7.1-cp37-cp37m-win_amd64.whl", hash = "sha256:1f4c09a81971cc54d95be55b9b413fd12121a37528b402d1861a8fa0b4e85509"},
|
| 3459 |
+
{file = "ray-2.7.1-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:1f6d2508d117aac0b880d26a4db65a9f90def2d688709b62e0d039879c3afc7a"},
|
| 3460 |
+
{file = "ray-2.7.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:32a6c0866d559d4e6c623ff220cd0790d2da1f3785073a5d0444b8f0486ff541"},
|
| 3461 |
+
{file = "ray-2.7.1-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:d035642e6033f43551a0c17e2363a392739f01df6b4072c5ed71cf3096936d33"},
|
| 3462 |
+
{file = "ray-2.7.1-cp38-cp38-manylinux2014_x86_64.whl", hash = "sha256:a366569d1bd220a92af0dbe092821a11d1ff8ad7b00ed4f74b8a5f380e34ccc7"},
|
| 3463 |
+
{file = "ray-2.7.1-cp38-cp38-win_amd64.whl", hash = "sha256:6fe65dc7f83f1c617af3068d84f8c67f3371b1a48776e44ab6af54998891364c"},
|
| 3464 |
+
{file = "ray-2.7.1-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:3c1501ca56da394e07213efd5be42c2cf0a2eae68d76949d26a3133154d6d9ff"},
|
| 3465 |
+
{file = "ray-2.7.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:57f7e05ad275317158c447680705e046410f68d2a5992e16d07bbc2cc79da2b3"},
|
| 3466 |
+
{file = "ray-2.7.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:b5410ae53c765108c65821fc5e5968509579f98a64d275e103408e1b068e8ca8"},
|
| 3467 |
+
{file = "ray-2.7.1-cp39-cp39-manylinux2014_x86_64.whl", hash = "sha256:1b096abab78b63db6c1a2633f242dd8b3c51e395b574215f3cb8e47f5d7364b9"},
|
| 3468 |
+
{file = "ray-2.7.1-cp39-cp39-win_amd64.whl", hash = "sha256:c03fe26443598bd7ad1c22de4585daec324bc03eabc04d3c2f805d9697a554d6"},
|
| 3469 |
+
]
|
| 3470 |
+
|
| 3471 |
+
[package.dependencies]
|
| 3472 |
+
aiosignal = "*"
|
| 3473 |
+
click = ">=7.0"
|
| 3474 |
+
filelock = "*"
|
| 3475 |
+
frozenlist = "*"
|
| 3476 |
+
jsonschema = "*"
|
| 3477 |
+
msgpack = ">=1.0.0,<2.0.0"
|
| 3478 |
+
numpy = {version = ">=1.19.3", markers = "python_version >= \"3.9\""}
|
| 3479 |
+
packaging = "*"
|
| 3480 |
+
protobuf = ">=3.15.3,<3.19.5 || >3.19.5"
|
| 3481 |
+
pyyaml = "*"
|
| 3482 |
+
requests = "*"
|
| 3483 |
+
|
| 3484 |
+
[package.extras]
|
| 3485 |
+
air = ["aiohttp (>=3.7)", "aiohttp-cors", "aiorwlock", "colorful", "fastapi", "fsspec", "gpustat (>=1.0.0)", "grpcio (>=1.32.0)", "grpcio (>=1.42.0)", "numpy (>=1.20)", "opencensus", "pandas", "pandas (>=1.3)", "prometheus-client (>=0.7.1)", "py-spy (>=0.2.0)", "pyarrow (>=6.0.1)", "pydantic (<2)", "requests", "smart-open", "starlette", "tensorboardX (>=1.9)", "uvicorn", "virtualenv (>=20.0.24,<20.21.1)", "watchfiles"]
|
| 3486 |
+
all = ["aiohttp (>=3.7)", "aiohttp-cors", "aiorwlock", "colorful", "dm-tree", "fastapi", "fsspec", "gpustat (>=1.0.0)", "grpcio (!=1.56.0)", "grpcio (>=1.32.0)", "grpcio (>=1.42.0)", "gymnasium (==0.28.1)", "lz4", "numpy (>=1.20)", "opencensus", "opentelemetry-api", "opentelemetry-exporter-otlp", "opentelemetry-sdk", "pandas", "pandas (>=1.3)", "prometheus-client (>=0.7.1)", "py-spy (>=0.2.0)", "pyarrow (>=6.0.1)", "pydantic (<2)", "pyyaml", "ray-cpp (==2.7.1)", "requests", "rich", "scikit-image", "scipy", "smart-open", "starlette", "tensorboardX (>=1.9)", "typer", "uvicorn", "virtualenv (>=20.0.24,<20.21.1)", "watchfiles"]
|
| 3487 |
+
client = ["grpcio (!=1.56.0)"]
|
| 3488 |
+
cpp = ["ray-cpp (==2.7.1)"]
|
| 3489 |
+
data = ["fsspec", "numpy (>=1.20)", "pandas (>=1.3)", "pyarrow (>=6.0.1)"]
|
| 3490 |
+
default = ["aiohttp (>=3.7)", "aiohttp-cors", "colorful", "gpustat (>=1.0.0)", "grpcio (>=1.32.0)", "grpcio (>=1.42.0)", "opencensus", "prometheus-client (>=0.7.1)", "py-spy (>=0.2.0)", "pydantic (<2)", "requests", "smart-open", "virtualenv (>=20.0.24,<20.21.1)"]
|
| 3491 |
+
observability = ["opentelemetry-api", "opentelemetry-exporter-otlp", "opentelemetry-sdk"]
|
| 3492 |
+
rllib = ["dm-tree", "fsspec", "gymnasium (==0.28.1)", "lz4", "pandas", "pyarrow (>=6.0.1)", "pyyaml", "requests", "rich", "scikit-image", "scipy", "tensorboardX (>=1.9)", "typer"]
|
| 3493 |
+
serve = ["aiohttp (>=3.7)", "aiohttp-cors", "aiorwlock", "colorful", "fastapi", "gpustat (>=1.0.0)", "grpcio (>=1.32.0)", "grpcio (>=1.42.0)", "opencensus", "prometheus-client (>=0.7.1)", "py-spy (>=0.2.0)", "pydantic (<2)", "requests", "smart-open", "starlette", "uvicorn", "virtualenv (>=20.0.24,<20.21.1)", "watchfiles"]
|
| 3494 |
+
serve-grpc = ["aiohttp (>=3.7)", "aiohttp-cors", "aiorwlock", "colorful", "fastapi", "gpustat (>=1.0.0)", "grpcio (>=1.32.0)", "grpcio (>=1.42.0)", "opencensus", "prometheus-client (>=0.7.1)", "py-spy (>=0.2.0)", "pydantic (<2)", "requests", "smart-open", "starlette", "uvicorn", "virtualenv (>=20.0.24,<20.21.1)", "watchfiles"]
|
| 3495 |
+
train = ["fsspec", "pandas", "pyarrow (>=6.0.1)", "requests", "tensorboardX (>=1.9)"]
|
| 3496 |
+
tune = ["fsspec", "pandas", "pyarrow (>=6.0.1)", "requests", "tensorboardX (>=1.9)"]
|
| 3497 |
+
|
| 3498 |
[[package]]
|
| 3499 |
name = "referencing"
|
| 3500 |
version = "0.30.2"
|
|
|
|
| 4260 |
[package.dependencies]
|
| 4261 |
mpmath = ">=0.19"
|
| 4262 |
|
| 4263 |
+
[[package]]
|
| 4264 |
+
name = "tabulate"
|
| 4265 |
+
version = "0.9.0"
|
| 4266 |
+
description = "Pretty-print tabular data"
|
| 4267 |
+
optional = false
|
| 4268 |
+
python-versions = ">=3.7"
|
| 4269 |
+
files = [
|
| 4270 |
+
{file = "tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f"},
|
| 4271 |
+
{file = "tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c"},
|
| 4272 |
+
]
|
| 4273 |
+
|
| 4274 |
+
[package.extras]
|
| 4275 |
+
widechars = ["wcwidth"]
|
| 4276 |
+
|
| 4277 |
[[package]]
|
| 4278 |
name = "terminado"
|
| 4279 |
version = "0.17.1"
|
|
|
|
| 4777 |
|
| 4778 |
[[package]]
|
| 4779 |
name = "wcwidth"
|
| 4780 |
+
version = "0.2.9"
|
| 4781 |
description = "Measures the displayed width of unicode strings in a terminal"
|
| 4782 |
optional = false
|
| 4783 |
python-versions = "*"
|
| 4784 |
files = [
|
| 4785 |
+
{file = "wcwidth-0.2.9-py2.py3-none-any.whl", hash = "sha256:9a929bd8380f6cd9571a968a9c8f4353ca58d7cd812a4822bba831f8d685b223"},
|
| 4786 |
+
{file = "wcwidth-0.2.9.tar.gz", hash = "sha256:a675d1a4a2d24ef67096a04b85b02deeecd8e226f57b5e3a72dbb9ed99d27da8"},
|
| 4787 |
]
|
| 4788 |
|
| 4789 |
[[package]]
|
|
|
|
| 5061 |
[metadata]
|
| 5062 |
lock-version = "2.0"
|
| 5063 |
python-versions = ">=3.9,<3.13"
|
| 5064 |
+
content-hash = "026459e6ec77505270a4430f661e19c2db1f7e49876b64b08b7fdc83729915bd"
|
pyproject.toml
CHANGED
|
@@ -16,9 +16,13 @@ pydantic = "^2.4.2"
|
|
| 16 |
pydantic-settings = "^2.0.3"
|
| 17 |
nougat-ocr = "^0.1.17"
|
| 18 |
transformers = "^4.34.1"
|
| 19 |
-
torch = "^2.1.0"
|
| 20 |
numpy = "^1.26.1"
|
| 21 |
python-dotenv = "^1.0.0"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
[tool.poetry.group.dev.dependencies]
|
| 24 |
jupyter = "^1.0.0"
|
|
|
|
| 16 |
pydantic-settings = "^2.0.3"
|
| 17 |
nougat-ocr = "^0.1.17"
|
| 18 |
transformers = "^4.34.1"
|
|
|
|
| 19 |
numpy = "^1.26.1"
|
| 20 |
python-dotenv = "^1.0.0"
|
| 21 |
+
torch = "^2.1.0"
|
| 22 |
+
ray = "^2.7.1"
|
| 23 |
+
tqdm = "^4.66.1"
|
| 24 |
+
tabulate = "^0.9.0"
|
| 25 |
+
|
| 26 |
|
| 27 |
[tool.poetry.group.dev.dependencies]
|
| 28 |
jupyter = "^1.0.0"
|