Vik Paruchuri
commited on
Commit
·
5330766
1
Parent(s):
4dcfb84
Improve table, markdown, and ocr
Browse files- README.md +1 -1
- marker/cleaners/table.py +13 -7
- marker/ocr/heuristics.py +1 -1
- marker/ocr/recognition.py +1 -1
- marker/pdf/extract_text.py +5 -1
- marker/postprocessors/markdown.py +14 -9
- marker/settings.py +1 -5
- poetry.lock +82 -96
- pyproject.toml +1 -0
README.md
CHANGED
|
@@ -95,7 +95,7 @@ First, some configuration. Note that settings can be overridden with env vars,
|
|
| 95 |
- Your torch device will be automatically detected, but you can manually set it also. For example, `TORCH_DEVICE=cuda` or `TORCH_DEVICE=mps`. `cpu` is the default.
|
| 96 |
- If using GPU, set `INFERENCE_RAM` to your GPU VRAM (per GPU). For example, if you have 16 GB of VRAM, set `INFERENCE_RAM=16`.
|
| 97 |
- Depending on your document types, marker's average memory usage per task can vary slightly. You can configure `VRAM_PER_TASK` to adjust this if you notice tasks failing with GPU out of memory errors.
|
| 98 |
-
- By default, marker will use `
|
| 99 |
- Inspect the other settings in `marker/settings.py`. You can override any settings in the `local.env` file, or by setting environment variables.
|
| 100 |
|
| 101 |
|
|
|
|
| 95 |
- Your torch device will be automatically detected, but you can manually set it also. For example, `TORCH_DEVICE=cuda` or `TORCH_DEVICE=mps`. `cpu` is the default.
|
| 96 |
- If using GPU, set `INFERENCE_RAM` to your GPU VRAM (per GPU). For example, if you have 16 GB of VRAM, set `INFERENCE_RAM=16`.
|
| 97 |
- Depending on your document types, marker's average memory usage per task can vary slightly. You can configure `VRAM_PER_TASK` to adjust this if you notice tasks failing with GPU out of memory errors.
|
| 98 |
+
- By default, marker will use `surya` for OCR. Surya is slower on CPU, but more accurate than tesseract. If you want faster OCR, set `OCR_ENGINE` to `ocrmypdf`. This also requires external dependencies (see above).
|
| 99 |
- Inspect the other settings in `marker/settings.py`. You can override any settings in the `local.env` file, or by setting environment variables.
|
| 100 |
|
| 101 |
|
marker/cleaners/table.py
CHANGED
|
@@ -6,10 +6,14 @@ from typing import List, Dict
|
|
| 6 |
import re
|
| 7 |
|
| 8 |
|
| 9 |
-
def
|
| 10 |
vertical_groups = {}
|
| 11 |
for block in blocks:
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
if group_key not in vertical_groups:
|
| 14 |
vertical_groups[group_key] = []
|
| 15 |
vertical_groups[group_key].append(block)
|
|
@@ -17,7 +21,7 @@ def sort_char_blocks(blocks, tolerance=1.25):
|
|
| 17 |
# Sort each group horizontally and flatten the groups into a single list
|
| 18 |
sorted_blocks = []
|
| 19 |
for _, group in sorted(vertical_groups.items()):
|
| 20 |
-
sorted_group = sorted(group, key=lambda x: x["bbox"][0])
|
| 21 |
sorted_blocks.extend(sorted_group)
|
| 22 |
|
| 23 |
return sorted_blocks
|
|
@@ -42,8 +46,10 @@ def get_table_surya(page, table_box, space_tol=.01) -> List[List[str]]:
|
|
| 42 |
table_rows = []
|
| 43 |
table_row = []
|
| 44 |
x_position = None
|
| 45 |
-
|
| 46 |
-
|
|
|
|
|
|
|
| 47 |
line_bbox = line.bbox
|
| 48 |
intersect_pct = box_intersection_pct(line_bbox, table_box)
|
| 49 |
if intersect_pct < .5 or len(line.spans) == 0:
|
|
@@ -116,9 +122,9 @@ def get_table_pdftext(page: Page, table_box, space_tol=.01) -> List[List[str]]:
|
|
| 116 |
cell_bbox = None
|
| 117 |
prev_end = None
|
| 118 |
table_row = []
|
| 119 |
-
sorted_char_blocks =
|
| 120 |
for block_idx, block in enumerate(sorted_char_blocks):
|
| 121 |
-
sorted_block_lines =
|
| 122 |
for line_idx, line in enumerate(sorted_block_lines):
|
| 123 |
line_bbox = line["bbox"]
|
| 124 |
intersect_pct = box_intersection_pct(line_bbox, table_box)
|
|
|
|
| 6 |
import re
|
| 7 |
|
| 8 |
|
| 9 |
+
def sort_table_blocks(blocks, tolerance=5):
|
| 10 |
vertical_groups = {}
|
| 11 |
for block in blocks:
|
| 12 |
+
if hasattr(block, "bbox"):
|
| 13 |
+
bbox = block.bbox
|
| 14 |
+
else:
|
| 15 |
+
bbox = block["bbox"]
|
| 16 |
+
group_key = round(bbox[1] / tolerance) * tolerance
|
| 17 |
if group_key not in vertical_groups:
|
| 18 |
vertical_groups[group_key] = []
|
| 19 |
vertical_groups[group_key].append(block)
|
|
|
|
| 21 |
# Sort each group horizontally and flatten the groups into a single list
|
| 22 |
sorted_blocks = []
|
| 23 |
for _, group in sorted(vertical_groups.items()):
|
| 24 |
+
sorted_group = sorted(group, key=lambda x: x.bbox[0] if hasattr(x, "bbox") else x["bbox"][0])
|
| 25 |
sorted_blocks.extend(sorted_group)
|
| 26 |
|
| 27 |
return sorted_blocks
|
|
|
|
| 46 |
table_rows = []
|
| 47 |
table_row = []
|
| 48 |
x_position = None
|
| 49 |
+
sorted_blocks = sort_table_blocks(page.blocks)
|
| 50 |
+
for block_idx, block in enumerate(sorted_blocks):
|
| 51 |
+
sorted_lines = sort_table_blocks(block.lines)
|
| 52 |
+
for line_idx, line in enumerate(sorted_lines):
|
| 53 |
line_bbox = line.bbox
|
| 54 |
intersect_pct = box_intersection_pct(line_bbox, table_box)
|
| 55 |
if intersect_pct < .5 or len(line.spans) == 0:
|
|
|
|
| 122 |
cell_bbox = None
|
| 123 |
prev_end = None
|
| 124 |
table_row = []
|
| 125 |
+
sorted_char_blocks = sort_table_blocks(page.char_blocks)
|
| 126 |
for block_idx, block in enumerate(sorted_char_blocks):
|
| 127 |
+
sorted_block_lines = sort_table_blocks(block["lines"])
|
| 128 |
for line_idx, line in enumerate(sorted_block_lines):
|
| 129 |
line_bbox = line["bbox"]
|
| 130 |
intersect_pct = box_intersection_pct(line_bbox, table_box)
|
marker/ocr/heuristics.py
CHANGED
|
@@ -52,7 +52,7 @@ def no_text_found(pages: List[Page]):
|
|
| 52 |
return len(full_text.strip()) == 0
|
| 53 |
|
| 54 |
|
| 55 |
-
def detected_line_coverage(page: Page, intersect_thresh=.
|
| 56 |
found_lines = 0
|
| 57 |
for detected_line in page.text_lines.bboxes:
|
| 58 |
|
|
|
|
| 52 |
return len(full_text.strip()) == 0
|
| 53 |
|
| 54 |
|
| 55 |
+
def detected_line_coverage(page: Page, intersect_thresh=.4, detection_thresh=.3):
|
| 56 |
found_lines = 0
|
| 57 |
for detected_line in page.text_lines.bboxes:
|
| 58 |
|
marker/ocr/recognition.py
CHANGED
|
@@ -41,7 +41,7 @@ def run_ocr(doc, pages: List[Page], langs: List[str], rec_model, parallel_factor
|
|
| 41 |
ocr_success += 1
|
| 42 |
pages[orig_idx] = page
|
| 43 |
|
| 44 |
-
return pages, {"ocr_pages": ocr_pages, "ocr_failed": ocr_failed, "ocr_success": ocr_success}
|
| 45 |
|
| 46 |
|
| 47 |
def surya_recognition(doc, page_idxs, langs: List[str], rec_model, pages: List[Page]) -> List[Optional[Page]]:
|
|
|
|
| 41 |
ocr_success += 1
|
| 42 |
pages[orig_idx] = page
|
| 43 |
|
| 44 |
+
return pages, {"ocr_pages": ocr_pages, "ocr_failed": ocr_failed, "ocr_success": ocr_success, "ocr_engine": ocr_method}
|
| 45 |
|
| 46 |
|
| 47 |
def surya_recognition(doc, page_idxs, langs: List[str], rec_model, pages: List[Page]) -> List[Optional[Page]]:
|
marker/pdf/extract_text.py
CHANGED
|
@@ -22,7 +22,11 @@ def pdftext_format_to_blocks(page, pnum: int) -> Page:
|
|
| 22 |
for l in block["lines"]:
|
| 23 |
spans = []
|
| 24 |
for i, s in enumerate(l["spans"]):
|
| 25 |
-
block_text = s["text"]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
block_text = block_text.replace("-\n", "") # Remove hyphenated line breaks
|
| 27 |
span_obj = Span(
|
| 28 |
text=block_text, # Remove end of line newlines, not spaces
|
|
|
|
| 22 |
for l in block["lines"]:
|
| 23 |
spans = []
|
| 24 |
for i, s in enumerate(l["spans"]):
|
| 25 |
+
block_text = s["text"]
|
| 26 |
+
# Remove trailing newlines and carriage returns (tesseract)
|
| 27 |
+
while len(block_text) > 0 and block_text[-1] in ["\n", "\r"]:
|
| 28 |
+
block_text = block_text[:-1]
|
| 29 |
+
|
| 30 |
block_text = block_text.replace("-\n", "") # Remove hyphenated line breaks
|
| 31 |
span_obj = Span(
|
| 32 |
text=block_text, # Remove end of line newlines, not spaces
|
marker/postprocessors/markdown.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
from marker.schema.merged import MergedLine, MergedBlock, FullyMergedBlock
|
| 2 |
from marker.schema.page import Page
|
| 3 |
import re
|
|
|
|
| 4 |
from typing import List
|
| 5 |
|
| 6 |
|
|
@@ -80,31 +81,35 @@ def block_surround(text, block_type):
|
|
| 80 |
|
| 81 |
def line_separator(line1, line2, block_type, is_continuation=False):
|
| 82 |
# Should cover latin-derived languages and russian
|
| 83 |
-
lowercase_letters =
|
| 84 |
-
uppercase_letters = "A-ZÀ-ÖØ-ßА-ЯŞĆĂÂĐÊÔƠƯÞÐÆØÅ"
|
| 85 |
# Remove hyphen in current line if next line and current line appear to be joined
|
| 86 |
-
hyphen_pattern =
|
| 87 |
-
if line1 and hyphen_pattern.match(line1) and
|
| 88 |
# Split on — or - from the right
|
| 89 |
line1 = re.split(r"[-—]\s?$", line1)[0]
|
| 90 |
return line1.rstrip() + line2.lstrip()
|
| 91 |
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
|
|
|
|
|
|
|
|
|
| 95 |
|
| 96 |
if block_type in ["Title", "Section-header"]:
|
| 97 |
return line1.rstrip() + " " + line2.lstrip()
|
| 98 |
-
elif
|
| 99 |
return line1.rstrip() + " " + line2.lstrip()
|
| 100 |
elif is_continuation:
|
| 101 |
return line1.rstrip() + " " + line2.lstrip()
|
| 102 |
-
elif block_type == "Text" and
|
| 103 |
return line1 + "\n\n" + line2
|
| 104 |
elif block_type == "Formula":
|
| 105 |
return line1 + " " + line2
|
| 106 |
elif block_type == "Table":
|
| 107 |
return line1 + "\n\n" + line2
|
|
|
|
|
|
|
| 108 |
else:
|
| 109 |
return line1 + "\n" + line2
|
| 110 |
|
|
|
|
| 1 |
from marker.schema.merged import MergedLine, MergedBlock, FullyMergedBlock
|
| 2 |
from marker.schema.page import Page
|
| 3 |
import re
|
| 4 |
+
import regex
|
| 5 |
from typing import List
|
| 6 |
|
| 7 |
|
|
|
|
| 81 |
|
| 82 |
def line_separator(line1, line2, block_type, is_continuation=False):
|
| 83 |
# Should cover latin-derived languages and russian
|
| 84 |
+
lowercase_letters = r'(\p{Lo}+|\p{Ll}+)'
|
|
|
|
| 85 |
# Remove hyphen in current line if next line and current line appear to be joined
|
| 86 |
+
hyphen_pattern = regex.compile(rf'.*[{lowercase_letters}][-]\s?$', regex.DOTALL)
|
| 87 |
+
if line1 and hyphen_pattern.match(line1) and regex.match(rf"^\s?[{lowercase_letters}]", line2):
|
| 88 |
# Split on — or - from the right
|
| 89 |
line1 = re.split(r"[-—]\s?$", line1)[0]
|
| 90 |
return line1.rstrip() + line2.lstrip()
|
| 91 |
|
| 92 |
+
all_letters = r'\p{L}+'
|
| 93 |
+
sentence_continuations = r',;(—'
|
| 94 |
+
sentence_ends = r'。ๆ.?!'
|
| 95 |
+
line_end_pattern = regex.compile(rf'.*[{lowercase_letters}{sentence_continuations}]\s?$', regex.DOTALL)
|
| 96 |
+
line_start_pattern = regex.compile(rf'^\s?[{all_letters}]', regex.DOTALL)
|
| 97 |
+
sentence_end_pattern = regex.compile(rf'.*[{sentence_ends}]\s?$', regex.DOTALL)
|
| 98 |
|
| 99 |
if block_type in ["Title", "Section-header"]:
|
| 100 |
return line1.rstrip() + " " + line2.lstrip()
|
| 101 |
+
elif line_end_pattern.match(line1) and line_start_pattern.match(line2) and block_type == "Text":
|
| 102 |
return line1.rstrip() + " " + line2.lstrip()
|
| 103 |
elif is_continuation:
|
| 104 |
return line1.rstrip() + " " + line2.lstrip()
|
| 105 |
+
elif block_type == "Text" and sentence_end_pattern.match(line1):
|
| 106 |
return line1 + "\n\n" + line2
|
| 107 |
elif block_type == "Formula":
|
| 108 |
return line1 + " " + line2
|
| 109 |
elif block_type == "Table":
|
| 110 |
return line1 + "\n\n" + line2
|
| 111 |
+
elif block_type in ["Formula"]:
|
| 112 |
+
return line1.rstrip() + "\n\n" + line2.lstrip()
|
| 113 |
else:
|
| 114 |
return line1 + "\n" + line2
|
| 115 |
|
marker/settings.py
CHANGED
|
@@ -61,11 +61,7 @@ class Settings(BaseSettings):
|
|
| 61 |
if self.OCR_ENGINE is not None:
|
| 62 |
return self.OCR_ENGINE
|
| 63 |
|
| 64 |
-
|
| 65 |
-
if torch.cuda.is_available():
|
| 66 |
-
return "surya"
|
| 67 |
-
|
| 68 |
-
return "ocrmypdf"
|
| 69 |
|
| 70 |
# Texify model
|
| 71 |
TEXIFY_MODEL_MAX: int = 384 # Max inference length for texify
|
|
|
|
| 61 |
if self.OCR_ENGINE is not None:
|
| 62 |
return self.OCR_ENGINE
|
| 63 |
|
| 64 |
+
return "surya"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
|
| 66 |
# Texify model
|
| 67 |
TEXIFY_MODEL_MAX: int = 384 # Max inference length for texify
|
poetry.lock
CHANGED
|
@@ -3554,104 +3554,90 @@ rpds-py = ">=0.7.0"
|
|
| 3554 |
|
| 3555 |
[[package]]
|
| 3556 |
name = "regex"
|
| 3557 |
-
version = "
|
| 3558 |
description = "Alternative regular expression module, to replace re."
|
| 3559 |
optional = false
|
| 3560 |
-
python-versions = ">=3.
|
| 3561 |
files = [
|
| 3562 |
-
{file = "regex-
|
| 3563 |
-
{file = "regex-
|
| 3564 |
-
{file = "regex-
|
| 3565 |
-
{file = "regex-
|
| 3566 |
-
{file = "regex-
|
| 3567 |
-
{file = "regex-
|
| 3568 |
-
{file = "regex-
|
| 3569 |
-
{file = "regex-
|
| 3570 |
-
{file = "regex-
|
| 3571 |
-
{file = "regex-
|
| 3572 |
-
{file = "regex-
|
| 3573 |
-
{file = "regex-
|
| 3574 |
-
{file = "regex-
|
| 3575 |
-
{file = "regex-
|
| 3576 |
-
{file = "regex-
|
| 3577 |
-
{file = "regex-
|
| 3578 |
-
{file = "regex-
|
| 3579 |
-
{file = "regex-
|
| 3580 |
-
{file = "regex-
|
| 3581 |
-
{file = "regex-
|
| 3582 |
-
{file = "regex-
|
| 3583 |
-
{file = "regex-
|
| 3584 |
-
{file = "regex-
|
| 3585 |
-
{file = "regex-
|
| 3586 |
-
{file = "regex-
|
| 3587 |
-
{file = "regex-
|
| 3588 |
-
{file = "regex-
|
| 3589 |
-
{file = "regex-
|
| 3590 |
-
{file = "regex-
|
| 3591 |
-
{file = "regex-
|
| 3592 |
-
{file = "regex-
|
| 3593 |
-
{file = "regex-
|
| 3594 |
-
{file = "regex-
|
| 3595 |
-
{file = "regex-
|
| 3596 |
-
{file = "regex-
|
| 3597 |
-
{file = "regex-
|
| 3598 |
-
{file = "regex-
|
| 3599 |
-
{file = "regex-
|
| 3600 |
-
{file = "regex-
|
| 3601 |
-
{file = "regex-
|
| 3602 |
-
{file = "regex-
|
| 3603 |
-
{file = "regex-
|
| 3604 |
-
{file = "regex-
|
| 3605 |
-
{file = "regex-
|
| 3606 |
-
{file = "regex-
|
| 3607 |
-
{file = "regex-
|
| 3608 |
-
{file = "regex-
|
| 3609 |
-
{file = "regex-
|
| 3610 |
-
{file = "regex-
|
| 3611 |
-
{file = "regex-
|
| 3612 |
-
{file = "regex-
|
| 3613 |
-
{file = "regex-
|
| 3614 |
-
{file = "regex-
|
| 3615 |
-
{file = "regex-
|
| 3616 |
-
{file = "regex-
|
| 3617 |
-
{file = "regex-
|
| 3618 |
-
{file = "regex-
|
| 3619 |
-
{file = "regex-
|
| 3620 |
-
{file = "regex-
|
| 3621 |
-
{file = "regex-
|
| 3622 |
-
{file = "regex-
|
| 3623 |
-
{file = "regex-
|
| 3624 |
-
{file = "regex-
|
| 3625 |
-
{file = "regex-
|
| 3626 |
-
{file = "regex-
|
| 3627 |
-
{file = "regex-
|
| 3628 |
-
{file = "regex-
|
| 3629 |
-
{file = "regex-
|
| 3630 |
-
{file = "regex-
|
| 3631 |
-
{file = "regex-
|
| 3632 |
-
{file = "regex-
|
| 3633 |
-
{file = "regex-
|
| 3634 |
-
{file = "regex-
|
| 3635 |
-
{file = "regex-
|
| 3636 |
-
{file = "regex-
|
| 3637 |
-
{file = "regex-
|
| 3638 |
-
{file = "regex-
|
| 3639 |
-
{file = "regex-
|
| 3640 |
-
{file = "regex-
|
| 3641 |
-
{file = "regex-2023.12.25-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc6bb9aa69aacf0f6032c307da718f61a40cf970849e471254e0e91c56ffca95"},
|
| 3642 |
-
{file = "regex-2023.12.25-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:298dc6354d414bc921581be85695d18912bea163a8b23cac9a2562bbcd5088b1"},
|
| 3643 |
-
{file = "regex-2023.12.25-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2f4e475a80ecbd15896a976aa0b386c5525d0ed34d5c600b6d3ebac0a67c7ddf"},
|
| 3644 |
-
{file = "regex-2023.12.25-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:531ac6cf22b53e0696f8e1d56ce2396311254eb806111ddd3922c9d937151dae"},
|
| 3645 |
-
{file = "regex-2023.12.25-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:22f3470f7524b6da61e2020672df2f3063676aff444db1daa283c2ea4ed259d6"},
|
| 3646 |
-
{file = "regex-2023.12.25-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:89723d2112697feaa320c9d351e5f5e7b841e83f8b143dba8e2d2b5f04e10923"},
|
| 3647 |
-
{file = "regex-2023.12.25-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0ecf44ddf9171cd7566ef1768047f6e66975788258b1c6c6ca78098b95cf9a3d"},
|
| 3648 |
-
{file = "regex-2023.12.25-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:905466ad1702ed4acfd67a902af50b8db1feeb9781436372261808df7a2a7bca"},
|
| 3649 |
-
{file = "regex-2023.12.25-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:4558410b7a5607a645e9804a3e9dd509af12fb72b9825b13791a37cd417d73a5"},
|
| 3650 |
-
{file = "regex-2023.12.25-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:7e316026cc1095f2a3e8cc012822c99f413b702eaa2ca5408a513609488cb62f"},
|
| 3651 |
-
{file = "regex-2023.12.25-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:3b1de218d5375cd6ac4b5493e0b9f3df2be331e86520f23382f216c137913d20"},
|
| 3652 |
-
{file = "regex-2023.12.25-cp39-cp39-win32.whl", hash = "sha256:11a963f8e25ab5c61348d090bf1b07f1953929c13bd2309a0662e9ff680763c9"},
|
| 3653 |
-
{file = "regex-2023.12.25-cp39-cp39-win_amd64.whl", hash = "sha256:e693e233ac92ba83a87024e1d32b5f9ab15ca55ddd916d878146f4e3406b5c91"},
|
| 3654 |
-
{file = "regex-2023.12.25.tar.gz", hash = "sha256:29171aa128da69afdf4bde412d5bedc335f2ca8fcfe4489038577d05f16181e5"},
|
| 3655 |
]
|
| 3656 |
|
| 3657 |
[[package]]
|
|
@@ -5004,4 +4990,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
|
|
| 5004 |
[metadata]
|
| 5005 |
lock-version = "2.0"
|
| 5006 |
python-versions = ">=3.9,<3.13,!=3.9.7"
|
| 5007 |
-
content-hash = "
|
|
|
|
| 3554 |
|
| 3555 |
[[package]]
|
| 3556 |
name = "regex"
|
| 3557 |
+
version = "2024.4.28"
|
| 3558 |
description = "Alternative regular expression module, to replace re."
|
| 3559 |
optional = false
|
| 3560 |
+
python-versions = ">=3.8"
|
| 3561 |
files = [
|
| 3562 |
+
{file = "regex-2024.4.28-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:cd196d056b40af073d95a2879678585f0b74ad35190fac04ca67954c582c6b61"},
|
| 3563 |
+
{file = "regex-2024.4.28-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8bb381f777351bd534462f63e1c6afb10a7caa9fa2a421ae22c26e796fe31b1f"},
|
| 3564 |
+
{file = "regex-2024.4.28-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:47af45b6153522733aa6e92543938e97a70ce0900649ba626cf5aad290b737b6"},
|
| 3565 |
+
{file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:99d6a550425cc51c656331af0e2b1651e90eaaa23fb4acde577cf15068e2e20f"},
|
| 3566 |
+
{file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bf29304a8011feb58913c382902fde3395957a47645bf848eea695839aa101b7"},
|
| 3567 |
+
{file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:92da587eee39a52c91aebea8b850e4e4f095fe5928d415cb7ed656b3460ae79a"},
|
| 3568 |
+
{file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6277d426e2f31bdbacb377d17a7475e32b2d7d1f02faaecc48d8e370c6a3ff31"},
|
| 3569 |
+
{file = "regex-2024.4.28-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:28e1f28d07220c0f3da0e8fcd5a115bbb53f8b55cecf9bec0c946eb9a059a94c"},
|
| 3570 |
+
{file = "regex-2024.4.28-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:aaa179975a64790c1f2701ac562b5eeb733946eeb036b5bcca05c8d928a62f10"},
|
| 3571 |
+
{file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:6f435946b7bf7a1b438b4e6b149b947c837cb23c704e780c19ba3e6855dbbdd3"},
|
| 3572 |
+
{file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:19d6c11bf35a6ad077eb23852827f91c804eeb71ecb85db4ee1386825b9dc4db"},
|
| 3573 |
+
{file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:fdae0120cddc839eb8e3c15faa8ad541cc6d906d3eb24d82fb041cfe2807bc1e"},
|
| 3574 |
+
{file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:e672cf9caaf669053121f1766d659a8813bd547edef6e009205378faf45c67b8"},
|
| 3575 |
+
{file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f57515750d07e14743db55d59759893fdb21d2668f39e549a7d6cad5d70f9fea"},
|
| 3576 |
+
{file = "regex-2024.4.28-cp310-cp310-win32.whl", hash = "sha256:a1409c4eccb6981c7baabc8888d3550df518add6e06fe74fa1d9312c1838652d"},
|
| 3577 |
+
{file = "regex-2024.4.28-cp310-cp310-win_amd64.whl", hash = "sha256:1f687a28640f763f23f8a9801fe9e1b37338bb1ca5d564ddd41619458f1f22d1"},
|
| 3578 |
+
{file = "regex-2024.4.28-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:84077821c85f222362b72fdc44f7a3a13587a013a45cf14534df1cbbdc9a6796"},
|
| 3579 |
+
{file = "regex-2024.4.28-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b45d4503de8f4f3dc02f1d28a9b039e5504a02cc18906cfe744c11def942e9eb"},
|
| 3580 |
+
{file = "regex-2024.4.28-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:457c2cd5a646dd4ed536c92b535d73548fb8e216ebee602aa9f48e068fc393f3"},
|
| 3581 |
+
{file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2b51739ddfd013c6f657b55a508de8b9ea78b56d22b236052c3a85a675102dc6"},
|
| 3582 |
+
{file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:459226445c7d7454981c4c0ce0ad1a72e1e751c3e417f305722bbcee6697e06a"},
|
| 3583 |
+
{file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:670fa596984b08a4a769491cbdf22350431970d0112e03d7e4eeaecaafcd0fec"},
|
| 3584 |
+
{file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fe00f4fe11c8a521b173e6324d862ee7ee3412bf7107570c9b564fe1119b56fb"},
|
| 3585 |
+
{file = "regex-2024.4.28-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:36f392dc7763fe7924575475736bddf9ab9f7a66b920932d0ea50c2ded2f5636"},
|
| 3586 |
+
{file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:23a412b7b1a7063f81a742463f38821097b6a37ce1e5b89dd8e871d14dbfd86b"},
|
| 3587 |
+
{file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:f1d6e4b7b2ae3a6a9df53efbf199e4bfcff0959dbdb5fd9ced34d4407348e39a"},
|
| 3588 |
+
{file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:499334ad139557de97cbc4347ee921c0e2b5e9c0f009859e74f3f77918339257"},
|
| 3589 |
+
{file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:0940038bec2fe9e26b203d636c44d31dd8766abc1fe66262da6484bd82461ccf"},
|
| 3590 |
+
{file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:66372c2a01782c5fe8e04bff4a2a0121a9897e19223d9eab30c54c50b2ebeb7f"},
|
| 3591 |
+
{file = "regex-2024.4.28-cp311-cp311-win32.whl", hash = "sha256:c77d10ec3c1cf328b2f501ca32583625987ea0f23a0c2a49b37a39ee5c4c4630"},
|
| 3592 |
+
{file = "regex-2024.4.28-cp311-cp311-win_amd64.whl", hash = "sha256:fc0916c4295c64d6890a46e02d4482bb5ccf33bf1a824c0eaa9e83b148291f90"},
|
| 3593 |
+
{file = "regex-2024.4.28-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:08a1749f04fee2811c7617fdd46d2e46d09106fa8f475c884b65c01326eb15c5"},
|
| 3594 |
+
{file = "regex-2024.4.28-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b8eb28995771c087a73338f695a08c9abfdf723d185e57b97f6175c5051ff1ae"},
|
| 3595 |
+
{file = "regex-2024.4.28-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:dd7ef715ccb8040954d44cfeff17e6b8e9f79c8019daae2fd30a8806ef5435c0"},
|
| 3596 |
+
{file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb0315a2b26fde4005a7c401707c5352df274460f2f85b209cf6024271373013"},
|
| 3597 |
+
{file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f2fc053228a6bd3a17a9b0a3f15c3ab3cf95727b00557e92e1cfe094b88cc662"},
|
| 3598 |
+
{file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7fe9739a686dc44733d52d6e4f7b9c77b285e49edf8570754b322bca6b85b4cc"},
|
| 3599 |
+
{file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a74fcf77d979364f9b69fcf8200849ca29a374973dc193a7317698aa37d8b01c"},
|
| 3600 |
+
{file = "regex-2024.4.28-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:965fd0cf4694d76f6564896b422724ec7b959ef927a7cb187fc6b3f4e4f59833"},
|
| 3601 |
+
{file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:2fef0b38c34ae675fcbb1b5db760d40c3fc3612cfa186e9e50df5782cac02bcd"},
|
| 3602 |
+
{file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bc365ce25f6c7c5ed70e4bc674f9137f52b7dd6a125037f9132a7be52b8a252f"},
|
| 3603 |
+
{file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:ac69b394764bb857429b031d29d9604842bc4cbfd964d764b1af1868eeebc4f0"},
|
| 3604 |
+
{file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:144a1fc54765f5c5c36d6d4b073299832aa1ec6a746a6452c3ee7b46b3d3b11d"},
|
| 3605 |
+
{file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2630ca4e152c221072fd4a56d4622b5ada876f668ecd24d5ab62544ae6793ed6"},
|
| 3606 |
+
{file = "regex-2024.4.28-cp312-cp312-win32.whl", hash = "sha256:7f3502f03b4da52bbe8ba962621daa846f38489cae5c4a7b5d738f15f6443d17"},
|
| 3607 |
+
{file = "regex-2024.4.28-cp312-cp312-win_amd64.whl", hash = "sha256:0dd3f69098511e71880fb00f5815db9ed0ef62c05775395968299cb400aeab82"},
|
| 3608 |
+
{file = "regex-2024.4.28-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:374f690e1dd0dbdcddea4a5c9bdd97632cf656c69113f7cd6a361f2a67221cb6"},
|
| 3609 |
+
{file = "regex-2024.4.28-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:25f87ae6b96374db20f180eab083aafe419b194e96e4f282c40191e71980c666"},
|
| 3610 |
+
{file = "regex-2024.4.28-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5dbc1bcc7413eebe5f18196e22804a3be1bfdfc7e2afd415e12c068624d48247"},
|
| 3611 |
+
{file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f85151ec5a232335f1be022b09fbbe459042ea1951d8a48fef251223fc67eee1"},
|
| 3612 |
+
{file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:57ba112e5530530fd175ed550373eb263db4ca98b5f00694d73b18b9a02e7185"},
|
| 3613 |
+
{file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:224803b74aab56aa7be313f92a8d9911dcade37e5f167db62a738d0c85fdac4b"},
|
| 3614 |
+
{file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0a54a047b607fd2d2d52a05e6ad294602f1e0dec2291152b745870afc47c1397"},
|
| 3615 |
+
{file = "regex-2024.4.28-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a2a512d623f1f2d01d881513af9fc6a7c46e5cfffb7dc50c38ce959f9246c94"},
|
| 3616 |
+
{file = "regex-2024.4.28-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:c06bf3f38f0707592898428636cbb75d0a846651b053a1cf748763e3063a6925"},
|
| 3617 |
+
{file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:1031a5e7b048ee371ab3653aad3030ecfad6ee9ecdc85f0242c57751a05b0ac4"},
|
| 3618 |
+
{file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:d7a353ebfa7154c871a35caca7bfd8f9e18666829a1dc187115b80e35a29393e"},
|
| 3619 |
+
{file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:7e76b9cfbf5ced1aca15a0e5b6f229344d9b3123439ffce552b11faab0114a02"},
|
| 3620 |
+
{file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:5ce479ecc068bc2a74cb98dd8dba99e070d1b2f4a8371a7dfe631f85db70fe6e"},
|
| 3621 |
+
{file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:7d77b6f63f806578c604dca209280e4c54f0fa9a8128bb8d2cc5fb6f99da4150"},
|
| 3622 |
+
{file = "regex-2024.4.28-cp38-cp38-win32.whl", hash = "sha256:d84308f097d7a513359757c69707ad339da799e53b7393819ec2ea36bc4beb58"},
|
| 3623 |
+
{file = "regex-2024.4.28-cp38-cp38-win_amd64.whl", hash = "sha256:2cc1b87bba1dd1a898e664a31012725e48af826bf3971e786c53e32e02adae6c"},
|
| 3624 |
+
{file = "regex-2024.4.28-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7413167c507a768eafb5424413c5b2f515c606be5bb4ef8c5dee43925aa5718b"},
|
| 3625 |
+
{file = "regex-2024.4.28-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:108e2dcf0b53a7c4ab8986842a8edcb8ab2e59919a74ff51c296772e8e74d0ae"},
|
| 3626 |
+
{file = "regex-2024.4.28-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f1c5742c31ba7d72f2dedf7968998730664b45e38827637e0f04a2ac7de2f5f1"},
|
| 3627 |
+
{file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ecc6148228c9ae25ce403eade13a0961de1cb016bdb35c6eafd8e7b87ad028b1"},
|
| 3628 |
+
{file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b7d893c8cf0e2429b823ef1a1d360a25950ed11f0e2a9df2b5198821832e1947"},
|
| 3629 |
+
{file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4290035b169578ffbbfa50d904d26bec16a94526071ebec3dadbebf67a26b25e"},
|
| 3630 |
+
{file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:44a22ae1cfd82e4ffa2066eb3390777dc79468f866f0625261a93e44cdf6482b"},
|
| 3631 |
+
{file = "regex-2024.4.28-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fd24fd140b69f0b0bcc9165c397e9b2e89ecbeda83303abf2a072609f60239e2"},
|
| 3632 |
+
{file = "regex-2024.4.28-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:39fb166d2196413bead229cd64a2ffd6ec78ebab83fff7d2701103cf9f4dfd26"},
|
| 3633 |
+
{file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9301cc6db4d83d2c0719f7fcda37229691745168bf6ae849bea2e85fc769175d"},
|
| 3634 |
+
{file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:7c3d389e8d76a49923683123730c33e9553063d9041658f23897f0b396b2386f"},
|
| 3635 |
+
{file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:99ef6289b62042500d581170d06e17f5353b111a15aa6b25b05b91c6886df8fc"},
|
| 3636 |
+
{file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:b91d529b47798c016d4b4c1d06cc826ac40d196da54f0de3c519f5a297c5076a"},
|
| 3637 |
+
{file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:43548ad74ea50456e1c68d3c67fff3de64c6edb85bcd511d1136f9b5376fc9d1"},
|
| 3638 |
+
{file = "regex-2024.4.28-cp39-cp39-win32.whl", hash = "sha256:05d9b6578a22db7dedb4df81451f360395828b04f4513980b6bd7a1412c679cc"},
|
| 3639 |
+
{file = "regex-2024.4.28-cp39-cp39-win_amd64.whl", hash = "sha256:3986217ec830c2109875be740531feb8ddafe0dfa49767cdcd072ed7e8927962"},
|
| 3640 |
+
{file = "regex-2024.4.28.tar.gz", hash = "sha256:83ab366777ea45d58f72593adf35d36ca911ea8bd838483c1823b883a121b0e4"},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3641 |
]
|
| 3642 |
|
| 3643 |
[[package]]
|
|
|
|
| 4990 |
[metadata]
|
| 4991 |
lock-version = "2.0"
|
| 4992 |
python-versions = ">=3.9,<3.13,!=3.9.7"
|
| 4993 |
+
content-hash = "459483572dd8347587db50c0e627b839b6b061af2af022ab8d893c70905b04cb"
|
pyproject.toml
CHANGED
|
@@ -40,6 +40,7 @@ rapidfuzz = "^3.8.1"
|
|
| 40 |
surya-ocr = "^0.4.0"
|
| 41 |
filetype = "^1.2.0"
|
| 42 |
pdftext = "^0.3.4"
|
|
|
|
| 43 |
|
| 44 |
[tool.poetry.group.dev.dependencies]
|
| 45 |
jupyter = "^1.0.0"
|
|
|
|
| 40 |
surya-ocr = "^0.4.0"
|
| 41 |
filetype = "^1.2.0"
|
| 42 |
pdftext = "^0.3.4"
|
| 43 |
+
regex = "^2024.4.28"
|
| 44 |
|
| 45 |
[tool.poetry.group.dev.dependencies]
|
| 46 |
jupyter = "^1.0.0"
|