Vik Paruchuri
commited on
Commit
·
331dc0d
1
Parent(s):
29da3ef
Add min length flag
Browse files- chunk_convert.sh +1 -0
- convert.py +14 -3
- marker/convert.py +13 -0
- marker/extract_text.py +7 -2
- marker/ocr/segment.py +0 -27
- marker/ocr/utils.py +1 -1
- marker/settings.py +1 -1
chunk_convert.sh
CHANGED
|
@@ -37,6 +37,7 @@ for (( i=0; i<$NUM_DEVICES; i++ )); do
|
|
| 37 |
echo "Running convert.py on GPU $DEVICE_NUM"
|
| 38 |
cmd="CUDA_VISIBLE_DEVICES=$DEVICE_NUM python convert.py $INPUT_FOLDER $OUTPUT_FOLDER --num_chunks $NUM_DEVICES --chunk_idx $DEVICE_NUM --workers $NUM_WORKERS"
|
| 39 |
[[ -n "$METADATA_FILE" ]] && cmd="$cmd --metadata_file $METADATA_FILE"
|
|
|
|
| 40 |
eval $cmd &
|
| 41 |
|
| 42 |
sleep 5
|
|
|
|
| 37 |
echo "Running convert.py on GPU $DEVICE_NUM"
|
| 38 |
cmd="CUDA_VISIBLE_DEVICES=$DEVICE_NUM python convert.py $INPUT_FOLDER $OUTPUT_FOLDER --num_chunks $NUM_DEVICES --chunk_idx $DEVICE_NUM --workers $NUM_WORKERS"
|
| 39 |
[[ -n "$METADATA_FILE" ]] && cmd="$cmd --metadata_file $METADATA_FILE"
|
| 40 |
+
[[ -n "$MIN_LENGTH" ]] && cmd="$cmd --min_length $MIN_LENGTH"
|
| 41 |
eval $cmd &
|
| 42 |
|
| 43 |
sleep 5
|
convert.py
CHANGED
|
@@ -6,7 +6,7 @@ import ray
|
|
| 6 |
from tqdm import tqdm
|
| 7 |
import math
|
| 8 |
|
| 9 |
-
from marker.convert import convert_single_pdf
|
| 10 |
from marker.segmentation import load_layout_model
|
| 11 |
from marker.cleaners.equations import load_nougat_model
|
| 12 |
from marker.settings import settings
|
|
@@ -16,14 +16,23 @@ import json
|
|
| 16 |
|
| 17 |
configure_logging()
|
| 18 |
|
|
|
|
| 19 |
@ray.remote(num_cpus=settings.RAY_CORES_PER_WORKER, num_gpus=.05 if settings.CUDA else 0)
|
| 20 |
-
def process_single_pdf(fname: str, out_folder: str, nougat_model, layout_model, metadata: Dict | None=None):
|
| 21 |
out_filename = fname.rsplit(".", 1)[0] + ".md"
|
| 22 |
out_filename = os.path.join(out_folder, os.path.basename(out_filename))
|
| 23 |
out_meta_filename = out_filename.rsplit(".", 1)[0] + "_meta.json"
|
| 24 |
if os.path.exists(out_filename):
|
| 25 |
return
|
| 26 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
full_text, out_metadata = convert_single_pdf(fname, layout_model, nougat_model, metadata=metadata)
|
| 28 |
if len(full_text.strip()) > 0:
|
| 29 |
with open(out_filename, "w+") as f:
|
|
@@ -46,6 +55,7 @@ if __name__ == "__main__":
|
|
| 46 |
parser.add_argument("--max", type=int, default=None, help="Maximum number of pdfs to convert")
|
| 47 |
parser.add_argument("--workers", type=int, default=5, help="Number of worker processes to use")
|
| 48 |
parser.add_argument("--metadata_file", type=str, default=None, help="Metadata file to use for filtering")
|
|
|
|
| 49 |
|
| 50 |
args = parser.parse_args()
|
| 51 |
|
|
@@ -95,7 +105,8 @@ if __name__ == "__main__":
|
|
| 95 |
out_folder,
|
| 96 |
nougat_ref,
|
| 97 |
layoutlm_ref,
|
| 98 |
-
metadata.get(os.path.basename(filename))
|
|
|
|
| 99 |
) for filename in files_to_convert
|
| 100 |
]
|
| 101 |
|
|
|
|
| 6 |
from tqdm import tqdm
|
| 7 |
import math
|
| 8 |
|
| 9 |
+
from marker.convert import convert_single_pdf, get_length_of_text
|
| 10 |
from marker.segmentation import load_layout_model
|
| 11 |
from marker.cleaners.equations import load_nougat_model
|
| 12 |
from marker.settings import settings
|
|
|
|
| 16 |
|
| 17 |
configure_logging()
|
| 18 |
|
| 19 |
+
|
| 20 |
@ray.remote(num_cpus=settings.RAY_CORES_PER_WORKER, num_gpus=.05 if settings.CUDA else 0)
|
| 21 |
+
def process_single_pdf(fname: str, out_folder: str, nougat_model, layout_model, metadata: Dict | None=None, min_length: int | None = None):
|
| 22 |
out_filename = fname.rsplit(".", 1)[0] + ".md"
|
| 23 |
out_filename = os.path.join(out_folder, os.path.basename(out_filename))
|
| 24 |
out_meta_filename = out_filename.rsplit(".", 1)[0] + "_meta.json"
|
| 25 |
if os.path.exists(out_filename):
|
| 26 |
return
|
| 27 |
try:
|
| 28 |
+
# Skip trying to convert files that don't have a lot of embedded text
|
| 29 |
+
# This can indicate that they were scanned, and not OCRed properly
|
| 30 |
+
# Usually these files are not recent/high-quality
|
| 31 |
+
if min_length:
|
| 32 |
+
length = get_length_of_text(fname)
|
| 33 |
+
if length < min_length:
|
| 34 |
+
return
|
| 35 |
+
|
| 36 |
full_text, out_metadata = convert_single_pdf(fname, layout_model, nougat_model, metadata=metadata)
|
| 37 |
if len(full_text.strip()) > 0:
|
| 38 |
with open(out_filename, "w+") as f:
|
|
|
|
| 55 |
parser.add_argument("--max", type=int, default=None, help="Maximum number of pdfs to convert")
|
| 56 |
parser.add_argument("--workers", type=int, default=5, help="Number of worker processes to use")
|
| 57 |
parser.add_argument("--metadata_file", type=str, default=None, help="Metadata file to use for filtering")
|
| 58 |
+
parser.add_argument("--min_length", type=int, default=None, help="Minimum length of pdf to convert")
|
| 59 |
|
| 60 |
args = parser.parse_args()
|
| 61 |
|
|
|
|
| 105 |
out_folder,
|
| 106 |
nougat_ref,
|
| 107 |
layoutlm_ref,
|
| 108 |
+
metadata=metadata.get(os.path.basename(filename)),
|
| 109 |
+
min_length=args.min_length
|
| 110 |
) for filename in files_to_convert
|
| 111 |
]
|
| 112 |
|
marker/convert.py
CHANGED
|
@@ -39,6 +39,19 @@ def annotate_spans(blocks: List[Page], block_types: List[BlockType]):
|
|
| 39 |
page.add_block_types(page_block_types)
|
| 40 |
|
| 41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
def convert_single_pdf(fname: str, layoutlm_model, nougat_model, max_pages=None, metadata: Dict | None=None) -> Tuple[str, Dict]:
|
| 43 |
lang = settings.DEFAULT_LANG
|
| 44 |
if metadata:
|
|
|
|
| 39 |
page.add_block_types(page_block_types)
|
| 40 |
|
| 41 |
|
| 42 |
+
def get_length_of_text(fname: str) -> int:
|
| 43 |
+
filetype = find_filetype(fname)
|
| 44 |
+
if filetype == "other":
|
| 45 |
+
return 0
|
| 46 |
+
|
| 47 |
+
doc = pymupdf.open(fname, filetype=filetype)
|
| 48 |
+
full_text = ""
|
| 49 |
+
for page in doc:
|
| 50 |
+
full_text += page.get_text("text", sort=True, flags=settings.TEXT_FLAGS)
|
| 51 |
+
|
| 52 |
+
return len(full_text)
|
| 53 |
+
|
| 54 |
+
|
| 55 |
def convert_single_pdf(fname: str, layoutlm_model, nougat_model, max_pages=None, metadata: Dict | None=None) -> Tuple[str, Dict]:
|
| 56 |
lang = settings.DEFAULT_LANG
|
| 57 |
if metadata:
|
marker/extract_text.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
import os
|
| 2 |
from typing import Tuple, List
|
| 3 |
|
| 4 |
-
from marker.ocr.segment import ocr_bbox
|
| 5 |
from marker.ocr.page import ocr_entire_page_ocrmp
|
| 6 |
from marker.ocr.utils import detect_bad_ocr, font_flags_decomposer
|
| 7 |
from marker.settings import settings
|
|
@@ -61,6 +60,8 @@ def get_text_blocks(doc, tess_lang: str, spell_lang: str, max_pages: int | None=
|
|
| 61 |
extracted = [False]
|
| 62 |
ocr_pages = 0
|
| 63 |
min_ocr_page = 2
|
|
|
|
|
|
|
| 64 |
for pnum, page in enumerate(doc):
|
| 65 |
if max_pages and pnum >= max_pages:
|
| 66 |
break
|
|
@@ -81,9 +82,13 @@ def get_text_blocks(doc, tess_lang: str, spell_lang: str, max_pages: int | None=
|
|
| 81 |
page_obj = Page(blocks=blocks, pnum=pnum)
|
| 82 |
extracted.append(False)
|
| 83 |
ocr_pages += 1
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
else:
|
| 85 |
if pnum > min_ocr_page:
|
| 86 |
extracted.append(True)
|
| 87 |
|
| 88 |
all_blocks.append(page_obj)
|
| 89 |
-
return all_blocks, toc, {"ocr_pages": ocr_pages}
|
|
|
|
| 1 |
import os
|
| 2 |
from typing import Tuple, List
|
| 3 |
|
|
|
|
| 4 |
from marker.ocr.page import ocr_entire_page_ocrmp
|
| 5 |
from marker.ocr.utils import detect_bad_ocr, font_flags_decomposer
|
| 6 |
from marker.settings import settings
|
|
|
|
| 60 |
extracted = [False]
|
| 61 |
ocr_pages = 0
|
| 62 |
min_ocr_page = 2
|
| 63 |
+
ocr_failed = 0
|
| 64 |
+
ocr_success = 0
|
| 65 |
for pnum, page in enumerate(doc):
|
| 66 |
if max_pages and pnum >= max_pages:
|
| 67 |
break
|
|
|
|
| 82 |
page_obj = Page(blocks=blocks, pnum=pnum)
|
| 83 |
extracted.append(False)
|
| 84 |
ocr_pages += 1
|
| 85 |
+
if len(blocks) == 0:
|
| 86 |
+
ocr_failed += 1
|
| 87 |
+
else:
|
| 88 |
+
ocr_success += 1
|
| 89 |
else:
|
| 90 |
if pnum > min_ocr_page:
|
| 91 |
extracted.append(True)
|
| 92 |
|
| 93 |
all_blocks.append(page_obj)
|
| 94 |
+
return all_blocks, toc, {"ocr_pages": ocr_pages, "ocr_failed": ocr_failed, "ocr_success": ocr_success}
|
marker/ocr/segment.py
DELETED
|
@@ -1,27 +0,0 @@
|
|
| 1 |
-
import fitz as pymupdf
|
| 2 |
-
|
| 3 |
-
from marker.settings import settings
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
def ocr_bbox(page, old_text, bbox, lang: str):
|
| 7 |
-
pix = page.get_pixmap(dpi=settings.SEGMENT_DPI, clip=bbox)
|
| 8 |
-
|
| 9 |
-
try:
|
| 10 |
-
ocrpdf = pymupdf.open("pdf", pix.pdfocr_tobytes(language=lang))
|
| 11 |
-
ocrpage = ocrpdf[0]
|
| 12 |
-
new_text = ocrpage.get_text() # extract OCR-ed text
|
| 13 |
-
except RuntimeError:
|
| 14 |
-
# If the OCR fails, just return the original text
|
| 15 |
-
return old_text
|
| 16 |
-
|
| 17 |
-
if not new_text.strip():
|
| 18 |
-
# If the OCR data is blank, return old text
|
| 19 |
-
return old_text
|
| 20 |
-
|
| 21 |
-
# Tesseract ignores leading spaces, hence some corrections
|
| 22 |
-
lblanks = len(old_text) - len(old_text.lstrip())
|
| 23 |
-
rblanks = len(old_text) - len(old_text.rstrip())
|
| 24 |
-
|
| 25 |
-
# prefix/suffix OCRed text with this many spaces
|
| 26 |
-
new_text = " " * lblanks + new_text + " " * rblanks
|
| 27 |
-
return new_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
marker/ocr/utils.py
CHANGED
|
@@ -36,7 +36,7 @@ def detect_bad_ocr(text, spell_lang: str | None, misspell_threshold=.8, space_th
|
|
| 36 |
if char in settings.INVALID_CHARS:
|
| 37 |
invalid_chars += 1
|
| 38 |
|
| 39 |
-
if invalid_chars > 2:
|
| 40 |
return True
|
| 41 |
|
| 42 |
return False
|
|
|
|
| 36 |
if char in settings.INVALID_CHARS:
|
| 37 |
invalid_chars += 1
|
| 38 |
|
| 39 |
+
if invalid_chars > max(2.0, len(text) * .02):
|
| 40 |
return True
|
| 41 |
|
| 42 |
return False
|
marker/settings.py
CHANGED
|
@@ -24,7 +24,7 @@ class Settings(BaseSettings):
|
|
| 24 |
TEXT_FLAGS: int = pymupdf.TEXTFLAGS_DICT & ~pymupdf.TEXT_PRESERVE_LIGATURES & ~pymupdf.TEXT_PRESERVE_IMAGES
|
| 25 |
|
| 26 |
# OCR
|
| 27 |
-
INVALID_CHARS: List[str] = [chr(0xfffd)
|
| 28 |
DPI: int = 800
|
| 29 |
SEGMENT_DPI: int = 1200
|
| 30 |
TESSDATA_PREFIX: str = ""
|
|
|
|
| 24 |
TEXT_FLAGS: int = pymupdf.TEXTFLAGS_DICT & ~pymupdf.TEXT_PRESERVE_LIGATURES & ~pymupdf.TEXT_PRESERVE_IMAGES
|
| 25 |
|
| 26 |
# OCR
|
| 27 |
+
INVALID_CHARS: List[str] = [chr(0xfffd)]
|
| 28 |
DPI: int = 800
|
| 29 |
SEGMENT_DPI: int = 1200
|
| 30 |
TESSDATA_PREFIX: str = ""
|