Vik Paruchuri commited on
Commit
331dc0d
·
1 Parent(s): 29da3ef

Add min length flag

Browse files
chunk_convert.sh CHANGED
@@ -37,6 +37,7 @@ for (( i=0; i<$NUM_DEVICES; i++ )); do
37
  echo "Running convert.py on GPU $DEVICE_NUM"
38
  cmd="CUDA_VISIBLE_DEVICES=$DEVICE_NUM python convert.py $INPUT_FOLDER $OUTPUT_FOLDER --num_chunks $NUM_DEVICES --chunk_idx $DEVICE_NUM --workers $NUM_WORKERS"
39
  [[ -n "$METADATA_FILE" ]] && cmd="$cmd --metadata_file $METADATA_FILE"
 
40
  eval $cmd &
41
 
42
  sleep 5
 
37
  echo "Running convert.py on GPU $DEVICE_NUM"
38
  cmd="CUDA_VISIBLE_DEVICES=$DEVICE_NUM python convert.py $INPUT_FOLDER $OUTPUT_FOLDER --num_chunks $NUM_DEVICES --chunk_idx $DEVICE_NUM --workers $NUM_WORKERS"
39
  [[ -n "$METADATA_FILE" ]] && cmd="$cmd --metadata_file $METADATA_FILE"
40
+ [[ -n "$MIN_LENGTH" ]] && cmd="$cmd --min_length $MIN_LENGTH"
41
  eval $cmd &
42
 
43
  sleep 5
convert.py CHANGED
@@ -6,7 +6,7 @@ import ray
6
  from tqdm import tqdm
7
  import math
8
 
9
- from marker.convert import convert_single_pdf
10
  from marker.segmentation import load_layout_model
11
  from marker.cleaners.equations import load_nougat_model
12
  from marker.settings import settings
@@ -16,14 +16,23 @@ import json
16
 
17
  configure_logging()
18
 
 
19
  @ray.remote(num_cpus=settings.RAY_CORES_PER_WORKER, num_gpus=.05 if settings.CUDA else 0)
20
- def process_single_pdf(fname: str, out_folder: str, nougat_model, layout_model, metadata: Dict | None=None):
21
  out_filename = fname.rsplit(".", 1)[0] + ".md"
22
  out_filename = os.path.join(out_folder, os.path.basename(out_filename))
23
  out_meta_filename = out_filename.rsplit(".", 1)[0] + "_meta.json"
24
  if os.path.exists(out_filename):
25
  return
26
  try:
 
 
 
 
 
 
 
 
27
  full_text, out_metadata = convert_single_pdf(fname, layout_model, nougat_model, metadata=metadata)
28
  if len(full_text.strip()) > 0:
29
  with open(out_filename, "w+") as f:
@@ -46,6 +55,7 @@ if __name__ == "__main__":
46
  parser.add_argument("--max", type=int, default=None, help="Maximum number of pdfs to convert")
47
  parser.add_argument("--workers", type=int, default=5, help="Number of worker processes to use")
48
  parser.add_argument("--metadata_file", type=str, default=None, help="Metadata file to use for filtering")
 
49
 
50
  args = parser.parse_args()
51
 
@@ -95,7 +105,8 @@ if __name__ == "__main__":
95
  out_folder,
96
  nougat_ref,
97
  layoutlm_ref,
98
- metadata.get(os.path.basename(filename))
 
99
  ) for filename in files_to_convert
100
  ]
101
 
 
6
  from tqdm import tqdm
7
  import math
8
 
9
+ from marker.convert import convert_single_pdf, get_length_of_text
10
  from marker.segmentation import load_layout_model
11
  from marker.cleaners.equations import load_nougat_model
12
  from marker.settings import settings
 
16
 
17
  configure_logging()
18
 
19
+
20
  @ray.remote(num_cpus=settings.RAY_CORES_PER_WORKER, num_gpus=.05 if settings.CUDA else 0)
21
+ def process_single_pdf(fname: str, out_folder: str, nougat_model, layout_model, metadata: Dict | None=None, min_length: int | None = None):
22
  out_filename = fname.rsplit(".", 1)[0] + ".md"
23
  out_filename = os.path.join(out_folder, os.path.basename(out_filename))
24
  out_meta_filename = out_filename.rsplit(".", 1)[0] + "_meta.json"
25
  if os.path.exists(out_filename):
26
  return
27
  try:
28
+ # Skip trying to convert files that don't have a lot of embedded text
29
+ # This can indicate that they were scanned, and not OCRed properly
30
+ # Usually these files are not recent/high-quality
31
+ if min_length:
32
+ length = get_length_of_text(fname)
33
+ if length < min_length:
34
+ return
35
+
36
  full_text, out_metadata = convert_single_pdf(fname, layout_model, nougat_model, metadata=metadata)
37
  if len(full_text.strip()) > 0:
38
  with open(out_filename, "w+") as f:
 
55
  parser.add_argument("--max", type=int, default=None, help="Maximum number of pdfs to convert")
56
  parser.add_argument("--workers", type=int, default=5, help="Number of worker processes to use")
57
  parser.add_argument("--metadata_file", type=str, default=None, help="Metadata file to use for filtering")
58
+ parser.add_argument("--min_length", type=int, default=None, help="Minimum length of pdf to convert")
59
 
60
  args = parser.parse_args()
61
 
 
105
  out_folder,
106
  nougat_ref,
107
  layoutlm_ref,
108
+ metadata=metadata.get(os.path.basename(filename)),
109
+ min_length=args.min_length
110
  ) for filename in files_to_convert
111
  ]
112
 
marker/convert.py CHANGED
@@ -39,6 +39,19 @@ def annotate_spans(blocks: List[Page], block_types: List[BlockType]):
39
  page.add_block_types(page_block_types)
40
 
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  def convert_single_pdf(fname: str, layoutlm_model, nougat_model, max_pages=None, metadata: Dict | None=None) -> Tuple[str, Dict]:
43
  lang = settings.DEFAULT_LANG
44
  if metadata:
 
39
  page.add_block_types(page_block_types)
40
 
41
 
42
+ def get_length_of_text(fname: str) -> int:
43
+ filetype = find_filetype(fname)
44
+ if filetype == "other":
45
+ return 0
46
+
47
+ doc = pymupdf.open(fname, filetype=filetype)
48
+ full_text = ""
49
+ for page in doc:
50
+ full_text += page.get_text("text", sort=True, flags=settings.TEXT_FLAGS)
51
+
52
+ return len(full_text)
53
+
54
+
55
  def convert_single_pdf(fname: str, layoutlm_model, nougat_model, max_pages=None, metadata: Dict | None=None) -> Tuple[str, Dict]:
56
  lang = settings.DEFAULT_LANG
57
  if metadata:
marker/extract_text.py CHANGED
@@ -1,7 +1,6 @@
1
  import os
2
  from typing import Tuple, List
3
 
4
- from marker.ocr.segment import ocr_bbox
5
  from marker.ocr.page import ocr_entire_page_ocrmp
6
  from marker.ocr.utils import detect_bad_ocr, font_flags_decomposer
7
  from marker.settings import settings
@@ -61,6 +60,8 @@ def get_text_blocks(doc, tess_lang: str, spell_lang: str, max_pages: int | None=
61
  extracted = [False]
62
  ocr_pages = 0
63
  min_ocr_page = 2
 
 
64
  for pnum, page in enumerate(doc):
65
  if max_pages and pnum >= max_pages:
66
  break
@@ -81,9 +82,13 @@ def get_text_blocks(doc, tess_lang: str, spell_lang: str, max_pages: int | None=
81
  page_obj = Page(blocks=blocks, pnum=pnum)
82
  extracted.append(False)
83
  ocr_pages += 1
 
 
 
 
84
  else:
85
  if pnum > min_ocr_page:
86
  extracted.append(True)
87
 
88
  all_blocks.append(page_obj)
89
- return all_blocks, toc, {"ocr_pages": ocr_pages}
 
1
  import os
2
  from typing import Tuple, List
3
 
 
4
  from marker.ocr.page import ocr_entire_page_ocrmp
5
  from marker.ocr.utils import detect_bad_ocr, font_flags_decomposer
6
  from marker.settings import settings
 
60
  extracted = [False]
61
  ocr_pages = 0
62
  min_ocr_page = 2
63
+ ocr_failed = 0
64
+ ocr_success = 0
65
  for pnum, page in enumerate(doc):
66
  if max_pages and pnum >= max_pages:
67
  break
 
82
  page_obj = Page(blocks=blocks, pnum=pnum)
83
  extracted.append(False)
84
  ocr_pages += 1
85
+ if len(blocks) == 0:
86
+ ocr_failed += 1
87
+ else:
88
+ ocr_success += 1
89
  else:
90
  if pnum > min_ocr_page:
91
  extracted.append(True)
92
 
93
  all_blocks.append(page_obj)
94
+ return all_blocks, toc, {"ocr_pages": ocr_pages, "ocr_failed": ocr_failed, "ocr_success": ocr_success}
marker/ocr/segment.py DELETED
@@ -1,27 +0,0 @@
1
- import fitz as pymupdf
2
-
3
- from marker.settings import settings
4
-
5
-
6
- def ocr_bbox(page, old_text, bbox, lang: str):
7
- pix = page.get_pixmap(dpi=settings.SEGMENT_DPI, clip=bbox)
8
-
9
- try:
10
- ocrpdf = pymupdf.open("pdf", pix.pdfocr_tobytes(language=lang))
11
- ocrpage = ocrpdf[0]
12
- new_text = ocrpage.get_text() # extract OCR-ed text
13
- except RuntimeError:
14
- # If the OCR fails, just return the original text
15
- return old_text
16
-
17
- if not new_text.strip():
18
- # If the OCR data is blank, return old text
19
- return old_text
20
-
21
- # Tesseract ignores leading spaces, hence some corrections
22
- lblanks = len(old_text) - len(old_text.lstrip())
23
- rblanks = len(old_text) - len(old_text.rstrip())
24
-
25
- # prefix/suffix OCRed text with this many spaces
26
- new_text = " " * lblanks + new_text + " " * rblanks
27
- return new_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
marker/ocr/utils.py CHANGED
@@ -36,7 +36,7 @@ def detect_bad_ocr(text, spell_lang: str | None, misspell_threshold=.8, space_th
36
  if char in settings.INVALID_CHARS:
37
  invalid_chars += 1
38
 
39
- if invalid_chars > 2:
40
  return True
41
 
42
  return False
 
36
  if char in settings.INVALID_CHARS:
37
  invalid_chars += 1
38
 
39
+ if invalid_chars > max(2.0, len(text) * .02):
40
  return True
41
 
42
  return False
marker/settings.py CHANGED
@@ -24,7 +24,7 @@ class Settings(BaseSettings):
24
  TEXT_FLAGS: int = pymupdf.TEXTFLAGS_DICT & ~pymupdf.TEXT_PRESERVE_LIGATURES & ~pymupdf.TEXT_PRESERVE_IMAGES
25
 
26
  # OCR
27
- INVALID_CHARS: List[str] = [chr(0xfffd), chr(65533)]
28
  DPI: int = 800
29
  SEGMENT_DPI: int = 1200
30
  TESSDATA_PREFIX: str = ""
 
24
  TEXT_FLAGS: int = pymupdf.TEXTFLAGS_DICT & ~pymupdf.TEXT_PRESERVE_LIGATURES & ~pymupdf.TEXT_PRESERVE_IMAGES
25
 
26
  # OCR
27
+ INVALID_CHARS: List[str] = [chr(0xfffd)]
28
  DPI: int = 800
29
  SEGMENT_DPI: int = 1200
30
  TESSDATA_PREFIX: str = ""