Vik Paruchuri commited on
Commit
51a8322
·
1 Parent(s): 580302f

Minor cleanups

Browse files
README.md CHANGED
@@ -421,7 +421,7 @@ python benchmarks/overall.py data/pdfs data/references report.json
421
  The processed FinTabNet dataset is hosted [here](https://huggingface.co/datasets/datalab-to/fintabnet-test) and is automatically downloaded. Run the benchmark with:
422
 
423
  ```shell
424
- python benchmarks/table/table.py table_report.json --max 1000
425
  ```
426
 
427
  # Thanks
 
421
  The processed FinTabNet dataset is hosted [here](https://huggingface.co/datasets/datalab-to/fintabnet-test) and is automatically downloaded. Run the benchmark with:
422
 
423
  ```shell
424
+ python benchmarks/table/table.py table_report.json --max_rows 1000
425
  ```
426
 
427
  # Thanks
benchmarks/table/scoring.py CHANGED
@@ -1,16 +1,12 @@
1
- '''
2
  TEDS Code Adapted from https://github.com/ibm-aur-nlp/EDD
3
- '''
4
 
5
- from typing import List
6
-
7
- from tqdm import tqdm
8
  import distance
9
  from apted import APTED, Config
10
  from apted.helpers import Tree
11
  from lxml import html
12
  from collections import deque
13
- import numpy as np
14
 
15
  def wrap_table_html(table_html:str)->str:
16
  return f'<html><body>{table_html}</body></html>'
@@ -21,7 +17,9 @@ class TableTree(Tree):
21
  self.colspan = colspan
22
  self.rowspan = rowspan
23
  self.content = content
24
- self.children = list(children)
 
 
25
 
26
  def bracket(self):
27
  """Show tree using brackets notation"""
@@ -37,17 +35,12 @@ class TableTree(Tree):
37
  class CustomConfig(Config):
38
  @staticmethod
39
  def maximum(*sequences):
40
- """Get maximum possible value
41
- """
42
  return max(map(len, sequences))
43
 
44
  def normalized_distance(self, *sequences):
45
- """Get distance from 0 to 1
46
- """
47
  return float(distance.levenshtein(*sequences)) / self.maximum(*sequences)
48
 
49
  def rename(self, node1, node2):
50
- """Compares attributes of trees"""
51
  if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan):
52
  return 1.
53
  if node1.tag == 'td':
@@ -56,8 +49,9 @@ class CustomConfig(Config):
56
  return 0.
57
 
58
  def tokenize(node):
59
- ''' Tokenizes table cells
60
- '''
 
61
  global __tokens__
62
  __tokens__.append('<%s>' % node.tag)
63
  if node.text is not None:
@@ -70,8 +64,9 @@ def tokenize(node):
70
  __tokens__ += list(node.tail)
71
 
72
  def tree_convert_html(node, convert_cell=False, parent=None):
73
- ''' Converts HTML tree to the format required by apted
74
- '''
 
75
  global __tokens__
76
  if node.tag == 'td':
77
  if convert_cell:
@@ -95,9 +90,9 @@ def tree_convert_html(node, convert_cell=False, parent=None):
95
  return new_node
96
 
97
  def similarity_eval_html(pred, true, structure_only=False):
98
- ''' Computes TEDS score between the prediction and the ground truth of a
99
- given samples
100
- '''
101
  pred, true = html.fromstring(pred), html.fromstring(true)
102
  if pred.xpath('body/table') and true.xpath('body/table'):
103
  pred = pred.xpath('body/table')[0]
 
1
+ """"
2
  TEDS Code Adapted from https://github.com/ibm-aur-nlp/EDD
3
+ """
4
 
 
 
 
5
  import distance
6
  from apted import APTED, Config
7
  from apted.helpers import Tree
8
  from lxml import html
9
  from collections import deque
 
10
 
11
  def wrap_table_html(table_html:str)->str:
12
  return f'<html><body>{table_html}</body></html>'
 
17
  self.colspan = colspan
18
  self.rowspan = rowspan
19
  self.content = content
20
+
21
+ # Sets self.name and self.children
22
+ super().__init__(tag, *children)
23
 
24
  def bracket(self):
25
  """Show tree using brackets notation"""
 
35
  class CustomConfig(Config):
36
  @staticmethod
37
  def maximum(*sequences):
 
 
38
  return max(map(len, sequences))
39
 
40
  def normalized_distance(self, *sequences):
 
 
41
  return float(distance.levenshtein(*sequences)) / self.maximum(*sequences)
42
 
43
  def rename(self, node1, node2):
 
44
  if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan):
45
  return 1.
46
  if node1.tag == 'td':
 
49
  return 0.
50
 
51
  def tokenize(node):
52
+ """
53
+ Tokenizes table cells
54
+ """
55
  global __tokens__
56
  __tokens__.append('<%s>' % node.tag)
57
  if node.text is not None:
 
64
  __tokens__ += list(node.tail)
65
 
66
  def tree_convert_html(node, convert_cell=False, parent=None):
67
+ """
68
+ Converts HTML tree to the format required by apted
69
+ """
70
  global __tokens__
71
  if node.tag == 'td':
72
  if convert_cell:
 
90
  return new_node
91
 
92
  def similarity_eval_html(pred, true, structure_only=False):
93
+ """
94
+ Computes TEDS score between the prediction and the ground truth of a given samples
95
+ """
96
  pred, true = html.fromstring(pred), html.fromstring(true)
97
  if pred.xpath('body/table') and true.xpath('body/table'):
98
  pred = pred.xpath('body/table')[0]
benchmarks/table/table.py CHANGED
@@ -1,5 +1,7 @@
1
- import base64
2
  import os
 
 
 
3
  import time
4
  import datasets
5
  from tqdm import tqdm
@@ -11,8 +13,6 @@ from bs4 import BeautifulSoup
11
  from concurrent.futures import ThreadPoolExecutor
12
  from pypdfium2._helpers.misc import PdfiumError
13
 
14
- os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # Transformers uses .isin for a simple op, which is not supported on MPS
15
-
16
  from marker.config.parser import ConfigParser
17
  from marker.converters.table import TableConverter
18
  from marker.models import create_model_dict
@@ -30,10 +30,10 @@ def update_teds_score(result):
30
  @click.command(help="Benchmark Table to HTML Conversion")
31
  @click.argument("out_file", type=str)
32
  @click.option("--dataset", type=str, default="datalab-to/fintabnet-test", help="Dataset to use")
33
- @click.option("--max", type=int, default=None, help="Maximum number of PDFs to process")
34
- def main(out_file, dataset, max):
35
  models = create_model_dict()
36
- config_parser = ConfigParser({})
37
  start = time.time()
38
 
39
 
@@ -41,8 +41,8 @@ def main(out_file, dataset, max):
41
  dataset = dataset.shuffle(seed=0)
42
 
43
  iterations = len(dataset)
44
- if max is not None:
45
- iterations = min(max, len(dataset))
46
 
47
  results = []
48
  for i in tqdm(range(iterations), desc='Converting Tables'):
@@ -55,7 +55,7 @@ def main(out_file, dataset, max):
55
  config=config_parser.generate_config_dict(),
56
  artifact_dict=models,
57
  processor_list=config_parser.get_processors(),
58
- renderer='marker.renderers.html.HTMLRenderer'
59
  )
60
 
61
  with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as temp_pdf_file:
 
 
1
  import os
2
+ os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # Transformers uses .isin for a simple op, which is not supported on MPS
3
+
4
+ import base64
5
  import time
6
  import datasets
7
  from tqdm import tqdm
 
13
  from concurrent.futures import ThreadPoolExecutor
14
  from pypdfium2._helpers.misc import PdfiumError
15
 
 
 
16
  from marker.config.parser import ConfigParser
17
  from marker.converters.table import TableConverter
18
  from marker.models import create_model_dict
 
30
  @click.command(help="Benchmark Table to HTML Conversion")
31
  @click.argument("out_file", type=str)
32
  @click.option("--dataset", type=str, default="datalab-to/fintabnet-test", help="Dataset to use")
33
+ @click.option("--max_rows", type=int, default=None, help="Maximum number of PDFs to process")
34
+ def main(out_file: str, dataset: str, max_rows: int):
35
  models = create_model_dict()
36
+ config_parser = ConfigParser({'output_format': 'html'})
37
  start = time.time()
38
 
39
 
 
41
  dataset = dataset.shuffle(seed=0)
42
 
43
  iterations = len(dataset)
44
+ if max_rows is not None:
45
+ iterations = min(max_rows, len(dataset))
46
 
47
  results = []
48
  for i in tqdm(range(iterations), desc='Converting Tables'):
 
55
  config=config_parser.generate_config_dict(),
56
  artifact_dict=models,
57
  processor_list=config_parser.get_processors(),
58
+ renderer=config_parser.get_renderer()
59
  )
60
 
61
  with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as temp_pdf_file: