Vik Paruchuri
commited on
Commit
·
51a8322
1
Parent(s):
580302f
Minor cleanups
Browse files- README.md +1 -1
- benchmarks/table/scoring.py +14 -19
- benchmarks/table/table.py +9 -9
README.md
CHANGED
|
@@ -421,7 +421,7 @@ python benchmarks/overall.py data/pdfs data/references report.json
|
|
| 421 |
The processed FinTabNet dataset is hosted [here](https://huggingface.co/datasets/datalab-to/fintabnet-test) and is automatically downloaded. Run the benchmark with:
|
| 422 |
|
| 423 |
```shell
|
| 424 |
-
python benchmarks/table/table.py table_report.json --
|
| 425 |
```
|
| 426 |
|
| 427 |
# Thanks
|
|
|
|
| 421 |
The processed FinTabNet dataset is hosted [here](https://huggingface.co/datasets/datalab-to/fintabnet-test) and is automatically downloaded. Run the benchmark with:
|
| 422 |
|
| 423 |
```shell
|
| 424 |
+
python benchmarks/table/table.py table_report.json --max_rows 1000
|
| 425 |
```
|
| 426 |
|
| 427 |
# Thanks
|
benchmarks/table/scoring.py
CHANGED
|
@@ -1,16 +1,12 @@
|
|
| 1 |
-
|
| 2 |
TEDS Code Adapted from https://github.com/ibm-aur-nlp/EDD
|
| 3 |
-
|
| 4 |
|
| 5 |
-
from typing import List
|
| 6 |
-
|
| 7 |
-
from tqdm import tqdm
|
| 8 |
import distance
|
| 9 |
from apted import APTED, Config
|
| 10 |
from apted.helpers import Tree
|
| 11 |
from lxml import html
|
| 12 |
from collections import deque
|
| 13 |
-
import numpy as np
|
| 14 |
|
| 15 |
def wrap_table_html(table_html:str)->str:
|
| 16 |
return f'<html><body>{table_html}</body></html>'
|
|
@@ -21,7 +17,9 @@ class TableTree(Tree):
|
|
| 21 |
self.colspan = colspan
|
| 22 |
self.rowspan = rowspan
|
| 23 |
self.content = content
|
| 24 |
-
|
|
|
|
|
|
|
| 25 |
|
| 26 |
def bracket(self):
|
| 27 |
"""Show tree using brackets notation"""
|
|
@@ -37,17 +35,12 @@ class TableTree(Tree):
|
|
| 37 |
class CustomConfig(Config):
|
| 38 |
@staticmethod
|
| 39 |
def maximum(*sequences):
|
| 40 |
-
"""Get maximum possible value
|
| 41 |
-
"""
|
| 42 |
return max(map(len, sequences))
|
| 43 |
|
| 44 |
def normalized_distance(self, *sequences):
|
| 45 |
-
"""Get distance from 0 to 1
|
| 46 |
-
"""
|
| 47 |
return float(distance.levenshtein(*sequences)) / self.maximum(*sequences)
|
| 48 |
|
| 49 |
def rename(self, node1, node2):
|
| 50 |
-
"""Compares attributes of trees"""
|
| 51 |
if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan):
|
| 52 |
return 1.
|
| 53 |
if node1.tag == 'td':
|
|
@@ -56,8 +49,9 @@ class CustomConfig(Config):
|
|
| 56 |
return 0.
|
| 57 |
|
| 58 |
def tokenize(node):
|
| 59 |
-
|
| 60 |
-
|
|
|
|
| 61 |
global __tokens__
|
| 62 |
__tokens__.append('<%s>' % node.tag)
|
| 63 |
if node.text is not None:
|
|
@@ -70,8 +64,9 @@ def tokenize(node):
|
|
| 70 |
__tokens__ += list(node.tail)
|
| 71 |
|
| 72 |
def tree_convert_html(node, convert_cell=False, parent=None):
|
| 73 |
-
|
| 74 |
-
|
|
|
|
| 75 |
global __tokens__
|
| 76 |
if node.tag == 'td':
|
| 77 |
if convert_cell:
|
|
@@ -95,9 +90,9 @@ def tree_convert_html(node, convert_cell=False, parent=None):
|
|
| 95 |
return new_node
|
| 96 |
|
| 97 |
def similarity_eval_html(pred, true, structure_only=False):
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
pred, true = html.fromstring(pred), html.fromstring(true)
|
| 102 |
if pred.xpath('body/table') and true.xpath('body/table'):
|
| 103 |
pred = pred.xpath('body/table')[0]
|
|
|
|
| 1 |
+
""""
|
| 2 |
TEDS Code Adapted from https://github.com/ibm-aur-nlp/EDD
|
| 3 |
+
"""
|
| 4 |
|
|
|
|
|
|
|
|
|
|
| 5 |
import distance
|
| 6 |
from apted import APTED, Config
|
| 7 |
from apted.helpers import Tree
|
| 8 |
from lxml import html
|
| 9 |
from collections import deque
|
|
|
|
| 10 |
|
| 11 |
def wrap_table_html(table_html:str)->str:
|
| 12 |
return f'<html><body>{table_html}</body></html>'
|
|
|
|
| 17 |
self.colspan = colspan
|
| 18 |
self.rowspan = rowspan
|
| 19 |
self.content = content
|
| 20 |
+
|
| 21 |
+
# Sets self.name and self.children
|
| 22 |
+
super().__init__(tag, *children)
|
| 23 |
|
| 24 |
def bracket(self):
|
| 25 |
"""Show tree using brackets notation"""
|
|
|
|
| 35 |
class CustomConfig(Config):
|
| 36 |
@staticmethod
|
| 37 |
def maximum(*sequences):
|
|
|
|
|
|
|
| 38 |
return max(map(len, sequences))
|
| 39 |
|
| 40 |
def normalized_distance(self, *sequences):
|
|
|
|
|
|
|
| 41 |
return float(distance.levenshtein(*sequences)) / self.maximum(*sequences)
|
| 42 |
|
| 43 |
def rename(self, node1, node2):
|
|
|
|
| 44 |
if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan):
|
| 45 |
return 1.
|
| 46 |
if node1.tag == 'td':
|
|
|
|
| 49 |
return 0.
|
| 50 |
|
| 51 |
def tokenize(node):
|
| 52 |
+
"""
|
| 53 |
+
Tokenizes table cells
|
| 54 |
+
"""
|
| 55 |
global __tokens__
|
| 56 |
__tokens__.append('<%s>' % node.tag)
|
| 57 |
if node.text is not None:
|
|
|
|
| 64 |
__tokens__ += list(node.tail)
|
| 65 |
|
| 66 |
def tree_convert_html(node, convert_cell=False, parent=None):
|
| 67 |
+
"""
|
| 68 |
+
Converts HTML tree to the format required by apted
|
| 69 |
+
"""
|
| 70 |
global __tokens__
|
| 71 |
if node.tag == 'td':
|
| 72 |
if convert_cell:
|
|
|
|
| 90 |
return new_node
|
| 91 |
|
| 92 |
def similarity_eval_html(pred, true, structure_only=False):
|
| 93 |
+
"""
|
| 94 |
+
Computes TEDS score between the prediction and the ground truth of a given samples
|
| 95 |
+
"""
|
| 96 |
pred, true = html.fromstring(pred), html.fromstring(true)
|
| 97 |
if pred.xpath('body/table') and true.xpath('body/table'):
|
| 98 |
pred = pred.xpath('body/table')[0]
|
benchmarks/table/table.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
| 1 |
-
import base64
|
| 2 |
import os
|
|
|
|
|
|
|
|
|
|
| 3 |
import time
|
| 4 |
import datasets
|
| 5 |
from tqdm import tqdm
|
|
@@ -11,8 +13,6 @@ from bs4 import BeautifulSoup
|
|
| 11 |
from concurrent.futures import ThreadPoolExecutor
|
| 12 |
from pypdfium2._helpers.misc import PdfiumError
|
| 13 |
|
| 14 |
-
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # Transformers uses .isin for a simple op, which is not supported on MPS
|
| 15 |
-
|
| 16 |
from marker.config.parser import ConfigParser
|
| 17 |
from marker.converters.table import TableConverter
|
| 18 |
from marker.models import create_model_dict
|
|
@@ -30,10 +30,10 @@ def update_teds_score(result):
|
|
| 30 |
@click.command(help="Benchmark Table to HTML Conversion")
|
| 31 |
@click.argument("out_file", type=str)
|
| 32 |
@click.option("--dataset", type=str, default="datalab-to/fintabnet-test", help="Dataset to use")
|
| 33 |
-
@click.option("--
|
| 34 |
-
def main(out_file, dataset,
|
| 35 |
models = create_model_dict()
|
| 36 |
-
config_parser = ConfigParser({})
|
| 37 |
start = time.time()
|
| 38 |
|
| 39 |
|
|
@@ -41,8 +41,8 @@ def main(out_file, dataset, max):
|
|
| 41 |
dataset = dataset.shuffle(seed=0)
|
| 42 |
|
| 43 |
iterations = len(dataset)
|
| 44 |
-
if
|
| 45 |
-
iterations = min(
|
| 46 |
|
| 47 |
results = []
|
| 48 |
for i in tqdm(range(iterations), desc='Converting Tables'):
|
|
@@ -55,7 +55,7 @@ def main(out_file, dataset, max):
|
|
| 55 |
config=config_parser.generate_config_dict(),
|
| 56 |
artifact_dict=models,
|
| 57 |
processor_list=config_parser.get_processors(),
|
| 58 |
-
renderer=
|
| 59 |
)
|
| 60 |
|
| 61 |
with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as temp_pdf_file:
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
+
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # Transformers uses .isin for a simple op, which is not supported on MPS
|
| 3 |
+
|
| 4 |
+
import base64
|
| 5 |
import time
|
| 6 |
import datasets
|
| 7 |
from tqdm import tqdm
|
|
|
|
| 13 |
from concurrent.futures import ThreadPoolExecutor
|
| 14 |
from pypdfium2._helpers.misc import PdfiumError
|
| 15 |
|
|
|
|
|
|
|
| 16 |
from marker.config.parser import ConfigParser
|
| 17 |
from marker.converters.table import TableConverter
|
| 18 |
from marker.models import create_model_dict
|
|
|
|
| 30 |
@click.command(help="Benchmark Table to HTML Conversion")
|
| 31 |
@click.argument("out_file", type=str)
|
| 32 |
@click.option("--dataset", type=str, default="datalab-to/fintabnet-test", help="Dataset to use")
|
| 33 |
+
@click.option("--max_rows", type=int, default=None, help="Maximum number of PDFs to process")
|
| 34 |
+
def main(out_file: str, dataset: str, max_rows: int):
|
| 35 |
models = create_model_dict()
|
| 36 |
+
config_parser = ConfigParser({'output_format': 'html'})
|
| 37 |
start = time.time()
|
| 38 |
|
| 39 |
|
|
|
|
| 41 |
dataset = dataset.shuffle(seed=0)
|
| 42 |
|
| 43 |
iterations = len(dataset)
|
| 44 |
+
if max_rows is not None:
|
| 45 |
+
iterations = min(max_rows, len(dataset))
|
| 46 |
|
| 47 |
results = []
|
| 48 |
for i in tqdm(range(iterations), desc='Converting Tables'):
|
|
|
|
| 55 |
config=config_parser.generate_config_dict(),
|
| 56 |
artifact_dict=models,
|
| 57 |
processor_list=config_parser.get_processors(),
|
| 58 |
+
renderer=config_parser.get_renderer()
|
| 59 |
)
|
| 60 |
|
| 61 |
with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as temp_pdf_file:
|