Spaces:

rt4u
/

marker

Sleeping

App Files Files Community

Vik Paruchuri commited on Jan 15

Commit

51a8322

1 Parent(s): 580302f

Minor cleanups

Browse files

Files changed (3) hide show

README.md +1 -1
benchmarks/table/scoring.py +14 -19
benchmarks/table/table.py +9 -9

README.md CHANGED Viewed

@@ -421,7 +421,7 @@ python benchmarks/overall.py data/pdfs data/references report.json
 The processed FinTabNet dataset is hosted [here](https://huggingface.co/datasets/datalab-to/fintabnet-test) and is automatically downloaded. Run the benchmark with:
 ```shell
-python benchmarks/table/table.py table_report.json --max 1000
 ```
 # Thanks

 The processed FinTabNet dataset is hosted [here](https://huggingface.co/datasets/datalab-to/fintabnet-test) and is automatically downloaded. Run the benchmark with:
 ```shell
+python benchmarks/table/table.py table_report.json --max_rows 1000
 ```
 # Thanks

benchmarks/table/scoring.py CHANGED Viewed

@@ -1,16 +1,12 @@
-'''
 TEDS Code Adapted from https://github.com/ibm-aur-nlp/EDD
-'''
-from typing import List
-from tqdm import tqdm
 import distance
 from apted import APTED, Config
 from apted.helpers import Tree
 from lxml import html
 from collections import deque
-import numpy as np
 def wrap_table_html(table_html:str)->str:
     return f'<html><body>{table_html}</body></html>'
@@ -21,7 +17,9 @@ class TableTree(Tree):
         self.colspan = colspan
         self.rowspan = rowspan
         self.content = content
-        self.children = list(children)
     def bracket(self):
         """Show tree using brackets notation"""
@@ -37,17 +35,12 @@ class TableTree(Tree):
 class CustomConfig(Config):
     @staticmethod
     def maximum(*sequences):
-        """Get maximum possible value
-        """
         return max(map(len, sequences))
     def normalized_distance(self, *sequences):
-        """Get distance from 0 to 1
-        """
         return float(distance.levenshtein(*sequences)) / self.maximum(*sequences)
     def rename(self, node1, node2):
-        """Compares attributes of trees"""
         if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan):
             return 1.
         if node1.tag == 'td':
@@ -56,8 +49,9 @@ class CustomConfig(Config):
         return 0.
 def tokenize(node):
-    ''' Tokenizes table cells
-    '''
     global __tokens__
     __tokens__.append('<%s>' % node.tag)
     if node.text is not None:
@@ -70,8 +64,9 @@ def tokenize(node):
             __tokens__ += list(node.tail)
 def tree_convert_html(node, convert_cell=False, parent=None):
-    ''' Converts HTML tree to the format required by apted
-    '''
     global __tokens__
     if node.tag == 'td':
         if convert_cell:
@@ -95,9 +90,9 @@ def tree_convert_html(node, convert_cell=False, parent=None):
         return new_node
 def similarity_eval_html(pred, true, structure_only=False):
-    ''' Computes TEDS score between the prediction and the ground truth of a
-        given samples
-    '''
     pred, true = html.fromstring(pred), html.fromstring(true)
     if pred.xpath('body/table') and true.xpath('body/table'):
         pred = pred.xpath('body/table')[0]

+""""
 TEDS Code Adapted from https://github.com/ibm-aur-nlp/EDD
+"""
 import distance
 from apted import APTED, Config
 from apted.helpers import Tree
 from lxml import html
 from collections import deque
 def wrap_table_html(table_html:str)->str:
     return f'<html><body>{table_html}</body></html>'
         self.colspan = colspan
         self.rowspan = rowspan
         self.content = content
+        # Sets self.name and self.children
+        super().__init__(tag, *children)
     def bracket(self):
         """Show tree using brackets notation"""
 class CustomConfig(Config):
     @staticmethod
     def maximum(*sequences):
         return max(map(len, sequences))
     def normalized_distance(self, *sequences):
         return float(distance.levenshtein(*sequences)) / self.maximum(*sequences)
     def rename(self, node1, node2):
         if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan):
             return 1.
         if node1.tag == 'td':
         return 0.
 def tokenize(node):
+    """
+    Tokenizes table cells
+    """
     global __tokens__
     __tokens__.append('<%s>' % node.tag)
     if node.text is not None:
             __tokens__ += list(node.tail)
 def tree_convert_html(node, convert_cell=False, parent=None):
+    """
+    Converts HTML tree to the format required by apted
+    """
     global __tokens__
     if node.tag == 'td':
         if convert_cell:
         return new_node
 def similarity_eval_html(pred, true, structure_only=False):
+    """
+    Computes TEDS score between the prediction and the ground truth of a given samples
+    """
     pred, true = html.fromstring(pred), html.fromstring(true)
     if pred.xpath('body/table') and true.xpath('body/table'):
         pred = pred.xpath('body/table')[0]

benchmarks/table/table.py CHANGED Viewed

@@ -1,5 +1,7 @@
-import base64
 import os
 import time
 import datasets
 from tqdm import tqdm
@@ -11,8 +13,6 @@ from bs4 import BeautifulSoup
 from concurrent.futures import ThreadPoolExecutor
 from pypdfium2._helpers.misc import PdfiumError
-os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"  # Transformers uses .isin for a simple op, which is not supported on MPS
 from marker.config.parser import ConfigParser
 from marker.converters.table import TableConverter
 from marker.models import create_model_dict
@@ -30,10 +30,10 @@ def update_teds_score(result):
 @click.command(help="Benchmark Table to HTML Conversion")
 @click.argument("out_file", type=str)
 @click.option("--dataset", type=str, default="datalab-to/fintabnet-test", help="Dataset to use")
-@click.option("--max", type=int, default=None, help="Maximum number of PDFs to process")
-def main(out_file, dataset, max):
     models = create_model_dict()
-    config_parser = ConfigParser({})
     start = time.time()
@@ -41,8 +41,8 @@ def main(out_file, dataset, max):
     dataset = dataset.shuffle(seed=0)
     iterations = len(dataset)
-    if max is not None:
-        iterations = min(max, len(dataset))
     results = []
     for i in tqdm(range(iterations), desc='Converting Tables'):
@@ -55,7 +55,7 @@ def main(out_file, dataset, max):
                 config=config_parser.generate_config_dict(),
                 artifact_dict=models,
                 processor_list=config_parser.get_processors(),
-                renderer='marker.renderers.html.HTMLRenderer'
             )
             with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as temp_pdf_file:

 import os
+os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"  # Transformers uses .isin for a simple op, which is not supported on MPS
+import base64
 import time
 import datasets
 from tqdm import tqdm
 from concurrent.futures import ThreadPoolExecutor
 from pypdfium2._helpers.misc import PdfiumError
 from marker.config.parser import ConfigParser
 from marker.converters.table import TableConverter
 from marker.models import create_model_dict
 @click.command(help="Benchmark Table to HTML Conversion")
 @click.argument("out_file", type=str)
 @click.option("--dataset", type=str, default="datalab-to/fintabnet-test", help="Dataset to use")
+@click.option("--max_rows", type=int, default=None, help="Maximum number of PDFs to process")
+def main(out_file: str, dataset: str, max_rows: int):
     models = create_model_dict()
+    config_parser = ConfigParser({'output_format': 'html'})
     start = time.time()
     dataset = dataset.shuffle(seed=0)
     iterations = len(dataset)
+    if max_rows is not None:
+        iterations = min(max_rows, len(dataset))
     results = []
     for i in tqdm(range(iterations), desc='Converting Tables'):
                 config=config_parser.generate_config_dict(),
                 artifact_dict=models,
                 processor_list=config_parser.get_processors(),
+                renderer=config_parser.get_renderer()
             )
             with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as temp_pdf_file: