File size: 478 Bytes
2c69783
1ecee7a
 
19a4543
 
 
70bf91d
632e817
0c603b0
70bf91d
19a4543
 
 
 
 
 
 
70bf91d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
from marker.providers.pdf import PdfProvider
import tempfile

import datasets


def setup_pdf_provider(
    filename='adversarial.pdf',
    config=None,
) -> PdfProvider:
    dataset = datasets.load_dataset("datalab-to/pdfs", split="train")
    idx = dataset['filename'].index(filename)

    temp_pdf = tempfile.NamedTemporaryFile(suffix=".pdf")
    temp_pdf.write(dataset['pdf'][idx])
    temp_pdf.flush()

    provider = PdfProvider(temp_pdf.name, config)
    return provider