Spaces:
Sleeping
Sleeping
malte.ostendorff@telekom.de
commited on
Commit
·
3c258f1
0
Parent(s):
init
Browse files- .gitignore +176 -0
- README.md +22 -0
- app.py +235 -0
- languages.py +188 -0
- packages.txt +1 -0
- requirements.txt +5 -0
.gitignore
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Created by https://www.toptal.com/developers/gitignore/api/python
|
| 2 |
+
# Edit at https://www.toptal.com/developers/gitignore?templates=python
|
| 3 |
+
|
| 4 |
+
### Python ###
|
| 5 |
+
# Byte-compiled / optimized / DLL files
|
| 6 |
+
__pycache__/
|
| 7 |
+
*.py[cod]
|
| 8 |
+
*$py.class
|
| 9 |
+
|
| 10 |
+
# C extensions
|
| 11 |
+
*.so
|
| 12 |
+
|
| 13 |
+
# Distribution / packaging
|
| 14 |
+
.Python
|
| 15 |
+
build/
|
| 16 |
+
develop-eggs/
|
| 17 |
+
dist/
|
| 18 |
+
downloads/
|
| 19 |
+
eggs/
|
| 20 |
+
.eggs/
|
| 21 |
+
lib/
|
| 22 |
+
lib64/
|
| 23 |
+
parts/
|
| 24 |
+
sdist/
|
| 25 |
+
var/
|
| 26 |
+
wheels/
|
| 27 |
+
share/python-wheels/
|
| 28 |
+
*.egg-info/
|
| 29 |
+
.installed.cfg
|
| 30 |
+
*.egg
|
| 31 |
+
MANIFEST
|
| 32 |
+
|
| 33 |
+
# PyInstaller
|
| 34 |
+
# Usually these files are written by a python script from a template
|
| 35 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 36 |
+
*.manifest
|
| 37 |
+
*.spec
|
| 38 |
+
|
| 39 |
+
# Installer logs
|
| 40 |
+
pip-log.txt
|
| 41 |
+
pip-delete-this-directory.txt
|
| 42 |
+
|
| 43 |
+
# Unit test / coverage reports
|
| 44 |
+
htmlcov/
|
| 45 |
+
.tox/
|
| 46 |
+
.nox/
|
| 47 |
+
.coverage
|
| 48 |
+
.coverage.*
|
| 49 |
+
.cache
|
| 50 |
+
nosetests.xml
|
| 51 |
+
coverage.xml
|
| 52 |
+
*.cover
|
| 53 |
+
*.py,cover
|
| 54 |
+
.hypothesis/
|
| 55 |
+
.pytest_cache/
|
| 56 |
+
cover/
|
| 57 |
+
|
| 58 |
+
# Translations
|
| 59 |
+
*.mo
|
| 60 |
+
*.pot
|
| 61 |
+
|
| 62 |
+
# Django stuff:
|
| 63 |
+
*.log
|
| 64 |
+
local_settings.py
|
| 65 |
+
db.sqlite3
|
| 66 |
+
db.sqlite3-journal
|
| 67 |
+
|
| 68 |
+
# Flask stuff:
|
| 69 |
+
instance/
|
| 70 |
+
.webassets-cache
|
| 71 |
+
|
| 72 |
+
# Scrapy stuff:
|
| 73 |
+
.scrapy
|
| 74 |
+
|
| 75 |
+
# Sphinx documentation
|
| 76 |
+
docs/_build/
|
| 77 |
+
|
| 78 |
+
# PyBuilder
|
| 79 |
+
.pybuilder/
|
| 80 |
+
target/
|
| 81 |
+
|
| 82 |
+
# Jupyter Notebook
|
| 83 |
+
.ipynb_checkpoints
|
| 84 |
+
|
| 85 |
+
# IPython
|
| 86 |
+
profile_default/
|
| 87 |
+
ipython_config.py
|
| 88 |
+
|
| 89 |
+
# pyenv
|
| 90 |
+
# For a library or package, you might want to ignore these files since the code is
|
| 91 |
+
# intended to run in multiple environments; otherwise, check them in:
|
| 92 |
+
# .python-version
|
| 93 |
+
|
| 94 |
+
# pipenv
|
| 95 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 96 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 97 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 98 |
+
# install all needed dependencies.
|
| 99 |
+
#Pipfile.lock
|
| 100 |
+
|
| 101 |
+
# poetry
|
| 102 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
| 103 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 104 |
+
# commonly ignored for libraries.
|
| 105 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
| 106 |
+
#poetry.lock
|
| 107 |
+
|
| 108 |
+
# pdm
|
| 109 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
| 110 |
+
#pdm.lock
|
| 111 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
| 112 |
+
# in version control.
|
| 113 |
+
# https://pdm.fming.dev/#use-with-ide
|
| 114 |
+
.pdm.toml
|
| 115 |
+
|
| 116 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
| 117 |
+
__pypackages__/
|
| 118 |
+
|
| 119 |
+
# Celery stuff
|
| 120 |
+
celerybeat-schedule
|
| 121 |
+
celerybeat.pid
|
| 122 |
+
|
| 123 |
+
# SageMath parsed files
|
| 124 |
+
*.sage.py
|
| 125 |
+
|
| 126 |
+
# Environments
|
| 127 |
+
.env
|
| 128 |
+
.venv
|
| 129 |
+
env/
|
| 130 |
+
venv/
|
| 131 |
+
ENV/
|
| 132 |
+
env.bak/
|
| 133 |
+
venv.bak/
|
| 134 |
+
|
| 135 |
+
# Spyder project settings
|
| 136 |
+
.spyderproject
|
| 137 |
+
.spyproject
|
| 138 |
+
|
| 139 |
+
# Rope project settings
|
| 140 |
+
.ropeproject
|
| 141 |
+
|
| 142 |
+
# mkdocs documentation
|
| 143 |
+
/site
|
| 144 |
+
|
| 145 |
+
# mypy
|
| 146 |
+
.mypy_cache/
|
| 147 |
+
.dmypy.json
|
| 148 |
+
dmypy.json
|
| 149 |
+
|
| 150 |
+
# Pyre type checker
|
| 151 |
+
.pyre/
|
| 152 |
+
|
| 153 |
+
# pytype static type analyzer
|
| 154 |
+
.pytype/
|
| 155 |
+
|
| 156 |
+
# Cython debug symbols
|
| 157 |
+
cython_debug/
|
| 158 |
+
|
| 159 |
+
# PyCharm
|
| 160 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
| 161 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
| 162 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
| 163 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 164 |
+
#.idea/
|
| 165 |
+
|
| 166 |
+
### Python Patch ###
|
| 167 |
+
# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
|
| 168 |
+
poetry.toml
|
| 169 |
+
|
| 170 |
+
# ruff
|
| 171 |
+
.ruff_cache/
|
| 172 |
+
|
| 173 |
+
# LSP config files
|
| 174 |
+
pyrightconfig.json
|
| 175 |
+
|
| 176 |
+
# End of https://www.toptal.com/developers/gitignore/api/python
|
README.md
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Seed Crawl Annotator
|
| 3 |
+
emoji: 🐨
|
| 4 |
+
colorFrom: red
|
| 5 |
+
colorTo: yellow
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 5.6.0
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
license: mit
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
# Annotate Web Languages
|
| 14 |
+
|
| 15 |
+
## Usage
|
| 16 |
+
|
| 17 |
+
```bash
|
| 18 |
+
# Run the Gradio app
|
| 19 |
+
gradio app.py # auto reload
|
| 20 |
+
python app.py # static
|
| 21 |
+
|
| 22 |
+
```
|
app.py
ADDED
|
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import random
|
| 5 |
+
import time
|
| 6 |
+
import gradio as gr
|
| 7 |
+
from selenium import webdriver
|
| 8 |
+
from selenium.common.exceptions import WebDriverException
|
| 9 |
+
from PIL import Image
|
| 10 |
+
from io import BytesIO
|
| 11 |
+
import base64
|
| 12 |
+
|
| 13 |
+
import trafilatura
|
| 14 |
+
|
| 15 |
+
from huggingface_hub import whoami
|
| 16 |
+
|
| 17 |
+
from languages import ISO_CODE_TO_LANGUAGE_NAME
|
| 18 |
+
|
| 19 |
+
OFFLINE = os.environ.get("OFFLINE", False)
|
| 20 |
+
|
| 21 |
+
def pil_image_to_base64(image):
|
| 22 |
+
# Save the image to a BytesIO buffer
|
| 23 |
+
buffer = BytesIO()
|
| 24 |
+
image.save(buffer, format="PNG") # You can change the format if needed
|
| 25 |
+
buffer.seek(0)
|
| 26 |
+
|
| 27 |
+
# Encode the bytes into a base64 string
|
| 28 |
+
img_base64 = base64.b64encode(buffer.getvalue()).decode("utf-8")
|
| 29 |
+
|
| 30 |
+
# Format the base64 string for use in an HTML image tag
|
| 31 |
+
html_img_tag_src = f"data:image/png;base64,{img_base64}"
|
| 32 |
+
return html_img_tag_src
|
| 33 |
+
|
| 34 |
+
def fetch_screenshot_and_text_from_url(url):
|
| 35 |
+
screen_width = 1080
|
| 36 |
+
height = 350
|
| 37 |
+
text = ""
|
| 38 |
+
|
| 39 |
+
if OFFLINE:
|
| 40 |
+
screenshot = Image.new('RGB', (350, height))
|
| 41 |
+
text = f"Some dummy text for {url} (offline mode enabled)"
|
| 42 |
+
|
| 43 |
+
else:
|
| 44 |
+
options = webdriver.ChromeOptions()
|
| 45 |
+
options.add_argument('--headless')
|
| 46 |
+
options.add_argument('--no-sandbox')
|
| 47 |
+
options.add_argument('--disable-dev-shm-usage')
|
| 48 |
+
|
| 49 |
+
try:
|
| 50 |
+
driver = webdriver.Chrome(options=options)
|
| 51 |
+
#driver.set_window_size(1080, 720) # Adjust the window size here
|
| 52 |
+
driver.get(url)
|
| 53 |
+
|
| 54 |
+
driver.implicitly_wait(10)
|
| 55 |
+
|
| 56 |
+
# Wait for the page to fully load; you may adjust the sleep time or implement a wait condition
|
| 57 |
+
# time.sleep(2)
|
| 58 |
+
|
| 59 |
+
# fetch html from web page
|
| 60 |
+
html_str = driver.page_source
|
| 61 |
+
|
| 62 |
+
# Execute JS to find the full height of the rendered page
|
| 63 |
+
scroll_height = driver.execute_script("return document.body.scrollHeight")
|
| 64 |
+
|
| 65 |
+
# Resize the window to full page height
|
| 66 |
+
driver.set_window_size(screen_width, max(scroll_height + 200, 900))
|
| 67 |
+
|
| 68 |
+
raw_screenshot = driver.get_screenshot_as_png()
|
| 69 |
+
|
| 70 |
+
screenshot = Image.open(BytesIO(raw_screenshot))
|
| 71 |
+
|
| 72 |
+
# extract text
|
| 73 |
+
text = trafilatura.extract(html_str)
|
| 74 |
+
|
| 75 |
+
except WebDriverException as e:
|
| 76 |
+
screenshot = Image.new('RGB', (1, 1))
|
| 77 |
+
finally:
|
| 78 |
+
if driver:
|
| 79 |
+
driver.quit()
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
# embed base65 encoded image as <img> tag into html string
|
| 83 |
+
screenshot_html_str = f"""<div style="width: 100%; height: {height}px; overflow-y: scroll;"><img src="{pil_image_to_base64(screenshot)}" /></div>"""
|
| 84 |
+
|
| 85 |
+
# return gr.update(value=html_str, visible=True), text, gr.update(visible=True)
|
| 86 |
+
return screenshot_html_str, text
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
with gr.Blocks(fill_height=True) as demo:
|
| 90 |
+
|
| 91 |
+
gr.Markdown(
|
| 92 |
+
"""
|
| 93 |
+
# Seed Crawl Annotator
|
| 94 |
+
""")
|
| 95 |
+
|
| 96 |
+
profile_state = gr.State([])
|
| 97 |
+
gr.LoginButton()
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
with gr.Column(visible=False) as wrapper_col:
|
| 101 |
+
def handle_login(profile: gr.OAuthProfile | None) -> dict:
|
| 102 |
+
if profile:
|
| 103 |
+
gr.Info(f"Logged in as {profile.username}")
|
| 104 |
+
return {
|
| 105 |
+
profile_state: f"{profile.username}",
|
| 106 |
+
wrapper_col: gr.update(visible=True),
|
| 107 |
+
}
|
| 108 |
+
else:
|
| 109 |
+
gr.Warning(f"You need to login to use this app.")
|
| 110 |
+
return {
|
| 111 |
+
profile_state: None,
|
| 112 |
+
wrapper_col: gr.update(visible=False),
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
demo.load(handle_login, inputs=None, outputs=[profile_state, wrapper_col])
|
| 116 |
+
|
| 117 |
+
url_field = gr.Textbox(label="Website URL", placeholder="Enter a URL you want to annotate", interactive=True)
|
| 118 |
+
|
| 119 |
+
with gr.Row():
|
| 120 |
+
set_random_btn = gr.Button("Set Random URL", variant="secondary", interactive=True)
|
| 121 |
+
|
| 122 |
+
load_btn = gr.Button("Annotate URL", variant="primary", interactive=True)
|
| 123 |
+
|
| 124 |
+
with gr.Row():
|
| 125 |
+
extracted_text = gr.Textbox(label="Extracted text", max_lines=15, lines=15, visible=False, placeholder="Click on `Load URL` to fetch Web page's text content.")
|
| 126 |
+
|
| 127 |
+
screenshot_scrollable = gr.HTML(visible=False)
|
| 128 |
+
|
| 129 |
+
with gr.Column(visible=False) as output_col:
|
| 130 |
+
with gr.Row():
|
| 131 |
+
language_codes = gr.Dropdown(
|
| 132 |
+
[("unknown", "unknown")] + [(f"{code}: {name}", code) for code, name in ISO_CODE_TO_LANGUAGE_NAME.items()],
|
| 133 |
+
label="Language codes",
|
| 134 |
+
multiselect=True,
|
| 135 |
+
# allow_custom_value=True,
|
| 136 |
+
)
|
| 137 |
+
categories = gr.CheckboxGroup(["News", "Culture/History", "Government", "Political Parties", "Other"], label="Categories")
|
| 138 |
+
|
| 139 |
+
with gr.Row():
|
| 140 |
+
do_crawl_btn = gr.Button("✅ Do Crawl", elem_classes="success")
|
| 141 |
+
dont_crawl_btn = gr.Button("❌ Don't Crawl", elem_classes="error")
|
| 142 |
+
# random_subpage_btn = gr.Button("🔁 Load Another Subpage", variant="secondary")
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
def set_random_url():
|
| 146 |
+
candidate_urls = [
|
| 147 |
+
"http://example.com",
|
| 148 |
+
"https://wikipedia.org/",
|
| 149 |
+
"https://occiglot.eu",
|
| 150 |
+
"https://ostendorff.org",
|
| 151 |
+
"https://fr.wikipedia.org/",
|
| 152 |
+
"https://amazon.com/"
|
| 153 |
+
]
|
| 154 |
+
selected_url = random.choice(candidate_urls)
|
| 155 |
+
return selected_url
|
| 156 |
+
|
| 157 |
+
set_random_btn.click(fn=set_random_url, outputs=url_field)
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
def load_url(url):
|
| 161 |
+
screenshot_html_str, text = fetch_screenshot_and_text_from_url(url)
|
| 162 |
+
|
| 163 |
+
if not screenshot_html_str or not text:
|
| 164 |
+
gr.Error("Could not fetch data for url")
|
| 165 |
+
else:
|
| 166 |
+
|
| 167 |
+
return {
|
| 168 |
+
screenshot_scrollable: gr.update(value=screenshot_html_str, visible=True),
|
| 169 |
+
extracted_text: gr.update(value=text, visible=True),
|
| 170 |
+
output_col: gr.update(visible=True),
|
| 171 |
+
language_codes: "unknown", # Reset by set to invalid value # gr.update(None, label=url),
|
| 172 |
+
categories: gr.update(value=None),
|
| 173 |
+
}
|
| 174 |
+
|
| 175 |
+
load_btn.click(fn=load_url, inputs=url_field, outputs=[screenshot_scrollable, extracted_text, output_col, language_codes, categories], api_name="load_url")
|
| 176 |
+
|
| 177 |
+
def do_crawl(profile_state, url, language_codes, categories, do_crawl=True):
|
| 178 |
+
|
| 179 |
+
if profile_state:
|
| 180 |
+
html_str = f"<b>Thanks {profile_state}, we have saved your feedback!</b>"
|
| 181 |
+
gr.Info("Thanks for your feedback")
|
| 182 |
+
else:
|
| 183 |
+
gr.Error("Feedback could not be saved")
|
| 184 |
+
html_str = f"<b>Feedback could not be saved.</b> You are not authenticated."
|
| 185 |
+
|
| 186 |
+
return {
|
| 187 |
+
url_field: "",
|
| 188 |
+
output_col: gr.update(visible=False),
|
| 189 |
+
extracted_text: gr.update(value=None, visible=False),
|
| 190 |
+
screenshot_scrollable: gr.update(value="", visible=False),
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
# def do_crawl(profile_state, url, language_codes, categories):
|
| 194 |
+
# return do_crawl_or_not(profile_state, url, language_codes, categories, do_crawl=True)
|
| 195 |
+
|
| 196 |
+
# def dont_crawl(profile_state, url, language_codes, categories):
|
| 197 |
+
# return do_crawl_or_not(profile_state, url, language_codes, categories, do_crawl=False)
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
do_crawl_btn.click(
|
| 201 |
+
fn=do_crawl,
|
| 202 |
+
inputs=[profile_state, url_field, language_codes, categories],
|
| 203 |
+
outputs=[
|
| 204 |
+
url_field,
|
| 205 |
+
output_col,
|
| 206 |
+
extracted_text,
|
| 207 |
+
screenshot_scrollable
|
| 208 |
+
],
|
| 209 |
+
api_name="do_crawl",
|
| 210 |
+
)
|
| 211 |
+
dont_crawl_btn.click(
|
| 212 |
+
fn=do_crawl,
|
| 213 |
+
inputs=[profile_state, url_field, language_codes, categories],
|
| 214 |
+
outputs=[
|
| 215 |
+
url_field,
|
| 216 |
+
output_col,
|
| 217 |
+
extracted_text,
|
| 218 |
+
screenshot_scrollable
|
| 219 |
+
],
|
| 220 |
+
api_name="do_crawl",
|
| 221 |
+
)
|
| 222 |
+
|
| 223 |
+
# dont_crawl_btn.click(fn=dont_crawl, inputs=[profile_state, url, language_codes, categories], outputs=[url, output_col, extracted_text, screenshot_scrollable], api_name="dont_crawl")
|
| 224 |
+
|
| 225 |
+
# def random_subpage(url):
|
| 226 |
+
# new_url = "http://example.com"
|
| 227 |
+
|
| 228 |
+
# return [new_url, *fetch_screenshot_and_text_from_url(new_url)]
|
| 229 |
+
|
| 230 |
+
# random_subpage_btn.click(fn=random_subpage, inputs=url, outputs=[url, screenshot_scrollable, extracted_text, output_col], api_name="load_random_subpage")
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
|
| 234 |
+
if __name__ == "__main__":
|
| 235 |
+
demo.launch()
|
languages.py
ADDED
|
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Taken from:
|
| 2 |
+
# https://gist.github.com/jrnk/8eb57b065ea0b098d571
|
| 3 |
+
ISO_CODE_TO_LANGUAGE_NAME = {
|
| 4 |
+
"aa": "Afar",
|
| 5 |
+
"ab": "Abkhazian",
|
| 6 |
+
"ae": "Avestan",
|
| 7 |
+
"af": "Afrikaans",
|
| 8 |
+
"ak": "Akan",
|
| 9 |
+
"am": "Amharic",
|
| 10 |
+
"an": "Aragonese",
|
| 11 |
+
"ar": "Arabic",
|
| 12 |
+
"as": "Assamese",
|
| 13 |
+
"av": "Avaric",
|
| 14 |
+
"ay": "Aymara",
|
| 15 |
+
"az": "Azerbaijani",
|
| 16 |
+
"ba": "Bashkir",
|
| 17 |
+
"be": "Belarusian",
|
| 18 |
+
"bg": "Bulgarian",
|
| 19 |
+
"bh": "Bihari languages",
|
| 20 |
+
"bi": "Bislama",
|
| 21 |
+
"bm": "Bambara",
|
| 22 |
+
"bn": "Bengali",
|
| 23 |
+
"bo": "Tibetan",
|
| 24 |
+
"br": "Breton",
|
| 25 |
+
"bs": "Bosnian",
|
| 26 |
+
"ca": "Catalan; Valencian",
|
| 27 |
+
"ce": "Chechen",
|
| 28 |
+
"ch": "Chamorro",
|
| 29 |
+
"co": "Corsican",
|
| 30 |
+
"cr": "Cree",
|
| 31 |
+
"cs": "Czech",
|
| 32 |
+
"cu": "Church Slavic; Old Slavonic; Church Slavonic; Old Bulgarian; Old Church Slavonic",
|
| 33 |
+
"cv": "Chuvash",
|
| 34 |
+
"cy": "Welsh",
|
| 35 |
+
"da": "Danish",
|
| 36 |
+
"de": "German",
|
| 37 |
+
"dv": "Divehi; Dhivehi; Maldivian",
|
| 38 |
+
"dz": "Dzongkha",
|
| 39 |
+
"ee": "Ewe",
|
| 40 |
+
"el": "Greek, Modern (1453-)",
|
| 41 |
+
"en": "English",
|
| 42 |
+
"eo": "Esperanto",
|
| 43 |
+
"es": "Spanish; Castilian",
|
| 44 |
+
"et": "Estonian",
|
| 45 |
+
"eu": "Basque",
|
| 46 |
+
"fa": "Persian",
|
| 47 |
+
"ff": "Fulah",
|
| 48 |
+
"fi": "Finnish",
|
| 49 |
+
"fj": "Fijian",
|
| 50 |
+
"fo": "Faroese",
|
| 51 |
+
"fr": "French",
|
| 52 |
+
"fy": "Western Frisian",
|
| 53 |
+
"ga": "Irish",
|
| 54 |
+
"gd": "Gaelic; Scomttish Gaelic",
|
| 55 |
+
"gl": "Galician",
|
| 56 |
+
"gn": "Guarani",
|
| 57 |
+
"gu": "Gujarati",
|
| 58 |
+
"gv": "Manx",
|
| 59 |
+
"ha": "Hausa",
|
| 60 |
+
"he": "Hebrew",
|
| 61 |
+
"hi": "Hindi",
|
| 62 |
+
"ho": "Hiri Motu",
|
| 63 |
+
"hr": "Croatian",
|
| 64 |
+
"ht": "Haitian; Haitian Creole",
|
| 65 |
+
"hu": "Hungarian",
|
| 66 |
+
"hy": "Armenian",
|
| 67 |
+
"hz": "Herero",
|
| 68 |
+
"ia": "Interlingua (International Auxiliary Language Association)",
|
| 69 |
+
"id": "Indonesian",
|
| 70 |
+
"ie": "Interlingue; Occidental",
|
| 71 |
+
"ig": "Igbo",
|
| 72 |
+
"ii": "Sichuan Yi; Nuosu",
|
| 73 |
+
"ik": "Inupiaq",
|
| 74 |
+
"io": "Ido",
|
| 75 |
+
"is": "Icelandic",
|
| 76 |
+
"it": "Italian",
|
| 77 |
+
"iu": "Inuktitut",
|
| 78 |
+
"ja": "Japanese",
|
| 79 |
+
"jv": "Javanese",
|
| 80 |
+
"ka": "Georgian",
|
| 81 |
+
"kg": "Kongo",
|
| 82 |
+
"ki": "Kikuyu; Gikuyu",
|
| 83 |
+
"kj": "Kuanyama; Kwanyama",
|
| 84 |
+
"kk": "Kazakh",
|
| 85 |
+
"kl": "Kalaallisut; Greenlandic",
|
| 86 |
+
"km": "Central Khmer",
|
| 87 |
+
"kn": "Kannada",
|
| 88 |
+
"ko": "Korean",
|
| 89 |
+
"kr": "Kanuri",
|
| 90 |
+
"ks": "Kashmiri",
|
| 91 |
+
"ku": "Kurdish",
|
| 92 |
+
"kv": "Komi",
|
| 93 |
+
"kw": "Cornish",
|
| 94 |
+
"ky": "Kirghiz; Kyrgyz",
|
| 95 |
+
"la": "Latin",
|
| 96 |
+
"lb": "Luxembourgish; Letzeburgesch",
|
| 97 |
+
"lg": "Ganda",
|
| 98 |
+
"li": "Limburgan; Limburger; Limburgish",
|
| 99 |
+
"ln": "Lingala",
|
| 100 |
+
"lo": "Lao",
|
| 101 |
+
"lt": "Lithuanian",
|
| 102 |
+
"lu": "Luba-Katanga",
|
| 103 |
+
"lv": "Latvian",
|
| 104 |
+
"mg": "Malagasy",
|
| 105 |
+
"mh": "Marshallese",
|
| 106 |
+
"mi": "Maori",
|
| 107 |
+
"mk": "Macedonian",
|
| 108 |
+
"ml": "Malayalam",
|
| 109 |
+
"mn": "Mongolian",
|
| 110 |
+
"mr": "Marathi",
|
| 111 |
+
"ms": "Malay",
|
| 112 |
+
"mt": "Maltese",
|
| 113 |
+
"my": "Burmese",
|
| 114 |
+
"na": "Nauru",
|
| 115 |
+
"nb": "Bokmål, Norwegian; Norwegian Bokmål",
|
| 116 |
+
"nd": "Ndebele, North; North Ndebele",
|
| 117 |
+
"ne": "Nepali",
|
| 118 |
+
"ng": "Ndonga",
|
| 119 |
+
"nl": "Dutch; Flemish",
|
| 120 |
+
"nn": "Norwegian Nynorsk; Nynorsk, Norwegian",
|
| 121 |
+
"no": "Norwegian",
|
| 122 |
+
"nr": "Ndebele, South; South Ndebele",
|
| 123 |
+
"nv": "Navajo; Navaho",
|
| 124 |
+
"ny": "Chichewa; Chewa; Nyanja",
|
| 125 |
+
"oc": "Occitan (post 1500)",
|
| 126 |
+
"oj": "Ojibwa",
|
| 127 |
+
"om": "Oromo",
|
| 128 |
+
"or": "Oriya",
|
| 129 |
+
"os": "Ossetian; Ossetic",
|
| 130 |
+
"pa": "Panjabi; Punjabi",
|
| 131 |
+
"pi": "Pali",
|
| 132 |
+
"pl": "Polish",
|
| 133 |
+
"ps": "Pushto; Pashto",
|
| 134 |
+
"pt": "Portuguese",
|
| 135 |
+
"qu": "Quechua",
|
| 136 |
+
"rm": "Romansh",
|
| 137 |
+
"rn": "Rundi",
|
| 138 |
+
"ro": "Romanian; Moldavian; Moldovan",
|
| 139 |
+
"ru": "Russian",
|
| 140 |
+
"rw": "Kinyarwanda",
|
| 141 |
+
"sa": "Sanskrit",
|
| 142 |
+
"sc": "Sardinian",
|
| 143 |
+
"sd": "Sindhi",
|
| 144 |
+
"se": "Northern Sami",
|
| 145 |
+
"sg": "Sango",
|
| 146 |
+
"si": "Sinhala; Sinhalese",
|
| 147 |
+
"sk": "Slovak",
|
| 148 |
+
"sl": "Slovenian",
|
| 149 |
+
"sm": "Samoan",
|
| 150 |
+
"sn": "Shona",
|
| 151 |
+
"so": "Somali",
|
| 152 |
+
"sq": "Albanian",
|
| 153 |
+
"sr": "Serbian",
|
| 154 |
+
"ss": "Swati",
|
| 155 |
+
"st": "Sotho, Southern",
|
| 156 |
+
"su": "Sundanese",
|
| 157 |
+
"sv": "Swedish",
|
| 158 |
+
"sw": "Swahili",
|
| 159 |
+
"ta": "Tamil",
|
| 160 |
+
"te": "Telugu",
|
| 161 |
+
"tg": "Tajik",
|
| 162 |
+
"th": "Thai",
|
| 163 |
+
"ti": "Tigrinya",
|
| 164 |
+
"tk": "Turkmen",
|
| 165 |
+
"tl": "Tagalog",
|
| 166 |
+
"tn": "Tswana",
|
| 167 |
+
"to": "Tonga (Tonga Islands)",
|
| 168 |
+
"tr": "Turkish",
|
| 169 |
+
"ts": "Tsonga",
|
| 170 |
+
"tt": "Tatar",
|
| 171 |
+
"tw": "Twi",
|
| 172 |
+
"ty": "Tahitian",
|
| 173 |
+
"ug": "Uighur; Uyghur",
|
| 174 |
+
"uk": "Ukrainian",
|
| 175 |
+
"ur": "Urdu",
|
| 176 |
+
"uz": "Uzbek",
|
| 177 |
+
"ve": "Venda",
|
| 178 |
+
"vi": "Vietnamese",
|
| 179 |
+
"vo": "Volapük",
|
| 180 |
+
"wa": "Walloon",
|
| 181 |
+
"wo": "Wolof",
|
| 182 |
+
"xh": "Xhosa",
|
| 183 |
+
"yi": "Yiddish",
|
| 184 |
+
"yo": "Yoruba",
|
| 185 |
+
"za": "Zhuang; Chuang",
|
| 186 |
+
"zh": "Chinese",
|
| 187 |
+
"zu": "Zulu"
|
| 188 |
+
}
|
packages.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
chromium-driver
|
requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
selenium >=4.0.0, < 5.0.0
|
| 2 |
+
gradio>=3.40.1
|
| 3 |
+
Pillow>=8.3.1,<9.0
|
| 4 |
+
trafilatura
|
| 5 |
+
gradio[oauth]
|