Norelad's picture
Upload apertus_ui.py with huggingface_hub
3e1d91b verified
raw
history blame
10.8 kB
import streamlit as st
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import os
import xml.etree.ElementTree as ET
import re
# Coptic alphabet helper
COPTIC_ALPHABET = {
'Ⲁ': 'Alpha', 'Ⲃ': 'Beta', 'Ⲅ': 'Gamma', 'Ⲇ': 'Delta', 'Ⲉ': 'Epsilon', 'Ⲋ': 'Zeta',
'Ⲏ': 'Eta', 'Ⲑ': 'Theta', 'Ⲓ': 'Iota', 'Ⲕ': 'Kappa', 'Ⲗ': 'Lambda', 'Ⲙ': 'Mu',
'Ⲛ': 'Nu', 'Ⲝ': 'Xi', 'Ⲟ': 'Omicron', 'Ⲡ': 'Pi', 'Ⲣ': 'Rho', 'Ⲥ': 'Sigma',
'Ⲧ': 'Tau', 'Ⲩ': 'Upsilon', 'Ⲫ': 'Phi', 'Ⲭ': 'Chi', 'Ⲯ': 'Psi', 'Ⲱ': 'Omega',
'Ϣ': 'Shai', 'Ϥ': 'Fai', 'Ϧ': 'Khei', 'Ϩ': 'Hori', 'Ϫ': 'Gangia', 'Ϭ': 'Shima', 'Ϯ': 'Ti'
}
# Coptic linguistic prompts
COPTIC_PROMPTS = {
'dialect_analysis': "Analyze the Coptic dialect of this text and identify linguistic features:",
'translation': "Translate this Coptic text to English, preserving theological and cultural context:",
'transcription': "Provide a romanized transcription of this Coptic text:",
'morphology': "Analyze the morphological structure of these Coptic words:",
'lexicon_lookup': "Look up these Coptic words in the lexicon and provide Greek etymologies:"
}
# Lexicon loader
@st.cache_data
def load_coptic_lexicon(file_path=None):
"""Load Coptic lexicon from various formats including TEI XML"""
if not file_path or not os.path.exists(file_path):
return {}
lexicon = {}
try:
# Handle XML format (TEI structure for Comprehensive Coptic Lexicon)
if file_path.endswith('.xml'):
tree = ET.parse(file_path)
root = tree.getroot()
# Handle TEI namespace
ns = {'tei': 'http://www.tei-c.org/ns/1.0'}
# Find entries in TEI format
entries = root.findall('.//tei:entry', ns)
for entry in entries[:100]: # Limit to first 100 entries for performance
coptic_word = ""
definition = ""
# Extract Coptic headword from TEI structure
form = entry.find('.//tei:form[@type="lemma"]', ns) or entry.find('.//tei:form', ns)
if form is not None:
orth = form.find('.//tei:orth', ns)
if orth is not None and orth.text:
coptic_word = orth.text.strip()
# Extract definition from sense elements
senses = entry.findall('.//tei:sense', ns)
definitions = []
for sense in senses[:2]: # Limit to first 2 senses
def_elem = sense.find('.//tei:def', ns)
if def_elem is not None and def_elem.text:
definitions.append(def_elem.text.strip())
if definitions:
definition = "; ".join(definitions)
# Clean and store
if coptic_word and definition:
# Clean Coptic word (preserve Coptic and Greek Unicode)
coptic_word = re.sub(r'[^\u2C80-\u2CFF\u03B0-\u03FF\u1F00-\u1FFF\w\s\-]', '', coptic_word).strip()
if coptic_word:
lexicon[coptic_word] = definition[:200] # Limit definition length
# Handle text formats
else:
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if not line:
continue
# Support multiple separators
separator = None
for sep in ['\t', '|', ',', ';']:
if sep in line:
separator = sep
break
if separator:
parts = line.split(separator, 1)
if len(parts) >= 2:
coptic_word = parts[0].strip()
definition = parts[1].strip()
lexicon[coptic_word] = definition
except Exception as e:
st.error(f"Error loading lexicon: {str(e)}")
return lexicon
# Language detection and UI
LANGUAGES = {
'en': 'English', 'es': 'Español', 'fr': 'Français', 'de': 'Deutsch',
'zh': '中文', 'ja': '日本語', 'ar': 'العربية', 'hi': 'हिन्दी',
'cop': 'Coptic (ⲘⲉⲧⲢⲉⲙ̀ⲛⲭⲏⲙⲓ)', 'cop-sa': 'Sahidic Coptic', 'cop-bo': 'Bohairic Coptic'
}
st.set_page_config(page_title="Apertus Chat", layout="wide")
# Language selector
selected_lang = st.selectbox("Language / Langue / Idioma",
options=list(LANGUAGES.keys()),
format_func=lambda x: LANGUAGES[x])
# Sidebar for Coptic tools
with st.sidebar:
st.header("Coptic Tools")
# Lexicon file uploader
lexicon_file = st.file_uploader("Upload Coptic Lexicon",
type=['txt', 'tsv', 'csv', 'xml'],
help="Supports: Text (TAB/pipe separated), XML (Crum format), CSV")
# Load lexicon
if lexicon_file:
# Save uploaded file temporarily
with open("temp_lexicon.txt", "wb") as f:
f.write(lexicon_file.getbuffer())
coptic_lexicon = load_coptic_lexicon("temp_lexicon.txt")
st.success(f"Loaded {len(coptic_lexicon)} lexicon entries")
else:
# Try to load the comprehensive lexicon if available
comprehensive_lexicon_path = "Comprehensive_Coptic_Lexicon-v1.2-2020.xml"
if os.path.exists(comprehensive_lexicon_path):
coptic_lexicon = load_coptic_lexicon(comprehensive_lexicon_path)
if coptic_lexicon:
st.info(f"Loaded Comprehensive Coptic Lexicon: {len(coptic_lexicon)} entries")
else:
coptic_lexicon = {}
else:
coptic_lexicon = {}
# Coptic alphabet reference
if st.expander("Coptic Alphabet"):
for letter, name in COPTIC_ALPHABET.items():
st.text(f"{letter} - {name}")
# Lexicon search
if coptic_lexicon:
st.subheader("Lexicon Search")
# Virtual Coptic keyboard
st.write("**Virtual Keyboard:**")
coptic_letters = ['ⲁ', 'ⲃ', 'ⲅ', 'ⲇ', 'ⲉ', 'ⲍ', 'ⲏ', 'ⲑ', 'ⲓ', 'ⲕ', 'ⲗ', 'ⲙ', 'ⲛ', 'ⲝ', 'ⲟ', 'ⲡ', 'ⲣ', 'ⲥ', 'ⲧ', 'ⲩ', 'ⲫ', 'ⲭ', 'ⲯ', 'ⲱ', 'ϣ', 'ϥ', 'ϧ', 'ϩ', 'ϫ', 'ϭ', 'ϯ']
# Create keyboard layout in rows
cols1 = st.columns(8)
cols2 = st.columns(8)
cols3 = st.columns(8)
cols4 = st.columns(8)
keyboard_input = ""
for i, letter in enumerate(coptic_letters):
col_idx = i % 8
if i < 8:
if cols1[col_idx].button(letter, key=f"key_{letter}"):
keyboard_input = letter
elif i < 16:
if cols2[col_idx].button(letter, key=f"key_{letter}"):
keyboard_input = letter
elif i < 24:
if cols3[col_idx].button(letter, key=f"key_{letter}"):
keyboard_input = letter
else:
if cols4[col_idx].button(letter, key=f"key_{letter}"):
keyboard_input = letter
# Search input
search_term = st.text_input("Search Coptic word:", value=keyboard_input if keyboard_input else "")
if search_term:
if search_term in coptic_lexicon:
st.write(f"**{search_term}**")
st.write(coptic_lexicon[search_term])
else:
# Partial matches
matches = [k for k in coptic_lexicon.keys() if search_term in k]
if matches:
st.write("Partial matches:")
for match in matches[:5]: # Show first 5 matches
st.write(f"**{match}** → {coptic_lexicon[match][:100]}...")
else:
st.write("No matches found")
# Linguistic analysis options
if selected_lang in ['cop', 'cop-sa', 'cop-bo']:
st.subheader("Analysis Type")
analysis_type = st.selectbox("Choose analysis:",
options=list(COPTIC_PROMPTS.keys()),
format_func=lambda x: x.replace('_', ' ').title())
# Load model (cached)
@st.cache_resource
def load_model():
model_path = "/home/aldn/Téléchargements/Apertus8B"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16)
return tokenizer, model
tokenizer, model = load_model()
# Chat interface
if "messages" not in st.session_state:
st.session_state.messages = []
# Display chat history
for message in st.session_state.messages:
with st.chat_message(message["role"]):
st.markdown(message["content"])
# User input
if prompt := st.chat_input("Type your message..."):
# Add Coptic-specific prompt prefix if applicable
if selected_lang in ['cop', 'cop-sa', 'cop-bo'] and 'analysis_type' in locals():
full_prompt = f"{COPTIC_PROMPTS[analysis_type]} {prompt}"
# Add lexicon context for lexicon lookup
if analysis_type == 'lexicon_lookup' and coptic_lexicon:
words_in_prompt = prompt.split()
lexicon_matches = []
for word in words_in_prompt:
if word in coptic_lexicon:
lexicon_matches.append(f"{word} = {coptic_lexicon[word]}")
if lexicon_matches:
full_prompt += f"\n\nLexicon entries found: {'; '.join(lexicon_matches)}"
else:
full_prompt = prompt
st.session_state.messages.append({"role": "user", "content": full_prompt})
with st.chat_message("user"):
st.markdown(full_prompt)
# Generate response
with st.chat_message("assistant"):
messages = [{"role": "user", "content": full_prompt}]
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer([text], return_tensors="pt")
with torch.no_grad():
outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.8, top_p=0.9)
response = tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True)
st.markdown(response)
st.session_state.messages.append({"role": "assistant", "content": response})