|
|
|
|
|
""" |
|
|
HuggingFace Space for fine-tuning megalaa Coptic translation model |
|
|
|
|
|
This Gradio app provides a user-friendly interface for training the |
|
|
megalaa/coptic-english-translator model on your CopticScriptorium corpus. |
|
|
""" |
|
|
|
|
|
import gradio as gr |
|
|
import os |
|
|
import subprocess |
|
|
import threading |
|
|
import time |
|
|
from pathlib import Path |
|
|
|
|
|
|
|
|
training_status = { |
|
|
"running": False, |
|
|
"log": [], |
|
|
"completed": False, |
|
|
"error": None |
|
|
} |
|
|
|
|
|
|
|
|
def train_model(train_file, val_file, num_epochs, batch_size, learning_rate, hf_token, model_repo_name): |
|
|
""" |
|
|
Start model training with uploaded data files |
|
|
""" |
|
|
global training_status |
|
|
|
|
|
|
|
|
training_status = { |
|
|
"running": True, |
|
|
"log": ["🚀 Starting training setup...\n"], |
|
|
"completed": False, |
|
|
"error": None |
|
|
} |
|
|
|
|
|
try: |
|
|
|
|
|
train_path = "train.jsonl" |
|
|
val_path = "val.jsonl" |
|
|
|
|
|
with open(train_path, "wb") as f: |
|
|
f.write(train_file) |
|
|
with open(val_path, "wb") as f: |
|
|
f.write(val_file) |
|
|
|
|
|
training_status["log"].append(f"✓ Training data saved: {train_path}\n") |
|
|
training_status["log"].append(f"✓ Validation data saved: {val_path}\n") |
|
|
|
|
|
|
|
|
script_content = f'''#!/usr/bin/env python3 |
|
|
import os |
|
|
import json |
|
|
import torch |
|
|
from datasets import load_dataset |
|
|
from transformers import ( |
|
|
AutoTokenizer, |
|
|
AutoModelForSeq2SeqLM, |
|
|
Seq2SeqTrainingArguments, |
|
|
Seq2SeqTrainer, |
|
|
DataCollatorForSeq2Seq, |
|
|
) |
|
|
from huggingface_hub import HfApi, login |
|
|
from evaluate import load |
|
|
import numpy as np |
|
|
import logging |
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
# HuggingFace Hub configuration |
|
|
HF_TOKEN = "{hf_token}" |
|
|
MODEL_REPO_NAME = "{model_repo_name}" |
|
|
|
|
|
if HF_TOKEN: |
|
|
login(token=HF_TOKEN) |
|
|
logger.info("✓ Logged in to HuggingFace Hub") |
|
|
|
|
|
# Greekification for megalaa models |
|
|
COPTIC_TO_GREEK = {{ |
|
|
"ⲁ": "α", "ⲃ": "β", "ⲅ": "γ", "ⲇ": "δ", "ⲉ": "ε", "ⲋ": "ϛ", |
|
|
"ⲍ": "ζ", "ⲏ": "η", "ⲑ": "θ", "ⲓ": "ι", "ⲕ": "κ", "ⲗ": "λ", |
|
|
"ⲙ": "μ", "ⲛ": "ν", "ⲝ": "ξ", "ⲟ": "ο", "ⲡ": "π", "ⲣ": "ρ", |
|
|
"ⲥ": "σ", "ⲧ": "τ", "ⲩ": "υ", "ⲫ": "φ", "ⲭ": "χ", "ⲯ": "ψ", |
|
|
"ⲱ": "ω", "ϣ": "s", "ϥ": "f", "ϧ": "k", "ϩ": "h", "ϫ": "j", |
|
|
"ϭ": "c", "ϯ": "t", |
|
|
}} |
|
|
|
|
|
def greekify(text): |
|
|
if not text: |
|
|
return "" |
|
|
return "".join(COPTIC_TO_GREEK.get(c.lower(), c.lower()) for c in text) |
|
|
|
|
|
def extract_parallel_texts(examples): |
|
|
coptic_texts = [] |
|
|
english_texts = [] |
|
|
|
|
|
for messages in examples['messages']: |
|
|
coptic_text = None |
|
|
english_text = None |
|
|
|
|
|
for msg in messages: |
|
|
if msg['role'] == 'user' and 'Coptic text to English:' in msg['content']: |
|
|
coptic_text = msg['content'].split('Coptic text to English:')[-1].strip() |
|
|
elif msg['role'] == 'assistant': |
|
|
english_text = msg['content'] |
|
|
|
|
|
coptic_texts.append(coptic_text) |
|
|
english_texts.append(english_text) |
|
|
|
|
|
return {{'coptic': coptic_texts, 'english': english_texts}} |
|
|
|
|
|
def preprocess_function(examples, tokenizer, max_length=256): |
|
|
greekified_coptic = [greekify(text.lower()) if text else "" for text in examples["coptic"]] |
|
|
|
|
|
model_inputs = tokenizer( |
|
|
greekified_coptic, |
|
|
max_length=max_length, |
|
|
truncation=True, |
|
|
padding="max_length" |
|
|
) |
|
|
|
|
|
labels = tokenizer( |
|
|
text_target=examples["english"], |
|
|
max_length=max_length, |
|
|
truncation=True, |
|
|
padding="max_length" |
|
|
) |
|
|
|
|
|
labels["input_ids"] = [ |
|
|
[(label if label != tokenizer.pad_token_id else -100) for label in labels_example] |
|
|
for labels_example in labels["input_ids"] |
|
|
] |
|
|
|
|
|
model_inputs["labels"] = labels["input_ids"] |
|
|
return model_inputs |
|
|
|
|
|
def compute_metrics(eval_preds, tokenizer, metric): |
|
|
preds, labels = eval_preds |
|
|
|
|
|
if isinstance(preds, tuple): |
|
|
preds = preds[0] |
|
|
|
|
|
labels = np.where(labels != -100, labels, tokenizer.pad_token_id) |
|
|
|
|
|
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) |
|
|
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) |
|
|
decoded_labels = [[label] for label in decoded_labels] |
|
|
|
|
|
result = metric.compute(predictions=decoded_preds, references=decoded_labels) |
|
|
return {{"bleu": result["score"]}} |
|
|
|
|
|
# Configuration |
|
|
model_name = "megalaa/coptic-english-translator" |
|
|
output_dir = "coptic_megalaa_finetuned" |
|
|
num_epochs = {num_epochs} |
|
|
batch_size = {batch_size} |
|
|
learning_rate = {learning_rate} |
|
|
|
|
|
logger.info("="*60) |
|
|
logger.info("MEGALAA FINE-TUNING ON HUGGINGFACE SPACES") |
|
|
logger.info("="*60) |
|
|
logger.info(f"Base model: {{model_name}}") |
|
|
logger.info(f"Epochs: {{num_epochs}}") |
|
|
logger.info(f"Batch size: {{batch_size}}") |
|
|
logger.info(f"Learning rate: {{learning_rate}}") |
|
|
|
|
|
# Check GPU |
|
|
if torch.cuda.is_available(): |
|
|
logger.info(f"GPU: {{torch.cuda.get_device_name(0)}}") |
|
|
logger.info(f"GPU Memory: {{torch.cuda.get_device_properties(0).total_memory / (1024**3):.1f}} GB") |
|
|
else: |
|
|
logger.warning("No GPU detected!") |
|
|
|
|
|
# Load model |
|
|
logger.info("\\nLoading model...") |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_name) |
|
|
|
|
|
# Load datasets |
|
|
logger.info("Loading datasets...") |
|
|
train_dataset = load_dataset('json', data_files='{train_path}', split='train') |
|
|
val_dataset = load_dataset('json', data_files='{val_path}', split='train') |
|
|
|
|
|
logger.info(f"Train samples: {{len(train_dataset):,}}") |
|
|
logger.info(f"Validation samples: {{len(val_dataset):,}}") |
|
|
|
|
|
# Extract and tokenize |
|
|
logger.info("Processing datasets...") |
|
|
train_dataset = train_dataset.map(extract_parallel_texts, batched=True, remove_columns=['messages']) |
|
|
val_dataset = val_dataset.map(extract_parallel_texts, batched=True, remove_columns=['messages']) |
|
|
|
|
|
tokenized_train = train_dataset.map( |
|
|
lambda examples: preprocess_function(examples, tokenizer), |
|
|
batched=True, |
|
|
remove_columns=['coptic', 'english'] |
|
|
) |
|
|
tokenized_val = val_dataset.map( |
|
|
lambda examples: preprocess_function(examples, tokenizer), |
|
|
batched=True, |
|
|
remove_columns=['coptic', 'english'] |
|
|
) |
|
|
|
|
|
# Setup training |
|
|
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding=True) |
|
|
metric = load("sacrebleu") |
|
|
|
|
|
training_args = Seq2SeqTrainingArguments( |
|
|
output_dir=output_dir, |
|
|
num_train_epochs=num_epochs, |
|
|
per_device_train_batch_size=batch_size, |
|
|
per_device_eval_batch_size=batch_size, |
|
|
gradient_accumulation_steps=2, |
|
|
learning_rate=learning_rate, |
|
|
warmup_steps=500, |
|
|
max_grad_norm=1.0, |
|
|
weight_decay=0.01, |
|
|
eval_strategy="steps", |
|
|
eval_steps=500, |
|
|
logging_steps=50, |
|
|
save_steps=500, |
|
|
save_total_limit=3, |
|
|
load_best_model_at_end=True, |
|
|
metric_for_best_model="bleu", |
|
|
greater_is_better=True, |
|
|
predict_with_generate=True, |
|
|
generation_max_length=256, |
|
|
generation_num_beams=5, |
|
|
fp16=torch.cuda.is_available(), |
|
|
report_to="tensorboard", |
|
|
logging_dir=f"{{output_dir}}/logs", |
|
|
push_to_hub=False, |
|
|
) |
|
|
|
|
|
trainer = Seq2SeqTrainer( |
|
|
model=model, |
|
|
args=training_args, |
|
|
train_dataset=tokenized_train, |
|
|
eval_dataset=tokenized_val, |
|
|
tokenizer=tokenizer, |
|
|
data_collator=data_collator, |
|
|
compute_metrics=lambda eval_preds: compute_metrics(eval_preds, tokenizer, metric) |
|
|
) |
|
|
|
|
|
logger.info("\\nSTARTING TRAINING") |
|
|
logger.info("="*60) |
|
|
|
|
|
# Train |
|
|
trainer.train() |
|
|
|
|
|
# Save locally |
|
|
logger.info("\\nSaving final model...") |
|
|
trainer.save_model(f"{{output_dir}}/final") |
|
|
tokenizer.save_pretrained(f"{{output_dir}}/final") |
|
|
|
|
|
# Push to HuggingFace Hub |
|
|
if HF_TOKEN and MODEL_REPO_NAME: |
|
|
logger.info(f"\\nPushing model to HuggingFace Hub: {{MODEL_REPO_NAME}}") |
|
|
try: |
|
|
api = HfApi() |
|
|
api.create_repo(repo_id=MODEL_REPO_NAME, repo_type="model", exist_ok=True) |
|
|
|
|
|
# Upload all files |
|
|
api.upload_folder( |
|
|
folder_path=f"{{output_dir}}/final", |
|
|
repo_id=MODEL_REPO_NAME, |
|
|
repo_type="model", |
|
|
) |
|
|
logger.info(f"✅ Model successfully pushed to: https://huggingface.co/{{MODEL_REPO_NAME}}") |
|
|
except Exception as e: |
|
|
logger.error(f"❌ Failed to push to Hub: {{e}}") |
|
|
|
|
|
# Final evaluation |
|
|
logger.info("\\nFinal evaluation...") |
|
|
eval_results = trainer.evaluate() |
|
|
|
|
|
logger.info("\\n" + "="*60) |
|
|
logger.info("TRAINING COMPLETE!") |
|
|
logger.info("="*60) |
|
|
for key, value in eval_results.items(): |
|
|
logger.info(f"{{key}}: {{value}}") |
|
|
|
|
|
logger.info(f"\\n✅ Model saved locally to: {{output_dir}}/final") |
|
|
if HF_TOKEN and MODEL_REPO_NAME: |
|
|
logger.info(f"✅ Model available at: https://huggingface.co/{{MODEL_REPO_NAME}}") |
|
|
''' |
|
|
|
|
|
with open("train_script.py", "w") as f: |
|
|
f.write(script_content) |
|
|
|
|
|
training_status["log"].append("✓ Training script created\n") |
|
|
training_status["log"].append("🚀 Starting training...\n\n") |
|
|
|
|
|
|
|
|
process = subprocess.Popen( |
|
|
["python", "train_script.py"], |
|
|
stdout=subprocess.PIPE, |
|
|
stderr=subprocess.STDOUT, |
|
|
text=True, |
|
|
bufsize=1 |
|
|
) |
|
|
|
|
|
|
|
|
for line in process.stdout: |
|
|
training_status["log"].append(line) |
|
|
time.sleep(0.01) |
|
|
|
|
|
process.wait() |
|
|
|
|
|
if process.returncode == 0: |
|
|
training_status["completed"] = True |
|
|
training_status["log"].append("\n\n✅ TRAINING COMPLETED SUCCESSFULLY!\n") |
|
|
training_status["log"].append("📦 Model saved locally to: coptic_megalaa_finetuned/final\n") |
|
|
if hf_token and model_repo_name: |
|
|
training_status["log"].append(f"📦 Model pushed to: https://huggingface.co/{model_repo_name}\n") |
|
|
else: |
|
|
training_status["error"] = f"Training failed with exit code {process.returncode}" |
|
|
training_status["log"].append(f"\n\n❌ Training failed with exit code {process.returncode}\n") |
|
|
|
|
|
except Exception as e: |
|
|
training_status["error"] = str(e) |
|
|
training_status["log"].append(f"\n\n❌ Error: {str(e)}\n") |
|
|
|
|
|
finally: |
|
|
training_status["running"] = False |
|
|
|
|
|
|
|
|
def start_training(train_file, val_file, num_epochs, batch_size, learning_rate, hf_token, model_repo_name): |
|
|
""" |
|
|
Start training in background thread |
|
|
""" |
|
|
if training_status["running"]: |
|
|
return "⚠️ Training already in progress!" |
|
|
|
|
|
if not hf_token or not model_repo_name: |
|
|
return "⚠️ Please provide both HuggingFace Token and Model Repository Name!" |
|
|
|
|
|
|
|
|
thread = threading.Thread( |
|
|
target=train_model, |
|
|
args=(train_file, val_file, num_epochs, batch_size, learning_rate, hf_token, model_repo_name) |
|
|
) |
|
|
thread.daemon = True |
|
|
thread.start() |
|
|
|
|
|
return "🚀 Training started! Monitor progress in the logs below." |
|
|
|
|
|
|
|
|
def get_training_log(): |
|
|
""" |
|
|
Return current training log |
|
|
""" |
|
|
return "".join(training_status["log"]) |
|
|
|
|
|
|
|
|
def check_status(): |
|
|
""" |
|
|
Return training status |
|
|
""" |
|
|
if training_status["completed"]: |
|
|
return "✅ Training completed!" |
|
|
elif training_status["error"]: |
|
|
return f"❌ Error: {training_status['error']}" |
|
|
elif training_status["running"]: |
|
|
return "🔄 Training in progress..." |
|
|
else: |
|
|
return "⏸️ Ready to train" |
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks(title="Megalaa Coptic Fine-tuning") as demo: |
|
|
gr.Markdown(""" |
|
|
# 🏛️ Megalaa Coptic Translation Fine-tuning |
|
|
|
|
|
Fine-tune the megalaa/coptic-english-translator model on your CopticScriptorium corpus. |
|
|
|
|
|
**⚙️ IMPORTANT:** Make sure this Space is running on **T4 Small GPU** for optimal performance! |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
gr.Markdown("### 🔑 HuggingFace Hub Configuration") |
|
|
hf_token_input = gr.Textbox( |
|
|
label="HuggingFace Token", |
|
|
placeholder="hf_...", |
|
|
type="password", |
|
|
info="Get your token from https://huggingface.co/settings/tokens" |
|
|
) |
|
|
model_repo_input = gr.Textbox( |
|
|
label="Model Repository Name", |
|
|
placeholder="username/coptic-megalaa-finetuned", |
|
|
info="Example: john-doe/coptic-megalaa-finetuned" |
|
|
) |
|
|
|
|
|
gr.Markdown("### 📤 Upload Training Data") |
|
|
train_file_upload = gr.File( |
|
|
label="Training Data (train.jsonl)", |
|
|
file_types=[".jsonl"] |
|
|
) |
|
|
val_file_upload = gr.File( |
|
|
label="Validation Data (val.jsonl)", |
|
|
file_types=[".jsonl"] |
|
|
) |
|
|
|
|
|
gr.Markdown("### ⚙️ Training Parameters") |
|
|
num_epochs = gr.Slider( |
|
|
minimum=1, |
|
|
maximum=10, |
|
|
value=5, |
|
|
step=1, |
|
|
label="Number of Epochs" |
|
|
) |
|
|
batch_size = gr.Slider( |
|
|
minimum=4, |
|
|
maximum=16, |
|
|
value=8, |
|
|
step=4, |
|
|
label="Batch Size" |
|
|
) |
|
|
learning_rate = gr.Number( |
|
|
value=2e-5, |
|
|
label="Learning Rate" |
|
|
) |
|
|
|
|
|
start_btn = gr.Button("🚀 Start Training", variant="primary", size="lg") |
|
|
status_text = gr.Textbox(label="Status", value="⏸️ Ready to train") |
|
|
|
|
|
with gr.Column(): |
|
|
gr.Markdown("### 📊 Training Log") |
|
|
log_output = gr.Textbox( |
|
|
label="Real-time Training Log", |
|
|
lines=30, |
|
|
max_lines=30, |
|
|
autoscroll=True, |
|
|
every=2 |
|
|
) |
|
|
|
|
|
|
|
|
start_btn.click( |
|
|
fn=start_training, |
|
|
inputs=[train_file_upload, val_file_upload, num_epochs, batch_size, learning_rate, hf_token_input, model_repo_input], |
|
|
outputs=status_text |
|
|
) |
|
|
|
|
|
|
|
|
demo.load(fn=get_training_log, outputs=log_output, every=2) |
|
|
demo.load(fn=check_status, outputs=status_text, every=2) |
|
|
|
|
|
gr.Markdown(""" |
|
|
--- |
|
|
### 📥 After Training |
|
|
|
|
|
When training completes, your fine-tuned model will be automatically pushed to HuggingFace Hub! |
|
|
|
|
|
**Next steps:** |
|
|
1. Visit your model repository at `https://huggingface.co/YOUR_USERNAME/MODEL_NAME` |
|
|
2. Download and test with: `python evaluate_megalaa_model.py` |
|
|
3. Integrate into your Coptic translation interface |
|
|
4. Share your model with the community! |
|
|
|
|
|
**Estimated training time:** 6-8 hours on T4 GPU |
|
|
|
|
|
**Note:** The model is also saved temporarily to `coptic_megalaa_finetuned/final/` during training, |
|
|
but this local copy will be lost when the Space restarts. Use the HuggingFace Hub version! |
|
|
""") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |
|
|
|