Rogaton
Add automatic model upload to HuggingFace Hub
5461265
#!/usr/bin/env python3
"""
HuggingFace Space for fine-tuning megalaa Coptic translation model
This Gradio app provides a user-friendly interface for training the
megalaa/coptic-english-translator model on your CopticScriptorium corpus.
"""
import gradio as gr
import os
import subprocess
import threading
import time
from pathlib import Path
# Global variable to track training status
training_status = {
"running": False,
"log": [],
"completed": False,
"error": None
}
def train_model(train_file, val_file, num_epochs, batch_size, learning_rate, hf_token, model_repo_name):
"""
Start model training with uploaded data files
"""
global training_status
# Reset status
training_status = {
"running": True,
"log": ["🚀 Starting training setup...\n"],
"completed": False,
"error": None
}
try:
# Save uploaded files
train_path = "train.jsonl"
val_path = "val.jsonl"
with open(train_path, "wb") as f:
f.write(train_file)
with open(val_path, "wb") as f:
f.write(val_file)
training_status["log"].append(f"✓ Training data saved: {train_path}\n")
training_status["log"].append(f"✓ Validation data saved: {val_path}\n")
# Create training script
script_content = f'''#!/usr/bin/env python3
import os
import json
import torch
from datasets import load_dataset
from transformers import (
AutoTokenizer,
AutoModelForSeq2SeqLM,
Seq2SeqTrainingArguments,
Seq2SeqTrainer,
DataCollatorForSeq2Seq,
)
from huggingface_hub import HfApi, login
from evaluate import load
import numpy as np
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# HuggingFace Hub configuration
HF_TOKEN = "{hf_token}"
MODEL_REPO_NAME = "{model_repo_name}"
if HF_TOKEN:
login(token=HF_TOKEN)
logger.info("✓ Logged in to HuggingFace Hub")
# Greekification for megalaa models
COPTIC_TO_GREEK = {{
"ⲁ": "α", "ⲃ": "β", "ⲅ": "γ", "ⲇ": "δ", "ⲉ": "ε", "ⲋ": "ϛ",
"ⲍ": "ζ", "ⲏ": "η", "ⲑ": "θ", "ⲓ": "ι", "ⲕ": "κ", "ⲗ": "λ",
"ⲙ": "μ", "ⲛ": "ν", "ⲝ": "ξ", "ⲟ": "ο", "ⲡ": "π", "ⲣ": "ρ",
"ⲥ": "σ", "ⲧ": "τ", "ⲩ": "υ", "ⲫ": "φ", "ⲭ": "χ", "ⲯ": "ψ",
"ⲱ": "ω", "ϣ": "s", "ϥ": "f", "ϧ": "k", "ϩ": "h", "ϫ": "j",
"ϭ": "c", "ϯ": "t",
}}
def greekify(text):
if not text:
return ""
return "".join(COPTIC_TO_GREEK.get(c.lower(), c.lower()) for c in text)
def extract_parallel_texts(examples):
coptic_texts = []
english_texts = []
for messages in examples['messages']:
coptic_text = None
english_text = None
for msg in messages:
if msg['role'] == 'user' and 'Coptic text to English:' in msg['content']:
coptic_text = msg['content'].split('Coptic text to English:')[-1].strip()
elif msg['role'] == 'assistant':
english_text = msg['content']
coptic_texts.append(coptic_text)
english_texts.append(english_text)
return {{'coptic': coptic_texts, 'english': english_texts}}
def preprocess_function(examples, tokenizer, max_length=256):
greekified_coptic = [greekify(text.lower()) if text else "" for text in examples["coptic"]]
model_inputs = tokenizer(
greekified_coptic,
max_length=max_length,
truncation=True,
padding="max_length"
)
labels = tokenizer(
text_target=examples["english"],
max_length=max_length,
truncation=True,
padding="max_length"
)
labels["input_ids"] = [
[(label if label != tokenizer.pad_token_id else -100) for label in labels_example]
for labels_example in labels["input_ids"]
]
model_inputs["labels"] = labels["input_ids"]
return model_inputs
def compute_metrics(eval_preds, tokenizer, metric):
preds, labels = eval_preds
if isinstance(preds, tuple):
preds = preds[0]
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
decoded_labels = [[label] for label in decoded_labels]
result = metric.compute(predictions=decoded_preds, references=decoded_labels)
return {{"bleu": result["score"]}}
# Configuration
model_name = "megalaa/coptic-english-translator"
output_dir = "coptic_megalaa_finetuned"
num_epochs = {num_epochs}
batch_size = {batch_size}
learning_rate = {learning_rate}
logger.info("="*60)
logger.info("MEGALAA FINE-TUNING ON HUGGINGFACE SPACES")
logger.info("="*60)
logger.info(f"Base model: {{model_name}}")
logger.info(f"Epochs: {{num_epochs}}")
logger.info(f"Batch size: {{batch_size}}")
logger.info(f"Learning rate: {{learning_rate}}")
# Check GPU
if torch.cuda.is_available():
logger.info(f"GPU: {{torch.cuda.get_device_name(0)}}")
logger.info(f"GPU Memory: {{torch.cuda.get_device_properties(0).total_memory / (1024**3):.1f}} GB")
else:
logger.warning("No GPU detected!")
# Load model
logger.info("\\nLoading model...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
# Load datasets
logger.info("Loading datasets...")
train_dataset = load_dataset('json', data_files='{train_path}', split='train')
val_dataset = load_dataset('json', data_files='{val_path}', split='train')
logger.info(f"Train samples: {{len(train_dataset):,}}")
logger.info(f"Validation samples: {{len(val_dataset):,}}")
# Extract and tokenize
logger.info("Processing datasets...")
train_dataset = train_dataset.map(extract_parallel_texts, batched=True, remove_columns=['messages'])
val_dataset = val_dataset.map(extract_parallel_texts, batched=True, remove_columns=['messages'])
tokenized_train = train_dataset.map(
lambda examples: preprocess_function(examples, tokenizer),
batched=True,
remove_columns=['coptic', 'english']
)
tokenized_val = val_dataset.map(
lambda examples: preprocess_function(examples, tokenizer),
batched=True,
remove_columns=['coptic', 'english']
)
# Setup training
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding=True)
metric = load("sacrebleu")
training_args = Seq2SeqTrainingArguments(
output_dir=output_dir,
num_train_epochs=num_epochs,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
gradient_accumulation_steps=2,
learning_rate=learning_rate,
warmup_steps=500,
max_grad_norm=1.0,
weight_decay=0.01,
eval_strategy="steps",
eval_steps=500,
logging_steps=50,
save_steps=500,
save_total_limit=3,
load_best_model_at_end=True,
metric_for_best_model="bleu",
greater_is_better=True,
predict_with_generate=True,
generation_max_length=256,
generation_num_beams=5,
fp16=torch.cuda.is_available(),
report_to="tensorboard",
logging_dir=f"{{output_dir}}/logs",
push_to_hub=False,
)
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
train_dataset=tokenized_train,
eval_dataset=tokenized_val,
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=lambda eval_preds: compute_metrics(eval_preds, tokenizer, metric)
)
logger.info("\\nSTARTING TRAINING")
logger.info("="*60)
# Train
trainer.train()
# Save locally
logger.info("\\nSaving final model...")
trainer.save_model(f"{{output_dir}}/final")
tokenizer.save_pretrained(f"{{output_dir}}/final")
# Push to HuggingFace Hub
if HF_TOKEN and MODEL_REPO_NAME:
logger.info(f"\\nPushing model to HuggingFace Hub: {{MODEL_REPO_NAME}}")
try:
api = HfApi()
api.create_repo(repo_id=MODEL_REPO_NAME, repo_type="model", exist_ok=True)
# Upload all files
api.upload_folder(
folder_path=f"{{output_dir}}/final",
repo_id=MODEL_REPO_NAME,
repo_type="model",
)
logger.info(f"✅ Model successfully pushed to: https://huggingface.co/{{MODEL_REPO_NAME}}")
except Exception as e:
logger.error(f"❌ Failed to push to Hub: {{e}}")
# Final evaluation
logger.info("\\nFinal evaluation...")
eval_results = trainer.evaluate()
logger.info("\\n" + "="*60)
logger.info("TRAINING COMPLETE!")
logger.info("="*60)
for key, value in eval_results.items():
logger.info(f"{{key}}: {{value}}")
logger.info(f"\\n✅ Model saved locally to: {{output_dir}}/final")
if HF_TOKEN and MODEL_REPO_NAME:
logger.info(f"✅ Model available at: https://huggingface.co/{{MODEL_REPO_NAME}}")
'''
with open("train_script.py", "w") as f:
f.write(script_content)
training_status["log"].append("✓ Training script created\n")
training_status["log"].append("🚀 Starting training...\n\n")
# Run training in subprocess
process = subprocess.Popen(
["python", "train_script.py"],
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
bufsize=1
)
# Stream output
for line in process.stdout:
training_status["log"].append(line)
time.sleep(0.01) # Small delay to allow UI updates
process.wait()
if process.returncode == 0:
training_status["completed"] = True
training_status["log"].append("\n\n✅ TRAINING COMPLETED SUCCESSFULLY!\n")
training_status["log"].append("📦 Model saved locally to: coptic_megalaa_finetuned/final\n")
if hf_token and model_repo_name:
training_status["log"].append(f"📦 Model pushed to: https://huggingface.co/{model_repo_name}\n")
else:
training_status["error"] = f"Training failed with exit code {process.returncode}"
training_status["log"].append(f"\n\n❌ Training failed with exit code {process.returncode}\n")
except Exception as e:
training_status["error"] = str(e)
training_status["log"].append(f"\n\n❌ Error: {str(e)}\n")
finally:
training_status["running"] = False
def start_training(train_file, val_file, num_epochs, batch_size, learning_rate, hf_token, model_repo_name):
"""
Start training in background thread
"""
if training_status["running"]:
return "⚠️ Training already in progress!"
if not hf_token or not model_repo_name:
return "⚠️ Please provide both HuggingFace Token and Model Repository Name!"
# Start training thread
thread = threading.Thread(
target=train_model,
args=(train_file, val_file, num_epochs, batch_size, learning_rate, hf_token, model_repo_name)
)
thread.daemon = True
thread.start()
return "🚀 Training started! Monitor progress in the logs below."
def get_training_log():
"""
Return current training log
"""
return "".join(training_status["log"])
def check_status():
"""
Return training status
"""
if training_status["completed"]:
return "✅ Training completed!"
elif training_status["error"]:
return f"❌ Error: {training_status['error']}"
elif training_status["running"]:
return "🔄 Training in progress..."
else:
return "⏸️ Ready to train"
# Create Gradio interface
with gr.Blocks(title="Megalaa Coptic Fine-tuning") as demo:
gr.Markdown("""
# 🏛️ Megalaa Coptic Translation Fine-tuning
Fine-tune the megalaa/coptic-english-translator model on your CopticScriptorium corpus.
**⚙️ IMPORTANT:** Make sure this Space is running on **T4 Small GPU** for optimal performance!
""")
with gr.Row():
with gr.Column():
gr.Markdown("### 🔑 HuggingFace Hub Configuration")
hf_token_input = gr.Textbox(
label="HuggingFace Token",
placeholder="hf_...",
type="password",
info="Get your token from https://huggingface.co/settings/tokens"
)
model_repo_input = gr.Textbox(
label="Model Repository Name",
placeholder="username/coptic-megalaa-finetuned",
info="Example: john-doe/coptic-megalaa-finetuned"
)
gr.Markdown("### 📤 Upload Training Data")
train_file_upload = gr.File(
label="Training Data (train.jsonl)",
file_types=[".jsonl"]
)
val_file_upload = gr.File(
label="Validation Data (val.jsonl)",
file_types=[".jsonl"]
)
gr.Markdown("### ⚙️ Training Parameters")
num_epochs = gr.Slider(
minimum=1,
maximum=10,
value=5,
step=1,
label="Number of Epochs"
)
batch_size = gr.Slider(
minimum=4,
maximum=16,
value=8,
step=4,
label="Batch Size"
)
learning_rate = gr.Number(
value=2e-5,
label="Learning Rate"
)
start_btn = gr.Button("🚀 Start Training", variant="primary", size="lg")
status_text = gr.Textbox(label="Status", value="⏸️ Ready to train")
with gr.Column():
gr.Markdown("### 📊 Training Log")
log_output = gr.Textbox(
label="Real-time Training Log",
lines=30,
max_lines=30,
autoscroll=True,
every=2
)
# Button actions
start_btn.click(
fn=start_training,
inputs=[train_file_upload, val_file_upload, num_epochs, batch_size, learning_rate, hf_token_input, model_repo_input],
outputs=status_text
)
# Auto-refresh log and status
demo.load(fn=get_training_log, outputs=log_output, every=2)
demo.load(fn=check_status, outputs=status_text, every=2)
gr.Markdown("""
---
### 📥 After Training
When training completes, your fine-tuned model will be automatically pushed to HuggingFace Hub!
**Next steps:**
1. Visit your model repository at `https://huggingface.co/YOUR_USERNAME/MODEL_NAME`
2. Download and test with: `python evaluate_megalaa_model.py`
3. Integrate into your Coptic translation interface
4. Share your model with the community!
**Estimated training time:** 6-8 hours on T4 GPU
**Note:** The model is also saved temporarily to `coptic_megalaa_finetuned/final/` during training,
but this local copy will be lost when the Space restarts. Use the HuggingFace Hub version!
""")
if __name__ == "__main__":
demo.launch()