Spaces:

Norelad
/

coptic-translation-interface

Sleeping

File size: 14,887 Bytes
#!/usr/bin/env python3
"""
HuggingFace Space for fine-tuning megalaa Coptic translation model

This Gradio app provides a user-friendly interface for training the
megalaa/coptic-english-translator model on your CopticScriptorium corpus.
"""

import gradio as gr
import os
import subprocess
import threading
import time
from pathlib import Path

# Global variable to track training status
training_status = {
    "running": False,
    "log": [],
    "completed": False,
    "error": None
}


def train_model(train_file, val_file, num_epochs, batch_size, learning_rate, hf_token, model_repo_name):
    """
    Start model training with uploaded data files
    """
    global training_status

    # Reset status
    training_status = {
        "running": True,
        "log": ["🚀 Starting training setup...\n"],
        "completed": False,
        "error": None
    }

    try:
        # Save uploaded files
        train_path = "train.jsonl"
        val_path = "val.jsonl"

        with open(train_path, "wb") as f:
            f.write(train_file)
        with open(val_path, "wb") as f:
            f.write(val_file)

        training_status["log"].append(f"✓ Training data saved: {train_path}\n")
        training_status["log"].append(f"✓ Validation data saved: {val_path}\n")

        # Create training script
        script_content = f'''#!/usr/bin/env python3
import os
import json
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)
from huggingface_hub import HfApi, login
from evaluate import load
import numpy as np
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# HuggingFace Hub configuration
HF_TOKEN = "{hf_token}"
MODEL_REPO_NAME = "{model_repo_name}"

if HF_TOKEN:
    login(token=HF_TOKEN)
    logger.info("✓ Logged in to HuggingFace Hub")

# Greekification for megalaa models
COPTIC_TO_GREEK = {{
    "ⲁ": "α", "ⲃ": "β", "ⲅ": "γ", "ⲇ": "δ", "ⲉ": "ε", "ⲋ": "ϛ",
    "ⲍ": "ζ", "ⲏ": "η", "ⲑ": "θ", "ⲓ": "ι", "ⲕ": "κ", "ⲗ": "λ",
    "ⲙ": "μ", "ⲛ": "ν", "ⲝ": "ξ", "ⲟ": "ο", "ⲡ": "π", "ⲣ": "ρ",
    "ⲥ": "σ", "ⲧ": "τ", "ⲩ": "υ", "ⲫ": "φ", "ⲭ": "χ", "ⲯ": "ψ",
    "ⲱ": "ω", "ϣ": "s", "ϥ": "f", "ϧ": "k", "ϩ": "h", "ϫ": "j",
    "ϭ": "c", "ϯ": "t",
}}

def greekify(text):
    if not text:
        return ""
    return "".join(COPTIC_TO_GREEK.get(c.lower(), c.lower()) for c in text)

def extract_parallel_texts(examples):
    coptic_texts = []
    english_texts = []

    for messages in examples['messages']:
        coptic_text = None
        english_text = None

        for msg in messages:
            if msg['role'] == 'user' and 'Coptic text to English:' in msg['content']:
                coptic_text = msg['content'].split('Coptic text to English:')[-1].strip()
            elif msg['role'] == 'assistant':
                english_text = msg['content']

        coptic_texts.append(coptic_text)
        english_texts.append(english_text)

    return {{'coptic': coptic_texts, 'english': english_texts}}

def preprocess_function(examples, tokenizer, max_length=256):
    greekified_coptic = [greekify(text.lower()) if text else "" for text in examples["coptic"]]

    model_inputs = tokenizer(
        greekified_coptic,
        max_length=max_length,
        truncation=True,
        padding="max_length"
    )

    labels = tokenizer(
        text_target=examples["english"],
        max_length=max_length,
        truncation=True,
        padding="max_length"
    )

    labels["input_ids"] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in labels_example]
        for labels_example in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

def compute_metrics(eval_preds, tokenizer, metric):
    preds, labels = eval_preds

    if isinstance(preds, tuple):
        preds = preds[0]

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_labels = [[label] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {{"bleu": result["score"]}}

# Configuration
model_name = "megalaa/coptic-english-translator"
output_dir = "coptic_megalaa_finetuned"
num_epochs = {num_epochs}
batch_size = {batch_size}
learning_rate = {learning_rate}

logger.info("="*60)
logger.info("MEGALAA FINE-TUNING ON HUGGINGFACE SPACES")
logger.info("="*60)
logger.info(f"Base model: {{model_name}}")
logger.info(f"Epochs: {{num_epochs}}")
logger.info(f"Batch size: {{batch_size}}")
logger.info(f"Learning rate: {{learning_rate}}")

# Check GPU
if torch.cuda.is_available():
    logger.info(f"GPU: {{torch.cuda.get_device_name(0)}}")
    logger.info(f"GPU Memory: {{torch.cuda.get_device_properties(0).total_memory / (1024**3):.1f}} GB")
else:
    logger.warning("No GPU detected!")

# Load model
logger.info("\\nLoading model...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Load datasets
logger.info("Loading datasets...")
train_dataset = load_dataset('json', data_files='{train_path}', split='train')
val_dataset = load_dataset('json', data_files='{val_path}', split='train')

logger.info(f"Train samples: {{len(train_dataset):,}}")
logger.info(f"Validation samples: {{len(val_dataset):,}}")

# Extract and tokenize
logger.info("Processing datasets...")
train_dataset = train_dataset.map(extract_parallel_texts, batched=True, remove_columns=['messages'])
val_dataset = val_dataset.map(extract_parallel_texts, batched=True, remove_columns=['messages'])

tokenized_train = train_dataset.map(
    lambda examples: preprocess_function(examples, tokenizer),
    batched=True,
    remove_columns=['coptic', 'english']
)
tokenized_val = val_dataset.map(
    lambda examples: preprocess_function(examples, tokenizer),
    batched=True,
    remove_columns=['coptic', 'english']
)

# Setup training
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding=True)
metric = load("sacrebleu")

training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=2,
    learning_rate=learning_rate,
    warmup_steps=500,
    max_grad_norm=1.0,
    weight_decay=0.01,
    eval_strategy="steps",
    eval_steps=500,
    logging_steps=50,
    save_steps=500,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="bleu",
    greater_is_better=True,
    predict_with_generate=True,
    generation_max_length=256,
    generation_num_beams=5,
    fp16=torch.cuda.is_available(),
    report_to="tensorboard",
    logging_dir=f"{{output_dir}}/logs",
    push_to_hub=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=lambda eval_preds: compute_metrics(eval_preds, tokenizer, metric)
)

logger.info("\\nSTARTING TRAINING")
logger.info("="*60)

# Train
trainer.train()

# Save locally
logger.info("\\nSaving final model...")
trainer.save_model(f"{{output_dir}}/final")
tokenizer.save_pretrained(f"{{output_dir}}/final")

# Push to HuggingFace Hub
if HF_TOKEN and MODEL_REPO_NAME:
    logger.info(f"\\nPushing model to HuggingFace Hub: {{MODEL_REPO_NAME}}")
    try:
        api = HfApi()
        api.create_repo(repo_id=MODEL_REPO_NAME, repo_type="model", exist_ok=True)

        # Upload all files
        api.upload_folder(
            folder_path=f"{{output_dir}}/final",
            repo_id=MODEL_REPO_NAME,
            repo_type="model",
        )
        logger.info(f"✅ Model successfully pushed to: https://huggingface.co/{{MODEL_REPO_NAME}}")
    except Exception as e:
        logger.error(f"❌ Failed to push to Hub: {{e}}")

# Final evaluation
logger.info("\\nFinal evaluation...")
eval_results = trainer.evaluate()

logger.info("\\n" + "="*60)
logger.info("TRAINING COMPLETE!")
logger.info("="*60)
for key, value in eval_results.items():
    logger.info(f"{{key}}: {{value}}")

logger.info(f"\\n✅ Model saved locally to: {{output_dir}}/final")
if HF_TOKEN and MODEL_REPO_NAME:
    logger.info(f"✅ Model available at: https://huggingface.co/{{MODEL_REPO_NAME}}")
'''

        with open("train_script.py", "w") as f:
            f.write(script_content)

        training_status["log"].append("✓ Training script created\n")
        training_status["log"].append("🚀 Starting training...\n\n")

        # Run training in subprocess
        process = subprocess.Popen(
            ["python", "train_script.py"],
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            text=True,
            bufsize=1
        )

        # Stream output
        for line in process.stdout:
            training_status["log"].append(line)
            time.sleep(0.01)  # Small delay to allow UI updates

        process.wait()

        if process.returncode == 0:
            training_status["completed"] = True
            training_status["log"].append("\n\n✅ TRAINING COMPLETED SUCCESSFULLY!\n")
            training_status["log"].append("📦 Model saved locally to: coptic_megalaa_finetuned/final\n")
            if hf_token and model_repo_name:
                training_status["log"].append(f"📦 Model pushed to: https://huggingface.co/{model_repo_name}\n")
        else:
            training_status["error"] = f"Training failed with exit code {process.returncode}"
            training_status["log"].append(f"\n\n❌ Training failed with exit code {process.returncode}\n")

    except Exception as e:
        training_status["error"] = str(e)
        training_status["log"].append(f"\n\n❌ Error: {str(e)}\n")

    finally:
        training_status["running"] = False


def start_training(train_file, val_file, num_epochs, batch_size, learning_rate, hf_token, model_repo_name):
    """
    Start training in background thread
    """
    if training_status["running"]:
        return "⚠️ Training already in progress!"

    if not hf_token or not model_repo_name:
        return "⚠️ Please provide both HuggingFace Token and Model Repository Name!"

    # Start training thread
    thread = threading.Thread(
        target=train_model,
        args=(train_file, val_file, num_epochs, batch_size, learning_rate, hf_token, model_repo_name)
    )
    thread.daemon = True
    thread.start()

    return "🚀 Training started! Monitor progress in the logs below."


def get_training_log():
    """
    Return current training log
    """
    return "".join(training_status["log"])


def check_status():
    """
    Return training status
    """
    if training_status["completed"]:
        return "✅ Training completed!"
    elif training_status["error"]:
        return f"❌ Error: {training_status['error']}"
    elif training_status["running"]:
        return "🔄 Training in progress..."
    else:
        return "⏸️ Ready to train"


# Create Gradio interface
with gr.Blocks(title="Megalaa Coptic Fine-tuning") as demo:
    gr.Markdown("""
    # 🏛️ Megalaa Coptic Translation Fine-tuning

    Fine-tune the megalaa/coptic-english-translator model on your CopticScriptorium corpus.

    **⚙️ IMPORTANT:** Make sure this Space is running on **T4 Small GPU** for optimal performance!
    """)

    with gr.Row():
        with gr.Column():
            gr.Markdown("### 🔑 HuggingFace Hub Configuration")
            hf_token_input = gr.Textbox(
                label="HuggingFace Token",
                placeholder="hf_...",
                type="password",
                info="Get your token from https://huggingface.co/settings/tokens"
            )
            model_repo_input = gr.Textbox(
                label="Model Repository Name",
                placeholder="username/coptic-megalaa-finetuned",
                info="Example: john-doe/coptic-megalaa-finetuned"
            )

            gr.Markdown("### 📤 Upload Training Data")
            train_file_upload = gr.File(
                label="Training Data (train.jsonl)",
                file_types=[".jsonl"]
            )
            val_file_upload = gr.File(
                label="Validation Data (val.jsonl)",
                file_types=[".jsonl"]
            )

            gr.Markdown("### ⚙️ Training Parameters")
            num_epochs = gr.Slider(
                minimum=1,
                maximum=10,
                value=5,
                step=1,
                label="Number of Epochs"
            )
            batch_size = gr.Slider(
                minimum=4,
                maximum=16,
                value=8,
                step=4,
                label="Batch Size"
            )
            learning_rate = gr.Number(
                value=2e-5,
                label="Learning Rate"
            )

            start_btn = gr.Button("🚀 Start Training", variant="primary", size="lg")
            status_text = gr.Textbox(label="Status", value="⏸️ Ready to train")

        with gr.Column():
            gr.Markdown("### 📊 Training Log")
            log_output = gr.Textbox(
                label="Real-time Training Log",
                lines=30,
                max_lines=30,
                autoscroll=True,
                every=2
            )

    # Button actions
    start_btn.click(
        fn=start_training,
        inputs=[train_file_upload, val_file_upload, num_epochs, batch_size, learning_rate, hf_token_input, model_repo_input],
        outputs=status_text
    )

    # Auto-refresh log and status
    demo.load(fn=get_training_log, outputs=log_output, every=2)
    demo.load(fn=check_status, outputs=status_text, every=2)

    gr.Markdown("""
    ---
    ### 📥 After Training

    When training completes, your fine-tuned model will be automatically pushed to HuggingFace Hub!

    **Next steps:**
    1. Visit your model repository at `https://huggingface.co/YOUR_USERNAME/MODEL_NAME`
    2. Download and test with: `python evaluate_megalaa_model.py`
    3. Integrate into your Coptic translation interface
    4. Share your model with the community!

    **Estimated training time:** 6-8 hours on T4 GPU

    **Note:** The model is also saved temporarily to `coptic_megalaa_finetuned/final/` during training,
    but this local copy will be lost when the Space restarts. Use the HuggingFace Hub version!
    """)

if __name__ == "__main__":
    demo.launch()