exam-evaluator / src /train_baseline.py
KarmanovaLidiia
Initial clean commit for HF Space (models via Git LFS)
bcb314a
# src/train_baseline.py
from pathlib import Path
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
import numpy as np
ROOT = Path(__file__).resolve().parents[1]
DATA_PATH = ROOT / "data" / "processed" / "features_with_semantics.csv"
MODEL_PATH = ROOT / "models" / "catboost_baseline.cbm"
def main():
df = pd.read_csv(DATA_PATH, encoding="utf-8-sig")
# Разделяем признаки и таргет
target = df["score"]
features = df.drop(columns=["score", "question_text", "answer_text"])
# Кросс-валидация
kf = KFold(n_splits=5, shuffle=True, random_state=42)
maes = []
for fold, (train_idx, val_idx) in enumerate(kf.split(features)):
X_train, X_val = features.iloc[train_idx], features.iloc[val_idx]
y_train, y_val = target.iloc[train_idx], target.iloc[val_idx]
model = CatBoostRegressor(
iterations=200,
depth=6,
learning_rate=0.05,
loss_function="MAE",
verbose=False,
random_state=42
)
model.fit(X_train, y_train)
preds = model.predict(X_val)
mae = mean_absolute_error(y_val, preds)
maes.append(mae)
print(f"Fold {fold+1}: MAE = {mae:.3f}")
print(f"🔹 Среднее MAE: {np.mean(maes):.3f}")
model.save_model(MODEL_PATH)
print(f"✅ Модель сохранена: {MODEL_PATH}")
if __name__ == "__main__":
main()