Spaces:

A7m0d
/

rag_korean_manufacturing_docs

Running

File size: 2,341 Bytes

7dfe46c

#!/usr/bin/env python3
"""Quick test to verify the evaluator fix works."""

import sys
from pathlib import Path

# Add src to path
sys.path.append(str(Path(__file__).parent.parent))

from src.logger import setup_logging
from src.config import Config
from src.evaluator import KoreanQAEvaluator

def test_single_evaluation():
    """Test single case evaluation to verify score extraction."""
    
    # Setup logging
    logger_setup = setup_logging(log_level="INFO")
    logger = logger_setup.get_logger(__name__)
    
    try:
        # Load config
        config_path = Path(__file__).parent / "config.yaml"
        config = Config(str(config_path))
        
        # Initialize evaluator
        evaluator = KoreanQAEvaluator(
            model_name=config.gemini_model,
            api_key=config.google_api_key,
            threshold=0.8,
            verbose_mode=True
        )
        
        # Test case
        input_text = "이번 달 우리 회사 전체 매출은 얼마야?"
        actual_output = "2025년 1월 삼광 Global 전체 매출은 335.4억원입니다."
        
        # Run evaluation
        logger.info("Testing single case evaluation...")
        results = evaluator.evaluate_single_case(input_text, actual_output)
        
        # Check if we got real scores
        detailed_results = results.get('detailed_results', [])
        if detailed_results:
            first_case = detailed_results[0]
            metrics = first_case.get('metrics', {})
            
            logger.info("Evaluation results:")
            for metric_name, metric_data in metrics.items():
                score = metric_data.get('score')
                passed = metric_data.get('passed')
                reason = metric_data.get('reason', '')
                
                logger.info(f"  {metric_name}: {score:.4f} ({'PASS' if passed else 'FAIL'})")
                if reason and not reason.startswith('Mock') and not reason.startswith('Fallback'):
                    logger.info("  ✓ Real DeepEval score extracted successfully!")
                else:
                    logger.warning("  ⚠ Still using fallback/mock scores")
        
        return results
        
    except Exception as e:
        logger.error(f"Test failed: {e}")
        return None

if __name__ == "__main__":
    test_single_evaluation()