File size: 2,341 Bytes
7dfe46c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/usr/bin/env python3
"""Quick test to verify the evaluator fix works."""

import sys
from pathlib import Path

# Add src to path
sys.path.append(str(Path(__file__).parent.parent))

from src.logger import setup_logging
from src.config import Config
from src.evaluator import KoreanQAEvaluator

def test_single_evaluation():
    """Test single case evaluation to verify score extraction."""
    
    # Setup logging
    logger_setup = setup_logging(log_level="INFO")
    logger = logger_setup.get_logger(__name__)
    
    try:
        # Load config
        config_path = Path(__file__).parent / "config.yaml"
        config = Config(str(config_path))
        
        # Initialize evaluator
        evaluator = KoreanQAEvaluator(
            model_name=config.gemini_model,
            api_key=config.google_api_key,
            threshold=0.8,
            verbose_mode=True
        )
        
        # Test case
        input_text = "์ด๋ฒˆ ๋‹ฌ ์šฐ๋ฆฌ ํšŒ์‚ฌ ์ „์ฒด ๋งค์ถœ์€ ์–ผ๋งˆ์•ผ?"
        actual_output = "2025๋…„ 1์›” ์‚ผ๊ด‘ Global ์ „์ฒด ๋งค์ถœ์€ 335.4์–ต์›์ž…๋‹ˆ๋‹ค."
        
        # Run evaluation
        logger.info("Testing single case evaluation...")
        results = evaluator.evaluate_single_case(input_text, actual_output)
        
        # Check if we got real scores
        detailed_results = results.get('detailed_results', [])
        if detailed_results:
            first_case = detailed_results[0]
            metrics = first_case.get('metrics', {})
            
            logger.info("Evaluation results:")
            for metric_name, metric_data in metrics.items():
                score = metric_data.get('score')
                passed = metric_data.get('passed')
                reason = metric_data.get('reason', '')
                
                logger.info(f"  {metric_name}: {score:.4f} ({'PASS' if passed else 'FAIL'})")
                if reason and not reason.startswith('Mock') and not reason.startswith('Fallback'):
                    logger.info("  โœ“ Real DeepEval score extracted successfully!")
                else:
                    logger.warning("  โš  Still using fallback/mock scores")
        
        return results
        
    except Exception as e:
        logger.error(f"Test failed: {e}")
        return None

if __name__ == "__main__":
    test_single_evaluation()