File size: 2,341 Bytes
7dfe46c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
#!/usr/bin/env python3
"""Quick test to verify the evaluator fix works."""
import sys
from pathlib import Path
# Add src to path
sys.path.append(str(Path(__file__).parent.parent))
from src.logger import setup_logging
from src.config import Config
from src.evaluator import KoreanQAEvaluator
def test_single_evaluation():
"""Test single case evaluation to verify score extraction."""
# Setup logging
logger_setup = setup_logging(log_level="INFO")
logger = logger_setup.get_logger(__name__)
try:
# Load config
config_path = Path(__file__).parent / "config.yaml"
config = Config(str(config_path))
# Initialize evaluator
evaluator = KoreanQAEvaluator(
model_name=config.gemini_model,
api_key=config.google_api_key,
threshold=0.8,
verbose_mode=True
)
# Test case
input_text = "์ด๋ฒ ๋ฌ ์ฐ๋ฆฌ ํ์ฌ ์ ์ฒด ๋งค์ถ์ ์ผ๋ง์ผ?"
actual_output = "2025๋
1์ ์ผ๊ด Global ์ ์ฒด ๋งค์ถ์ 335.4์ต์์
๋๋ค."
# Run evaluation
logger.info("Testing single case evaluation...")
results = evaluator.evaluate_single_case(input_text, actual_output)
# Check if we got real scores
detailed_results = results.get('detailed_results', [])
if detailed_results:
first_case = detailed_results[0]
metrics = first_case.get('metrics', {})
logger.info("Evaluation results:")
for metric_name, metric_data in metrics.items():
score = metric_data.get('score')
passed = metric_data.get('passed')
reason = metric_data.get('reason', '')
logger.info(f" {metric_name}: {score:.4f} ({'PASS' if passed else 'FAIL'})")
if reason and not reason.startswith('Mock') and not reason.startswith('Fallback'):
logger.info(" โ Real DeepEval score extracted successfully!")
else:
logger.warning(" โ Still using fallback/mock scores")
return results
except Exception as e:
logger.error(f"Test failed: {e}")
return None
if __name__ == "__main__":
test_single_evaluation() |