|
|
|
|
|
"""Quick test to verify the evaluator fix works.""" |
|
|
|
|
|
import sys |
|
|
from pathlib import Path |
|
|
|
|
|
|
|
|
sys.path.append(str(Path(__file__).parent.parent)) |
|
|
|
|
|
from src.logger import setup_logging |
|
|
from src.config import Config |
|
|
from src.evaluator import KoreanQAEvaluator |
|
|
|
|
|
def test_single_evaluation(): |
|
|
"""Test single case evaluation to verify score extraction.""" |
|
|
|
|
|
|
|
|
logger_setup = setup_logging(log_level="INFO") |
|
|
logger = logger_setup.get_logger(__name__) |
|
|
|
|
|
try: |
|
|
|
|
|
config_path = Path(__file__).parent / "config.yaml" |
|
|
config = Config(str(config_path)) |
|
|
|
|
|
|
|
|
evaluator = KoreanQAEvaluator( |
|
|
model_name=config.gemini_model, |
|
|
api_key=config.google_api_key, |
|
|
threshold=0.8, |
|
|
verbose_mode=True |
|
|
) |
|
|
|
|
|
|
|
|
input_text = "์ด๋ฒ ๋ฌ ์ฐ๋ฆฌ ํ์ฌ ์ ์ฒด ๋งค์ถ์ ์ผ๋ง์ผ?" |
|
|
actual_output = "2025๋
1์ ์ผ๊ด Global ์ ์ฒด ๋งค์ถ์ 335.4์ต์์
๋๋ค." |
|
|
|
|
|
|
|
|
logger.info("Testing single case evaluation...") |
|
|
results = evaluator.evaluate_single_case(input_text, actual_output) |
|
|
|
|
|
|
|
|
detailed_results = results.get('detailed_results', []) |
|
|
if detailed_results: |
|
|
first_case = detailed_results[0] |
|
|
metrics = first_case.get('metrics', {}) |
|
|
|
|
|
logger.info("Evaluation results:") |
|
|
for metric_name, metric_data in metrics.items(): |
|
|
score = metric_data.get('score') |
|
|
passed = metric_data.get('passed') |
|
|
reason = metric_data.get('reason', '') |
|
|
|
|
|
logger.info(f" {metric_name}: {score:.4f} ({'PASS' if passed else 'FAIL'})") |
|
|
if reason and not reason.startswith('Mock') and not reason.startswith('Fallback'): |
|
|
logger.info(" โ Real DeepEval score extracted successfully!") |
|
|
else: |
|
|
logger.warning(" โ Still using fallback/mock scores") |
|
|
|
|
|
return results |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Test failed: {e}") |
|
|
return None |
|
|
|
|
|
if __name__ == "__main__": |
|
|
test_single_evaluation() |