jeanbaptdzd's picture
feat: Clean deployment to HuggingFace Space with model config test endpoint
8c0b652
#!/usr/bin/env python3
"""
Instruction following test suite.
Tests the model's ability to follow simple and complex instructions.
"""
import json
from typing import List, Dict, Any
from testing.core.base_tester import BaseTester, TestCase
class InstructionTester(BaseTester):
"""Test instruction following capabilities."""
def load_test_cases(self) -> List[TestCase]:
"""Load instruction following test cases."""
return [
TestCase(
name="simple_qa",
prompt="What is SFCR in European insurance regulation?",
expected_keys=["response"],
max_tokens=100
),
TestCase(
name="complex_instruction",
prompt="Explain the key components of Solvency II framework and how they differ from Basel III requirements.",
expected_keys=["response"],
max_tokens=200
),
TestCase(
name="step_by_step",
prompt="List the main steps an insurance company must follow to calculate their Solvency Capital Requirement (SCR).",
expected_keys=["response"],
max_tokens=150
),
TestCase(
name="comparison",
prompt="Compare the risk-based capital requirements between European Solvency II and US RBC frameworks.",
expected_keys=["response"],
max_tokens=180
),
TestCase(
name="definition",
prompt="Define 'Own Risk and Solvency Assessment' (ORSA) and explain its purpose in insurance regulation.",
expected_keys=["response"],
max_tokens=120
)
]
def validate_response(self, response: Dict[str, Any], test_case: TestCase) -> bool:
"""Validate instruction following response."""
try:
# Check if response exists
if "response" not in response:
return False
response_text = response["response"]
# Basic validation
if not response_text or len(response_text.strip()) < 10:
return False
# Check for expected content based on test case
if test_case.name == "simple_qa":
# Should contain SFCR-related content
return any(keyword in response_text.lower() for keyword in
["sfcr", "solvency", "capital", "requirement"])
elif test_case.name == "complex_instruction":
# Should mention both Solvency II and Basel III
return ("solvency ii" in response_text.lower() and
"basel iii" in response_text.lower())
elif test_case.name == "step_by_step":
# Should contain numbered or bulleted steps
return any(indicator in response_text for indicator in
["1.", "2.", "3.", "•", "-", "step"])
elif test_case.name == "comparison":
# Should contain comparison words
return any(word in response_text.lower() for word in
["compare", "difference", "versus", "vs", "unlike", "similar"])
elif test_case.name == "definition":
# Should define ORSA
return "orsa" in response_text.lower() or "own risk" in response_text.lower()
# Default validation
return True
except Exception as e:
print(f"Validation error: {e}")
return False