Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python3 | |
| """ | |
| Instruction following test suite. | |
| Tests the model's ability to follow simple and complex instructions. | |
| """ | |
| import json | |
| from typing import List, Dict, Any | |
| from testing.core.base_tester import BaseTester, TestCase | |
| class InstructionTester(BaseTester): | |
| """Test instruction following capabilities.""" | |
| def load_test_cases(self) -> List[TestCase]: | |
| """Load instruction following test cases.""" | |
| return [ | |
| TestCase( | |
| name="simple_qa", | |
| prompt="What is SFCR in European insurance regulation?", | |
| expected_keys=["response"], | |
| max_tokens=100 | |
| ), | |
| TestCase( | |
| name="complex_instruction", | |
| prompt="Explain the key components of Solvency II framework and how they differ from Basel III requirements.", | |
| expected_keys=["response"], | |
| max_tokens=200 | |
| ), | |
| TestCase( | |
| name="step_by_step", | |
| prompt="List the main steps an insurance company must follow to calculate their Solvency Capital Requirement (SCR).", | |
| expected_keys=["response"], | |
| max_tokens=150 | |
| ), | |
| TestCase( | |
| name="comparison", | |
| prompt="Compare the risk-based capital requirements between European Solvency II and US RBC frameworks.", | |
| expected_keys=["response"], | |
| max_tokens=180 | |
| ), | |
| TestCase( | |
| name="definition", | |
| prompt="Define 'Own Risk and Solvency Assessment' (ORSA) and explain its purpose in insurance regulation.", | |
| expected_keys=["response"], | |
| max_tokens=120 | |
| ) | |
| ] | |
| def validate_response(self, response: Dict[str, Any], test_case: TestCase) -> bool: | |
| """Validate instruction following response.""" | |
| try: | |
| # Check if response exists | |
| if "response" not in response: | |
| return False | |
| response_text = response["response"] | |
| # Basic validation | |
| if not response_text or len(response_text.strip()) < 10: | |
| return False | |
| # Check for expected content based on test case | |
| if test_case.name == "simple_qa": | |
| # Should contain SFCR-related content | |
| return any(keyword in response_text.lower() for keyword in | |
| ["sfcr", "solvency", "capital", "requirement"]) | |
| elif test_case.name == "complex_instruction": | |
| # Should mention both Solvency II and Basel III | |
| return ("solvency ii" in response_text.lower() and | |
| "basel iii" in response_text.lower()) | |
| elif test_case.name == "step_by_step": | |
| # Should contain numbered or bulleted steps | |
| return any(indicator in response_text for indicator in | |
| ["1.", "2.", "3.", "•", "-", "step"]) | |
| elif test_case.name == "comparison": | |
| # Should contain comparison words | |
| return any(word in response_text.lower() for word in | |
| ["compare", "difference", "versus", "vs", "unlike", "similar"]) | |
| elif test_case.name == "definition": | |
| # Should define ORSA | |
| return "orsa" in response_text.lower() or "own risk" in response_text.lower() | |
| # Default validation | |
| return True | |
| except Exception as e: | |
| print(f"Validation error: {e}") | |
| return False | |