Spaces:

jeanbaptdzd
/

dragonllm-finance-models

Runtime error

App Files Files Community

dragonllm-finance-models / testing /suites /instruction_test.py

jeanbaptdzd

feat: Clean deployment to HuggingFace Space with model config test endpoint

8c0b652 2 months ago

raw

history blame contribute delete

3.76 kB

	#!/usr/bin/env python3
	"""
	Instruction following test suite.
	Tests the model's ability to follow simple and complex instructions.
	"""

	import json
	from typing import List, Dict, Any
	from testing.core.base_tester import BaseTester, TestCase

	class InstructionTester(BaseTester):
	"""Test instruction following capabilities."""

	def load_test_cases(self) -> List[TestCase]:
	"""Load instruction following test cases."""
	return [
	TestCase(
	name="simple_qa",
	prompt="What is SFCR in European insurance regulation?",
	expected_keys=["response"],
	max_tokens=100
	),
	TestCase(
	name="complex_instruction",
	prompt="Explain the key components of Solvency II framework and how they differ from Basel III requirements.",
	expected_keys=["response"],
	max_tokens=200
	),
	TestCase(
	name="step_by_step",
	prompt="List the main steps an insurance company must follow to calculate their Solvency Capital Requirement (SCR).",
	expected_keys=["response"],
	max_tokens=150
	),
	TestCase(
	name="comparison",
	prompt="Compare the risk-based capital requirements between European Solvency II and US RBC frameworks.",
	expected_keys=["response"],
	max_tokens=180
	),
	TestCase(
	name="definition",
	prompt="Define 'Own Risk and Solvency Assessment' (ORSA) and explain its purpose in insurance regulation.",
	expected_keys=["response"],
	max_tokens=120
	)
	]

	def validate_response(self, response: Dict[str, Any], test_case: TestCase) -> bool:
	"""Validate instruction following response."""
	try:
	# Check if response exists
	if "response" not in response:
	return False

	response_text = response["response"]

	# Basic validation
	if not response_text or len(response_text.strip()) < 10:
	return False

	# Check for expected content based on test case
	if test_case.name == "simple_qa":
	# Should contain SFCR-related content
	return any(keyword in response_text.lower() for keyword in
	["sfcr", "solvency", "capital", "requirement"])

	elif test_case.name == "complex_instruction":
	# Should mention both Solvency II and Basel III
	return ("solvency ii" in response_text.lower() and
	"basel iii" in response_text.lower())

	elif test_case.name == "step_by_step":
	# Should contain numbered or bulleted steps
	return any(indicator in response_text for indicator in
	["1.", "2.", "3.", "•", "-", "step"])

	elif test_case.name == "comparison":
	# Should contain comparison words
	return any(word in response_text.lower() for word in
	["compare", "difference", "versus", "vs", "unlike", "similar"])

	elif test_case.name == "definition":
	# Should define ORSA
	return "orsa" in response_text.lower() or "own risk" in response_text.lower()

	# Default validation
	return True

	except Exception as e:
	print(f"Validation error: {e}")
	return False