File size: 4,840 Bytes
7dfe46c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
import pandas as pd
from pathlib import Path
from deepeval.dataset import EvaluationDataset
from deepeval.test_case import LLMTestCase
import logging
from typing import List, Optional
logger = logging.getLogger(__name__)
class DatasetLoader:
"""Handle loading and processing of evaluation datasets."""
def __init__(self):
self.dataset = EvaluationDataset()
def load_from_csv(self,
file_path: str,
input_col: str = "input",
output_col: str = "expected_output",
context_col: Optional[str] = None) -> EvaluationDataset:
"""
Load dataset from CSV file with comprehensive logging.
Args:
file_path: Path to the CSV file
input_col: Column name for input questions
output_col: Column name for expected outputs
context_col: Optional column name for context
Returns:
EvaluationDataset: Loaded dataset
"""
try:
file_path = Path(file_path)
if not file_path.exists():
logger.error(f"Dataset file not found: {file_path}")
raise FileNotFoundError(f"Dataset file not found: {file_path}")
logger.info(f"Loading dataset from: {file_path}")
# Read CSV file
df = pd.read_csv(file_path)
logger.info(f"CSV file loaded successfully. Shape: {df.shape}")
# Validate required columns
required_cols = [input_col, output_col]
missing_cols = [col for col in required_cols if col not in df.columns]
if missing_cols:
logger.error(f"Missing required columns: {missing_cols}")
logger.error(f"Available columns: {list(df.columns)}")
raise ValueError(f"Missing required columns: {missing_cols}")
# Log column information
logger.info(f"Dataset columns: {list(df.columns)}")
logger.info(f"Input column: {input_col}")
logger.info(f"Output column: {output_col}")
if context_col:
logger.info(f"Context column: {context_col}")
# Clean and validate data
df = self._clean_data(df, input_col, output_col)
# Load test cases
self.dataset.add_test_cases_from_csv_file(
file_path=str(file_path),
input_col_name=input_col,
actual_output_col_name=output_col,
)
logger.info(f"Successfully loaded {len(self.dataset.test_cases)} test cases")
# Log sample data
self._log_sample_data(df, input_col, output_col)
return self.dataset
except Exception as e:
logger.error(f"Error loading dataset: {e}")
raise
def _clean_data(self, df: pd.DataFrame, input_col: str, output_col: str) -> pd.DataFrame:
"""Clean and validate dataset."""
logger.info("Cleaning dataset...")
initial_count = len(df)
# Remove rows with missing values in required columns
df = df.dropna(subset=[input_col, output_col])
# Remove empty strings
df = df[df[input_col].str.strip() != '']
df = df[df[output_col].str.strip() != '']
final_count = len(df)
removed_count = initial_count - final_count
if removed_count > 0:
logger.warning(f"Removed {removed_count} invalid rows during cleaning")
logger.info(f"Dataset cleaned. Final count: {final_count} rows")
return df
def _log_sample_data(self, df: pd.DataFrame, input_col: str, output_col: str) -> None:
"""Log sample data for verification."""
logger.info("Sample data from dataset:")
for i, row in df.head(3).iterrows():
logger.info(f"Sample {i+1}:")
logger.info(f" Input: {row[input_col][:100]}...")
logger.info(f" Output: {row[output_col][:100]}...")
def get_dataset_stats(self) -> dict:
"""Get dataset statistics."""
if not self.dataset.test_cases:
return {"total_cases": 0}
stats = {
"total_cases": len(self.dataset.test_cases),
"avg_input_length": sum(len(case.input) for case in self.dataset.test_cases) / len(self.dataset.test_cases),
"avg_output_length": sum(len(case.actual_output or "") for case in self.dataset.test_cases) / len(self.dataset.test_cases)
}
logger.info(f"Dataset statistics: {stats}")
return stats |