File size: 4,840 Bytes
7dfe46c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128

import pandas as pd
from pathlib import Path
from deepeval.dataset import EvaluationDataset
from deepeval.test_case import LLMTestCase
import logging
from typing import List, Optional

logger = logging.getLogger(__name__)

class DatasetLoader:
    """Handle loading and processing of evaluation datasets."""
    
    def __init__(self):
        self.dataset = EvaluationDataset()
    
    def load_from_csv(self, 
                     file_path: str,
                     input_col: str = "input",
                     output_col: str = "expected_output",
                     context_col: Optional[str] = None) -> EvaluationDataset:
        """
        Load dataset from CSV file with comprehensive logging.
        
        Args:
            file_path: Path to the CSV file
            input_col: Column name for input questions
            output_col: Column name for expected outputs
            context_col: Optional column name for context
            
        Returns:
            EvaluationDataset: Loaded dataset
        """
        try:
            file_path = Path(file_path)
            
            if not file_path.exists():
                logger.error(f"Dataset file not found: {file_path}")
                raise FileNotFoundError(f"Dataset file not found: {file_path}")
            
            logger.info(f"Loading dataset from: {file_path}")
            
            # Read CSV file
            df = pd.read_csv(file_path)
            logger.info(f"CSV file loaded successfully. Shape: {df.shape}")
            
            # Validate required columns
            required_cols = [input_col, output_col]
            missing_cols = [col for col in required_cols if col not in df.columns]
            
            if missing_cols:
                logger.error(f"Missing required columns: {missing_cols}")
                logger.error(f"Available columns: {list(df.columns)}")
                raise ValueError(f"Missing required columns: {missing_cols}")
            
            # Log column information
            logger.info(f"Dataset columns: {list(df.columns)}")
            logger.info(f"Input column: {input_col}")
            logger.info(f"Output column: {output_col}")
            if context_col:
                logger.info(f"Context column: {context_col}")
            
            # Clean and validate data
            df = self._clean_data(df, input_col, output_col)
            
            # Load test cases
            self.dataset.add_test_cases_from_csv_file(
                file_path=str(file_path),
                input_col_name=input_col,
                actual_output_col_name=output_col,
            )
            
            logger.info(f"Successfully loaded {len(self.dataset.test_cases)} test cases")
            
            # Log sample data
            self._log_sample_data(df, input_col, output_col)
            
            return self.dataset
            
        except Exception as e:
            logger.error(f"Error loading dataset: {e}")
            raise
    
    def _clean_data(self, df: pd.DataFrame, input_col: str, output_col: str) -> pd.DataFrame:
        """Clean and validate dataset."""
        logger.info("Cleaning dataset...")
        
        initial_count = len(df)
        
        # Remove rows with missing values in required columns
        df = df.dropna(subset=[input_col, output_col])
        
        # Remove empty strings
        df = df[df[input_col].str.strip() != '']
        df = df[df[output_col].str.strip() != '']
        
        final_count = len(df)
        removed_count = initial_count - final_count
        
        if removed_count > 0:
            logger.warning(f"Removed {removed_count} invalid rows during cleaning")
        
        logger.info(f"Dataset cleaned. Final count: {final_count} rows")
        
        return df
    
    def _log_sample_data(self, df: pd.DataFrame, input_col: str, output_col: str) -> None:
        """Log sample data for verification."""
        logger.info("Sample data from dataset:")
        
        for i, row in df.head(3).iterrows():
            logger.info(f"Sample {i+1}:")
            logger.info(f"  Input: {row[input_col][:100]}...")
            logger.info(f"  Output: {row[output_col][:100]}...")
    
    def get_dataset_stats(self) -> dict:
        """Get dataset statistics."""
        if not self.dataset.test_cases:
            return {"total_cases": 0}
        
        stats = {
            "total_cases": len(self.dataset.test_cases),
            "avg_input_length": sum(len(case.input) for case in self.dataset.test_cases) / len(self.dataset.test_cases),
            "avg_output_length": sum(len(case.actual_output or "") for case in self.dataset.test_cases) / len(self.dataset.test_cases)
        }
        
        logger.info(f"Dataset statistics: {stats}")
        return stats