from typing import Optional, List, Dict, Any from pydantic import BaseModel, Field # Models class OptimizeRequest(BaseModel): """ ๐Ÿ”ง Explicit optimization request for RAG (Retrieval-Augmented Generation) pipelines. Args: docs_path (str, optional): ๐Ÿ“‚ Folder containing your documents for RAG optimization. Default: "data/docs" retriever (List[str], optional): ๐Ÿ” Retriever type(s) to use. Default: ['faiss']. Example: 'bm25', 'faiss', 'chroma' embedding_model (List[str], optional): ๐Ÿง  Embedding model(s) to use. Default: ['sentence-transformers/all-MiniLM-L6-v2'] strategy (List[str], optional): ๐ŸŽฏ RAG strategy to apply. Default: ['fixed']. Options: 'fixed', 'token', 'sentence' chunk_sizes (List[int], optional): ๐Ÿ“ List of chunk sizes to evaluate. Default: [200, 400, 600] overlaps (List[int], optional): ๐Ÿ” List of overlap values to test. Default: [50, 100, 200] rerankers (List[str], optional): โš–๏ธ Rerankers to apply after retrieval. Default: ['mmr'] search_type (str, optional): ๐Ÿ” Search method for parameter exploration. Default: 'grid'. Options: 'grid', 'random', 'bayesian' trials (int, optional): ๐Ÿงช Number of optimization trials. Default: 5 metric (str, optional): ๐Ÿ“ˆ Metric to optimize. Default: 'faithfulness' validation_choice (str, optional): โœ… Source of validation data. Default: 'generate'. Options: blank (use default), 'generate', local path, HF dataset ID llm_model (str, optional): ๐Ÿค– LLM used for QA generation if validation_choice='generate'. Default: 'gemini-2.5-flash-lite' """ docs_path: Optional[str] = Field( default="data/docs", description="๐Ÿ“‚ Folder containing your documents for RAG optimization. Example: 'data/docs'" ) retriever: Optional[List[str]] = Field( description="๐Ÿ” Retriever type to use. Example: 'bm25', 'faiss', 'chroma'", default=['faiss'] ) embedding_model: Optional[List[str]] = Field( description="๐Ÿง  Embedding model name or path. Example: 'sentence-transformers/all-MiniLM-L6-v2'", default=['sentence-transformers/all-MiniLM-L6-v2'] ) strategy: Optional[List[str]] = Field( description="๐ŸŽฏ RAG strategy name. Example: 'fixed', 'token', 'sentence'", default=['fixed'] ) chunk_sizes: Optional[List[int]] = Field( description="๐Ÿ“ List of chunk sizes to evaluate. Example: [200, 400, 600]", default=[200, 400, 600] ) overlaps: Optional[List[int]] = Field( description="๐Ÿ” List of overlap values to test. Example: [50, 100, 200]", default = [50, 100, 200] ) rerankers: Optional[List[str]] = Field( default=["mmr"], description="โš–๏ธ Rerankers to apply after retrieval. Default: ['mmr']" ) search_type: Optional[str] = Field( default="grid", description="๐Ÿ” Search method to explore parameter space. Options: 'grid', 'random', 'bayesian'" ) trials: Optional[int] = Field( default=5, description="๐Ÿงช Number of optimization trials to run." ) metric: Optional[str] = Field( default="faithfulness", description="๐Ÿ“ˆ Evaluation metric for optimization. Options: 'faithfulness'" ) validation_choice: Optional[str] = Field( default='generate', description=( "โœ… Validation data source. Options:\n" " - Leave blank โ†’ use default 'validation_qa.json' if available\n" " - 'generate' โ†’ auto-generate a validation QA file from your docs\n" " - Path to a local JSON file (e.g. 'data/validation_qa.json')\n" " - Hugging Face dataset ID (e.g. 'squad')" ) ) llm_model: Optional[str] = Field( default="gemini-2.5-flash-lite", description="๐Ÿค– LLM used to generate QA dataset when validation_choice='generate'. Example: 'gemini-pro', 'gpt-4o-mini'" ) class AutotuneRequest(BaseModel): """ โšก Automatically tunes RAG pipeline parameters based on document analysis. Args: docs_path (str, optional): ๐Ÿ“‚ Folder containing documents for RAG optimization. Default: "data/docs" embedding_model (str, optional): ๐Ÿง  Embedding model to analyze. Default: 'sentence-transformers/all-MiniLM-L6-v2' num_chunk_pairs (int, optional): ๐Ÿ”ข Number of chunk pairs to analyze. Default: 5 metric (str, optional): ๐Ÿ“ˆ Metric to optimize. Default: 'faithfulness' search_type (str, optional): ๐Ÿ” Search method for parameter exploration. Default: 'grid'. Options: 'grid', 'random', 'bayesian' trials (int, optional): ๐Ÿงช Number of optimization trials. Default: 5 validation_choice (str, optional): โœ… Source of validation data. Default: 'generate'. Options: blank, 'generate', local path, HF dataset ID llm_model (str, optional): ๐Ÿค– LLM used for QA generation if validation_choice='generate'. Default: 'gemini-2.5-flash-lite' """ docs_path: Optional[str] = Field( default="data/docs", description="๐Ÿ“‚ Folder containing your documents for RAG optimization. Example: 'data/docs'" ) embedding_model: Optional[str] = Field( default="sentence-transformers/all-MiniLM-L6-v2", description="๐Ÿง  Embedding model name or path. Example: 'sentence-transformers/all-MiniLM-L6-v2'" ) num_chunk_pairs: Optional[int] = Field( default=5, description="๐Ÿ”ข Number of chunk pairs to analyze for tuning." ) metric: Optional[str] = Field( default="faithfulness", description="๐Ÿ“ˆ Evaluation metric for optimization. Options: 'faithfulness'" ) search_type: Optional[str] = Field( default="grid", description="๐Ÿ” Search method to explore parameter space. Options: 'grid', 'random', 'bayesian'" ) trials: Optional[int] = Field( default=5, description="๐Ÿงช Number of optimization trials to run." ) validation_choice: Optional[str] = Field( default='generate', description=( "โœ… Validation data source. Options:\n" " - Leave blank โ†’ use default 'validation_qa.jsonl' if available\n" " - 'generate' โ†’ auto-generate a validation QA file from your docs\n" " - Path to a local JSON file (e.g. 'data/validation_qa.json')\n" " - Hugging Face dataset ID (e.g. 'squad')" ) ) llm_model: Optional[str] = Field( default="gemini-2.5-flash-lite", description="๐Ÿค– LLM used to generate QA dataset when validation_choice='generate'. Example: 'gemini-pro', 'gpt-4o-mini'" ) class QARequest(BaseModel): """ ๐Ÿงฉ Generate a validation QA dataset from documents for RAG evaluation. Args: docs_path (str): ๐Ÿ“‚ Folder containing documents. Default: 'data/docs' llm_model (str): ๐Ÿค– LLM model used for question generation. Default: 'gemini-2.5-flash-lite' batch_size (int): ๐Ÿ“ฆ Number of documents per batch. Default: 5 min_q (int): โ“ Minimum number of questions per document. Default: 3 max_q (int): โ“ Maximum number of questions per document. Default: 25 """ docs_path: str = Field( description="๐Ÿ“‚ Folder containing your documents to generate QA pairs from. Example: 'data/docs'", default='data/docs' ) llm_model: str = Field( default="gemini-2.5-flash-lite", description="๐Ÿค– LLM model used for question generation. Example: 'gemini-2.5-flash-lite', 'gpt-4o-mini'" ) batch_size: int = Field( default=5, description="๐Ÿ“ฆ Number of documents processed per generation batch." ) min_q: int = Field( default=3, description="โ“ Minimum number of questions per document." ) max_q: int = Field( default=25, description="โ“ Maximum number of questions per document." )