File size: 12,833 Bytes
3aa91e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
"""Analysis agent for statistical analysis using Modal code execution."""

from collections.abc import AsyncIterable
from typing import TYPE_CHECKING, Any

from agent_framework import (
    AgentRunResponse,
    AgentRunResponseUpdate,
    AgentThread,
    BaseAgent,
    ChatMessage,
    Role,
)
from pydantic import BaseModel, Field
from pydantic_ai import Agent

from src.agent_factory.judges import get_model
from src.tools.code_execution import CodeExecutionError, get_code_executor
from src.utils.models import Evidence

if TYPE_CHECKING:
    from src.services.embeddings import EmbeddingService


class AnalysisResult(BaseModel):
    """Result of statistical analysis."""

    verdict: str = Field(
        description="SUPPORTED, REFUTED, or INCONCLUSIVE",
    )
    confidence: float = Field(ge=0.0, le=1.0, description="Confidence in verdict (0-1)")
    statistical_evidence: str = Field(
        description="Summary of statistical findings from code execution"
    )
    code_generated: str = Field(description="Python code that was executed")
    execution_output: str = Field(description="Output from code execution")
    key_findings: list[str] = Field(default_factory=list, description="Key takeaways from analysis")
    limitations: list[str] = Field(default_factory=list, description="Limitations of the analysis")


class AnalysisAgent(BaseAgent):  # type: ignore[misc]
    """Performs statistical analysis using Modal code execution.

    This agent:
    1. Retrieves relevant evidence using RAG (if available)
    2. Generates Python code for statistical analysis
    3. Executes code in Modal sandbox
    4. Interprets results
    5. Returns verdict (SUPPORTED/REFUTED/INCONCLUSIVE)
    """

    def __init__(
        self,
        evidence_store: dict[str, Any],
        embedding_service: "EmbeddingService | None" = None,
    ) -> None:
        super().__init__(
            name="AnalysisAgent",
            description="Performs statistical analysis of evidence using secure code execution",
        )
        self._evidence_store = evidence_store
        self._embeddings = embedding_service
        self._code_executor = get_code_executor()
        self._agent: Agent[None, str] | None = None  # LLM for code generation

    def _get_agent(self) -> Agent[None, str]:
        """Lazy initialization of LLM agent."""
        if self._agent is None:
            self._agent = Agent(
                model=get_model(),
                output_type=str,  # Returns code as string
                system_prompt=self._get_system_prompt(),
            )
        return self._agent

    def _get_system_prompt(self) -> str:
        """System prompt for code generation."""
        return """You are a biomedical data scientist specializing in statistical analysis.

Your task: Generate Python code to analyze research evidence and test hypotheses.

Guidelines:
1. Use pandas, numpy, scipy.stats for analysis
2. Generate code that prints clear, interpretable results
3. Include statistical tests (t-tests, chi-square, meta-analysis, etc.)
4. Calculate effect sizes and confidence intervals
5. Print summary statistics and test results
6. Keep code concise (<50 lines)
7. Set a variable called 'result' with final verdict

Available libraries:
- pandas==2.2.0
- numpy==1.26.4
- scipy==1.11.4
- matplotlib==3.8.2
- scikit-learn==1.4.0
- statsmodels==0.14.1

Output format:
Return ONLY executable Python code, no explanations or markdown.
"""

    async def run(
        self,
        messages: str | ChatMessage | list[str] | list[ChatMessage] | None = None,
        *,
        thread: AgentThread | None = None,
        **kwargs: Any,
    ) -> AgentRunResponse:
        """Analyze evidence and return verdict."""
        # Extract query and hypothesis
        query = self._extract_query(messages)
        hypotheses = self._evidence_store.get("hypotheses", [])
        evidence = self._evidence_store.get("current", [])

        if not hypotheses:
            return self._error_response("No hypotheses available. Run HypothesisAgent first.")

        if not evidence:
            return self._error_response("No evidence available. Run SearchAgent first.")

        # Get primary hypothesis
        primary = hypotheses[0] if hypotheses else None
        if not primary:
            return self._error_response("No primary hypothesis found.")

        # Retrieve relevant evidence using RAG (if available)
        relevant_evidence = await self._retrieve_relevant_evidence(primary, evidence)

        # Generate analysis code
        code_prompt = self._create_code_generation_prompt(query, primary, relevant_evidence)

        try:
            # Generate code using LLM
            agent = self._get_agent()
            code_result = await agent.run(code_prompt)
            generated_code = code_result.output

            # Execute code in Modal sandbox
            execution_result = self._code_executor.execute(generated_code, timeout=120)

            if not execution_result["success"]:
                return self._error_response(f"Code execution failed: {execution_result['error']}")

            # Interpret results
            analysis_result = await self._interpret_results(
                query, primary, generated_code, execution_result
            )

            # Store analysis in shared context
            self._evidence_store["analysis"] = analysis_result.model_dump()

            # Format response
            response_text = self._format_response(analysis_result)

            return AgentRunResponse(
                messages=[ChatMessage(role=Role.ASSISTANT, text=response_text)],
                response_id=f"analysis-{analysis_result.verdict.lower()}",
                additional_properties={"analysis": analysis_result.model_dump()},
            )

        except CodeExecutionError as e:
            return self._error_response(f"Analysis failed: {e}")
        except Exception as e:
            return self._error_response(f"Unexpected error: {e}")

    async def _retrieve_relevant_evidence(
        self, hypothesis: Any, all_evidence: list[Evidence]
    ) -> list[Evidence]:
        """Retrieve most relevant evidence using RAG (if available)."""
        if not self._embeddings:
            # No RAG available, return top N evidence
            return all_evidence[:10]

        # Use embeddings to find relevant evidence
        # TODO: Implement semantic search with embeddings service
        # For now, just return all evidence
        return all_evidence[:10]

    def _create_code_generation_prompt(
        self, query: str, hypothesis: Any, evidence: list[Evidence]
    ) -> str:
        """Create prompt for code generation."""
        # Extract data from evidence
        evidence_summary = self._summarize_evidence(evidence)

        prompt = f"""Generate Python code to statistically analyze the following hypothesis:

**Original Question**: {query}

**Hypothesis**: {hypothesis.drug} β†’ {hypothesis.target} β†’ {hypothesis.pathway} β†’ {hypothesis.effect}
**Confidence**: {hypothesis.confidence:.0%}

**Evidence Summary**:
{evidence_summary}

**Task**:
1. Parse the evidence data
2. Perform appropriate statistical tests
3. Calculate effect sizes and confidence intervals
4. Determine verdict: SUPPORTED, REFUTED, or INCONCLUSIVE
5. Set result variable to verdict string

Generate executable Python code only (no markdown, no explanations).
"""
        return prompt

    def _summarize_evidence(self, evidence: list[Evidence]) -> str:
        """Summarize evidence for code generation prompt."""
        if not evidence:
            return "No evidence available."

        lines = []
        for i, ev in enumerate(evidence[:5], 1):  # Top 5 most relevant
            lines.append(f"{i}. {ev.content[:200]}...")
            lines.append(f"   Source: {ev.citation.title}")
            lines.append(f"   Relevance: {ev.relevance:.0%}\n")

        return "\n".join(lines)

    async def _interpret_results(
        self,
        query: str,
        hypothesis: Any,
        code: str,
        execution_result: dict[str, Any],
    ) -> AnalysisResult:
        """Interpret code execution results using LLM."""
        # Extract verdict from output
        stdout = execution_result["stdout"]
        verdict = "INCONCLUSIVE"  # Default

        # Simple heuristic: look for verdict in output
        if "SUPPORTED" in stdout.upper():
            verdict = "SUPPORTED"
        elif "REFUTED" in stdout.upper():
            verdict = "REFUTED"
        elif "INCONCLUSIVE" in stdout.upper():
            verdict = "INCONCLUSIVE"

        # Parse key findings from output
        key_findings = self._extract_findings(stdout)

        # Calculate confidence based on statistical significance
        confidence = self._calculate_confidence(stdout)

        return AnalysisResult(
            verdict=verdict,
            confidence=confidence,
            statistical_evidence=stdout.strip(),
            code_generated=code,
            execution_output=stdout,
            key_findings=key_findings,
            limitations=[
                "Analysis based on summary data only",
                "Limited to available evidence",
                "Statistical tests assume data independence",
            ],
        )

    def _extract_findings(self, output: str) -> list[str]:
        """Extract key findings from code output."""
        findings = []

        # Look for common statistical patterns
        lines = output.split("\n")
        for line in lines:
            line_lower = line.lower()
            if any(
                keyword in line_lower
                for keyword in ["p-value", "significant", "effect size", "correlation", "mean"]
            ):
                findings.append(line.strip())

        return findings[:5]  # Top 5 findings

    def _calculate_confidence(self, output: str) -> float:
        """Calculate confidence based on statistical results."""
        # Look for p-values
        import re

        p_values = re.findall(r"p[-\s]?value[:\s]+(\d+\.?\d*)", output.lower())

        if p_values:
            try:
                min_p = min(float(p) for p in p_values)
                # Higher confidence for lower p-values
                if min_p < 0.001:
                    return 0.95
                elif min_p < 0.01:
                    return 0.90
                elif min_p < 0.05:
                    return 0.80
                else:
                    return 0.60
            except ValueError:
                pass

        # Default medium confidence
        return 0.70

    def _format_response(self, result: AnalysisResult) -> str:
        """Format analysis result as markdown."""
        lines = [
            "## Statistical Analysis Complete\n",
            f"### Verdict: **{result.verdict}**",
            f"**Confidence**: {result.confidence:.0%}\n",
            "### Key Findings",
        ]

        for finding in result.key_findings:
            lines.append(f"- {finding}")

        lines.extend(
            [
                "\n### Statistical Evidence",
                "```",
                result.statistical_evidence,
                "```",
                "\n### Generated Code",
                "```python",
                result.code_generated,
                "```",
                "\n### Limitations",
            ]
        )

        for limitation in result.limitations:
            lines.append(f"- {limitation}")

        return "\n".join(lines)

    def _error_response(self, message: str) -> AgentRunResponse:
        """Create error response."""
        return AgentRunResponse(
            messages=[ChatMessage(role=Role.ASSISTANT, text=f"❌ **Error**: {message}")],
            response_id="analysis-error",
        )

    def _extract_query(
        self, messages: str | ChatMessage | list[str] | list[ChatMessage] | None
    ) -> str:
        """Extract query from messages."""
        if isinstance(messages, str):
            return messages
        elif isinstance(messages, ChatMessage):
            return messages.text or ""
        elif isinstance(messages, list):
            for msg in reversed(messages):
                if isinstance(msg, ChatMessage) and msg.role == Role.USER:
                    return msg.text or ""
                elif isinstance(msg, str):
                    return msg
        return ""

    async def run_stream(
        self,
        messages: str | ChatMessage | list[str] | list[ChatMessage] | None = None,
        *,
        thread: AgentThread | None = None,
        **kwargs: Any,
    ) -> AsyncIterable[AgentRunResponseUpdate]:
        """Streaming wrapper."""
        result = await self.run(messages, thread=thread, **kwargs)
        yield AgentRunResponseUpdate(messages=result.messages, response_id=result.response_id)