Spaces:

gmkdigitalmedia
/

CTapi-raw

Paused

Your Name Claude commited on Nov 6

Commit

4213e35

1 Parent(s): 6997480

Add /search endpoint with 355M perplexity ranking (Option B implementation)

NEW FEATURES:
- New /search endpoint returns structured JSON (no LLM response generation)
- Keeps Query Parser LLM for entity extraction + synonym expansion
- Implements 355M Clinical Trial GPT perplexity-based re-ranking
- Includes before/after benchmarking metrics for 355M impact
- Returns trials ranked by clinical relevance (70% hybrid + 30% perplexity)

IMPLEMENTATION:
- rank_trials_with_355m_perplexity(): Uses perplexity scoring (not generation) to avoid hallucinations
- parse_trial_text_to_dict(): Parses trial text into structured fields
- process_query_structured(): Main function for /search endpoint

BENCHMARKING:
- rank_before_355m: Original hybrid search ranking
- rank_after_355m: Final ranking after 355M perplexity adjustment
- perplexity: Raw perplexity score (lower = more relevant)
- perplexity_score: Normalized 0-1 score
- Processing time breakdown for each stage

API Response includes:
- query_analysis: Extracted entities and optimized search terms
- results: Total found, returned count, top relevance score
- trials[]: Structured trial data with scoring metadata
- benchmarking: Performance metrics and 355M ranking impact
- metadata: Model versions and database info

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (2) hide show

app.py +70 -5
foundation_engine.py +475 -0

app.py CHANGED Viewed

@@ -40,6 +40,10 @@ class QueryResponse(BaseModel):
     summary: str
     processing_time: float
 class HealthResponse(BaseModel):
     status: str
     trials_loaded: int
@@ -64,18 +68,22 @@ async def root():
     """API information"""
     return {
         "service": "Clinical Trial API",
-        "version": "1.0.0",
-        "description": "Production REST API for Foundation 1.2",
         "status": "healthy",
         "endpoints": {
-            "POST /query": "Query clinical trials and get AI-generated summary",
             "GET /health": "Health check",
             "GET /docs": "Interactive API documentation (Swagger UI)",
             "GET /redoc": "Alternative API documentation (ReDoc)"
         },
         "features": [
-            "Drug Scoring",
-            "355M foundation model"
         ]
     }
@@ -123,6 +131,63 @@ async def query_trials(request: QueryRequest):
         logger.error(f"Error processing query: {str(e)}")
         raise HTTPException(status_code=500, detail=f"Error processing query: {str(e)}")
 if __name__ == "__main__":
     import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=7860)

     summary: str
     processing_time: float
+class SearchRequest(BaseModel):
+    query: str
+    top_k: int = 10
 class HealthResponse(BaseModel):
     status: str
     trials_loaded: int
     """API information"""
     return {
         "service": "Clinical Trial API",
+        "version": "2.0.0",
+        "description": "Production REST API for Foundation 1.2 with 355M perplexity ranking",
         "status": "healthy",
         "endpoints": {
+            "POST /search": "[NEW] Search trials with structured JSON output (includes 355M ranking)",
+            "POST /query": "Query clinical trials and get AI-generated summary (legacy)",
             "GET /health": "Health check",
             "GET /docs": "Interactive API documentation (Swagger UI)",
             "GET /redoc": "Alternative API documentation (ReDoc)"
         },
         "features": [
+            "LLM Query Parser (entity extraction + synonyms)",
+            "Hybrid RAG Search (BM25 + semantic + inverted index)",
+            "355M Clinical Trial GPT perplexity-based ranking",
+            "Structured JSON output",
+            "Benchmarking metrics (before/after 355M scores)"
         ]
     }
         logger.error(f"Error processing query: {str(e)}")
         raise HTTPException(status_code=500, detail=f"Error processing query: {str(e)}")
+@app.post("/search")
+async def search_trials(request: SearchRequest):
+    """
+    Search clinical trials and get structured JSON results (NEW API v2.0)
+    This endpoint provides:
+    - Query parsing with LLM (entity extraction + synonym expansion)
+    - Hybrid RAG search (BM25 + semantic embeddings + inverted index)
+    - 355M Clinical Trial GPT perplexity-based re-ranking
+    - Structured JSON output with benchmarking data
+    **No response generation** - returns raw trial data for client-side processing
+    Args:
+    - **query**: Your question about clinical trials
+    - **top_k**: Number of trials to return (default: 10, max: 50)
+    Returns:
+    - Structured JSON with trials ranked by clinical relevance
+    - Includes before/after 355M ranking scores for benchmarking
+    - Processing time breakdown (query parsing, RAG search, 355M ranking)
+    """
+    try:
+        logger.info(f"[SEARCH API] Query received: {request.query[:100]}...")
+        # Validate top_k
+        if request.top_k > 50:
+            logger.warning(f"[SEARCH API] top_k={request.top_k} exceeds maximum 50, capping")
+            request.top_k = 50
+        elif request.top_k < 1:
+            logger.warning(f"[SEARCH API] top_k={request.top_k} is invalid, using default 10")
+            request.top_k = 10
+        start_time = time.time()
+        # Call the structured query processor
+        result = foundation_engine.process_query_structured(request.query, top_k=request.top_k)
+        processing_time = time.time() - start_time
+        logger.info(f"[SEARCH API] Query completed in {processing_time:.2f}s")
+        # Ensure processing_time is set
+        if 'processing_time' not in result or result['processing_time'] == 0:
+            result['processing_time'] = processing_time
+        return result
+    except Exception as e:
+        logger.error(f"[SEARCH API] Error processing query: {str(e)}")
+        import traceback
+        return {
+            "error": str(e),
+            "traceback": traceback.format_exc(),
+            "query": request.query,
+            "processing_time": time.time() - start_time if 'start_time' in locals() else 0
+        }
 if __name__ == "__main__":
     import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=7860)

foundation_engine.py CHANGED Viewed

@@ -1814,6 +1814,481 @@ Query Type Distribution:
     return report
 # ============================================================================
 # GRADIO INTERFACE
 # ============================================================================

     return report
+# ============================================================================
+# 355M PERPLEXITY-BASED RANKING (FOR STRUCTURED JSON API)
+# ============================================================================
+def rank_trials_with_355m_perplexity(query, trials_list, hf_token=None):
+    """
+    Rank trials using 355M Clinical Trial GPT perplexity scoring
+    This uses the model for SCORING not GENERATION to avoid hallucinations
+    Lower perplexity = more relevant trial
+    Args:
+        query: User query
+        trials_list: List of (score, trial_text) tuples from hybrid search
+        hf_token: Not needed (model runs locally)
+    Returns:
+        List of dicts with trial data and perplexity scores
+    """
+    import time
+    import re
+    import torch
+    from transformers import GPT2LMHeadModel, GPT2TokenizerFast
+    start_time = time.time()
+    # Only rank top 10 trials (balance between accuracy and speed)
+    top_10 = trials_list[:10]
+    logger.info(f"[355M PERPLEXITY] Ranking {len(top_10)} trials with CT2 model...")
+    try:
+        # Load 355M model
+        tokenizer = GPT2TokenizerFast.from_pretrained("gmkdigitalmedia/CT2")
+        model = GPT2LMHeadModel.from_pretrained(
+            "gmkdigitalmedia/CT2",
+            torch_dtype=torch.float16,
+            device_map="auto"
+        )
+        model.eval()
+        tokenizer.pad_token = tokenizer.eos_token
+        ranked_trials = []
+        for idx, (hybrid_score, trial_text) in enumerate(top_10):
+            # Extract NCT ID
+            nct_match = re.search(r'NCT_ID:\s*(NCT\d+)', trial_text)
+            nct_id = nct_match.group(1) if nct_match else f"Trial_{idx+1}"
+            # Format test text for perplexity calculation
+            # The model calculates: "How natural is this query-trial pairing?"
+            test_text = f"""Query: {query}
+Relevant Clinical Trial:
+{trial_text[:800]}
+This trial is highly relevant because"""
+            # Calculate perplexity (lower = more relevant)
+            inputs = tokenizer(
+                test_text,
+                return_tensors="pt",
+                truncation=True,
+                max_length=512,
+                padding=True
+            ).to(model.device)
+            with torch.no_grad():
+                outputs = model(**inputs, labels=inputs.input_ids)
+                perplexity = torch.exp(outputs.loss).item()
+            # Convert perplexity to relevance score (0-1)
+            # Typical range: 10-1000, lower is better
+            perplexity_score = 1.0 / (1.0 + perplexity / 100)
+            # Combine with hybrid score (70% hybrid, 30% perplexity)
+            combined_score = 0.7 * hybrid_score + 0.3 * perplexity_score
+            logger.info(f"[355M] {nct_id}: Hybrid={hybrid_score:.3f}, "
+                       f"Perplexity={perplexity:.1f}, "
+                       f"Perplexity_Score={perplexity_score:.3f}, "
+                       f"Combined={combined_score:.3f}")
+            ranked_trials.append({
+                'nct_id': nct_id,
+                'trial_text': trial_text,
+                'hybrid_score': float(hybrid_score),
+                'perplexity': float(perplexity),
+                'perplexity_score': float(perplexity_score),
+                'combined_score': float(combined_score),
+                'rank_before_355m': idx + 1
+            })
+        # Sort by combined score (descending)
+        ranked_trials.sort(key=lambda x: x['combined_score'], reverse=True)
+        # Add final rank
+        for idx, trial in enumerate(ranked_trials):
+            trial['rank_after_355m'] = idx + 1
+        elapsed = time.time() - start_time
+        logger.info(f"[355M PERPLEXITY] ✓ Ranking complete in {elapsed:.1f}s")
+        # Add remaining trials (beyond top 10) without 355M scoring
+        for idx, (hybrid_score, trial_text) in enumerate(trials_list[10:], start=10):
+            nct_match = re.search(r'NCT_ID:\s*(NCT\d+)', trial_text)
+            nct_id = nct_match.group(1) if nct_match else f"Trial_{idx+1}"
+            ranked_trials.append({
+                'nct_id': nct_id,
+                'trial_text': trial_text,
+                'hybrid_score': float(hybrid_score),
+                'perplexity': None,
+                'perplexity_score': None,
+                'combined_score': float(hybrid_score),
+                'rank_before_355m': idx + 1,
+                'rank_after_355m': len(ranked_trials) + 1
+            })
+        return ranked_trials
+    except Exception as e:
+        logger.error(f"[355M PERPLEXITY] Error: {e}")
+        logger.warning("[355M PERPLEXITY] Falling back to hybrid scores only")
+        # Fallback: return trials with hybrid scores only
+        fallback_trials = []
+        for idx, (hybrid_score, trial_text) in enumerate(trials_list):
+            nct_match = re.search(r'NCT_ID:\s*(NCT\d+)', trial_text)
+            nct_id = nct_match.group(1) if nct_match else f"Trial_{idx+1}"
+            fallback_trials.append({
+                'nct_id': nct_id,
+                'trial_text': trial_text,
+                'hybrid_score': float(hybrid_score),
+                'perplexity': None,
+                'perplexity_score': None,
+                'combined_score': float(hybrid_score),
+                'rank_before_355m': idx + 1,
+                'rank_after_355m': idx + 1
+            })
+        return fallback_trials
+def parse_trial_text_to_dict(trial_text, nct_id):
+    """
+    Parse trial text into structured dictionary
+    Args:
+        trial_text: Raw trial text
+        nct_id: NCT ID
+    Returns:
+        Dict with parsed trial fields
+    """
+    import re
+    trial_dict = {
+        'nct_id': nct_id,
+        'title': '',
+        'sponsor': '',
+        'collaborators': [],
+        'phase': '',
+        'status': '',
+        'enrollment': None,
+        'conditions': [],
+        'interventions': [],
+        'primary_outcome': '',
+        'results_summary': '',
+        'start_date': '',
+        'completion_date': '',
+        'last_update': '',
+        'locations': []
+    }
+    # Extract fields using regex patterns
+    try:
+        # Title
+        title_match = re.search(r'TITLE:\s*([^\n]+)', trial_text, re.IGNORECASE)
+        if title_match:
+            trial_dict['title'] = title_match.group(1).strip()
+        # Sponsor
+        sponsor_match = re.search(r'SPONSOR:\s*([^\n]+)', trial_text, re.IGNORECASE)
+        if sponsor_match:
+            trial_dict['sponsor'] = sponsor_match.group(1).strip()
+        # Collaborators
+        collab_match = re.search(r'COLLABORATOR[S]?:\s*([^\n]+)', trial_text, re.IGNORECASE)
+        if collab_match:
+            collabs = collab_match.group(1).strip().split(',')
+            trial_dict['collaborators'] = [c.strip() for c in collabs if c.strip()]
+        # Phase
+        phase_match = re.search(r'PHASE:\s*([^\n]+)', trial_text, re.IGNORECASE)
+        if phase_match:
+            trial_dict['phase'] = phase_match.group(1).strip()
+        # Status
+        status_match = re.search(r'STATUS:\s*([^\n]+)', trial_text, re.IGNORECASE)
+        if status_match:
+            trial_dict['status'] = status_match.group(1).strip()
+        # Enrollment
+        enrollment_match = re.search(r'ENROLLMENT:\s*(\d+)', trial_text, re.IGNORECASE)
+        if enrollment_match:
+            trial_dict['enrollment'] = int(enrollment_match.group(1))
+        # Conditions
+        condition_match = re.search(r'CONDITION[S]?:\s*([^\n]+)', trial_text, re.IGNORECASE)
+        if condition_match:
+            conditions = condition_match.group(1).strip().split(',')
+            trial_dict['conditions'] = [c.strip() for c in conditions if c.strip()]
+        # Interventions
+        intervention_match = re.search(r'INTERVENTION[S]?:\s*([^\n]+)', trial_text, re.IGNORECASE)
+        if intervention_match:
+            interventions = intervention_match.group(1).strip().split(',')
+            trial_dict['interventions'] = [i.strip() for i in interventions if i.strip()]
+        # Primary outcome
+        outcome_match = re.search(r'PRIMARY[_ ]OUTCOME:\s*([^\n]+)', trial_text, re.IGNORECASE)
+        if outcome_match:
+            trial_dict['primary_outcome'] = outcome_match.group(1).strip()
+        # Results summary
+        results_match = re.search(r'RESULTS:\s*([^\n]+)', trial_text, re.IGNORECASE)
+        if results_match:
+            trial_dict['results_summary'] = results_match.group(1).strip()
+        # Dates
+        start_match = re.search(r'START[_ ]DATE:\s*([^\n]+)', trial_text, re.IGNORECASE)
+        if start_match:
+            trial_dict['start_date'] = start_match.group(1).strip()
+        completion_match = re.search(r'COMPLETION[_ ]DATE:\s*([^\n]+)', trial_text, re.IGNORECASE)
+        if completion_match:
+            trial_dict['completion_date'] = completion_match.group(1).strip()
+        # Locations
+        location_match = re.search(r'LOCATION[S]?:\s*([^\n]+)', trial_text, re.IGNORECASE)
+        if location_match:
+            locations = location_match.group(1).strip().split(',')
+            trial_dict['locations'] = [l.strip() for l in locations if l.strip()]
+    except Exception as e:
+        logger.warning(f"Error parsing trial {nct_id}: {e}")
+    return trial_dict
+def process_query_structured(query, top_k=10):
+    """
+    Process query and return structured JSON (no LLM response generation)
+    This is the new API endpoint that:
+    1. Uses LLM for query parsing/entity extraction
+    2. Performs hybrid RAG search
+    3. Ranks with 355M perplexity scoring
+    4. Returns structured JSON
+    Args:
+        query: User query
+        top_k: Number of trials to return
+    Returns:
+        Dict with structured response
+    """
+    import time
+    start_time = time.time()
+    result = {
+        'query': query,
+        'processing_time': 0,
+        'query_analysis': {},
+        'results': {},
+        'trials': [],
+        'benchmarking': {},
+        'metadata': {}
+    }
+    try:
+        # Step 1: Parse query with LLM
+        step1_start = time.time()
+        logger.info("[STRUCTURED API] Step 1: Parsing query with LLM...")
+        try:
+            parsed_query = parse_query_with_llm(query, hf_token=hf_token)
+            search_query = parsed_query['search_terms']
+            result['query_analysis'] = {
+                'extracted_entities': {
+                    'drugs': parsed_query.get('drugs', []),
+                    'diseases': parsed_query.get('diseases', []),
+                    'companies': parsed_query.get('companies', []),
+                    'endpoints': parsed_query.get('endpoints', [])
+                },
+                'optimized_search': search_query,
+                'parsing_time': time.time() - step1_start
+            }
+            logger.info(f"[STRUCTURED API] Query parsed in {time.time() - step1_start:.1f}s")
+        except Exception as e:
+            logger.warning(f"[STRUCTURED API] Query parsing failed: {e}, using original query")
+            search_query = query
+            parsed_query = {'drugs': [], 'diseases': [], 'companies': [], 'endpoints': []}
+            result['query_analysis'] = {
+                'extracted_entities': parsed_query,
+                'optimized_search': search_query,
+                'parsing_time': time.time() - step1_start,
+                'error': str(e)
+            }
+        # Step 2: Hybrid RAG search
+        step2_start = time.time()
+        logger.info("[STRUCTURED API] Step 2: Hybrid RAG search...")
+        # Get more candidates for 355M ranking
+        candidate_k = top_k * 3
+        # We need to get the candidate trials with scores
+        # Re-implement the key parts of retrieve_context_with_embeddings to get structured data
+        from collections import Counter
+        global doc_chunks, doc_embeddings, embedder, inverted_index
+        if doc_embeddings is None or len(doc_chunks) == 0:
+            raise Exception("Embeddings not loaded!")
+        # Extract keywords
+        stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with',
+                      'is', 'are', 'was', 'were', 'be', 'been', 'being', 'what', 'how', 'do', 'you', 'know',
+                      'about', 'that', 'this', 'there', 'it'}
+        query_lower = search_query.lower()
+        import re
+        words = re.findall(r'\b\w+\b', query_lower)
+        query_terms = [w for w in words if len(w) > 2 and w not in stop_words]
+        # Keyword scoring with inverted index
+        keyword_scores = {}
+        if inverted_index is not None:
+            inv_index_candidates = set()
+            for term in query_terms:
+                if term in inverted_index:
+                    inv_index_candidates.update(inverted_index[term])
+            if inv_index_candidates:
+                drug_specific_terms = set()
+                for term in query_terms:
+                    if term in inverted_index and len(inverted_index[term]) < 100:
+                        drug_specific_terms.add(term)
+                for idx in inv_index_candidates:
+                    chunk_data = doc_chunks[idx]
+                    chunk_text = chunk_data[1] if isinstance(chunk_data, tuple) else chunk_data
+                    chunk_lower = chunk_text.lower()
+                    has_drug_match = any(drug_term in chunk_lower for drug_term in drug_specific_terms)
+                    if has_drug_match:
+                        keyword_scores[idx] = 1000.0
+                    else:
+                        keyword_scores[idx] = 1.0
+        # Semantic scoring
+        load_embedder()
+        query_embedding = embedder.encode([search_query])[0]
+        semantic_similarities = np.dot(doc_embeddings, query_embedding)
+        # Normalize and combine scores
+        if keyword_scores:
+            max_kw = max(keyword_scores.values())
+            keyword_scores_norm = {idx: score/max_kw for idx, score in keyword_scores.items()}
+        else:
+            keyword_scores_norm = {}
+        max_sem = semantic_similarities.max()
+        min_sem = semantic_similarities.min()
+        semantic_scores_norm = (semantic_similarities - min_sem) / (max_sem - min_sem + 1e-10)
+        # Combined scores
+        combined_scores = np.zeros(len(doc_chunks))
+        for idx in range(len(doc_chunks)):
+            kw_score = keyword_scores_norm.get(idx, 0.0)
+            sem_score = semantic_scores_norm[idx]
+            combined_scores[idx] = 0.5 * kw_score + 0.5 * sem_score if kw_score > 0 else sem_score
+        # Get top candidates
+        top_indices = np.argsort(combined_scores)[-candidate_k:][::-1]
+        # Format as (score, text) tuples
+        candidate_trials = [(combined_scores[i], doc_chunks[i][1] if isinstance(doc_chunks[i], tuple) else doc_chunks[i])
+                           for i in top_indices]
+        rag_time = time.time() - step2_start
+        logger.info(f"[STRUCTURED API] RAG search complete in {rag_time:.1f}s, found {len(candidate_trials)} candidates")
+        # Step 3: Rank with 355M perplexity
+        step3_start = time.time()
+        logger.info("[STRUCTURED API] Step 3: Ranking with 355M perplexity...")
+        ranked_trials = rank_trials_with_355m_perplexity(query, candidate_trials, hf_token=hf_token)
+        ranking_time = time.time() - step3_start
+        logger.info(f"[STRUCTURED API] 355M ranking complete in {ranking_time:.1f}s")
+        # Format results
+        result['results'] = {
+            'total_found': len(candidate_trials),
+            'returned': min(top_k, len(ranked_trials)),
+            'top_relevance_score': ranked_trials[0]['combined_score'] if ranked_trials else 0
+        }
+        # Parse trials and add to results
+        for trial_data in ranked_trials[:top_k]:
+            trial_dict = parse_trial_text_to_dict(trial_data['trial_text'], trial_data['nct_id'])
+            trial_dict['scoring'] = {
+                'relevance_score': trial_data['combined_score'],
+                'hybrid_score': trial_data['hybrid_score'],
+                'perplexity': trial_data['perplexity'],
+                'perplexity_score': trial_data['perplexity_score'],
+                'rank_before_355m': trial_data['rank_before_355m'],
+                'rank_after_355m': trial_data['rank_after_355m'],
+                'ranking_method': '355m_perplexity' if trial_data['perplexity'] is not None else 'hybrid_only'
+            }
+            trial_dict['url'] = f"https://clinicaltrials.gov/study/{trial_data['nct_id']}"
+            result['trials'].append(trial_dict)
+        # Benchmarking data
+        if ranked_trials:
+            # Calculate how much 355M changed the ranking
+            rank_changes = []
+            for trial in ranked_trials[:top_k]:
+                if trial['perplexity'] is not None:
+                    rank_change = trial['rank_before_355m'] - trial['rank_after_355m']
+                    rank_changes.append(rank_change)
+            result['benchmarking'] = {
+                'rag_search_time': rag_time,
+                '355m_ranking_time': ranking_time,
+                'total_processing_time': time.time() - start_time,
+                'trials_ranked_by_355m': len([t for t in ranked_trials if t['perplexity'] is not None]),
+                'average_rank_change': sum(rank_changes) / len(rank_changes) if rank_changes else 0,
+                'max_rank_improvement': max(rank_changes) if rank_changes else 0,
+                'top_3_perplexity_scores': [t['perplexity'] for t in ranked_trials[:3] if t['perplexity'] is not None]
+            }
+        # Metadata
+        result['metadata'] = {
+            'database_version': '2025-01-06',
+            'total_trials_searched': len(doc_chunks),
+            'api_version': '2.0.0',
+            'model_info': {
+                'query_parser': 'Llama-3.1-70B-Instruct',
+                'ranking_model': 'gmkdigitalmedia/CT2-355M',
+                'embedding_model': 'all-MiniLM-L6-v2'
+            }
+        }
+        result['processing_time'] = time.time() - start_time
+        logger.info(f"[STRUCTURED API] ✓ Complete in {result['processing_time']:.1f}s")
+        return result
+    except Exception as e:
+        logger.error(f"[STRUCTURED API] Error: {e}")
+        import traceback
+        result['error'] = str(e)
+        result['traceback'] = traceback.format_exc()
+        result['processing_time'] = time.time() - start_time
+        return result
 # ============================================================================
 # GRADIO INTERFACE
 # ============================================================================