Spaces:

gmkdigitalmedia
/

CTapi-raw

Paused

CTapi-raw / fix_355m_hallucination.py

Your Name

Deploy Option B: Query Parser + RAG + 355M Ranking

45cf63e about 1 month ago

15 kB

	"""
	fix_355m_hallucination.py
	Direct fix to stop 355M model hallucinations in your system
	Replace generation with scoring/extraction
	"""

	import torch
	from transformers import GPT2LMHeadModel, GPT2TokenizerFast
	import logging
	import re
	from typing import List, Tuple, Dict

	logger = logging.getLogger(__name__)

	# ============================================================================
	# IMMEDIATE FIX: Replace your current 355M usage
	# ============================================================================

	def fix_your_355m_ranking_function():
	"""
	Your CURRENT code (two_llm_system_FIXED.py, line 60-170) tries to use
	the 355M model for ranking, but it's also trying to generate text.

	Here's the FIXED version that ONLY scores, doesn't generate:
	"""

	from transformers import GPT2LMHeadModel, GPT2TokenizerFast
	import spaces

	@spaces.GPU
	def rank_trials_with_355m_FIXED(
	query: str,
	trials_list: List[Tuple[float, str]],
	hf_token=None
	) -> List[Tuple[float, str]]:
	"""
	FIXED: Use 355M ONLY for scoring relevance, NOT for generation

	The model can't answer questions, but it CAN recognize relevance
	"""
	import time
	start_time = time.time()

	# Only process top 5 trials (not 3, gives better coverage)
	top_5 = trials_list[:5]

	logger.info(f"[355M SCORING] Scoring {len(top_5)} trials for relevance...")

	# Load model
	tokenizer = GPT2TokenizerFast.from_pretrained("gmkdigitalmedia/CT2")
	model = GPT2LMHeadModel.from_pretrained(
	"gmkdigitalmedia/CT2",
	torch_dtype=torch.float16,
	device_map="auto"
	)
	model.eval()
	tokenizer.pad_token = tokenizer.eos_token

	scored_trials = []

	for idx, (bm25_score, trial_text) in enumerate(top_5):
	# Extract NCT ID
	nct_match = re.search(r'NCT_ID:\s*(NCT\d+)', trial_text)
	nct_id = nct_match.group(1) if nct_match else f"Trial_{idx+1}"

	# DON'T ASK THE MODEL TO RATE! Calculate perplexity instead
	# Format: Does this trial answer this query?
	test_text = f"""Query: {query}

	Trial Data: {trial_text[:800]}

	This trial is relevant to the query because it"""

	# Calculate perplexity (lower = more natural = more relevant)
	inputs = tokenizer(
	test_text,
	return_tensors="pt",
	truncation=True,
	max_length=512,
	padding=True
	).to(model.device)

	with torch.no_grad():
	outputs = model(**inputs, labels=inputs.input_ids)
	perplexity = torch.exp(outputs.loss).item()

	# Convert perplexity to score (lower perplexity = higher score)
	# Typical perplexity range: 10-1000
	relevance_score = 100 / (perplexity + 1) # Higher score = more relevant

	# Combine with BM25 (70% BM25, 30% 355M perplexity)
	combined_score = 0.7 * bm25_score + 0.3 * (relevance_score / 100)

	logger.info(f"[355M] {nct_id}: BM25={bm25_score:.3f}, "
	f"Perplexity={perplexity:.1f}, "
	f"Combined={combined_score:.3f}")

	scored_trials.append((combined_score, trial_text, nct_id))

	# Sort by combined score
	scored_trials.sort(key=lambda x: x[0], reverse=True)

	# Return in expected format
	result = [(score, text) for score, text, _ in scored_trials]

	elapsed = time.time() - start_time
	logger.info(f"[355M SCORING] ✓ Completed in {elapsed:.1f}s")

	return result + trials_list[5:] # Add remaining trials unchanged

	# ============================================================================
	# BETTER SOLUTION: Don't generate text with 355M at all
	# ============================================================================

	class BetterUseOf355M:
	"""
	Instead of generation, use 355M for what it's good at:
	1. Scoring relevance (perplexity-based)
	2. Extracting structured fields
	3. Understanding clinical terminology
	"""

	def __init__(self):
	logger.info("Loading 355M model for scoring/extraction (not generation)...")
	self.tokenizer = GPT2TokenizerFast.from_pretrained("gmkdigitalmedia/CT2")
	self.model = GPT2LMHeadModel.from_pretrained(
	"gmkdigitalmedia/CT2",
	torch_dtype=torch.float16,
	device_map="auto"
	)
	self.model.eval()
	self.tokenizer.pad_token = self.tokenizer.eos_token

	def score_relevance(self, query: str, trial: str) -> float:
	"""
	Score how relevant a trial is to a query
	Uses perplexity - the model's confidence that these go together
	"""
	# Test if model thinks this pairing is "natural"
	text = f"Query: {query}\nRelevant Trial: {trial[:500]}"

	inputs = self.tokenizer(
	text,
	return_tensors="pt",
	truncation=True,
	max_length=512
	).to(self.model.device)

	with torch.no_grad():
	outputs = self.model(**inputs, labels=inputs.input_ids)
	perplexity = torch.exp(outputs.loss).item()

	# Lower perplexity = more natural = higher relevance
	score = 1.0 / (1.0 + perplexity / 100)
	return score

	def extract_endpoints(self, trial_text: str) -> List[str]:
	"""
	Extract endpoints WITHOUT generation - use attention weights
	"""
	# Find sections that model pays attention to when seeing "endpoint"
	test_prompts = [
	f"{trial_text[:500]}\nPRIMARY ENDPOINT:",
	f"{trial_text[:500]}\nThe main outcome measure is",
	f"{trial_text[:500]}\nThis trial measures"
	]

	endpoints = []
	for prompt in test_prompts:
	inputs = self.tokenizer(
	prompt,
	return_tensors="pt",
	truncation=True,
	max_length=512
	).to(self.model.device)

	with torch.no_grad():
	outputs = self.model(**inputs, output_attentions=True)
	# Get attention to identify important tokens
	attentions = outputs.attentions[-1] # Last layer
	avg_attention = attentions.mean(dim=1).squeeze()

	# Find high-attention tokens (likely endpoints)
	high_attention_indices = torch.where(
	avg_attention.mean(dim=0) > avg_attention.mean() * 1.5
	)[0]

	if len(high_attention_indices) > 0:
	# Decode high-attention tokens
	important_tokens = self.tokenizer.decode(
	inputs.input_ids[0][high_attention_indices]
	)
	if important_tokens and len(important_tokens) > 10:
	endpoints.append(important_tokens)

	return endpoints

	def identify_drug_mentions(self, trial_text: str, drug_name: str) -> bool:
	"""
	Check if a trial truly mentions a specific drug
	Uses the model's understanding of drug name variations
	"""
	# Test multiple phrasings
	drug_variants = [
	drug_name.lower(),
	drug_name.upper(),
	drug_name.capitalize()
	]

	for variant in drug_variants:
	test = f"This trial tests {variant}. {trial_text[:300]}"

	inputs = self.tokenizer(
	test,
	return_tensors="pt",
	truncation=True,
	max_length=256
	).to(self.model.device)

	with torch.no_grad():
	outputs = self.model(**inputs, labels=inputs.input_ids)
	perplexity = torch.exp(outputs.loss).item()

	# Low perplexity means model thinks this makes sense
	if perplexity < 50: # Threshold
	return True

	return False

	# ============================================================================
	# COMPLETE REPLACEMENT FOR YOUR PIPELINE
	# ============================================================================

	def process_query_no_hallucination(
	query: str,
	retrieved_trials: List[str],
	hf_token: str = None
	) -> str:
	"""
	Complete pipeline that uses 355M for scoring, Llama for generation
	NO HALLUCINATIONS because 355M never generates answers

	This replaces your current process_query function
	"""
	import time
	from huggingface_hub import InferenceClient

	start_time = time.time()

	# Step 1: Use 355M to score and rank trials
	logger.info("Step 1: Scoring trials with 355M model...")
	model_355m = BetterUseOf355M()

	scored_trials = []
	for trial in retrieved_trials[:10]: # Score top 10
	score = model_355m.score_relevance(query, trial)
	scored_trials.append((score, trial))

	# Sort by relevance score
	scored_trials.sort(key=lambda x: x[0], reverse=True)
	top_trials = scored_trials[:3] # Take top 3

	logger.info(f"Top relevance scores: {[s for s, _ in top_trials]}")

	# Step 2: Extract key information using 355M (optional)
	extracted_info = []
	for score, trial in top_trials:
	# Extract NCT ID
	nct_match = re.search(r'NCT_ID:\s*(NCT\d+)', trial)
	nct_id = nct_match.group(1) if nct_match else "Unknown"

	# Try to extract endpoints (without generation)
	endpoints = model_355m.extract_endpoints(trial)

	extracted_info.append({
	'nct_id': nct_id,
	'relevance_score': score,
	'endpoints': endpoints,
	'snippet': trial[:500]
	})

	# Step 3: Use Llama-70B for actual answer generation
	logger.info("Step 3: Generating answer with Llama-70B...")

	# Format context from scored trials
	context = "\n---\n".join([
	f"TRIAL {i+1} (Relevance: {info['relevance_score']:.2%}):\n"
	f"NCT ID: {info['nct_id']}\n"
	f"{info['snippet']}"
	for i, info in enumerate(extracted_info)
	])

	if hf_token:
	client = InferenceClient(token=hf_token)

	prompt = f"""Answer this clinical trial question based on the provided data:

	Question: {query}

	Relevant Clinical Trials (ranked by relevance):
	{context}

	Provide a clear, factual answer based ONLY on the trial data above. If the trials don't contain the answer, say so."""

	response = client.chat_completion(
	model="meta-llama/Llama-3.1-70B-Instruct",
	messages=[{"role": "user", "content": prompt}],
	max_tokens=500,
	temperature=0.3
	)

	answer = response.choices[0].message.content
	else:
	answer = "Llama-70B API not available. Please provide HF_TOKEN."

	elapsed = time.time() - start_time

	return f"""QUERY: {query}

	PROCESSING:
	✓ 355M Relevance Scoring: {len(scored_trials)} trials scored
	✓ Top relevance: {top_trials[0][0]:.2%}
	✓ Llama-70B Generation: Complete
	✓ Total time: {elapsed:.1f}s

	ANSWER:
	{answer}

	SOURCES:
	{chr(10).join(f"- {info['nct_id']}: Relevance {info['relevance_score']:.2%}"
	for info in extracted_info)}

	Note: Using 355M for scoring only (no hallucinations), Llama-70B for generation."""

	# ============================================================================
	# QUICK FIX INSTRUCTIONS
	# ============================================================================

	def get_quick_fix_instructions():
	"""
	Simple instructions to fix the hallucination problem immediately
	"""
	return """
	========================================================================
	QUICK FIX FOR 355M MODEL HALLUCINATIONS
	========================================================================

	PROBLEM:
	--------
	Your 355M model hallucinates because:
	1. It was trained to GENERATE clinical trial text
	2. It was NOT trained on question-answer pairs
	3. When asked "What are the endpoints in trial X?", it generates
	random trial text because that's all it knows how to do

	SOLUTION:
	---------
	STOP using 355M for text generation. Use it ONLY for:
	1. Scoring relevance (perplexity-based)
	2. Ranking trials
	3. Checking if terms match

	IMMEDIATE FIX:
	--------------
	In two_llm_system_FIXED.py, replace the generate() calls with
	perplexity scoring:

	OLD (line 113-120):
	outputs = model.generate(...) # This causes hallucinations!
	generated = tokenizer.decode(outputs...)

	NEW:
	outputs = model(**inputs, labels=inputs.input_ids)
	perplexity = torch.exp(outputs.loss).item()
	relevance_score = 100 / (perplexity + 1)

	BETTER FIX:
	-----------
	1. Copy the rank_trials_with_355m_FIXED function above
	2. Replace your current ranking function
	3. The model will now ONLY score, not generate

	BEST FIX:
	---------
	Use the complete process_query_no_hallucination function above.
	It properly separates:
	- 355M: Scoring and ranking only
	- Llama-70B: All text generation

	RESULTS:
	--------
	Before: "ianalumab trial endpoints" → Hallucinates about S-1 and OA
	After: "ianalumab trial endpoints" → Correctly finds and ranks
	ianalumab trials, Llama generates accurate answer

	The 355M model is still valuable! Just don't ask it to write -
	ask it to score, rank, and recognize patterns.

	========================================================================
	"""

	if __name__ == "__main__":
	print(get_quick_fix_instructions())

	# Test the fix
	print("\nTesting fixed scoring (no generation)...")
	test_model = BetterUseOf355M()

	# Test relevance scoring
	query = "ianalumab for sjogren's syndrome endpoints"
	good_trial = "TITLE: Phase 2 Study of Ianalumab in Sjogren's\nPRIMARY ENDPOINT: ESSDAI score"
	bad_trial = "TITLE: Aspirin for Headache\nPRIMARY ENDPOINT: Pain reduction"

	good_score = test_model.score_relevance(query, good_trial)
	bad_score = test_model.score_relevance(query, bad_trial)

	print(f"\nRelevance Scores (no hallucination):")
	print(f" Relevant trial: {good_score:.3f}")
	print(f" Irrelevant trial: {bad_score:.3f}")
	print(f" Correct ranking: {good_score > bad_score} ✓")