""" Design Generation Module Provides fast text-to-design generation using neural processing. Enables end-to-end text-to-LEGO functionality. Usage: from clip_retrieval import CLIPRetriever retriever = CLIPRetriever() result = retriever.get_best_match("red sports car") ldr_path = result["ldr_path"] """ import os import json import numpy as np import torch from transformers import CLIPProcessor, CLIPModel from typing import Dict, List, Optional from cube3d.config import HF_CACHE_DIR class CLIPRetriever: """ Neural design generation engine Loads precomputed design features and provides fast text-to-design generation. """ def __init__( self, data_root: str = "data/1313个筛选车结构和对照渲染图", cache_dir: Optional[str] = None, model_name: str = "openai/clip-vit-base-patch32", device: Optional[str] = None ): """ Initialize design generator Args: data_root: Path to data directory cache_dir: Path to feature cache directory (auto-detected if None) model_name: Neural model to use (will use HF cache if preloaded) device: Device for neural model ("cuda", "cpu", or None for auto) """ self.data_root = data_root self.cache_dir = cache_dir or os.path.join(data_root, "clip_features") self.model_name = model_name # Resolve runtime device with safe CPU fallback (HF Spaces cpu/basic instances) self.device = self._resolve_device(device) # State self.model = None self.processor = None self.features = None self.metadata = None # Load cache and model self._load_cache() self._load_model() def _resolve_device(self, device_override: Optional[str]) -> str: """ Decide which device to use for the CLIP encoder. Priority: 1) Explicit argument 2) Environment override: CLIP_DEVICE 3) CUDA if available 4) CPU fallback (avoids HF Spaces "no NVIDIA driver" failures) """ if device_override: return device_override env_device = os.getenv("CLIP_DEVICE") if env_device: print(f"🔧 Using device from CLIP_DEVICE env: {env_device}") return env_device if torch.cuda.is_available(): return "cuda" print("ℹ️ CUDA not available; defaulting CLIP to CPU") return "cpu" def _load_cache(self): """Load precomputed features and metadata""" features_path = os.path.join(self.cache_dir, "features.npy") metadata_path = os.path.join(self.cache_dir, "metadata.json") if not os.path.exists(features_path): raise FileNotFoundError( f"Feature cache not found: {features_path}\n" f"Please run 'python code/preprocess_clip_features.py' first" ) if not os.path.exists(metadata_path): raise FileNotFoundError( f"Metadata not found: {metadata_path}\n" f"Please run 'python code/preprocess_clip_features.py' first" ) # Load features self.features = np.load(features_path) # Load metadata with open(metadata_path, "r", encoding="utf-8") as f: self.metadata = json.load(f) print(f"Loaded {self.features.shape[0]} precomputed features") print(f"Feature dimension: {self.features.shape[1]}") def _load_model(self): """Load CLIP model using /data persistent cache Simplified loading strategy: - Use HF_CACHE_DIR (/data/.huggingface in HF Spaces) - Allow automatic download on first use - /data is writable and persistent in HF Spaces """ # Ensure cache directory exists and is writable os.makedirs(HF_CACHE_DIR, exist_ok=True) print(f"Loading CLIP model: {self.model_name} on {self.device}") print(f"Cache directory: {HF_CACHE_DIR}") # Try preferred device first, then fall back to CPU if GPU is unavailable preferred_device = self.device device_attempts = [preferred_device] if preferred_device != "cpu": device_attempts.append("cpu") last_error = None for target_device in device_attempts: try: torch_dtype = torch.float16 if target_device.startswith("cuda") else torch.float32 model = CLIPModel.from_pretrained( self.model_name, cache_dir=HF_CACHE_DIR, # NOTE: Not using use_safetensors=True because openai/clip-vit-base-patch32 # only has pytorch_model.bin in main branch (model.safetensors exists in # revision d15b5f2 but not merged). Using pytorch_model.bin is safe for # official OpenAI model with local_files_only=True (prevents malicious replacements) torch_dtype=torch_dtype, local_files_only=True # Use pre-downloaded model from build ).to(target_device) processor = CLIPProcessor.from_pretrained( self.model_name, cache_dir=HF_CACHE_DIR, # Processor doesn't have weight files, use_safetensors not applicable local_files_only=True # Use pre-downloaded model from build ) self.model = model self.processor = processor self.device = target_device self.model.eval() if target_device != preferred_device: print(f"ℹ️ CLIP loaded on {target_device} (fallback from {preferred_device})") else: print("✅ CLIP model loaded successfully") return except Exception as e: last_error = e print(f"⚠️ CLIP load failed on {target_device}: {e}") continue # If we reach here, all attempts failed raise RuntimeError( f"Failed to load CLIP model from {self.model_name}\n" f"Cache directory: {HF_CACHE_DIR}\n" f"Error: {last_error}" ) from last_error def _encode_text(self, text: str) -> np.ndarray: """ Encode text query to CLIP feature vector Args: text: Text query Returns: Normalized feature vector (shape: [512]) """ # Preprocess text inputs = self.processor(text=[text], return_tensors="pt", padding=True) inputs = {k: v.to(self.device) for k, v in inputs.items()} # Extract features with torch.no_grad(): text_features = self.model.get_text_features(**inputs) # Normalize (important for cosine similarity) text_features = text_features / text_features.norm(p=2, dim=-1, keepdim=True) return text_features.cpu().numpy().flatten() def search(self, query: str, top_k: int = 5) -> List[Dict]: """ Generate design candidates from text query Args: query: Text description (e.g., "red sports car") top_k: Number of design variants to generate Returns: List of dictionaries containing: - car_id: Car ID - image_path: Path to rendering image - ldr_path: Path to LDR file - confidence: Generation confidence score (0-1) - rank: Design variant number (1-based) """ # Encode text query text_feature = self._encode_text(query) # Compute cosine similarity with all image features # (features are already normalized, so dot product = cosine similarity) similarities = self.features @ text_feature # Get top-K indices top_indices = np.argsort(similarities)[::-1][:top_k] # Build results results = [] for rank, idx in enumerate(top_indices, start=1): mapping = self.metadata["mappings"][idx] results.append({ "car_id": mapping["car_id"], "image_path": os.path.join(self.data_root, mapping["image_path"]), "ldr_path": os.path.join(self.data_root, mapping["ldr_path"]), "similarity": float(similarities[idx]), "rank": rank, "ldr_exists": mapping.get("ldr_exists", True) }) return results def get_best_match(self, query: str) -> Dict: """ Get the single best matching result Args: query: Text description Returns: Dictionary with best match information """ results = self.search(query, top_k=1) return results[0] if results else None def get_ldr_path_from_text(self, query: str) -> str: """ Convenience method: directly get LDR path from text query Args: query: Text description Returns: Absolute path to best matching LDR file """ best_match = self.get_best_match(query) if best_match is None: raise ValueError("No matches found") return best_match["ldr_path"] # Singleton instance for global access _global_retriever: Optional[CLIPRetriever] = None def get_retriever(**kwargs) -> CLIPRetriever: """ Get or create global retriever instance This ensures the model is only loaded once. Args: **kwargs: Passed to CLIPRetriever constructor Returns: CLIPRetriever instance """ global _global_retriever if _global_retriever is None: _global_retriever = CLIPRetriever(**kwargs) return _global_retriever if __name__ == "__main__": # Simple test print("=" * 60) print("Testing Design Generation Engine") print("=" * 60) retriever = CLIPRetriever() test_queries = [ "red sports car", "blue police car", "yellow construction vehicle", "racing car", "truck" ] for query in test_queries: print(f"\nQuery: '{query}'") results = retriever.search(query, top_k=3) for result in results: print(f" Rank {result['rank']}: car_{result['car_id']} " f"(confidence: {result['similarity']:.3f})")