Spaces:

AbstractPhil
/

sd15-flow-matching-lune

Running on Zero

App Files Files Community

AbstractPhil commited on Nov 7

Commit

e4fdf48

verified ·

1 Parent(s): 5731dbc

Update app.py

Browse files

Files changed (1) hide show

app.py +245 -16

app.py CHANGED Viewed

@@ -20,9 +20,17 @@ from diffusers import (
     DPMSolverMultistepScheduler,
     EulerDiscreteScheduler
 )
-from transformers import CLIPTextModel, CLIPTokenizer
 from huggingface_hub import hf_hub_download
 # ============================================================================
 # MODEL LOADING
@@ -38,7 +46,10 @@ class FlowMatchingPipeline:
         tokenizer: CLIPTokenizer,
         unet: UNet2DConditionModel,
         scheduler,
-        device: str = "cuda"
     ):
         self.vae = vae
         self.text_encoder = text_encoder
@@ -47,6 +58,11 @@ class FlowMatchingPipeline:
         self.scheduler = scheduler
         self.device = device
         # VAE scaling factor
         self.vae_scale_factor = 0.18215
@@ -83,6 +99,90 @@ class FlowMatchingPipeline:
         return prompt_embeds, negative_prompt_embeds
     @torch.no_grad()
     def __call__(
         self,
@@ -96,6 +196,7 @@ class FlowMatchingPipeline:
         use_flow_matching: bool = True,
         prediction_type: str = "epsilon",
         seed: Optional[int] = None,
         progress_callback=None
     ):
         """Generate image using flow matching or standard diffusion."""
@@ -106,10 +207,15 @@ class FlowMatchingPipeline:
         else:
             generator = None
-        # Encode prompts
-        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
-            prompt, negative_prompt
-        )
         # Prepare latents
         latent_channels = 4
@@ -257,12 +363,94 @@ def load_lune_checkpoint(repo_id: str, filename: str, device: str = "cuda"):
     return unet.to(device)
 def initialize_pipeline(model_choice: str, device: str = "cuda"):
     """Initialize the complete pipeline."""
     print(f"🚀 Initializing {model_choice} pipeline...")
     is_lune = "Lune" in model_choice
     # Load base components
     print("Loading VAE...")
@@ -272,7 +460,7 @@ def initialize_pipeline(model_choice: str, device: str = "cuda"):
         torch_dtype=torch.float32
     ).to(device)
-    print("Loading text encoder...")
     text_encoder = CLIPTextModel.from_pretrained(
         "openai/clip-vit-large-patch14",
         torch_dtype=torch.float32
@@ -282,6 +470,26 @@ def initialize_pipeline(model_choice: str, device: str = "cuda"):
         "openai/clip-vit-large-patch14"
     )
     # Load UNet based on model choice
     if is_lune:
         # Load latest checkpoint from repo
@@ -290,7 +498,8 @@ def initialize_pipeline(model_choice: str, device: str = "cuda"):
         filename = "sd15_flow_lune_e34_s34000.pt"
         unet = load_lune_checkpoint(repo_id, filename, device)
-    elif model_choice == "SD1.5 Base":
         print("Loading SD1.5 base UNet...")
         unet = UNet2DConditionModel.from_pretrained(
             "runwayml/stable-diffusion-v1-5",
@@ -315,7 +524,10 @@ def initialize_pipeline(model_choice: str, device: str = "cuda"):
         tokenizer=tokenizer,
         unet=unet,
         scheduler=scheduler,
-        device=device
     )
     # Set flag for Lune-specific VAE scaling
@@ -393,6 +605,9 @@ def generate_image(
         # Get pipeline
         pipeline = get_pipeline(model_choice)
         # Generate
         progress(0.05, desc="Starting generation...")
@@ -407,6 +622,7 @@ def generate_image(
             use_flow_matching=use_flow_matching,
             prediction_type=prediction_type,
             seed=seed,
             progress_callback=progress_callback
         )
@@ -432,7 +648,11 @@ def create_demo():
         **Geometric crystalline diffusion with flow matching** by [AbstractPhil](https://huggingface.co/AbstractPhil)
-        Generate images using SD1.5-based flow matching with pentachoron geometric structures.
         Achieves high quality with dramatically reduced step counts through geometric efficiency.
         """)
@@ -457,6 +677,7 @@ def create_demo():
                     label="Model",
                     choices=[
                         "Flow-Lune (Latest)",
                         "SD1.5 Base"
                     ],
                     value="Flow-Lune (Latest)"
@@ -554,11 +775,13 @@ def create_demo():
                 - **Shift** controls the flow trajectory (2.0-2.5 recommended for Lune)
                 - Lower shift = more direct path, higher shift = more exploration
                 - **Lune** uses v_prediction by default for optimal results
                 - **SD1.5 Base** uses epsilon (standard diffusion)
                 - Lune operates in a scaled latent space (5.52x) for geometric efficiency
                 ### Model Info:
                 - **Flow-Lune**: Trained with flow matching on 500k SD1.5 distillation pairs
                 - **SD1.5 Base**: Standard Stable Diffusion 1.5 for comparison
                 [📚 Learn more about geometric deep learning](https://github.com/AbstractEyes/lattice_vocabulary)
@@ -584,14 +807,14 @@ def create_demo():
                 [
                     "A futuristic cyberpunk city at night, neon lights, rain-slicked streets, highly detailed",
                     "low quality, blurry",
-                    "Flow-Lune (Latest)",
-                    22,
-                    8.0,
                     512,
                     512,
-                    2.5,
-                    True,
-                    "v_prediction",
                     123,
                     False
                 ],
@@ -631,6 +854,12 @@ def create_demo():
                     use_flow_matching: gr.update(value=False),
                     prediction_type: gr.update(value="epsilon")
                 }
             else:
                 # Lune: enable flow matching, use v_prediction
                 return {

     DPMSolverMultistepScheduler,
     EulerDiscreteScheduler
 )
+from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5Tokenizer
 from huggingface_hub import hf_hub_download
+# Import Lyra VAE from geovocab2
+try:
+    from geovocab2.train.model.vae.vae_lyra import MultiModalVAE, MultiModalVAEConfig
+    LYRA_AVAILABLE = True
+except ImportError:
+    print("⚠️ Lyra VAE not available - install geovocab2")
+    LYRA_AVAILABLE = False
 # ============================================================================
 # MODEL LOADING
         tokenizer: CLIPTokenizer,
         unet: UNet2DConditionModel,
         scheduler,
+        device: str = "cuda",
+        t5_encoder: Optional[T5EncoderModel] = None,
+        t5_tokenizer: Optional[T5Tokenizer] = None,
+        lyra_model: Optional[any] = None
     ):
         self.vae = vae
         self.text_encoder = text_encoder
         self.scheduler = scheduler
         self.device = device
+        # Lyra-specific components
+        self.t5_encoder = t5_encoder
+        self.t5_tokenizer = t5_tokenizer
+        self.lyra_model = lyra_model
         # VAE scaling factor
         self.vae_scale_factor = 0.18215
         return prompt_embeds, negative_prompt_embeds
+    def encode_prompt_lyra(self, prompt: str, negative_prompt: str = ""):
+        """Encode text prompts using Lyra VAE (CLIP + T5 fusion)."""
+        if self.lyra_model is None or self.t5_encoder is None:
+            raise ValueError("Lyra VAE components not initialized")
+        # Get CLIP embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids.to(self.device)
+        with torch.no_grad():
+            clip_embeds = self.text_encoder(text_input_ids)[0]
+        # Get T5 embeddings
+        t5_inputs = self.t5_tokenizer(
+            prompt,
+            max_length=77,
+            padding='max_length',
+            truncation=True,
+            return_tensors='pt'
+        ).to(self.device)
+        with torch.no_grad():
+            t5_embeds = self.t5_encoder(**t5_inputs).last_hidden_state
+        # Fuse through Lyra VAE
+        modality_inputs = {
+            'clip': clip_embeds,
+            't5': t5_embeds
+        }
+        with torch.no_grad():
+            reconstructions, mu, logvar = self.lyra_model(
+                modality_inputs,
+                target_modalities=['clip']
+            )
+            prompt_embeds = reconstructions['clip']
+        # Process negative prompt
+        if negative_prompt:
+            uncond_inputs = self.tokenizer(
+                negative_prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_input_ids = uncond_inputs.input_ids.to(self.device)
+            with torch.no_grad():
+                clip_embeds_uncond = self.text_encoder(uncond_input_ids)[0]
+            t5_inputs_uncond = self.t5_tokenizer(
+                negative_prompt,
+                max_length=77,
+                padding='max_length',
+                truncation=True,
+                return_tensors='pt'
+            ).to(self.device)
+            with torch.no_grad():
+                t5_embeds_uncond = self.t5_encoder(**t5_inputs_uncond).last_hidden_state
+            modality_inputs_uncond = {
+                'clip': clip_embeds_uncond,
+                't5': t5_embeds_uncond
+            }
+            with torch.no_grad():
+                reconstructions_uncond, _, _ = self.lyra_model(
+                    modality_inputs_uncond,
+                    target_modalities=['clip']
+                )
+                negative_prompt_embeds = reconstructions_uncond['clip']
+        else:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+        return prompt_embeds, negative_prompt_embeds
     @torch.no_grad()
     def __call__(
         self,
         use_flow_matching: bool = True,
         prediction_type: str = "epsilon",
         seed: Optional[int] = None,
+        use_lyra: bool = False,
         progress_callback=None
     ):
         """Generate image using flow matching or standard diffusion."""
         else:
             generator = None
+        # Encode prompts - use Lyra if specified
+        if use_lyra and self.lyra_model is not None:
+            prompt_embeds, negative_prompt_embeds = self.encode_prompt_lyra(
+                prompt, negative_prompt
+            )
+        else:
+            prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+                prompt, negative_prompt
+            )
         # Prepare latents
         latent_channels = 4
     return unet.to(device)
+def load_lyra_vae(repo_id: str = "AbstractPhil/vae-lyra", device: str = "cuda"):
+    """Load Lyra VAE from HuggingFace."""
+    if not LYRA_AVAILABLE:
+        print("⚠️ Lyra VAE not available - geovocab2 not installed")
+        return None
+    print(f"🎵 Loading Lyra VAE from {repo_id}...")
+    try:
+        # Download checkpoint
+        checkpoint_path = hf_hub_download(
+            repo_id=repo_id,
+            filename="best_model.pt",
+            repo_type="model"
+        )
+        print(f"✓ Downloaded checkpoint: {checkpoint_path}")
+        # Load checkpoint
+        checkpoint = torch.load(checkpoint_path, map_location="cpu")
+        # Extract config
+        if 'config' in checkpoint:
+            config_dict = checkpoint['config']
+        else:
+            # Use default config
+            config_dict = {
+                'modality_dims': {"clip": 768, "t5": 768},
+                'latent_dim': 768,
+                'seq_len': 77,
+                'encoder_layers': 3,
+                'decoder_layers': 3,
+                'hidden_dim': 1024,
+                'dropout': 0.1,
+                'fusion_strategy': 'cantor',
+                'fusion_heads': 8,
+                'fusion_dropout': 0.1
+            }
+        # Create VAE config
+        vae_config = MultiModalVAEConfig(
+            modality_dims=config_dict.get('modality_dims', {"clip": 768, "t5": 768}),
+            latent_dim=config_dict.get('latent_dim', 768),
+            seq_len=config_dict.get('seq_len', 77),
+            encoder_layers=config_dict.get('encoder_layers', 3),
+            decoder_layers=config_dict.get('decoder_layers', 3),
+            hidden_dim=config_dict.get('hidden_dim', 1024),
+            dropout=config_dict.get('dropout', 0.1),
+            fusion_strategy=config_dict.get('fusion_strategy', 'cantor'),
+            fusion_heads=config_dict.get('fusion_heads', 8),
+            fusion_dropout=config_dict.get('fusion_dropout', 0.1)
+        )
+        # Create model
+        lyra_model = MultiModalVAE(vae_config)
+        # Load weights
+        if 'model_state_dict' in checkpoint:
+            lyra_model.load_state_dict(checkpoint['model_state_dict'])
+        else:
+            lyra_model.load_state_dict(checkpoint)
+        lyra_model.to(device)
+        lyra_model.eval()
+        # Print info
+        print(f"✅ Lyra VAE loaded successfully")
+        if 'global_step' in checkpoint:
+            print(f"   Training step: {checkpoint['global_step']:,}")
+        if 'best_loss' in checkpoint:
+            print(f"   Best loss: {checkpoint['best_loss']:.4f}")
+        print(f"   Fusion strategy: {vae_config.fusion_strategy}")
+        print(f"   Latent dim: {vae_config.latent_dim}")
+        return lyra_model
+    except Exception as e:
+        print(f"❌ Failed to load Lyra VAE: {e}")
+        return None
 def initialize_pipeline(model_choice: str, device: str = "cuda"):
     """Initialize the complete pipeline."""
     print(f"🚀 Initializing {model_choice} pipeline...")
     is_lune = "Lune" in model_choice
+    is_lyra = "Lyra" in model_choice
     # Load base components
     print("Loading VAE...")
         torch_dtype=torch.float32
     ).to(device)
+    print("Loading CLIP text encoder...")
     text_encoder = CLIPTextModel.from_pretrained(
         "openai/clip-vit-large-patch14",
         torch_dtype=torch.float32
         "openai/clip-vit-large-patch14"
     )
+    # Load T5 and Lyra if needed
+    t5_encoder = None
+    t5_tokenizer = None
+    lyra_model = None
+    if is_lyra:
+        print("Loading T5-base encoder...")
+        t5_tokenizer = T5Tokenizer.from_pretrained("t5-base")
+        t5_encoder = T5EncoderModel.from_pretrained(
+            "t5-base",
+            torch_dtype=torch.float32
+        ).to(device)
+        t5_encoder.eval()
+        print("✓ T5 loaded")
+        print("Loading Lyra VAE...")
+        lyra_model = load_lyra_vae(device=device)
+        if lyra_model is None:
+            raise ValueError("Failed to load Lyra VAE")
     # Load UNet based on model choice
     if is_lune:
         # Load latest checkpoint from repo
         filename = "sd15_flow_lune_e34_s34000.pt"
         unet = load_lune_checkpoint(repo_id, filename, device)
+    elif is_lyra or model_choice == "SD1.5 Base":
+        # Use standard SD1.5 UNet for both Lyra and base
         print("Loading SD1.5 base UNet...")
         unet = UNet2DConditionModel.from_pretrained(
             "runwayml/stable-diffusion-v1-5",
         tokenizer=tokenizer,
         unet=unet,
         scheduler=scheduler,
+        device=device,
+        t5_encoder=t5_encoder,
+        t5_tokenizer=t5_tokenizer,
+        lyra_model=lyra_model
     )
     # Set flag for Lune-specific VAE scaling
         # Get pipeline
         pipeline = get_pipeline(model_choice)
+        # Determine if we should use Lyra encoding
+        use_lyra = "Lyra" in model_choice
         # Generate
         progress(0.05, desc="Starting generation...")
             use_flow_matching=use_flow_matching,
             prediction_type=prediction_type,
             seed=seed,
+            use_lyra=use_lyra,
             progress_callback=progress_callback
         )
         **Geometric crystalline diffusion with flow matching** by [AbstractPhil](https://huggingface.co/AbstractPhil)
+        Generate images using SD1.5-based models with geometric deep learning approaches:
+        - **Flow-Lune**: Flow matching with pentachoron geometric structures
+        - **Lyra-VAE**: Multi-modal fusion (CLIP+T5) via geometric attention
+        - **SD1.5 Base**: Standard baseline for comparison
         Achieves high quality with dramatically reduced step counts through geometric efficiency.
         """)
                     label="Model",
                     choices=[
                         "Flow-Lune (Latest)",
+                        "Lyra-VAE (Geometric Fusion)",
                         "SD1.5 Base"
                     ],
                     value="Flow-Lune (Latest)"
                 - **Shift** controls the flow trajectory (2.0-2.5 recommended for Lune)
                 - Lower shift = more direct path, higher shift = more exploration
                 - **Lune** uses v_prediction by default for optimal results
+                - **Lyra** fuses CLIP+T5 encoders through geometric VAE for richer embeddings
                 - **SD1.5 Base** uses epsilon (standard diffusion)
                 - Lune operates in a scaled latent space (5.52x) for geometric efficiency
                 ### Model Info:
                 - **Flow-Lune**: Trained with flow matching on 500k SD1.5 distillation pairs
+                - **Lyra-VAE**: Multi-modal fusion (CLIP+T5) via Cantor geometric attention
                 - **SD1.5 Base**: Standard Stable Diffusion 1.5 for comparison
                 [📚 Learn more about geometric deep learning](https://github.com/AbstractEyes/lattice_vocabulary)
                 [
                     "A futuristic cyberpunk city at night, neon lights, rain-slicked streets, highly detailed",
                     "low quality, blurry",
+                    "Lyra-VAE (Geometric Fusion)",
+                    30,
+                    7.5,
                     512,
                     512,
+                    0.0,
+                    False,
+                    "epsilon",
                     123,
                     False
                 ],
                     use_flow_matching: gr.update(value=False),
                     prediction_type: gr.update(value="epsilon")
                 }
+            elif model_name == "Lyra-VAE (Geometric Fusion)":
+                # Lyra: disable flow matching (uses standard diffusion), use epsilon
+                return {
+                    use_flow_matching: gr.update(value=False),
+                    prediction_type: gr.update(value="epsilon")
+                }
             else:
                 # Lune: enable flow matching, use v_prediction
                 return {