Spaces:

AbstractPhil
/

sd15-flow-matching-lune

Sleeping

App Files Files Community

AbstractPhil commited on Nov 7

Commit

e7c91d9

verified ·

1 Parent(s): cecdf89

Update app.py

Browse files

Files changed (1) hide show

app.py +188 -94

app.py CHANGED Viewed

@@ -444,13 +444,13 @@ def load_lyra_vae(repo_id: str = "AbstractPhil/vae-lyra", device: str = "cuda"):
         return None
-def initialize_pipeline(model_choice: str, device: str = "cuda"):
     """Initialize the complete pipeline."""
     print(f"🚀 Initializing {model_choice} pipeline...")
     is_lune = "Lune" in model_choice
-    is_lyra = "Lyra" in model_choice
     # Load base components
     print("Loading VAE...")
@@ -460,46 +460,40 @@ def initialize_pipeline(model_choice: str, device: str = "cuda"):
         torch_dtype=torch.float32
     ).to(device)
-    print("Loading CLIP text encoder...")
     text_encoder = CLIPTextModel.from_pretrained(
-        "openai/clip-vit-large-patch14",
         torch_dtype=torch.float32
     ).to(device)
     tokenizer = CLIPTokenizer.from_pretrained(
-        "openai/clip-vit-large-patch14"
     )
-    # Load T5 and Lyra if needed
-    t5_encoder = None
-    t5_tokenizer = None
-    lyra_model = None
-    if is_lyra:
-        print("Loading T5-base encoder...")
-        t5_tokenizer = T5Tokenizer.from_pretrained("t5-base")
-        t5_encoder = T5EncoderModel.from_pretrained(
-            "t5-base",
-            torch_dtype=torch.float32
-        ).to(device)
-        t5_encoder.eval()
-        print("✓ T5 loaded")
-        print("Loading Lyra VAE...")
-        lyra_model = load_lyra_vae(device=device)
-        if lyra_model is None:
-            raise ValueError("Failed to load Lyra VAE")
     # Load UNet based on model choice
     if is_lune:
         # Load latest checkpoint from repo
         repo_id = "AbstractPhil/sd15-flow-lune"
-        # Find latest checkpoint - for now use a known one
         filename = "sd15_flow_lune_e34_s34000.pt"
         unet = load_lune_checkpoint(repo_id, filename, device)
-    elif is_lyra or model_choice == "SD1.5 Base":
-        # Use standard SD1.5 UNet for both Lyra and base
         print("Loading SD1.5 base UNet...")
         unet = UNet2DConditionModel.from_pretrained(
             "runwayml/stable-diffusion-v1-5",
@@ -543,15 +537,17 @@ def initialize_pipeline(model_choice: str, device: str = "cuda"):
 # Initialize with None, will load on first inference
 CURRENT_PIPELINE = None
 CURRENT_MODEL = None
-def get_pipeline(model_choice: str):
-    """Get or create pipeline for selected model."""
-    global CURRENT_PIPELINE, CURRENT_MODEL
-    if CURRENT_PIPELINE is None or CURRENT_MODEL != model_choice:
-        CURRENT_PIPELINE = initialize_pipeline(model_choice, device="cuda")
         CURRENT_MODEL = model_choice
     return CURRENT_PIPELINE
@@ -560,7 +556,7 @@ def get_pipeline(model_choice: str):
 # INFERENCE
 # ============================================================================
-def estimate_duration(num_steps: int, width: int, height: int) -> int:
     """Estimate GPU duration based on generation parameters."""
     # Base time per step (seconds)
     base_time_per_step = 0.3
@@ -568,18 +564,24 @@ def estimate_duration(num_steps: int, width: int, height: int) -> int:
     # Resolution scaling
     resolution_factor = (width * height) / (512 * 512)
-    # Total estimate
     estimated = num_steps * base_time_per_step * resolution_factor
     # Add 15 seconds for model loading overhead
     return int(estimated + 15)
-@spaces.GPU(duration=lambda *args: estimate_duration(args[3], args[5], args[6]))
 def generate_image(
     prompt: str,
     negative_prompt: str,
     model_choice: str,
     num_steps: int,
     cfg_scale: float,
     width: int,
@@ -587,11 +589,12 @@ def generate_image(
     shift: float,
     use_flow_matching: bool,
     prediction_type: str,
     seed: int,
     randomize_seed: bool,
     progress=gr.Progress()
 ):
-    """Generate image with ZeroGPU support."""
     # Randomize seed if requested
     if randomize_seed:
@@ -603,32 +606,68 @@ def generate_image(
     try:
         # Get pipeline
-        pipeline = get_pipeline(model_choice)
-        # Determine if we should use Lyra encoding
-        use_lyra = "Lyra" in model_choice
-        # Generate
-        progress(0.05, desc="Starting generation...")
-        image = pipeline(
-            prompt=prompt,
-            negative_prompt=negative_prompt,
-            height=height,
-            width=width,
-            num_inference_steps=num_steps,
-            guidance_scale=cfg_scale,
-            shift=shift,
-            use_flow_matching=use_flow_matching,
-            prediction_type=prediction_type,
-            seed=seed,
-            use_lyra=use_lyra,
-            progress_callback=progress_callback
-        )
-        progress(1.0, desc="Complete!")
-        return image, seed
     except Exception as e:
         print(f"❌ Generation failed: {e}")
@@ -648,12 +687,13 @@ def create_demo():
         **Geometric crystalline diffusion with flow matching** by [AbstractPhil](https://huggingface.co/AbstractPhil)
-        Generate images using SD1.5-based models with geometric deep learning approaches:
-        - **Flow-Lune**: Flow matching with pentachoron geometric structures
-        - **Lyra-VAE**: Multi-modal fusion (CLIP+T5) via geometric attention
-        - **SD1.5 Base**: Standard baseline for comparison
-        Achieves high quality with dramatically reduced step counts through geometric efficiency.
         """)
         with gr.Row():
@@ -674,15 +714,34 @@ def create_demo():
                 # Model selection
                 model_choice = gr.Dropdown(
-                    label="Model",
                     choices=[
                         "Flow-Lune (Latest)",
-                        "Lyra-VAE (Geometric Fusion)",
                         "SD1.5 Base"
                     ],
                     value="Flow-Lune (Latest)"
                 )
                 # Flow matching settings
                 with gr.Accordion("Flow Matching Settings", open=True):
                     use_flow_matching = gr.Checkbox(
@@ -759,10 +818,18 @@ def create_demo():
                 generate_btn = gr.Button("🎨 Generate", variant="primary", size="lg")
             with gr.Column(scale=1):
-                output_image = gr.Image(
-                    label="Generated Image",
-                    type="pil"
-                )
                 output_seed = gr.Number(
                     label="Used Seed",
@@ -775,15 +842,22 @@ def create_demo():
                 - **Shift** controls the flow trajectory (2.0-2.5 recommended for Lune)
                 - Lower shift = more direct path, higher shift = more exploration
                 - **Lune** uses v_prediction by default for optimal results
-                - **Lyra** fuses CLIP+T5 encoders through geometric VAE for richer embeddings
                 - **SD1.5 Base** uses epsilon (standard diffusion)
                 - Lune operates in a scaled latent space (5.52x) for geometric efficiency
                 ### Model Info:
                 - **Flow-Lune**: Trained with flow matching on 500k SD1.5 distillation pairs
-                - **Lyra-VAE**: Multi-modal fusion (CLIP+T5) via Cantor geometric attention
                 - **SD1.5 Base**: Standard Stable Diffusion 1.5 for comparison
                 [📚 Learn more about geometric deep learning](https://github.com/AbstractEyes/lattice_vocabulary)
                 """)
@@ -794,6 +868,7 @@ def create_demo():
                     "A serene mountain landscape at golden hour, crystal clear lake reflecting snow-capped peaks, photorealistic, 8k",
                     "blurry, low quality",
                     "Flow-Lune (Latest)",
                     20,
                     7.5,
                     512,
@@ -801,44 +876,49 @@ def create_demo():
                     2.5,
                     True,
                     "v_prediction",
                     42,
                     False
                 ],
                 [
                     "A futuristic cyberpunk city at night, neon lights, rain-slicked streets, highly detailed",
                     "low quality, blurry",
-                    "Lyra-VAE (Geometric Fusion)",
-                    30,
                     7.5,
                     512,
                     512,
-                    0.0,
-                    False,
-                    "epsilon",
                     123,
                     False
                 ],
                 [
                     "Portrait of a majestic lion, golden mane, dramatic lighting, wildlife photography",
                     "cartoon, painting",
-                    "Flow-Lune (Latest)",
-                    18,
-                    7.0,
                     512,
                     512,
-                    2.0,
                     True,
-                    "v_prediction",
                     456,
                     False
                 ]
             ],
             inputs=[
-                prompt, negative_prompt, model_choice, num_steps, cfg_scale,
-                width, height, shift, use_flow_matching, prediction_type,
                 seed, randomize_seed
             ],
-            outputs=[output_image, output_seed],
             fn=generate_image,
             cache_examples=False
         )
@@ -854,12 +934,6 @@ def create_demo():
                     use_flow_matching: gr.update(value=False),
                     prediction_type: gr.update(value="epsilon")
                 }
-            elif model_name == "Lyra-VAE (Geometric Fusion)":
-                # Lyra: disable flow matching (uses standard diffusion), use epsilon
-                return {
-                    use_flow_matching: gr.update(value=False),
-                    prediction_type: gr.update(value="epsilon")
-                }
             else:
                 # Lune: enable flow matching, use v_prediction
                 return {
@@ -867,20 +941,40 @@ def create_demo():
                     prediction_type: gr.update(value="v_prediction")
                 }
         model_choice.change(
             fn=on_model_change,
             inputs=[model_choice],
             outputs=[use_flow_matching, prediction_type]
         )
         generate_btn.click(
             fn=generate_image,
             inputs=[
-                prompt, negative_prompt, model_choice, num_steps, cfg_scale,
-                width, height, shift, use_flow_matching, prediction_type,
                 seed, randomize_seed
             ],
-            outputs=[output_image, output_seed]
         )
     return demo

         return None
+def initialize_pipeline(model_choice: str, clip_model: str = "openai/clip-vit-large-patch14", device: str = "cuda"):
     """Initialize the complete pipeline."""
     print(f"🚀 Initializing {model_choice} pipeline...")
+    print(f"   CLIP model: {clip_model}")
     is_lune = "Lune" in model_choice
     # Load base components
     print("Loading VAE...")
         torch_dtype=torch.float32
     ).to(device)
+    print(f"Loading CLIP text encoder: {clip_model}...")
     text_encoder = CLIPTextModel.from_pretrained(
+        clip_model,
         torch_dtype=torch.float32
     ).to(device)
     tokenizer = CLIPTokenizer.from_pretrained(
+        clip_model
     )
+    # Always load T5 and Lyra for potential use
+    print("Loading T5-base encoder...")
+    t5_tokenizer = T5Tokenizer.from_pretrained("t5-base")
+    t5_encoder = T5EncoderModel.from_pretrained(
+        "t5-base",
+        torch_dtype=torch.float32
+    ).to(device)
+    t5_encoder.eval()
+    print("✓ T5 loaded")
+    print("Loading Lyra VAE...")
+    lyra_model = load_lyra_vae(device=device)
+    if lyra_model is None:
+        print("⚠️ Lyra VAE not available - fusion disabled")
     # Load UNet based on model choice
     if is_lune:
         # Load latest checkpoint from repo
         repo_id = "AbstractPhil/sd15-flow-lune"
         filename = "sd15_flow_lune_e34_s34000.pt"
         unet = load_lune_checkpoint(repo_id, filename, device)
+    elif model_choice == "SD1.5 Base":
+        # Use standard SD1.5 UNet
         print("Loading SD1.5 base UNet...")
         unet = UNet2DConditionModel.from_pretrained(
             "runwayml/stable-diffusion-v1-5",
 # Initialize with None, will load on first inference
 CURRENT_PIPELINE = None
 CURRENT_MODEL = None
+CURRENT_CLIP_MODEL = None
+def get_pipeline(model_choice: str, clip_model: str):
+    """Get or create pipeline for selected model and CLIP variant."""
+    global CURRENT_PIPELINE, CURRENT_MODEL, CURRENT_CLIP_MODEL
+    if CURRENT_PIPELINE is None or CURRENT_MODEL != model_choice or CURRENT_CLIP_MODEL != clip_model:
+        CURRENT_PIPELINE = initialize_pipeline(model_choice, clip_model, device="cuda")
         CURRENT_MODEL = model_choice
+        CURRENT_CLIP_MODEL = clip_model
     return CURRENT_PIPELINE
 # INFERENCE
 # ============================================================================
+def estimate_duration(num_steps: int, width: int, height: int, use_lyra: bool = False) -> int:
     """Estimate GPU duration based on generation parameters."""
     # Base time per step (seconds)
     base_time_per_step = 0.3
     # Resolution scaling
     resolution_factor = (width * height) / (512 * 512)
+    # Total estimate for one generation
     estimated = num_steps * base_time_per_step * resolution_factor
+    # If Lyra enabled, we generate twice
+    if use_lyra:
+        estimated *= 2
+        estimated += 2  # Extra overhead for dual generation
     # Add 15 seconds for model loading overhead
     return int(estimated + 15)
+@spaces.GPU(duration=lambda *args: estimate_duration(args[4], args[6], args[7], args[11]))
 def generate_image(
     prompt: str,
     negative_prompt: str,
     model_choice: str,
+    clip_model: str,
     num_steps: int,
     cfg_scale: float,
     width: int,
     shift: float,
     use_flow_matching: bool,
     prediction_type: str,
+    use_lyra: bool,
     seed: int,
     randomize_seed: bool,
     progress=gr.Progress()
 ):
+    """Generate image with ZeroGPU support. Returns (standard_img, lyra_img, seed) or (img, None, seed)."""
     # Randomize seed if requested
     if randomize_seed:
     try:
         # Get pipeline
+        pipeline = get_pipeline(model_choice, clip_model)
+        if not use_lyra or pipeline.lyra_model is None:
+            # Standard generation only
+            progress(0.05, desc="Generating (standard)...")
+            image = pipeline(
+                prompt=prompt,
+                negative_prompt=negative_prompt,
+                height=height,
+                width=width,
+                num_inference_steps=num_steps,
+                guidance_scale=cfg_scale,
+                shift=shift,
+                use_flow_matching=use_flow_matching,
+                prediction_type=prediction_type,
+                seed=seed,
+                use_lyra=False,
+                progress_callback=progress_callback
+            )
+            progress(1.0, desc="Complete!")
+            return image, None, seed
+        else:
+            # Generate both standard and Lyra versions
+            progress(0.05, desc="Generating standard version...")
+            image_standard = pipeline(
+                prompt=prompt,
+                negative_prompt=negative_prompt,
+                height=height,
+                width=width,
+                num_inference_steps=num_steps,
+                guidance_scale=cfg_scale,
+                shift=shift,
+                use_flow_matching=use_flow_matching,
+                prediction_type=prediction_type,
+                seed=seed,
+                use_lyra=False,
+                progress_callback=lambda s, t, d: progress(0.05 + (s/t) * 0.45, desc=d)
+            )
+            progress(0.5, desc="Generating Lyra fusion version...")
+            image_lyra = pipeline(
+                prompt=prompt,
+                negative_prompt=negative_prompt,
+                height=height,
+                width=width,
+                num_inference_steps=num_steps,
+                guidance_scale=cfg_scale,
+                shift=shift,
+                use_flow_matching=use_flow_matching,
+                prediction_type=prediction_type,
+                seed=seed,
+                use_lyra=True,
+                progress_callback=lambda s, t, d: progress(0.5 + (s/t) * 0.45, desc=d)
+            )
+            progress(1.0, desc="Complete!")
+            return image_standard, image_lyra, seed
     except Exception as e:
         print(f"❌ Generation failed: {e}")
         **Geometric crystalline diffusion with flow matching** by [AbstractPhil](https://huggingface.co/AbstractPhil)
+        Generate images using SD1.5-based models with geometric deep learning:
+        - **Flow-Lune**: Flow matching with pentachoron geometric structures (15-25 steps)
+        - **SD1.5 Base**: Standard Stable Diffusion 1.5 baseline
+        - **Lyra VAE Toggle**: Add CLIP+T5 fusion for side-by-side comparison
+        - **CLIP Variants**: Different text encoders for varied semantic understanding
+        Enable Lyra to see both standard CLIP and geometric CLIP+T5 fusion results!
         """)
         with gr.Row():
                 # Model selection
                 model_choice = gr.Dropdown(
+                    label="Base Model",
                     choices=[
                         "Flow-Lune (Latest)",
                         "SD1.5 Base"
                     ],
                     value="Flow-Lune (Latest)"
                 )
+                # CLIP model selection
+                clip_model_choice = gr.Dropdown(
+                    label="CLIP Model",
+                    choices=[
+                        "openai/clip-vit-large-patch14",
+                        "openai/clip-vit-large-patch14-336",
+                        "laion/CLIP-ViT-L-14-laion2B-s32B-b82K",
+                        "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k"
+                    ],
+                    value="openai/clip-vit-large-patch14",
+                    info="Text encoder variant"
+                )
+                # Lyra toggle
+                use_lyra = gr.Checkbox(
+                    label="Enable Lyra VAE (CLIP+T5 Fusion)",
+                    value=False,
+                    info="Generate side-by-side comparison with geometric fusion"
+                )
                 # Flow matching settings
                 with gr.Accordion("Flow Matching Settings", open=True):
                     use_flow_matching = gr.Checkbox(
                 generate_btn = gr.Button("🎨 Generate", variant="primary", size="lg")
             with gr.Column(scale=1):
+                with gr.Row():
+                    output_image_standard = gr.Image(
+                        label="Standard Generation",
+                        type="pil",
+                        visible=True
+                    )
+                    output_image_lyra = gr.Image(
+                        label="Lyra Fusion 🎵",
+                        type="pil",
+                        visible=False
+                    )
                 output_seed = gr.Number(
                     label="Used Seed",
                 - **Shift** controls the flow trajectory (2.0-2.5 recommended for Lune)
                 - Lower shift = more direct path, higher shift = more exploration
                 - **Lune** uses v_prediction by default for optimal results
+                - **Lyra toggle** generates side-by-side comparison (CLIP vs CLIP+T5 fusion)
+                - **CLIP variants** may give different semantic interpretations
                 - **SD1.5 Base** uses epsilon (standard diffusion)
                 - Lune operates in a scaled latent space (5.52x) for geometric efficiency
                 ### Model Info:
                 - **Flow-Lune**: Trained with flow matching on 500k SD1.5 distillation pairs
+                - **Lyra VAE**: Multi-modal fusion (CLIP+T5) via Cantor geometric attention
                 - **SD1.5 Base**: Standard Stable Diffusion 1.5 for comparison
+                ### CLIP Models:
+                - **openai/clip-vit-large-patch14**: Standard CLIP-L (default)
+                - **openai/clip-vit-large-patch14-336**: Higher resolution CLIP-L
+                - **laion/CLIP-ViT-L-14**: LAION-trained CLIP-L variant
+                - **laion/CLIP-ViT-bigG-14**: Larger CLIP-G model
                 [📚 Learn more about geometric deep learning](https://github.com/AbstractEyes/lattice_vocabulary)
                 """)
                     "A serene mountain landscape at golden hour, crystal clear lake reflecting snow-capped peaks, photorealistic, 8k",
                     "blurry, low quality",
                     "Flow-Lune (Latest)",
+                    "openai/clip-vit-large-patch14",
                     20,
                     7.5,
                     512,
                     2.5,
                     True,
                     "v_prediction",
+                    False,
                     42,
                     False
                 ],
                 [
                     "A futuristic cyberpunk city at night, neon lights, rain-slicked streets, highly detailed",
                     "low quality, blurry",
+                    "Flow-Lune (Latest)",
+                    "openai/clip-vit-large-patch14",
+                    20,
                     7.5,
                     512,
                     512,
+                    2.5,
+                    True,
+                    "v_prediction",
+                    True,
                     123,
                     False
                 ],
                 [
                     "Portrait of a majestic lion, golden mane, dramatic lighting, wildlife photography",
                     "cartoon, painting",
+                    "SD1.5 Base",
+                    "openai/clip-vit-large-patch14",
+                    30,
+                    7.5,
                     512,
                     512,
+                    0.0,
+                    False,
+                    "epsilon",
                     True,
                     456,
                     False
                 ]
             ],
             inputs=[
+                prompt, negative_prompt, model_choice, clip_model_choice, num_steps, cfg_scale,
+                width, height, shift, use_flow_matching, prediction_type, use_lyra,
                 seed, randomize_seed
             ],
+            outputs=[output_image_standard, output_image_lyra, output_seed],
             fn=generate_image,
             cache_examples=False
         )
                     use_flow_matching: gr.update(value=False),
                     prediction_type: gr.update(value="epsilon")
                 }
             else:
                 # Lune: enable flow matching, use v_prediction
                 return {
                     prediction_type: gr.update(value="v_prediction")
                 }
+        # Update image visibility when Lyra toggle changes
+        def on_lyra_toggle(lyra_enabled):
+            """Show/hide Lyra comparison image."""
+            if lyra_enabled:
+                return {
+                    output_image_standard: gr.update(visible=True, label="Standard CLIP"),
+                    output_image_lyra: gr.update(visible=True, label="Lyra Fusion (CLIP+T5) 🎵")
+                }
+            else:
+                return {
+                    output_image_standard: gr.update(visible=True, label="Generated Image"),
+                    output_image_lyra: gr.update(visible=False)
+                }
         model_choice.change(
             fn=on_model_change,
             inputs=[model_choice],
             outputs=[use_flow_matching, prediction_type]
         )
+        use_lyra.change(
+            fn=on_lyra_toggle,
+            inputs=[use_lyra],
+            outputs=[output_image_standard, output_image_lyra]
+        )
         generate_btn.click(
             fn=generate_image,
             inputs=[
+                prompt, negative_prompt, model_choice, clip_model_choice, num_steps, cfg_scale,
+                width, height, shift, use_flow_matching, prediction_type, use_lyra,
                 seed, randomize_seed
             ],
+            outputs=[output_image_standard, output_image_lyra, output_seed]
         )
     return demo