Spaces:

Ahmedbelaid1
/

triposr

Sleeping

App Files Files Community

Ahmedbelaid1 commited on Nov 22, 2025

Commit

e065062

verified ·

1 Parent(s): 7e74ef2

Upload 41 files

Browse files

Files changed (41) hide show

.dockerignore +55 -0
.gitattributes +2 -35
.gitignore +164 -0
API_README.md +209 -0
DEPLOYMENT_GUIDE.md +271 -0
Dockerfile +56 -0
LICENSE +21 -0
REACT_INTEGRATION.md +549 -0
README.md +143 -11
README_DOCKER.md +158 -0
README_HF_SPACES.md +143 -0
api_example.html +194 -0
api_server.py +403 -0
app.py +171 -0
deploy_colab.ipynb +268 -0
docker-compose.yml +31 -0
gradio_app.py +187 -0
requirements.txt +16 -0
run.py +197 -0
tsr/__pycache__/bake_texture.cpython-313.pyc +0 -0
tsr/__pycache__/system.cpython-313.pyc +0 -0
tsr/__pycache__/utils.cpython-313.pyc +0 -0
tsr/bake_texture.py +191 -0
tsr/models/__pycache__/isosurface.cpython-313.pyc +0 -0
tsr/models/__pycache__/nerf_renderer.cpython-313.pyc +0 -0
tsr/models/__pycache__/network_utils.cpython-313.pyc +0 -0
tsr/models/isosurface.py +64 -0
tsr/models/nerf_renderer.py +180 -0
tsr/models/network_utils.py +124 -0
tsr/models/tokenizers/__pycache__/image.cpython-313.pyc +0 -0
tsr/models/tokenizers/__pycache__/triplane.cpython-313.pyc +0 -0
tsr/models/tokenizers/image.py +66 -0
tsr/models/tokenizers/triplane.py +45 -0
tsr/models/transformer/__pycache__/attention.cpython-313.pyc +0 -0
tsr/models/transformer/__pycache__/basic_transformer_block.cpython-313.pyc +0 -0
tsr/models/transformer/__pycache__/transformer_1d.cpython-313.pyc +0 -0
tsr/models/transformer/attention.py +653 -0
tsr/models/transformer/basic_transformer_block.py +334 -0
tsr/models/transformer/transformer_1d.py +219 -0
tsr/system.py +205 -0
tsr/utils.py +510 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,55 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+*.egg-info/
+dist/
+build/
+*.egg
+# Virtual environments
+venv/
+env/
+ENV/
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# Git
+.git/
+.gitignore
+# Docker
+Dockerfile
+.dockerignore
+# Output files
+output/
+*.obj
+*.glb
+*.zip
+*.png
+*.jpg
+*.jpeg
+# Logs
+*.log
+# OS
+.DS_Store
+Thumbs.db
+# Model cache (will be downloaded at runtime)
+.cache/
+# Temporary files
+*.tmp
+temp/

.gitattributes CHANGED Viewed

@@ -1,35 +1,2 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text


1	+ # Auto detect text files and perform LF normalization
2	+ * text=auto

.gitignore ADDED Viewed

	@@ -0,0 +1,164 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# default output directory
+output/
+outputs/

API_README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+# TripoSR REST API
+A FastAPI-based REST API server for TripoSR 3D mesh generation.
+## Starting the Server
+```bash
+python api_server.py
+```
+Or with uvicorn directly:
+```bash
+uvicorn api_server:app --host 0.0.0.0 --port 8000
+```
+The API will be available at `http://localhost:8000`
+## API Endpoints
+### 1. Health Check
+**GET** `/health`
+Check if the API is running and get device information.
+**Response:**
+```json
+{
+  "status": "healthy",
+  "device": "cuda:0",
+  "cuda_available": true
+}
+```
+### 2. Generate Mesh (File Download)
+**POST** `/generate`
+Generate a 3D mesh from an uploaded image and download the mesh file.
+**Parameters:**
+- `image` (file, required): Image file (PNG, JPG, JPEG)
+- `do_remove_background` (boolean, default: true): Whether to remove background
+- `foreground_ratio` (float, default: 0.85): Ratio of foreground size (0.5-1.0)
+- `mc_resolution` (int, default: 256): Marching cubes resolution (128, 160, 192, 224, 256, 288, 320)
+- `format` (string, default: "obj"): Output format - "obj" or "glb"
+**Response:** Mesh file download
+**Example (cURL):**
+```bash
+curl -X POST "http://localhost:8000/generate" \
+  -F "image=@chair.png" \
+  -F "do_remove_background=true" \
+  -F "foreground_ratio=0.85" \
+  -F "mc_resolution=256" \
+  -F "format=obj" \
+  --output mesh.obj
+```
+### 3. Generate Mesh (Base64)
+**POST** `/generate-base64`
+Generate a 3D mesh and return as base64 encoded string.
+**Parameters:** Same as `/generate`
+**Response:**
+```json
+{
+  "success": true,
+  "format": "obj",
+  "mesh": "base64_encoded_mesh_data...",
+  "size": 1234567
+}
+```
+## Frontend Integration Examples
+### JavaScript/Fetch
+```javascript
+const formData = new FormData();
+formData.append('image', imageFile);
+formData.append('do_remove_background', true);
+formData.append('foreground_ratio', 0.85);
+formData.append('mc_resolution', 256);
+formData.append('format', 'obj');
+const response = await fetch('http://localhost:8000/generate', {
+  method: 'POST',
+  body: formData
+});
+if (response.ok) {
+  const blob = await response.blob();
+  const url = window.URL.createObjectURL(blob);
+  // Download or use the mesh file
+  const a = document.createElement('a');
+  a.href = url;
+  a.download = 'mesh.obj';
+  a.click();
+}
+```
+### React Example
+```jsx
+import { useState } from 'react';
+function MeshGenerator() {
+  const [loading, setLoading] = useState(false);
+  const generateMesh = async (imageFile) => {
+    setLoading(true);
+    const formData = new FormData();
+    formData.append('image', imageFile);
+    formData.append('do_remove_background', true);
+    formData.append('foreground_ratio', 0.85);
+    formData.append('mc_resolution', 256);
+    formData.append('format', 'obj');
+    try {
+      const response = await fetch('http://localhost:8000/generate', {
+        method: 'POST',
+        body: formData
+      });
+      if (response.ok) {
+        const blob = await response.blob();
+        // Handle the mesh file
+        const url = window.URL.createObjectURL(blob);
+        // Download or display
+      }
+    } catch (error) {
+      console.error('Error:', error);
+    } finally {
+      setLoading(false);
+    }
+  };
+  return (
+    <div>
+      <input
+        type="file"
+        accept="image/*"
+        onChange={(e) => generateMesh(e.target.files[0])}
+      />
+      {loading && <p>Generating mesh...</p>}
+    </div>
+  );
+}
+```
+### Python Client Example
+```python
+import requests
+url = "http://localhost:8000/generate"
+with open("chair.png", "rb") as f:
+    files = {"image": f}
+    data = {
+        "do_remove_background": True,
+        "foreground_ratio": 0.85,
+        "mc_resolution": 256,
+        "format": "obj"
+    }
+    response = requests.post(url, files=files, data=data)
+    if response.status_code == 200:
+        with open("mesh.obj", "wb") as out:
+            out.write(response.content)
+        print("Mesh saved to mesh.obj")
+```
+## CORS Configuration
+The API is configured to allow CORS from all origins by default. For production, update the `allow_origins` in `api_server.py`:
+```python
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["https://your-frontend-domain.com"],  # Your frontend URL
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+```
+## Performance Notes
+- Model initialization takes ~18-20 seconds on first request
+- Mesh generation typically takes 30-60 seconds depending on resolution
+- GPU (CUDA) is recommended for faster processing
+- Consider implementing request queuing for production use
+## Error Handling
+All endpoints return appropriate HTTP status codes:
+- `200`: Success
+- `400`: Bad request (invalid parameters)
+- `500`: Server error (model processing failed)
+Error responses include a JSON body with a `detail` field describing the error.

DEPLOYMENT_GUIDE.md ADDED Viewed

	@@ -0,0 +1,271 @@

+# 🚀 Free Cloud Deployment Guide for TripoSR API
+This guide covers multiple **FREE** options to deploy your TripoSR API in the cloud.
+## 📋 Table of Contents
+1. [Hugging Face Spaces (Recommended)](#1-hugging-face-spaces-recommended)
+2. [Google Colab](#2-google-colab)
+3. [Render.com (CPU Only)](#3-rendercom-cpu-only)
+4. [Railway.app](#4-railwayapp)
+---
+## 1. Hugging Face Spaces (Recommended) ⭐
+**Best option** for this project - offers free GPU access and is designed for ML models.
+### Features:
+- ✅ Free GPU (T4) available
+- ✅ Persistent deployment
+- ✅ Built-in CI/CD
+- ✅ Public API endpoint
+- ✅ Great for ML models
+### Setup Steps:
+1. **Create a Hugging Face account** at [huggingface.co](https://huggingface.co)
+2. **Create a new Space:**
+   - Go to https://huggingface.co/spaces
+   - Click "Create new Space"
+   - Name: `triposr-api`
+   - License: MIT
+   - Select SDK: **Docker**
+   - Hardware: **CPU basic** (start here, upgrade to GPU if needed)
+3. **Clone your Space repository:**
+   ```bash
+   git clone https://huggingface.co/spaces/YOUR_USERNAME/triposr-api
+   cd triposr-api
+   ```
+4. **Copy these files to the Space:**
+   - Copy `Dockerfile.huggingface` as `Dockerfile`
+   - Copy `api_server.py`
+   - Copy `requirements.txt`
+   - Copy the entire `tsr/` directory
+   - Copy `README.md`
+5. **Create a `README.md` header** (required by HF Spaces):
+   ```markdown
+   ---
+   title: TripoSR API
+   emoji: 🎨
+   colorFrom: blue
+   colorTo: purple
+   sdk: docker
+   pinned: false
+   license: mit
+   ---
+   # TripoSR API
+   Fast 3D reconstruction from a single image.
+   ```
+6. **Push to Hugging Face:**
+   ```bash
+   git add .
+   git commit -m "Initial deployment"
+   git push
+   ```
+7. **Your API will be available at:**
+   ```
+   https://YOUR_USERNAME-triposr-api.hf.space
+   ```
+### Upgrade to GPU (if needed):
+- Go to your Space settings
+- Under "Hardware", select **T4 small** (free tier)
+- The Space will rebuild automatically
+---
+## 2. Google Colab
+**Good for testing** - Free GPU but sessions expire after inactivity.
+### Features:
+- ✅ Free GPU (T4/K80)
+- ❌ Sessions expire (12-hour limit)
+- ❌ Not persistent
+- ✅ Good for testing/demos
+### Setup Steps:
+1. **Upload the Colab notebook:**
+   - Open [Google Colab](https://colab.research.google.com)
+   - Upload `deploy_colab.ipynb` (provided in this repo)
+2. **Run the notebook:**
+   - Enable GPU: Runtime → Change runtime type → GPU
+   - Run all cells
+   - The notebook will install dependencies and start the server
+3. **Access via ngrok tunnel:**
+   - The notebook creates a public URL using ngrok
+   - URL will be displayed in the output
+   - Example: `https://abc123.ngrok.io`
+**Note:** The URL changes every time you restart the notebook.
+---
+## 3. Render.com (CPU Only)
+**Limited functionality** - Free tier is CPU only, so 3D generation will be VERY slow.
+### Features:
+- ✅ Free tier available
+- ✅ Persistent deployment
+- ❌ CPU only (slow inference)
+- ✅ Auto-deploy from GitHub
+### Setup Steps:
+1. **Push code to GitHub:**
+   ```bash
+   git init
+   git add .
+   git commit -m "Initial commit"
+   git remote add origin YOUR_GITHUB_REPO_URL
+   git push -u origin main
+   ```
+2. **Create Render account** at [render.com](https://render.com)
+3. **Create a new Web Service:**
+   - Click "New +" → "Web Service"
+   - Connect your GitHub repository
+   - Name: `triposr-api`
+   - Environment: **Docker**
+   - Plan: **Free**
+4. **Configure:**
+   - Build Command: (leave empty - using Dockerfile)
+   - Start Command: (leave empty - using Dockerfile CMD)
+5. **Deploy:**
+   - Click "Create Web Service"
+   - Wait for build to complete (~10-15 minutes)
+6. **Your API will be available at:**
+   ```
+   https://triposr-api.onrender.com
+   ```
+**⚠️ Warning:** CPU-only inference will be 10-50x slower than GPU!
+---
+## 4. Railway.app
+**Trial credits** - $5 free credits, then paid.
+### Features:
+- ✅ $5 free trial credits
+- ✅ Easy deployment
+- ❌ No free GPU
+- ✅ Good for CPU testing
+### Setup Steps:
+1. **Create Railway account** at [railway.app](https://railway.app)
+2. **Create new project:**
+   - Click "New Project"
+   - Select "Deploy from GitHub repo"
+   - Connect your repository
+3. **Configure:**
+   - Railway auto-detects Dockerfile
+   - Add environment variables if needed
+4. **Deploy:**
+   - Railway builds and deploys automatically
+   - Get your public URL from the dashboard
+---
+## 🎯 Recommendation
+**For production use:** Hugging Face Spaces with GPU
+- Best balance of features, performance, and cost
+- Designed for ML models
+- Free GPU tier available
+**For testing/demos:** Google Colab
+- Quick setup
+- Free GPU
+- Good for temporary demos
+**For CPU-only:** Render.com
+- Persistent deployment
+- But very slow for 3D generation
+---
+## 📊 Comparison Table
+| Platform | GPU | Persistent | Free Tier | Best For |
+|----------|-----|------------|-----------|----------|
+| **Hugging Face Spaces** | ✅ T4 | ✅ | ✅ | Production |
+| **Google Colab** | ✅ T4/K80 | ❌ | ✅ | Testing |
+| **Render.com** | ❌ | ✅ | ✅ | CPU demos |
+| **Railway.app** | ❌ | ✅ | $5 credits | Trial |
+---
+## 🔧 Testing Your Deployment
+Once deployed, test your API:
+```bash
+# Health check
+curl https://YOUR_DEPLOYMENT_URL/health
+# Generate 3D model
+curl -X POST https://YOUR_DEPLOYMENT_URL/generate \
+  -F "image=@test_image.png" \
+  -F "format=obj" \
+  -F "bake_texture_flag=true" \
+  -o output.zip
+```
+---
+## 📝 Notes
+- **Model size:** The TripoSR model is ~1.5GB and will be downloaded on first run
+- **Memory requirements:** Minimum 8GB RAM, 6GB VRAM for GPU
+- **Cold starts:** First request may take 30-60 seconds to load the model
+- **Rate limits:** Free tiers may have rate limits or usage quotas
+---
+## 🆘 Troubleshooting
+### "Out of memory" errors
+- Reduce `mc_resolution` parameter (default: 256)
+- Use smaller images
+- Upgrade to larger instance
+### Slow generation
+- Ensure GPU is enabled
+- Check if running on CPU (much slower)
+- Monitor instance resources
+### Build failures
+- Check Docker logs
+- Ensure all dependencies are in `requirements.txt`
+- Verify CUDA compatibility
+---
+## 📚 Additional Resources
+- [TripoSR Paper](https://arxiv.org/abs/2403.02151)
+- [Hugging Face Spaces Docs](https://huggingface.co/docs/hub/spaces)
+- [Render Docs](https://render.com/docs)
+- [Railway Docs](https://docs.railway.app)

Dockerfile ADDED Viewed

	@@ -0,0 +1,56 @@

+# Hugging Face Spaces optimized Dockerfile
+# This uses a lighter base image suitable for HF Spaces
+FROM python:3.10-slim
+# Set environment variables
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHONUNBUFFERED=1
+ENV TRANSFORMERS_CACHE=/tmp/transformers_cache
+ENV HF_HOME=/tmp/hf_home
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    gcc \
+    g++ \
+    git \
+    ffmpeg \
+    libsm6 \
+    libxext6 \
+    libxrender-dev \
+    libgomp1 \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+# Set up the application directory
+WORKDIR /app
+# Copy requirements file first for better Docker layer caching
+COPY requirements.txt /app/
+# Upgrade pip and setuptools
+RUN pip install --no-cache-dir --upgrade pip setuptools wheel
+# Install PyTorch CPU version (HF Spaces will use CPU by default, GPU if upgraded)
+# Using CPU version to reduce image size - will auto-detect GPU if available
+RUN pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cpu
+# Install other dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy the rest of the application code
+COPY . /app
+# Create output directory for temporary files
+RUN mkdir -p /app/output
+# Expose the port (HF Spaces uses 7860 by default)
+EXPOSE 7860
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=3 \
+    CMD curl -f http://localhost:7860/health || exit 1
+# Run the server on port 7860 (required by HF Spaces)
+CMD ["python", "-m", "uvicorn", "api_server:app", "--host", "0.0.0.0", "--port", "7860"]

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 Tripo AI & Stability AI
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

REACT_INTEGRATION.md ADDED Viewed

	@@ -0,0 +1,549 @@

+# TripoSR API - React Integration Guide
+This guide shows how to integrate TripoSR API into your React + Supabase project.
+## 1. Update CORS in API Server
+First, update `api_server.py` to allow your React app's origin:
+```python
+# In api_server.py, update the CORS middleware:
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=[
+        "http://localhost:3000",  # React dev server
+        "http://localhost:5173",   # Vite dev server
+        "http://localhost:8080",   # Other common ports
+        # Add your production domain here
+    ],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+```
+## 2. React Hook for TripoSR API
+Create a custom hook: `src/hooks/useTripoSR.js` or `src/hooks/useTripoSR.ts`
+### TypeScript Version:
+```typescript
+import { useState } from 'react';
+interface GenerateMeshParams {
+  doRemoveBackground?: boolean;
+  foregroundRatio?: number;
+  mcResolution?: number;
+  format?: 'obj' | 'glb';
+}
+interface UseTripoSRReturn {
+  generateMesh: (imageFile: File, params?: GenerateMeshParams) => Promise<Blob | null>;
+  generateMeshBase64: (imageFile: File, params?: GenerateMeshParams) => Promise<string | null>;
+  loading: boolean;
+  error: string | null;
+}
+const API_URL = process.env.REACT_APP_TRIPOSR_API_URL || 'http://localhost:8000';
+export const useTripoSR = (): UseTripoSRReturn => {
+  const [loading, setLoading] = useState(false);
+  const [error, setError] = useState<string | null>(null);
+  const generateMesh = async (
+    imageFile: File,
+    params: GenerateMeshParams = {}
+  ): Promise<Blob | null> => {
+    setLoading(true);
+    setError(null);
+    try {
+      const formData = new FormData();
+      formData.append('image', imageFile);
+      formData.append('do_remove_background', String(params.doRemoveBackground ?? true));
+      formData.append('foreground_ratio', String(params.foregroundRatio ?? 0.85));
+      formData.append('mc_resolution', String(params.mcResolution ?? 256));
+      formData.append('format', params.format ?? 'obj');
+      const response = await fetch(`${API_URL}/generate`, {
+        method: 'POST',
+        body: formData,
+      });
+      if (!response.ok) {
+        const errorData = await response.json().catch(() => ({ detail: 'Unknown error' }));
+        throw new Error(errorData.detail || `HTTP error! status: ${response.status}`);
+      }
+      const blob = await response.blob();
+      return blob;
+    } catch (err) {
+      const errorMessage = err instanceof Error ? err.message : 'Failed to generate mesh';
+      setError(errorMessage);
+      console.error('TripoSR API error:', err);
+      return null;
+    } finally {
+      setLoading(false);
+    }
+  };
+  const generateMeshBase64 = async (
+    imageFile: File,
+    params: GenerateMeshParams = {}
+  ): Promise<string | null> => {
+    setLoading(true);
+    setError(null);
+    try {
+      const formData = new FormData();
+      formData.append('image', imageFile);
+      formData.append('do_remove_background', String(params.doRemoveBackground ?? true));
+      formData.append('foreground_ratio', String(params.foregroundRatio ?? 0.85));
+      formData.append('mc_resolution', String(params.mcResolution ?? 256));
+      formData.append('format', params.format ?? 'obj');
+      const response = await fetch(`${API_URL}/generate-base64`, {
+        method: 'POST',
+        body: formData,
+      });
+      if (!response.ok) {
+        const errorData = await response.json().catch(() => ({ detail: 'Unknown error' }));
+        throw new Error(errorData.detail || `HTTP error! status: ${response.status}`);
+      }
+      const data = await response.json();
+      return data.mesh; // Base64 encoded mesh
+    } catch (err) {
+      const errorMessage = err instanceof Error ? err.message : 'Failed to generate mesh';
+      setError(errorMessage);
+      console.error('TripoSR API error:', err);
+      return null;
+    } finally {
+      setLoading(false);
+    }
+  };
+  return { generateMesh, generateMeshBase64, loading, error };
+};
+```
+### JavaScript Version:
+```javascript
+import { useState } from 'react';
+const API_URL = process.env.REACT_APP_TRIPOSR_API_URL || 'http://localhost:8000';
+export const useTripoSR = () => {
+  const [loading, setLoading] = useState(false);
+  const [error, setError] = useState(null);
+  const generateMesh = async (imageFile, params = {}) => {
+    setLoading(true);
+    setError(null);
+    try {
+      const formData = new FormData();
+      formData.append('image', imageFile);
+      formData.append('do_remove_background', String(params.doRemoveBackground ?? true));
+      formData.append('foreground_ratio', String(params.foregroundRatio ?? 0.85));
+      formData.append('mc_resolution', String(params.mcResolution ?? 256));
+      formData.append('format', params.format ?? 'obj');
+      const response = await fetch(`${API_URL}/generate`, {
+        method: 'POST',
+        body: formData,
+      });
+      if (!response.ok) {
+        const errorData = await response.json().catch(() => ({ detail: 'Unknown error' }));
+        throw new Error(errorData.detail || `HTTP error! status: ${response.status}`);
+      }
+      const blob = await response.blob();
+      return blob;
+    } catch (err) {
+      const errorMessage = err.message || 'Failed to generate mesh';
+      setError(errorMessage);
+      console.error('TripoSR API error:', err);
+      return null;
+    } finally {
+      setLoading(false);
+    }
+  };
+  const generateMeshBase64 = async (imageFile, params = {}) => {
+    setLoading(true);
+    setError(null);
+    try {
+      const formData = new FormData();
+      formData.append('image', imageFile);
+      formData.append('do_remove_background', String(params.doRemoveBackground ?? true));
+      formData.append('foreground_ratio', String(params.foregroundRatio ?? 0.85));
+      formData.append('mc_resolution', String(params.mcResolution ?? 256));
+      formData.append('format', params.format ?? 'obj');
+      const response = await fetch(`${API_URL}/generate-base64`, {
+        method: 'POST',
+        body: formData,
+      });
+      if (!response.ok) {
+        const errorData = await response.json().catch(() => ({ detail: 'Unknown error' }));
+        throw new Error(errorData.detail || `HTTP error! status: ${response.status}`);
+      }
+      const data = await response.json();
+      return data.mesh;
+    } catch (err) {
+      const errorMessage = err.message || 'Failed to generate mesh';
+      setError(errorMessage);
+      console.error('TripoSR API error:', err);
+      return null;
+    } finally {
+      setLoading(false);
+    }
+  };
+  return { generateMesh, generateMeshBase64, loading, error };
+};
+```
+## 3. React Component Example
+Create `src/components/MeshGenerator.tsx` or `.jsx`:
+```tsx
+import React, { useState } from 'react';
+import { useTripoSR } from '../hooks/useTripoSR';
+const MeshGenerator: React.FC = () => {
+  const [selectedFile, setSelectedFile] = useState<File | null>(null);
+  const [preview, setPreview] = useState<string | null>(null);
+  const [meshUrl, setMeshUrl] = useState<string | null>(null);
+  const { generateMesh, loading, error } = useTripoSR();
+  const handleFileChange = (e: React.ChangeEvent<HTMLInputElement>) => {
+    const file = e.target.files?.[0];
+    if (file) {
+      setSelectedFile(file);
+      const reader = new FileReader();
+      reader.onloadend = () => {
+        setPreview(reader.result as string);
+      };
+      reader.readAsDataURL(file);
+    }
+  };
+  const handleGenerate = async () => {
+    if (!selectedFile) return;
+    const blob = await generateMesh(selectedFile, {
+      doRemoveBackground: true,
+      foregroundRatio: 0.85,
+      mcResolution: 256,
+      format: 'obj',
+    });
+    if (blob) {
+      const url = window.URL.createObjectURL(blob);
+      setMeshUrl(url);
+    }
+  };
+  const handleDownload = () => {
+    if (meshUrl) {
+      const a = document.createElement('a');
+      a.href = meshUrl;
+      a.download = 'mesh.obj';
+      document.body.appendChild(a);
+      a.click();
+      document.body.removeChild(a);
+    }
+  };
+  return (
+    <div className="mesh-generator">
+      <h2>3D Mesh Generator</h2>
+      <div className="upload-section">
+        <input
+          type="file"
+          accept="image/*"
+          onChange={handleFileChange}
+          disabled={loading}
+        />
+        {preview && (
+          <img src={preview} alt="Preview" style={{ maxWidth: '300px', marginTop: '10px' }} />
+        )}
+      </div>
+      <button onClick={handleGenerate} disabled={!selectedFile || loading}>
+        {loading ? 'Generating...' : 'Generate Mesh'}
+      </button>
+      {error && <div className="error">Error: {error}</div>}
+      {meshUrl && (
+        <div className="result-section">
+          <p>Mesh generated successfully!</p>
+          <button onClick={handleDownload}>Download Mesh</button>
+          {/* You can also use a 3D viewer here */}
+        </div>
+      )}
+    </div>
+  );
+};
+export default MeshGenerator;
+```
+## 4. Integration with Supabase Storage
+If you want to store mesh files in Supabase:
+```typescript
+import { useTripoSR } from '../hooks/useTripoSR';
+import { supabase } from '../lib/supabase'; // Your Supabase client
+const MeshGeneratorWithSupabase: React.FC = () => {
+  const { generateMesh, loading, error } = useTripoSR();
+  const [uploading, setUploading] = useState(false);
+  const handleGenerateAndUpload = async (imageFile: File, userId: string) => {
+    // Generate mesh
+    const blob = await generateMesh(imageFile);
+    if (!blob) return;
+    // Upload to Supabase Storage
+    setUploading(true);
+    try {
+      const fileName = `${userId}/${Date.now()}_mesh.obj`;
+      const { data, error: uploadError } = await supabase.storage
+        .from('meshes') // Your storage bucket name
+        .upload(fileName, blob, {
+          contentType: 'application/octet-stream',
+          upsert: false,
+        });
+      if (uploadError) throw uploadError;
+      // Get public URL
+      const { data: urlData } = supabase.storage
+        .from('meshes')
+        .getPublicUrl(fileName);
+      console.log('Mesh uploaded:', urlData.publicUrl);
+      return urlData.publicUrl;
+    } catch (err) {
+      console.error('Upload error:', err);
+    } finally {
+      setUploading(false);
+    }
+  };
+  // ... rest of component
+};
+```
+## 5. Environment Variables
+Create `.env.local` in your React project:
+```env
+REACT_APP_TRIPOSR_API_URL=http://localhost:8000
+```
+For production, update to your deployed API URL.
+## 6. 3D Mesh Viewer Integration
+To display the mesh in your React app, you can use libraries like:
+- **react-three-fiber** + **drei**
+- **@react-three/viewer**
+- **model-viewer** (web component)
+Example with `model-viewer`:
+```tsx
+import '@google/model-viewer';
+const MeshViewer: React.FC<{ meshUrl: string }> = ({ meshUrl }) => {
+  return (
+    <model-viewer
+      src={meshUrl}
+      alt="3D Mesh"
+      auto-rotate
+      camera-controls
+      style={{ width: '100%', height: '500px' }}
+    />
+  );
+};
+```
+## 7. Complete Example with All Features
+```tsx
+import React, { useState } from 'react';
+import { useTripoSR } from '../hooks/useTripoSR';
+import { supabase } from '../lib/supabase';
+const CompleteMeshGenerator: React.FC = () => {
+  const [file, setFile] = useState<File | null>(null);
+  const [preview, setPreview] = useState<string | null>(null);
+  const [meshUrl, setMeshUrl] = useState<string | null>(null);
+  const [foregroundRatio, setForegroundRatio] = useState(0.85);
+  const [resolution, setResolution] = useState(256);
+  const [format, setFormat] = useState<'obj' | 'glb'>('obj');
+  const { generateMesh, loading, error } = useTripoSR();
+  const handleFileSelect = (e: React.ChangeEvent<HTMLInputElement>) => {
+    const selected = e.target.files?.[0];
+    if (selected) {
+      setFile(selected);
+      const reader = new FileReader();
+      reader.onload = () => setPreview(reader.result as string);
+      reader.readAsDataURL(selected);
+    }
+  };
+  const handleGenerate = async () => {
+    if (!file) return;
+    const blob = await generateMesh(file, {
+      doRemoveBackground: true,
+      foregroundRatio,
+      mcResolution: resolution,
+      format,
+    });
+    if (blob) {
+      const url = window.URL.createObjectURL(blob);
+      setMeshUrl(url);
+    }
+  };
+  const handleSaveToSupabase = async () => {
+    if (!meshUrl || !file) return;
+    const { data: { user } } = await supabase.auth.getUser();
+    if (!user) {
+      alert('Please log in to save meshes');
+      return;
+    }
+    const response = await fetch(meshUrl);
+    const blob = await response.blob();
+    const fileName = `${user.id}/${Date.now()}_mesh.${format}`;
+    const { error } = await supabase.storage
+      .from('meshes')
+      .upload(fileName, blob);
+    if (error) {
+      console.error('Upload error:', error);
+    } else {
+      alert('Mesh saved to Supabase!');
+    }
+  };
+  return (
+    <div style={{ padding: '20px' }}>
+      <h1>3D Mesh Generator</h1>
+      <div>
+        <input type="file" accept="image/*" onChange={handleFileSelect} />
+        {preview && <img src={preview} alt="Preview" style={{ maxWidth: '300px' }} />}
+      </div>
+      <div style={{ margin: '20px 0' }}>
+        <label>
+          Foreground Ratio: {foregroundRatio}
+          <input
+            type="range"
+            min="0.5"
+            max="1.0"
+            step="0.05"
+            value={foregroundRatio}
+            onChange={(e) => setForegroundRatio(parseFloat(e.target.value))}
+          />
+        </label>
+      </div>
+      <div style={{ margin: '20px 0' }}>
+        <label>
+          Resolution: {resolution}
+          <input
+            type="range"
+            min="128"
+            max="320"
+            step="32"
+            value={resolution}
+            onChange={(e) => setResolution(parseInt(e.target.value))}
+          />
+        </label>
+      </div>
+      <div style={{ margin: '20px 0' }}>
+        <label>
+          Format:
+          <select value={format} onChange={(e) => setFormat(e.target.value as 'obj' | 'glb')}>
+            <option value="obj">OBJ</option>
+            <option value="glb">GLB</option>
+          </select>
+        </label>
+      </div>
+      <button onClick={handleGenerate} disabled={!file || loading}>
+        {loading ? 'Generating...' : 'Generate Mesh'}
+      </button>
+      {error && <div style={{ color: 'red' }}>Error: {error}</div>}
+      {meshUrl && (
+        <div>
+          <p>Mesh generated!</p>
+          <a href={meshUrl} download={`mesh.${format}`}>
+            <button>Download</button>
+          </a>
+          <button onClick={handleSaveToSupabase}>Save to Supabase</button>
+        </div>
+      )}
+    </div>
+  );
+};
+export default CompleteMeshGenerator;
+```
+## 8. API Health Check
+Add a health check on app load:
+```typescript
+useEffect(() => {
+  const checkAPI = async () => {
+    try {
+      const response = await fetch(`${API_URL}/health`);
+      const data = await response.json();
+      console.log('TripoSR API status:', data);
+    } catch (err) {
+      console.error('TripoSR API is not available:', err);
+    }
+  };
+  checkAPI();
+}, []);
+```
+## Notes
+- Make sure your TripoSR API server is running before using the React app
+- The API takes 30-60 seconds to generate a mesh, so show appropriate loading states
+- Consider implementing request cancellation for better UX
+- For production, deploy the API server and update the `REACT_APP_TRIPOSR_API_URL` environment variable

README.md CHANGED Viewed

@@ -1,11 +1,143 @@
----
-title: Triposr
-emoji: 🌍
-colorFrom: green
-colorTo: blue
-sdk: docker
-pinned: false
-license: mit
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: TripoSR API
+emoji: 🎨
+colorFrom: blue
+colorTo: purple
+sdk: docker
+pinned: false
+license: mit
+---
+# TripoSR API 🎨
+Fast 3D reconstruction from a single image using TripoSR.
+## 🚀 Features
+- **Fast 3D Generation**: Convert images to 3D models in seconds
+- **Multiple Formats**: Export as OBJ or GLB
+- **Texture Baking**: Optional texture atlas generation
+- **Background Removal**: Automatic background removal
+- **REST API**: Easy integration with any frontend
+## 📡 API Endpoints
+### Health Check
+```bash
+GET /health
+```
+### Generate 3D Model
+```bash
+POST /generate
+```
+**Parameters:**
+- `image`: Image file (PNG, JPG, JPEG)
+- `do_remove_background`: Remove background (default: true)
+- `foreground_ratio`: Foreground size ratio (default: 0.85)
+- `mc_resolution`: Mesh resolution (default: 256)
+- `format`: Output format - "obj" or "glb" (default: "obj")
+- `bake_texture_flag`: Bake texture (default: true)
+- `texture_resolution`: Texture resolution (default: 2048)
+- `orientation`: Mesh orientation - "standard", "gradio", or "none" (default: "standard")
+**Returns:**
+- ZIP file containing mesh and texture (if texture baking enabled)
+- Or mesh file only (if texture baking disabled)
+### Generate 3D Model (Base64)
+```bash
+POST /generate-base64
+```
+Same parameters as `/generate`, but returns JSON with base64-encoded mesh and texture.
+## 🧪 Example Usage
+### cURL
+```bash
+curl -X POST https://YOUR-SPACE-URL/generate \
+  -F "image=@your_image.png" \
+  -F "format=obj" \
+  -F "bake_texture_flag=true" \
+  -o output.zip
+```
+### Python
+```python
+import requests
+url = "https://YOUR-SPACE-URL/generate"
+files = {"image": open("your_image.png", "rb")}
+data = {
+    "format": "obj",
+    "bake_texture_flag": True,
+    "mc_resolution": 256
+}
+response = requests.post(url, files=files, data=data)
+with open("output.zip", "wb") as f:
+    f.write(response.content)
+```
+### JavaScript
+```javascript
+const formData = new FormData();
+formData.append('image', fileInput.files[0]);
+formData.append('format', 'obj');
+formData.append('bake_texture_flag', 'true');
+const response = await fetch('https://YOUR-SPACE-URL/generate', {
+  method: 'POST',
+  body: formData
+});
+const blob = await response.blob();
+// Download or process the blob
+```
+## ⚙️ Configuration
+### GPU Support
+This Space can run on CPU or GPU:
+- **CPU**: Slower but free
+- **GPU (T4)**: Much faster, may require upgrade in Space settings
+To upgrade to GPU:
+1. Go to Space settings
+2. Select "T4 small" under Hardware
+3. Space will rebuild automatically
+### Performance
+- **CPU**: ~30-60 seconds per image
+- **GPU (T4)**: ~5-10 seconds per image
+- **GPU (A100)**: ~1-2 seconds per image
+## 📚 Documentation
+- [TripoSR Paper](https://arxiv.org/abs/2403.02151)
+- [API Documentation](API_README.md)
+- [React Integration Guide](REACT_INTEGRATION.md)
+## 🔗 Links
+- [GitHub Repository](https://github.com/Ahmedbelaid/TripoSR-api)
+- [Original TripoSR](https://github.com/VAST-AI-Research/TripoSR)
+- [Stability AI](https://stability.ai/)
+- [Tripo AI](https://www.tripo3d.ai/)
+## 📄 License
+MIT License - see [LICENSE](LICENSE) file for details.
+## 🙏 Credits
+- **TripoSR Model**: [Stability AI](https://stability.ai/) and [Tripo AI](https://www.tripo3d.ai/)
+- **API Implementation**: Community contribution
+## 🆘 Support
+For issues or questions:
+- Open an issue on [GitHub](https://github.com/Ahmedbelaid/TripoSR-api/issues)
+- Join the [Discord](https://discord.gg/mvS9mCfMnQ)

README_DOCKER.md ADDED Viewed

	@@ -0,0 +1,158 @@

+# Docker Setup for TripoSR API
+This guide explains how to build and run the TripoSR API using Docker.
+## Prerequisites
+- Docker installed on your system
+- (Optional) NVIDIA Docker runtime (nvidia-docker2) for GPU support
+- At least 8GB of available RAM
+- (For GPU) NVIDIA GPU with CUDA support
+**Note:** This Dockerfile uses the `-devel` PyTorch image (instead of `-runtime`) because `torchmcubes` requires the CUDA development toolkit to compile with CUDA support. This results in a larger image size (~8-10GB) but is necessary for proper compilation.
+## Building the Docker Image
+### Basic Build
+```bash
+docker build -t triposr-api:latest .
+```
+### Build with Specific Tag
+```bash
+docker build -t triposr-api:v1.0 .
+```
+## Running the Container
+### CPU Mode
+```bash
+docker run -d \
+  --name triposr-api \
+  -p 8000:8000 \
+  triposr-api:latest
+```
+### GPU Mode (NVIDIA)
+First, ensure you have `nvidia-docker2` installed. Then run:
+```bash
+docker run -d \
+  --name triposr-api \
+  --gpus all \
+  -p 8000:8000 \
+  -e CUDA_VISIBLE_DEVICES=0 \
+  triposr-api:latest
+```
+### Using Docker Compose
+For easier management, use Docker Compose:
+```bash
+# Start the service
+docker-compose up -d
+# View logs
+docker-compose logs -f
+# Stop the service
+docker-compose down
+```
+For GPU support with Docker Compose, uncomment the GPU-related lines in `docker-compose.yml`.
+## Verifying the Installation
+Once the container is running, check the health endpoint:
+```bash
+curl http://localhost:8000/health
+```
+You should see a response like:
+```json
+{
+  "status": "healthy",
+  "device": "cuda:0",
+  "cuda_available": true
+}
+```
+## API Usage
+The API will be available at `http://localhost:8000`. See `API_README.md` for detailed API documentation.
+### Example Request
+```bash
+curl -X POST "http://localhost:8000/generate" \
+  -F "image=@your_image.jpg" \
+  -F "orientation=standard" \
+  -F "format=obj" \
+  --output mesh.zip
+```
+## Troubleshooting
+### Container Fails to Start
+1. Check logs: `docker logs triposr-api`
+2. Ensure port 8000 is not already in use
+3. Verify you have enough memory (at least 8GB recommended)
+### CUDA/GPU Issues
+1. Verify NVIDIA Docker runtime: `docker run --rm --gpus all nvidia/cuda:11.7.0-base-ubuntu20.04 nvidia-smi`
+2. Check CUDA availability in container: `docker exec triposr-api python -c "import torch; print(torch.cuda.is_available())"`
+### Out of Memory
+If you encounter OOM errors:
+- Reduce `mc_resolution` parameter (default: 256, try 128 or 64)
+- Reduce `texture_resolution` parameter (default: 2048, try 1024)
+- Use CPU mode if GPU memory is limited
+### Build Errors
+If the build fails:
+- Ensure you have a stable internet connection (model will be downloaded)
+- Check that all dependencies in `requirements.txt` are valid
+- Try building with `--no-cache`: `docker build --no-cache -t triposr-api:latest .`
+## Environment Variables
+- `CUDA_VISIBLE_DEVICES`: Set to specific GPU ID (e.g., "0") or empty string for CPU mode
+## Volumes
+The container can mount volumes for:
+- `/app/output`: Output directory for generated meshes (optional)
+Example:
+```bash
+docker run -d \
+  --name triposr-api \
+  -p 8000:8000 \
+  -v $(pwd)/output:/app/output \
+  triposr-api:latest
+```
+## Stopping and Removing
+```bash
+# Stop the container
+docker stop triposr-api
+# Remove the container
+docker rm triposr-api
+# Remove the image
+docker rmi triposr-api:latest
+```

README_HF_SPACES.md ADDED Viewed

	@@ -0,0 +1,143 @@

+---
+title: TripoSR API
+emoji: 🎨
+colorFrom: blue
+colorTo: purple
+sdk: docker
+pinned: false
+license: mit
+---
+# TripoSR API 🎨
+Fast 3D reconstruction from a single image using TripoSR.
+## 🚀 Features
+- **Fast 3D Generation**: Convert images to 3D models in seconds
+- **Multiple Formats**: Export as OBJ or GLB
+- **Texture Baking**: Optional texture atlas generation
+- **Background Removal**: Automatic background removal
+- **REST API**: Easy integration with any frontend
+## 📡 API Endpoints
+### Health Check
+```bash
+GET /health
+```
+### Generate 3D Model
+```bash
+POST /generate
+```
+**Parameters:**
+- `image`: Image file (PNG, JPG, JPEG)
+- `do_remove_background`: Remove background (default: true)
+- `foreground_ratio`: Foreground size ratio (default: 0.85)
+- `mc_resolution`: Mesh resolution (default: 256)
+- `format`: Output format - "obj" or "glb" (default: "obj")
+- `bake_texture_flag`: Bake texture (default: true)
+- `texture_resolution`: Texture resolution (default: 2048)
+- `orientation`: Mesh orientation - "standard", "gradio", or "none" (default: "standard")
+**Returns:**
+- ZIP file containing mesh and texture (if texture baking enabled)
+- Or mesh file only (if texture baking disabled)
+### Generate 3D Model (Base64)
+```bash
+POST /generate-base64
+```
+Same parameters as `/generate`, but returns JSON with base64-encoded mesh and texture.
+## 🧪 Example Usage
+### cURL
+```bash
+curl -X POST https://YOUR-SPACE-URL/generate \
+  -F "image=@your_image.png" \
+  -F "format=obj" \
+  -F "bake_texture_flag=true" \
+  -o output.zip
+```
+### Python
+```python
+import requests
+url = "https://YOUR-SPACE-URL/generate"
+files = {"image": open("your_image.png", "rb")}
+data = {
+    "format": "obj",
+    "bake_texture_flag": True,
+    "mc_resolution": 256
+}
+response = requests.post(url, files=files, data=data)
+with open("output.zip", "wb") as f:
+    f.write(response.content)
+```
+### JavaScript
+```javascript
+const formData = new FormData();
+formData.append('image', fileInput.files[0]);
+formData.append('format', 'obj');
+formData.append('bake_texture_flag', 'true');
+const response = await fetch('https://YOUR-SPACE-URL/generate', {
+  method: 'POST',
+  body: formData
+});
+const blob = await response.blob();
+// Download or process the blob
+```
+## ⚙️ Configuration
+### GPU Support
+This Space can run on CPU or GPU:
+- **CPU**: Slower but free
+- **GPU (T4)**: Much faster, may require upgrade in Space settings
+To upgrade to GPU:
+1. Go to Space settings
+2. Select "T4 small" under Hardware
+3. Space will rebuild automatically
+### Performance
+- **CPU**: ~30-60 seconds per image
+- **GPU (T4)**: ~5-10 seconds per image
+- **GPU (A100)**: ~1-2 seconds per image
+## 📚 Documentation
+- [TripoSR Paper](https://arxiv.org/abs/2403.02151)
+- [API Documentation](API_README.md)
+- [React Integration Guide](REACT_INTEGRATION.md)
+## 🔗 Links
+- [GitHub Repository](https://github.com/Ahmedbelaid/TripoSR-api)
+- [Original TripoSR](https://github.com/VAST-AI-Research/TripoSR)
+- [Stability AI](https://stability.ai/)
+- [Tripo AI](https://www.tripo3d.ai/)
+## 📄 License
+MIT License - see [LICENSE](LICENSE) file for details.
+## 🙏 Credits
+- **TripoSR Model**: [Stability AI](https://stability.ai/) and [Tripo AI](https://www.tripo3d.ai/)
+- **API Implementation**: Community contribution
+## 🆘 Support
+For issues or questions:
+- Open an issue on [GitHub](https://github.com/Ahmedbelaid/TripoSR-api/issues)
+- Join the [Discord](https://discord.gg/mvS9mCfMnQ)

api_example.html ADDED Viewed

	@@ -0,0 +1,194 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>TripoSR API Example</title>
+    <style>
+        body {
+            font-family: Arial, sans-serif;
+            max-width: 800px;
+            margin: 50px auto;
+            padding: 20px;
+        }
+        .container {
+            border: 1px solid #ddd;
+            padding: 20px;
+            border-radius: 8px;
+        }
+        input[type="file"] {
+            margin: 10px 0;
+        }
+        button {
+            background-color: #4CAF50;
+            color: white;
+            padding: 10px 20px;
+            border: none;
+            border-radius: 4px;
+            cursor: pointer;
+            font-size: 16px;
+        }
+        button:hover {
+            background-color: #45a049;
+        }
+        button:disabled {
+            background-color: #cccccc;
+            cursor: not-allowed;
+        }
+        .preview {
+            margin: 20px 0;
+        }
+        .preview img {
+            max-width: 100%;
+            border: 1px solid #ddd;
+            border-radius: 4px;
+        }
+        .result {
+            margin-top: 20px;
+            padding: 10px;
+            background-color: #f0f0f0;
+            border-radius: 4px;
+        }
+        .error {
+            color: red;
+            margin-top: 10px;
+        }
+        .loading {
+            color: #666;
+            font-style: italic;
+        }
+    </style>
+</head>
+<body>
+    <div class="container">
+        <h1>TripoSR API Example</h1>
+        <p>Upload an image to generate a 3D mesh</p>
+        <form id="meshForm">
+            <div>
+                <label for="imageInput">Select Image:</label><br>
+                <input type="file" id="imageInput" accept="image/*" required>
+            </div>
+            <div style="margin: 15px 0;">
+                <label>
+                    <input type="checkbox" id="removeBg" checked>
+                    Remove Background
+                </label>
+            </div>
+            <div style="margin: 15px 0;">
+                <label for="foregroundRatio">Foreground Ratio: <span id="ratioValue">0.85</span></label><br>
+                <input type="range" id="foregroundRatio" min="0.5" max="1.0" step="0.05" value="0.85">
+            </div>
+            <div style="margin: 15px 0;">
+                <label for="resolution">Resolution: <span id="resValue">256</span></label><br>
+                <input type="range" id="resolution" min="128" max="320" step="32" value="256">
+            </div>
+            <div style="margin: 15px 0;">
+                <label for="format">Format:</label>
+                <select id="format">
+                    <option value="obj">OBJ</option>
+                    <option value="glb">GLB</option>
+                </select>
+            </div>
+            <button type="submit" id="generateBtn">Generate Mesh</button>
+        </form>
+        <div class="preview" id="preview"></div>
+        <div id="result"></div>
+    </div>
+    <script>
+        const API_URL = 'http://localhost:8000';
+        // Update slider values
+        document.getElementById('foregroundRatio').addEventListener('input', (e) => {
+            document.getElementById('ratioValue').textContent = e.target.value;
+        });
+        document.getElementById('resolution').addEventListener('input', (e) => {
+            document.getElementById('resValue').textContent = e.target.value;
+        });
+        // Preview image
+        document.getElementById('imageInput').addEventListener('change', (e) => {
+            const file = e.target.files[0];
+            if (file) {
+                const reader = new FileReader();
+                reader.onload = (e) => {
+                    const preview = document.getElementById('preview');
+                    preview.innerHTML = `<img src="${e.target.result}" alt="Preview">`;
+                };
+                reader.readAsDataURL(file);
+            }
+        });
+        // Handle form submission
+        document.getElementById('meshForm').addEventListener('submit', async (e) => {
+            e.preventDefault();
+            const formData = new FormData();
+            const imageInput = document.getElementById('imageInput');
+            const removeBg = document.getElementById('removeBg').checked;
+            const foregroundRatio = parseFloat(document.getElementById('foregroundRatio').value);
+            const resolution = parseInt(document.getElementById('resolution').value);
+            const format = document.getElementById('format').value;
+            if (!imageInput.files[0]) {
+                alert('Please select an image');
+                return;
+            }
+            formData.append('image', imageInput.files[0]);
+            formData.append('do_remove_background', removeBg);
+            formData.append('foreground_ratio', foregroundRatio);
+            formData.append('mc_resolution', resolution);
+            formData.append('format', format);
+            const generateBtn = document.getElementById('generateBtn');
+            const resultDiv = document.getElementById('result');
+            generateBtn.disabled = true;
+            generateBtn.textContent = 'Generating...';
+            resultDiv.innerHTML = '<div class="loading">Processing image and generating mesh. This may take 30-60 seconds...</div>';
+            try {
+                const response = await fetch(`${API_URL}/generate`, {
+                    method: 'POST',
+                    body: formData
+                });
+                if (!response.ok) {
+                    const error = await response.json();
+                    throw new Error(error.detail || 'Failed to generate mesh');
+                }
+                // Get the mesh file
+                const blob = await response.blob();
+                const url = window.URL.createObjectURL(blob);
+                resultDiv.innerHTML = `
+                    <div class="result">
+                        <h3>Success! Mesh generated</h3>
+                        <p>Format: ${format.toUpperCase()}</p>
+                        <a href="${url}" download="mesh.${format}">
+                            <button>Download Mesh</button>
+                        </a>
+                    </div>
+                `;
+            } catch (error) {
+                resultDiv.innerHTML = `<div class="error">Error: ${error.message}</div>`;
+            } finally {
+                generateBtn.disabled = false;
+                generateBtn.textContent = 'Generate Mesh';
+            }
+        });
+    </script>
+</body>
+</html>

api_server.py ADDED Viewed

	@@ -0,0 +1,403 @@

+import logging
+import os
+import tempfile
+import base64
+import zipfile
+from io import BytesIO
+from typing import Optional
+import numpy as np
+import rembg
+import torch
+from PIL import Image
+from fastapi import FastAPI, File, UploadFile, HTTPException, Form
+from fastapi.responses import JSONResponse, FileResponse
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+from tsr.system import TSR
+from tsr.utils import remove_background, resize_foreground, apply_mesh_orientation
+from tsr.bake_texture import bake_texture
+# Configure logging
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(message)s", level=logging.INFO
+)
+# Initialize FastAPI app
+app = FastAPI(title="TripoSR API", version="1.0.0")
+# Enable CORS for frontend
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=[
+        "http://localhost:3000",  # React default port
+        "http://localhost:5173",  # Vite default port
+        "http://localhost:8080",  # Vue default port
+        "http://127.0.0.1:3000",
+        "http://127.0.0.1:5173",
+        "http://127.0.0.1:8080",
+          "https://huggingface.co",
+    "https://*.hf.space",  # Add this
+        # Add your production frontend URL here
+        # "https://your-frontend-domain.com",
+    ],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Initialize model
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+logging.info(f"Using device: {device}")
+logging.info("Loading TripoSR model...")
+model = TSR.from_pretrained(
+    "stabilityai/TripoSR",
+    config_name="config.yaml",
+    weight_name="model.ckpt",
+)
+model.renderer.set_chunk_size(8192)
+model.to(device)
+logging.info("Model loaded successfully!")
+rembg_session = rembg.new_session()
+class GenerateRequest(BaseModel):
+    do_remove_background: bool = True
+    foreground_ratio: float = 0.85
+    mc_resolution: int = 256
+    format: str = "obj"  # obj or glb
+def preprocess_image(image: Image.Image, do_remove_background: bool, foreground_ratio: float) -> Image.Image:
+    """Preprocess the input image."""
+    def fill_background(img):
+        img = np.array(img).astype(np.float32) / 255.0
+        img = img[:, :, :3] * img[:, :, 3:4] + (1 - img[:, :, 3:4]) * 0.5
+        return Image.fromarray((img * 255.0).astype(np.uint8))
+    if do_remove_background:
+        image = image.convert("RGB")
+        image = remove_background(image, rembg_session)
+        image = resize_foreground(image, foreground_ratio)
+        image = fill_background(image)
+    else:
+        if image.mode == "RGBA":
+            image = fill_background(image)
+    return image
+@app.get("/")
+async def root():
+    return {"message": "TripoSR API is running", "device": device}
+@app.get("/health")
+async def health():
+    return {"status": "healthy", "device": device, "cuda_available": torch.cuda.is_available()}
+@app.post("/generate")
+async def generate_mesh(
+    image: UploadFile = File(...),
+    do_remove_background: bool = Form(True),
+    foreground_ratio: float = Form(0.85),
+    mc_resolution: int = Form(256),
+    format: str = Form("obj"),
+    bake_texture_flag: bool = Form(True),
+    texture_resolution: int = Form(2048),
+    orientation: str = Form("standard")
+):
+    """
+    Generate a 3D mesh from an uploaded image with optional texture baking.
+    Parameters:
+    - image: Image file (PNG, JPG, JPEG)
+    - do_remove_background: Whether to remove background (default: True)
+    - foreground_ratio: Ratio of foreground size (default: 0.85)
+    - mc_resolution: Marching cubes resolution (default: 256)
+    - format: Output format - "obj" or "glb" (default: "obj")
+    - bake_texture_flag: Whether to bake texture (default: True)
+    - texture_resolution: Texture atlas resolution (default: 2048)
+    - orientation: Mesh orientation - "standard" (Y-up, Z-forward), "gradio" (Gradio viewer), or "none" (original) (default: "standard")
+    Returns:
+    - If bake_texture=True: ZIP file containing mesh and texture.png
+    - If bake_texture=False: Mesh file only
+    """
+    try:
+        # Validate format
+        if format not in ["obj", "glb"]:
+            raise HTTPException(status_code=400, detail="Format must be 'obj' or 'glb'")
+        # Read and validate image
+        image_data = await image.read()
+        input_image = Image.open(BytesIO(image_data))
+        if input_image.mode not in ["RGB", "RGBA"]:
+            input_image = input_image.convert("RGB")
+        logging.info(f"Processing image: {image.filename}, size: {input_image.size}")
+        # Preprocess image
+        processed_image = preprocess_image(input_image, do_remove_background, foreground_ratio)
+        # Generate mesh
+        logging.info("Running model...")
+        with torch.no_grad():
+            scene_codes = model([processed_image], device=device)
+        # Check if xatlas is available for texture baking
+        xatlas_available = False
+        if bake_texture_flag:
+            try:
+                import xatlas
+                xatlas_available = True
+                logging.info("xatlas found, texture baking enabled")
+            except ImportError:
+                logging.warning("xatlas not available - texture baking requires xatlas. Using vertex colors instead.")
+                logging.warning("To enable texture baking, install: pip install xatlas==0.0.9 moderngl==5.10.0")
+                bake_texture_flag = False
+        # ALWAYS extract mesh with vertex colors (colors from the model trained on the image)
+        # has_vertex_color=True extracts colors that the model learned from your input image
+        # These colors represent the true colors from your image as interpreted by the trained model
+        logging.info("Extracting mesh with vertex colors (true colors from image)...")
+        meshes = model.extract_mesh(scene_codes, has_vertex_color=True, resolution=mc_resolution)
+        mesh = meshes[0]
+        # Apply orientation transformation for better 3D viewing
+        # Options: "standard" (Y-up, Z-forward), "gradio" (Gradio viewer), "none" (original)
+        if orientation not in ["standard", "gradio", "none"]:
+            orientation = "standard"
+        logging.info(f"Applying mesh orientation: {orientation}")
+        mesh = apply_mesh_orientation(mesh, orientation=orientation)
+        if bake_texture_flag and xatlas_available:
+            # Bake texture
+            logging.info("Baking texture...")
+            bake_output = bake_texture(mesh, model, scene_codes[0], texture_resolution)
+            # Save mesh with UV mapping
+            mesh_temp = tempfile.NamedTemporaryFile(suffix=f".{format}", delete=False)
+            xatlas.export(
+                mesh_temp.name,
+                mesh.vertices[bake_output["vmapping"]],
+                bake_output["indices"],
+                bake_output["uvs"],
+                mesh.vertex_normals[bake_output["vmapping"]]
+            )
+            mesh_temp.close()
+            # Save texture
+            texture_temp = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
+            Image.fromarray(
+                (bake_output["colors"] * 255.0).astype(np.uint8)
+            ).transpose(Image.FLIP_TOP_BOTTOM).save(texture_temp.name)
+            texture_temp.close()
+            # Create ZIP file with both mesh and texture
+            zip_temp = tempfile.NamedTemporaryFile(suffix=".zip", delete=False)
+            with zipfile.ZipFile(zip_temp.name, 'w', zipfile.ZIP_DEFLATED) as zipf:
+                zipf.write(mesh_temp.name, f"mesh.{format}")
+                zipf.write(texture_temp.name, "texture.png")
+            zip_temp.close()
+            logging.info(f"Mesh and texture saved to: {zip_temp.name}")
+            # Clean up individual files
+            os.unlink(mesh_temp.name)
+            os.unlink(texture_temp.name)
+            return FileResponse(
+                zip_temp.name,
+                media_type="application/zip",
+                filename="mesh_with_texture.zip",
+                headers={"X-File-Path": zip_temp.name}
+            )
+        else:
+            # Save mesh with vertex colors (colors from the model, which learned from your image)
+            # The vertex colors represent the true colors from the input image as interpreted by the model
+            temp_file = tempfile.NamedTemporaryFile(suffix=f".{format}", delete=False)
+            try:
+                mesh.export(temp_file.name)
+            except AttributeError as e:
+                if "ptp" in str(e) and format == "glb":
+                    # Fallback to OBJ if GLB export fails due to NumPy 2.0 compatibility
+                    logging.warning(f"GLB export failed due to NumPy compatibility, falling back to OBJ format")
+                    temp_file.close()
+                    os.unlink(temp_file.name)
+                    temp_file = tempfile.NamedTemporaryFile(suffix=".obj", delete=False)
+                    format = "obj"
+                    mesh.export(temp_file.name)
+                else:
+                    raise
+            temp_file.close()
+            logging.info(f"Mesh with vertex colors (from image) saved to: {temp_file.name}")
+            return FileResponse(
+                temp_file.name,
+                media_type="application/octet-stream",
+                filename=f"mesh.{format}",
+                headers={"X-File-Path": temp_file.name}
+            )
+    except Exception as e:
+        logging.error(f"Error generating mesh: {str(e)}", exc_info=True)
+        raise HTTPException(status_code=500, detail=f"Error generating mesh: {str(e)}")
+@app.post("/generate-base64")
+async def generate_mesh_base64(
+    image: UploadFile = File(...),
+    do_remove_background: bool = Form(True),
+    foreground_ratio: float = Form(0.85),
+    mc_resolution: int = Form(256),
+    format: str = Form("obj"),
+    bake_texture_flag: bool = Form(True),
+    texture_resolution: int = Form(2048),
+    orientation: str = Form("standard")
+):
+    """
+    Generate a 3D mesh with texture and return as base64 encoded strings.
+    Useful for frontend that wants to handle the mesh data directly.
+    Returns JSON with mesh and texture (if baked) as base64 strings.
+    """
+    try:
+        if format not in ["obj", "glb"]:
+            raise HTTPException(status_code=400, detail="Format must be 'obj' or 'glb'")
+        # Read and validate image
+        image_data = await image.read()
+        input_image = Image.open(BytesIO(image_data))
+        if input_image.mode not in ["RGB", "RGBA"]:
+            input_image = input_image.convert("RGB")
+        logging.info(f"Processing image: {image.filename}")
+        # Preprocess image
+        processed_image = preprocess_image(input_image, do_remove_background, foreground_ratio)
+        # Generate mesh
+        logging.info("Running model...")
+        with torch.no_grad():
+            scene_codes = model([processed_image], device=device)
+        # Check if xatlas is available for texture baking
+        xatlas_available = False
+        if bake_texture_flag:
+            try:
+                import xatlas
+                xatlas_available = True
+                logging.info("xatlas found, texture baking enabled")
+            except ImportError:
+                logging.warning("xatlas not available - texture baking requires xatlas. Using vertex colors instead.")
+                bake_texture_flag = False
+        # ALWAYS extract mesh with vertex colors (colors from the model trained on the image)
+        # has_vertex_color=True extracts colors that the model learned from your input image
+        logging.info("Extracting mesh with vertex colors (true colors from image)...")
+        meshes = model.extract_mesh(scene_codes, has_vertex_color=True, resolution=mc_resolution)
+        mesh = meshes[0]
+        # Apply orientation transformation for better 3D viewing
+        # Options: "standard" (Y-up, Z-forward), "gradio" (Gradio viewer), "none" (original)
+        if orientation not in ["standard", "gradio", "none"]:
+            orientation = "standard"
+        logging.info(f"Applying mesh orientation: {orientation}")
+        mesh = apply_mesh_orientation(mesh, orientation=orientation)
+        if bake_texture_flag and xatlas_available:
+            # Bake texture (creates texture atlas from model colors)
+            logging.info("Baking texture...")
+            bake_output = bake_texture(mesh, model, scene_codes[0], texture_resolution)
+            # Save mesh with UV mapping
+            mesh_temp = tempfile.NamedTemporaryFile(suffix=f".{format}", delete=False)
+            xatlas.export(
+                mesh_temp.name,
+                mesh.vertices[bake_output["vmapping"]],
+                bake_output["indices"],
+                bake_output["uvs"],
+                mesh.vertex_normals[bake_output["vmapping"]]
+            )
+            # Save texture
+            texture_temp = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
+            Image.fromarray(
+                (bake_output["colors"] * 255.0).astype(np.uint8)
+            ).transpose(Image.FLIP_TOP_BOTTOM).save(texture_temp.name)
+            # Read and encode to base64
+            with open(mesh_temp.name, "rb") as f:
+                mesh_data = f.read()
+            with open(texture_temp.name, "rb") as f:
+                texture_data = f.read()
+            mesh_base64 = base64.b64encode(mesh_data).decode("utf-8")
+            texture_base64 = base64.b64encode(texture_data).decode("utf-8")
+            # Clean up
+            os.unlink(mesh_temp.name)
+            os.unlink(texture_temp.name)
+            return JSONResponse({
+                "success": True,
+                "format": format,
+                "mesh": mesh_base64,
+                "texture": texture_base64,
+                "mesh_size": len(mesh_data),
+                "texture_size": len(texture_data),
+                "has_texture": True
+            })
+        else:
+            # Save mesh without texture
+            temp_file = tempfile.NamedTemporaryFile(suffix=f".{format}", delete=False)
+            try:
+                mesh.export(temp_file.name)
+            except AttributeError as e:
+                if "ptp" in str(e) and format == "glb":
+                    # Fallback to OBJ if GLB export fails due to NumPy 2.0 compatibility
+                    logging.warning(f"GLB export failed due to NumPy compatibility, falling back to OBJ format")
+                    temp_file.close()
+                    os.unlink(temp_file.name)
+                    temp_file = tempfile.NamedTemporaryFile(suffix=".obj", delete=False)
+                    format = "obj"
+                    mesh.export(temp_file.name)
+                else:
+                    raise
+            # Read file and encode to base64
+            with open(temp_file.name, "rb") as f:
+                mesh_data = f.read()
+            mesh_base64 = base64.b64encode(mesh_data).decode("utf-8")
+            # Clean up
+            os.unlink(temp_file.name)
+            return JSONResponse({
+                "success": True,
+                "format": format,
+                "mesh": mesh_base64,
+                "mesh_size": len(mesh_data),
+                "has_texture": False
+            })
+    except Exception as e:
+        logging.error(f"Error generating mesh: {str(e)}", exc_info=True)
+        raise HTTPException(status_code=500, detail=f"Error generating mesh: {str(e)}")
+if __name__ == "__main__":
+    import uvicorn
+    # Use 127.0.0.1 for localhost access, or 0.0.0.0 for network access
+    # For local development, use 127.0.0.1
+    uvicorn.run(app, host="127.0.0.1", port=8000, reload=False)

app.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import logging
+import os
+import tempfile
+import gradio as gr
+import numpy as np
+import rembg
+import torch
+from PIL import Image
+from functools import partial
+from tsr.system import TSR
+from tsr.utils import remove_background, resize_foreground, to_gradio_3d_orientation
+if torch.cuda.is_available():
+    device = "cuda:0"
+else:
+    device = "cpu"
+model = TSR.from_pretrained(
+    "stabilityai/TripoSR",
+    config_name="config.yaml",
+    weight_name="model.ckpt",
+)
+# adjust the chunk size to balance between speed and memory usage
+model.renderer.set_chunk_size(8192)
+model.to(device)
+rembg_session = rembg.new_session()
+def check_input_image(input_image):
+    if input_image is None:
+        raise gr.Error("No image uploaded!")
+def preprocess(input_image, do_remove_background, foreground_ratio):
+    def fill_background(image):
+        image = np.array(image).astype(np.float32) / 255.0
+        image = image[:, :, :3] * image[:, :, 3:4] + (1 - image[:, :, 3:4]) * 0.5
+        image = Image.fromarray((image * 255.0).astype(np.uint8))
+        return image
+    if do_remove_background:
+        image = input_image.convert("RGB")
+        image = remove_background(image, rembg_session)
+        image = resize_foreground(image, foreground_ratio)
+        image = fill_background(image)
+    else:
+        image = input_image
+        if image.mode == "RGBA":
+            image = fill_background(image)
+    return image
+def generate(image, mc_resolution, formats=["obj", "glb"]):
+    scene_codes = model(image, device=device)
+    mesh = model.extract_mesh(scene_codes, True, resolution=mc_resolution)[0]
+    mesh = to_gradio_3d_orientation(mesh)
+    rv = []
+    for format in formats:
+        mesh_path = tempfile.NamedTemporaryFile(suffix=f".{format}", delete=False)
+        mesh.export(mesh_path.name)
+        rv.append(mesh_path.name)
+    return rv
+def run_example(image_pil):
+    preprocessed = preprocess(image_pil, False, 0.9)
+    mesh_name_obj, mesh_name_glb = generate(preprocessed, 256, ["obj", "glb"])
+    return preprocessed, mesh_name_obj, mesh_name_glb
+with gr.Blocks(title="TripoSR") as interface:
+    gr.Markdown(
+        """
+    # TripoSR Demo
+    [TripoSR](https://github.com/VAST-AI-Research/TripoSR) is a state-of-the-art open-source model for **fast** feedforward 3D reconstruction from a single image, collaboratively developed by [Tripo AI](https://www.tripo3d.ai/) and [Stability AI](https://stability.ai/).
+    **Tips:**
+    1. If you find the result is unsatisfied, please try to change the foreground ratio. It might improve the results.
+    2. It's better to disable "Remove Background" for the provided examples (except fot the last one) since they have been already preprocessed.
+    3. Otherwise, please disable "Remove Background" option only if your input image is RGBA with transparent background, image contents are centered and occupy more than 70% of image width or height.
+    """
+    )
+    with gr.Row(variant="panel"):
+        with gr.Column():
+            with gr.Row():
+                input_image = gr.Image(
+                    label="Input Image",
+                    image_mode="RGBA",
+                    sources="upload",
+                    type="pil",
+                    elem_id="content_image",
+                )
+                processed_image = gr.Image(label="Processed Image", interactive=False)
+            with gr.Row():
+                with gr.Group():
+                    do_remove_background = gr.Checkbox(
+                        label="Remove Background", value=True
+                    )
+                    foreground_ratio = gr.Slider(
+                        label="Foreground Ratio",
+                        minimum=0.5,
+                        maximum=1.0,
+                        value=0.85,
+                        step=0.05,
+                    )
+                    mc_resolution = gr.Slider(
+                        label="Marching Cubes Resolution",
+                        minimum=32,
+                        maximum=320,
+                        value=256,
+                        step=32
+                    )
+            with gr.Row():
+                submit = gr.Button("Generate", elem_id="generate", variant="primary")
+        with gr.Column():
+            with gr.Tab("OBJ"):
+                output_model_obj = gr.Model3D(
+                    label="Output Model (OBJ Format)",
+                    interactive=False,
+                )
+                gr.Markdown("Note: The model shown here is flipped. Download to get correct results.")
+            with gr.Tab("GLB"):
+                output_model_glb = gr.Model3D(
+                    label="Output Model (GLB Format)",
+                    interactive=False,
+                )
+                gr.Markdown("Note: The model shown here has a darker appearance. Download to get correct results.")
+    with gr.Row(variant="panel"):
+        gr.Examples(
+            examples=[
+                "examples/hamburger.png",
+                "examples/poly_fox.png",
+                "examples/robot.png",
+                "examples/teapot.png",
+                "examples/tiger_girl.png",
+                "examples/horse.png",
+                "examples/flamingo.png",
+                "examples/unicorn.png",
+                "examples/chair.png",
+                "examples/iso_house.png",
+                "examples/marble.png",
+                "examples/police_woman.png",
+                "examples/captured.jpeg",
+            ],
+            inputs=[input_image],
+            outputs=[processed_image, output_model_obj, output_model_glb],
+            cache_examples=False,
+            fn=partial(run_example),
+            label="Examples",
+            examples_per_page=20,
+        )
+    submit.click(fn=check_input_image, inputs=[input_image]).success(
+        fn=preprocess,
+        inputs=[input_image, do_remove_background, foreground_ratio],
+        outputs=[processed_image],
+    ).success(
+        fn=generate,
+        inputs=[processed_image, mc_resolution],
+        outputs=[output_model_obj, output_model_glb],
+    )
+# For Hugging Face Spaces, we just need to assign the interface
+# The launch() is handled automatically by Spaces
+app = interface

deploy_colab.ipynb ADDED Viewed

	@@ -0,0 +1,268 @@

+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "gpuType": "T4"
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "accelerator": "GPU"
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# 🎨 TripoSR API on Google Colab\n",
+        "\n",
+        "This notebook deploys the TripoSR API on Google Colab with a public URL using ngrok.\n",
+        "\n",
+        "## 📋 Setup Instructions\n",
+        "\n",
+        "1. **Enable GPU**: Runtime → Change runtime type → GPU (T4)\n",
+        "2. **Get ngrok token**: Sign up at [ngrok.com](https://ngrok.com) and get your authtoken\n",
+        "3. **Run all cells** in order\n",
+        "4. **Copy the public URL** from the output\n",
+        "\n",
+        "⚠️ **Note**: The session will expire after 12 hours of inactivity or 24 hours maximum."
+      ],
+      "metadata": {
+        "id": "header"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Check GPU availability\n",
+        "!nvidia-smi"
+      ],
+      "metadata": {
+        "id": "check_gpu"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Clone the repository\n",
+        "!git clone https://github.com/Ahmedbelaid/TripoSR-api.git\n",
+        "%cd TripoSR-api"
+      ],
+      "metadata": {
+        "id": "clone_repo"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Install dependencies\n",
+        "!pip install -q torch torchvision\n",
+        "!pip install -q -r requirements.txt"
+      ],
+      "metadata": {
+        "id": "install_deps"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Install ngrok for public URL\n",
+        "!pip install -q pyngrok\n",
+        "\n",
+        "from pyngrok import ngrok, conf\n",
+        "import getpass\n",
+        "\n",
+        "# Get ngrok authtoken\n",
+        "print(\"Get your authtoken from: https://dashboard.ngrok.com/get-started/your-authtoken\")\n",
+        "authtoken = getpass.getpass(\"Enter your ngrok authtoken: \")\n",
+        "\n",
+        "# Set ngrok authtoken\n",
+        "conf.get_default().auth_token = authtoken"
+      ],
+      "metadata": {
+        "id": "setup_ngrok"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Start the API server in the background\n",
+        "import subprocess\n",
+        "import time\n",
+        "\n",
+        "# Start server\n",
+        "process = subprocess.Popen(\n",
+        "    [\"python\", \"-m\", \"uvicorn\", \"api_server:app\", \"--host\", \"0.0.0.0\", \"--port\", \"8000\"],\n",
+        "    stdout=subprocess.PIPE,\n",
+        "    stderr=subprocess.PIPE\n",
+        ")\n",
+        "\n",
+        "# Wait for server to start\n",
+        "print(\"Starting server...\")\n",
+        "time.sleep(10)\n",
+        "\n",
+        "# Create ngrok tunnel\n",
+        "public_url = ngrok.connect(8000)\n",
+        "\n",
+        "print(\"\\n\" + \"=\"*60)\n",
+        "print(\"🚀 TripoSR API is now running!\")\n",
+        "print(\"=\"*60)\n",
+        "print(f\"\\n📡 Public URL: {public_url}\")\n",
+        "print(f\"\\n🔍 Health Check: {public_url}/health\")\n",
+        "print(f\"\\n📝 API Docs: {public_url}/docs\")\n",
+        "print(\"\\n\" + \"=\"*60)\n",
+        "print(\"\\n⚠️  Keep this notebook running to keep the API active\")\n",
+        "print(\"⚠️  Session will expire after 12 hours of inactivity\\n\")"
+      ],
+      "metadata": {
+        "id": "start_server"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Test the API\n",
+        "import requests\n",
+        "\n",
+        "# Health check\n",
+        "response = requests.get(f\"{public_url}/health\")\n",
+        "print(\"Health Check Response:\")\n",
+        "print(response.json())"
+      ],
+      "metadata": {
+        "id": "test_api"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## 🧪 Example: Generate 3D Model\n",
+        "\n",
+        "Upload an image and generate a 3D model:"
+      ],
+      "metadata": {
+        "id": "example_header"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from google.colab import files\n",
+        "import requests\n",
+        "\n",
+        "# Upload an image\n",
+        "print(\"Upload an image file:\")\n",
+        "uploaded = files.upload()\n",
+        "\n",
+        "# Get the filename\n",
+        "filename = list(uploaded.keys())[0]\n",
+        "\n",
+        "# Generate 3D model\n",
+        "print(f\"\\nGenerating 3D model from {filename}...\")\n",
+        "with open(filename, 'rb') as f:\n",
+        "    files_dict = {'image': f}\n",
+        "    data = {\n",
+        "        'format': 'obj',\n",
+        "        'bake_texture_flag': True,\n",
+        "        'mc_resolution': 256\n",
+        "    }\n",
+        "    response = requests.post(f\"{public_url}/generate\", files=files_dict, data=data)\n",
+        "\n",
+        "if response.status_code == 200:\n",
+        "    # Save the output\n",
+        "    output_filename = 'output.zip'\n",
+        "    with open(output_filename, 'wb') as f:\n",
+        "        f.write(response.content)\n",
+        "    print(f\"\\n✅ Success! 3D model saved to {output_filename}\")\n",
+        "    \n",
+        "    # Download the file\n",
+        "    files.download(output_filename)\n",
+        "else:\n",
+        "    print(f\"\\n❌ Error: {response.status_code}\")\n",
+        "    print(response.text)"
+      ],
+      "metadata": {
+        "id": "generate_example"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## 📊 Monitor Server Logs\n",
+        "\n",
+        "Run this cell to see server logs:"
+      ],
+      "metadata": {
+        "id": "logs_header"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# View server logs\n",
+        "import time\n",
+        "\n",
+        "print(\"Server logs (press Stop to exit):\")\n",
+        "print(\"=\"*60)\n",
+        "\n",
+        "while True:\n",
+        "    output = process.stdout.readline()\n",
+        "    if output:\n",
+        "        print(output.decode().strip())\n",
+        "    error = process.stderr.readline()\n",
+        "    if error:\n",
+        "        print(error.decode().strip())\n",
+        "    time.sleep(0.1)"
+      ],
+      "metadata": {
+        "id": "view_logs"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## 🛑 Stop Server\n",
+        "\n",
+        "Run this cell to stop the server:"
+      ],
+      "metadata": {
+        "id": "stop_header"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Stop the server\n",
+        "process.terminate()\n",
+        "ngrok.disconnect(public_url)\n",
+        "print(\"Server stopped.\")"
+      ],
+      "metadata": {
+        "id": "stop_server"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,31 @@

+version: '3.8'
+services:
+  triposr-api:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    container_name: triposr-api
+    ports:
+      - "8000:8000"
+    environment:
+      - CUDA_VISIBLE_DEVICES=0  # Use first GPU, set to empty string to use CPU
+    volumes:
+      # Optional: Mount a volume for output files if you want to persist them
+      - ./output:/app/output
+    # Uncomment the following lines if you have NVIDIA GPU and want to use it
+    # deploy:
+    #   resources:
+    #     reservations:
+    #       devices:
+    #         - driver: nvidia
+    #           count: 1
+    #           capabilities: [gpu]
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 60s

gradio_app.py ADDED Viewed

	@@ -0,0 +1,187 @@

+import logging
+import os
+import tempfile
+import time
+import gradio as gr
+import numpy as np
+import rembg
+import torch
+from PIL import Image
+from functools import partial
+from tsr.system import TSR
+from tsr.utils import remove_background, resize_foreground, to_gradio_3d_orientation
+import argparse
+if torch.cuda.is_available():
+    device = "cuda:0"
+else:
+    device = "cpu"
+model = TSR.from_pretrained(
+    "stabilityai/TripoSR",
+    config_name="config.yaml",
+    weight_name="model.ckpt",
+)
+# adjust the chunk size to balance between speed and memory usage
+model.renderer.set_chunk_size(8192)
+model.to(device)
+rembg_session = rembg.new_session()
+def check_input_image(input_image):
+    if input_image is None:
+        raise gr.Error("No image uploaded!")
+def preprocess(input_image, do_remove_background, foreground_ratio):
+    def fill_background(image):
+        image = np.array(image).astype(np.float32) / 255.0
+        image = image[:, :, :3] * image[:, :, 3:4] + (1 - image[:, :, 3:4]) * 0.5
+        image = Image.fromarray((image * 255.0).astype(np.uint8))
+        return image
+    if do_remove_background:
+        image = input_image.convert("RGB")
+        image = remove_background(image, rembg_session)
+        image = resize_foreground(image, foreground_ratio)
+        image = fill_background(image)
+    else:
+        image = input_image
+        if image.mode == "RGBA":
+            image = fill_background(image)
+    return image
+def generate(image, mc_resolution, formats=["obj", "glb"]):
+    scene_codes = model(image, device=device)
+    mesh = model.extract_mesh(scene_codes, True, resolution=mc_resolution)[0]
+    mesh = to_gradio_3d_orientation(mesh)
+    rv = []
+    for format in formats:
+        mesh_path = tempfile.NamedTemporaryFile(suffix=f".{format}", delete=False)
+        mesh.export(mesh_path.name)
+        rv.append(mesh_path.name)
+    return rv
+def run_example(image_pil):
+    preprocessed = preprocess(image_pil, False, 0.9)
+    mesh_name_obj, mesh_name_glb = generate(preprocessed, 256, ["obj", "glb"])
+    return preprocessed, mesh_name_obj, mesh_name_glb
+with gr.Blocks(title="TripoSR") as interface:
+    gr.Markdown(
+        """
+    # TripoSR Demo
+    [TripoSR](https://github.com/VAST-AI-Research/TripoSR) is a state-of-the-art open-source model for **fast** feedforward 3D reconstruction from a single image, collaboratively developed by [Tripo AI](https://www.tripo3d.ai/) and [Stability AI](https://stability.ai/).
+    **Tips:**
+    1. If you find the result is unsatisfied, please try to change the foreground ratio. It might improve the results.
+    2. It's better to disable "Remove Background" for the provided examples (except fot the last one) since they have been already preprocessed.
+    3. Otherwise, please disable "Remove Background" option only if your input image is RGBA with transparent background, image contents are centered and occupy more than 70% of image width or height.
+    """
+    )
+    with gr.Row(variant="panel"):
+        with gr.Column():
+            with gr.Row():
+                input_image = gr.Image(
+                    label="Input Image",
+                    image_mode="RGBA",
+                    sources="upload",
+                    type="pil",
+                    elem_id="content_image",
+                )
+                processed_image = gr.Image(label="Processed Image", interactive=False)
+            with gr.Row():
+                with gr.Group():
+                    do_remove_background = gr.Checkbox(
+                        label="Remove Background", value=True
+                    )
+                    foreground_ratio = gr.Slider(
+                        label="Foreground Ratio",
+                        minimum=0.5,
+                        maximum=1.0,
+                        value=0.85,
+                        step=0.05,
+                    )
+                    mc_resolution = gr.Slider(
+                        label="Marching Cubes Resolution",
+                        minimum=32,
+                        maximum=320,
+                        value=256,
+                        step=32
+                    )
+            with gr.Row():
+                submit = gr.Button("Generate", elem_id="generate", variant="primary")
+        with gr.Column():
+            with gr.Tab("OBJ"):
+                output_model_obj = gr.Model3D(
+                    label="Output Model (OBJ Format)",
+                    interactive=False,
+                )
+                gr.Markdown("Note: The model shown here is flipped. Download to get correct results.")
+            with gr.Tab("GLB"):
+                output_model_glb = gr.Model3D(
+                    label="Output Model (GLB Format)",
+                    interactive=False,
+                )
+                gr.Markdown("Note: The model shown here has a darker appearance. Download to get correct results.")
+    with gr.Row(variant="panel"):
+        gr.Examples(
+            examples=[
+                "examples/hamburger.png",
+                "examples/poly_fox.png",
+                "examples/robot.png",
+                "examples/teapot.png",
+                "examples/tiger_girl.png",
+                "examples/horse.png",
+                "examples/flamingo.png",
+                "examples/unicorn.png",
+                "examples/chair.png",
+                "examples/iso_house.png",
+                "examples/marble.png",
+                "examples/police_woman.png",
+                "examples/captured.jpeg",
+            ],
+            inputs=[input_image],
+            outputs=[processed_image, output_model_obj, output_model_glb],
+            cache_examples=False,
+            fn=partial(run_example),
+            label="Examples",
+            examples_per_page=20,
+        )
+    submit.click(fn=check_input_image, inputs=[input_image]).success(
+        fn=preprocess,
+        inputs=[input_image, do_remove_background, foreground_ratio],
+        outputs=[processed_image],
+    ).success(
+        fn=generate,
+        inputs=[processed_image, mc_resolution],
+        outputs=[output_model_obj, output_model_glb],
+    )
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--username', type=str, default=None, help='Username for authentication')
+    parser.add_argument('--password', type=str, default=None, help='Password for authentication')
+    parser.add_argument('--port', type=int, default=7860, help='Port to run the server listener on')
+    parser.add_argument("--listen", action='store_true', help="launch gradio with 0.0.0.0 as server name, allowing to respond to network requests")
+    parser.add_argument("--share", action='store_true', help="use share=True for gradio and make the UI accessible through their site")
+    parser.add_argument("--queuesize", type=int, default=1, help="launch gradio queue max_size")
+    args = parser.parse_args()
+    interface.queue(max_size=args.queuesize)
+    interface.launch(
+        auth=(args.username, args.password) if (args.username and args.password) else None,
+        share=args.share,
+        server_name="0.0.0.0" if args.listen else None,
+        server_port=args.port
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+torch>=2.0.0
+omegaconf==2.3.0
+Pillow==10.1.0
+einops==0.7.0
+git+https://github.com/tatsy/torchmcubes.git
+transformers==4.35.0
+trimesh==4.0.5
+rembg
+huggingface-hub
+imageio[ffmpeg]
+gradio==3.50.2
+xatlas==0.0.9
+moderngl==5.10.0
+fastapi>=0.100.0
+uvicorn[standard]>=0.23.0
+python-multipart

run.py ADDED Viewed

	@@ -0,0 +1,197 @@

+import argparse
+import logging
+import os
+import time
+import numpy as np
+import rembg
+import torch
+from PIL import Image
+from tsr.system import TSR
+from tsr.utils import remove_background, resize_foreground, save_video
+from tsr.bake_texture import bake_texture
+class Timer:
+    def __init__(self):
+        self.items = {}
+        self.time_scale = 1000.0  # ms
+        self.time_unit = "ms"
+    def start(self, name: str) -> None:
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        self.items[name] = time.time()
+        logging.info(f"{name} ...")
+    def end(self, name: str) -> float:
+        if name not in self.items:
+            return
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        start_time = self.items.pop(name)
+        delta = time.time() - start_time
+        t = delta * self.time_scale
+        logging.info(f"{name} finished in {t:.2f}{self.time_unit}.")
+timer = Timer()
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(message)s", level=logging.INFO
+)
+parser = argparse.ArgumentParser()
+parser.add_argument("image", type=str, nargs="+", help="Path to input image(s).")
+parser.add_argument(
+    "--device",
+    default="cuda:0",
+    type=str,
+    help="Device to use. If no CUDA-compatible device is found, will fallback to 'cpu'. Default: 'cuda:0'",
+)
+parser.add_argument(
+    "--pretrained-model-name-or-path",
+    default="stabilityai/TripoSR",
+    type=str,
+    help="Path to the pretrained model. Could be either a huggingface model id is or a local path. Default: 'stabilityai/TripoSR'",
+)
+parser.add_argument(
+    "--chunk-size",
+    default=8192,
+    type=int,
+    help="Evaluation chunk size for surface extraction and rendering. Smaller chunk size reduces VRAM usage but increases computation time. 0 for no chunking. Default: 8192",
+)
+parser.add_argument(
+    "--mc-resolution",
+    default=256,
+    type=int,
+    help="Marching cubes grid resolution. Default: 256"
+)
+parser.add_argument(
+    "--no-remove-bg",
+    action="store_true",
+    help="If specified, the background will NOT be automatically removed from the input image, and the input image should be an RGB image with gray background and properly-sized foreground. Default: false",
+)
+parser.add_argument(
+    "--foreground-ratio",
+    default=0.85,
+    type=float,
+    help="Ratio of the foreground size to the image size. Only used when --no-remove-bg is not specified. Default: 0.85",
+)
+parser.add_argument(
+    "--output-dir",
+    default="output/",
+    type=str,
+    help="Output directory to save the results. Default: 'output/'",
+)
+parser.add_argument(
+    "--model-save-format",
+    default="obj",
+    type=str,
+    choices=["obj", "glb"],
+    help="Format to save the extracted mesh. Default: 'obj'",
+)
+parser.add_argument(
+    "--bake-texture",
+    action="store_true",
+    help="Bake a texture atlas for the extracted mesh, instead of vertex colors",
+)
+parser.add_argument(
+    "--texture-resolution",
+    default=2048,
+    type=int,
+    help="Texture atlas resolution, only useful with --bake-texture. Default: 2048"
+)
+parser.add_argument(
+    "--render",
+    action="store_true",
+    help="If specified, save a NeRF-rendered video. Default: false",
+)
+args = parser.parse_args()
+output_dir = args.output_dir
+os.makedirs(output_dir, exist_ok=True)
+device = args.device
+if not torch.cuda.is_available():
+    device = "cpu"
+timer.start("Initializing model")
+model = TSR.from_pretrained(
+    args.pretrained_model_name_or_path,
+    config_name="config.yaml",
+    weight_name="model.ckpt",
+)
+model.renderer.set_chunk_size(args.chunk_size)
+model.to(device)
+timer.end("Initializing model")
+timer.start("Processing images")
+images = []
+if args.no_remove_bg:
+    rembg_session = None
+else:
+    rembg_session = rembg.new_session()
+for i, image_path in enumerate(args.image):
+    if args.no_remove_bg:
+        image = np.array(Image.open(image_path).convert("RGB"))
+    else:
+        image = remove_background(Image.open(image_path), rembg_session)
+        image = resize_foreground(image, args.foreground_ratio)
+        image = np.array(image).astype(np.float32) / 255.0
+        image = image[:, :, :3] * image[:, :, 3:4] + (1 - image[:, :, 3:4]) * 0.5
+        image = Image.fromarray((image * 255.0).astype(np.uint8))
+        if not os.path.exists(os.path.join(output_dir, str(i))):
+            os.makedirs(os.path.join(output_dir, str(i)))
+        image.save(os.path.join(output_dir, str(i), f"input.png"))
+    images.append(image)
+timer.end("Processing images")
+for i, image in enumerate(images):
+    logging.info(f"Running image {i + 1}/{len(images)} ...")
+    timer.start("Running model")
+    with torch.no_grad():
+        scene_codes = model([image], device=device)
+    timer.end("Running model")
+    if args.render:
+        timer.start("Rendering")
+        render_images = model.render(scene_codes, n_views=30, return_type="pil")
+        for ri, render_image in enumerate(render_images[0]):
+            render_image.save(os.path.join(output_dir, str(i), f"render_{ri:03d}.png"))
+        save_video(
+            render_images[0], os.path.join(output_dir, str(i), f"render.mp4"), fps=30
+        )
+        timer.end("Rendering")
+    timer.start("Extracting mesh")
+    meshes = model.extract_mesh(scene_codes, not args.bake_texture, resolution=args.mc_resolution)
+    timer.end("Extracting mesh")
+    out_mesh_path = os.path.join(output_dir, str(i), f"mesh.{args.model_save_format}")
+    if args.bake_texture:
+        try:
+            import xatlas
+        except ImportError:
+            raise ImportError(
+                "xatlas is required for texture baking. Please install it with: pip install xatlas==0.0.9\n"
+                "Note: This requires Microsoft Visual C++ Build Tools to compile."
+            )
+        out_texture_path = os.path.join(output_dir, str(i), "texture.png")
+        timer.start("Baking texture")
+        bake_output = bake_texture(meshes[0], model, scene_codes[0], args.texture_resolution)
+        timer.end("Baking texture")
+        timer.start("Exporting mesh and texture")
+        xatlas.export(out_mesh_path, meshes[0].vertices[bake_output["vmapping"]], bake_output["indices"], bake_output["uvs"], meshes[0].vertex_normals[bake_output["vmapping"]])
+        Image.fromarray((bake_output["colors"] * 255.0).astype(np.uint8)).transpose(Image.FLIP_TOP_BOTTOM).save(out_texture_path)
+        timer.end("Exporting mesh and texture")
+    else:
+        timer.start("Exporting mesh")
+        meshes[0].export(out_mesh_path)
+        timer.end("Exporting mesh")

tsr/__pycache__/bake_texture.cpython-313.pyc ADDED Viewed

Binary file (7.68 kB). View file

tsr/__pycache__/system.cpython-313.pyc ADDED Viewed

Binary file (9.9 kB). View file

tsr/__pycache__/utils.cpython-313.pyc ADDED Viewed

Binary file (24 kB). View file

tsr/bake_texture.py ADDED Viewed

	@@ -0,0 +1,191 @@

+import numpy as np
+import torch
+import trimesh
+from PIL import Image
+try:
+    import xatlas
+    import moderngl
+    _HAS_XATLAS = True
+except ImportError:
+    _HAS_XATLAS = False
+def make_atlas(mesh, texture_resolution, texture_padding):
+    if not _HAS_XATLAS:
+        raise ImportError(
+            "xatlas is required for texture baking. Please install it with: pip install xatlas==0.0.9\n"
+            "Note: This requires Microsoft Visual C++ Build Tools to compile."
+        )
+    atlas = xatlas.Atlas()
+    atlas.add_mesh(mesh.vertices, mesh.faces)
+    options = xatlas.PackOptions()
+    options.resolution = texture_resolution
+    options.padding = texture_padding
+    options.bilinear = True
+    atlas.generate(pack_options=options)
+    vmapping, indices, uvs = atlas[0]
+    return {
+        "vmapping": vmapping,
+        "indices": indices,
+        "uvs": uvs,
+    }
+def rasterize_position_atlas(
+    mesh, atlas_vmapping, atlas_indices, atlas_uvs, texture_resolution, texture_padding
+):
+    if not _HAS_XATLAS:
+        raise ImportError(
+            "moderngl is required for texture baking. Please install it with: pip install moderngl==5.10.0\n"
+            "Note: This requires Microsoft Visual C++ Build Tools to compile."
+        )
+    ctx = moderngl.create_context(standalone=True)
+    basic_prog = ctx.program(
+        vertex_shader="""
+            #version 330
+            in vec2 in_uv;
+            in vec3 in_pos;
+            out vec3 v_pos;
+            void main() {
+                v_pos = in_pos;
+                gl_Position = vec4(in_uv * 2.0 - 1.0, 0.0, 1.0);
+            }
+        """,
+        fragment_shader="""
+            #version 330
+            in vec3 v_pos;
+            out vec4 o_col;
+            void main() {
+                o_col = vec4(v_pos, 1.0);
+            }
+        """,
+    )
+    gs_prog = ctx.program(
+        vertex_shader="""
+            #version 330
+            in vec2 in_uv;
+            in vec3 in_pos;
+            out vec3 vg_pos;
+            void main() {
+                vg_pos = in_pos;
+                gl_Position = vec4(in_uv * 2.0 - 1.0, 0.0, 1.0);
+            }
+        """,
+        geometry_shader="""
+            #version 330
+            uniform float u_resolution;
+            uniform float u_dilation;
+            layout (triangles) in;
+            layout (triangle_strip, max_vertices = 12) out;
+            in vec3 vg_pos[];
+            out vec3 vf_pos;
+            void lineSegment(int aidx, int bidx) {
+                vec2 a = gl_in[aidx].gl_Position.xy;
+                vec2 b = gl_in[bidx].gl_Position.xy;
+                vec3 aCol = vg_pos[aidx];
+                vec3 bCol = vg_pos[bidx];
+                vec2 dir = normalize((b - a) * u_resolution);
+                vec2 offset = vec2(-dir.y, dir.x) * u_dilation / u_resolution;
+                gl_Position = vec4(a + offset, 0.0, 1.0);
+                vf_pos = aCol;
+                EmitVertex();
+                gl_Position = vec4(a - offset, 0.0, 1.0);
+                vf_pos = aCol;
+                EmitVertex();
+                gl_Position = vec4(b + offset, 0.0, 1.0);
+                vf_pos = bCol;
+                EmitVertex();
+                gl_Position = vec4(b - offset, 0.0, 1.0);
+                vf_pos = bCol;
+                EmitVertex();
+            }
+            void main() {
+                lineSegment(0, 1);
+                lineSegment(1, 2);
+                lineSegment(2, 0);
+                EndPrimitive();
+            }
+        """,
+        fragment_shader="""
+            #version 330
+            in vec3 vf_pos;
+            out vec4 o_col;
+            void main() {
+                o_col = vec4(vf_pos, 1.0);
+            }
+        """,
+    )
+    uvs = atlas_uvs.flatten().astype("f4")
+    pos = mesh.vertices[atlas_vmapping].flatten().astype("f4")
+    indices = atlas_indices.flatten().astype("i4")
+    vbo_uvs = ctx.buffer(uvs)
+    vbo_pos = ctx.buffer(pos)
+    ibo = ctx.buffer(indices)
+    vao_content = [
+        vbo_uvs.bind("in_uv", layout="2f"),
+        vbo_pos.bind("in_pos", layout="3f"),
+    ]
+    basic_vao = ctx.vertex_array(basic_prog, vao_content, ibo)
+    gs_vao = ctx.vertex_array(gs_prog, vao_content, ibo)
+    fbo = ctx.framebuffer(
+        color_attachments=[
+            ctx.texture((texture_resolution, texture_resolution), 4, dtype="f4")
+        ]
+    )
+    fbo.use()
+    fbo.clear(0.0, 0.0, 0.0, 0.0)
+    gs_prog["u_resolution"].value = texture_resolution
+    gs_prog["u_dilation"].value = texture_padding
+    gs_vao.render()
+    basic_vao.render()
+    fbo_bytes = fbo.color_attachments[0].read()
+    fbo_np = np.frombuffer(fbo_bytes, dtype="f4").reshape(
+        texture_resolution, texture_resolution, 4
+    )
+    return fbo_np
+def positions_to_colors(model, scene_code, positions_texture, texture_resolution):
+    positions = torch.tensor(positions_texture.reshape(-1, 4)[:, :-1])
+    with torch.no_grad():
+        queried_grid = model.renderer.query_triplane(
+            model.decoder,
+            positions,
+            scene_code,
+        )
+    rgb_f = queried_grid["color"].numpy().reshape(-1, 3)
+    rgba_f = np.insert(rgb_f, 3, positions_texture.reshape(-1, 4)[:, -1], axis=1)
+    rgba_f[rgba_f[:, -1] == 0.0] = [0, 0, 0, 0]
+    return rgba_f.reshape(texture_resolution, texture_resolution, 4)
+def bake_texture(mesh, model, scene_code, texture_resolution):
+    if not _HAS_XATLAS:
+        raise ImportError(
+            "xatlas and moderngl are required for texture baking. Please install them with:\n"
+            "  pip install xatlas==0.0.9 moderngl==5.10.0\n"
+            "Note: These require Microsoft Visual C++ Build Tools to compile."
+        )
+    texture_padding = round(max(2, texture_resolution / 256))
+    atlas = make_atlas(mesh, texture_resolution, texture_padding)
+    positions_texture = rasterize_position_atlas(
+        mesh,
+        atlas["vmapping"],
+        atlas["indices"],
+        atlas["uvs"],
+        texture_resolution,
+        texture_padding,
+    )
+    colors_texture = positions_to_colors(
+        model, scene_code, positions_texture, texture_resolution
+    )
+    return {
+        "vmapping": atlas["vmapping"],
+        "indices": atlas["indices"],
+        "uvs": atlas["uvs"],
+        "colors": colors_texture,
+    }

tsr/models/__pycache__/isosurface.cpython-313.pyc ADDED Viewed

Binary file (4.26 kB). View file

tsr/models/__pycache__/nerf_renderer.cpython-313.pyc ADDED Viewed

Binary file (8.46 kB). View file

tsr/models/__pycache__/network_utils.cpython-313.pyc ADDED Viewed

Binary file (5.88 kB). View file

tsr/models/isosurface.py ADDED Viewed

	@@ -0,0 +1,64 @@

+from typing import Callable, Optional, Tuple
+import numpy as np
+import torch
+import torch.nn as nn
+try:
+    from torchmcubes import marching_cubes
+    _HAS_TORCHMCUBES = True
+except ImportError:
+    _HAS_TORCHMCUBES = False
+    marching_cubes = None
+class IsosurfaceHelper(nn.Module):
+    points_range: Tuple[float, float] = (0, 1)
+    @property
+    def grid_vertices(self) -> torch.FloatTensor:
+        raise NotImplementedError
+class MarchingCubeHelper(IsosurfaceHelper):
+    def __init__(self, resolution: int) -> None:
+        super().__init__()
+        if not _HAS_TORCHMCUBES:
+            raise ImportError(
+                "torchmcubes is required for mesh extraction. Please install it with:\n"
+                "  pip install git+https://github.com/tatsy/torchmcubes.git\n"
+                "Note: This requires Microsoft Visual C++ Build Tools to compile."
+            )
+        self.resolution = resolution
+        self.mc_func: Callable = marching_cubes
+        self._grid_vertices: Optional[torch.FloatTensor] = None
+    @property
+    def grid_vertices(self) -> torch.FloatTensor:
+        if self._grid_vertices is None:
+            # keep the vertices on CPU so that we can support very large resolution
+            x, y, z = (
+                torch.linspace(*self.points_range, self.resolution),
+                torch.linspace(*self.points_range, self.resolution),
+                torch.linspace(*self.points_range, self.resolution),
+            )
+            x, y, z = torch.meshgrid(x, y, z, indexing="ij")
+            verts = torch.cat(
+                [x.reshape(-1, 1), y.reshape(-1, 1), z.reshape(-1, 1)], dim=-1
+            ).reshape(-1, 3)
+            self._grid_vertices = verts
+        return self._grid_vertices
+    def forward(
+        self,
+        level: torch.FloatTensor,
+    ) -> Tuple[torch.FloatTensor, torch.LongTensor]:
+        level = -level.view(self.resolution, self.resolution, self.resolution)
+        try:
+            v_pos, t_pos_idx = self.mc_func(level.detach(), 0.0)
+        except AttributeError:
+            print("torchmcubes was not compiled with CUDA support, use CPU version instead.")
+            v_pos, t_pos_idx = self.mc_func(level.detach().cpu(), 0.0)
+        v_pos = v_pos[..., [2, 1, 0]]
+        v_pos = v_pos / (self.resolution - 1.0)
+        return v_pos.to(level.device), t_pos_idx.to(level.device)

tsr/models/nerf_renderer.py ADDED Viewed

	@@ -0,0 +1,180 @@

+from dataclasses import dataclass
+from typing import Dict
+import torch
+import torch.nn.functional as F
+from einops import rearrange, reduce
+from ..utils import (
+    BaseModule,
+    chunk_batch,
+    get_activation,
+    rays_intersect_bbox,
+    scale_tensor,
+)
+class TriplaneNeRFRenderer(BaseModule):
+    @dataclass
+    class Config(BaseModule.Config):
+        radius: float
+        feature_reduction: str = "concat"
+        density_activation: str = "trunc_exp"
+        density_bias: float = -1.0
+        color_activation: str = "sigmoid"
+        num_samples_per_ray: int = 128
+        randomized: bool = False
+    cfg: Config
+    def configure(self) -> None:
+        assert self.cfg.feature_reduction in ["concat", "mean"]
+        self.chunk_size = 0
+    def set_chunk_size(self, chunk_size: int):
+        assert (
+            chunk_size >= 0
+        ), "chunk_size must be a non-negative integer (0 for no chunking)."
+        self.chunk_size = chunk_size
+    def query_triplane(
+        self,
+        decoder: torch.nn.Module,
+        positions: torch.Tensor,
+        triplane: torch.Tensor,
+    ) -> Dict[str, torch.Tensor]:
+        input_shape = positions.shape[:-1]
+        positions = positions.view(-1, 3)
+        # positions in (-radius, radius)
+        # normalized to (-1, 1) for grid sample
+        positions = scale_tensor(
+            positions, (-self.cfg.radius, self.cfg.radius), (-1, 1)
+        )
+        def _query_chunk(x):
+            indices2D: torch.Tensor = torch.stack(
+                (x[..., [0, 1]], x[..., [0, 2]], x[..., [1, 2]]),
+                dim=-3,
+            )
+            out: torch.Tensor = F.grid_sample(
+                rearrange(triplane, "Np Cp Hp Wp -> Np Cp Hp Wp", Np=3),
+                rearrange(indices2D, "Np N Nd -> Np () N Nd", Np=3),
+                align_corners=False,
+                mode="bilinear",
+            )
+            if self.cfg.feature_reduction == "concat":
+                out = rearrange(out, "Np Cp () N -> N (Np Cp)", Np=3)
+            elif self.cfg.feature_reduction == "mean":
+                out = reduce(out, "Np Cp () N -> N Cp", Np=3, reduction="mean")
+            else:
+                raise NotImplementedError
+            net_out: Dict[str, torch.Tensor] = decoder(out)
+            return net_out
+        if self.chunk_size > 0:
+            net_out = chunk_batch(_query_chunk, self.chunk_size, positions)
+        else:
+            net_out = _query_chunk(positions)
+        net_out["density_act"] = get_activation(self.cfg.density_activation)(
+            net_out["density"] + self.cfg.density_bias
+        )
+        net_out["color"] = get_activation(self.cfg.color_activation)(
+            net_out["features"]
+        )
+        net_out = {k: v.view(*input_shape, -1) for k, v in net_out.items()}
+        return net_out
+    def _forward(
+        self,
+        decoder: torch.nn.Module,
+        triplane: torch.Tensor,
+        rays_o: torch.Tensor,
+        rays_d: torch.Tensor,
+        **kwargs,
+    ):
+        rays_shape = rays_o.shape[:-1]
+        rays_o = rays_o.view(-1, 3)
+        rays_d = rays_d.view(-1, 3)
+        n_rays = rays_o.shape[0]
+        t_near, t_far, rays_valid = rays_intersect_bbox(rays_o, rays_d, self.cfg.radius)
+        t_near, t_far = t_near[rays_valid], t_far[rays_valid]
+        t_vals = torch.linspace(
+            0, 1, self.cfg.num_samples_per_ray + 1, device=triplane.device
+        )
+        t_mid = (t_vals[:-1] + t_vals[1:]) / 2.0
+        z_vals = t_near * (1 - t_mid[None]) + t_far * t_mid[None]  # (N_rays, N_samples)
+        xyz = (
+            rays_o[:, None, :] + z_vals[..., None] * rays_d[..., None, :]
+        )  # (N_rays, N_sample, 3)
+        mlp_out = self.query_triplane(
+            decoder=decoder,
+            positions=xyz,
+            triplane=triplane,
+        )
+        eps = 1e-10
+        # deltas = z_vals[:, 1:] - z_vals[:, :-1] # (N_rays, N_samples)
+        deltas = t_vals[1:] - t_vals[:-1]  # (N_rays, N_samples)
+        alpha = 1 - torch.exp(
+            -deltas * mlp_out["density_act"][..., 0]
+        )  # (N_rays, N_samples)
+        accum_prod = torch.cat(
+            [
+                torch.ones_like(alpha[:, :1]),
+                torch.cumprod(1 - alpha[:, :-1] + eps, dim=-1),
+            ],
+            dim=-1,
+        )
+        weights = alpha * accum_prod  # (N_rays, N_samples)
+        comp_rgb_ = (weights[..., None] * mlp_out["color"]).sum(dim=-2)  # (N_rays, 3)
+        opacity_ = weights.sum(dim=-1)  # (N_rays)
+        comp_rgb = torch.zeros(
+            n_rays, 3, dtype=comp_rgb_.dtype, device=comp_rgb_.device
+        )
+        opacity = torch.zeros(n_rays, dtype=opacity_.dtype, device=opacity_.device)
+        comp_rgb[rays_valid] = comp_rgb_
+        opacity[rays_valid] = opacity_
+        comp_rgb += 1 - opacity[..., None]
+        comp_rgb = comp_rgb.view(*rays_shape, 3)
+        return comp_rgb
+    def forward(
+        self,
+        decoder: torch.nn.Module,
+        triplane: torch.Tensor,
+        rays_o: torch.Tensor,
+        rays_d: torch.Tensor,
+    ) -> Dict[str, torch.Tensor]:
+        if triplane.ndim == 4:
+            comp_rgb = self._forward(decoder, triplane, rays_o, rays_d)
+        else:
+            comp_rgb = torch.stack(
+                [
+                    self._forward(decoder, triplane[i], rays_o[i], rays_d[i])
+                    for i in range(triplane.shape[0])
+                ],
+                dim=0,
+            )
+        return comp_rgb
+    def train(self, mode=True):
+        self.randomized = mode and self.cfg.randomized
+        return super().train(mode=mode)
+    def eval(self):
+        self.randomized = False
+        return super().eval()

tsr/models/network_utils.py ADDED Viewed

	@@ -0,0 +1,124 @@

+from dataclasses import dataclass
+from typing import Optional
+import torch
+import torch.nn as nn
+from einops import rearrange
+from ..utils import BaseModule
+class TriplaneUpsampleNetwork(BaseModule):
+    @dataclass
+    class Config(BaseModule.Config):
+        in_channels: int
+        out_channels: int
+    cfg: Config
+    def configure(self) -> None:
+        self.upsample = nn.ConvTranspose2d(
+            self.cfg.in_channels, self.cfg.out_channels, kernel_size=2, stride=2
+        )
+    def forward(self, triplanes: torch.Tensor) -> torch.Tensor:
+        triplanes_up = rearrange(
+            self.upsample(
+                rearrange(triplanes, "B Np Ci Hp Wp -> (B Np) Ci Hp Wp", Np=3)
+            ),
+            "(B Np) Co Hp Wp -> B Np Co Hp Wp",
+            Np=3,
+        )
+        return triplanes_up
+class NeRFMLP(BaseModule):
+    @dataclass
+    class Config(BaseModule.Config):
+        in_channels: int
+        n_neurons: int
+        n_hidden_layers: int
+        activation: str = "relu"
+        bias: bool = True
+        weight_init: Optional[str] = "kaiming_uniform"
+        bias_init: Optional[str] = None
+    cfg: Config
+    def configure(self) -> None:
+        layers = [
+            self.make_linear(
+                self.cfg.in_channels,
+                self.cfg.n_neurons,
+                bias=self.cfg.bias,
+                weight_init=self.cfg.weight_init,
+                bias_init=self.cfg.bias_init,
+            ),
+            self.make_activation(self.cfg.activation),
+        ]
+        for i in range(self.cfg.n_hidden_layers - 1):
+            layers += [
+                self.make_linear(
+                    self.cfg.n_neurons,
+                    self.cfg.n_neurons,
+                    bias=self.cfg.bias,
+                    weight_init=self.cfg.weight_init,
+                    bias_init=self.cfg.bias_init,
+                ),
+                self.make_activation(self.cfg.activation),
+            ]
+        layers += [
+            self.make_linear(
+                self.cfg.n_neurons,
+                4,  # density 1 + features 3
+                bias=self.cfg.bias,
+                weight_init=self.cfg.weight_init,
+                bias_init=self.cfg.bias_init,
+            )
+        ]
+        self.layers = nn.Sequential(*layers)
+    def make_linear(
+        self,
+        dim_in,
+        dim_out,
+        bias=True,
+        weight_init=None,
+        bias_init=None,
+    ):
+        layer = nn.Linear(dim_in, dim_out, bias=bias)
+        if weight_init is None:
+            pass
+        elif weight_init == "kaiming_uniform":
+            torch.nn.init.kaiming_uniform_(layer.weight, nonlinearity="relu")
+        else:
+            raise NotImplementedError
+        if bias:
+            if bias_init is None:
+                pass
+            elif bias_init == "zero":
+                torch.nn.init.zeros_(layer.bias)
+            else:
+                raise NotImplementedError
+        return layer
+    def make_activation(self, activation):
+        if activation == "relu":
+            return nn.ReLU(inplace=True)
+        elif activation == "silu":
+            return nn.SiLU(inplace=True)
+        else:
+            raise NotImplementedError
+    def forward(self, x):
+        inp_shape = x.shape[:-1]
+        x = x.reshape(-1, x.shape[-1])
+        features = self.layers(x)
+        features = features.reshape(*inp_shape, -1)
+        out = {"density": features[..., 0:1], "features": features[..., 1:4]}
+        return out

tsr/models/tokenizers/__pycache__/image.cpython-313.pyc ADDED Viewed

Binary file (3.59 kB). View file

tsr/models/tokenizers/__pycache__/triplane.cpython-313.pyc ADDED Viewed

Binary file (2.77 kB). View file

tsr/models/tokenizers/image.py ADDED Viewed

	@@ -0,0 +1,66 @@

+from dataclasses import dataclass
+import torch
+import torch.nn as nn
+from einops import rearrange
+from huggingface_hub import hf_hub_download
+from transformers.models.vit.modeling_vit import ViTModel
+from ...utils import BaseModule
+class DINOSingleImageTokenizer(BaseModule):
+    @dataclass
+    class Config(BaseModule.Config):
+        pretrained_model_name_or_path: str = "facebook/dino-vitb16"
+        enable_gradient_checkpointing: bool = False
+    cfg: Config
+    def configure(self) -> None:
+        self.model: ViTModel = ViTModel(
+            ViTModel.config_class.from_pretrained(
+                hf_hub_download(
+                    repo_id=self.cfg.pretrained_model_name_or_path,
+                    filename="config.json",
+                )
+            )
+        )
+        if self.cfg.enable_gradient_checkpointing:
+            self.model.encoder.gradient_checkpointing = True
+        self.register_buffer(
+            "image_mean",
+            torch.as_tensor([0.485, 0.456, 0.406]).reshape(1, 1, 3, 1, 1),
+            persistent=False,
+        )
+        self.register_buffer(
+            "image_std",
+            torch.as_tensor([0.229, 0.224, 0.225]).reshape(1, 1, 3, 1, 1),
+            persistent=False,
+        )
+    def forward(self, images: torch.FloatTensor, **kwargs) -> torch.FloatTensor:
+        packed = False
+        if images.ndim == 4:
+            packed = True
+            images = images.unsqueeze(1)
+        batch_size, n_input_views = images.shape[:2]
+        images = (images - self.image_mean) / self.image_std
+        out = self.model(
+            rearrange(images, "B N C H W -> (B N) C H W"), interpolate_pos_encoding=True
+        )
+        local_features, global_features = out.last_hidden_state, out.pooler_output
+        local_features = local_features.permute(0, 2, 1)
+        local_features = rearrange(
+            local_features, "(B N) Ct Nt -> B N Ct Nt", B=batch_size
+        )
+        if packed:
+            local_features = local_features.squeeze(1)
+        return local_features
+    def detokenize(self, *args, **kwargs):
+        raise NotImplementedError

tsr/models/tokenizers/triplane.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import math
+from dataclasses import dataclass
+import torch
+import torch.nn as nn
+from einops import rearrange, repeat
+from ...utils import BaseModule
+class Triplane1DTokenizer(BaseModule):
+    @dataclass
+    class Config(BaseModule.Config):
+        plane_size: int
+        num_channels: int
+    cfg: Config
+    def configure(self) -> None:
+        self.embeddings = nn.Parameter(
+            torch.randn(
+                (3, self.cfg.num_channels, self.cfg.plane_size, self.cfg.plane_size),
+                dtype=torch.float32,
+            )
+            * 1
+            / math.sqrt(self.cfg.num_channels)
+        )
+    def forward(self, batch_size: int) -> torch.Tensor:
+        return rearrange(
+            repeat(self.embeddings, "Np Ct Hp Wp -> B Np Ct Hp Wp", B=batch_size),
+            "B Np Ct Hp Wp -> B Ct (Np Hp Wp)",
+        )
+    def detokenize(self, tokens: torch.Tensor) -> torch.Tensor:
+        batch_size, Ct, Nt = tokens.shape
+        assert Nt == self.cfg.plane_size**2 * 3
+        assert Ct == self.cfg.num_channels
+        return rearrange(
+            tokens,
+            "B Ct (Np Hp Wp) -> B Np Ct Hp Wp",
+            Np=3,
+            Hp=self.cfg.plane_size,
+            Wp=self.cfg.plane_size,
+        )

tsr/models/transformer/__pycache__/attention.cpython-313.pyc ADDED Viewed

Binary file (23.5 kB). View file

tsr/models/transformer/__pycache__/basic_transformer_block.cpython-313.pyc ADDED Viewed

Binary file (13.6 kB). View file

tsr/models/transformer/__pycache__/transformer_1d.cpython-313.pyc ADDED Viewed

Binary file (7.52 kB). View file

tsr/models/transformer/attention.py ADDED Viewed

	@@ -0,0 +1,653 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# --------
+#
+# Modified 2024 by the Tripo AI and Stability AI Team.
+#
+# Copyright (c) 2024 Tripo AI & Stability AI
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+from typing import Optional
+import torch
+import torch.nn.functional as F
+from torch import nn
+class Attention(nn.Module):
+    r"""
+    A cross attention layer.
+    Parameters:
+        query_dim (`int`):
+            The number of channels in the query.
+        cross_attention_dim (`int`, *optional*):
+            The number of channels in the encoder_hidden_states. If not given, defaults to `query_dim`.
+        heads (`int`,  *optional*, defaults to 8):
+            The number of heads to use for multi-head attention.
+        dim_head (`int`,  *optional*, defaults to 64):
+            The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability to use.
+        bias (`bool`, *optional*, defaults to False):
+            Set to `True` for the query, key, and value linear layers to contain a bias parameter.
+        upcast_attention (`bool`, *optional*, defaults to False):
+            Set to `True` to upcast the attention computation to `float32`.
+        upcast_softmax (`bool`, *optional*, defaults to False):
+            Set to `True` to upcast the softmax computation to `float32`.
+        cross_attention_norm (`str`, *optional*, defaults to `None`):
+            The type of normalization to use for the cross attention. Can be `None`, `layer_norm`, or `group_norm`.
+        cross_attention_norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups to use for the group norm in the cross attention.
+        added_kv_proj_dim (`int`, *optional*, defaults to `None`):
+            The number of channels to use for the added key and value projections. If `None`, no projection is used.
+        norm_num_groups (`int`, *optional*, defaults to `None`):
+            The number of groups to use for the group norm in the attention.
+        spatial_norm_dim (`int`, *optional*, defaults to `None`):
+            The number of channels to use for the spatial normalization.
+        out_bias (`bool`, *optional*, defaults to `True`):
+            Set to `True` to use a bias in the output linear layer.
+        scale_qk (`bool`, *optional*, defaults to `True`):
+            Set to `True` to scale the query and key by `1 / sqrt(dim_head)`.
+        only_cross_attention (`bool`, *optional*, defaults to `False`):
+            Set to `True` to only use cross attention and not added_kv_proj_dim. Can only be set to `True` if
+            `added_kv_proj_dim` is not `None`.
+        eps (`float`, *optional*, defaults to 1e-5):
+            An additional value added to the denominator in group normalization that is used for numerical stability.
+        rescale_output_factor (`float`, *optional*, defaults to 1.0):
+            A factor to rescale the output by dividing it with this value.
+        residual_connection (`bool`, *optional*, defaults to `False`):
+            Set to `True` to add the residual connection to the output.
+        _from_deprecated_attn_block (`bool`, *optional*, defaults to `False`):
+            Set to `True` if the attention block is loaded from a deprecated state dict.
+        processor (`AttnProcessor`, *optional*, defaults to `None`):
+            The attention processor to use. If `None`, defaults to `AttnProcessor2_0` if `torch 2.x` is used and
+            `AttnProcessor` otherwise.
+    """
+    def __init__(
+        self,
+        query_dim: int,
+        cross_attention_dim: Optional[int] = None,
+        heads: int = 8,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+        bias: bool = False,
+        upcast_attention: bool = False,
+        upcast_softmax: bool = False,
+        cross_attention_norm: Optional[str] = None,
+        cross_attention_norm_num_groups: int = 32,
+        added_kv_proj_dim: Optional[int] = None,
+        norm_num_groups: Optional[int] = None,
+        out_bias: bool = True,
+        scale_qk: bool = True,
+        only_cross_attention: bool = False,
+        eps: float = 1e-5,
+        rescale_output_factor: float = 1.0,
+        residual_connection: bool = False,
+        _from_deprecated_attn_block: bool = False,
+        processor: Optional["AttnProcessor"] = None,
+        out_dim: int = None,
+    ):
+        super().__init__()
+        self.inner_dim = out_dim if out_dim is not None else dim_head * heads
+        self.query_dim = query_dim
+        self.cross_attention_dim = (
+            cross_attention_dim if cross_attention_dim is not None else query_dim
+        )
+        self.upcast_attention = upcast_attention
+        self.upcast_softmax = upcast_softmax
+        self.rescale_output_factor = rescale_output_factor
+        self.residual_connection = residual_connection
+        self.dropout = dropout
+        self.fused_projections = False
+        self.out_dim = out_dim if out_dim is not None else query_dim
+        # we make use of this private variable to know whether this class is loaded
+        # with an deprecated state dict so that we can convert it on the fly
+        self._from_deprecated_attn_block = _from_deprecated_attn_block
+        self.scale_qk = scale_qk
+        self.scale = dim_head**-0.5 if self.scale_qk else 1.0
+        self.heads = out_dim // dim_head if out_dim is not None else heads
+        # for slice_size > 0 the attention score computation
+        # is split across the batch axis to save memory
+        # You can set slice_size with `set_attention_slice`
+        self.sliceable_head_dim = heads
+        self.added_kv_proj_dim = added_kv_proj_dim
+        self.only_cross_attention = only_cross_attention
+        if self.added_kv_proj_dim is None and self.only_cross_attention:
+            raise ValueError(
+                "`only_cross_attention` can only be set to True if `added_kv_proj_dim` is not None. Make sure to set either `only_cross_attention=False` or define `added_kv_proj_dim`."
+            )
+        if norm_num_groups is not None:
+            self.group_norm = nn.GroupNorm(
+                num_channels=query_dim, num_groups=norm_num_groups, eps=eps, affine=True
+            )
+        else:
+            self.group_norm = None
+        self.spatial_norm = None
+        if cross_attention_norm is None:
+            self.norm_cross = None
+        elif cross_attention_norm == "layer_norm":
+            self.norm_cross = nn.LayerNorm(self.cross_attention_dim)
+        elif cross_attention_norm == "group_norm":
+            if self.added_kv_proj_dim is not None:
+                # The given `encoder_hidden_states` are initially of shape
+                # (batch_size, seq_len, added_kv_proj_dim) before being projected
+                # to (batch_size, seq_len, cross_attention_dim). The norm is applied
+                # before the projection, so we need to use `added_kv_proj_dim` as
+                # the number of channels for the group norm.
+                norm_cross_num_channels = added_kv_proj_dim
+            else:
+                norm_cross_num_channels = self.cross_attention_dim
+            self.norm_cross = nn.GroupNorm(
+                num_channels=norm_cross_num_channels,
+                num_groups=cross_attention_norm_num_groups,
+                eps=1e-5,
+                affine=True,
+            )
+        else:
+            raise ValueError(
+                f"unknown cross_attention_norm: {cross_attention_norm}. Should be None, 'layer_norm' or 'group_norm'"
+            )
+        linear_cls = nn.Linear
+        self.linear_cls = linear_cls
+        self.to_q = linear_cls(query_dim, self.inner_dim, bias=bias)
+        if not self.only_cross_attention:
+            # only relevant for the `AddedKVProcessor` classes
+            self.to_k = linear_cls(self.cross_attention_dim, self.inner_dim, bias=bias)
+            self.to_v = linear_cls(self.cross_attention_dim, self.inner_dim, bias=bias)
+        else:
+            self.to_k = None
+            self.to_v = None
+        if self.added_kv_proj_dim is not None:
+            self.add_k_proj = linear_cls(added_kv_proj_dim, self.inner_dim)
+            self.add_v_proj = linear_cls(added_kv_proj_dim, self.inner_dim)
+        self.to_out = nn.ModuleList([])
+        self.to_out.append(linear_cls(self.inner_dim, self.out_dim, bias=out_bias))
+        self.to_out.append(nn.Dropout(dropout))
+        # set attention processor
+        # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
+        # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
+        # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
+        if processor is None:
+            processor = (
+                AttnProcessor2_0()
+                if hasattr(F, "scaled_dot_product_attention") and self.scale_qk
+                else AttnProcessor()
+            )
+        self.set_processor(processor)
+    def set_processor(self, processor: "AttnProcessor") -> None:
+        self.processor = processor
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        **cross_attention_kwargs,
+    ) -> torch.Tensor:
+        r"""
+        The forward method of the `Attention` class.
+        Args:
+            hidden_states (`torch.Tensor`):
+                The hidden states of the query.
+            encoder_hidden_states (`torch.Tensor`, *optional*):
+                The hidden states of the encoder.
+            attention_mask (`torch.Tensor`, *optional*):
+                The attention mask to use. If `None`, no mask is applied.
+            **cross_attention_kwargs:
+                Additional keyword arguments to pass along to the cross attention.
+        Returns:
+            `torch.Tensor`: The output of the attention layer.
+        """
+        # The `Attention` class can call different attention processors / attention functions
+        # here we simply pass along all tensors to the selected processor class
+        # For standard processors that are defined here, `**cross_attention_kwargs` is empty
+        return self.processor(
+            self,
+            hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            attention_mask=attention_mask,
+            **cross_attention_kwargs,
+        )
+    def batch_to_head_dim(self, tensor: torch.Tensor) -> torch.Tensor:
+        r"""
+        Reshape the tensor from `[batch_size, seq_len, dim]` to `[batch_size // heads, seq_len, dim * heads]`. `heads`
+        is the number of heads initialized while constructing the `Attention` class.
+        Args:
+            tensor (`torch.Tensor`): The tensor to reshape.
+        Returns:
+            `torch.Tensor`: The reshaped tensor.
+        """
+        head_size = self.heads
+        batch_size, seq_len, dim = tensor.shape
+        tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim)
+        tensor = tensor.permute(0, 2, 1, 3).reshape(
+            batch_size // head_size, seq_len, dim * head_size
+        )
+        return tensor
+    def head_to_batch_dim(self, tensor: torch.Tensor, out_dim: int = 3) -> torch.Tensor:
+        r"""
+        Reshape the tensor from `[batch_size, seq_len, dim]` to `[batch_size, seq_len, heads, dim // heads]` `heads` is
+        the number of heads initialized while constructing the `Attention` class.
+        Args:
+            tensor (`torch.Tensor`): The tensor to reshape.
+            out_dim (`int`, *optional*, defaults to `3`): The output dimension of the tensor. If `3`, the tensor is
+                reshaped to `[batch_size * heads, seq_len, dim // heads]`.
+        Returns:
+            `torch.Tensor`: The reshaped tensor.
+        """
+        head_size = self.heads
+        batch_size, seq_len, dim = tensor.shape
+        tensor = tensor.reshape(batch_size, seq_len, head_size, dim // head_size)
+        tensor = tensor.permute(0, 2, 1, 3)
+        if out_dim == 3:
+            tensor = tensor.reshape(batch_size * head_size, seq_len, dim // head_size)
+        return tensor
+    def get_attention_scores(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        attention_mask: torch.Tensor = None,
+    ) -> torch.Tensor:
+        r"""
+        Compute the attention scores.
+        Args:
+            query (`torch.Tensor`): The query tensor.
+            key (`torch.Tensor`): The key tensor.
+            attention_mask (`torch.Tensor`, *optional*): The attention mask to use. If `None`, no mask is applied.
+        Returns:
+            `torch.Tensor`: The attention probabilities/scores.
+        """
+        dtype = query.dtype
+        if self.upcast_attention:
+            query = query.float()
+            key = key.float()
+        if attention_mask is None:
+            baddbmm_input = torch.empty(
+                query.shape[0],
+                query.shape[1],
+                key.shape[1],
+                dtype=query.dtype,
+                device=query.device,
+            )
+            beta = 0
+        else:
+            baddbmm_input = attention_mask
+            beta = 1
+        attention_scores = torch.baddbmm(
+            baddbmm_input,
+            query,
+            key.transpose(-1, -2),
+            beta=beta,
+            alpha=self.scale,
+        )
+        del baddbmm_input
+        if self.upcast_softmax:
+            attention_scores = attention_scores.float()
+        attention_probs = attention_scores.softmax(dim=-1)
+        del attention_scores
+        attention_probs = attention_probs.to(dtype)
+        return attention_probs
+    def prepare_attention_mask(
+        self,
+        attention_mask: torch.Tensor,
+        target_length: int,
+        batch_size: int,
+        out_dim: int = 3,
+    ) -> torch.Tensor:
+        r"""
+        Prepare the attention mask for the attention computation.
+        Args:
+            attention_mask (`torch.Tensor`):
+                The attention mask to prepare.
+            target_length (`int`):
+                The target length of the attention mask. This is the length of the attention mask after padding.
+            batch_size (`int`):
+                The batch size, which is used to repeat the attention mask.
+            out_dim (`int`, *optional*, defaults to `3`):
+                The output dimension of the attention mask. Can be either `3` or `4`.
+        Returns:
+            `torch.Tensor`: The prepared attention mask.
+        """
+        head_size = self.heads
+        if attention_mask is None:
+            return attention_mask
+        current_length: int = attention_mask.shape[-1]
+        if current_length != target_length:
+            if attention_mask.device.type == "mps":
+                # HACK: MPS: Does not support padding by greater than dimension of input tensor.
+                # Instead, we can manually construct the padding tensor.
+                padding_shape = (
+                    attention_mask.shape[0],
+                    attention_mask.shape[1],
+                    target_length,
+                )
+                padding = torch.zeros(
+                    padding_shape,
+                    dtype=attention_mask.dtype,
+                    device=attention_mask.device,
+                )
+                attention_mask = torch.cat([attention_mask, padding], dim=2)
+            else:
+                # TODO: for pipelines such as stable-diffusion, padding cross-attn mask:
+                #       we want to instead pad by (0, remaining_length), where remaining_length is:
+                #       remaining_length: int = target_length - current_length
+                # TODO: re-enable tests/models/test_models_unet_2d_condition.py#test_model_xattn_padding
+                attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
+        if out_dim == 3:
+            if attention_mask.shape[0] < batch_size * head_size:
+                attention_mask = attention_mask.repeat_interleave(head_size, dim=0)
+        elif out_dim == 4:
+            attention_mask = attention_mask.unsqueeze(1)
+            attention_mask = attention_mask.repeat_interleave(head_size, dim=1)
+        return attention_mask
+    def norm_encoder_hidden_states(
+        self, encoder_hidden_states: torch.Tensor
+    ) -> torch.Tensor:
+        r"""
+        Normalize the encoder hidden states. Requires `self.norm_cross` to be specified when constructing the
+        `Attention` class.
+        Args:
+            encoder_hidden_states (`torch.Tensor`): Hidden states of the encoder.
+        Returns:
+            `torch.Tensor`: The normalized encoder hidden states.
+        """
+        assert (
+            self.norm_cross is not None
+        ), "self.norm_cross must be defined to call self.norm_encoder_hidden_states"
+        if isinstance(self.norm_cross, nn.LayerNorm):
+            encoder_hidden_states = self.norm_cross(encoder_hidden_states)
+        elif isinstance(self.norm_cross, nn.GroupNorm):
+            # Group norm norms along the channels dimension and expects
+            # input to be in the shape of (N, C, *). In this case, we want
+            # to norm along the hidden dimension, so we need to move
+            # (batch_size, sequence_length, hidden_size) ->
+            # (batch_size, hidden_size, sequence_length)
+            encoder_hidden_states = encoder_hidden_states.transpose(1, 2)
+            encoder_hidden_states = self.norm_cross(encoder_hidden_states)
+            encoder_hidden_states = encoder_hidden_states.transpose(1, 2)
+        else:
+            assert False
+        return encoder_hidden_states
+    @torch.no_grad()
+    def fuse_projections(self, fuse=True):
+        is_cross_attention = self.cross_attention_dim != self.query_dim
+        device = self.to_q.weight.data.device
+        dtype = self.to_q.weight.data.dtype
+        if not is_cross_attention:
+            # fetch weight matrices.
+            concatenated_weights = torch.cat(
+                [self.to_q.weight.data, self.to_k.weight.data, self.to_v.weight.data]
+            )
+            in_features = concatenated_weights.shape[1]
+            out_features = concatenated_weights.shape[0]
+            # create a new single projection layer and copy over the weights.
+            self.to_qkv = self.linear_cls(
+                in_features, out_features, bias=False, device=device, dtype=dtype
+            )
+            self.to_qkv.weight.copy_(concatenated_weights)
+        else:
+            concatenated_weights = torch.cat(
+                [self.to_k.weight.data, self.to_v.weight.data]
+            )
+            in_features = concatenated_weights.shape[1]
+            out_features = concatenated_weights.shape[0]
+            self.to_kv = self.linear_cls(
+                in_features, out_features, bias=False, device=device, dtype=dtype
+            )
+            self.to_kv.weight.copy_(concatenated_weights)
+        self.fused_projections = fuse
+class AttnProcessor:
+    r"""
+    Default processor for performing attention-related computations.
+    """
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(
+                batch_size, channel, height * width
+            ).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape
+            if encoder_hidden_states is None
+            else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(
+            attention_mask, sequence_length, batch_size
+        )
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(
+                1, 2
+            )
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(
+                encoder_hidden_states
+            )
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(
+                batch_size, channel, height, width
+            )
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class AttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+    """
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
+            )
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        residual = hidden_states
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(
+                batch_size, channel, height * width
+            ).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape
+            if encoder_hidden_states is None
+            else encoder_hidden_states.shape
+        )
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(
+                attention_mask, sequence_length, batch_size
+            )
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(
+                batch_size, attn.heads, -1, attention_mask.shape[-1]
+            )
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(
+                1, 2
+            )
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(
+                encoder_hidden_states
+            )
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(
+            batch_size, -1, attn.heads * head_dim
+        )
+        hidden_states = hidden_states.to(query.dtype)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(
+                batch_size, channel, height, width
+            )
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states

tsr/models/transformer/basic_transformer_block.py ADDED Viewed

	@@ -0,0 +1,334 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# --------
+#
+# Modified 2024 by the Tripo AI and Stability AI Team.
+#
+# Copyright (c) 2024 Tripo AI & Stability AI
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+from typing import Optional
+import torch
+import torch.nn.functional as F
+from torch import nn
+from .attention import Attention
+class BasicTransformerBlock(nn.Module):
+    r"""
+    A basic Transformer block.
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        attention_bias (:
+            obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
+        only_cross_attention (`bool`, *optional*):
+            Whether to use only cross-attention layers. In this case two cross attention layers are used.
+        double_self_attention (`bool`, *optional*):
+            Whether to use two self-attention layers. In this case no cross attention layers are used.
+        upcast_attention (`bool`, *optional*):
+            Whether to upcast the attention computation to float32. This is useful for mixed precision training.
+        norm_elementwise_affine (`bool`, *optional*, defaults to `True`):
+            Whether to use learnable elementwise affine parameters for normalization.
+        norm_type (`str`, *optional*, defaults to `"layer_norm"`):
+            The normalization layer to use. Can be `"layer_norm"`, `"ada_norm"` or `"ada_norm_zero"`.
+        final_dropout (`bool` *optional*, defaults to False):
+            Whether to apply a final dropout after the last feed-forward layer.
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        dropout=0.0,
+        cross_attention_dim: Optional[int] = None,
+        activation_fn: str = "geglu",
+        attention_bias: bool = False,
+        only_cross_attention: bool = False,
+        double_self_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_elementwise_affine: bool = True,
+        norm_type: str = "layer_norm",
+        final_dropout: bool = False,
+    ):
+        super().__init__()
+        self.only_cross_attention = only_cross_attention
+        assert norm_type == "layer_norm"
+        # Define 3 blocks. Each block has its own normalization layer.
+        # 1. Self-Attn
+        self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+        self.attn1 = Attention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+            upcast_attention=upcast_attention,
+        )
+        # 2. Cross-Attn
+        if cross_attention_dim is not None or double_self_attention:
+            # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
+            # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
+            # the second cross attention block.
+            self.norm2 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+            self.attn2 = Attention(
+                query_dim=dim,
+                cross_attention_dim=(
+                    cross_attention_dim if not double_self_attention else None
+                ),
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                upcast_attention=upcast_attention,
+            )  # is self-attn if encoder_hidden_states is none
+        else:
+            self.norm2 = None
+            self.attn2 = None
+        # 3. Feed-forward
+        self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+        self.ff = FeedForward(
+            dim,
+            dropout=dropout,
+            activation_fn=activation_fn,
+            final_dropout=final_dropout,
+        )
+        # let chunk size default to None
+        self._chunk_size = None
+        self._chunk_dim = 0
+    def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int):
+        # Sets chunk feed-forward
+        self._chunk_size = chunk_size
+        self._chunk_dim = dim
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        # Notice that normalization is always applied before the real computation in the following blocks.
+        # 0. Self-Attention
+        norm_hidden_states = self.norm1(hidden_states)
+        attn_output = self.attn1(
+            norm_hidden_states,
+            encoder_hidden_states=(
+                encoder_hidden_states if self.only_cross_attention else None
+            ),
+            attention_mask=attention_mask,
+        )
+        hidden_states = attn_output + hidden_states
+        # 3. Cross-Attention
+        if self.attn2 is not None:
+            norm_hidden_states = self.norm2(hidden_states)
+            attn_output = self.attn2(
+                norm_hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+            )
+            hidden_states = attn_output + hidden_states
+        # 4. Feed-forward
+        norm_hidden_states = self.norm3(hidden_states)
+        if self._chunk_size is not None:
+            # "feed_forward_chunk_size" can be used to save memory
+            if norm_hidden_states.shape[self._chunk_dim] % self._chunk_size != 0:
+                raise ValueError(
+                    f"`hidden_states` dimension to be chunked: {norm_hidden_states.shape[self._chunk_dim]} has to be divisible by chunk size: {self._chunk_size}. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`."
+                )
+            num_chunks = norm_hidden_states.shape[self._chunk_dim] // self._chunk_size
+            ff_output = torch.cat(
+                [
+                    self.ff(hid_slice)
+                    for hid_slice in norm_hidden_states.chunk(
+                        num_chunks, dim=self._chunk_dim
+                    )
+                ],
+                dim=self._chunk_dim,
+            )
+        else:
+            ff_output = self.ff(norm_hidden_states)
+        hidden_states = ff_output + hidden_states
+        return hidden_states
+class FeedForward(nn.Module):
+    r"""
+    A feed-forward layer.
+    Parameters:
+        dim (`int`): The number of channels in the input.
+        dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
+        mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        final_dropout (`bool` *optional*, defaults to False): Apply a final dropout.
+    """
+    def __init__(
+        self,
+        dim: int,
+        dim_out: Optional[int] = None,
+        mult: int = 4,
+        dropout: float = 0.0,
+        activation_fn: str = "geglu",
+        final_dropout: bool = False,
+    ):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = dim_out if dim_out is not None else dim
+        linear_cls = nn.Linear
+        if activation_fn == "gelu":
+            act_fn = GELU(dim, inner_dim)
+        if activation_fn == "gelu-approximate":
+            act_fn = GELU(dim, inner_dim, approximate="tanh")
+        elif activation_fn == "geglu":
+            act_fn = GEGLU(dim, inner_dim)
+        elif activation_fn == "geglu-approximate":
+            act_fn = ApproximateGELU(dim, inner_dim)
+        self.net = nn.ModuleList([])
+        # project in
+        self.net.append(act_fn)
+        # project dropout
+        self.net.append(nn.Dropout(dropout))
+        # project out
+        self.net.append(linear_cls(inner_dim, dim_out))
+        # FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout
+        if final_dropout:
+            self.net.append(nn.Dropout(dropout))
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        for module in self.net:
+            hidden_states = module(hidden_states)
+        return hidden_states
+class GELU(nn.Module):
+    r"""
+    GELU activation function with tanh approximation support with `approximate="tanh"`.
+    Parameters:
+        dim_in (`int`): The number of channels in the input.
+        dim_out (`int`): The number of channels in the output.
+        approximate (`str`, *optional*, defaults to `"none"`): If `"tanh"`, use tanh approximation.
+    """
+    def __init__(self, dim_in: int, dim_out: int, approximate: str = "none"):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out)
+        self.approximate = approximate
+    def gelu(self, gate: torch.Tensor) -> torch.Tensor:
+        if gate.device.type != "mps":
+            return F.gelu(gate, approximate=self.approximate)
+        # mps: gelu is not implemented for float16
+        return F.gelu(gate.to(dtype=torch.float32), approximate=self.approximate).to(
+            dtype=gate.dtype
+        )
+    def forward(self, hidden_states):
+        hidden_states = self.proj(hidden_states)
+        hidden_states = self.gelu(hidden_states)
+        return hidden_states
+class GEGLU(nn.Module):
+    r"""
+    A variant of the gated linear unit activation function from https://arxiv.org/abs/2002.05202.
+    Parameters:
+        dim_in (`int`): The number of channels in the input.
+        dim_out (`int`): The number of channels in the output.
+    """
+    def __init__(self, dim_in: int, dim_out: int):
+        super().__init__()
+        linear_cls = nn.Linear
+        self.proj = linear_cls(dim_in, dim_out * 2)
+    def gelu(self, gate: torch.Tensor) -> torch.Tensor:
+        if gate.device.type != "mps":
+            return F.gelu(gate)
+        # mps: gelu is not implemented for float16
+        return F.gelu(gate.to(dtype=torch.float32)).to(dtype=gate.dtype)
+    def forward(self, hidden_states, scale: float = 1.0):
+        args = ()
+        hidden_states, gate = self.proj(hidden_states, *args).chunk(2, dim=-1)
+        return hidden_states * self.gelu(gate)
+class ApproximateGELU(nn.Module):
+    r"""
+    The approximate form of Gaussian Error Linear Unit (GELU). For more details, see section 2:
+    https://arxiv.org/abs/1606.08415.
+    Parameters:
+        dim_in (`int`): The number of channels in the input.
+        dim_out (`int`): The number of channels in the output.
+    """
+    def __init__(self, dim_in: int, dim_out: int):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.proj(x)
+        return x * torch.sigmoid(1.702 * x)

tsr/models/transformer/transformer_1d.py ADDED Viewed

	@@ -0,0 +1,219 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# --------
+#
+# Modified 2024 by the Tripo AI and Stability AI Team.
+#
+# Copyright (c) 2024 Tripo AI & Stability AI
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+from dataclasses import dataclass
+from typing import Optional
+import torch
+import torch.nn.functional as F
+from torch import nn
+from ...utils import BaseModule
+from .basic_transformer_block import BasicTransformerBlock
+class Transformer1D(BaseModule):
+    @dataclass
+    class Config(BaseModule.Config):
+        num_attention_heads: int = 16
+        attention_head_dim: int = 88
+        in_channels: Optional[int] = None
+        out_channels: Optional[int] = None
+        num_layers: int = 1
+        dropout: float = 0.0
+        norm_num_groups: int = 32
+        cross_attention_dim: Optional[int] = None
+        attention_bias: bool = False
+        activation_fn: str = "geglu"
+        only_cross_attention: bool = False
+        double_self_attention: bool = False
+        upcast_attention: bool = False
+        norm_type: str = "layer_norm"
+        norm_elementwise_affine: bool = True
+        gradient_checkpointing: bool = False
+    cfg: Config
+    def configure(self) -> None:
+        self.num_attention_heads = self.cfg.num_attention_heads
+        self.attention_head_dim = self.cfg.attention_head_dim
+        inner_dim = self.num_attention_heads * self.attention_head_dim
+        linear_cls = nn.Linear
+        # 2. Define input layers
+        self.in_channels = self.cfg.in_channels
+        self.norm = torch.nn.GroupNorm(
+            num_groups=self.cfg.norm_num_groups,
+            num_channels=self.cfg.in_channels,
+            eps=1e-6,
+            affine=True,
+        )
+        self.proj_in = linear_cls(self.cfg.in_channels, inner_dim)
+        # 3. Define transformers blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    inner_dim,
+                    self.num_attention_heads,
+                    self.attention_head_dim,
+                    dropout=self.cfg.dropout,
+                    cross_attention_dim=self.cfg.cross_attention_dim,
+                    activation_fn=self.cfg.activation_fn,
+                    attention_bias=self.cfg.attention_bias,
+                    only_cross_attention=self.cfg.only_cross_attention,
+                    double_self_attention=self.cfg.double_self_attention,
+                    upcast_attention=self.cfg.upcast_attention,
+                    norm_type=self.cfg.norm_type,
+                    norm_elementwise_affine=self.cfg.norm_elementwise_affine,
+                )
+                for d in range(self.cfg.num_layers)
+            ]
+        )
+        # 4. Define output layers
+        self.out_channels = (
+            self.cfg.in_channels
+            if self.cfg.out_channels is None
+            else self.cfg.out_channels
+        )
+        self.proj_out = linear_cls(inner_dim, self.cfg.in_channels)
+        self.gradient_checkpointing = self.cfg.gradient_checkpointing
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+    ):
+        """
+        The [`Transformer1DModel`] forward method.
+        Args:
+            hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.FloatTensor` of shape `(batch size, channel, height, width)` if continuous):
+                Input `hidden_states`.
+            encoder_hidden_states ( `torch.FloatTensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            attention_mask ( `torch.Tensor`, *optional*):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            encoder_attention_mask ( `torch.Tensor`, *optional*):
+                Cross-attention mask applied to `encoder_hidden_states`. Two formats supported:
+                    * Mask `(batch, sequence_length)` True = keep, False = discard.
+                    * Bias `(batch, 1, sequence_length)` 0 = keep, -10000 = discard.
+                If `ndim == 2`: will be interpreted as a mask, then converted into a bias consistent with the format
+                above. This bias will be added to the cross-attention scores.
+        Returns:
+            torch.FloatTensor
+        """
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension.
+        #   we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward.
+        #   we can tell by counting dims; if ndim == 2: it's a mask rather than a bias.
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+        if attention_mask is not None and attention_mask.ndim == 2:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.to(hidden_states.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
+            encoder_attention_mask = (
+                1 - encoder_attention_mask.to(hidden_states.dtype)
+            ) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+        # 1. Input
+        batch, _, seq_len = hidden_states.shape
+        residual = hidden_states
+        hidden_states = self.norm(hidden_states)
+        inner_dim = hidden_states.shape[1]
+        hidden_states = hidden_states.permute(0, 2, 1).reshape(
+            batch, seq_len, inner_dim
+        )
+        hidden_states = self.proj_in(hidden_states)
+        # 2. Blocks
+        for block in self.transformer_blocks:
+            if self.training and self.gradient_checkpointing:
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    block,
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    use_reentrant=False,
+                )
+            else:
+                hidden_states = block(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+        # 3. Output
+        hidden_states = self.proj_out(hidden_states)
+        hidden_states = (
+            hidden_states.reshape(batch, seq_len, inner_dim)
+            .permute(0, 2, 1)
+            .contiguous()
+        )
+        output = hidden_states + residual
+        return output

tsr/system.py ADDED Viewed

	@@ -0,0 +1,205 @@

+import math
+import os
+from dataclasses import dataclass, field
+from typing import List, Union
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+import trimesh
+from einops import rearrange
+from huggingface_hub import hf_hub_download
+from omegaconf import OmegaConf
+from PIL import Image
+from .models.isosurface import MarchingCubeHelper
+from .utils import (
+    BaseModule,
+    ImagePreprocessor,
+    find_class,
+    get_spherical_cameras,
+    scale_tensor,
+)
+class TSR(BaseModule):
+    @dataclass
+    class Config(BaseModule.Config):
+        cond_image_size: int
+        image_tokenizer_cls: str
+        image_tokenizer: dict
+        tokenizer_cls: str
+        tokenizer: dict
+        backbone_cls: str
+        backbone: dict
+        post_processor_cls: str
+        post_processor: dict
+        decoder_cls: str
+        decoder: dict
+        renderer_cls: str
+        renderer: dict
+    cfg: Config
+    @classmethod
+    def from_pretrained(
+        cls, pretrained_model_name_or_path: str, config_name: str, weight_name: str
+    ):
+        if os.path.isdir(pretrained_model_name_or_path):
+            config_path = os.path.join(pretrained_model_name_or_path, config_name)
+            weight_path = os.path.join(pretrained_model_name_or_path, weight_name)
+        else:
+            config_path = hf_hub_download(
+                repo_id=pretrained_model_name_or_path, filename=config_name
+            )
+            weight_path = hf_hub_download(
+                repo_id=pretrained_model_name_or_path, filename=weight_name
+            )
+        cfg = OmegaConf.load(config_path)
+        OmegaConf.resolve(cfg)
+        model = cls(cfg)
+        ckpt = torch.load(weight_path, map_location="cpu")
+        model.load_state_dict(ckpt)
+        return model
+    def configure(self):
+        self.image_tokenizer = find_class(self.cfg.image_tokenizer_cls)(
+            self.cfg.image_tokenizer
+        )
+        self.tokenizer = find_class(self.cfg.tokenizer_cls)(self.cfg.tokenizer)
+        self.backbone = find_class(self.cfg.backbone_cls)(self.cfg.backbone)
+        self.post_processor = find_class(self.cfg.post_processor_cls)(
+            self.cfg.post_processor
+        )
+        self.decoder = find_class(self.cfg.decoder_cls)(self.cfg.decoder)
+        self.renderer = find_class(self.cfg.renderer_cls)(self.cfg.renderer)
+        self.image_processor = ImagePreprocessor()
+        self.isosurface_helper = None
+    def forward(
+        self,
+        image: Union[
+            PIL.Image.Image,
+            np.ndarray,
+            torch.FloatTensor,
+            List[PIL.Image.Image],
+            List[np.ndarray],
+            List[torch.FloatTensor],
+        ],
+        device: str,
+    ) -> torch.FloatTensor:
+        rgb_cond = self.image_processor(image, self.cfg.cond_image_size)[:, None].to(
+            device
+        )
+        batch_size = rgb_cond.shape[0]
+        input_image_tokens: torch.Tensor = self.image_tokenizer(
+            rearrange(rgb_cond, "B Nv H W C -> B Nv C H W", Nv=1),
+        )
+        input_image_tokens = rearrange(
+            input_image_tokens, "B Nv C Nt -> B (Nv Nt) C", Nv=1
+        )
+        tokens: torch.Tensor = self.tokenizer(batch_size)
+        tokens = self.backbone(
+            tokens,
+            encoder_hidden_states=input_image_tokens,
+        )
+        scene_codes = self.post_processor(self.tokenizer.detokenize(tokens))
+        return scene_codes
+    def render(
+        self,
+        scene_codes,
+        n_views: int,
+        elevation_deg: float = 0.0,
+        camera_distance: float = 1.9,
+        fovy_deg: float = 40.0,
+        height: int = 256,
+        width: int = 256,
+        return_type: str = "pil",
+    ):
+        rays_o, rays_d = get_spherical_cameras(
+            n_views, elevation_deg, camera_distance, fovy_deg, height, width
+        )
+        rays_o, rays_d = rays_o.to(scene_codes.device), rays_d.to(scene_codes.device)
+        def process_output(image: torch.FloatTensor):
+            if return_type == "pt":
+                return image
+            elif return_type == "np":
+                return image.detach().cpu().numpy()
+            elif return_type == "pil":
+                return Image.fromarray(
+                    (image.detach().cpu().numpy() * 255.0).astype(np.uint8)
+                )
+            else:
+                raise NotImplementedError
+        images = []
+        for scene_code in scene_codes:
+            images_ = []
+            for i in range(n_views):
+                with torch.no_grad():
+                    image = self.renderer(
+                        self.decoder, scene_code, rays_o[i], rays_d[i]
+                    )
+                images_.append(process_output(image))
+            images.append(images_)
+        return images
+    def set_marching_cubes_resolution(self, resolution: int):
+        if (
+            self.isosurface_helper is not None
+            and self.isosurface_helper.resolution == resolution
+        ):
+            return
+        self.isosurface_helper = MarchingCubeHelper(resolution)
+    def extract_mesh(self, scene_codes, has_vertex_color, resolution: int = 256, threshold: float = 25.0):
+        self.set_marching_cubes_resolution(resolution)
+        meshes = []
+        for scene_code in scene_codes:
+            with torch.no_grad():
+                density = self.renderer.query_triplane(
+                    self.decoder,
+                    scale_tensor(
+                        self.isosurface_helper.grid_vertices.to(scene_codes.device),
+                        self.isosurface_helper.points_range,
+                        (-self.renderer.cfg.radius, self.renderer.cfg.radius),
+                    ),
+                    scene_code,
+                )["density_act"]
+            v_pos, t_pos_idx = self.isosurface_helper(-(density - threshold))
+            v_pos = scale_tensor(
+                v_pos,
+                self.isosurface_helper.points_range,
+                (-self.renderer.cfg.radius, self.renderer.cfg.radius),
+            )
+            color = None
+            if has_vertex_color:
+                with torch.no_grad():
+                    color = self.renderer.query_triplane(
+                        self.decoder,
+                        v_pos,
+                        scene_code,
+                    )["color"]
+            mesh = trimesh.Trimesh(
+                vertices=v_pos.cpu().numpy(),
+                faces=t_pos_idx.cpu().numpy(),
+                vertex_colors=color.cpu().numpy() if has_vertex_color else None,
+            )
+            meshes.append(mesh)
+        return meshes

tsr/utils.py ADDED Viewed

	@@ -0,0 +1,510 @@

+import importlib
+import math
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import imageio
+import numpy as np
+import PIL.Image
+import rembg
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import trimesh
+from omegaconf import DictConfig, OmegaConf
+from PIL import Image
+def parse_structured(fields: Any, cfg: Optional[Union[dict, DictConfig]] = None) -> Any:
+    scfg = OmegaConf.merge(OmegaConf.structured(fields), cfg)
+    return scfg
+def find_class(cls_string):
+    module_string = ".".join(cls_string.split(".")[:-1])
+    cls_name = cls_string.split(".")[-1]
+    module = importlib.import_module(module_string, package=None)
+    cls = getattr(module, cls_name)
+    return cls
+def get_intrinsic_from_fov(fov, H, W, bs=-1):
+    focal_length = 0.5 * H / np.tan(0.5 * fov)
+    intrinsic = np.identity(3, dtype=np.float32)
+    intrinsic[0, 0] = focal_length
+    intrinsic[1, 1] = focal_length
+    intrinsic[0, 2] = W / 2.0
+    intrinsic[1, 2] = H / 2.0
+    if bs > 0:
+        intrinsic = intrinsic[None].repeat(bs, axis=0)
+    return torch.from_numpy(intrinsic)
+class BaseModule(nn.Module):
+    @dataclass
+    class Config:
+        pass
+    cfg: Config  # add this to every subclass of BaseModule to enable static type checking
+    def __init__(
+        self, cfg: Optional[Union[dict, DictConfig]] = None, *args, **kwargs
+    ) -> None:
+        super().__init__()
+        self.cfg = parse_structured(self.Config, cfg)
+        self.configure(*args, **kwargs)
+    def configure(self, *args, **kwargs) -> None:
+        raise NotImplementedError
+class ImagePreprocessor:
+    def convert_and_resize(
+        self,
+        image: Union[PIL.Image.Image, np.ndarray, torch.Tensor],
+        size: int,
+    ):
+        if isinstance(image, PIL.Image.Image):
+            image = torch.from_numpy(np.array(image).astype(np.float32) / 255.0)
+        elif isinstance(image, np.ndarray):
+            if image.dtype == np.uint8:
+                image = torch.from_numpy(image.astype(np.float32) / 255.0)
+            else:
+                image = torch.from_numpy(image)
+        elif isinstance(image, torch.Tensor):
+            pass
+        batched = image.ndim == 4
+        if not batched:
+            image = image[None, ...]
+        image = F.interpolate(
+            image.permute(0, 3, 1, 2),
+            (size, size),
+            mode="bilinear",
+            align_corners=False,
+            antialias=True,
+        ).permute(0, 2, 3, 1)
+        if not batched:
+            image = image[0]
+        return image
+    def __call__(
+        self,
+        image: Union[
+            PIL.Image.Image,
+            np.ndarray,
+            torch.FloatTensor,
+            List[PIL.Image.Image],
+            List[np.ndarray],
+            List[torch.FloatTensor],
+        ],
+        size: int,
+    ) -> Any:
+        if isinstance(image, (np.ndarray, torch.FloatTensor)) and image.ndim == 4:
+            image = self.convert_and_resize(image, size)
+        else:
+            if not isinstance(image, list):
+                image = [image]
+            image = [self.convert_and_resize(im, size) for im in image]
+            image = torch.stack(image, dim=0)
+        return image
+def rays_intersect_bbox(
+    rays_o: torch.Tensor,
+    rays_d: torch.Tensor,
+    radius: float,
+    near: float = 0.0,
+    valid_thresh: float = 0.01,
+):
+    input_shape = rays_o.shape[:-1]
+    rays_o, rays_d = rays_o.view(-1, 3), rays_d.view(-1, 3)
+    rays_d_valid = torch.where(
+        rays_d.abs() < 1e-6, torch.full_like(rays_d, 1e-6), rays_d
+    )
+    if type(radius) in [int, float]:
+        radius = torch.FloatTensor(
+            [[-radius, radius], [-radius, radius], [-radius, radius]]
+        ).to(rays_o.device)
+    radius = (
+        1.0 - 1.0e-3
+    ) * radius  # tighten the radius to make sure the intersection point lies in the bounding box
+    interx0 = (radius[..., 1] - rays_o) / rays_d_valid
+    interx1 = (radius[..., 0] - rays_o) / rays_d_valid
+    t_near = torch.minimum(interx0, interx1).amax(dim=-1).clamp_min(near)
+    t_far = torch.maximum(interx0, interx1).amin(dim=-1)
+    # check wheter a ray intersects the bbox or not
+    rays_valid = t_far - t_near > valid_thresh
+    t_near[torch.where(~rays_valid)] = 0.0
+    t_far[torch.where(~rays_valid)] = 0.0
+    t_near = t_near.view(*input_shape, 1)
+    t_far = t_far.view(*input_shape, 1)
+    rays_valid = rays_valid.view(*input_shape)
+    return t_near, t_far, rays_valid
+def chunk_batch(func: Callable, chunk_size: int, *args, **kwargs) -> Any:
+    if chunk_size <= 0:
+        return func(*args, **kwargs)
+    B = None
+    for arg in list(args) + list(kwargs.values()):
+        if isinstance(arg, torch.Tensor):
+            B = arg.shape[0]
+            break
+    assert (
+        B is not None
+    ), "No tensor found in args or kwargs, cannot determine batch size."
+    out = defaultdict(list)
+    out_type = None
+    # max(1, B) to support B == 0
+    for i in range(0, max(1, B), chunk_size):
+        out_chunk = func(
+            *[
+                arg[i : i + chunk_size] if isinstance(arg, torch.Tensor) else arg
+                for arg in args
+            ],
+            **{
+                k: arg[i : i + chunk_size] if isinstance(arg, torch.Tensor) else arg
+                for k, arg in kwargs.items()
+            },
+        )
+        if out_chunk is None:
+            continue
+        out_type = type(out_chunk)
+        if isinstance(out_chunk, torch.Tensor):
+            out_chunk = {0: out_chunk}
+        elif isinstance(out_chunk, tuple) or isinstance(out_chunk, list):
+            chunk_length = len(out_chunk)
+            out_chunk = {i: chunk for i, chunk in enumerate(out_chunk)}
+        elif isinstance(out_chunk, dict):
+            pass
+        else:
+            print(
+                f"Return value of func must be in type [torch.Tensor, list, tuple, dict], get {type(out_chunk)}."
+            )
+            exit(1)
+        for k, v in out_chunk.items():
+            v = v if torch.is_grad_enabled() else v.detach()
+            out[k].append(v)
+    if out_type is None:
+        return None
+    out_merged: Dict[Any, Optional[torch.Tensor]] = {}
+    for k, v in out.items():
+        if all([vv is None for vv in v]):
+            # allow None in return value
+            out_merged[k] = None
+        elif all([isinstance(vv, torch.Tensor) for vv in v]):
+            out_merged[k] = torch.cat(v, dim=0)
+        else:
+            raise TypeError(
+                f"Unsupported types in return value of func: {[type(vv) for vv in v if not isinstance(vv, torch.Tensor)]}"
+            )
+    if out_type is torch.Tensor:
+        return out_merged[0]
+    elif out_type in [tuple, list]:
+        return out_type([out_merged[i] for i in range(chunk_length)])
+    elif out_type is dict:
+        return out_merged
+ValidScale = Union[Tuple[float, float], torch.FloatTensor]
+def scale_tensor(dat: torch.FloatTensor, inp_scale: ValidScale, tgt_scale: ValidScale):
+    if inp_scale is None:
+        inp_scale = (0, 1)
+    if tgt_scale is None:
+        tgt_scale = (0, 1)
+    if isinstance(tgt_scale, torch.FloatTensor):
+        assert dat.shape[-1] == tgt_scale.shape[-1]
+    dat = (dat - inp_scale[0]) / (inp_scale[1] - inp_scale[0])
+    dat = dat * (tgt_scale[1] - tgt_scale[0]) + tgt_scale[0]
+    return dat
+def get_activation(name) -> Callable:
+    if name is None:
+        return lambda x: x
+    name = name.lower()
+    if name == "none":
+        return lambda x: x
+    elif name == "exp":
+        return lambda x: torch.exp(x)
+    elif name == "sigmoid":
+        return lambda x: torch.sigmoid(x)
+    elif name == "tanh":
+        return lambda x: torch.tanh(x)
+    elif name == "softplus":
+        return lambda x: F.softplus(x)
+    else:
+        try:
+            return getattr(F, name)
+        except AttributeError:
+            raise ValueError(f"Unknown activation function: {name}")
+def get_ray_directions(
+    H: int,
+    W: int,
+    focal: Union[float, Tuple[float, float]],
+    principal: Optional[Tuple[float, float]] = None,
+    use_pixel_centers: bool = True,
+    normalize: bool = True,
+) -> torch.FloatTensor:
+    """
+    Get ray directions for all pixels in camera coordinate.
+    Reference: https://www.scratchapixel.com/lessons/3d-basic-rendering/
+               ray-tracing-generating-camera-rays/standard-coordinate-systems
+    Inputs:
+        H, W, focal, principal, use_pixel_centers: image height, width, focal length, principal point and whether use pixel centers
+    Outputs:
+        directions: (H, W, 3), the direction of the rays in camera coordinate
+    """
+    pixel_center = 0.5 if use_pixel_centers else 0
+    if isinstance(focal, float):
+        fx, fy = focal, focal
+        cx, cy = W / 2, H / 2
+    else:
+        fx, fy = focal
+        assert principal is not None
+        cx, cy = principal
+    i, j = torch.meshgrid(
+        torch.arange(W, dtype=torch.float32) + pixel_center,
+        torch.arange(H, dtype=torch.float32) + pixel_center,
+        indexing="xy",
+    )
+    directions = torch.stack([(i - cx) / fx, -(j - cy) / fy, -torch.ones_like(i)], -1)
+    if normalize:
+        directions = F.normalize(directions, dim=-1)
+    return directions
+def get_rays(
+    directions,
+    c2w,
+    keepdim=False,
+    normalize=False,
+) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
+    # Rotate ray directions from camera coordinate to the world coordinate
+    assert directions.shape[-1] == 3
+    if directions.ndim == 2:  # (N_rays, 3)
+        if c2w.ndim == 2:  # (4, 4)
+            c2w = c2w[None, :, :]
+        assert c2w.ndim == 3  # (N_rays, 4, 4) or (1, 4, 4)
+        rays_d = (directions[:, None, :] * c2w[:, :3, :3]).sum(-1)  # (N_rays, 3)
+        rays_o = c2w[:, :3, 3].expand(rays_d.shape)
+    elif directions.ndim == 3:  # (H, W, 3)
+        assert c2w.ndim in [2, 3]
+        if c2w.ndim == 2:  # (4, 4)
+            rays_d = (directions[:, :, None, :] * c2w[None, None, :3, :3]).sum(
+                -1
+            )  # (H, W, 3)
+            rays_o = c2w[None, None, :3, 3].expand(rays_d.shape)
+        elif c2w.ndim == 3:  # (B, 4, 4)
+            rays_d = (directions[None, :, :, None, :] * c2w[:, None, None, :3, :3]).sum(
+                -1
+            )  # (B, H, W, 3)
+            rays_o = c2w[:, None, None, :3, 3].expand(rays_d.shape)
+    elif directions.ndim == 4:  # (B, H, W, 3)
+        assert c2w.ndim == 3  # (B, 4, 4)
+        rays_d = (directions[:, :, :, None, :] * c2w[:, None, None, :3, :3]).sum(
+            -1
+        )  # (B, H, W, 3)
+        rays_o = c2w[:, None, None, :3, 3].expand(rays_d.shape)
+    if normalize:
+        rays_d = F.normalize(rays_d, dim=-1)
+    if not keepdim:
+        rays_o, rays_d = rays_o.reshape(-1, 3), rays_d.reshape(-1, 3)
+    return rays_o, rays_d
+def get_spherical_cameras(
+    n_views: int,
+    elevation_deg: float,
+    camera_distance: float,
+    fovy_deg: float,
+    height: int,
+    width: int,
+):
+    # Use 0 to 360*(n_views-1)/n_views to avoid duplicate first/last position
+    # This ensures full 360-degree coverage without overlap
+    azimuth_deg = torch.linspace(0, 360.0 * (n_views - 1) / n_views, n_views)
+    elevation_deg = torch.full_like(azimuth_deg, elevation_deg)
+    camera_distances = torch.full_like(elevation_deg, camera_distance)
+    elevation = elevation_deg * math.pi / 180
+    azimuth = azimuth_deg * math.pi / 180
+    # convert spherical coordinates to cartesian coordinates
+    # right hand coordinate system, x back, y right, z up
+    # elevation in (-90, 90), azimuth from +x to +y in (-180, 180)
+    camera_positions = torch.stack(
+        [
+            camera_distances * torch.cos(elevation) * torch.cos(azimuth),
+            camera_distances * torch.cos(elevation) * torch.sin(azimuth),
+            camera_distances * torch.sin(elevation),
+        ],
+        dim=-1,
+    )
+    # default scene center at origin
+    center = torch.zeros_like(camera_positions)
+    # default camera up direction as +z
+    up = torch.as_tensor([0, 0, 1], dtype=torch.float32)[None, :].repeat(n_views, 1)
+    fovy = torch.full_like(elevation_deg, fovy_deg) * math.pi / 180
+    lookat = F.normalize(center - camera_positions, dim=-1)
+    right = F.normalize(torch.cross(lookat, up), dim=-1)
+    up = F.normalize(torch.cross(right, lookat), dim=-1)
+    c2w3x4 = torch.cat(
+        [torch.stack([right, up, -lookat], dim=-1), camera_positions[:, :, None]],
+        dim=-1,
+    )
+    c2w = torch.cat([c2w3x4, torch.zeros_like(c2w3x4[:, :1])], dim=1)
+    c2w[:, 3, 3] = 1.0
+    # get directions by dividing directions_unit_focal by focal length
+    focal_length = 0.5 * height / torch.tan(0.5 * fovy)
+    directions_unit_focal = get_ray_directions(
+        H=height,
+        W=width,
+        focal=1.0,
+    )
+    directions = directions_unit_focal[None, :, :, :].repeat(n_views, 1, 1, 1)
+    directions[:, :, :, :2] = (
+        directions[:, :, :, :2] / focal_length[:, None, None, None]
+    )
+    # must use normalize=True to normalize directions here
+    rays_o, rays_d = get_rays(directions, c2w, keepdim=True, normalize=True)
+    return rays_o, rays_d
+def remove_background(
+    image: PIL.Image.Image,
+    rembg_session: Any = None,
+    force: bool = False,
+    **rembg_kwargs,
+) -> PIL.Image.Image:
+    do_remove = True
+    if image.mode == "RGBA" and image.getextrema()[3][0] < 255:
+        do_remove = False
+    do_remove = do_remove or force
+    if do_remove:
+        image = rembg.remove(image, session=rembg_session, **rembg_kwargs)
+    return image
+def resize_foreground(
+    image: PIL.Image.Image,
+    ratio: float,
+) -> PIL.Image.Image:
+    image = np.array(image)
+    assert image.shape[-1] == 4
+    alpha = np.where(image[..., 3] > 0)
+    y1, y2, x1, x2 = (
+        alpha[0].min(),
+        alpha[0].max(),
+        alpha[1].min(),
+        alpha[1].max(),
+    )
+    # crop the foreground
+    fg = image[y1:y2, x1:x2]
+    # pad to square
+    size = max(fg.shape[0], fg.shape[1])
+    ph0, pw0 = (size - fg.shape[0]) // 2, (size - fg.shape[1]) // 2
+    ph1, pw1 = size - fg.shape[0] - ph0, size - fg.shape[1] - pw0
+    new_image = np.pad(
+        fg,
+        ((ph0, ph1), (pw0, pw1), (0, 0)),
+        mode="constant",
+        constant_values=((0, 0), (0, 0), (0, 0)),
+    )
+    # compute padding according to the ratio
+    new_size = int(new_image.shape[0] / ratio)
+    # pad to size, double side
+    ph0, pw0 = (new_size - size) // 2, (new_size - size) // 2
+    ph1, pw1 = new_size - size - ph0, new_size - size - pw0
+    new_image = np.pad(
+        new_image,
+        ((ph0, ph1), (pw0, pw1), (0, 0)),
+        mode="constant",
+        constant_values=((0, 0), (0, 0), (0, 0)),
+    )
+    new_image = PIL.Image.fromarray(new_image)
+    return new_image
+def save_video(
+    frames: List[PIL.Image.Image],
+    output_path: str,
+    fps: int = 30,
+):
+    # use imageio to save video
+    frames = [np.array(frame) for frame in frames]
+    writer = imageio.get_writer(output_path, fps=fps)
+    for frame in frames:
+        writer.append_data(frame)
+    writer.close()
+def to_gradio_3d_orientation(mesh):
+    mesh.apply_transform(trimesh.transformations.rotation_matrix(-np.pi/2, [1, 0, 0]))
+    mesh.apply_transform(trimesh.transformations.rotation_matrix(np.pi/2, [0, 1, 0]))
+    return mesh
+def to_standard_3d_orientation(mesh):
+    """
+    Convert mesh to standard 3D viewer orientation (Y-up, Z-forward).
+    This is a more standard orientation that works better with most 3D viewers.
+    """
+    # Rotate -90 degrees around X axis (to make Y up instead of Z)
+    mesh.apply_transform(trimesh.transformations.rotation_matrix(-np.pi/2, [1, 0, 0]))
+    return mesh
+def apply_mesh_orientation(mesh, orientation="standard"):
+    """
+    Apply orientation transformation to mesh.
+    Args:
+        mesh: Trimesh mesh object
+        orientation: Orientation type
+            - "standard": Standard 3D viewer orientation (Y-up, Z-forward)
+            - "gradio": Gradio 3D viewer orientation
+            - "none": No transformation (original orientation)
+    Returns:
+        Transformed mesh
+    """
+    if orientation == "standard":
+        return to_standard_3d_orientation(mesh)
+    elif orientation == "gradio":
+        return to_gradio_3d_orientation(mesh)
+    elif orientation == "none":
+        return mesh
+    else:
+        raise ValueError(f"Unknown orientation: {orientation}. Must be 'standard', 'gradio', or 'none'")