Deepseek-R1-1.5b-API

Running

App Files Files Community

Echo-ai commited on Feb 3

Commit

f9e8a03

verified ·

1 Parent(s): b839d79

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -100

app.py CHANGED Viewed

@@ -1,18 +1,15 @@
 import os
 import requests
-import time
 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import StreamingResponse, HTMLResponse
 from llama_cpp import Llama
 from pydantic import BaseModel
 import uvicorn
-from typing import Generator
-import threading
 # Configuration
-MODEL_URL = "https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/resolve/main/DeepSeek-R1-Distill-Qwen-1.5B-Q4_K_M.gguf"  # Changed to Q4 for faster inference
-MODEL_NAME = "DeepSeek-R1-Distill-Qwen-1.5B-Q4_K_M.gguf"
 MODEL_DIR = "model"
 MODEL_PATH = os.path.join(MODEL_DIR, MODEL_NAME)
@@ -36,8 +33,8 @@ else:
 # Initialize FastAPI
 app = FastAPI(
     title="DeepSeek-R1 OpenAI-Compatible API",
-    description="Optimized OpenAI-compatible API with streaming support",
-    version="2.0.0"
 )
 # CORS Configuration
@@ -48,68 +45,36 @@ app.add_middleware(
     allow_headers=["*"],
 )
-# Global model loader with optimized settings
-print("Loading model with optimized settings...")
 try:
     llm = Llama(
         model_path=MODEL_PATH,
-        n_ctx=1024,  # Reduced context window for faster processing
-        n_threads=8,  # Increased threads for better CPU utilization
-        n_batch=512,  # Larger batch size for improved throughput
         n_gpu_layers=0,
-        use_mlock=True,  # Prevent swapping to disk
         verbose=False
     )
-    print("Model loaded with optimized settings!")
 except Exception as e:
     raise RuntimeError(f"Failed to load model: {str(e)}")
-# Streaming generator
-def generate_stream(prompt: str, max_tokens: int, temperature: float, top_p: float) -> Generator[str, None, None]:
-    start_time = time.time()
-    stream = llm.create_completion(
-        prompt=prompt,
-        max_tokens=max_tokens,
-        temperature=temperature,
-        top_p=top_p,
-        stop=["</s>"],
-        stream=True
-    )
-    for chunk in stream:
-        delta = chunk['choices'][0]['text']
-        yield f"data: {delta}\n\n"
-        # Early stopping if taking too long
-        if time.time() - start_time > 30:  # 30s timeout
-            break
-# OpenAI-Compatible Request Schema
-class ChatCompletionRequest(BaseModel):
-    model: str = "DeepSeek-R1-Distill-Qwen-1.5B"
-    messages: list[dict]
-    max_tokens: int = 256
-    temperature: float = 0.7
-    top_p: float = 0.9
-    stream: bool = False
-# Enhanced root endpoint with performance info
 @app.get("/", response_class=HTMLResponse)
 async def root():
     return f"""
     <html>
         <head>
-            <title>DeepSeek-R1 Optimized API</title>
             <style>
                 body {{ font-family: Arial, sans-serif; max-width: 800px; margin: 20px auto; padding: 0 20px; }}
                 .warning {{ color: #dc3545; background: #ffeef0; padding: 15px; border-radius: 5px; }}
-                .info {{ color: #0c5460; background: #d1ecf1; padding: 15px; border-radius: 5px; }}
                 a {{ color: #007bff; text-decoration: none; }}
                 code {{ background: #f8f9fa; padding: 2px 4px; border-radius: 4px; }}
             </style>
         </head>
         <body>
-            <h1>DeepSeek-R1 Optimized API</h1>
             <div class="warning">
                 <h3>⚠️ Important Notice</h3>
@@ -119,29 +84,29 @@ async def root():
                 3. Set visibility to Private</p>
             </div>
-            <div class="info">
-                <h3>⚡ Performance Optimizations</h3>
-                <ul>
-                    <li>Quantization: Q4_K_M (optimized speed/quality balance)</li>
-                    <li>Batch processing: 512 tokens/chunk</li>
-                    <li>Streaming support with 30s timeout</li>
-                    <li>8 CPU threads utilization</li>
-                </ul>
-            </div>
             <h2>API Documentation</h2>
             <ul>
                 <li><a href="/docs">Interactive Swagger Documentation</a></li>
                 <li><a href="/redoc">ReDoc Documentation</a></li>
             </ul>
-            <h2>Example Streaming Request</h2>
             <pre>
-curl -N -X POST "{os.environ.get('SPACE_HOST', 'http://localhost:7860')}/v1/chat/completions" \\
 -H "Content-Type: application/json" \\
 -d '{{
   "messages": [{{"role": "user", "content": "Explain quantum computing"}}],
-  "stream": true,
   "max_tokens": 150
 }}'
             </pre>
@@ -149,26 +114,30 @@ curl -N -X POST "{os.environ.get('SPACE_HOST', 'http://localhost:7860')}/v1/chat
     </html>
     """
-# Async endpoint handler
 @app.post("/v1/chat/completions")
 async def chat_completion(request: ChatCompletionRequest):
     try:
         prompt = "\n".join([f"{msg['role']}: {msg['content']}" for msg in request.messages])
         prompt += "\nassistant:"
-        if request.stream:
-            return StreamingResponse(
-                generate_stream(
-                    prompt=prompt,
-                    max_tokens=request.max_tokens,
-                    temperature=request.temperature,
-                    top_p=request.top_p
-                ),
-                media_type="text/event-stream"
-            )
-        # Non-streaming response
-        start_time = time.time()
         response = llm(
             prompt=prompt,
             max_tokens=request.max_tokens,
@@ -177,12 +146,8 @@ async def chat_completion(request: ChatCompletionRequest):
             stop=["</s>"]
         )
-        return {
-            "id": f"chatcmpl-{int(time.time())}",
-            "object": "chat.completion",
-            "created": int(time.time()),
-            "model": request.model,
-            "choices": [{
                 "index": 0,
                 "message": {
                     "role": "assistant",
@@ -190,32 +155,18 @@ async def chat_completion(request: ChatCompletionRequest):
                 },
                 "finish_reason": "stop"
             }],
-            "usage": {
                 "prompt_tokens": len(prompt),
                 "completion_tokens": len(response['choices'][0]['text']),
                 "total_tokens": len(prompt) + len(response['choices'][0]['text'])
             }
-        }
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
 @app.get("/health")
-async def health_check():
-    return {
-        "status": "healthy",
-        "model_loaded": True,
-        "performance_settings": {
-            "n_threads": llm.params.n_threads,
-            "n_ctx": llm.params.n_ctx,
-            "n_batch": llm.params.n_batch
-        }
-    }
 if __name__ == "__main__":
-    uvicorn.run(
-        app,
-        host="0.0.0.0",
-        port=7860,
-        timeout_keep_alive=300  # Keep alive for streaming connections
-    )

 import os
 import requests
 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import HTMLResponse
 from llama_cpp import Llama
 from pydantic import BaseModel
 import uvicorn
 # Configuration
+MODEL_URL = "https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/resolve/main/DeepSeek-R1-Distill-Qwen-1.5B-Q5_K_M.gguf"
+MODEL_NAME = "DeepSeek-R1-Distill-Qwen-1.5B-Q5_K_M.gguf"
 MODEL_DIR = "model"
 MODEL_PATH = os.path.join(MODEL_DIR, MODEL_NAME)
 # Initialize FastAPI
 app = FastAPI(
     title="DeepSeek-R1 OpenAI-Compatible API",
+    description="OpenAI-compatible API for DeepSeek-R1-Distill-Qwen-1.5B",
+    version="1.0.0"
 )
 # CORS Configuration
     allow_headers=["*"],
 )
+# Load the model
+print("Loading model...")
 try:
     llm = Llama(
         model_path=MODEL_PATH,
+        n_ctx=2048,
+        n_threads=4,
         n_gpu_layers=0,
         verbose=False
     )
+    print("Model loaded successfully!")
 except Exception as e:
     raise RuntimeError(f"Failed to load model: {str(e)}")
+# Root endpoint with documentation
 @app.get("/", response_class=HTMLResponse)
 async def root():
     return f"""
     <html>
         <head>
+            <title>DeepSeek-R1 OpenAI API</title>
             <style>
                 body {{ font-family: Arial, sans-serif; max-width: 800px; margin: 20px auto; padding: 0 20px; }}
                 .warning {{ color: #dc3545; background: #ffeef0; padding: 15px; border-radius: 5px; }}
                 a {{ color: #007bff; text-decoration: none; }}
                 code {{ background: #f8f9fa; padding: 2px 4px; border-radius: 4px; }}
             </style>
         </head>
         <body>
+            <h1>DeepSeek-R1 OpenAI-Compatible API</h1>
             <div class="warning">
                 <h3>⚠️ Important Notice</h3>
                 3. Set visibility to Private</p>
             </div>
             <h2>API Documentation</h2>
             <ul>
                 <li><a href="/docs">Interactive Swagger Documentation</a></li>
                 <li><a href="/redoc">ReDoc Documentation</a></li>
             </ul>
+            <h2>API Endpoints</h2>
+            <h3>Chat Completion</h3>
+            <p><code>POST /v1/chat/completions</code></p>
+            <p>Parameters:</p>
+            <ul>
+                <li><strong>messages</strong>: List of message objects</li>
+                <li><strong>max_tokens</strong>: Maximum response length (default: 128)</li>
+                <li><strong>temperature</strong>: Sampling temperature (default: 0.7)</li>
+                <li><strong>top_p</strong>: Nucleus sampling threshold (default: 0.9)</li>
+            </ul>
+            <h2>Example Request</h2>
             <pre>
+curl -X POST "{os.environ.get('SPACE_HOST', 'http://localhost:7860')}/v1/chat/completions" \\
 -H "Content-Type: application/json" \\
 -d '{{
   "messages": [{{"role": "user", "content": "Explain quantum computing"}}],
   "max_tokens": 150
 }}'
             </pre>
     </html>
     """
+# OpenAI-Compatible Request Schema
+class ChatCompletionRequest(BaseModel):
+    model: str = "DeepSeek-R1-Distill-Qwen-1.5B"
+    messages: list[dict]
+    max_tokens: int = 128
+    temperature: float = 0.7
+    top_p: float = 0.9
+    stream: bool = False
+# OpenAI-Compatible Response Schema
+class ChatCompletionResponse(BaseModel):
+    id: str = "chatcmpl-12345"
+    object: str = "chat.completion"
+    created: int = 1693161600
+    model: str = "DeepSeek-R1-Distill-Qwen-1.5B"
+    choices: list[dict]
+    usage: dict
 @app.post("/v1/chat/completions")
 async def chat_completion(request: ChatCompletionRequest):
     try:
         prompt = "\n".join([f"{msg['role']}: {msg['content']}" for msg in request.messages])
         prompt += "\nassistant:"
         response = llm(
             prompt=prompt,
             max_tokens=request.max_tokens,
             stop=["</s>"]
         )
+        return ChatCompletionResponse(
+            choices=[{
                 "index": 0,
                 "message": {
                     "role": "assistant",
                 },
                 "finish_reason": "stop"
             }],
+            usage={
                 "prompt_tokens": len(prompt),
                 "completion_tokens": len(response['choices'][0]['text']),
                 "total_tokens": len(prompt) + len(response['choices'][0]['text'])
             }
+        )
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
 @app.get("/health")
+def health_check():
+    return {"status": "healthy"}
 if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=7860)