Spaces:
Runtime error
Runtime error
| ## Due to a small bug when installing exllamav2 from dev branch directly we require CUDA paths | |
| import cuda_bug | |
| cuda_bug.install_cuda_toolkit_requirements() | |
| ## | |
| import gradio as gr | |
| from gradio.data_classes import FileData | |
| from huggingface_hub import snapshot_download | |
| from pathlib import Path | |
| import base64 | |
| import spaces | |
| import os | |
| import sys, os | |
| import torch | |
| from exllamav2 import ( | |
| ExLlamaV2, | |
| ExLlamaV2Config, | |
| ExLlamaV2Cache, | |
| ExLlamaV2Tokenizer, | |
| ExLlamaV2VisionTower, | |
| ) | |
| from exllamav2.generator import ( | |
| ExLlamaV2DynamicGenerator, | |
| ExLlamaV2Sampler, | |
| ) | |
| from PIL import Image | |
| import requests | |
| from huggingface_hub import snapshot_download | |
| from tqdm import tqdm | |
| default_max_context = 16384 | |
| default_max_output = 512 | |
| default_bpw = "4.0bpw" | |
| available_models = [ | |
| "2.5bpw", | |
| "3.0bpw", | |
| "3.5bpw", | |
| "4.0bpw", | |
| "4.5bpw", | |
| "5.0bpw", | |
| "6.0bpw", | |
| "8.0bpw" | |
| ] | |
| dirs = {} | |
| for model in tqdm(available_models): | |
| dirs.update({model: snapshot_download(repo_id="turboderp/pixtral-12b-exl2", revision=model)}) | |
| def run_inference(message, history, model_picked, context_size, max_output): | |
| if not model_picked: | |
| model_picked = default_bpw | |
| if not context_size: | |
| context_size = default_max_context | |
| if not max_output: | |
| max_output = default_max_output | |
| local_dir = dirs[model_picked] | |
| # Loading only once GPU available | |
| config = ExLlamaV2Config(local_dir) | |
| config.max_seq_len = context_size | |
| vision_model = ExLlamaV2VisionTower(config) | |
| vision_model.load(progress = True) | |
| model = ExLlamaV2(config) | |
| cache = ExLlamaV2Cache(model, lazy = True, max_seq_len = context_size) | |
| model.load_autosplit(cache, progress = True) | |
| tokenizer = ExLlamaV2Tokenizer(config) | |
| generator = ExLlamaV2DynamicGenerator( | |
| model = model, | |
| cache = cache, | |
| tokenizer = tokenizer | |
| ) | |
| # Making Prompt Template | |
| prompt = "" | |
| image_prompt = "" | |
| images_embeddings = [] | |
| for couple in history: | |
| if type(couple[0]) is tuple: | |
| images_embeddings += [ | |
| vision_model.get_image_embeddings( | |
| model = model, | |
| tokenizer = tokenizer, | |
| image = img, | |
| text_alias = alias, | |
| ) | |
| for (alias, img) in [("{{IMAGE_" + str(len(images_embeddings)+i+1) + "}}", Image.open(path)) for i, path in enumerate(couple[0])] | |
| ] | |
| image_prompt = "" | |
| for i in range(len(couple[0])): | |
| image_prompt += "{{IMAGE_" + str(len(images_embeddings)-len(couple[0])+i+1) + "}}" | |
| elif couple[0]: | |
| prompt += "[INST]" + image_prompt + couple[0] + "[/INST]" | |
| prompt += couple[1] + "</s>" | |
| if type(message) is dict: | |
| images_embeddings += [ | |
| vision_model.get_image_embeddings( | |
| model = model, | |
| tokenizer = tokenizer, | |
| image = img, | |
| text_alias = alias, | |
| ) | |
| for (alias, img) in [("{{IMAGE_" + str(len(images_embeddings)+i+1) + "}}", Image.open(path['path'] if type(path) is dict else path)) for i, path in enumerate(message['files'])] | |
| ] | |
| image_prompt = "" | |
| for i in range(len(message['files'])): | |
| image_prompt += "{{IMAGE_" + str(len(images_embeddings)-len(message['files'])+i+1) + "}}" | |
| prompt += "[INST]" + image_prompt + message["text"] + "[/INST]" | |
| else: | |
| prompt += "[INST]" + image_prompt + message + "[/INST]" | |
| print(prompt) | |
| # Gnerating Response | |
| output = generator.generate( | |
| prompt = prompt, | |
| max_new_tokens = max_output, | |
| add_bos = True, | |
| encode_special_tokens = True, | |
| decode_special_tokens = True, | |
| stop_conditions = [tokenizer.eos_token_id], | |
| gen_settings = ExLlamaV2Sampler.Settings.greedy(), | |
| embeddings = images_embeddings | |
| ) | |
| result = output.split("[/INST]")[-1] | |
| print(result) | |
| return result | |
| description="""A demo chat interface with Pixtral 12B EXL2 Quants, deployed using **ExllamaV2**! | |
| The model will be loaded once the GPU is available. This space specifically will load by default Pixtral at 4bpw from the following repository: [turboderp/pixtral-12b-exl2](https://huggingface.co/turboderp/pixtral-12b-exl2). Other quantization options are available. | |
| The current version of ExllamaV2 running is the dev branch, not the master branch: [ExllamaV2](https://github.com/turboderp/exllamav2/tree/dev). | |
| The model at **4bpw and 16k context size fits in less than 12GB of VRAM**, and at **2.5bpw and short context can potentially fit in 8GB of VRAM**! | |
| The current default settings are: | |
| - Model Quant: 4.0bpw | |
| - Context Size: 16k tokens | |
| - Max Output: 512 tokens | |
| You can select other quants and experiment! | |
| Thanks, turboderp!""" | |
| examples = [ | |
| [ | |
| {"text": "What are the similarities and differences between these two experiments?", "files":["test_image_1.jpg", "test_image_2.jpg"]}, | |
| ] | |
| ] | |
| drop = gr.Dropdown(available_models, label="EXL2 Quant", value=default_bpw) | |
| context_size_gradio = gr.Slider(minimum = 256, maximum = 32768, label="Context Size", value=default_max_context, step = 1) | |
| output_length_gradio = gr.Slider(minimum = 1, maximum = 4096, label="Max Ouput Length", value=default_max_output, step = 1) | |
| demo = gr.ChatInterface(fn=run_inference, examples = examples, title="Pixtral 12B EXL2", multimodal=True, description=description, additional_inputs = [drop, context_size_gradio, output_length_gradio]) | |
| demo.queue().launch() |