TESTing / app.py
AkramOM606's picture
Update app.py
ef37e4a verified
import gradio as gr
gr.load("models/meta-llama/Meta-Llama-3-8B").launch()
# import transformers
# import torch
# import os
# os.environ["HF_TOKEN"] = st.secrets["HF_TOKEN"]
# os.environ["HUGGINGFACEHUB_API_TOKEN"] = st.secrets["HF_TOKEN"]
# # os.environ["USE_FLASH_ATTENTION"] = "1"
# print(f"Device name: {torch.cuda.get_device_properties('cuda').name}")
# print(f"FlashAttention available: {torch.backends.cuda.flash_sdp_enabled()}")
# print(f"torch version: {torch.version}")
# # model_id = "meta-llama/Meta-Llama-3-8B"
# # pipeline = transformers.pipeline(
# # "text-generation", model=model_id, model_kwargs={"torch_dtype": torch.bfloat16}, device_map="auto"
# # )
# # pipeline("Hey how are you doing today?")
# model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
# pipeline = transformers.pipeline(
# "text-generation",
# model=model_id,
# model_kwargs={"torch_dtype": torch.bfloat16},
# device_map="auto",
# )
# messages = [
# {
# "role": "system",
# "content": "You are a pirate chatbot who always responds in pirate speak!",
# },
# {"role": "user", "content": "Who are you?"},
# ]
# prompt = pipeline.tokenizer.apply_chat_template(
# messages, tokenize=False, add_generation_prompt=True
# )
# terminators = [
# pipeline.tokenizer.eos_token_id,
# pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>"),
# ]
# outputs = pipeline(
# prompt,
# max_new_tokens=256,
# eos_token_id=terminators,
# do_sample=True,
# temperature=0.6,
# top_p=0.9,
# )
# print(outputs[0]["generated_text"][len(prompt) :])
# print("hello")