LLMhistory / generators.py
freQuensy23's picture
Fix torch
e4bfc4a
raw
history blame
No virus
3.59 kB
import asyncio
import json
import os
import aiohttp
import gradio as gr
import numpy as np
import spaces
from huggingface_hub import InferenceClient
import random
import torch
from huggingface_hub import AsyncInferenceClient
from transformers import LlamaTokenizer, LlamaForCausalLM, AutoTokenizer
async def query_llm(payload, model_name):
headers = {"Authorization": f"Bearer {os.getenv('HF_TOKEN')}"}
async with aiohttp.ClientSession() as session:
async with session.post(f"https://api-inference.huggingface.co/models/{model_name}", headers=headers,
json=payload) as response:
return await response.json()
async def generate_mistral_7bvo1(system_input, user_input):
client = AsyncInferenceClient(
"mistralai/Mistral-7B-Instruct-v0.1",
token=os.getenv('HF_TOKEN'),
)
async for message in await client.chat_completion(
messages=[
{"role": "system", "content": system_input},
{"role": "user", "content": user_input}, ],
max_tokens=256,
stream=True,
):
yield message.choices[0].delta.content
async def generate_gpt2(system_input, user_input):
output = await query_llm({
"inputs": (inputs:=f"{system_input}\n{user_input}"),
}, "openai-community/gpt2")
yield output[0]["generated_text"].replace(inputs, '')
async def generate_llama2(system_input, user_input):
client = AsyncInferenceClient(
"meta-llama/Llama-2-7b-chat-hf",
token=os.getenv('HF_TOKEN')
)
async for message in await client.chat_completion(
messages=[
{"role": "system", "content": system_input},
{"role": "user", "content": user_input}, ],
max_tokens=256,
stream=True,
):
yield message.choices[0].delta.content
@spaces.GPU(duration=120)
async def generate_openllama(system_input, user_input):
model_path = 'openlm-research/open_llama_3b_v2'
tokenizer = LlamaTokenizer.from_pretrained(model_path)
model = LlamaForCausalLM.from_pretrained(
model_path, torch_dtype=torch.float16, device_map='cuda',
)
# model = model.to("cuda")
input_text = f"{system_input}\n{user_input}"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
output = model.generate(input_ids, max_length=128)
return tokenizer.decode(output[0], skip_special_tokens=True)
@spaces.GPU(duration=120)
async def generate_bloom(system_input, user_input):
model_path = 'bigscience/bloom-7b1'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = LlamaForCausalLM.from_pretrained(
model_path, torch_dtype=torch.float16, device_map='cuda',
)
input_text = f"{system_input}\n{user_input}"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
output = model.generate(input_ids, max_length=128)
return tokenizer.decode(output[0], skip_special_tokens=True)
async def generate_llama3(system_input, user_input):
client = AsyncInferenceClient(
"meta-llama/Meta-Llama-3.1-8B-Instruct",
token=os.getenv('HF_TOKEN')
)
try:
async for message in await client.chat_completion(
messages=[
{"role": "system", "content": system_input},
{"role": "user", "content": user_input}, ],
max_tokens=256,
stream=True,
):
yield message.choices[0].delta.content
except json.JSONDecodeError:
pass