test_mistral_7b_on_cpu

Sleeping

File size: 1,475 Bytes

62c56f5
 
d15855c
 
 
 
62c56f5
 
 
 
 
 
bb85ff8
62c56f5
d15855c
 
62c56f5
 
 
5059db6
62c56f5
 
 
 
dc76d14
bb85ff8
d15855c
bb85ff8
 
5059db6
62c56f5

import gradio as gr
from threading import Thread
import os
from ctransformers import AutoModelForCausalLM


llm = AutoModelForCausalLM.from_pretrained("TheBloke/Mistral-7B-Instruct-v0.1-GGUF",
                                            model_file="mistral-7b-instruct-v0.1.Q4_K_M.gguf",
                                            model_type="mistral",
                                            temperature=0.7,
                                            gpu_layers=0,
                                            stream=True,
                                            threads=int(os.cpu_count()),
                                            max_new_tokens=10000)


# Function to generate model predictions.
def predict(message, history):
    history_transformer_format = history + [[message, ""]]

    # Formatting the input for the model.
    messages = "</s>".join(["</s>".join(["\n<|user|>:" + item[0], "\n<|assistant|>:" + item[1]])
                        for item in history_transformer_format])
    
    prompt = f"[INST]{messages}[/INST]"
    message_out = ""
    for text in llm(prompt=prompt):
        message_out += text
        yield message_out

# Setting up the Gradio chat interface.
gr.ChatInterface(predict,
                 title="Test Mistral 7B",
                 description="Ask Mistral any questions",
                 examples=['How to cook a fish?', 'Who is the president of US now?']
                 ).launch()  # Launching the web interface.