import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline from peft import prepare_model_for_kbit_training from peft import LoraConfig, get_peft_model from peft import PeftModel, PeftConfig from datasets import load_dataset import transformers fixed_llm_name = "TheBloke/Mistral-7B-Instruct-v0.2-GPTQ" model = AutoModelForCausalLM.from_pretrained(fixed_llm_name, device_map="auto", # automatically figures out how to best use CPU + GPU for loading model trust_remote_code=False, # prevents running custom model files on your machine revision="main") # which version of model to use in repo tokenizer = AutoTokenizer.from_pretrained(fixed_llm_name, use_fast=True) config = PeftConfig.from_pretrained("chihhuiho/VirtualTA") model = PeftModel.from_pretrained(model, "chihhuiho/VirtualTA") intstructions_string = "Assume you are a virtual teaching assistant in the statistical and machine learning course. Your job is to communicate with students, answer technical questions and help the student to solve the problem. Please respond to the following post from a student." prompt_template = lambda comment: f"[INST] {intstructions_string} {comment} [/INST]" def predict(comment): prompt = prompt_template(comment) model.eval() # model in evaluation mode (dropout modules are deactivated) # tokenize input inputs = tokenizer(prompt, return_tensors="pt") # generate output outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=400, pad_token_id=tokenizer.eos_token_id) outputs = tokenizer.batch_decode(outputs)[0] outputs = outputs.split("[/INST]")[1] outputs = outputs.split("[INST]")[0] outputs = outputs.split("")[0] return outputs iface = gr.Interface(fn=predict, inputs="text", outputs="text") iface.launch()