import torch import gradio as gr from transformers import TextIteratorStreamer, AutoProcessor, LlavaForConditionalGeneration from PIL import Image import threading import spaces import accelerate DESCRIPTION = '''

Krypton 🕋

This uses an Open Source model from xtuner/llava-llama-3-8b-v1_1-transformers

''' model_id = "xtuner/llava-llama-3-8b-v1_1-transformers" model = LlavaForConditionalGeneration.from_pretrained( model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True ).to('cuda') processor = AutoProcessor.from_pretrained(model_id) model.generation_config.eos_token_id = 128009 @spaces.GPU(duration=120) def krypton(input, history): """ Recieves inputs (prompts with images if they were added), the image is formated for pil and prompt is formated for the model, to place it's output to the user, these prompts and images are passed in the processor and generation of the model, than the output is decoded from the processor, onto the UI. """ if input["files"]: if type(input["files"][-1]) == dict: image = input["files"][-1]["path"] else: image = input["files"][-1] else: # If no images were passed now, look at the past images to keep up as reference still to the prompts # kept inside in tuples, the last one for hist in history: if type(hist[0]) == tuple: image = hist[0][0] try: if image is None: gr.Error("You need to upload an image please for krypton to work.") except NameError: # Image is not defined at all gr.Error("Uplaod an image for Krypton to work") pil_image = Image.fromarray(image.astype('uint8'), 'RGB') # image = Image.open(requests.get(url, stream=True).raw) prompt = ("<|start_header_id|>user<|end_header_id|>\n\n\n{input['text']}<|eot_id|>" "<|start_header_id|>assistant<|end_header_id|>\n\n") inputs = processor(prompt, pil_image, return_tensors='pt').to('cuda', torch.float16) outputs = model.generate(**inputs, max_new_tokens=200, do_sample=False) output_text = processor.decode(outputs[0], skip_special_tokens=True) print(output_text) return output_text chatbot=gr.Chatbot(height=600, label="Krypt AI") chat_input = gr.MultimodalTextbox(interactive=True, file_types=["image"], placeholder="Enter your question or upload an image.", show_label=False) with gr.Blocks(fill_height=True) as demo: gr.Markdown(DESCRIPTION) gr.ChatInterface( fn=krypton, chatbot=chatbot, fill_height=True, multimodal=True, textbox=chat_input, ) if __name__ == "__main__": demo.launch()