pszemraj's picture
✏️
7e75f34
raw
history blame contribute delete
No virus
7.68 kB
import gradio as gr
import torch
from gradio.themes.utils import sizes
from transformers import AutoModelForCausalLM, AutoTokenizer
import utils
from constants import END_OF_TEXT, MIN_TEMPERATURE
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(
"BEE-spoke-data/smol_llama-101M-GQA-python",
use_fast=False,
)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = END_OF_TEXT
model = AutoModelForCausalLM.from_pretrained(
"BEE-spoke-data/smol_llama-101M-GQA-python",
device_map="auto",
)
model = torch.compile(model, mode="reduce-overhead")
# UI things
_styles = utils.get_file_as_string("styles.css")
# Loads ./README.md file & splits it into sections
readme_file_content = utils.get_file_as_string("README.md", path="./")
(
manifest,
description,
disclaimer,
base_model_info,
formats,
) = utils.get_sections(readme_file_content, "---", up_to=5)
theme = gr.themes.Soft(
primary_hue="yellow",
secondary_hue="orange",
neutral_hue="slate",
radius_size=sizes.radius_sm,
font=[
gr.themes.GoogleFont("IBM Plex Sans", [400, 600]),
"ui-sans-serif",
"system-ui",
"sans-serif",
],
text_size=sizes.text_lg,
)
def run_inference(
prompt, temperature, max_new_tokens, top_p, repetition_penalty
) -> str:
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(
**inputs,
do_sample=True,
epsilon_cutoff=1e-3,
max_new_tokens=max_new_tokens,
min_new_tokens=2,
no_repeat_ngram_size=6,
renormalize_logits=True,
repetition_penalty=repetition_penalty,
temperature=max(temperature, MIN_TEMPERATURE),
top_p=top_p,
)
text = tokenizer.batch_decode(
outputs,
skip_special_tokens=True,
)[0]
return text
examples = [
[
'def greet(name: str) -> None:\n """\n Greets the user\n """\n print(f"Hello,',
0.2,
64,
0.9,
1.2,
],
[
'for i in range(5):\n """\n Loop through 0 to 4\n """\n print(i,',
0.2,
64,
0.9,
1.2,
],
['x = 10\n"""Check if x is greater than 5"""\nif x > 5:', 0.2, 64, 0.9, 1.2],
["def square(x: int) -> int:\n return", 0.2, 64, 0.9, 1.2],
['import math\n"""Math operations"""\nmath.', 0.2, 64, 0.9, 1.2],
[
'def is_even(n) -> bool:\n """\n Check if a number is even\n """\n if n % 2 == 0:',
0.2,
64,
0.9,
1.2,
],
[
'while True:\n """Infinite loop example"""\n print("Infinite loop,',
0.2,
64,
0.9,
1.2,
],
[
"def sum_list(lst: list[int]) -> int:\n total = 0\n for item in lst:",
0.2,
64,
0.9,
1.2,
],
[
'try:\n """\n Exception handling\n """\n x = int(input("Enter a number: "))\nexcept ValueError:',
0.2,
64,
0.9,
1.2,
],
[
'def divide(a: float, b: float) -> float:\n """\n Divide a by b\n """\n if b != 0:',
0.2,
64,
0.9,
1.2,
],
]
# Define the Gradio Blocks interface
with gr.Blocks(theme=theme, analytics_enabled=False, css=_styles) as demo:
with gr.Column():
gr.Markdown(description)
with gr.Row():
with gr.Column():
instruction = gr.Textbox(
value=examples[0][0],
placeholder="Enter your code here",
label="Code",
elem_id="q-input",
)
submit = gr.Button("Generate", variant="primary")
output = gr.Code(elem_id="q-output", language="python", lines=10)
with gr.Row():
with gr.Column():
with gr.Accordion("Advanced settings", open=False):
with gr.Row():
column_1, column_2 = gr.Column(), gr.Column()
with column_1:
temperature = gr.Slider(
label="Temperature",
value=0.2,
minimum=0.0,
maximum=1.0,
step=0.05,
interactive=True,
info="Higher values produce more diverse outputs",
)
max_new_tokens = gr.Slider(
label="Max new tokens",
value=64,
minimum=32,
maximum=512,
step=32,
interactive=True,
info="Number of tokens to generate",
)
with column_2:
top_p = gr.Slider(
label="Top-p (nucleus sampling)",
value=0.90,
minimum=0.0,
maximum=1,
step=0.05,
interactive=True,
info="Higher values sample more low-probability tokens",
)
repetition_penalty = gr.Slider(
label="Repetition penalty",
value=1.2,
minimum=1.0,
maximum=2.0,
step=0.05,
interactive=True,
info="Penalize repeated tokens",
)
with gr.Column():
version = gr.Dropdown(
[
"smol_llama-101M-GQA-python",
],
value="smol_llama-101M-GQA-python",
label="Version",
info="",
)
gr.Markdown(disclaimer)
gr.Examples(
examples=examples,
inputs=[
instruction,
temperature,
max_new_tokens,
top_p,
repetition_penalty,
version,
],
cache_examples=False,
fn=run_inference,
outputs=[output],
)
gr.Markdown(base_model_info)
gr.Markdown(formats)
submit.click(
run_inference,
inputs=[
instruction,
temperature,
max_new_tokens,
top_p,
repetition_penalty,
],
outputs=[output],
# preprocess=False,
# batch=False,
show_progress=True,
)
# .queue(max_size=10, api_open=False)
demo.launch(
debug=True,
show_api=False,
share=utils.is_google_colab(),
)