File size: 2,115 Bytes
ac3d1df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import gradio as gr
import torch
import os
import sys
import time
import json
from typing import List

from transformers import (
    LlamaTokenizer,
    LlamaForCausalLM,
    AutoModelForCausalLM,
    AutoTokenizer,
    LlamaConfig
)
from peft import PeftModel
from accelerate import disk_offload

model = AutoModelForCausalLM.from_pretrained(
    "Johntad110/llama-2-7b-amharic-tokenizer",
    return_dict=True,
    load_in_8bit=True,
    device_map="auto",
    low_cpu_mem_usage=True,
    attn_implementation="sdpa"
)

tokenizer = LlamaTokenizer.from_pretrained(
    "Johntad110/llama-2-7b-amharic-tokenizer"
)

embedding_size = model.get_input_embeddings().weight.shape[0]
if len(tokenizer) != embedding_size:
    model.resize_token_embeddings(len(tokenizer))

model = PeftModel.from_pretrained(model, "Johntad110/llama-2-amharic-peft")

model.eval()  # Set model to evaluation mode


def generate_text(
    prompt: str,
    max_new_tokens: int = None,
    seed: int = 42,
    do_sample: bool = True,
    min_length: int = None,
    use_cache: bool = True,
    top_p: float = 1.0,
    temperature: float = 1.0,
    top_k: int = 1,
    repetition_penalty: float = 1.0,
    length_penalty: int = 1,
):
    """
    Function to perform text generation with user-defined parameters
    """

    torch.cuda.manual_seed(seed)
    torch.manual_seed(seed)

    batch = tokenizer(prompt, return_tensors="pt")
    batch = {k: v.to("cuda") for k, v in batch.items()}

    with torch.no_grad():
        outputs = model.generate(
            **batch,
            max_new_tokens=max_new_tokens,
            do_sample=do_sample,
            top_p=top_p,
            temperature=temperature,
            min_length=min_length,
            use_cache=use_cache,
            top_k=top_k,
            repetition_penalty=repetition_penalty,
            length_penalty=length_penalty,
        )

    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return output_text


interface = gr.Interface(
    fn=generate_text,
    inputs=[gr.Textbox(label="Prompt")],
    outputs="text"
)

interface.launch(debug=True)