jordigonzm commited on
Commit
31cc7d7
1 Parent(s): 1f24ef3

app.py cambios gradio

Browse files
Files changed (2) hide show
  1. app.py +20 -17
  2. test.py +32 -0
app.py CHANGED
@@ -1,11 +1,15 @@
1
  import os
2
  from threading import Thread
3
  from typing import Iterator
4
-
5
  import gradio as gr
6
  import spaces
7
  import torch
8
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 
 
 
 
 
9
 
10
  MAX_MAX_NEW_TOKENS = 2048
11
  DEFAULT_MAX_NEW_TOKENS = 1024
@@ -17,10 +21,10 @@ DESCRIPTION = """\
17
 
18
  This Space demonstrates model [Llama-2-13b-chat](https://huggingface.co/meta-llama/Llama-2-13b-chat) by Meta, a Llama 2 model with 13B parameters fine-tuned for chat instructions. Feel free to play with it, or duplicate to run generations without a queue! If you want to run your own service, you can also [deploy the model on Inference Endpoints](https://huggingface.co/inference-endpoints).
19
 
20
- 🔎 For more details about the Llama 2 family of models and how to use them with `transformers`, take a look [at our blog post](https://huggingface.co/blog/llama2).
21
 
22
- 🔨 Looking for an even more powerful model? Check out the large [**70B** model demo](https://huggingface.co/spaces/ysharma/Explore_llamav2_with_TGI).
23
- 🐇 For a smaller model that you can run on many GPUs, check our [7B model demo](https://huggingface.co/spaces/huggingface-projects/llama-2-7b-chat).
24
 
25
  """
26
 
@@ -28,21 +32,25 @@ LICENSE = """
28
  <p/>
29
 
30
  ---
31
- As a derivate work of [Llama-2-13b-chat](https://huggingface.co/meta-llama/Llama-2-13b-chat) by Meta,
32
- this demo is governed by the original [license](https://huggingface.co/spaces/huggingface-projects/llama-2-13b-chat/blob/main/LICENSE.txt) and [acceptable use policy](https://huggingface.co/spaces/huggingface-projects/llama-2-13b-chat/blob/main/USE_POLICY.md).
33
  """
34
 
35
  if not torch.cuda.is_available():
36
- DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
37
 
38
 
39
- if torch.cuda.is_available():
 
 
 
 
 
40
  model_id = "meta-llama/Llama-2-13b-chat-hf"
41
  model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", load_in_4bit=True)
42
  tokenizer = AutoTokenizer.from_pretrained(model_id)
43
  tokenizer.use_default_system_prompt = False
44
 
45
-
46
  @spaces.GPU
47
  def generate(
48
  message: str,
@@ -88,6 +96,9 @@ def generate(
88
  yield "".join(outputs)
89
 
90
 
 
 
 
91
  chat_interface = gr.ChatInterface(
92
  fn=generate,
93
  additional_inputs=[
@@ -128,14 +139,6 @@ chat_interface = gr.ChatInterface(
128
  value=1.2,
129
  ),
130
  ],
131
- stop_btn=None,
132
- examples=[
133
- ["Hello there! How are you doing?"],
134
- ["Can you explain briefly to me what is the Python programming language?"],
135
- ["Explain the plot of Cinderella in a sentence."],
136
- ["How many hours does it take a man to eat a Helicopter?"],
137
- ["Write a 100-word article on 'Benefits of Open-Source in AI research'"],
138
- ],
139
  cache_examples=False,
140
  )
141
 
 
1
  import os
2
  from threading import Thread
3
  from typing import Iterator
 
4
  import gradio as gr
5
  import spaces
6
  import torch
7
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
8
+ import huggingface_hub
9
+
10
+ # Obtener token de Hugging Face
11
+ token = os.environ.get("HUGGINGFACE_HUB_TOKEN", None)
12
+ huggingface_hub.login(token=token)
13
 
14
  MAX_MAX_NEW_TOKENS = 2048
15
  DEFAULT_MAX_NEW_TOKENS = 1024
 
21
 
22
  This Space demonstrates model [Llama-2-13b-chat](https://huggingface.co/meta-llama/Llama-2-13b-chat) by Meta, a Llama 2 model with 13B parameters fine-tuned for chat instructions. Feel free to play with it, or duplicate to run generations without a queue! If you want to run your own service, you can also [deploy the model on Inference Endpoints](https://huggingface.co/inference-endpoints).
23
 
24
+ Para más detalles sobre la familia de modelos Llama 2 y cómo usarlos con `transformers`, echa un vistazo [a nuestro post de blog](https://huggingface.co/blog/llama2).
25
 
26
+ Buscando un modelo aún más potente? ¡Echa un vistazo a la demo del modelo grande [**70B**](https://huggingface.co/spaces/ysharma/Explore_llamav2_with_TGI)!
27
+ Para un modelo más pequeño que puedas ejecutar en muchas GPU, echa un vistazo a nuestra [demo del modelo 7B](https://huggingface.co/spaces/huggingface-projects/llama-2-7b-chat).
28
 
29
  """
30
 
 
32
  <p/>
33
 
34
  ---
35
+ Como un trabajo derivado de [Llama-2-13b-chat](https://huggingface.co/meta-llama/Llama-2-13b-chat) de Meta,
36
+ esta demo está gobernada por la [licencia original](https://huggingface.co/spaces/huggingface-projects/llama-2-13b-chat/blob/main/LICENSE.txt) y la [política de uso aceptable](https://huggingface.co/spaces/huggingface-projects/llama-2-13b-chat/blob/main/USE_POLICY.md).
37
  """
38
 
39
  if not torch.cuda.is_available():
40
+ DESCRIPTION += "\n<p>Running on CPU. This demo does not work on CPU.</p>"
41
 
42
 
43
+ model = None
44
+ tokenizer = None
45
+
46
+ @spaces.GPU
47
+ def load_model():
48
+ global model, tokenizer
49
  model_id = "meta-llama/Llama-2-13b-chat-hf"
50
  model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", load_in_4bit=True)
51
  tokenizer = AutoTokenizer.from_pretrained(model_id)
52
  tokenizer.use_default_system_prompt = False
53
 
 
54
  @spaces.GPU
55
  def generate(
56
  message: str,
 
96
  yield "".join(outputs)
97
 
98
 
99
+ if gr.running:
100
+ load_model()
101
+
102
  chat_interface = gr.ChatInterface(
103
  fn=generate,
104
  additional_inputs=[
 
139
  value=1.2,
140
  ),
141
  ],
 
 
 
 
 
 
 
 
142
  cache_examples=False,
143
  )
144
 
test.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForCausalLM
2
+ import os
3
+ import huggingface_hub
4
+
5
+ # Obtener token de Hugging Face
6
+ token = os.environ.get("HUGGINGFACE_HUB_TOKEN", None)
7
+ huggingface_hub.login(token=token)
8
+
9
+ # ID del modelo
10
+ model_id = "CohereForAI/aya-23-35B"
11
+
12
+ # Cargar tokenizador y modelo
13
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
14
+ model = AutoModelForCausalLM.from_pretrained(model_id)
15
+
16
+ # Formatear mensaje con la plantilla de chat
17
+ messages = [{"role": "user", "content": "Anneme onu ne kadar sevdiğimi anlatan bir mektup yaz"}]
18
+ input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
19
+
20
+ # Generar texto
21
+ gen_tokens = model.generate(
22
+ input_ids,
23
+ max_new_tokens=100,
24
+ do_sample=True,
25
+ temperature=0.3,
26
+ force_download=True
27
+ )
28
+
29
+ # Decodificar tokens generados
30
+ gen_text = tokenizer.decode(gen_tokens[0], skip_special_tokens=True)
31
+ print(gen_text)
32
+