Yhhxhfh commited on
Commit
360d510
1 Parent(s): beafae7

Create app,py

Browse files
Files changed (1) hide show
  1. app,py +67 -0
app,py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ import torch
4
+ from transformers import GPT2LMHeadModel, GPT2Tokenizer
5
+ from datasets import load_dataset, concatenate_datasets
6
+ from huggingface_hub import login
7
+ from autotrain import AutoTrain
8
+ import time
9
+
10
+ load_dotenv()
11
+ login(token=os.getenv('HUGGINGFACE_TOKEN'))
12
+
13
+ model_name = 'gpt2'
14
+ tokenizer = GPT2Tokenizer.from_pretrained(model_name)
15
+ model = GPT2LMHeadModel.from_pretrained(model_name)
16
+
17
+ dataset_humanizado = load_dataset('daily_dialog', split='train')
18
+ dataset_codigo = load_dataset('code_search_net', split='train')
19
+ dataset_prompts = load_dataset('openai_humaneval', split='train')
20
+
21
+ combined_dataset = concatenate_datasets([
22
+ dataset_humanizado,
23
+ dataset_codigo,
24
+ dataset_prompts
25
+ ])
26
+
27
+ def tokenize_function(examples):
28
+ return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)
29
+
30
+ tokenized_dataset = combined_dataset.map(tokenize_function, batched=True)
31
+
32
+ training_args = {
33
+ "output_dir": './results',
34
+ "per_device_train_batch_size": 100,
35
+ "per_device_eval_batch_size": 100,
36
+ "num_train_epochs": 0,
37
+ "learning_rate": 1e-5,
38
+ "logging_steps": -1,
39
+ "max_grad_norm": 1,
40
+ "save_total_limit": 1,
41
+ "seed": 42,
42
+ "weight_decay": 0,
43
+ "warmup_ratio": 0.0,
44
+ "evaluation_strategy": "no",
45
+ "optim": "adamw_torch",
46
+ "lr_scheduler_type": "constant",
47
+ "model_max_length": 2098989848
48
+ }
49
+
50
+ autotrain = AutoTrain(model=model, args=training_args)
51
+
52
+ @spaces.gpu
53
+ def run_training():
54
+ while True:
55
+ try:
56
+ autotrain.train(tokenized_dataset)
57
+ model.push_to_hub('Yhhxhfh/nombre_de_tu_modelo', repo_type='model', use_temp_dir=True, commit_message="Actualización del modelo")
58
+ tokenizer.push_to_hub('Yhhxhfh/nombre_de_tu_modelo', repo_type='model', use_temp_dir=True, commit_message="Actualización del tokenizador")
59
+ time.sleep(5)
60
+ except Exception as e:
61
+ print(f"Error durante el entrenamiento: {e}. Reiniciando el proceso de entrenamiento...")
62
+ time.sleep(10)
63
+
64
+ run_training()
65
+
66
+ import shutil
67
+ shutil.rmtree('./results', ignore_errors=True)