Spaces:

Yhhxhfh
/

dgdgdgdgd

Sleeping

App Files Files Community

Yhhxhfh commited on 8 days ago

Commit

360d510

•

1 Parent(s): beafae7

Create app,py

Browse files

Files changed (1) hide show

app,py +67 -0

app,py ADDED Viewed

	@@ -0,0 +1,67 @@

+import os
+from dotenv import load_dotenv
+import torch
+from transformers import GPT2LMHeadModel, GPT2Tokenizer
+from datasets import load_dataset, concatenate_datasets
+from huggingface_hub import login
+from autotrain import AutoTrain
+import time
+load_dotenv()
+login(token=os.getenv('HUGGINGFACE_TOKEN'))
+model_name = 'gpt2'
+tokenizer = GPT2Tokenizer.from_pretrained(model_name)
+model = GPT2LMHeadModel.from_pretrained(model_name)
+dataset_humanizado = load_dataset('daily_dialog', split='train')
+dataset_codigo = load_dataset('code_search_net', split='train')
+dataset_prompts = load_dataset('openai_humaneval', split='train')
+combined_dataset = concatenate_datasets([
+    dataset_humanizado,
+    dataset_codigo,
+    dataset_prompts
+])
+def tokenize_function(examples):
+    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)
+tokenized_dataset = combined_dataset.map(tokenize_function, batched=True)
+training_args = {
+    "output_dir": './results',
+    "per_device_train_batch_size": 100,
+    "per_device_eval_batch_size": 100,
+    "num_train_epochs": 0,
+    "learning_rate": 1e-5,
+    "logging_steps": -1,
+    "max_grad_norm": 1,
+    "save_total_limit": 1,
+    "seed": 42,
+    "weight_decay": 0,
+    "warmup_ratio": 0.0,
+    "evaluation_strategy": "no",
+    "optim": "adamw_torch",
+    "lr_scheduler_type": "constant",
+    "model_max_length": 2098989848
+}
+autotrain = AutoTrain(model=model, args=training_args)
+@spaces.gpu
+def run_training():
+    while True:
+        try:
+            autotrain.train(tokenized_dataset)
+            model.push_to_hub('Yhhxhfh/nombre_de_tu_modelo', repo_type='model', use_temp_dir=True, commit_message="Actualización del modelo")
+            tokenizer.push_to_hub('Yhhxhfh/nombre_de_tu_modelo', repo_type='model', use_temp_dir=True, commit_message="Actualización del tokenizador")
+            time.sleep(5)
+        except Exception as e:
+            print(f"Error durante el entrenamiento: {e}. Reiniciando el proceso de entrenamiento...")
+            time.sleep(10)
+run_training()
+import shutil
+shutil.rmtree('./results', ignore_errors=True)