lucidmorto commited on
Commit
b7588d3
1 Parent(s): fa2c7a7

feat: Improve training settings and dataset handling

Browse files

- Updated dataset to "pushshift-reddit-comments"
- Adjusted data field from "text" to "body"
- Increased tokenization max length to 256
- Enhanced training settings: more epochs, larger batch size, gradient accumulation, mixed precision
- Integrated scheduler with warmup steps
- Switched to using GPU if available

Files changed (1) hide show
  1. humanizer.py +23 -12
humanizer.py CHANGED
@@ -1,6 +1,6 @@
1
  from datasets import load_dataset, DatasetDict
2
  from transformers import AutoTokenizer, T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
3
- from transformers import EarlyStoppingCallback
4
  from transformers.integrations import TensorBoardCallback
5
  import torch
6
  import logging
@@ -11,7 +11,7 @@ logger = logging.getLogger(__name__)
11
 
12
  # Load the dataset and take only 1000 samples
13
  logger.info("Loading dataset...")
14
- dataset = load_dataset("LucasChu/reddit_comments")
15
  dataset = dataset.shuffle(seed=42)
16
  logger.info("Dataset loaded, shuffled, and truncated to 10,000 samples.")
17
 
@@ -32,7 +32,7 @@ def generate_formal_text(text):
32
 
33
  # Prepare the dataset
34
  def prepare_data(example):
35
- example["formal_text"] = generate_formal_text(example["text"]) # Changed from "comment" to "text"
36
  return example
37
 
38
  logger.info("Preparing dataset...")
@@ -44,8 +44,8 @@ model_name = "t5-base"
44
  tokenizer = AutoTokenizer.from_pretrained(model_name)
45
 
46
  def tokenize_function(examples):
47
- model_inputs = tokenizer(examples["formal_text"], max_length=128, truncation=True, padding="max_length")
48
- labels = tokenizer(examples["text"], max_length=128, truncation=True, padding="max_length") # Changed from "comment" to "text"
49
  model_inputs["labels"] = labels["input_ids"]
50
  return model_inputs
51
 
@@ -63,19 +63,29 @@ model = T5ForConditionalGeneration.from_pretrained(model_name)
63
 
64
  training_args = Seq2SeqTrainingArguments(
65
  output_dir="./results",
66
- num_train_epochs=1,
67
- per_device_train_batch_size=16,
68
- warmup_steps=100,
 
69
  weight_decay=0.01,
70
  logging_dir="./logs",
71
  logging_steps=100,
72
- evaluation_strategy="steps" if "test" in available_splits else "no",
73
- eval_steps=500,
74
  save_steps=1000,
75
- use_cpu=True,
76
  load_best_model_at_end=True,
77
  metric_for_best_model="eval_loss",
78
- greater_is_better=False
 
 
 
 
 
 
 
 
 
79
  )
80
 
81
  trainer = Seq2SeqTrainer(
@@ -84,6 +94,7 @@ trainer = Seq2SeqTrainer(
84
  train_dataset=tokenized_dataset["train"],
85
  eval_dataset=tokenized_dataset.get("test"),
86
  tokenizer=tokenizer,
 
87
  callbacks=[EarlyStoppingCallback(early_stopping_patience=3), TensorBoardCallback()]
88
  )
89
  logger.info("Model and trainer set up.")
 
1
  from datasets import load_dataset, DatasetDict
2
  from transformers import AutoTokenizer, T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
3
+ from transformers import EarlyStoppingCallback, get_linear_schedule_with_warmup
4
  from transformers.integrations import TensorBoardCallback
5
  import torch
6
  import logging
 
11
 
12
  # Load the dataset and take only 1000 samples
13
  logger.info("Loading dataset...")
14
+ dataset = load_dataset("fddemarco/pushshift-reddit-comments", split="train")
15
  dataset = dataset.shuffle(seed=42)
16
  logger.info("Dataset loaded, shuffled, and truncated to 10,000 samples.")
17
 
 
32
 
33
  # Prepare the dataset
34
  def prepare_data(example):
35
+ example["formal_text"] = generate_formal_text(example["body"]) # Changed from "text" to "body"
36
  return example
37
 
38
  logger.info("Preparing dataset...")
 
44
  tokenizer = AutoTokenizer.from_pretrained(model_name)
45
 
46
  def tokenize_function(examples):
47
+ model_inputs = tokenizer(examples["formal_text"], max_length=256, truncation=True, padding="max_length")
48
+ labels = tokenizer(examples["body"], max_length=256, truncation=True, padding="max_length")
49
  model_inputs["labels"] = labels["input_ids"]
50
  return model_inputs
51
 
 
63
 
64
  training_args = Seq2SeqTrainingArguments(
65
  output_dir="./results",
66
+ num_train_epochs=3, # Increase number of epochs
67
+ per_device_train_batch_size=32, # Increase batch size if memory allows
68
+ per_device_eval_batch_size=32,
69
+ warmup_steps=500,
70
  weight_decay=0.01,
71
  logging_dir="./logs",
72
  logging_steps=100,
73
+ evaluation_strategy="steps",
74
+ eval_steps=1000,
75
  save_steps=1000,
76
+ use_cpu=False, # Use GPU if available
77
  load_best_model_at_end=True,
78
  metric_for_best_model="eval_loss",
79
+ greater_is_better=False,
80
+ fp16=True, # Enable mixed precision training if GPU supports it
81
+ gradient_accumulation_steps=2, # Accumulate gradients to simulate larger batch sizes
82
+ )
83
+
84
+ optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
85
+ scheduler = get_linear_schedule_with_warmup(
86
+ optimizer,
87
+ num_warmup_steps=500,
88
+ num_training_steps=len(tokenized_dataset["train"]) * training_args.num_train_epochs
89
  )
90
 
91
  trainer = Seq2SeqTrainer(
 
94
  train_dataset=tokenized_dataset["train"],
95
  eval_dataset=tokenized_dataset.get("test"),
96
  tokenizer=tokenizer,
97
+ optimizers=(optimizer, scheduler),
98
  callbacks=[EarlyStoppingCallback(early_stopping_patience=3), TensorBoardCallback()]
99
  )
100
  logger.info("Model and trainer set up.")