CCRss commited on
Commit
1ceb4d2
1 Parent(s): 1ba1821

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +24 -1
README.md CHANGED
@@ -19,18 +19,34 @@ The **qqp_kz** model is paraphrasing tool tailored for the Kazakh language. It i
19
  Data Preprocessing
20
  The dataset used for training the qqp_kz model undergoes rigorous preprocessing to ensure compatibility and optimal performance:
21
  ```python
 
22
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
23
 
 
 
24
  tokenizer = AutoTokenizer.from_pretrained("CCRss/tokenizer_t5_kz")
25
 
 
 
 
26
  def preprocess_data(example):
 
27
  source = example["src"]
28
  target = example["trg"]
 
 
29
  source_inputs = tokenizer(source, padding="max_length", truncation=True, max_length=128)
 
 
30
  target_inputs = tokenizer(target, padding="max_length", truncation=True, max_length=128)
 
 
31
  return {**source_inputs, **target_inputs, "labels": target_inputs["input_ids"]}
32
 
 
 
33
  encoded_dataset = dataset.map(preprocess_data)
 
34
  encoded_dataset.set_format("torch")
35
 
36
  ```
@@ -39,11 +55,17 @@ encoded_dataset.set_format("torch")
39
  The model is trained with the following configuration:
40
 
41
  ```python
 
 
42
  from transformers import TrainingArguments, Seq2SeqTrainer
43
 
 
44
  name_of_model = "humarin/chatgpt_paraphraser_on_T5_base"
 
45
  model = AutoModelForSeq2SeqLM.from_pretrained(name_of_model)
46
 
 
 
47
  training_args = Seq2SeqTrainingArguments(
48
  per_device_train_batch_size=21,
49
  gradient_accumulation_steps=3,
@@ -57,6 +79,7 @@ training_args = Seq2SeqTrainingArguments(
57
  evaluation_strategy="steps"
58
  )
59
 
 
60
  trainer = Seq2SeqTrainer(
61
  model=model,
62
  args=training_args,
@@ -64,8 +87,8 @@ trainer = Seq2SeqTrainer(
64
  eval_dataset=encoded_dataset['valid']
65
  )
66
 
 
67
  trainer.train()
68
-
69
  ```
70
 
71
  ### Usage
 
19
  Data Preprocessing
20
  The dataset used for training the qqp_kz model undergoes rigorous preprocessing to ensure compatibility and optimal performance:
21
  ```python
22
+ # Importing necessary modules from the transformers library
23
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
24
 
25
+ # Initializing the tokenizer for the specific model. This tokenizer is used to convert
26
+ # text input into a format that is understandable by the model.
27
  tokenizer = AutoTokenizer.from_pretrained("CCRss/tokenizer_t5_kz")
28
 
29
+ # Define a function for preprocessing the data. This function takes an example
30
+ # (which includes source and target texts) and tokenizes both texts using the tokenizer.
31
+ # The tokenized output is then formatted to a fixed length for consistent model input.
32
  def preprocess_data(example):
33
+ # Extracting the source and target texts from the example
34
  source = example["src"]
35
  target = example["trg"]
36
+
37
+ # Tokenizing the source text with padding and truncation to ensure a fixed length
38
  source_inputs = tokenizer(source, padding="max_length", truncation=True, max_length=128)
39
+
40
+ # Tokenizing the target text with padding and truncation to ensure a fixed length
41
  target_inputs = tokenizer(target, padding="max_length", truncation=True, max_length=128)
42
+
43
+ # Returning the tokenized inputs, combining both source and target, and setting the target as labels
44
  return {**source_inputs, **target_inputs, "labels": target_inputs["input_ids"]}
45
 
46
+ # Applying the preprocessing function to the dataset, effectively transforming all text data
47
+ # into a tokenized format suitable for the Seq2Seq model.
48
  encoded_dataset = dataset.map(preprocess_data)
49
+ # Setting the format of the dataset to PyTorch tensors for compatibility with the training framework.
50
  encoded_dataset.set_format("torch")
51
 
52
  ```
 
55
  The model is trained with the following configuration:
56
 
57
  ```python
58
+
59
+ # Importing necessary classes for training from the transformers library
60
  from transformers import TrainingArguments, Seq2SeqTrainer
61
 
62
+ # Name of the pretrained model to be used for Seq2Seq learning
63
  name_of_model = "humarin/chatgpt_paraphraser_on_T5_base"
64
+ # Loading the model from the pretrained weights
65
  model = AutoModelForSeq2SeqLM.from_pretrained(name_of_model)
66
 
67
+ # Setting up training arguments. This includes batch size, learning rate, number of epochs,
68
+ # directories for saving results and logs, and evaluation strategy.
69
  training_args = Seq2SeqTrainingArguments(
70
  per_device_train_batch_size=21,
71
  gradient_accumulation_steps=3,
 
79
  evaluation_strategy="steps"
80
  )
81
 
82
+ # Initializing the trainer with the model, training arguments, and the datasets for training and evaluation.
83
  trainer = Seq2SeqTrainer(
84
  model=model,
85
  args=training_args,
 
87
  eval_dataset=encoded_dataset['valid']
88
  )
89
 
90
+ # Starting the training process of the model using the specified datasets and training arguments.
91
  trainer.train()
 
92
  ```
93
 
94
  ### Usage