lucidmorto commited on
Commit
fa2c7a7
1 Parent(s): bd7288e

feat: Upgrade model from t5-small to t5-base

Browse files

Upgraded the model from t5-small to t5-base for improved performance and accuracy. Additionally, increased the maximum generation length to 300 tokens in text generation, enhancing the capacity for more detailed outputs. Removed dataset truncation to utilize the entire dataset, helping in better model training and evaluation.

Files changed (2) hide show
  1. app.py +2 -2
  2. humanizer.py +1 -2
app.py CHANGED
@@ -1,13 +1,13 @@
1
  import gradio as gr
2
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
3
 
4
- model_name = "t5-small"
5
  tokenizer = AutoTokenizer.from_pretrained(model_name)
6
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
7
 
8
  def generate_text(input_text):
9
  input_ids = tokenizer("summarize: " + input_text, return_tensors="pt", max_length=512, truncation=True).input_ids
10
- outputs = model.generate(input_ids, max_length=150, num_return_sequences=1, no_repeat_ngram_size=2)
11
  return tokenizer.decode(outputs[0], skip_special_tokens=True)
12
 
13
  iface = gr.Interface(
 
1
  import gradio as gr
2
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
3
 
4
+ model_name = "t5-base"
5
  tokenizer = AutoTokenizer.from_pretrained(model_name)
6
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
7
 
8
  def generate_text(input_text):
9
  input_ids = tokenizer("summarize: " + input_text, return_tensors="pt", max_length=512, truncation=True).input_ids
10
+ outputs = model.generate(input_ids, max_length=300, num_return_sequences=1, no_repeat_ngram_size=2)
11
  return tokenizer.decode(outputs[0], skip_special_tokens=True)
12
 
13
  iface = gr.Interface(
humanizer.py CHANGED
@@ -13,7 +13,6 @@ logger = logging.getLogger(__name__)
13
  logger.info("Loading dataset...")
14
  dataset = load_dataset("LucasChu/reddit_comments")
15
  dataset = dataset.shuffle(seed=42)
16
- dataset["train"] = dataset["train"].select(range(10000))
17
  logger.info("Dataset loaded, shuffled, and truncated to 10,000 samples.")
18
 
19
  # Split the train dataset into train and test
@@ -41,7 +40,7 @@ processed_dataset = {split: data.map(prepare_data) for split, data in dataset.it
41
  logger.info("Dataset prepared.")
42
 
43
  # Tokenize the dataset
44
- model_name = "t5-small"
45
  tokenizer = AutoTokenizer.from_pretrained(model_name)
46
 
47
  def tokenize_function(examples):
 
13
  logger.info("Loading dataset...")
14
  dataset = load_dataset("LucasChu/reddit_comments")
15
  dataset = dataset.shuffle(seed=42)
 
16
  logger.info("Dataset loaded, shuffled, and truncated to 10,000 samples.")
17
 
18
  # Split the train dataset into train and test
 
40
  logger.info("Dataset prepared.")
41
 
42
  # Tokenize the dataset
43
+ model_name = "t5-base"
44
  tokenizer = AutoTokenizer.from_pretrained(model_name)
45
 
46
  def tokenize_function(examples):