Spaces:
Runtime error
Runtime error
lucidmorto
commited on
Commit
•
fa2c7a7
1
Parent(s):
bd7288e
feat: Upgrade model from t5-small to t5-base
Browse filesUpgraded the model from t5-small to t5-base for improved performance and accuracy. Additionally, increased the maximum generation length to 300 tokens in text generation, enhancing the capacity for more detailed outputs. Removed dataset truncation to utilize the entire dataset, helping in better model training and evaluation.
- app.py +2 -2
- humanizer.py +1 -2
app.py
CHANGED
@@ -1,13 +1,13 @@
|
|
1 |
import gradio as gr
|
2 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
3 |
|
4 |
-
model_name = "t5-
|
5 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
6 |
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
|
7 |
|
8 |
def generate_text(input_text):
|
9 |
input_ids = tokenizer("summarize: " + input_text, return_tensors="pt", max_length=512, truncation=True).input_ids
|
10 |
-
outputs = model.generate(input_ids, max_length=
|
11 |
return tokenizer.decode(outputs[0], skip_special_tokens=True)
|
12 |
|
13 |
iface = gr.Interface(
|
|
|
1 |
import gradio as gr
|
2 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
3 |
|
4 |
+
model_name = "t5-base"
|
5 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
6 |
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
|
7 |
|
8 |
def generate_text(input_text):
|
9 |
input_ids = tokenizer("summarize: " + input_text, return_tensors="pt", max_length=512, truncation=True).input_ids
|
10 |
+
outputs = model.generate(input_ids, max_length=300, num_return_sequences=1, no_repeat_ngram_size=2)
|
11 |
return tokenizer.decode(outputs[0], skip_special_tokens=True)
|
12 |
|
13 |
iface = gr.Interface(
|
humanizer.py
CHANGED
@@ -13,7 +13,6 @@ logger = logging.getLogger(__name__)
|
|
13 |
logger.info("Loading dataset...")
|
14 |
dataset = load_dataset("LucasChu/reddit_comments")
|
15 |
dataset = dataset.shuffle(seed=42)
|
16 |
-
dataset["train"] = dataset["train"].select(range(10000))
|
17 |
logger.info("Dataset loaded, shuffled, and truncated to 10,000 samples.")
|
18 |
|
19 |
# Split the train dataset into train and test
|
@@ -41,7 +40,7 @@ processed_dataset = {split: data.map(prepare_data) for split, data in dataset.it
|
|
41 |
logger.info("Dataset prepared.")
|
42 |
|
43 |
# Tokenize the dataset
|
44 |
-
model_name = "t5-
|
45 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
46 |
|
47 |
def tokenize_function(examples):
|
|
|
13 |
logger.info("Loading dataset...")
|
14 |
dataset = load_dataset("LucasChu/reddit_comments")
|
15 |
dataset = dataset.shuffle(seed=42)
|
|
|
16 |
logger.info("Dataset loaded, shuffled, and truncated to 10,000 samples.")
|
17 |
|
18 |
# Split the train dataset into train and test
|
|
|
40 |
logger.info("Dataset prepared.")
|
41 |
|
42 |
# Tokenize the dataset
|
43 |
+
model_name = "t5-base"
|
44 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
45 |
|
46 |
def tokenize_function(examples):
|