from datasets import Dataset from transformers import AutoTokenizer from datafile import QL, resume_data_dict # Load your custom data data = [] for i in range(len(QL['labels'])): data.append({"question":QL["queries"][i], "answer":resume_data_dict[QL['labels'][i]]}) # Create a Dataset dataset = Dataset.from_dict(data) # Load the tokenizer tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium") def preprocess_function(examples): inputs = [f"Question: {q}" for q in examples["question"]] model_inputs = tokenizer(inputs, padding="max_length", truncation=True) # Setup the tokenizer for targets with tokenizer.as_target_tokenizer(): labels = tokenizer(examples["answer"], padding="max_length", truncation=True) model_inputs["labels"] = labels["input_ids"] return model_inputs # Apply preprocessing tokenized_dataset = dataset.map(preprocess_function, batched=True)