import gradio as gr import PyPDF2 import io import requests import torch from transformers import AutoTokenizer, AutoModelForQuestionAnswering # Download and load pre-trained model and tokenizer model_name = "distilbert-base-cased-distilled-squad" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForQuestionAnswering.from_pretrained(model_name) # Define a list of pre-defined questions predefined_questions = [ "What is the purpose of this document?", "What is the main topic of the document?", "Who is the target audience?", "What is the author's main argument?", "What is the conclusion of the document?", ] def answer_questions(pdf_file, question): # Load PDF file and extract text pdf_reader = PyPDF2.PdfFileReader(io.BytesIO(pdf_file.read())) text = "" for i in range(pdf_reader.getNumPages()): page = pdf_reader.getPage(i) text += page.extractText() text = text.strip() # Tokenize question and text input_ids = tokenizer.encode(question, text) # Perform question answering outputs = model(torch.tensor([input_ids]), return_dict=True) answer_start = outputs.start_logits.argmax().item() answer_end = outputs.end_logits.argmax().item() answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end+1])) return answer inputs = [ gr.inputs.File(label="PDF document"), gr.inputs.Dropdown(label="Question", choices=predefined_questions), ] outputs = gr.outputs.Textbox(label="Answer") gr.Interface(fn=answer_questions, inputs=inputs, outputs=outputs, title="PDF Question Answering Tool", description="Upload a PDF document and select a question from the dropdown. The app will use a pre-trained model to find the answer.").launch()