import gradio as gr
import PyPDF2
import io
import requests
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

# Download and load pre-trained model and tokenizer
model_name = "distilbert-base-cased-distilled-squad"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# Define a list of pre-defined questions
predefined_questions = [
    "What is the purpose of this document?",
    "What is the main topic of the document?",
    "Who is the target audience?",
    "What is the author's main argument?",
    "What is the conclusion of the document?",
]

def answer_questions(pdf_file, question):
    # Load PDF file and extract text
    pdf_reader = PyPDF2.PdfFileReader(io.BytesIO(pdf_file.read()))
    text = ""
    for i in range(pdf_reader.getNumPages()):
        page = pdf_reader.getPage(i)
        text += page.extractText()
    text = text.strip()

    # Tokenize question and text
    input_ids = tokenizer.encode(question, text)

    # Perform question answering
    outputs = model(torch.tensor([input_ids]), return_dict=True)
    answer_start = outputs.start_logits.argmax().item()
    answer_end = outputs.end_logits.argmax().item()
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end+1]))
        
    return answer

inputs = [
    gr.inputs.File(label="PDF document"),
    gr.inputs.Dropdown(label="Question", choices=predefined_questions),
]

outputs = gr.outputs.Textbox(label="Answer")

gr.Interface(fn=answer_questions, inputs=inputs, outputs=outputs, title="PDF Question Answering Tool", 
             description="Upload a PDF document and select a question from the dropdown. The app will use a pre-trained model to find the answer.").launch()