import warnings warnings.simplefilter(action='ignore', category=FutureWarning) import logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) import fitz import gradio as gr from langchain.prompts import PromptTemplate from pathlib import Path from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint from langdetect import detect CONTEXT_WINDOW = 50_000 from transformers import BitsAndBytesConfig quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype="float16", bnb_4bit_use_double_quant=True ) llm = HuggingFaceEndpoint( repo_id="mistralai/Mistral-Nemo-Instruct-2407", #"mistralai/Mistral-7B-Instruct-v0.3", task="text-generation", model_kwargs={"quantization_config": quantization_config}, max_new_tokens=4096, temperature=0.5, do_sample=False, ) #llm_engine_hf = ChatHuggingFace(llm=llm) def read_pdf(file_path): logger.info("Reading a PDF file") try: pdf_document = fitz.open(file_path) text = "" for page_num in range(len(pdf_document)): page = pdf_document[page_num] text += page.get_text() if not text.strip(): message = "PDF contains no text. It may be due to the PDF being password-protected, collapsed, or full of images." logger.info(message) return message return text except Exception as e: error_message = f"Error reading PDF file: {e}" logger.error(error_message) return error_message def read_txt(file_path): logger.info("Reading a TXT file") try: with open(file_path, "r", encoding="utf-8") as f: text = f.read() return text except Exception as e: error_message = f"Error reading TXT file: {e}" logger.error(error_message) return error_message def summarize(file): global llm # Read the content of the uploaded file file_path = file.name if file_path.endswith('.pdf'): text = read_pdf(file_path) else: text = read_txt(file_path) logger.info("Length of text is %d", len(text)) lang = detect(text[:CONTEXT_WINDOW]) template_translate = ''' Please carefully read the following document: {TEXT} After reading through the document, pinpoint the key points and main ideas covered in the text. Organize these key points into a concise bulleted list that summarizes the essential information from the document. The summary should be in {LANG} language. ''' prompt_summarize = PromptTemplate( template=template_translate, input_variables=["TEXT", "LANG"] ) summaries = [] for i in range(0, len(text), CONTEXT_WINDOW): chunk = text[i:i + CONTEXT_WINDOW] formatted_prompt = prompt_summarize.format(TEXT=chunk, LANG=lang) summary = llm.invoke(formatted_prompt) summaries.append(summary) logger.info(f"Chunked into {len(summaries)}.") final_summary = "\n\n".join(summaries) return final_summary def download_summary(output_text): if output_text: file_path = Path('summary.txt') with open(file_path, 'w', encoding='utf-8') as f: f.write(output_text) return file_path else: return None def create_download_file(summary_text): file_path = download_summary(summary_text) return str(file_path) if file_path else None # Create the Gradio interface with gr.Blocks() as demo: gr.Markdown("## Document Summarizer") with gr.Row(): with gr.Column(): file = gr.File(label="Submit a file") with gr.Column(): output_text = gr.Textbox(label="Summary", lines=20) submit_button = gr.Button("Summarize") submit_button.click(summarize, inputs=[file], outputs=output_text) def generate_file(): summary_text = output_text file_path = download_summary(summary_text) return file_path download_button = gr.Button("Download Summary") download_button.click( fn=create_download_file, inputs=[output_text], outputs=gr.File() ) # Run the Gradio app demo.launch(share=True)