import json import base64 import pathlib import pdfplumber import streamlit as st import fillpdf from fillpdf import fillpdfs ########################################################## # Display PDF function def displayPDF(file): # Opening file from file path with open(file, "rb") as f: base64_pdf = base64.b64encode(f.read()).decode('utf-8') # Embedding PDF in HTML pdf_display = F'' # Displaying File st.markdown(pdf_display, unsafe_allow_html=True) ########################################################## st.set_page_config(page_title="AcroForms Data Extractor") st.title("AcroForms Data Extractor") st.markdown(""" This app allows you to extract AcroForms data from PDF files. Simply upload a PDF file and the app will generate a downloadable text file containing the extracted data. """) # Upload PDF pdf_file = st.file_uploader("Upload PDF File", type=["pdf"]) if pdf_file is not None: # Save file to a directory uploads_dir = pathlib.Path("uploads") if not uploads_dir.exists(): uploads_dir.mkdir() with open(f"{uploads_dir}/{pdf_file.name}", "wb") as f: f.write(pdf_file.getbuffer()) # Get file path pdf_path = f"{uploads_dir}/{pdf_file.name}" # # Print path # st.markdown("**PDF Path:**") # st.write(pdf_path) # Display PDF st.divider() st.markdown("**PDF Display:**") displayPDF(pdf_path) # Print Form Data st.divider() form_data = fillpdfs.get_form_fields(pdf_path) st.markdown("\n\n**PDF AcroForm:**") st.write(form_data) # convert dictionary into string form_txt = json.dumps(form_data) # download button st.download_button( label='Download AcroForm JSON', data=form_txt, file_name='form.json', mime='application/json', ) # Print Number of Pages and Extract Texxt st.divider() st.markdown("**PDF to Text:**") with pdfplumber.open(pdf_file) as pdf: pages = pdf.pages # Number of Pages st.markdown("**Number of Pages**") st.write(f"Number of Pages: {len(pages)}") # Extract Metadata st.markdown("**Metadata**") metadata = pdf.metadata st.code(metadata) # Extract Text text = "" for page in pages: text += page.extract_text(layout=True) + "\n\n" st.markdown("**Text**") st.text(text) # Allow text to be downloaded btn = st.download_button( label="Download PDF Text", data=text, file_name=f"{pdf_file.name.replace('.pdf', '')}_text.txt", mime="text/plain" )