dminhk commited on
Commit
ac446e5
1 Parent(s): bed9972

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +93 -0
app.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import base64
3
+ import pathlib
4
+ import pdfplumber
5
+ import streamlit as st
6
+ import fillpdf
7
+ from fillpdf import fillpdfs
8
+
9
+ ##########################################################
10
+ # Display PDF function
11
+ def displayPDF(file):
12
+ # Opening file from file path
13
+ with open(file, "rb") as f:
14
+ base64_pdf = base64.b64encode(f.read()).decode('utf-8')
15
+ # Embedding PDF in HTML
16
+ pdf_display = F'<embed src="data:application/pdf;base64,{base64_pdf}" width="800" height="800" type="application/pdf">'
17
+ # Displaying File
18
+ st.markdown(pdf_display, unsafe_allow_html=True)
19
+ ##########################################################
20
+
21
+ st.set_page_config(page_title="AcroForms Data Extractor")
22
+
23
+ st.title("AcroForms Data Extractor")
24
+
25
+ st.markdown("""
26
+ This app allows you to extract AcroForms data from PDF files. Simply upload a PDF file and the app will generate a downloadable text file containing the extracted data.
27
+ """)
28
+
29
+ # Upload PDF
30
+ pdf_file = st.file_uploader("Upload PDF File", type=["pdf"])
31
+
32
+ if pdf_file is not None:
33
+
34
+ # Save file to a directory
35
+ uploads_dir = pathlib.Path("uploads")
36
+ if not uploads_dir.exists():
37
+ uploads_dir.mkdir()
38
+
39
+ with open(f"{uploads_dir}/{pdf_file.name}", "wb") as f:
40
+ f.write(pdf_file.getbuffer())
41
+
42
+ # Get file path
43
+ pdf_path = f"{uploads_dir}/{pdf_file.name}"
44
+
45
+ # # Print path
46
+ # st.markdown("**PDF Path:**")
47
+ # st.write(pdf_path)
48
+
49
+ # Display PDF
50
+ st.divider()
51
+ st.markdown("**PDF Display:**")
52
+ displayPDF(pdf_path)
53
+ # Print Form Data
54
+ st.divider()
55
+ form_data = fillpdfs.get_form_fields(pdf_path)
56
+ st.markdown("\n\n**PDF AcroForm:**")
57
+ st.write(form_data)
58
+ # convert dictionary into string
59
+ form_txt = json.dumps(form_data)
60
+ # download button
61
+ st.download_button(
62
+ label='Download AcroForm JSON',
63
+ data=form_txt,
64
+ file_name='form.json',
65
+ mime='application/json',
66
+ )
67
+ # Print Number of Pages and Extract Texxt
68
+ st.divider()
69
+ st.markdown("**PDF to Text:**")
70
+ with pdfplumber.open(pdf_file) as pdf:
71
+ pages = pdf.pages
72
+ # Number of Pages
73
+ st.markdown("**Number of Pages**")
74
+ st.write(f"Number of Pages: {len(pages)}")
75
+ # Extract Metadata
76
+ st.markdown("**Metadata**")
77
+ metadata = pdf.metadata
78
+ st.code(metadata)
79
+ # Extract Text
80
+ text = ""
81
+ for page in pages:
82
+ text += page.extract_text(layout=True) + "\n\n"
83
+
84
+ st.markdown("**Text**")
85
+ st.text(text)
86
+
87
+ # Allow text to be downloaded
88
+ btn = st.download_button(
89
+ label="Download PDF Text",
90
+ data=text,
91
+ file_name=f"{pdf_file.name.replace('.pdf', '')}_text.txt",
92
+ mime="text/plain"
93
+ )