mkoot007's picture
Update app.py
406399b
raw
history blame
No virus
1.93 kB
import gradio as gr
import re
from docx import Document # Use python-docx to read DOCX files
from PyPDF2 import PdfFileReader # Import PdfFileReader from PyPDF2
# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_file):
text = ""
pdf = PdfFileReader(pdf_file)
for page_num in range(pdf.getNumPages()):
page = pdf.getPage(page_num)
text += page.extractText()
return text
# Function to extract text from a DOCX file
def extract_text_from_docx(docx_file):
doc = Document(docx_file)
text = "\n".join([para.text for para in doc.paragraphs])
return text
# Function to extract information from a resume
def extract_info_from_resume(resume_path):
if resume_path.name.endswith('.pdf'):
text = extract_text_from_pdf(resume_path)
elif resume_path.name.endswith('.docx'):
text = extract_text_from_docx(resume_path)
else:
raise ValueError("Unsupported file format. Only PDF and DOCX are supported.")
# Define regular expressions to extract information
name_pattern = r"([A-Z][a-z]+(?: [A-Z][a-z]+)+)"
email_pattern = r"[\w\.-]+@[\w\.-]+"
phone_pattern = r"(\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4})"
name = re.search(name_pattern, text)
email = re.search(email_pattern, text)
phone = re.search(phone_pattern, text)
if name:
name = name.group()
else:
name = "Name not found"
if email:
email = email.group()
else:
email = "Email not found"
if phone:
phone = phone.group()
else:
phone = "Phone number not found"
extracted_info = {
"Name": name,
"Email": email,
"Phone": phone,
}
return extracted_info
# Define a Gradio interface
iface = gr.Interface(
fn=extract_info_from_resume,
inputs=gr.inputs.File(type="file"),
outputs="json"
)
# Deploy the Gradio interface
iface.launch(share=True)