Priyanka-Balivada commited on
Commit
604af6d
β€’
1 Parent(s): 4ee43f7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -61
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import streamlit as st
2
  import nltk
3
  from gensim.models.doc2vec import Doc2Vec, TaggedDocument
@@ -8,21 +9,17 @@ import re
8
  import matplotlib.pyplot as plt
9
  import seaborn as sns
10
  import spacy
11
- import re
12
- import pandas as pd
13
- import matplotlib.pyplot as plt
14
- import seaborn as sns
15
 
 
16
  nltk.download('punkt')
17
 
18
-
19
  float_regex = re.compile(r'^\d{1,2}(\.\d{1,2})?$')
20
  email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
21
  float_digit_regex = re.compile(r'^\d{10}$')
22
- email_with_phone_regex = email_with_phone_regex = re.compile(
23
- r'(\d{10}).|.(\d{10})')
24
-
25
 
 
26
  def extract_text_from_pdf(pdf_file):
27
  pdf_reader = PyPDF2.PdfReader(pdf_file)
28
  text = ""
@@ -30,97 +27,74 @@ def extract_text_from_pdf(pdf_file):
30
  text += pdf_reader.pages[page_num].extract_text()
31
  return text
32
 
33
-
34
  def tokenize_text(text, nlp_model):
35
  doc = nlp_model(text, disable=["tagger", "parser"])
36
  tokens = [(token.text.lower(), token.label_) for token in doc.ents]
37
  return tokens
38
 
39
-
40
  def extract_cgpa(resume_text):
41
- # Define a regular expression pattern for CGPA extraction
42
  cgpa_pattern = r'\b(?:CGPA|GPA|C\.G\.PA|Cumulative GPA)\s*:?[\s-]([0-9]+(?:\.[0-9]+)?)\b|\b([0-9]+(?:\.[0-9]+)?)\s(?:CGPA|GPA)\b'
43
-
44
- # Search for CGPA pattern in the text
45
  match = re.search(cgpa_pattern, resume_text, re.IGNORECASE)
46
-
47
- # Check if a match is found
48
  if match:
49
- # Extract CGPA value
50
  cgpa = match.group(1) if match.group(1) else match.group(2)
51
  return float(cgpa)
52
  else:
53
  return None
54
 
55
-
56
  def extract_skills(text, skills_keywords):
57
- skills = [skill.lower()
58
- for skill in skills_keywords if re.search(r'\b' + re.escape(skill.lower()) + r'\b', text.lower())]
59
  return skills
60
 
61
-
62
  def preprocess_text(text):
63
  return word_tokenize(text.lower())
64
 
65
- @st.cache_data
66
- def load_data(results):
67
- df = pd.DataFrame(results)
68
- return df
69
-
70
  def train_doc2vec_model(documents):
71
  model = Doc2Vec(vector_size=20, min_count=2, epochs=50)
72
  model.build_vocab(documents)
73
- model.train(documents, total_examples=model.corpus_count,
74
- epochs=model.epochs)
75
  return model
76
 
77
-
78
  def calculate_similarity(model, text1, text2):
79
  vector1 = model.infer_vector(preprocess_text(text1))
80
  vector2 = model.infer_vector(preprocess_text(text2))
81
  return model.dv.cosine_similarities(vector1, [vector2])[0]
82
 
83
-
84
  def accuracy_calculation(true_positives, false_positives, false_negatives):
85
  total = true_positives + false_positives + false_negatives
86
  accuracy = true_positives / total if total != 0 else 0
87
  return accuracy
88
 
89
-
90
  # Streamlit Frontend
91
  st.markdown("# Resume Matching Tool πŸ“ƒπŸ“ƒ")
92
  st.markdown("An application to match resumes with a job description.")
93
 
94
  # Sidebar - File Upload for Resumes
95
  st.sidebar.markdown("## Upload Resumes PDF")
96
- resumes_files = st.sidebar.file_uploader(
97
- "Upload Resumes PDF", type=["pdf"], accept_multiple_files=True)
98
 
99
  if resumes_files:
100
  # Sidebar - File Upload for Job Descriptions
101
  st.sidebar.markdown("## Upload Job Description PDF")
102
- job_descriptions_file = st.sidebar.file_uploader(
103
- "Upload Job Description PDF", type=["pdf"])
104
-
105
- # Get skills keywords from user input
106
- skills_keywords_input = st.sidebar.text_input(
107
- "Enter skills keywords separated by commas (e.g., python, java, machine learning):")
108
- skills_keywords = [skill.strip()
109
- for skill in skills_keywords_input.split(',') if skill.strip()]
110
 
111
  if job_descriptions_file:
 
112
  nlp_model_path = "en_Resume_Matching_Keywords"
113
  nlp = spacy.load(nlp_model_path)
114
-
115
  # Backend Processing
116
  job_description_text = extract_text_from_pdf(job_descriptions_file)
117
- resumes_texts = [extract_text_from_pdf(
118
- resume_file) for resume_file in resumes_files]
119
  job_description_text = extract_text_from_pdf(job_descriptions_file)
120
  job_description_tokens = tokenize_text(job_description_text, nlp)
121
 
122
- # st.subheader("Matching Keywords")
123
-
124
  # Initialize counters
125
  overall_skill_matches = 0
126
  overall_qualification_matches = 0
@@ -193,8 +167,7 @@ if resumes_files:
193
  overall_qualification_matches += qualificationMatch
194
 
195
  # Add count of matched skills for this resume to the list
196
- skills_counts_all_resumes.append(
197
- [resume_text.count(skill.lower()) for skill in job_skills])
198
 
199
  # Create a dictionary for the current resume and append to the results list
200
  result_dict = {
@@ -214,20 +187,22 @@ if resumes_files:
214
  # Display overall matches
215
  st.subheader("Overall Matches")
216
  st.write(f"Total Skill Matches: {overall_skill_matches}")
217
- st.write(
218
- f"Total Qualification Matches: {overall_qualification_matches}")
219
  st.write(f"Job Qualifications: {job_qualifications}")
220
  st.write(f"Job Skills: {job_skills}")
221
 
222
  # Display individual results in a table
223
- results_df =load_data(results_list)
224
  st.subheader("Individual Results")
225
  st.dataframe(results_df)
226
- tagged_resumes = [TaggedDocument(words=preprocess_text(
227
- text), tags=[str(i)]) for i, text in enumerate(resumes_texts)]
228
  model_resumes = train_doc2vec_model(tagged_resumes)
229
-
230
  st.subheader("\nHeatmap:")
 
 
 
 
231
 
232
  if skills_keywords:
233
  # Calculate the similarity score between each skill keyword and the resume text
@@ -235,20 +210,16 @@ if resumes_files:
235
  for resume_text in resumes_texts:
236
  resume_text_similarity_scores = []
237
  for skill in skills_keywords:
238
- similarity_score = calculate_similarity(
239
- model_resumes, resume_text, skill)
240
  resume_text_similarity_scores.append(similarity_score)
241
  skills_similarity_scores.append(resume_text_similarity_scores)
242
 
243
  # Create a DataFrame with the similarity scores and set the index to the names of the PDFs
244
- skills_similarity_df = pd.DataFrame(
245
- skills_similarity_scores, columns=skills_keywords, index=[resume_file.name for resume_file in resumes_files])
246
 
247
  # Plot the heatmap
248
  fig, ax = plt.subplots(figsize=(12, 8))
249
-
250
- sns.heatmap(skills_similarity_df,
251
- cmap='YlGnBu', annot=True, fmt=".2f", ax=ax)
252
  ax.set_title('Heatmap for Skills Similarity')
253
  ax.set_xlabel('Skills')
254
  ax.set_ylabel('Resumes')
@@ -264,4 +235,4 @@ if resumes_files:
264
  else:
265
  st.warning("Please upload the Job Description PDF to proceed.")
266
  else:
267
- st.warning("Please upload Resumes PDF to proceed.")
 
1
+ # Import necessary libraries
2
  import streamlit as st
3
  import nltk
4
  from gensim.models.doc2vec import Doc2Vec, TaggedDocument
 
9
  import matplotlib.pyplot as plt
10
  import seaborn as sns
11
  import spacy
 
 
 
 
12
 
13
+ # Download necessary NLTK data
14
  nltk.download('punkt')
15
 
16
+ # Define regular expressions for pattern matching
17
  float_regex = re.compile(r'^\d{1,2}(\.\d{1,2})?$')
18
  email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
19
  float_digit_regex = re.compile(r'^\d{10}$')
20
+ email_with_phone_regex = re.compile(r'(\d{10}).|.(\d{10})')
 
 
21
 
22
+ # Function to extract text from a PDF file
23
  def extract_text_from_pdf(pdf_file):
24
  pdf_reader = PyPDF2.PdfReader(pdf_file)
25
  text = ""
 
27
  text += pdf_reader.pages[page_num].extract_text()
28
  return text
29
 
30
+ # Function to tokenize text using the NLP model
31
  def tokenize_text(text, nlp_model):
32
  doc = nlp_model(text, disable=["tagger", "parser"])
33
  tokens = [(token.text.lower(), token.label_) for token in doc.ents]
34
  return tokens
35
 
36
+ # Function to extract CGPA from a resume
37
  def extract_cgpa(resume_text):
 
38
  cgpa_pattern = r'\b(?:CGPA|GPA|C\.G\.PA|Cumulative GPA)\s*:?[\s-]([0-9]+(?:\.[0-9]+)?)\b|\b([0-9]+(?:\.[0-9]+)?)\s(?:CGPA|GPA)\b'
 
 
39
  match = re.search(cgpa_pattern, resume_text, re.IGNORECASE)
 
 
40
  if match:
 
41
  cgpa = match.group(1) if match.group(1) else match.group(2)
42
  return float(cgpa)
43
  else:
44
  return None
45
 
46
+ # Function to extract skills from a resume
47
  def extract_skills(text, skills_keywords):
48
+ skills = [skill.lower() for skill in skills_keywords if re.search(r'\b' + re.escape(skill.lower()) + r'\b', text.lower())]
 
49
  return skills
50
 
51
+ # Function to preprocess text
52
  def preprocess_text(text):
53
  return word_tokenize(text.lower())
54
 
55
+ # Function to train a Doc2Vec model
 
 
 
 
56
  def train_doc2vec_model(documents):
57
  model = Doc2Vec(vector_size=20, min_count=2, epochs=50)
58
  model.build_vocab(documents)
59
+ model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)
 
60
  return model
61
 
62
+ # Function to calculate similarity between two texts
63
  def calculate_similarity(model, text1, text2):
64
  vector1 = model.infer_vector(preprocess_text(text1))
65
  vector2 = model.infer_vector(preprocess_text(text2))
66
  return model.dv.cosine_similarities(vector1, [vector2])[0]
67
 
68
+ # Function to calculate accuracy
69
  def accuracy_calculation(true_positives, false_positives, false_negatives):
70
  total = true_positives + false_positives + false_negatives
71
  accuracy = true_positives / total if total != 0 else 0
72
  return accuracy
73
 
 
74
  # Streamlit Frontend
75
  st.markdown("# Resume Matching Tool πŸ“ƒπŸ“ƒ")
76
  st.markdown("An application to match resumes with a job description.")
77
 
78
  # Sidebar - File Upload for Resumes
79
  st.sidebar.markdown("## Upload Resumes PDF")
80
+ resumes_files = st.sidebar.file_uploader("Upload Resumes PDF", type=["pdf"], accept_multiple_files=True)
 
81
 
82
  if resumes_files:
83
  # Sidebar - File Upload for Job Descriptions
84
  st.sidebar.markdown("## Upload Job Description PDF")
85
+ job_descriptions_file = st.sidebar.file_uploader("Upload Job Description PDF", type=["pdf"])
 
 
 
 
 
 
 
86
 
87
  if job_descriptions_file:
88
+ # Load the pre-trained NLP model
89
  nlp_model_path = "en_Resume_Matching_Keywords"
90
  nlp = spacy.load(nlp_model_path)
91
+
92
  # Backend Processing
93
  job_description_text = extract_text_from_pdf(job_descriptions_file)
94
+ resumes_texts = [extract_text_from_pdf(resume_file) for resume_file in resumes_files]
 
95
  job_description_text = extract_text_from_pdf(job_descriptions_file)
96
  job_description_tokens = tokenize_text(job_description_text, nlp)
97
 
 
 
98
  # Initialize counters
99
  overall_skill_matches = 0
100
  overall_qualification_matches = 0
 
167
  overall_qualification_matches += qualificationMatch
168
 
169
  # Add count of matched skills for this resume to the list
170
+ skills_counts_all_resumes.append([resume_text.count(skill.lower()) for skill in job_skills])
 
171
 
172
  # Create a dictionary for the current resume and append to the results list
173
  result_dict = {
 
187
  # Display overall matches
188
  st.subheader("Overall Matches")
189
  st.write(f"Total Skill Matches: {overall_skill_matches}")
190
+ st.write(f"Total Qualification Matches: {overall_qualification_matches}")
 
191
  st.write(f"Job Qualifications: {job_qualifications}")
192
  st.write(f"Job Skills: {job_skills}")
193
 
194
  # Display individual results in a table
195
+ results_df = pd.DataFrame(results_list)
196
  st.subheader("Individual Results")
197
  st.dataframe(results_df)
198
+ tagged_resumes = [TaggedDocument(words=preprocess_text(text), tags=[str(i)]) for i, text in enumerate(resumes_texts)]
 
199
  model_resumes = train_doc2vec_model(tagged_resumes)
200
+
201
  st.subheader("\nHeatmap:")
202
+
203
+ # Get skills keywords from user input
204
+ skills_keywords_input = st.text_input("Enter skills keywords separated by commas (e.g., python, java, machine learning):")
205
+ skills_keywords = [skill.strip() for skill in skills_keywords_input.split(',') if skill.strip()]
206
 
207
  if skills_keywords:
208
  # Calculate the similarity score between each skill keyword and the resume text
 
210
  for resume_text in resumes_texts:
211
  resume_text_similarity_scores = []
212
  for skill in skills_keywords:
213
+ similarity_score = calculate_similarity(model_resumes, resume_text, skill)
 
214
  resume_text_similarity_scores.append(similarity_score)
215
  skills_similarity_scores.append(resume_text_similarity_scores)
216
 
217
  # Create a DataFrame with the similarity scores and set the index to the names of the PDFs
218
+ skills_similarity_df = pd.DataFrame(skills_similarity_scores, columns=skills_keywords, index=[resume_file.name for resume_file in resumes_files])
 
219
 
220
  # Plot the heatmap
221
  fig, ax = plt.subplots(figsize=(12, 8))
222
+ sns.heatmap(skills_similarity_df, cmap='YlGnBu', annot=True, fmt=".2f", ax=ax)
 
 
223
  ax.set_title('Heatmap for Skills Similarity')
224
  ax.set_xlabel('Skills')
225
  ax.set_ylabel('Resumes')
 
235
  else:
236
  st.warning("Please upload the Job Description PDF to proceed.")
237
  else:
238
+ st.warning("Please upload Resumes PDF to proceed.")