import io import os from json import JSONDecodeError from typing import Union import PyPDF2 from dotenv import load_dotenv from langchain.output_parsers import PydanticOutputParser from langchain.prompts import PromptTemplate from langchain_community.callbacks import get_openai_callback from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline from langchain_openai import ChatOpenAI from langchain_community.llms import HuggingFaceEndpoint from langchain_core.messages import BaseMessage from pydantic import ValidationError from resume_template import Resume load_dotenv() def pdf_to_string(file:Union[io.BytesIO, str, bytes]): """ Convert a PDF file to a string. Parameters: file: A file-like object representing the PDF file (io.BytesIO, or file Path, or bytes) Returns: str: The extracted text from the PDF. """ f = io.BytesIO(file) if isinstance(file, bytes) else file pdf_reader = PyPDF2.PdfReader(f) num_pages = len(pdf_reader.pages) text = '' for i in range(num_pages): page = pdf_reader.pages[i] text += page.extract_text() if isinstance(f, io.BytesIO): f.close() return text class ResumeParser: list_llm = [ "mistralai/mistral-7b-instruct-v0.2", "mistralai/Mixtral-8x7B-Instruct-v0.1", "microsoft/phi-2", "google/gemma-7b-it", "HuggingFaceH4/zephyr-7b-beta", "HuggingFaceH4/zephyr-7b-gemma-v0.1", ] def __init__(self, use_openai=False, openai_key=""): self.use_openai = use_openai self.openai_key = "" self.model = None self.hf_model_name = self.list_llm[0] self.openai_model_name = "gpt-3.5-turbo" #"gpt-4-1106-preview" self.set_key(openai_key) self.set_model() def set_key(self, openai_key=""): self.openai_key = openai_key if len(self.openai_key) == 0 and "OPENAI_API_KEY" in os.environ: self.openai_key = os.environ["OPENAI_API_KEY"] def set_model(self, hf_model_name=""): if self.use_openai: print(f"using OpenAI Model {self.openai_model_name}") self.model = ChatOpenAI(temperature=0, model=self.openai_model_name, max_tokens=2500, openai_api_key=self.openai_key) else: if hf_model_name != "" and hf_model_name in self.list_llm: self.hf_model_name = hf_model_name print(f"using HF Model {self.hf_model_name}") self.model = HuggingFaceEndpoint(repo_id=self.hf_model_name, temperature=0.001, max_new_tokens=4000) #self.model = HuggingFacePipeline.from_model_id(model_id=self.hf_model_name, task="text-generation", # device=0, # pipeline_kwargs={"max_new_tokens": 4000}, # model_kwargs={"temperature": 0.001, "max_length": 8000}) def extract_resume_fields(self, full_text): """ Analyze a resume text and extract structured information using a specified language model. Parameters: full_text (str): The text content of the resume. model (str): The language model object to use for processing the text. Returns: dict: A dictionary containing structured information extracted from the resume. """ # The Resume object is imported from the local resume_template file with open("prompts/resume_extraction.prompt", "r") as f: template = f.read() with open("prompts/json.schema", "r") as f: json_schema = f.read() parser = PydanticOutputParser(pydantic_object=Resume) prompt_template = PromptTemplate( template=template, input_variables=["resume"], partial_variables={"json_schema": json_schema}, ) # Invoke the language model and process the resume formatted_input = prompt_template.format_prompt(resume=full_text) llm = self.model # print("llm", llm) with get_openai_callback() as cb: output = llm.invoke(formatted_input.to_string()) print(cb) #print(output) # Print the output object for debugging if isinstance(output, BaseMessage): output = output.content try: parsed_output = parser.parse(output) #json_output = parsed_output.json() #print(json_output) return parsed_output except ValidationError as e: print(f"Validation error: {e}") #print(output) return output except JSONDecodeError as e: print(f"JSONDecodeError error: {e}") #print(output) return output except Exception as e: print(f"Exception: {e}") #print(output) return output def match_resume_with_job_description(self, resume_txt, job_description): with open("prompts/job_description_matching.prompt", "r") as f: template = f.read() prompt_template = PromptTemplate( template=template, input_variables=["resume", "job_description"], ) # Invoke the language model and process the resume formatted_input = prompt_template.format_prompt(resume=resume_txt, job_description=job_description) llm = self.model # print("llm", llm) with get_openai_callback() as cb: output = llm.invoke(formatted_input.to_string()) print(cb) # print(output) # Print the output object for debugging if isinstance(output, BaseMessage): output = output.content return output class StructuredResumeParser: list_llm = [ "mistralai/Mixtral-8x7B-Instruct-v0.1", "mistralai/mistral-7b-instruct-v0.2", "microsoft/phi-2", "google/gemma-7b-it", "HuggingFaceH4/zephyr-7b-beta", "HuggingFaceH4/zephyr-7b-gemma-v0.1", ] def __init__(self, use_openai=False, openai_key=""): self.use_openai = use_openai self.openai_key = "" self.model = None self.hf_model_name = self.list_llm[0] self.openai_model_name = "gpt-3.5-turbo" #"gpt-4-1106-preview" self.set_key(openai_key) self.set_model() def set_key(self, openai_key=""): self.openai_key = openai_key if len(self.openai_key) == 0 and "OPENAI_API_KEY" in os.environ: self.openai_key = os.environ["OPENAI_API_KEY"] def set_model(self, hf_model_name=""): if self.use_openai: print(f"using OpenAI Model {self.openai_model_name}") self.model = ChatOpenAI(temperature=0, model=self.openai_model_name, max_tokens=2500, openai_api_key=self.openai_key) else: if hf_model_name != "" and hf_model_name in self.list_llm: self.hf_model_name = hf_model_name print(f"using Together Model {self.hf_model_name}") self.model = ChatOpenAI( base_url="https://api.together.xyz/v1", api_key=os.environ["TOGETHER_API_KEY"], model=self.hf_model_name, ) def extract_resume_fields(self, full_text): """ Analyze a resume text and extract structured information using a specified language model. Parameters: full_text (str): The text content of the resume. model (str): The language model object to use for processing the text. Returns: dict: A dictionary containing structured information extracted from the resume. """ # The Resume object is imported from the local resume_template file with open("prompts/resume_extraction_structured.prompt", "r") as f: template = f.read() prompt_template = PromptTemplate( template=template, input_variables=["resume"] ) # Invoke the language model and process the resume formatted_input = prompt_template.format_prompt(resume=full_text) llm = self.model.with_structured_output(Resume) with get_openai_callback() as cb: output = llm.invoke(formatted_input.to_string()) print(cb) return output def match_resume_with_job_description(self, resume_txt, job_description): with open("prompts/job_description_matching.prompt", "r") as f: template = f.read() prompt_template = PromptTemplate( template=template, input_variables=["resume", "job_description"], ) # Invoke the language model and process the resume formatted_input = prompt_template.format_prompt(resume=resume_txt, job_description=job_description) llm = self.model # print("llm", llm) with get_openai_callback() as cb: output = llm.invoke(formatted_input.to_string()) print(cb) # print(output) # Print the output object for debugging if isinstance(output, BaseMessage): output = output.content return output if __name__ == "__main__": resume_file = "samples/0.pdf" text = pdf_to_string(resume_file) p = StructuredResumeParser(use_openai=False) resume_data = p.extract_resume_fields(text) ''' model = ChatOpenAI( base_url="https://api.together.xyz/v1", api_key=os.environ["TOGETHER_API_KEY"], model="mistralai/Mixtral-8x7B-Instruct-v0.1", ) model_with_structure = model.with_structured_output(Resume) with open("prompts/resume_extraction_structured.prompt", "r") as f: template = f.read() prompt_template = PromptTemplate( template=template, input_variables=["resume"] ) formatted_input = prompt_template.format_prompt(resume=text) with get_openai_callback() as cb: resume_data = model_with_structure.invoke(formatted_input.to_string()) print(cb) ''' print(resume_data)