Skip to content


added llm_job_parser
Browse files Browse the repository at this point in the history
  • Loading branch information
feder-cr committed Dec 4, 2024
1 parent 22bfbf0 commit 8967a1e
Showing 1 changed file with 181 additions and 0 deletions.
181 changes: 181 additions & 0 deletions src/ai_hawk/libs/resume_and_cover_builder/llm/
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
import os
import tempfile
import textwrap
import time
from src.ai_hawk.libs.resume_and_cover_builder.utils import LoggerChatModel
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv
from concurrent.futures import ThreadPoolExecutor, as_completed
from loguru import logger
from pathlib import Path
from langchain_core.prompt_values import StringPromptValue
from langchain_core.runnables import RunnablePassthrough
from langchain_text_splitters import TokenTextSplitter
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from lib_resume_builder_AIHawk.config import global_config
from langchain_community.document_loaders import TextLoader
import logging
import re # Per la parsing regex, soprattutto in `parse_wait_time_from_error_message`
from requests.exceptions import HTTPError as HTTPStatusError # Gestione degli errori HTTP
import openai

# Carica le variabili d'ambiente dal file .env

# Configura il file di log
log_folder = 'log/resume/gpt_resume'
if not os.path.exists(log_folder):
log_path = Path(log_folder).resolve()
logger.add(log_path / "gpt_resume.log", rotation="1 day", compression="zip", retention="7 days", level="DEBUG")

class LLMResumer:
def __init__(self, openai_api_key, strings):
self.llm_cheap = LoggerChatModel(
model_name="gpt-4o-mini", openai_api_key=openai_api_key, temperature=0.4
self.strings = strings
self.llm_embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key) # Inizializza gli embeddings

def _preprocess_template_string(template: str) -> str:
Preprocessa la stringa del template rimuovendo gli spazi bianchi iniziali e l'indentazione.
template (str): La stringa del template da preprocessare.
str: La stringa del template preprocessata.
return textwrap.dedent(template)

def get_job_description_from_url(self, url_job_description):
from lib_resume_builder_AIHawk.utils import create_driver_selenium
driver = create_driver_selenium()
body_element = driver.find_element("tag name", "body")
response = body_element.get_attribute("outerHTML")
with tempfile.NamedTemporaryFile(delete=False, suffix=".html", mode="w", encoding="utf-8") as temp_file:
temp_file_path =
loader = TextLoader(temp_file_path, encoding="utf-8", autodetect_encoding=True)
document = loader.load()
text_splitter = TokenTextSplitter(chunk_size=500, chunk_overlap=50)
all_splits = text_splitter.split_documents(document)
vectorstore = FAISS.from_documents(documents=all_splits, embedding=self.llm_embeddings)
prompt = PromptTemplate(
You are an expert job description analyst. Your role is to meticulously analyze and interpret job descriptions.
After analyzing the job description, answer the following question in a clear, and informative manner.
Question: {question}
Job Description: {context}
input_variables=["question", "context"]
def format_docs(docs):
return "\n\n".join(doc.page_content for doc in docs)
context_formatter = vectorstore.as_retriever() | format_docs
question_passthrough = RunnablePassthrough()
chain_job_description = prompt | self.llm_cheap | StrOutputParser()
summarize_prompt_template = self._preprocess_template_string(self.strings.summarize_prompt_template)
prompt_summarize = ChatPromptTemplate.from_template(summarize_prompt_template)
chain_summarize = prompt_summarize | self.llm_cheap | StrOutputParser()
qa_chain = (
"context": context_formatter,
"question": question_passthrough,
| chain_job_description
| (lambda output: {"text": output})
| chain_summarize
result = qa_chain.invoke("Provide, full job description")
self.job_description = result

def extract_company_name(self):
Estrae il nome dell'azienda dalla descrizione del lavoro.
str: Il nome dell'azienda estratto.
return self._extract_information("What is the company name in this job description?")

def extract_role(self):
Estrae il ruolo/titolo ricercato dalla descrizione del lavoro.
str: Il ruolo/titolo estratto.
return self._extract_information("What is the role or title being sought in this job description?")

def extract_location(self):
Estrae la località dalla descrizione del lavoro.
str: La località estratta.
return self._extract_information("What is the location mentioned in this job description?")

def extract_recruiter_email(self):
Estrae l'email del recruiter dalla descrizione del lavoro.
str: L'email del recruiter estratta.
return self._extract_information("What is the recruiter's email address in this job description?")

def _extract_information(self, question):
Metodo generico per estrarre informazioni specifiche basate sulla domanda fornita.
question (str): La domanda da porre al LLM per l'estrazione.
str: L'informazione estratta.
if not hasattr(self, 'job_description'):
raise ValueError("Job description not found. Please run get_job_description_from_url first.")

prompt = PromptTemplate(
You are an expert in extracting specific information from job descriptions.
Carefully read the job description below and provide a clear and concise answer to the question.
Job Description: {job_description}
Question: {question}
input_variables=["job_description", "question"]

chain = prompt | self.llm_cheap | StrOutputParser()
result = chain.invoke({
"job_description": self.job_description,
"question": question
return result.strip()

def extract_all_details(self):
Estrae il nome dell'azienda, il ruolo, la località e l'email del recruiter dalla descrizione del lavoro.
dict: Un dizionario contenente tutti i dettagli estratti.
details = {}
details['company_name'] = self.extract_company_name()
details['role'] = self.extract_role()
details['location'] = self.extract_location()
details['recruiter_email'] = self.extract_recruiter_email()
return details

0 comments on commit 8967a1e

Please sign in to comment.