feat: add agents/app
This commit is contained in:
0
agents/app/rag/__init__.py
Normal file
0
agents/app/rag/__init__.py
Normal file
10
agents/app/rag/embeddings.py
Normal file
10
agents/app/rag/embeddings.py
Normal file
@@ -0,0 +1,10 @@
|
||||
from dotenv import load_dotenv
|
||||
from langchain_openai import OpenAIEmbeddings
|
||||
|
||||
|
||||
def load_embeddins():
|
||||
load_dotenv()
|
||||
# model = "text-embedding-ada-002"
|
||||
model = "text-embedding-3-small"
|
||||
|
||||
return OpenAIEmbeddings(model=model)
|
||||
17
agents/app/rag/llm.py
Normal file
17
agents/app/rag/llm.py
Normal file
@@ -0,0 +1,17 @@
|
||||
from dotenv import load_dotenv
|
||||
from langchain_openai import ChatOpenAI
|
||||
|
||||
|
||||
def load_llm_openai():
|
||||
load_dotenv()
|
||||
# model = "gpt-3.5-turbo-0125"
|
||||
# model = "gpt-4o"
|
||||
model = "gpt-4o-mini"
|
||||
|
||||
llm = ChatOpenAI(
|
||||
model=model,
|
||||
temperature=0.1,
|
||||
max_tokens=2000,
|
||||
)
|
||||
|
||||
return llm
|
||||
46
agents/app/rag/rag_chain.py
Normal file
46
agents/app/rag/rag_chain.py
Normal file
@@ -0,0 +1,46 @@
|
||||
from langchain.chains import create_history_aware_retriever
|
||||
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
|
||||
from langchain.chains import create_retrieval_chain
|
||||
from langchain.chains.combine_documents import create_stuff_documents_chain
|
||||
|
||||
|
||||
def create_rag_chain(llm, retriever):
|
||||
contextualize_q_system_prompt = """
|
||||
Given a chat history and the latest user question \
|
||||
which might reference context in the chat history,
|
||||
formulate a standalone question \
|
||||
which can be understood without the chat history.
|
||||
Do NOT answer the question, \
|
||||
just reformulate it if needed and otherwise return it as is.
|
||||
"""
|
||||
contextualize_q_prompt = ChatPromptTemplate.from_messages(
|
||||
[
|
||||
("system", contextualize_q_system_prompt),
|
||||
MessagesPlaceholder("chat_history"),
|
||||
("human", "{input}"),
|
||||
]
|
||||
)
|
||||
history_aware_retriever = create_history_aware_retriever(
|
||||
llm, retriever, contextualize_q_prompt
|
||||
)
|
||||
|
||||
# ___________________Chain con el chat history_______________________-
|
||||
qa_system_prompt = """
|
||||
You are an assistant for question-answering tasks. \
|
||||
Use the following pieces of retrieved context to answer the question. \
|
||||
If you don't know the answer, just say that you don't know. \
|
||||
The length of the answer should be sufficient to address
|
||||
what is being asked, \
|
||||
meaning don't limit yourself in length.\
|
||||
{context}"""
|
||||
qa_prompt = ChatPromptTemplate.from_messages(
|
||||
[
|
||||
("system", qa_system_prompt),
|
||||
MessagesPlaceholder("chat_history"),
|
||||
("human", "{input}"),
|
||||
]
|
||||
)
|
||||
question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)
|
||||
|
||||
return create_retrieval_chain(
|
||||
history_aware_retriever, question_answer_chain)
|
||||
18
agents/app/rag/retriever.py
Normal file
18
agents/app/rag/retriever.py
Normal file
@@ -0,0 +1,18 @@
|
||||
from langchain_chroma import Chroma
|
||||
|
||||
|
||||
def create_retriever(embeddings, persist_directory: str):
|
||||
# Cargamos la vectorstore
|
||||
# vectordb = Chroma.from_documents(
|
||||
# persist_directory=st.session_state.persist_directory,
|
||||
# Este es el directorio del la vs del docuemnto del usuario
|
||||
# que se encuentra cargado en la session_state.
|
||||
# embedding_function=embeddings,
|
||||
# )
|
||||
vectordb = Chroma(
|
||||
persist_directory=persist_directory,
|
||||
embedding_function=embeddings,
|
||||
)
|
||||
|
||||
# Creamos el retriver para que retorne los fragmentos mas relevantes.
|
||||
return vectordb.as_retriever(search_kwargs={"k": 6})
|
||||
42
agents/app/rag/split_docs.py
Normal file
42
agents/app/rag/split_docs.py
Normal file
@@ -0,0 +1,42 @@
|
||||
from langchain_community.document_loaders.pdf import PyPDFLoader
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
import os
|
||||
|
||||
|
||||
# def load_split_docs(file_name: str) -> list:
|
||||
# file_path: str = os.path.join("documents", "pdfs", file_name)
|
||||
# loader = PyPDFLoader(file_path)
|
||||
# docs: list = loader.load()
|
||||
# chunk_size: int = 2000
|
||||
# chunk_overlap: int = 300
|
||||
#
|
||||
# splitter = RecursiveCharacterTextSplitter(
|
||||
# chunk_size=chunk_size, chunk_overlap=chunk_overlap
|
||||
# )
|
||||
# docs_split: list = splitter.split_documents(docs)
|
||||
#
|
||||
# return docs_split
|
||||
|
||||
|
||||
def load_split_docs(file_name: str) -> list:
|
||||
# Obtener el directorio base del proyecto
|
||||
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
# Construir la ruta absoluta al PDF
|
||||
file_path = os.path.join(base_dir, "documents", "pdfs", file_name)
|
||||
|
||||
# Verificar si el archivo existe
|
||||
if not os.path.exists(file_path):
|
||||
print(f"Archivo no encontrado en: {file_path}")
|
||||
raise FileNotFoundError(f"No se encontró el archivo en: {file_path}")
|
||||
|
||||
loader = PyPDFLoader(file_path)
|
||||
docs: list = loader.load()
|
||||
|
||||
chunk_size: int = 2000
|
||||
chunk_overlap: int = 300
|
||||
splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size=chunk_size, chunk_overlap=chunk_overlap
|
||||
)
|
||||
docs_split: list = splitter.split_documents(docs)
|
||||
return docs_split
|
||||
48
agents/app/rag/vectorstore.py
Normal file
48
agents/app/rag/vectorstore.py
Normal file
@@ -0,0 +1,48 @@
|
||||
from langchain_chroma import Chroma
|
||||
import os
|
||||
|
||||
#
|
||||
# def create_vectorstore(docs_split: list, embeddings, file_name: str):
|
||||
# db_name: str = file_name.replace(".pdf", "").replace(" ", "_").lower()
|
||||
# persist_directory: str = f"embeddings/{db_name}"
|
||||
#
|
||||
# # Crear el directorio si no existe
|
||||
# os.makedirs(persist_directory, exist_ok=True)
|
||||
#
|
||||
# # Siempre crear/actualizar el vectorstore
|
||||
# vectordb = Chroma.from_documents(
|
||||
# persist_directory=persist_directory,
|
||||
# documents=docs_split,
|
||||
# embedding=embeddings,
|
||||
# )
|
||||
#
|
||||
# return vectordb
|
||||
|
||||
|
||||
def create_vectorstore(docs_split: list, embeddings, file_name: str):
|
||||
# Obtener el directorio base del proyecto
|
||||
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
# Crear el nombre de la base de datos
|
||||
db_name: str = file_name.replace(".pdf", "").replace(" ", "_").lower()
|
||||
|
||||
# Construir la ruta absoluta para los embeddings
|
||||
persist_directory: str = os.path.join(base_dir, "embeddings", db_name)
|
||||
|
||||
# Crear el directorio si no existe
|
||||
os.makedirs(persist_directory, exist_ok=True)
|
||||
|
||||
# Debug log
|
||||
print(f"Creando vectorstore en: {persist_directory}")
|
||||
|
||||
try:
|
||||
# Crear/actualizar el vectorstore
|
||||
vectordb = Chroma.from_documents(
|
||||
persist_directory=persist_directory,
|
||||
documents=docs_split,
|
||||
embedding=embeddings,
|
||||
)
|
||||
return vectordb
|
||||
except Exception as e:
|
||||
print(f"Error al crear vectorstore: {e}")
|
||||
raise
|
||||
Reference in New Issue
Block a user