Feat: Se mueven archivos a APP

This commit is contained in:
2024-11-01 17:57:44 -05:00
parent a5e9f2db81
commit 4be1115e3a
31 changed files with 1171 additions and 2 deletions

0
app/rag/__init__.py Normal file
View File

10
app/rag/embeddings.py Normal file
View File

@@ -0,0 +1,10 @@
from dotenv import load_dotenv
from langchain_openai import OpenAIEmbeddings
def load_embeddins():
load_dotenv()
# model = "text-embedding-ada-002"
model = "text-embedding-3-small"
return OpenAIEmbeddings(model=model)

17
app/rag/llm.py Normal file
View File

@@ -0,0 +1,17 @@
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
def load_llm_openai():
load_dotenv()
# model = "gpt-3.5-turbo-0125"
# model = "gpt-4o"
model = "gpt-4o-mini"
llm = ChatOpenAI(
model=model,
temperature=0.1,
max_tokens=2000,
)
return llm

40
app/rag/rag_chain.py Normal file
View File

@@ -0,0 +1,40 @@
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
def create_rag_chain(llm, retriever):
contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""
contextualize_q_prompt = ChatPromptTemplate.from_messages(
[
("system", contextualize_q_system_prompt),
MessagesPlaceholder("chat_history"),
("human", "{input}"),
]
)
history_aware_retriever = create_history_aware_retriever(
llm, retriever, contextualize_q_prompt
)
# ___________________Chain con el chat history_______________________-
qa_system_prompt = """You are an assistant for question-answering tasks. \
Use the following pieces of retrieved context to answer the question. \
If you don't know the answer, just say that you don't know. \
The length of the answer should be sufficient to address what is being asked, \
meaning don't limit yourself in length.\
{context}"""
qa_prompt = ChatPromptTemplate.from_messages(
[
("system", qa_system_prompt),
MessagesPlaceholder("chat_history"),
("human", "{input}"),
]
)
question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)
return create_retrieval_chain(history_aware_retriever, question_answer_chain)

16
app/rag/retriever.py Normal file
View File

@@ -0,0 +1,16 @@
from langchain_chroma import Chroma
def create_retriever(embeddings, persist_directory: str):
# Cargamos la vectorstore
# vectordb = Chroma.from_documents(
# persist_directory=st.session_state.persist_directory, # Este es el directorio del la vs del docuemnto del usuario que se encuentra cargado en la session_state.
# embedding_function=embeddings,
# )
vectordb = Chroma(
persist_directory=persist_directory,
embedding_function=embeddings,
)
# Creamos el retriver para que retorne los fragmentos mas relevantes.
return vectordb.as_retriever(search_kwargs={"k": 6})

19
app/rag/split_docs.py Normal file
View File

@@ -0,0 +1,19 @@
from langchain_community.document_loaders.pdf import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
def load_split_docs(file_name: str) -> list:
file_path: str = os.path.join("documents", "pdfs", file_name)
loader = PyPDFLoader(file_path)
docs: list = loader.load()
chunk_size: int = 2000
chunk_overlap: int = 300
splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
docs_split: list = splitter.split_documents(docs)
return docs_split

15
app/rag/vectorstore.py Normal file
View File

@@ -0,0 +1,15 @@
from langchain_chroma import Chroma
import os
def create_verctorstore(docs_split: list, embeddings, file_name: str):
db_name: str = file_name.replace(".pdf", "").replace(" ", "_").lower()
persist_directory: str = f"embeddings/{db_name}"
if not os.path.exists(persist_directory):
vectordb = Chroma.from_documents(
persist_directory=persist_directory,
documents=docs_split,
embedding=embeddings,
)