Se agrega el proyecto al repositorio

This commit is contained in:
Mongar28
2024-10-29 21:57:59 -05:00
parent 5cd142fabc
commit 70424a5c50
40 changed files with 1068 additions and 0 deletions

0
rag/__init__.py Normal file
View File

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

10
rag/embeddings.py Normal file
View File

@@ -0,0 +1,10 @@
from dotenv import load_dotenv
from langchain_openai import OpenAIEmbeddings
def load_embeddins():
load_dotenv()
# model = "text-embedding-ada-002"
model = "text-embedding-3-small"
return OpenAIEmbeddings(model=model)

17
rag/llm.py Normal file
View File

@@ -0,0 +1,17 @@
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
def load_llm_openai():
load_dotenv()
# model = "gpt-3.5-turbo-0125"
# model = "gpt-4o"
model = "gpt-4o-mini"
llm = ChatOpenAI(
model=model,
temperature=0.1,
max_tokens=2000,
)
return llm

40
rag/rag_chain.py Normal file
View File

@@ -0,0 +1,40 @@
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
def create_rag_chain(llm, retriever):
contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""
contextualize_q_prompt = ChatPromptTemplate.from_messages(
[
("system", contextualize_q_system_prompt),
MessagesPlaceholder("chat_history"),
("human", "{input}"),
]
)
history_aware_retriever = create_history_aware_retriever(
llm, retriever, contextualize_q_prompt
)
# ___________________Chain con el chat history_______________________-
qa_system_prompt = """You are an assistant for question-answering tasks. \
Use the following pieces of retrieved context to answer the question. \
If you don't know the answer, just say that you don't know. \
The length of the answer should be sufficient to address what is being asked, \
meaning don't limit yourself in length.\
{context}"""
qa_prompt = ChatPromptTemplate.from_messages(
[
("system", qa_system_prompt),
MessagesPlaceholder("chat_history"),
("human", "{input}"),
]
)
question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)
return create_retrieval_chain(history_aware_retriever, question_answer_chain)

16
rag/retriever.py Normal file
View File

@@ -0,0 +1,16 @@
from langchain_chroma import Chroma
def create_retriever(embeddings, persist_directory: str):
# Cargamos la vectorstore
# vectordb = Chroma.from_documents(
# persist_directory=st.session_state.persist_directory, # Este es el directorio del la vs del docuemnto del usuario que se encuentra cargado en la session_state.
# embedding_function=embeddings,
# )
vectordb = Chroma(
persist_directory=persist_directory,
embedding_function=embeddings,
)
# Creamos el retriver para que retorne los fragmentos mas relevantes.
return vectordb.as_retriever(search_kwargs={"k": 6})

19
rag/split_docs.py Normal file
View File

@@ -0,0 +1,19 @@
from langchain_community.document_loaders.pdf import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
def load_split_docs(file_name: str) -> list:
file_path: str = os.path.join("documents", "pdfs", file_name)
loader = PyPDFLoader(file_path)
docs: list = loader.load()
chunk_size: int = 2000
chunk_overlap: int = 300
splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
docs_split: list = splitter.split_documents(docs)
return docs_split

15
rag/vectorstore.py Normal file
View File

@@ -0,0 +1,15 @@
from langchain_chroma import Chroma
import os
def create_verctorstore(docs_split: list, embeddings, file_name: str):
db_name: str = file_name.replace(".pdf", "").replace(" ", "_").lower()
persist_directory: str = f"embeddings/{db_name}"
if not os.path.exists(persist_directory):
vectordb = Chroma.from_documents(
persist_directory=persist_directory,
documents=docs_split,
embedding=embeddings,
)