first commit
This commit is contained in:
parent
324bae0939
commit
337f054a35
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
.venv
|
||||
.env
|
5
.streamlit/config.toml
Normal file
5
.streamlit/config.toml
Normal file
@ -0,0 +1,5 @@
|
||||
[theme]
|
||||
base = "dark"
|
||||
backgroundColor = "#0d2669"
|
||||
secondaryBackgroundColor = "#050550"
|
||||
font = "monospace"
|
132
app.py
Normal file
132
app.py
Normal file
@ -0,0 +1,132 @@
|
||||
import streamlit as st
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from langchain_community.chat_models import ChatOpenAI
|
||||
from chats.streamlit_tools import import_file, clear_cache
|
||||
from streamlit_extras.add_vertical_space import add_vertical_space
|
||||
from langchain_tools.pdf_tools import PdfLangChain
|
||||
from langchain_tools.lc_tools import LangChainTools
|
||||
|
||||
|
||||
# App title
|
||||
st.set_page_config(page_title="Snowflake Arctic")
|
||||
|
||||
# sidebar
|
||||
with st.sidebar:
|
||||
|
||||
# Cargar el logo (asegúrate de que el archivo de imagen esté en la misma carpeta que tu script)
|
||||
logo_path = "documents/Logo azulblanco.png"
|
||||
# Ajusta el ancho según sea necesario
|
||||
logo = st.sidebar.image(logo_path, width=200)
|
||||
|
||||
add_vertical_space(18)
|
||||
pdf_name = import_file()
|
||||
|
||||
# Crea un botón en Streamlit que llama a la función clear_cache() cuando se presiona
|
||||
if st.button('Eliminar caché'):
|
||||
clear_cache()
|
||||
if st.button('Reiniciar'):
|
||||
st.experimental_rerun()
|
||||
st.markdown(
|
||||
"Built by [OneCluster](https://www.onecluster.org/)."
|
||||
)
|
||||
|
||||
st.title('💬📄 LLM CHat APP')
|
||||
|
||||
|
||||
if pdf_name:
|
||||
|
||||
with st.spinner("Processing the document..."):
|
||||
|
||||
# Inicializamos la clase PdfLangChain
|
||||
pdfLangChain = PdfLangChain(pdf_name)
|
||||
pdf_name = pdfLangChain.file_name
|
||||
|
||||
# Cargamos el documento PDF
|
||||
docs: list = pdfLangChain.load_pdf()
|
||||
|
||||
# Dividimos los documentos en partes mas pequenas
|
||||
docs_split: list = pdfLangChain.split_docs(docs)
|
||||
|
||||
# Instanciamos la clase LangChainTools que contiene herramientras LangChain
|
||||
langChainTools = LangChainTools()
|
||||
|
||||
# Cargamos el modelo de embeddings
|
||||
embedding_model = langChainTools.load_embedding_opnai()
|
||||
|
||||
# Creamos el vector store
|
||||
docstorage = langChainTools.create_vector_strore(
|
||||
docs_split,
|
||||
pdf_name,
|
||||
embedding_model)
|
||||
|
||||
# Cargamos el modelo LLM desde LangChain
|
||||
llm = langChainTools.load_llm_openai()
|
||||
|
||||
# Creamos la cadena que integra Vectorstroe, el LLM para hacer consultas.
|
||||
# Para este caso la cadena tene el parametro de memoria.
|
||||
qa = langChainTools.define_retrieval_qa_memory(
|
||||
llm, docstorage,
|
||||
pdf_name,
|
||||
embedding_model)
|
||||
|
||||
# Store conversation history
|
||||
if "messages" not in st.session_state.keys():
|
||||
st.session_state.messages = [
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Hola, soy una IA con el que puedes chatear con tu PDF. Haz un pregunta al documento.",
|
||||
}
|
||||
]
|
||||
|
||||
# Display or clear chat messages
|
||||
for message in st.session_state.messages:
|
||||
with st.chat_message(message["role"]):
|
||||
st.write(message["content"])
|
||||
|
||||
def clear_chat_history():
|
||||
st.session_state.messages = [
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Hola, soy una IA con el que puedes chatear con tu PDF. Haz un pregunta al documento.",
|
||||
}
|
||||
]
|
||||
|
||||
st.sidebar.button("Clear chat history", on_click=clear_chat_history)
|
||||
|
||||
@ st.cache_resource
|
||||
def get_num_tokens(prompt):
|
||||
"""Get the number of tokens in a given prompt"""
|
||||
return len(prompt.split())
|
||||
|
||||
# Function for generating Snowflake Arctic response
|
||||
|
||||
# User-provided prompt
|
||||
if prompt := st.chat_input():
|
||||
st.session_state.messages.append({"role": "user", "content": prompt})
|
||||
with st.chat_message("user"):
|
||||
st.write(prompt)
|
||||
|
||||
# Generate a new response if last message is not from assistant
|
||||
if st.session_state.messages[-1]["role"] != "assistant":
|
||||
with st.spinner("Thinking..."):
|
||||
|
||||
# Creamos la cadena que integra Vectorstroe, el LLM para hacer consultas.
|
||||
# Para este caso la cadena tene el parametro de memoria.
|
||||
qa = langChainTools.define_retrieval_qa_memory(
|
||||
llm, docstorage,
|
||||
pdf_name,
|
||||
embedding_model)
|
||||
|
||||
input = "\n".join([msg["content"]
|
||||
for msg in st.session_state.messages])
|
||||
|
||||
query = qa.invoke({"question": f"{input}"},
|
||||
return_only_outputs=True)
|
||||
|
||||
response = query["answer"]
|
||||
|
||||
with st.chat_message("assistant"):
|
||||
st.write(response)
|
||||
st.session_state.messages.append(
|
||||
{"role": "assistant", "content": response})
|
0
chats/__init__.py
Normal file
0
chats/__init__.py
Normal file
BIN
chats/__pycache__/__init__.cpython-311.pyc
Normal file
BIN
chats/__pycache__/__init__.cpython-311.pyc
Normal file
Binary file not shown.
BIN
chats/__pycache__/chat_tools.cpython-311.pyc
Normal file
BIN
chats/__pycache__/chat_tools.cpython-311.pyc
Normal file
Binary file not shown.
BIN
chats/__pycache__/streamlit_tools.cpython-311.pyc
Normal file
BIN
chats/__pycache__/streamlit_tools.cpython-311.pyc
Normal file
Binary file not shown.
28
chats/chat_tools.py
Normal file
28
chats/chat_tools.py
Normal file
@ -0,0 +1,28 @@
|
||||
from colorama import Fore, Back, Style
|
||||
|
||||
|
||||
class MessageManager:
|
||||
|
||||
def create_chat(self, qa):
|
||||
|
||||
# Emoticon de robot
|
||||
ia_emoticon = "\U0001F916" # Emoticon de robot Unicode
|
||||
humano_emoticon = "\U0001F604" # Emoticon de carita feliz Unicode
|
||||
|
||||
# Imprimir el texto en amarillo y negrita con el emoticon de robot
|
||||
|
||||
# Definicimo el mensaje de la IA
|
||||
print(f"{ia_emoticon} " + Style.BRIGHT + Fore.YELLOW +
|
||||
"IA: " + Style.RESET_ALL + "Pregunta algo al documento")
|
||||
while True:
|
||||
input_usuario = input(
|
||||
Style.BRIGHT + Fore.BLUE + f"{humano_emoticon} You: " + Style.RESET_ALL)
|
||||
if input_usuario.lower() == 'salir':
|
||||
break
|
||||
bot_response = qa.invoke({"question": f"{input_usuario}"},
|
||||
return_only_outputs=True)
|
||||
print(f'{ia_emoticon} ' + Style.BRIGHT + Fore.YELLOW +
|
||||
'IA:' + Style.RESET_ALL + f'{bot_response["answer"]}')
|
||||
|
||||
def generate_citations(self):
|
||||
pass
|
33
chats/streamlit_tools.py
Normal file
33
chats/streamlit_tools.py
Normal file
@ -0,0 +1,33 @@
|
||||
import streamlit as st
|
||||
import os
|
||||
|
||||
# @st.cache_data
|
||||
|
||||
|
||||
def import_file() -> str:
|
||||
# Cargar el archivo pdf
|
||||
archivo = st.file_uploader(
|
||||
'Arrastra o ingresa tu archivo .pdf', type=['.pdf'])
|
||||
nombre_archivo: str = ''
|
||||
# Verificar si se ha cargado un archivo
|
||||
if archivo is not None:
|
||||
|
||||
nombre_archivo = archivo.name
|
||||
# Abrir un archivo en modo escritura binaria ('wb') para guardar el archivo de audio
|
||||
|
||||
with open(f'documents/pdfs/{nombre_archivo}', 'wb') as new_file:
|
||||
# Leer los datos del archivo cargado y escribirlos en el nuevo archivo
|
||||
new_file.write(archivo.read())
|
||||
|
||||
return nombre_archivo
|
||||
|
||||
# Define la función para borrar el caché
|
||||
|
||||
|
||||
def clear_cache():
|
||||
cache_path = os.path.join(st.__path__[0], 'static', 'cache')
|
||||
for root, dirs, files in os.walk(cache_path):
|
||||
for file in files:
|
||||
os.remove(os.path.join(root, file))
|
||||
st.success('Cache limpio exitosamente.')
|
||||
|
BIN
documents/Logo azulblanco.png
Normal file
BIN
documents/Logo azulblanco.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 38 KiB |
BIN
documents/pdfs/CARTA_BICIPIZZA_2024.pdf
Normal file
BIN
documents/pdfs/CARTA_BICIPIZZA_2024.pdf
Normal file
Binary file not shown.
Binary file not shown.
17300
documents/pdfs/Constitucion politica de Colombia .pdf
Normal file
17300
documents/pdfs/Constitucion politica de Colombia .pdf
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
BIN
documents/pdfs/El espejo como no-lugar.pdf
Normal file
BIN
documents/pdfs/El espejo como no-lugar.pdf
Normal file
Binary file not shown.
BIN
documents/pdfs/Etapas del conflicto armado.pdf
Normal file
BIN
documents/pdfs/Etapas del conflicto armado.pdf
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
embeddings/carta_bicipizza_2024/chroma.sqlite3
Normal file
BIN
embeddings/carta_bicipizza_2024/chroma.sqlite3
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
embeddings/constitucion_politica_de_colombia_/chroma.sqlite3
Normal file
BIN
embeddings/constitucion_politica_de_colombia_/chroma.sqlite3
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
embeddings/el_espejo_como_no-lugar/chroma.sqlite3
Normal file
BIN
embeddings/el_espejo_como_no-lugar/chroma.sqlite3
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
embeddings/etapas_del_conflicto_armado/chroma.sqlite3
Normal file
BIN
embeddings/etapas_del_conflicto_armado/chroma.sqlite3
Normal file
Binary file not shown.
0
langchain_tools/__init__.py
Normal file
0
langchain_tools/__init__.py
Normal file
BIN
langchain_tools/__pycache__/__init__.cpython-311.pyc
Normal file
BIN
langchain_tools/__pycache__/__init__.cpython-311.pyc
Normal file
Binary file not shown.
BIN
langchain_tools/__pycache__/lc_tools.cpython-311.pyc
Normal file
BIN
langchain_tools/__pycache__/lc_tools.cpython-311.pyc
Normal file
Binary file not shown.
BIN
langchain_tools/__pycache__/pdf_tools.cpython-311.pyc
Normal file
BIN
langchain_tools/__pycache__/pdf_tools.cpython-311.pyc
Normal file
Binary file not shown.
BIN
langchain_tools/__pycache__/tools.cpython-311.pyc
Normal file
BIN
langchain_tools/__pycache__/tools.cpython-311.pyc
Normal file
Binary file not shown.
185
langchain_tools/lc_tools.py
Normal file
185
langchain_tools/lc_tools.py
Normal file
@ -0,0 +1,185 @@
|
||||
from langchain_openai import OpenAIEmbeddings
|
||||
from langchain_community.vectorstores import Chroma
|
||||
from langchain_community.llms import OpenAI
|
||||
from langchain_community.chat_models import ChatOpenAI
|
||||
from langchain_core.prompts import PromptTemplate
|
||||
from langchain.memory.buffer import ConversationBufferMemory
|
||||
import os
|
||||
import streamlit as st
|
||||
from dotenv import load_dotenv
|
||||
from langchain.chains import (
|
||||
RetrievalQAWithSourcesChain,
|
||||
ConversationalRetrievalChain)
|
||||
|
||||
|
||||
class LangChainTools():
|
||||
"""
|
||||
Esta clase maneja algunas herramientas integraciones con las que
|
||||
cuenta LangChain.
|
||||
"""
|
||||
|
||||
def load_embedding_opnai(self):
|
||||
"""Esta funcion carga un modelo de embedding de OpenAI
|
||||
|
||||
Returns:
|
||||
_type_: Retorno a un objetito de tipo embedding de OpenAI
|
||||
"""
|
||||
|
||||
# Cargamos la variable que contiene la api_key de OpenAI
|
||||
load_dotenv()
|
||||
openai_api_key = os.getenv('api_key')
|
||||
# Define an OpenAI embeddings model
|
||||
self.embedding_model = OpenAIEmbeddings(openai_api_key=openai_api_key)
|
||||
# st.success('El modelo de embeddins de OpneAI se ha cargado')
|
||||
|
||||
return self.embedding_model
|
||||
|
||||
@st.cache_resource
|
||||
def create_vector_strore(_self, _docs_split: list, _file_name: str, _embedding_model):
|
||||
"""Esta funcion construye un vector store a partir de un documento
|
||||
|
||||
Args:
|
||||
_docs_split (list): Lista de documentos divididos
|
||||
_file_name (str): Nombre del documento
|
||||
"""
|
||||
|
||||
db_name = _file_name.replace('.pdf', '').replace(' ', '_').lower()
|
||||
|
||||
# Cargamos el modelo de embeddings
|
||||
# _embedding_model = self._embedding_model
|
||||
|
||||
# Verificamos si existe la vector strore
|
||||
persist_directory = f"embeddings/{db_name}"
|
||||
|
||||
if os.path.exists(persist_directory):
|
||||
vectordb = Chroma(
|
||||
persist_directory=persist_directory,
|
||||
embedding_function=_embedding_model)
|
||||
else:
|
||||
vectordb = Chroma.from_documents(
|
||||
persist_directory=persist_directory,
|
||||
documents=_docs_split,
|
||||
embedding=_embedding_model)
|
||||
|
||||
vectordb.persist()
|
||||
|
||||
return vectordb
|
||||
|
||||
def load_llm_openai(self):
|
||||
"""Esta funcion carga un modelo de LLM de OpenAI
|
||||
|
||||
Returns:
|
||||
_type_: Retorno a un objetito de tipo LLM de OpenAI
|
||||
"""
|
||||
|
||||
# Cargamos la variable que contiene la api_key de OpenAI
|
||||
load_dotenv()
|
||||
openai_api_key = os.getenv('api_key')
|
||||
|
||||
temperature = 0.5
|
||||
llm_openai = ChatOpenAI(model_name="gpt-3.5-turbo",
|
||||
temperature=temperature,
|
||||
openai_api_key=openai_api_key,
|
||||
max_tokens=1000)
|
||||
|
||||
return llm_openai
|
||||
|
||||
def load_prompt_template(self):
|
||||
"""Esta funcion construye un prompt template de lanfchain.
|
||||
|
||||
Returns:
|
||||
_type_: Retorno a un prompt template de LangChain.
|
||||
"""
|
||||
template = """Responde a la siguiente pregunta utilizando los documentos proporcionados y citando las fuentes relevantes entre corchetes []:
|
||||
|
||||
Pregunta: {question}
|
||||
|
||||
Respuesta:"""
|
||||
|
||||
prompt_template = PromptTemplate(
|
||||
template=template, input_variables=["question"])
|
||||
|
||||
return prompt_template
|
||||
|
||||
def define_retrieval_qa(
|
||||
self, _llm, _vectordb, _file_name, _embedding_model
|
||||
):
|
||||
"""Esta función integra un LLM y una base de datos vectorial en una
|
||||
chain de LangChain para hacer requerimientos. Este modelo no integra memoria.
|
||||
|
||||
Args:
|
||||
_llm (_type_): <Modelo Largo de Lenguaje.
|
||||
_vectordb (_type_): Base de datos vectorial.
|
||||
_file_name (_type_): Nombre del archvio con la que se crea la BDV
|
||||
para cargarla si existe.
|
||||
_embedding_model (_type_): Modelo de embedding.
|
||||
|
||||
Returns:
|
||||
_type_: Retorna un objeto RetrievalQAWithSourcesChain con el quie
|
||||
se pueden hacer requerimientos a la chain que integra el modelo
|
||||
y la BDV.
|
||||
"""
|
||||
|
||||
db_name = _file_name.replace('.pdf', '').replace(' ', '_').lower()
|
||||
|
||||
# Verificamos si existe la vector strore
|
||||
persist_directory = f"embeddings/{db_name}"
|
||||
|
||||
_vectordb = Chroma(
|
||||
persist_directory=persist_directory,
|
||||
embedding_function=_embedding_model)
|
||||
|
||||
# Define the Retrieval QA Chain to integrate the database and LLM
|
||||
qa = RetrievalQAWithSourcesChain.from_chain_type(
|
||||
_llm,
|
||||
retriever=_vectordb.as_retriever(),
|
||||
return_source_documents=True, # Devuelve los documentos fuente
|
||||
max_tokens_limit=1000, # Límite máximo de tokens para el LLM
|
||||
reduce_k_below_max_tokens=True, # Reduce k si los tokens exceden el límite
|
||||
verbose=True, # Modo verboso
|
||||
)
|
||||
|
||||
return qa
|
||||
|
||||
@st.cache_resource
|
||||
def define_retrieval_qa_memory(
|
||||
_self, _llm, _vectordb, _file_name, _embedding_model
|
||||
):
|
||||
"""Esta función integra un LLM y una base de datos vectorial en una
|
||||
chain de LangChain para hacer requerimientos. Este modelo integra memoria.
|
||||
|
||||
Args:
|
||||
_llm (_type_): <Modelo Largo de Lenguaje.
|
||||
_vectordb (_type_): Base de datos vectorial.
|
||||
_file_name (_type_): Nombre del archvio con la que se crea la BDV
|
||||
para cargarla si existe.
|
||||
_embedding_model (_type_): Modelo de embedding.
|
||||
|
||||
Returns:
|
||||
_type_: Retorna un objeto RetrievalQAWithSourcesChain con el quie
|
||||
se pueden hacer requerimientos a la chain que integra el modelo
|
||||
y la BDV.
|
||||
"""
|
||||
|
||||
db_name = _file_name.replace('.pdf', '').replace(' ', '_').lower()
|
||||
|
||||
# Verificamos si existe la vector strore
|
||||
persist_directory = f"embeddings/{db_name}"
|
||||
|
||||
_vectordb = Chroma(
|
||||
persist_directory=persist_directory,
|
||||
embedding_function=_embedding_model)
|
||||
|
||||
# Configura la memoria
|
||||
memory = ConversationBufferMemory(
|
||||
memory_key="chat_history", return_messages=True)
|
||||
|
||||
# Define the Retrieval QA Chain to integrate the database and LLM
|
||||
conversation = ConversationalRetrievalChain.from_llm(
|
||||
_llm,
|
||||
retriever=_vectordb.as_retriever(),
|
||||
memory=memory,
|
||||
verbose=False # Modo verboso
|
||||
)
|
||||
|
||||
return conversation
|
61
langchain_tools/pdf_tools.py
Normal file
61
langchain_tools/pdf_tools.py
Normal file
@ -0,0 +1,61 @@
|
||||
import os
|
||||
from langchain_community.document_loaders.pdf import PyPDFLoader
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
import streamlit as st
|
||||
|
||||
|
||||
class PdfLangChain:
|
||||
"""Clase para menejar documentos pdf con LangChain.
|
||||
.
|
||||
|
||||
Attributes:
|
||||
file_name (str): Nombre del archivo PDF.
|
||||
file_path (str): Ruta del archivo PDF.
|
||||
"""
|
||||
|
||||
def __init__(self, file_name: str) -> None:
|
||||
"""
|
||||
Inicializa la clase.
|
||||
|
||||
Args:
|
||||
file_name (str): Nombre del archivo PDF.
|
||||
"""
|
||||
|
||||
self.file_name = file_name
|
||||
self.file_path = os.path.join('documents', 'pdfs', self.file_name)
|
||||
|
||||
# Verificar si el directorio exist, sino, crearlo
|
||||
if not os.path.exists(self.file_path):
|
||||
os.mkdir(self.file_path)
|
||||
|
||||
@st.cache_resource
|
||||
def load_pdf(_self):
|
||||
"""Carga el documento PDF.
|
||||
|
||||
Returns:
|
||||
_type_: Lista de documentos cargados.
|
||||
"""
|
||||
|
||||
loader = PyPDFLoader(_self.file_path)
|
||||
_self.docs = loader.load()
|
||||
return _self.docs
|
||||
|
||||
def split_docs(self, data: list) -> list:
|
||||
"""Divide los documentos en fragmentos más pequeños.
|
||||
|
||||
Args:
|
||||
data (list): Listra de docuemntos cargador
|
||||
|
||||
Returns:
|
||||
_type_: Retrorna una lista con fragmentos mas pequeños
|
||||
del documento.
|
||||
"""
|
||||
chunk_size = 2000
|
||||
chunk_overlap = 300
|
||||
|
||||
splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap)
|
||||
self.docs_split = splitter.split_documents(data)
|
||||
|
||||
return self.docs_split
|
202
langchain_tools/tools.py
Normal file
202
langchain_tools/tools.py
Normal file
@ -0,0 +1,202 @@
|
||||
from langchain_community.document_loaders.pdf import PyPDFLoader
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
from langchain_openai import OpenAIEmbeddings
|
||||
from langchain_community.vectorstores import Chroma
|
||||
from langchain_community.llms import OpenAI
|
||||
from langchain_community.chat_models import ChatOpenAI
|
||||
from langchain import PromptTemplate
|
||||
from langchain.chains import RetrievalQAWithSourcesChain, ConversationalRetrievalChain
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
import streamlit as st
|
||||
from langchain.prompts import BasePromptTemplate
|
||||
from langchain.memory.buffer import ConversationBufferMemory
|
||||
|
||||
|
||||
def pdf_load(file_name: str):
|
||||
# Cargar el archivo PDF utilizando el modelo rápido de PDF
|
||||
pdf_path = os.path.join('documents/pdfs', file_name)
|
||||
|
||||
loader = PyPDFLoader(
|
||||
pdf_path
|
||||
)
|
||||
|
||||
# Cargar el contenido del PDF en una lista de documentos
|
||||
docs = loader.load()
|
||||
|
||||
return docs
|
||||
|
||||
|
||||
def split_docs(_data: list):
|
||||
chunk_size = 1000
|
||||
chunk_overlap = 200
|
||||
|
||||
# Split the quote using RecursiveCharacterTextSplitter
|
||||
splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap)
|
||||
docs_split = splitter.split_documents(_data)
|
||||
|
||||
# st.success('El pdf se ha dividido')
|
||||
|
||||
return docs_split
|
||||
|
||||
|
||||
def load_embedding_opnai():
|
||||
|
||||
# Cargamos la variable que contiene la api_key de OpenAI
|
||||
load_dotenv()
|
||||
openai_api_key = os.getenv('api_key')
|
||||
# Define an OpenAI embeddings model
|
||||
embedding_model = OpenAIEmbeddings(openai_api_key=openai_api_key)
|
||||
# st.success('El modelo de embeddins de OpneAI se ha cargado')
|
||||
|
||||
return embedding_model
|
||||
|
||||
|
||||
# @st.cache_resource
|
||||
def create_vector_strore(_docs_split, file_name):
|
||||
|
||||
db_name = file_name.replace('.pdf', '').replace(' ', '_').lower()
|
||||
|
||||
# Cargamos el modelo de embeddings
|
||||
embedding_model = load_embedding_opnai()
|
||||
|
||||
# Verificamos si existe la vector strore
|
||||
persist_directory = f"./{db_name}"
|
||||
|
||||
if os.path.exists(persist_directory):
|
||||
vectordb = Chroma(
|
||||
persist_directory=persist_directory,
|
||||
embedding_function=embedding_model)
|
||||
else:
|
||||
vectordb = Chroma.from_documents(
|
||||
persist_directory=persist_directory,
|
||||
documents=_docs_split,
|
||||
embedding=embedding_model)
|
||||
|
||||
vectordb.persist()
|
||||
|
||||
# vectordb.persist()
|
||||
|
||||
return vectordb
|
||||
|
||||
|
||||
# @st.cache_resource
|
||||
def load_llm_openai():
|
||||
|
||||
# Cargamos la variable que contiene la api_key de OpenAI
|
||||
load_dotenv()
|
||||
openai_api_key = os.getenv('api_key')
|
||||
|
||||
# llm_openai = OpenAI(model_name="gpt-3.5-turbo-instruct",
|
||||
# temperature=0.0,
|
||||
# openai_api_key=openai_api_key,
|
||||
# max_tokens=1000)
|
||||
|
||||
llm_openai = ChatOpenAI(model_name="gpt-3.5-turbo",
|
||||
temperature=0.0,
|
||||
openai_api_key=openai_api_key,
|
||||
max_tokens=1000)
|
||||
|
||||
# st.success('El modelo LLM de OpneAI se ha cargado')
|
||||
|
||||
return llm_openai
|
||||
|
||||
# @st.cache_resource
|
||||
|
||||
|
||||
def load_prompt_template():
|
||||
template = """Responde a la siguiente pregunta utilizando los documentos proporcionados y citando las fuentes relevantes entre corchetes []:
|
||||
|
||||
Pregunta: {question}
|
||||
|
||||
Respuesta:"""
|
||||
|
||||
prompt_template = PromptTemplate(
|
||||
template=template, input_variables=["question"])
|
||||
|
||||
return prompt_template
|
||||
|
||||
|
||||
# def define_retrieval_qa(_llm, vectordb, file_name, embedding_model):
|
||||
#
|
||||
# db_name = file_name.replace('.pdf', '').replace(' ', '_').lower()
|
||||
#
|
||||
# # Verificamos si existe la vector strore
|
||||
# persist_directory = f"./{db_name}"
|
||||
#
|
||||
# vectordb = Chroma(
|
||||
# persist_directory=persist_directory,
|
||||
# embedding_function=embedding_model)
|
||||
#
|
||||
# memory = ConversationBufferMemory(
|
||||
# memory_key="chat_history", return_messages=True)
|
||||
#
|
||||
# # Define the Retrieval QA Chain to integrate the database and LLM
|
||||
# qa = RetrievalQAWithSourcesChain.from_chain_type(
|
||||
# _llm,
|
||||
# memory=memory,
|
||||
# retriever=vectordb.as_retriever(),
|
||||
# return_source_documents=True, # Devuelve los documentos fuente
|
||||
# max_tokens_limit=1000, # Límite máximo de tokens para el LLM
|
||||
# reduce_k_below_max_tokens=True, # Reduce k si los tokens exceden el límite
|
||||
# verbose=True, # Modo verboso
|
||||
# )
|
||||
#
|
||||
# return qa
|
||||
|
||||
|
||||
def define_retrieval_qa(_llm, vectordb, file_name, embedding_model):
|
||||
|
||||
db_name = file_name.replace('.pdf', '').replace(' ', '_').lower()
|
||||
|
||||
# Verificamos si existe la vector strore
|
||||
persist_directory = f"./{db_name}"
|
||||
|
||||
vectordb = Chroma(
|
||||
persist_directory=persist_directory,
|
||||
embedding_function=embedding_model)
|
||||
|
||||
memory = ConversationBufferMemory(
|
||||
memory_key="chat_history", return_messages=True)
|
||||
|
||||
# Define the Retrieval QA Chain to integrate the database and LLM
|
||||
qa = RetrievalQAWithSourcesChain.from_chain_type(
|
||||
_llm,
|
||||
retriever=vectordb.as_retriever(),
|
||||
return_source_documents=True, # Devuelve los documentos fuente
|
||||
max_tokens_limit=1000, # Límite máximo de tokens para el LLM
|
||||
reduce_k_below_max_tokens=True, # Reduce k si los tokens exceden el límite
|
||||
verbose=True, # Modo verboso
|
||||
)
|
||||
|
||||
return qa
|
||||
|
||||
|
||||
def define_retrieval_qa_memory(_llm, vectordb, file_name, embedding_model):
|
||||
|
||||
db_name = file_name.replace('.pdf', '').replace(' ', '_').lower()
|
||||
|
||||
# Verificamos si existe la vector strore
|
||||
persist_directory = f"./{db_name}"
|
||||
|
||||
vectordb = Chroma(
|
||||
persist_directory=persist_directory,
|
||||
embedding_function=embedding_model)
|
||||
|
||||
# Configura la memoria
|
||||
memory = ConversationBufferMemory(
|
||||
memory_key="chat_history", return_messages=True)
|
||||
|
||||
# Define the Retrieval QA Chain to integrate the database and LLM
|
||||
conversation = ConversationalRetrievalChain.from_llm(
|
||||
_llm,
|
||||
retriever=vectordb.as_retriever(),
|
||||
memory=memory,
|
||||
verbose=True # Modo verboso
|
||||
)
|
||||
|
||||
# result = conversation({"query": query, "chat_history": chat_history})
|
||||
|
||||
return conversation
|
13
requirements.txt
Normal file
13
requirements.txt
Normal file
@ -0,0 +1,13 @@
|
||||
chroma-hnswlib==0.7.3
|
||||
chromadb==0.5.0
|
||||
colorama==0.4.6
|
||||
coloredlogs==15.0.1
|
||||
huggingface-hub==0.22.2
|
||||
langchain==0.1.16
|
||||
langchain-community==0.0.34
|
||||
langchain-core==0.1.45
|
||||
langchain-openai==0.1.3
|
||||
langchain-text-splitters==0.0.1
|
||||
langsmith==0.1.50
|
||||
openai==1.23.5
|
||||
|
Loading…
Reference in New Issue
Block a user