203 lines
6.0 KiB
Python
203 lines
6.0 KiB
Python
from langchain_community.document_loaders.pdf import PyPDFLoader
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
from langchain_openai import OpenAIEmbeddings
|
|
from langchain_community.vectorstores import Chroma
|
|
from langchain_community.llms import OpenAI
|
|
from langchain_community.chat_models import ChatOpenAI
|
|
from langchain import PromptTemplate
|
|
from langchain.chains import RetrievalQAWithSourcesChain, ConversationalRetrievalChain
|
|
import os
|
|
from dotenv import load_dotenv
|
|
import streamlit as st
|
|
from langchain.prompts import BasePromptTemplate
|
|
from langchain.memory.buffer import ConversationBufferMemory
|
|
|
|
|
|
def pdf_load(file_name: str):
|
|
# Cargar el archivo PDF utilizando el modelo rápido de PDF
|
|
pdf_path = os.path.join('documents/pdfs', file_name)
|
|
|
|
loader = PyPDFLoader(
|
|
pdf_path
|
|
)
|
|
|
|
# Cargar el contenido del PDF en una lista de documentos
|
|
docs = loader.load()
|
|
|
|
return docs
|
|
|
|
|
|
def split_docs(_data: list):
|
|
chunk_size = 1000
|
|
chunk_overlap = 200
|
|
|
|
# Split the quote using RecursiveCharacterTextSplitter
|
|
splitter = RecursiveCharacterTextSplitter(
|
|
chunk_size=chunk_size,
|
|
chunk_overlap=chunk_overlap)
|
|
docs_split = splitter.split_documents(_data)
|
|
|
|
# st.success('El pdf se ha dividido')
|
|
|
|
return docs_split
|
|
|
|
|
|
def load_embedding_opnai():
|
|
|
|
# Cargamos la variable que contiene la api_key de OpenAI
|
|
load_dotenv()
|
|
openai_api_key = os.getenv('api_key')
|
|
# Define an OpenAI embeddings model
|
|
embedding_model = OpenAIEmbeddings(openai_api_key=openai_api_key)
|
|
# st.success('El modelo de embeddins de OpneAI se ha cargado')
|
|
|
|
return embedding_model
|
|
|
|
|
|
# @st.cache_resource
|
|
def create_vector_strore(_docs_split, file_name):
|
|
|
|
db_name = file_name.replace('.pdf', '').replace(' ', '_').lower()
|
|
|
|
# Cargamos el modelo de embeddings
|
|
embedding_model = load_embedding_opnai()
|
|
|
|
# Verificamos si existe la vector strore
|
|
persist_directory = f"./{db_name}"
|
|
|
|
if os.path.exists(persist_directory):
|
|
vectordb = Chroma(
|
|
persist_directory=persist_directory,
|
|
embedding_function=embedding_model)
|
|
else:
|
|
vectordb = Chroma.from_documents(
|
|
persist_directory=persist_directory,
|
|
documents=_docs_split,
|
|
embedding=embedding_model)
|
|
|
|
vectordb.persist()
|
|
|
|
# vectordb.persist()
|
|
|
|
return vectordb
|
|
|
|
|
|
# @st.cache_resource
|
|
def load_llm_openai():
|
|
|
|
# Cargamos la variable que contiene la api_key de OpenAI
|
|
load_dotenv()
|
|
openai_api_key = os.getenv('api_key')
|
|
|
|
# llm_openai = OpenAI(model_name="gpt-3.5-turbo-instruct",
|
|
# temperature=0.0,
|
|
# openai_api_key=openai_api_key,
|
|
# max_tokens=1000)
|
|
|
|
llm_openai = ChatOpenAI(model_name="gpt-3.5-turbo",
|
|
temperature=0.0,
|
|
openai_api_key=openai_api_key,
|
|
max_tokens=1000)
|
|
|
|
# st.success('El modelo LLM de OpneAI se ha cargado')
|
|
|
|
return llm_openai
|
|
|
|
# @st.cache_resource
|
|
|
|
|
|
def load_prompt_template():
|
|
template = """Responde a la siguiente pregunta utilizando los documentos proporcionados y citando las fuentes relevantes entre corchetes []:
|
|
|
|
Pregunta: {question}
|
|
|
|
Respuesta:"""
|
|
|
|
prompt_template = PromptTemplate(
|
|
template=template, input_variables=["question"])
|
|
|
|
return prompt_template
|
|
|
|
|
|
# def define_retrieval_qa(_llm, vectordb, file_name, embedding_model):
|
|
#
|
|
# db_name = file_name.replace('.pdf', '').replace(' ', '_').lower()
|
|
#
|
|
# # Verificamos si existe la vector strore
|
|
# persist_directory = f"./{db_name}"
|
|
#
|
|
# vectordb = Chroma(
|
|
# persist_directory=persist_directory,
|
|
# embedding_function=embedding_model)
|
|
#
|
|
# memory = ConversationBufferMemory(
|
|
# memory_key="chat_history", return_messages=True)
|
|
#
|
|
# # Define the Retrieval QA Chain to integrate the database and LLM
|
|
# qa = RetrievalQAWithSourcesChain.from_chain_type(
|
|
# _llm,
|
|
# memory=memory,
|
|
# retriever=vectordb.as_retriever(),
|
|
# return_source_documents=True, # Devuelve los documentos fuente
|
|
# max_tokens_limit=1000, # Límite máximo de tokens para el LLM
|
|
# reduce_k_below_max_tokens=True, # Reduce k si los tokens exceden el límite
|
|
# verbose=True, # Modo verboso
|
|
# )
|
|
#
|
|
# return qa
|
|
|
|
|
|
def define_retrieval_qa(_llm, vectordb, file_name, embedding_model):
|
|
|
|
db_name = file_name.replace('.pdf', '').replace(' ', '_').lower()
|
|
|
|
# Verificamos si existe la vector strore
|
|
persist_directory = f"./{db_name}"
|
|
|
|
vectordb = Chroma(
|
|
persist_directory=persist_directory,
|
|
embedding_function=embedding_model)
|
|
|
|
memory = ConversationBufferMemory(
|
|
memory_key="chat_history", return_messages=True)
|
|
|
|
# Define the Retrieval QA Chain to integrate the database and LLM
|
|
qa = RetrievalQAWithSourcesChain.from_chain_type(
|
|
_llm,
|
|
retriever=vectordb.as_retriever(),
|
|
return_source_documents=True, # Devuelve los documentos fuente
|
|
max_tokens_limit=1000, # Límite máximo de tokens para el LLM
|
|
reduce_k_below_max_tokens=True, # Reduce k si los tokens exceden el límite
|
|
verbose=True, # Modo verboso
|
|
)
|
|
|
|
return qa
|
|
|
|
|
|
def define_retrieval_qa_memory(_llm, vectordb, file_name, embedding_model):
|
|
|
|
db_name = file_name.replace('.pdf', '').replace(' ', '_').lower()
|
|
|
|
# Verificamos si existe la vector strore
|
|
persist_directory = f"./{db_name}"
|
|
|
|
vectordb = Chroma(
|
|
persist_directory=persist_directory,
|
|
embedding_function=embedding_model)
|
|
|
|
# Configura la memoria
|
|
memory = ConversationBufferMemory(
|
|
memory_key="chat_history", return_messages=True)
|
|
|
|
# Define the Retrieval QA Chain to integrate the database and LLM
|
|
conversation = ConversationalRetrievalChain.from_llm(
|
|
_llm,
|
|
retriever=vectordb.as_retriever(),
|
|
memory=memory,
|
|
verbose=True # Modo verboso
|
|
)
|
|
|
|
# result = conversation({"query": query, "chat_history": chat_history})
|
|
|
|
return conversation
|